diff --git a/.changelog/unreleased/improvements/ibc-telemetry/3366-configurable-histogram-buckets.md b/.changelog/unreleased/improvements/ibc-telemetry/3366-configurable-histogram-buckets.md new file mode 100644 index 0000000000..b27829b142 --- /dev/null +++ b/.changelog/unreleased/improvements/ibc-telemetry/3366-configurable-histogram-buckets.md @@ -0,0 +1,6 @@ +- Add two new configurations for the telemetry `buckets`: + - `latency_submitted` used to specify the range and number of + buckets for the `tx_latency_submitted` metric. + - `latency_confirmed` used to specify the range and number of + buckets for the `tx_latency_confirmed` metric. + ([#3366](https://github.com/informalsystems/hermes/issues/3366)) \ No newline at end of file diff --git a/Cargo.lock b/Cargo.lock index 1ebc4a4812..464f6da00e 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1671,6 +1671,7 @@ dependencies = [ "serde_json", "tendermint", "tokio", + "tracing", ] [[package]] diff --git a/config.toml b/config.toml index 0331d6e677..949d67d2c7 100644 --- a/config.toml +++ b/config.toml @@ -110,6 +110,19 @@ host = '127.0.0.1' # by the telemetry service. Default: 3001 port = 3001 +[telemetry.buckets] +# Specify the range of the 10 histogram buckets in ms for the `tx_latency_submitted` metric. +# Default: { start = 500, end = 10000, buckets = 10 } +# The default will give the following buckets: +# [500, 2450, 4400, 6350, 8300, 10250, 12200, 14150, 16100, 18050, 20000] +# latency_submitted = { start = 500, end = 20000, buckets = 10 } + +# Specify the range of the 10 histogram buckets in ms for the `tx_latency_confirmed` metric. +# Default: { start = 1000, end = 20000, buckets = 10 } +# The default will give the following buckets: +# [1000, 3900, 6800, 9700, 12600, 15500, 18400, 21300, 24200, 27100, 30000] +# latency_confirmed = { start = 1000, end = 30000, buckets = 10 } + # A chains section includes parameters related to a chain and the full node to which # the relayer can send transactions and queries. diff --git a/crates/relayer-cli/src/commands/start.rs b/crates/relayer-cli/src/commands/start.rs index e75685cd8c..7f7d57596a 100644 --- a/crates/relayer-cli/src/commands/start.rs +++ b/crates/relayer-cli/src/commands/start.rs @@ -193,7 +193,12 @@ fn spawn_telemetry_server(config: &Config) { let _span = tracing::error_span!("telemetry").entered(); - let state = ibc_telemetry::global(); + let state = ibc_telemetry::init( + config.telemetry.buckets.latency_submitted.range.clone(), + config.telemetry.buckets.latency_submitted.buckets, + config.telemetry.buckets.latency_confirmed.range.clone(), + config.telemetry.buckets.latency_confirmed.buckets, + ); let telemetry = config.telemetry.clone(); if !telemetry.enabled { diff --git a/crates/relayer/src/config.rs b/crates/relayer/src/config.rs index f6754c34e5..92fe601cbf 100644 --- a/crates/relayer/src/config.rs +++ b/crates/relayer/src/config.rs @@ -19,6 +19,7 @@ use std::{ fs, fs::File, io::Write, + ops::Range, path::{Path, PathBuf}, }; use tendermint::block::Height as BlockHeight; @@ -204,6 +205,26 @@ pub mod default { pub fn max_grpc_decoding_size() -> Byte { Byte::from_bytes(33554432) } + + pub fn latency_submitted() -> HistogramConfig { + HistogramConfig { + range: Range { + start: 500, + end: 20000, + }, + buckets: 10, + } + } + + pub fn latency_confirmed() -> HistogramConfig { + HistogramConfig { + range: Range { + start: 1000, + end: 30000, + }, + buckets: 10, + } + } } #[derive(Clone, Debug, Default, Deserialize, Serialize)] @@ -385,6 +406,60 @@ pub struct TelemetryConfig { pub enabled: bool, pub host: String, pub port: u16, + #[serde(default = "HistogramBuckets::default")] + pub buckets: HistogramBuckets, +} + +#[derive(Clone, Debug, Deserialize, Serialize)] +pub struct HistogramBuckets { + #[serde(default = "default::latency_submitted")] + pub latency_submitted: HistogramConfig, + #[serde(default = "default::latency_confirmed")] + pub latency_confirmed: HistogramConfig, +} + +impl Default for HistogramBuckets { + fn default() -> Self { + Self { + latency_submitted: default::latency_submitted(), + latency_confirmed: default::latency_confirmed(), + } + } +} + +#[derive(Clone, Debug, Deserialize, Serialize)] +#[serde(try_from = "HistogramRangeUnchecked")] +pub struct HistogramConfig { + #[serde(flatten)] + pub range: Range, + pub buckets: u64, +} + +impl TryFrom for HistogramConfig { + type Error = String; + + fn try_from(value: HistogramRangeUnchecked) -> Result { + if value.start > value.end { + return Err(format!( + "histogram range min `{}` must be smaller or equal than max `{}`", + value.start, value.end + )); + } + Ok(Self { + range: Range { + start: value.start, + end: value.end, + }, + buckets: value.buckets, + }) + } +} + +#[derive(Clone, Debug, Deserialize, Serialize)] +pub struct HistogramRangeUnchecked { + start: u64, + end: u64, + buckets: u64, } /// Default values for the telemetry configuration. @@ -396,6 +471,7 @@ impl Default for TelemetryConfig { enabled: false, host: "127.0.0.1".to_string(), port: 3001, + buckets: HistogramBuckets::default(), } } } @@ -661,6 +737,28 @@ mod tests { dbg!(config); } + #[test] + fn parse_valid_telemetry() { + let path = concat!( + env!("CARGO_MANIFEST_DIR"), + "/tests/config/fixtures/relayer_conf_example_valid_telemetry.toml" + ); + + let config = load(path).expect("could not parse config"); + + dbg!(config); + } + + #[test] + fn parse_invalid_telemetry() { + let path = concat!( + env!("CARGO_MANIFEST_DIR"), + "/tests/config/fixtures/relayer_conf_example_invalid_telemetry.toml" + ); + + assert!(load(path).is_err()); + } + #[test] fn serialize_valid_config() { let path = concat!( diff --git a/crates/relayer/tests/config/fixtures/relayer_conf_example_invalid_telemetry.toml b/crates/relayer/tests/config/fixtures/relayer_conf_example_invalid_telemetry.toml new file mode 100644 index 0000000000..3e44c53a7d --- /dev/null +++ b/crates/relayer/tests/config/fixtures/relayer_conf_example_invalid_telemetry.toml @@ -0,0 +1,70 @@ +[global] +log_level = 'error' + +[mode] + +[mode.clients] +enabled = true +refresh = true +misbehaviour = true + +[mode.connections] +enabled = false + +[mode.channels] +enabled = false + +[mode.packets] +enabled = true +clear_interval = 100 +clear_on_start = true +tx_confirmation = true + +[telemetry] +enabled = true +host = '127.0.0.1' +port = 3001 + +[telemetry.buckets] +latency_submitted = { start = 5000, end = 1000, buckets = 10 } # start can't be smaller than end +latency_confirmed = { start = 5000, end = 10000, buckets = 10 } + +[[chains]] +id = 'chain_A' +rpc_addr = 'http://127.0.0.1:26657' +grpc_addr = 'http://127.0.0.1:9090' +event_source = { mode = 'push', url = 'ws://localhost:26657/websocket' } +rpc_timeout = '10s' +account_prefix = 'cosmos' +key_name = 'testkey' +store_prefix = 'ibc' +max_gas = 200000 +gas_price = { price = 0.001, denom = 'stake' } +max_msg_num = 4 +max_tx_size = 1048576 +clock_drift = '5s' +trusting_period = '14days' +trust_threshold = { numerator = '1', denominator = '3' } +address_type = { derivation = 'cosmos' } + +[chains.packet_filter] +policy = 'allow' +list = [ + ['ica*', '*'], + ['transfer', 'channel-0'], +] + +[[chains]] +id = 'chain_B' +rpc_addr = 'http://127.0.0.1:26557' +grpc_addr = 'http://127.0.0.1:9090' +event_source = { mode = 'push', url = 'ws://localhost:26557/websocket' } +rpc_timeout = '10s' +account_prefix = 'cosmos' +key_name = 'testkey' +store_prefix = 'ibc' +gas_price = { price = 0.001, denom = 'stake' } +clock_drift = '5s' +trusting_period = '14days' +trust_threshold = { numerator = '1', denominator = '3' } +address_type = { derivation = 'ethermint', proto_type = { pk_type = '/injective.crypto.v1beta1.ethsecp256k1.PubKey' } } diff --git a/crates/relayer/tests/config/fixtures/relayer_conf_example_valid_telemetry.toml b/crates/relayer/tests/config/fixtures/relayer_conf_example_valid_telemetry.toml new file mode 100644 index 0000000000..ca0ae487ec --- /dev/null +++ b/crates/relayer/tests/config/fixtures/relayer_conf_example_valid_telemetry.toml @@ -0,0 +1,70 @@ +[global] +log_level = 'error' + +[mode] + +[mode.clients] +enabled = true +refresh = true +misbehaviour = true + +[mode.connections] +enabled = false + +[mode.channels] +enabled = false + +[mode.packets] +enabled = true +clear_interval = 100 +clear_on_start = true +tx_confirmation = true + +[telemetry] +enabled = true +host = '127.0.0.1' +port = 3001 + +[telemetry.buckets] +latency_submitted = { start = 5000, end = 10000, buckets = 10 } +latency_confirmed = { start = 5000, end = 10000, buckets = 10 } + +[[chains]] +id = 'chain_A' +rpc_addr = 'http://127.0.0.1:26657' +grpc_addr = 'http://127.0.0.1:9090' +event_source = { mode = 'push', url = 'ws://localhost:26657/websocket' } +rpc_timeout = '10s' +account_prefix = 'cosmos' +key_name = 'testkey' +store_prefix = 'ibc' +max_gas = 200000 +gas_price = { price = 0.001, denom = 'stake' } +max_msg_num = 4 +max_tx_size = 1048576 +clock_drift = '5s' +trusting_period = '14days' +trust_threshold = { numerator = '1', denominator = '3' } +address_type = { derivation = 'cosmos' } + +[chains.packet_filter] +policy = 'allow' +list = [ + ['ica*', '*'], + ['transfer', 'channel-0'], +] + +[[chains]] +id = 'chain_B' +rpc_addr = 'http://127.0.0.1:26557' +grpc_addr = 'http://127.0.0.1:9090' +event_source = { mode = 'push', url = 'ws://localhost:26557/websocket' } +rpc_timeout = '10s' +account_prefix = 'cosmos' +key_name = 'testkey' +store_prefix = 'ibc' +gas_price = { price = 0.001, denom = 'stake' } +clock_drift = '5s' +trusting_period = '14days' +trust_threshold = { numerator = '1', denominator = '3' } +address_type = { derivation = 'ethermint', proto_type = { pk_type = '/injective.crypto.v1beta1.ethsecp256k1.PubKey' } } diff --git a/crates/telemetry/Cargo.toml b/crates/telemetry/Cargo.toml index ca00b5b01f..7c02ccbd8e 100644 --- a/crates/telemetry/Cargo.toml +++ b/crates/telemetry/Cargo.toml @@ -25,6 +25,7 @@ serde_json = "1.0.94" serde = "1.0.164" axum = "0.6.18" tokio = "1.26.0" +tracing = "0.1.36" [dependencies.tendermint] version = "0.32.0" diff --git a/crates/telemetry/src/lib.rs b/crates/telemetry/src/lib.rs index 40fa56bfce..aa2c8498ce 100644 --- a/crates/telemetry/src/lib.rs +++ b/crates/telemetry/src/lib.rs @@ -5,21 +5,71 @@ pub mod state; use std::error::Error; use std::net::{SocketAddr, ToSocketAddrs}; +use std::ops::Range; use std::sync::Arc; -use once_cell::sync::Lazy; +use once_cell::sync::OnceCell; use tokio::task::JoinHandle; +use tracing::{debug, warn}; pub use crate::state::TelemetryState; -pub fn new_state() -> Arc { - Arc::new(TelemetryState::default()) +pub fn new_state( + tx_latency_submitted_range: Range, + tx_latency_submitted_buckets: u64, + tx_latency_confirmed_range: Range, + tx_latency_confirmed_buckets: u64, +) -> Arc { + Arc::new(TelemetryState::new( + tx_latency_submitted_range, + tx_latency_submitted_buckets, + tx_latency_confirmed_range, + tx_latency_confirmed_buckets, + )) } -static GLOBAL_STATE: Lazy> = Lazy::new(new_state); +static GLOBAL_STATE: OnceCell> = OnceCell::new(); + +pub fn init( + tx_latency_submitted_range: Range, + tx_latency_submitted_buckets: u64, + tx_latency_confirmed_range: Range, + tx_latency_confirmed_buckets: u64, +) -> &'static Arc { + let new_state = new_state( + tx_latency_submitted_range, + tx_latency_submitted_buckets, + tx_latency_confirmed_range, + tx_latency_confirmed_buckets, + ); + match GLOBAL_STATE.set(new_state) { + Ok(_) => debug!("initialised telemetry global state"), + Err(_) => debug!("telemetry global state was already set"), + } + GLOBAL_STATE.get().unwrap() +} pub fn global() -> &'static Arc { - &GLOBAL_STATE + match GLOBAL_STATE.get() { + Some(state) => state, + None => { + warn!( + "global telemetry state not set, will initialize it using default histogram ranges" + ); + init( + Range { + start: 500, + end: 10000, + }, + 10, + Range { + start: 1000, + end: 20000, + }, + 10, + ) + } + } } pub type BoxError = Box; diff --git a/crates/telemetry/src/state.rs b/crates/telemetry/src/state.rs index dc959a6bde..15a51f44dc 100644 --- a/crates/telemetry/src/state.rs +++ b/crates/telemetry/src/state.rs @@ -1,5 +1,6 @@ use core::fmt::{Display, Error as FmtError, Formatter}; use std::{ + ops::Range, sync::Mutex, time::{Duration, Instant}, }; @@ -195,6 +196,175 @@ pub struct TelemetryState { } impl TelemetryState { + pub fn new( + tx_latency_submitted_range: Range, + tx_latency_submitted_buckets: u64, + tx_latency_confirmed_range: Range, + tx_latency_confirmed_buckets: u64, + ) -> Self { + use opentelemetry::sdk::export::metrics::aggregation; + use opentelemetry::sdk::metrics::{controllers, processors}; + + let controller = controllers::basic(processors::factory( + CustomAggregatorSelector::new( + tx_latency_submitted_range, + tx_latency_submitted_buckets, + tx_latency_confirmed_range, + tx_latency_confirmed_buckets, + ), + aggregation::cumulative_temporality_selector(), + )) + .build(); + + let exporter = opentelemetry_prometheus::ExporterBuilder::new(controller).init(); + + let meter = global::meter("hermes"); + + Self { + exporter, + + workers: meter + .i64_up_down_counter("workers") + .with_description("Number of workers") + .init(), + + client_updates_submitted: meter + .u64_counter("client_updates_submitted") + .with_description("Number of client update messages submitted") + .init(), + + client_misbehaviours_submitted: meter + .u64_counter("client_misbehaviours_submitted") + .with_description("Number of misbehaviours detected and submitted") + .init(), + + receive_packets_confirmed: meter + .u64_counter("receive_packets_confirmed") + .with_description("Number of confirmed receive packets. Available if relayer runs with Tx confirmation enabled") + .init(), + + acknowledgment_packets_confirmed: meter + .u64_counter("acknowledgment_packets_confirmed") + .with_description("Number of confirmed acknowledgment packets. Available if relayer runs with Tx confirmation enabled") + .init(), + + timeout_packets_confirmed: meter + .u64_counter("timeout_packets_confirmed") + .with_description("Number of confirmed timeout packets. Available if relayer runs with Tx confirmation enabled") + .init(), + + queries: meter + .u64_counter("queries") + .with_description( + "Number of queries submitted by Hermes", + ) + .init(), + + queries_cache_hits: meter + .u64_counter("queries_cache_hits") + .with_description("Number of cache hits for queries submitted by Hermes") + .init(), + + ws_reconnect: meter + .u64_counter("ws_reconnect") + .with_description("Number of times Hermes reconnected to the websocket endpoint") + .init(), + + ws_events: meter + .u64_counter("ws_events") + .with_description("How many IBC events did Hermes receive via the websocket subscription") + .init(), + + messages_submitted: meter + .u64_counter("messages_submitted") + .with_description("Number of messages submitted to a specific chain") + .init(), + + wallet_balance: meter + .f64_observable_gauge("wallet_balance") + .with_description("The balance of each wallet Hermes uses per chain. Please note that when converting the balance to f64 a loss in precision might be introduced in the displayed value") + .init(), + + send_packet_events: meter + .u64_counter("send_packet_events") + .with_description("Number of SendPacket events received") + .init(), + + acknowledgement_events: meter + .u64_counter("acknowledgement_events") + .with_description("Number of WriteAcknowledgement events received") + .init(), + + timeout_events: meter + .u64_counter("timeout_events") + .with_description("Number of TimeoutPacket events received") + .init(), + + cleared_send_packet_events: meter + .u64_counter("cleared_send_packet_events") + .with_description("Number of SendPacket events received during the initial and periodic clearing") + .init(), + + cleared_acknowledgment_events: meter + .u64_counter("cleared_acknowledgment_events") + .with_description("Number of WriteAcknowledgement events received during the initial and periodic clearing") + .init(), + + tx_latency_submitted: meter + .u64_observable_gauge("tx_latency_submitted") + .with_unit(Unit::new("milliseconds")) + .with_description("The latency for all transactions submitted to a specific chain, \ + i.e. the difference between the moment when Hermes received a batch of events \ + and when it submitted the corresponding transaction(s). Milliseconds.") + .init(), + + tx_latency_confirmed: meter + .u64_observable_gauge("tx_latency_confirmed") + .with_unit(Unit::new("milliseconds")) + .with_description("The latency for all transactions submitted & confirmed to a specific chain, \ + i.e. the difference between the moment when Hermes received a batch of events \ + until the corresponding transaction(s) were confirmed. Milliseconds.") + .init(), + + in_flight_events: moka::sync::Cache::builder() + .time_to_live(Duration::from_secs(60 * 60)) // Remove entries after 1 hour + .time_to_idle(Duration::from_secs(30 * 60)) // Remove entries if they have been idle for 30 minutes + .build(), + + backlogs: DashMap::new(), + + backlog_oldest_sequence: meter + .u64_observable_gauge("backlog_oldest_sequence") + .with_description("Sequence number of the oldest SendPacket event in the backlog") + .init(), + + backlog_oldest_timestamp: meter + .u64_observable_gauge("backlog_oldest_timestamp") + .with_unit(Unit::new("seconds")) + .with_description("Local timestamp for the oldest SendPacket event in the backlog") + .init(), + + backlog_size: meter + .u64_observable_gauge("backlog_size") + .with_description("Total number of SendPacket events in the backlog") + .init(), + + fee_amounts: meter + .u64_counter("ics29_fee_amounts") + .with_description("Total amount received from ICS29 fees") + .init(), + + visible_fee_addresses: DashSet::new(), + + cached_fees: Mutex::new(Vec::new()), + + period_fees: meter + .u64_observable_gauge("ics29_period_fees") + .with_description("Amount of ICS29 fees rewarded over the past 7 days") + .init(), + } + } + /// Gather the metrics for export pub fn gather(&self) -> Vec { self.exporter.registry().gather() @@ -881,7 +1051,51 @@ use opentelemetry::sdk::metrics::aggregators::{histogram, last_value, sum}; use opentelemetry::sdk::metrics::sdk_api::Descriptor; #[derive(Debug)] -struct CustomAggregatorSelector; +struct CustomAggregatorSelector { + tx_latency_submitted_range: Range, + tx_latency_submitted_buckets: u64, + tx_latency_confirmed_range: Range, + tx_latency_confirmed_buckets: u64, +} + +impl CustomAggregatorSelector { + pub fn new( + tx_latency_submitted_range: Range, + tx_latency_submitted_buckets: u64, + tx_latency_confirmed_range: Range, + tx_latency_confirmed_buckets: u64, + ) -> Self { + Self { + tx_latency_submitted_range, + tx_latency_submitted_buckets, + tx_latency_confirmed_range, + tx_latency_confirmed_buckets, + } + } + + pub fn get_submitted_range(&self) -> Vec { + build_histogram_buckets( + self.tx_latency_submitted_range.start, + self.tx_latency_submitted_range.end, + self.tx_latency_submitted_buckets, + ) + } + + pub fn get_confirmed_range(&self) -> Vec { + build_histogram_buckets( + self.tx_latency_confirmed_range.start, + self.tx_latency_confirmed_range.end, + self.tx_latency_confirmed_buckets, + ) + } +} + +fn build_histogram_buckets(start: u64, end: u64, buckets: u64) -> Vec { + let step = (end - start) / buckets; + (0..=buckets) + .map(|i| (start + i * step) as f64) + .collect::>() +} impl AggregatorSelector for CustomAggregatorSelector { fn aggregator_for(&self, descriptor: &Descriptor) -> Option> { @@ -893,175 +1107,10 @@ impl AggregatorSelector for CustomAggregatorSelector { // Prometheus' supports only collector for histogram, sum, and last value aggregators. // https://docs.rs/opentelemetry-prometheus/0.10.0/src/opentelemetry_prometheus/lib.rs.html#411-418 // TODO: Once quantile sketches are supported, replace histograms with that. - "tx_latency_submitted" => Some(Arc::new(histogram(&[ - 200.0, 500.0, 1000.0, 2000.0, 5000.0, 10000.0, - ]))), - "tx_latency_confirmed" => Some(Arc::new(histogram(&[ - 1000.0, 5000.0, 9000.0, 13000.0, 17000.0, 20000.0, - ]))), + "tx_latency_submitted" => Some(Arc::new(histogram(&self.get_submitted_range()))), + "tx_latency_confirmed" => Some(Arc::new(histogram(&self.get_confirmed_range()))), "ics29_period_fees" => Some(Arc::new(last_value())), _ => Some(Arc::new(sum())), } } } - -impl Default for TelemetryState { - fn default() -> Self { - use opentelemetry::sdk::export::metrics::aggregation; - use opentelemetry::sdk::metrics::{controllers, processors}; - - let controller = controllers::basic(processors::factory( - CustomAggregatorSelector, - aggregation::cumulative_temporality_selector(), - )) - .build(); - - let exporter = opentelemetry_prometheus::ExporterBuilder::new(controller).init(); - - let meter = global::meter("hermes"); - - Self { - exporter, - - workers: meter - .i64_up_down_counter("workers") - .with_description("Number of workers") - .init(), - - client_updates_submitted: meter - .u64_counter("client_updates_submitted") - .with_description("Number of client update messages submitted") - .init(), - - client_misbehaviours_submitted: meter - .u64_counter("client_misbehaviours_submitted") - .with_description("Number of misbehaviours detected and submitted") - .init(), - - receive_packets_confirmed: meter - .u64_counter("receive_packets_confirmed") - .with_description("Number of confirmed receive packets. Available if relayer runs with Tx confirmation enabled") - .init(), - - acknowledgment_packets_confirmed: meter - .u64_counter("acknowledgment_packets_confirmed") - .with_description("Number of confirmed acknowledgment packets. Available if relayer runs with Tx confirmation enabled") - .init(), - - timeout_packets_confirmed: meter - .u64_counter("timeout_packets_confirmed") - .with_description("Number of confirmed timeout packets. Available if relayer runs with Tx confirmation enabled") - .init(), - - queries: meter - .u64_counter("queries") - .with_description( - "Number of queries submitted by Hermes", - ) - .init(), - - queries_cache_hits: meter - .u64_counter("queries_cache_hits") - .with_description("Number of cache hits for queries submitted by Hermes") - .init(), - - ws_reconnect: meter - .u64_counter("ws_reconnect") - .with_description("Number of times Hermes reconnected to the websocket endpoint") - .init(), - - ws_events: meter - .u64_counter("ws_events") - .with_description("How many IBC events did Hermes receive via the websocket subscription") - .init(), - - messages_submitted: meter - .u64_counter("messages_submitted") - .with_description("Number of messages submitted to a specific chain") - .init(), - - wallet_balance: meter - .f64_observable_gauge("wallet_balance") - .with_description("The balance of each wallet Hermes uses per chain. Please note that when converting the balance to f64 a loss in precision might be introduced in the displayed value") - .init(), - - send_packet_events: meter - .u64_counter("send_packet_events") - .with_description("Number of SendPacket events received") - .init(), - - acknowledgement_events: meter - .u64_counter("acknowledgement_events") - .with_description("Number of WriteAcknowledgement events received") - .init(), - - timeout_events: meter - .u64_counter("timeout_events") - .with_description("Number of TimeoutPacket events received") - .init(), - - cleared_send_packet_events: meter - .u64_counter("cleared_send_packet_events") - .with_description("Number of SendPacket events received during the initial and periodic clearing") - .init(), - - cleared_acknowledgment_events: meter - .u64_counter("cleared_acknowledgment_events") - .with_description("Number of WriteAcknowledgement events received during the initial and periodic clearing") - .init(), - - tx_latency_submitted: meter - .u64_observable_gauge("tx_latency_submitted") - .with_unit(Unit::new("milliseconds")) - .with_description("The latency for all transactions submitted to a specific chain, \ - i.e. the difference between the moment when Hermes received a batch of events \ - and when it submitted the corresponding transaction(s). Milliseconds.") - .init(), - - tx_latency_confirmed: meter - .u64_observable_gauge("tx_latency_confirmed") - .with_unit(Unit::new("milliseconds")) - .with_description("The latency for all transactions submitted & confirmed to a specific chain, \ - i.e. the difference between the moment when Hermes received a batch of events \ - until the corresponding transaction(s) were confirmed. Milliseconds.") - .init(), - - in_flight_events: moka::sync::Cache::builder() - .time_to_live(Duration::from_secs(60 * 60)) // Remove entries after 1 hour - .time_to_idle(Duration::from_secs(30 * 60)) // Remove entries if they have been idle for 30 minutes - .build(), - - backlogs: DashMap::new(), - - backlog_oldest_sequence: meter - .u64_observable_gauge("backlog_oldest_sequence") - .with_description("Sequence number of the oldest SendPacket event in the backlog") - .init(), - - backlog_oldest_timestamp: meter - .u64_observable_gauge("backlog_oldest_timestamp") - .with_unit(Unit::new("seconds")) - .with_description("Local timestamp for the oldest SendPacket event in the backlog") - .init(), - - backlog_size: meter - .u64_observable_gauge("backlog_size") - .with_description("Total number of SendPacket events in the backlog") - .init(), - - fee_amounts: meter - .u64_counter("ics29_fee_amounts") - .with_description("Total amount received from ICS29 fees") - .init(), - - visible_fee_addresses: DashSet::new(), - - cached_fees: Mutex::new(Vec::new()), - - period_fees: meter - .u64_observable_gauge("ics29_period_fees") - .with_description("Amount of ICS29 fees rewarded over the past 7 days") - .init(), - } - } -} diff --git a/guide/src/assets/grafana_template.json b/guide/src/assets/grafana_template.json index 67564590db..2d7e91454a 100644 --- a/guide/src/assets/grafana_template.json +++ b/guide/src/assets/grafana_template.json @@ -363,7 +363,7 @@ "uid": "${DS_PROMETHEUS}" }, "editorMode": "builder", - "expr": "delta(client_updates_submitted{job=\"hermes\"}[1m])", + "expr": "delta(client_updates_submitted_total{job=\"hermes\"}[1m])", "legendFormat": "{{src_chain}} -> {{dst_chain}}:{{client}} ({{instance}})", "range": true, "refId": "A" @@ -453,13 +453,13 @@ "uid": "${DS_PROMETHEUS}" }, "editorMode": "builder", - "expr": "delta(total_messages_submitted{job=\"hermes\"}[1m])", + "expr": "delta(messages_submitted_total{job=\"hermes\"}[1m])", "legendFormat": "{{instance}}:{{chain}}", "range": true, "refId": "A" } ], - "title": "total_messages_submitted over 1 minute", + "title": "messages_submitted over 1 minute", "type": "timeseries" }, { @@ -1227,7 +1227,7 @@ "uid": "${DS_PROMETHEUS}" }, "editorMode": "builder", - "expr": "delta(receive_packets_confirmed{job=\"hermes\"}[1m])", + "expr": "delta(receive_packets_confirmed_total{job=\"hermes\"}[1m])", "legendFormat": "{{src_chain}}:{{src_channel}}/{{src_port}} ({{instance}})", "range": true, "refId": "A" @@ -1317,7 +1317,7 @@ "uid": "${DS_PROMETHEUS}" }, "editorMode": "builder", - "expr": "delta(timeout_packets_confirmed{job=\"hermes\"}[1m])", + "expr": "delta(timeout_packets_confirmed_total{job=\"hermes\"}[1m])", "legendFormat": "{{src_chain}}:{{src_channel}}/{{src_port}} ({{instance}})", "range": true, "refId": "A" @@ -1407,7 +1407,7 @@ "uid": "${DS_PROMETHEUS}" }, "editorMode": "builder", - "expr": "delta(acknowledgment_packets_confirmed{job=\"hermes\"}[1m])", + "expr": "delta(acknowledgment_packets_confirmed_total{job=\"hermes\"}[1m])", "legendFormat": "{{src_chain}}:{{src_channel}}/{{src_port}} ({{instance}})", "range": true, "refId": "A" @@ -1520,7 +1520,7 @@ "uid": "${DS_PROMETHEUS}" }, "editorMode": "builder", - "expr": "delta(send_packet_events{job=\"hermes\"}[1m])", + "expr": "delta(send_packet_events_total{job=\"hermes\"}[1m])", "interval": "", "legendFormat": "{{chain}}:{{channel}}/{{port}}-{{counterparty}} ({{instance}})", "range": true, @@ -1611,7 +1611,7 @@ "uid": "${DS_PROMETHEUS}" }, "editorMode": "builder", - "expr": "delta(acknowledgement_events{job=\"hermes\"}[1m])", + "expr": "delta(acknowledgement_events_total{job=\"hermes\"}[1m])", "legendFormat": "{{chain}}:{{channel}}/{{port}}-{{counterparty}} ({{instance}})", "range": true, "refId": "A" @@ -1701,7 +1701,7 @@ "uid": "${DS_PROMETHEUS}" }, "editorMode": "builder", - "expr": "delta(timeout_events{job=\"hermes\"}[1m])", + "expr": "delta(timeout_events_total{job=\"hermes\"}[1m])", "legendFormat": "{{chain}}:{{channel}}/{{port}}-{{counterparty}} ({{instance}})", "range": true, "refId": "A" @@ -1791,7 +1791,7 @@ "uid": "${DS_PROMETHEUS}" }, "editorMode": "builder", - "expr": "delta(ws_events{job=\"hermes\"}[1m])", + "expr": "delta(ws_events_total{job=\"hermes\"}[1m])", "interval": "", "legendFormat": "{{instance}}:{{chain}}", "range": true, @@ -1882,7 +1882,7 @@ "uid": "${DS_PROMETHEUS}" }, "editorMode": "builder", - "expr": "delta(ws_reconnect{job=\"hermes\"}[1m])", + "expr": "delta(ws_reconnect_total{job=\"hermes\"}[1m])", "legendFormat": "{{instance}}:{{chain}}", "range": true, "refId": "A" @@ -1972,7 +1972,7 @@ "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", - "expr": "sum(delta(queries{job=\"hermes\"}[1m])) by(instance,chain)", + "expr": "sum(delta(queries_total{job=\"hermes\"}[1m])) by(instance,chain)", "legendFormat": "{{instance}}:{{chain}}", "range": true, "refId": "A" @@ -2078,7 +2078,7 @@ "uid": "${DS_PROMETHEUS}" }, "editorMode": "builder", - "expr": "delta(cleared_acknowledgment_events{job=\"hermes\"}[1m])", + "expr": "delta(cleared_acknowledgment_events_total{job=\"hermes\"}[1m])", "legendFormat": "{{chain}}:{{channel}}/{{port}}-{{counterparty}} ({{instance}})", "range": true, "refId": "A" @@ -2168,7 +2168,7 @@ "uid": "${DS_PROMETHEUS}" }, "editorMode": "builder", - "expr": "delta(cleared_send_packet_events{job=\"hermes\"}[1m])", + "expr": "delta(cleared_send_packet_events_total{job=\"hermes\"}[1m])", "legendFormat": "{{chain}}:{{channel}}/{{port}}-{{counterparty}} ({{instance}})", "range": true, "refId": "A" @@ -2349,7 +2349,7 @@ "uid": "${DS_PROMETHEUS}" }, "editorMode": "builder", - "expr": "sum by(chain, instance) (delta(queries_cache_hits{job=\"hermes\"}[1m]))", + "expr": "sum by(chain, instance) (delta(queries_cache_hits_total{job=\"hermes\"}[1m]))", "legendFormat": "{{instance}}:{{chain}}", "range": true, "refId": "A" @@ -2358,12 +2358,144 @@ "title": "queries_cache_hits over 1 minute", "type": "timeseries" }, + { + "description": "Indicates the latency rate for all transactions submitted from ibc-0", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 98 + }, + "id": 63, + "options": { + "displayMode": "gradient", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showUnfilled": true + }, + "pluginVersion": "9.0.7", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "builder", + "exemplar": true, + "expr": "increase(tx_latency_submitted_bucket{chain=\"ibc-0\"}[$__range])", + "format": "heatmap", + "instant": false, + "interval": "", + "legendFormat": "{{le}}", + "range": true, + "refId": "A" + } + ], + "title": "tx_latency_submitted histogram ibc-0", + "transformations": [], + "type": "bargauge" + }, + { + "description": "Indicates the latency rate for all transactions confirmed from ibc-0", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 98 + }, + "id": 64, + "options": { + "displayMode": "gradient", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showUnfilled": true + }, + "pluginVersion": "9.0.7", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "builder", + "exemplar": true, + "expr": "increase(tx_latency_confirmed_bucket{chain=\"ibc-0\"}[$__range])", + "format": "heatmap", + "instant": false, + "interval": "", + "legendFormat": "{{le}}", + "range": true, + "refId": "A" + } + ], + "title": "tx_latency_confirmed_bucket histogram ibc-0", + "transformations": [], + "type": "bargauge" + }, { "gridPos": { "h": 2, "w": 24, "x": 0, - "y": 98 + "y": 106 }, "id": 58, "options": { @@ -2454,7 +2586,7 @@ "uid": "${DS_PROMETHEUS}" }, "editorMode": "builder", - "expr": "delta(client_misbehaviours_submitted[5m])", + "expr": "delta(client_misbehaviours_submitted_total[5m])", "legendFormat": "{{src_chain}}-{{dst_chain}}:{{client}} ({{instance}})", "range": true, "refId": "A" diff --git a/guide/src/documentation/telemetry/index.md b/guide/src/documentation/telemetry/index.md index bb67d4f72e..b0d8f926d1 100644 --- a/guide/src/documentation/telemetry/index.md +++ b/guide/src/documentation/telemetry/index.md @@ -13,9 +13,12 @@ The telemetry service is not active by default, and must be enabled in Hermes' c ```toml [telemetry] -enabled = true # default = false -host = '127.0.0.1' # default value -port = 3001 # default value +enabled = true # default = false +host = '127.0.0.1' # default value +port = 3001 # default value +[telemetry.buckets] # default value +latency_submitted = { start = 5000, end = 10000, buckets = 10 } # default value +latency_confirmed = { start = 5000, end = 10000, buckets = 10 } # default value ``` Please see the [relevant section for *Configuration*](../configuration/index.md) for more general details about Hermes configuration options. diff --git a/guide/src/documentation/telemetry/operators.md b/guide/src/documentation/telemetry/operators.md index 04ed8c37f5..e98d903f05 100644 --- a/guide/src/documentation/telemetry/operators.md +++ b/guide/src/documentation/telemetry/operators.md @@ -59,6 +59,19 @@ into a full node's mempool. **A note on wallet balances.** For the `wallet_balance`, we convert from a String into a f64, which can lead to a loss in precision in the displayed value. +**latency histogram** +The `tx_latency_submitted` and `tx_latency_confirmed` are displayed with histogram buckets which each contain the number of values less or equal to their bucket label. This means that if there are 5 buckets with label `500`, `2000`, `3000`, `4000` and `5000` and 2 `tx_latency_submitted` were recorded of respectively `1800ms` and `3100ms` then the `tx_latency_submitted` will look like this: + +```text +tx_latency_submitted_bucket{chain="ibc-0",channel="channel-0",counterparty="ibc-1",port="transfer",service_name="unknown_service",otel_scope_name="hermes",otel_scope_version="",le="500"} 0 +tx_latency_submitted_bucket{chain="ibc-0",channel="channel-0",counterparty="ibc-1",port="transfer",service_name="unknown_service",otel_scope_name="hermes",otel_scope_version="",le="2000"} 1 +tx_latency_submitted_bucket{chain="ibc-0",channel="channel-0",counterparty="ibc-1",port="transfer",service_name="unknown_service",otel_scope_name="hermes",otel_scope_version="",le="3000"} 1 +tx_latency_submitted_bucket{chain="ibc-0",channel="channel-0",counterparty="ibc-1",port="transfer",service_name="unknown_service",otel_scope_name="hermes",otel_scope_version="",le="4000"} 2 +tx_latency_submitted_bucket{chain="ibc-0",channel="channel-0",counterparty="ibc-1",port="transfer",service_name="unknown_service",otel_scope_name="hermes",otel_scope_version="",le="5000"} 2 +tx_latency_submitted_bucket{chain="ibc-1",channel="channel-0",counterparty="ibc-0",port="transfer",service_name="unknown_service",otel_scope_name="hermes",otel_scope_version="",le="+Inf"} 2 +``` +The range of the buckets can be configured using the `latency_submitted` and `latency_confirmed` seen [here](./index.md) + ## Are Hermes transactions successful?