From 9875f783ca3ce5032e73f2c93ea24b63f1eaf6c1 Mon Sep 17 00:00:00 2001 From: Luca Joss Date: Wed, 31 May 2023 11:14:40 +0200 Subject: [PATCH 01/13] Add new histogram range configuration for telemetry --- config.toml | 30 ++++++++ crates/relayer/src/config.rs | 62 +++++++++++++++++ ...elayer_conf_example_invalid_telemetry.toml | 68 +++++++++++++++++++ .../relayer_conf_example_valid_telemetry.toml | 68 +++++++++++++++++++ 4 files changed, 228 insertions(+) create mode 100644 crates/relayer/tests/config/fixtures/relayer_conf_example_invalid_telemetry.toml create mode 100644 crates/relayer/tests/config/fixtures/relayer_conf_example_valid_telemetry.toml diff --git a/config.toml b/config.toml index 079a72d25c..eb623d2e00 100644 --- a/config.toml +++ b/config.toml @@ -110,6 +110,36 @@ host = '127.0.0.1' # by the telemetry service. Default: 3001 port = 3001 +# Specify the range of the 10 histogram buckets in ms for the `tx_latency_submitted` metric. +# Default: { min = 500, max = 10000 } which will result in the following buckets: +# tx_latency_submitted_bucket{...,le="950"} 0 +# tx_latency_submitted_bucket{...,le="1900"} 1 +# tx_latency_submitted_bucket{...,le="2850"} 1 +# tx_latency_submitted_bucket{...,le="3800"} 1 +# tx_latency_submitted_bucket{...,le="4750"} 1 +# tx_latency_submitted_bucket{...,le="5700"} 1 +# tx_latency_submitted_bucket{...,le="6650"} 1 +# tx_latency_submitted_bucket{...,le="7600"} 1 +# tx_latency_submitted_bucket{...,le="8550"} 1 +# tx_latency_submitted_bucket{...,le="9500"} 1 +# tx_latency_submitted_bucket{...,le="+Inf"} 1 +# submitted_range = { min = 5000, max = 10000 } + +# Specify the range of the 10 histogram buckets in ms for the `tx_latency_confirmed` metric. +# Default: { min = 1000, max = 20000 } which will result in the following buckets: +# tx_latency_confirmed_bucket{...,le="1900"} 0 +# tx_latency_confirmed_bucket{...,le="3800"} 0 +# tx_latency_confirmed_bucket{...,le="5700"} 0 +# tx_latency_confirmed_bucket{...,le="7600"} 0 +# tx_latency_confirmed_bucket{...,le="9500"} 0 +# tx_latency_confirmed_bucket{...,le="11400"} 0 +# tx_latency_confirmed_bucket{...,le="13300"} 0 +# tx_latency_confirmed_bucket{...,le="15200"} 0 +# tx_latency_confirmed_bucket{...,le="17100"} 0 +# tx_latency_confirmed_bucket{...,le="19000"} 0 +# tx_latency_confirmed_bucket{...,le="+Inf"} 1 +confirmed_range = { min = 1000, max = 20000 } + # A chains section includes parameters related to a chain and the full node to which # the relayer can send transactions and queries. diff --git a/crates/relayer/src/config.rs b/crates/relayer/src/config.rs index 78379362fb..d0547a2320 100644 --- a/crates/relayer/src/config.rs +++ b/crates/relayer/src/config.rs @@ -381,6 +381,38 @@ pub struct TelemetryConfig { pub enabled: bool, pub host: String, pub port: u16, + pub submitted_range: HistogramRange, + pub confirmed_range: HistogramRange, +} + +#[derive(Clone, Debug, Deserialize, Serialize)] +#[serde(try_from = "HistogramRangeUnchecked")] +pub struct HistogramRange { + pub min: u64, + pub max: u64, +} + +impl TryFrom for HistogramRange { + type Error = String; + + fn try_from(value: HistogramRangeUnchecked) -> Result { + if value.min > value.max { + return Err(format!( + "histogram range min `{}` must be smaller or equal than max `{}`", + value.min, value.max + )); + } + Ok(Self { + min: value.min, + max: value.max, + }) + } +} + +#[derive(Clone, Debug, Deserialize, Serialize)] +pub struct HistogramRangeUnchecked { + min: u64, + max: u64, } /// Default values for the telemetry configuration. @@ -392,6 +424,14 @@ impl Default for TelemetryConfig { enabled: false, host: "127.0.0.1".to_string(), port: 3001, + submitted_range: HistogramRange { + min: 500, + max: 10000, + }, + confirmed_range: HistogramRange { + min: 1000, + max: 20000, + }, } } } @@ -623,6 +663,28 @@ mod tests { dbg!(config); } + #[test] + fn parse_valid_telemetry() { + let path = concat!( + env!("CARGO_MANIFEST_DIR"), + "/tests/config/fixtures/relayer_conf_example_valid_telemetry.toml" + ); + + let config = load(path).expect("could not parse config"); + + dbg!(config); + } + + #[test] + fn parse_invalid_telemetry() { + let path = concat!( + env!("CARGO_MANIFEST_DIR"), + "/tests/config/fixtures/relayer_conf_example_invalid_telemetry.toml" + ); + + assert!(load(path).is_err()); + } + #[test] fn serialize_valid_config() { let path = concat!( diff --git a/crates/relayer/tests/config/fixtures/relayer_conf_example_invalid_telemetry.toml b/crates/relayer/tests/config/fixtures/relayer_conf_example_invalid_telemetry.toml new file mode 100644 index 0000000000..8c6de39610 --- /dev/null +++ b/crates/relayer/tests/config/fixtures/relayer_conf_example_invalid_telemetry.toml @@ -0,0 +1,68 @@ +[global] +log_level = 'error' + +[mode] + +[mode.clients] +enabled = true +refresh = true +misbehaviour = true + +[mode.connections] +enabled = false + +[mode.channels] +enabled = false + +[mode.packets] +enabled = true +clear_interval = 100 +clear_on_start = true +tx_confirmation = true + +[telemetry] +enabled = true +host = '127.0.0.1' +port = 3001 +submitted_range = { min = 10000, max = 5000 } +confirmed_range = { min = 10000, max = 20000 } + +[[chains]] +id = 'chain_A' +rpc_addr = 'http://127.0.0.1:26657' +grpc_addr = 'http://127.0.0.1:9090' +websocket_addr = 'ws://localhost:26657/websocket' +rpc_timeout = '10s' +account_prefix = 'cosmos' +key_name = 'testkey' +store_prefix = 'ibc' +max_gas = 200000 +gas_price = { price = 0.001, denom = 'stake' } +max_msg_num = 4 +max_tx_size = 1048576 +clock_drift = '5s' +trusting_period = '14days' +trust_threshold = { numerator = '1', denominator = '3' } +address_type = { derivation = 'cosmos' } + +[chains.packet_filter] +policy = 'allow' +list = [ + ['ica*', '*'], + ['transfer', 'channel-0'], +] + +[[chains]] +id = 'chain_B' +rpc_addr = 'http://127.0.0.1:26557' +grpc_addr = 'http://127.0.0.1:9090' +websocket_addr = 'ws://localhost:26557/websocket' +rpc_timeout = '10s' +account_prefix = 'cosmos' +key_name = 'testkey' +store_prefix = 'ibc' +gas_price = { price = 0.001, denom = 'stake' } +clock_drift = '5s' +trusting_period = '14days' +trust_threshold = { numerator = '1', denominator = '3' } +address_type = { derivation = 'ethermint', proto_type = { pk_type = '/injective.crypto.v1beta1.ethsecp256k1.PubKey' } } \ No newline at end of file diff --git a/crates/relayer/tests/config/fixtures/relayer_conf_example_valid_telemetry.toml b/crates/relayer/tests/config/fixtures/relayer_conf_example_valid_telemetry.toml new file mode 100644 index 0000000000..003dcceed5 --- /dev/null +++ b/crates/relayer/tests/config/fixtures/relayer_conf_example_valid_telemetry.toml @@ -0,0 +1,68 @@ +[global] +log_level = 'error' + +[mode] + +[mode.clients] +enabled = true +refresh = true +misbehaviour = true + +[mode.connections] +enabled = false + +[mode.channels] +enabled = false + +[mode.packets] +enabled = true +clear_interval = 100 +clear_on_start = true +tx_confirmation = true + +[telemetry] +enabled = true +host = '127.0.0.1' +port = 3001 +submitted_range = { min = 5000, max = 10000 } +confirmed_range = { min = 10000, max = 20000 } + +[[chains]] +id = 'chain_A' +rpc_addr = 'http://127.0.0.1:26657' +grpc_addr = 'http://127.0.0.1:9090' +websocket_addr = 'ws://localhost:26657/websocket' +rpc_timeout = '10s' +account_prefix = 'cosmos' +key_name = 'testkey' +store_prefix = 'ibc' +max_gas = 200000 +gas_price = { price = 0.001, denom = 'stake' } +max_msg_num = 4 +max_tx_size = 1048576 +clock_drift = '5s' +trusting_period = '14days' +trust_threshold = { numerator = '1', denominator = '3' } +address_type = { derivation = 'cosmos' } + +[chains.packet_filter] +policy = 'allow' +list = [ + ['ica*', '*'], + ['transfer', 'channel-0'], +] + +[[chains]] +id = 'chain_B' +rpc_addr = 'http://127.0.0.1:26557' +grpc_addr = 'http://127.0.0.1:9090' +websocket_addr = 'ws://localhost:26557/websocket' +rpc_timeout = '10s' +account_prefix = 'cosmos' +key_name = 'testkey' +store_prefix = 'ibc' +gas_price = { price = 0.001, denom = 'stake' } +clock_drift = '5s' +trusting_period = '14days' +trust_threshold = { numerator = '1', denominator = '3' } +address_type = { derivation = 'ethermint', proto_type = { pk_type = '/injective.crypto.v1beta1.ethsecp256k1.PubKey' } } \ No newline at end of file From 9192c79adfe3022b77d8b9f2bffcbf42e4cdcd6b Mon Sep 17 00:00:00 2001 From: Luca Joss Date: Wed, 31 May 2023 11:15:08 +0200 Subject: [PATCH 02/13] Use histogram range configuration for telemetry --- Cargo.lock | 1 + crates/relayer-cli/src/commands/start.rs | 7 +- crates/telemetry/Cargo.toml | 1 + crates/telemetry/src/lib.rs | 53 +++++- crates/telemetry/src/state.rs | 233 ++++++++++++++++++++++- 5 files changed, 287 insertions(+), 8 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index cb148701b4..c5c8907661 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1671,6 +1671,7 @@ dependencies = [ "serde_json", "tendermint", "tokio", + "tracing", ] [[package]] diff --git a/crates/relayer-cli/src/commands/start.rs b/crates/relayer-cli/src/commands/start.rs index e75685cd8c..0e44b5f6b9 100644 --- a/crates/relayer-cli/src/commands/start.rs +++ b/crates/relayer-cli/src/commands/start.rs @@ -193,7 +193,12 @@ fn spawn_telemetry_server(config: &Config) { let _span = tracing::error_span!("telemetry").entered(); - let state = ibc_telemetry::global(); + let state = ibc_telemetry::init( + config.telemetry.submitted_range.min, + config.telemetry.submitted_range.max, + config.telemetry.confirmed_range.min, + config.telemetry.confirmed_range.max, + ); let telemetry = config.telemetry.clone(); if !telemetry.enabled { diff --git a/crates/telemetry/Cargo.toml b/crates/telemetry/Cargo.toml index afa32f9338..fd9a12c079 100644 --- a/crates/telemetry/Cargo.toml +++ b/crates/telemetry/Cargo.toml @@ -25,6 +25,7 @@ serde_json = "1.0.94" serde = "1.0.149" axum = "0.6.18" tokio = "1.26.0" +tracing = "0.1.36" [dependencies.tendermint] version = "0.32.0" diff --git a/crates/telemetry/src/lib.rs b/crates/telemetry/src/lib.rs index 40fa56bfce..958c4132b8 100644 --- a/crates/telemetry/src/lib.rs +++ b/crates/telemetry/src/lib.rs @@ -7,19 +7,62 @@ use std::error::Error; use std::net::{SocketAddr, ToSocketAddrs}; use std::sync::Arc; -use once_cell::sync::Lazy; +use once_cell::sync::OnceCell; use tokio::task::JoinHandle; +use tracing::{debug, warn}; pub use crate::state::TelemetryState; -pub fn new_state() -> Arc { - Arc::new(TelemetryState::default()) +pub fn new_state( + tx_latency_submitted_min: u64, + tx_latency_submitted_max: u64, + tx_latency_confirmed_min: u64, + tx_latency_confirmed_max: u64, +) -> Arc { + Arc::new(TelemetryState::new( + tx_latency_submitted_min, + tx_latency_submitted_max, + tx_latency_confirmed_min, + tx_latency_confirmed_max, + )) } -static GLOBAL_STATE: Lazy> = Lazy::new(new_state); +static GLOBAL_STATE: OnceCell> = OnceCell::new(); + +pub fn init( + tx_latency_submitted_min: u64, + tx_latency_submitted_max: u64, + tx_latency_confirmed_min: u64, + tx_latency_confirmed_max: u64, +) -> &'static Arc { + let new_state = new_state( + tx_latency_submitted_min, + tx_latency_submitted_max, + tx_latency_confirmed_min, + tx_latency_confirmed_max, + ); + match GLOBAL_STATE.set(new_state) { + Ok(_) => debug!("initialised telemetry global state"), + Err(_) => debug!("telemetry global state was already set"), + } + GLOBAL_STATE.get().unwrap() +} pub fn global() -> &'static Arc { - &GLOBAL_STATE + match GLOBAL_STATE.get() { + Some(state) => state, + None => { + warn!( + "global telemetry state not set, will initialize it using default histogram ranges" + ); + let new_state = new_state(500, 10000, 1000, 20000); + match GLOBAL_STATE.set(new_state) { + Ok(_) => debug!("initialised telemetry global state"), + Err(_) => debug!("telemetry global state was already set"), + } + GLOBAL_STATE.get().unwrap() + } + } } pub type BoxError = Box; diff --git a/crates/telemetry/src/state.rs b/crates/telemetry/src/state.rs index dc959a6bde..3cfc112a46 100644 --- a/crates/telemetry/src/state.rs +++ b/crates/telemetry/src/state.rs @@ -195,6 +195,175 @@ pub struct TelemetryState { } impl TelemetryState { + pub fn new( + tx_latency_submitted_min: u64, + tx_latency_submitted_max: u64, + tx_latency_confirmed_min: u64, + tx_latency_confirmed_max: u64, + ) -> Self { + use opentelemetry::sdk::export::metrics::aggregation; + use opentelemetry::sdk::metrics::{controllers, processors}; + + let controller = controllers::basic(processors::factory( + RangeCustomAggregatorSelector::new( + tx_latency_submitted_min, + tx_latency_submitted_max, + tx_latency_confirmed_min, + tx_latency_confirmed_max, + ), + aggregation::cumulative_temporality_selector(), + )) + .build(); + + let exporter = opentelemetry_prometheus::ExporterBuilder::new(controller).init(); + + let meter = global::meter("hermes"); + + Self { + exporter, + + workers: meter + .i64_up_down_counter("workers") + .with_description("Number of workers") + .init(), + + client_updates_submitted: meter + .u64_counter("client_updates_submitted") + .with_description("Number of client update messages submitted") + .init(), + + client_misbehaviours_submitted: meter + .u64_counter("client_misbehaviours_submitted") + .with_description("Number of misbehaviours detected and submitted") + .init(), + + receive_packets_confirmed: meter + .u64_counter("receive_packets_confirmed") + .with_description("Number of confirmed receive packets. Available if relayer runs with Tx confirmation enabled") + .init(), + + acknowledgment_packets_confirmed: meter + .u64_counter("acknowledgment_packets_confirmed") + .with_description("Number of confirmed acknowledgment packets. Available if relayer runs with Tx confirmation enabled") + .init(), + + timeout_packets_confirmed: meter + .u64_counter("timeout_packets_confirmed") + .with_description("Number of confirmed timeout packets. Available if relayer runs with Tx confirmation enabled") + .init(), + + queries: meter + .u64_counter("queries") + .with_description( + "Number of queries submitted by Hermes", + ) + .init(), + + queries_cache_hits: meter + .u64_counter("queries_cache_hits") + .with_description("Number of cache hits for queries submitted by Hermes") + .init(), + + ws_reconnect: meter + .u64_counter("ws_reconnect") + .with_description("Number of times Hermes reconnected to the websocket endpoint") + .init(), + + ws_events: meter + .u64_counter("ws_events") + .with_description("How many IBC events did Hermes receive via the websocket subscription") + .init(), + + messages_submitted: meter + .u64_counter("messages_submitted") + .with_description("Number of messages submitted to a specific chain") + .init(), + + wallet_balance: meter + .f64_observable_gauge("wallet_balance") + .with_description("The balance of each wallet Hermes uses per chain. Please note that when converting the balance to f64 a loss in precision might be introduced in the displayed value") + .init(), + + send_packet_events: meter + .u64_counter("send_packet_events") + .with_description("Number of SendPacket events received") + .init(), + + acknowledgement_events: meter + .u64_counter("acknowledgement_events") + .with_description("Number of WriteAcknowledgement events received") + .init(), + + timeout_events: meter + .u64_counter("timeout_events") + .with_description("Number of TimeoutPacket events received") + .init(), + + cleared_send_packet_events: meter + .u64_counter("cleared_send_packet_events") + .with_description("Number of SendPacket events received during the initial and periodic clearing") + .init(), + + cleared_acknowledgment_events: meter + .u64_counter("cleared_acknowledgment_events") + .with_description("Number of WriteAcknowledgement events received during the initial and periodic clearing") + .init(), + + tx_latency_submitted: meter + .u64_observable_gauge("tx_latency_submitted") + .with_unit(Unit::new("milliseconds")) + .with_description("The latency for all transactions submitted to a specific chain, \ + i.e. the difference between the moment when Hermes received a batch of events \ + and when it submitted the corresponding transaction(s). Milliseconds.") + .init(), + + tx_latency_confirmed: meter + .u64_observable_gauge("tx_latency_confirmed") + .with_unit(Unit::new("milliseconds")) + .with_description("The latency for all transactions submitted & confirmed to a specific chain, \ + i.e. the difference between the moment when Hermes received a batch of events \ + until the corresponding transaction(s) were confirmed. Milliseconds.") + .init(), + + in_flight_events: moka::sync::Cache::builder() + .time_to_live(Duration::from_secs(60 * 60)) // Remove entries after 1 hour + .time_to_idle(Duration::from_secs(30 * 60)) // Remove entries if they have been idle for 30 minutes + .build(), + + backlogs: DashMap::new(), + + backlog_oldest_sequence: meter + .u64_observable_gauge("backlog_oldest_sequence") + .with_description("Sequence number of the oldest SendPacket event in the backlog") + .init(), + + backlog_oldest_timestamp: meter + .u64_observable_gauge("backlog_oldest_timestamp") + .with_unit(Unit::new("seconds")) + .with_description("Local timestamp for the oldest SendPacket event in the backlog") + .init(), + + backlog_size: meter + .u64_observable_gauge("backlog_size") + .with_description("Total number of SendPacket events in the backlog") + .init(), + + fee_amounts: meter + .u64_counter("ics29_fee_amounts") + .with_description("Total amount received from ICS29 fees") + .init(), + + visible_fee_addresses: DashSet::new(), + + cached_fees: Mutex::new(Vec::new()), + + period_fees: meter + .u64_observable_gauge("ics29_period_fees") + .with_description("Amount of ICS29 fees rewarded over the past 7 days") + .init(), + } + } + /// Gather the metrics for export pub fn gather(&self) -> Vec { self.exporter.registry().gather() @@ -880,6 +1049,64 @@ use opentelemetry::sdk::metrics::aggregators::Aggregator; use opentelemetry::sdk::metrics::aggregators::{histogram, last_value, sum}; use opentelemetry::sdk::metrics::sdk_api::Descriptor; +#[derive(Debug)] +struct RangeCustomAggregatorSelector { + tx_latency_submitted_min: u64, + tx_latency_submitted_max: u64, + tx_latency_confirmed_min: u64, + tx_latency_confirmed_max: u64, +} + +impl RangeCustomAggregatorSelector { + pub fn new( + tx_latency_submitted_min: u64, + tx_latency_submitted_max: u64, + tx_latency_confirmed_min: u64, + tx_latency_confirmed_max: u64, + ) -> Self { + Self { + tx_latency_submitted_min, + tx_latency_submitted_max, + tx_latency_confirmed_min, + tx_latency_confirmed_max, + } + } + + pub fn get_submitted_range(&self) -> Vec { + let step = (self.tx_latency_submitted_max - self.tx_latency_submitted_min) / 10; + (self.tx_latency_submitted_min..=self.tx_latency_submitted_max) + .filter(|e| e % step == 0) + .map(|e| e as f64) + .collect::>() + } + + pub fn get_confirmed_range(&self) -> Vec { + let step = (self.tx_latency_confirmed_max - self.tx_latency_confirmed_min) / 10; + (self.tx_latency_confirmed_min..=self.tx_latency_confirmed_max) + .filter(|e| e % step == 0) + .map(|e| e as f64) + .collect::>() + } +} + +impl AggregatorSelector for RangeCustomAggregatorSelector { + fn aggregator_for(&self, descriptor: &Descriptor) -> Option> { + match descriptor.name() { + "wallet_balance" => Some(Arc::new(last_value())), + "backlog_oldest_sequence" => Some(Arc::new(last_value())), + "backlog_oldest_timestamp" => Some(Arc::new(last_value())), + "backlog_size" => Some(Arc::new(last_value())), + // Prometheus' supports only collector for histogram, sum, and last value aggregators. + // https://docs.rs/opentelemetry-prometheus/0.10.0/src/opentelemetry_prometheus/lib.rs.html#411-418 + // TODO: Once quantile sketches are supported, replace histograms with that. + "tx_latency_submitted" => Some(Arc::new(histogram(&self.get_submitted_range()))), + "tx_latency_confirmed" => Some(Arc::new(histogram(&self.get_confirmed_range()))), + "ics29_period_fees" => Some(Arc::new(last_value())), + _ => Some(Arc::new(sum())), + } + } +} + #[derive(Debug)] struct CustomAggregatorSelector; @@ -894,10 +1121,12 @@ impl AggregatorSelector for CustomAggregatorSelector { // https://docs.rs/opentelemetry-prometheus/0.10.0/src/opentelemetry_prometheus/lib.rs.html#411-418 // TODO: Once quantile sketches are supported, replace histograms with that. "tx_latency_submitted" => Some(Arc::new(histogram(&[ - 200.0, 500.0, 1000.0, 2000.0, 5000.0, 10000.0, + 200.0, 500.0, 1000.0, 2000.0, 5000.0, 6000.0, 7000.0, 8000.0, 9000.0, 10000.0, + 12000.0, 14000.0, 16000.0, ]))), "tx_latency_confirmed" => Some(Arc::new(histogram(&[ - 1000.0, 5000.0, 9000.0, 13000.0, 17000.0, 20000.0, + 1000.0, 5000.0, 9000.0, 13000.0, 17000.0, 20000.0, 23000.0, 25000.0, 27000.0, + 30000.0, ]))), "ics29_period_fees" => Some(Arc::new(last_value())), _ => Some(Arc::new(sum())), From e2e4240ddb44a40643abc21a1a8a1784093f92ab Mon Sep 17 00:00:00 2001 From: Luca Joss Date: Wed, 31 May 2023 11:19:54 +0200 Subject: [PATCH 03/13] Add changelog entry --- .../ibc-telemetry/3366-configurable-histogram-buckets.md | 6 ++++++ 1 file changed, 6 insertions(+) create mode 100644 .changelog/unreleased/improvements/ibc-telemetry/3366-configurable-histogram-buckets.md diff --git a/.changelog/unreleased/improvements/ibc-telemetry/3366-configurable-histogram-buckets.md b/.changelog/unreleased/improvements/ibc-telemetry/3366-configurable-histogram-buckets.md new file mode 100644 index 0000000000..ef9ddab624 --- /dev/null +++ b/.changelog/unreleased/improvements/ibc-telemetry/3366-configurable-histogram-buckets.md @@ -0,0 +1,6 @@ +- Add two new configurations for the telemetry: + - `submitted_range` used to specify the range of the histogram + buckets for the `tx_latency_submitted` metric. + - `confirmed_range` used to specify the range of the histogram + buckets for the `tx_latency_confirmed` metric. + ([#3366](https://github.com/informalsystems/hermes/issues/3366)) \ No newline at end of file From a5cbdbc9610fff661f05928710fd549fae03ca65 Mon Sep 17 00:00:00 2001 From: Luca Joss Date: Wed, 31 May 2023 11:33:48 +0200 Subject: [PATCH 04/13] Add additional information in the guide for latency metrics --- guide/src/documentation/telemetry/index.md | 8 +++++--- guide/src/documentation/telemetry/operators.md | 13 +++++++++++++ 2 files changed, 18 insertions(+), 3 deletions(-) diff --git a/guide/src/documentation/telemetry/index.md b/guide/src/documentation/telemetry/index.md index bb67d4f72e..a270f73031 100644 --- a/guide/src/documentation/telemetry/index.md +++ b/guide/src/documentation/telemetry/index.md @@ -13,9 +13,11 @@ The telemetry service is not active by default, and must be enabled in Hermes' c ```toml [telemetry] -enabled = true # default = false -host = '127.0.0.1' # default value -port = 3001 # default value +enabled = true # default = false +host = '127.0.0.1' # default value +port = 3001 # default value +submitted_range = { min = 500, max = 10000 } # default value +confirmed_range = { min = 1000, max = 20000 } # default value ``` Please see the [relevant section for *Configuration*](../configuration/index.md) for more general details about Hermes configuration options. diff --git a/guide/src/documentation/telemetry/operators.md b/guide/src/documentation/telemetry/operators.md index 04ed8c37f5..2a44c5f2ab 100644 --- a/guide/src/documentation/telemetry/operators.md +++ b/guide/src/documentation/telemetry/operators.md @@ -59,6 +59,19 @@ into a full node's mempool. **A note on wallet balances.** For the `wallet_balance`, we convert from a String into a f64, which can lead to a loss in precision in the displayed value. +**latency histogram** +The `tx_latency_submitted` and `tx_latency_confirmed` are displayed with histogram buckets which each contain the number of values less or equal to their bucket label. This means that if there are 5 buckets with label `500`, `2000`, `3000`, `4000` and `5000` and 2 `tx_latency_submitted` were recorded of respectively `1800ms` and `3100ms` then the `tx_latency_submitted` will look like this: + +```text +tx_latency_submitted_bucket{chain="ibc-0",channel="channel-0",counterparty="ibc-1",port="transfer",service_name="unknown_service",otel_scope_name="hermes",otel_scope_version="",le="500"} 0 +tx_latency_submitted_bucket{chain="ibc-0",channel="channel-0",counterparty="ibc-1",port="transfer",service_name="unknown_service",otel_scope_name="hermes",otel_scope_version="",le="2000"} 1 +tx_latency_submitted_bucket{chain="ibc-0",channel="channel-0",counterparty="ibc-1",port="transfer",service_name="unknown_service",otel_scope_name="hermes",otel_scope_version="",le="3000"} 1 +tx_latency_submitted_bucket{chain="ibc-0",channel="channel-0",counterparty="ibc-1",port="transfer",service_name="unknown_service",otel_scope_name="hermes",otel_scope_version="",le="4000"} 2 +tx_latency_submitted_bucket{chain="ibc-0",channel="channel-0",counterparty="ibc-1",port="transfer",service_name="unknown_service",otel_scope_name="hermes",otel_scope_version="",le="5000"} 2 +tx_latency_submitted_bucket{chain="ibc-1",channel="channel-0",counterparty="ibc-0",port="transfer",service_name="unknown_service",otel_scope_name="hermes",otel_scope_version="",le="+Inf"} 2 +``` +The range of the buckets can be configured using the `submitted_range` and `confirmed_range` seen [here](./index.md) + ## Are Hermes transactions successful? From ac5458891f81359aa66c0eb2902c8bcf6b757cb6 Mon Sep 17 00:00:00 2001 From: Luca Joss Date: Wed, 31 May 2023 11:43:36 +0200 Subject: [PATCH 05/13] Add default for histogram bucket configurations --- crates/relayer/src/config.rs | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/crates/relayer/src/config.rs b/crates/relayer/src/config.rs index d0547a2320..9581128f76 100644 --- a/crates/relayer/src/config.rs +++ b/crates/relayer/src/config.rs @@ -200,6 +200,20 @@ pub mod default { pub fn max_grpc_decoding_size() -> Byte { Byte::from_bytes(33554432) } + + pub fn submitted_range() -> HistogramRange { + HistogramRange { + min: 500, + max: 10000, + } + } + + pub fn confirmed_range() -> HistogramRange { + HistogramRange { + min: 1000, + max: 20000, + } + } } #[derive(Clone, Debug, Default, Deserialize, Serialize)] @@ -381,7 +395,9 @@ pub struct TelemetryConfig { pub enabled: bool, pub host: String, pub port: u16, + #[serde(default = "default::submitted_range")] pub submitted_range: HistogramRange, + #[serde(default = "default::confirmed_range")] pub confirmed_range: HistogramRange, } From 04dfccd1c2bf7762270c069f058b9b6a654ba9f0 Mon Sep 17 00:00:00 2001 From: Luca Joss Date: Wed, 31 May 2023 15:45:48 +0200 Subject: [PATCH 06/13] Improve histogram configuration --- .../3366-configurable-histogram-buckets.md | 6 +- config.toml | 35 +-- crates/relayer-cli/src/commands/start.rs | 8 +- crates/relayer/src/config.rs | 86 ++++-- ...elayer_conf_example_invalid_telemetry.toml | 6 +- .../relayer_conf_example_valid_telemetry.toml | 6 +- crates/telemetry/src/lib.rs | 51 ++-- crates/telemetry/src/state.rs | 265 ++++-------------- guide/src/documentation/telemetry/index.md | 11 +- 9 files changed, 163 insertions(+), 311 deletions(-) diff --git a/.changelog/unreleased/improvements/ibc-telemetry/3366-configurable-histogram-buckets.md b/.changelog/unreleased/improvements/ibc-telemetry/3366-configurable-histogram-buckets.md index ef9ddab624..b27829b142 100644 --- a/.changelog/unreleased/improvements/ibc-telemetry/3366-configurable-histogram-buckets.md +++ b/.changelog/unreleased/improvements/ibc-telemetry/3366-configurable-histogram-buckets.md @@ -1,6 +1,6 @@ -- Add two new configurations for the telemetry: - - `submitted_range` used to specify the range of the histogram +- Add two new configurations for the telemetry `buckets`: + - `latency_submitted` used to specify the range and number of buckets for the `tx_latency_submitted` metric. - - `confirmed_range` used to specify the range of the histogram + - `latency_confirmed` used to specify the range and number of buckets for the `tx_latency_confirmed` metric. ([#3366](https://github.com/informalsystems/hermes/issues/3366)) \ No newline at end of file diff --git a/config.toml b/config.toml index eb623d2e00..e4f226f8a6 100644 --- a/config.toml +++ b/config.toml @@ -110,35 +110,18 @@ host = '127.0.0.1' # by the telemetry service. Default: 3001 port = 3001 +[telemetry.buckets] # Specify the range of the 10 histogram buckets in ms for the `tx_latency_submitted` metric. -# Default: { min = 500, max = 10000 } which will result in the following buckets: -# tx_latency_submitted_bucket{...,le="950"} 0 -# tx_latency_submitted_bucket{...,le="1900"} 1 -# tx_latency_submitted_bucket{...,le="2850"} 1 -# tx_latency_submitted_bucket{...,le="3800"} 1 -# tx_latency_submitted_bucket{...,le="4750"} 1 -# tx_latency_submitted_bucket{...,le="5700"} 1 -# tx_latency_submitted_bucket{...,le="6650"} 1 -# tx_latency_submitted_bucket{...,le="7600"} 1 -# tx_latency_submitted_bucket{...,le="8550"} 1 -# tx_latency_submitted_bucket{...,le="9500"} 1 -# tx_latency_submitted_bucket{...,le="+Inf"} 1 -# submitted_range = { min = 5000, max = 10000 } +# Default: { min = 500, max = 10000, buckets = 10 } +# The default will give the following buckets: +# [500, 1450, 2400, 3350, 4300, 5250, 6200, 7150, 8100, 9050, 10000] +# latency_submitted = { min = 500, max = 10000, buckets = 10 } # Specify the range of the 10 histogram buckets in ms for the `tx_latency_confirmed` metric. -# Default: { min = 1000, max = 20000 } which will result in the following buckets: -# tx_latency_confirmed_bucket{...,le="1900"} 0 -# tx_latency_confirmed_bucket{...,le="3800"} 0 -# tx_latency_confirmed_bucket{...,le="5700"} 0 -# tx_latency_confirmed_bucket{...,le="7600"} 0 -# tx_latency_confirmed_bucket{...,le="9500"} 0 -# tx_latency_confirmed_bucket{...,le="11400"} 0 -# tx_latency_confirmed_bucket{...,le="13300"} 0 -# tx_latency_confirmed_bucket{...,le="15200"} 0 -# tx_latency_confirmed_bucket{...,le="17100"} 0 -# tx_latency_confirmed_bucket{...,le="19000"} 0 -# tx_latency_confirmed_bucket{...,le="+Inf"} 1 -confirmed_range = { min = 1000, max = 20000 } +# Default: { min = 1000, max = 20000, buckets = 10 } +# The default will give the following buckets: +# [1000, 2900, 4800, 6700, 8600, 10500, 12400, 14300, 16200, 18100, 20000] +# latency_confirmed = { min = 1000, max = 20000, buckets = 10 } # A chains section includes parameters related to a chain and the full node to which diff --git a/crates/relayer-cli/src/commands/start.rs b/crates/relayer-cli/src/commands/start.rs index 0e44b5f6b9..7f7d57596a 100644 --- a/crates/relayer-cli/src/commands/start.rs +++ b/crates/relayer-cli/src/commands/start.rs @@ -194,10 +194,10 @@ fn spawn_telemetry_server(config: &Config) { let _span = tracing::error_span!("telemetry").entered(); let state = ibc_telemetry::init( - config.telemetry.submitted_range.min, - config.telemetry.submitted_range.max, - config.telemetry.confirmed_range.min, - config.telemetry.confirmed_range.max, + config.telemetry.buckets.latency_submitted.range.clone(), + config.telemetry.buckets.latency_submitted.buckets, + config.telemetry.buckets.latency_confirmed.range.clone(), + config.telemetry.buckets.latency_confirmed.buckets, ); let telemetry = config.telemetry.clone(); diff --git a/crates/relayer/src/config.rs b/crates/relayer/src/config.rs index 9581128f76..da3ea8901b 100644 --- a/crates/relayer/src/config.rs +++ b/crates/relayer/src/config.rs @@ -19,6 +19,7 @@ use std::{ fs, fs::File, io::Write, + ops::Range, path::{Path, PathBuf}, }; use tendermint::block::Height as BlockHeight; @@ -201,17 +202,22 @@ pub mod default { Byte::from_bytes(33554432) } - pub fn submitted_range() -> HistogramRange { - HistogramRange { - min: 500, - max: 10000, - } - } - - pub fn confirmed_range() -> HistogramRange { - HistogramRange { - min: 1000, - max: 20000, + pub fn buckets() -> HistogramBuckets { + HistogramBuckets { + latency_submitted: HistogramConfig { + range: Range { + start: 500, + end: 10000, + }, + buckets: 10, + }, + latency_confirmed: HistogramConfig { + range: Range { + start: 1000, + end: 20000, + }, + buckets: 10, + }, } } } @@ -395,20 +401,45 @@ pub struct TelemetryConfig { pub enabled: bool, pub host: String, pub port: u16, - #[serde(default = "default::submitted_range")] - pub submitted_range: HistogramRange, - #[serde(default = "default::confirmed_range")] - pub confirmed_range: HistogramRange, + #[serde(default = "default::buckets")] + pub buckets: HistogramBuckets, +} + +#[derive(Clone, Debug, Deserialize, Serialize)] +pub struct HistogramBuckets { + pub latency_submitted: HistogramConfig, + pub latency_confirmed: HistogramConfig, +} + +impl Default for HistogramBuckets { + fn default() -> Self { + Self { + latency_submitted: HistogramConfig { + range: Range { + start: 500, + end: 10000, + }, + buckets: 10, + }, + latency_confirmed: HistogramConfig { + range: Range { + start: 1000, + end: 20000, + }, + buckets: 10, + }, + } + } } #[derive(Clone, Debug, Deserialize, Serialize)] #[serde(try_from = "HistogramRangeUnchecked")] -pub struct HistogramRange { - pub min: u64, - pub max: u64, +pub struct HistogramConfig { + pub range: Range, + pub buckets: u64, } -impl TryFrom for HistogramRange { +impl TryFrom for HistogramConfig { type Error = String; fn try_from(value: HistogramRangeUnchecked) -> Result { @@ -419,8 +450,11 @@ impl TryFrom for HistogramRange { )); } Ok(Self { - min: value.min, - max: value.max, + range: Range { + start: value.min, + end: value.max, + }, + buckets: value.buckets, }) } } @@ -429,6 +463,7 @@ impl TryFrom for HistogramRange { pub struct HistogramRangeUnchecked { min: u64, max: u64, + buckets: u64, } /// Default values for the telemetry configuration. @@ -440,14 +475,7 @@ impl Default for TelemetryConfig { enabled: false, host: "127.0.0.1".to_string(), port: 3001, - submitted_range: HistogramRange { - min: 500, - max: 10000, - }, - confirmed_range: HistogramRange { - min: 1000, - max: 20000, - }, + buckets: HistogramBuckets::default(), } } } diff --git a/crates/relayer/tests/config/fixtures/relayer_conf_example_invalid_telemetry.toml b/crates/relayer/tests/config/fixtures/relayer_conf_example_invalid_telemetry.toml index 8c6de39610..bd366065e2 100644 --- a/crates/relayer/tests/config/fixtures/relayer_conf_example_invalid_telemetry.toml +++ b/crates/relayer/tests/config/fixtures/relayer_conf_example_invalid_telemetry.toml @@ -24,8 +24,10 @@ tx_confirmation = true enabled = true host = '127.0.0.1' port = 3001 -submitted_range = { min = 10000, max = 5000 } -confirmed_range = { min = 10000, max = 20000 } + +[telemetry.buckets] +latency_submitted = { min = 5000, max = 1000, buckets = 10 } # min can't be smaller than max +latency_confirmed = { min = 5000, max = 10000, buckets = 10 } [[chains]] id = 'chain_A' diff --git a/crates/relayer/tests/config/fixtures/relayer_conf_example_valid_telemetry.toml b/crates/relayer/tests/config/fixtures/relayer_conf_example_valid_telemetry.toml index 003dcceed5..5d68a1fb25 100644 --- a/crates/relayer/tests/config/fixtures/relayer_conf_example_valid_telemetry.toml +++ b/crates/relayer/tests/config/fixtures/relayer_conf_example_valid_telemetry.toml @@ -24,8 +24,10 @@ tx_confirmation = true enabled = true host = '127.0.0.1' port = 3001 -submitted_range = { min = 5000, max = 10000 } -confirmed_range = { min = 10000, max = 20000 } + +[telemetry.buckets] +latency_submitted = { min = 5000, max = 10000, buckets = 10 } +latency_confirmed = { min = 5000, max = 10000, buckets = 10 } [[chains]] id = 'chain_A' diff --git a/crates/telemetry/src/lib.rs b/crates/telemetry/src/lib.rs index 958c4132b8..aa2c8498ce 100644 --- a/crates/telemetry/src/lib.rs +++ b/crates/telemetry/src/lib.rs @@ -5,6 +5,7 @@ pub mod state; use std::error::Error; use std::net::{SocketAddr, ToSocketAddrs}; +use std::ops::Range; use std::sync::Arc; use once_cell::sync::OnceCell; @@ -14,32 +15,32 @@ use tracing::{debug, warn}; pub use crate::state::TelemetryState; pub fn new_state( - tx_latency_submitted_min: u64, - tx_latency_submitted_max: u64, - tx_latency_confirmed_min: u64, - tx_latency_confirmed_max: u64, + tx_latency_submitted_range: Range, + tx_latency_submitted_buckets: u64, + tx_latency_confirmed_range: Range, + tx_latency_confirmed_buckets: u64, ) -> Arc { Arc::new(TelemetryState::new( - tx_latency_submitted_min, - tx_latency_submitted_max, - tx_latency_confirmed_min, - tx_latency_confirmed_max, + tx_latency_submitted_range, + tx_latency_submitted_buckets, + tx_latency_confirmed_range, + tx_latency_confirmed_buckets, )) } static GLOBAL_STATE: OnceCell> = OnceCell::new(); pub fn init( - tx_latency_submitted_min: u64, - tx_latency_submitted_max: u64, - tx_latency_confirmed_min: u64, - tx_latency_confirmed_max: u64, + tx_latency_submitted_range: Range, + tx_latency_submitted_buckets: u64, + tx_latency_confirmed_range: Range, + tx_latency_confirmed_buckets: u64, ) -> &'static Arc { let new_state = new_state( - tx_latency_submitted_min, - tx_latency_submitted_max, - tx_latency_confirmed_min, - tx_latency_confirmed_max, + tx_latency_submitted_range, + tx_latency_submitted_buckets, + tx_latency_confirmed_range, + tx_latency_confirmed_buckets, ); match GLOBAL_STATE.set(new_state) { Ok(_) => debug!("initialised telemetry global state"), @@ -55,12 +56,18 @@ pub fn global() -> &'static Arc { warn!( "global telemetry state not set, will initialize it using default histogram ranges" ); - let new_state = new_state(500, 10000, 1000, 20000); - match GLOBAL_STATE.set(new_state) { - Ok(_) => debug!("initialised telemetry global state"), - Err(_) => debug!("telemetry global state was already set"), - } - GLOBAL_STATE.get().unwrap() + init( + Range { + start: 500, + end: 10000, + }, + 10, + Range { + start: 1000, + end: 20000, + }, + 10, + ) } } } diff --git a/crates/telemetry/src/state.rs b/crates/telemetry/src/state.rs index 3cfc112a46..d0064c7cc4 100644 --- a/crates/telemetry/src/state.rs +++ b/crates/telemetry/src/state.rs @@ -1,5 +1,6 @@ use core::fmt::{Display, Error as FmtError, Formatter}; use std::{ + ops::Range, sync::Mutex, time::{Duration, Instant}, }; @@ -196,20 +197,20 @@ pub struct TelemetryState { impl TelemetryState { pub fn new( - tx_latency_submitted_min: u64, - tx_latency_submitted_max: u64, - tx_latency_confirmed_min: u64, - tx_latency_confirmed_max: u64, + tx_latency_submitted_range: Range, + tx_latency_submitted_buckets: u64, + tx_latency_confirmed_range: Range, + tx_latency_confirmed_buckets: u64, ) -> Self { use opentelemetry::sdk::export::metrics::aggregation; use opentelemetry::sdk::metrics::{controllers, processors}; let controller = controllers::basic(processors::factory( - RangeCustomAggregatorSelector::new( - tx_latency_submitted_min, - tx_latency_submitted_max, - tx_latency_confirmed_min, - tx_latency_confirmed_max, + CustomAggregatorSelector::new( + tx_latency_submitted_range, + tx_latency_submitted_buckets, + tx_latency_confirmed_range, + tx_latency_confirmed_buckets, ), aggregation::cumulative_temporality_selector(), )) @@ -1050,46 +1051,62 @@ use opentelemetry::sdk::metrics::aggregators::{histogram, last_value, sum}; use opentelemetry::sdk::metrics::sdk_api::Descriptor; #[derive(Debug)] -struct RangeCustomAggregatorSelector { - tx_latency_submitted_min: u64, - tx_latency_submitted_max: u64, - tx_latency_confirmed_min: u64, - tx_latency_confirmed_max: u64, +struct CustomAggregatorSelector { + tx_latency_submitted_range: Range, + tx_latency_submitted_buckets: u64, + tx_latency_confirmed_range: Range, + tx_latency_confirmed_buckets: u64, } -impl RangeCustomAggregatorSelector { +impl CustomAggregatorSelector { pub fn new( - tx_latency_submitted_min: u64, - tx_latency_submitted_max: u64, - tx_latency_confirmed_min: u64, - tx_latency_confirmed_max: u64, + tx_latency_submitted_range: Range, + tx_latency_submitted_buckets: u64, + tx_latency_confirmed_range: Range, + tx_latency_confirmed_buckets: u64, ) -> Self { Self { - tx_latency_submitted_min, - tx_latency_submitted_max, - tx_latency_confirmed_min, - tx_latency_confirmed_max, + tx_latency_submitted_range, + tx_latency_submitted_buckets, + tx_latency_confirmed_range, + tx_latency_confirmed_buckets, } } pub fn get_submitted_range(&self) -> Vec { - let step = (self.tx_latency_submitted_max - self.tx_latency_submitted_min) / 10; - (self.tx_latency_submitted_min..=self.tx_latency_submitted_max) - .filter(|e| e % step == 0) + let step = (self.tx_latency_submitted_range.end - self.tx_latency_submitted_range.start) + / self.tx_latency_submitted_buckets; + (0..=self.tx_latency_submitted_buckets) + .scan(self.tx_latency_submitted_range.start, |state, _| { + let current_value = *state; + if current_value >= self.tx_latency_submitted_range.end { + return Some(self.tx_latency_submitted_range.end); + } + *state += step; + Some(current_value) + }) .map(|e| e as f64) .collect::>() } pub fn get_confirmed_range(&self) -> Vec { - let step = (self.tx_latency_confirmed_max - self.tx_latency_confirmed_min) / 10; - (self.tx_latency_confirmed_min..=self.tx_latency_confirmed_max) - .filter(|e| e % step == 0) + let step = (self.tx_latency_confirmed_range.end - self.tx_latency_confirmed_range.start) + / self.tx_latency_confirmed_buckets; + (0..=self.tx_latency_confirmed_buckets) + .scan(self.tx_latency_confirmed_range.start, |state, _| { + let current_value = *state; + if current_value >= self.tx_latency_confirmed_range.end { + return Some(self.tx_latency_confirmed_range.end); + } + *state += step; + Some(current_value) + }) .map(|e| e as f64) .collect::>() } } -impl AggregatorSelector for RangeCustomAggregatorSelector { +impl AggregatorSelector for CustomAggregatorSelector { fn aggregator_for(&self, descriptor: &Descriptor) -> Option> { match descriptor.name() { "wallet_balance" => Some(Arc::new(last_value())), @@ -1106,191 +1123,3 @@ impl AggregatorSelector for RangeCustomAggregatorSelector { } } } - -#[derive(Debug)] -struct CustomAggregatorSelector; - -impl AggregatorSelector for CustomAggregatorSelector { - fn aggregator_for(&self, descriptor: &Descriptor) -> Option> { - match descriptor.name() { - "wallet_balance" => Some(Arc::new(last_value())), - "backlog_oldest_sequence" => Some(Arc::new(last_value())), - "backlog_oldest_timestamp" => Some(Arc::new(last_value())), - "backlog_size" => Some(Arc::new(last_value())), - // Prometheus' supports only collector for histogram, sum, and last value aggregators. - // https://docs.rs/opentelemetry-prometheus/0.10.0/src/opentelemetry_prometheus/lib.rs.html#411-418 - // TODO: Once quantile sketches are supported, replace histograms with that. - "tx_latency_submitted" => Some(Arc::new(histogram(&[ - 200.0, 500.0, 1000.0, 2000.0, 5000.0, 6000.0, 7000.0, 8000.0, 9000.0, 10000.0, - 12000.0, 14000.0, 16000.0, - ]))), - "tx_latency_confirmed" => Some(Arc::new(histogram(&[ - 1000.0, 5000.0, 9000.0, 13000.0, 17000.0, 20000.0, 23000.0, 25000.0, 27000.0, - 30000.0, - ]))), - "ics29_period_fees" => Some(Arc::new(last_value())), - _ => Some(Arc::new(sum())), - } - } -} - -impl Default for TelemetryState { - fn default() -> Self { - use opentelemetry::sdk::export::metrics::aggregation; - use opentelemetry::sdk::metrics::{controllers, processors}; - - let controller = controllers::basic(processors::factory( - CustomAggregatorSelector, - aggregation::cumulative_temporality_selector(), - )) - .build(); - - let exporter = opentelemetry_prometheus::ExporterBuilder::new(controller).init(); - - let meter = global::meter("hermes"); - - Self { - exporter, - - workers: meter - .i64_up_down_counter("workers") - .with_description("Number of workers") - .init(), - - client_updates_submitted: meter - .u64_counter("client_updates_submitted") - .with_description("Number of client update messages submitted") - .init(), - - client_misbehaviours_submitted: meter - .u64_counter("client_misbehaviours_submitted") - .with_description("Number of misbehaviours detected and submitted") - .init(), - - receive_packets_confirmed: meter - .u64_counter("receive_packets_confirmed") - .with_description("Number of confirmed receive packets. Available if relayer runs with Tx confirmation enabled") - .init(), - - acknowledgment_packets_confirmed: meter - .u64_counter("acknowledgment_packets_confirmed") - .with_description("Number of confirmed acknowledgment packets. Available if relayer runs with Tx confirmation enabled") - .init(), - - timeout_packets_confirmed: meter - .u64_counter("timeout_packets_confirmed") - .with_description("Number of confirmed timeout packets. Available if relayer runs with Tx confirmation enabled") - .init(), - - queries: meter - .u64_counter("queries") - .with_description( - "Number of queries submitted by Hermes", - ) - .init(), - - queries_cache_hits: meter - .u64_counter("queries_cache_hits") - .with_description("Number of cache hits for queries submitted by Hermes") - .init(), - - ws_reconnect: meter - .u64_counter("ws_reconnect") - .with_description("Number of times Hermes reconnected to the websocket endpoint") - .init(), - - ws_events: meter - .u64_counter("ws_events") - .with_description("How many IBC events did Hermes receive via the websocket subscription") - .init(), - - messages_submitted: meter - .u64_counter("messages_submitted") - .with_description("Number of messages submitted to a specific chain") - .init(), - - wallet_balance: meter - .f64_observable_gauge("wallet_balance") - .with_description("The balance of each wallet Hermes uses per chain. Please note that when converting the balance to f64 a loss in precision might be introduced in the displayed value") - .init(), - - send_packet_events: meter - .u64_counter("send_packet_events") - .with_description("Number of SendPacket events received") - .init(), - - acknowledgement_events: meter - .u64_counter("acknowledgement_events") - .with_description("Number of WriteAcknowledgement events received") - .init(), - - timeout_events: meter - .u64_counter("timeout_events") - .with_description("Number of TimeoutPacket events received") - .init(), - - cleared_send_packet_events: meter - .u64_counter("cleared_send_packet_events") - .with_description("Number of SendPacket events received during the initial and periodic clearing") - .init(), - - cleared_acknowledgment_events: meter - .u64_counter("cleared_acknowledgment_events") - .with_description("Number of WriteAcknowledgement events received during the initial and periodic clearing") - .init(), - - tx_latency_submitted: meter - .u64_observable_gauge("tx_latency_submitted") - .with_unit(Unit::new("milliseconds")) - .with_description("The latency for all transactions submitted to a specific chain, \ - i.e. the difference between the moment when Hermes received a batch of events \ - and when it submitted the corresponding transaction(s). Milliseconds.") - .init(), - - tx_latency_confirmed: meter - .u64_observable_gauge("tx_latency_confirmed") - .with_unit(Unit::new("milliseconds")) - .with_description("The latency for all transactions submitted & confirmed to a specific chain, \ - i.e. the difference between the moment when Hermes received a batch of events \ - until the corresponding transaction(s) were confirmed. Milliseconds.") - .init(), - - in_flight_events: moka::sync::Cache::builder() - .time_to_live(Duration::from_secs(60 * 60)) // Remove entries after 1 hour - .time_to_idle(Duration::from_secs(30 * 60)) // Remove entries if they have been idle for 30 minutes - .build(), - - backlogs: DashMap::new(), - - backlog_oldest_sequence: meter - .u64_observable_gauge("backlog_oldest_sequence") - .with_description("Sequence number of the oldest SendPacket event in the backlog") - .init(), - - backlog_oldest_timestamp: meter - .u64_observable_gauge("backlog_oldest_timestamp") - .with_unit(Unit::new("seconds")) - .with_description("Local timestamp for the oldest SendPacket event in the backlog") - .init(), - - backlog_size: meter - .u64_observable_gauge("backlog_size") - .with_description("Total number of SendPacket events in the backlog") - .init(), - - fee_amounts: meter - .u64_counter("ics29_fee_amounts") - .with_description("Total amount received from ICS29 fees") - .init(), - - visible_fee_addresses: DashSet::new(), - - cached_fees: Mutex::new(Vec::new()), - - period_fees: meter - .u64_observable_gauge("ics29_period_fees") - .with_description("Amount of ICS29 fees rewarded over the past 7 days") - .init(), - } - } -} diff --git a/guide/src/documentation/telemetry/index.md b/guide/src/documentation/telemetry/index.md index a270f73031..5933d149b3 100644 --- a/guide/src/documentation/telemetry/index.md +++ b/guide/src/documentation/telemetry/index.md @@ -13,11 +13,12 @@ The telemetry service is not active by default, and must be enabled in Hermes' c ```toml [telemetry] -enabled = true # default = false -host = '127.0.0.1' # default value -port = 3001 # default value -submitted_range = { min = 500, max = 10000 } # default value -confirmed_range = { min = 1000, max = 20000 } # default value +enabled = true # default = false +host = '127.0.0.1' # default value +port = 3001 # default value +[telemetry.buckets] # default value +latency_submitted = { min = 5000, max = 10000, buckets = 10 } # default value +latency_confirmed = { min = 5000, max = 10000, buckets = 10 } # default value ``` Please see the [relevant section for *Configuration*](../configuration/index.md) for more general details about Hermes configuration options. From b0325050ac8eb982392f5ee0bce38adeb39bd3f0 Mon Sep 17 00:00:00 2001 From: Luca Joss Date: Wed, 31 May 2023 15:53:34 +0200 Subject: [PATCH 07/13] Fix guide typo --- guide/src/documentation/telemetry/operators.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/guide/src/documentation/telemetry/operators.md b/guide/src/documentation/telemetry/operators.md index 2a44c5f2ab..e98d903f05 100644 --- a/guide/src/documentation/telemetry/operators.md +++ b/guide/src/documentation/telemetry/operators.md @@ -70,7 +70,7 @@ tx_latency_submitted_bucket{chain="ibc-0",channel="channel-0",counterparty="ibc- tx_latency_submitted_bucket{chain="ibc-0",channel="channel-0",counterparty="ibc-1",port="transfer",service_name="unknown_service",otel_scope_name="hermes",otel_scope_version="",le="5000"} 2 tx_latency_submitted_bucket{chain="ibc-1",channel="channel-0",counterparty="ibc-0",port="transfer",service_name="unknown_service",otel_scope_name="hermes",otel_scope_version="",le="+Inf"} 2 ``` -The range of the buckets can be configured using the `submitted_range` and `confirmed_range` seen [here](./index.md) +The range of the buckets can be configured using the `latency_submitted` and `latency_confirmed` seen [here](./index.md) ## Are Hermes transactions successful? From 3a11ef7ebd2d06964873ca8654c6437f99f8c274 Mon Sep 17 00:00:00 2001 From: Luca Joss Date: Wed, 31 May 2023 16:30:25 +0200 Subject: [PATCH 08/13] Fix histogram config tests and cleaned code --- crates/relayer/src/config.rs | 49 ++++++++++++++--------------------- crates/telemetry/src/state.rs | 43 ++++++++++++------------------ 2 files changed, 37 insertions(+), 55 deletions(-) diff --git a/crates/relayer/src/config.rs b/crates/relayer/src/config.rs index da3ea8901b..9cc6d39bb4 100644 --- a/crates/relayer/src/config.rs +++ b/crates/relayer/src/config.rs @@ -202,22 +202,23 @@ pub mod default { Byte::from_bytes(33554432) } - pub fn buckets() -> HistogramBuckets { - HistogramBuckets { - latency_submitted: HistogramConfig { - range: Range { - start: 500, - end: 10000, - }, - buckets: 10, + pub fn latency_submitted() -> HistogramConfig { + HistogramConfig { + range: Range { + start: 500, + end: 10000, }, - latency_confirmed: HistogramConfig { - range: Range { - start: 1000, - end: 20000, - }, - buckets: 10, + buckets: 10, + } + } + + pub fn latency_confirmed() -> HistogramConfig { + HistogramConfig { + range: Range { + start: 1000, + end: 20000, }, + buckets: 10, } } } @@ -401,33 +402,23 @@ pub struct TelemetryConfig { pub enabled: bool, pub host: String, pub port: u16, - #[serde(default = "default::buckets")] + #[serde(default = "HistogramBuckets::default")] pub buckets: HistogramBuckets, } #[derive(Clone, Debug, Deserialize, Serialize)] pub struct HistogramBuckets { + #[serde(default = "default::latency_submitted")] pub latency_submitted: HistogramConfig, + #[serde(default = "default::latency_confirmed")] pub latency_confirmed: HistogramConfig, } impl Default for HistogramBuckets { fn default() -> Self { Self { - latency_submitted: HistogramConfig { - range: Range { - start: 500, - end: 10000, - }, - buckets: 10, - }, - latency_confirmed: HistogramConfig { - range: Range { - start: 1000, - end: 20000, - }, - buckets: 10, - }, + latency_submitted: default::latency_submitted(), + latency_confirmed: default::latency_confirmed(), } } } diff --git a/crates/telemetry/src/state.rs b/crates/telemetry/src/state.rs index d0064c7cc4..15a51f44dc 100644 --- a/crates/telemetry/src/state.rs +++ b/crates/telemetry/src/state.rs @@ -1074,38 +1074,29 @@ impl CustomAggregatorSelector { } pub fn get_submitted_range(&self) -> Vec { - let step = (self.tx_latency_submitted_range.end - self.tx_latency_submitted_range.start) - / self.tx_latency_submitted_buckets; - (0..=self.tx_latency_submitted_buckets) - .scan(self.tx_latency_submitted_range.start, |state, _| { - let current_value = *state; - if current_value >= self.tx_latency_submitted_range.end { - return Some(self.tx_latency_submitted_range.end); - } - *state += step; - Some(current_value) - }) - .map(|e| e as f64) - .collect::>() + build_histogram_buckets( + self.tx_latency_submitted_range.start, + self.tx_latency_submitted_range.end, + self.tx_latency_submitted_buckets, + ) } pub fn get_confirmed_range(&self) -> Vec { - let step = (self.tx_latency_confirmed_range.end - self.tx_latency_confirmed_range.start) - / self.tx_latency_confirmed_buckets; - (0..=self.tx_latency_confirmed_buckets) - .scan(self.tx_latency_confirmed_range.start, |state, _| { - let current_value = *state; - if current_value >= self.tx_latency_confirmed_range.end { - return Some(self.tx_latency_confirmed_range.end); - } - *state += step; - Some(current_value) - }) - .map(|e| e as f64) - .collect::>() + build_histogram_buckets( + self.tx_latency_confirmed_range.start, + self.tx_latency_confirmed_range.end, + self.tx_latency_confirmed_buckets, + ) } } +fn build_histogram_buckets(start: u64, end: u64, buckets: u64) -> Vec { + let step = (end - start) / buckets; + (0..=buckets) + .map(|i| (start + i * step) as f64) + .collect::>() +} + impl AggregatorSelector for CustomAggregatorSelector { fn aggregator_for(&self, descriptor: &Descriptor) -> Option> { match descriptor.name() { From 07cbbaf81c21aacfa756f597b43d387df022a559 Mon Sep 17 00:00:00 2001 From: Luca Joss Date: Wed, 31 May 2023 17:37:16 +0200 Subject: [PATCH 09/13] Rename 'min'/'max' to 'start'/'end' --- config.toml | 8 ++++---- crates/relayer/src/config.rs | 13 +++++++------ .../relayer_conf_example_invalid_telemetry.toml | 4 ++-- .../relayer_conf_example_valid_telemetry.toml | 4 ++-- guide/src/documentation/telemetry/index.md | 4 ++-- 5 files changed, 17 insertions(+), 16 deletions(-) diff --git a/config.toml b/config.toml index e4f226f8a6..b1e0799d7d 100644 --- a/config.toml +++ b/config.toml @@ -112,16 +112,16 @@ port = 3001 [telemetry.buckets] # Specify the range of the 10 histogram buckets in ms for the `tx_latency_submitted` metric. -# Default: { min = 500, max = 10000, buckets = 10 } +# Default: { start = 500, end = 10000, buckets = 10 } # The default will give the following buckets: # [500, 1450, 2400, 3350, 4300, 5250, 6200, 7150, 8100, 9050, 10000] -# latency_submitted = { min = 500, max = 10000, buckets = 10 } +# latency_submitted = { start = 500, end = 10000, buckets = 10 } # Specify the range of the 10 histogram buckets in ms for the `tx_latency_confirmed` metric. -# Default: { min = 1000, max = 20000, buckets = 10 } +# Default: { start = 1000, end = 20000, buckets = 10 } # The default will give the following buckets: # [1000, 2900, 4800, 6700, 8600, 10500, 12400, 14300, 16200, 18100, 20000] -# latency_confirmed = { min = 1000, max = 20000, buckets = 10 } +# latency_confirmed = { start = 1000, end = 20000, buckets = 10 } # A chains section includes parameters related to a chain and the full node to which diff --git a/crates/relayer/src/config.rs b/crates/relayer/src/config.rs index 9cc6d39bb4..587f60ae40 100644 --- a/crates/relayer/src/config.rs +++ b/crates/relayer/src/config.rs @@ -426,6 +426,7 @@ impl Default for HistogramBuckets { #[derive(Clone, Debug, Deserialize, Serialize)] #[serde(try_from = "HistogramRangeUnchecked")] pub struct HistogramConfig { + #[serde(flatten)] pub range: Range, pub buckets: u64, } @@ -434,16 +435,16 @@ impl TryFrom for HistogramConfig { type Error = String; fn try_from(value: HistogramRangeUnchecked) -> Result { - if value.min > value.max { + if value.start > value.end { return Err(format!( "histogram range min `{}` must be smaller or equal than max `{}`", - value.min, value.max + value.start, value.end )); } Ok(Self { range: Range { - start: value.min, - end: value.max, + start: value.start, + end: value.end, }, buckets: value.buckets, }) @@ -452,8 +453,8 @@ impl TryFrom for HistogramConfig { #[derive(Clone, Debug, Deserialize, Serialize)] pub struct HistogramRangeUnchecked { - min: u64, - max: u64, + start: u64, + end: u64, buckets: u64, } diff --git a/crates/relayer/tests/config/fixtures/relayer_conf_example_invalid_telemetry.toml b/crates/relayer/tests/config/fixtures/relayer_conf_example_invalid_telemetry.toml index bd366065e2..7bb211b7c3 100644 --- a/crates/relayer/tests/config/fixtures/relayer_conf_example_invalid_telemetry.toml +++ b/crates/relayer/tests/config/fixtures/relayer_conf_example_invalid_telemetry.toml @@ -26,8 +26,8 @@ host = '127.0.0.1' port = 3001 [telemetry.buckets] -latency_submitted = { min = 5000, max = 1000, buckets = 10 } # min can't be smaller than max -latency_confirmed = { min = 5000, max = 10000, buckets = 10 } +latency_submitted = { start = 5000, end = 1000, buckets = 10 } # start can't be smaller than end +latency_confirmed = { start = 5000, end = 10000, buckets = 10 } [[chains]] id = 'chain_A' diff --git a/crates/relayer/tests/config/fixtures/relayer_conf_example_valid_telemetry.toml b/crates/relayer/tests/config/fixtures/relayer_conf_example_valid_telemetry.toml index 5d68a1fb25..25f0c4683f 100644 --- a/crates/relayer/tests/config/fixtures/relayer_conf_example_valid_telemetry.toml +++ b/crates/relayer/tests/config/fixtures/relayer_conf_example_valid_telemetry.toml @@ -26,8 +26,8 @@ host = '127.0.0.1' port = 3001 [telemetry.buckets] -latency_submitted = { min = 5000, max = 10000, buckets = 10 } -latency_confirmed = { min = 5000, max = 10000, buckets = 10 } +latency_submitted = { start = 5000, end = 10000, buckets = 10 } +latency_confirmed = { start = 5000, end = 10000, buckets = 10 } [[chains]] id = 'chain_A' diff --git a/guide/src/documentation/telemetry/index.md b/guide/src/documentation/telemetry/index.md index 5933d149b3..b0d8f926d1 100644 --- a/guide/src/documentation/telemetry/index.md +++ b/guide/src/documentation/telemetry/index.md @@ -17,8 +17,8 @@ enabled = true # default = fals host = '127.0.0.1' # default value port = 3001 # default value [telemetry.buckets] # default value -latency_submitted = { min = 5000, max = 10000, buckets = 10 } # default value -latency_confirmed = { min = 5000, max = 10000, buckets = 10 } # default value +latency_submitted = { start = 5000, end = 10000, buckets = 10 } # default value +latency_confirmed = { start = 5000, end = 10000, buckets = 10 } # default value ``` Please see the [relevant section for *Configuration*](../configuration/index.md) for more general details about Hermes configuration options. From f4c2d4d9f2bf9789ae9fd16ad37acad0231441eb Mon Sep 17 00:00:00 2001 From: Luca Joss Date: Wed, 7 Jun 2023 13:11:56 +0200 Subject: [PATCH 10/13] Update and improve grafana template --- guide/src/assets/grafana_template.json | 166 ++++++++++++++++++++++--- 1 file changed, 149 insertions(+), 17 deletions(-) diff --git a/guide/src/assets/grafana_template.json b/guide/src/assets/grafana_template.json index 67564590db..4372b9a1b1 100644 --- a/guide/src/assets/grafana_template.json +++ b/guide/src/assets/grafana_template.json @@ -363,7 +363,7 @@ "uid": "${DS_PROMETHEUS}" }, "editorMode": "builder", - "expr": "delta(client_updates_submitted{job=\"hermes\"}[1m])", + "expr": "delta(client_updates_submitted_total{job=\"hermes\"}[1m])", "legendFormat": "{{src_chain}} -> {{dst_chain}}:{{client}} ({{instance}})", "range": true, "refId": "A" @@ -453,13 +453,13 @@ "uid": "${DS_PROMETHEUS}" }, "editorMode": "builder", - "expr": "delta(total_messages_submitted{job=\"hermes\"}[1m])", + "expr": "delta(messages_submitted_total{job=\"hermes\"}[1m])", "legendFormat": "{{instance}}:{{chain}}", "range": true, "refId": "A" } ], - "title": "total_messages_submitted over 1 minute", + "title": "messages_submitted over 1 minute", "type": "timeseries" }, { @@ -1227,7 +1227,7 @@ "uid": "${DS_PROMETHEUS}" }, "editorMode": "builder", - "expr": "delta(receive_packets_confirmed{job=\"hermes\"}[1m])", + "expr": "delta(receive_packets_confirmed_total{job=\"hermes\"}[1m])", "legendFormat": "{{src_chain}}:{{src_channel}}/{{src_port}} ({{instance}})", "range": true, "refId": "A" @@ -1317,7 +1317,7 @@ "uid": "${DS_PROMETHEUS}" }, "editorMode": "builder", - "expr": "delta(timeout_packets_confirmed{job=\"hermes\"}[1m])", + "expr": "delta(timeout_packets_confirmed_total{job=\"hermes\"}[1m])", "legendFormat": "{{src_chain}}:{{src_channel}}/{{src_port}} ({{instance}})", "range": true, "refId": "A" @@ -1407,7 +1407,7 @@ "uid": "${DS_PROMETHEUS}" }, "editorMode": "builder", - "expr": "delta(acknowledgment_packets_confirmed{job=\"hermes\"}[1m])", + "expr": "delta(acknowledgment_packets_confirmed_total{job=\"hermes\"}[1m])", "legendFormat": "{{src_chain}}:{{src_channel}}/{{src_port}} ({{instance}})", "range": true, "refId": "A" @@ -1520,7 +1520,7 @@ "uid": "${DS_PROMETHEUS}" }, "editorMode": "builder", - "expr": "delta(send_packet_events{job=\"hermes\"}[1m])", + "expr": "delta(send_packet_events_total{job=\"hermes\"}[1m])", "interval": "", "legendFormat": "{{chain}}:{{channel}}/{{port}}-{{counterparty}} ({{instance}})", "range": true, @@ -1611,7 +1611,7 @@ "uid": "${DS_PROMETHEUS}" }, "editorMode": "builder", - "expr": "delta(acknowledgement_events{job=\"hermes\"}[1m])", + "expr": "delta(acknowledgement_events_total{job=\"hermes\"}[1m])", "legendFormat": "{{chain}}:{{channel}}/{{port}}-{{counterparty}} ({{instance}})", "range": true, "refId": "A" @@ -1701,7 +1701,7 @@ "uid": "${DS_PROMETHEUS}" }, "editorMode": "builder", - "expr": "delta(timeout_events{job=\"hermes\"}[1m])", + "expr": "delta(timeout_events_total{job=\"hermes\"}[1m])", "legendFormat": "{{chain}}:{{channel}}/{{port}}-{{counterparty}} ({{instance}})", "range": true, "refId": "A" @@ -1791,7 +1791,7 @@ "uid": "${DS_PROMETHEUS}" }, "editorMode": "builder", - "expr": "delta(ws_events{job=\"hermes\"}[1m])", + "expr": "delta(ws_events_total{job=\"hermes\"}[1m])", "interval": "", "legendFormat": "{{instance}}:{{chain}}", "range": true, @@ -1882,7 +1882,7 @@ "uid": "${DS_PROMETHEUS}" }, "editorMode": "builder", - "expr": "delta(ws_reconnect{job=\"hermes\"}[1m])", + "expr": "delta(ws_reconnect_total{job=\"hermes\"}[1m])", "legendFormat": "{{instance}}:{{chain}}", "range": true, "refId": "A" @@ -1972,7 +1972,7 @@ "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", - "expr": "sum(delta(queries{job=\"hermes\"}[1m])) by(instance,chain)", + "expr": "sum(delta(queries_total{job=\"hermes\"}[1m])) by(instance,chain)", "legendFormat": "{{instance}}:{{chain}}", "range": true, "refId": "A" @@ -2078,7 +2078,7 @@ "uid": "${DS_PROMETHEUS}" }, "editorMode": "builder", - "expr": "delta(cleared_acknowledgment_events{job=\"hermes\"}[1m])", + "expr": "delta(cleared_acknowledgment_events_total{job=\"hermes\"}[1m])", "legendFormat": "{{chain}}:{{channel}}/{{port}}-{{counterparty}} ({{instance}})", "range": true, "refId": "A" @@ -2168,7 +2168,7 @@ "uid": "${DS_PROMETHEUS}" }, "editorMode": "builder", - "expr": "delta(cleared_send_packet_events{job=\"hermes\"}[1m])", + "expr": "delta(cleared_send_packet_events_total{job=\"hermes\"}[1m])", "legendFormat": "{{chain}}:{{channel}}/{{port}}-{{counterparty}} ({{instance}})", "range": true, "refId": "A" @@ -2349,7 +2349,7 @@ "uid": "${DS_PROMETHEUS}" }, "editorMode": "builder", - "expr": "sum by(chain, instance) (delta(queries_cache_hits{job=\"hermes\"}[1m]))", + "expr": "sum by(chain, instance) (delta(queries_cache_hits_total{job=\"hermes\"}[1m]))", "legendFormat": "{{instance}}:{{chain}}", "range": true, "refId": "A" @@ -2358,12 +2358,144 @@ "title": "queries_cache_hits over 1 minute", "type": "timeseries" }, + { + "description": "Indicates the latency rate for all transactions submitted from ibc-0", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 98 + }, + "id": 63, + "options": { + "displayMode": "gradient", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showUnfilled": true + }, + "pluginVersion": "9.0.7", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "builder", + "exemplar": true, + "expr": "tx_latency_submitted_bucket{chain=\"ibc-0\"}", + "format": "heatmap", + "instant": false, + "interval": "", + "legendFormat": "{{le}}", + "range": true, + "refId": "A" + } + ], + "title": "tx_latency_submitted histogram ibc-0", + "transformations": [], + "type": "bargauge" + }, + { + "description": "Indicates the latency rate for all transactions confirmed from ibc-0", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 98 + }, + "id": 64, + "options": { + "displayMode": "gradient", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showUnfilled": true + }, + "pluginVersion": "9.0.7", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "builder", + "exemplar": true, + "expr": "tx_latency_confirmed_bucket{chain=\"ibc-0\"}", + "format": "heatmap", + "instant": false, + "interval": "", + "legendFormat": "{{le}}", + "range": true, + "refId": "A" + } + ], + "title": "tx_latency_confirmed_bucket histogram ibc-0", + "transformations": [], + "type": "bargauge" + }, { "gridPos": { "h": 2, "w": 24, "x": 0, - "y": 98 + "y": 106 }, "id": 58, "options": { @@ -2454,7 +2586,7 @@ "uid": "${DS_PROMETHEUS}" }, "editorMode": "builder", - "expr": "delta(client_misbehaviours_submitted[5m])", + "expr": "delta(client_misbehaviours_submitted_total[5m])", "legendFormat": "{{src_chain}}-{{dst_chain}}:{{client}} ({{instance}})", "range": true, "refId": "A" From a29c9ccc114638488150d051b937dd1a728cf496 Mon Sep 17 00:00:00 2001 From: Luca Joss Date: Wed, 7 Jun 2023 13:52:47 +0200 Subject: [PATCH 11/13] Improve Grafana template --- guide/src/assets/grafana_template.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/guide/src/assets/grafana_template.json b/guide/src/assets/grafana_template.json index 4372b9a1b1..2d7e91454a 100644 --- a/guide/src/assets/grafana_template.json +++ b/guide/src/assets/grafana_template.json @@ -2411,7 +2411,7 @@ }, "editorMode": "builder", "exemplar": true, - "expr": "tx_latency_submitted_bucket{chain=\"ibc-0\"}", + "expr": "increase(tx_latency_submitted_bucket{chain=\"ibc-0\"}[$__range])", "format": "heatmap", "instant": false, "interval": "", @@ -2477,7 +2477,7 @@ }, "editorMode": "builder", "exemplar": true, - "expr": "tx_latency_confirmed_bucket{chain=\"ibc-0\"}", + "expr": "increase(tx_latency_confirmed_bucket{chain=\"ibc-0\"}[$__range])", "format": "heatmap", "instant": false, "interval": "", From 938d25f2f4a2cceff3bc931f42f4dbc1ea515504 Mon Sep 17 00:00:00 2001 From: Romain Ruetschi Date: Thu, 8 Jun 2023 11:43:37 +0200 Subject: [PATCH 12/13] Fix config file fixtures --- .../fixtures/relayer_conf_example_invalid_telemetry.toml | 6 +++--- .../fixtures/relayer_conf_example_valid_telemetry.toml | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/crates/relayer/tests/config/fixtures/relayer_conf_example_invalid_telemetry.toml b/crates/relayer/tests/config/fixtures/relayer_conf_example_invalid_telemetry.toml index 7bb211b7c3..3e44c53a7d 100644 --- a/crates/relayer/tests/config/fixtures/relayer_conf_example_invalid_telemetry.toml +++ b/crates/relayer/tests/config/fixtures/relayer_conf_example_invalid_telemetry.toml @@ -33,7 +33,7 @@ latency_confirmed = { start = 5000, end = 10000, buckets = 10 } id = 'chain_A' rpc_addr = 'http://127.0.0.1:26657' grpc_addr = 'http://127.0.0.1:9090' -websocket_addr = 'ws://localhost:26657/websocket' +event_source = { mode = 'push', url = 'ws://localhost:26657/websocket' } rpc_timeout = '10s' account_prefix = 'cosmos' key_name = 'testkey' @@ -58,7 +58,7 @@ list = [ id = 'chain_B' rpc_addr = 'http://127.0.0.1:26557' grpc_addr = 'http://127.0.0.1:9090' -websocket_addr = 'ws://localhost:26557/websocket' +event_source = { mode = 'push', url = 'ws://localhost:26557/websocket' } rpc_timeout = '10s' account_prefix = 'cosmos' key_name = 'testkey' @@ -67,4 +67,4 @@ gas_price = { price = 0.001, denom = 'stake' } clock_drift = '5s' trusting_period = '14days' trust_threshold = { numerator = '1', denominator = '3' } -address_type = { derivation = 'ethermint', proto_type = { pk_type = '/injective.crypto.v1beta1.ethsecp256k1.PubKey' } } \ No newline at end of file +address_type = { derivation = 'ethermint', proto_type = { pk_type = '/injective.crypto.v1beta1.ethsecp256k1.PubKey' } } diff --git a/crates/relayer/tests/config/fixtures/relayer_conf_example_valid_telemetry.toml b/crates/relayer/tests/config/fixtures/relayer_conf_example_valid_telemetry.toml index 25f0c4683f..ca0ae487ec 100644 --- a/crates/relayer/tests/config/fixtures/relayer_conf_example_valid_telemetry.toml +++ b/crates/relayer/tests/config/fixtures/relayer_conf_example_valid_telemetry.toml @@ -33,7 +33,7 @@ latency_confirmed = { start = 5000, end = 10000, buckets = 10 } id = 'chain_A' rpc_addr = 'http://127.0.0.1:26657' grpc_addr = 'http://127.0.0.1:9090' -websocket_addr = 'ws://localhost:26657/websocket' +event_source = { mode = 'push', url = 'ws://localhost:26657/websocket' } rpc_timeout = '10s' account_prefix = 'cosmos' key_name = 'testkey' @@ -58,7 +58,7 @@ list = [ id = 'chain_B' rpc_addr = 'http://127.0.0.1:26557' grpc_addr = 'http://127.0.0.1:9090' -websocket_addr = 'ws://localhost:26557/websocket' +event_source = { mode = 'push', url = 'ws://localhost:26557/websocket' } rpc_timeout = '10s' account_prefix = 'cosmos' key_name = 'testkey' @@ -67,4 +67,4 @@ gas_price = { price = 0.001, denom = 'stake' } clock_drift = '5s' trusting_period = '14days' trust_threshold = { numerator = '1', denominator = '3' } -address_type = { derivation = 'ethermint', proto_type = { pk_type = '/injective.crypto.v1beta1.ethsecp256k1.PubKey' } } \ No newline at end of file +address_type = { derivation = 'ethermint', proto_type = { pk_type = '/injective.crypto.v1beta1.ethsecp256k1.PubKey' } } From 4a1ef09b3ed6274ce235ebc67bba15016f9042b6 Mon Sep 17 00:00:00 2001 From: Romain Ruetschi Date: Wed, 14 Jun 2023 14:13:36 +0200 Subject: [PATCH 13/13] Increase upper bound on default buckets to 20s and 30s for submitted and confirmed latencies --- config.toml | 8 ++++---- crates/relayer/src/config.rs | 4 ++-- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/config.toml b/config.toml index 664fb73b71..949d67d2c7 100644 --- a/config.toml +++ b/config.toml @@ -114,14 +114,14 @@ port = 3001 # Specify the range of the 10 histogram buckets in ms for the `tx_latency_submitted` metric. # Default: { start = 500, end = 10000, buckets = 10 } # The default will give the following buckets: -# [500, 1450, 2400, 3350, 4300, 5250, 6200, 7150, 8100, 9050, 10000] -# latency_submitted = { start = 500, end = 10000, buckets = 10 } +# [500, 2450, 4400, 6350, 8300, 10250, 12200, 14150, 16100, 18050, 20000] +# latency_submitted = { start = 500, end = 20000, buckets = 10 } # Specify the range of the 10 histogram buckets in ms for the `tx_latency_confirmed` metric. # Default: { start = 1000, end = 20000, buckets = 10 } # The default will give the following buckets: -# [1000, 2900, 4800, 6700, 8600, 10500, 12400, 14300, 16200, 18100, 20000] -# latency_confirmed = { start = 1000, end = 20000, buckets = 10 } +# [1000, 3900, 6800, 9700, 12600, 15500, 18400, 21300, 24200, 27100, 30000] +# latency_confirmed = { start = 1000, end = 30000, buckets = 10 } # A chains section includes parameters related to a chain and the full node to which diff --git a/crates/relayer/src/config.rs b/crates/relayer/src/config.rs index 68e552b813..92fe601cbf 100644 --- a/crates/relayer/src/config.rs +++ b/crates/relayer/src/config.rs @@ -210,7 +210,7 @@ pub mod default { HistogramConfig { range: Range { start: 500, - end: 10000, + end: 20000, }, buckets: 10, } @@ -220,7 +220,7 @@ pub mod default { HistogramConfig { range: Range { start: 1000, - end: 20000, + end: 30000, }, buckets: 10, }