diff --git a/.changelog/unreleased/features/ibc-telemetry/3845-add-simulate-errors-metric.md b/.changelog/unreleased/features/ibc-telemetry/3845-add-simulate-errors-metric.md new file mode 100644 index 0000000000..7e83a36b32 --- /dev/null +++ b/.changelog/unreleased/features/ibc-telemetry/3845-add-simulate-errors-metric.md @@ -0,0 +1,11 @@ +- Added a new Prometheus metric `simulate_errors` for tracking when a transaction simulation fails, with the following labels: + * `recoverable` (can the execution continue if this happened?) + * `account` (account from which the tx was sent) + * `error_description` (description of the error) + ([\#3845](https://github.com/informalsystems/hermes/issues/3845)) + + ``` + # HELP simulate_errors_total Number of errors observed by Hermes when simulating a Tx + # TYPE simulate_errors_total counter + simulate_errors_total{account="osmo17ndx5qfku28ymxgmq6zq4a6d02dvpfjjul0hyh",error_description="Unknown error",recoverable="false",service_name="unknown_service",otel_scope_name="hermes",otel_scope_version=""} 4 + ``` diff --git a/crates/relayer/src/chain/cosmos/estimate.rs b/crates/relayer/src/chain/cosmos/estimate.rs index 87a18a26b4..79772e92c8 100644 --- a/crates/relayer/src/chain/cosmos/estimate.rs +++ b/crates/relayer/src/chain/cosmos/estimate.rs @@ -14,6 +14,7 @@ use crate::chain::cosmos::types::gas::GasConfig; use crate::config::types::Memo; use crate::error::Error; use crate::keyring::Secp256k1KeyPair; +use crate::telemetry; use crate::util::pretty::PrettyFee; pub async fn estimate_tx_fees( @@ -51,6 +52,7 @@ pub async fn estimate_tx_fees( &config.rpc_address, &config.chain_id, tx, + account, ) .await?; @@ -63,6 +65,7 @@ async fn estimate_fee_with_tx( rpc_address: &Url, chain_id: &ChainId, tx: Tx, + account: &Account, ) -> Result { let estimated_gas = { crate::time!( @@ -72,7 +75,7 @@ async fn estimate_fee_with_tx( } ); - estimate_gas_with_tx(gas_config, grpc_address, tx).await + estimate_gas_with_tx(gas_config, grpc_address, tx, account).await }?; if estimated_gas > gas_config.max_gas { @@ -112,6 +115,7 @@ async fn estimate_gas_with_tx( gas_config: &GasConfig, grpc_address: &Uri, tx: Tx, + account: &Account, ) -> Result { let simulated_gas = send_tx_simulate(grpc_address, tx) .await @@ -147,6 +151,13 @@ async fn estimate_gas_with_tx( e.detail() ); + telemetry!( + simulate_errors, + &account.address.to_string(), + true, + get_error_text(&e), + ); + Ok(gas_config.default_gas) } @@ -155,6 +166,14 @@ async fn estimate_gas_with_tx( "failed to simulate tx. propagating error to caller: {}", e.detail() ); + + telemetry!( + simulate_errors, + &account.address.to_string(), + false, + get_error_text(&e), + ); + // Propagate the error, the retrying mechanism at caller may catch & retry. Err(e) } @@ -175,3 +194,12 @@ fn can_recover_from_simulation_failure(e: &Error) -> bool { _ => false, } } + +fn get_error_text(e: &Error) -> String { + use crate::error::ErrorDetail::*; + + match e.detail() { + GrpcStatus(detail) => detail.status.code().to_string(), + detail => detail.to_string(), + } +} diff --git a/crates/telemetry/src/state.rs b/crates/telemetry/src/state.rs index bf48381d1f..465a367044 100644 --- a/crates/telemetry/src/state.rs +++ b/crates/telemetry/src/state.rs @@ -201,6 +201,9 @@ pub struct TelemetryState { /// Number of errors observed by Hermes when broadcasting a Tx broadcast_errors: Counter, + /// Number of errors observed by Hermes when simulating a Tx + simulate_errors: Counter, + /// The EIP-1559 base fee queried dynamic_gas_queried_fees: ObservableGauge, @@ -394,6 +397,13 @@ impl TelemetryState { ) .init(), + simulate_errors: meter + .u64_counter("simulate_errors") + .with_description( + "Number of errors observed by Hermes when simulating a Tx", + ) + .init(), + dynamic_gas_queried_fees: meter .f64_observable_gauge("dynamic_gas_queried_fees") .with_description("The EIP-1559 base fee queried") @@ -1160,6 +1170,20 @@ impl TelemetryState { self.broadcast_errors.add(&cx, 1, labels); } + /// Add an error and its description to the list of errors observed after simulating + /// a Tx with a specific account. + pub fn simulate_errors(&self, address: &String, recoverable: bool, error_description: String) { + let cx = Context::current(); + + let labels = &[ + KeyValue::new("account", address.to_string()), + KeyValue::new("recoverable", recoverable.to_string()), + KeyValue::new("error_description", error_description.to_owned()), + ]; + + self.simulate_errors.add(&cx, 1, labels); + } + pub fn dynamic_gas_queried_fees(&self, chain_id: &ChainId, amount: f64) { let cx = Context::current(); diff --git a/guide/src/documentation/telemetry/operators.md b/guide/src/documentation/telemetry/operators.md index 2496b6e91a..dd0d137d2e 100644 --- a/guide/src/documentation/telemetry/operators.md +++ b/guide/src/documentation/telemetry/operators.md @@ -142,6 +142,7 @@ If this metric is increasing, it signals that the packet queue is increasing and | `cleared_send_packet_count_total`  | Number of SendPacket events received during the initial and periodic clearing, per chain, counterparty chain, channel and port | `u64` Counter | Packet workers enabled, and periodic packet clearing or clear on start enabled | | `cleared_acknowledgment_count_total` | Number of WriteAcknowledgement events received during the initial and periodic clearing, per chain, counterparty chain, channel and port | `u64` Counter | Packet workers enabled, and periodic packet clearing or clear on start enabled | | `broadcast_errors_total` | Number of errors observed by Hermes when broadcasting a Tx, per error type and account | `u64` Counter | Packet workers enabled | +| `simulate_errors_total` | Number of errors observed by Hermes when simulating a Tx, per error type, account and whether the error is recoverable or not | `u64` Counter | Packet workers enabled | | `filtered_packets` | Number of ICS-20 packets filtered because the memo and/or the receiver fields were exceeding the configured limits | `u64` Counter | Packet workers enabled, and `ics20_max_memo_size` and/or `ics20_max_receiver_size` enabled | Notes: