Skip to content

Commit

Permalink
feat: add simulate errors metrics (#3846)
Browse files Browse the repository at this point in the history
* feat: add simulate errors metrics

* feat: add error message

* chore: add docs

* chore: add unclog entry

* chore: cargo fmt

* Update .changelog/unreleased/features/3845-add-simulate-errors-metric.md

Co-authored-by: Luca Joss <43531661+ljoss17@users.noreply.github.com>
Signed-off-by: Sergey <83376337+freak12techno@users.noreply.github.com>

* chore: renamed unreleased file

* Update changelog entry

---------

Signed-off-by: Sergey <83376337+freak12techno@users.noreply.github.com>
Co-authored-by: Romain Ruetschi <romain@informal.systems>
Co-authored-by: Luca Joss <43531661+ljoss17@users.noreply.github.com>
  • Loading branch information
3 people authored Feb 21, 2024
1 parent 8736bba commit 5512d17
Show file tree
Hide file tree
Showing 4 changed files with 65 additions and 1 deletion.
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
- Added a new Prometheus metric `simulate_errors` for tracking when a transaction simulation fails, with the following labels:
* `recoverable` (can the execution continue if this happened?)
* `account` (account from which the tx was sent)
* `error_description` (description of the error)
([\#3845](https://github.com/informalsystems/hermes/issues/3845))

```
# HELP simulate_errors_total Number of errors observed by Hermes when simulating a Tx
# TYPE simulate_errors_total counter
simulate_errors_total{account="osmo17ndx5qfku28ymxgmq6zq4a6d02dvpfjjul0hyh",error_description="Unknown error",recoverable="false",service_name="unknown_service",otel_scope_name="hermes",otel_scope_version=""} 4
```
30 changes: 29 additions & 1 deletion crates/relayer/src/chain/cosmos/estimate.rs
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ use crate::chain::cosmos::types::gas::GasConfig;
use crate::config::types::Memo;
use crate::error::Error;
use crate::keyring::Secp256k1KeyPair;
use crate::telemetry;
use crate::util::pretty::PrettyFee;

pub async fn estimate_tx_fees(
Expand Down Expand Up @@ -51,6 +52,7 @@ pub async fn estimate_tx_fees(
&config.rpc_address,
&config.chain_id,
tx,
account,
)
.await?;

Expand All @@ -63,6 +65,7 @@ async fn estimate_fee_with_tx(
rpc_address: &Url,
chain_id: &ChainId,
tx: Tx,
account: &Account,
) -> Result<Fee, Error> {
let estimated_gas = {
crate::time!(
Expand All @@ -72,7 +75,7 @@ async fn estimate_fee_with_tx(
}

);
estimate_gas_with_tx(gas_config, grpc_address, tx).await
estimate_gas_with_tx(gas_config, grpc_address, tx, account).await
}?;

if estimated_gas > gas_config.max_gas {
Expand Down Expand Up @@ -112,6 +115,7 @@ async fn estimate_gas_with_tx(
gas_config: &GasConfig,
grpc_address: &Uri,
tx: Tx,
account: &Account,
) -> Result<u64, Error> {
let simulated_gas = send_tx_simulate(grpc_address, tx)
.await
Expand Down Expand Up @@ -147,6 +151,13 @@ async fn estimate_gas_with_tx(
e.detail()
);

telemetry!(
simulate_errors,
&account.address.to_string(),
true,
get_error_text(&e),
);

Ok(gas_config.default_gas)
}

Expand All @@ -155,6 +166,14 @@ async fn estimate_gas_with_tx(
"failed to simulate tx. propagating error to caller: {}",
e.detail()
);

telemetry!(
simulate_errors,
&account.address.to_string(),
false,
get_error_text(&e),
);

// Propagate the error, the retrying mechanism at caller may catch & retry.
Err(e)
}
Expand All @@ -175,3 +194,12 @@ fn can_recover_from_simulation_failure(e: &Error) -> bool {
_ => false,
}
}

fn get_error_text(e: &Error) -> String {
use crate::error::ErrorDetail::*;

match e.detail() {
GrpcStatus(detail) => detail.status.code().to_string(),
detail => detail.to_string(),
}
}
24 changes: 24 additions & 0 deletions crates/telemetry/src/state.rs
Original file line number Diff line number Diff line change
Expand Up @@ -201,6 +201,9 @@ pub struct TelemetryState {
/// Number of errors observed by Hermes when broadcasting a Tx
broadcast_errors: Counter<u64>,

/// Number of errors observed by Hermes when simulating a Tx
simulate_errors: Counter<u64>,

/// The EIP-1559 base fee queried
dynamic_gas_queried_fees: ObservableGauge<f64>,

Expand Down Expand Up @@ -394,6 +397,13 @@ impl TelemetryState {
)
.init(),

simulate_errors: meter
.u64_counter("simulate_errors")
.with_description(
"Number of errors observed by Hermes when simulating a Tx",
)
.init(),

dynamic_gas_queried_fees: meter
.f64_observable_gauge("dynamic_gas_queried_fees")
.with_description("The EIP-1559 base fee queried")
Expand Down Expand Up @@ -1160,6 +1170,20 @@ impl TelemetryState {
self.broadcast_errors.add(&cx, 1, labels);
}

/// Add an error and its description to the list of errors observed after simulating
/// a Tx with a specific account.
pub fn simulate_errors(&self, address: &String, recoverable: bool, error_description: String) {
let cx = Context::current();

let labels = &[
KeyValue::new("account", address.to_string()),
KeyValue::new("recoverable", recoverable.to_string()),
KeyValue::new("error_description", error_description.to_owned()),
];

self.simulate_errors.add(&cx, 1, labels);
}

pub fn dynamic_gas_queried_fees(&self, chain_id: &ChainId, amount: f64) {
let cx = Context::current();

Expand Down
1 change: 1 addition & 0 deletions guide/src/documentation/telemetry/operators.md
Original file line number Diff line number Diff line change
Expand Up @@ -142,6 +142,7 @@ If this metric is increasing, it signals that the packet queue is increasing and
| `cleared_send_packet_count_total`  | Number of SendPacket events received during the initial and periodic clearing, per chain, counterparty chain, channel and port | `u64` Counter | Packet workers enabled, and periodic packet clearing or clear on start enabled |
| `cleared_acknowledgment_count_total` | Number of WriteAcknowledgement events received during the initial and periodic clearing, per chain, counterparty chain, channel and port | `u64` Counter | Packet workers enabled, and periodic packet clearing or clear on start enabled |
| `broadcast_errors_total` | Number of errors observed by Hermes when broadcasting a Tx, per error type and account | `u64` Counter | Packet workers enabled |
| `simulate_errors_total` | Number of errors observed by Hermes when simulating a Tx, per error type, account and whether the error is recoverable or not | `u64` Counter | Packet workers enabled |
| `filtered_packets` | Number of ICS-20 packets filtered because the memo and/or the receiver fields were exceeding the configured limits | `u64` Counter | Packet workers enabled, and `ics20_max_memo_size` and/or `ics20_max_receiver_size` enabled |

Notes:
Expand Down

0 comments on commit 5512d17

Please sign in to comment.