Skip to content

Commit

Permalink
Export TX failures to prometheus metrics (counter) (#1240)
Browse files Browse the repository at this point in the history
* export tx failures to prometheus

* change label to `cause`
  • Loading branch information
boojamya authored Jul 25, 2023
1 parent 107d3f5 commit 3c78287
Show file tree
Hide file tree
Showing 3 changed files with 57 additions and 0 deletions.
11 changes: 11 additions & 0 deletions docs/advanced_usage.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,17 @@
If you started `rly` with the default `--debug-addr` argument,
you can use `http://$IP:5183/relayer/metrics` as a target for your prometheus scraper.


Exported metrics:

| **Exported Metric** | **Description** | **Type** |
|:----------------------------------:|:----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------:|:--------:|
| cosmos_relayer_observed_packets | The total number of observed packets | Counter |
| cosmos_relayer_relayed_packets | The total number of relayed packets | Counter |
| cosmos_relayer_chain_latest_height | The current height of the chain | Gauge |
| cosmos_relayer_wallet_balance | The current balance for the relayer's wallet | Gauge |
| cosmos_relayer_fees_spent | The amount of fees spent from the relayer's wallet | Gauge |
| cosmos_relayer_tx_failure | <br>The total number of tx failures broken up into catagories .<br>Categories:<br> - "packet messages are redundant"<br> - "insufficient funds"<br> - "invalid coins"<br> - "out of gas"<br> - "incorrect account sequence" <br><br> "Tx Failure" is the the catch all bucket| Counter |
**Example metrics**

```
Expand Down
36 changes: 36 additions & 0 deletions relayer/processor/message_processor.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ import (
"sync"
"time"

sdkerrors "github.com/cosmos/cosmos-sdk/types/errors"
chantypes "github.com/cosmos/ibc-go/v7/modules/core/04-channel/types"
ibcexported "github.com/cosmos/ibc-go/v7/modules/core/exported"
"github.com/cosmos/relayer/v2/relayer/provider"
Expand All @@ -33,6 +34,9 @@ type messageProcessor struct {
isLocalhost bool
}

// catagories of tx errors for a Prometheus counter. If the error doesnt fall into one of the below categories, it is labeled as "Tx Failure"
var promErrorCatagories = []error{chantypes.ErrRedundantTx, sdkerrors.ErrInsufficientFunds, sdkerrors.ErrInvalidCoins, sdkerrors.ErrOutOfGas, sdkerrors.ErrWrongSequence}

// trackMessage stores the message tracker in the correct slice and index based on the type.
func (mp *messageProcessor) trackMessage(tracker messageToTrack, i int) {
switch t := tracker.(type) {
Expand Down Expand Up @@ -361,6 +365,16 @@ func (mp *messageProcessor) sendClientUpdate(
zap.String("dst_client_id", dst.info.ClientID),
zap.Error(err),
)

for _, promError := range promErrorCatagories {
if mp.metrics != nil {
if errors.Is(err, promError) {
mp.metrics.IncTxFailure(src.info.PathName, src.info.ChainID, promError.Error())
} else {
mp.metrics.IncTxFailure(src.info.PathName, src.info.ChainID, "Tx Failure")
}
}
}
return
}
dst.log.Debug("Client update broadcast completed")
Expand Down Expand Up @@ -430,6 +444,17 @@ func (mp *messageProcessor) sendBatchMessages(
zap.String("dst_client_id", dst.info.ClientID),
zap.Error(err),
}

for _, promError := range promErrorCatagories {
if mp.metrics != nil {
if errors.Is(err, promError) {
mp.metrics.IncTxFailure(src.info.PathName, src.info.ChainID, promError.Error())
} else {
mp.metrics.IncTxFailure(src.info.PathName, src.info.ChainID, "Tx Failure")
}
}
}

if errors.Is(err, chantypes.ErrRedundantTx) {
mp.log.Debug("Redundant message(s)", errFields...)
return
Expand Down Expand Up @@ -490,6 +515,17 @@ func (mp *messageProcessor) sendSingleMessage(
zap.String("src_client_id", src.info.ClientID),
zap.String("dst_client_id", dst.info.ClientID),
}

for _, promError := range promErrorCatagories {
if mp.metrics != nil {
if errors.Is(err, promError) {
mp.metrics.IncTxFailure(src.info.PathName, src.info.ChainID, promError.Error())
} else {
mp.metrics.IncTxFailure(src.info.PathName, src.info.ChainID, "Tx Failure")
}
}
}

errFields = append(errFields, zap.Object("msg", tracker))
errFields = append(errFields, zap.Error(err))
if errors.Is(err, chantypes.ErrRedundantTx) {
Expand Down
10 changes: 10 additions & 0 deletions relayer/processor/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ type PrometheusMetrics struct {
LatestHeightGauge *prometheus.GaugeVec
WalletBalance *prometheus.GaugeVec
FeesSpent *prometheus.GaugeVec
TxFailureError *prometheus.CounterVec
BlockQueryFailure *prometheus.CounterVec
ClientExpiration *prometheus.GaugeVec
}
Expand Down Expand Up @@ -46,9 +47,14 @@ func (m *PrometheusMetrics) IncBlockQueryFailure(chain, err string) {
m.BlockQueryFailure.WithLabelValues(chain, err).Inc()
}

func (m *PrometheusMetrics) IncTxFailure(path, chain, errDesc string) {
m.TxFailureError.WithLabelValues(path, chain, errDesc).Inc()
}

func NewPrometheusMetrics() *PrometheusMetrics {
packetLabels := []string{"path", "chain", "channel", "port", "type"}
heightLabels := []string{"chain"}
txFailureLabels := []string{"path", "chain", "cause"}
blockQueryFailureLabels := []string{"chain", "type"}
walletLabels := []string{"chain", "gas_price", "key", "address", "denom"}
clientExpirationLables := []string{"path_name", "chain", "client_id", "trusting_period"}
Expand Down Expand Up @@ -76,6 +82,10 @@ func NewPrometheusMetrics() *PrometheusMetrics {
Name: "cosmos_relayer_fees_spent",
Help: "The amount of fees spent from the relayer's wallet",
}, walletLabels),
TxFailureError: registerer.NewCounterVec(prometheus.CounterOpts{
Name: "cosmos_relayer_tx_errors_total",
Help: "The total number of tx failures broken up into categories. See https://github.com/cosmos/relayer/blob/main/docs/advanced_usage.md#monitoring for list of catagories. 'Tx Failure' is the catch-all category",
}, txFailureLabels),
BlockQueryFailure: registerer.NewCounterVec(prometheus.CounterOpts{
Name: "cosmos_relayer_block_query_errors_total",
Help: "The total number of block query failures. The failures are separated into two catagories: 'RPC Client' and 'IBC Header'",
Expand Down

0 comments on commit 3c78287

Please sign in to comment.