Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Export TX failures to prometheus metrics (counter) #1240

Merged
merged 3 commits into from
Jul 25, 2023
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions docs/advanced_usage.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,17 @@
If you started `rly` with the default `--debug-addr` argument,
you can use `http://$IP:7597/relayer/metrics` as a target for your prometheus scraper.


Exported metrics:

| **Exported Metric** | **Description** | **Type** |
|:----------------------------------:|:----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------:|:--------:|
| cosmos_relayer_observed_packets | The total number of observed packets | Counter |
| cosmos_relayer_relayed_packets | The total number of relayed packets | Counter |
| cosmos_relayer_chain_latest_height | The current height of the chain | Gauge |
| cosmos_relayer_wallet_balance | The current balance for the relayer's wallet | Gauge |
| cosmos_relayer_fees_spent | The amount of fees spent from the relayer's wallet | Gauge |
| cosmos_relayer_tx_failure | <br>The total number of tx failures broken up into catagories .<br>Categories:<br> - "packet messages are redundant"<br> - "insufficient funds"<br> - "invalid coins"<br> - "out of gas"<br> - "incorrect account sequence" <br><br> "Tx Failure" is the the catch all bucket| Counter |
**Example metrics**

```
Expand Down
36 changes: 36 additions & 0 deletions relayer/processor/message_processor.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ import (
"sync"
"time"

sdkerrors "github.com/cosmos/cosmos-sdk/types/errors"
chantypes "github.com/cosmos/ibc-go/v7/modules/core/04-channel/types"
ibcexported "github.com/cosmos/ibc-go/v7/modules/core/exported"
"github.com/cosmos/relayer/v2/relayer/provider"
Expand All @@ -33,6 +34,9 @@ type messageProcessor struct {
isLocalhost bool
}

// catagories of tx errors for a Prometheus counter. If the error doesnt fall into one of the below categories, it is labeled as "Tx Failure"
var promErrorCatagories = []error{chantypes.ErrRedundantTx, sdkerrors.ErrInsufficientFunds, sdkerrors.ErrInvalidCoins, sdkerrors.ErrOutOfGas, sdkerrors.ErrWrongSequence}

// trackMessage stores the message tracker in the correct slice and index based on the type.
func (mp *messageProcessor) trackMessage(tracker messageToTrack, i int) {
switch t := tracker.(type) {
Expand Down Expand Up @@ -354,6 +358,16 @@ func (mp *messageProcessor) sendClientUpdate(
zap.String("dst_client_id", dst.info.ClientID),
zap.Error(err),
)

for _, promError := range promErrorCatagories {
if mp.metrics != nil {
if errors.Is(err, promError) {
mp.metrics.IncTxFailure(src.info.PathName, src.info.ChainID, promError.Error())
} else {
mp.metrics.IncTxFailure(src.info.PathName, src.info.ChainID, "Tx Failure")
}
}
}
return
}
dst.log.Debug("Client update broadcast completed")
Expand Down Expand Up @@ -423,6 +437,17 @@ func (mp *messageProcessor) sendBatchMessages(
zap.String("dst_client_id", dst.info.ClientID),
zap.Error(err),
}

for _, promError := range promErrorCatagories {
if mp.metrics != nil {
if errors.Is(err, promError) {
mp.metrics.IncTxFailure(src.info.PathName, src.info.ChainID, promError.Error())
} else {
mp.metrics.IncTxFailure(src.info.PathName, src.info.ChainID, "Tx Failure")
}
}
}

if errors.Is(err, chantypes.ErrRedundantTx) {
mp.log.Debug("Redundant message(s)", errFields...)
return
Expand Down Expand Up @@ -483,6 +508,17 @@ func (mp *messageProcessor) sendSingleMessage(
zap.String("src_client_id", src.info.ClientID),
zap.String("dst_client_id", dst.info.ClientID),
}

for _, promError := range promErrorCatagories {
if mp.metrics != nil {
if errors.Is(err, promError) {
mp.metrics.IncTxFailure(src.info.PathName, src.info.ChainID, promError.Error())
} else {
mp.metrics.IncTxFailure(src.info.PathName, src.info.ChainID, "Tx Failure")
}
}
}

errFields = append(errFields, zap.Object("msg", tracker))
errFields = append(errFields, zap.Error(err))
if errors.Is(err, chantypes.ErrRedundantTx) {
Expand Down
10 changes: 10 additions & 0 deletions relayer/processor/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ type PrometheusMetrics struct {
LatestHeightGauge *prometheus.GaugeVec
WalletBalance *prometheus.GaugeVec
FeesSpent *prometheus.GaugeVec
TxFailureError *prometheus.CounterVec
}

func (m *PrometheusMetrics) AddPacketsObserved(path, chain, channel, port, eventType string, count int) {
Expand All @@ -34,10 +35,15 @@ func (m *PrometheusMetrics) SetFeesSpent(chain, key, address, denom string, amou
m.FeesSpent.WithLabelValues(chain, key, address, denom).Set(amount)
}

func (m *PrometheusMetrics) IncTxFailure(path, chain, errDesc string) {
m.TxFailureError.WithLabelValues(path, chain, errDesc).Inc()
}

func NewPrometheusMetrics() *PrometheusMetrics {
packetLabels := []string{"path", "chain", "channel", "port", "type"}
heightLabels := []string{"chain"}
walletLabels := []string{"chain", "key", "address", "denom"}
txFailureLabels := []string{"path", "chain", "cause"}
registry := prometheus.NewRegistry()
registerer := promauto.With(registry)
return &PrometheusMetrics{
Expand All @@ -62,5 +68,9 @@ func NewPrometheusMetrics() *PrometheusMetrics {
Name: "cosmos_relayer_fees_spent",
Help: "The amount of fees spent from the relayer's wallet",
}, walletLabels),
TxFailureError: registerer.NewCounterVec(prometheus.CounterOpts{
Name: "cosmos_relayer_tx_errors_total",
Help: "The total number of tx failures broken up into categories. See https://github.com/cosmos/relayer/blob/main/docs/advanced_usage.md#monitoring for list of catagories. 'Tx Failure' is the catch-all category",
}, txFailureLabels),
}
}
Loading