Skip to content

Commit

Permalink
Remove user from state key metric value (#5453)
Browse files Browse the repository at this point in the history
* Implementing Bucket index sync status file (#5446)

* Implementing Bucket index sync status

Signed-off-by: Alan Protasio <alanprot@gmail.com>

* fixing bug when returning from cache

Signed-off-by: Alan Protasio <alanprot@gmail.com>

* Addressing some comments

Signed-off-by: Alan Protasio <alanprot@gmail.com>

* Changelog

Signed-off-by: Alan Protasio <alanprot@gmail.com>

---------

Signed-off-by: Alan Protasio <alanprot@gmail.com>
Signed-off-by: Yijie Qin <qinyijie@amazon.com>

* Implementing multi level index cache (#5451)

Signed-off-by: Yijie Qin <qinyijie@amazon.com>

* remove the user from am state replication key

Signed-off-by: Yijie Qin <qinyijie@amazon.com>

* expose the key label

Signed-off-by: Yijie Qin <qinyijie@amazon.com>

* add Changelog

Signed-off-by: Yijie Qin <qinyijie@amazon.com>

* fix comments

Signed-off-by: Yijie Qin <qinyijie@amazon.com>

* change to use type

Signed-off-by: Yijie Qin <qinyijie@amazon.com>

* address comment

Signed-off-by: Yijie Qin <qinyijie@amazon.com>

* fix comment

Signed-off-by: Yijie Qin <qinyijie@amazon.com>

* use type instead of key

Signed-off-by: Yijie Qin <qinyijie@amazon.com>

---------

Signed-off-by: Alan Protasio <alanprot@gmail.com>
Signed-off-by: Yijie Qin <qinyijie@amazon.com>
Co-authored-by: Alan Protasio <approtas@amazon.com>
  • Loading branch information
qinxx108 and alanprot authored Jul 13, 2023
1 parent 241fa91 commit b3cab52
Show file tree
Hide file tree
Showing 5 changed files with 96 additions and 26 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@
* [BUGFIX] Store Gateway: Fix bug in store gateway ring comparison logic. #5426
* [BUGFIX] Ring: Fix bug in consistency of Get func in a scaling zone-aware ring. #5429
* [BUGFIX] Query Frontend: Fix bug of failing to cancel downstream request context in query frontend v2 mode (query scheduler enabled). #5447
* [BUGFIX] Alertmanager: Remove the user id from state replication key metric label value. #5453

## 1.15.1 2023-04-26

Expand Down
16 changes: 8 additions & 8 deletions pkg/alertmanager/alertmanager_metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -168,19 +168,19 @@ func newAlertmanagerMetrics() *alertmanagerMetrics {
partialMerges: prometheus.NewDesc(
"cortex_alertmanager_partial_state_merges_total",
"Number of times we have received a partial state to merge for a key.",
[]string{"user"}, nil),
[]string{"user", "type"}, nil),
partialMergesFailed: prometheus.NewDesc(
"cortex_alertmanager_partial_state_merges_failed_total",
"Number of times we have failed to merge a partial state received for a key.",
[]string{"user"}, nil),
[]string{"user", "type"}, nil),
replicationTotal: prometheus.NewDesc(
"cortex_alertmanager_state_replication_total",
"Number of times we have tried to replicate a state to other alertmanagers",
[]string{"user"}, nil),
[]string{"user", "type"}, nil),
replicationFailed: prometheus.NewDesc(
"cortex_alertmanager_state_replication_failed_total",
"Number of times we have failed to replicate a state to other alertmanagers",
[]string{"user"}, nil),
[]string{"user", "type"}, nil),
fetchReplicaStateTotal: prometheus.NewDesc(
"cortex_alertmanager_state_fetch_replica_state_total",
"Number of times we have tried to read and merge the full state from another replica.",
Expand Down Expand Up @@ -317,10 +317,10 @@ func (m *alertmanagerMetrics) Collect(out chan<- prometheus.Metric) {

data.SendMaxOfGaugesPerUser(out, m.configHashValue, "alertmanager_config_hash")

data.SendSumOfCountersPerUser(out, m.partialMerges, "alertmanager_partial_state_merges_total")
data.SendSumOfCountersPerUser(out, m.partialMergesFailed, "alertmanager_partial_state_merges_failed_total")
data.SendSumOfCountersPerUser(out, m.replicationTotal, "alertmanager_state_replication_total")
data.SendSumOfCountersPerUser(out, m.replicationFailed, "alertmanager_state_replication_failed_total")
data.SendSumOfCountersPerUserWithLabels(out, m.partialMerges, "alertmanager_partial_state_merges_total", "type")
data.SendSumOfCountersPerUserWithLabels(out, m.partialMergesFailed, "alertmanager_partial_state_merges_failed_total", "type")
data.SendSumOfCountersPerUserWithLabels(out, m.replicationTotal, "alertmanager_state_replication_total", "type")
data.SendSumOfCountersPerUserWithLabels(out, m.replicationFailed, "alertmanager_state_replication_failed_total", "type")
data.SendSumOfCounters(out, m.fetchReplicaStateTotal, "alertmanager_state_fetch_replica_state_total")
data.SendSumOfCounters(out, m.fetchReplicaStateFailed, "alertmanager_state_fetch_replica_state_failed_total")
data.SendSumOfCounters(out, m.initialSyncTotal, "alertmanager_state_initial_sync_total")
Expand Down
56 changes: 56 additions & 0 deletions pkg/alertmanager/alertmanager_metrics_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -211,6 +211,17 @@ func TestAlertmanagerMetricsStore(t *testing.T) {
cortex_alertmanager_notifications_total{integration="wechat",user="user2"} 20
cortex_alertmanager_notifications_total{integration="wechat",user="user3"} 200
# HELP cortex_alertmanager_partial_state_merges_failed_total Number of times we have failed to merge a partial state received for a key.
# TYPE cortex_alertmanager_partial_state_merges_failed_total counter
cortex_alertmanager_partial_state_merges_failed_total{type="nfl",user="user1"} 2
cortex_alertmanager_partial_state_merges_failed_total{type="nfl",user="user2"} 20
cortex_alertmanager_partial_state_merges_failed_total{type="nfl",user="user3"} 200
# HELP cortex_alertmanager_partial_state_merges_total Number of times we have received a partial state to merge for a key.
# TYPE cortex_alertmanager_partial_state_merges_total counter
cortex_alertmanager_partial_state_merges_total{type="nfl",user="user1"} 3
cortex_alertmanager_partial_state_merges_total{type="nfl",user="user2"} 30
cortex_alertmanager_partial_state_merges_total{type="nfl",user="user3"} 300
# HELP cortex_alertmanager_silences How many silences by state.
# TYPE cortex_alertmanager_silences gauge
cortex_alertmanager_silences{state="active",user="user1"} 1
Expand Down Expand Up @@ -506,6 +517,17 @@ func TestAlertmanagerMetricsRemoval(t *testing.T) {
cortex_alertmanager_notifications_total{integration="wechat",user="user2"} 20
cortex_alertmanager_notifications_total{integration="wechat",user="user3"} 200
# HELP cortex_alertmanager_partial_state_merges_failed_total Number of times we have failed to merge a partial state received for a key.
# TYPE cortex_alertmanager_partial_state_merges_failed_total counter
cortex_alertmanager_partial_state_merges_failed_total{type="nfl",user="user1"} 2
cortex_alertmanager_partial_state_merges_failed_total{type="nfl",user="user2"} 20
cortex_alertmanager_partial_state_merges_failed_total{type="nfl",user="user3"} 200
# HELP cortex_alertmanager_partial_state_merges_total Number of times we have received a partial state to merge for a key.
# TYPE cortex_alertmanager_partial_state_merges_total counter
cortex_alertmanager_partial_state_merges_total{type="nfl",user="user1"} 3
cortex_alertmanager_partial_state_merges_total{type="nfl",user="user2"} 30
cortex_alertmanager_partial_state_merges_total{type="nfl",user="user3"} 300
# HELP cortex_alertmanager_silences How many silences by state.
# TYPE cortex_alertmanager_silences gauge
cortex_alertmanager_silences{state="active",user="user1"} 1
Expand Down Expand Up @@ -758,6 +780,15 @@ func TestAlertmanagerMetricsRemoval(t *testing.T) {
cortex_alertmanager_notifications_total{integration="wechat",user="user1"} 2
cortex_alertmanager_notifications_total{integration="wechat",user="user2"} 20
# HELP cortex_alertmanager_partial_state_merges_failed_total Number of times we have failed to merge a partial state received for a key.
# TYPE cortex_alertmanager_partial_state_merges_failed_total counter
cortex_alertmanager_partial_state_merges_failed_total{type="nfl",user="user1"} 2
cortex_alertmanager_partial_state_merges_failed_total{type="nfl",user="user2"} 20
# HELP cortex_alertmanager_partial_state_merges_total Number of times we have received a partial state to merge for a key.
# TYPE cortex_alertmanager_partial_state_merges_total counter
cortex_alertmanager_partial_state_merges_total{type="nfl",user="user1"} 3
cortex_alertmanager_partial_state_merges_total{type="nfl",user="user2"} 30
# HELP cortex_alertmanager_silences How many silences by state.
# TYPE cortex_alertmanager_silences gauge
cortex_alertmanager_silences{state="active",user="user1"} 1
Expand Down Expand Up @@ -898,6 +929,10 @@ func populateAlertmanager(base float64) *prometheus.Registry {
lm.size.Set(100 * base)
lm.insertFailures.Add(7 * base)

sr := newStateReplicationMetrics(reg)
sr.partialStateMergesFailed.WithLabelValues("nfl").Add(base * 2)
sr.partialStateMergesTotal.WithLabelValues("nfl").Add(base * 3)

return reg
}

Expand Down Expand Up @@ -1130,3 +1165,24 @@ func newLimiterMetrics(r prometheus.Registerer) *limiterMetrics {
insertFailures: insertAlertFailures,
}
}

type stateReplicationMetrics struct {
partialStateMergesTotal *prometheus.CounterVec
partialStateMergesFailed *prometheus.CounterVec
}

func newStateReplicationMetrics(r prometheus.Registerer) *stateReplicationMetrics {
partialStateMergesTotal := promauto.With(r).NewCounterVec(prometheus.CounterOpts{
Name: "alertmanager_partial_state_merges_total",
Help: "Number of times we have received a partial state to merge for a key.",
}, []string{"type"})
partialStateMergesFailed := promauto.With(r).NewCounterVec(prometheus.CounterOpts{
Name: "alertmanager_partial_state_merges_failed_total",
Help: "Number of times we have failed to merge a partial state received for a key.",
}, []string{"type"})

return &stateReplicationMetrics{
partialStateMergesTotal: partialStateMergesTotal,
partialStateMergesFailed: partialStateMergesFailed,
}
}
39 changes: 26 additions & 13 deletions pkg/alertmanager/state_replication.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ package alertmanager
import (
"context"
"fmt"
"strings"
"sync"
"time"

Expand Down Expand Up @@ -79,19 +80,19 @@ func newReplicatedStates(userID string, rf int, re Replicator, st alertstore.Ale
partialStateMergesTotal: promauto.With(r).NewCounterVec(prometheus.CounterOpts{
Name: "alertmanager_partial_state_merges_total",
Help: "Number of times we have received a partial state to merge for a key.",
}, []string{"key"}),
}, []string{"type"}),
partialStateMergesFailed: promauto.With(r).NewCounterVec(prometheus.CounterOpts{
Name: "alertmanager_partial_state_merges_failed_total",
Help: "Number of times we have failed to merge a partial state received for a key.",
}, []string{"key"}),
}, []string{"type"}),
stateReplicationTotal: promauto.With(r).NewCounterVec(prometheus.CounterOpts{
Name: "alertmanager_state_replication_total",
Help: "Number of times we have tried to replicate a state to other alertmanagers.",
}, []string{"key"}),
}, []string{"type"}),
stateReplicationFailed: promauto.With(r).NewCounterVec(prometheus.CounterOpts{
Name: "alertmanager_state_replication_failed_total",
Help: "Number of times we have failed to replicate a state to other alertmanagers.",
}, []string{"key"}),
}, []string{"type"}),
fetchReplicaStateTotal: promauto.With(r).NewCounter(prometheus.CounterOpts{
Name: "alertmanager_state_fetch_replica_state_total",
Help: "Number of times we have tried to read and merge the full state from another replica.",
Expand Down Expand Up @@ -131,10 +132,11 @@ func (s *state) AddState(key string, cs cluster.State, _ prometheus.Registerer)

s.states[key] = cs

s.partialStateMergesTotal.WithLabelValues(key)
s.partialStateMergesFailed.WithLabelValues(key)
s.stateReplicationTotal.WithLabelValues(key)
s.stateReplicationFailed.WithLabelValues(key)
stateType := getStateTypeFromKey(key)
s.partialStateMergesTotal.WithLabelValues(stateType)
s.partialStateMergesFailed.WithLabelValues(stateType)
s.stateReplicationTotal.WithLabelValues(stateType)
s.stateReplicationFailed.WithLabelValues(stateType)

return &stateChannel{
s: s,
Expand All @@ -144,18 +146,19 @@ func (s *state) AddState(key string, cs cluster.State, _ prometheus.Registerer)

// MergePartialState merges a received partial message with an internal state.
func (s *state) MergePartialState(p *clusterpb.Part) error {
s.partialStateMergesTotal.WithLabelValues(p.Key).Inc()
stateType := getStateTypeFromKey(p.Key)
s.partialStateMergesTotal.WithLabelValues(stateType).Inc()

s.mtx.Lock()
defer s.mtx.Unlock()
st, ok := s.states[p.Key]
if !ok {
s.partialStateMergesFailed.WithLabelValues(p.Key).Inc()
s.partialStateMergesFailed.WithLabelValues(stateType).Inc()
return fmt.Errorf("key not found while merging")
}

if err := st.Merge(p.Data); err != nil {
s.partialStateMergesFailed.WithLabelValues(p.Key).Inc()
s.partialStateMergesFailed.WithLabelValues(stateType).Inc()
return err
}

Expand Down Expand Up @@ -285,9 +288,10 @@ func (s *state) running(ctx context.Context) error {
return nil
}

s.stateReplicationTotal.WithLabelValues(p.Key).Inc()
stateType := getStateTypeFromKey(p.Key)
s.stateReplicationTotal.WithLabelValues(stateType).Inc()
if err := s.replicator.ReplicateStateForUser(ctx, s.userID, p); err != nil {
s.stateReplicationFailed.WithLabelValues(p.Key).Inc()
s.stateReplicationFailed.WithLabelValues(stateType).Inc()
level.Error(s.logger).Log("msg", "failed to replicate state to other alertmanagers", "user", s.userID, "key", p.Key, "err", err)
}
case <-ctx.Done():
Expand All @@ -314,3 +318,12 @@ type stateChannel struct {
func (c *stateChannel) Broadcast(b []byte) {
c.s.broadcast(c.key, b)
}

// getStateTypeFromKey used for get the state type out of the state key.
func getStateTypeFromKey(key string) string {
index := strings.IndexByte(key, ':')
if index < 0 {
return key
}
return key[:index]
}
10 changes: 5 additions & 5 deletions pkg/alertmanager/state_replication_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -143,7 +143,7 @@ func TestStateReplication(t *testing.T) {
require.NoError(t, s.WaitReady(ctx))
}

ch := s.AddState("nflog", &fakeState{}, reg)
ch := s.AddState("nflog:user-1", &fakeState{}, reg)

part := tt.message
d, err := part.Marshal()
Expand All @@ -166,10 +166,10 @@ alertmanager_state_fetch_replica_state_failed_total 0
alertmanager_state_fetch_replica_state_total 1
# HELP alertmanager_partial_state_merges_failed_total Number of times we have failed to merge a partial state received for a key.
# TYPE alertmanager_partial_state_merges_failed_total counter
alertmanager_partial_state_merges_failed_total{key="nflog"} 0
alertmanager_partial_state_merges_failed_total{type="nflog"} 0
# HELP alertmanager_partial_state_merges_total Number of times we have received a partial state to merge for a key.
# TYPE alertmanager_partial_state_merges_total counter
alertmanager_partial_state_merges_total{key="nflog"} 0
alertmanager_partial_state_merges_total{type="nflog"} 0
# HELP alertmanager_state_initial_sync_completed_total Number of times we have completed syncing initial state for each possible outcome.
# TYPE alertmanager_state_initial_sync_completed_total counter
alertmanager_state_initial_sync_completed_total{outcome="failed"} 0
Expand All @@ -181,10 +181,10 @@ alertmanager_state_initial_sync_completed_total{outcome="user-not-found"} 0
alertmanager_state_initial_sync_total 1
# HELP alertmanager_state_replication_failed_total Number of times we have failed to replicate a state to other alertmanagers.
# TYPE alertmanager_state_replication_failed_total counter
alertmanager_state_replication_failed_total{key="nflog"} 0
alertmanager_state_replication_failed_total{type="nflog"} 0
# HELP alertmanager_state_replication_total Number of times we have tried to replicate a state to other alertmanagers.
# TYPE alertmanager_state_replication_total counter
alertmanager_state_replication_total{key="nflog"} 1
alertmanager_state_replication_total{type="nflog"} 1
`),
"alertmanager_state_fetch_replica_state_failed_total",
"alertmanager_state_fetch_replica_state_total",
Expand Down

0 comments on commit b3cab52

Please sign in to comment.