Skip to content

Commit

Permalink
Merge pull request #13974 from prometheus/measure-restore-time-rules
Browse files Browse the repository at this point in the history
Rule Manager: Add `rule_group_last_restore_duration_seconds` to measure restore time per rule group
  • Loading branch information
gotjosh authored Apr 24, 2024
2 parents d15869a + 5beb2fe commit 4ac7806
Show file tree
Hide file tree
Showing 2 changed files with 27 additions and 12 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
## unreleased

* [CHANGE] TSDB: Fix the predicate checking for blocks which are beyond the retention period to include the ones right at the retention boundary. #9633
* [ENHANCEMENT] Rules: Add `rule_group_last_restore_duration_seconds` to measure the time it takes to restore a rule group. #13974

## 2.51.2 / 2024-04-09

Expand Down
38 changes: 26 additions & 12 deletions rules/group.go
Original file line number Diff line number Diff line change
Expand Up @@ -230,7 +230,11 @@ func (g *Group) run(ctx context.Context) {
g.evalIterationFunc(ctx, g, evalTimestamp)
}

g.RestoreForState(time.Now())
restoreStartTime := time.Now()
g.RestoreForState(restoreStartTime)
totalRestoreTimeSeconds := time.Since(restoreStartTime).Seconds()
g.metrics.GroupLastRestoreDuration.WithLabelValues(GroupKey(g.file, g.name)).Set(totalRestoreTimeSeconds)
level.Debug(g.logger).Log("msg", "'for' state restoration completed", "duration_seconds", totalRestoreTimeSeconds)
g.shouldRestore = false
}

Expand Down Expand Up @@ -779,17 +783,18 @@ const namespace = "prometheus"

// Metrics for rule evaluation.
type Metrics struct {
EvalDuration prometheus.Summary
IterationDuration prometheus.Summary
IterationsMissed *prometheus.CounterVec
IterationsScheduled *prometheus.CounterVec
EvalTotal *prometheus.CounterVec
EvalFailures *prometheus.CounterVec
GroupInterval *prometheus.GaugeVec
GroupLastEvalTime *prometheus.GaugeVec
GroupLastDuration *prometheus.GaugeVec
GroupRules *prometheus.GaugeVec
GroupSamples *prometheus.GaugeVec
EvalDuration prometheus.Summary
IterationDuration prometheus.Summary
IterationsMissed *prometheus.CounterVec
IterationsScheduled *prometheus.CounterVec
EvalTotal *prometheus.CounterVec
EvalFailures *prometheus.CounterVec
GroupInterval *prometheus.GaugeVec
GroupLastEvalTime *prometheus.GaugeVec
GroupLastDuration *prometheus.GaugeVec
GroupLastRestoreDuration *prometheus.GaugeVec
GroupRules *prometheus.GaugeVec
GroupSamples *prometheus.GaugeVec
}

// NewGroupMetrics creates a new instance of Metrics and registers it with the provided registerer,
Expand Down Expand Up @@ -865,6 +870,14 @@ func NewGroupMetrics(reg prometheus.Registerer) *Metrics {
},
[]string{"rule_group"},
),
GroupLastRestoreDuration: prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Namespace: namespace,
Name: "rule_group_last_restore_duration_seconds",
Help: "The duration of the last alert rules alerts restoration using the `ALERTS_FOR_STATE` series.",
},
[]string{"rule_group"},
),
GroupRules: prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Namespace: namespace,
Expand Down Expand Up @@ -894,6 +907,7 @@ func NewGroupMetrics(reg prometheus.Registerer) *Metrics {
m.GroupInterval,
m.GroupLastEvalTime,
m.GroupLastDuration,
m.GroupLastRestoreDuration,
m.GroupRules,
m.GroupSamples,
)
Expand Down

0 comments on commit 4ac7806

Please sign in to comment.