Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Alertmanager: Update Alertmanager to commit 80b3cb0 #7384

Merged
merged 6 commits into from
Feb 15, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

### Grafana Mimir

* [ENHANCEMENT] Alertmanager: Adds metric `cortex_alertmanager_notifications_suppressed_total` that counts the total number of notifications suppressed for being silenced, inhibited, outside of active time intervals or within muted time intervals.
* [CHANGE] Alertmanager: Deprecates the `v1` API. All `v1` API endpoints now respond with a JSON deprecation notice and a status code of `410`. All endpoints have a `v2` equivalent. The list of endpoints is: #7103
* `<alertmanager-web.external-url>/api/v1/alerts`
* `<alertmanager-web.external-url>/api/v1/receivers`
Expand Down
2 changes: 1 addition & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ require (
github.com/opentracing-contrib/go-stdlib v1.0.0
github.com/opentracing/opentracing-go v1.2.1-0.20220228012449-10b1cf09e00b
github.com/pkg/errors v0.9.1
github.com/prometheus/alertmanager v0.26.1-0.20240208095903-f69a5086657b
github.com/prometheus/alertmanager v0.26.1-0.20240215111258-80b3cb072fbd
github.com/prometheus/client_golang v1.18.0
github.com/prometheus/client_model v0.5.0
github.com/prometheus/common v0.46.0
Expand Down
4 changes: 2 additions & 2 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -769,8 +769,8 @@ github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 h1:Jamvg5psRI
github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
github.com/posener/complete v1.1.1/go.mod h1:em0nMJCgc9GFtwrmVmEMR/ZL6WyhyjMBndrE9hABlRI=
github.com/posener/complete v1.2.3/go.mod h1:WZIdtGGp+qx0sLrYKtIRAruyNpv6hFCicSgv7Sy7s/s=
github.com/prometheus/alertmanager v0.26.1-0.20240208095903-f69a5086657b h1:qdLfwUabfhvvvOhnObLgRfXo5wq2V3pZSdLhUMgN4QE=
github.com/prometheus/alertmanager v0.26.1-0.20240208095903-f69a5086657b/go.mod h1:8Ia/R3urPmbzJ8OsdvmZvIprDwvwmYCmUbwBL+jlPOE=
github.com/prometheus/alertmanager v0.26.1-0.20240215111258-80b3cb072fbd h1:41+1zd8AibDiY4xov0REU1rNW+Kg+ioVVQRynAXRvZg=
github.com/prometheus/alertmanager v0.26.1-0.20240215111258-80b3cb072fbd/go.mod h1:8Ia/R3urPmbzJ8OsdvmZvIprDwvwmYCmUbwBL+jlPOE=
github.com/prometheus/client_golang v0.9.1/go.mod h1:7SWBe2y4D6OKWSNQJUaRYU/AaXPKyh/dDVn+NZz0KFw=
github.com/prometheus/client_golang v1.0.0/go.mod h1:db9x61etRT2tGnBNRi70OPL5FsnadC4Ky3P0J6CfImo=
github.com/prometheus/client_golang v1.4.0/go.mod h1:e9GMxYsXl05ICDXkRhurwBS4Q3OK1iX/F2sw+iXX5zU=
Expand Down
6 changes: 3 additions & 3 deletions pkg/alertmanager/alertmanager_config_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -102,11 +102,11 @@ inhibit_rules:
},
expected: []string{
`level=debug user=4 msg="Parsing with UTF-8 matchers parser, with fallback to classic matchers parser" input="foo=" origin=test`,
`level=warn user=4 msg="Alertmanager is moving to a new parser for labels and matchers, and this input is incompatible. Alertmanager has instead parsed the input using the old matchers parser as a fallback. To make this input compatible with the new parser please make sure all regular expressions and values are double-quoted. If you are still seeing this message please open an issue." input="foo=" origin=test err="end of input: expected label value" suggestion="foo=\"\""`,
`level=warn user=4 msg="Alertmanager is moving to a new parser for labels and matchers, and this input is incompatible. Alertmanager has instead parsed the input using the classic matchers parser as a fallback. To make this input compatible with the UTF-8 matchers parser please make sure all regular expressions and values are double-quoted. If you are still seeing this message please open an issue." input="foo=" origin=test err="end of input: expected label value" suggestion="foo=\"\""`,
`level=debug user=4 msg="Parsing with UTF-8 matchers parser, with fallback to classic matchers parser" input="bar=" origin=test`,
`level=warn user=4 msg="Alertmanager is moving to a new parser for labels and matchers, and this input is incompatible. Alertmanager has instead parsed the input using the old matchers parser as a fallback. To make this input compatible with the new parser please make sure all regular expressions and values are double-quoted. If you are still seeing this message please open an issue." input="bar=" origin=test err="end of input: expected label value" suggestion="bar=\"\""`,
`level=warn user=4 msg="Alertmanager is moving to a new parser for labels and matchers, and this input is incompatible. Alertmanager has instead parsed the input using the classic matchers parser as a fallback. To make this input compatible with the UTF-8 matchers parser please make sure all regular expressions and values are double-quoted. If you are still seeing this message please open an issue." input="bar=" origin=test err="end of input: expected label value" suggestion="bar=\"\""`,
`level=debug user=4 msg="Parsing with UTF-8 matchers parser, with fallback to classic matchers parser" input="baz=" origin=test`,
`level=warn user=4 msg="Alertmanager is moving to a new parser for labels and matchers, and this input is incompatible. Alertmanager has instead parsed the input using the old matchers parser as a fallback. To make this input compatible with the new parser please make sure all regular expressions and values are double-quoted. If you are still seeing this message please open an issue." input="baz=" origin=test err="end of input: expected label value" suggestion="baz=\"\""`,
`level=warn user=4 msg="Alertmanager is moving to a new parser for labels and matchers, and this input is incompatible. Alertmanager has instead parsed the input using the classic matchers parser as a fallback. To make this input compatible with the UTF-8 matchers parser please make sure all regular expressions and values are double-quoted. If you are still seeing this message please open an issue." input="baz=" origin=test err="end of input: expected label value" suggestion="baz=\"\""`,
},
}, {
name: "config contains disagreement",
Expand Down
7 changes: 7 additions & 0 deletions pkg/alertmanager/alertmanager_metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ type alertmanagerMetrics struct {
numFailedNotifications *prometheus.Desc
numNotificationRequestsTotal *prometheus.Desc
numNotificationRequestsFailedTotal *prometheus.Desc
numNotificationSuppressedTotal *prometheus.Desc
notificationLatencySeconds *prometheus.Desc

// exported metrics, gathered from Alertmanager nflog
Expand Down Expand Up @@ -107,6 +108,10 @@ func newAlertmanagerMetrics() *alertmanagerMetrics {
"cortex_alertmanager_notification_requests_failed_total",
"The total number of failed notification requests.",
[]string{"user", "integration"}, nil),
numNotificationSuppressedTotal: prometheus.NewDesc(
"cortex_alertmanager_notifications_suppressed_total",
"The total number of notifications suppressed for being silenced, inhibited, outside of active time intervals or within muted time intervals.",
[]string{"user", "reason"}, nil),
notificationLatencySeconds: prometheus.NewDesc(
"cortex_alertmanager_notification_latency_seconds",
"The latency of notifications in seconds.",
Expand Down Expand Up @@ -287,6 +292,7 @@ func (m *alertmanagerMetrics) Describe(out chan<- *prometheus.Desc) {
out <- m.numFailedNotifications
out <- m.numNotificationRequestsTotal
out <- m.numNotificationRequestsFailedTotal
out <- m.numNotificationSuppressedTotal
out <- m.notificationLatencySeconds
out <- m.markerAlerts
out <- m.nflogGCDuration
Expand Down Expand Up @@ -339,6 +345,7 @@ func (m *alertmanagerMetrics) Collect(out chan<- prometheus.Metric) {
data.SendSumOfCountersPerTenant(out, m.numFailedNotifications, "alertmanager_notifications_failed_total", dskit_metrics.WithLabels("integration", "reason"), dskit_metrics.WithSkipZeroValueMetrics)
data.SendSumOfCountersPerTenant(out, m.numNotificationRequestsTotal, "alertmanager_notification_requests_total", dskit_metrics.WithLabels("integration"), dskit_metrics.WithSkipZeroValueMetrics)
data.SendSumOfCountersPerTenant(out, m.numNotificationRequestsFailedTotal, "alertmanager_notification_requests_failed_total", dskit_metrics.WithLabels("integration"), dskit_metrics.WithSkipZeroValueMetrics)
data.SendSumOfCountersPerTenant(out, m.numNotificationSuppressedTotal, "alertmanager_notifications_suppressed_total", dskit_metrics.WithLabels("reason"), dskit_metrics.WithSkipZeroValueMetrics)
data.SendSumOfHistograms(out, m.notificationLatencySeconds, "alertmanager_notification_latency_seconds")
data.SendSumOfGaugesPerTenantWithLabels(out, m.markerAlerts, "alertmanager_alerts", "state")

Expand Down
45 changes: 45 additions & 0 deletions pkg/alertmanager/alertmanager_metrics_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -145,6 +145,17 @@ func TestAlertmanagerMetricsStore(t *testing.T) {
cortex_alertmanager_notifications_failed_total{integration="telegram",reason="clientError",user="user1"} 9
cortex_alertmanager_notifications_failed_total{integration="telegram",reason="clientError",user="user2"} 90
cortex_alertmanager_notifications_failed_total{integration="telegram",reason="clientError",user="user3"} 900
# HELP cortex_alertmanager_notifications_suppressed_total The total number of notifications suppressed for being silenced, inhibited, outside of active time intervals or within muted time intervals.
# TYPE cortex_alertmanager_notifications_suppressed_total counter
cortex_alertmanager_notifications_suppressed_total{reason="active_time_interval",user="user1"} 3
cortex_alertmanager_notifications_suppressed_total{reason="active_time_interval",user="user2"} 30
cortex_alertmanager_notifications_suppressed_total{reason="active_time_interval",user="user3"} 300
cortex_alertmanager_notifications_suppressed_total{reason="inhibition",user="user1"} 1
cortex_alertmanager_notifications_suppressed_total{reason="inhibition",user="user2"} 10
cortex_alertmanager_notifications_suppressed_total{reason="inhibition",user="user3"} 100
cortex_alertmanager_notifications_suppressed_total{reason="mute_time_interval",user="user1"} 2
cortex_alertmanager_notifications_suppressed_total{reason="mute_time_interval",user="user2"} 20
cortex_alertmanager_notifications_suppressed_total{reason="mute_time_interval",user="user3"} 200
# HELP cortex_alertmanager_notification_requests_total The total number of attempted notification requests.
# TYPE cortex_alertmanager_notification_requests_total counter
cortex_alertmanager_notification_requests_total{integration="opsgenie",user="user1"} 5
Expand Down Expand Up @@ -531,6 +542,18 @@ func TestAlertmanagerMetricsRemoval(t *testing.T) {
cortex_alertmanager_notifications_failed_total{integration="telegram",reason="clientError",user="user2"} 90
cortex_alertmanager_notifications_failed_total{integration="telegram",reason="clientError",user="user3"} 900

# HELP cortex_alertmanager_notifications_suppressed_total The total number of notifications suppressed for being silenced, inhibited, outside of active time intervals or within muted time intervals.
# TYPE cortex_alertmanager_notifications_suppressed_total counter
cortex_alertmanager_notifications_suppressed_total{reason="active_time_interval",user="user1"} 3
cortex_alertmanager_notifications_suppressed_total{reason="active_time_interval",user="user2"} 30
cortex_alertmanager_notifications_suppressed_total{reason="active_time_interval",user="user3"} 300
cortex_alertmanager_notifications_suppressed_total{reason="inhibition",user="user1"} 1
cortex_alertmanager_notifications_suppressed_total{reason="inhibition",user="user2"} 10
cortex_alertmanager_notifications_suppressed_total{reason="inhibition",user="user3"} 100
cortex_alertmanager_notifications_suppressed_total{reason="mute_time_interval",user="user1"} 2
cortex_alertmanager_notifications_suppressed_total{reason="mute_time_interval",user="user2"} 20
cortex_alertmanager_notifications_suppressed_total{reason="mute_time_interval",user="user3"} 200

# HELP cortex_alertmanager_notifications_total The total number of attempted notifications.
# TYPE cortex_alertmanager_notifications_total counter
cortex_alertmanager_notifications_total{integration="opsgenie",user="user1"} 5
Expand Down Expand Up @@ -822,6 +845,15 @@ func TestAlertmanagerMetricsRemoval(t *testing.T) {
cortex_alertmanager_notifications_failed_total{integration="telegram",reason="clientError",user="user1"} 9
cortex_alertmanager_notifications_failed_total{integration="telegram",reason="clientError",user="user2"} 90

# HELP cortex_alertmanager_notifications_suppressed_total The total number of notifications suppressed for being silenced, inhibited, outside of active time intervals or within muted time intervals.
# TYPE cortex_alertmanager_notifications_suppressed_total counter
cortex_alertmanager_notifications_suppressed_total{reason="active_time_interval",user="user1"} 3
cortex_alertmanager_notifications_suppressed_total{reason="active_time_interval",user="user2"} 30
cortex_alertmanager_notifications_suppressed_total{reason="inhibition",user="user1"} 1
cortex_alertmanager_notifications_suppressed_total{reason="inhibition",user="user2"} 10
cortex_alertmanager_notifications_suppressed_total{reason="mute_time_interval",user="user1"} 2
cortex_alertmanager_notifications_suppressed_total{reason="mute_time_interval",user="user2"} 20

# HELP cortex_alertmanager_notifications_total The total number of attempted notifications.
# TYPE cortex_alertmanager_notifications_total counter
cortex_alertmanager_notifications_total{integration="opsgenie",user="user1"} 5
Expand Down Expand Up @@ -989,6 +1021,9 @@ func populateAlertmanager(base float64) *prometheus.Registry {
nm.numNotificationRequestsFailedTotal.WithLabelValues(integration).Add(base * float64(i))
nm.notificationLatencySeconds.WithLabelValues(integration).Observe(base * float64(i) * 0.025)
}
for i, reason := range possibleSuppressedReason {
nm.numNotificationSuppressedTotal.WithLabelValues(reason).Add(base * float64(i))
}

m := newMarkerMetrics(reg)
m.alerts.WithLabelValues(string(types.AlertStateActive)).Add(base)
Expand Down Expand Up @@ -1157,6 +1192,7 @@ type notifyMetrics struct {
numTotalFailedNotifications *prometheus.CounterVec
numNotificationRequestsTotal *prometheus.CounterVec
numNotificationRequestsFailedTotal *prometheus.CounterVec
numNotificationSuppressedTotal *prometheus.CounterVec
notificationLatencySeconds *prometheus.HistogramVec
}

Expand All @@ -1182,6 +1218,11 @@ func newNotifyMetrics(r prometheus.Registerer) *notifyMetrics {
Name: "notification_requests_failed_total",
Help: "The total number of failed notification requests.",
}, []string{"integration"}),
numNotificationSuppressedTotal: promauto.With(r).NewCounterVec(prometheus.CounterOpts{
Namespace: "alertmanager",
Name: "notifications_suppressed_total",
Help: "The total number of notifications suppressed for being silenced, inhibited, outside of active time intervals or within muted time intervals.",
}, []string{"reason"}),
notificationLatencySeconds: promauto.With(r).NewHistogramVec(prometheus.HistogramOpts{
Namespace: "alertmanager",
Name: "notification_latency_seconds",
Expand Down Expand Up @@ -1210,12 +1251,16 @@ func newNotifyMetrics(r prometheus.Registerer) *notifyMetrics {
m.numTotalFailedNotifications.WithLabelValues(integration, reason)
}
}
for _, reason := range possibleSuppressedReason {
m.numNotificationSuppressedTotal.WithLabelValues(reason)
}
return m
}

// Copied from github.com/alertmanager/notify/util.go
// possibleFailureReasonCategory is a list of possible failure reason.
var possibleFailureReasonCategory = []string{notify.DefaultReason.String(), notify.ClientErrorReason.String(), notify.ServerErrorReason.String()}
var possibleSuppressedReason = []string{notify.SuppressedReasonSilence, notify.SuppressedReasonInhibition, notify.SuppressedReasonMuteTimeInterval, notify.SuppressedReasonActiveTimeInterval}

type markerMetrics struct {
alerts *prometheus.GaugeVec
Expand Down

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading
Loading