diff --git a/notify/notify.go b/notify/notify.go index d6c741fe47..933aa4a841 100644 --- a/notify/notify.go +++ b/notify/notify.go @@ -55,7 +55,7 @@ const MinTimeout = 10 * time.Second // returns an error if unsuccessful and a flag whether the error is // recoverable. This information is useful for a retry logic. type Notifier interface { - Notify(context.Context, ...*types.Alert) (bool, error) + Notify(context.Context, ...*types.Alert) (bool, bool, error) } // Integration wraps a notifier and its configuration to be uniquely identified @@ -78,7 +78,7 @@ func NewIntegration(notifier Notifier, rs ResolvedSender, name string, idx int) } // Notify implements the Notifier interface. -func (i *Integration) Notify(ctx context.Context, alerts ...*types.Alert) (bool, error) { +func (i *Integration) Notify(ctx context.Context, alerts ...*types.Alert) (bool, bool, error) { return i.notifier.Notify(ctx, alerts...) } @@ -246,6 +246,7 @@ type NotificationLog interface { type Metrics struct { numNotifications *prometheus.CounterVec numTotalFailedNotifications *prometheus.CounterVec + numTotal4xxFailedNotifications *prometheus.CounterVec numNotificationRequestsTotal *prometheus.CounterVec numNotificationRequestsFailedTotal *prometheus.CounterVec notificationLatencySeconds *prometheus.HistogramVec @@ -263,6 +264,11 @@ func NewMetrics(r prometheus.Registerer) *Metrics { Name: "notifications_failed_total", Help: "The total number of failed notifications.", }, []string{"integration"}), + numTotal4xxFailedNotifications: prometheus.NewCounterVec(prometheus.CounterOpts{ + Namespace: "alertmanager", + Name: "notifications_4xx_failed_total", + Help: "The total number of failed 4xx notifications.", + }, []string{"integration"}), numNotificationRequestsTotal: prometheus.NewCounterVec(prometheus.CounterOpts{ Namespace: "alertmanager", Name: "notification_requests_total", @@ -294,12 +300,13 @@ func NewMetrics(r prometheus.Registerer) *Metrics { } { m.numNotifications.WithLabelValues(integration) m.numTotalFailedNotifications.WithLabelValues(integration) + m.numTotal4xxFailedNotifications.WithLabelValues(integration) m.numNotificationRequestsTotal.WithLabelValues(integration) m.numNotificationRequestsFailedTotal.WithLabelValues(integration) m.notificationLatencySeconds.WithLabelValues(integration) } r.MustRegister( - m.numNotifications, m.numTotalFailedNotifications, + m.numNotifications, m.numTotalFailedNotifications, m.numTotal4xxFailedNotifications, m.numNotificationRequestsTotal, m.numNotificationRequestsFailedTotal, m.notificationLatencySeconds, ) @@ -719,11 +726,14 @@ func (r RetryStage) exec(ctx context.Context, l log.Logger, alerts ...*types.Ale select { case <-tick.C: now := time.Now() - retry, err := r.integration.Notify(ctx, sent...) + retry, is4xx, err := r.integration.Notify(ctx, sent...) r.metrics.notificationLatencySeconds.WithLabelValues(r.integration.Name()).Observe(time.Since(now).Seconds()) r.metrics.numNotificationRequestsTotal.WithLabelValues(r.integration.Name()).Inc() if err != nil { r.metrics.numNotificationRequestsFailedTotal.WithLabelValues(r.integration.Name()).Inc() + if is4xx { + r.metrics.numTotal4xxFailedNotifications.WithLabelValues(r.integration.Name()).Inc() + } if !retry { return ctx, alerts, errors.Wrapf(err, "%s/%s: notify retry canceled due to unrecoverable error after %d attempts", r.groupName, r.integration.String(), i) } diff --git a/notify/notify_test.go b/notify/notify_test.go index a78fbf74f6..8c99e24d5f 100644 --- a/notify/notify_test.go +++ b/notify/notify_test.go @@ -19,11 +19,14 @@ import ( "fmt" "io" "reflect" + "strconv" "testing" "time" + "github.com/aws/aws-sdk-go/aws/awserr" "github.com/go-kit/log" "github.com/prometheus/client_golang/prometheus" + prom_testutil "github.com/prometheus/client_golang/prometheus/testutil" "github.com/prometheus/common/model" "github.com/stretchr/testify/require" "gopkg.in/yaml.v2" @@ -42,9 +45,9 @@ func (s sendResolved) SendResolved() bool { return bool(s) } -type notifierFunc func(ctx context.Context, alerts ...*types.Alert) (bool, error) +type notifierFunc func(ctx context.Context, alerts ...*types.Alert) (bool, bool, error) -func (f notifierFunc) Notify(ctx context.Context, alerts ...*types.Alert) (bool, error) { +func (f notifierFunc) Notify(ctx context.Context, alerts ...*types.Alert) (bool, bool, error) { return f(ctx, alerts...) } @@ -380,13 +383,13 @@ func TestRetryStageWithError(t *testing.T) { fail, retry := true, true sent := []*types.Alert{} i := Integration{ - notifier: notifierFunc(func(ctx context.Context, alerts ...*types.Alert) (bool, error) { + notifier: notifierFunc(func(ctx context.Context, alerts ...*types.Alert) (bool, bool, error) { if fail { fail = false - return retry, errors.New("fail to deliver notification") + return retry, false, errors.New("fail to deliver notification") } sent = append(sent, alerts...) - return false, nil + return false, false, nil }), rs: sendResolved(false), } @@ -425,9 +428,9 @@ func TestRetryStageWithError(t *testing.T) { func TestRetryStageNoResolved(t *testing.T) { sent := []*types.Alert{} i := Integration{ - notifier: notifierFunc(func(ctx context.Context, alerts ...*types.Alert) (bool, error) { + notifier: notifierFunc(func(ctx context.Context, alerts ...*types.Alert) (bool, bool, error) { sent = append(sent, alerts...) - return false, nil + return false, false, nil }), rs: sendResolved(false), } @@ -479,9 +482,9 @@ func TestRetryStageNoResolved(t *testing.T) { func TestRetryStageSendResolved(t *testing.T) { sent := []*types.Alert{} i := Integration{ - notifier: notifierFunc(func(ctx context.Context, alerts ...*types.Alert) (bool, error) { + notifier: notifierFunc(func(ctx context.Context, alerts ...*types.Alert) (bool, bool, error) { sent = append(sent, alerts...) - return false, nil + return false, false, nil }), rs: sendResolved(true), } @@ -913,3 +916,74 @@ func BenchmarkHashAlert(b *testing.B) { hashAlert(alert) } } + +func TestRetryStageWithErrorCode(t *testing.T) { + + testcases := map[string]struct { + errorcode int + expected4xxCount int + }{ + + "for 400": {errorcode: 400, expected4xxCount: 1}, + "for 401": {errorcode: 401, expected4xxCount: 1}, + "for 403": {errorcode: 403, expected4xxCount: 1}, + "for 404": {errorcode: 404, expected4xxCount: 1}, + "for 413": {errorcode: 413, expected4xxCount: 1}, + "for 422": {errorcode: 422, expected4xxCount: 1}, + "for 429": {errorcode: 429, expected4xxCount: 1}, + "for 500": {errorcode: 500, expected4xxCount: 0}, + "for 502": {errorcode: 502, expected4xxCount: 0}, + } + for _, testData := range testcases { + fail, retry := true, true + sent := []*types.Alert{} + testData := testData + is4xx := Check4xxStatus(testData.errorcode) + i := Integration{ + notifier: notifierFunc(func(ctx context.Context, alerts ...*types.Alert) (bool, bool, error) { + if fail { + fail = false + return retry, is4xx, awserr.New(strconv.Itoa(testData.errorcode), "Not found", errors.New("fail to deliver notification")) + } + sent = append(sent, alerts...) + return false, is4xx, nil + }), + rs: sendResolved(false), + } + r := RetryStage{ + integration: i, + metrics: NewMetrics(prometheus.NewRegistry()), + } + + alerts := []*types.Alert{ + { + Alert: model.Alert{ + EndsAt: time.Now().Add(time.Hour), + }, + }, + } + + ctx := context.Background() + ctx = WithFiringAlerts(ctx, []uint64{0}) + + // Notify with a recoverable error should retry and succeed. + resctx, res, err := r.Exec(ctx, log.NewNopLogger(), alerts...) + var counter = r.metrics.numTotal4xxFailedNotifications + require.Equal(t, testData.expected4xxCount, int(prom_testutil.ToFloat64(counter.WithLabelValues(r.integration.Name())))) + + require.Nil(t, err) + require.Equal(t, alerts, res) + require.Equal(t, alerts, sent) + require.NotNil(t, resctx) + + // Notify with an unrecoverable error should fail. + sent = sent[:0] + fail = true + retry = false + + resctx, _, err = r.Exec(ctx, log.NewNopLogger(), alerts...) + require.NotNil(t, err) + require.NotNil(t, resctx) + + } +} diff --git a/notify/sns/sns.go b/notify/sns/sns.go index 1dfd42b4a5..2ecd2099f9 100644 --- a/notify/sns/sns.go +++ b/notify/sns/sns.go @@ -60,37 +60,46 @@ func New(c *config.SNSConfig, t *template.Template, l log.Logger, httpOpts ...co }, nil } -func (n *Notifier) Notify(ctx context.Context, alert ...*types.Alert) (bool, error) { +func (n *Notifier) Notify(ctx context.Context, alert ...*types.Alert) (bool, bool, error) { var ( - err error - data = notify.GetTemplateData(ctx, n.tmpl, alert, n.logger) - tmpl = notify.TmplText(n.tmpl, data, &err) + err error + data = notify.GetTemplateData(ctx, n.tmpl, alert, n.logger) + tmpl = notify.TmplText(n.tmpl, data, &err) + is4xx = false + retry = false ) client, err := n.createSNSClient(tmpl) if err != nil { if e, ok := err.(awserr.RequestFailure); ok { - return n.retrier.Check(e.StatusCode(), strings.NewReader(e.Message())) + is4xx := notify.Check4xxStatus(e.StatusCode()) + retry, err = n.retrier.Check(e.StatusCode(), strings.NewReader(e.Message())) + return retry, is4xx, err } - return true, err + return true, is4xx, err } publishInput, err := n.createPublishInput(ctx, tmpl) if err != nil { - return true, err + if e, ok := err.(awserr.RequestFailure); ok { + is4xx = notify.Check4xxStatus(e.StatusCode()) + } + return true, is4xx, err } publishOutput, err := client.Publish(publishInput) if err != nil { if e, ok := err.(awserr.RequestFailure); ok { - return n.retrier.Check(e.StatusCode(), strings.NewReader(e.Message())) + is4xx = notify.Check4xxStatus(e.StatusCode()) + retry, err = n.retrier.Check(e.StatusCode(), strings.NewReader(e.Message())) + return retry, is4xx, err } - return true, err + return true, is4xx, err } level.Debug(n.logger).Log("msg", "SNS message successfully published", "message_id", publishOutput.MessageId, "sequence number", publishOutput.SequenceNumber) - return false, nil + return false, is4xx, nil } func (n *Notifier) createSNSClient(tmpl func(string) string) (*sns.SNS, error) { diff --git a/notify/util.go b/notify/util.go index fb6efe4c07..67606b6567 100644 --- a/notify/util.go +++ b/notify/util.go @@ -210,3 +210,11 @@ func (r *Retrier) Check(statusCode int, body io.Reader) (bool, error) { } return retry, errors.New(s) } + +func Check4xxStatus(statusCode int) bool { + var is4xx = false + if statusCode/100 == 4 { + is4xx = true + } + return is4xx +}