Skip to content

Commit

Permalink
Stn/correctly mark api silences (#1733)
Browse files Browse the repository at this point in the history
* Update alert status on every GET to alerts

Signed-off-by: stuart nelson <stuartnelson3@gmail.com>
  • Loading branch information
stuartnelson3 authored Feb 18, 2019
1 parent da6e2a8 commit 51eebbe
Show file tree
Hide file tree
Showing 5 changed files with 232 additions and 13 deletions.
10 changes: 6 additions & 4 deletions api/api.go
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,8 @@ func (o Options) validate() error {
return nil
}

// New creates a new API object combining all API versions.
// New creates a new API object combining all API versions. Note that an Update
// call is also needed to get the APIs into an operational state.
func New(opts Options) (*API, error) {
if err := opts.validate(); err != nil {
return nil, fmt.Errorf("invalid API options: %s", err)
Expand Down Expand Up @@ -183,13 +184,14 @@ func (api *API) Register(r *route.Router, routePrefix string) *http.ServeMux {
return mux
}

// Update config and resolve timeout of each API.
func (api *API) Update(cfg *config.Config, resolveTimeout time.Duration) error {
// Update config and resolve timeout of each API. APIv2 also needs
// setAlertStatus to be updated.
func (api *API) Update(cfg *config.Config, resolveTimeout time.Duration, setAlertStatus func(model.LabelSet) error) error {
if err := api.v1.Update(cfg, resolveTimeout); err != nil {
return err
}

return api.v2.Update(cfg, resolveTimeout)
return api.v2.Update(cfg, resolveTimeout, setAlertStatus)
}

func (api *API) limitHandler(h http.Handler) http.Handler {
Expand Down
22 changes: 18 additions & 4 deletions api/v2/api.go
Original file line number Diff line number Diff line change
Expand Up @@ -56,23 +56,31 @@ type API struct {
getAlertStatus getAlertStatusFn
uptime time.Time

// mtx protects resolveTimeout, alertmanagerConfig and route.
// mtx protects resolveTimeout, alertmanagerConfig, setAlertStatus and route.
mtx sync.RWMutex
// resolveTimeout represents the default resolve timeout that an alert is
// assigned if no end time is specified.
resolveTimeout time.Duration
alertmanagerConfig *config.Config
route *dispatch.Route
setAlertStatus setAlertStatusFn

logger log.Logger

Handler http.Handler
}

type getAlertStatusFn func(prometheus_model.Fingerprint) types.AlertStatus
type setAlertStatusFn func(prometheus_model.LabelSet) error

// NewAPI returns a new Alertmanager API v2
func NewAPI(alerts provider.Alerts, sf getAlertStatusFn, silences *silence.Silences, peer *cluster.Peer, l log.Logger) (*API, error) {
func NewAPI(
alerts provider.Alerts,
sf getAlertStatusFn,
silences *silence.Silences,
peer *cluster.Peer,
l log.Logger,
) (*API, error) {
api := API{
alerts: alerts,
getAlertStatus: sf,
Expand Down Expand Up @@ -114,14 +122,15 @@ func NewAPI(alerts provider.Alerts, sf getAlertStatusFn, silences *silence.Silen
return &api, nil
}

// Update sets the configuration string to a new value.
func (api *API) Update(cfg *config.Config, resolveTimeout time.Duration) error {
// Update sets the API struct members that may change between reloads of alertmanager.
func (api *API) Update(cfg *config.Config, resolveTimeout time.Duration, setAlertStatus setAlertStatusFn) error {
api.mtx.Lock()
defer api.mtx.Unlock()

api.resolveTimeout = resolveTimeout
api.alertmanagerConfig = cfg
api.route = dispatch.NewRoute(cfg.Route, nil)
api.setAlertStatus = setAlertStatus
return nil
}

Expand Down Expand Up @@ -252,6 +261,11 @@ func (api *API) getAlertsHandler(params alert_ops.GetAlertsParams) middleware.Re
continue
}

// Set alert's current status based on its label set.
if err := api.setAlertStatus(a.Labels); err != nil {
level.Error(api.logger).Log("msg", "set alert status failed", "err", err)
}
// Get alert's current status after seeing if it is suppressed.
status := api.getAlertStatus(a.Fingerprint())

if !*params.Active && status.State == types.AlertStateActive {
Expand Down
37 changes: 32 additions & 5 deletions cmd/alertmanager/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ import (
"github.com/go-kit/kit/log/level"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/promhttp"
"github.com/prometheus/common/model"
"github.com/prometheus/common/promlog"
promlogflag "github.com/prometheus/common/promlog/flag"
"github.com/prometheus/common/route"
Expand Down Expand Up @@ -324,11 +325,6 @@ func run() int {

hash = md5HashAsMetricValue(plainCfg)

err = api.Update(conf, time.Duration(conf.Global.ResolveTimeout))
if err != nil {
return err
}

tmpl, err = template.FromGlobs(conf.Templates...)
if err != nil {
return err
Expand All @@ -350,6 +346,12 @@ func run() int {
peer,
logger,
)

err = api.Update(conf, time.Duration(conf.Global.ResolveTimeout), setAlertStatus(inhibitor, marker, silences))
if err != nil {
return err
}

disp = dispatch.NewDispatcher(alerts, dispatch.NewRoute(conf.Route, nil), pipeline, marker, timeoutFunc, logger)

go disp.Run()
Expand Down Expand Up @@ -476,3 +478,28 @@ func md5HashAsMetricValue(data []byte) float64 {
copy(bytes, smallSum)
return float64(binary.LittleEndian.Uint64(bytes))
}

func setAlertStatus(inhibitor *inhibit.Inhibitor, marker types.Marker, silences *silence.Silences) func(model.LabelSet) error {
return func(labels model.LabelSet) error {
inhibitor.Mutes(labels)
// TODO(beorn7): The following code is almost exactly replicated in notify/notify.go.
sils, err := silences.Query(
silence.QState(types.SilenceStateActive),
silence.QMatches(labels),
)
if err != nil {
return fmt.Errorf("failed to query silences: %v", err)
}

if len(sils) > 0 {
ids := make([]string, len(sils))
for i, s := range sils {
ids[i] = s.Id
}
marker.SetSilenced(labels.Fingerprint(), ids...)
} else {
marker.SetSilenced(labels.Fingerprint())
}
return nil
}
}
9 changes: 9 additions & 0 deletions test/with_api_v2/acceptance.go
Original file line number Diff line number Diff line change
Expand Up @@ -289,6 +289,11 @@ func (amc *AlertmanagerCluster) Start() error {
return nil
}

// Members returns the underlying slice of cluster members.
func (amc *AlertmanagerCluster) Members() []*Alertmanager {
return amc.ams
}

// Start the alertmanager and wait until it is ready to receive.
func (am *Alertmanager) Start(additionalArg []string) error {
am.t.Helper()
Expand Down Expand Up @@ -555,6 +560,10 @@ func (am *Alertmanager) GenericAPIV2Call(at float64, f func()) {
am.t.Do(at, f)
}

func (am *Alertmanager) Client() *apiclient.Alertmanager {
return am.clientV2
}

func (am *Alertmanager) getURL(path string) string {
return fmt.Sprintf("http://%s%s%s", am.apiAddr, am.opts.RoutePrefix, path)
}
167 changes: 167 additions & 0 deletions test/with_api_v2/acceptance/api_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,167 @@
// Copyright 2018 Prometheus Team
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package test

import (
"fmt"
"testing"
"time"

"github.com/go-openapi/strfmt"
a "github.com/prometheus/alertmanager/test/with_api_v2"
"github.com/prometheus/alertmanager/test/with_api_v2/api_v2_client/client/alert"
"github.com/prometheus/alertmanager/test/with_api_v2/api_v2_client/client/silence"
"github.com/prometheus/alertmanager/test/with_api_v2/api_v2_client/models"
"github.com/prometheus/common/model"
"github.com/stretchr/testify/require"
)

// TestAlertGetReturnsCurrentStatus checks that querying the API returns the
// current status of each alert, i.e. if it is silenced or inhibited.
func TestAlertGetReturnsCurrentAlertStatus(t *testing.T) {
t.Parallel()

conf := `
route:
receiver: "default"
group_by: []
group_wait: 1s
group_interval: 10m
repeat_interval: 1h
inhibit_rules:
- source_match:
severity: 'critical'
target_match:
severity: 'warning'
equal: ['alertname']
receivers:
- name: "default"
webhook_configs:
- url: 'http://%s'
`

at := a.NewAcceptanceTest(t, &a.AcceptanceOpts{
Tolerance: 1 * time.Second,
})
co := at.Collector("webhook")
wh := a.NewWebhook(co)

amc := at.AlertmanagerCluster(fmt.Sprintf(conf, wh.Address()), 1)
require.NoError(t, amc.Start())
defer amc.Terminate()

am := amc.Members()[0]

labelName := "alertname"
labelValue := "test1"

now := time.Now()
startsAt := strfmt.DateTime(now)
endsAt := strfmt.DateTime(now.Add(5 * time.Minute))

labels := models.LabelSet(map[string]string{labelName: labelValue, "severity": "warning"})
fp := model.LabelSet{model.LabelName(labelName): model.LabelValue(labelValue), "severity": "warning"}.Fingerprint()
pa := &models.PostableAlert{
StartsAt: startsAt,
EndsAt: endsAt,
Alert: models.Alert{Labels: labels},
}
alertParams := alert.NewPostAlertsParams()
alertParams.Alerts = models.PostableAlerts{pa}
_, err := am.Client().Alert.PostAlerts(alertParams)
require.NoError(t, err)

resp, err := am.Client().Alert.GetAlerts(nil)
require.NoError(t, err)
// No silence has been created or inhibiting alert sent, alert should
// be active.
for _, alert := range resp.Payload {
require.Equal(t, models.AlertStatusStateActive, *alert.Status.State)
}

// Wait for group_wait, so that we are in the group_interval period,
// when the pipeline won't update the alert's status.
time.Sleep(2 * time.Second)

// Create silence and verify that the alert is immediately marked
// silenced via the API.
silenceParams := silence.NewPostSilencesParams()

cm := "a"
isRegex := false
ps := &models.PostableSilence{
Silence: models.Silence{
StartsAt: &startsAt,
EndsAt: &endsAt,
Comment: &cm,
CreatedBy: &cm,
Matchers: models.Matchers{
&models.Matcher{Name: &labelName, Value: &labelValue, IsRegex: &isRegex},
},
},
}
silenceParams.Silence = ps
silenceResp, err := am.Client().Silence.PostSilences(silenceParams)
require.NoError(t, err)
silenceID := silenceResp.Payload.SilenceID

resp, err = am.Client().Alert.GetAlerts(nil)
require.NoError(t, err)
for _, alert := range resp.Payload {
require.Equal(t, models.AlertStatusStateSuppressed, *alert.Status.State)
require.Equal(t, fp.String(), *alert.Fingerprint)
require.Equal(t, 1, len(alert.Status.SilencedBy))
require.Equal(t, silenceID, alert.Status.SilencedBy[0])
}

// Create inhibiting alert and verify that original alert is
// immediately marked as inhibited.
labels["severity"] = "critical"
_, err = am.Client().Alert.PostAlerts(alertParams)
require.NoError(t, err)

inhibitingFP := model.LabelSet{model.LabelName(labelName): model.LabelValue(labelValue), "severity": "critical"}.Fingerprint()

resp, err = am.Client().Alert.GetAlerts(nil)
require.NoError(t, err)
for _, alert := range resp.Payload {
require.Equal(t, 1, len(alert.Status.SilencedBy))
require.Equal(t, silenceID, alert.Status.SilencedBy[0])
if fp.String() == *alert.Fingerprint {
require.Equal(t, models.AlertStatusStateSuppressed, *alert.Status.State)
require.Equal(t, fp.String(), *alert.Fingerprint)
require.Equal(t, 1, len(alert.Status.InhibitedBy))
require.Equal(t, inhibitingFP.String(), alert.Status.InhibitedBy[0])
}
}

deleteParams := silence.NewDeleteSilenceParams().WithSilenceID(strfmt.UUID(silenceID))
_, err = am.Client().Silence.DeleteSilence(deleteParams)
require.NoError(t, err)

resp, err = am.Client().Alert.GetAlerts(nil)
require.NoError(t, err)
// Silence has been deleted, inhibiting alert should be active.
// Original alert should still be inhibited.
for _, alert := range resp.Payload {
require.Equal(t, 0, len(alert.Status.SilencedBy))
if inhibitingFP.String() == *alert.Fingerprint {
require.Equal(t, models.AlertStatusStateActive, *alert.Status.State)
} else {
require.Equal(t, models.AlertStatusStateSuppressed, *alert.Status.State)
}
}
}

0 comments on commit 51eebbe

Please sign in to comment.