Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Distributed alert checks to prevent high load spikes #2249

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions cmd/bosun/conf/conf.go
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ type SystemConfProvider interface {

GetCheckFrequency() time.Duration
GetDefaultRunEvery() int
GetAlertCheckDistribution() string
GetUnknownThreshold() int
GetMinGroupSize() int

Expand Down
25 changes: 18 additions & 7 deletions cmd/bosun/conf/system.go
Original file line number Diff line number Diff line change
Expand Up @@ -32,9 +32,10 @@ type SystemConf struct {
InternetProxy string
MinGroupSize int

UnknownThreshold int
CheckFrequency Duration // Time between alert checks: 5m
DefaultRunEvery int // Default number of check intervals to run each alert: 1
UnknownThreshold int
CheckFrequency Duration // Time between alert checks: 5m
DefaultRunEvery int // Default number of check intervals to run each alert: 1
AlertCheckDistribution string // Method to distribute alet checks. No distribution if equals ""

DBConf DBConf

Expand Down Expand Up @@ -223,10 +224,11 @@ const (
// NewSystemConf retruns a system conf with default values set
func newSystemConf() *SystemConf {
return &SystemConf{
Scheme: "http",
CheckFrequency: Duration{Duration: time.Minute * 5},
DefaultRunEvery: 1,
HTTPListen: defaultHTTPListen,
Scheme: "http",
CheckFrequency: Duration{Duration: time.Minute * 5},
DefaultRunEvery: 1,
HTTPListen: defaultHTTPListen,
AlertCheckDistribution: "",
DBConf: DBConf{
LedisDir: "ledis_data",
LedisBindAddr: "127.0.0.1:9565",
Expand Down Expand Up @@ -269,6 +271,10 @@ func loadSystemConfig(conf string, isFileName bool) (*SystemConf, error) {
return sc, fmt.Errorf("undecoded fields in system configuration: %v", decodeMeta.Undecoded())
}

if sc.GetAlertCheckDistribution() != "" && sc.GetAlertCheckDistribution() != "simple" {
return sc, fmt.Errorf("invalid value %v for AlertCheckDistribution", sc.GetAlertCheckDistribution())
}

// iterate over each hosts
for hostPrefix, value := range sc.ElasticConf {
if value.SimpleClient && value.ClientOptions.Enabled {
Expand Down Expand Up @@ -401,6 +407,11 @@ func (sc *SystemConf) GetDefaultRunEvery() int {
return sc.DefaultRunEvery
}

// GetAlertCheckDistribution returns if the alert rule checks are scattered over check period
func (sc *SystemConf) GetAlertCheckDistribution() string {
return sc.AlertCheckDistribution
}

// GetUnknownThreshold returns the threshold in which multiple unknown alerts in a check iteration
// should be grouped into a single notification
func (sc *SystemConf) GetUnknownThreshold() int {
Expand Down
19 changes: 17 additions & 2 deletions cmd/bosun/sched/alertRunner.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,16 +19,31 @@ func (s *Schedule) Run() error {
type alertCh struct {
ch chan<- *checkContext
modulo int
shift int // used to distribute alert runs
}
chs := []alertCh{}

// Every alert gets a small shift in time.
// This way the alerts with the same period are not fired
// simultaneously, but are distributed.
circular_shifts := make(map[int]int) // the map is *run period* -> *time shift to add*
for _, a := range s.RuleConf.GetAlerts() {
ch := make(chan *checkContext, 1)
re := a.RunEvery
if re == 0 {
re = s.SystemConf.GetDefaultRunEvery()
}
go s.runAlert(a, ch)
chs = append(chs, alertCh{ch: ch, modulo: re})

if s.SystemConf.GetAlertCheckDistribution() == "simple" { // only apply shifts if the respective option is set
chs = append(chs, alertCh{ch: ch, modulo: re, shift: circular_shifts[re]})
} else {
// there are no shifts if option is off
chs = append(chs, alertCh{ch: ch, modulo: re, shift: 0})
}

// the shifts for a given period range 0..(period - 1)
circular_shifts[re] = (circular_shifts[re] + 1) % re
}
i := 0
for {
Expand All @@ -40,7 +55,7 @@ func (s *Schedule) Run() error {
ctx := &checkContext{utcNow(), cache.New(0)}
s.LastCheck = utcNow()
for _, a := range chs {
if i%a.modulo != 0 {
if (i+a.shift)%a.modulo != 0 {
continue
}
// Put on channel. If that fails, the alert is backed up pretty bad.
Expand Down
7 changes: 7 additions & 0 deletions docs/system_configuration.md
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,13 @@ frequent as every "1m", and others that run less often (any multiple of
Example:
`DefaultRunEvery = 5`

### AlertCheckDistribution
Selects algorithm to distribute alert checks to decrease system load spikes. There is no distribution by default. This means, if there are several checks with same period, they all will happen at the same points in time. This method is used if the option is not specified or equals to empty string.

The single alternative option is `simple`. If specified, the alert checks with the same period will be uniformly distributed on second marks.

Example: `AlertCheckDistribution = "simple"`

### RuleFilePath
Path to the file containing definitions of alerts, macros, lookups,
templates, notifications, and global variables which are [documented
Expand Down