Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Track plan rejection history and automatically mark clients as ineligible #13421

Merged
merged 14 commits into from
Jul 12, 2022
Merged
Show file tree
Hide file tree
Changes from 11 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions .changelog/13421.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
```release-note:improvement
core: automatically mark clients with recurring plan rejections as ineligible
```

```release-note:improvement
metrics: emit `nomad.nomad.plan.rejection_tracker.node_score` metric for the number of times a node had a plan rejection within the past time window
```
12 changes: 12 additions & 0 deletions command/agent/agent.go
Original file line number Diff line number Diff line change
Expand Up @@ -436,6 +436,18 @@ func convertServerConfig(agentConfig *Config) (*nomad.Config, error) {
return nil, fmt.Errorf("deploy_query_rate_limit must be greater than 0")
}

// Set plan rejection tracker configuration.
if planRejectConf := agentConfig.Server.PlanRejectionTracker; planRejectConf != nil {
conf.NodePlanRejectionEnabled = planRejectConf.Enabled
conf.NodePlanRejectionThreshold = planRejectConf.NodeThreshold

if planRejectConf.NodeWindow == 0 {
return nil, fmt.Errorf("plan_rejection_tracker.node_window must be greater than 0")
} else {
conf.NodePlanRejectionWindow = planRejectConf.NodeWindow
}
}

// Add Enterprise license configs
conf.LicenseEnv = agentConfig.Server.LicenseEnv
conf.LicensePath = agentConfig.Server.LicensePath
Expand Down
74 changes: 74 additions & 0 deletions command/agent/agent_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -365,6 +365,80 @@ func TestAgent_ServerConfig_Limits_OK(t *testing.T) {
}
}

func TestAgent_ServerConfig_PlanRejectionTracker(t *testing.T) {
ci.Parallel(t)

cases := []struct {
name string
trackerConfig *PlanRejectionTracker
expectedConfig *PlanRejectionTracker
expectedErr string
}{
{
name: "default",
trackerConfig: nil,
expectedConfig: &PlanRejectionTracker{
NodeThreshold: 100,
NodeWindow: 5 * time.Minute,
},
expectedErr: "",
},
{
name: "valid config",
trackerConfig: &PlanRejectionTracker{
Enabled: true,
NodeThreshold: 123,
NodeWindow: 17 * time.Minute,
},
expectedConfig: &PlanRejectionTracker{
Enabled: true,
NodeThreshold: 123,
NodeWindow: 17 * time.Minute,
},
expectedErr: "",
},
{
name: "invalid node window",
trackerConfig: &PlanRejectionTracker{
NodeThreshold: 123,
},
expectedErr: "node_window must be greater than 0",
},
}

for _, tc := range cases {
t.Run(tc.name, func(t *testing.T) {
config := DevConfig(nil)
require.NoError(t, config.normalizeAddrs())

if tc.trackerConfig != nil {
config.Server.PlanRejectionTracker = tc.trackerConfig
}

serverConfig, err := convertServerConfig(config)

if tc.expectedErr != "" {
require.Error(t, err)
require.Contains(t, err.Error(), tc.expectedErr)
} else {
require.NoError(t, err)
require.Equal(t,
tc.expectedConfig.Enabled,
serverConfig.NodePlanRejectionEnabled,
)
require.Equal(t,
tc.expectedConfig.NodeThreshold,
serverConfig.NodePlanRejectionThreshold,
)
require.Equal(t,
tc.expectedConfig.NodeWindow,
serverConfig.NodePlanRejectionWindow,
)
}
})
}
}

func TestAgent_ServerConfig_RaftMultiplier_Ok(t *testing.T) {
ci.Parallel(t)

Expand Down
59 changes: 59 additions & 0 deletions command/agent/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -503,6 +503,10 @@ type ServerConfig struct {
// This value is ignored.
DefaultSchedulerConfig *structs.SchedulerConfiguration `hcl:"default_scheduler_config"`

// PlanRejectionTracker configures the node plan rejection tracker that
// detects potentially bad nodes.
PlanRejectionTracker *PlanRejectionTracker `hcl:"plan_rejection_tracker"`

// EnableEventBroker configures whether this server's state store
// will generate events for its event stream.
EnableEventBroker *bool `hcl:"enable_event_broker"`
Expand Down Expand Up @@ -548,6 +552,53 @@ type RaftBoltConfig struct {
NoFreelistSync bool `hcl:"no_freelist_sync"`
}

// PlanRejectionTracker is used in servers to configure the plan rejection
// tracker.
type PlanRejectionTracker struct {
// Enabled controls if the plan rejection tracker is active or not.
Enabled bool `hcl:"enabled"`
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If we ever want to default this to true we'll have to change the type to a pointer (*bool) to differentiate unset vs explicitly-false. That shouldn't be a problem though since unlike objects sent over RPCs we never have to worry about an agent on the new version sending a nil to an agent still expecting a true or false.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good point, I may as well do the work now 😄
f5936f0


// NodeThreshold is the number of times a node can have plan rejections
// before it is marked as ineligible.
NodeThreshold int `hcl:"node_threshold"`

// NodeWindow is the time window used to track active plan rejections for
// nodes.
NodeWindow time.Duration
NodeWindowHCL string `hcl:"node_window" json:"-"`
Comment on lines +563 to +568
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not sure about these names, but I wanted to hedge against us potentially needing to track plan rejections for other things, like evals or jobs. Feel free to modify them.


// ExtraKeysHCL is used by hcl to surface unexpected keys
ExtraKeysHCL []string `hcl:",unusedKeys" json:"-"`
}

func (p *PlanRejectionTracker) Merge(b *PlanRejectionTracker) *PlanRejectionTracker {
if p == nil {
return b
}

result := *p

if b == nil {
return &result
}

if b.Enabled {
result.Enabled = true
}

if b.NodeThreshold != 0 {
result.NodeThreshold = b.NodeThreshold
}

if b.NodeWindow != 0 {
result.NodeWindow = b.NodeWindow
}
if b.NodeWindowHCL != "" {
result.NodeWindowHCL = b.NodeWindowHCL
}
return &result
}

// Search is used in servers to configure search API options.
type Search struct {
// FuzzyEnabled toggles whether the FuzzySearch API is enabled. If not
Expand Down Expand Up @@ -985,6 +1036,10 @@ func DefaultConfig() *Config {
EventBufferSize: helper.IntToPtr(100),
RaftProtocol: 3,
StartJoin: []string{},
PlanRejectionTracker: &PlanRejectionTracker{
NodeThreshold: 100,
NodeWindow: 5 * time.Minute,
},
ServerJoin: &ServerJoin{
RetryJoin: []string{},
RetryInterval: 30 * time.Second,
Expand Down Expand Up @@ -1586,6 +1641,10 @@ func (s *ServerConfig) Merge(b *ServerConfig) *ServerConfig {
result.EventBufferSize = b.EventBufferSize
}

if b.PlanRejectionTracker != nil {
result.PlanRejectionTracker = result.PlanRejectionTracker.Merge(b.PlanRejectionTracker)
}

if b.DefaultSchedulerConfig != nil {
c := *b.DefaultSchedulerConfig
result.DefaultSchedulerConfig = &c
Expand Down
8 changes: 6 additions & 2 deletions command/agent/config_parse.go
Original file line number Diff line number Diff line change
Expand Up @@ -43,9 +43,12 @@ func ParseConfigFile(path string) (*Config, error) {
VaultRetry: &client.RetryConfig{},
},
},
Server: &ServerConfig{
PlanRejectionTracker: &PlanRejectionTracker{},
ServerJoin: &ServerJoin{},
},
ACL: &ACLConfig{},
Audit: &config.AuditConfig{},
Server: &ServerConfig{ServerJoin: &ServerJoin{}},
Consul: &config.ConsulConfig{},
Autopilot: &config.AutopilotConfig{},
Telemetry: &Telemetry{},
Expand All @@ -54,7 +57,7 @@ func ParseConfigFile(path string) (*Config, error) {

err = hcl.Decode(c, buf.String())
if err != nil {
return nil, err
return nil, fmt.Errorf("failed to decode HCL file %s: %w", path, err)
}

// convert strings to time.Durations
Expand All @@ -66,6 +69,7 @@ func ParseConfigFile(path string) (*Config, error) {
{"server.heartbeat_grace", &c.Server.HeartbeatGrace, &c.Server.HeartbeatGraceHCL, nil},
{"server.min_heartbeat_ttl", &c.Server.MinHeartbeatTTL, &c.Server.MinHeartbeatTTLHCL, nil},
{"server.failover_heartbeat_ttl", &c.Server.FailoverHeartbeatTTL, &c.Server.FailoverHeartbeatTTLHCL, nil},
{"server.plan_rejection_tracker.node_window", &c.Server.PlanRejectionTracker.NodeWindow, &c.Server.PlanRejectionTracker.NodeWindowHCL, nil},
{"server.retry_interval", &c.Server.RetryInterval, &c.Server.RetryIntervalHCL, nil},
{"server.server_join.retry_interval", &c.Server.ServerJoin.RetryInterval, &c.Server.ServerJoin.RetryIntervalHCL, nil},
{"consul.timeout", &c.Consul.Timeout, &c.Consul.TimeoutHCL, nil},
Expand Down
19 changes: 19 additions & 0 deletions command/agent/config_parse_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -126,6 +126,12 @@ var basicConfig = &Config{
EncryptKey: "abc",
EnableEventBroker: helper.BoolToPtr(false),
EventBufferSize: helper.IntToPtr(200),
PlanRejectionTracker: &PlanRejectionTracker{
Enabled: true,
NodeThreshold: 100,
NodeWindow: 41 * time.Minute,
NodeWindowHCL: "41m",
},
ServerJoin: &ServerJoin{
RetryJoin: []string{"1.1.1.1", "2.2.2.2"},
RetryInterval: time.Duration(15) * time.Second,
Expand Down Expand Up @@ -540,6 +546,9 @@ func (c *Config) addDefaults() {
if c.Server.ServerJoin == nil {
c.Server.ServerJoin = &ServerJoin{}
}
if c.Server.PlanRejectionTracker == nil {
c.Server.PlanRejectionTracker = &PlanRejectionTracker{}
}
}

// Tests for a panic parsing json with an object of exactly
Expand Down Expand Up @@ -617,6 +626,11 @@ var sample0 = &Config{
RetryJoin: []string{"10.0.0.101", "10.0.0.102", "10.0.0.103"},
EncryptKey: "sHck3WL6cxuhuY7Mso9BHA==",
ServerJoin: &ServerJoin{},
PlanRejectionTracker: &PlanRejectionTracker{
NodeThreshold: 100,
NodeWindow: 31 * time.Minute,
NodeWindowHCL: "31m",
},
},
ACL: &ACLConfig{
Enabled: true,
Expand Down Expand Up @@ -707,6 +721,11 @@ var sample1 = &Config{
RetryJoin: []string{"10.0.0.101", "10.0.0.102", "10.0.0.103"},
EncryptKey: "sHck3WL6cxuhuY7Mso9BHA==",
ServerJoin: &ServerJoin{},
PlanRejectionTracker: &PlanRejectionTracker{
NodeThreshold: 100,
NodeWindow: 31 * time.Minute,
NodeWindowHCL: "31m",
},
},
ACL: &ACLConfig{
Enabled: true,
Expand Down
10 changes: 10 additions & 0 deletions command/agent/config_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -148,6 +148,11 @@ func TestConfig_Merge(t *testing.T) {
UpgradeVersion: "foo",
EnableEventBroker: helper.BoolToPtr(false),
EventBufferSize: helper.IntToPtr(0),
PlanRejectionTracker: &PlanRejectionTracker{
Enabled: true,
NodeThreshold: 100,
NodeWindow: 11 * time.Minute,
},
},
ACL: &ACLConfig{
Enabled: true,
Expand Down Expand Up @@ -343,6 +348,11 @@ func TestConfig_Merge(t *testing.T) {
UpgradeVersion: "bar",
EnableEventBroker: helper.BoolToPtr(true),
EventBufferSize: helper.IntToPtr(100),
PlanRejectionTracker: &PlanRejectionTracker{
Enabled: true,
NodeThreshold: 100,
NodeWindow: 11 * time.Minute,
},
},
ACL: &ACLConfig{
Enabled: true,
Expand Down
6 changes: 6 additions & 0 deletions command/agent/testdata/basic.hcl
Original file line number Diff line number Diff line change
Expand Up @@ -133,6 +133,12 @@ server {
enable_event_broker = false
event_buffer_size = 200

plan_rejection_tracker {
enabled = true
node_threshold = 100
node_window = "41m"
}

server_join {
retry_join = ["1.1.1.1", "2.2.2.2"]
retry_max = 3
Expand Down
5 changes: 5 additions & 0 deletions command/agent/testdata/basic.json
Original file line number Diff line number Diff line change
Expand Up @@ -277,6 +277,11 @@
"node_gc_threshold": "12h",
"non_voting_server": true,
"num_schedulers": 2,
"plan_rejection_tracker": {
"enabled": true,
"node_threshold": 100,
"node_window": "41m"
},
"raft_protocol": 3,
"raft_multiplier": 4,
"redundancy_zone": "foo",
Expand Down
4 changes: 4 additions & 0 deletions command/agent/testdata/sample0.json
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,10 @@
"bootstrap_expect": 3,
"enabled": true,
"encrypt": "sHck3WL6cxuhuY7Mso9BHA==",
"plan_rejection_tracker": {
"node_threshold": 100,
"node_window": "31m"
},
"retry_join": [
"10.0.0.101",
"10.0.0.102",
Expand Down
4 changes: 4 additions & 0 deletions command/agent/testdata/sample1/sample0.json
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,10 @@
"bootstrap_expect": 3,
"enabled": true,
"encrypt": "sHck3WL6cxuhuY7Mso9BHA==",
"plan_rejection_tracker": {
"node_threshold": 100,
"node_window": "31m"
},
"retry_join": [
"10.0.0.101",
"10.0.0.102",
Expand Down
14 changes: 14 additions & 0 deletions nomad/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -232,6 +232,17 @@ type Config struct {
// additional delay is selected from this range randomly.
EvalFailedFollowupDelayRange time.Duration

// NodePlanRejectionEnabled controls if node rejection tracker is enabled.
NodePlanRejectionEnabled bool

// NodePlanRejectionThreshold is the number of times a node must have a
// plan rejection before it is set as ineligible.
NodePlanRejectionThreshold int

// NodePlanRejectionWindow is the time window used to track plan
// rejections for nodes.
NodePlanRejectionWindow time.Duration

// MinHeartbeatTTL is the minimum time between heartbeats.
// This is used as a floor to prevent excessive updates.
MinHeartbeatTTL time.Duration
Expand Down Expand Up @@ -395,6 +406,9 @@ func DefaultConfig() *Config {
MaxHeartbeatsPerSecond: 50.0,
HeartbeatGrace: 10 * time.Second,
FailoverHeartbeatTTL: 300 * time.Second,
NodePlanRejectionEnabled: false,
NodePlanRejectionThreshold: 15,
NodePlanRejectionWindow: 10 * time.Minute,
ConsulConfig: config.DefaultConsulConfig(),
VaultConfig: config.DefaultVaultConfig(),
RPCHoldTimeout: 5 * time.Second,
Expand Down
Loading