hashicorp · lgfa29 · Jul 12, 2022 · Jun 17, 2022 · Jun 17, 2022 · Jun 18, 2022
diff --git a/.changelog/13421.txt b/.changelog/13421.txt
@@ -0,0 +1,7 @@
+```release-note:improvement
+core: automatically mark clients with recurring plan rejections as ineligible
+```
+
+```release-note:improvement
+metrics: emit `nomad.nomad.plan.rejection_tracker.node_score` metric for the number of times a node had a plan rejection within the past time window
+```
diff --git a/command/agent/agent.go b/command/agent/agent.go
@@ -436,6 +436,17 @@ func convertServerConfig(agentConfig *Config) (*nomad.Config, error) {
 		return nil, fmt.Errorf("deploy_query_rate_limit must be greater than 0")
 	}
 
+	// Set plan rejection tracker configuration.
+	if planRejectConf := agentConfig.Server.PlanRejectionTracker; planRejectConf != nil {
+		conf.NodePlanRejectionThreshold = planRejectConf.NodeThreshold
+
+		if planRejectConf.NodeWindow == 0 {
+			return nil, fmt.Errorf("plan_rejection_tracker.node_window must be greater than 0")
+		} else {
+			conf.NodePlanRejectionWindow = planRejectConf.NodeWindow
+		}
+	}
+
 	// Add Enterprise license configs
 	conf.LicenseEnv = agentConfig.Server.LicenseEnv
 	conf.LicensePath = agentConfig.Server.LicensePath

diff --git a/command/agent/agent_test.go b/command/agent/agent_test.go
@@ -365,6 +365,74 @@ func TestAgent_ServerConfig_Limits_OK(t *testing.T) {
 	}
 }
 
+func TestAgent_ServerConfig_PlanRejectionTracker(t *testing.T) {
+	ci.Parallel(t)
+
+	cases := []struct {
+		name           string
+		trackerConfig  *PlanRejectionTracker
+		expectedConfig *PlanRejectionTracker
+		expectedErr    string
+	}{
+		{
+			name:          "default",
+			trackerConfig: nil,
+			expectedConfig: &PlanRejectionTracker{
+				NodeThreshold: 15,
+				NodeWindow:    10 * time.Minute,
+			},
+			expectedErr: "",
+		},
+		{
+			name: "valid config",
+			trackerConfig: &PlanRejectionTracker{
+				NodeThreshold: 123,
+				NodeWindow:    17 * time.Minute,
+			},
+			expectedConfig: &PlanRejectionTracker{
+				NodeThreshold: 123,
+				NodeWindow:    17 * time.Minute,
+			},
+			expectedErr: "",
+		},
+		{
+			name: "invalid node window",
+			trackerConfig: &PlanRejectionTracker{
+				NodeThreshold: 123,
+			},
+			expectedErr: "node_window must be greater than 0",
+		},
+	}
+
+	for _, tc := range cases {
+		t.Run(tc.name, func(t *testing.T) {
+			config := DevConfig(nil)
+			require.NoError(t, config.normalizeAddrs())
+
+			if tc.trackerConfig != nil {
+				config.Server.PlanRejectionTracker = tc.trackerConfig
+			}
+
+			serverConfig, err := convertServerConfig(config)
+
+			if tc.expectedErr != "" {
+				require.Error(t, err)
+				require.Contains(t, err.Error(), tc.expectedErr)
+			} else {
+				require.NoError(t, err)
+				require.Equal(t,
+					tc.expectedConfig.NodeThreshold,
+					serverConfig.NodePlanRejectionThreshold,
+				)
+				require.Equal(t,
+					tc.expectedConfig.NodeWindow,
+					serverConfig.NodePlanRejectionWindow,
+				)
+			}
+		})
+	}
+}
+
 func TestAgent_ServerConfig_RaftMultiplier_Ok(t *testing.T) {
 	ci.Parallel(t)
 

diff --git a/command/agent/config.go b/command/agent/config.go
@@ -503,6 +503,10 @@ type ServerConfig struct {
 	// This value is ignored.
 	DefaultSchedulerConfig *structs.SchedulerConfiguration `hcl:"default_scheduler_config"`
 
+	// PlanRejectionTracker configures the node plan rejection tracker that
+	// detects potentially bad nodes.
+	PlanRejectionTracker *PlanRejectionTracker `hcl:"plan_rejection_tracker"`
+
 	// EnableEventBroker configures whether this server's state store
 	// will generate events for its event stream.
 	EnableEventBroker *bool `hcl:"enable_event_broker"`
@@ -548,6 +552,46 @@ type RaftBoltConfig struct {
 	NoFreelistSync bool `hcl:"no_freelist_sync"`
 }
 
+// PlanRejectionTracker is used in servers to configure the plan rejection
+// tracker.
+type PlanRejectionTracker struct {
+	// NodeThreshold is the number of times a node can have plan rejections
+	// before it is marked as ineligible.
+	NodeThreshold int `hcl:"node_threshold"`
+
+	// NodeWindow is the time window used to track active plan rejections for
+	// nodes.
+	NodeWindow    time.Duration
+	NodeWindowHCL string `hcl:"node_window" json:"-"`
+
+	// ExtraKeysHCL is used by hcl to surface unexpected keys
+	ExtraKeysHCL []string `hcl:",unusedKeys" json:"-"`
+}
+
+func (p *PlanRejectionTracker) Merge(b *PlanRejectionTracker) *PlanRejectionTracker {
+	if p == nil {
+		return b
+	}
+
+	result := *p
+
+	if b == nil {
+		return &result
+	}
+
+	if b.NodeThreshold != 0 {
+		result.NodeThreshold = b.NodeThreshold
+	}
+
+	if b.NodeWindow != 0 {
+		result.NodeWindow = b.NodeWindow
+	}
+	if b.NodeWindowHCL != "" {
+		result.NodeWindowHCL = b.NodeWindowHCL
+	}
+	return &result
+}
+
 // Search is used in servers to configure search API options.
 type Search struct {
 	// FuzzyEnabled toggles whether the FuzzySearch API is enabled. If not
@@ -985,6 +1029,10 @@ func DefaultConfig() *Config {
 			EventBufferSize:   helper.IntToPtr(100),
 			RaftProtocol:      3,
 			StartJoin:         []string{},
+			PlanRejectionTracker: &PlanRejectionTracker{
+				NodeThreshold: 15,
+				NodeWindow:    10 * time.Minute,
+			},
 			ServerJoin: &ServerJoin{
 				RetryJoin:        []string{},
 				RetryInterval:    30 * time.Second,
@@ -1586,6 +1634,10 @@ func (s *ServerConfig) Merge(b *ServerConfig) *ServerConfig {
 		result.EventBufferSize = b.EventBufferSize
 	}
 
+	if b.PlanRejectionTracker != nil {
+		result.PlanRejectionTracker = result.PlanRejectionTracker.Merge(b.PlanRejectionTracker)
+	}
+
 	if b.DefaultSchedulerConfig != nil {
 		c := *b.DefaultSchedulerConfig
 		result.DefaultSchedulerConfig = &c

diff --git a/command/agent/config_parse.go b/command/agent/config_parse.go
@@ -43,9 +43,12 @@ func ParseConfigFile(path string) (*Config, error) {
 				VaultRetry:  &client.RetryConfig{},
 			},
 		},
+		Server: &ServerConfig{
+			PlanRejectionTracker: &PlanRejectionTracker{},
+			ServerJoin:           &ServerJoin{},
+		},
 		ACL:       &ACLConfig{},
 		Audit:     &config.AuditConfig{},
-		Server:    &ServerConfig{ServerJoin: &ServerJoin{}},
 		Consul:    &config.ConsulConfig{},
 		Autopilot: &config.AutopilotConfig{},
 		Telemetry: &Telemetry{},
@@ -54,7 +57,7 @@ func ParseConfigFile(path string) (*Config, error) {
 
 	err = hcl.Decode(c, buf.String())
 	if err != nil {
-		return nil, err
+		return nil, fmt.Errorf("failed to decode HCL file %s: %v", path, err)
 	}
 
 	// convert strings to time.Durations
@@ -66,6 +69,7 @@ func ParseConfigFile(path string) (*Config, error) {
 		{"server.heartbeat_grace", &c.Server.HeartbeatGrace, &c.Server.HeartbeatGraceHCL, nil},
 		{"server.min_heartbeat_ttl", &c.Server.MinHeartbeatTTL, &c.Server.MinHeartbeatTTLHCL, nil},
 		{"server.failover_heartbeat_ttl", &c.Server.FailoverHeartbeatTTL, &c.Server.FailoverHeartbeatTTLHCL, nil},
+		{"server.plan_rejection_tracker.node_window", &c.Server.PlanRejectionTracker.NodeWindow, &c.Server.PlanRejectionTracker.NodeWindowHCL, nil},
 		{"server.retry_interval", &c.Server.RetryInterval, &c.Server.RetryIntervalHCL, nil},
 		{"server.server_join.retry_interval", &c.Server.ServerJoin.RetryInterval, &c.Server.ServerJoin.RetryIntervalHCL, nil},
 		{"consul.timeout", &c.Consul.Timeout, &c.Consul.TimeoutHCL, nil},

diff --git a/command/agent/config_parse_test.go b/command/agent/config_parse_test.go
@@ -126,6 +126,11 @@ var basicConfig = &Config{
 		EncryptKey:                "abc",
 		EnableEventBroker:         helper.BoolToPtr(false),
 		EventBufferSize:           helper.IntToPtr(200),
+		PlanRejectionTracker: &PlanRejectionTracker{
+			NodeThreshold: 100,
+			NodeWindow:    41 * time.Minute,
+			NodeWindowHCL: "41m",
+		},
 		ServerJoin: &ServerJoin{
 			RetryJoin:        []string{"1.1.1.1", "2.2.2.2"},
 			RetryInterval:    time.Duration(15) * time.Second,
@@ -540,6 +545,9 @@ func (c *Config) addDefaults() {
 	if c.Server.ServerJoin == nil {
 		c.Server.ServerJoin = &ServerJoin{}
 	}
+	if c.Server.PlanRejectionTracker == nil {
+		c.Server.PlanRejectionTracker = &PlanRejectionTracker{}
+	}
 }
 
 // Tests for a panic parsing json with an object of exactly
@@ -617,6 +625,11 @@ var sample0 = &Config{
 		RetryJoin:       []string{"10.0.0.101", "10.0.0.102", "10.0.0.103"},
 		EncryptKey:      "sHck3WL6cxuhuY7Mso9BHA==",
 		ServerJoin:      &ServerJoin{},
+		PlanRejectionTracker: &PlanRejectionTracker{
+			NodeThreshold: 100,
+			NodeWindow:    31 * time.Minute,
+			NodeWindowHCL: "31m",
+		},
 	},
 	ACL: &ACLConfig{
 		Enabled: true,
@@ -707,6 +720,11 @@ var sample1 = &Config{
 		RetryJoin:       []string{"10.0.0.101", "10.0.0.102", "10.0.0.103"},
 		EncryptKey:      "sHck3WL6cxuhuY7Mso9BHA==",
 		ServerJoin:      &ServerJoin{},
+		PlanRejectionTracker: &PlanRejectionTracker{
+			NodeThreshold: 100,
+			NodeWindow:    31 * time.Minute,
+			NodeWindowHCL: "31m",
+		},
 	},
 	ACL: &ACLConfig{
 		Enabled: true,

diff --git a/command/agent/config_test.go b/command/agent/config_test.go
@@ -148,6 +148,10 @@ func TestConfig_Merge(t *testing.T) {
 			UpgradeVersion:         "foo",
 			EnableEventBroker:      helper.BoolToPtr(false),
 			EventBufferSize:        helper.IntToPtr(0),
+			PlanRejectionTracker: &PlanRejectionTracker{
+				NodeThreshold: 100,
+				NodeWindow:    11 * time.Minute,
+			},
 		},
 		ACL: &ACLConfig{
 			Enabled:          true,
@@ -343,6 +347,10 @@ func TestConfig_Merge(t *testing.T) {
 			UpgradeVersion:         "bar",
 			EnableEventBroker:      helper.BoolToPtr(true),
 			EventBufferSize:        helper.IntToPtr(100),
+			PlanRejectionTracker: &PlanRejectionTracker{
+				NodeThreshold: 100,
+				NodeWindow:    11 * time.Minute,
+			},
 		},
 		ACL: &ACLConfig{
 			Enabled:          true,

diff --git a/command/agent/testdata/basic.hcl b/command/agent/testdata/basic.hcl
@@ -133,6 +133,11 @@ server {
   enable_event_broker           = false
   event_buffer_size             = 200
 
+  plan_rejection_tracker {
+    node_threshold = 100
+    node_window    = "41m"
+  }
+
   server_join {
     retry_join     = ["1.1.1.1", "2.2.2.2"]
     retry_max      = 3

diff --git a/command/agent/testdata/basic.json b/command/agent/testdata/basic.json
@@ -277,6 +277,10 @@
       "node_gc_threshold": "12h",
       "non_voting_server": true,
       "num_schedulers": 2,
+      "plan_rejection_tracker": {
+        "node_threshold": 100,
+        "node_window": "41m"
+      },
       "raft_protocol": 3,
       "raft_multiplier": 4,
       "redundancy_zone": "foo",

diff --git a/command/agent/testdata/sample0.json b/command/agent/testdata/sample0.json
@@ -55,6 +55,10 @@
     "bootstrap_expect": 3,
     "enabled": true,
     "encrypt": "sHck3WL6cxuhuY7Mso9BHA==",
+    "plan_rejection_tracker": {
+      "node_threshold": 100,
+      "node_window": "31m"
+    },
     "retry_join": [
       "10.0.0.101",
       "10.0.0.102",

diff --git a/command/agent/testdata/sample1/sample0.json b/command/agent/testdata/sample1/sample0.json
@@ -20,6 +20,10 @@
     "bootstrap_expect": 3,
     "enabled": true,
     "encrypt": "sHck3WL6cxuhuY7Mso9BHA==",
+    "plan_rejection_tracker": {
+      "node_threshold": 100,
+      "node_window": "31m"
+    },
     "retry_join": [
       "10.0.0.101",
       "10.0.0.102",

diff --git a/nomad/config.go b/nomad/config.go
@@ -232,6 +232,14 @@ type Config struct {
 	// additional delay is selected from this range randomly.
 	EvalFailedFollowupDelayRange time.Duration
 
+	// NodePlanRejectionThreshold is the number of times a node must have a
+	// plan rejection before it is set as ineligible.
+	NodePlanRejectionThreshold int
+
+	// NodePlanRejectionWindow is the time window used to track plan
+	// rejections for nodes.
+	NodePlanRejectionWindow time.Duration
+
 	// MinHeartbeatTTL is the minimum time between heartbeats.
 	// This is used as a floor to prevent excessive updates.
 	MinHeartbeatTTL time.Duration
@@ -395,6 +403,8 @@ func DefaultConfig() *Config {
 		MaxHeartbeatsPerSecond:           50.0,
 		HeartbeatGrace:                   10 * time.Second,
 		FailoverHeartbeatTTL:             300 * time.Second,
+		NodePlanRejectionThreshold:       15,
+		NodePlanRejectionWindow:          10 * time.Minute,
 		ConsulConfig:                     config.DefaultConsulConfig(),
 		VaultConfig:                      config.DefaultVaultConfig(),
 		RPCHoldTimeout:                   5 * time.Second,