diff --git a/api/allocations.go b/api/allocations.go index 68047ee5b462..fc035ebb16ce 100644 --- a/api/allocations.go +++ b/api/allocations.go @@ -81,6 +81,7 @@ type Allocation struct { Metrics *AllocationMetric DesiredStatus string DesiredDescription string + DesiredTransition DesiredTransition ClientStatus string ClientDescription string TaskStates map[string]*TaskState @@ -205,3 +206,17 @@ type RescheduleEvent struct { // PrevNodeID is the node ID of the previous allocation PrevNodeID string } + +// DesiredTransition is used to mark an allocation as having a desired state +// transition. This information can be used by the scheduler to make the +// correct decision. +type DesiredTransition struct { + // Migrate is used to indicate that this allocation should be stopped and + // migrated to another node. + Migrate *bool +} + +// ShouldMigrate returns whether the transition object dictates a migration. +func (d DesiredTransition) ShouldMigrate() bool { + return d.Migrate != nil && *d.Migrate +} diff --git a/api/allocations_test.go b/api/allocations_test.go index dd5ae333bd1a..5eb5508bb69f 100644 --- a/api/allocations_test.go +++ b/api/allocations_test.go @@ -239,3 +239,10 @@ func TestAllocations_RescheduleInfo(t *testing.T) { } } + +func TestAllocations_ShouldMigrate(t *testing.T) { + t.Parallel() + require.True(t, DesiredTransition{Migrate: helper.BoolToPtr(true)}.ShouldMigrate()) + require.False(t, DesiredTransition{}.ShouldMigrate()) + require.False(t, DesiredTransition{Migrate: helper.BoolToPtr(false)}.ShouldMigrate()) +} diff --git a/api/jobs.go b/api/jobs.go index 9e3227af49e8..5fcecf403871 100644 --- a/api/jobs.go +++ b/api/jobs.go @@ -559,6 +559,7 @@ type Job struct { ParameterizedJob *ParameterizedJobConfig Payload []byte Reschedule *ReschedulePolicy + Migrate *MigrateStrategy Meta map[string]string VaultToken *string `mapstructure:"vault_token"` Status *string diff --git a/api/jobs_test.go b/api/jobs_test.go index edf045a3cde1..f14bd7b49fd8 100644 --- a/api/jobs_test.go +++ b/api/jobs_test.go @@ -12,41 +12,34 @@ import ( "github.com/hashicorp/nomad/testutil" "github.com/kr/pretty" "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" ) func TestJobs_Register(t *testing.T) { t.Parallel() + require := require.New(t) + c, s := makeClient(t, nil, nil) defer s.Stop() jobs := c.Jobs() // Listing jobs before registering returns nothing resp, qm, err := jobs.List(nil) - if err != nil { - t.Fatalf("err: %s", err) - } + require.Nil(err) assertQueryMeta(t, qm) - if n := len(resp); n != 0 { - t.Fatalf("expected 0 jobs, got: %d", n) - } + require.Emptyf(resp, "expected 0 jobs, got: %d", len(resp)) // Create a job and attempt to register it job := testJob() resp2, wm, err := jobs.Register(job, nil) - if err != nil { - t.Fatalf("err: %s", err) - } - if resp2 == nil || resp2.EvalID == "" { - t.Fatalf("missing eval id") - } + require.Nil(err) + require.NotNil(resp2) + require.NotEmpty(resp2.EvalID) assertWriteMeta(t, wm) // Query the jobs back out again - resp, qm, err = jobs.List(nil) - if err != nil { - t.Fatalf("err: %s", err) - } - assertQueryMeta(t, qm) + resp, _, err = jobs.List(nil) + require.Nil(err) // Check that we got the expected response if len(resp) != 1 || resp[0].ID != *job.ID { @@ -141,6 +134,7 @@ func TestJobs_Canonicalize(t *testing.T) { MaxDelay: helper.TimeToPtr(1 * time.Hour), Unlimited: helper.BoolToPtr(true), }, + Migrate: DefaultMigrateStrategy(), Tasks: []*Task{ { KillTimeout: helper.TimeToPtr(5 * time.Second), @@ -211,6 +205,7 @@ func TestJobs_Canonicalize(t *testing.T) { MaxDelay: helper.TimeToPtr(1 * time.Hour), Unlimited: helper.BoolToPtr(true), }, + Migrate: DefaultMigrateStrategy(), Tasks: []*Task{ { Name: "task1", @@ -363,6 +358,7 @@ func TestJobs_Canonicalize(t *testing.T) { AutoRevert: helper.BoolToPtr(false), Canary: helper.IntToPtr(0), }, + Migrate: DefaultMigrateStrategy(), Tasks: []*Task{ { Name: "redis", @@ -576,6 +572,7 @@ func TestJobs_Canonicalize(t *testing.T) { AutoRevert: helper.BoolToPtr(true), Canary: helper.IntToPtr(1), }, + Migrate: DefaultMigrateStrategy(), Tasks: []*Task{ { Name: "task1", @@ -616,6 +613,7 @@ func TestJobs_Canonicalize(t *testing.T) { AutoRevert: helper.BoolToPtr(false), Canary: helper.IntToPtr(0), }, + Migrate: DefaultMigrateStrategy(), Tasks: []*Task{ { Name: "task1", diff --git a/api/nodes.go b/api/nodes.go index 549eeea66639..76e25594f775 100644 --- a/api/nodes.go +++ b/api/nodes.go @@ -3,8 +3,9 @@ package api import ( "fmt" "sort" - "strconv" "time" + + "github.com/hashicorp/nomad/nomad/structs" ) // Nodes is used to query node-related API endpoints @@ -42,10 +43,57 @@ func (n *Nodes) Info(nodeID string, q *QueryOptions) (*Node, *QueryMeta, error) return &resp, qm, nil } -// ToggleDrain is used to toggle drain mode on/off for a given node. -func (n *Nodes) ToggleDrain(nodeID string, drain bool, q *WriteOptions) (*WriteMeta, error) { - drainArg := strconv.FormatBool(drain) - wm, err := n.client.write("/v1/node/"+nodeID+"/drain?enable="+drainArg, nil, nil, q) +// NodeUpdateDrainRequest is used to update the drain specification for a node. +type NodeUpdateDrainRequest struct { + // NodeID is the node to update the drain specification for. + NodeID string + + // DrainSpec is the drain specification to set for the node. A nil DrainSpec + // will disable draining. + DrainSpec *DrainSpec + + // MarkEligible marks the node as eligible for scheduling if removing + // the drain strategy. + MarkEligible bool +} + +// UpdateDrain is used to update the drain strategy for a given node. If +// markEligible is true and the drain is being removed, the node will be marked +// as having its scheduling being elibile +func (n *Nodes) UpdateDrain(nodeID string, spec *DrainSpec, markEligible bool, q *WriteOptions) (*WriteMeta, error) { + req := &NodeUpdateDrainRequest{ + NodeID: nodeID, + DrainSpec: spec, + MarkEligible: markEligible, + } + + wm, err := n.client.write("/v1/node/"+nodeID+"/drain", req, nil, q) + if err != nil { + return nil, err + } + return wm, nil +} + +// NodeUpdateEligibilityRequest is used to update the drain specification for a node. +type NodeUpdateEligibilityRequest struct { + // NodeID is the node to update the drain specification for. + NodeID string + Eligibility string +} + +// ToggleEligibility is used to update the scheduling eligibility of the node +func (n *Nodes) ToggleEligibility(nodeID string, eligible bool, q *WriteOptions) (*WriteMeta, error) { + e := structs.NodeSchedulingEligible + if !eligible { + e = structs.NodeSchedulingIneligible + } + + req := &NodeUpdateEligibilityRequest{ + NodeID: nodeID, + Eligibility: e, + } + + wm, err := n.client.write("/v1/node/"+nodeID+"/eligibility", req, nil, q) if err != nil { return nil, err } @@ -108,25 +156,48 @@ type DriverInfo struct { // Node is used to deserialize a node entry. type Node struct { - ID string - Datacenter string - Name string - HTTPAddr string - TLSEnabled bool - Attributes map[string]string - Resources *Resources - Reserved *Resources - Links map[string]string - Meta map[string]string - NodeClass string - Drain bool - Status string - StatusDescription string - StatusUpdatedAt int64 - Events []*NodeEvent - Drivers map[string]*DriverInfo - CreateIndex uint64 - ModifyIndex uint64 + ID string + Datacenter string + Name string + HTTPAddr string + TLSEnabled bool + Attributes map[string]string + Resources *Resources + Reserved *Resources + Links map[string]string + Meta map[string]string + NodeClass string + Drain bool + DrainStrategy *DrainStrategy + SchedulingEligibility string + Status string + StatusDescription string + StatusUpdatedAt int64 + Events []*NodeEvent + Drivers map[string]*DriverInfo + CreateIndex uint64 + ModifyIndex uint64 +} + +// DrainStrategy describes a Node's drain behavior. +type DrainStrategy struct { + // DrainSpec is the user declared drain specification + DrainSpec + + // ForceDeadline is the deadline time for the drain after which drains will + // be forced + ForceDeadline time.Time +} + +// DrainSpec describes a Node's drain behavior. +type DrainSpec struct { + // Deadline is the duration after StartTime when the remaining + // allocations on a draining Node should be told to stop. + Deadline time.Duration + + // IgnoreSystemJobs allows systems jobs to remain on the node even though it + // has been marked for draining. + IgnoreSystemJobs bool } const ( @@ -181,17 +252,18 @@ type HostDiskStats struct { // NodeListStub is a subset of information returned during // node list operations. type NodeListStub struct { - Address string - ID string - Datacenter string - Name string - NodeClass string - Version string - Drain bool - Status string - StatusDescription string - CreateIndex uint64 - ModifyIndex uint64 + Address string + ID string + Datacenter string + Name string + NodeClass string + Version string + Drain bool + SchedulingEligibility string + Status string + StatusDescription string + CreateIndex uint64 + ModifyIndex uint64 } // NodeIndexSort reverse sorts nodes by CreateIndex diff --git a/api/nodes_test.go b/api/nodes_test.go index 06b960746942..4945b3f99c76 100644 --- a/api/nodes_test.go +++ b/api/nodes_test.go @@ -142,6 +142,7 @@ func TestNodes_Info(t *testing.T) { func TestNodes_ToggleDrain(t *testing.T) { t.Parallel() + require := require.New(t) c, s := makeClient(t, nil, func(c *testutil.TestServerConfig) { c.DevMode = true }) @@ -166,15 +167,80 @@ func TestNodes_ToggleDrain(t *testing.T) { // Check for drain mode out, _, err := nodes.Info(nodeID, nil) - if err != nil { - t.Fatalf("err: %s", err) - } + require.Nil(err) if out.Drain { t.Fatalf("drain mode should be off") } // Toggle it on - wm, err := nodes.ToggleDrain(nodeID, true, nil) + spec := &DrainSpec{ + Deadline: 10 * time.Second, + } + wm, err := nodes.UpdateDrain(nodeID, spec, false, nil) + require.Nil(err) + assertWriteMeta(t, wm) + + // Check again + out, _, err = nodes.Info(nodeID, nil) + require.Nil(err) + if out.SchedulingEligibility != structs.NodeSchedulingIneligible { + t.Fatalf("bad eligibility: %v vs %v", out.SchedulingEligibility, structs.NodeSchedulingIneligible) + } + + // Toggle off again + wm, err = nodes.UpdateDrain(nodeID, nil, true, nil) + require.Nil(err) + assertWriteMeta(t, wm) + + // Check again + out, _, err = nodes.Info(nodeID, nil) + require.Nil(err) + if out.Drain { + t.Fatalf("drain mode should be off") + } + if out.DrainStrategy != nil { + t.Fatalf("drain strategy should be unset") + } + if out.SchedulingEligibility != structs.NodeSchedulingEligible { + t.Fatalf("should be eligible") + } +} + +func TestNodes_ToggleEligibility(t *testing.T) { + t.Parallel() + c, s := makeClient(t, nil, func(c *testutil.TestServerConfig) { + c.DevMode = true + }) + defer s.Stop() + nodes := c.Nodes() + + // Wait for node registration and get the ID + var nodeID string + testutil.WaitForResult(func() (bool, error) { + out, _, err := nodes.List(nil) + if err != nil { + return false, err + } + if n := len(out); n != 1 { + return false, fmt.Errorf("expected 1 node, got: %d", n) + } + nodeID = out[0].ID + return true, nil + }, func(err error) { + t.Fatalf("err: %s", err) + }) + + // Check for eligibility + out, _, err := nodes.Info(nodeID, nil) + if err != nil { + t.Fatalf("err: %s", err) + } + if out.SchedulingEligibility != structs.NodeSchedulingEligible { + t.Fatalf("node should be eligible") + } + + // Toggle it off + wm, err := nodes.ToggleEligibility(nodeID, false, nil) if err != nil { t.Fatalf("err: %s", err) } @@ -185,12 +251,12 @@ func TestNodes_ToggleDrain(t *testing.T) { if err != nil { t.Fatalf("err: %s", err) } - if !out.Drain { - t.Fatalf("drain mode should be on") + if out.SchedulingEligibility != structs.NodeSchedulingIneligible { + t.Fatalf("bad eligibility: %v vs %v", out.SchedulingEligibility, structs.NodeSchedulingIneligible) } - // Toggle off again - wm, err = nodes.ToggleDrain(nodeID, false, nil) + // Toggle on + wm, err = nodes.ToggleEligibility(nodeID, true, nil) if err != nil { t.Fatalf("err: %s", err) } @@ -201,8 +267,8 @@ func TestNodes_ToggleDrain(t *testing.T) { if err != nil { t.Fatalf("err: %s", err) } - if out.Drain { - t.Fatalf("drain mode should be off") + if out.SchedulingEligibility != structs.NodeSchedulingEligible { + t.Fatalf("bad eligibility: %v vs %v", out.SchedulingEligibility, structs.NodeSchedulingEligible) } } diff --git a/api/tasks.go b/api/tasks.go index 047afccaf0a3..47b502d57558 100644 --- a/api/tasks.go +++ b/api/tasks.go @@ -284,6 +284,67 @@ func (e *EphemeralDisk) Canonicalize() { } } +// MigrateStrategy describes how allocations for a task group should be +// migrated between nodes (eg when draining). +type MigrateStrategy struct { + MaxParallel *int `mapstructure:"max_parallel"` + HealthCheck *string `mapstructure:"health_check"` + MinHealthyTime *time.Duration `mapstructure:"min_healthy_time"` + HealthyDeadline *time.Duration `mapstructure:"healthy_deadline"` +} + +func DefaultMigrateStrategy() *MigrateStrategy { + return &MigrateStrategy{ + MaxParallel: helper.IntToPtr(1), + HealthCheck: helper.StringToPtr("checks"), + MinHealthyTime: helper.TimeToPtr(10 * time.Second), + HealthyDeadline: helper.TimeToPtr(5 * time.Minute), + } +} + +func (m *MigrateStrategy) Canonicalize() { + if m == nil { + return + } + defaults := DefaultMigrateStrategy() + if m.MaxParallel == nil { + m.MaxParallel = defaults.MaxParallel + } + if m.HealthCheck == nil { + m.HealthCheck = defaults.HealthCheck + } + if m.MinHealthyTime == nil { + m.MinHealthyTime = defaults.MinHealthyTime + } + if m.HealthyDeadline == nil { + m.HealthyDeadline = defaults.HealthyDeadline + } +} + +func (m *MigrateStrategy) Merge(o *MigrateStrategy) { + if o.MaxParallel != nil { + m.MaxParallel = o.MaxParallel + } + if o.HealthCheck != nil { + m.HealthCheck = o.HealthCheck + } + if o.MinHealthyTime != nil { + m.MinHealthyTime = o.MinHealthyTime + } + if o.HealthyDeadline != nil { + m.HealthyDeadline = o.HealthyDeadline + } +} + +func (m *MigrateStrategy) Copy() *MigrateStrategy { + if m == nil { + return nil + } + nm := new(MigrateStrategy) + *nm = *m + return nm +} + // TaskGroup is the unit of scheduling. type TaskGroup struct { Name *string @@ -294,6 +355,7 @@ type TaskGroup struct { ReschedulePolicy *ReschedulePolicy EphemeralDisk *EphemeralDisk Update *UpdateStrategy + Migrate *MigrateStrategy Meta map[string]string } @@ -377,6 +439,26 @@ func (g *TaskGroup) Canonicalize(job *Job) { } g.ReschedulePolicy = defaultReschedulePolicy + // Merge the migrate strategy from the job + if jm, tm := job.Migrate != nil, g.Migrate != nil; jm && tm { + jobMigrate := job.Migrate.Copy() + jobMigrate.Merge(g.Migrate) + g.Migrate = jobMigrate + } else if jm { + jobMigrate := job.Migrate.Copy() + g.Migrate = jobMigrate + } + + // Merge with default reschedule policy + if *job.Type == "service" { + defaultMigrateStrategy := &MigrateStrategy{} + defaultMigrateStrategy.Canonicalize() + if g.Migrate != nil { + defaultMigrateStrategy.Merge(g.Migrate) + } + g.Migrate = defaultMigrateStrategy + } + var defaultRestartPolicy *RestartPolicy switch *job.Type { case "service", "system": diff --git a/api/tasks_test.go b/api/tasks_test.go index 3280507ad591..d72acc179bf6 100644 --- a/api/tasks_test.go +++ b/api/tasks_test.go @@ -430,6 +430,158 @@ func TestTaskGroup_Canonicalize_ReschedulePolicy(t *testing.T) { } } +// Verifies that migrate strategy is merged correctly +func TestTaskGroup_Canonicalize_MigrateStrategy(t *testing.T) { + type testCase struct { + desc string + jobType string + jobMigrate *MigrateStrategy + taskMigrate *MigrateStrategy + expected *MigrateStrategy + } + + testCases := []testCase{ + { + desc: "Default batch", + jobType: "batch", + jobMigrate: nil, + taskMigrate: nil, + expected: nil, + }, + { + desc: "Default service", + jobType: "service", + jobMigrate: nil, + taskMigrate: nil, + expected: &MigrateStrategy{ + MaxParallel: helper.IntToPtr(1), + HealthCheck: helper.StringToPtr("checks"), + MinHealthyTime: helper.TimeToPtr(10 * time.Second), + HealthyDeadline: helper.TimeToPtr(5 * time.Minute), + }, + }, + { + desc: "Empty job migrate strategy", + jobType: "service", + jobMigrate: &MigrateStrategy{ + MaxParallel: helper.IntToPtr(0), + HealthCheck: helper.StringToPtr(""), + MinHealthyTime: helper.TimeToPtr(0), + HealthyDeadline: helper.TimeToPtr(0), + }, + taskMigrate: nil, + expected: &MigrateStrategy{ + MaxParallel: helper.IntToPtr(0), + HealthCheck: helper.StringToPtr(""), + MinHealthyTime: helper.TimeToPtr(0), + HealthyDeadline: helper.TimeToPtr(0), + }, + }, + { + desc: "Inherit from job", + jobType: "service", + jobMigrate: &MigrateStrategy{ + MaxParallel: helper.IntToPtr(3), + HealthCheck: helper.StringToPtr("checks"), + MinHealthyTime: helper.TimeToPtr(2), + HealthyDeadline: helper.TimeToPtr(2), + }, + taskMigrate: nil, + expected: &MigrateStrategy{ + MaxParallel: helper.IntToPtr(3), + HealthCheck: helper.StringToPtr("checks"), + MinHealthyTime: helper.TimeToPtr(2), + HealthyDeadline: helper.TimeToPtr(2), + }, + }, + { + desc: "Set in task", + jobType: "service", + jobMigrate: nil, + taskMigrate: &MigrateStrategy{ + MaxParallel: helper.IntToPtr(3), + HealthCheck: helper.StringToPtr("checks"), + MinHealthyTime: helper.TimeToPtr(2), + HealthyDeadline: helper.TimeToPtr(2), + }, + expected: &MigrateStrategy{ + MaxParallel: helper.IntToPtr(3), + HealthCheck: helper.StringToPtr("checks"), + MinHealthyTime: helper.TimeToPtr(2), + HealthyDeadline: helper.TimeToPtr(2), + }, + }, + { + desc: "Merge from job", + jobType: "service", + jobMigrate: &MigrateStrategy{ + MaxParallel: helper.IntToPtr(11), + }, + taskMigrate: &MigrateStrategy{ + HealthCheck: helper.StringToPtr("checks"), + MinHealthyTime: helper.TimeToPtr(2), + HealthyDeadline: helper.TimeToPtr(2), + }, + expected: &MigrateStrategy{ + MaxParallel: helper.IntToPtr(11), + HealthCheck: helper.StringToPtr("checks"), + MinHealthyTime: helper.TimeToPtr(2), + HealthyDeadline: helper.TimeToPtr(2), + }, + }, + { + desc: "Override from group", + jobType: "service", + jobMigrate: &MigrateStrategy{ + MaxParallel: helper.IntToPtr(11), + }, + taskMigrate: &MigrateStrategy{ + MaxParallel: helper.IntToPtr(5), + HealthCheck: helper.StringToPtr("checks"), + MinHealthyTime: helper.TimeToPtr(2), + HealthyDeadline: helper.TimeToPtr(2), + }, + expected: &MigrateStrategy{ + MaxParallel: helper.IntToPtr(5), + HealthCheck: helper.StringToPtr("checks"), + MinHealthyTime: helper.TimeToPtr(2), + HealthyDeadline: helper.TimeToPtr(2), + }, + }, + { + desc: "Parallel from job, defaulting", + jobType: "service", + jobMigrate: &MigrateStrategy{ + MaxParallel: helper.IntToPtr(5), + }, + taskMigrate: nil, + expected: &MigrateStrategy{ + MaxParallel: helper.IntToPtr(5), + HealthCheck: helper.StringToPtr("checks"), + MinHealthyTime: helper.TimeToPtr(10 * time.Second), + HealthyDeadline: helper.TimeToPtr(5 * time.Minute), + }, + }, + } + + for _, tc := range testCases { + t.Run(tc.desc, func(t *testing.T) { + job := &Job{ + ID: helper.StringToPtr("test"), + Migrate: tc.jobMigrate, + Type: helper.StringToPtr(tc.jobType), + } + job.Canonicalize() + tg := &TaskGroup{ + Name: helper.StringToPtr("foo"), + Migrate: tc.taskMigrate, + } + tg.Canonicalize(job) + assert.Equal(t, tc.expected, tg.Migrate) + }) + } +} + // TestService_CheckRestart asserts Service.CheckRestart settings are properly // inherited by Checks. func TestService_CheckRestart(t *testing.T) { diff --git a/api/util_test.go b/api/util_test.go index 9aceee0bfdad..c6f99018e4ce 100644 --- a/api/util_test.go +++ b/api/util_test.go @@ -7,6 +7,7 @@ import ( ) func assertQueryMeta(t *testing.T, qm *QueryMeta) { + t.Helper() if qm.LastIndex == 0 { t.Fatalf("bad index: %d", qm.LastIndex) } @@ -16,6 +17,7 @@ func assertQueryMeta(t *testing.T, qm *QueryMeta) { } func assertWriteMeta(t *testing.T, wm *WriteMeta) { + t.Helper() if wm.LastIndex == 0 { t.Fatalf("bad index: %d", wm.LastIndex) } diff --git a/client/alloc_runner_health_watcher.go b/client/alloc_runner_health_watcher.go index ba94763b555e..b57f9c46e94f 100644 --- a/client/alloc_runner_health_watcher.go +++ b/client/alloc_runner_health_watcher.go @@ -31,7 +31,22 @@ func (r *AllocRunner) watchHealth(ctx context.Context) { // See if we should watch the allocs health alloc := r.Alloc() - if alloc.DeploymentID == "" || alloc.DeploymentStatus.IsHealthy() || alloc.DeploymentStatus.IsUnhealthy() { + if alloc.DeploymentStatus.IsHealthy() || alloc.DeploymentStatus.IsUnhealthy() { + // No need to watch health as it's already set + return + } + + // Neither deployments nor migrations care about system jobs so never + // watch their health + if alloc.Job.Type == structs.JobTypeSystem { + return + } + + isDeploy := alloc.DeploymentID != "" + + // Migrations don't consider the health of batch jobs so only watch + // batch health during deployments + if !isDeploy && alloc.Job.Type == structs.JobTypeBatch { return } @@ -39,7 +54,9 @@ func (r *AllocRunner) watchHealth(ctx context.Context) { if tg == nil { r.logger.Printf("[ERR] client.alloc_watcher: failed to lookup allocation's task group. Exiting watcher") return - } else if tg.Update == nil || tg.Update.HealthCheck == structs.UpdateStrategyHealthCheck_Manual { + } + + if isDeploy && (tg.Update == nil || tg.Update.HealthCheck == structs.UpdateStrategyHealthCheck_Manual) { return } @@ -47,14 +64,36 @@ func (r *AllocRunner) watchHealth(ctx context.Context) { l := r.allocBroadcast.Listen() defer l.Close() + // Define the deadline, health method, min healthy time from the + // deployment if this is a deployment; otherwise from the migration + // strategy. + var deadline time.Time + var useChecks bool + var minHealthyTime time.Duration + + if isDeploy { + deadline = time.Now().Add(tg.Update.HealthyDeadline) + minHealthyTime = tg.Update.MinHealthyTime + useChecks = tg.Update.HealthCheck == structs.UpdateStrategyHealthCheck_Checks + } else { + strategy := tg.Migrate + if strategy == nil { + // For backwards compat with pre-0.8 allocations that + // don't have a migrate strategy set. + strategy = structs.DefaultMigrateStrategy() + } + deadline = time.Now().Add(strategy.HealthyDeadline) + minHealthyTime = strategy.MinHealthyTime + useChecks = strategy.HealthCheck == structs.MigrateStrategyHealthChecks + } + // Create a new context with the health deadline - deadline := time.Now().Add(tg.Update.HealthyDeadline) healthCtx, healthCtxCancel := context.WithDeadline(ctx, deadline) defer healthCtxCancel() - r.logger.Printf("[DEBUG] client.alloc_watcher: deadline (%v) for alloc %q is at %v", tg.Update.HealthyDeadline, alloc.ID, deadline) + r.logger.Printf("[DEBUG] client.alloc_watcher: deadline for alloc %q is at %v (deploy=%t checks=%t)", alloc.ID, deadline, isDeploy, useChecks) // Create the health tracker object - tracker := newAllocHealthTracker(healthCtx, r.logger, alloc, l, r.consulClient) + tracker := newAllocHealthTracker(healthCtx, r.logger, alloc, l, r.consulClient, minHealthyTime, useChecks) tracker.Start() allocHealthy := false @@ -77,8 +116,8 @@ func (r *AllocRunner) watchHealth(ctx context.Context) { r.allocHealth = helper.BoolToPtr(allocHealthy) r.allocLock.Unlock() - // We are unhealthy so emit task events explaining why - if !allocHealthy { + // If deployment is unhealthy emit task events explaining why + if !allocHealthy && isDeploy { r.taskLock.RLock() for task, event := range tracker.TaskEvents() { if tr, ok := r.tasks[task]; ok { @@ -107,6 +146,13 @@ type allocHealthTracker struct { // tg is the task group we are tracking tg *structs.TaskGroup + // minHealthyTime is the duration an alloc must remain healthy to be + // considered healthy + minHealthyTime time.Duration + + // useChecks specifies whether to use Consul healh checks or not + useChecks bool + // consulCheckCount is the number of checks the task group will attempt to // register consulCheckCount int @@ -146,16 +192,19 @@ type allocHealthTracker struct { // alloc listener and consul API object are given so that the watcher can detect // health changes. func newAllocHealthTracker(parentCtx context.Context, logger *log.Logger, alloc *structs.Allocation, - allocUpdates *cstructs.AllocListener, consulClient ConsulServiceAPI) *allocHealthTracker { + allocUpdates *cstructs.AllocListener, consulClient ConsulServiceAPI, + minHealthyTime time.Duration, useChecks bool) *allocHealthTracker { a := &allocHealthTracker{ - logger: logger, - healthy: make(chan bool, 1), - allocStopped: make(chan struct{}), - alloc: alloc, - tg: alloc.Job.LookupTaskGroup(alloc.TaskGroup), - allocUpdates: allocUpdates, - consulClient: consulClient, + logger: logger, + healthy: make(chan bool, 1), + allocStopped: make(chan struct{}), + alloc: alloc, + tg: alloc.Job.LookupTaskGroup(alloc.TaskGroup), + minHealthyTime: minHealthyTime, + useChecks: useChecks, + allocUpdates: allocUpdates, + consulClient: consulClient, } a.taskHealth = make(map[string]*taskHealthState, len(a.tg.Tasks)) @@ -176,7 +225,7 @@ func newAllocHealthTracker(parentCtx context.Context, logger *log.Logger, alloc // Start starts the watcher. func (a *allocHealthTracker) Start() { go a.watchTaskEvents() - if a.tg.Update.HealthCheck == structs.UpdateStrategyHealthCheck_Checks { + if a.useChecks { go a.watchConsulEvents() } } @@ -210,7 +259,8 @@ func (a *allocHealthTracker) TaskEvents() map[string]string { // Go through are task information and build the event map for task, state := range a.taskHealth { - if e, ok := state.event(deadline, a.tg.Update); ok { + useChecks := a.tg.Update.HealthCheck == structs.UpdateStrategyHealthCheck_Checks + if e, ok := state.event(deadline, a.tg.Update.MinHealthyTime, useChecks); ok { events[task] = e } } @@ -227,7 +277,7 @@ func (a *allocHealthTracker) setTaskHealth(healthy, terminal bool) { // If we are marked healthy but we also require Consul to be healthy and it // isn't yet, return, unless the task is terminal - requireConsul := a.tg.Update.HealthCheck == structs.UpdateStrategyHealthCheck_Checks && a.consulCheckCount > 0 + requireConsul := a.useChecks && a.consulCheckCount > 0 if !terminal && healthy && requireConsul && !a.checksHealthy { return } @@ -337,7 +387,7 @@ func (a *allocHealthTracker) watchTaskEvents() { // Set the timer since all tasks are started if !latestStartTime.IsZero() { allStartedTime = latestStartTime - healthyTimer.Reset(a.tg.Update.MinHealthyTime) + healthyTimer.Reset(a.minHealthyTime) } } @@ -453,7 +503,7 @@ OUTER: } primed = true - healthyTimer.Reset(a.tg.Update.MinHealthyTime) + healthyTimer.Reset(a.minHealthyTime) } } } @@ -470,7 +520,7 @@ type taskHealthState struct { // event takes the deadline time for the allocation to be healthy and the update // strategy of the group. It returns true if the task has contributed to the // allocation being unhealthy and if so, an event description of why. -func (t *taskHealthState) event(deadline time.Time, update *structs.UpdateStrategy) (string, bool) { +func (t *taskHealthState) event(deadline time.Time, minHealthyTime time.Duration, useChecks bool) (string, bool) { requireChecks := false desiredChecks := 0 for _, s := range t.task.Services { @@ -479,7 +529,7 @@ func (t *taskHealthState) event(deadline time.Time, update *structs.UpdateStrate desiredChecks += nc } } - requireChecks = requireChecks && update.HealthCheck == structs.UpdateStrategyHealthCheck_Checks + requireChecks = requireChecks && useChecks if t.state != nil { if t.state.Failed { @@ -490,8 +540,8 @@ func (t *taskHealthState) event(deadline time.Time, update *structs.UpdateStrate } // We are running so check if we have been running long enough - if t.state.StartedAt.Add(update.MinHealthyTime).After(deadline) { - return fmt.Sprintf("Task not running for min_healthy_time of %v by deadline", update.MinHealthyTime), true + if t.state.StartedAt.Add(minHealthyTime).After(deadline) { + return fmt.Sprintf("Task not running for min_healthy_time of %v by deadline", minHealthyTime), true } } diff --git a/client/alloc_runner_test.go b/client/alloc_runner_test.go index 0ade0ba39dba..b2927f86eb9f 100644 --- a/client/alloc_runner_test.go +++ b/client/alloc_runner_test.go @@ -168,7 +168,7 @@ func TestAllocRunner_DeploymentHealth_Unhealthy_Deadline(t *testing.T) { // Make the task block task := ar.alloc.Job.TaskGroups[0].Tasks[0] task.Driver = "mock_driver" - task.Config["start_block_for"] = "2s" + task.Config["start_block_for"] = "4s" task.Config["run_for"] = "10s" // Make the alloc be part of a deployment diff --git a/client/driver/mock_driver.go b/client/driver/mock_driver.go index 09a86f72deda..ffa6b09774ef 100644 --- a/client/driver/mock_driver.go +++ b/client/driver/mock_driver.go @@ -379,7 +379,7 @@ func (h *mockDriverHandle) Signal(s os.Signal) error { // Kill kills a mock task func (h *mockDriverHandle) Kill() error { - h.logger.Printf("[DEBUG] driver.mock: killing task %q after kill timeout: %v", h.taskName, h.killTimeout) + h.logger.Printf("[DEBUG] driver.mock: killing task %q after %s or kill timeout: %v", h.taskName, h.killAfter, h.killTimeout) select { case <-h.doneCh: case <-time.After(h.killAfter): diff --git a/client/driver/mock_driver_testing.go b/client/driver/mock_driver_testing.go index 1b1e861a8915..8a712205e4aa 100644 --- a/client/driver/mock_driver_testing.go +++ b/client/driver/mock_driver_testing.go @@ -1,4 +1,4 @@ -//+build nomad_test +// +build nomad_test package driver diff --git a/client/testing.go b/client/testing.go index a86728365abe..4043da298738 100644 --- a/client/testing.go +++ b/client/testing.go @@ -21,6 +21,10 @@ func TestClient(t testing.T, cb func(c *config.Config)) *Client { }, } + // Loosen GC threshold + conf.GCDiskUsageThreshold = 98.0 + conf.GCInodeUsageThreshold = 98.0 + // Tighten the fingerprinter timeouts if conf.Options == nil { conf.Options = make(map[string]string) diff --git a/command/agent/fs_endpoint_test.go b/command/agent/fs_endpoint_test.go index f59bbd953b0f..9be39497ce9d 100644 --- a/command/agent/fs_endpoint_test.go +++ b/command/agent/fs_endpoint_test.go @@ -437,11 +437,10 @@ func TestHTTP_FS_Logs_Follow(t *testing.T) { req, err := http.NewRequest("GET", path, p) require.Nil(err) respW := httptest.NewRecorder() - doneCh := make(chan struct{}) + errCh := make(chan error) go func() { - _, err = s.Server.Logs(respW, req) - require.Nil(err) - close(doneCh) + _, err := s.Server.Logs(respW, req) + errCh <- err }() out := "" @@ -458,8 +457,8 @@ func TestHTTP_FS_Logs_Follow(t *testing.T) { }) select { - case <-doneCh: - t.Fatal("shouldn't close") + case err := <-errCh: + t.Fatalf("shouldn't exit: %v", err) case <-time.After(1 * time.Second): } diff --git a/command/agent/job_endpoint.go b/command/agent/job_endpoint.go index 840fb1feeda9..ce1605728740 100644 --- a/command/agent/job_endpoint.go +++ b/command/agent/job_endpoint.go @@ -649,6 +649,15 @@ func ApiTgToStructsTG(taskGroup *api.TaskGroup, tg *structs.TaskGroup) { } } + if taskGroup.Migrate != nil { + tg.Migrate = &structs.MigrateStrategy{ + MaxParallel: *taskGroup.Migrate.MaxParallel, + HealthCheck: *taskGroup.Migrate.HealthCheck, + MinHealthyTime: *taskGroup.Migrate.MinHealthyTime, + HealthyDeadline: *taskGroup.Migrate.HealthyDeadline, + } + } + tg.EphemeralDisk = &structs.EphemeralDisk{ Sticky: *taskGroup.EphemeralDisk.Sticky, SizeMB: *taskGroup.EphemeralDisk.SizeMB, diff --git a/command/agent/job_endpoint_test.go b/command/agent/job_endpoint_test.go index f59acaaf2eef..3e730a96957d 100644 --- a/command/agent/job_endpoint_test.go +++ b/command/agent/job_endpoint_test.go @@ -942,8 +942,7 @@ func TestHTTP_JobDispatch(t *testing.T) { t.Parallel() httpTest(t, nil, func(s *TestAgent) { // Create the parameterized job - job := mock.Job() - job.Type = "batch" + job := mock.BatchJob() job.ParameterizedJob = &structs.ParameterizedJobConfig{} args := structs.JobRegisterRequest{ @@ -1179,6 +1178,12 @@ func TestJobs_ApiJobToStructsJob(t *testing.T) { Unlimited: helper.BoolToPtr(true), MaxDelay: helper.TimeToPtr(20 * time.Minute), }, + Migrate: &api.MigrateStrategy{ + MaxParallel: helper.IntToPtr(12), + HealthCheck: helper.StringToPtr("task_events"), + MinHealthyTime: helper.TimeToPtr(12 * time.Hour), + HealthyDeadline: helper.TimeToPtr(12 * time.Hour), + }, EphemeralDisk: &api.EphemeralDisk{ SizeMB: helper.IntToPtr(100), Sticky: helper.BoolToPtr(true), @@ -1395,6 +1400,12 @@ func TestJobs_ApiJobToStructsJob(t *testing.T) { Unlimited: true, MaxDelay: 20 * time.Minute, }, + Migrate: &structs.MigrateStrategy{ + MaxParallel: 12, + HealthCheck: "task_events", + MinHealthyTime: 12 * time.Hour, + HealthyDeadline: 12 * time.Hour, + }, EphemeralDisk: &structs.EphemeralDisk{ SizeMB: 100, Sticky: true, diff --git a/command/agent/node_endpoint.go b/command/agent/node_endpoint.go index fd396a67c40f..bad4fc445b4d 100644 --- a/command/agent/node_endpoint.go +++ b/command/agent/node_endpoint.go @@ -4,7 +4,9 @@ import ( "net/http" "strconv" "strings" + "time" + "github.com/hashicorp/nomad/api" "github.com/hashicorp/nomad/nomad/structs" ) @@ -42,6 +44,9 @@ func (s *HTTPServer) NodeSpecificRequest(resp http.ResponseWriter, req *http.Req case strings.HasSuffix(path, "/drain"): nodeName := strings.TrimSuffix(path, "/drain") return s.nodeToggleDrain(resp, req, nodeName) + case strings.HasSuffix(path, "/eligibility"): + nodeName := strings.TrimSuffix(path, "/eligibility") + return s.nodeToggleEligibility(resp, req, nodeName) case strings.HasSuffix(path, "/purge"): nodeName := strings.TrimSuffix(path, "/purge") return s.nodePurge(resp, req, nodeName) @@ -101,19 +106,42 @@ func (s *HTTPServer) nodeToggleDrain(resp http.ResponseWriter, req *http.Request return nil, CodedError(405, ErrInvalidMethod) } - // Get the enable value + var drainRequest api.NodeUpdateDrainRequest + + // COMPAT: Remove in 0.9. Allow the old style enable query param. + // Get the enable parameter enableRaw := req.URL.Query().Get("enable") - if enableRaw == "" { - return nil, CodedError(400, "missing enable value") - } - enable, err := strconv.ParseBool(enableRaw) - if err != nil { - return nil, CodedError(400, "invalid enable value") + var enable bool + if enableRaw != "" { + var err error + enable, err = strconv.ParseBool(enableRaw) + if err != nil { + return nil, CodedError(400, "invalid enable value") + } + + // Use the force drain to have it keep the same behavior as old clients. + if enable { + drainRequest.DrainSpec = &api.DrainSpec{ + Deadline: -1 * time.Second, + } + } + } else { + if err := decodeBody(req, &drainRequest); err != nil { + return nil, CodedError(400, err.Error()) + } } args := structs.NodeUpdateDrainRequest{ - NodeID: nodeID, - Drain: enable, + NodeID: nodeID, + MarkEligible: drainRequest.MarkEligible, + } + if drainRequest.DrainSpec != nil { + args.DrainStrategy = &structs.DrainStrategy{ + DrainSpec: structs.DrainSpec{ + Deadline: drainRequest.DrainSpec.Deadline, + IgnoreSystemJobs: drainRequest.DrainSpec.IgnoreSystemJobs, + }, + } } s.parseWriteRequest(req, &args.WriteRequest) @@ -125,6 +153,26 @@ func (s *HTTPServer) nodeToggleDrain(resp http.ResponseWriter, req *http.Request return out, nil } +func (s *HTTPServer) nodeToggleEligibility(resp http.ResponseWriter, req *http.Request, + nodeID string) (interface{}, error) { + if req.Method != "PUT" && req.Method != "POST" { + return nil, CodedError(405, ErrInvalidMethod) + } + + var drainRequest structs.NodeUpdateEligibilityRequest + if err := decodeBody(req, &drainRequest); err != nil { + return nil, CodedError(400, err.Error()) + } + s.parseWriteRequest(req, &drainRequest.WriteRequest) + + var out structs.GenericResponse + if err := s.agent.RPC("Node.UpdateEligibility", &drainRequest, &out); err != nil { + return nil, err + } + setIndex(resp, out.Index) + return nil, nil +} + func (s *HTTPServer) nodeQuery(resp http.ResponseWriter, req *http.Request, nodeID string) (interface{}, error) { if req.Method != "GET" { diff --git a/command/agent/node_endpoint_test.go b/command/agent/node_endpoint_test.go index a5566adc19fb..6b3d96c44ed4 100644 --- a/command/agent/node_endpoint_test.go +++ b/command/agent/node_endpoint_test.go @@ -4,10 +4,13 @@ import ( "net/http" "net/http/httptest" "testing" + "time" + "github.com/hashicorp/nomad/api" "github.com/hashicorp/nomad/nomad/mock" "github.com/hashicorp/nomad/nomad/structs" "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" ) func TestHTTP_NodesList(t *testing.T) { @@ -238,6 +241,7 @@ func TestHTTP_NodeAllocations(t *testing.T) { func TestHTTP_NodeDrain(t *testing.T) { t.Parallel() + require := require.New(t) httpTest(t, nil, func(s *TestAgent) { // Create the node node := mock.Node() @@ -246,45 +250,106 @@ func TestHTTP_NodeDrain(t *testing.T) { WriteRequest: structs.WriteRequest{Region: "global"}, } var resp structs.NodeUpdateResponse - if err := s.Agent.RPC("Node.Register", &args, &resp); err != nil { - t.Fatalf("err: %v", err) - } + require.Nil(s.Agent.RPC("Node.Register", &args, &resp)) - // Directly manipulate the state - state := s.Agent.server.State() - alloc1 := mock.Alloc() - alloc1.NodeID = node.ID - if err := state.UpsertJobSummary(999, mock.JobSummary(alloc1.JobID)); err != nil { - t.Fatal(err) - } - err := state.UpsertAllocs(1000, []*structs.Allocation{alloc1}) - if err != nil { - t.Fatalf("err: %v", err) + drainReq := api.NodeUpdateDrainRequest{ + NodeID: node.ID, + DrainSpec: &api.DrainSpec{ + Deadline: 10 * time.Second, + }, } // Make the HTTP request - req, err := http.NewRequest("POST", "/v1/node/"+node.ID+"/drain?enable=1", nil) - if err != nil { - t.Fatalf("err: %v", err) - } + buf := encodeReq(drainReq) + req, err := http.NewRequest("POST", "/v1/node/"+node.ID+"/drain", buf) + require.Nil(err) respW := httptest.NewRecorder() // Make the request obj, err := s.Server.NodeSpecificRequest(respW, req) - if err != nil { - t.Fatalf("err: %v", err) - } + require.Nil(err) // Check for the index - if respW.HeaderMap.Get("X-Nomad-Index") == "" { - t.Fatalf("missing index") - } + require.NotZero(respW.HeaderMap.Get("X-Nomad-Index")) // Check the response - upd := obj.(structs.NodeDrainUpdateResponse) - if len(upd.EvalIDs) == 0 { - t.Fatalf("bad: %v", upd) + _, ok := obj.(structs.NodeDrainUpdateResponse) + require.True(ok) + + // Check that the node has been updated + state := s.Agent.server.State() + out, err := state.NodeByID(nil, node.ID) + require.Nil(err) + require.True(out.Drain) + require.NotNil(out.DrainStrategy) + require.Equal(10*time.Second, out.DrainStrategy.Deadline) + + // Make the HTTP request to unset drain + drainReq.DrainSpec = nil + buf = encodeReq(drainReq) + req, err = http.NewRequest("POST", "/v1/node/"+node.ID+"/drain", buf) + require.Nil(err) + respW = httptest.NewRecorder() + + // Make the request + _, err = s.Server.NodeSpecificRequest(respW, req) + require.Nil(err) + + out, err = state.NodeByID(nil, node.ID) + require.Nil(err) + require.False(out.Drain) + require.Nil(out.DrainStrategy) + }) +} + +func TestHTTP_NodeEligible(t *testing.T) { + t.Parallel() + require := require.New(t) + httpTest(t, nil, func(s *TestAgent) { + // Create the node + node := mock.Node() + args := structs.NodeRegisterRequest{ + Node: node, + WriteRequest: structs.WriteRequest{Region: "global"}, } + var resp structs.NodeUpdateResponse + require.Nil(s.Agent.RPC("Node.Register", &args, &resp)) + + drainReq := api.NodeUpdateEligibilityRequest{ + NodeID: node.ID, + Eligibility: structs.NodeSchedulingIneligible, + } + + // Make the HTTP request + buf := encodeReq(drainReq) + req, err := http.NewRequest("POST", "/v1/node/"+node.ID+"/eligibility", buf) + require.Nil(err) + respW := httptest.NewRecorder() + + // Make the request + _, err = s.Server.NodeSpecificRequest(respW, req) + require.Nil(err) + + // Check for the index + require.NotZero(respW.HeaderMap.Get("X-Nomad-Index")) + + // Check that the node has been updated + state := s.Agent.server.State() + out, err := state.NodeByID(nil, node.ID) + require.Nil(err) + require.Equal(structs.NodeSchedulingIneligible, out.SchedulingEligibility) + + // Make the HTTP request to set something invalid + drainReq.Eligibility = "foo" + buf = encodeReq(drainReq) + req, err = http.NewRequest("POST", "/v1/node/"+node.ID+"/eligibility", buf) + require.Nil(err) + respW = httptest.NewRecorder() + + // Make the request + _, err = s.Server.NodeSpecificRequest(respW, req) + require.NotNil(err) + require.Contains(err.Error(), "invalid") }) } diff --git a/command/node.go b/command/node.go new file mode 100644 index 000000000000..36436d9b7868 --- /dev/null +++ b/command/node.go @@ -0,0 +1,19 @@ +package command + +import "github.com/mitchellh/cli" + +type NodeCommand struct { + Meta +} + +func (f *NodeCommand) Help() string { + return "This command is accessed by using one of the subcommands below." +} + +func (f *NodeCommand) Synopsis() string { + return "Interact with nodes" +} + +func (f *NodeCommand) Run(args []string) int { + return cli.RunResultHelp +} diff --git a/command/node_drain.go b/command/node_drain.go index b40757b7c90b..9f170c76e082 100644 --- a/command/node_drain.go +++ b/command/node_drain.go @@ -3,18 +3,27 @@ package command import ( "fmt" "strings" + "time" + "github.com/hashicorp/nomad/api" "github.com/hashicorp/nomad/api/contexts" + "github.com/hashicorp/nomad/nomad/structs" "github.com/posener/complete" ) +var ( + // defaultDrainDuration is the default drain duration if it is not specified + // explicitly + defaultDrainDuration = 1 * time.Hour +) + type NodeDrainCommand struct { Meta } func (c *NodeDrainCommand) Help() string { helpText := ` -Usage: nomad node-drain [options] +Usage: nomad node drain [options] Toggles node draining on a specified node. It is required that either -enable or -disable is specified, but not both. @@ -32,8 +41,32 @@ Node Drain Options: -enable Enable draining for the specified node. + -deadline + Set the deadline by which all allocations must be moved off the node. + Remaining allocations after the deadline are forced removed from the node. + If unspecified, a default deadline of one hour is applied. + + -detach + Return immediately instead of entering monitor mode. + + -force + Force remove allocations off the node immediately. + + -no-deadline + No deadline allows the allocations to drain off the node without being force + stopped after a certain deadline. + + -ignore-system + Ignore system allows the drain to complete without stopping system job + allocations. By default system jobs are stopped last. + + -keep-ineligible + Keep ineligible will maintain the node's scheduling ineligibility even if + the drain is being disabled. This is useful when an existing drain is being + cancelled but additional scheduling on the node is not desired. + -self - Query the status of the local node. + Set the drain status of the local node. -yes Automatic yes to prompts. @@ -48,10 +81,16 @@ func (c *NodeDrainCommand) Synopsis() string { func (c *NodeDrainCommand) AutocompleteFlags() complete.Flags { return mergeAutocompleteFlags(c.Meta.AutocompleteFlags(FlagSetClient), complete.Flags{ - "-disable": complete.PredictNothing, - "-enable": complete.PredictNothing, - "-self": complete.PredictNothing, - "-yes": complete.PredictNothing, + "-disable": complete.PredictNothing, + "-enable": complete.PredictNothing, + "-deadline": complete.PredictAnything, + "-detach": complete.PredictNothing, + "-force": complete.PredictNothing, + "-no-deadline": complete.PredictNothing, + "-ignore-system": complete.PredictNothing, + "-keep-ineligible": complete.PredictNothing, + "-self": complete.PredictNothing, + "-yes": complete.PredictNothing, }) } @@ -71,12 +110,20 @@ func (c *NodeDrainCommand) AutocompleteArgs() complete.Predictor { } func (c *NodeDrainCommand) Run(args []string) int { - var enable, disable, self, autoYes bool + var enable, disable, detach, force, + noDeadline, ignoreSystem, keepIneligible, self, autoYes bool + var deadline string flags := c.Meta.FlagSet("node-drain", FlagSetClient) flags.Usage = func() { c.Ui.Output(c.Help()) } flags.BoolVar(&enable, "enable", false, "Enable drain mode") flags.BoolVar(&disable, "disable", false, "Disable drain mode") + flags.StringVar(&deadline, "deadline", "", "Deadline after which allocations are force stopped") + flags.BoolVar(&detach, "detach", false, "") + flags.BoolVar(&force, "force", false, "Force immediate drain") + flags.BoolVar(&noDeadline, "no-deadline", false, "Drain node with no deadline") + flags.BoolVar(&ignoreSystem, "ignore-system", false, "Do not drain system job allocations from the node") + flags.BoolVar(&keepIneligible, "keep-ineligible", false, "Do not update the nodes scheduling eligibility") flags.BoolVar(&self, "self", false, "") flags.BoolVar(&autoYes, "yes", false, "Automatic yes to prompts.") @@ -93,10 +140,46 @@ func (c *NodeDrainCommand) Run(args []string) int { // Check that we got a node ID args = flags.Args() if l := len(args); self && l != 0 || !self && l != 1 { - c.Ui.Error(c.Help()) + c.Ui.Error("Node ID must be specified if -self isn't being used") return 1 } + // Validate a compatible set of flags were set + if disable && (deadline != "" || force || noDeadline || ignoreSystem) { + c.Ui.Error("-disable can't be combined with flags configuring drain strategy") + return 1 + } + if deadline != "" && (force || noDeadline) { + c.Ui.Error("-deadline can't be combined with -force or -no-deadline") + return 1 + } + if force && noDeadline { + c.Ui.Error("-force and -no-deadline are mutually exclusive") + return 1 + } + + // Parse the duration + var d time.Duration + if force { + d = -1 * time.Second + } else if noDeadline { + d = 0 + } else if deadline != "" { + dur, err := time.ParseDuration(deadline) + if err != nil { + c.Ui.Error(fmt.Sprintf("Failed to parse deadline %q: %v", deadline, err)) + return 1 + } + if dur <= 0 { + c.Ui.Error("A positive drain duration must be given") + return 1 + } + + d = dur + } else { + d = defaultDrainDuration + } + // Get the HTTP client client, err := c.Meta.Client() if err != nil { @@ -134,21 +217,8 @@ func (c *NodeDrainCommand) Run(args []string) int { return 1 } if len(nodes) > 1 { - // Format the nodes list that matches the prefix so that the user - // can create a more specific request - out := make([]string, len(nodes)+1) - out[0] = "ID|Datacenter|Name|Class|Drain|Status" - for i, node := range nodes { - out[i+1] = fmt.Sprintf("%s|%s|%s|%s|%v|%s", - node.ID, - node.Datacenter, - node.Name, - node.NodeClass, - node.Drain, - node.Status) - } - // Dump the output - c.Ui.Error(fmt.Sprintf("Prefix matched multiple nodes\n\n%s", formatList(out))) + c.Ui.Error(fmt.Sprintf("Prefix matched multiple nodes\n\n%s", + formatNodeStubList(nodes, true))) return 1 } @@ -186,10 +256,175 @@ func (c *NodeDrainCommand) Run(args []string) int { } } + var spec *api.DrainSpec + if enable { + spec = &api.DrainSpec{ + Deadline: d, + IgnoreSystemJobs: ignoreSystem, + } + } + // Toggle node draining - if _, err := client.Nodes().ToggleDrain(node.ID, enable, nil); err != nil { - c.Ui.Error(fmt.Sprintf("Error toggling drain mode: %s", err)) + meta, err := client.Nodes().UpdateDrain(node.ID, spec, !keepIneligible, nil) + if err != nil { + c.Ui.Error(fmt.Sprintf("Error updating drain specification: %s", err)) return 1 } + + c.Ui.Output(fmt.Sprintf("Node %q drain strategy set", node.ID)) + + if enable && !detach { + if err := monitorDrain(c.Ui.Output, client.Nodes(), node.ID, meta.LastIndex); err != nil { + c.Ui.Error(fmt.Sprintf("Error monitoring drain: %v", err)) + return 1 + } + + c.Ui.Output(fmt.Sprintf("Node %q drain complete", nodeID)) + } + return 0 } + +// monitorDrain monitors the node being drained and exits when the node has +// finished draining. +func monitorDrain(output func(string), nodeClient *api.Nodes, nodeID string, index uint64) error { + doneCh := make(chan struct{}) + defer close(doneCh) + + // Errors from either goroutine are sent here + errCh := make(chan error, 1) + + // Monitor node changes and close chan when drain is complete + nodeCh := make(chan struct{}) + go func() { + for { + q := api.QueryOptions{ + AllowStale: true, + WaitIndex: index, + } + node, meta, err := nodeClient.Info(nodeID, &q) + if err != nil { + select { + case errCh <- err: + case <-doneCh: + } + return + } + + if node.DrainStrategy == nil { + close(nodeCh) + return + } + + // Drain still ongoing + index = meta.LastIndex + } + }() + + // Monitor alloc changes + allocCh := make(chan string, 1) + go func() { + allocs, meta, err := nodeClient.Allocations(nodeID, nil) + if err != nil { + select { + case errCh <- err: + case <-doneCh: + } + return + } + + initial := make(map[string]*api.Allocation, len(allocs)) + for _, a := range allocs { + initial[a.ID] = a + } + + for { + q := api.QueryOptions{ + AllowStale: true, + WaitIndex: meta.LastIndex, + } + + allocs, meta, err = nodeClient.Allocations(nodeID, &q) + if err != nil { + select { + case errCh <- err: + case <-doneCh: + } + return + } + + for _, a := range allocs { + // Get previous version of alloc + orig, ok := initial[a.ID] + + // Update local alloc state + initial[a.ID] = a + + migrating := a.DesiredTransition.ShouldMigrate() + + msg := "" + switch { + case !ok: + // Should only be possible if response + // from initial Allocations call was + // stale. No need to output + + case orig.ClientStatus != a.ClientStatus: + // Alloc status has changed; output + msg = fmt.Sprintf("status %s -> %s", orig.ClientStatus, a.ClientStatus) + + case migrating && !orig.DesiredTransition.ShouldMigrate(): + // Alloc was marked for migration + msg = "marked for migration" + case migrating && (orig.DesiredStatus != a.DesiredStatus) && a.DesiredStatus == structs.AllocDesiredStatusStop: + // Alloc has already been marked for migration and is now being stopped + msg = "draining" + case a.NextAllocation != "" && orig.NextAllocation == "": + // Alloc has been replaced by another allocation + msg = fmt.Sprintf("replaced by allocation %q", a.NextAllocation) + } + + if msg != "" { + select { + case allocCh <- fmt.Sprintf("Alloc %q %s", a.ID, msg): + case <-doneCh: + return + } + } + } + } + }() + + done := false + for !done { + select { + case err := <-errCh: + return err + case <-nodeCh: + done = true + case msg := <-allocCh: + output(msg) + } + } + + // Loop on alloc messages for a bit longer as we may have gotten the + // "node done" first (since the watchers run concurrently the events + // may be received out of order) + deadline := 250 * time.Millisecond + timer := time.NewTimer(deadline) + for { + select { + case err := <-errCh: + return err + case msg := <-allocCh: + output(msg) + if !timer.Stop() { + <-timer.C + } + timer.Reset(deadline) + case <-timer.C: + // No events within deadline, exit + return nil + } + } +} diff --git a/command/node_drain_test.go b/command/node_drain_test.go index 241845ab4878..01c8b12532bd 100644 --- a/command/node_drain_test.go +++ b/command/node_drain_test.go @@ -4,11 +4,16 @@ import ( "fmt" "strings" "testing" + "time" + "github.com/hashicorp/nomad/api" + "github.com/hashicorp/nomad/command/agent" + "github.com/hashicorp/nomad/helper" "github.com/hashicorp/nomad/testutil" "github.com/mitchellh/cli" "github.com/posener/complete" "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" ) func TestNodeDrainCommand_Implements(t *testing.T) { @@ -16,6 +21,173 @@ func TestNodeDrainCommand_Implements(t *testing.T) { var _ cli.Command = &NodeDrainCommand{} } +func TestNodeDrainCommand_Detach(t *testing.T) { + t.Parallel() + require := require.New(t) + server, client, url := testServer(t, true, func(c *agent.Config) { + c.NodeName = "drain_detach_node" + }) + defer server.Shutdown() + + // Wait for a node to appear + var nodeID string + testutil.WaitForResult(func() (bool, error) { + nodes, _, err := client.Nodes().List(nil) + if err != nil { + return false, err + } + if len(nodes) == 0 { + return false, fmt.Errorf("missing node") + } + nodeID = nodes[0].ID + return true, nil + }, func(err error) { + t.Fatalf("err: %s", err) + }) + + // Register a job to create an alloc to drain that will block draining + job := &api.Job{ + ID: helper.StringToPtr("mock_service"), + Name: helper.StringToPtr("mock_service"), + Datacenters: []string{"dc1"}, + TaskGroups: []*api.TaskGroup{ + { + Name: helper.StringToPtr("mock_group"), + Tasks: []*api.Task{ + { + Name: "mock_task", + Driver: "mock_driver", + Config: map[string]interface{}{ + "run_for": "10m", + "exit_after": "10m", + }, + }, + }, + }, + }, + } + + _, _, err := client.Jobs().Register(job, nil) + require.Nil(err) + + testutil.WaitForResult(func() (bool, error) { + allocs, _, err := client.Nodes().Allocations(nodeID, nil) + if err != nil { + return false, err + } + return len(allocs) > 0, fmt.Errorf("no allocs") + }, func(err error) { + t.Fatalf("err: %v", err) + }) + + ui := new(cli.MockUi) + cmd := &NodeDrainCommand{Meta: Meta{Ui: ui}} + if code := cmd.Run([]string{"-address=" + url, "-self", "-enable", "-detach"}); code != 0 { + t.Fatalf("expected exit 0, got: %d", code) + } + + out := ui.OutputWriter.String() + expected := "drain strategy set" + require.Contains(out, expected) + + node, _, err := client.Nodes().Info(nodeID, nil) + require.Nil(err) + require.NotNil(node.DrainStrategy) +} + +func TestNodeDrainCommand_Monitor(t *testing.T) { + t.Parallel() + require := require.New(t) + server, client, url := testServer(t, true, func(c *agent.Config) { + c.NodeName = "drain_monitor_node" + }) + defer server.Shutdown() + + // Wait for a node to appear + var nodeID string + testutil.WaitForResult(func() (bool, error) { + nodes, _, err := client.Nodes().List(nil) + if err != nil { + return false, err + } + if len(nodes) == 0 { + return false, fmt.Errorf("missing node") + } + nodeID = nodes[0].ID + return true, nil + }, func(err error) { + t.Fatalf("err: %s", err) + }) + + // Register a job to create an alloc to drain + count := 3 + job := &api.Job{ + ID: helper.StringToPtr("mock_service"), + Name: helper.StringToPtr("mock_service"), + Datacenters: []string{"dc1"}, + TaskGroups: []*api.TaskGroup{ + { + Name: helper.StringToPtr("mock_group"), + Count: &count, + Migrate: &api.MigrateStrategy{ + MaxParallel: helper.IntToPtr(1), + HealthCheck: helper.StringToPtr("task_states"), + MinHealthyTime: helper.TimeToPtr(10 * time.Millisecond), + HealthyDeadline: helper.TimeToPtr(5 * time.Minute), + }, + Tasks: []*api.Task{ + { + Name: "mock_task", + Driver: "mock_driver", + Config: map[string]interface{}{ + "run_for": "10m", + }, + }, + }, + }, + }, + } + + _, _, err := client.Jobs().Register(job, nil) + require.Nil(err) + + var allocs []*api.Allocation + testutil.WaitForResult(func() (bool, error) { + allocs, _, err = client.Nodes().Allocations(nodeID, nil) + if err != nil { + return false, err + } + if len(allocs) != count { + return false, fmt.Errorf("number of allocs %d != count (%d)", len(allocs), count) + } + for _, a := range allocs { + if a.ClientStatus != "running" { + return false, fmt.Errorf("alloc %q still not running: %s", a.ID, a.ClientStatus) + } + } + return true, nil + }, func(err error) { + t.Fatalf("err: %v", err) + }) + + ui := new(cli.MockUi) + cmd := &NodeDrainCommand{Meta: Meta{Ui: ui}} + args := []string{"-address=" + url, "-self", "-enable", "-deadline", "1s"} + t.Logf("Running: %v", args) + if code := cmd.Run(args); code != 0 { + t.Fatalf("expected exit 0, got: %d", code) + } + + out := ui.OutputWriter.String() + t.Logf("Output:\n%s", out) + + require.Contains(out, "drain complete") + for _, a := range allocs { + require.Contains(out, fmt.Sprintf("Alloc %q marked for migration", a.ID)) + require.Contains(out, fmt.Sprintf("Alloc %q draining", a.ID)) + } +} + func TestNodeDrainCommand_Fails(t *testing.T) { t.Parallel() srv, _, url := testServer(t, false, nil) @@ -85,6 +257,49 @@ func TestNodeDrainCommand_Fails(t *testing.T) { if out := ui.ErrorWriter.String(); !strings.Contains(out, "No node(s) with prefix or id") { t.Fatalf("expected not exist error, got: %s", out) } + ui.ErrorWriter.Reset() + + // Fail on disable being used with drain strategy flags + for _, flag := range []string{"-force", "-no-deadline", "-ignore-system"} { + if code := cmd.Run([]string{"-address=" + url, "-disable", flag, "12345678-abcd-efab-cdef-123456789abc"}); code != 1 { + t.Fatalf("expected exit 1, got: %d", code) + } + if out := ui.ErrorWriter.String(); !strings.Contains(out, "combined with flags configuring drain strategy") { + t.Fatalf("got: %s", out) + } + ui.ErrorWriter.Reset() + } + + // Fail on setting a deadline plus deadline modifying flags + for _, flag := range []string{"-force", "-no-deadline"} { + if code := cmd.Run([]string{"-address=" + url, "-enable", "-deadline=10s", flag, "12345678-abcd-efab-cdef-123456789abc"}); code != 1 { + t.Fatalf("expected exit 1, got: %d", code) + } + if out := ui.ErrorWriter.String(); !strings.Contains(out, "deadline can't be combined with") { + t.Fatalf("got: %s", out) + } + ui.ErrorWriter.Reset() + } + + // Fail on setting a force and no deadline + if code := cmd.Run([]string{"-address=" + url, "-enable", "-force", "-no-deadline", "12345678-abcd-efab-cdef-123456789abc"}); code != 1 { + t.Fatalf("expected exit 1, got: %d", code) + } + if out := ui.ErrorWriter.String(); !strings.Contains(out, "mutually exclusive") { + t.Fatalf("got: %s", out) + } + ui.ErrorWriter.Reset() + + // Fail on setting a bad deadline + for _, flag := range []string{"-deadline=0s", "-deadline=-1s"} { + if code := cmd.Run([]string{"-address=" + url, "-enable", flag, "12345678-abcd-efab-cdef-123456789abc"}); code != 1 { + t.Fatalf("expected exit 1, got: %d", code) + } + if out := ui.ErrorWriter.String(); !strings.Contains(out, "positive") { + t.Fatalf("got: %s", out) + } + ui.ErrorWriter.Reset() + } } func TestNodeDrainCommand_AutocompleteArgs(t *testing.T) { diff --git a/command/node_eligibility.go b/command/node_eligibility.go new file mode 100644 index 000000000000..a3fe5f802cfd --- /dev/null +++ b/command/node_eligibility.go @@ -0,0 +1,155 @@ +package command + +import ( + "fmt" + "strings" + + "github.com/hashicorp/nomad/api/contexts" + "github.com/posener/complete" +) + +type NodeEligibilityCommand struct { + Meta +} + +func (c *NodeEligibilityCommand) Help() string { + helpText := ` +Usage: nomad node eligibility [options] + + Toggles the nodes scheduling eligibility. When a node is marked as ineligible, + no new allocations will be placed on it but existing allocations will remain. + To remove existing allocations, use the node drain command. + + It is required that either -enable or -disable is specified, but not both. + The -self flag is useful to set the scheduling eligibility of the local node. + +General Options: + + ` + generalOptionsUsage() + ` + +Node Eligibility Options: + + -disable + Mark the specified node as ineligible for new allocations. + + -enable + Mark the specified node as eligible for new allocations. + + -self + Set the eligibility of the local node. +` + return strings.TrimSpace(helpText) +} + +func (c *NodeEligibilityCommand) Synopsis() string { + return "Toggle scheduling eligibility for a given node" +} + +func (c *NodeEligibilityCommand) AutocompleteFlags() complete.Flags { + return mergeAutocompleteFlags(c.Meta.AutocompleteFlags(FlagSetClient), + complete.Flags{ + "-disable": complete.PredictNothing, + "-enable": complete.PredictNothing, + "-self": complete.PredictNothing, + }) +} + +func (c *NodeEligibilityCommand) AutocompleteArgs() complete.Predictor { + return complete.PredictFunc(func(a complete.Args) []string { + client, err := c.Meta.Client() + if err != nil { + return nil + } + + resp, _, err := client.Search().PrefixSearch(a.Last, contexts.Nodes, nil) + if err != nil { + return []string{} + } + return resp.Matches[contexts.Nodes] + }) +} + +func (c *NodeEligibilityCommand) Run(args []string) int { + var enable, disable, self bool + + flags := c.Meta.FlagSet("node-eligibility", FlagSetClient) + flags.Usage = func() { c.Ui.Output(c.Help()) } + flags.BoolVar(&enable, "enable", false, "Mark node as eligibile for scheduling") + flags.BoolVar(&disable, "disable", false, "Mark node as ineligibile for scheduling") + flags.BoolVar(&self, "self", false, "") + + if err := flags.Parse(args); err != nil { + return 1 + } + + // Check that we got either enable or disable, but not both. + if (enable && disable) || (!enable && !disable) { + c.Ui.Error(c.Help()) + return 1 + } + + // Check that we got a node ID + args = flags.Args() + if l := len(args); self && l != 0 || !self && l != 1 { + c.Ui.Error("Node ID must be specified if -self isn't being used") + return 1 + } + + // Get the HTTP client + client, err := c.Meta.Client() + if err != nil { + c.Ui.Error(fmt.Sprintf("Error initializing client: %s", err)) + return 1 + } + + // If -self flag is set then determine the current node. + var nodeID string + if !self { + nodeID = args[0] + } else { + var err error + if nodeID, err = getLocalNodeID(client); err != nil { + c.Ui.Error(err.Error()) + return 1 + } + } + + // Check if node exists + if len(nodeID) == 1 { + c.Ui.Error(fmt.Sprintf("Identifier must contain at least two characters.")) + return 1 + } + + nodeID = sanitizeUUIDPrefix(nodeID) + nodes, _, err := client.Nodes().PrefixList(nodeID) + if err != nil { + c.Ui.Error(fmt.Sprintf("Error updating scheduling eligibility: %s", err)) + return 1 + } + // Return error if no nodes are found + if len(nodes) == 0 { + c.Ui.Error(fmt.Sprintf("No node(s) with prefix or id %q found", nodeID)) + return 1 + } + if len(nodes) > 1 { + c.Ui.Error(fmt.Sprintf("Prefix matched multiple nodes\n\n%s", + formatNodeStubList(nodes, true))) + return 1 + } + + // Prefix lookup matched a single node + node, _, err := client.Nodes().Info(nodes[0].ID, nil) + if err != nil { + c.Ui.Error(fmt.Sprintf("Error updating scheduling eligibility: %s", err)) + return 1 + } + + // Toggle node eligibility + if _, err := client.Nodes().ToggleEligibility(node.ID, enable, nil); err != nil { + c.Ui.Error(fmt.Sprintf("Error updating scheduling eligibility: %s", err)) + return 1 + } + + c.Ui.Output(fmt.Sprintf("Node %q scheduling eligibility set", node.ID)) + return 0 +} diff --git a/command/node_eligibility_test.go b/command/node_eligibility_test.go new file mode 100644 index 000000000000..6fbb3c91d8ff --- /dev/null +++ b/command/node_eligibility_test.go @@ -0,0 +1,125 @@ +package command + +import ( + "fmt" + "strings" + "testing" + + "github.com/hashicorp/nomad/testutil" + "github.com/mitchellh/cli" + "github.com/posener/complete" + "github.com/stretchr/testify/assert" +) + +func TestNodeEligibilityCommand_Implements(t *testing.T) { + t.Parallel() + var _ cli.Command = &NodeEligibilityCommand{} +} + +func TestNodeEligibilityCommand_Fails(t *testing.T) { + t.Parallel() + srv, _, url := testServer(t, false, nil) + defer srv.Shutdown() + + ui := new(cli.MockUi) + cmd := &NodeEligibilityCommand{Meta: Meta{Ui: ui}} + + // Fails on misuse + if code := cmd.Run([]string{"some", "bad", "args"}); code != 1 { + t.Fatalf("expected exit code 1, got: %d", code) + } + if out := ui.ErrorWriter.String(); !strings.Contains(out, cmd.Help()) { + t.Fatalf("expected help output, got: %s", out) + } + ui.ErrorWriter.Reset() + + // Fails on connection failure + if code := cmd.Run([]string{"-address=nope", "-enable", "12345678-abcd-efab-cdef-123456789abc"}); code != 1 { + t.Fatalf("expected exit code 1, got: %d", code) + } + expected := "Error updating scheduling eligibility" + if out := ui.ErrorWriter.String(); !strings.Contains(out, expected) { + t.Fatalf("expected %q, got: %s", expected, out) + } + ui.ErrorWriter.Reset() + + // Fails on non-existent node + if code := cmd.Run([]string{"-address=" + url, "-enable", "12345678-abcd-efab-cdef-123456789abc"}); code != 1 { + t.Fatalf("expected exit 1, got: %d", code) + } + if out := ui.ErrorWriter.String(); !strings.Contains(out, "No node(s) with prefix or id") { + t.Fatalf("expected not exist error, got: %s", out) + } + ui.ErrorWriter.Reset() + + // Fails if both enable and disable specified + if code := cmd.Run([]string{"-enable", "-disable", "12345678-abcd-efab-cdef-123456789abc"}); code != 1 { + t.Fatalf("expected exit 1, got: %d", code) + } + if out := ui.ErrorWriter.String(); !strings.Contains(out, cmd.Help()) { + t.Fatalf("expected help output, got: %s", out) + } + ui.ErrorWriter.Reset() + + // Fails if neither enable or disable specified + if code := cmd.Run([]string{"12345678-abcd-efab-cdef-123456789abc"}); code != 1 { + t.Fatalf("expected exit 1, got: %d", code) + } + if out := ui.ErrorWriter.String(); !strings.Contains(out, cmd.Help()) { + t.Fatalf("expected help output, got: %s", out) + } + ui.ErrorWriter.Reset() + + // Fail on identifier with too few characters + if code := cmd.Run([]string{"-address=" + url, "-enable", "1"}); code != 1 { + t.Fatalf("expected exit 1, got: %d", code) + } + if out := ui.ErrorWriter.String(); !strings.Contains(out, "must contain at least two characters.") { + t.Fatalf("expected too few characters error, got: %s", out) + } + ui.ErrorWriter.Reset() + + // Identifiers with uneven length should produce a query result + if code := cmd.Run([]string{"-address=" + url, "-enable", "123"}); code != 1 { + t.Fatalf("expected exit 1, got: %d", code) + } + if out := ui.ErrorWriter.String(); !strings.Contains(out, "No node(s) with prefix or id") { + t.Fatalf("expected not exist error, got: %s", out) + } + ui.ErrorWriter.Reset() +} + +func TestNodeEligibilityCommand_AutocompleteArgs(t *testing.T) { + assert := assert.New(t) + t.Parallel() + + srv, client, url := testServer(t, true, nil) + defer srv.Shutdown() + + // Wait for a node to appear + var nodeID string + testutil.WaitForResult(func() (bool, error) { + nodes, _, err := client.Nodes().List(nil) + if err != nil { + return false, err + } + if len(nodes) == 0 { + return false, fmt.Errorf("missing node") + } + nodeID = nodes[0].ID + return true, nil + }, func(err error) { + t.Fatalf("err: %s", err) + }) + + ui := new(cli.MockUi) + cmd := &NodeEligibilityCommand{Meta: Meta{Ui: ui, flagAddress: url}} + + prefix := nodeID[:len(nodeID)-5] + args := complete.Args{Last: prefix} + predictor := cmd.AutocompleteArgs() + + res := predictor.Predict(args) + assert.Equal(1, len(res)) + assert.Equal(nodeID, res[0]) +} diff --git a/command/node_status.go b/command/node_status.go index cbce475346a9..68c72342b11f 100644 --- a/command/node_status.go +++ b/command/node_status.go @@ -37,7 +37,7 @@ type NodeStatusCommand struct { func (c *NodeStatusCommand) Help() string { helpText := ` -Usage: nomad node-status [options] +Usage: nomad node status [options] Display status information about a given node. The list of nodes returned includes only nodes which jobs may be scheduled to, and @@ -183,7 +183,7 @@ func (c *NodeStatusCommand) Run(args []string) int { out[0] += "Address|Version|" } - out[0] += "Drain|Status" + out[0] += "Drain|Eligibility|Status" if c.list_allocs { out[0] += "|Running Allocs" @@ -199,9 +199,11 @@ func (c *NodeStatusCommand) Run(args []string) int { out[i+1] += fmt.Sprintf("|%s|%s", node.Address, node.Version) } - out[i+1] += fmt.Sprintf("|%v|%s", + out[i+1] += fmt.Sprintf("|%v|%s|%s", node.Drain, + node.SchedulingEligibility, node.Status) + if c.list_allocs { numAllocs, err := getRunningAllocs(client, node.ID) if err != nil { @@ -246,23 +248,12 @@ func (c *NodeStatusCommand) Run(args []string) int { return 1 } if len(nodes) > 1 { - // Format the nodes list that matches the prefix so that the user - // can create a more specific request - out := make([]string, len(nodes)+1) - out[0] = "ID|DC|Name|Class|Drain|Status" - for i, node := range nodes { - out[i+1] = fmt.Sprintf("%s|%s|%s|%s|%v|%s", - limit(node.ID, c.length), - node.Datacenter, - node.Name, - node.NodeClass, - node.Drain, - node.Status) - } // Dump the output - c.Ui.Error(fmt.Sprintf("Prefix matched multiple nodes\n\n%s", formatList(out))) + c.Ui.Error(fmt.Sprintf("Prefix matched multiple nodes\n\n%s", + formatNodeStubList(nodes, c.verbose))) return 1 } + // Prefix lookup matched a single node node, _, err := client.Nodes().Info(nodes[0].ID, nil) if err != nil { @@ -313,6 +304,7 @@ func (c *NodeStatusCommand) formatNode(client *api.Client, node *api.Node) int { fmt.Sprintf("Class|%s", node.NodeClass), fmt.Sprintf("DC|%s", node.Datacenter), fmt.Sprintf("Drain|%v", node.Drain), + fmt.Sprintf("Eligibility|%s", node.SchedulingEligibility), fmt.Sprintf("Status|%s", node.Status), fmt.Sprintf("Drivers|%s", strings.Join(nodeDrivers(node), ",")), } @@ -637,3 +629,33 @@ func getHostResources(hostStats *api.HostStats, node *api.Node) ([]string, error } return resources, nil } + +// formatNodeStubList is used to return a table format of a list of node stubs. +func formatNodeStubList(nodes []*api.NodeListStub, verbose bool) string { + // Return error if no nodes are found + if len(nodes) == 0 { + return "" + } + // Truncate the id unless full length is requested + length := shortId + if verbose { + length = fullId + } + + // Format the nodes list that matches the prefix so that the user + // can create a more specific request + out := make([]string, len(nodes)+1) + out[0] = "ID|DC|Name|Class|Drain|Eligibility|Status" + for i, node := range nodes { + out[i+1] = fmt.Sprintf("%s|%s|%s|%s|%v|%s|%s", + limit(node.ID, length), + node.Datacenter, + node.Name, + node.NodeClass, + node.Drain, + node.SchedulingEligibility, + node.Status) + } + + return formatList(out) +} diff --git a/commands.go b/commands.go index 75155948bd21..0b3a422f0348 100644 --- a/commands.go +++ b/commands.go @@ -258,17 +258,36 @@ func Commands(metaPtr *command.Meta) map[string]cli.CommandFactory { Meta: meta, }, nil }, + "node": func() (cli.Command, error) { + return &command.NodeCommand{ + Meta: meta, + }, nil + }, "node-drain": func() (cli.Command, error) { return &command.NodeDrainCommand{ Meta: meta, }, nil }, + "node drain": func() (cli.Command, error) { + return &command.NodeDrainCommand{ + Meta: meta, + }, nil + }, + "node eligibility": func() (cli.Command, error) { + return &command.NodeEligibilityCommand{ + Meta: meta, + }, nil + }, "node-status": func() (cli.Command, error) { return &command.NodeStatusCommand{ Meta: meta, }, nil }, - + "node status": func() (cli.Command, error) { + return &command.NodeStatusCommand{ + Meta: meta, + }, nil + }, "operator": func() (cli.Command, error) { return &command.OperatorCommand{ Meta: meta, diff --git a/helper/testlog/testlog.go b/helper/testlog/testlog.go index b72fcfb28bef..709bd9d54745 100644 --- a/helper/testlog/testlog.go +++ b/helper/testlog/testlog.go @@ -6,8 +6,14 @@ package testlog import ( "io" "log" + "os" ) +// UseStdout returns true if NOMAD_TEST_STDOUT=1 and sends logs to stdout. +func UseStdout() bool { + return os.Getenv("NOMAD_TEST_STDOUT") == "1" +} + // LogPrinter is the methods of testing.T (or testing.B) needed by the test // logger. type LogPrinter interface { @@ -27,11 +33,17 @@ func (w *writer) Write(p []byte) (n int, err error) { // NewWriter creates a new io.Writer backed by a Logger. func NewWriter(t LogPrinter) io.Writer { + if UseStdout() { + return os.Stdout + } return &writer{t} } // New returns a new test logger. See https://golang.org/pkg/log/#New func New(t LogPrinter, prefix string, flag int) *log.Logger { + if UseStdout() { + return log.New(os.Stdout, prefix, flag) + } return log.New(&writer{t}, prefix, flag) } diff --git a/jobspec/parse.go b/jobspec/parse.go index d6f235e05f26..e56161cd4c40 100644 --- a/jobspec/parse.go +++ b/jobspec/parse.go @@ -104,11 +104,12 @@ func parseJob(result *api.Job, list *ast.ObjectList) error { } delete(m, "constraint") delete(m, "meta") - delete(m, "update") - delete(m, "periodic") - delete(m, "vault") + delete(m, "migrate") delete(m, "parameterized") + delete(m, "periodic") delete(m, "reschedule") + delete(m, "update") + delete(m, "vault") // Set the ID and name to the object key result.ID = helper.StringToPtr(obj.Keys[0].Token.Value().(string)) @@ -132,19 +133,20 @@ func parseJob(result *api.Job, list *ast.ObjectList) error { "all_at_once", "constraint", "datacenters", - "parameterized", "group", "id", "meta", + "migrate", "name", "namespace", + "parameterized", "periodic", "priority", "region", + "reschedule", "task", "type", "update", - "reschedule", "vault", "vault_token", } @@ -187,6 +189,13 @@ func parseJob(result *api.Job, list *ast.ObjectList) error { } } + // If we have a migration strategy, then parse that + if o := listVal.Filter("migrate"); len(o.Items) > 0 { + if err := parseMigrate(&result.Migrate, o); err != nil { + return multierror.Prefix(err, "migrate ->") + } + } + // Parse out meta fields. These are in HCL as a list so we need // to iterate over them and merge them. if metaO := listVal.Filter("meta"); len(metaO.Items) > 0 { @@ -285,6 +294,7 @@ func parseGroups(result *api.Job, list *ast.ObjectList) error { "update", "reschedule", "vault", + "migrate", } if err := helper.CheckHCLKeys(listVal, valid); err != nil { return multierror.Prefix(err, fmt.Sprintf("'%s' ->", n)) @@ -301,6 +311,7 @@ func parseGroups(result *api.Job, list *ast.ObjectList) error { delete(m, "ephemeral_disk") delete(m, "update") delete(m, "vault") + delete(m, "migrate") // Build the group with the basic decode var g api.TaskGroup @@ -344,6 +355,13 @@ func parseGroups(result *api.Job, list *ast.ObjectList) error { } } + // If we have a migration strategy, then parse that + if o := listVal.Filter("migrate"); len(o.Items) > 0 { + if err := parseMigrate(&g.Migrate, o); err != nil { + return multierror.Prefix(err, "migrate ->") + } + } + // Parse out meta fields. These are in HCL as a list so we need // to iterate over them and merge them. if metaO := listVal.Filter("meta"); len(metaO.Items) > 0 { @@ -1320,6 +1338,42 @@ func parseUpdate(result **api.UpdateStrategy, list *ast.ObjectList) error { return dec.Decode(m) } +func parseMigrate(result **api.MigrateStrategy, list *ast.ObjectList) error { + list = list.Elem() + if len(list.Items) > 1 { + return fmt.Errorf("only one 'migrate' block allowed") + } + + // Get our resource object + o := list.Items[0] + + var m map[string]interface{} + if err := hcl.DecodeObject(&m, o.Val); err != nil { + return err + } + + // Check for invalid keys + valid := []string{ + "max_parallel", + "health_check", + "min_healthy_time", + "healthy_deadline", + } + if err := helper.CheckHCLKeys(o.Val, valid); err != nil { + return err + } + + dec, err := mapstructure.NewDecoder(&mapstructure.DecoderConfig{ + DecodeHook: mapstructure.StringToTimeDurationHookFunc(), + WeaklyTypedInput: true, + Result: result, + }) + if err != nil { + return err + } + return dec.Decode(m) +} + func parsePeriodic(result **api.PeriodicConfig, list *ast.ObjectList) error { list = list.Elem() if len(list.Items) > 1 { diff --git a/jobspec/parse_test.go b/jobspec/parse_test.go index dbf1200570eb..1275cd51c90f 100644 --- a/jobspec/parse_test.go +++ b/jobspec/parse_test.go @@ -110,6 +110,12 @@ func TestParse(t *testing.T) { AutoRevert: helper.BoolToPtr(false), Canary: helper.IntToPtr(2), }, + Migrate: &api.MigrateStrategy{ + MaxParallel: helper.IntToPtr(2), + HealthCheck: helper.StringToPtr("task_states"), + MinHealthyTime: helper.TimeToPtr(11 * time.Second), + HealthyDeadline: helper.TimeToPtr(11 * time.Minute), + }, Tasks: []*api.Task{ { Name: "binstore", @@ -735,6 +741,44 @@ func TestParse(t *testing.T) { }, false, }, + { + "migrate-job.hcl", + &api.Job{ + ID: helper.StringToPtr("foo"), + Name: helper.StringToPtr("foo"), + Type: helper.StringToPtr("batch"), + Datacenters: []string{"dc1"}, + Migrate: &api.MigrateStrategy{ + MaxParallel: helper.IntToPtr(2), + HealthCheck: helper.StringToPtr("task_states"), + MinHealthyTime: helper.TimeToPtr(11 * time.Second), + HealthyDeadline: helper.TimeToPtr(11 * time.Minute), + }, + TaskGroups: []*api.TaskGroup{ + { + Name: helper.StringToPtr("bar"), + Count: helper.IntToPtr(3), + Migrate: &api.MigrateStrategy{ + MaxParallel: helper.IntToPtr(3), + HealthCheck: helper.StringToPtr("checks"), + MinHealthyTime: helper.TimeToPtr(1 * time.Second), + HealthyDeadline: helper.TimeToPtr(1 * time.Minute), + }, + Tasks: []*api.Task{ + { + Name: "bar", + Driver: "raw_exec", + Config: map[string]interface{}{ + "command": "bash", + "args": []interface{}{"-c", "echo hi"}, + }, + }, + }, + }, + }, + }, + false, + }, } for _, tc := range cases { diff --git a/jobspec/test-fixtures/basic.hcl b/jobspec/test-fixtures/basic.hcl index 9942e3dfc34c..2b3f973aa9c4 100644 --- a/jobspec/test-fixtures/basic.hcl +++ b/jobspec/test-fixtures/basic.hcl @@ -67,6 +67,13 @@ job "binstore-storagelocker" { canary = 2 } + migrate { + max_parallel = 2 + health_check = "task_states" + min_healthy_time = "11s" + healthy_deadline = "11m" + } + task "binstore" { driver = "docker" user = "bob" diff --git a/jobspec/test-fixtures/migrate-job.hcl b/jobspec/test-fixtures/migrate-job.hcl new file mode 100644 index 000000000000..5ec05e6b5141 --- /dev/null +++ b/jobspec/test-fixtures/migrate-job.hcl @@ -0,0 +1,28 @@ +job "foo" { + datacenters = ["dc1"] + type = "batch" + migrate { + max_parallel = 2 + health_check = "task_states" + min_healthy_time = "11s" + healthy_deadline = "11m" + } + + group "bar" { + count = 3 + task "bar" { + driver = "raw_exec" + config { + command = "bash" + args = ["-c", "echo hi"] + } + } + + migrate { + max_parallel = 3 + health_check = "checks" + min_healthy_time = "1s" + healthy_deadline = "1m" + } + } +} diff --git a/main.go b/main.go index 4fe38fd6a998..f482ca2838bd 100644 --- a/main.go +++ b/main.go @@ -37,6 +37,7 @@ func RunCustom(args []string, commands map[string]cli.CommandFactory) int { case "quota list", "quota delete", "quota apply", "quota status", "quota inspect", "quota init": case "operator raft", "operator raft list-peers", "operator raft remove-peer": case "acl policy", "acl policy apply", "acl token", "acl token create": + case "node-drain", "node-status": default: commandsInclude = append(commandsInclude, k) } @@ -46,7 +47,7 @@ func RunCustom(args []string, commands map[string]cli.CommandFactory) int { // users should not be running should be placed here, versus hiding // subcommands from the main help, which should be filtered out of the // commands above. - hidden := []string{"check", "executor", "syslog"} + hidden := []string{"check", "executor", "syslog", "node-drain", "node-status"} cli := &cli.CLI{ Name: "nomad", diff --git a/nomad/alloc_endpoint.go b/nomad/alloc_endpoint.go index 033a1a0103aa..405136ca8cc1 100644 --- a/nomad/alloc_endpoint.go +++ b/nomad/alloc_endpoint.go @@ -1,6 +1,7 @@ package nomad import ( + "fmt" "time" "github.com/armon/go-metrics" @@ -200,3 +201,35 @@ func (a *Alloc) GetAllocs(args *structs.AllocsGetRequest, } return a.srv.blockingRPC(&opts) } + +// UpdateDesiredTransition is used to update the desired transitions of an +// allocation. +func (a *Alloc) UpdateDesiredTransition(args *structs.AllocUpdateDesiredTransitionRequest, reply *structs.GenericResponse) error { + if done, err := a.srv.forward("Alloc.UpdateDesiredTransition", args, args, reply); done { + return err + } + defer metrics.MeasureSince([]string{"nomad", "alloc", "update_desired_transition"}, time.Now()) + + // Check that it is a management token. + if aclObj, err := a.srv.ResolveToken(args.AuthToken); err != nil { + return err + } else if aclObj != nil && !aclObj.IsManagement() { + return structs.ErrPermissionDenied + } + + // Ensure at least a single alloc + if len(args.Allocs) == 0 { + return fmt.Errorf("must update at least one allocation") + } + + // Commit this update via Raft + _, index, err := a.srv.raftApply(structs.AllocUpdateDesiredTransitionRequestType, args) + if err != nil { + a.srv.logger.Printf("[ERR] nomad.allocs: AllocUpdateDesiredTransitionRequest failed: %v", err) + return err + } + + // Setup the response + reply.Index = index + return nil +} diff --git a/nomad/alloc_endpoint_test.go b/nomad/alloc_endpoint_test.go index abb36178681c..5d309d7c3b96 100644 --- a/nomad/alloc_endpoint_test.go +++ b/nomad/alloc_endpoint_test.go @@ -7,11 +7,13 @@ import ( "github.com/hashicorp/net-rpc-msgpackrpc" "github.com/hashicorp/nomad/acl" + "github.com/hashicorp/nomad/helper" "github.com/hashicorp/nomad/helper/uuid" "github.com/hashicorp/nomad/nomad/mock" "github.com/hashicorp/nomad/nomad/structs" "github.com/hashicorp/nomad/testutil" "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" ) func TestAllocEndpoint_List(t *testing.T) { @@ -481,3 +483,87 @@ func TestAllocEndpoint_GetAllocs_Blocking(t *testing.T) { t.Fatalf("bad: %#v", resp.Allocs) } } + +func TestAllocEndpoint_UpdateDesiredTransition(t *testing.T) { + t.Parallel() + require := require.New(t) + + s1, _ := TestACLServer(t, nil) + defer s1.Shutdown() + codec := rpcClient(t, s1) + testutil.WaitForLeader(t, s1.RPC) + + // Create the register request + alloc := mock.Alloc() + alloc2 := mock.Alloc() + state := s1.fsm.State() + require.Nil(state.UpsertJobSummary(998, mock.JobSummary(alloc.JobID))) + require.Nil(state.UpsertJobSummary(999, mock.JobSummary(alloc2.JobID))) + require.Nil(state.UpsertAllocs(1000, []*structs.Allocation{alloc, alloc2})) + + t1 := &structs.DesiredTransition{ + Migrate: helper.BoolToPtr(true), + } + + // Update the allocs desired status + get := &structs.AllocUpdateDesiredTransitionRequest{ + Allocs: map[string]*structs.DesiredTransition{ + alloc.ID: t1, + alloc2.ID: t1, + }, + Evals: []*structs.Evaluation{ + { + ID: uuid.Generate(), + Namespace: alloc.Namespace, + Priority: alloc.Job.Priority, + Type: alloc.Job.Type, + TriggeredBy: structs.EvalTriggerNodeDrain, + JobID: alloc.Job.ID, + JobModifyIndex: alloc.Job.ModifyIndex, + Status: structs.EvalStatusPending, + }, + { + ID: uuid.Generate(), + Namespace: alloc2.Namespace, + Priority: alloc2.Job.Priority, + Type: alloc2.Job.Type, + TriggeredBy: structs.EvalTriggerNodeDrain, + JobID: alloc2.Job.ID, + JobModifyIndex: alloc2.Job.ModifyIndex, + Status: structs.EvalStatusPending, + }, + }, + WriteRequest: structs.WriteRequest{ + Region: "global", + }, + } + + // Try without permissions + var resp structs.GenericResponse + err := msgpackrpc.CallWithCodec(codec, "Alloc.UpdateDesiredTransition", get, &resp) + require.NotNil(err) + require.True(structs.IsErrPermissionDenied(err)) + + // Try with permissions + get.WriteRequest.AuthToken = s1.getLeaderAcl() + var resp2 structs.GenericResponse + require.Nil(msgpackrpc.CallWithCodec(codec, "Alloc.UpdateDesiredTransition", get, &resp2)) + require.NotZero(resp2.Index) + + // Look up the allocations + out1, err := state.AllocByID(nil, alloc.ID) + require.Nil(err) + out2, err := state.AllocByID(nil, alloc.ID) + require.Nil(err) + e1, err := state.EvalByID(nil, get.Evals[0].ID) + require.Nil(err) + e2, err := state.EvalByID(nil, get.Evals[1].ID) + require.Nil(err) + + require.NotNil(out1.DesiredTransition.Migrate) + require.NotNil(out2.DesiredTransition.Migrate) + require.NotNil(e1) + require.NotNil(e2) + require.True(*out1.DesiredTransition.Migrate) + require.True(*out2.DesiredTransition.Migrate) +} diff --git a/nomad/deploymentwatcher/deployments_watcher.go b/nomad/deploymentwatcher/deployments_watcher.go index d9aab78770fb..a88a1de67f93 100644 --- a/nomad/deploymentwatcher/deployments_watcher.go +++ b/nomad/deploymentwatcher/deployments_watcher.go @@ -102,7 +102,7 @@ func NewDeploymentsWatcher(logger *log.Logger, // SetEnabled is used to control if the watcher is enabled. The watcher // should only be enabled on the active leader. When being enabled the state is // passed in as it is no longer valid once a leader election has taken place. -func (w *Watcher) SetEnabled(enabled bool, state *state.StateStore) error { +func (w *Watcher) SetEnabled(enabled bool, state *state.StateStore) { w.l.Lock() defer w.l.Unlock() @@ -120,8 +120,6 @@ func (w *Watcher) SetEnabled(enabled bool, state *state.StateStore) error { if enabled && !wasEnabled { go w.watchDeployments(w.ctx) } - - return nil } // flush is used to clear the state of the watcher diff --git a/nomad/drainer/drain_heap.go b/nomad/drainer/drain_heap.go new file mode 100644 index 000000000000..2d0a1506e052 --- /dev/null +++ b/nomad/drainer/drain_heap.go @@ -0,0 +1,162 @@ +package drainer + +import ( + "context" + "sync" + "time" +) + +// DrainDeadlineNotifier allows batch notification of nodes that have reached +// their drain deadline. +type DrainDeadlineNotifier interface { + // NextBatch returns the next batch of nodes that have reached their + // deadline. + NextBatch() <-chan []string + + // Remove removes the given node from being tracked for a deadline. + Remove(nodeID string) + + // Watch marks the given node for being watched for its deadline. + Watch(nodeID string, deadline time.Time) +} + +// deadlineHeap implements the DrainDeadlineNotifier and is backed by a min-heap +// to efficiently determine the next deadlining node. It also supports +// coalescing several deadlines into a single emission. +type deadlineHeap struct { + ctx context.Context + coalesceWindow time.Duration + batch chan []string + nodes map[string]time.Time + trigger chan struct{} + mu sync.Mutex +} + +// NewDeadlineHeap returns a new deadline heap that coalesces for the given +// duration and will stop watching when the passed context is cancelled. +func NewDeadlineHeap(ctx context.Context, coalesceWindow time.Duration) *deadlineHeap { + d := &deadlineHeap{ + ctx: ctx, + coalesceWindow: coalesceWindow, + batch: make(chan []string), + nodes: make(map[string]time.Time, 64), + trigger: make(chan struct{}, 1), + } + + go d.watch() + return d +} + +func (d *deadlineHeap) watch() { + timer := time.NewTimer(0) + timer.Stop() + select { + case <-timer.C: + default: + } + + var nextDeadline time.Time + defer timer.Stop() + + for { + select { + case <-d.ctx.Done(): + return + case <-timer.C: + if nextDeadline.IsZero() { + continue + } + + var batch []string + + d.mu.Lock() + for nodeID, nodeDeadline := range d.nodes { + if !nodeDeadline.After(nextDeadline) { + batch = append(batch, nodeID) + delete(d.nodes, nodeID) + } + } + d.mu.Unlock() + + if len(batch) > 0 { + // Send the batch + select { + case d.batch <- batch: + case <-d.ctx.Done(): + return + } + } + + case <-d.trigger: + } + + // Calculate the next deadline + deadline, ok := d.calculateNextDeadline() + if !ok { + continue + } + + if !deadline.Equal(nextDeadline) { + timer.Reset(deadline.Sub(time.Now())) + nextDeadline = deadline + } + } +} + +// calculateNextDeadline returns the next deadline in which to scan for +// deadlined nodes. It applies the coalesce window. +func (d *deadlineHeap) calculateNextDeadline() (time.Time, bool) { + d.mu.Lock() + defer d.mu.Unlock() + + if len(d.nodes) == 0 { + return time.Time{}, false + } + + // Calculate the new timer value + var deadline time.Time + for _, v := range d.nodes { + if deadline.IsZero() || v.Before(deadline) { + deadline = v + } + } + + var maxWithinWindow time.Time + coalescedDeadline := deadline.Add(d.coalesceWindow) + for _, nodeDeadline := range d.nodes { + if nodeDeadline.Before(coalescedDeadline) { + if maxWithinWindow.IsZero() || nodeDeadline.After(maxWithinWindow) { + maxWithinWindow = nodeDeadline + } + } + } + + return maxWithinWindow, true +} + +// NextBatch returns the next batch of nodes to be drained. +func (d *deadlineHeap) NextBatch() <-chan []string { + return d.batch +} + +func (d *deadlineHeap) Remove(nodeID string) { + d.mu.Lock() + defer d.mu.Unlock() + delete(d.nodes, nodeID) + + select { + case d.trigger <- struct{}{}: + default: + } +} + +func (d *deadlineHeap) Watch(nodeID string, deadline time.Time) { + d.mu.Lock() + defer d.mu.Unlock() + d.nodes[nodeID] = deadline + + select { + case d.trigger <- struct{}{}: + default: + } +} diff --git a/nomad/drainer/drain_heap_test.go b/nomad/drainer/drain_heap_test.go new file mode 100644 index 000000000000..02108e1dfa0e --- /dev/null +++ b/nomad/drainer/drain_heap_test.go @@ -0,0 +1,149 @@ +package drainer + +import ( + "context" + "testing" + "time" + + "github.com/stretchr/testify/require" +) + +func TestDeadlineHeap_Interface(t *testing.T) { + t.Parallel() + require := require.New(t) + h := NewDeadlineHeap(context.Background(), 1*time.Second) + require.Implements((*DrainDeadlineNotifier)(nil), h) +} + +func TestDeadlineHeap_WatchAndGet(t *testing.T) { + t.Parallel() + require := require.New(t) + h := NewDeadlineHeap(context.Background(), 1*time.Second) + + now := time.Now() + nodeID := "1" + wait := 10 * time.Millisecond + deadline := now.Add(wait) + h.Watch(nodeID, deadline) + + var batch []string + select { + case batch = <-h.NextBatch(): + case <-time.After(2 * wait): + t.Fatal("timeout") + } + + require.Len(batch, 1) + require.Equal(nodeID, batch[0]) +} + +func TestDeadlineHeap_WatchThenUpdateAndGet(t *testing.T) { + t.Parallel() + require := require.New(t) + h := NewDeadlineHeap(context.Background(), 1*time.Second) + + now := time.Now() + nodeID := "1" + wait := 10 * time.Millisecond + deadline := now.Add(wait) + + // Initially watch way in the future + h.Watch(nodeID, now.Add(24*time.Hour)) + + // Rewatch + h.Watch(nodeID, deadline) + + var batch []string + select { + case batch = <-h.NextBatch(): + case <-time.After(2 * wait): + t.Fatal("timeout") + } + + require.Len(batch, 1) + require.Equal(nodeID, batch[0]) +} + +func TestDeadlineHeap_MultiwatchAndDelete(t *testing.T) { + t.Parallel() + require := require.New(t) + h := NewDeadlineHeap(context.Background(), 1*time.Second) + + now := time.Now() + wait := 50 * time.Millisecond + deadline := now.Add(wait) + + nodeID1 := "1" + nodeID2 := "2" + h.Watch(nodeID1, deadline) + h.Watch(nodeID2, deadline) + + time.Sleep(1 * time.Millisecond) + h.Remove(nodeID2) + + var batch []string + select { + case batch = <-h.NextBatch(): + case <-time.After(2 * wait): + t.Fatal("timeout") + } + + require.Len(batch, 1) + require.Equal(nodeID1, batch[0]) +} + +func TestDeadlineHeap_WatchCoalesce(t *testing.T) { + t.Parallel() + require := require.New(t) + h := NewDeadlineHeap(context.Background(), 100*time.Millisecond) + + now := time.Now() + + group1 := map[string]time.Time{ + "1": now.Add(5 * time.Millisecond), + "2": now.Add(10 * time.Millisecond), + "3": now.Add(20 * time.Millisecond), + "4": now.Add(100 * time.Millisecond), + } + + group2 := map[string]time.Time{ + "10": now.Add(350 * time.Millisecond), + "11": now.Add(360 * time.Millisecond), + } + + for _, g := range []map[string]time.Time{group1, group2} { + for n, d := range g { + h.Watch(n, d) + } + } + + var batch []string + select { + case batch = <-h.NextBatch(): + case <-time.After(1 * time.Second): + t.Fatal("timeout") + } + + require.Len(batch, len(group1)) + for nodeID := range group1 { + require.Contains(batch, nodeID) + } + batch = nil + + select { + case batch = <-h.NextBatch(): + case <-time.After(2 * time.Second): + t.Fatal("timeout") + } + + require.Len(batch, len(group2)) + for nodeID := range group2 { + require.Contains(batch, nodeID) + } + + select { + case <-h.NextBatch(): + t.Fatal("unexpected batch") + case <-time.After(100 * time.Millisecond): + } +} diff --git a/nomad/drainer/drain_testing.go b/nomad/drainer/drain_testing.go new file mode 100644 index 000000000000..5af351fe819f --- /dev/null +++ b/nomad/drainer/drain_testing.go @@ -0,0 +1,45 @@ +package drainer + +import ( + "sync" + + "github.com/hashicorp/nomad/nomad/structs" +) + +type MockNodeTrackerEvent struct { + NodeUpdate *structs.Node + NodeRemove string +} + +type MockNodeTracker struct { + Nodes map[string]*structs.Node + Events []*MockNodeTrackerEvent + sync.Mutex +} + +func NewMockNodeTracker() *MockNodeTracker { + return &MockNodeTracker{ + Nodes: make(map[string]*structs.Node), + Events: make([]*MockNodeTrackerEvent, 0, 16), + } +} + +func (m *MockNodeTracker) TrackedNodes() map[string]*structs.Node { + m.Lock() + defer m.Unlock() + return m.Nodes +} + +func (m *MockNodeTracker) Remove(nodeID string) { + m.Lock() + defer m.Unlock() + delete(m.Nodes, nodeID) + m.Events = append(m.Events, &MockNodeTrackerEvent{NodeRemove: nodeID}) +} + +func (m *MockNodeTracker) Update(node *structs.Node) { + m.Lock() + defer m.Unlock() + m.Nodes[node.ID] = node + m.Events = append(m.Events, &MockNodeTrackerEvent{NodeUpdate: node}) +} diff --git a/nomad/drainer/drainer.go b/nomad/drainer/drainer.go new file mode 100644 index 000000000000..46dcad696d4c --- /dev/null +++ b/nomad/drainer/drainer.go @@ -0,0 +1,380 @@ +package drainer + +import ( + "context" + "log" + "sync" + "time" + + "github.com/hashicorp/nomad/helper" + "github.com/hashicorp/nomad/helper/uuid" + "github.com/hashicorp/nomad/nomad/state" + "github.com/hashicorp/nomad/nomad/structs" + "golang.org/x/time/rate" +) + +var ( + // stateReadErrorDelay is the delay to apply before retrying reading state + // when there is an error + stateReadErrorDelay = 1 * time.Second +) + +const ( + // LimitStateQueriesPerSecond is the number of state queries allowed per + // second + LimitStateQueriesPerSecond = 100.0 + + // BatchUpdateInterval is how long we wait to batch updates + BatchUpdateInterval = 1 * time.Second + + // NodeDeadlineCoalesceWindow is the duration in which deadlining nodes will + // be coalesced together + NodeDeadlineCoalesceWindow = 5 * time.Second +) + +// RaftApplier contains methods for applying the raft requests required by the +// NodeDrainer. +type RaftApplier interface { + AllocUpdateDesiredTransition(allocs map[string]*structs.DesiredTransition, evals []*structs.Evaluation) (uint64, error) + NodesDrainComplete(nodes []string) (uint64, error) +} + +// NodeTracker is the interface to notify an object that is tracking draining +// nodes of changes +type NodeTracker interface { + // TrackedNodes returns all the nodes that are currently tracked as + // draining. + TrackedNodes() map[string]*structs.Node + + // Remove removes a node from the draining set. + Remove(nodeID string) + + // Update either updates the specification of a draining node or tracks the + // node as draining. + Update(node *structs.Node) +} + +// DrainingJobWatcherFactory returns a new DrainingJobWatcher +type DrainingJobWatcherFactory func(context.Context, *rate.Limiter, *state.StateStore, *log.Logger) DrainingJobWatcher + +// DrainingNodeWatcherFactory returns a new DrainingNodeWatcher +type DrainingNodeWatcherFactory func(context.Context, *rate.Limiter, *state.StateStore, *log.Logger, NodeTracker) DrainingNodeWatcher + +// DrainDeadlineNotifierFactory returns a new DrainDeadlineNotifier +type DrainDeadlineNotifierFactory func(context.Context) DrainDeadlineNotifier + +// GetDrainingJobWatcher returns a draining job watcher +func GetDrainingJobWatcher(ctx context.Context, limiter *rate.Limiter, state *state.StateStore, logger *log.Logger) DrainingJobWatcher { + return NewDrainingJobWatcher(ctx, limiter, state, logger) +} + +// GetDeadlineNotifier returns a node deadline notifier with default coalescing. +func GetDeadlineNotifier(ctx context.Context) DrainDeadlineNotifier { + return NewDeadlineHeap(ctx, NodeDeadlineCoalesceWindow) +} + +// GetNodeWatcherFactory returns a DrainingNodeWatcherFactory +func GetNodeWatcherFactory() DrainingNodeWatcherFactory { + return func(ctx context.Context, limiter *rate.Limiter, state *state.StateStore, logger *log.Logger, tracker NodeTracker) DrainingNodeWatcher { + return NewNodeDrainWatcher(ctx, limiter, state, logger, tracker) + } +} + +// allocMigrateBatcher is used to batch allocation updates. +type allocMigrateBatcher struct { + // updates holds pending client status updates for allocations + updates []*structs.Allocation + + // updateFuture is used to wait for the pending batch update + // to complete. This may be nil if no batch is pending. + updateFuture *structs.BatchFuture + + // updateTimer is the timer that will trigger the next batch + // update, and may be nil if there is no batch pending. + updateTimer *time.Timer + + batchWindow time.Duration + + // synchronizes access to the updates list, the future and the timer. + sync.Mutex +} + +// NodeDrainerConfig is used to configure a new node drainer. +type NodeDrainerConfig struct { + Logger *log.Logger + Raft RaftApplier + JobFactory DrainingJobWatcherFactory + NodeFactory DrainingNodeWatcherFactory + DrainDeadlineFactory DrainDeadlineNotifierFactory + + // StateQueriesPerSecond configures the query limit against the state store + // that is allowed by the node drainer. + StateQueriesPerSecond float64 + + // BatchUpdateInterval is the interval in which allocation updates are + // batched. + BatchUpdateInterval time.Duration +} + +// NodeDrainer is used to orchestrate migrating allocations off of draining +// nodes. +type NodeDrainer struct { + enabled bool + logger *log.Logger + + // nodes is the set of draining nodes + nodes map[string]*drainingNode + + // nodeWatcher watches for nodes to transition in and out of drain state. + nodeWatcher DrainingNodeWatcher + nodeFactory DrainingNodeWatcherFactory + + // jobWatcher watches draining jobs and emits desired drains and notifies + // when migrations take place. + jobWatcher DrainingJobWatcher + jobFactory DrainingJobWatcherFactory + + // deadlineNotifier notifies when nodes reach their drain deadline. + deadlineNotifier DrainDeadlineNotifier + deadlineNotifierFactory DrainDeadlineNotifierFactory + + // state is the state that is watched for state changes. + state *state.StateStore + + // queryLimiter is used to limit the rate of blocking queries + queryLimiter *rate.Limiter + + // raft is a shim around the raft messages necessary for draining + raft RaftApplier + + // batcher is used to batch alloc migrations. + batcher allocMigrateBatcher + + // ctx and exitFn are used to cancel the watcher + ctx context.Context + exitFn context.CancelFunc + + l sync.RWMutex +} + +// NewNodeDrainer returns a new new node drainer. The node drainer is +// responsible for marking allocations on draining nodes with a desired +// migration transition, updating the drain strategy on nodes when they are +// complete and creating evaluations for the system to react to these changes. +func NewNodeDrainer(c *NodeDrainerConfig) *NodeDrainer { + return &NodeDrainer{ + raft: c.Raft, + logger: c.Logger, + jobFactory: c.JobFactory, + nodeFactory: c.NodeFactory, + deadlineNotifierFactory: c.DrainDeadlineFactory, + queryLimiter: rate.NewLimiter(rate.Limit(c.StateQueriesPerSecond), 100), + batcher: allocMigrateBatcher{ + batchWindow: c.BatchUpdateInterval, + }, + } +} + +// SetEnabled will start or stop the node draining goroutine depending on the +// enabled boolean. +func (n *NodeDrainer) SetEnabled(enabled bool, state *state.StateStore) { + n.l.Lock() + defer n.l.Unlock() + + // If we are starting now or have a new state, init state and start the + // run loop + n.enabled = enabled + if enabled { + n.flush(state) + go n.run(n.ctx) + } else if !enabled && n.exitFn != nil { + n.exitFn() + } +} + +// flush is used to clear the state of the watcher +func (n *NodeDrainer) flush(state *state.StateStore) { + // Cancel anything that may be running. + if n.exitFn != nil { + n.exitFn() + } + + // Store the new state + if state != nil { + n.state = state + } + + n.ctx, n.exitFn = context.WithCancel(context.Background()) + n.jobWatcher = n.jobFactory(n.ctx, n.queryLimiter, n.state, n.logger) + n.nodeWatcher = n.nodeFactory(n.ctx, n.queryLimiter, n.state, n.logger, n) + n.deadlineNotifier = n.deadlineNotifierFactory(n.ctx) + n.nodes = make(map[string]*drainingNode, 32) +} + +// run is a long lived event handler that receives changes from the relevant +// watchers and takes action based on them. +func (n *NodeDrainer) run(ctx context.Context) { + for { + select { + case <-n.ctx.Done(): + return + case nodes := <-n.deadlineNotifier.NextBatch(): + n.handleDeadlinedNodes(nodes) + case req := <-n.jobWatcher.Drain(): + n.handleJobAllocDrain(req) + case allocs := <-n.jobWatcher.Migrated(): + n.handleMigratedAllocs(allocs) + } + } +} + +// handleDeadlinedNodes handles a set of nodes reaching their drain deadline. +// The handler detects the remaining allocations on the nodes and immediately +// marks them for migration. +func (n *NodeDrainer) handleDeadlinedNodes(nodes []string) { + // Retrieve the set of allocations that will be force stopped. + n.l.RLock() + var forceStop []*structs.Allocation + for _, node := range nodes { + draining, ok := n.nodes[node] + if !ok { + n.logger.Printf("[DEBUG] nomad.node_drainer: skipping untracked deadlined node %q", node) + continue + } + + allocs, err := draining.DeadlineAllocs() + if err != nil { + n.logger.Printf("[ERR] nomad.node_drainer: failed to retrive allocs on deadlined node %q: %v", node, err) + continue + } + + forceStop = append(forceStop, allocs...) + } + n.l.RUnlock() + n.batchDrainAllocs(forceStop) +} + +// handleJobAllocDrain handles marking a set of allocations as having a desired +// transition to drain. The handler blocks till the changes to the allocation +// have occurred. +func (n *NodeDrainer) handleJobAllocDrain(req *DrainRequest) { + index, err := n.batchDrainAllocs(req.Allocs) + req.Resp.Respond(index, err) +} + +// handleMigratedAllocs checks to see if any nodes can be considered done +// draining based on the set of allocations that have migrated because of an +// ongoing drain for a job. +func (n *NodeDrainer) handleMigratedAllocs(allocs []*structs.Allocation) { + // Determine the set of nodes that were effected + nodes := make(map[string]struct{}) + for _, alloc := range allocs { + nodes[alloc.NodeID] = struct{}{} + } + + // For each node, check if it is now done + n.l.RLock() + var done []string + for node := range nodes { + draining, ok := n.nodes[node] + if !ok { + continue + } + + isDone, err := draining.IsDone() + if err != nil { + n.logger.Printf("[ERR] nomad.drain: checking if node %q is done draining: %v", node, err) + continue + } + + if !isDone { + continue + } + + done = append(done, node) + } + n.l.RUnlock() + + // Submit the node transistions in a sharded form to ensure a reasonable + // Raft transaction size. + for _, nodes := range partitionIds(done) { + if _, err := n.raft.NodesDrainComplete(nodes); err != nil { + n.logger.Printf("[ERR] nomad.drain: failed to unset drain for nodes: %v", err) + } + } +} + +// batchDrainAllocs is used to batch the draining of allocations. It will block +// until the batch is complete. +func (n *NodeDrainer) batchDrainAllocs(allocs []*structs.Allocation) (uint64, error) { + // Add this to the batch + n.batcher.Lock() + n.batcher.updates = append(n.batcher.updates, allocs...) + + // Start a new batch if none + future := n.batcher.updateFuture + if future == nil { + future = structs.NewBatchFuture() + n.batcher.updateFuture = future + n.batcher.updateTimer = time.AfterFunc(n.batcher.batchWindow, func() { + // Get the pending updates + n.batcher.Lock() + updates := n.batcher.updates + future := n.batcher.updateFuture + n.batcher.updates = nil + n.batcher.updateFuture = nil + n.batcher.updateTimer = nil + n.batcher.Unlock() + + // Perform the batch update + n.drainAllocs(future, updates) + }) + } + n.batcher.Unlock() + + if err := future.Wait(); err != nil { + return 0, err + } + + return future.Index(), nil +} + +// drainAllocs is a non batch, marking of the desired transition to migrate for +// the set of allocations. It will also create the necessary evaluations for the +// affected jobs. +func (n *NodeDrainer) drainAllocs(future *structs.BatchFuture, allocs []*structs.Allocation) { + // Compute the effected jobs and make the transition map + jobs := make(map[string]*structs.Allocation, 4) + transistions := make(map[string]*structs.DesiredTransition, len(allocs)) + for _, alloc := range allocs { + transistions[alloc.ID] = &structs.DesiredTransition{ + Migrate: helper.BoolToPtr(true), + } + jobs[alloc.JobID] = alloc + } + + evals := make([]*structs.Evaluation, 0, len(jobs)) + for job, alloc := range jobs { + evals = append(evals, &structs.Evaluation{ + ID: uuid.Generate(), + Namespace: alloc.Namespace, + Priority: alloc.Job.Priority, + Type: alloc.Job.Type, + TriggeredBy: structs.EvalTriggerNodeDrain, + JobID: job, + Status: structs.EvalStatusPending, + }) + } + + // Commit this update via Raft + var finalIndex uint64 + for _, u := range partitionAllocDrain(transistions, evals) { + index, err := n.raft.AllocUpdateDesiredTransition(u.Transistions, u.Evals) + if err != nil { + future.Respond(index, err) + } + finalIndex = index + } + + future.Respond(finalIndex, nil) +} diff --git a/nomad/drainer/drainer_util.go b/nomad/drainer/drainer_util.go new file mode 100644 index 000000000000..09d026235aa0 --- /dev/null +++ b/nomad/drainer/drainer_util.go @@ -0,0 +1,93 @@ +package drainer + +import ( + "github.com/hashicorp/nomad/nomad/structs" +) + +var ( + // maxIdsPerTxn is the maximum number of IDs that can be included in a + // single Raft transaction. This is to ensure that the Raft message does not + // become too large. + maxIdsPerTxn = (1024 * 256) / 36 // 0.25 MB of ids. +) + +// partitionIds takes a set of IDs and returns a partitioned view of them such +// that no batch would result in an overly large raft transaction. +func partitionIds(ids []string) [][]string { + index := 0 + total := len(ids) + var partitions [][]string + for remaining := total - index; remaining > 0; remaining = total - index { + if remaining < maxIdsPerTxn { + partitions = append(partitions, ids[index:]) + break + } else { + partitions = append(partitions, ids[index:index+maxIdsPerTxn]) + index += maxIdsPerTxn + } + } + + return partitions +} + +// transistionTuple is used to group desired transistions and evals +type transistionTuple struct { + Transistions map[string]*structs.DesiredTransition + Evals []*structs.Evaluation +} + +// partitionAllocDrain returns a list of alloc transistions and evals to apply +// in a single raft transaction.This is necessary to ensure that the Raft +// transaction does not become too large. +func partitionAllocDrain(transistions map[string]*structs.DesiredTransition, + evals []*structs.Evaluation) []*transistionTuple { + + // Determine a stable ordering of the transistioning allocs + allocs := make([]string, 0, len(transistions)) + for id := range transistions { + allocs = append(allocs, id) + } + + var requests []*transistionTuple + submittedEvals, submittedTrans := 0, 0 + for submittedEvals != len(evals) || submittedTrans != len(transistions) { + req := &transistionTuple{ + Transistions: make(map[string]*structs.DesiredTransition), + } + requests = append(requests, req) + available := maxIdsPerTxn + + // Add the allocs first + if remaining := len(allocs) - submittedTrans; remaining > 0 { + if remaining <= available { + for _, id := range allocs[submittedTrans:] { + req.Transistions[id] = transistions[id] + } + available -= remaining + submittedTrans += remaining + } else { + for _, id := range allocs[submittedTrans : submittedTrans+available] { + req.Transistions[id] = transistions[id] + } + submittedTrans += available + + // Exhausted space so skip adding evals + continue + } + + } + + // Add the evals + if remaining := len(evals) - submittedEvals; remaining > 0 { + if remaining <= available { + req.Evals = evals[submittedEvals:] + submittedEvals += remaining + } else { + req.Evals = evals[submittedEvals : submittedEvals+available] + submittedEvals += available + } + } + } + + return requests +} diff --git a/nomad/drainer/drainer_util_test.go b/nomad/drainer/drainer_util_test.go new file mode 100644 index 000000000000..ee2f4a79f508 --- /dev/null +++ b/nomad/drainer/drainer_util_test.go @@ -0,0 +1,54 @@ +package drainer + +import ( + "testing" + + "github.com/hashicorp/nomad/nomad/structs" + "github.com/stretchr/testify/require" +) + +func TestDrainer_PartitionAllocDrain(t *testing.T) { + // Set the max ids per reap to something lower. + old := maxIdsPerTxn + defer func() { maxIdsPerTxn = old }() + maxIdsPerTxn = 2 + + require := require.New(t) + transistions := map[string]*structs.DesiredTransition{"a": nil, "b": nil, "c": nil} + evals := []*structs.Evaluation{nil, nil, nil} + requests := partitionAllocDrain(transistions, evals) + require.Len(requests, 3) + + first := requests[0] + require.Len(first.Transistions, 2) + require.Len(first.Evals, 0) + + second := requests[1] + require.Len(second.Transistions, 1) + require.Len(second.Evals, 1) + + third := requests[2] + require.Len(third.Transistions, 0) + require.Len(third.Evals, 2) +} + +func TestDrainer_PartitionIds(t *testing.T) { + require := require.New(t) + + // Set the max ids per reap to something lower. + old := maxIdsPerTxn + defer func() { maxIdsPerTxn = old }() + maxIdsPerTxn = 2 + + ids := []string{"1", "2", "3", "4", "5"} + requests := partitionIds(ids) + require.Len(requests, 3) + require.Len(requests[0], 2) + require.Len(requests[1], 2) + require.Len(requests[2], 1) + require.Equal(requests[0][0], ids[0]) + require.Equal(requests[0][1], ids[1]) + require.Equal(requests[1][0], ids[2]) + require.Equal(requests[1][1], ids[3]) + require.Equal(requests[2][0], ids[4]) +} diff --git a/nomad/drainer/draining_node.go b/nomad/drainer/draining_node.go new file mode 100644 index 000000000000..af5c094b8089 --- /dev/null +++ b/nomad/drainer/draining_node.go @@ -0,0 +1,154 @@ +package drainer + +import ( + "fmt" + "sync" + "time" + + "github.com/hashicorp/nomad/nomad/state" + "github.com/hashicorp/nomad/nomad/structs" +) + +type drainingNode struct { + state *state.StateStore + node *structs.Node + l sync.RWMutex +} + +func NewDrainingNode(node *structs.Node, state *state.StateStore) *drainingNode { + return &drainingNode{ + state: state, + node: node, + } +} + +func (n *drainingNode) GetNode() *structs.Node { + n.l.Lock() + defer n.l.Unlock() + return n.node +} + +func (n *drainingNode) Update(node *structs.Node) { + n.l.Lock() + defer n.l.Unlock() + n.node = node +} + +// DeadlineTime returns if the node has a deadline and if so what it is +func (n *drainingNode) DeadlineTime() (bool, time.Time) { + n.l.RLock() + defer n.l.RUnlock() + + // Should never happen + if n.node == nil || n.node.DrainStrategy == nil { + return false, time.Time{} + } + + return n.node.DrainStrategy.DeadlineTime() +} + +// IsDone returns if the node is done draining +func (n *drainingNode) IsDone() (bool, error) { + n.l.RLock() + defer n.l.RUnlock() + + // Should never happen + if n.node == nil || n.node.DrainStrategy == nil { + return false, fmt.Errorf("node doesn't have a drain strategy set") + } + + // Grab the relevant drain info + ignoreSystem := n.node.DrainStrategy.IgnoreSystemJobs + + // Retrieve the allocs on the node + allocs, err := n.state.AllocsByNode(nil, n.node.ID) + if err != nil { + return false, err + } + + for _, alloc := range allocs { + // Skip system if configured to + if alloc.Job.Type == structs.JobTypeSystem && ignoreSystem { + continue + } + + // If there is a non-terminal we aren't done + if !alloc.TerminalStatus() { + return false, nil + } + } + + return true, nil +} + +// TODO test that we return the right thing given the strategies +// DeadlineAllocs returns the set of allocations that should be drained given a +// node is at its deadline +func (n *drainingNode) DeadlineAllocs() ([]*structs.Allocation, error) { + n.l.RLock() + defer n.l.RUnlock() + + // Should never happen + if n.node == nil || n.node.DrainStrategy == nil { + return nil, fmt.Errorf("node doesn't have a drain strategy set") + } + + // Grab the relevant drain info + inf, _ := n.node.DrainStrategy.DeadlineTime() + if inf { + return nil, nil + } + ignoreSystem := n.node.DrainStrategy.IgnoreSystemJobs + + // Retrieve the allocs on the node + allocs, err := n.state.AllocsByNode(nil, n.node.ID) + if err != nil { + return nil, err + } + + var drain []*structs.Allocation + for _, alloc := range allocs { + // Nothing to do on a terminal allocation + if alloc.TerminalStatus() { + continue + } + + // Skip system if configured to + if alloc.Job.Type == structs.JobTypeSystem && ignoreSystem { + continue + } + + drain = append(drain, alloc) + } + + return drain, nil +} + +// RunningServices returns the set of jobs on the node +func (n *drainingNode) RunningServices() ([]structs.NamespacedID, error) { + n.l.RLock() + defer n.l.RUnlock() + + // Retrieve the allocs on the node + allocs, err := n.state.AllocsByNode(nil, n.node.ID) + if err != nil { + return nil, err + } + + jobIDs := make(map[structs.NamespacedID]struct{}) + var jobs []structs.NamespacedID + for _, alloc := range allocs { + if alloc.TerminalStatus() || alloc.Job.Type != structs.JobTypeService { + continue + } + + jns := structs.NamespacedID{Namespace: alloc.Namespace, ID: alloc.JobID} + if _, ok := jobIDs[jns]; ok { + continue + } + jobIDs[jns] = struct{}{} + jobs = append(jobs, jns) + } + + return jobs, nil +} diff --git a/nomad/drainer/watch_jobs.go b/nomad/drainer/watch_jobs.go new file mode 100644 index 000000000000..93232aeb40e9 --- /dev/null +++ b/nomad/drainer/watch_jobs.go @@ -0,0 +1,474 @@ +package drainer + +import ( + "context" + "fmt" + "log" + "sync" + "time" + + memdb "github.com/hashicorp/go-memdb" + "github.com/hashicorp/nomad/helper" + "github.com/hashicorp/nomad/nomad/state" + "github.com/hashicorp/nomad/nomad/structs" + "golang.org/x/time/rate" +) + +type DrainRequest struct { + Allocs []*structs.Allocation + Resp *structs.BatchFuture +} + +func NewDrainRequest(allocs []*structs.Allocation) *DrainRequest { + return &DrainRequest{ + Allocs: allocs, + Resp: structs.NewBatchFuture(), + } +} + +// DrainingJobWatcher is the interface for watching a job drain +type DrainingJobWatcher interface { + // RegisterJob is used to start watching a draining job + RegisterJobs(job []structs.NamespacedID) + + // Drain is used to emit allocations that should be drained. + Drain() <-chan *DrainRequest + + // Migrated is allocations for draining jobs that have transitioned to + // stop. There is no guarantee that duplicates won't be published. + Migrated() <-chan []*structs.Allocation +} + +// drainingJobWatcher is used to watch draining jobs and emit events when +// draining allocations have replacements +type drainingJobWatcher struct { + ctx context.Context + logger *log.Logger + + // state is the state that is watched for state changes. + state *state.StateStore + + // limiter is used to limit the rate of blocking queries + limiter *rate.Limiter + + // jobs is the set of tracked jobs. + jobs map[structs.NamespacedID]struct{} + + // queryCtx is used to cancel a blocking query. + queryCtx context.Context + queryCancel context.CancelFunc + + // drainCh and migratedCh are used to emit allocations + drainCh chan *DrainRequest + migratedCh chan []*structs.Allocation + + l sync.RWMutex +} + +// NewDrainingJobWatcher returns a new job watcher. The caller is expected to +// cancel the context to clean up the drainer. +func NewDrainingJobWatcher(ctx context.Context, limiter *rate.Limiter, state *state.StateStore, logger *log.Logger) *drainingJobWatcher { + + // Create a context that can cancel the blocking query so that when a new + // job gets registered it is handled. + queryCtx, queryCancel := context.WithCancel(ctx) + + w := &drainingJobWatcher{ + ctx: ctx, + queryCtx: queryCtx, + queryCancel: queryCancel, + limiter: limiter, + logger: logger, + state: state, + jobs: make(map[structs.NamespacedID]struct{}, 64), + drainCh: make(chan *DrainRequest), + migratedCh: make(chan []*structs.Allocation), + } + + go w.watch() + return w +} + +// RegisterJob marks the given job as draining and adds it to being watched. +func (w *drainingJobWatcher) RegisterJobs(jobs []structs.NamespacedID) { + w.l.Lock() + defer w.l.Unlock() + + updated := false + for _, jns := range jobs { + if _, ok := w.jobs[jns]; ok { + continue + } + + // Add the job and cancel the context + w.logger.Printf("[TRACE] nomad.drain.job_watcher: registering job %v", jns) + w.jobs[jns] = struct{}{} + updated = true + } + + if updated { + w.queryCancel() + + // Create a new query context + w.queryCtx, w.queryCancel = context.WithCancel(w.ctx) + } +} + +// Drain returns the channel that emits allocations to drain. +func (w *drainingJobWatcher) Drain() <-chan *DrainRequest { + return w.drainCh +} + +// Migrated returns the channel that emits allocations for draining jobs that +// have been migrated. +func (w *drainingJobWatcher) Migrated() <-chan []*structs.Allocation { + return w.migratedCh +} + +// deregisterJob removes the job from being watched. +func (w *drainingJobWatcher) deregisterJob(jobID, namespace string) { + w.l.Lock() + defer w.l.Unlock() + jns := structs.NamespacedID{ + ID: jobID, + Namespace: namespace, + } + delete(w.jobs, jns) + w.logger.Printf("[TRACE] nomad.drain.job_watcher: deregistering job %v", jns) +} + +// watch is the long lived watching routine that detects job drain changes. +func (w *drainingJobWatcher) watch() { + waitIndex := uint64(1) + for { + w.logger.Printf("[TRACE] nomad.drain.job_watcher: getting job allocs at index %d", waitIndex) + jobAllocs, index, err := w.getJobAllocs(w.getQueryCtx(), waitIndex) + w.logger.Printf("[TRACE] nomad.drain.job_watcher: got job allocs %d at index %d: %v", len(jobAllocs), waitIndex, err) + if err != nil { + if err == context.Canceled { + // Determine if it is a cancel or a shutdown + select { + case <-w.ctx.Done(): + w.logger.Printf("[TRACE] nomad.drain.job_watcher: shutting down") + return + default: + // The query context was cancelled + continue + } + } + + w.logger.Printf("[ERR] nomad.drain.job_watcher: error watching job allocs updates at index %d: %v", waitIndex, err) + select { + case <-w.ctx.Done(): + w.logger.Printf("[TRACE] nomad.drain.job_watcher: shutting down") + return + case <-time.After(stateReadErrorDelay): + continue + } + } + + lastHandled := waitIndex + waitIndex = index + + // Snapshot the state store + snap, err := w.state.Snapshot() + if err != nil { + w.logger.Printf("[WARN] nomad.drain.job_watcher: failed to snapshot statestore: %v", err) + continue + } + + currentJobs := w.drainingJobs() + var allDrain, allMigrated []*structs.Allocation + for jns, allocs := range jobAllocs { + // Check if the job is still registered + if _, ok := currentJobs[jns]; !ok { + w.logger.Printf("[TRACE] nomad.drain.job_watcher: skipping job %v as it is no longer registered for draining", jns) + continue + } + + w.logger.Printf("[TRACE] nomad.drain.job_watcher: handling job %v", jns) + + // Lookup the job + job, err := snap.JobByID(nil, jns.Namespace, jns.ID) + if err != nil { + w.logger.Printf("[WARN] nomad.drain.job_watcher: failed to lookup job %v: %v", jns, err) + continue + } + + // Ignore purged jobs + if job == nil { + w.logger.Printf("[TRACE] nomad.drain.job_watcher: ignoring garbage collected job %q", jns) + w.deregisterJob(jns.ID, jns.Namespace) + continue + } + + // Ignore all non-service jobs + if job.Type != structs.JobTypeService { + w.deregisterJob(job.ID, job.Namespace) + continue + } + + result, err := handleJob(snap, job, allocs, lastHandled) + if err != nil { + w.logger.Printf("[ERR] nomad.drain.job_watcher: handling drain for job %v failed: %v", jns, err) + continue + } + + w.logger.Printf("[TRACE] nomad.drain.job_watcher: result for job %v: %v", jns, result) + + allDrain = append(allDrain, result.drain...) + allMigrated = append(allMigrated, result.migrated...) + + // Stop tracking this job + if result.done { + w.deregisterJob(job.ID, job.Namespace) + } + } + + if len(allDrain) != 0 { + // Create the request + req := NewDrainRequest(allDrain) + w.logger.Printf("[TRACE] nomad.drain.job_watcher: sending drain request for %d allocs", len(allDrain)) + + select { + case w.drainCh <- req: + case <-w.ctx.Done(): + w.logger.Printf("[TRACE] nomad.drain.job_watcher: shutting down") + return + } + + // Wait for the request to be committed + select { + case <-req.Resp.WaitCh(): + case <-w.ctx.Done(): + w.logger.Printf("[TRACE] nomad.drain.job_watcher: shutting down") + return + } + + // See if it successfully committed + if err := req.Resp.Error(); err != nil { + w.logger.Printf("[ERR] nomad.drain.job_watcher: failed to transition allocations: %v", err) + } + + // Wait until the new index + if index := req.Resp.Index(); index > waitIndex { + waitIndex = index + } + } + + if len(allMigrated) != 0 { + w.logger.Printf("[TRACE] nomad.drain.job_watcher: sending migrated for %d allocs", len(allMigrated)) + select { + case w.migratedCh <- allMigrated: + case <-w.ctx.Done(): + w.logger.Printf("[TRACE] nomad.drain.job_watcher: shutting down") + return + } + } + } +} + +// jobResult is the set of actions to take for a draining job given its current +// state. +type jobResult struct { + // drain is the set of allocations to emit for draining. + drain []*structs.Allocation + + // migrated is the set of allocations to emit as migrated + migrated []*structs.Allocation + + // done marks whether the job has been fully drained. + done bool +} + +// newJobResult returns a jobResult with done=true. It is the responsibility of +// callers to set done=false when a remaining drainable alloc is found. +func newJobResult() *jobResult { + return &jobResult{ + done: true, + } +} + +func (r *jobResult) String() string { + return fmt.Sprintf("Drain %d ; Migrate %d ; Done %v", len(r.drain), len(r.migrated), r.done) +} + +// handleJob takes the state of a draining job and returns the desired actions. +func handleJob(snap *state.StateSnapshot, job *structs.Job, allocs []*structs.Allocation, lastHandledIndex uint64) (*jobResult, error) { + r := newJobResult() + taskGroups := make(map[string]*structs.TaskGroup, len(job.TaskGroups)) + for _, tg := range job.TaskGroups { + if tg.Migrate != nil { + // TODO handle the upgrade path + // Only capture the groups that have a migrate strategy + taskGroups[tg.Name] = tg + } + } + + // Sort the allocations by TG + tgAllocs := make(map[string][]*structs.Allocation, len(taskGroups)) + for _, alloc := range allocs { + if _, ok := taskGroups[alloc.TaskGroup]; !ok { + continue + } + + tgAllocs[alloc.TaskGroup] = append(tgAllocs[alloc.TaskGroup], alloc) + } + + for name, tg := range taskGroups { + allocs := tgAllocs[name] + if err := handleTaskGroup(snap, tg, allocs, lastHandledIndex, r); err != nil { + return nil, fmt.Errorf("drain for task group %q failed: %v", name, err) + } + } + + return r, nil +} + +// handleTaskGroup takes the state of a draining task group and computes the desired actions. +func handleTaskGroup(snap *state.StateSnapshot, tg *structs.TaskGroup, + allocs []*structs.Allocation, lastHandledIndex uint64, result *jobResult) error { + + // Determine how many allocations can be drained + drainingNodes := make(map[string]bool, 4) + healthy := 0 + remainingDrainingAlloc := false + var drainable []*structs.Allocation + + for _, alloc := range allocs { + // Check if the alloc is on a draining node. + onDrainingNode, ok := drainingNodes[alloc.NodeID] + if !ok { + // Look up the node + node, err := snap.NodeByID(nil, alloc.NodeID) + if err != nil { + return err + } + + onDrainingNode = node.DrainStrategy != nil + drainingNodes[node.ID] = onDrainingNode + } + + // Check if the alloc should be considered migrated. A migrated + // allocation is one that is terminal, is on a draining + // allocation, and has only happened since our last handled index to + // avoid emitting many duplicate migrate events. + if alloc.TerminalStatus() && + onDrainingNode && + alloc.ModifyIndex > lastHandledIndex { + result.migrated = append(result.migrated, alloc) + continue + } + + // If the alloc is running and has its deployment status set, it is + // considered healthy from a migration standpoint. + if !alloc.TerminalStatus() && + alloc.DeploymentStatus != nil && + alloc.DeploymentStatus.Healthy != nil { + healthy++ + } + + // An alloc can't be considered for migration if: + // - It isn't on a draining node + // - It is already terminal + if !onDrainingNode || alloc.TerminalStatus() { + continue + } + + // Capture the fact that there is an allocation that is still draining + // for this job. + remainingDrainingAlloc = true + + // If we haven't marked this allocation for migration already, capture + // it as eligible for draining. + if !alloc.DesiredTransition.ShouldMigrate() { + drainable = append(drainable, alloc) + } + } + + // Update the done status + if remainingDrainingAlloc { + result.done = false + } + + // Determine how many we can drain + thresholdCount := tg.Count - tg.Migrate.MaxParallel + numToDrain := healthy - thresholdCount + numToDrain = helper.IntMin(len(drainable), numToDrain) + if numToDrain <= 0 { + return nil + } + + result.drain = append(result.drain, drainable[0:numToDrain]...) + return nil +} + +// getJobAllocs returns all allocations for draining jobs +func (w *drainingJobWatcher) getJobAllocs(ctx context.Context, minIndex uint64) (map[structs.NamespacedID][]*structs.Allocation, uint64, error) { + if err := w.limiter.Wait(ctx); err != nil { + return nil, 0, err + } + + resp, index, err := w.state.BlockingQuery(w.getJobAllocsImpl, minIndex, ctx) + if err != nil { + return nil, 0, err + } + if resp == nil { + return nil, index, nil + } + + return resp.(map[structs.NamespacedID][]*structs.Allocation), index, nil +} + +// getJobAllocsImpl returns a map of draining jobs to their allocations. +func (w *drainingJobWatcher) getJobAllocsImpl(ws memdb.WatchSet, state *state.StateStore) (interface{}, uint64, error) { + index, err := state.Index("allocs") + if err != nil { + return nil, 0, err + } + + // Capture the draining jobs. + draining := w.drainingJobs() + l := len(draining) + if l == 0 { + return nil, index, nil + } + + // Capture the allocs for each draining job. + resp := make(map[structs.NamespacedID][]*structs.Allocation, l) + for jns := range draining { + allocs, err := state.AllocsByJob(ws, jns.Namespace, jns.ID, false) + if err != nil { + return nil, index, err + } + + resp[jns] = allocs + } + + return resp, index, nil +} + +// drainingJobs captures the set of draining jobs. +func (w *drainingJobWatcher) drainingJobs() map[structs.NamespacedID]struct{} { + w.l.RLock() + defer w.l.RUnlock() + + l := len(w.jobs) + if l == 0 { + return nil + } + + draining := make(map[structs.NamespacedID]struct{}, l) + for k := range w.jobs { + draining[k] = struct{}{} + } + + return draining +} + +// getQueryCtx is a helper for getting the query context. +func (w *drainingJobWatcher) getQueryCtx() context.Context { + w.l.RLock() + defer w.l.RUnlock() + return w.queryCtx +} diff --git a/nomad/drainer/watch_jobs_test.go b/nomad/drainer/watch_jobs_test.go new file mode 100644 index 000000000000..be90ed13d42f --- /dev/null +++ b/nomad/drainer/watch_jobs_test.go @@ -0,0 +1,618 @@ +package drainer + +import ( + "context" + "testing" + "time" + + "github.com/hashicorp/nomad/helper" + "github.com/hashicorp/nomad/helper/testlog" + "github.com/hashicorp/nomad/nomad/mock" + "github.com/hashicorp/nomad/nomad/state" + "github.com/hashicorp/nomad/nomad/structs" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + "golang.org/x/time/rate" +) + +func testNodes(t *testing.T, state *state.StateStore) (drainingNode, runningNode *structs.Node) { + n1 := mock.Node() + n1.Name = "draining" + n1.DrainStrategy = &structs.DrainStrategy{ + DrainSpec: structs.DrainSpec{ + Deadline: time.Minute, + }, + ForceDeadline: time.Now().Add(time.Minute), + } + require.Nil(t, state.UpsertNode(100, n1)) + + // Create a non-draining node + n2 := mock.Node() + n2.Name = "running" + require.Nil(t, state.UpsertNode(101, n2)) + return n1, n2 +} + +func testDrainingJobWatcher(t *testing.T, state *state.StateStore) (*drainingJobWatcher, context.CancelFunc) { + t.Helper() + + limiter := rate.NewLimiter(100.0, 100) + logger := testlog.Logger(t) + ctx, cancel := context.WithCancel(context.Background()) + w := NewDrainingJobWatcher(ctx, limiter, state, logger) + return w, cancel +} + +// TestDrainingJobWatcher_Interface is a compile-time assertion that we +// implement the intended interface. +func TestDrainingJobWatcher_Interface(t *testing.T) { + w, cancel := testDrainingJobWatcher(t, state.TestStateStore(t)) + cancel() + var _ DrainingJobWatcher = w +} + +// asertJobWatcherOps asserts a certain number of allocs are drained and/or +// migrated by the job watcher. +func assertJobWatcherOps(t *testing.T, jw DrainingJobWatcher, drained, migrated int) ( + *DrainRequest, []*structs.Allocation) { + t.Helper() + var ( + drains *DrainRequest + migrations []*structs.Allocation + drainsChecked, migrationsChecked bool + ) + for { + select { + case drains = <-jw.Drain(): + ids := make([]string, len(drains.Allocs)) + for i, a := range drains.Allocs { + ids[i] = a.JobID[:6] + ":" + a.ID[:6] + } + t.Logf("draining %d allocs: %v", len(ids), ids) + require.False(t, drainsChecked, "drains already received") + drainsChecked = true + require.Lenf(t, drains.Allocs, drained, + "expected %d drains but found %d", drained, len(drains.Allocs)) + case migrations = <-jw.Migrated(): + ids := make([]string, len(migrations)) + for i, a := range migrations { + ids[i] = a.JobID[:6] + ":" + a.ID[:6] + } + t.Logf("migrating %d allocs: %v", len(ids), ids) + require.False(t, migrationsChecked, "migrations already received") + migrationsChecked = true + require.Lenf(t, migrations, migrated, + "expected %d migrations but found %d", migrated, len(migrations)) + case <-time.After(10 * time.Millisecond): + if !drainsChecked && drained > 0 { + t.Fatalf("expected %d drains but none happened", drained) + } + if !migrationsChecked && migrated > 0 { + t.Fatalf("expected %d migrations but none happened", migrated) + } + return drains, migrations + } + } +} + +// TestDrainingJobWatcher_DrainJobs asserts DrainingJobWatcher batches +// allocation changes from multiple jobs. +func TestDrainingJobWatcher_DrainJobs(t *testing.T) { + t.Parallel() + require := require.New(t) + + state := state.TestStateStore(t) + jobWatcher, cancelWatcher := testDrainingJobWatcher(t, state) + defer cancelWatcher() + drainingNode, runningNode := testNodes(t, state) + + var index uint64 = 101 + count := 8 + + newAlloc := func(node *structs.Node, job *structs.Job) *structs.Allocation { + a := mock.Alloc() + a.JobID = job.ID + a.Job = job + a.TaskGroup = job.TaskGroups[0].Name + a.NodeID = node.ID + return a + } + + // 2 jobs with count 10, max parallel 3 + jnss := make([]structs.NamespacedID, 2) + jobs := make([]*structs.Job, 2) + for i := 0; i < 2; i++ { + job := mock.Job() + jobs[i] = job + jnss[i] = structs.NamespacedID{Namespace: job.Namespace, ID: job.ID} + job.TaskGroups[0].Migrate.MaxParallel = 3 + job.TaskGroups[0].Count = count + require.Nil(state.UpsertJob(index, job)) + index++ + + var allocs []*structs.Allocation + for i := 0; i < count; i++ { + a := newAlloc(drainingNode, job) + a.DeploymentStatus = &structs.AllocDeploymentStatus{ + Healthy: helper.BoolToPtr(true), + } + allocs = append(allocs, a) + } + + require.Nil(state.UpsertAllocs(index, allocs)) + index++ + + } + + // Only register jobs with watcher after creating all data models as + // once the watcher starts we need to track the index carefully for + // updating the batch future + jobWatcher.RegisterJobs(jnss) + + // Expect a first batch of MaxParallel allocs from each job + drains, _ := assertJobWatcherOps(t, jobWatcher, 6, 0) + + // Fake migrating the drained allocs by starting new ones and stopping + // the old ones + drainedAllocs := make([]*structs.Allocation, len(drains.Allocs)) + for i, a := range drains.Allocs { + a.DesiredTransition.Migrate = helper.BoolToPtr(true) + + // create a copy so we can reuse this slice + drainedAllocs[i] = a.Copy() + } + require.Nil(state.UpsertAllocs(index, drainedAllocs)) + drains.Resp.Respond(index, nil) + index++ + + // Just setting ShouldMigrate should not cause any further drains + assertJobWatcherOps(t, jobWatcher, 0, 0) + + // Proceed our fake migration along by creating new allocs and stopping + // old ones + replacements := make([]*structs.Allocation, len(drainedAllocs)) + updates := make([]*structs.Allocation, 0, len(drainedAllocs)*2) + for i, a := range drainedAllocs { + // Stop drained allocs + a.DesiredTransition.Migrate = nil + a.DesiredStatus = structs.AllocDesiredStatusStop + + // Create a replacement + replacement := mock.Alloc() + replacement.JobID = a.Job.ID + replacement.Job = a.Job + replacement.TaskGroup = a.TaskGroup + replacement.NodeID = runningNode.ID + // start in pending state with no health status + + updates = append(updates, a, replacement) + replacements[i] = replacement.Copy() + } + require.Nil(state.UpsertAllocs(index, updates)) + index++ + + // The drained allocs stopping cause migrations but no new drains + // because the replacements have not started + assertJobWatcherOps(t, jobWatcher, 0, 6) + + // Finally kickoff further drain activity by "starting" replacements + for _, a := range replacements { + a.ClientStatus = structs.AllocClientStatusRunning + a.DeploymentStatus = &structs.AllocDeploymentStatus{ + Healthy: helper.BoolToPtr(true), + } + } + require.Nil(state.UpsertAllocs(index, replacements)) + index++ + + require.NotEmpty(jobWatcher.drainingJobs()) + + // 6 new drains + drains, _ = assertJobWatcherOps(t, jobWatcher, 6, 0) + + // Fake migrations once more to finish the drain + drainedAllocs = make([]*structs.Allocation, len(drains.Allocs)) + for i, a := range drains.Allocs { + a.DesiredTransition.Migrate = helper.BoolToPtr(true) + + // create a copy so we can reuse this slice + drainedAllocs[i] = a.Copy() + } + require.Nil(state.UpsertAllocs(index, drainedAllocs)) + drains.Resp.Respond(index, nil) + index++ + + assertJobWatcherOps(t, jobWatcher, 0, 0) + + replacements = make([]*structs.Allocation, len(drainedAllocs)) + updates = make([]*structs.Allocation, 0, len(drainedAllocs)*2) + for i, a := range drainedAllocs { + a.DesiredTransition.Migrate = nil + a.DesiredStatus = structs.AllocDesiredStatusStop + + replacement := newAlloc(runningNode, a.Job) + updates = append(updates, a, replacement) + replacements[i] = replacement.Copy() + } + require.Nil(state.UpsertAllocs(index, updates)) + index++ + + assertJobWatcherOps(t, jobWatcher, 0, 6) + + for _, a := range replacements { + a.ClientStatus = structs.AllocClientStatusRunning + a.DeploymentStatus = &structs.AllocDeploymentStatus{ + Healthy: helper.BoolToPtr(true), + } + } + require.Nil(state.UpsertAllocs(index, replacements)) + index++ + + require.NotEmpty(jobWatcher.drainingJobs()) + + // Final 4 new drains + drains, _ = assertJobWatcherOps(t, jobWatcher, 4, 0) + + // Fake migrations once more to finish the drain + drainedAllocs = make([]*structs.Allocation, len(drains.Allocs)) + for i, a := range drains.Allocs { + a.DesiredTransition.Migrate = helper.BoolToPtr(true) + + // create a copy so we can reuse this slice + drainedAllocs[i] = a.Copy() + } + require.Nil(state.UpsertAllocs(index, drainedAllocs)) + drains.Resp.Respond(index, nil) + index++ + + assertJobWatcherOps(t, jobWatcher, 0, 0) + + replacements = make([]*structs.Allocation, len(drainedAllocs)) + updates = make([]*structs.Allocation, 0, len(drainedAllocs)*2) + for i, a := range drainedAllocs { + a.DesiredTransition.Migrate = nil + a.DesiredStatus = structs.AllocDesiredStatusStop + + replacement := newAlloc(runningNode, a.Job) + updates = append(updates, a, replacement) + replacements[i] = replacement.Copy() + } + require.Nil(state.UpsertAllocs(index, updates)) + index++ + + assertJobWatcherOps(t, jobWatcher, 0, 4) + + for _, a := range replacements { + a.ClientStatus = structs.AllocClientStatusRunning + a.DeploymentStatus = &structs.AllocDeploymentStatus{ + Healthy: helper.BoolToPtr(true), + } + } + require.Nil(state.UpsertAllocs(index, replacements)) + + // No jobs should be left! + require.Empty(jobWatcher.drainingJobs()) +} + +// DrainingJobWatcher tests: +// TODO Test that the watcher cancels its query when a new job is registered + +// handleTaskGroupTestCase is the test case struct for TestHandleTaskGroup +// +// Two nodes will be initialized: one draining and one running. +type handleTaskGroupTestCase struct { + // Name of test + Name string + + // Expectations + ExpectedDrained int + ExpectedMigrated int + ExpectedDone bool + + // Count overrides the default count of 10 if set + Count int + + // MaxParallel overrides the default max_parallel of 1 if set + MaxParallel int + + // AddAlloc will be called 10 times to create test allocs + // + // Allocs default to be healthy on the draining node + AddAlloc func(i int, a *structs.Allocation, drainingID, runningID string) +} + +func TestHandeTaskGroup_Table(t *testing.T) { + cases := []handleTaskGroupTestCase{ + { + // All allocs on draining node + Name: "AllDraining", + ExpectedDrained: 1, + ExpectedMigrated: 0, + ExpectedDone: false, + }, + { + // All allocs on non-draining node + Name: "AllNonDraining", + ExpectedDrained: 0, + ExpectedMigrated: 0, + ExpectedDone: true, + AddAlloc: func(i int, a *structs.Allocation, drainingID, runningID string) { + a.NodeID = runningID + }, + }, + { + // Some allocs on non-draining node but not healthy + Name: "SomeNonDrainingUnhealthy", + ExpectedDrained: 0, + ExpectedMigrated: 0, + ExpectedDone: false, + AddAlloc: func(i int, a *structs.Allocation, drainingID, runningID string) { + if i%2 == 0 { + a.NodeID = runningID + a.DeploymentStatus = nil + } + }, + }, + { + // One draining, other allocs on non-draining node and healthy + Name: "OneDraining", + ExpectedDrained: 1, + ExpectedMigrated: 0, + ExpectedDone: false, + AddAlloc: func(i int, a *structs.Allocation, drainingID, runningID string) { + if i != 0 { + a.NodeID = runningID + } + }, + }, + { + // One already draining, other allocs on non-draining node and healthy + Name: "OneAlreadyDraining", + ExpectedDrained: 0, + ExpectedMigrated: 0, + ExpectedDone: false, + AddAlloc: func(i int, a *structs.Allocation, drainingID, runningID string) { + if i == 0 { + a.DesiredTransition.Migrate = helper.BoolToPtr(true) + return + } + a.NodeID = runningID + }, + }, + { + // One already drained, other allocs on non-draining node and healthy + Name: "OneAlreadyDrained", + ExpectedDrained: 0, + ExpectedMigrated: 1, + ExpectedDone: true, + AddAlloc: func(i int, a *structs.Allocation, drainingID, runningID string) { + if i == 0 { + a.DesiredStatus = structs.AllocDesiredStatusStop + return + } + a.NodeID = runningID + }, + }, + { + // All allocs are terminl, nothing to be drained + Name: "AllMigrating", + ExpectedDrained: 0, + ExpectedMigrated: 10, + ExpectedDone: true, + AddAlloc: func(i int, a *structs.Allocation, drainingID, runningID string) { + a.DesiredStatus = structs.AllocDesiredStatusStop + }, + }, + { + // All allocs may be drained at once + Name: "AllAtOnce", + ExpectedDrained: 10, + ExpectedMigrated: 0, + ExpectedDone: false, + MaxParallel: 10, + }, + { + // Drain 2 + Name: "Drain2", + ExpectedDrained: 2, + ExpectedMigrated: 0, + ExpectedDone: false, + MaxParallel: 2, + }, + { + // One on new node, one drained, and one draining + ExpectedDrained: 1, + ExpectedMigrated: 1, + MaxParallel: 2, + AddAlloc: func(i int, a *structs.Allocation, drainingID, runningID string) { + switch i { + case 0: + // One alloc on running node + a.NodeID = runningID + case 1: + // One alloc already migrated + a.DesiredStatus = structs.AllocDesiredStatusStop + } + }, + }, + { + // 8 on new node, one drained, and one draining + ExpectedDrained: 1, + ExpectedMigrated: 1, + MaxParallel: 2, + AddAlloc: func(i int, a *structs.Allocation, drainingID, runningID string) { + switch i { + case 0, 1, 2, 3, 4, 5, 6, 7: + a.NodeID = runningID + case 8: + a.DesiredStatus = structs.AllocDesiredStatusStop + } + }, + }, + { + // 5 on new node, two drained, and three draining + ExpectedDrained: 3, + ExpectedMigrated: 2, + MaxParallel: 5, + AddAlloc: func(i int, a *structs.Allocation, drainingID, runningID string) { + switch i { + case 0, 1, 2, 3, 4: + a.NodeID = runningID + case 8, 9: + a.DesiredStatus = structs.AllocDesiredStatusStop + } + }, + }, + { + // Not all on new node have health set + Name: "PendingHealth", + ExpectedDrained: 1, + ExpectedMigrated: 1, + MaxParallel: 3, + AddAlloc: func(i int, a *structs.Allocation, drainingID, runningID string) { + switch i { + case 0: + // Deployment status UNset for 1 on new node + a.NodeID = runningID + a.DeploymentStatus = nil + case 1, 2, 3, 4: + // Deployment status set for 4 on new node + a.NodeID = runningID + case 9: + a.DesiredStatus = structs.AllocDesiredStatusStop + } + }, + }, + { + // 5 max parallel - 1 migrating - 2 with unset health = 2 drainable + Name: "PendingHealthHigherMax", + ExpectedDrained: 2, + ExpectedMigrated: 1, + MaxParallel: 5, + AddAlloc: func(i int, a *structs.Allocation, drainingID, runningID string) { + switch i { + case 0, 1: + // Deployment status UNset for 2 on new node + a.NodeID = runningID + a.DeploymentStatus = nil + case 2, 3, 4: + // Deployment status set for 3 on new node + a.NodeID = runningID + case 9: + a.DesiredStatus = structs.AllocDesiredStatusStop + } + }, + }, + } + + for _, testCase := range cases { + t.Run(testCase.Name, func(t *testing.T) { + testHandleTaskGroup(t, testCase) + }) + } +} + +func testHandleTaskGroup(t *testing.T, tc handleTaskGroupTestCase) { + t.Parallel() + require := require.New(t) + assert := assert.New(t) + + // Create nodes + state := state.TestStateStore(t) + drainingNode, runningNode := testNodes(t, state) + + job := mock.Job() + job.TaskGroups[0].Count = 10 + if tc.Count > 0 { + job.TaskGroups[0].Count = tc.Count + } + if tc.MaxParallel > 0 { + job.TaskGroups[0].Migrate.MaxParallel = tc.MaxParallel + } + require.Nil(state.UpsertJob(102, job)) + + var allocs []*structs.Allocation + for i := 0; i < 10; i++ { + a := mock.Alloc() + a.JobID = job.ID + a.Job = job + a.TaskGroup = job.TaskGroups[0].Name + + // Default to being healthy on the draining node + a.NodeID = drainingNode.ID + a.DeploymentStatus = &structs.AllocDeploymentStatus{ + Healthy: helper.BoolToPtr(true), + } + if tc.AddAlloc != nil { + tc.AddAlloc(i, a, drainingNode.ID, runningNode.ID) + } + allocs = append(allocs, a) + } + + require.Nil(state.UpsertAllocs(103, allocs)) + snap, err := state.Snapshot() + require.Nil(err) + + res := newJobResult() + require.Nil(handleTaskGroup(snap, job.TaskGroups[0], allocs, 102, res)) + assert.Lenf(res.drain, tc.ExpectedDrained, "Drain expected %d but found: %d", + tc.ExpectedDrained, len(res.drain)) + assert.Lenf(res.migrated, tc.ExpectedMigrated, "Migrate expected %d but found: %d", + tc.ExpectedMigrated, len(res.migrated)) + assert.Equal(tc.ExpectedDone, res.done) +} + +func TestHandleTaskGroup_Migrations(t *testing.T) { + t.Parallel() + require := require.New(t) + + // Create a draining node + state := state.TestStateStore(t) + n := mock.Node() + n.DrainStrategy = &structs.DrainStrategy{ + DrainSpec: structs.DrainSpec{ + Deadline: 5 * time.Minute, + }, + ForceDeadline: time.Now().Add(1 * time.Minute), + } + require.Nil(state.UpsertNode(100, n)) + + job := mock.Job() + require.Nil(state.UpsertJob(101, job)) + + // Create 10 done allocs + var allocs []*structs.Allocation + for i := 0; i < 10; i++ { + a := mock.Alloc() + a.Job = job + a.TaskGroup = job.TaskGroups[0].Name + a.NodeID = n.ID + a.DeploymentStatus = &structs.AllocDeploymentStatus{ + Healthy: helper.BoolToPtr(false), + } + + if i%2 == 0 { + a.DesiredStatus = structs.AllocDesiredStatusStop + } else { + a.ClientStatus = structs.AllocClientStatusFailed + } + allocs = append(allocs, a) + } + require.Nil(state.UpsertAllocs(102, allocs)) + + snap, err := state.Snapshot() + require.Nil(err) + + // Handle before and after indexes + res := newJobResult() + require.Nil(handleTaskGroup(snap, job.TaskGroups[0], allocs, 101, res)) + require.Empty(res.drain) + require.Len(res.migrated, 10) + require.True(res.done) + + res = newJobResult() + require.Nil(handleTaskGroup(snap, job.TaskGroups[0], allocs, 103, res)) + require.Empty(res.drain) + require.Empty(res.migrated) + require.True(res.done) +} diff --git a/nomad/drainer/watch_nodes.go b/nomad/drainer/watch_nodes.go new file mode 100644 index 000000000000..97c6cf8b24ce --- /dev/null +++ b/nomad/drainer/watch_nodes.go @@ -0,0 +1,231 @@ +package drainer + +import ( + "context" + "log" + "time" + + memdb "github.com/hashicorp/go-memdb" + "github.com/hashicorp/nomad/nomad/state" + "github.com/hashicorp/nomad/nomad/structs" + "golang.org/x/time/rate" +) + +// DrainingNodeWatcher is the interface for watching for draining nodes. +type DrainingNodeWatcher interface{} + +// TrackedNodes returns the set of tracked nodes +func (n *NodeDrainer) TrackedNodes() map[string]*structs.Node { + n.l.RLock() + defer n.l.RUnlock() + + t := make(map[string]*structs.Node, len(n.nodes)) + for n, d := range n.nodes { + t[n] = d.GetNode() + } + + return t +} + +// Remove removes the given node from being tracked +func (n *NodeDrainer) Remove(nodeID string) { + n.l.Lock() + defer n.l.Unlock() + + // TODO test the notifier is updated + // Remove it from being tracked and remove it from the dealiner + delete(n.nodes, nodeID) + n.deadlineNotifier.Remove(nodeID) +} + +// Update updates the node, either updating the tracked version or starting to +// track the node. +func (n *NodeDrainer) Update(node *structs.Node) { + n.l.Lock() + defer n.l.Unlock() + + if node == nil { + return + } + + draining, ok := n.nodes[node.ID] + if !ok { + draining = NewDrainingNode(node, n.state) + n.nodes[node.ID] = draining + } else { + // Update it + draining.Update(node) + } + + // TODO test the notifier is updated + if inf, deadline := node.DrainStrategy.DeadlineTime(); !inf { + n.deadlineNotifier.Watch(node.ID, deadline) + } else { + // There is an infinite deadline so it shouldn't be tracked for + // deadlining + n.deadlineNotifier.Remove(node.ID) + } + + // TODO Test this + // Register interest in the draining jobs. + jobs, err := draining.RunningServices() + if err != nil { + n.logger.Printf("[ERR] nomad.drain: error retrieving services on node %q: %v", node.ID, err) + return + } + n.logger.Printf("[TRACE] nomad.drain: node %q has %d services on it", node.ID, len(jobs)) + n.jobWatcher.RegisterJobs(jobs) + + // TODO Test at this layer as well that a node drain on a node without + // allocs immediately gets unmarked as draining + // Check if the node is done such that if an operator drains a node with + // nothing on it we unset drain + done, err := draining.IsDone() + if err != nil { + n.logger.Printf("[ERR] nomad.drain: failed to check if node %q is done draining: %v", node.ID, err) + return + } + + if done { + index, err := n.raft.NodesDrainComplete([]string{node.ID}) + if err != nil { + n.logger.Printf("[ERR] nomad.drain: failed to unset drain for node %q: %v", node.ID, err) + } else { + n.logger.Printf("[INFO] nomad.drain: node %q completed draining at index %d", node.ID, index) + } + } +} + +// nodeDrainWatcher is used to watch nodes that are entering, leaving or +// changing their drain strategy. +type nodeDrainWatcher struct { + ctx context.Context + logger *log.Logger + + // state is the state that is watched for state changes. + state *state.StateStore + + // limiter is used to limit the rate of blocking queries + limiter *rate.Limiter + + // tracker is the object that is tracking the nodes and provides us with the + // needed callbacks + tracker NodeTracker +} + +// NewNodeDrainWatcher returns a new node drain watcher. +func NewNodeDrainWatcher(ctx context.Context, limiter *rate.Limiter, state *state.StateStore, logger *log.Logger, tracker NodeTracker) *nodeDrainWatcher { + w := &nodeDrainWatcher{ + ctx: ctx, + limiter: limiter, + logger: logger, + tracker: tracker, + state: state, + } + + go w.watch() + return w +} + +// watch is the long lived watching routine that detects node changes. +func (w *nodeDrainWatcher) watch() { + nindex := uint64(1) + for { + w.logger.Printf("[TRACE] nomad.drain.node_watcher: getting nodes at index %d", nindex) + nodes, index, err := w.getNodes(nindex) + w.logger.Printf("[TRACE] nomad.drain.node_watcher: got nodes %d at index %d: %v", len(nodes), nindex, err) + if err != nil { + if err == context.Canceled { + w.logger.Printf("[TRACE] nomad.drain.node_watcher: shutting down") + return + } + + w.logger.Printf("[ERR] nomad.drain.node_watcher: error watching node updates at index %d: %v", nindex, err) + select { + case <-w.ctx.Done(): + w.logger.Printf("[TRACE] nomad.drain.node_watcher: shutting down") + return + case <-time.After(stateReadErrorDelay): + continue + } + } + + // update index for next run + nindex = index + + tracked := w.tracker.TrackedNodes() + for nodeID, node := range nodes { + newDraining := node.DrainStrategy != nil + currentNode, tracked := tracked[nodeID] + + switch { + // If the node is tracked but not draining, untrack + case tracked && !newDraining: + w.logger.Printf("[TRACE] nomad.drain.node_watcher: tracked node %q is no longer draining", nodeID) + w.tracker.Remove(nodeID) + + // If the node is not being tracked but is draining, track + case !tracked && newDraining: + w.logger.Printf("[TRACE] nomad.drain.node_watcher: untracked node %q is draining", nodeID) + w.tracker.Update(node) + + // If the node is being tracked but has changed, update: + case tracked && newDraining && !currentNode.DrainStrategy.Equal(node.DrainStrategy): + w.logger.Printf("[TRACE] nomad.drain.node_watcher: tracked node %q has updated drain", nodeID) + w.tracker.Update(node) + default: + w.logger.Printf("[TRACE] nomad.drain.node_watcher: node %q at index %v: tracked %v, draining %v", nodeID, node.ModifyIndex, tracked, newDraining) + } + + // TODO(schmichael) handle the case of a lost node + } + + for nodeID := range tracked { + if _, ok := nodes[nodeID]; !ok { + w.logger.Printf("[TRACE] nomad.drain.node_watcher: tracked node %q is no longer exists", nodeID) + w.tracker.Remove(nodeID) + } + } + } +} + +// getNodes returns all nodes blocking until the nodes are after the given index. +func (w *nodeDrainWatcher) getNodes(minIndex uint64) (map[string]*structs.Node, uint64, error) { + if err := w.limiter.Wait(w.ctx); err != nil { + return nil, 0, err + } + + resp, index, err := w.state.BlockingQuery(w.getNodesImpl, minIndex, w.ctx) + if err != nil { + return nil, 0, err + } + + return resp.(map[string]*structs.Node), index, nil +} + +// getNodesImpl is used to get nodes from the state store, returning the set of +// nodes and the given index. +func (w *nodeDrainWatcher) getNodesImpl(ws memdb.WatchSet, state *state.StateStore) (interface{}, uint64, error) { + iter, err := state.Nodes(ws) + if err != nil { + return nil, 0, err + } + + index, err := state.Index("nodes") + if err != nil { + return nil, 0, err + } + + resp := make(map[string]*structs.Node, 64) + for { + raw := iter.Next() + if raw == nil { + break + } + + node := raw.(*structs.Node) + resp[node.ID] = node + } + + return resp, index, nil +} diff --git a/nomad/drainer/watch_nodes_test.go b/nomad/drainer/watch_nodes_test.go new file mode 100644 index 000000000000..476c7a39bb50 --- /dev/null +++ b/nomad/drainer/watch_nodes_test.go @@ -0,0 +1,190 @@ +package drainer + +import ( + "context" + "testing" + "time" + + "github.com/hashicorp/nomad/helper/testlog" + "github.com/hashicorp/nomad/nomad/mock" + "github.com/hashicorp/nomad/nomad/state" + "github.com/hashicorp/nomad/nomad/structs" + "github.com/hashicorp/nomad/testutil" + "github.com/stretchr/testify/require" + "golang.org/x/time/rate" +) + +func testNodeDrainWatcher(t *testing.T) (*nodeDrainWatcher, *state.StateStore, *MockNodeTracker) { + t.Helper() + + sconfig := &state.StateStoreConfig{ + LogOutput: testlog.NewWriter(t), + Region: "global", + } + state, err := state.NewStateStore(sconfig) + if err != nil { + t.Fatalf("failed to create state store: %v", err) + } + + limiter := rate.NewLimiter(100.0, 100) + logger := testlog.Logger(t) + m := NewMockNodeTracker() + w := NewNodeDrainWatcher(context.Background(), limiter, state, logger, m) + return w, state, m +} + +func TestNodeDrainWatcher_Interface(t *testing.T) { + t.Parallel() + require := require.New(t) + w, _, _ := testNodeDrainWatcher(t) + require.Implements((*DrainingNodeWatcher)(nil), w) +} + +func TestNodeDrainWatcher_AddDraining(t *testing.T) { + t.Parallel() + require := require.New(t) + _, state, m := testNodeDrainWatcher(t) + + // Create two nodes, one draining and one not draining + n1, n2 := mock.Node(), mock.Node() + n2.DrainStrategy = &structs.DrainStrategy{ + DrainSpec: structs.DrainSpec{ + Deadline: time.Hour, + }, + ForceDeadline: time.Now().Add(time.Hour), + } + + require.Nil(state.UpsertNode(100, n1)) + require.Nil(state.UpsertNode(101, n2)) + + testutil.WaitForResult(func() (bool, error) { + return len(m.Events) == 1, nil + }, func(err error) { + t.Fatal("No node drain events") + }) + + tracked := m.TrackedNodes() + require.NotContains(tracked, n1.ID) + require.Contains(tracked, n2.ID) + require.Equal(n2, tracked[n2.ID]) + +} + +func TestNodeDrainWatcher_Remove(t *testing.T) { + t.Parallel() + require := require.New(t) + _, state, m := testNodeDrainWatcher(t) + + // Create a draining node + n := mock.Node() + n.DrainStrategy = &structs.DrainStrategy{ + DrainSpec: structs.DrainSpec{ + Deadline: time.Hour, + }, + ForceDeadline: time.Now().Add(time.Hour), + } + + // Wait for it to be tracked + require.Nil(state.UpsertNode(100, n)) + testutil.WaitForResult(func() (bool, error) { + return len(m.Events) == 1, nil + }, func(err error) { + t.Fatal("No node drain events") + }) + + tracked := m.TrackedNodes() + require.Contains(tracked, n.ID) + require.Equal(n, tracked[n.ID]) + + // Change the node to be not draining and wait for it to be untracked + require.Nil(state.UpdateNodeDrain(101, n.ID, nil, false)) + testutil.WaitForResult(func() (bool, error) { + return len(m.Events) == 2, nil + }, func(err error) { + t.Fatal("No new node drain events") + }) + + tracked = m.TrackedNodes() + require.NotContains(tracked, n.ID) +} + +func TestNodeDrainWatcher_Remove_Nonexistent(t *testing.T) { + t.Parallel() + require := require.New(t) + _, state, m := testNodeDrainWatcher(t) + + // Create a draining node + n := mock.Node() + n.DrainStrategy = &structs.DrainStrategy{ + DrainSpec: structs.DrainSpec{ + Deadline: time.Hour, + }, + ForceDeadline: time.Now().Add(time.Hour), + } + + // Wait for it to be tracked + require.Nil(state.UpsertNode(100, n)) + testutil.WaitForResult(func() (bool, error) { + return len(m.Events) == 1, nil + }, func(err error) { + t.Fatal("No node drain events") + }) + + tracked := m.TrackedNodes() + require.Contains(tracked, n.ID) + require.Equal(n, tracked[n.ID]) + + // Delete the node + require.Nil(state.DeleteNode(101, n.ID)) + testutil.WaitForResult(func() (bool, error) { + return len(m.Events) == 2, nil + }, func(err error) { + t.Fatal("No new node drain events") + }) + + tracked = m.TrackedNodes() + require.NotContains(tracked, n.ID) +} + +func TestNodeDrainWatcher_Update(t *testing.T) { + t.Parallel() + require := require.New(t) + _, state, m := testNodeDrainWatcher(t) + + // Create a draining node + n := mock.Node() + n.DrainStrategy = &structs.DrainStrategy{ + DrainSpec: structs.DrainSpec{ + Deadline: time.Hour, + }, + ForceDeadline: time.Now().Add(time.Hour), + } + + // Wait for it to be tracked + require.Nil(state.UpsertNode(100, n)) + testutil.WaitForResult(func() (bool, error) { + return len(m.Events) == 1, nil + }, func(err error) { + t.Fatal("No node drain events") + }) + + tracked := m.TrackedNodes() + require.Contains(tracked, n.ID) + require.Equal(n, tracked[n.ID]) + + // Change the node to have a new spec + s2 := n.DrainStrategy.Copy() + s2.Deadline += time.Hour + require.Nil(state.UpdateNodeDrain(101, n.ID, s2, false)) + + // Wait for it to be updated + testutil.WaitForResult(func() (bool, error) { + return len(m.Events) == 2, nil + }, func(err error) { + t.Fatal("No new node drain events") + }) + + tracked = m.TrackedNodes() + require.Contains(tracked, n.ID) + require.Equal(s2, tracked[n.ID].DrainStrategy) +} diff --git a/nomad/drainer_int_test.go b/nomad/drainer_int_test.go new file mode 100644 index 000000000000..f71363a0d03b --- /dev/null +++ b/nomad/drainer_int_test.go @@ -0,0 +1,325 @@ +package nomad + +import ( + "context" + "fmt" + "log" + "net/rpc" + "testing" + "time" + + memdb "github.com/hashicorp/go-memdb" + msgpackrpc "github.com/hashicorp/net-rpc-msgpackrpc" + "github.com/hashicorp/nomad/helper" + "github.com/hashicorp/nomad/nomad/mock" + "github.com/hashicorp/nomad/nomad/state" + "github.com/hashicorp/nomad/nomad/structs" + "github.com/hashicorp/nomad/testutil" + "github.com/stretchr/testify/require" +) + +func allocPromoter(t *testing.T, ctx context.Context, + state *state.StateStore, codec rpc.ClientCodec, nodeID string, + logger *log.Logger) { + t.Helper() + + nindex := uint64(1) + for { + allocs, index, err := getNodeAllocs(ctx, state, nodeID, nindex) + if err != nil { + if err == context.Canceled { + return + } + + t.Fatalf("failed to get node allocs: %v", err) + } + nindex = index + + // For each alloc that doesn't have its deployment status set, set it + var updates []*structs.Allocation + for _, alloc := range allocs { + if alloc.DeploymentStatus != nil && alloc.DeploymentStatus.Healthy != nil { + continue + } + + newAlloc := alloc.Copy() + newAlloc.DeploymentStatus = &structs.AllocDeploymentStatus{ + Healthy: helper.BoolToPtr(true), + } + updates = append(updates, newAlloc) + logger.Printf("Marked deployment health for alloc %q", alloc.ID) + } + + if len(updates) == 0 { + continue + } + + // Send the update + req := &structs.AllocUpdateRequest{ + Alloc: updates, + WriteRequest: structs.WriteRequest{Region: "global"}, + } + var resp structs.NodeAllocsResponse + if err := msgpackrpc.CallWithCodec(codec, "Node.UpdateAlloc", req, &resp); err != nil { + if ctx.Err() == context.Canceled { + return + } else { + require.Nil(t, err) + } + } + } +} + +func getNodeAllocs(ctx context.Context, state *state.StateStore, nodeID string, index uint64) ([]*structs.Allocation, uint64, error) { + resp, index, err := state.BlockingQuery(getNodeAllocsImpl(nodeID), index, ctx) + if err != nil { + return nil, 0, err + } + if err := ctx.Err(); err != nil { + return nil, 0, err + } + + return resp.([]*structs.Allocation), index, nil +} + +func getNodeAllocsImpl(nodeID string) func(ws memdb.WatchSet, state *state.StateStore) (interface{}, uint64, error) { + return func(ws memdb.WatchSet, state *state.StateStore) (interface{}, uint64, error) { + // Capture all the allocations + allocs, err := state.AllocsByNode(ws, nodeID) + if err != nil { + return nil, 0, err + } + + // Use the last index that affected the jobs table + index, err := state.Index("allocs") + if err != nil { + return nil, index, err + } + + return allocs, index, nil + } +} + +func TestDrainer_Simple_ServiceOnly(t *testing.T) { + t.Parallel() + require := require.New(t) + s1 := TestServer(t, nil) + defer s1.Shutdown() + codec := rpcClient(t, s1) + testutil.WaitForLeader(t, s1.RPC) + + // Create two nodes + n1, n2 := mock.Node(), mock.Node() + nodeReg := &structs.NodeRegisterRequest{ + Node: n1, + WriteRequest: structs.WriteRequest{Region: "global"}, + } + var nodeResp structs.NodeUpdateResponse + require.Nil(msgpackrpc.CallWithCodec(codec, "Node.Register", nodeReg, &nodeResp)) + + // Create a job that runs on just one + job := mock.Job() + job.TaskGroups[0].Count = 2 + req := &structs.JobRegisterRequest{ + Job: job, + WriteRequest: structs.WriteRequest{ + Region: "global", + Namespace: job.Namespace, + }, + } + + // Fetch the response + var resp structs.JobRegisterResponse + require.Nil(msgpackrpc.CallWithCodec(codec, "Job.Register", req, &resp)) + require.NotZero(resp.Index) + + // Wait for the two allocations to be placed + state := s1.State() + testutil.WaitForResult(func() (bool, error) { + allocs, err := state.AllocsByJob(nil, job.Namespace, job.ID, false) + if err != nil { + return false, err + } + return len(allocs) == 2, fmt.Errorf("got %d allocs", len(allocs)) + }, func(err error) { + t.Fatalf("err: %v", err) + }) + + // Create the second node + nodeReg = &structs.NodeRegisterRequest{ + Node: n2, + WriteRequest: structs.WriteRequest{Region: "global"}, + } + require.Nil(msgpackrpc.CallWithCodec(codec, "Node.Register", nodeReg, &nodeResp)) + + // Drain the first node + drainReq := &structs.NodeUpdateDrainRequest{ + NodeID: n1.ID, + DrainStrategy: &structs.DrainStrategy{ + DrainSpec: structs.DrainSpec{ + Deadline: 10 * time.Minute, + }, + }, + WriteRequest: structs.WriteRequest{Region: "global"}, + } + var drainResp structs.NodeDrainUpdateResponse + require.Nil(msgpackrpc.CallWithCodec(codec, "Node.UpdateDrain", drainReq, &drainResp)) + + // Wait for the allocs to be replaced + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + go allocPromoter(t, ctx, state, codec, n1.ID, s1.logger) + go allocPromoter(t, ctx, state, codec, n2.ID, s1.logger) + + testutil.WaitForResult(func() (bool, error) { + allocs, err := state.AllocsByNode(nil, n2.ID) + if err != nil { + return false, err + } + return len(allocs) == 2, fmt.Errorf("got %d allocs", len(allocs)) + }, func(err error) { + t.Fatalf("err: %v", err) + }) + + // Check that the node drain is removed + testutil.WaitForResult(func() (bool, error) { + node, err := state.NodeByID(nil, n1.ID) + if err != nil { + return false, err + } + return node.DrainStrategy == nil, fmt.Errorf("has drain strategy still set") + }, func(err error) { + t.Fatalf("err: %v", err) + }) +} + +func TestDrainer_Simple_ServiceOnly_Deadline(t *testing.T) { + t.Parallel() + require := require.New(t) + s1 := TestServer(t, nil) + defer s1.Shutdown() + codec := rpcClient(t, s1) + testutil.WaitForLeader(t, s1.RPC) + + // Create a node + n1 := mock.Node() + nodeReg := &structs.NodeRegisterRequest{ + Node: n1, + WriteRequest: structs.WriteRequest{Region: "global"}, + } + var nodeResp structs.NodeUpdateResponse + require.Nil(msgpackrpc.CallWithCodec(codec, "Node.Register", nodeReg, &nodeResp)) + + // Create a job that runs on just one + job := mock.Job() + job.Update = *structs.DefaultUpdateStrategy + job.Update.Stagger = 30 * time.Second + job.TaskGroups[0].Count = 2 + req := &structs.JobRegisterRequest{ + Job: job, + WriteRequest: structs.WriteRequest{ + Region: "global", + Namespace: job.Namespace, + }, + } + + // Fetch the response + var resp structs.JobRegisterResponse + require.Nil(msgpackrpc.CallWithCodec(codec, "Job.Register", req, &resp)) + require.NotZero(resp.Index) + + // Wait for the two allocations to be placed + state := s1.State() + testutil.WaitForResult(func() (bool, error) { + allocs, err := state.AllocsByJob(nil, job.Namespace, job.ID, false) + if err != nil { + return false, err + } + return len(allocs) == 2, fmt.Errorf("got %d allocs", len(allocs)) + }, func(err error) { + t.Fatalf("err: %v", err) + }) + + // Drain the node + drainReq := &structs.NodeUpdateDrainRequest{ + NodeID: n1.ID, + DrainStrategy: &structs.DrainStrategy{ + DrainSpec: structs.DrainSpec{ + Deadline: 1 * time.Second, + }, + }, + WriteRequest: structs.WriteRequest{Region: "global"}, + } + var drainResp structs.NodeDrainUpdateResponse + require.Nil(msgpackrpc.CallWithCodec(codec, "Node.UpdateDrain", drainReq, &drainResp)) + + // Wait for the allocs to be stopped + testutil.WaitForResult(func() (bool, error) { + allocs, err := state.AllocsByNode(nil, n1.ID) + if err != nil { + return false, err + } + for _, alloc := range allocs { + if alloc.DesiredStatus != structs.AllocDesiredStatusStop { + return false, fmt.Errorf("got desired status %v", alloc.DesiredStatus) + } + } + return true, nil + }, func(err error) { + t.Fatalf("err: %v", err) + }) + + // Check that the node drain is removed + testutil.WaitForResult(func() (bool, error) { + node, err := state.NodeByID(nil, n1.ID) + if err != nil { + return false, err + } + return node.DrainStrategy == nil, fmt.Errorf("has drain strategy still set") + }, func(err error) { + t.Fatalf("err: %v", err) + }) +} + +func TestDrainer_DrainEmptyNode(t *testing.T) { + t.Parallel() + require := require.New(t) + s1 := TestServer(t, nil) + defer s1.Shutdown() + codec := rpcClient(t, s1) + testutil.WaitForLeader(t, s1.RPC) + + // Create a node + n1 := mock.Node() + nodeReg := &structs.NodeRegisterRequest{ + Node: n1, + WriteRequest: structs.WriteRequest{Region: "global"}, + } + var nodeResp structs.NodeUpdateResponse + require.Nil(msgpackrpc.CallWithCodec(codec, "Node.Register", nodeReg, &nodeResp)) + + // Drain the node + drainReq := &structs.NodeUpdateDrainRequest{ + NodeID: n1.ID, + DrainStrategy: &structs.DrainStrategy{ + DrainSpec: structs.DrainSpec{ + Deadline: 10 * time.Minute, + }, + }, + WriteRequest: structs.WriteRequest{Region: "global"}, + } + var drainResp structs.NodeDrainUpdateResponse + require.Nil(msgpackrpc.CallWithCodec(codec, "Node.UpdateDrain", drainReq, &drainResp)) + + // Check that the node drain is removed + state := s1.State() + testutil.WaitForResult(func() (bool, error) { + node, err := state.NodeByID(nil, n1.ID) + if err != nil { + return false, err + } + return node.DrainStrategy == nil, fmt.Errorf("has drain strategy still set") + }, func(err error) { + t.Fatalf("err: %v", err) + }) +} diff --git a/nomad/drainer_shims.go b/nomad/drainer_shims.go new file mode 100644 index 000000000000..0eb8c43a27b3 --- /dev/null +++ b/nomad/drainer_shims.go @@ -0,0 +1,47 @@ +package nomad + +import "github.com/hashicorp/nomad/nomad/structs" + +// drainerShim implements the drainer.RaftApplier interface required by the +// NodeDrainer. +type drainerShim struct { + s *Server +} + +func (d drainerShim) NodesDrainComplete(nodes []string) (uint64, error) { + args := &structs.BatchNodeUpdateDrainRequest{ + Updates: make(map[string]*structs.DrainUpdate, len(nodes)), + WriteRequest: structs.WriteRequest{Region: d.s.config.Region}, + } + + update := &structs.DrainUpdate{} + for _, node := range nodes { + args.Updates[node] = update + } + + resp, index, err := d.s.raftApply(structs.BatchNodeUpdateDrainRequestType, args) + return d.convertApplyErrors(resp, index, err) +} + +func (d drainerShim) AllocUpdateDesiredTransition(allocs map[string]*structs.DesiredTransition, evals []*structs.Evaluation) (uint64, error) { + args := &structs.AllocUpdateDesiredTransitionRequest{ + Allocs: allocs, + Evals: evals, + WriteRequest: structs.WriteRequest{Region: d.s.config.Region}, + } + resp, index, err := d.s.raftApply(structs.AllocUpdateDesiredTransitionRequestType, args) + return d.convertApplyErrors(resp, index, err) +} + +// convertApplyErrors parses the results of a raftApply and returns the index at +// which it was applied and any error that occurred. Raft Apply returns two +// separate errors, Raft library errors and user returned errors from the FSM. +// This helper, joins the errors by inspecting the applyResponse for an error. +func (d drainerShim) convertApplyErrors(applyResp interface{}, index uint64, err error) (uint64, error) { + if applyResp != nil { + if fsmErr, ok := applyResp.(error); ok && fsmErr != nil { + return index, fsmErr + } + } + return index, err +} diff --git a/nomad/fsm.go b/nomad/fsm.go index 21a785b6750f..afe726eede39 100644 --- a/nomad/fsm.go +++ b/nomad/fsm.go @@ -240,6 +240,12 @@ func (n *nomadFSM) Apply(log *raft.Log) interface{} { return n.applyUpsertNodeEvent(buf[1:], log.Index) case structs.JobBatchDeregisterRequestType: return n.applyBatchDeregisterJob(buf[1:], log.Index) + case structs.AllocUpdateDesiredTransitionRequestType: + return n.applyAllocUpdateDesiredTransition(buf[1:], log.Index) + case structs.NodeUpdateEligibilityRequestType: + return n.applyNodeEligibilityUpdate(buf[1:], log.Index) + case structs.BatchNodeUpdateDrainRequestType: + return n.applyBatchDrainUpdate(buf[1:], log.Index) } // Check enterprise only message types. @@ -326,13 +332,56 @@ func (n *nomadFSM) applyDrainUpdate(buf []byte, index uint64) interface{} { panic(fmt.Errorf("failed to decode request: %v", err)) } - if err := n.state.UpdateNodeDrain(index, req.NodeID, req.Drain); err != nil { + if err := n.state.UpdateNodeDrain(index, req.NodeID, req.DrainStrategy, req.MarkEligible); err != nil { n.logger.Printf("[ERR] nomad.fsm: UpdateNodeDrain failed: %v", err) return err } return nil } +func (n *nomadFSM) applyBatchDrainUpdate(buf []byte, index uint64) interface{} { + defer metrics.MeasureSince([]string{"nomad", "fsm", "batch_node_drain_update"}, time.Now()) + var req structs.BatchNodeUpdateDrainRequest + if err := structs.Decode(buf, &req); err != nil { + panic(fmt.Errorf("failed to decode request: %v", err)) + } + + if err := n.state.BatchUpdateNodeDrain(index, req.Updates); err != nil { + n.logger.Printf("[ERR] nomad.fsm: BatchUpdateNodeDrain failed: %v", err) + return err + } + return nil +} + +func (n *nomadFSM) applyNodeEligibilityUpdate(buf []byte, index uint64) interface{} { + defer metrics.MeasureSince([]string{"nomad", "fsm", "node_eligibility_update"}, time.Now()) + var req structs.NodeUpdateEligibilityRequest + if err := structs.Decode(buf, &req); err != nil { + panic(fmt.Errorf("failed to decode request: %v", err)) + } + + // Lookup the existing node + node, err := n.state.NodeByID(nil, req.NodeID) + if err != nil { + n.logger.Printf("[ERR] nomad.fsm: UpdateNodeEligibility failed to lookup node %q: %v", req.NodeID, err) + return err + } + + if err := n.state.UpdateNodeEligibility(index, req.NodeID, req.Eligibility); err != nil { + n.logger.Printf("[ERR] nomad.fsm: UpdateNodeEligibility failed: %v", err) + return err + } + + // Unblock evals for the nodes computed node class if it is in a ready + // state. + if node != nil && node.SchedulingEligibility == structs.NodeSchedulingIneligible && + req.Eligibility == structs.NodeSchedulingEligible { + n.blockedEvals.Unblock(node.ComputedClass, index) + } + + return nil +} + func (n *nomadFSM) applyUpsertJob(buf []byte, index uint64) interface{} { defer metrics.MeasureSince([]string{"nomad", "fsm", "register_job"}, time.Now()) var req structs.JobRegisterRequest @@ -651,6 +700,27 @@ func (n *nomadFSM) applyAllocClientUpdate(buf []byte, index uint64) interface{} return nil } +// applyAllocUpdateDesiredTransition is used to update the desired transitions +// of a set of allocations. +func (n *nomadFSM) applyAllocUpdateDesiredTransition(buf []byte, index uint64) interface{} { + defer metrics.MeasureSince([]string{"nomad", "fsm", "alloc_update_desired_transition"}, time.Now()) + var req structs.AllocUpdateDesiredTransitionRequest + if err := structs.Decode(buf, &req); err != nil { + panic(fmt.Errorf("failed to decode request: %v", err)) + } + + if err := n.state.UpdateAllocsDesiredTransitions(index, req.Allocs, req.Evals); err != nil { + n.logger.Printf("[ERR] nomad.fsm: UpdateAllocsDesiredTransitions failed: %v", err) + return err + } + + if err := n.upsertEvals(index, req.Evals); err != nil { + n.logger.Printf("[ERR] nomad.fsm: AllocUpdateDesiredTransition failed to upsert %d eval(s): %v", len(req.Evals), err) + return err + } + return nil +} + // applyReconcileSummaries reconciles summaries for all the jobs func (n *nomadFSM) applyReconcileSummaries(buf []byte, index uint64) interface{} { if err := n.state.ReconcileJobSummaries(index); err != nil { diff --git a/nomad/fsm_test.go b/nomad/fsm_test.go index 5c2ed08cb112..ed8cf2df5944 100644 --- a/nomad/fsm_test.go +++ b/nomad/fsm_test.go @@ -12,6 +12,7 @@ import ( "github.com/google/go-cmp/cmp" memdb "github.com/hashicorp/go-memdb" "github.com/hashicorp/nomad/helper" + "github.com/hashicorp/nomad/helper/uuid" "github.com/hashicorp/nomad/nomad/mock" "github.com/hashicorp/nomad/nomad/state" "github.com/hashicorp/nomad/nomad/structs" @@ -277,8 +278,9 @@ func TestFSM_UpdateNodeStatus(t *testing.T) { }) } -func TestFSM_UpdateNodeDrain(t *testing.T) { +func TestFSM_BatchUpdateNodeDrain(t *testing.T) { t.Parallel() + require := require.New(t) fsm := testFSM(t) node := mock.Node() @@ -286,38 +288,188 @@ func TestFSM_UpdateNodeDrain(t *testing.T) { Node: node, } buf, err := structs.Encode(structs.NodeRegisterRequestType, req) - if err != nil { - t.Fatalf("err: %v", err) - } + require.Nil(err) resp := fsm.Apply(makeLog(buf)) - if resp != nil { - t.Fatalf("resp: %v", resp) + require.Nil(resp) + + strategy := &structs.DrainStrategy{ + DrainSpec: structs.DrainSpec{ + Deadline: 10 * time.Second, + }, + } + req2 := structs.BatchNodeUpdateDrainRequest{ + Updates: map[string]*structs.DrainUpdate{ + node.ID: { + DrainStrategy: strategy, + }, + }, + } + buf, err = structs.Encode(structs.BatchNodeUpdateDrainRequestType, req2) + require.Nil(err) + + resp = fsm.Apply(makeLog(buf)) + require.Nil(resp) + + // Verify drain is set + ws := memdb.NewWatchSet() + node, err = fsm.State().NodeByID(ws, req.Node.ID) + require.Nil(err) + require.True(node.Drain) + require.Equal(node.DrainStrategy, strategy) +} + +func TestFSM_UpdateNodeDrain(t *testing.T) { + t.Parallel() + require := require.New(t) + fsm := testFSM(t) + + node := mock.Node() + req := structs.NodeRegisterRequest{ + Node: node, } + buf, err := structs.Encode(structs.NodeRegisterRequestType, req) + require.Nil(err) + resp := fsm.Apply(makeLog(buf)) + require.Nil(resp) + + strategy := &structs.DrainStrategy{ + DrainSpec: structs.DrainSpec{ + Deadline: 10 * time.Second, + }, + } req2 := structs.NodeUpdateDrainRequest{ - NodeID: node.ID, - Drain: true, + NodeID: node.ID, + DrainStrategy: strategy, } buf, err = structs.Encode(structs.NodeUpdateDrainRequestType, req2) - if err != nil { - t.Fatalf("err: %v", err) - } + require.Nil(err) resp = fsm.Apply(makeLog(buf)) - if resp != nil { - t.Fatalf("resp: %v", resp) - } + require.Nil(resp) // Verify we are NOT registered ws := memdb.NewWatchSet() node, err = fsm.State().NodeByID(ws, req.Node.ID) - if err != nil { - t.Fatalf("err: %v", err) + require.Nil(err) + require.True(node.Drain) + require.Equal(node.DrainStrategy, strategy) +} + +func TestFSM_UpdateNodeEligibility(t *testing.T) { + t.Parallel() + require := require.New(t) + fsm := testFSM(t) + + node := mock.Node() + req := structs.NodeRegisterRequest{ + Node: node, } - if !node.Drain { - t.Fatalf("bad node: %#v", node) + buf, err := structs.Encode(structs.NodeRegisterRequestType, req) + require.Nil(err) + + resp := fsm.Apply(makeLog(buf)) + require.Nil(resp) + + // Set the eligibility + req2 := structs.NodeUpdateEligibilityRequest{ + NodeID: node.ID, + Eligibility: structs.NodeSchedulingIneligible, + } + buf, err = structs.Encode(structs.NodeUpdateEligibilityRequestType, req2) + require.Nil(err) + + resp = fsm.Apply(makeLog(buf)) + require.Nil(resp) + + // Lookup the node and check + node, err = fsm.State().NodeByID(nil, req.Node.ID) + require.Nil(err) + require.Equal(node.SchedulingEligibility, structs.NodeSchedulingIneligible) + + // Update the drain + strategy := &structs.DrainStrategy{ + DrainSpec: structs.DrainSpec{ + Deadline: 10 * time.Second, + }, + } + req3 := structs.NodeUpdateDrainRequest{ + NodeID: node.ID, + DrainStrategy: strategy, + } + buf, err = structs.Encode(structs.NodeUpdateDrainRequestType, req3) + require.Nil(err) + resp = fsm.Apply(makeLog(buf)) + require.Nil(resp) + + // Try forcing eligibility + req4 := structs.NodeUpdateEligibilityRequest{ + NodeID: node.ID, + Eligibility: structs.NodeSchedulingEligible, + } + buf, err = structs.Encode(structs.NodeUpdateEligibilityRequestType, req4) + require.Nil(err) + + resp = fsm.Apply(makeLog(buf)) + require.NotNil(resp) + err, ok := resp.(error) + require.True(ok) + require.Contains(err.Error(), "draining") +} + +func TestFSM_UpdateNodeEligibility_Unblock(t *testing.T) { + t.Parallel() + require := require.New(t) + fsm := testFSM(t) + + node := mock.Node() + req := structs.NodeRegisterRequest{ + Node: node, + } + buf, err := structs.Encode(structs.NodeRegisterRequestType, req) + require.Nil(err) + + resp := fsm.Apply(makeLog(buf)) + require.Nil(resp) + + // Set the eligibility + req2 := structs.NodeUpdateEligibilityRequest{ + NodeID: node.ID, + Eligibility: structs.NodeSchedulingIneligible, + } + buf, err = structs.Encode(structs.NodeUpdateEligibilityRequestType, req2) + require.Nil(err) + + resp = fsm.Apply(makeLog(buf)) + require.Nil(resp) + + // Mark an eval as blocked. + eval := mock.Eval() + eval.ClassEligibility = map[string]bool{node.ComputedClass: true} + fsm.blockedEvals.Block(eval) + + // Set eligible + req4 := structs.NodeUpdateEligibilityRequest{ + NodeID: node.ID, + Eligibility: structs.NodeSchedulingEligible, } + buf, err = structs.Encode(structs.NodeUpdateEligibilityRequestType, req4) + require.Nil(err) + + resp = fsm.Apply(makeLog(buf)) + require.Nil(resp) + + // Verify the eval was unblocked. + testutil.WaitForResult(func() (bool, error) { + bStats := fsm.blockedEvals.Stats() + if bStats.TotalBlocked != 0 { + return false, fmt.Errorf("bad: %#v", bStats) + } + return true, nil + }, func(err error) { + t.Fatalf("err: %s", err) + }) } func TestFSM_RegisterJob(t *testing.T) { @@ -1241,6 +1393,63 @@ func TestFSM_UpdateAllocFromClient(t *testing.T) { require.Equal(eval, res) } +func TestFSM_UpdateAllocDesiredTransition(t *testing.T) { + t.Parallel() + fsm := testFSM(t) + state := fsm.State() + require := require.New(t) + + alloc := mock.Alloc() + alloc2 := mock.Alloc() + alloc2.Job = alloc.Job + alloc2.JobID = alloc.JobID + state.UpsertJobSummary(9, mock.JobSummary(alloc.JobID)) + state.UpsertAllocs(10, []*structs.Allocation{alloc, alloc2}) + + t1 := &structs.DesiredTransition{ + Migrate: helper.BoolToPtr(true), + } + + eval := &structs.Evaluation{ + ID: uuid.Generate(), + Namespace: alloc.Namespace, + Priority: alloc.Job.Priority, + Type: alloc.Job.Type, + TriggeredBy: structs.EvalTriggerNodeDrain, + JobID: alloc.Job.ID, + JobModifyIndex: alloc.Job.ModifyIndex, + Status: structs.EvalStatusPending, + } + req := structs.AllocUpdateDesiredTransitionRequest{ + Allocs: map[string]*structs.DesiredTransition{ + alloc.ID: t1, + alloc2.ID: t1, + }, + Evals: []*structs.Evaluation{eval}, + } + buf, err := structs.Encode(structs.AllocUpdateDesiredTransitionRequestType, req) + require.Nil(err) + + resp := fsm.Apply(makeLog(buf)) + require.Nil(resp) + + // Verify we are registered + ws := memdb.NewWatchSet() + out1, err := fsm.State().AllocByID(ws, alloc.ID) + require.Nil(err) + out2, err := fsm.State().AllocByID(ws, alloc2.ID) + require.Nil(err) + evalOut, err := fsm.State().EvalByID(ws, eval.ID) + require.Nil(err) + require.NotNil(evalOut) + require.Equal(eval.ID, evalOut.ID) + + require.NotNil(out1.DesiredTransition.Migrate) + require.NotNil(out2.DesiredTransition.Migrate) + require.True(*out1.DesiredTransition.Migrate) + require.True(*out2.DesiredTransition.Migrate) +} + func TestFSM_UpsertVaultAccessor(t *testing.T) { t.Parallel() fsm := testFSM(t) diff --git a/nomad/job_endpoint_test.go b/nomad/job_endpoint_test.go index 7d0d0e770831..9182cc872754 100644 --- a/nomad/job_endpoint_test.go +++ b/nomad/job_endpoint_test.go @@ -421,8 +421,7 @@ func TestJobEndpoint_Register_ParameterizedJob(t *testing.T) { testutil.WaitForLeader(t, s1.RPC) // Create the register request for a parameterized job. - job := mock.Job() - job.Type = structs.JobTypeBatch + job := mock.BatchJob() job.ParameterizedJob = &structs.ParameterizedJobConfig{} req := &structs.JobRegisterRequest{ Job: job, @@ -1423,8 +1422,7 @@ func TestJobEndpoint_Evaluate_ParameterizedJob(t *testing.T) { testutil.WaitForLeader(t, s1.RPC) // Create the register request - job := mock.Job() - job.Type = structs.JobTypeBatch + job := mock.BatchJob() job.ParameterizedJob = &structs.ParameterizedJobConfig{} req := &structs.JobRegisterRequest{ Job: job, @@ -1751,8 +1749,7 @@ func TestJobEndpoint_Deregister_ParameterizedJob(t *testing.T) { testutil.WaitForLeader(t, s1.RPC) // Create the register request - job := mock.Job() - job.Type = structs.JobTypeBatch + job := mock.BatchJob() job.ParameterizedJob = &structs.ParameterizedJobConfig{} reg := &structs.JobRegisterRequest{ Job: job, @@ -3958,8 +3955,7 @@ func TestJobEndpoint_Dispatch_ACL(t *testing.T) { state := s1.fsm.State() // Create a parameterized job - job := mock.Job() - job.Type = structs.JobTypeBatch + job := mock.BatchJob() job.ParameterizedJob = &structs.ParameterizedJobConfig{} err := state.UpsertJob(400, job) require.Nil(err) @@ -4027,34 +4023,29 @@ func TestJobEndpoint_Dispatch(t *testing.T) { t.Parallel() // No requirements - d1 := mock.Job() - d1.Type = structs.JobTypeBatch + d1 := mock.BatchJob() d1.ParameterizedJob = &structs.ParameterizedJobConfig{} // Require input data - d2 := mock.Job() - d2.Type = structs.JobTypeBatch + d2 := mock.BatchJob() d2.ParameterizedJob = &structs.ParameterizedJobConfig{ Payload: structs.DispatchPayloadRequired, } // Disallow input data - d3 := mock.Job() - d3.Type = structs.JobTypeBatch + d3 := mock.BatchJob() d3.ParameterizedJob = &structs.ParameterizedJobConfig{ Payload: structs.DispatchPayloadForbidden, } // Require meta - d4 := mock.Job() - d4.Type = structs.JobTypeBatch + d4 := mock.BatchJob() d4.ParameterizedJob = &structs.ParameterizedJobConfig{ MetaRequired: []string{"foo", "bar"}, } // Optional meta - d5 := mock.Job() - d5.Type = structs.JobTypeBatch + d5 := mock.BatchJob() d5.ParameterizedJob = &structs.ParameterizedJobConfig{ MetaOptional: []string{"foo", "bar"}, } @@ -4063,8 +4054,7 @@ func TestJobEndpoint_Dispatch(t *testing.T) { d6 := mock.PeriodicJob() d6.ParameterizedJob = &structs.ParameterizedJobConfig{} - d7 := mock.Job() - d7.Type = structs.JobTypeBatch + d7 := mock.BatchJob() d7.ParameterizedJob = &structs.ParameterizedJobConfig{} d7.Stop = true diff --git a/nomad/leader.go b/nomad/leader.go index 51aa737b3099..f65a22477727 100644 --- a/nomad/leader.go +++ b/nomad/leader.go @@ -199,9 +199,10 @@ func (s *Server) establishLeadership(stopCh chan struct{}) error { s.blockedEvals.SetTimetable(s.fsm.TimeTable()) // Enable the deployment watcher, since we are now the leader - if err := s.deploymentWatcher.SetEnabled(true, s.State()); err != nil { - return err - } + s.deploymentWatcher.SetEnabled(true, s.State()) + + // Enable the NodeDrainer + s.nodeDrainer.SetEnabled(true, s.State()) // Restore the eval broker state if err := s.restoreEvals(); err != nil { @@ -673,9 +674,10 @@ func (s *Server) revokeLeadership() error { s.vault.SetActive(false) // Disable the deployment watcher as it is only useful as a leader. - if err := s.deploymentWatcher.SetEnabled(false, nil); err != nil { - return err - } + s.deploymentWatcher.SetEnabled(false, nil) + + // Disable the node drainer + s.nodeDrainer.SetEnabled(false, nil) // Disable any enterprise systems required. if err := s.revokeEnterpriseLeadership(); err != nil { diff --git a/nomad/mock/mock.go b/nomad/mock/mock.go index 3a8588b9cbad..fc12adbb1618 100644 --- a/nomad/mock/mock.go +++ b/nomad/mock/mock.go @@ -54,8 +54,9 @@ func Node() *structs.Node { "database": "mysql", "version": "5.6", }, - NodeClass: "linux-medium-pci", - Status: structs.NodeStatusReady, + NodeClass: "linux-medium-pci", + Status: structs.NodeStatusReady, + SchedulingEligibility: structs.NodeSchedulingEligible, } node.ComputeClass() return node @@ -97,6 +98,7 @@ func Job() *structs.Job { Delay: 5 * time.Second, DelayFunction: "linear", }, + Migrate: structs.DefaultMigrateStrategy(), Tasks: []*structs.Task{ { Name: "web", @@ -167,6 +169,72 @@ func Job() *structs.Job { return job } +func BatchJob() *structs.Job { + job := &structs.Job{ + Region: "global", + ID: uuid.Generate(), + Name: "batch-job", + Namespace: structs.DefaultNamespace, + Type: structs.JobTypeBatch, + Priority: 50, + AllAtOnce: false, + Datacenters: []string{"dc1"}, + TaskGroups: []*structs.TaskGroup{ + { + Name: "worker", + Count: 10, + EphemeralDisk: &structs.EphemeralDisk{ + SizeMB: 150, + }, + RestartPolicy: &structs.RestartPolicy{ + Attempts: 3, + Interval: 10 * time.Minute, + Delay: 1 * time.Minute, + Mode: structs.RestartPolicyModeDelay, + }, + ReschedulePolicy: &structs.ReschedulePolicy{ + Attempts: 2, + Interval: 10 * time.Minute, + Delay: 5 * time.Second, + DelayFunction: "linear", + }, + Tasks: []*structs.Task{ + { + Name: "worker", + Driver: "mock_driver", + Config: map[string]interface{}{ + "run_for": "500ms", + }, + Env: map[string]string{ + "FOO": "bar", + }, + LogConfig: structs.DefaultLogConfig(), + Resources: &structs.Resources{ + CPU: 100, + MemoryMB: 100, + Networks: []*structs.NetworkResource{ + { + MBits: 50, + }, + }, + }, + Meta: map[string]string{ + "foo": "bar", + }, + }, + }, + }, + }, + Status: structs.JobStatusPending, + Version: 0, + CreateIndex: 43, + ModifyIndex: 99, + JobModifyIndex: 99, + } + job.Canonicalize() + return job +} + func SystemJob() *structs.Job { job := &structs.Job{ Region: "global", @@ -194,6 +262,10 @@ func SystemJob() *structs.Job { Delay: 1 * time.Minute, Mode: structs.RestartPolicyModeDelay, }, + ReschedulePolicy: &structs.ReschedulePolicy{ + Attempts: 2, + Interval: 10 * time.Minute, + }, EphemeralDisk: structs.DefaultEphemeralDisk(), Tasks: []*structs.Task{ { @@ -238,6 +310,7 @@ func PeriodicJob() *structs.Job { Spec: "*/30 * * * *", } job.Status = structs.JobStatusRunning + job.TaskGroups[0].Migrate = nil return job } diff --git a/nomad/node_endpoint.go b/nomad/node_endpoint.go index 12fffbce2a5e..e8726a2f4125 100644 --- a/nomad/node_endpoint.go +++ b/nomad/node_endpoint.go @@ -41,7 +41,7 @@ type Node struct { // updateFuture is used to wait for the pending batch update // to complete. This may be nil if no batch is pending. - updateFuture *batchFuture + updateFuture *structs.BatchFuture // updateTimer is the timer that will trigger the next batch // update, and may be nil if there is no batch pending. @@ -87,6 +87,11 @@ func (n *Node) Register(args *structs.NodeRegisterRequest, reply *structs.NodeUp return fmt.Errorf("invalid status for node") } + // Default to eligible for scheduling if unset + if args.Node.SchedulingEligibility == "" { + args.Node.SchedulingEligibility = structs.NodeSchedulingEligible + } + // Set the timestamp when the node is registered args.Node.StatusUpdatedAt = time.Now().Unix() @@ -428,29 +433,90 @@ func (n *Node) UpdateDrain(args *structs.NodeUpdateDrainRequest, return fmt.Errorf("node not found") } - // Update the timestamp to - node.StatusUpdatedAt = time.Now().Unix() + // COMPAT: Remove in 0.9. Attempt to upgrade the request if it is of the old + // format. + if args.Drain && args.DrainStrategy == nil { + args.DrainStrategy = &structs.DrainStrategy{ + DrainSpec: structs.DrainSpec{ + Deadline: -1 * time.Second, // Force drain + }, + } + } + + // Mark the deadline time + if args.DrainStrategy != nil && args.DrainStrategy.Deadline.Nanoseconds() > 0 { + args.DrainStrategy.ForceDeadline = time.Now().Add(args.DrainStrategy.Deadline) + } // Commit this update via Raft - var index uint64 - if node.Drain != args.Drain { - _, index, err = n.srv.raftApply(structs.NodeUpdateDrainRequestType, args) - if err != nil { - n.srv.logger.Printf("[ERR] nomad.client: drain update failed: %v", err) - return err - } - reply.NodeModifyIndex = index + _, index, err := n.srv.raftApply(structs.NodeUpdateDrainRequestType, args) + if err != nil { + n.srv.logger.Printf("[ERR] nomad.client: drain update failed: %v", err) + return err } + reply.NodeModifyIndex = index - // Always attempt to create Node evaluations because there may be a System - // job registered that should be evaluated. - evalIDs, evalIndex, err := n.createNodeEvals(args.NodeID, index) + // Set the reply index + reply.Index = index + return nil +} + +// UpdateEligibility is used to update the scheduling eligibility of a node +func (n *Node) UpdateEligibility(args *structs.NodeUpdateEligibilityRequest, + reply *structs.GenericResponse) error { + if done, err := n.srv.forward("Node.UpdateEligibility", args, args, reply); done { + return err + } + defer metrics.MeasureSince([]string{"nomad", "client", "update_eligibility"}, time.Now()) + + // Check node write permissions + if aclObj, err := n.srv.ResolveToken(args.AuthToken); err != nil { + return err + } else if aclObj != nil && !aclObj.AllowNodeWrite() { + return structs.ErrPermissionDenied + } + + // Verify the arguments + if args.NodeID == "" { + return fmt.Errorf("missing node ID for setting scheduling eligibility") + } + + // Look for the node + snap, err := n.srv.fsm.State().Snapshot() if err != nil { - n.srv.logger.Printf("[ERR] nomad.client: eval creation failed: %v", err) return err } - reply.EvalIDs = evalIDs - reply.EvalCreateIndex = evalIndex + ws := memdb.NewWatchSet() + node, err := snap.NodeByID(ws, args.NodeID) + if err != nil { + return err + } + if node == nil { + return fmt.Errorf("node not found") + } + + if node.DrainStrategy != nil && args.Eligibility == structs.NodeSchedulingEligible { + return fmt.Errorf("can not set node's scheduling eligibility to eligible while it is draining") + } + + switch args.Eligibility { + case structs.NodeSchedulingEligible, structs.NodeSchedulingIneligible: + default: + return fmt.Errorf("invalid scheduling eligibility %q", args.Eligibility) + } + + // Commit this update via Raft + outErr, index, err := n.srv.raftApply(structs.NodeUpdateEligibilityRequestType, args) + if err != nil { + n.srv.logger.Printf("[ERR] nomad.client: eligibility update failed: %v", err) + return err + } + if outErr != nil { + if err, ok := outErr.(error); ok && err != nil { + n.srv.logger.Printf("[ERR] nomad.client: eligibility update failed: %v", err) + return err + } + } // Set the reply index reply.Index = index @@ -817,7 +883,7 @@ func (n *Node) UpdateAlloc(args *structs.AllocUpdateRequest, reply *structs.Gene // Ensure that evals aren't set from client RPCs // We create them here before the raft update if len(args.Evals) != 0 { - return fmt.Errorf("evals field must not be set ") + return fmt.Errorf("evals field must not be set") } // Update modified timestamp for client initiated allocation updates @@ -867,7 +933,7 @@ func (n *Node) UpdateAlloc(args *structs.AllocUpdateRequest, reply *structs.Gene // Start a new batch if none future := n.updateFuture if future == nil { - future = NewBatchFuture() + future = structs.NewBatchFuture() n.updateFuture = future n.updateTimer = time.AfterFunc(batchUpdateInterval, func() { // Get the pending updates @@ -896,7 +962,7 @@ func (n *Node) UpdateAlloc(args *structs.AllocUpdateRequest, reply *structs.Gene } // batchUpdate is used to update all the allocations -func (n *Node) batchUpdate(future *batchFuture, updates []*structs.Allocation, evals []*structs.Evaluation) { +func (n *Node) batchUpdate(future *structs.BatchFuture, updates []*structs.Allocation, evals []*structs.Evaluation) { // Prepare the batch update batch := &structs.AllocUpdateRequest{ Alloc: updates, @@ -1100,38 +1166,6 @@ func (n *Node) createNodeEvals(nodeID string, nodeIndex uint64) ([]string, uint6 return evalIDs, evalIndex, nil } -// batchFuture is used to wait on a batch update to complete -type batchFuture struct { - doneCh chan struct{} - err error - index uint64 -} - -// NewBatchFuture creates a new batch future -func NewBatchFuture() *batchFuture { - return &batchFuture{ - doneCh: make(chan struct{}), - } -} - -// Wait is used to block for the future to complete and returns the error -func (b *batchFuture) Wait() error { - <-b.doneCh - return b.err -} - -// Index is used to return the index of the batch, only after Wait() -func (b *batchFuture) Index() uint64 { - return b.index -} - -// Respond is used to unblock the future -func (b *batchFuture) Respond(index uint64, err error) { - b.index = index - b.err = err - close(b.doneCh) -} - // DeriveVaultToken is used by the clients to request wrapped Vault tokens for // tasks func (n *Node) DeriveVaultToken(args *structs.DeriveVaultTokenRequest, diff --git a/nomad/node_endpoint_test.go b/nomad/node_endpoint_test.go index 375ca8731cb3..4c278766cbc7 100644 --- a/nomad/node_endpoint_test.go +++ b/nomad/node_endpoint_test.go @@ -747,11 +747,15 @@ func TestClientEndpoint_UpdateStatus_HeartbeatOnly_Advertise(t *testing.T) { func TestClientEndpoint_UpdateDrain(t *testing.T) { t.Parallel() + require := require.New(t) s1 := TestServer(t, nil) defer s1.Shutdown() codec := rpcClient(t, s1) testutil.WaitForLeader(t, s1.RPC) + // Disable drainer to prevent drain from completing during test + s1.nodeDrainer.SetEnabled(false, nil) + // Create the register request node := mock.Node() reg := &structs.NodeRegisterRequest{ @@ -761,34 +765,36 @@ func TestClientEndpoint_UpdateDrain(t *testing.T) { // Fetch the response var resp structs.NodeUpdateResponse - if err := msgpackrpc.CallWithCodec(codec, "Node.Register", reg, &resp); err != nil { - t.Fatalf("err: %v", err) + require.Nil(msgpackrpc.CallWithCodec(codec, "Node.Register", reg, &resp)) + + beforeUpdate := time.Now() + strategy := &structs.DrainStrategy{ + DrainSpec: structs.DrainSpec{ + Deadline: 10 * time.Second, + }, } // Update the status dereg := &structs.NodeUpdateDrainRequest{ - NodeID: node.ID, - Drain: true, - WriteRequest: structs.WriteRequest{Region: "global"}, + NodeID: node.ID, + DrainStrategy: strategy, + WriteRequest: structs.WriteRequest{Region: "global"}, } var resp2 structs.NodeDrainUpdateResponse - if err := msgpackrpc.CallWithCodec(codec, "Node.UpdateDrain", dereg, &resp2); err != nil { - t.Fatalf("err: %v", err) - } - if resp2.Index == 0 { - t.Fatalf("bad index: %d", resp2.Index) - } + require.Nil(msgpackrpc.CallWithCodec(codec, "Node.UpdateDrain", dereg, &resp2)) + require.NotZero(resp2.Index) // Check for the node in the FSM state := s1.fsm.State() ws := memdb.NewWatchSet() out, err := state.NodeByID(ws, node.ID) - if err != nil { - t.Fatalf("err: %v", err) - } - if !out.Drain { - t.Fatalf("bad: %#v", out) - } + require.Nil(err) + require.True(out.Drain) + require.Equal(strategy.Deadline, out.DrainStrategy.Deadline) + // before+deadline should be before the forced deadline + require.True(beforeUpdate.Add(strategy.Deadline).Before(out.DrainStrategy.ForceDeadline)) + // now+deadline should be after the forced deadline + require.True(time.Now().Add(strategy.Deadline).After(out.DrainStrategy.ForceDeadline)) } func TestClientEndpoint_UpdateDrain_ACL(t *testing.T) { @@ -797,13 +803,13 @@ func TestClientEndpoint_UpdateDrain_ACL(t *testing.T) { defer s1.Shutdown() codec := rpcClient(t, s1) testutil.WaitForLeader(t, s1.RPC) - assert := assert.New(t) + require := require.New(t) // Create the node node := mock.Node() state := s1.fsm.State() - assert.Nil(state.UpsertNode(1, node), "UpsertNode") + require.Nil(state.UpsertNode(1, node), "UpsertNode") // Create the policy and tokens validToken := mock.CreatePolicyAndToken(t, state, 1001, "test-valid", mock.NodePolicy(acl.PolicyWrite)) @@ -811,22 +817,26 @@ func TestClientEndpoint_UpdateDrain_ACL(t *testing.T) { // Update the status without a token and expect failure dereg := &structs.NodeUpdateDrainRequest{ - NodeID: node.ID, - Drain: true, + NodeID: node.ID, + DrainStrategy: &structs.DrainStrategy{ + DrainSpec: structs.DrainSpec{ + Deadline: 10 * time.Second, + }, + }, WriteRequest: structs.WriteRequest{Region: "global"}, } { var resp structs.NodeDrainUpdateResponse err := msgpackrpc.CallWithCodec(codec, "Node.UpdateDrain", dereg, &resp) - assert.NotNil(err, "RPC") - assert.Equal(err.Error(), structs.ErrPermissionDenied.Error()) + require.NotNil(err, "RPC") + require.Equal(err.Error(), structs.ErrPermissionDenied.Error()) } // Try with a valid token dereg.AuthToken = validToken.SecretID { var resp structs.NodeDrainUpdateResponse - assert.Nil(msgpackrpc.CallWithCodec(codec, "Node.UpdateDrain", dereg, &resp), "RPC") + require.Nil(msgpackrpc.CallWithCodec(codec, "Node.UpdateDrain", dereg, &resp), "RPC") } // Try with a invalid token @@ -834,15 +844,15 @@ func TestClientEndpoint_UpdateDrain_ACL(t *testing.T) { { var resp structs.NodeDrainUpdateResponse err := msgpackrpc.CallWithCodec(codec, "Node.UpdateDrain", dereg, &resp) - assert.NotNil(err, "RPC") - assert.Equal(err.Error(), structs.ErrPermissionDenied.Error()) + require.NotNil(err, "RPC") + require.Equal(err.Error(), structs.ErrPermissionDenied.Error()) } // Try with a root token dereg.AuthToken = root.SecretID { var resp structs.NodeDrainUpdateResponse - assert.Nil(msgpackrpc.CallWithCodec(codec, "Node.UpdateDrain", dereg, &resp), "RPC") + require.Nil(msgpackrpc.CallWithCodec(codec, "Node.UpdateDrain", dereg, &resp), "RPC") } } @@ -854,6 +864,7 @@ func TestClientEndpoint_Drain_Down(t *testing.T) { defer s1.Shutdown() codec := rpcClient(t, s1) testutil.WaitForLeader(t, s1.RPC) + require := require.New(t) // Register a node node := mock.Node() @@ -863,9 +874,7 @@ func TestClientEndpoint_Drain_Down(t *testing.T) { } // Fetch the response var resp structs.NodeUpdateResponse - if err := msgpackrpc.CallWithCodec(codec, "Node.Register", reg, &resp); err != nil { - t.Fatalf("err: %v", err) - } + require.Nil(msgpackrpc.CallWithCodec(codec, "Node.Register", reg, &resp)) // Register a service job var jobResp structs.JobRegisterResponse @@ -878,15 +887,12 @@ func TestClientEndpoint_Drain_Down(t *testing.T) { Namespace: job.Namespace, }, } - if err := msgpackrpc.CallWithCodec(codec, "Job.Register", jobReq, &jobResp); err != nil { - t.Fatalf("err: %v", err) - } + require.Nil(msgpackrpc.CallWithCodec(codec, "Job.Register", jobReq, &jobResp)) // Register a system job var jobResp1 structs.JobRegisterResponse - job1 := mock.Job() + job1 := mock.SystemJob() job1.TaskGroups[0].Count = 1 - job1.Type = structs.JobTypeSystem jobReq1 := &structs.JobRegisterRequest{ Job: job1, WriteRequest: structs.WriteRequest{ @@ -894,9 +900,7 @@ func TestClientEndpoint_Drain_Down(t *testing.T) { Namespace: job1.Namespace, }, } - if err := msgpackrpc.CallWithCodec(codec, "Job.Register", jobReq1, &jobResp1); err != nil { - t.Fatalf("err: %v", err) - } + require.Nil(msgpackrpc.CallWithCodec(codec, "Job.Register", jobReq1, &jobResp1)) // Wait for the scheduler to create an allocation testutil.WaitForResult(func() (bool, error) { @@ -916,14 +920,16 @@ func TestClientEndpoint_Drain_Down(t *testing.T) { // Drain the node dereg := &structs.NodeUpdateDrainRequest{ - NodeID: node.ID, - Drain: true, + NodeID: node.ID, + DrainStrategy: &structs.DrainStrategy{ + DrainSpec: structs.DrainSpec{ + Deadline: -1 * time.Second, + }, + }, WriteRequest: structs.WriteRequest{Region: "global"}, } var resp2 structs.NodeDrainUpdateResponse - if err := msgpackrpc.CallWithCodec(codec, "Node.UpdateDrain", dereg, &resp2); err != nil { - t.Fatalf("err: %v", err) - } + require.Nil(msgpackrpc.CallWithCodec(codec, "Node.UpdateDrain", dereg, &resp2)) // Mark the node as down node.Status = structs.NodeStatusDown @@ -931,9 +937,7 @@ func TestClientEndpoint_Drain_Down(t *testing.T) { Node: node, WriteRequest: structs.WriteRequest{Region: "global"}, } - if err := msgpackrpc.CallWithCodec(codec, "Node.Register", reg, &resp); err != nil { - t.Fatalf("err: %v", err) - } + require.Nil(msgpackrpc.CallWithCodec(codec, "Node.Register", reg, &resp)) // Ensure that the allocation has transitioned to lost testutil.WaitForResult(func() (bool, error) { @@ -956,7 +960,7 @@ func TestClientEndpoint_Drain_Down(t *testing.T) { ModifyIndex: summary.ModifyIndex, } if !reflect.DeepEqual(summary, expectedSummary) { - return false, fmt.Errorf("expected: %#v, actual: %#v", expectedSummary, summary) + return false, fmt.Errorf("Service: expected: %#v, actual: %#v", expectedSummary, summary) } summary1, err := s1.fsm.state.JobSummaryByID(ws, job1.Namespace, job1.ID) @@ -976,7 +980,7 @@ func TestClientEndpoint_Drain_Down(t *testing.T) { ModifyIndex: summary1.ModifyIndex, } if !reflect.DeepEqual(summary1, expectedSummary1) { - return false, fmt.Errorf("expected: %#v, actual: %#v", expectedSummary1, summary1) + return false, fmt.Errorf("System: expected: %#v, actual: %#v", expectedSummary1, summary1) } return true, nil }, func(err error) { @@ -984,6 +988,97 @@ func TestClientEndpoint_Drain_Down(t *testing.T) { }) } +func TestClientEndpoint_UpdateEligibility(t *testing.T) { + t.Parallel() + require := require.New(t) + s1 := TestServer(t, nil) + defer s1.Shutdown() + codec := rpcClient(t, s1) + testutil.WaitForLeader(t, s1.RPC) + + // Create the register request + node := mock.Node() + reg := &structs.NodeRegisterRequest{ + Node: node, + WriteRequest: structs.WriteRequest{Region: "global"}, + } + + // Fetch the response + var resp structs.NodeUpdateResponse + require.Nil(msgpackrpc.CallWithCodec(codec, "Node.Register", reg, &resp)) + + // Update the eligibility + dereg := &structs.NodeUpdateEligibilityRequest{ + NodeID: node.ID, + Eligibility: structs.NodeSchedulingIneligible, + WriteRequest: structs.WriteRequest{Region: "global"}, + } + var resp2 structs.GenericResponse + require.Nil(msgpackrpc.CallWithCodec(codec, "Node.UpdateEligibility", dereg, &resp2)) + require.NotZero(resp2.Index) + + // Check for the node in the FSM + state := s1.fsm.State() + out, err := state.NodeByID(nil, node.ID) + require.Nil(err) + require.Equal(out.SchedulingEligibility, structs.NodeSchedulingIneligible) +} + +func TestClientEndpoint_UpdateEligibility_ACL(t *testing.T) { + t.Parallel() + s1, root := TestACLServer(t, nil) + defer s1.Shutdown() + codec := rpcClient(t, s1) + testutil.WaitForLeader(t, s1.RPC) + require := require.New(t) + + // Create the node + node := mock.Node() + state := s1.fsm.State() + + require.Nil(state.UpsertNode(1, node), "UpsertNode") + + // Create the policy and tokens + validToken := mock.CreatePolicyAndToken(t, state, 1001, "test-valid", mock.NodePolicy(acl.PolicyWrite)) + invalidToken := mock.CreatePolicyAndToken(t, state, 1003, "test-invalid", mock.NodePolicy(acl.PolicyRead)) + + // Update the status without a token and expect failure + dereg := &structs.NodeUpdateEligibilityRequest{ + NodeID: node.ID, + Eligibility: structs.NodeSchedulingIneligible, + WriteRequest: structs.WriteRequest{Region: "global"}, + } + { + var resp structs.GenericResponse + err := msgpackrpc.CallWithCodec(codec, "Node.UpdateEligibility", dereg, &resp) + require.NotNil(err, "RPC") + require.Equal(err.Error(), structs.ErrPermissionDenied.Error()) + } + + // Try with a valid token + dereg.AuthToken = validToken.SecretID + { + var resp structs.GenericResponse + require.Nil(msgpackrpc.CallWithCodec(codec, "Node.UpdateEligibility", dereg, &resp), "RPC") + } + + // Try with a invalid token + dereg.AuthToken = invalidToken.SecretID + { + var resp structs.GenericResponse + err := msgpackrpc.CallWithCodec(codec, "Node.UpdateEligibility", dereg, &resp) + require.NotNil(err, "RPC") + require.Equal(err.Error(), structs.ErrPermissionDenied.Error()) + } + + // Try with a root token + dereg.AuthToken = root.SecretID + { + var resp structs.GenericResponse + require.Nil(msgpackrpc.CallWithCodec(codec, "Node.UpdateEligibility", dereg, &resp), "RPC") + } +} + func TestClientEndpoint_GetNode(t *testing.T) { t.Parallel() s1 := TestServer(t, nil) @@ -1888,7 +1983,7 @@ func TestClientEndpoint_BatchUpdate(t *testing.T) { clientAlloc.ClientStatus = structs.AllocClientStatusFailed // Call to do the batch update - bf := NewBatchFuture() + bf := structs.NewBatchFuture() endpoint := s1.staticEndpoints.Node endpoint.batchUpdate(bf, []*structs.Allocation{clientAlloc}, nil) if err := bf.Wait(); err != nil { @@ -2344,15 +2439,18 @@ func TestClientEndpoint_ListNodes_Blocking(t *testing.T) { codec := rpcClient(t, s1) testutil.WaitForLeader(t, s1.RPC) + // Disable drainer to prevent drain from completing during test + s1.nodeDrainer.SetEnabled(false, nil) + // Create the node node := mock.Node() // Node upsert triggers watches - time.AfterFunc(100*time.Millisecond, func() { - if err := state.UpsertNode(2, node); err != nil { - t.Fatalf("err: %v", err) - } + errCh := make(chan error, 1) + timer := time.AfterFunc(100*time.Millisecond, func() { + errCh <- state.UpsertNode(2, node) }) + defer timer.Stop() req := &structs.NodeListRequest{ QueryOptions: structs.QueryOptions{ @@ -2366,6 +2464,10 @@ func TestClientEndpoint_ListNodes_Blocking(t *testing.T) { t.Fatalf("err: %v", err) } + if err := <-errCh; err != nil { + t.Fatalf("error from timer: %v", err) + } + if elapsed := time.Since(start); elapsed < 100*time.Millisecond { t.Fatalf("should block (returned in %s) %#v", elapsed, resp) } @@ -2378,9 +2480,12 @@ func TestClientEndpoint_ListNodes_Blocking(t *testing.T) { // Node drain updates trigger watches. time.AfterFunc(100*time.Millisecond, func() { - if err := state.UpdateNodeDrain(3, node.ID, true); err != nil { - t.Fatalf("err: %v", err) + s := &structs.DrainStrategy{ + DrainSpec: structs.DrainSpec{ + Deadline: 10 * time.Second, + }, } + errCh <- state.UpdateNodeDrain(3, node.ID, s, false) }) req.MinQueryIndex = 2 @@ -2390,6 +2495,10 @@ func TestClientEndpoint_ListNodes_Blocking(t *testing.T) { t.Fatalf("err: %v", err) } + if err := <-errCh; err != nil { + t.Fatalf("error from timer: %v", err) + } + if elapsed := time.Since(start); elapsed < 100*time.Millisecond { t.Fatalf("should block (returned in %s) %#v", elapsed, resp2) } @@ -2402,23 +2511,25 @@ func TestClientEndpoint_ListNodes_Blocking(t *testing.T) { // Node status update triggers watches time.AfterFunc(100*time.Millisecond, func() { - if err := state.UpdateNodeStatus(4, node.ID, structs.NodeStatusDown); err != nil { - t.Fatalf("err: %v", err) - } + errCh <- state.UpdateNodeStatus(40, node.ID, structs.NodeStatusDown) }) - req.MinQueryIndex = 3 + req.MinQueryIndex = 38 var resp3 structs.NodeListResponse start = time.Now() if err := msgpackrpc.CallWithCodec(codec, "Node.List", req, &resp3); err != nil { t.Fatalf("err: %v", err) } + if err := <-errCh; err != nil { + t.Fatalf("error from timer: %v", err) + } + if elapsed := time.Since(start); elapsed < 100*time.Millisecond { t.Fatalf("should block (returned in %s) %#v", elapsed, resp3) } - if resp3.Index != 4 { - t.Fatalf("Bad index: %d %d", resp3.Index, 4) + if resp3.Index != 40 { + t.Fatalf("Bad index: %d %d", resp3.Index, 40) } if len(resp3.Nodes) != 1 || resp3.Nodes[0].Status != structs.NodeStatusDown { t.Fatalf("bad: %#v", resp3.Nodes) @@ -2426,57 +2537,31 @@ func TestClientEndpoint_ListNodes_Blocking(t *testing.T) { // Node delete triggers watches. time.AfterFunc(100*time.Millisecond, func() { - if err := state.DeleteNode(5, node.ID); err != nil { - t.Fatalf("err: %v", err) - } + errCh <- state.DeleteNode(50, node.ID) }) - req.MinQueryIndex = 4 + req.MinQueryIndex = 45 var resp4 structs.NodeListResponse start = time.Now() if err := msgpackrpc.CallWithCodec(codec, "Node.List", req, &resp4); err != nil { t.Fatalf("err: %v", err) } + if err := <-errCh; err != nil { + t.Fatalf("error from timer: %v", err) + } + if elapsed := time.Since(start); elapsed < 100*time.Millisecond { t.Fatalf("should block (returned in %s) %#v", elapsed, resp4) } - if resp4.Index != 5 { - t.Fatalf("Bad index: %d %d", resp4.Index, 5) + if resp4.Index != 50 { + t.Fatalf("Bad index: %d %d", resp4.Index, 50) } if len(resp4.Nodes) != 0 { t.Fatalf("bad: %#v", resp4.Nodes) } } -func TestBatchFuture(t *testing.T) { - t.Parallel() - bf := NewBatchFuture() - - // Async respond to the future - expect := fmt.Errorf("testing") - go func() { - time.Sleep(10 * time.Millisecond) - bf.Respond(1000, expect) - }() - - // Block for the result - start := time.Now() - err := bf.Wait() - diff := time.Since(start) - if diff < 5*time.Millisecond { - t.Fatalf("too fast") - } - - // Check the results - if err != expect { - t.Fatalf("bad: %s", err) - } - if bf.Index() != 1000 { - t.Fatalf("bad: %d", bf.Index()) - } -} - func TestClientEndpoint_DeriveVaultToken_Bad(t *testing.T) { t.Parallel() s1 := TestServer(t, nil) diff --git a/nomad/plan_apply.go b/nomad/plan_apply.go index 8e988232318d..089af0f5853a 100644 --- a/nomad/plan_apply.go +++ b/nomad/plan_apply.go @@ -415,7 +415,10 @@ func evaluateNodePlan(snap *state.StateSnapshot, plan *structs.Plan, nodeID stri return false, "node does not exist", nil } else if node.Status != structs.NodeStatusReady { return false, "node is not ready for placements", nil + } else if node.SchedulingEligibility == structs.NodeSchedulingIneligible { + return false, "node is not eligible for draining", nil } else if node.Drain { + // Deprecate in favor of scheduling eligibility and remove post-0.8 return false, "node is draining", nil } diff --git a/nomad/rpc_test.go b/nomad/rpc_test.go index c876c6adb1df..ec885cc652c8 100644 --- a/nomad/rpc_test.go +++ b/nomad/rpc_test.go @@ -30,7 +30,7 @@ func rpcClient(t *testing.T, s *Server) rpc.ClientCodec { if err != nil { t.Fatalf("err: %v", err) } - // Write the Consul RPC byte to set the mode + // Write the Nomad RPC byte to set the mode conn.Write([]byte{byte(pool.RpcNomad)}) return pool.NewClientCodec(conn) } diff --git a/nomad/server.go b/nomad/server.go index 68789da4a259..b69e0a022571 100644 --- a/nomad/server.go +++ b/nomad/server.go @@ -27,6 +27,7 @@ import ( "github.com/hashicorp/nomad/helper/stats" "github.com/hashicorp/nomad/helper/tlsutil" "github.com/hashicorp/nomad/nomad/deploymentwatcher" + "github.com/hashicorp/nomad/nomad/drainer" "github.com/hashicorp/nomad/nomad/state" "github.com/hashicorp/nomad/nomad/structs" "github.com/hashicorp/nomad/nomad/structs/config" @@ -172,6 +173,9 @@ type Server struct { // make the required calls to continue to transition the deployment. deploymentWatcher *deploymentwatcher.Watcher + // nodeDrainer is used to drain allocations from nodes. + nodeDrainer *drainer.NodeDrainer + // evalBroker is used to manage the in-progress evaluations // that are waiting to be brokered to a sub-scheduler evalBroker *EvalBroker @@ -355,6 +359,9 @@ func NewServer(config *Config, consulCatalog consul.CatalogAPI, logger *log.Logg return nil, fmt.Errorf("failed to create deployment watcher: %v", err) } + // Setup the node drainer. + s.setupNodeDrainer() + // Setup the enterprise state if err := s.setupEnterprise(config); err != nil { return nil, err @@ -880,6 +887,23 @@ func (s *Server) setupDeploymentWatcher() error { return nil } +// setupNodeDrainer creates a node drainer which will be enabled when a server +// becomes a leader. +func (s *Server) setupNodeDrainer() { + // Create a shim around Raft requests + shim := drainerShim{s} + c := &drainer.NodeDrainerConfig{ + Logger: s.logger, + Raft: shim, + JobFactory: drainer.GetDrainingJobWatcher, + NodeFactory: drainer.GetNodeWatcherFactory(), + DrainDeadlineFactory: drainer.GetDeadlineNotifier, + StateQueriesPerSecond: drainer.LimitStateQueriesPerSecond, + BatchUpdateInterval: drainer.BatchUpdateInterval, + } + s.nodeDrainer = drainer.NewNodeDrainer(c) +} + // setupVaultClient is used to set up the Vault API client. func (s *Server) setupVaultClient() error { v, err := NewVaultClient(s.config.VaultConfig, s.logger, s.purgeVaultAccessors) diff --git a/nomad/state/state_store.go b/nomad/state/state_store.go index 6156a3c75020..6e4f3978db65 100644 --- a/nomad/state/state_store.go +++ b/nomad/state/state_store.go @@ -509,7 +509,7 @@ func (s *StateStore) DeleteDeployment(index uint64, deploymentIDs []string) erro // UpsertNode is used to register a node or update a node definition // This is assumed to be triggered by the client, so we retain the value -// of drain which is set by the scheduler. +// of drain/eligibility which is set by the scheduler. func (s *StateStore) UpsertNode(index uint64, node *structs.Node) error { txn := s.db.Txn(true) defer txn.Abort() @@ -525,10 +525,13 @@ func (s *StateStore) UpsertNode(index uint64, node *structs.Node) error { exist := existing.(*structs.Node) node.CreateIndex = exist.CreateIndex node.ModifyIndex = index - node.Drain = exist.Drain // Retain the drain mode // Retain node events that have already been set on the node node.Events = exist.Events + + node.Drain = exist.Drain // Retain the drain mode + node.SchedulingEligibility = exist.SchedulingEligibility // Retain the eligibility + node.DrainStrategy = exist.DrainStrategy // Retain the drain strategy } else { // Because this is the first time the node is being registered, we should // also create a node registration event @@ -596,8 +599,7 @@ func (s *StateStore) UpdateNodeStatus(index uint64, nodeID, status string) error // Copy the existing node existingNode := existing.(*structs.Node) - copyNode := new(structs.Node) - *copyNode = *existingNode + copyNode := existingNode.Copy() // Update the status in the copy copyNode.Status = status @@ -615,10 +617,34 @@ func (s *StateStore) UpdateNodeStatus(index uint64, nodeID, status string) error return nil } +// BatchUpdateNodeDrain is used to update the drain of a node set of nodes +func (s *StateStore) BatchUpdateNodeDrain(index uint64, updates map[string]*structs.DrainUpdate) error { + txn := s.db.Txn(true) + defer txn.Abort() + for node, update := range updates { + if err := s.updateNodeDrainImpl(txn, index, node, update.DrainStrategy, update.MarkEligible); err != nil { + return err + } + } + txn.Commit() + return nil +} + // UpdateNodeDrain is used to update the drain of a node -func (s *StateStore) UpdateNodeDrain(index uint64, nodeID string, drain bool) error { +func (s *StateStore) UpdateNodeDrain(index uint64, nodeID string, + drain *structs.DrainStrategy, markEligible bool) error { + txn := s.db.Txn(true) defer txn.Abort() + if err := s.updateNodeDrainImpl(txn, index, nodeID, drain, markEligible); err != nil { + return err + } + txn.Commit() + return nil +} + +func (s *StateStore) updateNodeDrainImpl(txn *memdb.Txn, index uint64, nodeID string, + drain *structs.DrainStrategy, markEligible bool) error { // Lookup the node existing, err := txn.First("nodes", "id", nodeID) @@ -634,7 +660,53 @@ func (s *StateStore) UpdateNodeDrain(index uint64, nodeID string, drain bool) er copyNode := existingNode.Copy() // Update the drain in the copy - copyNode.Drain = drain + copyNode.Drain = drain != nil // COMPAT: Remove in Nomad 0.9 + copyNode.DrainStrategy = drain + if drain != nil { + copyNode.SchedulingEligibility = structs.NodeSchedulingIneligible + } else if markEligible { + copyNode.SchedulingEligibility = structs.NodeSchedulingEligible + } + + copyNode.ModifyIndex = index + + // Insert the node + if err := txn.Insert("nodes", copyNode); err != nil { + return fmt.Errorf("node update failed: %v", err) + } + if err := txn.Insert("index", &IndexEntry{"nodes", index}); err != nil { + return fmt.Errorf("index update failed: %v", err) + } + + return nil +} + +// UpdateNodeEligibility is used to update the scheduling eligibility of a node +func (s *StateStore) UpdateNodeEligibility(index uint64, nodeID string, eligibility string) error { + + txn := s.db.Txn(true) + defer txn.Abort() + + // Lookup the node + existing, err := txn.First("nodes", "id", nodeID) + if err != nil { + return fmt.Errorf("node lookup failed: %v", err) + } + if existing == nil { + return fmt.Errorf("node not found") + } + + // Copy the existing node + existingNode := existing.(*structs.Node) + copyNode := existingNode.Copy() + + // Check if this is a valid action + if copyNode.DrainStrategy != nil && eligibility == structs.NodeSchedulingEligible { + return fmt.Errorf("can not set node's scheduling eligibility to eligible while it is draining") + } + + // Update the eligibility in the copy + copyNode.SchedulingEligibility = eligibility copyNode.ModifyIndex = index // Insert the node @@ -1996,6 +2068,65 @@ func (s *StateStore) upsertAllocsImpl(index uint64, allocs []*structs.Allocation return nil } +// UpdateAllocsDesiredTransitions is used to update a set of allocations +// desired transitions. +func (s *StateStore) UpdateAllocsDesiredTransitions(index uint64, allocs map[string]*structs.DesiredTransition, + evals []*structs.Evaluation) error { + + txn := s.db.Txn(true) + defer txn.Abort() + + // Handle each of the updated allocations + for id, transition := range allocs { + if err := s.nestedUpdateAllocDesiredTransition(txn, index, id, transition); err != nil { + return err + } + } + + // Update the indexes + if err := txn.Insert("index", &IndexEntry{"allocs", index}); err != nil { + return fmt.Errorf("index update failed: %v", err) + } + + txn.Commit() + return nil +} + +// nestedUpdateAllocDesiredTransition is used to nest an update of an +// allocations desired transition +func (s *StateStore) nestedUpdateAllocDesiredTransition( + txn *memdb.Txn, index uint64, allocID string, + transition *structs.DesiredTransition) error { + + // Look for existing alloc + existing, err := txn.First("allocs", "id", allocID) + if err != nil { + return fmt.Errorf("alloc lookup failed: %v", err) + } + + // Nothing to do if this does not exist + if existing == nil { + return nil + } + exist := existing.(*structs.Allocation) + + // Copy everything from the existing allocation + copyAlloc := exist.Copy() + + // Merge the desired transitions + copyAlloc.DesiredTransition.Merge(transition) + + // Update the modify index + copyAlloc.ModifyIndex = index + + // Update the allocation + if err := txn.Insert("allocs", copyAlloc); err != nil { + return fmt.Errorf("alloc insert failed: %v", err) + } + + return nil +} + // AllocByID is used to lookup an allocation by its ID func (s *StateStore) AllocByID(ws memdb.WatchSet, id string) (*structs.Allocation, error) { txn := s.db.Txn(false) diff --git a/nomad/state/state_store_test.go b/nomad/state/state_store_test.go index d176e178b9a9..9f13dd10bf36 100644 --- a/nomad/state/state_store_test.go +++ b/nomad/state/state_store_test.go @@ -698,54 +698,86 @@ func TestStateStore_UpdateNodeStatus_Node(t *testing.T) { } } -func TestStateStore_UpdateNodeDrain_Node(t *testing.T) { +func TestStateStore_BatchUpdateNodeDrain(t *testing.T) { + require := require.New(t) state := testStateStore(t) - node := mock.Node() - err := state.UpsertNode(1000, node) - if err != nil { - t.Fatalf("err: %v", err) - } + n1, n2 := mock.Node(), mock.Node() + require.Nil(state.UpsertNode(1000, n1)) + require.Nil(state.UpsertNode(1001, n2)) // Create a watchset so we can test that update node drain fires the watch ws := memdb.NewWatchSet() - if _, err := state.NodeByID(ws, node.ID); err != nil { - t.Fatalf("bad: %v", err) - } + _, err := state.NodeByID(ws, n1.ID) + require.Nil(err) - err = state.UpdateNodeDrain(1001, node.ID, true) - if err != nil { - t.Fatalf("err: %v", err) + expectedDrain := &structs.DrainStrategy{ + DrainSpec: structs.DrainSpec{ + Deadline: -1 * time.Second, + }, } - if !watchFired(ws) { - t.Fatalf("bad") + update := map[string]*structs.DrainUpdate{ + n1.ID: { + DrainStrategy: expectedDrain, + }, + n2.ID: { + DrainStrategy: expectedDrain, + }, } - ws = memdb.NewWatchSet() - out, err := state.NodeByID(ws, node.ID) - if err != nil { - t.Fatalf("err: %v", err) - } + require.Nil(state.BatchUpdateNodeDrain(1002, update)) + require.True(watchFired(ws)) - if !out.Drain { - t.Fatalf("bad: %#v", out) - } - if out.ModifyIndex != 1001 { - t.Fatalf("bad: %#v", out) + ws = memdb.NewWatchSet() + for _, id := range []string{n1.ID, n2.ID} { + out, err := state.NodeByID(ws, id) + require.Nil(err) + require.True(out.Drain) + require.NotNil(out.DrainStrategy) + require.Equal(out.DrainStrategy, expectedDrain) + require.EqualValues(1002, out.ModifyIndex) } index, err := state.Index("nodes") - if err != nil { - t.Fatalf("err: %v", err) - } - if index != 1001 { - t.Fatalf("bad: %d", index) - } + require.Nil(err) + require.EqualValues(1002, index) + require.False(watchFired(ws)) +} - if watchFired(ws) { - t.Fatalf("bad") +func TestStateStore_UpdateNodeDrain_Node(t *testing.T) { + require := require.New(t) + state := testStateStore(t) + node := mock.Node() + + require.Nil(state.UpsertNode(1000, node)) + + // Create a watchset so we can test that update node drain fires the watch + ws := memdb.NewWatchSet() + _, err := state.NodeByID(ws, node.ID) + require.Nil(err) + + expectedDrain := &structs.DrainStrategy{ + DrainSpec: structs.DrainSpec{ + Deadline: -1 * time.Second, + }, } + + require.Nil(state.UpdateNodeDrain(1001, node.ID, expectedDrain, false)) + require.True(watchFired(ws)) + + ws = memdb.NewWatchSet() + out, err := state.NodeByID(ws, node.ID) + require.Nil(err) + require.True(out.Drain) + require.NotNil(out.DrainStrategy) + require.Equal(out.DrainStrategy, expectedDrain) + require.EqualValues(1001, out.ModifyIndex) + + index, err := state.Index("nodes") + require.Nil(err) + require.EqualValues(1001, index) + require.False(watchFired(ws)) } func TestStateStore_AddSingleNodeEvent(t *testing.T) { @@ -837,6 +869,89 @@ func TestStateStore_NodeEvents_RetentionWindow(t *testing.T) { require.Equal(uint64(20), out.Events[len(out.Events)-1].CreateIndex) } +func TestStateStore_UpdateNodeDrain_ResetEligiblity(t *testing.T) { + require := require.New(t) + state := testStateStore(t) + node := mock.Node() + require.Nil(state.UpsertNode(1000, node)) + + // Create a watchset so we can test that update node drain fires the watch + ws := memdb.NewWatchSet() + _, err := state.NodeByID(ws, node.ID) + require.Nil(err) + + drain := &structs.DrainStrategy{ + DrainSpec: structs.DrainSpec{ + Deadline: -1 * time.Second, + }, + } + + require.Nil(state.UpdateNodeDrain(1001, node.ID, drain, false)) + require.True(watchFired(ws)) + + // Remove the drain + require.Nil(state.UpdateNodeDrain(1002, node.ID, nil, true)) + + ws = memdb.NewWatchSet() + out, err := state.NodeByID(ws, node.ID) + require.Nil(err) + require.False(out.Drain) + require.Nil(out.DrainStrategy) + require.Equal(out.SchedulingEligibility, structs.NodeSchedulingEligible) + require.EqualValues(1002, out.ModifyIndex) + + index, err := state.Index("nodes") + require.Nil(err) + require.EqualValues(1002, index) + require.False(watchFired(ws)) +} + +func TestStateStore_UpdateNodeEligibility(t *testing.T) { + require := require.New(t) + state := testStateStore(t) + node := mock.Node() + + err := state.UpsertNode(1000, node) + if err != nil { + t.Fatalf("err: %v", err) + } + + expectedEligibility := structs.NodeSchedulingIneligible + + // Create a watchset so we can test that update node drain fires the watch + ws := memdb.NewWatchSet() + if _, err := state.NodeByID(ws, node.ID); err != nil { + t.Fatalf("bad: %v", err) + } + + require.Nil(state.UpdateNodeEligibility(1001, node.ID, expectedEligibility)) + require.True(watchFired(ws)) + + ws = memdb.NewWatchSet() + out, err := state.NodeByID(ws, node.ID) + require.Nil(err) + require.Equal(out.SchedulingEligibility, expectedEligibility) + require.EqualValues(1001, out.ModifyIndex) + + index, err := state.Index("nodes") + require.Nil(err) + require.EqualValues(1001, index) + require.False(watchFired(ws)) + + // Set a drain strategy + expectedDrain := &structs.DrainStrategy{ + DrainSpec: structs.DrainSpec{ + Deadline: -1 * time.Second, + }, + } + require.Nil(state.UpdateNodeDrain(1002, node.ID, expectedDrain, false)) + + // Try to set the node to eligible + err = state.UpdateNodeEligibility(1003, node.ID, structs.NodeSchedulingEligible) + require.NotNil(err) + require.Contains(err.Error(), "while it is draining") +} + func TestStateStore_Nodes(t *testing.T) { state := testStateStore(t) var nodes []*structs.Node @@ -3823,6 +3938,69 @@ func TestStateStore_UpdateAlloc_NoJob(t *testing.T) { } } +func TestStateStore_UpdateAllocDesiredTransition(t *testing.T) { + t.Parallel() + require := require.New(t) + + state := testStateStore(t) + alloc := mock.Alloc() + + require.Nil(state.UpsertJob(999, alloc.Job)) + require.Nil(state.UpsertAllocs(1000, []*structs.Allocation{alloc})) + + t1 := &structs.DesiredTransition{ + Migrate: helper.BoolToPtr(true), + } + t2 := &structs.DesiredTransition{ + Migrate: helper.BoolToPtr(false), + } + eval := &structs.Evaluation{ + ID: uuid.Generate(), + Namespace: alloc.Namespace, + Priority: alloc.Job.Priority, + Type: alloc.Job.Type, + TriggeredBy: structs.EvalTriggerNodeDrain, + JobID: alloc.Job.ID, + JobModifyIndex: alloc.Job.ModifyIndex, + Status: structs.EvalStatusPending, + } + evals := []*structs.Evaluation{eval} + + m := map[string]*structs.DesiredTransition{alloc.ID: t1} + require.Nil(state.UpdateAllocsDesiredTransitions(1001, m, evals)) + + ws := memdb.NewWatchSet() + out, err := state.AllocByID(ws, alloc.ID) + require.Nil(err) + require.NotNil(out.DesiredTransition.Migrate) + require.True(*out.DesiredTransition.Migrate) + require.EqualValues(1000, out.CreateIndex) + require.EqualValues(1001, out.ModifyIndex) + + index, err := state.Index("allocs") + require.Nil(err) + require.EqualValues(1001, index) + + m = map[string]*structs.DesiredTransition{alloc.ID: t2} + require.Nil(state.UpdateAllocsDesiredTransitions(1002, m, evals)) + + ws = memdb.NewWatchSet() + out, err = state.AllocByID(ws, alloc.ID) + require.Nil(err) + require.NotNil(out.DesiredTransition.Migrate) + require.False(*out.DesiredTransition.Migrate) + require.EqualValues(1000, out.CreateIndex) + require.EqualValues(1002, out.ModifyIndex) + + index, err = state.Index("allocs") + require.Nil(err) + require.EqualValues(1002, index) + + // Try with a bogus alloc id + m = map[string]*structs.DesiredTransition{uuid.Generate(): t2} + require.Nil(state.UpdateAllocsDesiredTransitions(1003, m, evals)) +} + func TestStateStore_JobSummary(t *testing.T) { state := testStateStore(t) diff --git a/nomad/state/testing.go b/nomad/state/testing.go index 69509714d179..ee7dce1d6c7f 100644 --- a/nomad/state/testing.go +++ b/nomad/state/testing.go @@ -1,14 +1,13 @@ package state import ( - "os" - + "github.com/hashicorp/nomad/helper/testlog" "github.com/mitchellh/go-testing-interface" ) func TestStateStore(t testing.T) *StateStore { config := &StateStoreConfig{ - LogOutput: os.Stderr, + LogOutput: testlog.NewWriter(t), Region: "global", } state, err := NewStateStore(config) diff --git a/nomad/structs/structs.go b/nomad/structs/structs.go index 54c89fb95e20..5f455e58d3e9 100644 --- a/nomad/structs/structs.go +++ b/nomad/structs/structs.go @@ -78,6 +78,9 @@ const ( AutopilotRequestType UpsertNodeEventsType JobBatchDeregisterRequestType + AllocUpdateDesiredTransitionRequestType + NodeUpdateEligibilityRequestType + BatchNodeUpdateDrainRequestType ) const ( @@ -153,6 +156,10 @@ type NamespacedID struct { Namespace string } +func (n NamespacedID) String() string { + return fmt.Sprintf("", n.Namespace, n.ID) +} + // RPCInfo is used to describe common information about query type RPCInfo interface { RequestRegion() string @@ -301,10 +308,38 @@ type NodeUpdateStatusRequest struct { WriteRequest } -// NodeUpdateDrainRequest is used for updating the drain status +// NodeUpdateDrainRequest is used for updating the drain strategy type NodeUpdateDrainRequest struct { - NodeID string - Drain bool + NodeID string + Drain bool // TODO Deprecate + DrainStrategy *DrainStrategy + + // MarkEligible marks the node as eligible if removing the drain strategy. + MarkEligible bool + WriteRequest +} + +// BatchNodeUpdateDrainRequest is used for updating the drain strategy for a +// batch of nodes +type BatchNodeUpdateDrainRequest struct { + // Updates is a mapping of nodes to their updated drain strategy + Updates map[string]*DrainUpdate + WriteRequest +} + +// DrainUpdate is used to update the drain of a node +type DrainUpdate struct { + // DrainStrategy is the new strategy for the node + DrainStrategy *DrainStrategy + + // MarkEligible marks the node as eligible if removing the drain strategy. + MarkEligible bool +} + +// NodeUpdateEligibilityRequest is used for updating the scheduling eligibility +type NodeUpdateEligibilityRequest struct { + NodeID string + Eligibility string WriteRequest } @@ -573,6 +608,19 @@ type AllocUpdateRequest struct { WriteRequest } +// AllocUpdateDesiredTransitionRequest is used to submit changes to allocations +// desired transition state. +type AllocUpdateDesiredTransitionRequest struct { + // Allocs is the mapping of allocation ids to their desired state + // transition + Allocs map[string]*DesiredTransition + + // Evals is the set of evaluations to create + Evals []*Evaluation + + WriteRequest +} + // AllocListRequest is used to request a list of allocations type AllocListRequest struct { QueryOptions @@ -857,10 +905,13 @@ type NodeUpdateResponse struct { // NodeDrainUpdateResponse is used to respond to a node drain update type NodeDrainUpdateResponse struct { - EvalIDs []string - EvalCreateIndex uint64 NodeModifyIndex uint64 QueryMeta + + // Deprecated in Nomad 0.8 as an evaluation is not immediately created but + // is instead handled by the drainer. + EvalIDs []string + EvalCreateIndex uint64 } // NodeAllocsResponse is used to return allocs for a single node @@ -1163,6 +1214,88 @@ func ValidNodeStatus(status string) bool { } } +const ( + // NodeSchedulingEligible and Ineligible marks the node as eligible or not, + // respectively, for receiving allocations. This is orthoginal to the node + // status being ready. + NodeSchedulingEligible = "eligible" + NodeSchedulingIneligible = "ineligible" +) + +// DrainSpec describes a Node's desired drain behavior. +type DrainSpec struct { + // Deadline is the duration after StartTime when the remaining + // allocations on a draining Node should be told to stop. + Deadline time.Duration + + // IgnoreSystemJobs allows systems jobs to remain on the node even though it + // has been marked for draining. + IgnoreSystemJobs bool +} + +// DrainStrategy describes a Node's drain behavior. +type DrainStrategy struct { + // DrainSpec is the user declared drain specification + DrainSpec + + // ForceDeadline is the deadline time for the drain after which drains will + // be forced + ForceDeadline time.Time +} + +func (d *DrainStrategy) Copy() *DrainStrategy { + if d == nil { + return nil + } + + nd := new(DrainStrategy) + *nd = *d + return nd +} + +// DeadlineTime returns a boolean whether the drain strategy allows an infinite +// duration or otherwise the deadline time. The force drain is captured by the +// deadline time being in the past. +func (d *DrainStrategy) DeadlineTime() (infinite bool, deadline time.Time) { + // Treat the nil case as a force drain so during an upgrade where a node may + // not have a drain strategy but has Drain set to true, it is treated as a + // force to mimick old behavior. + if d == nil { + return false, time.Time{} + } + + ns := d.Deadline.Nanoseconds() + switch { + case ns < 0: // Force + return false, time.Time{} + case ns == 0: // Infinite + return true, time.Time{} + default: + return false, d.ForceDeadline + } +} + +func (d *DrainStrategy) Equal(o *DrainStrategy) bool { + if d == nil && o == nil { + return true + } else if o != nil && d == nil { + return false + } else if d != nil && o == nil { + return false + } + + // Compare values + if d.ForceDeadline != o.ForceDeadline { + return false + } else if d.Deadline != o.Deadline { + return false + } else if d.IgnoreSystemJobs != o.IgnoreSystemJobs { + return false + } + + return true +} + // Node is a representation of a schedulable client node type Node struct { // ID is a unique identifier for the node. It can be constructed @@ -1222,11 +1355,21 @@ type Node struct { // attributes and capabilities. ComputedClass string + // COMPAT: Remove in Nomad 0.9 // Drain is controlled by the servers, and not the client. // If true, no jobs will be scheduled to this node, and existing - // allocations will be drained. + // allocations will be drained. Superceded by DrainStrategy in Nomad + // 0.8 but kept for backward compat. Drain bool + // DrainStrategy determines the node's draining behavior. Will be nil + // when Drain=false. + DrainStrategy *DrainStrategy + + // SchedulingEligibility determines whether this node will receive new + // placements. + SchedulingEligibility string + // Status of this node Status string @@ -1249,9 +1392,10 @@ type Node struct { ModifyIndex uint64 } -// Ready returns if the node is ready for running allocations +// Ready returns true if the node is ready for running allocations func (n *Node) Ready() bool { - return n.Status == NodeStatusReady && !n.Drain + // Drain is checked directly to support pre-0.8 Node data + return n.Status == NodeStatusReady && !n.Drain && n.SchedulingEligibility == NodeSchedulingEligible } func (n *Node) Copy() *Node { @@ -1266,6 +1410,7 @@ func (n *Node) Copy() *Node { nn.Links = helper.CopyMapStringString(nn.Links) nn.Meta = helper.CopyMapStringString(nn.Meta) nn.Events = copyNodeEvents(n.Events) + nn.DrainStrategy = nn.DrainStrategy.Copy() return nn } @@ -1300,34 +1445,36 @@ func (n *Node) Stub() *NodeListStub { addr, _, _ := net.SplitHostPort(n.HTTPAddr) return &NodeListStub{ - Address: addr, - ID: n.ID, - Datacenter: n.Datacenter, - Name: n.Name, - NodeClass: n.NodeClass, - Version: n.Attributes["nomad.version"], - Drain: n.Drain, - Status: n.Status, - StatusDescription: n.StatusDescription, - CreateIndex: n.CreateIndex, - ModifyIndex: n.ModifyIndex, + Address: addr, + ID: n.ID, + Datacenter: n.Datacenter, + Name: n.Name, + NodeClass: n.NodeClass, + Version: n.Attributes["nomad.version"], + Drain: n.Drain, + SchedulingEligibility: n.SchedulingEligibility, + Status: n.Status, + StatusDescription: n.StatusDescription, + CreateIndex: n.CreateIndex, + ModifyIndex: n.ModifyIndex, } } // NodeListStub is used to return a subset of job information // for the job list type NodeListStub struct { - Address string - ID string - Datacenter string - Name string - NodeClass string - Version string - Drain bool - Status string - StatusDescription string - CreateIndex uint64 - ModifyIndex uint64 + Address string + ID string + Datacenter string + Name string + NodeClass string + Version string + Drain bool + SchedulingEligibility string + Status string + StatusDescription string + CreateIndex uint64 + ModifyIndex uint64 } // Networks defined for a task on the Resources struct. @@ -2898,6 +3045,64 @@ func NewReschedulePolicy(jobType string) *ReschedulePolicy { return nil } +const ( + MigrateStrategyHealthChecks = "checks" + MigrateStrategyHealthStates = "task_states" +) + +type MigrateStrategy struct { + MaxParallel int + HealthCheck string + MinHealthyTime time.Duration + HealthyDeadline time.Duration +} + +// DefaultMigrateStrategy is used for backwards compat with pre-0.8 Allocations +// that lack an update strategy. +// +// This function should match its counterpart in api/tasks.go +func DefaultMigrateStrategy() *MigrateStrategy { + return &MigrateStrategy{ + MaxParallel: 1, + HealthCheck: MigrateStrategyHealthChecks, + MinHealthyTime: 10 * time.Second, + HealthyDeadline: 5 * time.Minute, + } +} + +func (m *MigrateStrategy) Validate() error { + var mErr multierror.Error + + if m.MaxParallel < 0 { + multierror.Append(&mErr, fmt.Errorf("MaxParallel must be >= 0 but found %d", m.MaxParallel)) + } + + switch m.HealthCheck { + case MigrateStrategyHealthChecks, MigrateStrategyHealthStates: + // ok + case "": + if m.MaxParallel > 0 { + multierror.Append(&mErr, fmt.Errorf("Missing HealthCheck")) + } + default: + multierror.Append(&mErr, fmt.Errorf("Invalid HealthCheck: %q", m.HealthCheck)) + } + + if m.MinHealthyTime < 0 { + multierror.Append(&mErr, fmt.Errorf("MinHealthyTime is %s and must be >= 0", m.MinHealthyTime)) + } + + if m.HealthyDeadline < 0 { + multierror.Append(&mErr, fmt.Errorf("HealthyDeadline is %s and must be >= 0", m.HealthyDeadline)) + } + + if m.MinHealthyTime > m.HealthyDeadline { + multierror.Append(&mErr, fmt.Errorf("MinHealthyTime must be less than HealthyDeadline")) + } + + return mErr.ErrorOrNil() +} + // TaskGroup is an atomic unit of placement. Each task group belongs to // a job and may contain any number of tasks. A task group support running // in many replicas using the same configuration.. @@ -2912,6 +3117,9 @@ type TaskGroup struct { // Update is used to control the update strategy for this task group Update *UpdateStrategy + // Migrate is used to control the migration strategy for this task group + Migrate *MigrateStrategy + // Constraints can be specified at a task group level and apply to // all the tasks contained. Constraints []*Constraint @@ -3059,6 +3267,20 @@ func (tg *TaskGroup) Validate(j *Job) error { } } + // Validate the migration strategy + switch j.Type { + case JobTypeService: + if tg.Migrate != nil { + if err := tg.Migrate.Validate(); err != nil { + mErr.Errors = append(mErr.Errors, err) + } + } + default: + if tg.Migrate != nil { + mErr.Errors = append(mErr.Errors, fmt.Errorf("Job type %q does not allow migrate block", j.Type)) + } + } + // Check for duplicate tasks, that there is only leader task if any, // and no duplicated static ports tasks := make(map[string]int) @@ -5211,6 +5433,28 @@ func (re *RescheduleEvent) Copy() *RescheduleEvent { return copy } +// DesiredTransition is used to mark an allocation as having a desired state +// transition. This information can be used by the scheduler to make the +// correct decision. +type DesiredTransition struct { + // Migrate is used to indicate that this allocation should be stopped and + // migrated to another node. + Migrate *bool +} + +// Merge merges the two desired transitions, preferring the values from the +// passed in object. +func (d *DesiredTransition) Merge(o *DesiredTransition) { + if o.Migrate != nil { + d.Migrate = o.Migrate + } +} + +// ShouldMigrate returns whether the transition object dictates a migration. +func (d *DesiredTransition) ShouldMigrate() bool { + return d.Migrate != nil && *d.Migrate +} + const ( AllocDesiredStatusRun = "run" // Allocation should run AllocDesiredStatusStop = "stop" // Allocation should stop @@ -5272,6 +5516,10 @@ type Allocation struct { // DesiredStatusDescription is meant to provide more human useful information DesiredDescription string + // DesiredTransition is used to indicate that a state transition + // is desired for a given reason. + DesiredTransition DesiredTransition + // Status of the allocation on the client ClientStatus string @@ -5837,6 +6085,7 @@ const ( EvalTriggerJobRegister = "job-register" EvalTriggerJobDeregister = "job-deregister" EvalTriggerPeriodicJob = "periodic-job" + EvalTriggerNodeDrain = "node-drain" EvalTriggerNodeUpdate = "node-update" EvalTriggerScheduled = "scheduled" EvalTriggerRollingUpdate = "rolling-update" @@ -6770,3 +7019,45 @@ type ACLTokenUpsertResponse struct { Tokens []*ACLToken WriteMeta } + +// BatchFuture is used to wait on a batch update to complete +type BatchFuture struct { + doneCh chan struct{} + err error + index uint64 +} + +// NewBatchFuture creates a new batch future +func NewBatchFuture() *BatchFuture { + return &BatchFuture{ + doneCh: make(chan struct{}), + } +} + +// Wait is used to block for the future to complete and returns the error +func (b *BatchFuture) Wait() error { + <-b.doneCh + return b.err +} + +// WaitCh is used to block for the future to complete +func (b *BatchFuture) WaitCh() <-chan struct{} { + return b.doneCh +} + +// Error is used to return the error of the batch, only after Wait() +func (b *BatchFuture) Error() error { + return b.err +} + +// Index is used to return the index of the batch, only after Wait() +func (b *BatchFuture) Index() uint64 { + return b.index +} + +// Respond is used to unblock the future +func (b *BatchFuture) Respond(index uint64, err error) { + b.index = index + b.err = err + close(b.doneCh) +} diff --git a/nomad/structs/structs_test.go b/nomad/structs/structs_test.go index f3cbe3d055e0..9df3d9e4f706 100644 --- a/nomad/structs/structs_test.go +++ b/nomad/structs/structs_test.go @@ -3597,3 +3597,31 @@ func TestNetworkResourcesEquals(t *testing.T) { require.Equal(testCase.expected, first.Equals(second), testCase.errorMsg) } } + +func TestBatchFuture(t *testing.T) { + t.Parallel() + bf := NewBatchFuture() + + // Async respond to the future + expect := fmt.Errorf("testing") + go func() { + time.Sleep(10 * time.Millisecond) + bf.Respond(1000, expect) + }() + + // Block for the result + start := time.Now() + err := bf.Wait() + diff := time.Since(start) + if diff < 5*time.Millisecond { + t.Fatalf("too fast") + } + + // Check the results + if err != expect { + t.Fatalf("bad: %s", err) + } + if bf.Index() != 1000 { + t.Fatalf("bad: %d", bf.Index()) + } +} diff --git a/nomad/worker.go b/nomad/worker.go index 209d0b2938f8..6908188fbaf2 100644 --- a/nomad/worker.go +++ b/nomad/worker.go @@ -327,7 +327,7 @@ SUBMIT: } return nil, nil, err } else { - w.logger.Printf("[DEBUG] worker: submitted plan for evaluation %s", plan.EvalID) + w.logger.Printf("[DEBUG] worker: submitted plan at index %d for evaluation %s", resp.Index, plan.EvalID) w.backoffReset() } diff --git a/scheduler/generic_sched.go b/scheduler/generic_sched.go index 94dbc8a4b60f..6b812740ce9b 100644 --- a/scheduler/generic_sched.go +++ b/scheduler/generic_sched.go @@ -76,10 +76,7 @@ type GenericScheduler struct { ctx *EvalContext stack *GenericStack - // Deprecated, was used in pre Nomad 0.7 rolling update stanza and in node draining prior to Nomad 0.8 - followupEvalWait time.Duration - nextEval *structs.Evaluation - followUpEvals []*structs.Evaluation + followUpEvals []*structs.Evaluation deployment *structs.Deployment @@ -117,14 +114,15 @@ func (s *GenericScheduler) Process(eval *structs.Evaluation) error { // Verify the evaluation trigger reason is understood switch eval.TriggeredBy { - case structs.EvalTriggerJobRegister, structs.EvalTriggerNodeUpdate, - structs.EvalTriggerJobDeregister, structs.EvalTriggerRollingUpdate, + case structs.EvalTriggerJobRegister, structs.EvalTriggerJobDeregister, + structs.EvalTriggerNodeDrain, structs.EvalTriggerNodeUpdate, + structs.EvalTriggerRollingUpdate, structs.EvalTriggerPeriodicJob, structs.EvalTriggerMaxPlans, structs.EvalTriggerDeploymentWatcher, structs.EvalTriggerRetryFailedAlloc: default: desc := fmt.Sprintf("scheduler cannot handle '%s' evaluation reason", eval.TriggeredBy) - return setStatus(s.logger, s.planner, s.eval, s.nextEval, s.blocked, + return setStatus(s.logger, s.planner, s.eval, nil, s.blocked, s.failedTGAllocs, structs.EvalStatusFailed, desc, s.queuedAllocs, s.deployment.GetID()) } @@ -143,7 +141,7 @@ func (s *GenericScheduler) Process(eval *structs.Evaluation) error { if err := s.createBlockedEval(true); err != nil { mErr.Errors = append(mErr.Errors, err) } - if err := setStatus(s.logger, s.planner, s.eval, s.nextEval, s.blocked, + if err := setStatus(s.logger, s.planner, s.eval, nil, s.blocked, s.failedTGAllocs, statusErr.EvalStatus, err.Error(), s.queuedAllocs, s.deployment.GetID()); err != nil { mErr.Errors = append(mErr.Errors, err) @@ -165,7 +163,7 @@ func (s *GenericScheduler) Process(eval *structs.Evaluation) error { } // Update the status to complete - return setStatus(s.logger, s.planner, s.eval, s.nextEval, s.blocked, + return setStatus(s.logger, s.planner, s.eval, nil, s.blocked, s.failedTGAllocs, structs.EvalStatusComplete, "", s.queuedAllocs, s.deployment.GetID()) } @@ -258,16 +256,6 @@ func (s *GenericScheduler) process() (bool, error) { return true, nil } - // If we need a followup eval and we haven't created one, do so. - if s.followupEvalWait != 0 && s.nextEval == nil { - s.nextEval = s.eval.NextRollingEval(s.followupEvalWait) - if err := s.planner.CreateEval(s.nextEval); err != nil { - s.logger.Printf("[ERR] sched: %#v failed to make next eval for rolling migration: %v", s.eval, err) - return false, err - } - s.logger.Printf("[DEBUG] sched: %#v: rolling migration limit reached, next eval '%s' created", s.eval, s.nextEval.ID) - } - // Create follow up evals for any delayed reschedule eligible allocations if len(s.followUpEvals) > 0 { for _, eval := range s.followUpEvals { @@ -352,16 +340,13 @@ func (s *GenericScheduler) computeJobAllocs() error { s.plan.Deployment = results.deployment s.plan.DeploymentUpdates = results.deploymentUpdates - // Store the the follow up eval wait duration. If set this will trigger a - // follow up eval to handle node draining. - s.followupEvalWait = results.followupEvalWait - // Store all the follow up evaluations from rescheduled allocations if len(results.desiredFollowupEvals) > 0 { for _, evals := range results.desiredFollowupEvals { s.followUpEvals = append(s.followUpEvals, evals...) } } + // Update the stored deployment if results.deployment != nil { s.deployment = results.deployment diff --git a/scheduler/generic_sched_test.go b/scheduler/generic_sched_test.go index 5b21034eb9cb..fd677f952db3 100644 --- a/scheduler/generic_sched_test.go +++ b/scheduler/generic_sched_test.go @@ -2211,6 +2211,7 @@ func TestServiceSched_NodeDown(t *testing.T) { // Register a node node := mock.Node() + node.Status = structs.NodeStatusDown noErr(t, h.State.UpsertNode(h.NextIndex(), node)) // Generate a fake job with allocations and an update policy. @@ -2235,18 +2236,19 @@ func TestServiceSched_NodeDown(t *testing.T) { allocs[9].DesiredStatus = structs.AllocDesiredStatusRun allocs[9].ClientStatus = structs.AllocClientStatusComplete - noErr(t, h.State.UpsertAllocs(h.NextIndex(), allocs)) - // Mark some allocs as running - ws := memdb.NewWatchSet() for i := 0; i < 4; i++ { - out, _ := h.State.AllocByID(ws, allocs[i].ID) + out := allocs[i] out.ClientStatus = structs.AllocClientStatusRunning - noErr(t, h.State.UpdateAllocsFromClient(h.NextIndex(), []*structs.Allocation{out})) } - // Mark the node as down - noErr(t, h.State.UpdateNodeStatus(h.NextIndex(), node.ID, structs.NodeStatusDown)) + // Mark appropriate allocs for migration + for i := 0; i < 7; i++ { + out := allocs[i] + out.DesiredTransition.Migrate = helper.BoolToPtr(true) + } + + noErr(t, h.State.UpsertAllocs(h.NextIndex(), allocs)) // Create a mock evaluation to deal with drain eval := &structs.Evaluation{ @@ -2365,6 +2367,7 @@ func TestServiceSched_NodeDrain(t *testing.T) { alloc.JobID = job.ID alloc.NodeID = node.ID alloc.Name = fmt.Sprintf("my-job.web[%d]", i) + alloc.DesiredTransition.Migrate = helper.BoolToPtr(true) allocs = append(allocs, alloc) } noErr(t, h.State.UpsertAllocs(h.NextIndex(), allocs)) @@ -2447,9 +2450,10 @@ func TestServiceSched_NodeDrain_Down(t *testing.T) { // Set the desired state of the allocs to stop var stop []*structs.Allocation - for i := 0; i < 10; i++ { + for i := 0; i < 6; i++ { newAlloc := allocs[i].Copy() newAlloc.ClientStatus = structs.AllocDesiredStatusStop + newAlloc.DesiredTransition.Migrate = helper.BoolToPtr(true) stop = append(stop, newAlloc) } noErr(t, h.State.UpsertAllocs(h.NextIndex(), stop)) @@ -2466,7 +2470,7 @@ func TestServiceSched_NodeDrain_Down(t *testing.T) { // Mark some of the allocations as complete var complete []*structs.Allocation for i := 6; i < 10; i++ { - newAlloc := stop[i].Copy() + newAlloc := allocs[i].Copy() newAlloc.TaskStates = make(map[string]*structs.TaskState) newAlloc.TaskStates["web"] = &structs.TaskState{ State: structs.TaskStateDead, @@ -2552,6 +2556,7 @@ func TestServiceSched_NodeDrain_Queued_Allocations(t *testing.T) { alloc.JobID = job.ID alloc.NodeID = node.ID alloc.Name = fmt.Sprintf("my-job.web[%d]", i) + alloc.DesiredTransition.Migrate = helper.BoolToPtr(true) allocs = append(allocs, alloc) } noErr(t, h.State.UpsertAllocs(h.NextIndex(), allocs)) @@ -2583,88 +2588,6 @@ func TestServiceSched_NodeDrain_Queued_Allocations(t *testing.T) { } } -func TestServiceSched_NodeDrain_UpdateStrategy(t *testing.T) { - h := NewHarness(t) - - // Register a draining node - node := mock.Node() - node.Drain = true - noErr(t, h.State.UpsertNode(h.NextIndex(), node)) - - // Create some nodes - for i := 0; i < 10; i++ { - node := mock.Node() - noErr(t, h.State.UpsertNode(h.NextIndex(), node)) - } - - // Generate a fake job with allocations and an update policy. - job := mock.Job() - mp := 5 - u := structs.DefaultUpdateStrategy.Copy() - u.MaxParallel = mp - u.Stagger = time.Second - job.TaskGroups[0].Update = u - - noErr(t, h.State.UpsertJob(h.NextIndex(), job)) - - var allocs []*structs.Allocation - for i := 0; i < 10; i++ { - alloc := mock.Alloc() - alloc.Job = job - alloc.JobID = job.ID - alloc.NodeID = node.ID - alloc.Name = fmt.Sprintf("my-job.web[%d]", i) - allocs = append(allocs, alloc) - } - noErr(t, h.State.UpsertAllocs(h.NextIndex(), allocs)) - - // Create a mock evaluation to deal with drain - eval := &structs.Evaluation{ - Namespace: structs.DefaultNamespace, - ID: uuid.Generate(), - Priority: 50, - TriggeredBy: structs.EvalTriggerNodeUpdate, - JobID: job.ID, - NodeID: node.ID, - Status: structs.EvalStatusPending, - } - noErr(t, h.State.UpsertEvals(h.NextIndex(), []*structs.Evaluation{eval})) - - // Process the evaluation - err := h.Process(NewServiceScheduler, eval) - if err != nil { - t.Fatalf("err: %v", err) - } - - // Ensure a single plan - if len(h.Plans) != 1 { - t.Fatalf("bad: %#v", h.Plans) - } - plan := h.Plans[0] - - // Ensure the plan evicted all allocs - if len(plan.NodeUpdate[node.ID]) != mp { - t.Fatalf("bad: %#v", plan) - } - - // Ensure the plan allocated - var planned []*structs.Allocation - for _, allocList := range plan.NodeAllocation { - planned = append(planned, allocList...) - } - if len(planned) != mp { - t.Fatalf("bad: %#v", plan) - } - - // Ensure there is a followup eval. - if len(h.CreateEvals) != 1 || - h.CreateEvals[0].TriggeredBy != structs.EvalTriggerRollingUpdate { - t.Fatalf("bad: %#v", h.CreateEvals) - } - - h.AssertEvalStatus(t, structs.EvalStatusComplete) -} - func TestServiceSched_RetryLimit(t *testing.T) { h := NewHarness(t) h.Planner = &RejectPlan{h} @@ -3755,6 +3678,7 @@ func TestBatchSched_NodeDrain_Running_OldJob(t *testing.T) { // Create an update job job2 := job.Copy() job2.TaskGroups[0].Tasks[0].Env = map[string]string{"foo": "bar"} + job2.Version++ noErr(t, h.State.UpsertJob(h.NextIndex(), job2)) // Create a mock evaluation to register the job @@ -4021,10 +3945,10 @@ func TestServiceSched_NodeDrain_Sticky(t *testing.T) { // Create an alloc on the draining node alloc := mock.Alloc() alloc.Name = "my-job.web[0]" - alloc.DesiredStatus = structs.AllocDesiredStatusStop alloc.NodeID = node.ID alloc.Job.TaskGroups[0].Count = 1 alloc.Job.TaskGroups[0].EphemeralDisk.Sticky = true + alloc.DesiredTransition.Migrate = helper.BoolToPtr(true) noErr(t, h.State.UpsertJob(h.NextIndex(), alloc.Job)) noErr(t, h.State.UpsertAllocs(h.NextIndex(), []*structs.Allocation{alloc})) diff --git a/scheduler/reconcile.go b/scheduler/reconcile.go index 3bfd1a89e14d..b7b936defdca 100644 --- a/scheduler/reconcile.go +++ b/scheduler/reconcile.go @@ -99,11 +99,6 @@ type reconcileResults struct { // task group. desiredTGUpdates map[string]*structs.DesiredUpdates - // followupEvalWait is set if there should be a followup eval run after the - // given duration - // Deprecated, the delay strategy that sets this is not available after nomad 0.7.0 - followupEvalWait time.Duration - // desiredFollowupEvals is the map of follow up evaluations to create per task group // This is used to create a delayed evaluation for rescheduling failed allocations. desiredFollowupEvals map[string][]*structs.Evaluation @@ -131,9 +126,6 @@ func (r *reconcileResults) GoString() string { base += fmt.Sprintf("\nDeployment Update for ID %q: Status %q; Description %q", u.DeploymentID, u.Status, u.StatusDescription) } - if r.followupEvalWait != 0 { - base += fmt.Sprintf("\nFollowup Eval in %v", r.followupEvalWait) - } for tg, u := range r.desiredTGUpdates { base += fmt.Sprintf("\nDesired Changes for %q: %#v", tg, u) } @@ -444,7 +436,6 @@ func (a *allocReconciler) computeGroup(group string, all allocSet) bool { if deploymentPlaceReady { // Do all destructive updates min := helper.IntMin(len(destructive), limit) - limit -= min desiredChanges.DestructiveUpdate += uint64(min) desiredChanges.Ignore += uint64(len(destructive) - min) for _, alloc := range destructive.nameOrder()[:min] { @@ -461,16 +452,12 @@ func (a *allocReconciler) computeGroup(group string, all allocSet) bool { // Calculate the allowed number of changes and set the desired changes // accordingly. - min := helper.IntMin(len(migrate), limit) if !a.deploymentFailed && !a.deploymentPaused { - desiredChanges.Migrate += uint64(min) - desiredChanges.Ignore += uint64(len(migrate) - min) + desiredChanges.Migrate += uint64(len(migrate)) } else { desiredChanges.Stop += uint64(len(migrate)) } - followup := false - migrated := 0 for _, alloc := range migrate.nameOrder() { // If the deployment is failed or paused, don't replace it, just mark as stop. if a.deploymentFailed || a.deploymentPaused { @@ -481,12 +468,6 @@ func (a *allocReconciler) computeGroup(group string, all allocSet) bool { continue } - if migrated >= limit { - followup = true - break - } - - migrated++ a.result.stop = append(a.result.stop, allocStopResult{ alloc: alloc, statusDescription: allocMigrating, @@ -499,11 +480,6 @@ func (a *allocReconciler) computeGroup(group string, all allocSet) bool { }) } - // We need to create a followup evaluation. - if followup && strategy != nil && a.result.followupEvalWait < strategy.Stagger { - a.result.followupEvalWait = strategy.Stagger - } - // Create a new deployment if necessary if !existingDeployment && strategy != nil && dstate.DesiredTotal != 0 { // A previous group may have made the deployment already diff --git a/scheduler/reconcile_test.go b/scheduler/reconcile_test.go index 34f6eddbfa0c..604347fa5bd9 100644 --- a/scheduler/reconcile_test.go +++ b/scheduler/reconcile_test.go @@ -75,7 +75,6 @@ Update stanza Tests: √ Failed deployment cancels non-promoted task groups √ Failed deployment and updated job works √ Finished deployment gets marked as complete -√ The stagger is correctly calculated when it is applied across multiple task groups. √ Change job change while scaling up √ Update the job when all allocations from the previous job haven't been placed yet. √ Paused or failed deployment doesn't do any rescheduling of failed allocs @@ -306,7 +305,6 @@ type resultExpectation struct { inplace int stop int desiredTGUpdates map[string]*structs.DesiredUpdates - followupEvalWait time.Duration } func assertResults(t *testing.T, r *reconcileResults, exp *resultExpectation) { @@ -342,9 +340,6 @@ func assertResults(t *testing.T, r *reconcileResults, exp *resultExpectation) { if l := len(r.desiredTGUpdates); l != len(exp.desiredTGUpdates) { t.Fatalf("Expected %d task group desired tg updates annotations; got %d", len(exp.desiredTGUpdates), l) } - if r.followupEvalWait != exp.followupEvalWait { - t.Fatalf("Unexpected followup eval wait time. Got %v; want %v", r.followupEvalWait, exp.followupEvalWait) - } // Check the desired updates happened for group, desired := range exp.desiredTGUpdates { @@ -927,6 +922,7 @@ func TestReconciler_DrainNode(t *testing.T) { for i := 0; i < 2; i++ { n := mock.Node() n.ID = allocs[i].NodeID + allocs[i].DesiredTransition.Migrate = helper.BoolToPtr(true) n.Drain = true tainted[n.ID] = n } @@ -979,6 +975,7 @@ func TestReconciler_DrainNode_ScaleUp(t *testing.T) { for i := 0; i < 2; i++ { n := mock.Node() n.ID = allocs[i].NodeID + allocs[i].DesiredTransition.Migrate = helper.BoolToPtr(true) n.Drain = true tainted[n.ID] = n } @@ -1032,6 +1029,7 @@ func TestReconciler_DrainNode_ScaleDown(t *testing.T) { for i := 0; i < 3; i++ { n := mock.Node() n.ID = allocs[i].NodeID + allocs[i].DesiredTransition.Migrate = helper.BoolToPtr(true) n.Drain = true tainted[n.ID] = n } @@ -2213,6 +2211,7 @@ func TestReconciler_PausedOrFailedDeployment_Migrations(t *testing.T) { for i := 0; i < 3; i++ { n := mock.Node() n.ID = allocs[i].NodeID + allocs[i].DesiredTransition.Migrate = helper.BoolToPtr(true) n.Drain = true tainted[n.ID] = n } @@ -2286,6 +2285,7 @@ func TestReconciler_DrainNode_Canary(t *testing.T) { tainted := make(map[string]*structs.Node, 1) n := mock.Node() n.ID = allocs[11].NodeID + allocs[11].DesiredTransition.Migrate = helper.BoolToPtr(true) n.Drain = true tainted[n.ID] = n @@ -3025,6 +3025,7 @@ func TestReconciler_TaintedNode_RollingUpgrade(t *testing.T) { n.Status = structs.NodeStatusDown } else { n.Drain = true + allocs[2+i].DesiredTransition.Migrate = helper.BoolToPtr(true) } tainted[n.ID] = n } @@ -3037,24 +3038,23 @@ func TestReconciler_TaintedNode_RollingUpgrade(t *testing.T) { assertResults(t, r, &resultExpectation{ createDeployment: nil, deploymentUpdates: nil, - place: 2, + place: 3, destructive: 2, - stop: 2, - followupEvalWait: 31 * time.Second, + stop: 3, desiredTGUpdates: map[string]*structs.DesiredUpdates{ job.TaskGroups[0].Name: { Place: 1, // Place the lost Stop: 1, // Stop the lost - Migrate: 1, // Migrate the tainted + Migrate: 2, // Migrate the tainted DestructiveUpdate: 2, - Ignore: 6, + Ignore: 5, }, }, }) assertNamesHaveIndexes(t, intRange(8, 9), destructiveResultsToNames(r.destructiveUpdate)) - assertNamesHaveIndexes(t, intRange(0, 1), placeResultsToNames(r.place)) - assertNamesHaveIndexes(t, intRange(0, 1), stopResultsToNames(r.stop)) + assertNamesHaveIndexes(t, intRange(0, 2), placeResultsToNames(r.place)) + assertNamesHaveIndexes(t, intRange(0, 2), stopResultsToNames(r.stop)) } // Tests the reconciler handles a failed deployment and only replaces lost @@ -3110,6 +3110,7 @@ func TestReconciler_FailedDeployment_PlacementLost(t *testing.T) { n.Status = structs.NodeStatusDown } else { n.Drain = true + allocs[6+i].DesiredTransition.Migrate = helper.BoolToPtr(true) } tainted[n.ID] = n } @@ -3125,7 +3126,6 @@ func TestReconciler_FailedDeployment_PlacementLost(t *testing.T) { place: 1, // Only replace the lost node inplace: 0, stop: 2, - followupEvalWait: 0, // Since the deployment is failed, there should be no followup desiredTGUpdates: map[string]*structs.DesiredUpdates{ job.TaskGroups[0].Name: { Place: 1, @@ -3406,72 +3406,6 @@ func TestReconciler_MarkDeploymentComplete(t *testing.T) { }) } -// Tests the reconciler picks the maximum of the staggers when multiple task -// groups are under going node drains. -func TestReconciler_TaintedNode_MultiGroups(t *testing.T) { - // Create a job with two task groups - job := mock.Job() - job.TaskGroups[0].Update = noCanaryUpdate - job.TaskGroups = append(job.TaskGroups, job.TaskGroups[0].Copy()) - job.TaskGroups[1].Name = "two" - job.TaskGroups[1].Update.Stagger = 100 * time.Second - - // Create the allocations - var allocs []*structs.Allocation - for j := 0; j < 2; j++ { - for i := 0; i < 10; i++ { - alloc := mock.Alloc() - alloc.Job = job - alloc.JobID = job.ID - alloc.NodeID = uuid.Generate() - alloc.Name = structs.AllocName(job.ID, job.TaskGroups[j].Name, uint(i)) - alloc.TaskGroup = job.TaskGroups[j].Name - allocs = append(allocs, alloc) - } - } - - // Build a map of tainted nodes - tainted := make(map[string]*structs.Node, 15) - for i := 0; i < 15; i++ { - n := mock.Node() - n.ID = allocs[i].NodeID - n.Drain = true - tainted[n.ID] = n - } - - reconciler := NewAllocReconciler(testLogger(), allocUpdateFnIgnore, false, job.ID, job, nil, allocs, tainted) - r := reconciler.Compute() - - // Assert the correct results - assertResults(t, r, &resultExpectation{ - createDeployment: nil, - deploymentUpdates: nil, - place: 8, - inplace: 0, - stop: 8, - followupEvalWait: 100 * time.Second, - desiredTGUpdates: map[string]*structs.DesiredUpdates{ - job.TaskGroups[0].Name: { - Place: 0, - Stop: 0, - Migrate: 4, - DestructiveUpdate: 0, - Ignore: 6, - }, - job.TaskGroups[1].Name: { - Place: 0, - Stop: 0, - Migrate: 4, - DestructiveUpdate: 0, - Ignore: 6, - }, - }, - }) - - assertNamesHaveIndexes(t, intRange(0, 3, 0, 3), placeResultsToNames(r.place)) - assertNamesHaveIndexes(t, intRange(0, 3, 0, 3), stopResultsToNames(r.stop)) -} - // Tests the reconciler handles changing a job such that a deployment is created // while doing a scale up but as the second eval. func TestReconciler_JobChange_ScaleUp_SecondEval(t *testing.T) { diff --git a/scheduler/reconcile_util.go b/scheduler/reconcile_util.go index db3a5ff1e3d5..a7b0b814120f 100644 --- a/scheduler/reconcile_util.go +++ b/scheduler/reconcile_util.go @@ -199,30 +199,33 @@ func (a allocSet) filterByTainted(nodes map[string]*structs.Node) (untainted, mi migrate = make(map[string]*structs.Allocation) lost = make(map[string]*structs.Allocation) for _, alloc := range a { - n, ok := nodes[alloc.NodeID] - if !ok { + // Terminal allocs are always untainted as they should never be migrated + if alloc.TerminalStatus() { untainted[alloc.ID] = alloc continue } - // If the job is batch and finished successfully, the fact that the - // node is tainted does not mean it should be migrated or marked as - // lost as the work was already successfully finished. However for - // service/system jobs, tasks should never complete. The check of - // batch type, defends against client bugs. - if alloc.Job.Type == structs.JobTypeBatch && alloc.RanSuccessfully() { - untainted[alloc.ID] = alloc + // Non-terminal allocs that should migrate should always migrate + if alloc.DesiredTransition.ShouldMigrate() { + migrate[alloc.ID] = alloc continue } - if !alloc.TerminalStatus() { - if n == nil || n.TerminalStatus() { - lost[alloc.ID] = alloc - } else { - migrate[alloc.ID] = alloc - } - } else { + + n, ok := nodes[alloc.NodeID] + if !ok { + // Node is untainted so alloc is untainted untainted[alloc.ID] = alloc + continue } + + // Allocs on GC'd (nil) or lost nodes are Lost + if n == nil || n.TerminalStatus() { + lost[alloc.ID] = alloc + continue + } + + // All other allocs are untainted + untainted[alloc.ID] = alloc } return } diff --git a/scheduler/reconcile_util_test.go b/scheduler/reconcile_util_test.go index 3b45a55ed6d5..6905b26fbbd9 100644 --- a/scheduler/reconcile_util_test.go +++ b/scheduler/reconcile_util_test.go @@ -3,7 +3,9 @@ package scheduler import ( "testing" + "github.com/hashicorp/nomad/helper" "github.com/hashicorp/nomad/nomad/structs" + "github.com/stretchr/testify/require" ) // Test that we properly create the bitmap even when the alloc set includes an @@ -29,3 +31,100 @@ func TestBitmapFrom(t *testing.T) { t.Fatalf("got %d; want %d", act, exp) } } + +func TestAllocSet_filterByTainted(t *testing.T) { + require := require.New(t) + + nodes := map[string]*structs.Node{ + "draining": { + ID: "draining", + Drain: true, + }, + "lost": { + ID: "lost", + Status: structs.NodeStatusDown, + }, + "nil": nil, + "normal": { + ID: "normal", + Status: structs.NodeStatusReady, + }, + } + + batchJob := &structs.Job{ + Type: structs.JobTypeBatch, + } + + allocs := allocSet{ + // Non-terminal alloc with migrate=true should migrate on a draining node + "migrating1": { + ID: "migrating1", + ClientStatus: structs.AllocClientStatusRunning, + DesiredTransition: structs.DesiredTransition{helper.BoolToPtr(true)}, + Job: batchJob, + NodeID: "draining", + }, + // Non-terminal alloc with migrate=true should migrate on an unknown node + "migrating2": { + ID: "migrating2", + ClientStatus: structs.AllocClientStatusRunning, + DesiredTransition: structs.DesiredTransition{helper.BoolToPtr(true)}, + Job: batchJob, + NodeID: "nil", + }, + "untainted1": { + ID: "untainted1", + ClientStatus: structs.AllocClientStatusRunning, + Job: batchJob, + NodeID: "normal", + }, + // Terminal allocs are always untainted + "untainted2": { + ID: "untainted2", + ClientStatus: structs.AllocClientStatusComplete, + Job: batchJob, + NodeID: "normal", + }, + // Terminal allocs are always untainted, even on draining nodes + "untainted3": { + ID: "untainted3", + ClientStatus: structs.AllocClientStatusComplete, + Job: batchJob, + NodeID: "draining", + }, + // Terminal allocs are always untainted, even on lost nodes + "untainted4": { + ID: "untainted4", + ClientStatus: structs.AllocClientStatusComplete, + Job: batchJob, + NodeID: "lost", + }, + // Non-terminal allocs on lost nodes are lost + "lost1": { + ID: "lost1", + ClientStatus: structs.AllocClientStatusPending, + Job: batchJob, + NodeID: "lost", + }, + // Non-terminal allocs on lost nodes are lost + "lost2": { + ID: "lost2", + ClientStatus: structs.AllocClientStatusRunning, + Job: batchJob, + NodeID: "lost", + }, + } + + untainted, migrate, lost := allocs.filterByTainted(nodes) + require.Len(untainted, 4) + require.Contains(untainted, "untainted1") + require.Contains(untainted, "untainted2") + require.Contains(untainted, "untainted3") + require.Contains(untainted, "untainted4") + require.Len(migrate, 2) + require.Contains(migrate, "migrating1") + require.Contains(migrate, "migrating2") + require.Len(lost, 2) + require.Contains(lost, "lost1") + require.Contains(lost, "lost2") +} diff --git a/scheduler/system_sched.go b/scheduler/system_sched.go index d30608c8b724..4fa2d20f673a 100644 --- a/scheduler/system_sched.go +++ b/scheduler/system_sched.go @@ -62,7 +62,7 @@ func (s *SystemScheduler) Process(eval *structs.Evaluation) error { switch eval.TriggeredBy { case structs.EvalTriggerJobRegister, structs.EvalTriggerNodeUpdate, structs.EvalTriggerJobDeregister, structs.EvalTriggerRollingUpdate, - structs.EvalTriggerDeploymentWatcher: + structs.EvalTriggerDeploymentWatcher, structs.EvalTriggerNodeDrain: default: desc := fmt.Sprintf("scheduler cannot handle '%s' evaluation reason", eval.TriggeredBy) diff --git a/scheduler/system_sched_test.go b/scheduler/system_sched_test.go index 8cd1a0c6474a..3d78b7061366 100644 --- a/scheduler/system_sched_test.go +++ b/scheduler/system_sched_test.go @@ -7,6 +7,7 @@ import ( "time" memdb "github.com/hashicorp/go-memdb" + "github.com/hashicorp/nomad/helper" "github.com/hashicorp/nomad/helper/uuid" "github.com/hashicorp/nomad/nomad/mock" "github.com/hashicorp/nomad/nomad/structs" @@ -971,6 +972,7 @@ func TestSystemSched_NodeDown(t *testing.T) { alloc.JobID = job.ID alloc.NodeID = node.ID alloc.Name = "my-job.web[0]" + alloc.DesiredTransition.Migrate = helper.BoolToPtr(true) noErr(t, h.State.UpsertAllocs(h.NextIndex(), []*structs.Allocation{alloc})) // Create a mock evaluation to deal with drain @@ -1099,6 +1101,7 @@ func TestSystemSched_NodeDrain(t *testing.T) { alloc.JobID = job.ID alloc.NodeID = node.ID alloc.Name = "my-job.web[0]" + alloc.DesiredTransition.Migrate = helper.BoolToPtr(true) noErr(t, h.State.UpsertAllocs(h.NextIndex(), []*structs.Allocation{alloc})) // Create a mock evaluation to deal with drain @@ -1412,6 +1415,7 @@ func TestSystemSched_PlanWithDrainedNode(t *testing.T) { alloc.JobID = job.ID alloc.NodeID = node.ID alloc.Name = "my-job.web[0]" + alloc.DesiredTransition.Migrate = helper.BoolToPtr(true) alloc.TaskGroup = "web" alloc2 := mock.Alloc() diff --git a/scheduler/testing.go b/scheduler/testing.go index a04b99ce860c..47a6caaeb004 100644 --- a/scheduler/testing.go +++ b/scheduler/testing.go @@ -2,12 +2,11 @@ package scheduler import ( "fmt" - "log" - "os" "sync" "time" memdb "github.com/hashicorp/go-memdb" + "github.com/hashicorp/nomad/helper/testlog" "github.com/hashicorp/nomad/nomad/state" "github.com/hashicorp/nomad/nomad/structs" "github.com/mitchellh/go-testing-interface" @@ -40,6 +39,7 @@ func (r *RejectPlan) ReblockEval(*structs.Evaluation) error { // store copy and provides the planner interface. It can be extended for various // testing uses or for invoking the scheduler without side effects. type Harness struct { + t testing.T State *state.StateStore Planner Planner @@ -58,6 +58,7 @@ type Harness struct { func NewHarness(t testing.T) *Harness { state := state.TestStateStore(t) h := &Harness{ + t: t, State: state, nextIndex: 1, } @@ -68,6 +69,7 @@ func NewHarness(t testing.T) *Harness { // purposes. func NewHarnessWithState(t testing.T, state *state.StateStore) *Harness { return &Harness{ + t: t, State: state, nextIndex: 1, } @@ -201,7 +203,7 @@ func (h *Harness) Snapshot() State { // Scheduler is used to return a new scheduler from // a snapshot of current state using the harness for planning. func (h *Harness) Scheduler(factory Factory) Scheduler { - logger := log.New(os.Stderr, "", log.LstdFlags) + logger := testlog.Logger(h.t) return factory(logger, h.Snapshot(), h) } diff --git a/scheduler/util.go b/scheduler/util.go index 17b7942accda..c0943e126380 100644 --- a/scheduler/util.go +++ b/scheduler/util.go @@ -104,20 +104,26 @@ func diffAllocs(job *structs.Job, taintedNodes map[string]*structs.Node, goto IGNORE } - if node == nil || node.TerminalStatus() { - result.lost = append(result.lost, allocTuple{ - Name: name, - TaskGroup: tg, - Alloc: exist, - }) + if !exist.TerminalStatus() { + if node == nil || node.TerminalStatus() { + result.lost = append(result.lost, allocTuple{ + Name: name, + TaskGroup: tg, + Alloc: exist, + }) + } else if exist.DesiredTransition.ShouldMigrate() { + result.migrate = append(result.migrate, allocTuple{ + Name: name, + TaskGroup: tg, + Alloc: exist, + }) + } else { + goto IGNORE + } } else { - // This is the drain case - result.migrate = append(result.migrate, allocTuple{ - Name: name, - TaskGroup: tg, - Alloc: exist, - }) + goto IGNORE } + continue } @@ -249,6 +255,9 @@ func readyNodesInDCs(state State, dcs []string) ([]*structs.Node, map[string]int if node.Drain { continue } + if node.SchedulingEligibility != structs.NodeSchedulingEligible { + continue + } if _, ok := dcMap[node.Datacenter]; !ok { continue } diff --git a/scheduler/util_test.go b/scheduler/util_test.go index cb96e83ea283..7fde4fa65718 100644 --- a/scheduler/util_test.go +++ b/scheduler/util_test.go @@ -7,6 +7,7 @@ import ( "reflect" "testing" + "github.com/hashicorp/nomad/helper" "github.com/hashicorp/nomad/helper/uuid" "github.com/hashicorp/nomad/nomad/mock" "github.com/hashicorp/nomad/nomad/state" @@ -90,6 +91,9 @@ func TestDiffAllocs(t *testing.T) { NodeID: "drainNode", Name: "my-job.web[2]", Job: oldJob, + DesiredTransition: structs.DesiredTransition{ + Migrate: helper.BoolToPtr(true), + }, }, // Mark the 4th lost { @@ -219,6 +223,9 @@ func TestDiffSystemAllocs(t *testing.T) { NodeID: drainNode.ID, Name: "my-job.web[0]", Job: oldJob, + DesiredTransition: structs.DesiredTransition{ + Migrate: helper.BoolToPtr(true), + }, }, // Mark as lost on a dead node {