From 95b3b6eb0251070f40a4196b4d535ea691499aad Mon Sep 17 00:00:00 2001
From: Michael Schurter <schmichael@hashicorp.com>
Date: Tue, 23 Jan 2018 16:47:00 -0800
Subject: [PATCH 01/79] drain: initial drainv2 structs and impl

---
 api/tasks.go                          |  40 ++
 client/alloc_runner_health_watcher.go |  85 ++-
 jobspec/parse.go                      |  45 ++
 jobspec/parse_test.go                 |   6 +
 jobspec/test-fixtures/basic.hcl       |   7 +
 nomad/drain.go                        | 752 ++++++++++++++++++++++++++
 nomad/drain_test.go                   | 216 ++++++++
 nomad/leader.go                       |   3 +
 nomad/mock/mock.go                    |   1 +
 nomad/node_endpoint.go                |   5 +
 nomad/plan_apply.go                   |   3 +
 nomad/state/state_store.go            |  12 +
 nomad/structs/structs.go              | 178 +++++-
 scheduler/generic_sched.go            |   5 +-
 scheduler/util.go                     |  10 +-
 testutil/rpcapi/rcpapi.go             | 114 ++++
 16 files changed, 1433 insertions(+), 49 deletions(-)
 create mode 100644 nomad/drain.go
 create mode 100644 nomad/drain_test.go
 create mode 100644 testutil/rpcapi/rcpapi.go

diff --git a/api/tasks.go b/api/tasks.go
index 047afccaf0a3..f7d3d9fb0737 100644
--- a/api/tasks.go
+++ b/api/tasks.go
@@ -284,6 +284,43 @@ func (e *EphemeralDisk) Canonicalize() {
 	}
 }
 
+// MigrateStrategy describes how allocations for a task group should be
+// migrated between nodes (eg when draining).
+type MigrateStrategy struct {
+	MaxParallel     *int           `mapstructure:"max_parallel"`
+	HealthCheck     *string        `mapstructure:"health_check"`
+	MinHealthyTime  *time.Duration `mapstructure:"min_healthy_time"`
+	HealthyDeadline *time.Duration `mapstructure:"healthy_deadline"`
+}
+
+func DefaultMigrateStrategy() *MigrateStrategy {
+	return &MigrateStrategy{
+		MaxParallel:     helper.IntToPtr(1),
+		HealthCheck:     helper.StringToPtr("checks"),
+		MinHealthyTime:  helper.TimeToPtr(10 * time.Second),
+		HealthyDeadline: helper.TimeToPtr(5 * time.Minute),
+	}
+}
+
+func (m *MigrateStrategy) Canonicalize() {
+	if m == nil {
+		return
+	}
+	defaults := DefaultMigrateStrategy()
+	if m.MaxParallel == nil {
+		m.MaxParallel = defaults.MaxParallel
+	}
+	if m.HealthCheck == nil {
+		m.HealthCheck = defaults.HealthCheck
+	}
+	if m.MinHealthyTime == nil {
+		m.MinHealthyTime = defaults.MinHealthyTime
+	}
+	if m.HealthyDeadline == nil {
+		m.HealthyDeadline = defaults.HealthyDeadline
+	}
+}
+
 // TaskGroup is the unit of scheduling.
 type TaskGroup struct {
 	Name             *string
@@ -294,6 +331,7 @@ type TaskGroup struct {
 	ReschedulePolicy *ReschedulePolicy
 	EphemeralDisk    *EphemeralDisk
 	Update           *UpdateStrategy
+	Migrate          *MigrateStrategy
 	Meta             map[string]string
 }
 
@@ -377,6 +415,8 @@ func (g *TaskGroup) Canonicalize(job *Job) {
 	}
 	g.ReschedulePolicy = defaultReschedulePolicy
 
+	g.Migrate.Canonicalize()
+
 	var defaultRestartPolicy *RestartPolicy
 	switch *job.Type {
 	case "service", "system":
diff --git a/client/alloc_runner_health_watcher.go b/client/alloc_runner_health_watcher.go
index ba94763b555e..db9164740319 100644
--- a/client/alloc_runner_health_watcher.go
+++ b/client/alloc_runner_health_watcher.go
@@ -31,7 +31,17 @@ func (r *AllocRunner) watchHealth(ctx context.Context) {
 
 	// See if we should watch the allocs health
 	alloc := r.Alloc()
-	if alloc.DeploymentID == "" || alloc.DeploymentStatus.IsHealthy() || alloc.DeploymentStatus.IsUnhealthy() {
+	if alloc.Job.Type == structs.JobTypeSystem || alloc.DeploymentStatus.IsHealthy() || alloc.DeploymentStatus.IsUnhealthy() {
+		// Neither deployments nor migrations apply to system jobs and
+		// we don't need to track allocations which already have a
+		// status
+		return
+	}
+
+	isDeploy := alloc.DeploymentID != ""
+
+	if isDeploy && alloc.Job.Type != structs.JobTypeService {
+		// Deployments don't track non-Service jobs
 		return
 	}
 
@@ -39,7 +49,8 @@ func (r *AllocRunner) watchHealth(ctx context.Context) {
 	if tg == nil {
 		r.logger.Printf("[ERR] client.alloc_watcher: failed to lookup allocation's task group. Exiting watcher")
 		return
-	} else if tg.Update == nil || tg.Update.HealthCheck == structs.UpdateStrategyHealthCheck_Manual {
+	}
+	if isDeploy && (tg.Update == nil || tg.Update.HealthCheck == structs.UpdateStrategyHealthCheck_Manual) {
 		return
 	}
 
@@ -47,14 +58,36 @@ func (r *AllocRunner) watchHealth(ctx context.Context) {
 	l := r.allocBroadcast.Listen()
 	defer l.Close()
 
+	// Define the deadline, health method, min healthy time from the
+	// deployment if this is a deployment; otherwise from the migration
+	// strategy.
+	var deadline time.Time
+	var useChecks bool
+	var minHealthyTime time.Duration
+
+	if isDeploy {
+		deadline = time.Now().Add(tg.Update.HealthyDeadline)
+		minHealthyTime = tg.Update.MinHealthyTime
+		useChecks = tg.Update.HealthCheck == structs.UpdateStrategyHealthCheck_Checks
+	} else {
+		strategy := tg.Migrate
+		if strategy == nil {
+			// For backwards compat with pre-0.8 allocations that
+			// don't have a migrate strategy set.
+			strategy = structs.DefaultMigrateStrategy()
+		}
+		deadline = time.Now().Add(strategy.HealthyDeadline)
+		minHealthyTime = strategy.MinHealthyTime
+		useChecks = strategy.HealthCheck == structs.MigrateStrategyHealthChecks
+	}
+
 	// Create a new context with the health deadline
-	deadline := time.Now().Add(tg.Update.HealthyDeadline)
 	healthCtx, healthCtxCancel := context.WithDeadline(ctx, deadline)
 	defer healthCtxCancel()
-	r.logger.Printf("[DEBUG] client.alloc_watcher: deadline (%v) for alloc %q is at %v", tg.Update.HealthyDeadline, alloc.ID, deadline)
+	r.logger.Printf("[DEBUG] client.alloc_watcher: deadline for alloc %q is at %v (deploy=%t checks=%t)", alloc.ID, deadline, isDeploy, useChecks)
 
 	// Create the health tracker object
-	tracker := newAllocHealthTracker(healthCtx, r.logger, alloc, l, r.consulClient)
+	tracker := newAllocHealthTracker(healthCtx, r.logger, alloc, l, r.consulClient, minHealthyTime, useChecks)
 	tracker.Start()
 
 	allocHealthy := false
@@ -77,8 +110,8 @@ func (r *AllocRunner) watchHealth(ctx context.Context) {
 	r.allocHealth = helper.BoolToPtr(allocHealthy)
 	r.allocLock.Unlock()
 
-	// We are unhealthy so emit task events explaining why
-	if !allocHealthy {
+	// If deployment is unhealthy emit task events explaining why
+	if !allocHealthy && isDeploy {
 		r.taskLock.RLock()
 		for task, event := range tracker.TaskEvents() {
 			if tr, ok := r.tasks[task]; ok {
@@ -107,6 +140,13 @@ type allocHealthTracker struct {
 	// tg is the task group we are tracking
 	tg *structs.TaskGroup
 
+	// minHealthyTime is the duration an alloc must remain healthy to be
+	// considered healthy
+	minHealthyTime time.Duration
+
+	// useChecks specifies whether to use Consul healh checks or not
+	useChecks bool
+
 	// consulCheckCount is the number of checks the task group will attempt to
 	// register
 	consulCheckCount int
@@ -146,7 +186,8 @@ type allocHealthTracker struct {
 // alloc listener and consul API object are given so that the watcher can detect
 // health changes.
 func newAllocHealthTracker(parentCtx context.Context, logger *log.Logger, alloc *structs.Allocation,
-	allocUpdates *cstructs.AllocListener, consulClient ConsulServiceAPI) *allocHealthTracker {
+	allocUpdates *cstructs.AllocListener, consulClient ConsulServiceAPI,
+	minHealthyTime time.Duration, useChecks bool) *allocHealthTracker {
 
 	a := &allocHealthTracker{
 		logger:       logger,
@@ -154,8 +195,11 @@ func newAllocHealthTracker(parentCtx context.Context, logger *log.Logger, alloc
 		allocStopped: make(chan struct{}),
 		alloc:        alloc,
 		tg:           alloc.Job.LookupTaskGroup(alloc.TaskGroup),
-		allocUpdates: allocUpdates,
-		consulClient: consulClient,
+		//FIXME should i wrap all these parameters up in a struct?
+		minHealthyTime: minHealthyTime,
+		useChecks:      useChecks,
+		allocUpdates:   allocUpdates,
+		consulClient:   consulClient,
 	}
 
 	a.taskHealth = make(map[string]*taskHealthState, len(a.tg.Tasks))
@@ -176,7 +220,7 @@ func newAllocHealthTracker(parentCtx context.Context, logger *log.Logger, alloc
 // Start starts the watcher.
 func (a *allocHealthTracker) Start() {
 	go a.watchTaskEvents()
-	if a.tg.Update.HealthCheck == structs.UpdateStrategyHealthCheck_Checks {
+	if a.useChecks {
 		go a.watchConsulEvents()
 	}
 }
@@ -210,7 +254,9 @@ func (a *allocHealthTracker) TaskEvents() map[string]string {
 
 	// Go through are task information and build the event map
 	for task, state := range a.taskHealth {
-		if e, ok := state.event(deadline, a.tg.Update); ok {
+		//FIXME skip this for migrations?
+		useChecks := a.tg.Update.HealthCheck == structs.UpdateStrategyHealthCheck_Checks
+		if e, ok := state.event(deadline, a.tg.Update.MinHealthyTime, useChecks); ok {
 			events[task] = e
 		}
 	}
@@ -227,7 +273,7 @@ func (a *allocHealthTracker) setTaskHealth(healthy, terminal bool) {
 
 	// If we are marked healthy but we also require Consul to be healthy and it
 	// isn't yet, return, unless the task is terminal
-	requireConsul := a.tg.Update.HealthCheck == structs.UpdateStrategyHealthCheck_Checks && a.consulCheckCount > 0
+	requireConsul := a.useChecks && a.consulCheckCount > 0
 	if !terminal && healthy && requireConsul && !a.checksHealthy {
 		return
 	}
@@ -337,7 +383,7 @@ func (a *allocHealthTracker) watchTaskEvents() {
 			// Set the timer since all tasks are started
 			if !latestStartTime.IsZero() {
 				allStartedTime = latestStartTime
-				healthyTimer.Reset(a.tg.Update.MinHealthyTime)
+				healthyTimer.Reset(a.minHealthyTime)
 			}
 		}
 
@@ -453,7 +499,7 @@ OUTER:
 			}
 
 			primed = true
-			healthyTimer.Reset(a.tg.Update.MinHealthyTime)
+			healthyTimer.Reset(a.minHealthyTime)
 		}
 	}
 }
@@ -470,7 +516,7 @@ type taskHealthState struct {
 // event takes the deadline time for the allocation to be healthy and the update
 // strategy of the group. It returns true if the task has contributed to the
 // allocation being unhealthy and if so, an event description of why.
-func (t *taskHealthState) event(deadline time.Time, update *structs.UpdateStrategy) (string, bool) {
+func (t *taskHealthState) event(deadline time.Time, minHealthyTime time.Duration, useChecks bool) (string, bool) {
 	requireChecks := false
 	desiredChecks := 0
 	for _, s := range t.task.Services {
@@ -479,7 +525,7 @@ func (t *taskHealthState) event(deadline time.Time, update *structs.UpdateStrate
 			desiredChecks += nc
 		}
 	}
-	requireChecks = requireChecks && update.HealthCheck == structs.UpdateStrategyHealthCheck_Checks
+	requireChecks = requireChecks && useChecks
 
 	if t.state != nil {
 		if t.state.Failed {
@@ -490,8 +536,9 @@ func (t *taskHealthState) event(deadline time.Time, update *structs.UpdateStrate
 		}
 
 		// We are running so check if we have been running long enough
-		if t.state.StartedAt.Add(update.MinHealthyTime).After(deadline) {
-			return fmt.Sprintf("Task not running for min_healthy_time of %v by deadline", update.MinHealthyTime), true
+		//FIXME need minHealthyTime here
+		if t.state.StartedAt.Add(minHealthyTime).After(deadline) {
+			return fmt.Sprintf("Task not running for min_healthy_time of %v by deadline", minHealthyTime), true
 		}
 	}
 
diff --git a/jobspec/parse.go b/jobspec/parse.go
index d6f235e05f26..4bfebc9099aa 100644
--- a/jobspec/parse.go
+++ b/jobspec/parse.go
@@ -285,6 +285,7 @@ func parseGroups(result *api.Job, list *ast.ObjectList) error {
 			"update",
 			"reschedule",
 			"vault",
+			"migrate",
 		}
 		if err := helper.CheckHCLKeys(listVal, valid); err != nil {
 			return multierror.Prefix(err, fmt.Sprintf("'%s' ->", n))
@@ -301,6 +302,7 @@ func parseGroups(result *api.Job, list *ast.ObjectList) error {
 		delete(m, "ephemeral_disk")
 		delete(m, "update")
 		delete(m, "vault")
+		delete(m, "migrate")
 
 		// Build the group with the basic decode
 		var g api.TaskGroup
@@ -344,6 +346,13 @@ func parseGroups(result *api.Job, list *ast.ObjectList) error {
 			}
 		}
 
+		// If we have a migration strategy, then parse that
+		if o := listVal.Filter("migrate"); len(o.Items) > 0 {
+			if err := parseMigrate(&g.Migrate, o); err != nil {
+				return multierror.Prefix(err, "migrate ->")
+			}
+		}
+
 		// Parse out meta fields. These are in HCL as a list so we need
 		// to iterate over them and merge them.
 		if metaO := listVal.Filter("meta"); len(metaO.Items) > 0 {
@@ -1320,6 +1329,42 @@ func parseUpdate(result **api.UpdateStrategy, list *ast.ObjectList) error {
 	return dec.Decode(m)
 }
 
+func parseMigrate(result **api.MigrateStrategy, list *ast.ObjectList) error {
+	list = list.Elem()
+	if len(list.Items) > 1 {
+		return fmt.Errorf("only one 'migrate' block allowed")
+	}
+
+	// Get our resource object
+	o := list.Items[0]
+
+	var m map[string]interface{}
+	if err := hcl.DecodeObject(&m, o.Val); err != nil {
+		return err
+	}
+
+	// Check for invalid keys
+	valid := []string{
+		"max_parallel",
+		"health_check",
+		"min_healthy_time",
+		"healthy_deadline",
+	}
+	if err := helper.CheckHCLKeys(o.Val, valid); err != nil {
+		return err
+	}
+
+	dec, err := mapstructure.NewDecoder(&mapstructure.DecoderConfig{
+		DecodeHook:       mapstructure.StringToTimeDurationHookFunc(),
+		WeaklyTypedInput: true,
+		Result:           result,
+	})
+	if err != nil {
+		return err
+	}
+	return dec.Decode(m)
+}
+
 func parsePeriodic(result **api.PeriodicConfig, list *ast.ObjectList) error {
 	list = list.Elem()
 	if len(list.Items) > 1 {
diff --git a/jobspec/parse_test.go b/jobspec/parse_test.go
index dbf1200570eb..c3989a68ca94 100644
--- a/jobspec/parse_test.go
+++ b/jobspec/parse_test.go
@@ -110,6 +110,12 @@ func TestParse(t *testing.T) {
 							AutoRevert:      helper.BoolToPtr(false),
 							Canary:          helper.IntToPtr(2),
 						},
+						Migrate: &api.MigrateStrategy{
+							MaxParallel:     helper.IntToPtr(2),
+							HealthCheck:     helper.StringToPtr("task_states"),
+							MinHealthyTime:  helper.TimeToPtr(11 * time.Second),
+							HealthyDeadline: helper.TimeToPtr(11 * time.Minute),
+						},
 						Tasks: []*api.Task{
 							{
 								Name:   "binstore",
diff --git a/jobspec/test-fixtures/basic.hcl b/jobspec/test-fixtures/basic.hcl
index 9942e3dfc34c..2b3f973aa9c4 100644
--- a/jobspec/test-fixtures/basic.hcl
+++ b/jobspec/test-fixtures/basic.hcl
@@ -67,6 +67,13 @@ job "binstore-storagelocker" {
         canary = 2
     }
 
+    migrate {
+        max_parallel = 2
+        health_check = "task_states"
+        min_healthy_time = "11s"
+        healthy_deadline = "11m"
+    }
+
     task "binstore" {
       driver = "docker"
       user   = "bob"
diff --git a/nomad/drain.go b/nomad/drain.go
new file mode 100644
index 000000000000..a1dc99972029
--- /dev/null
+++ b/nomad/drain.go
@@ -0,0 +1,752 @@
+package nomad
+
+import (
+	"context"
+	"log"
+	"strings"
+	"sync"
+	"time"
+
+	memdb "github.com/hashicorp/go-memdb"
+	"github.com/hashicorp/nomad/helper/uuid"
+	"github.com/hashicorp/nomad/nomad/state"
+	"github.com/hashicorp/nomad/nomad/structs"
+)
+
+// drainingJob contains the Job and allocations for that job meant to be used
+// when collecting all allocations for a job with at least one allocation on a
+// draining node.
+//
+// This allows the MaxParallel calculation to take the entire job's allocation
+// state into account. FIXME is that even useful?
+type drainingJob struct {
+	job    *structs.Job
+	allocs []*structs.Allocation
+}
+
+// drainingAlloc contains a conservative deadline an alloc has to be healthy by
+// before it should stopped being watched and replaced.
+type drainingAlloc struct {
+	// LastModified+MigrateStrategy.HealthyDeadline
+	deadline time.Time
+
+	// Task Group key
+	tgKey string
+}
+
+func newDrainingAlloc(a *structs.Allocation, deadline time.Time) drainingAlloc {
+	return drainingAlloc{
+		deadline: deadline,
+		tgKey:    makeTaskGroupKey(a),
+	}
+}
+
+// makeTaskGroupKey returns a unique key for an allocation's task group
+func makeTaskGroupKey(a *structs.Allocation) string {
+	return strings.Join([]string{a.Namespace, a.JobID, a.TaskGroup}, "-")
+}
+
+// stopAllocs tracks allocs to drain by a unique TG key
+type stopAllocs struct {
+	perTaskGroup map[string]int
+	allocBatch   []*structs.Allocation
+
+	// namespace+jobid -> Job
+	jobBatch map[string]*structs.Job
+}
+
+//FIXME this method does an awful lot
+func (s *stopAllocs) add(j *structs.Job, a *structs.Allocation) {
+	// Increment the counter for how many allocs in this task group are being stopped
+	tgKey := makeTaskGroupKey(a)
+	s.perTaskGroup[tgKey]++
+
+	// Update the allocation
+	a.ModifyTime = time.Now().UnixNano()
+	a.DesiredStatus = structs.AllocDesiredStatusStop
+
+	// Add alloc to the allocation batch
+	s.allocBatch = append(s.allocBatch, a)
+
+	// Add job to the job batch
+	jobKey := strings.Join([]string{j.Namespace, j.ID}, "-")
+	s.jobBatch[jobKey] = j
+}
+
+// startNodeDrainer should be called in establishLeadership by the leader.
+func (s *Server) startNodeDrainer(stopCh chan struct{}) {
+	state := s.fsm.State()
+
+	ctx, cancel := context.WithCancel(context.Background())
+	defer cancel()
+	go func() {
+		select {
+		case <-stopCh:
+			cancel()
+		case <-ctx.Done():
+		}
+	}()
+
+	nodes, nodesIndex, drainingAllocs, allocsIndex := initDrainer(s.logger, state)
+
+	// Wait for a node's drain deadline to expire
+	var nextDeadline time.Time
+	for _, node := range nodes {
+		if nextDeadline.IsZero() {
+			nextDeadline = node.DrainStrategy.DeadlineTime()
+			continue
+		}
+		if deadline := node.DrainStrategy.DeadlineTime(); deadline.Before(nextDeadline) {
+			nextDeadline = deadline
+		}
+
+	}
+	deadlineTimer := time.NewTimer(time.Until(nextDeadline))
+
+	// Watch for nodes to start or stop draining
+	nodeWatcher := newNodeWatcher(s.logger, nodes, nodesIndex, state)
+	go nodeWatcher.run(ctx)
+
+	// Watch for drained allocations to be replaced
+	prevAllocs := newPrevAllocWatcher(s.logger, drainingAllocs, allocsIndex, state)
+	go prevAllocs.run(ctx)
+
+	for {
+		//TODO this method of async node updates means we could make
+		//migration decisions on out of date information. the worst
+		//possible outcome of this is that an allocation could be
+		//stopped on a node that recently had its drain cancelled which
+		//doesn't seem like that bad of a pathological case
+		select {
+		case nodes = <-nodeWatcher.nodesCh:
+			// update draining nodes
+			//TODO remove allocs from draining list with node ids not in this map
+			s.logger.Printf("[TRACE] nomad.drain: running due to node change (%d nodes draining)", len(nodes))
+		case drainedID := <-prevAllocs.allocsCh:
+			// drained alloc has been replaced
+			//TODO instead of modifying a view of draining allocs here created a shared map like prevallocs
+			delete(drainingAllocs, drainedID)
+			s.logger.Printf("[TRACE] nomad.drain: running due to alloc change (%s replaced)", drainedID)
+		case when := <-deadlineTimer.C:
+			// deadline for a node was reached
+			s.logger.Printf("[TRACE] nomad.drain: running due to deadline reached (at %s)", when)
+		case <-ctx.Done():
+			// exit
+			return
+		}
+
+		// Tracks nodes that are done draining
+		doneNodes := map[string]*structs.Node{}
+
+		//TODO work from a state snapshot? perhaps from a last update
+		//index? I can't think of why this would be beneficial as this
+		//entire process runs asynchronously with the fsm/scheduler/etc
+		snapshot, err := state.Snapshot()
+		if err != nil {
+			//FIXME
+			panic(err)
+		}
+		now := time.Now() // for determing deadlines in a consistent way
+
+		// namespace -> job id -> {job, allocs}
+		// Collect all allocs for all jobs with at least one
+		// alloc on a draining node.
+		// Invariants:
+		//  - No system jobs
+		//  - No batch jobs unless their node's deadline is reached
+		//  - No entries with 0 allocs
+		//TODO could this be a helper method on prevAllocWatcher
+		drainable := map[string]map[string]*drainingJob{}
+
+		// Collect all drainable jobs
+		for nodeID, node := range nodes {
+			allocs, err := snapshot.AllocsByNode(nil, nodeID)
+			if err != nil {
+				//FIXME
+				panic(err)
+			}
+
+			// track number of allocs left on this node to be drained
+			allocsLeft := false
+			for _, alloc := range allocs {
+				if _, ok := drainable[alloc.Namespace]; !ok {
+					// namespace does not exist
+					drainable[alloc.Namespace] = make(map[string]*drainingJob)
+				}
+
+				if _, ok := drainable[alloc.Namespace][alloc.JobID]; ok {
+					// already found
+					continue
+				}
+
+				// job does not found yet
+				job, err := snapshot.JobByID(nil, alloc.Namespace, alloc.JobID)
+				if err != nil {
+					//FIXME
+					panic(err)
+				}
+				//TODO check for job == nil?
+
+				// Don't bother collecting system jobs
+				if job.Type == structs.JobTypeSystem {
+					continue
+				}
+
+				// If a drainable alloc isn't yet stopping this
+				// node has allocs left to be drained
+				if !alloc.TerminalStatus() {
+					allocsLeft = true
+				}
+
+				// Don't bother collecting batch jobs for nodes that haven't hit their deadline
+				if job.Type == structs.JobTypeBatch && node.DrainStrategy.DeadlineTime().After(now) {
+					continue
+				}
+
+				jobAllocs, err := snapshot.AllocsByJob(nil, alloc.Namespace, alloc.JobID, true)
+				if err != nil {
+					//FIXME
+					panic(err)
+				}
+
+				drainable[alloc.Namespace][alloc.JobID] = &drainingJob{
+					job:    job,
+					allocs: jobAllocs,
+				}
+			}
+
+			// if node has no allocs, it's done draining!
+			if !allocsLeft {
+				delete(nodes, nodeID)
+				doneNodes[nodeID] = node
+			}
+		}
+
+		// Initialize stoplist with a count of allocs already draining per task group
+		//TODO wrap this up in a new func
+		stoplist := &stopAllocs{
+			perTaskGroup: make(map[string]int, len(drainingAllocs)),
+			allocBatch:   make([]*structs.Allocation, len(drainingAllocs)),
+			jobBatch:     make(map[string]*structs.Job),
+		}
+		// initialize perTaskGroup to be the number of total *currently draining* allocations per task group
+		for _, a := range drainingAllocs {
+			stoplist.perTaskGroup[a.tgKey]++
+		}
+
+		// deadlineNodes is a map of node IDs that have reached their
+		// deadline and allocs that will be stopped due to deadline
+		deadlineNodes := map[string]int{}
+
+		//TODO build drain list considering deadline & max_parallel
+		for _, drainingJobs := range drainable {
+			for _, drainingJob := range drainingJobs {
+				for _, alloc := range drainingJob.allocs {
+					// Already draining/dead allocs don't need to be drained
+					if alloc.TerminalStatus() {
+						continue
+					}
+
+					node, ok := nodes[alloc.NodeID]
+					if !ok {
+						// Alloc's node is not draining so not elligible for draining!
+						continue
+					}
+
+					if node.DrainStrategy.DeadlineTime().Before(now) {
+						s.logger.Printf("[TRACE] nomad.drain: draining job %s alloc %s from node %s due to node's drain deadline", drainingJob.job.Name, alloc.ID[:6], alloc.NodeID[:6])
+						// Alloc's Node has reached its deadline
+						stoplist.add(drainingJob.job, alloc)
+
+						deadlineNodes[node.ID]++
+
+						//FIXME purge from watchlist?
+						continue
+					}
+
+					// Batch jobs are only stopped when the node
+					// deadline is reached which has already been
+					// done.
+					if drainingJob.job.Type == structs.JobTypeBatch {
+						continue
+					}
+
+					// Stop allocs with count=1, max_parallel==0, or draining<max_parallel
+					tg := drainingJob.job.LookupTaskGroup(alloc.TaskGroup)
+					//FIXME tg==nil here?
+
+					// Only 1, drain
+					if tg.Count == 1 {
+						s.logger.Printf("[TRACE] nomad.drain: draining job %s alloc %s from node %s due to count=1", drainingJob.job.Name, alloc.ID[:6], alloc.NodeID[:6])
+						stoplist.add(drainingJob.job, alloc)
+						continue
+					}
+
+					// No migrate strategy or a max parallel of 0 mean force draining
+					if tg.Migrate == nil || tg.Migrate.MaxParallel == 0 {
+						s.logger.Printf("[TRACE] nomad.drain: draining job %s alloc %s from node %s due to force drain", drainingJob.job.Name, alloc.ID[:6], alloc.NodeID[:6])
+						stoplist.add(drainingJob.job, alloc)
+						continue
+					}
+
+					// If MaxParallel > how many allocs are
+					// already draining for this task
+					// group, drain and track this alloc
+					tgKey := makeTaskGroupKey(alloc)
+
+					//FIXME change this to be based off of the sum(deploymentstatus!=nil && clientstatus==running) for this task group
+					if tg.Migrate.MaxParallel > stoplist.perTaskGroup[tgKey] {
+						s.logger.Printf("[TRACE] nomad.drain: draining job %s alloc %s from node %s due to max parallel", drainingJob.job.Name, alloc.ID[:6], alloc.NodeID[:6])
+						// More migrations are allowed, add to stoplist
+						stoplist.add(drainingJob.job, alloc)
+
+						// Also add to prevAllocWatcher
+						prevAllocs.watch(alloc.ID)
+					}
+				}
+			}
+		}
+
+		// log drains due to node deadlines
+		for nodeID, remaining := range deadlineNodes {
+			s.logger.Printf("[DEBUG] nomad.drain: node %s drain deadline reached; stopping %d remaining allocs", nodeID, remaining)
+		}
+
+		if len(stoplist.allocBatch) > 0 {
+			s.logger.Printf("[DEBUG] nomad.drain: stopping %d alloc(s) for %d job(s)", len(stoplist.allocBatch), len(stoplist.jobBatch))
+
+			// Stop allocs in stoplist and add them to drainingAllocs + prevAllocWatcher
+			batch := &structs.AllocUpdateRequest{
+				Alloc:        stoplist.allocBatch,
+				WriteRequest: structs.WriteRequest{Region: s.config.Region},
+			}
+
+			// Commit this update via Raft
+			//TODO Not the right request
+			_, index, err := s.raftApply(structs.AllocClientUpdateRequestType, batch)
+			if err != nil {
+				//FIXME
+				panic(err)
+			}
+
+			//TODO i bet there's something useful to do with this index
+			_ = index
+
+			// Reevaluate affected jobs
+			evals := make([]*structs.Evaluation, 0, len(stoplist.jobBatch))
+			for _, job := range stoplist.jobBatch {
+				evals = append(evals, &structs.Evaluation{
+					ID:             uuid.Generate(),
+					Namespace:      job.Namespace,
+					Priority:       job.Priority,
+					Type:           job.Type,
+					TriggeredBy:    structs.EvalTriggerNodeDrain,
+					JobID:          job.ID,
+					JobModifyIndex: job.ModifyIndex,
+					Status:         structs.EvalStatusPending,
+				})
+			}
+
+			evalUpdate := &structs.EvalUpdateRequest{
+				Evals:        evals,
+				WriteRequest: structs.WriteRequest{Region: s.config.Region},
+			}
+
+			// Commit this evaluation via Raft
+			_, _, err = s.raftApply(structs.EvalUpdateRequestType, evalUpdate)
+			if err != nil {
+				//FIXME
+				panic(err)
+			}
+		}
+
+		// Unset drain for nodes done draining
+		for nodeID, node := range doneNodes {
+			args := structs.NodeUpdateDrainRequest{
+				NodeID:       nodeID,
+				Drain:        false,
+				WriteRequest: structs.WriteRequest{Region: s.config.Region},
+			}
+
+			_, _, err := s.raftApply(structs.NodeUpdateDrainRequestType, &args)
+			if err != nil {
+				s.logger.Printf("[ERR] nomad.drain: failed to unset drain for: %v", err)
+				//FIXME
+				panic(err)
+			}
+			s.logger.Printf("[INFO] nomad.drain: node %s (%s) completed draining", nodeID, node.Name)
+		}
+	}
+}
+
+// nodeWatcher watches for nodes to start or stop draining
+type nodeWatcher struct {
+	index   uint64
+	nodes   map[string]*structs.Node
+	nodesCh chan map[string]*structs.Node
+	state   *state.StateStore
+	logger  *log.Logger
+}
+
+func newNodeWatcher(logger *log.Logger, nodes map[string]*structs.Node, index uint64, state *state.StateStore) *nodeWatcher {
+	return &nodeWatcher{
+		nodes:   nodes,
+		nodesCh: make(chan map[string]*structs.Node),
+		index:   index,
+		state:   state,
+		logger:  logger,
+	}
+}
+
+func (n *nodeWatcher) run(ctx context.Context) {
+	// Trigger an initial drain pass if there are already nodes draining
+	//FIXME this is unneccessary if a node has reached a deadline
+	n.logger.Printf("[TRACE] nomad.drain: initial draining nodes: %d", len(n.nodes))
+	if len(n.nodes) > 0 {
+		n.nodesCh <- n.nodes
+	}
+
+	for {
+		//FIXME it seems possible for this to return a nil error and a 0 index, what to do in that case?
+		resp, index, err := n.state.BlockingQuery(n.queryNodeDrain, n.index, ctx)
+		if err != nil {
+			if err == context.Canceled {
+				n.logger.Printf("[TRACE] nomad.drain: draining node watcher shutting down")
+				return
+			}
+			n.logger.Printf("[ERR] nomad.drain: error blocking on node updates at index %d: %v", n.index, err)
+			return
+		}
+
+		// update index for next run
+		n.index = index
+
+		changed := false
+		newNodes := resp.([]*structs.Node)
+		n.logger.Printf("[TRACE] nomad.drain: %d nodes to consider", len(newNodes)) //FIXME remove
+		for _, newNode := range newNodes {
+			if _, ok := n.nodes[newNode.ID]; ok {
+				// Node was draining
+				if !newNode.Drain {
+					// Node stopped draining
+					delete(n.nodes, newNode.ID)
+					changed = true
+				} else {
+					// Update deadline
+					n.nodes[newNode.ID] = newNode
+					//FIXME set changed if it changed?
+					//changed = true
+				}
+			} else {
+				// Node was not draining
+				if newNode.Drain {
+					// Node started draining
+					n.nodes[newNode.ID] = newNode
+					changed = true
+				}
+			}
+		}
+
+		// Send a copy of the draining nodes if there were changes
+		if !changed {
+			continue
+		}
+
+		nodesCopy := make(map[string]*structs.Node, len(n.nodes))
+		for k, v := range n.nodes {
+			nodesCopy[k] = v
+		}
+
+		select {
+		case n.nodesCh <- nodesCopy:
+		case <-ctx.Done():
+			return
+		}
+	}
+}
+
+func (n *nodeWatcher) queryNodeDrain(ws memdb.WatchSet, state *state.StateStore) (interface{}, uint64, error) {
+	iter, err := state.Nodes(ws)
+	if err != nil {
+		return nil, 0, err
+	}
+
+	index, err := state.Index("nodes")
+	if err != nil {
+		return nil, 0, err
+	}
+
+	//FIXME initial cap?
+	resp := make([]*structs.Node, 0, 1)
+
+	for {
+		raw := iter.Next()
+		if raw == nil {
+			break
+		}
+
+		node := raw.(*structs.Node)
+		resp = append(resp, node)
+	}
+
+	return resp, index, nil
+}
+
+// prevAllocWatcher monitors allocation updates for allocations which replace
+// draining allocations.
+type prevAllocWatcher struct {
+	// watchList is a map of alloc ids to look for in PreviousAllocation
+	// fields of new allocs
+	watchList   map[string]struct{}
+	watchListMu sync.Mutex
+
+	state *state.StateStore
+
+	// allocIndex to start watching from
+	allocIndex uint64
+
+	// allocsCh is sent Allocation.IDs as they're removed from the watchList
+	allocsCh chan string
+
+	logger *log.Logger
+}
+
+// newPrevAllocWatcher creates a new prevAllocWatcher watching drainingAllocs
+// from allocIndex in the state store. Must call run to start watching.
+func newPrevAllocWatcher(logger *log.Logger, drainingAllocs map[string]drainingAlloc, allocIndex uint64,
+	state *state.StateStore) *prevAllocWatcher {
+
+	watchList := make(map[string]struct{}, len(drainingAllocs))
+	for allocID := range drainingAllocs {
+		watchList[allocID] = struct{}{}
+	}
+
+	return &prevAllocWatcher{
+		watchList:  watchList,
+		state:      state,
+		allocIndex: allocIndex,
+		allocsCh:   make(chan string, 8), //FIXME 8? really? what should this be
+		logger:     logger,
+	}
+}
+
+// watch for an allocation ID to be replaced.
+func (p *prevAllocWatcher) watch(allocID string) {
+	p.watchListMu.Lock()
+	defer p.watchListMu.Unlock()
+	p.watchList[allocID] = struct{}{}
+}
+
+// run the prevAllocWatcher and send replaced draining alloc IDs on allocsCh.
+func (p *prevAllocWatcher) run(ctx context.Context) {
+	// index to watch from
+	var resp interface{}
+	var err error
+
+	for {
+		//FIXME it seems possible for this to return a nil error and a 0 index, what to do in that case?
+		resp, p.allocIndex, err = p.state.BlockingQuery(p.queryPrevAlloc, p.allocIndex, ctx)
+		if err != nil {
+			if err == context.Canceled {
+				p.logger.Printf("[TRACE] nomad.drain: previous allocation watcher shutting down")
+				return
+			}
+			p.logger.Printf("[ERR] nomad.drain: error blocking on alloc updates: %v", err)
+			return
+		}
+
+		allocIDs := resp.([]string)
+		for _, id := range allocIDs {
+			select {
+			case p.allocsCh <- id:
+			case <-ctx.Done():
+				return
+			}
+		}
+	}
+}
+
+// queryPrevAlloc is the BlockingQuery func for scanning for replacement allocs
+func (p *prevAllocWatcher) queryPrevAlloc(ws memdb.WatchSet, state *state.StateStore) (interface{}, uint64, error) {
+	iter, err := state.Allocs(ws)
+	if err != nil {
+		return nil, 0, err
+	}
+
+	index, err := state.Index("allocs")
+	if err != nil {
+		return nil, 0, err
+	}
+
+	//FIXME do fine grained locking around watclist mutations?
+	p.watchListMu.Lock()
+	defer p.watchListMu.Unlock()
+
+	resp := make([]string, 0, len(p.watchList))
+
+	for {
+		raw := iter.Next()
+		if raw == nil {
+			break
+		}
+
+		alloc := raw.(*structs.Allocation)
+		_, ok := p.watchList[alloc.PreviousAllocation]
+		if !ok {
+			// PreviousAllocation not in watchList, skip it
+			continue
+		}
+
+		// If the migration health is set on the replacement alloc we can stop watching the drained alloc
+		if alloc.DeploymentStatus.IsHealthy() || alloc.DeploymentStatus.IsUnhealthy() {
+			delete(p.watchList, alloc.PreviousAllocation)
+			resp = append(resp, alloc.PreviousAllocation)
+		}
+	}
+
+	return resp, index, nil
+}
+
+// initDrainer initializes the node drainer state and returns a list of
+// draining nodes as well as allocs that are draining that should be watched
+// for a replacement.
+func initDrainer(logger *log.Logger, state *state.StateStore) (map[string]*structs.Node, uint64, map[string]drainingAlloc, uint64) {
+	// StateStore.Snapshot never returns an error so don't bother checking it
+	snapshot, _ := state.Snapshot()
+	now := time.Now()
+
+	iter, err := snapshot.Nodes(nil)
+	if err != nil {
+		logger.Printf("[ERR] nomad.drain: error iterating nodes: %v", err)
+		panic(err) //FIXME
+	}
+
+	// map of draining nodes keyed by node ID
+	nodes := map[string]*structs.Node{}
+
+	//FIXME rollup by composite namespace+job.ID+tg key?
+	// List of draining allocs by namespace and job: namespace -> job.ID -> alloc.ID -> *Allocation
+	allocsByNS := map[string]map[string]map[string]*structs.Allocation{}
+
+	for {
+		raw := iter.Next()
+		if raw == nil {
+			break
+		}
+
+		// Filter on datacenter and status
+		node := raw.(*structs.Node)
+		if !node.Drain {
+			continue
+		}
+
+		// Track draining node
+		nodes[node.ID] = node
+
+		// No point in tracking draining allocs as the deadline has been reached
+		if node.DrainStrategy.DeadlineTime().Before(now) {
+			continue
+		}
+
+		allocs, err := snapshot.AllocsByNode(nil, node.ID)
+		if err != nil {
+			logger.Printf("[ERR] nomad.drain: error iterating allocs for node %q: %v", node.ID, err)
+			panic(err) //FIXME
+		}
+
+		for _, alloc := range allocs {
+			//FIXME is it safe to assume the drainer set the desired status to stop?
+			if alloc.DesiredStatus == structs.AllocDesiredStatusStop {
+				if allocsByJob, ok := allocsByNS[alloc.Namespace]; ok {
+					if allocs, ok := allocsByJob[alloc.JobID]; ok {
+						allocs[alloc.ID] = alloc
+					} else {
+						// First alloc for job
+						allocsByJob[alloc.JobID] = map[string]*structs.Allocation{alloc.ID: alloc}
+					}
+				} else {
+					// First alloc in namespace
+					allocsByNS[alloc.Namespace] = map[string]map[string]*structs.Allocation{
+						alloc.JobID: map[string]*structs.Allocation{alloc.ID: alloc},
+					}
+				}
+			}
+		}
+	}
+
+	// drainingAllocs is the list of all allocations that are currently
+	// draining and waiting for a replacement
+	drainingAllocs := map[string]drainingAlloc{}
+
+	for ns, allocsByJobs := range allocsByNS {
+		for jobID, allocs := range allocsByJobs {
+			for allocID, alloc := range allocs {
+				job, err := snapshot.JobByID(nil, ns, jobID)
+				if err != nil {
+					logger.Printf("[ERR] nomad.drain: error getting job %q for alloc %q: %v", alloc.JobID, allocID, err)
+					//FIXME
+					panic(err)
+				}
+
+				// Don't track drains for stopped or gc'd jobs
+				if job == nil || job.Status == structs.JobStatusDead {
+					continue
+				}
+
+				jobAllocs, err := snapshot.AllocsByJob(nil, ns, jobID, true)
+				if err != nil {
+					//FIXME
+					panic(err)
+				}
+
+				// Remove drained allocs for replacement allocs
+				for _, alloc := range jobAllocs {
+					if alloc.DeploymentStatus.IsHealthy() || alloc.DeploymentStatus.IsUnhealthy() {
+						delete(allocs, alloc.PreviousAllocation)
+					}
+				}
+
+				//FIXME why are we doing a nested loop over allocs?
+				// Any remaining allocs need to be tracked
+				for allocID, alloc := range allocs {
+					tg := job.LookupTaskGroup(alloc.TaskGroup)
+					if tg == nil {
+						logger.Printf("[DEBUG] nomad.drain: unable to find task group %q for alloc %q", alloc.TaskGroup, allocID)
+						continue
+					}
+
+					if tg.Migrate == nil {
+						// No migrate strategy so don't track
+						continue
+					}
+
+					//FIXME Remove this? ModifyTime is not updated as expected
+
+					// alloc.ModifyTime + HealthyDeadline is >= the
+					// healthy deadline for the allocation, so we
+					// can stop tracking it at that time.
+					deadline := time.Unix(0, alloc.ModifyTime).Add(tg.Migrate.HealthyDeadline)
+
+					if deadline.After(now) {
+						// deadline already reached; don't bother tracking
+						continue
+					}
+
+					// Draining allocation hasn't been replaced or
+					// reached its deadline; track it!
+					drainingAllocs[allocID] = newDrainingAlloc(alloc, deadline)
+				}
+			}
+		}
+	}
+
+	nodesIndex, _ := snapshot.Index("nodes")
+	if nodesIndex == 0 {
+		nodesIndex = 1
+	}
+	allocsIndex, _ := snapshot.Index("allocs")
+	if allocsIndex == 0 {
+		allocsIndex = 1
+	}
+	return nodes, nodesIndex, drainingAllocs, allocsIndex
+}
diff --git a/nomad/drain_test.go b/nomad/drain_test.go
new file mode 100644
index 000000000000..bf1ec875de3a
--- /dev/null
+++ b/nomad/drain_test.go
@@ -0,0 +1,216 @@
+package nomad
+
+import (
+	"fmt"
+	"sort"
+	"strings"
+	"testing"
+	"time"
+
+	msgpackrpc "github.com/hashicorp/net-rpc-msgpackrpc"
+	"github.com/hashicorp/nomad/client"
+	"github.com/hashicorp/nomad/client/config"
+	"github.com/hashicorp/nomad/helper/testlog"
+	"github.com/hashicorp/nomad/nomad/mock"
+	"github.com/hashicorp/nomad/nomad/structs"
+	"github.com/hashicorp/nomad/testutil"
+	"github.com/hashicorp/nomad/testutil/rpcapi"
+	"github.com/stretchr/testify/require"
+)
+
+// TestNodeDrainer_SimpleDrain asserts that draining when there are two nodes
+// moves allocs from the draining node to the other node.
+func TestNodeDrainer_SimpleDrain(t *testing.T) {
+	require := require.New(t)
+	server := TestServer(t, nil)
+	defer server.Shutdown()
+
+	testutil.WaitForLeader(t, server.RPC)
+
+	// Setup 2 Nodes: A & B; A has allocs and is draining
+
+	// Create mock jobs
+	state := server.fsm.State()
+
+	serviceJob := mock.Job()
+	serviceJob.Name = "service-job"
+	serviceJob.Type = structs.JobTypeService
+	serviceJob.TaskGroups[0].Migrate = &structs.MigrateStrategy{
+		MaxParallel:     1,
+		HealthCheck:     structs.MigrateStrategyHealthStates,
+		MinHealthyTime:  time.Millisecond,
+		HealthyDeadline: 2 * time.Second,
+	}
+	serviceJob.TaskGroups[0].Tasks[0].Driver = "mock_driver"
+	serviceJob.TaskGroups[0].Tasks[0].Resources = structs.MinResources()
+	serviceJob.TaskGroups[0].Tasks[0].Config = map[string]interface{}{
+		"run_for":    "10m",
+		"kill_after": "1ms",
+	}
+	serviceJob.TaskGroups[0].Tasks[0].Services = nil
+
+	systemJob := mock.SystemJob()
+	systemJob.Name = "system-job"
+	systemJob.Type = structs.JobTypeSystem
+	//FIXME hack until system job reschedule policy validation is fixed
+	systemJob.TaskGroups[0].ReschedulePolicy = &structs.ReschedulePolicy{Attempts: 1, Interval: time.Minute}
+	systemJob.TaskGroups[0].Tasks[0].Driver = "mock_driver"
+	systemJob.TaskGroups[0].Tasks[0].Config = map[string]interface{}{
+		"run_for":    "10m",
+		"kill_after": "1ms",
+	}
+	systemJob.TaskGroups[0].Tasks[0].Resources = structs.MinResources()
+	systemJob.TaskGroups[0].Tasks[0].Services = nil
+
+	batchJob := mock.Job()
+	batchJob.Name = "batch-job"
+	batchJob.Type = structs.JobTypeBatch
+	batchJob.TaskGroups[0].Name = "batch-group"
+	batchJob.TaskGroups[0].Migrate = nil
+	batchJob.TaskGroups[0].Tasks[0].Name = "batch-task"
+	batchJob.TaskGroups[0].Tasks[0].Driver = "mock_driver"
+	batchJob.TaskGroups[0].Tasks[0].Config = map[string]interface{}{
+		"run_for":    "10m",
+		"kill_after": "1ms",
+		"exit_code":  13, // set nonzero exit code to cause rescheduling
+	}
+	batchJob.TaskGroups[0].Tasks[0].Resources = structs.MinResources()
+	batchJob.TaskGroups[0].Tasks[0].Services = nil
+
+	// Start node 1
+	c1 := client.TestClient(t, func(conf *config.Config) {
+		conf.LogOutput = testlog.NewWriter(t)
+		conf.Servers = []string{server.config.RPCAddr.String()}
+	})
+	defer c1.Shutdown()
+
+	// Start jobs so they all get placed on node 1
+	codec := rpcClient(t, server)
+	for _, job := range []*structs.Job{systemJob, serviceJob, batchJob} {
+		req := &structs.JobRegisterRequest{
+			Job: job.Copy(),
+			WriteRequest: structs.WriteRequest{
+				Region:    "global",
+				Namespace: job.Namespace,
+			},
+		}
+
+		// Fetch the response
+		var resp structs.JobRegisterResponse
+		require.Nil(msgpackrpc.CallWithCodec(codec, "Job.Register", req, &resp))
+		require.NotZero(resp.Index)
+	}
+
+	// Wait for jobs to start on c1
+	rpc := rpcapi.NewRPC(codec)
+	testutil.WaitForResult(func() (bool, error) {
+		resp, err := rpc.NodeGetAllocs(c1.NodeID())
+		if err != nil {
+			return false, err
+		}
+
+		system, batch, service := 0, 0, 0
+		for _, alloc := range resp.Allocs {
+			if alloc.ClientStatus != structs.AllocClientStatusRunning {
+				return false, fmt.Errorf("alloc %s for job %s not running: %s", alloc.ID, alloc.Job.Name, alloc.ClientStatus)
+			}
+			switch alloc.JobID {
+			case batchJob.ID:
+				batch++
+			case serviceJob.ID:
+				service++
+			case systemJob.ID:
+				system++
+			}
+		}
+		// 1 system + 10 batch + 10 service = 21
+		if system+batch+service != 21 {
+			return false, fmt.Errorf("wrong number of allocs: system %d/1, batch %d/10, service %d/10", system, batch, service)
+		}
+		return true, nil
+	}, func(err error) {
+		if resp, err := rpc.NodeGetAllocs(c1.NodeID()); err == nil {
+			for i, alloc := range resp.Allocs {
+				t.Logf("%d alloc %s job %s status %s", i, alloc.ID, alloc.Job.Name, alloc.ClientStatus)
+			}
+		}
+		t.Fatalf("failed waiting for all allocs to start: %v", err)
+	})
+
+	// Start draining node 1
+	//FIXME update drain rpc to skip fsm manipulation and use api
+	node, err := state.NodeByID(nil, c1.NodeID())
+	require.Nil(err)
+	require.Nil(state.UpdateNodeDrain(node.ModifyIndex+1, node.ID, true))
+
+	// Start node 2
+	c2 := client.TestClient(t, func(conf *config.Config) {
+		conf.NetworkSpeed = 10000
+		conf.Servers = []string{server.config.RPCAddr.String()}
+	})
+	defer c2.Shutdown()
+
+	// Wait for services to be migrated
+	testutil.WaitForResult(func() (bool, error) {
+		resp, err := rpc.NodeGetAllocs(c2.NodeID())
+		if err != nil {
+			return false, err
+		}
+
+		system, batch, service := 0, 0, 0
+		for _, alloc := range resp.Allocs {
+			if alloc.ClientStatus != structs.AllocClientStatusRunning {
+				return false, fmt.Errorf("alloc %s for job %s not running: %s", alloc.ID, alloc.Job.Name, alloc.ClientStatus)
+			}
+			switch alloc.JobID {
+			case batchJob.ID:
+				batch++
+			case serviceJob.ID:
+				service++
+			case systemJob.ID:
+				system++
+			}
+		}
+		// 1 system + 10 batch + 10 service = 21
+		if system+batch+service != 21 {
+			return false, fmt.Errorf("wrong number of allocs: system %d/1, batch %d/10, service %d/10", system, batch, service)
+		}
+		return true, nil
+	}, func(err error) {
+		if resp, err := rpc.NodeGetAllocs(c2.NodeID()); err == nil {
+			for i, alloc := range resp.Allocs {
+				t.Logf("%d alloc %s job %s status %s prev %s", i, alloc.ID, alloc.Job.Name, alloc.ClientStatus, alloc.PreviousAllocation)
+			}
+		}
+		t.Fatalf("failed waiting for all allocs to start: %v", err)
+	})
+
+	// Wait for all service allocs to be replaced
+	jobs, err := rpc.JobList()
+	require.Nil(err)
+	t.Logf("%d jobs", len(jobs.Jobs))
+	for _, job := range jobs.Jobs {
+		t.Logf("job: %s status: %s %s", job.Name, job.Status, job.StatusDescription)
+	}
+
+	allocs, err := rpc.AllocAll()
+	require.Nil(err)
+
+	sort.Slice(allocs, func(i, j int) bool {
+		r := strings.Compare(allocs[i].Job.Name, allocs[j].Job.Name)
+		switch {
+		case r < 0:
+			return true
+		case r == 0:
+			return allocs[i].ModifyIndex < allocs[j].ModifyIndex
+		case r > 0:
+			return false
+		}
+		panic("unreachable")
+	})
+
+	t.Logf("%d allocs", len(allocs))
+	for _, alloc := range allocs {
+		t.Logf("job: %s node: %s alloc: %s desired: %s actual: %s replaces: %s", alloc.Job.Name, alloc.NodeID[:6], alloc.ID, alloc.DesiredStatus, alloc.ClientStatus, alloc.PreviousAllocation)
+	}
+}
diff --git a/nomad/leader.go b/nomad/leader.go
index 51aa737b3099..b81b65d23232 100644
--- a/nomad/leader.go
+++ b/nomad/leader.go
@@ -267,6 +267,9 @@ func (s *Server) establishLeadership(stopCh chan struct{}) error {
 		go s.replicateACLTokens(stopCh)
 	}
 
+	// Start Node Drainer
+	go s.startNodeDrainer(stopCh)
+
 	// Setup any enterprise systems required.
 	if err := s.establishEnterpriseLeadership(stopCh); err != nil {
 		return err
diff --git a/nomad/mock/mock.go b/nomad/mock/mock.go
index 3a8588b9cbad..1d96e556b39b 100644
--- a/nomad/mock/mock.go
+++ b/nomad/mock/mock.go
@@ -97,6 +97,7 @@ func Job() *structs.Job {
 					Delay:         5 * time.Second,
 					DelayFunction: "linear",
 				},
+				Migrate: structs.DefaultMigrateStrategy(),
 				Tasks: []*structs.Task{
 					{
 						Name:   "web",
diff --git a/nomad/node_endpoint.go b/nomad/node_endpoint.go
index 12fffbce2a5e..3ef43ccf6903 100644
--- a/nomad/node_endpoint.go
+++ b/nomad/node_endpoint.go
@@ -87,6 +87,11 @@ func (n *Node) Register(args *structs.NodeRegisterRequest, reply *structs.NodeUp
 		return fmt.Errorf("invalid status for node")
 	}
 
+	// Default to eligible for scheduling if unset
+	if args.Node.SchedulingEligibility == "" {
+		args.Node.SchedulingEligibility = structs.NodeSchedulingEligible
+	}
+
 	// Set the timestamp when the node is registered
 	args.Node.StatusUpdatedAt = time.Now().Unix()
 
diff --git a/nomad/plan_apply.go b/nomad/plan_apply.go
index 8e988232318d..089af0f5853a 100644
--- a/nomad/plan_apply.go
+++ b/nomad/plan_apply.go
@@ -415,7 +415,10 @@ func evaluateNodePlan(snap *state.StateSnapshot, plan *structs.Plan, nodeID stri
 		return false, "node does not exist", nil
 	} else if node.Status != structs.NodeStatusReady {
 		return false, "node is not ready for placements", nil
+	} else if node.SchedulingEligibility == structs.NodeSchedulingIneligible {
+		return false, "node is not eligible for draining", nil
 	} else if node.Drain {
+		// Deprecate in favor of scheduling eligibility and remove post-0.8
 		return false, "node is draining", nil
 	}
 
diff --git a/nomad/state/state_store.go b/nomad/state/state_store.go
index 6156a3c75020..67a02f348976 100644
--- a/nomad/state/state_store.go
+++ b/nomad/state/state_store.go
@@ -6,6 +6,7 @@ import (
 	"io"
 	"log"
 	"sort"
+	"time"
 
 	"github.com/hashicorp/go-memdb"
 	multierror "github.com/hashicorp/go-multierror"
@@ -635,6 +636,17 @@ func (s *StateStore) UpdateNodeDrain(index uint64, nodeID string, drain bool) er
 
 	// Update the drain in the copy
 	copyNode.Drain = drain
+	//FIXME
+	if drain {
+		copyNode.DrainStrategy = &structs.DrainStrategy{
+			StartTime: time.Now().UnixNano(),
+			Deadline:  10 * time.Second,
+		}
+		copyNode.SchedulingEligibility = structs.NodeSchedulingIneligible
+	} else {
+		copyNode.DrainStrategy = nil
+		copyNode.SchedulingEligibility = structs.NodeSchedulingEligible
+	}
 	copyNode.ModifyIndex = index
 
 	// Insert the node
diff --git a/nomad/structs/structs.go b/nomad/structs/structs.go
index 54c89fb95e20..68975ec69963 100644
--- a/nomad/structs/structs.go
+++ b/nomad/structs/structs.go
@@ -1142,6 +1142,7 @@ const (
 
 // ShouldDrainNode checks if a given node status should trigger an
 // evaluation. Some states don't require any further action.
+//TODO(schmichael) Update for drainv2?!
 func ShouldDrainNode(status string) bool {
 	switch status {
 	case NodeStatusInit, NodeStatusReady:
@@ -1163,6 +1164,44 @@ func ValidNodeStatus(status string) bool {
 	}
 }
 
+const (
+	NodeSchedulingEligible   = "eligbile"
+	NodeSchedulingIneligible = "ineligible"
+)
+
+// DrainStrategy describes a Node's drain behavior.
+type DrainStrategy struct {
+	// StartTime as nanoseconds since Unix epoch indicating when a drain
+	// began for deadline calcuations.
+	StartTime int64
+
+	// Deadline is the duration after StartTime when the remaining
+	// allocations on a draining Node should be told to stop.
+	Deadline time.Duration
+}
+
+func (d *DrainStrategy) Copy() *DrainStrategy {
+	if d == nil {
+		return nil
+	}
+
+	nd := new(DrainStrategy)
+	*nd = *d
+	return nd
+}
+
+// DeadlineTime returns the Time this drain's deadline will be reached or the
+// zero value for Time if DrainStrategy is nil or Duration is <= 0.
+func (d *DrainStrategy) DeadlineTime() time.Time {
+	if d == nil {
+		return time.Time{}
+	}
+	if d.Deadline <= 0 {
+		return time.Time{}
+	}
+	return time.Unix(0, d.StartTime).Add(d.Deadline)
+}
+
 // Node is a representation of a schedulable client node
 type Node struct {
 	// ID is a unique identifier for the node. It can be constructed
@@ -1224,9 +1263,18 @@ type Node struct {
 
 	// Drain is controlled by the servers, and not the client.
 	// If true, no jobs will be scheduled to this node, and existing
-	// allocations will be drained.
+	// allocations will be drained. Superceded by DrainStrategy in Nomad
+	// 0.8 but kept for backward compat.
 	Drain bool
 
+	// DrainStrategy determines the node's draining behavior. Will be nil
+	// when Drain=false.
+	DrainStrategy *DrainStrategy
+
+	// SchedulingEligibility determines whether this node will receive new
+	// placements.
+	SchedulingEligibility string
+
 	// Status of this node
 	Status string
 
@@ -1249,9 +1297,10 @@ type Node struct {
 	ModifyIndex uint64
 }
 
-// Ready returns if the node is ready for running allocations
+// Ready returns true if the node is ready for running allocations
 func (n *Node) Ready() bool {
-	return n.Status == NodeStatusReady && !n.Drain
+	// Drain is checked directly to support pre-0.8 Node data
+	return n.Status == NodeStatusReady && !n.Drain && n.SchedulingEligibility == NodeSchedulingEligible
 }
 
 func (n *Node) Copy() *Node {
@@ -1261,6 +1310,7 @@ func (n *Node) Copy() *Node {
 	nn := new(Node)
 	*nn = *n
 	nn.Attributes = helper.CopyMapStringString(nn.Attributes)
+	nn.DrainStrategy = nn.DrainStrategy.Copy()
 	nn.Resources = nn.Resources.Copy()
 	nn.Reserved = nn.Reserved.Copy()
 	nn.Links = helper.CopyMapStringString(nn.Links)
@@ -1300,34 +1350,36 @@ func (n *Node) Stub() *NodeListStub {
 	addr, _, _ := net.SplitHostPort(n.HTTPAddr)
 
 	return &NodeListStub{
-		Address:           addr,
-		ID:                n.ID,
-		Datacenter:        n.Datacenter,
-		Name:              n.Name,
-		NodeClass:         n.NodeClass,
-		Version:           n.Attributes["nomad.version"],
-		Drain:             n.Drain,
-		Status:            n.Status,
-		StatusDescription: n.StatusDescription,
-		CreateIndex:       n.CreateIndex,
-		ModifyIndex:       n.ModifyIndex,
+		Address:    addr,
+		ID:         n.ID,
+		Datacenter: n.Datacenter,
+		Name:       n.Name,
+		NodeClass:  n.NodeClass,
+		Version:    n.Attributes["nomad.version"],
+		Drain:      n.Drain,
+		SchedulingEligibility: n.SchedulingEligibility,
+		Status:                n.Status,
+		StatusDescription:     n.StatusDescription,
+		CreateIndex:           n.CreateIndex,
+		ModifyIndex:           n.ModifyIndex,
 	}
 }
 
 // NodeListStub is used to return a subset of job information
 // for the job list
 type NodeListStub struct {
-	Address           string
-	ID                string
-	Datacenter        string
-	Name              string
-	NodeClass         string
-	Version           string
-	Drain             bool
-	Status            string
-	StatusDescription string
-	CreateIndex       uint64
-	ModifyIndex       uint64
+	Address               string
+	ID                    string
+	Datacenter            string
+	Name                  string
+	NodeClass             string
+	Version               string
+	Drain                 bool
+	SchedulingEligibility string
+	Status                string
+	StatusDescription     string
+	CreateIndex           uint64
+	ModifyIndex           uint64
 }
 
 // Networks defined for a task on the Resources struct.
@@ -2898,6 +2950,64 @@ func NewReschedulePolicy(jobType string) *ReschedulePolicy {
 	return nil
 }
 
+const (
+	MigrateStrategyHealthChecks = "checks"
+	MigrateStrategyHealthStates = "task_states"
+)
+
+type MigrateStrategy struct {
+	MaxParallel     int
+	HealthCheck     string
+	MinHealthyTime  time.Duration
+	HealthyDeadline time.Duration
+}
+
+// DefaultMigrateStrategy is used for backwards compat with pre-0.8 Allocations
+// that lack an update strategy.
+//
+// This function should match its counterpart in api/tasks.go
+func DefaultMigrateStrategy() *MigrateStrategy {
+	return &MigrateStrategy{
+		MaxParallel:     1,
+		HealthCheck:     MigrateStrategyHealthChecks,
+		MinHealthyTime:  10 * time.Second,
+		HealthyDeadline: 5 * time.Minute,
+	}
+}
+
+func (m *MigrateStrategy) Validate() error {
+	var mErr multierror.Error
+
+	if m.MaxParallel < 0 {
+		multierror.Append(&mErr, fmt.Errorf("MaxParallel must be >= 0 but found %d", m.MaxParallel))
+	}
+
+	switch m.HealthCheck {
+	case MigrateStrategyHealthChecks, MigrateStrategyHealthStates:
+		// ok
+	case "":
+		if m.MaxParallel > 0 {
+			multierror.Append(&mErr, fmt.Errorf("Missing HealthCheck"))
+		}
+	default:
+		multierror.Append(&mErr, fmt.Errorf("Invalid HealthCheck: %q", m.HealthCheck))
+	}
+
+	if m.MinHealthyTime < 0 {
+		multierror.Append(&mErr, fmt.Errorf("MinHealthyTime is %s and must be >= 0", m.MinHealthyTime))
+	}
+
+	if m.HealthyDeadline < 0 {
+		multierror.Append(&mErr, fmt.Errorf("HealthyDeadline is %s and must be >= 0", m.HealthyDeadline))
+	}
+
+	if m.MinHealthyTime > m.HealthyDeadline {
+		multierror.Append(&mErr, fmt.Errorf("MinHealthyTime must be less than HealthyDeadline"))
+	}
+
+	return mErr.ErrorOrNil()
+}
+
 // TaskGroup is an atomic unit of placement. Each task group belongs to
 // a job and may contain any number of tasks. A task group support running
 // in many replicas using the same configuration..
@@ -2912,6 +3022,9 @@ type TaskGroup struct {
 	// Update is used to control the update strategy for this task group
 	Update *UpdateStrategy
 
+	// Migrate is used to control the migration strategy for this task group
+	Migrate *MigrateStrategy
+
 	// Constraints can be specified at a task group level and apply to
 	// all the tasks contained.
 	Constraints []*Constraint
@@ -3059,6 +3172,20 @@ func (tg *TaskGroup) Validate(j *Job) error {
 		}
 	}
 
+	// Validate the migration strategy
+	switch j.Type {
+	case JobTypeService:
+		if tg.Count == 1 && tg.Migrate != nil {
+			mErr.Errors = append(mErr.Errors, fmt.Errorf("Task Group %v should not have a migration strategy with a count = 1", tg.Name))
+		} else if err := tg.Migrate.Validate(); err != nil {
+			mErr.Errors = append(mErr.Errors, err)
+		}
+	default:
+		if tg.Migrate != nil {
+			mErr.Errors = append(mErr.Errors, fmt.Errorf("Job type %q does not allow migrate block", j.Type))
+		}
+	}
+
 	// Check for duplicate tasks, that there is only leader task if any,
 	// and no duplicated static ports
 	tasks := make(map[string]int)
@@ -5837,6 +5964,7 @@ const (
 	EvalTriggerJobRegister       = "job-register"
 	EvalTriggerJobDeregister     = "job-deregister"
 	EvalTriggerPeriodicJob       = "periodic-job"
+	EvalTriggerNodeDrain         = "node-drain"
 	EvalTriggerNodeUpdate        = "node-update"
 	EvalTriggerScheduled         = "scheduled"
 	EvalTriggerRollingUpdate     = "rolling-update"
diff --git a/scheduler/generic_sched.go b/scheduler/generic_sched.go
index 94dbc8a4b60f..32758359b8c4 100644
--- a/scheduler/generic_sched.go
+++ b/scheduler/generic_sched.go
@@ -117,8 +117,9 @@ func (s *GenericScheduler) Process(eval *structs.Evaluation) error {
 
 	// Verify the evaluation trigger reason is understood
 	switch eval.TriggeredBy {
-	case structs.EvalTriggerJobRegister, structs.EvalTriggerNodeUpdate,
-		structs.EvalTriggerJobDeregister, structs.EvalTriggerRollingUpdate,
+	case structs.EvalTriggerJobRegister, structs.EvalTriggerJobDeregister,
+		structs.EvalTriggerNodeDrain, structs.EvalTriggerNodeUpdate,
+		structs.EvalTriggerRollingUpdate,
 		structs.EvalTriggerPeriodicJob, structs.EvalTriggerMaxPlans,
 		structs.EvalTriggerDeploymentWatcher, structs.EvalTriggerRetryFailedAlloc:
 	default:
diff --git a/scheduler/util.go b/scheduler/util.go
index 17b7942accda..3417356014b6 100644
--- a/scheduler/util.go
+++ b/scheduler/util.go
@@ -249,6 +249,9 @@ func readyNodesInDCs(state State, dcs []string) ([]*structs.Node, map[string]int
 		if node.Drain {
 			continue
 		}
+		if node.SchedulingEligibility != structs.NodeSchedulingEligible {
+			continue
+		}
 		if _, ok := dcMap[node.Datacenter]; !ok {
 			continue
 		}
@@ -315,9 +318,10 @@ func taintedNodes(state State, allocs []*structs.Allocation) (map[string]*struct
 			out[alloc.NodeID] = nil
 			continue
 		}
-		if structs.ShouldDrainNode(node.Status) || node.Drain {
-			out[alloc.NodeID] = node
-		}
+		//FIXME is this right?
+		//if structs.ShouldDrainNode(node.Status) || node.Drain {
+		//	out[alloc.NodeID] = node
+		//}
 	}
 	return out, nil
 }
diff --git a/testutil/rpcapi/rcpapi.go b/testutil/rpcapi/rcpapi.go
new file mode 100644
index 000000000000..71e5be057ea0
--- /dev/null
+++ b/testutil/rpcapi/rcpapi.go
@@ -0,0 +1,114 @@
+package rpcapi
+
+import (
+	"net/rpc"
+
+	msgpackrpc "github.com/hashicorp/net-rpc-msgpackrpc"
+	"github.com/hashicorp/nomad/nomad/structs"
+)
+
+type RPC struct {
+	Region    string
+	Namespace string
+	codec     rpc.ClientCodec
+}
+
+func NewRPC(codec rpc.ClientCodec) *RPC {
+	return &RPC{
+		Region:    "global",
+		Namespace: structs.DefaultNamespace,
+		codec:     codec,
+	}
+}
+
+// AllocAll calls Alloc.List + Alloc.GetAllocs to return all allocs.
+func (r *RPC) AllocAll() ([]*structs.Allocation, error) {
+	listResp, err := r.AllocList()
+	if err != nil {
+		return nil, err
+	}
+
+	ids := make([]string, 0, len(listResp.Allocations))
+	for _, a := range listResp.Allocations {
+		ids = append(ids, a.ID)
+	}
+
+	allocsResp, err := r.AllocGetAllocs(ids)
+	if err != nil {
+		return nil, err
+	}
+	return allocsResp.Allocs, nil
+}
+
+// Alloc.List RPC
+func (r *RPC) AllocList() (*structs.AllocListResponse, error) {
+	get := &structs.AllocListRequest{
+		QueryOptions: structs.QueryOptions{
+			Region:    r.Region,
+			Namespace: r.Namespace,
+		},
+	}
+
+	var resp structs.AllocListResponse
+	if err := msgpackrpc.CallWithCodec(r.codec, "Alloc.List", get, &resp); err != nil {
+		return nil, err
+	}
+	return &resp, nil
+}
+
+// Alloc.GetAllocs RPC
+func (r *RPC) AllocGetAllocs(ids []string) (*structs.AllocsGetResponse, error) {
+	get := &structs.AllocsGetRequest{
+		AllocIDs: ids,
+		QueryOptions: structs.QueryOptions{
+			Region:    r.Region,
+			Namespace: r.Namespace,
+		},
+	}
+	var resp structs.AllocsGetResponse
+	if err := msgpackrpc.CallWithCodec(r.codec, "Alloc.GetAllocs", get, &resp); err != nil {
+		return nil, err
+	}
+	return &resp, nil
+}
+
+// Job.List RPC
+func (r *RPC) JobList() (*structs.JobListResponse, error) {
+	get := &structs.JobListRequest{
+		QueryOptions: structs.QueryOptions{
+			Region:    r.Region,
+			Namespace: r.Namespace,
+		},
+	}
+
+	var resp structs.JobListResponse
+	if err := msgpackrpc.CallWithCodec(r.codec, "Job.List", get, &resp); err != nil {
+		return nil, err
+	}
+	return &resp, nil
+}
+
+// Node.List RPC
+func (r *RPC) NodeList() (*structs.NodeListResponse, error) {
+	get := &structs.NodeListRequest{
+		QueryOptions: structs.QueryOptions{Region: r.Region},
+	}
+	var resp structs.NodeListResponse
+	if err := msgpackrpc.CallWithCodec(r.codec, "Node.List", get, &resp); err != nil {
+		return nil, err
+	}
+	return &resp, nil
+}
+
+// Node.GetAllocs RPC
+func (r *RPC) NodeGetAllocs(nodeID string) (*structs.NodeAllocsResponse, error) {
+	get := &structs.NodeSpecificRequest{
+		NodeID:       nodeID,
+		QueryOptions: structs.QueryOptions{Region: r.Region},
+	}
+	var resp structs.NodeAllocsResponse
+	if err := msgpackrpc.CallWithCodec(r.codec, "Node.GetAllocs", get, &resp); err != nil {
+		return nil, err
+	}
+	return &resp, nil
+}

From 587d4e264b5c41a72e682e5dffaecadbc9259b0f Mon Sep 17 00:00:00 2001
From: Michael Schurter <schmichael@hashicorp.com>
Date: Fri, 16 Feb 2018 15:07:49 -0800
Subject: [PATCH 02/79] testlog: override testlogger with envvar

---
 helper/testlog/testlog.go | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/helper/testlog/testlog.go b/helper/testlog/testlog.go
index b72fcfb28bef..709bd9d54745 100644
--- a/helper/testlog/testlog.go
+++ b/helper/testlog/testlog.go
@@ -6,8 +6,14 @@ package testlog
 import (
 	"io"
 	"log"
+	"os"
 )
 
+// UseStdout returns true if NOMAD_TEST_STDOUT=1 and sends logs to stdout.
+func UseStdout() bool {
+	return os.Getenv("NOMAD_TEST_STDOUT") == "1"
+}
+
 // LogPrinter is the methods of testing.T (or testing.B) needed by the test
 // logger.
 type LogPrinter interface {
@@ -27,11 +33,17 @@ func (w *writer) Write(p []byte) (n int, err error) {
 
 // NewWriter creates a new io.Writer backed by a Logger.
 func NewWriter(t LogPrinter) io.Writer {
+	if UseStdout() {
+		return os.Stdout
+	}
 	return &writer{t}
 }
 
 // New returns a new test logger. See https://golang.org/pkg/log/#New
 func New(t LogPrinter, prefix string, flag int) *log.Logger {
+	if UseStdout() {
+		return log.New(os.Stdout, prefix, flag)
+	}
 	return log.New(&writer{t}, prefix, flag)
 }
 

From 91e8fd098f6614bb333db3e9b96d3870c055ee67 Mon Sep 17 00:00:00 2001
From: Michael Schurter <schmichael@hashicorp.com>
Date: Mon, 19 Feb 2018 17:14:54 -0800
Subject: [PATCH 03/79] mock_driver: improve Kill() logging

---
 client/driver/mock_driver.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/client/driver/mock_driver.go b/client/driver/mock_driver.go
index 09a86f72deda..ffa6b09774ef 100644
--- a/client/driver/mock_driver.go
+++ b/client/driver/mock_driver.go
@@ -379,7 +379,7 @@ func (h *mockDriverHandle) Signal(s os.Signal) error {
 
 // Kill kills a mock task
 func (h *mockDriverHandle) Kill() error {
-	h.logger.Printf("[DEBUG] driver.mock: killing task %q after kill timeout: %v", h.taskName, h.killTimeout)
+	h.logger.Printf("[DEBUG] driver.mock: killing task %q after %s or kill timeout: %v", h.taskName, h.killAfter, h.killTimeout)
 	select {
 	case <-h.doneCh:
 	case <-time.After(h.killAfter):

From 48d637dad191075f2da6a1d1d72810f10ca7442b Mon Sep 17 00:00:00 2001
From: Alex Dadgar <alex.dadgar@gmail.com>
Date: Wed, 21 Feb 2018 10:58:04 -0800
Subject: [PATCH 04/79] RPC, FSM, State Store for marking DesiredTransistion

fix build tag
---
 api/allocations.go                   |  10 +++
 client/driver/mock_driver_testing.go |   2 +-
 nomad/alloc_endpoint.go              |  33 ++++++++
 nomad/alloc_endpoint_test.go         |  58 ++++++++++++++
 nomad/fsm.go                         |  18 +++++
 nomad/fsm_test.go                    |  42 +++++++++++
 nomad/mock/mock.go                   |   5 +-
 nomad/node_endpoint.go               |   2 +-
 nomad/state/state_store.go           |  57 ++++++++++++++
 nomad/state/state_store_test.go      |  52 +++++++++++++
 nomad/structs/structs.go             |  37 +++++++++
 scheduler/generic_sched_test.go      | 108 ++++-----------------------
 scheduler/reconcile.go               |   1 +
 scheduler/reconcile_test.go          |   8 ++
 scheduler/reconcile_util.go          |   5 +-
 scheduler/system_sched.go            |   2 +-
 scheduler/system_sched_test.go       |   4 +
 scheduler/testing.go                 |   8 +-
 scheduler/util.go                    |  37 +++++----
 scheduler/util_test.go               |   7 ++
 20 files changed, 379 insertions(+), 117 deletions(-)

diff --git a/api/allocations.go b/api/allocations.go
index 68047ee5b462..89206dadee0b 100644
--- a/api/allocations.go
+++ b/api/allocations.go
@@ -81,6 +81,7 @@ type Allocation struct {
 	Metrics            *AllocationMetric
 	DesiredStatus      string
 	DesiredDescription string
+	DesiredTransistion DesiredTransistion
 	ClientStatus       string
 	ClientDescription  string
 	TaskStates         map[string]*TaskState
@@ -205,3 +206,12 @@ type RescheduleEvent struct {
 	// PrevNodeID is the node ID of the previous allocation
 	PrevNodeID string
 }
+
+// DesiredTransistion is used to mark an allocation as having a desired state
+// transistion. This information can be used by the scheduler to make the
+// correct decision.
+type DesiredTransistion struct {
+	// Migrate is used to indicate that this allocation should be stopped and
+	// migrated to another node.
+	Migrate *bool
+}
diff --git a/client/driver/mock_driver_testing.go b/client/driver/mock_driver_testing.go
index 1b1e861a8915..8a712205e4aa 100644
--- a/client/driver/mock_driver_testing.go
+++ b/client/driver/mock_driver_testing.go
@@ -1,4 +1,4 @@
-//+build nomad_test
+// +build nomad_test
 
 package driver
 
diff --git a/nomad/alloc_endpoint.go b/nomad/alloc_endpoint.go
index 033a1a0103aa..a7f5e3bdc2ac 100644
--- a/nomad/alloc_endpoint.go
+++ b/nomad/alloc_endpoint.go
@@ -1,6 +1,7 @@
 package nomad
 
 import (
+	"fmt"
 	"time"
 
 	"github.com/armon/go-metrics"
@@ -200,3 +201,35 @@ func (a *Alloc) GetAllocs(args *structs.AllocsGetRequest,
 	}
 	return a.srv.blockingRPC(&opts)
 }
+
+// UpdateDesiredTransistion is used to update the desired transistions of an
+// allocation.
+func (a *Alloc) UpdateDesiredTransistion(args *structs.AllocUpdateDesiredTransistionRequest, reply *structs.GenericResponse) error {
+	if done, err := a.srv.forward("Alloc.UpdateDesiredTransistion", args, args, reply); done {
+		return err
+	}
+	defer metrics.MeasureSince([]string{"nomad", "alloc", "update_desired_transistion"}, time.Now())
+
+	// Check that it is a management token.
+	if aclObj, err := a.srv.ResolveToken(args.AuthToken); err != nil {
+		return err
+	} else if aclObj != nil && !aclObj.IsManagement() {
+		return structs.ErrPermissionDenied
+	}
+
+	// Ensure at least a single alloc
+	if len(args.Allocs) == 0 {
+		return fmt.Errorf("must update at least one allocation")
+	}
+
+	// Commit this update via Raft
+	_, index, err := a.srv.raftApply(structs.AllocUpdateDesiredTransistionRequestType, args)
+	if err != nil {
+		a.srv.logger.Printf("[ERR] nomad.allocs: AllocUpdateDesiredTransistionRequest failed: %v", err)
+		return err
+	}
+
+	// Setup the response
+	reply.Index = index
+	return nil
+}
diff --git a/nomad/alloc_endpoint_test.go b/nomad/alloc_endpoint_test.go
index abb36178681c..f898f2b7dd9f 100644
--- a/nomad/alloc_endpoint_test.go
+++ b/nomad/alloc_endpoint_test.go
@@ -7,11 +7,13 @@ import (
 
 	"github.com/hashicorp/net-rpc-msgpackrpc"
 	"github.com/hashicorp/nomad/acl"
+	"github.com/hashicorp/nomad/helper"
 	"github.com/hashicorp/nomad/helper/uuid"
 	"github.com/hashicorp/nomad/nomad/mock"
 	"github.com/hashicorp/nomad/nomad/structs"
 	"github.com/hashicorp/nomad/testutil"
 	"github.com/stretchr/testify/assert"
+	"github.com/stretchr/testify/require"
 )
 
 func TestAllocEndpoint_List(t *testing.T) {
@@ -481,3 +483,59 @@ func TestAllocEndpoint_GetAllocs_Blocking(t *testing.T) {
 		t.Fatalf("bad: %#v", resp.Allocs)
 	}
 }
+
+func TestAllocEndpoint_UpdateDesiredTransistion(t *testing.T) {
+	t.Parallel()
+	require := require.New(t)
+
+	s1, _ := TestACLServer(t, nil)
+	defer s1.Shutdown()
+	codec := rpcClient(t, s1)
+	testutil.WaitForLeader(t, s1.RPC)
+
+	// Create the register request
+	alloc := mock.Alloc()
+	alloc2 := mock.Alloc()
+	state := s1.fsm.State()
+	require.Nil(state.UpsertJobSummary(998, mock.JobSummary(alloc.JobID)))
+	require.Nil(state.UpsertJobSummary(999, mock.JobSummary(alloc2.JobID)))
+	require.Nil(state.UpsertAllocs(1000, []*structs.Allocation{alloc, alloc2}))
+
+	t1 := &structs.DesiredTransistion{
+		Migrate: helper.BoolToPtr(true),
+	}
+
+	// Update the allocs desired status
+	get := &structs.AllocUpdateDesiredTransistionRequest{
+		Allocs: map[string]*structs.DesiredTransistion{
+			alloc.ID:  t1,
+			alloc2.ID: t1,
+		},
+		WriteRequest: structs.WriteRequest{
+			Region: "global",
+		},
+	}
+
+	// Try without permissions
+	var resp structs.GenericResponse
+	err := msgpackrpc.CallWithCodec(codec, "Alloc.UpdateDesiredTransistion", get, &resp)
+	require.NotNil(err)
+	require.True(structs.IsErrPermissionDenied(err))
+
+	// Try with permissions
+	get.WriteRequest.AuthToken = s1.getLeaderAcl()
+	var resp2 structs.GenericResponse
+	require.Nil(msgpackrpc.CallWithCodec(codec, "Alloc.UpdateDesiredTransistion", get, &resp2))
+	require.NotZero(resp2.Index)
+
+	// Look up the allocations
+	out1, err := state.AllocByID(nil, alloc.ID)
+	require.Nil(err)
+	out2, err := state.AllocByID(nil, alloc.ID)
+	require.Nil(err)
+
+	require.NotNil(out1.DesiredTransistion.Migrate)
+	require.NotNil(out2.DesiredTransistion.Migrate)
+	require.True(*out1.DesiredTransistion.Migrate)
+	require.True(*out2.DesiredTransistion.Migrate)
+}
diff --git a/nomad/fsm.go b/nomad/fsm.go
index 21a785b6750f..a1d9113cada2 100644
--- a/nomad/fsm.go
+++ b/nomad/fsm.go
@@ -240,6 +240,8 @@ func (n *nomadFSM) Apply(log *raft.Log) interface{} {
 		return n.applyUpsertNodeEvent(buf[1:], log.Index)
 	case structs.JobBatchDeregisterRequestType:
 		return n.applyBatchDeregisterJob(buf[1:], log.Index)
+	case structs.AllocUpdateDesiredTransistionRequestType:
+		return n.applyAllocUpdateDesiredTransition(buf[1:], log.Index)
 	}
 
 	// Check enterprise only message types.
@@ -651,6 +653,22 @@ func (n *nomadFSM) applyAllocClientUpdate(buf []byte, index uint64) interface{}
 	return nil
 }
 
+// applyAllocUpdateDesiredTransition is used to update the desired transistions
+// of a set of allocations.
+func (n *nomadFSM) applyAllocUpdateDesiredTransition(buf []byte, index uint64) interface{} {
+	defer metrics.MeasureSince([]string{"nomad", "fsm", "alloc_update_desired_transistion"}, time.Now())
+	var req structs.AllocUpdateDesiredTransistionRequest
+	if err := structs.Decode(buf, &req); err != nil {
+		panic(fmt.Errorf("failed to decode request: %v", err))
+	}
+
+	if err := n.state.UpdateAllocsDesiredTransistions(index, req.Allocs); err != nil {
+		n.logger.Printf("[ERR] nomad.fsm: UpdateAllocsDesiredTransistions failed: %v", err)
+		return err
+	}
+	return nil
+}
+
 // applyReconcileSummaries reconciles summaries for all the jobs
 func (n *nomadFSM) applyReconcileSummaries(buf []byte, index uint64) interface{} {
 	if err := n.state.ReconcileJobSummaries(index); err != nil {
diff --git a/nomad/fsm_test.go b/nomad/fsm_test.go
index 5c2ed08cb112..a04f1cd2f1c1 100644
--- a/nomad/fsm_test.go
+++ b/nomad/fsm_test.go
@@ -1241,6 +1241,48 @@ func TestFSM_UpdateAllocFromClient(t *testing.T) {
 	require.Equal(eval, res)
 }
 
+func TestFSM_UpdateAllocDesiredTransistion(t *testing.T) {
+	t.Parallel()
+	fsm := testFSM(t)
+	state := fsm.State()
+	require := require.New(t)
+
+	alloc := mock.Alloc()
+	alloc2 := mock.Alloc()
+	alloc2.Job = alloc.Job
+	alloc2.JobID = alloc.JobID
+	state.UpsertJobSummary(9, mock.JobSummary(alloc.JobID))
+	state.UpsertAllocs(10, []*structs.Allocation{alloc, alloc2})
+
+	t1 := &structs.DesiredTransistion{
+		Migrate: helper.BoolToPtr(true),
+	}
+
+	req := structs.AllocUpdateDesiredTransistionRequest{
+		Allocs: map[string]*structs.DesiredTransistion{
+			alloc.ID:  t1,
+			alloc2.ID: t1,
+		},
+	}
+	buf, err := structs.Encode(structs.AllocUpdateDesiredTransistionRequestType, req)
+	require.Nil(err)
+
+	resp := fsm.Apply(makeLog(buf))
+	require.Nil(resp)
+
+	// Verify we are registered
+	ws := memdb.NewWatchSet()
+	out1, err := fsm.State().AllocByID(ws, alloc.ID)
+	require.Nil(err)
+	out2, err := fsm.State().AllocByID(ws, alloc2.ID)
+	require.Nil(err)
+
+	require.NotNil(out1.DesiredTransistion.Migrate)
+	require.NotNil(out2.DesiredTransistion.Migrate)
+	require.True(*out1.DesiredTransistion.Migrate)
+	require.True(*out2.DesiredTransistion.Migrate)
+}
+
 func TestFSM_UpsertVaultAccessor(t *testing.T) {
 	t.Parallel()
 	fsm := testFSM(t)
diff --git a/nomad/mock/mock.go b/nomad/mock/mock.go
index 1d96e556b39b..6c2a3f42e0a3 100644
--- a/nomad/mock/mock.go
+++ b/nomad/mock/mock.go
@@ -54,8 +54,9 @@ func Node() *structs.Node {
 			"database": "mysql",
 			"version":  "5.6",
 		},
-		NodeClass: "linux-medium-pci",
-		Status:    structs.NodeStatusReady,
+		NodeClass:             "linux-medium-pci",
+		Status:                structs.NodeStatusReady,
+		SchedulingEligibility: structs.NodeSchedulingEligible,
 	}
 	node.ComputeClass()
 	return node
diff --git a/nomad/node_endpoint.go b/nomad/node_endpoint.go
index 3ef43ccf6903..182817392bdf 100644
--- a/nomad/node_endpoint.go
+++ b/nomad/node_endpoint.go
@@ -822,7 +822,7 @@ func (n *Node) UpdateAlloc(args *structs.AllocUpdateRequest, reply *structs.Gene
 	// Ensure that evals aren't set from client RPCs
 	// We create them here before the raft update
 	if len(args.Evals) != 0 {
-		return fmt.Errorf("evals field must not be set ")
+		return fmt.Errorf("evals field must not be set")
 	}
 
 	// Update modified timestamp for client initiated allocation updates
diff --git a/nomad/state/state_store.go b/nomad/state/state_store.go
index 67a02f348976..1c67327ae4ca 100644
--- a/nomad/state/state_store.go
+++ b/nomad/state/state_store.go
@@ -2008,6 +2008,63 @@ func (s *StateStore) upsertAllocsImpl(index uint64, allocs []*structs.Allocation
 	return nil
 }
 
+// UpdateAllocsDesiredTransistions is used to update a set of allocations
+// desired transistions.
+func (s *StateStore) UpdateAllocsDesiredTransistions(index uint64, allocs map[string]*structs.DesiredTransistion) error {
+	txn := s.db.Txn(true)
+	defer txn.Abort()
+
+	// Handle each of the updated allocations
+	for id, transistion := range allocs {
+		if err := s.nestedUpdateAllocDesiredTransition(txn, index, id, transistion); err != nil {
+			return err
+		}
+	}
+
+	// Update the indexes
+	if err := txn.Insert("index", &IndexEntry{"allocs", index}); err != nil {
+		return fmt.Errorf("index update failed: %v", err)
+	}
+
+	txn.Commit()
+	return nil
+}
+
+// nestedUpdateAllocDesiredTransition is used to nest an update of an
+// allocations desired transistion
+func (s *StateStore) nestedUpdateAllocDesiredTransition(
+	txn *memdb.Txn, index uint64, allocID string,
+	transistion *structs.DesiredTransistion) error {
+
+	// Look for existing alloc
+	existing, err := txn.First("allocs", "id", allocID)
+	if err != nil {
+		return fmt.Errorf("alloc lookup failed: %v", err)
+	}
+
+	// Nothing to do if this does not exist
+	if existing == nil {
+		return nil
+	}
+	exist := existing.(*structs.Allocation)
+
+	// Copy everything from the existing allocation
+	copyAlloc := exist.Copy()
+
+	// Merge the desired transistions
+	copyAlloc.DesiredTransistion.Merge(transistion)
+
+	// Update the modify index
+	copyAlloc.ModifyIndex = index
+
+	// Update the allocation
+	if err := txn.Insert("allocs", copyAlloc); err != nil {
+		return fmt.Errorf("alloc insert failed: %v", err)
+	}
+
+	return nil
+}
+
 // AllocByID is used to lookup an allocation by its ID
 func (s *StateStore) AllocByID(ws memdb.WatchSet, id string) (*structs.Allocation, error) {
 	txn := s.db.Txn(false)
diff --git a/nomad/state/state_store_test.go b/nomad/state/state_store_test.go
index d176e178b9a9..4fd2173f94cf 100644
--- a/nomad/state/state_store_test.go
+++ b/nomad/state/state_store_test.go
@@ -3823,6 +3823,58 @@ func TestStateStore_UpdateAlloc_NoJob(t *testing.T) {
 	}
 }
 
+func TestStateStore_UpdateAllocDesiredTransistion(t *testing.T) {
+	t.Parallel()
+	require := require.New(t)
+
+	state := testStateStore(t)
+	alloc := mock.Alloc()
+
+	require.Nil(state.UpsertJob(999, alloc.Job))
+	require.Nil(state.UpsertAllocs(1000, []*structs.Allocation{alloc}))
+
+	t1 := &structs.DesiredTransistion{
+		Migrate: helper.BoolToPtr(true),
+	}
+	t2 := &structs.DesiredTransistion{
+		Migrate: helper.BoolToPtr(false),
+	}
+
+	m := map[string]*structs.DesiredTransistion{alloc.ID: t1}
+	require.Nil(state.UpdateAllocsDesiredTransistions(1001, m))
+
+	ws := memdb.NewWatchSet()
+	out, err := state.AllocByID(ws, alloc.ID)
+	require.Nil(err)
+	require.NotNil(out.DesiredTransistion.Migrate)
+	require.True(*out.DesiredTransistion.Migrate)
+	require.EqualValues(1000, out.CreateIndex)
+	require.EqualValues(1001, out.ModifyIndex)
+
+	index, err := state.Index("allocs")
+	require.Nil(err)
+	require.EqualValues(1001, index)
+
+	m = map[string]*structs.DesiredTransistion{alloc.ID: t2}
+	require.Nil(state.UpdateAllocsDesiredTransistions(1002, m))
+
+	ws = memdb.NewWatchSet()
+	out, err = state.AllocByID(ws, alloc.ID)
+	require.Nil(err)
+	require.NotNil(out.DesiredTransistion.Migrate)
+	require.False(*out.DesiredTransistion.Migrate)
+	require.EqualValues(1000, out.CreateIndex)
+	require.EqualValues(1002, out.ModifyIndex)
+
+	index, err = state.Index("allocs")
+	require.Nil(err)
+	require.EqualValues(1002, index)
+
+	// Try with a bogus alloc id
+	m = map[string]*structs.DesiredTransistion{uuid.Generate(): t2}
+	require.Nil(state.UpdateAllocsDesiredTransistions(1003, m))
+}
+
 func TestStateStore_JobSummary(t *testing.T) {
 	state := testStateStore(t)
 
diff --git a/nomad/structs/structs.go b/nomad/structs/structs.go
index 68975ec69963..e50921c27cb5 100644
--- a/nomad/structs/structs.go
+++ b/nomad/structs/structs.go
@@ -78,6 +78,7 @@ const (
 	AutopilotRequestType
 	UpsertNodeEventsType
 	JobBatchDeregisterRequestType
+	AllocUpdateDesiredTransistionRequestType
 )
 
 const (
@@ -573,6 +574,16 @@ type AllocUpdateRequest struct {
 	WriteRequest
 }
 
+// AllocUpdateDesiredTransistionRequest is used to submit changes to allocations
+// desired transistion state.
+type AllocUpdateDesiredTransistionRequest struct {
+	// Allocs is the mapping of allocation ids to their desired state
+	// transistion
+	Allocs map[string]*DesiredTransistion
+
+	WriteRequest
+}
+
 // AllocListRequest is used to request a list of allocations
 type AllocListRequest struct {
 	QueryOptions
@@ -5338,6 +5349,28 @@ func (re *RescheduleEvent) Copy() *RescheduleEvent {
 	return copy
 }
 
+// DesiredTransistion is used to mark an allocation as having a desired state
+// transistion. This information can be used by the scheduler to make the
+// correct decision.
+type DesiredTransistion struct {
+	// Migrate is used to indicate that this allocation should be stopped and
+	// migrated to another node.
+	Migrate *bool
+}
+
+// Merge merges the two desired transitions, preferring the values from the
+// passed in object.
+func (d *DesiredTransistion) Merge(o *DesiredTransistion) {
+	if o.Migrate != nil {
+		d.Migrate = o.Migrate
+	}
+}
+
+// ShouldMigrate returns whether the transistion object dictates a migration.
+func (d *DesiredTransistion) ShouldMigrate() bool {
+	return d.Migrate != nil && *d.Migrate
+}
+
 const (
 	AllocDesiredStatusRun   = "run"   // Allocation should run
 	AllocDesiredStatusStop  = "stop"  // Allocation should stop
@@ -5399,6 +5432,10 @@ type Allocation struct {
 	// DesiredStatusDescription is meant to provide more human useful information
 	DesiredDescription string
 
+	// DesiredTransistion is used to indicate that a state transistion
+	// is desired for a given reason.
+	DesiredTransistion DesiredTransistion
+
 	// Status of the allocation on the client
 	ClientStatus string
 
diff --git a/scheduler/generic_sched_test.go b/scheduler/generic_sched_test.go
index 5b21034eb9cb..d1bbf4710334 100644
--- a/scheduler/generic_sched_test.go
+++ b/scheduler/generic_sched_test.go
@@ -2211,6 +2211,7 @@ func TestServiceSched_NodeDown(t *testing.T) {
 
 	// Register a node
 	node := mock.Node()
+	node.Status = structs.NodeStatusDown
 	noErr(t, h.State.UpsertNode(h.NextIndex(), node))
 
 	// Generate a fake job with allocations and an update policy.
@@ -2235,18 +2236,19 @@ func TestServiceSched_NodeDown(t *testing.T) {
 	allocs[9].DesiredStatus = structs.AllocDesiredStatusRun
 	allocs[9].ClientStatus = structs.AllocClientStatusComplete
 
-	noErr(t, h.State.UpsertAllocs(h.NextIndex(), allocs))
-
 	// Mark some allocs as running
-	ws := memdb.NewWatchSet()
 	for i := 0; i < 4; i++ {
-		out, _ := h.State.AllocByID(ws, allocs[i].ID)
+		out := allocs[i]
 		out.ClientStatus = structs.AllocClientStatusRunning
-		noErr(t, h.State.UpdateAllocsFromClient(h.NextIndex(), []*structs.Allocation{out}))
 	}
 
-	// Mark the node as down
-	noErr(t, h.State.UpdateNodeStatus(h.NextIndex(), node.ID, structs.NodeStatusDown))
+	// Mark appropriate allocs for migration
+	for i := 0; i < 7; i++ {
+		out := allocs[i]
+		out.DesiredTransistion.Migrate = helper.BoolToPtr(true)
+	}
+
+	noErr(t, h.State.UpsertAllocs(h.NextIndex(), allocs))
 
 	// Create a mock evaluation to deal with drain
 	eval := &structs.Evaluation{
@@ -2365,6 +2367,7 @@ func TestServiceSched_NodeDrain(t *testing.T) {
 		alloc.JobID = job.ID
 		alloc.NodeID = node.ID
 		alloc.Name = fmt.Sprintf("my-job.web[%d]", i)
+		alloc.DesiredTransistion.Migrate = helper.BoolToPtr(true)
 		allocs = append(allocs, alloc)
 	}
 	noErr(t, h.State.UpsertAllocs(h.NextIndex(), allocs))
@@ -2447,9 +2450,10 @@ func TestServiceSched_NodeDrain_Down(t *testing.T) {
 
 	// Set the desired state of the allocs to stop
 	var stop []*structs.Allocation
-	for i := 0; i < 10; i++ {
+	for i := 0; i < 6; i++ {
 		newAlloc := allocs[i].Copy()
 		newAlloc.ClientStatus = structs.AllocDesiredStatusStop
+		newAlloc.DesiredTransistion.Migrate = helper.BoolToPtr(true)
 		stop = append(stop, newAlloc)
 	}
 	noErr(t, h.State.UpsertAllocs(h.NextIndex(), stop))
@@ -2466,7 +2470,7 @@ func TestServiceSched_NodeDrain_Down(t *testing.T) {
 	// Mark some of the allocations as complete
 	var complete []*structs.Allocation
 	for i := 6; i < 10; i++ {
-		newAlloc := stop[i].Copy()
+		newAlloc := allocs[i].Copy()
 		newAlloc.TaskStates = make(map[string]*structs.TaskState)
 		newAlloc.TaskStates["web"] = &structs.TaskState{
 			State: structs.TaskStateDead,
@@ -2552,6 +2556,7 @@ func TestServiceSched_NodeDrain_Queued_Allocations(t *testing.T) {
 		alloc.JobID = job.ID
 		alloc.NodeID = node.ID
 		alloc.Name = fmt.Sprintf("my-job.web[%d]", i)
+		alloc.DesiredTransistion.Migrate = helper.BoolToPtr(true)
 		allocs = append(allocs, alloc)
 	}
 	noErr(t, h.State.UpsertAllocs(h.NextIndex(), allocs))
@@ -2583,88 +2588,6 @@ func TestServiceSched_NodeDrain_Queued_Allocations(t *testing.T) {
 	}
 }
 
-func TestServiceSched_NodeDrain_UpdateStrategy(t *testing.T) {
-	h := NewHarness(t)
-
-	// Register a draining node
-	node := mock.Node()
-	node.Drain = true
-	noErr(t, h.State.UpsertNode(h.NextIndex(), node))
-
-	// Create some nodes
-	for i := 0; i < 10; i++ {
-		node := mock.Node()
-		noErr(t, h.State.UpsertNode(h.NextIndex(), node))
-	}
-
-	// Generate a fake job with allocations and an update policy.
-	job := mock.Job()
-	mp := 5
-	u := structs.DefaultUpdateStrategy.Copy()
-	u.MaxParallel = mp
-	u.Stagger = time.Second
-	job.TaskGroups[0].Update = u
-
-	noErr(t, h.State.UpsertJob(h.NextIndex(), job))
-
-	var allocs []*structs.Allocation
-	for i := 0; i < 10; i++ {
-		alloc := mock.Alloc()
-		alloc.Job = job
-		alloc.JobID = job.ID
-		alloc.NodeID = node.ID
-		alloc.Name = fmt.Sprintf("my-job.web[%d]", i)
-		allocs = append(allocs, alloc)
-	}
-	noErr(t, h.State.UpsertAllocs(h.NextIndex(), allocs))
-
-	// Create a mock evaluation to deal with drain
-	eval := &structs.Evaluation{
-		Namespace:   structs.DefaultNamespace,
-		ID:          uuid.Generate(),
-		Priority:    50,
-		TriggeredBy: structs.EvalTriggerNodeUpdate,
-		JobID:       job.ID,
-		NodeID:      node.ID,
-		Status:      structs.EvalStatusPending,
-	}
-	noErr(t, h.State.UpsertEvals(h.NextIndex(), []*structs.Evaluation{eval}))
-
-	// Process the evaluation
-	err := h.Process(NewServiceScheduler, eval)
-	if err != nil {
-		t.Fatalf("err: %v", err)
-	}
-
-	// Ensure a single plan
-	if len(h.Plans) != 1 {
-		t.Fatalf("bad: %#v", h.Plans)
-	}
-	plan := h.Plans[0]
-
-	// Ensure the plan evicted all allocs
-	if len(plan.NodeUpdate[node.ID]) != mp {
-		t.Fatalf("bad: %#v", plan)
-	}
-
-	// Ensure the plan allocated
-	var planned []*structs.Allocation
-	for _, allocList := range plan.NodeAllocation {
-		planned = append(planned, allocList...)
-	}
-	if len(planned) != mp {
-		t.Fatalf("bad: %#v", plan)
-	}
-
-	// Ensure there is a followup eval.
-	if len(h.CreateEvals) != 1 ||
-		h.CreateEvals[0].TriggeredBy != structs.EvalTriggerRollingUpdate {
-		t.Fatalf("bad: %#v", h.CreateEvals)
-	}
-
-	h.AssertEvalStatus(t, structs.EvalStatusComplete)
-}
-
 func TestServiceSched_RetryLimit(t *testing.T) {
 	h := NewHarness(t)
 	h.Planner = &RejectPlan{h}
@@ -3755,6 +3678,7 @@ func TestBatchSched_NodeDrain_Running_OldJob(t *testing.T) {
 	// Create an update job
 	job2 := job.Copy()
 	job2.TaskGroups[0].Tasks[0].Env = map[string]string{"foo": "bar"}
+	job2.Version++
 	noErr(t, h.State.UpsertJob(h.NextIndex(), job2))
 
 	// Create a mock evaluation to register the job
@@ -4021,10 +3945,10 @@ func TestServiceSched_NodeDrain_Sticky(t *testing.T) {
 	// Create an alloc on the draining node
 	alloc := mock.Alloc()
 	alloc.Name = "my-job.web[0]"
-	alloc.DesiredStatus = structs.AllocDesiredStatusStop
 	alloc.NodeID = node.ID
 	alloc.Job.TaskGroups[0].Count = 1
 	alloc.Job.TaskGroups[0].EphemeralDisk.Sticky = true
+	alloc.DesiredTransistion.Migrate = helper.BoolToPtr(true)
 	noErr(t, h.State.UpsertJob(h.NextIndex(), alloc.Job))
 	noErr(t, h.State.UpsertAllocs(h.NextIndex(), []*structs.Allocation{alloc}))
 
diff --git a/scheduler/reconcile.go b/scheduler/reconcile.go
index 3bfd1a89e14d..cdc375510750 100644
--- a/scheduler/reconcile.go
+++ b/scheduler/reconcile.go
@@ -499,6 +499,7 @@ func (a *allocReconciler) computeGroup(group string, all allocSet) bool {
 		})
 	}
 
+	// TODO Deprecate
 	// We need to create a followup evaluation.
 	if followup && strategy != nil && a.result.followupEvalWait < strategy.Stagger {
 		a.result.followupEvalWait = strategy.Stagger
diff --git a/scheduler/reconcile_test.go b/scheduler/reconcile_test.go
index 34f6eddbfa0c..a9188fa42ee5 100644
--- a/scheduler/reconcile_test.go
+++ b/scheduler/reconcile_test.go
@@ -927,6 +927,7 @@ func TestReconciler_DrainNode(t *testing.T) {
 	for i := 0; i < 2; i++ {
 		n := mock.Node()
 		n.ID = allocs[i].NodeID
+		allocs[i].DesiredTransistion.Migrate = helper.BoolToPtr(true)
 		n.Drain = true
 		tainted[n.ID] = n
 	}
@@ -979,6 +980,7 @@ func TestReconciler_DrainNode_ScaleUp(t *testing.T) {
 	for i := 0; i < 2; i++ {
 		n := mock.Node()
 		n.ID = allocs[i].NodeID
+		allocs[i].DesiredTransistion.Migrate = helper.BoolToPtr(true)
 		n.Drain = true
 		tainted[n.ID] = n
 	}
@@ -1032,6 +1034,7 @@ func TestReconciler_DrainNode_ScaleDown(t *testing.T) {
 	for i := 0; i < 3; i++ {
 		n := mock.Node()
 		n.ID = allocs[i].NodeID
+		allocs[i].DesiredTransistion.Migrate = helper.BoolToPtr(true)
 		n.Drain = true
 		tainted[n.ID] = n
 	}
@@ -2213,6 +2216,7 @@ func TestReconciler_PausedOrFailedDeployment_Migrations(t *testing.T) {
 			for i := 0; i < 3; i++ {
 				n := mock.Node()
 				n.ID = allocs[i].NodeID
+				allocs[i].DesiredTransistion.Migrate = helper.BoolToPtr(true)
 				n.Drain = true
 				tainted[n.ID] = n
 			}
@@ -2286,6 +2290,7 @@ func TestReconciler_DrainNode_Canary(t *testing.T) {
 	tainted := make(map[string]*structs.Node, 1)
 	n := mock.Node()
 	n.ID = allocs[11].NodeID
+	allocs[11].DesiredTransistion.Migrate = helper.BoolToPtr(true)
 	n.Drain = true
 	tainted[n.ID] = n
 
@@ -3025,6 +3030,7 @@ func TestReconciler_TaintedNode_RollingUpgrade(t *testing.T) {
 			n.Status = structs.NodeStatusDown
 		} else {
 			n.Drain = true
+			allocs[2+i].DesiredTransistion.Migrate = helper.BoolToPtr(true)
 		}
 		tainted[n.ID] = n
 	}
@@ -3110,6 +3116,7 @@ func TestReconciler_FailedDeployment_PlacementLost(t *testing.T) {
 			n.Status = structs.NodeStatusDown
 		} else {
 			n.Drain = true
+			allocs[6+i].DesiredTransistion.Migrate = helper.BoolToPtr(true)
 		}
 		tainted[n.ID] = n
 	}
@@ -3435,6 +3442,7 @@ func TestReconciler_TaintedNode_MultiGroups(t *testing.T) {
 	for i := 0; i < 15; i++ {
 		n := mock.Node()
 		n.ID = allocs[i].NodeID
+		allocs[i].DesiredTransistion.Migrate = helper.BoolToPtr(true)
 		n.Drain = true
 		tainted[n.ID] = n
 	}
diff --git a/scheduler/reconcile_util.go b/scheduler/reconcile_util.go
index db3a5ff1e3d5..fc8d619fb661 100644
--- a/scheduler/reconcile_util.go
+++ b/scheduler/reconcile_util.go
@@ -214,11 +214,14 @@ func (a allocSet) filterByTainted(nodes map[string]*structs.Node) (untainted, mi
 			untainted[alloc.ID] = alloc
 			continue
 		}
+
 		if !alloc.TerminalStatus() {
 			if n == nil || n.TerminalStatus() {
 				lost[alloc.ID] = alloc
-			} else {
+			} else if alloc.DesiredTransistion.ShouldMigrate() {
 				migrate[alloc.ID] = alloc
+			} else {
+				untainted[alloc.ID] = alloc
 			}
 		} else {
 			untainted[alloc.ID] = alloc
diff --git a/scheduler/system_sched.go b/scheduler/system_sched.go
index d30608c8b724..4fa2d20f673a 100644
--- a/scheduler/system_sched.go
+++ b/scheduler/system_sched.go
@@ -62,7 +62,7 @@ func (s *SystemScheduler) Process(eval *structs.Evaluation) error {
 	switch eval.TriggeredBy {
 	case structs.EvalTriggerJobRegister, structs.EvalTriggerNodeUpdate,
 		structs.EvalTriggerJobDeregister, structs.EvalTriggerRollingUpdate,
-		structs.EvalTriggerDeploymentWatcher:
+		structs.EvalTriggerDeploymentWatcher, structs.EvalTriggerNodeDrain:
 	default:
 		desc := fmt.Sprintf("scheduler cannot handle '%s' evaluation reason",
 			eval.TriggeredBy)
diff --git a/scheduler/system_sched_test.go b/scheduler/system_sched_test.go
index 8cd1a0c6474a..7303ea1708df 100644
--- a/scheduler/system_sched_test.go
+++ b/scheduler/system_sched_test.go
@@ -7,6 +7,7 @@ import (
 	"time"
 
 	memdb "github.com/hashicorp/go-memdb"
+	"github.com/hashicorp/nomad/helper"
 	"github.com/hashicorp/nomad/helper/uuid"
 	"github.com/hashicorp/nomad/nomad/mock"
 	"github.com/hashicorp/nomad/nomad/structs"
@@ -971,6 +972,7 @@ func TestSystemSched_NodeDown(t *testing.T) {
 	alloc.JobID = job.ID
 	alloc.NodeID = node.ID
 	alloc.Name = "my-job.web[0]"
+	alloc.DesiredTransistion.Migrate = helper.BoolToPtr(true)
 	noErr(t, h.State.UpsertAllocs(h.NextIndex(), []*structs.Allocation{alloc}))
 
 	// Create a mock evaluation to deal with drain
@@ -1099,6 +1101,7 @@ func TestSystemSched_NodeDrain(t *testing.T) {
 	alloc.JobID = job.ID
 	alloc.NodeID = node.ID
 	alloc.Name = "my-job.web[0]"
+	alloc.DesiredTransistion.Migrate = helper.BoolToPtr(true)
 	noErr(t, h.State.UpsertAllocs(h.NextIndex(), []*structs.Allocation{alloc}))
 
 	// Create a mock evaluation to deal with drain
@@ -1412,6 +1415,7 @@ func TestSystemSched_PlanWithDrainedNode(t *testing.T) {
 	alloc.JobID = job.ID
 	alloc.NodeID = node.ID
 	alloc.Name = "my-job.web[0]"
+	alloc.DesiredTransistion.Migrate = helper.BoolToPtr(true)
 	alloc.TaskGroup = "web"
 
 	alloc2 := mock.Alloc()
diff --git a/scheduler/testing.go b/scheduler/testing.go
index a04b99ce860c..47a6caaeb004 100644
--- a/scheduler/testing.go
+++ b/scheduler/testing.go
@@ -2,12 +2,11 @@ package scheduler
 
 import (
 	"fmt"
-	"log"
-	"os"
 	"sync"
 	"time"
 
 	memdb "github.com/hashicorp/go-memdb"
+	"github.com/hashicorp/nomad/helper/testlog"
 	"github.com/hashicorp/nomad/nomad/state"
 	"github.com/hashicorp/nomad/nomad/structs"
 	"github.com/mitchellh/go-testing-interface"
@@ -40,6 +39,7 @@ func (r *RejectPlan) ReblockEval(*structs.Evaluation) error {
 // store copy and provides the planner interface. It can be extended for various
 // testing uses or for invoking the scheduler without side effects.
 type Harness struct {
+	t     testing.T
 	State *state.StateStore
 
 	Planner  Planner
@@ -58,6 +58,7 @@ type Harness struct {
 func NewHarness(t testing.T) *Harness {
 	state := state.TestStateStore(t)
 	h := &Harness{
+		t:         t,
 		State:     state,
 		nextIndex: 1,
 	}
@@ -68,6 +69,7 @@ func NewHarness(t testing.T) *Harness {
 // purposes.
 func NewHarnessWithState(t testing.T, state *state.StateStore) *Harness {
 	return &Harness{
+		t:         t,
 		State:     state,
 		nextIndex: 1,
 	}
@@ -201,7 +203,7 @@ func (h *Harness) Snapshot() State {
 // Scheduler is used to return a new scheduler from
 // a snapshot of current state using the harness for planning.
 func (h *Harness) Scheduler(factory Factory) Scheduler {
-	logger := log.New(os.Stderr, "", log.LstdFlags)
+	logger := testlog.Logger(h.t)
 	return factory(logger, h.Snapshot(), h)
 }
 
diff --git a/scheduler/util.go b/scheduler/util.go
index 3417356014b6..fcac79d1c87e 100644
--- a/scheduler/util.go
+++ b/scheduler/util.go
@@ -104,20 +104,26 @@ func diffAllocs(job *structs.Job, taintedNodes map[string]*structs.Node,
 				goto IGNORE
 			}
 
-			if node == nil || node.TerminalStatus() {
-				result.lost = append(result.lost, allocTuple{
-					Name:      name,
-					TaskGroup: tg,
-					Alloc:     exist,
-				})
+			if !exist.TerminalStatus() {
+				if node == nil || node.TerminalStatus() {
+					result.lost = append(result.lost, allocTuple{
+						Name:      name,
+						TaskGroup: tg,
+						Alloc:     exist,
+					})
+				} else if exist.DesiredTransistion.ShouldMigrate() {
+					result.migrate = append(result.migrate, allocTuple{
+						Name:      name,
+						TaskGroup: tg,
+						Alloc:     exist,
+					})
+				} else {
+					goto IGNORE
+				}
 			} else {
-				// This is the drain case
-				result.migrate = append(result.migrate, allocTuple{
-					Name:      name,
-					TaskGroup: tg,
-					Alloc:     exist,
-				})
+				goto IGNORE
 			}
+
 			continue
 		}
 
@@ -318,10 +324,9 @@ func taintedNodes(state State, allocs []*structs.Allocation) (map[string]*struct
 			out[alloc.NodeID] = nil
 			continue
 		}
-		//FIXME is this right?
-		//if structs.ShouldDrainNode(node.Status) || node.Drain {
-		//	out[alloc.NodeID] = node
-		//}
+		if structs.ShouldDrainNode(node.Status) || node.Drain {
+			out[alloc.NodeID] = node
+		}
 	}
 	return out, nil
 }
diff --git a/scheduler/util_test.go b/scheduler/util_test.go
index cb96e83ea283..f2b339d38eff 100644
--- a/scheduler/util_test.go
+++ b/scheduler/util_test.go
@@ -7,6 +7,7 @@ import (
 	"reflect"
 	"testing"
 
+	"github.com/hashicorp/nomad/helper"
 	"github.com/hashicorp/nomad/helper/uuid"
 	"github.com/hashicorp/nomad/nomad/mock"
 	"github.com/hashicorp/nomad/nomad/state"
@@ -90,6 +91,9 @@ func TestDiffAllocs(t *testing.T) {
 			NodeID: "drainNode",
 			Name:   "my-job.web[2]",
 			Job:    oldJob,
+			DesiredTransistion: structs.DesiredTransistion{
+				Migrate: helper.BoolToPtr(true),
+			},
 		},
 		// Mark the 4th lost
 		{
@@ -219,6 +223,9 @@ func TestDiffSystemAllocs(t *testing.T) {
 			NodeID: drainNode.ID,
 			Name:   "my-job.web[0]",
 			Job:    oldJob,
+			DesiredTransistion: structs.DesiredTransistion{
+				Migrate: helper.BoolToPtr(true),
+			},
 		},
 		// Mark as lost on a dead node
 		{

From 7deabe958d43720009bba6223db2e2b4f50dc39f Mon Sep 17 00:00:00 2001
From: Michael Schurter <schmichael@hashicorp.com>
Date: Wed, 21 Feb 2018 17:22:06 -0800
Subject: [PATCH 05/79] drainer: switch to job based watching

---
 nomad/drain.go      | 455 +++++++++++++++++++++-----------------------
 nomad/drain_test.go |   6 +-
 2 files changed, 223 insertions(+), 238 deletions(-)

diff --git a/nomad/drain.go b/nomad/drain.go
index a1dc99972029..01732db1448a 100644
--- a/nomad/drain.go
+++ b/nomad/drain.go
@@ -13,6 +13,12 @@ import (
 	"github.com/hashicorp/nomad/nomad/structs"
 )
 
+// jobKey is a tuple of namespace+jobid for use as a map key by job
+type jobKey struct {
+	ns    string
+	jobid string
+}
+
 // drainingJob contains the Job and allocations for that job meant to be used
 // when collecting all allocations for a job with at least one allocation on a
 // draining node.
@@ -48,19 +54,14 @@ func makeTaskGroupKey(a *structs.Allocation) string {
 
 // stopAllocs tracks allocs to drain by a unique TG key
 type stopAllocs struct {
-	perTaskGroup map[string]int
-	allocBatch   []*structs.Allocation
+	allocBatch []*structs.Allocation
 
 	// namespace+jobid -> Job
-	jobBatch map[string]*structs.Job
+	jobBatch map[jobKey]*structs.Job
 }
 
 //FIXME this method does an awful lot
 func (s *stopAllocs) add(j *structs.Job, a *structs.Allocation) {
-	// Increment the counter for how many allocs in this task group are being stopped
-	tgKey := makeTaskGroupKey(a)
-	s.perTaskGroup[tgKey]++
-
 	// Update the allocation
 	a.ModifyTime = time.Now().UnixNano()
 	a.DesiredStatus = structs.AllocDesiredStatusStop
@@ -69,8 +70,7 @@ func (s *stopAllocs) add(j *structs.Job, a *structs.Allocation) {
 	s.allocBatch = append(s.allocBatch, a)
 
 	// Add job to the job batch
-	jobKey := strings.Join([]string{j.Namespace, j.ID}, "-")
-	s.jobBatch[jobKey] = j
+	s.jobBatch[jobKey{a.Namespace, a.JobID}] = j
 }
 
 // startNodeDrainer should be called in establishLeadership by the leader.
@@ -87,7 +87,7 @@ func (s *Server) startNodeDrainer(stopCh chan struct{}) {
 		}
 	}()
 
-	nodes, nodesIndex, drainingAllocs, allocsIndex := initDrainer(s.logger, state)
+	nodes, nodesIndex, drainingJobs, allocsIndex := initDrainer(s.logger, state)
 
 	// Wait for a node's drain deadline to expire
 	var nextDeadline time.Time
@@ -108,8 +108,9 @@ func (s *Server) startNodeDrainer(stopCh chan struct{}) {
 	go nodeWatcher.run(ctx)
 
 	// Watch for drained allocations to be replaced
-	prevAllocs := newPrevAllocWatcher(s.logger, drainingAllocs, allocsIndex, state)
-	go prevAllocs.run(ctx)
+	// Watch for changes in allocs for jobs with allocs on draining nodes
+	jobWatcher := newJobWatcher(s.logger, drainingJobs, allocsIndex, state)
+	go jobWatcher.run(ctx)
 
 	for {
 		//TODO this method of async node updates means we could make
@@ -117,16 +118,43 @@ func (s *Server) startNodeDrainer(stopCh chan struct{}) {
 		//possible outcome of this is that an allocation could be
 		//stopped on a node that recently had its drain cancelled which
 		//doesn't seem like that bad of a pathological case
+		s.logger.Printf("[TRACE] nomad.drain: LOOP next deadline: %s (%s)", nextDeadline, time.Until(nextDeadline))
 		select {
 		case nodes = <-nodeWatcher.nodesCh:
 			// update draining nodes
-			//TODO remove allocs from draining list with node ids not in this map
 			s.logger.Printf("[TRACE] nomad.drain: running due to node change (%d nodes draining)", len(nodes))
-		case drainedID := <-prevAllocs.allocsCh:
-			// drained alloc has been replaced
-			//TODO instead of modifying a view of draining allocs here created a shared map like prevallocs
-			delete(drainingAllocs, drainedID)
-			s.logger.Printf("[TRACE] nomad.drain: running due to alloc change (%s replaced)", drainedID)
+
+			// update deadline timer
+			changed := false
+			for _, n := range nodes {
+				if nextDeadline.IsZero() {
+					nextDeadline = n.DrainStrategy.DeadlineTime()
+					changed = true
+					continue
+				}
+
+				if deadline := n.DrainStrategy.DeadlineTime(); deadline.Before(nextDeadline) {
+					nextDeadline = deadline
+					changed = true
+				}
+			}
+
+			// if changed reset the timer
+			if changed {
+				s.logger.Printf("[TRACE] nomad.drain: new node deadline: %s", nextDeadline)
+				if !deadlineTimer.Stop() {
+					// timer may have been recv'd in a
+					// previous loop, so don't block
+					select {
+					case <-deadlineTimer.C:
+					default:
+					}
+				}
+				deadlineTimer.Reset(time.Until(nextDeadline))
+			}
+
+		case jobs := <-jobWatcher.WaitCh():
+			s.logger.Printf("[TRACE] nomad.drain: running due to alloc change (%d jobs updated)", len(jobs))
 		case when := <-deadlineTimer.C:
 			// deadline for a node was reached
 			s.logger.Printf("[TRACE] nomad.drain: running due to deadline reached (at %s)", when)
@@ -148,7 +176,7 @@ func (s *Server) startNodeDrainer(stopCh chan struct{}) {
 		}
 		now := time.Now() // for determing deadlines in a consistent way
 
-		// namespace -> job id -> {job, allocs}
+		// job key -> {job, allocs}
 		// Collect all allocs for all jobs with at least one
 		// alloc on a draining node.
 		// Invariants:
@@ -156,7 +184,15 @@ func (s *Server) startNodeDrainer(stopCh chan struct{}) {
 		//  - No batch jobs unless their node's deadline is reached
 		//  - No entries with 0 allocs
 		//TODO could this be a helper method on prevAllocWatcher
-		drainable := map[string]map[string]*drainingJob{}
+		drainable := map[jobKey]*drainingJob{}
+
+		// track jobs we've looked up before and know we shouldn't
+		// consider for draining eg system jobs
+		skipJob := map[jobKey]struct{}{}
+
+		// track number of "up" allocs per task group (not terminal and
+		// have a deployment status)
+		upPerTG := map[string]int{}
 
 		// Collect all drainable jobs
 		for nodeID, node := range nodes {
@@ -169,37 +205,45 @@ func (s *Server) startNodeDrainer(stopCh chan struct{}) {
 			// track number of allocs left on this node to be drained
 			allocsLeft := false
 			for _, alloc := range allocs {
-				if _, ok := drainable[alloc.Namespace]; !ok {
-					// namespace does not exist
-					drainable[alloc.Namespace] = make(map[string]*drainingJob)
-				}
+				jobkey := jobKey{alloc.Namespace, alloc.JobID}
 
-				if _, ok := drainable[alloc.Namespace][alloc.JobID]; ok {
+				if _, ok := drainable[jobkey]; ok {
 					// already found
 					continue
 				}
 
+				if _, ok := skipJob[jobkey]; ok {
+					// already looked up and skipped
+					continue
+				}
+
 				// job does not found yet
 				job, err := snapshot.JobByID(nil, alloc.Namespace, alloc.JobID)
 				if err != nil {
 					//FIXME
 					panic(err)
 				}
-				//TODO check for job == nil?
 
 				// Don't bother collecting system jobs
 				if job.Type == structs.JobTypeSystem {
+					skipJob[jobkey] = struct{}{}
+					s.logger.Printf("[TRACE] nomad.drain: skipping system job %s", job.Name)
 					continue
 				}
 
-				// If a drainable alloc isn't yet stopping this
-				// node has allocs left to be drained
+				// If alloc isn't yet terminal this node has
+				// allocs left to be drained
 				if !alloc.TerminalStatus() {
-					allocsLeft = true
+					if !allocsLeft {
+						s.logger.Printf("[TRACE] nomad.drain: node %s has allocs left to drain", nodeID[:6])
+						allocsLeft = true
+					}
 				}
 
 				// Don't bother collecting batch jobs for nodes that haven't hit their deadline
 				if job.Type == structs.JobTypeBatch && node.DrainStrategy.DeadlineTime().After(now) {
+					s.logger.Printf("[TRACE] nomad.drain: not draining batch job %s because deadline isn't for %s", job.Name, node.DrainStrategy.DeadlineTime().Sub(now))
+					skipJob[jobkey] = struct{}{}
 					continue
 				}
 
@@ -209,100 +253,109 @@ func (s *Server) startNodeDrainer(stopCh chan struct{}) {
 					panic(err)
 				}
 
-				drainable[alloc.Namespace][alloc.JobID] = &drainingJob{
+				// Count the number of down (terminal or nil deployment status) per task group
+				if job.Type == structs.JobTypeService {
+					n := 0
+					for _, a := range jobAllocs {
+						if !a.TerminalStatus() && a.DeploymentStatus != nil {
+							upPerTG[makeTaskGroupKey(a)]++
+							n++
+						}
+					}
+					s.logger.Printf("[TRACE] nomad.drain: job %s has %d task groups running", job.Name, n)
+				}
+
+				drainable[jobkey] = &drainingJob{
 					job:    job,
 					allocs: jobAllocs,
 				}
+
+				jobWatcher.watch(jobkey, nodeID)
 			}
 
 			// if node has no allocs, it's done draining!
 			if !allocsLeft {
+				s.logger.Printf("[TRACE] nomad.drain: node %s has no more allocs left to drain", nodeID)
+				jobWatcher.nodeDone(nodeID)
 				delete(nodes, nodeID)
 				doneNodes[nodeID] = node
 			}
 		}
 
-		// Initialize stoplist with a count of allocs already draining per task group
-		//TODO wrap this up in a new func
+		// stoplist are the allocations to stop and their jobs to emit
+		// evaluations for
 		stoplist := &stopAllocs{
-			perTaskGroup: make(map[string]int, len(drainingAllocs)),
-			allocBatch:   make([]*structs.Allocation, len(drainingAllocs)),
-			jobBatch:     make(map[string]*structs.Job),
-		}
-		// initialize perTaskGroup to be the number of total *currently draining* allocations per task group
-		for _, a := range drainingAllocs {
-			stoplist.perTaskGroup[a.tgKey]++
+			allocBatch: make([]*structs.Allocation, 0, len(drainable)),
+			jobBatch:   make(map[jobKey]*structs.Job),
 		}
 
 		// deadlineNodes is a map of node IDs that have reached their
 		// deadline and allocs that will be stopped due to deadline
 		deadlineNodes := map[string]int{}
 
-		//TODO build drain list considering deadline & max_parallel
-		for _, drainingJobs := range drainable {
-			for _, drainingJob := range drainingJobs {
-				for _, alloc := range drainingJob.allocs {
-					// Already draining/dead allocs don't need to be drained
-					if alloc.TerminalStatus() {
-						continue
-					}
+		// build drain list considering deadline & max_parallel
+		for _, drainingJob := range drainable {
+			for _, alloc := range drainingJob.allocs {
+				// Already draining/dead allocs don't need to be drained
+				if alloc.TerminalStatus() {
+					continue
+				}
 
-					node, ok := nodes[alloc.NodeID]
-					if !ok {
-						// Alloc's node is not draining so not elligible for draining!
-						continue
-					}
+				node, ok := nodes[alloc.NodeID]
+				if !ok {
+					// Alloc's node is not draining so not elligible for draining!
+					continue
+				}
 
-					if node.DrainStrategy.DeadlineTime().Before(now) {
-						s.logger.Printf("[TRACE] nomad.drain: draining job %s alloc %s from node %s due to node's drain deadline", drainingJob.job.Name, alloc.ID[:6], alloc.NodeID[:6])
-						// Alloc's Node has reached its deadline
-						stoplist.add(drainingJob.job, alloc)
+				tgKey := makeTaskGroupKey(alloc)
 
-						deadlineNodes[node.ID]++
+				if node.DrainStrategy.DeadlineTime().Before(now) {
+					s.logger.Printf("[TRACE] nomad.drain: draining job %s alloc %s from node %s due to node's drain deadline", drainingJob.job.Name, alloc.ID[:6], alloc.NodeID[:6])
+					// Alloc's Node has reached its deadline
+					stoplist.add(drainingJob.job, alloc)
+					upPerTG[tgKey]--
 
-						//FIXME purge from watchlist?
-						continue
-					}
+					deadlineNodes[node.ID]++
+					continue
+				}
 
-					// Batch jobs are only stopped when the node
-					// deadline is reached which has already been
-					// done.
-					if drainingJob.job.Type == structs.JobTypeBatch {
-						continue
-					}
+				// Batch jobs are only stopped when the node
+				// deadline is reached which has already been
+				// done.
+				if drainingJob.job.Type == structs.JobTypeBatch {
+					continue
+				}
 
-					// Stop allocs with count=1, max_parallel==0, or draining<max_parallel
-					tg := drainingJob.job.LookupTaskGroup(alloc.TaskGroup)
-					//FIXME tg==nil here?
+				// Stop allocs with count=1, max_parallel==0, or draining<max_parallel
+				tg := drainingJob.job.LookupTaskGroup(alloc.TaskGroup)
+				//FIXME tg==nil here?
 
-					// Only 1, drain
-					if tg.Count == 1 {
-						s.logger.Printf("[TRACE] nomad.drain: draining job %s alloc %s from node %s due to count=1", drainingJob.job.Name, alloc.ID[:6], alloc.NodeID[:6])
-						stoplist.add(drainingJob.job, alloc)
-						continue
-					}
+				// Only 1, drain
+				if tg.Count == 1 {
+					s.logger.Printf("[TRACE] nomad.drain: draining job %s alloc %s from node %s due to count=1", drainingJob.job.Name, alloc.ID[:6], alloc.NodeID[:6])
+					stoplist.add(drainingJob.job, alloc)
+					continue
+				}
 
-					// No migrate strategy or a max parallel of 0 mean force draining
-					if tg.Migrate == nil || tg.Migrate.MaxParallel == 0 {
-						s.logger.Printf("[TRACE] nomad.drain: draining job %s alloc %s from node %s due to force drain", drainingJob.job.Name, alloc.ID[:6], alloc.NodeID[:6])
-						stoplist.add(drainingJob.job, alloc)
-						continue
-					}
+				// No migrate strategy or a max parallel of 0 mean force draining
+				if tg.Migrate == nil || tg.Migrate.MaxParallel == 0 {
+					s.logger.Printf("[TRACE] nomad.drain: draining job %s alloc %s from node %s due to force drain", drainingJob.job.Name, alloc.ID[:6], alloc.NodeID[:6])
+					stoplist.add(drainingJob.job, alloc)
+					continue
+				}
 
-					// If MaxParallel > how many allocs are
-					// already draining for this task
-					// group, drain and track this alloc
-					tgKey := makeTaskGroupKey(alloc)
+				s.logger.Printf("[TRACE] nomad.drain: considering job %s alloc %s  count %d  maxp %d  up %d",
+					drainingJob.job.Name, alloc.ID[:6], tg.Count, tg.Migrate.MaxParallel, upPerTG[tgKey])
 
-					//FIXME change this to be based off of the sum(deploymentstatus!=nil && clientstatus==running) for this task group
-					if tg.Migrate.MaxParallel > stoplist.perTaskGroup[tgKey] {
-						s.logger.Printf("[TRACE] nomad.drain: draining job %s alloc %s from node %s due to max parallel", drainingJob.job.Name, alloc.ID[:6], alloc.NodeID[:6])
-						// More migrations are allowed, add to stoplist
-						stoplist.add(drainingJob.job, alloc)
+				// Count - MaxParalell = minimum number of allocations that must be "up"
+				minUp := (tg.Count - tg.Migrate.MaxParallel)
 
-						// Also add to prevAllocWatcher
-						prevAllocs.watch(alloc.ID)
-					}
+				// If minimum is < the current number up it is safe to stop one.
+				if minUp < upPerTG[tgKey] {
+					s.logger.Printf("[TRACE] nomad.drain: draining job %s alloc %s from node %s due to max parallel", drainingJob.job.Name, alloc.ID[:6], alloc.NodeID[:6])
+					// More migrations are allowed, add to stoplist
+					stoplist.add(drainingJob.job, alloc)
+					upPerTG[tgKey]--
 				}
 			}
 		}
@@ -310,6 +363,7 @@ func (s *Server) startNodeDrainer(stopCh chan struct{}) {
 		// log drains due to node deadlines
 		for nodeID, remaining := range deadlineNodes {
 			s.logger.Printf("[DEBUG] nomad.drain: node %s drain deadline reached; stopping %d remaining allocs", nodeID, remaining)
+			jobWatcher.nodeDone(nodeID)
 		}
 
 		if len(stoplist.allocBatch) > 0 {
@@ -425,17 +479,16 @@ func (n *nodeWatcher) run(ctx context.Context) {
 		newNodes := resp.([]*structs.Node)
 		n.logger.Printf("[TRACE] nomad.drain: %d nodes to consider", len(newNodes)) //FIXME remove
 		for _, newNode := range newNodes {
-			if _, ok := n.nodes[newNode.ID]; ok {
-				// Node was draining
+			if existingNode, ok := n.nodes[newNode.ID]; ok {
+				// Node was draining, see if it has changed
 				if !newNode.Drain {
 					// Node stopped draining
 					delete(n.nodes, newNode.ID)
 					changed = true
-				} else {
+				} else if !newNode.DrainStrategy.DeadlineTime().Equal(existingNode.DrainStrategy.DeadlineTime()) {
 					// Update deadline
 					n.nodes[newNode.ID] = newNode
-					//FIXME set changed if it changed?
-					//changed = true
+					changed = true
 				}
 			} else {
 				// Node was not draining
@@ -492,73 +545,78 @@ func (n *nodeWatcher) queryNodeDrain(ws memdb.WatchSet, state *state.StateStore)
 	return resp, index, nil
 }
 
-// prevAllocWatcher monitors allocation updates for allocations which replace
-// draining allocations.
-type prevAllocWatcher struct {
-	// watchList is a map of alloc ids to look for in PreviousAllocation
-	// fields of new allocs
-	watchList   map[string]struct{}
-	watchListMu sync.Mutex
+type jobWatcher struct {
+	// allocsIndex to start watching from
+	allocsIndex uint64
 
-	state *state.StateStore
+	// job -> node.ID
+	jobs   map[jobKey]string
+	jobsMu sync.Mutex
 
-	// allocIndex to start watching from
-	allocIndex uint64
+	jobsCh chan map[jobKey]struct{}
 
-	// allocsCh is sent Allocation.IDs as they're removed from the watchList
-	allocsCh chan string
+	state *state.StateStore
 
 	logger *log.Logger
 }
 
-// newPrevAllocWatcher creates a new prevAllocWatcher watching drainingAllocs
-// from allocIndex in the state store. Must call run to start watching.
-func newPrevAllocWatcher(logger *log.Logger, drainingAllocs map[string]drainingAlloc, allocIndex uint64,
-	state *state.StateStore) *prevAllocWatcher {
-
-	watchList := make(map[string]struct{}, len(drainingAllocs))
-	for allocID := range drainingAllocs {
-		watchList[allocID] = struct{}{}
+func newJobWatcher(logger *log.Logger, jobs map[jobKey]string, allocsIndex uint64, state *state.StateStore) *jobWatcher {
+	return &jobWatcher{
+		allocsIndex: allocsIndex,
+		logger:      logger,
+		jobs:        jobs,
+		jobsCh:      make(chan map[jobKey]struct{}),
+		state:       state,
 	}
+}
+
+func (j *jobWatcher) watch(k jobKey, nodeID string) {
+	j.logger.Printf("[TRACE] nomad.drain: watching job %s on draining node %s", k.jobid, nodeID[:6])
+	j.jobsMu.Lock()
+	j.jobs[k] = nodeID
+	j.jobsMu.Unlock()
+}
 
-	return &prevAllocWatcher{
-		watchList:  watchList,
-		state:      state,
-		allocIndex: allocIndex,
-		allocsCh:   make(chan string, 8), //FIXME 8? really? what should this be
-		logger:     logger,
+func (j *jobWatcher) nodeDone(nodeID string) {
+	j.jobsMu.Lock()
+	defer j.jobsMu.Unlock()
+	for k, v := range j.jobs {
+		if v == nodeID {
+			j.logger.Printf("[TRACE] nomad.drain: UNwatching job %s on done draining node %s", k.jobid, nodeID[:6])
+			delete(j.jobs, k)
+		}
 	}
 }
 
-// watch for an allocation ID to be replaced.
-func (p *prevAllocWatcher) watch(allocID string) {
-	p.watchListMu.Lock()
-	defer p.watchListMu.Unlock()
-	p.watchList[allocID] = struct{}{}
+func (j *jobWatcher) WaitCh() <-chan map[jobKey]struct{} {
+	return j.jobsCh
 }
 
-// run the prevAllocWatcher and send replaced draining alloc IDs on allocsCh.
-func (p *prevAllocWatcher) run(ctx context.Context) {
-	// index to watch from
+func (j *jobWatcher) run(ctx context.Context) {
 	var resp interface{}
 	var err error
 
 	for {
+		//FIXME have watchAllocs create a closure and give it a copy of j.jobs to remove locking?
 		//FIXME it seems possible for this to return a nil error and a 0 index, what to do in that case?
-		resp, p.allocIndex, err = p.state.BlockingQuery(p.queryPrevAlloc, p.allocIndex, ctx)
+		var newIndex uint64
+		resp, newIndex, err = j.state.BlockingQuery(j.watchAllocs, j.allocsIndex, ctx)
 		if err != nil {
 			if err == context.Canceled {
-				p.logger.Printf("[TRACE] nomad.drain: previous allocation watcher shutting down")
+				j.logger.Printf("[TRACE] nomad.drain: job watcher shutting down")
 				return
 			}
-			p.logger.Printf("[ERR] nomad.drain: error blocking on alloc updates: %v", err)
+			j.logger.Printf("[ERR] nomad.drain: error blocking on alloc updates: %v", err)
 			return
 		}
 
-		allocIDs := resp.([]string)
-		for _, id := range allocIDs {
+		j.logger.Printf("[TRACE] nomad.drain: job watcher old index: %d new index: %d", j.allocsIndex, newIndex)
+		j.allocsIndex = newIndex
+
+		changedJobs := resp.(map[jobKey]struct{})
+		if len(changedJobs) > 0 {
 			select {
-			case p.allocsCh <- id:
+			case j.jobsCh <- changedJobs:
 			case <-ctx.Done():
 				return
 			}
@@ -566,8 +624,7 @@ func (p *prevAllocWatcher) run(ctx context.Context) {
 	}
 }
 
-// queryPrevAlloc is the BlockingQuery func for scanning for replacement allocs
-func (p *prevAllocWatcher) queryPrevAlloc(ws memdb.WatchSet, state *state.StateStore) (interface{}, uint64, error) {
+func (j *jobWatcher) watchAllocs(ws memdb.WatchSet, state *state.StateStore) (interface{}, uint64, error) {
 	iter, err := state.Allocs(ws)
 	if err != nil {
 		return nil, 0, err
@@ -578,11 +635,10 @@ func (p *prevAllocWatcher) queryPrevAlloc(ws memdb.WatchSet, state *state.StateS
 		return nil, 0, err
 	}
 
-	//FIXME do fine grained locking around watclist mutations?
-	p.watchListMu.Lock()
-	defer p.watchListMu.Unlock()
+	skipped := 0
 
-	resp := make([]string, 0, len(p.watchList))
+	// job ids
+	resp := map[jobKey]struct{}{}
 
 	for {
 		raw := iter.Next()
@@ -591,26 +647,35 @@ func (p *prevAllocWatcher) queryPrevAlloc(ws memdb.WatchSet, state *state.StateS
 		}
 
 		alloc := raw.(*structs.Allocation)
-		_, ok := p.watchList[alloc.PreviousAllocation]
+
+		j.jobsMu.Lock()
+		_, ok := j.jobs[jobKey{alloc.Namespace, alloc.JobID}]
+		j.jobsMu.Unlock()
+
 		if !ok {
-			// PreviousAllocation not in watchList, skip it
+			// alloc is not part of a draining job
+			skipped++
 			continue
 		}
 
-		// If the migration health is set on the replacement alloc we can stop watching the drained alloc
+		// don't wake drain loop if alloc hasn't updated its health
 		if alloc.DeploymentStatus.IsHealthy() || alloc.DeploymentStatus.IsUnhealthy() {
-			delete(p.watchList, alloc.PreviousAllocation)
-			resp = append(resp, alloc.PreviousAllocation)
+			j.logger.Printf("[TRACE] nomad.drain: job watcher found alloc %s - deployment status: %t", alloc.ID[:6], *alloc.DeploymentStatus.Healthy)
+			resp[jobKey{alloc.Namespace, alloc.JobID}] = struct{}{}
+		} else {
+			j.logger.Printf("[TRACE] nomad.drain: job watcher ignoring alloc %s - no deployment status", alloc.ID[:6])
 		}
 	}
 
+	j.logger.Printf("[TRACE] nomad.drain: job watcher ignoring %d allocs - not part of draining job at index %d", skipped, index)
+
 	return resp, index, nil
 }
 
 // initDrainer initializes the node drainer state and returns a list of
 // draining nodes as well as allocs that are draining that should be watched
 // for a replacement.
-func initDrainer(logger *log.Logger, state *state.StateStore) (map[string]*structs.Node, uint64, map[string]drainingAlloc, uint64) {
+func initDrainer(logger *log.Logger, state *state.StateStore) (map[string]*structs.Node, uint64, map[jobKey]string, uint64) {
 	// StateStore.Snapshot never returns an error so don't bother checking it
 	snapshot, _ := state.Snapshot()
 	now := time.Now()
@@ -624,9 +689,8 @@ func initDrainer(logger *log.Logger, state *state.StateStore) (map[string]*struc
 	// map of draining nodes keyed by node ID
 	nodes := map[string]*structs.Node{}
 
-	//FIXME rollup by composite namespace+job.ID+tg key?
-	// List of draining allocs by namespace and job: namespace -> job.ID -> alloc.ID -> *Allocation
-	allocsByNS := map[string]map[string]map[string]*structs.Allocation{}
+	// map of draining job IDs keyed by {namespace, job id} -> node.ID
+	jobs := map[jobKey]string{}
 
 	for {
 		raw := iter.Next()
@@ -655,88 +719,7 @@ func initDrainer(logger *log.Logger, state *state.StateStore) (map[string]*struc
 		}
 
 		for _, alloc := range allocs {
-			//FIXME is it safe to assume the drainer set the desired status to stop?
-			if alloc.DesiredStatus == structs.AllocDesiredStatusStop {
-				if allocsByJob, ok := allocsByNS[alloc.Namespace]; ok {
-					if allocs, ok := allocsByJob[alloc.JobID]; ok {
-						allocs[alloc.ID] = alloc
-					} else {
-						// First alloc for job
-						allocsByJob[alloc.JobID] = map[string]*structs.Allocation{alloc.ID: alloc}
-					}
-				} else {
-					// First alloc in namespace
-					allocsByNS[alloc.Namespace] = map[string]map[string]*structs.Allocation{
-						alloc.JobID: map[string]*structs.Allocation{alloc.ID: alloc},
-					}
-				}
-			}
-		}
-	}
-
-	// drainingAllocs is the list of all allocations that are currently
-	// draining and waiting for a replacement
-	drainingAllocs := map[string]drainingAlloc{}
-
-	for ns, allocsByJobs := range allocsByNS {
-		for jobID, allocs := range allocsByJobs {
-			for allocID, alloc := range allocs {
-				job, err := snapshot.JobByID(nil, ns, jobID)
-				if err != nil {
-					logger.Printf("[ERR] nomad.drain: error getting job %q for alloc %q: %v", alloc.JobID, allocID, err)
-					//FIXME
-					panic(err)
-				}
-
-				// Don't track drains for stopped or gc'd jobs
-				if job == nil || job.Status == structs.JobStatusDead {
-					continue
-				}
-
-				jobAllocs, err := snapshot.AllocsByJob(nil, ns, jobID, true)
-				if err != nil {
-					//FIXME
-					panic(err)
-				}
-
-				// Remove drained allocs for replacement allocs
-				for _, alloc := range jobAllocs {
-					if alloc.DeploymentStatus.IsHealthy() || alloc.DeploymentStatus.IsUnhealthy() {
-						delete(allocs, alloc.PreviousAllocation)
-					}
-				}
-
-				//FIXME why are we doing a nested loop over allocs?
-				// Any remaining allocs need to be tracked
-				for allocID, alloc := range allocs {
-					tg := job.LookupTaskGroup(alloc.TaskGroup)
-					if tg == nil {
-						logger.Printf("[DEBUG] nomad.drain: unable to find task group %q for alloc %q", alloc.TaskGroup, allocID)
-						continue
-					}
-
-					if tg.Migrate == nil {
-						// No migrate strategy so don't track
-						continue
-					}
-
-					//FIXME Remove this? ModifyTime is not updated as expected
-
-					// alloc.ModifyTime + HealthyDeadline is >= the
-					// healthy deadline for the allocation, so we
-					// can stop tracking it at that time.
-					deadline := time.Unix(0, alloc.ModifyTime).Add(tg.Migrate.HealthyDeadline)
-
-					if deadline.After(now) {
-						// deadline already reached; don't bother tracking
-						continue
-					}
-
-					// Draining allocation hasn't been replaced or
-					// reached its deadline; track it!
-					drainingAllocs[allocID] = newDrainingAlloc(alloc, deadline)
-				}
-			}
+			jobs[jobKey{alloc.Namespace, alloc.JobID}] = node.ID
 		}
 	}
 
@@ -748,5 +731,5 @@ func initDrainer(logger *log.Logger, state *state.StateStore) (map[string]*struc
 	if allocsIndex == 0 {
 		allocsIndex = 1
 	}
-	return nodes, nodesIndex, drainingAllocs, allocsIndex
+	return nodes, nodesIndex, jobs, allocsIndex
 }
diff --git a/nomad/drain_test.go b/nomad/drain_test.go
index bf1ec875de3a..e611fbdee2cb 100644
--- a/nomad/drain_test.go
+++ b/nomad/drain_test.go
@@ -62,6 +62,7 @@ func TestNodeDrainer_SimpleDrain(t *testing.T) {
 	systemJob.TaskGroups[0].Tasks[0].Resources = structs.MinResources()
 	systemJob.TaskGroups[0].Tasks[0].Services = nil
 
+	// Batch job will run until the node's drain deadline is reached
 	batchJob := mock.Job()
 	batchJob.Name = "batch-job"
 	batchJob.Type = structs.JobTypeBatch
@@ -134,6 +135,7 @@ func TestNodeDrainer_SimpleDrain(t *testing.T) {
 				t.Logf("%d alloc %s job %s status %s", i, alloc.ID, alloc.Job.Name, alloc.ClientStatus)
 			}
 		}
+		server.logger.Println("----------------------------------------------------------------------quitting--------------------------------------------------------")
 		t.Fatalf("failed waiting for all allocs to start: %v", err)
 	})
 
@@ -182,10 +184,10 @@ func TestNodeDrainer_SimpleDrain(t *testing.T) {
 				t.Logf("%d alloc %s job %s status %s prev %s", i, alloc.ID, alloc.Job.Name, alloc.ClientStatus, alloc.PreviousAllocation)
 			}
 		}
-		t.Fatalf("failed waiting for all allocs to start: %v", err)
+		server.logger.Println("----------------------------------------------------------------------quitting--------------------------------------------------------")
+		t.Errorf("failed waiting for all allocs to migrate: %v", err)
 	})
 
-	// Wait for all service allocs to be replaced
 	jobs, err := rpc.JobList()
 	require.Nil(err)
 	t.Logf("%d jobs", len(jobs.Jobs))

From 832b1d5694465f86fbdcafeba4e4ccfd0f749ad5 Mon Sep 17 00:00:00 2001
From: Michael Schurter <schmichael@hashicorp.com>
Date: Thu, 22 Feb 2018 17:38:44 -0800
Subject: [PATCH 06/79] switch to new raft DesiredTransition message

---
 api/allocations.go              |  8 +--
 nomad/alloc_endpoint.go         | 12 ++---
 nomad/alloc_endpoint_test.go    | 48 +++++++++++++----
 nomad/drain.go                  | 92 +++++++++++----------------------
 nomad/drain_test.go             | 16 ++++--
 nomad/fsm.go                    | 17 +++---
 nomad/fsm_test.go               | 32 ++++++++----
 nomad/state/state_store.go      | 23 +++++----
 nomad/state/state_store_test.go | 37 ++++++++-----
 nomad/structs/structs.go        | 31 ++++++-----
 scheduler/generic_sched_test.go | 10 ++--
 scheduler/reconcile_test.go     | 16 +++---
 scheduler/reconcile_util.go     |  2 +-
 scheduler/system_sched_test.go  |  6 +--
 scheduler/util.go               |  2 +-
 scheduler/util_test.go          |  4 +-
 testutil/rpcapi/rcpapi.go       | 28 ++++++++++
 17 files changed, 228 insertions(+), 156 deletions(-)

diff --git a/api/allocations.go b/api/allocations.go
index 89206dadee0b..c3759806741f 100644
--- a/api/allocations.go
+++ b/api/allocations.go
@@ -81,7 +81,7 @@ type Allocation struct {
 	Metrics            *AllocationMetric
 	DesiredStatus      string
 	DesiredDescription string
-	DesiredTransistion DesiredTransistion
+	DesiredTransition  DesiredTransition
 	ClientStatus       string
 	ClientDescription  string
 	TaskStates         map[string]*TaskState
@@ -207,10 +207,10 @@ type RescheduleEvent struct {
 	PrevNodeID string
 }
 
-// DesiredTransistion is used to mark an allocation as having a desired state
-// transistion. This information can be used by the scheduler to make the
+// DesiredTransition is used to mark an allocation as having a desired state
+// transition. This information can be used by the scheduler to make the
 // correct decision.
-type DesiredTransistion struct {
+type DesiredTransition struct {
 	// Migrate is used to indicate that this allocation should be stopped and
 	// migrated to another node.
 	Migrate *bool
diff --git a/nomad/alloc_endpoint.go b/nomad/alloc_endpoint.go
index a7f5e3bdc2ac..405136ca8cc1 100644
--- a/nomad/alloc_endpoint.go
+++ b/nomad/alloc_endpoint.go
@@ -202,13 +202,13 @@ func (a *Alloc) GetAllocs(args *structs.AllocsGetRequest,
 	return a.srv.blockingRPC(&opts)
 }
 
-// UpdateDesiredTransistion is used to update the desired transistions of an
+// UpdateDesiredTransition is used to update the desired transitions of an
 // allocation.
-func (a *Alloc) UpdateDesiredTransistion(args *structs.AllocUpdateDesiredTransistionRequest, reply *structs.GenericResponse) error {
-	if done, err := a.srv.forward("Alloc.UpdateDesiredTransistion", args, args, reply); done {
+func (a *Alloc) UpdateDesiredTransition(args *structs.AllocUpdateDesiredTransitionRequest, reply *structs.GenericResponse) error {
+	if done, err := a.srv.forward("Alloc.UpdateDesiredTransition", args, args, reply); done {
 		return err
 	}
-	defer metrics.MeasureSince([]string{"nomad", "alloc", "update_desired_transistion"}, time.Now())
+	defer metrics.MeasureSince([]string{"nomad", "alloc", "update_desired_transition"}, time.Now())
 
 	// Check that it is a management token.
 	if aclObj, err := a.srv.ResolveToken(args.AuthToken); err != nil {
@@ -223,9 +223,9 @@ func (a *Alloc) UpdateDesiredTransistion(args *structs.AllocUpdateDesiredTransis
 	}
 
 	// Commit this update via Raft
-	_, index, err := a.srv.raftApply(structs.AllocUpdateDesiredTransistionRequestType, args)
+	_, index, err := a.srv.raftApply(structs.AllocUpdateDesiredTransitionRequestType, args)
 	if err != nil {
-		a.srv.logger.Printf("[ERR] nomad.allocs: AllocUpdateDesiredTransistionRequest failed: %v", err)
+		a.srv.logger.Printf("[ERR] nomad.allocs: AllocUpdateDesiredTransitionRequest failed: %v", err)
 		return err
 	}
 
diff --git a/nomad/alloc_endpoint_test.go b/nomad/alloc_endpoint_test.go
index f898f2b7dd9f..5d309d7c3b96 100644
--- a/nomad/alloc_endpoint_test.go
+++ b/nomad/alloc_endpoint_test.go
@@ -484,7 +484,7 @@ func TestAllocEndpoint_GetAllocs_Blocking(t *testing.T) {
 	}
 }
 
-func TestAllocEndpoint_UpdateDesiredTransistion(t *testing.T) {
+func TestAllocEndpoint_UpdateDesiredTransition(t *testing.T) {
 	t.Parallel()
 	require := require.New(t)
 
@@ -501,16 +501,38 @@ func TestAllocEndpoint_UpdateDesiredTransistion(t *testing.T) {
 	require.Nil(state.UpsertJobSummary(999, mock.JobSummary(alloc2.JobID)))
 	require.Nil(state.UpsertAllocs(1000, []*structs.Allocation{alloc, alloc2}))
 
-	t1 := &structs.DesiredTransistion{
+	t1 := &structs.DesiredTransition{
 		Migrate: helper.BoolToPtr(true),
 	}
 
 	// Update the allocs desired status
-	get := &structs.AllocUpdateDesiredTransistionRequest{
-		Allocs: map[string]*structs.DesiredTransistion{
+	get := &structs.AllocUpdateDesiredTransitionRequest{
+		Allocs: map[string]*structs.DesiredTransition{
 			alloc.ID:  t1,
 			alloc2.ID: t1,
 		},
+		Evals: []*structs.Evaluation{
+			{
+				ID:             uuid.Generate(),
+				Namespace:      alloc.Namespace,
+				Priority:       alloc.Job.Priority,
+				Type:           alloc.Job.Type,
+				TriggeredBy:    structs.EvalTriggerNodeDrain,
+				JobID:          alloc.Job.ID,
+				JobModifyIndex: alloc.Job.ModifyIndex,
+				Status:         structs.EvalStatusPending,
+			},
+			{
+				ID:             uuid.Generate(),
+				Namespace:      alloc2.Namespace,
+				Priority:       alloc2.Job.Priority,
+				Type:           alloc2.Job.Type,
+				TriggeredBy:    structs.EvalTriggerNodeDrain,
+				JobID:          alloc2.Job.ID,
+				JobModifyIndex: alloc2.Job.ModifyIndex,
+				Status:         structs.EvalStatusPending,
+			},
+		},
 		WriteRequest: structs.WriteRequest{
 			Region: "global",
 		},
@@ -518,14 +540,14 @@ func TestAllocEndpoint_UpdateDesiredTransistion(t *testing.T) {
 
 	// Try without permissions
 	var resp structs.GenericResponse
-	err := msgpackrpc.CallWithCodec(codec, "Alloc.UpdateDesiredTransistion", get, &resp)
+	err := msgpackrpc.CallWithCodec(codec, "Alloc.UpdateDesiredTransition", get, &resp)
 	require.NotNil(err)
 	require.True(structs.IsErrPermissionDenied(err))
 
 	// Try with permissions
 	get.WriteRequest.AuthToken = s1.getLeaderAcl()
 	var resp2 structs.GenericResponse
-	require.Nil(msgpackrpc.CallWithCodec(codec, "Alloc.UpdateDesiredTransistion", get, &resp2))
+	require.Nil(msgpackrpc.CallWithCodec(codec, "Alloc.UpdateDesiredTransition", get, &resp2))
 	require.NotZero(resp2.Index)
 
 	// Look up the allocations
@@ -533,9 +555,15 @@ func TestAllocEndpoint_UpdateDesiredTransistion(t *testing.T) {
 	require.Nil(err)
 	out2, err := state.AllocByID(nil, alloc.ID)
 	require.Nil(err)
+	e1, err := state.EvalByID(nil, get.Evals[0].ID)
+	require.Nil(err)
+	e2, err := state.EvalByID(nil, get.Evals[1].ID)
+	require.Nil(err)
 
-	require.NotNil(out1.DesiredTransistion.Migrate)
-	require.NotNil(out2.DesiredTransistion.Migrate)
-	require.True(*out1.DesiredTransistion.Migrate)
-	require.True(*out2.DesiredTransistion.Migrate)
+	require.NotNil(out1.DesiredTransition.Migrate)
+	require.NotNil(out2.DesiredTransition.Migrate)
+	require.NotNil(e1)
+	require.NotNil(e2)
+	require.True(*out1.DesiredTransition.Migrate)
+	require.True(*out2.DesiredTransition.Migrate)
 }
diff --git a/nomad/drain.go b/nomad/drain.go
index 01732db1448a..f0e1dd59b89f 100644
--- a/nomad/drain.go
+++ b/nomad/drain.go
@@ -8,6 +8,7 @@ import (
 	"time"
 
 	memdb "github.com/hashicorp/go-memdb"
+	"github.com/hashicorp/nomad/helper"
 	"github.com/hashicorp/nomad/helper/uuid"
 	"github.com/hashicorp/nomad/nomad/state"
 	"github.com/hashicorp/nomad/nomad/structs"
@@ -54,20 +55,17 @@ func makeTaskGroupKey(a *structs.Allocation) string {
 
 // stopAllocs tracks allocs to drain by a unique TG key
 type stopAllocs struct {
-	allocBatch []*structs.Allocation
+	allocBatch map[string]*structs.DesiredTransition
 
 	// namespace+jobid -> Job
 	jobBatch map[jobKey]*structs.Job
 }
 
-//FIXME this method does an awful lot
 func (s *stopAllocs) add(j *structs.Job, a *structs.Allocation) {
-	// Update the allocation
-	a.ModifyTime = time.Now().UnixNano()
-	a.DesiredStatus = structs.AllocDesiredStatusStop
-
-	// Add alloc to the allocation batch
-	s.allocBatch = append(s.allocBatch, a)
+	// Add the desired migration transition to the batch
+	s.allocBatch[a.ID] = &structs.DesiredTransition{
+		Migrate: helper.BoolToPtr(true),
+	}
 
 	// Add job to the job batch
 	s.jobBatch[jobKey{a.Namespace, a.JobID}] = j
@@ -204,6 +202,7 @@ func (s *Server) startNodeDrainer(stopCh chan struct{}) {
 
 			// track number of allocs left on this node to be drained
 			allocsLeft := false
+			deadlineReached := node.DrainStrategy.DeadlineTime().Before(now)
 			for _, alloc := range allocs {
 				jobkey := jobKey{alloc.Namespace, alloc.JobID}
 
@@ -224,13 +223,6 @@ func (s *Server) startNodeDrainer(stopCh chan struct{}) {
 					panic(err)
 				}
 
-				// Don't bother collecting system jobs
-				if job.Type == structs.JobTypeSystem {
-					skipJob[jobkey] = struct{}{}
-					s.logger.Printf("[TRACE] nomad.drain: skipping system job %s", job.Name)
-					continue
-				}
-
 				// If alloc isn't yet terminal this node has
 				// allocs left to be drained
 				if !alloc.TerminalStatus() {
@@ -240,9 +232,10 @@ func (s *Server) startNodeDrainer(stopCh chan struct{}) {
 					}
 				}
 
-				// Don't bother collecting batch jobs for nodes that haven't hit their deadline
-				if job.Type == structs.JobTypeBatch && node.DrainStrategy.DeadlineTime().After(now) {
-					s.logger.Printf("[TRACE] nomad.drain: not draining batch job %s because deadline isn't for %s", job.Name, node.DrainStrategy.DeadlineTime().Sub(now))
+				// Don't bother collecting system/batch jobs for nodes that haven't hit their deadline
+				if job.Type != structs.JobTypeService && !deadlineReached {
+					s.logger.Printf("[TRACE] nomad.drain: not draining %s job %s because deadline isn't for %s",
+						job.Type, job.Name, node.DrainStrategy.DeadlineTime().Sub(now))
 					skipJob[jobkey] = struct{}{}
 					continue
 				}
@@ -273,26 +266,21 @@ func (s *Server) startNodeDrainer(stopCh chan struct{}) {
 				jobWatcher.watch(jobkey, nodeID)
 			}
 
-			// if node has no allocs, it's done draining!
-			if !allocsLeft {
-				s.logger.Printf("[TRACE] nomad.drain: node %s has no more allocs left to drain", nodeID)
+			// if node has no allocs or has hit its deadline, it's done draining!
+			if !allocsLeft || deadlineReached {
+				s.logger.Printf("[TRACE] nomad.drain: node %s has no more allocs left to drain or has reached deadline", nodeID)
 				jobWatcher.nodeDone(nodeID)
-				delete(nodes, nodeID)
 				doneNodes[nodeID] = node
 			}
 		}
 
-		// stoplist are the allocations to stop and their jobs to emit
+		// stoplist are the allocations to migrate and their jobs to emit
 		// evaluations for
 		stoplist := &stopAllocs{
-			allocBatch: make([]*structs.Allocation, 0, len(drainable)),
+			allocBatch: make(map[string]*structs.DesiredTransition),
 			jobBatch:   make(map[jobKey]*structs.Job),
 		}
 
-		// deadlineNodes is a map of node IDs that have reached their
-		// deadline and allocs that will be stopped due to deadline
-		deadlineNodes := map[string]int{}
-
 		// build drain list considering deadline & max_parallel
 		for _, drainingJob := range drainable {
 			for _, alloc := range drainingJob.allocs {
@@ -315,14 +303,13 @@ func (s *Server) startNodeDrainer(stopCh chan struct{}) {
 					stoplist.add(drainingJob.job, alloc)
 					upPerTG[tgKey]--
 
-					deadlineNodes[node.ID]++
 					continue
 				}
 
-				// Batch jobs are only stopped when the node
-				// deadline is reached which has already been
-				// done.
-				if drainingJob.job.Type == structs.JobTypeBatch {
+				// Batch/System jobs are only stopped when the
+				// node deadline is reached which has already
+				// been done.
+				if drainingJob.job.Type != structs.JobTypeService {
 					continue
 				}
 
@@ -360,32 +347,9 @@ func (s *Server) startNodeDrainer(stopCh chan struct{}) {
 			}
 		}
 
-		// log drains due to node deadlines
-		for nodeID, remaining := range deadlineNodes {
-			s.logger.Printf("[DEBUG] nomad.drain: node %s drain deadline reached; stopping %d remaining allocs", nodeID, remaining)
-			jobWatcher.nodeDone(nodeID)
-		}
-
 		if len(stoplist.allocBatch) > 0 {
 			s.logger.Printf("[DEBUG] nomad.drain: stopping %d alloc(s) for %d job(s)", len(stoplist.allocBatch), len(stoplist.jobBatch))
 
-			// Stop allocs in stoplist and add them to drainingAllocs + prevAllocWatcher
-			batch := &structs.AllocUpdateRequest{
-				Alloc:        stoplist.allocBatch,
-				WriteRequest: structs.WriteRequest{Region: s.config.Region},
-			}
-
-			// Commit this update via Raft
-			//TODO Not the right request
-			_, index, err := s.raftApply(structs.AllocClientUpdateRequestType, batch)
-			if err != nil {
-				//FIXME
-				panic(err)
-			}
-
-			//TODO i bet there's something useful to do with this index
-			_ = index
-
 			// Reevaluate affected jobs
 			evals := make([]*structs.Evaluation, 0, len(stoplist.jobBatch))
 			for _, job := range stoplist.jobBatch {
@@ -401,17 +365,23 @@ func (s *Server) startNodeDrainer(stopCh chan struct{}) {
 				})
 			}
 
-			evalUpdate := &structs.EvalUpdateRequest{
+			// Send raft request
+			batch := &structs.AllocUpdateDesiredTransitionRequest{
+				Allocs:       stoplist.allocBatch,
 				Evals:        evals,
 				WriteRequest: structs.WriteRequest{Region: s.config.Region},
 			}
 
-			// Commit this evaluation via Raft
-			_, _, err = s.raftApply(structs.EvalUpdateRequestType, evalUpdate)
+			// Commit this update via Raft
+			//TODO Not the right request
+			_, index, err := s.raftApply(structs.AllocUpdateDesiredTransitionRequestType, batch)
 			if err != nil {
 				//FIXME
 				panic(err)
 			}
+
+			//TODO i bet there's something useful to do with this index
+			_ = index
 		}
 
 		// Unset drain for nodes done draining
@@ -429,6 +399,7 @@ func (s *Server) startNodeDrainer(stopCh chan struct{}) {
 				panic(err)
 			}
 			s.logger.Printf("[INFO] nomad.drain: node %s (%s) completed draining", nodeID, node.Name)
+			delete(nodes, nodeID)
 		}
 	}
 }
@@ -529,8 +500,7 @@ func (n *nodeWatcher) queryNodeDrain(ws memdb.WatchSet, state *state.StateStore)
 		return nil, 0, err
 	}
 
-	//FIXME initial cap?
-	resp := make([]*structs.Node, 0, 1)
+	resp := make([]*structs.Node, 0, 8)
 
 	for {
 		raw := iter.Next()
diff --git a/nomad/drain_test.go b/nomad/drain_test.go
index e611fbdee2cb..0b343549e7ce 100644
--- a/nomad/drain_test.go
+++ b/nomad/drain_test.go
@@ -15,6 +15,8 @@ import (
 	"github.com/hashicorp/nomad/nomad/structs"
 	"github.com/hashicorp/nomad/testutil"
 	"github.com/hashicorp/nomad/testutil/rpcapi"
+	"github.com/kr/pretty"
+	"github.com/stretchr/testify/assert"
 	"github.com/stretchr/testify/require"
 )
 
@@ -188,9 +190,16 @@ func TestNodeDrainer_SimpleDrain(t *testing.T) {
 		t.Errorf("failed waiting for all allocs to migrate: %v", err)
 	})
 
+	node1, err := rpc.NodeGet(c1.NodeID())
+	assert := assert.New(t)
+	require.Nil(err)
+	assert.False(node1.Node.Drain)
+	assert.Nil(node1.Node.DrainStrategy)
+	assert.Equal(structs.NodeSchedulingIneligible, node1.Node.SchedulingEligibility)
+
 	jobs, err := rpc.JobList()
 	require.Nil(err)
-	t.Logf("%d jobs", len(jobs.Jobs))
+	t.Logf("--> %d jobs", len(jobs.Jobs))
 	for _, job := range jobs.Jobs {
 		t.Logf("job: %s status: %s %s", job.Name, job.Status, job.StatusDescription)
 	}
@@ -211,8 +220,9 @@ func TestNodeDrainer_SimpleDrain(t *testing.T) {
 		panic("unreachable")
 	})
 
-	t.Logf("%d allocs", len(allocs))
+	t.Logf("--> %d allocs", len(allocs))
 	for _, alloc := range allocs {
-		t.Logf("job: %s node: %s alloc: %s desired: %s actual: %s replaces: %s", alloc.Job.Name, alloc.NodeID[:6], alloc.ID, alloc.DesiredStatus, alloc.ClientStatus, alloc.PreviousAllocation)
+		t.Logf("job: %s  node: %s  alloc: %s  desired_status: %s  desired_transition: %s  actual: %s  replaces: %s",
+			alloc.Job.Name, alloc.NodeID[:6], alloc.ID[:6], alloc.DesiredStatus, pretty.Sprint(alloc.DesiredTransition.Migrate), alloc.ClientStatus, alloc.PreviousAllocation)
 	}
 }
diff --git a/nomad/fsm.go b/nomad/fsm.go
index a1d9113cada2..c8babc50ddb2 100644
--- a/nomad/fsm.go
+++ b/nomad/fsm.go
@@ -240,7 +240,7 @@ func (n *nomadFSM) Apply(log *raft.Log) interface{} {
 		return n.applyUpsertNodeEvent(buf[1:], log.Index)
 	case structs.JobBatchDeregisterRequestType:
 		return n.applyBatchDeregisterJob(buf[1:], log.Index)
-	case structs.AllocUpdateDesiredTransistionRequestType:
+	case structs.AllocUpdateDesiredTransitionRequestType:
 		return n.applyAllocUpdateDesiredTransition(buf[1:], log.Index)
 	}
 
@@ -653,17 +653,22 @@ func (n *nomadFSM) applyAllocClientUpdate(buf []byte, index uint64) interface{}
 	return nil
 }
 
-// applyAllocUpdateDesiredTransition is used to update the desired transistions
+// applyAllocUpdateDesiredTransition is used to update the desired transitions
 // of a set of allocations.
 func (n *nomadFSM) applyAllocUpdateDesiredTransition(buf []byte, index uint64) interface{} {
-	defer metrics.MeasureSince([]string{"nomad", "fsm", "alloc_update_desired_transistion"}, time.Now())
-	var req structs.AllocUpdateDesiredTransistionRequest
+	defer metrics.MeasureSince([]string{"nomad", "fsm", "alloc_update_desired_transition"}, time.Now())
+	var req structs.AllocUpdateDesiredTransitionRequest
 	if err := structs.Decode(buf, &req); err != nil {
 		panic(fmt.Errorf("failed to decode request: %v", err))
 	}
 
-	if err := n.state.UpdateAllocsDesiredTransistions(index, req.Allocs); err != nil {
-		n.logger.Printf("[ERR] nomad.fsm: UpdateAllocsDesiredTransistions failed: %v", err)
+	if err := n.state.UpdateAllocsDesiredTransitions(index, req.Allocs, req.Evals); err != nil {
+		n.logger.Printf("[ERR] nomad.fsm: UpdateAllocsDesiredTransitions failed: %v", err)
+		return err
+	}
+
+	if err := n.upsertEvals(index, req.Evals); err != nil {
+		n.logger.Printf("[ERR] nomad.fsm: AllocUpdateDesiredTransition failed to upsert %d eval(s): %v", len(req.Evals), err)
 		return err
 	}
 	return nil
diff --git a/nomad/fsm_test.go b/nomad/fsm_test.go
index a04f1cd2f1c1..a61a9e84fa49 100644
--- a/nomad/fsm_test.go
+++ b/nomad/fsm_test.go
@@ -12,6 +12,7 @@ import (
 	"github.com/google/go-cmp/cmp"
 	memdb "github.com/hashicorp/go-memdb"
 	"github.com/hashicorp/nomad/helper"
+	"github.com/hashicorp/nomad/helper/uuid"
 	"github.com/hashicorp/nomad/nomad/mock"
 	"github.com/hashicorp/nomad/nomad/state"
 	"github.com/hashicorp/nomad/nomad/structs"
@@ -1241,7 +1242,7 @@ func TestFSM_UpdateAllocFromClient(t *testing.T) {
 	require.Equal(eval, res)
 }
 
-func TestFSM_UpdateAllocDesiredTransistion(t *testing.T) {
+func TestFSM_UpdateAllocDesiredTransition(t *testing.T) {
 	t.Parallel()
 	fsm := testFSM(t)
 	state := fsm.State()
@@ -1254,17 +1255,28 @@ func TestFSM_UpdateAllocDesiredTransistion(t *testing.T) {
 	state.UpsertJobSummary(9, mock.JobSummary(alloc.JobID))
 	state.UpsertAllocs(10, []*structs.Allocation{alloc, alloc2})
 
-	t1 := &structs.DesiredTransistion{
+	t1 := &structs.DesiredTransition{
 		Migrate: helper.BoolToPtr(true),
 	}
 
-	req := structs.AllocUpdateDesiredTransistionRequest{
-		Allocs: map[string]*structs.DesiredTransistion{
+	eval := &structs.Evaluation{
+		ID:             uuid.Generate(),
+		Namespace:      alloc.Namespace,
+		Priority:       alloc.Job.Priority,
+		Type:           alloc.Job.Type,
+		TriggeredBy:    structs.EvalTriggerNodeDrain,
+		JobID:          alloc.Job.ID,
+		JobModifyIndex: alloc.Job.ModifyIndex,
+		Status:         structs.EvalStatusPending,
+	}
+	req := structs.AllocUpdateDesiredTransitionRequest{
+		Allocs: map[string]*structs.DesiredTransition{
 			alloc.ID:  t1,
 			alloc2.ID: t1,
 		},
+		Evals: []*structs.Evaluation{eval},
 	}
-	buf, err := structs.Encode(structs.AllocUpdateDesiredTransistionRequestType, req)
+	buf, err := structs.Encode(structs.AllocUpdateDesiredTransitionRequestType, req)
 	require.Nil(err)
 
 	resp := fsm.Apply(makeLog(buf))
@@ -1276,11 +1288,13 @@ func TestFSM_UpdateAllocDesiredTransistion(t *testing.T) {
 	require.Nil(err)
 	out2, err := fsm.State().AllocByID(ws, alloc2.ID)
 	require.Nil(err)
+	_, err = fsm.State().EvalByID(ws, eval.ID)
+	require.Nil(err)
 
-	require.NotNil(out1.DesiredTransistion.Migrate)
-	require.NotNil(out2.DesiredTransistion.Migrate)
-	require.True(*out1.DesiredTransistion.Migrate)
-	require.True(*out2.DesiredTransistion.Migrate)
+	require.NotNil(out1.DesiredTransition.Migrate)
+	require.NotNil(out2.DesiredTransition.Migrate)
+	require.True(*out1.DesiredTransition.Migrate)
+	require.True(*out2.DesiredTransition.Migrate)
 }
 
 func TestFSM_UpsertVaultAccessor(t *testing.T) {
diff --git a/nomad/state/state_store.go b/nomad/state/state_store.go
index 1c67327ae4ca..90af315012f5 100644
--- a/nomad/state/state_store.go
+++ b/nomad/state/state_store.go
@@ -644,8 +644,9 @@ func (s *StateStore) UpdateNodeDrain(index uint64, nodeID string, drain bool) er
 		}
 		copyNode.SchedulingEligibility = structs.NodeSchedulingIneligible
 	} else {
+		// When stopping a drain unset the strategy but leave the node
+		// ineligible for scheduling
 		copyNode.DrainStrategy = nil
-		copyNode.SchedulingEligibility = structs.NodeSchedulingEligible
 	}
 	copyNode.ModifyIndex = index
 
@@ -2008,15 +2009,17 @@ func (s *StateStore) upsertAllocsImpl(index uint64, allocs []*structs.Allocation
 	return nil
 }
 
-// UpdateAllocsDesiredTransistions is used to update a set of allocations
-// desired transistions.
-func (s *StateStore) UpdateAllocsDesiredTransistions(index uint64, allocs map[string]*structs.DesiredTransistion) error {
+// UpdateAllocsDesiredTransitions is used to update a set of allocations
+// desired transitions.
+func (s *StateStore) UpdateAllocsDesiredTransitions(index uint64, allocs map[string]*structs.DesiredTransition,
+	evals []*structs.Evaluation) error {
+
 	txn := s.db.Txn(true)
 	defer txn.Abort()
 
 	// Handle each of the updated allocations
-	for id, transistion := range allocs {
-		if err := s.nestedUpdateAllocDesiredTransition(txn, index, id, transistion); err != nil {
+	for id, transition := range allocs {
+		if err := s.nestedUpdateAllocDesiredTransition(txn, index, id, transition); err != nil {
 			return err
 		}
 	}
@@ -2031,10 +2034,10 @@ func (s *StateStore) UpdateAllocsDesiredTransistions(index uint64, allocs map[st
 }
 
 // nestedUpdateAllocDesiredTransition is used to nest an update of an
-// allocations desired transistion
+// allocations desired transition
 func (s *StateStore) nestedUpdateAllocDesiredTransition(
 	txn *memdb.Txn, index uint64, allocID string,
-	transistion *structs.DesiredTransistion) error {
+	transition *structs.DesiredTransition) error {
 
 	// Look for existing alloc
 	existing, err := txn.First("allocs", "id", allocID)
@@ -2051,8 +2054,8 @@ func (s *StateStore) nestedUpdateAllocDesiredTransition(
 	// Copy everything from the existing allocation
 	copyAlloc := exist.Copy()
 
-	// Merge the desired transistions
-	copyAlloc.DesiredTransistion.Merge(transistion)
+	// Merge the desired transitions
+	copyAlloc.DesiredTransition.Merge(transition)
 
 	// Update the modify index
 	copyAlloc.ModifyIndex = index
diff --git a/nomad/state/state_store_test.go b/nomad/state/state_store_test.go
index 4fd2173f94cf..bac9839c298f 100644
--- a/nomad/state/state_store_test.go
+++ b/nomad/state/state_store_test.go
@@ -3823,7 +3823,7 @@ func TestStateStore_UpdateAlloc_NoJob(t *testing.T) {
 	}
 }
 
-func TestStateStore_UpdateAllocDesiredTransistion(t *testing.T) {
+func TestStateStore_UpdateAllocDesiredTransition(t *testing.T) {
 	t.Parallel()
 	require := require.New(t)
 
@@ -3833,21 +3833,32 @@ func TestStateStore_UpdateAllocDesiredTransistion(t *testing.T) {
 	require.Nil(state.UpsertJob(999, alloc.Job))
 	require.Nil(state.UpsertAllocs(1000, []*structs.Allocation{alloc}))
 
-	t1 := &structs.DesiredTransistion{
+	t1 := &structs.DesiredTransition{
 		Migrate: helper.BoolToPtr(true),
 	}
-	t2 := &structs.DesiredTransistion{
+	t2 := &structs.DesiredTransition{
 		Migrate: helper.BoolToPtr(false),
 	}
+	eval := &structs.Evaluation{
+		ID:             uuid.Generate(),
+		Namespace:      alloc.Namespace,
+		Priority:       alloc.Job.Priority,
+		Type:           alloc.Job.Type,
+		TriggeredBy:    structs.EvalTriggerNodeDrain,
+		JobID:          alloc.Job.ID,
+		JobModifyIndex: alloc.Job.ModifyIndex,
+		Status:         structs.EvalStatusPending,
+	}
+	evals := []*structs.Evaluation{eval}
 
-	m := map[string]*structs.DesiredTransistion{alloc.ID: t1}
-	require.Nil(state.UpdateAllocsDesiredTransistions(1001, m))
+	m := map[string]*structs.DesiredTransition{alloc.ID: t1}
+	require.Nil(state.UpdateAllocsDesiredTransitions(1001, m, evals))
 
 	ws := memdb.NewWatchSet()
 	out, err := state.AllocByID(ws, alloc.ID)
 	require.Nil(err)
-	require.NotNil(out.DesiredTransistion.Migrate)
-	require.True(*out.DesiredTransistion.Migrate)
+	require.NotNil(out.DesiredTransition.Migrate)
+	require.True(*out.DesiredTransition.Migrate)
 	require.EqualValues(1000, out.CreateIndex)
 	require.EqualValues(1001, out.ModifyIndex)
 
@@ -3855,14 +3866,14 @@ func TestStateStore_UpdateAllocDesiredTransistion(t *testing.T) {
 	require.Nil(err)
 	require.EqualValues(1001, index)
 
-	m = map[string]*structs.DesiredTransistion{alloc.ID: t2}
-	require.Nil(state.UpdateAllocsDesiredTransistions(1002, m))
+	m = map[string]*structs.DesiredTransition{alloc.ID: t2}
+	require.Nil(state.UpdateAllocsDesiredTransitions(1002, m, evals))
 
 	ws = memdb.NewWatchSet()
 	out, err = state.AllocByID(ws, alloc.ID)
 	require.Nil(err)
-	require.NotNil(out.DesiredTransistion.Migrate)
-	require.False(*out.DesiredTransistion.Migrate)
+	require.NotNil(out.DesiredTransition.Migrate)
+	require.False(*out.DesiredTransition.Migrate)
 	require.EqualValues(1000, out.CreateIndex)
 	require.EqualValues(1002, out.ModifyIndex)
 
@@ -3871,8 +3882,8 @@ func TestStateStore_UpdateAllocDesiredTransistion(t *testing.T) {
 	require.EqualValues(1002, index)
 
 	// Try with a bogus alloc id
-	m = map[string]*structs.DesiredTransistion{uuid.Generate(): t2}
-	require.Nil(state.UpdateAllocsDesiredTransistions(1003, m))
+	m = map[string]*structs.DesiredTransition{uuid.Generate(): t2}
+	require.Nil(state.UpdateAllocsDesiredTransitions(1003, m, evals))
 }
 
 func TestStateStore_JobSummary(t *testing.T) {
diff --git a/nomad/structs/structs.go b/nomad/structs/structs.go
index e50921c27cb5..6f6a98a6fb70 100644
--- a/nomad/structs/structs.go
+++ b/nomad/structs/structs.go
@@ -78,7 +78,7 @@ const (
 	AutopilotRequestType
 	UpsertNodeEventsType
 	JobBatchDeregisterRequestType
-	AllocUpdateDesiredTransistionRequestType
+	AllocUpdateDesiredTransitionRequestType
 )
 
 const (
@@ -574,12 +574,15 @@ type AllocUpdateRequest struct {
 	WriteRequest
 }
 
-// AllocUpdateDesiredTransistionRequest is used to submit changes to allocations
-// desired transistion state.
-type AllocUpdateDesiredTransistionRequest struct {
+// AllocUpdateDesiredTransitionRequest is used to submit changes to allocations
+// desired transition state.
+type AllocUpdateDesiredTransitionRequest struct {
 	// Allocs is the mapping of allocation ids to their desired state
-	// transistion
-	Allocs map[string]*DesiredTransistion
+	// transition
+	Allocs map[string]*DesiredTransition
+
+	// Evals is the set of evaluations to create
+	Evals []*Evaluation
 
 	WriteRequest
 }
@@ -5349,10 +5352,10 @@ func (re *RescheduleEvent) Copy() *RescheduleEvent {
 	return copy
 }
 
-// DesiredTransistion is used to mark an allocation as having a desired state
-// transistion. This information can be used by the scheduler to make the
+// DesiredTransition is used to mark an allocation as having a desired state
+// transition. This information can be used by the scheduler to make the
 // correct decision.
-type DesiredTransistion struct {
+type DesiredTransition struct {
 	// Migrate is used to indicate that this allocation should be stopped and
 	// migrated to another node.
 	Migrate *bool
@@ -5360,14 +5363,14 @@ type DesiredTransistion struct {
 
 // Merge merges the two desired transitions, preferring the values from the
 // passed in object.
-func (d *DesiredTransistion) Merge(o *DesiredTransistion) {
+func (d *DesiredTransition) Merge(o *DesiredTransition) {
 	if o.Migrate != nil {
 		d.Migrate = o.Migrate
 	}
 }
 
-// ShouldMigrate returns whether the transistion object dictates a migration.
-func (d *DesiredTransistion) ShouldMigrate() bool {
+// ShouldMigrate returns whether the transition object dictates a migration.
+func (d *DesiredTransition) ShouldMigrate() bool {
 	return d.Migrate != nil && *d.Migrate
 }
 
@@ -5432,9 +5435,9 @@ type Allocation struct {
 	// DesiredStatusDescription is meant to provide more human useful information
 	DesiredDescription string
 
-	// DesiredTransistion is used to indicate that a state transistion
+	// DesiredTransition is used to indicate that a state transition
 	// is desired for a given reason.
-	DesiredTransistion DesiredTransistion
+	DesiredTransition DesiredTransition
 
 	// Status of the allocation on the client
 	ClientStatus string
diff --git a/scheduler/generic_sched_test.go b/scheduler/generic_sched_test.go
index d1bbf4710334..fd677f952db3 100644
--- a/scheduler/generic_sched_test.go
+++ b/scheduler/generic_sched_test.go
@@ -2245,7 +2245,7 @@ func TestServiceSched_NodeDown(t *testing.T) {
 	// Mark appropriate allocs for migration
 	for i := 0; i < 7; i++ {
 		out := allocs[i]
-		out.DesiredTransistion.Migrate = helper.BoolToPtr(true)
+		out.DesiredTransition.Migrate = helper.BoolToPtr(true)
 	}
 
 	noErr(t, h.State.UpsertAllocs(h.NextIndex(), allocs))
@@ -2367,7 +2367,7 @@ func TestServiceSched_NodeDrain(t *testing.T) {
 		alloc.JobID = job.ID
 		alloc.NodeID = node.ID
 		alloc.Name = fmt.Sprintf("my-job.web[%d]", i)
-		alloc.DesiredTransistion.Migrate = helper.BoolToPtr(true)
+		alloc.DesiredTransition.Migrate = helper.BoolToPtr(true)
 		allocs = append(allocs, alloc)
 	}
 	noErr(t, h.State.UpsertAllocs(h.NextIndex(), allocs))
@@ -2453,7 +2453,7 @@ func TestServiceSched_NodeDrain_Down(t *testing.T) {
 	for i := 0; i < 6; i++ {
 		newAlloc := allocs[i].Copy()
 		newAlloc.ClientStatus = structs.AllocDesiredStatusStop
-		newAlloc.DesiredTransistion.Migrate = helper.BoolToPtr(true)
+		newAlloc.DesiredTransition.Migrate = helper.BoolToPtr(true)
 		stop = append(stop, newAlloc)
 	}
 	noErr(t, h.State.UpsertAllocs(h.NextIndex(), stop))
@@ -2556,7 +2556,7 @@ func TestServiceSched_NodeDrain_Queued_Allocations(t *testing.T) {
 		alloc.JobID = job.ID
 		alloc.NodeID = node.ID
 		alloc.Name = fmt.Sprintf("my-job.web[%d]", i)
-		alloc.DesiredTransistion.Migrate = helper.BoolToPtr(true)
+		alloc.DesiredTransition.Migrate = helper.BoolToPtr(true)
 		allocs = append(allocs, alloc)
 	}
 	noErr(t, h.State.UpsertAllocs(h.NextIndex(), allocs))
@@ -3948,7 +3948,7 @@ func TestServiceSched_NodeDrain_Sticky(t *testing.T) {
 	alloc.NodeID = node.ID
 	alloc.Job.TaskGroups[0].Count = 1
 	alloc.Job.TaskGroups[0].EphemeralDisk.Sticky = true
-	alloc.DesiredTransistion.Migrate = helper.BoolToPtr(true)
+	alloc.DesiredTransition.Migrate = helper.BoolToPtr(true)
 	noErr(t, h.State.UpsertJob(h.NextIndex(), alloc.Job))
 	noErr(t, h.State.UpsertAllocs(h.NextIndex(), []*structs.Allocation{alloc}))
 
diff --git a/scheduler/reconcile_test.go b/scheduler/reconcile_test.go
index a9188fa42ee5..a00471fba603 100644
--- a/scheduler/reconcile_test.go
+++ b/scheduler/reconcile_test.go
@@ -927,7 +927,7 @@ func TestReconciler_DrainNode(t *testing.T) {
 	for i := 0; i < 2; i++ {
 		n := mock.Node()
 		n.ID = allocs[i].NodeID
-		allocs[i].DesiredTransistion.Migrate = helper.BoolToPtr(true)
+		allocs[i].DesiredTransition.Migrate = helper.BoolToPtr(true)
 		n.Drain = true
 		tainted[n.ID] = n
 	}
@@ -980,7 +980,7 @@ func TestReconciler_DrainNode_ScaleUp(t *testing.T) {
 	for i := 0; i < 2; i++ {
 		n := mock.Node()
 		n.ID = allocs[i].NodeID
-		allocs[i].DesiredTransistion.Migrate = helper.BoolToPtr(true)
+		allocs[i].DesiredTransition.Migrate = helper.BoolToPtr(true)
 		n.Drain = true
 		tainted[n.ID] = n
 	}
@@ -1034,7 +1034,7 @@ func TestReconciler_DrainNode_ScaleDown(t *testing.T) {
 	for i := 0; i < 3; i++ {
 		n := mock.Node()
 		n.ID = allocs[i].NodeID
-		allocs[i].DesiredTransistion.Migrate = helper.BoolToPtr(true)
+		allocs[i].DesiredTransition.Migrate = helper.BoolToPtr(true)
 		n.Drain = true
 		tainted[n.ID] = n
 	}
@@ -2216,7 +2216,7 @@ func TestReconciler_PausedOrFailedDeployment_Migrations(t *testing.T) {
 			for i := 0; i < 3; i++ {
 				n := mock.Node()
 				n.ID = allocs[i].NodeID
-				allocs[i].DesiredTransistion.Migrate = helper.BoolToPtr(true)
+				allocs[i].DesiredTransition.Migrate = helper.BoolToPtr(true)
 				n.Drain = true
 				tainted[n.ID] = n
 			}
@@ -2290,7 +2290,7 @@ func TestReconciler_DrainNode_Canary(t *testing.T) {
 	tainted := make(map[string]*structs.Node, 1)
 	n := mock.Node()
 	n.ID = allocs[11].NodeID
-	allocs[11].DesiredTransistion.Migrate = helper.BoolToPtr(true)
+	allocs[11].DesiredTransition.Migrate = helper.BoolToPtr(true)
 	n.Drain = true
 	tainted[n.ID] = n
 
@@ -3030,7 +3030,7 @@ func TestReconciler_TaintedNode_RollingUpgrade(t *testing.T) {
 			n.Status = structs.NodeStatusDown
 		} else {
 			n.Drain = true
-			allocs[2+i].DesiredTransistion.Migrate = helper.BoolToPtr(true)
+			allocs[2+i].DesiredTransition.Migrate = helper.BoolToPtr(true)
 		}
 		tainted[n.ID] = n
 	}
@@ -3116,7 +3116,7 @@ func TestReconciler_FailedDeployment_PlacementLost(t *testing.T) {
 			n.Status = structs.NodeStatusDown
 		} else {
 			n.Drain = true
-			allocs[6+i].DesiredTransistion.Migrate = helper.BoolToPtr(true)
+			allocs[6+i].DesiredTransition.Migrate = helper.BoolToPtr(true)
 		}
 		tainted[n.ID] = n
 	}
@@ -3442,7 +3442,7 @@ func TestReconciler_TaintedNode_MultiGroups(t *testing.T) {
 	for i := 0; i < 15; i++ {
 		n := mock.Node()
 		n.ID = allocs[i].NodeID
-		allocs[i].DesiredTransistion.Migrate = helper.BoolToPtr(true)
+		allocs[i].DesiredTransition.Migrate = helper.BoolToPtr(true)
 		n.Drain = true
 		tainted[n.ID] = n
 	}
diff --git a/scheduler/reconcile_util.go b/scheduler/reconcile_util.go
index fc8d619fb661..5527aecb4ffc 100644
--- a/scheduler/reconcile_util.go
+++ b/scheduler/reconcile_util.go
@@ -218,7 +218,7 @@ func (a allocSet) filterByTainted(nodes map[string]*structs.Node) (untainted, mi
 		if !alloc.TerminalStatus() {
 			if n == nil || n.TerminalStatus() {
 				lost[alloc.ID] = alloc
-			} else if alloc.DesiredTransistion.ShouldMigrate() {
+			} else if alloc.DesiredTransition.ShouldMigrate() {
 				migrate[alloc.ID] = alloc
 			} else {
 				untainted[alloc.ID] = alloc
diff --git a/scheduler/system_sched_test.go b/scheduler/system_sched_test.go
index 7303ea1708df..3d78b7061366 100644
--- a/scheduler/system_sched_test.go
+++ b/scheduler/system_sched_test.go
@@ -972,7 +972,7 @@ func TestSystemSched_NodeDown(t *testing.T) {
 	alloc.JobID = job.ID
 	alloc.NodeID = node.ID
 	alloc.Name = "my-job.web[0]"
-	alloc.DesiredTransistion.Migrate = helper.BoolToPtr(true)
+	alloc.DesiredTransition.Migrate = helper.BoolToPtr(true)
 	noErr(t, h.State.UpsertAllocs(h.NextIndex(), []*structs.Allocation{alloc}))
 
 	// Create a mock evaluation to deal with drain
@@ -1101,7 +1101,7 @@ func TestSystemSched_NodeDrain(t *testing.T) {
 	alloc.JobID = job.ID
 	alloc.NodeID = node.ID
 	alloc.Name = "my-job.web[0]"
-	alloc.DesiredTransistion.Migrate = helper.BoolToPtr(true)
+	alloc.DesiredTransition.Migrate = helper.BoolToPtr(true)
 	noErr(t, h.State.UpsertAllocs(h.NextIndex(), []*structs.Allocation{alloc}))
 
 	// Create a mock evaluation to deal with drain
@@ -1415,7 +1415,7 @@ func TestSystemSched_PlanWithDrainedNode(t *testing.T) {
 	alloc.JobID = job.ID
 	alloc.NodeID = node.ID
 	alloc.Name = "my-job.web[0]"
-	alloc.DesiredTransistion.Migrate = helper.BoolToPtr(true)
+	alloc.DesiredTransition.Migrate = helper.BoolToPtr(true)
 	alloc.TaskGroup = "web"
 
 	alloc2 := mock.Alloc()
diff --git a/scheduler/util.go b/scheduler/util.go
index fcac79d1c87e..c0943e126380 100644
--- a/scheduler/util.go
+++ b/scheduler/util.go
@@ -111,7 +111,7 @@ func diffAllocs(job *structs.Job, taintedNodes map[string]*structs.Node,
 						TaskGroup: tg,
 						Alloc:     exist,
 					})
-				} else if exist.DesiredTransistion.ShouldMigrate() {
+				} else if exist.DesiredTransition.ShouldMigrate() {
 					result.migrate = append(result.migrate, allocTuple{
 						Name:      name,
 						TaskGroup: tg,
diff --git a/scheduler/util_test.go b/scheduler/util_test.go
index f2b339d38eff..7fde4fa65718 100644
--- a/scheduler/util_test.go
+++ b/scheduler/util_test.go
@@ -91,7 +91,7 @@ func TestDiffAllocs(t *testing.T) {
 			NodeID: "drainNode",
 			Name:   "my-job.web[2]",
 			Job:    oldJob,
-			DesiredTransistion: structs.DesiredTransistion{
+			DesiredTransition: structs.DesiredTransition{
 				Migrate: helper.BoolToPtr(true),
 			},
 		},
@@ -223,7 +223,7 @@ func TestDiffSystemAllocs(t *testing.T) {
 			NodeID: drainNode.ID,
 			Name:   "my-job.web[0]",
 			Job:    oldJob,
-			DesiredTransistion: structs.DesiredTransistion{
+			DesiredTransition: structs.DesiredTransition{
 				Migrate: helper.BoolToPtr(true),
 			},
 		},
diff --git a/testutil/rpcapi/rcpapi.go b/testutil/rpcapi/rcpapi.go
index 71e5be057ea0..795123fdabcc 100644
--- a/testutil/rpcapi/rcpapi.go
+++ b/testutil/rpcapi/rcpapi.go
@@ -72,6 +72,21 @@ func (r *RPC) AllocGetAllocs(ids []string) (*structs.AllocsGetResponse, error) {
 	return &resp, nil
 }
 
+// Eval.List RPC
+func (r *RPC) EvalList() (*structs.EvalListResponse, error) {
+	get := &structs.EvalListRequest{
+		QueryOptions: structs.QueryOptions{
+			Region:    r.Region,
+			Namespace: r.Namespace,
+		},
+	}
+	var resp structs.EvalListResponse
+	if err := msgpackrpc.CallWithCodec(r.codec, "Eval.List", get, &resp); err != nil {
+		return nil, err
+	}
+	return &resp, nil
+}
+
 // Job.List RPC
 func (r *RPC) JobList() (*structs.JobListResponse, error) {
 	get := &structs.JobListRequest{
@@ -112,3 +127,16 @@ func (r *RPC) NodeGetAllocs(nodeID string) (*structs.NodeAllocsResponse, error)
 	}
 	return &resp, nil
 }
+
+// Node.GetNode RPC
+func (r *RPC) NodeGet(nodeID string) (*structs.SingleNodeResponse, error) {
+	get := &structs.NodeSpecificRequest{
+		NodeID:       nodeID,
+		QueryOptions: structs.QueryOptions{Region: r.Region},
+	}
+	var resp structs.SingleNodeResponse
+	if err := msgpackrpc.CallWithCodec(r.codec, "Node.GetNode", get, &resp); err != nil {
+		return nil, err
+	}
+	return &resp, nil
+}

From a466f97cbafce2c720c1732b49d3301848a1ad1d Mon Sep 17 00:00:00 2001
From: Michael Schurter <schmichael@hashicorp.com>
Date: Fri, 23 Feb 2018 16:45:57 -0800
Subject: [PATCH 07/79] scheduler: migrate non-terminal migrating allocs

filterByTainted node should always migrate non-terminal migrating allocs
---
 scheduler/reconcile_util.go      | 36 ++++++------
 scheduler/reconcile_util_test.go | 99 ++++++++++++++++++++++++++++++++
 2 files changed, 117 insertions(+), 18 deletions(-)

diff --git a/scheduler/reconcile_util.go b/scheduler/reconcile_util.go
index 5527aecb4ffc..a7b0b814120f 100644
--- a/scheduler/reconcile_util.go
+++ b/scheduler/reconcile_util.go
@@ -199,33 +199,33 @@ func (a allocSet) filterByTainted(nodes map[string]*structs.Node) (untainted, mi
 	migrate = make(map[string]*structs.Allocation)
 	lost = make(map[string]*structs.Allocation)
 	for _, alloc := range a {
-		n, ok := nodes[alloc.NodeID]
-		if !ok {
+		// Terminal allocs are always untainted as they should never be migrated
+		if alloc.TerminalStatus() {
 			untainted[alloc.ID] = alloc
 			continue
 		}
 
-		// If the job is batch and finished successfully, the fact that the
-		// node is tainted does not mean it should be migrated or marked as
-		// lost as the work was already successfully finished. However for
-		// service/system jobs, tasks should never complete. The check of
-		// batch type, defends against client bugs.
-		if alloc.Job.Type == structs.JobTypeBatch && alloc.RanSuccessfully() {
-			untainted[alloc.ID] = alloc
+		// Non-terminal allocs that should migrate should always migrate
+		if alloc.DesiredTransition.ShouldMigrate() {
+			migrate[alloc.ID] = alloc
 			continue
 		}
 
-		if !alloc.TerminalStatus() {
-			if n == nil || n.TerminalStatus() {
-				lost[alloc.ID] = alloc
-			} else if alloc.DesiredTransition.ShouldMigrate() {
-				migrate[alloc.ID] = alloc
-			} else {
-				untainted[alloc.ID] = alloc
-			}
-		} else {
+		n, ok := nodes[alloc.NodeID]
+		if !ok {
+			// Node is untainted so alloc is untainted
 			untainted[alloc.ID] = alloc
+			continue
+		}
+
+		// Allocs on GC'd (nil) or lost nodes are Lost
+		if n == nil || n.TerminalStatus() {
+			lost[alloc.ID] = alloc
+			continue
 		}
+
+		// All other allocs are untainted
+		untainted[alloc.ID] = alloc
 	}
 	return
 }
diff --git a/scheduler/reconcile_util_test.go b/scheduler/reconcile_util_test.go
index 3b45a55ed6d5..6d85dfb811ed 100644
--- a/scheduler/reconcile_util_test.go
+++ b/scheduler/reconcile_util_test.go
@@ -3,7 +3,9 @@ package scheduler
 import (
 	"testing"
 
+	"github.com/hashicorp/nomad/helper"
 	"github.com/hashicorp/nomad/nomad/structs"
+	"github.com/stretchr/testify/require"
 )
 
 // Test that we properly create the bitmap even when the alloc set includes an
@@ -29,3 +31,100 @@ func TestBitmapFrom(t *testing.T) {
 		t.Fatalf("got %d; want %d", act, exp)
 	}
 }
+
+func TestAllocSet_filterByTainted(t *testing.T) {
+	require := require.New(t)
+
+	nodes := map[string]*structs.Node{
+		"draining": &structs.Node{
+			ID:    "draining",
+			Drain: true,
+		},
+		"lost": &structs.Node{
+			ID:     "lost",
+			Status: structs.NodeStatusDown,
+		},
+		"nil": nil,
+		"normal": &structs.Node{
+			ID:     "normal",
+			Status: structs.NodeStatusReady,
+		},
+	}
+
+	batchJob := &structs.Job{
+		Type: structs.JobTypeBatch,
+	}
+
+	allocs := allocSet{
+		// Non-terminal alloc with migrate=true should migrate on a draining node
+		"migrating1": {
+			ID:                "migrating1",
+			ClientStatus:      structs.AllocClientStatusRunning,
+			DesiredTransition: structs.DesiredTransition{helper.BoolToPtr(true)},
+			Job:               batchJob,
+			NodeID:            "draining",
+		},
+		// Non-terminal alloc with migrate=true should migrate on an unknown node
+		"migrating2": {
+			ID:                "migrating2",
+			ClientStatus:      structs.AllocClientStatusRunning,
+			DesiredTransition: structs.DesiredTransition{helper.BoolToPtr(true)},
+			Job:               batchJob,
+			NodeID:            "nil",
+		},
+		"untainted1": {
+			ID:           "untainted1",
+			ClientStatus: structs.AllocClientStatusRunning,
+			Job:          batchJob,
+			NodeID:       "normal",
+		},
+		// Terminal allocs are always untainted
+		"untainted2": {
+			ID:           "untainted2",
+			ClientStatus: structs.AllocClientStatusComplete,
+			Job:          batchJob,
+			NodeID:       "normal",
+		},
+		// Terminal allocs are always untainted, even on draining nodes
+		"untainted3": {
+			ID:           "untainted3",
+			ClientStatus: structs.AllocClientStatusComplete,
+			Job:          batchJob,
+			NodeID:       "draining",
+		},
+		// Terminal allocs are always untainted, even on lost nodes
+		"untainted4": {
+			ID:           "untainted4",
+			ClientStatus: structs.AllocClientStatusComplete,
+			Job:          batchJob,
+			NodeID:       "lost",
+		},
+		// Non-terminal allocs on lost nodes are lost
+		"lost1": {
+			ID:           "lost1",
+			ClientStatus: structs.AllocClientStatusPending,
+			Job:          batchJob,
+			NodeID:       "lost",
+		},
+		// Non-terminal allocs on lost nodes are lost
+		"lost2": {
+			ID:           "lost2",
+			ClientStatus: structs.AllocClientStatusRunning,
+			Job:          batchJob,
+			NodeID:       "lost",
+		},
+	}
+
+	untainted, migrate, lost := allocs.filterByTainted(nodes)
+	require.Len(untainted, 4)
+	require.Contains(untainted, "untainted1")
+	require.Contains(untainted, "untainted2")
+	require.Contains(untainted, "untainted3")
+	require.Contains(untainted, "untainted4")
+	require.Len(migrate, 2)
+	require.Contains(migrate, "migrating1")
+	require.Contains(migrate, "migrating2")
+	require.Len(lost, 2)
+	require.Contains(lost, "lost1")
+	require.Contains(lost, "lost2")
+}

From 116c28c77c42dcb036fce2610c5e4b56dcf3ef35 Mon Sep 17 00:00:00 2001
From: Michael Schurter <schmichael@hashicorp.com>
Date: Mon, 26 Feb 2018 15:01:27 -0800
Subject: [PATCH 08/79] improve drain fsm/statestore tests

---
 nomad/fsm_test.go               |  4 +-
 nomad/state/state_store_test.go | 81 ++++++++++++++++++++-------------
 2 files changed, 52 insertions(+), 33 deletions(-)

diff --git a/nomad/fsm_test.go b/nomad/fsm_test.go
index a61a9e84fa49..90ba21f14e8e 100644
--- a/nomad/fsm_test.go
+++ b/nomad/fsm_test.go
@@ -1288,8 +1288,10 @@ func TestFSM_UpdateAllocDesiredTransition(t *testing.T) {
 	require.Nil(err)
 	out2, err := fsm.State().AllocByID(ws, alloc2.ID)
 	require.Nil(err)
-	_, err = fsm.State().EvalByID(ws, eval.ID)
+	evalOut, err := fsm.State().EvalByID(ws, eval.ID)
 	require.Nil(err)
+	require.NotNil(evalOut)
+	require.Equal(eval.ID, evalOut.ID)
 
 	require.NotNil(out1.DesiredTransition.Migrate)
 	require.NotNil(out2.DesiredTransition.Migrate)
diff --git a/nomad/state/state_store_test.go b/nomad/state/state_store_test.go
index bac9839c298f..73c86bbd9f59 100644
--- a/nomad/state/state_store_test.go
+++ b/nomad/state/state_store_test.go
@@ -699,52 +699,69 @@ func TestStateStore_UpdateNodeStatus_Node(t *testing.T) {
 }
 
 func TestStateStore_UpdateNodeDrain_Node(t *testing.T) {
+	require := require.New(t)
 	state := testStateStore(t)
 	node := mock.Node()
 
-	err := state.UpsertNode(1000, node)
-	if err != nil {
-		t.Fatalf("err: %v", err)
-	}
+	require.Nil(state.UpsertNode(1000, node))
 
 	// Create a watchset so we can test that update node drain fires the watch
 	ws := memdb.NewWatchSet()
-	if _, err := state.NodeByID(ws, node.ID); err != nil {
-		t.Fatalf("bad: %v", err)
-	}
 
-	err = state.UpdateNodeDrain(1001, node.ID, true)
-	if err != nil {
-		t.Fatalf("err: %v", err)
-	}
+	// Assert initial node state
+	{
+		out, err := state.NodeByID(ws, node.ID)
+		require.Nil(err)
 
-	if !watchFired(ws) {
-		t.Fatalf("bad")
+		require.False(out.Drain)
+		require.Nil(out.DrainStrategy)
+		require.Equal(structs.NodeSchedulingEligible, out.SchedulingEligibility)
+		if out.ModifyIndex != 1000 {
+			t.Fatalf("expected ModifyIndex=1000, found %d", out.ModifyIndex)
+		}
 	}
 
-	ws = memdb.NewWatchSet()
-	out, err := state.NodeByID(ws, node.ID)
-	if err != nil {
-		t.Fatalf("err: %v", err)
-	}
+	// Start draining
+	{
+		require.Nil(state.UpdateNodeDrain(1001, node.ID, true))
+		require.True(watchFired(ws))
 
-	if !out.Drain {
-		t.Fatalf("bad: %#v", out)
-	}
-	if out.ModifyIndex != 1001 {
-		t.Fatalf("bad: %#v", out)
-	}
+		ws = memdb.NewWatchSet()
+		out, err := state.NodeByID(ws, node.ID)
+		require.Nil(err)
 
-	index, err := state.Index("nodes")
-	if err != nil {
-		t.Fatalf("err: %v", err)
-	}
-	if index != 1001 {
-		t.Fatalf("bad: %d", index)
+		require.True(out.Drain)
+		require.NotNil(out.DrainStrategy)
+		require.Equal(structs.NodeSchedulingIneligible, out.SchedulingEligibility)
+		if out.ModifyIndex != 1001 {
+			t.Fatalf("expected ModifyIndex=1001, found %d", out.ModifyIndex)
+		}
+
+		index, err := state.Index("nodes")
+		require.Nil(err)
+		if index != 1001 {
+			t.Fatalf("expected index=1001, found %d", index)
+		}
+
+		require.False(watchFired(ws))
 	}
 
-	if watchFired(ws) {
-		t.Fatalf("bad")
+	// Stop draining (no need to retest watch behavior)
+	{
+		require.Nil(state.UpdateNodeDrain(1002, node.ID, false))
+
+		out, err := state.NodeByID(nil, node.ID)
+		require.Nil(err)
+
+		require.False(out.Drain)
+		require.Nil(out.DrainStrategy)
+		if out.ModifyIndex != 1002 {
+			t.Fatalf("expected ModifyIndex=1002, found %d", out.ModifyIndex)
+		}
+
+		// Scheduling eligibility should *not* flip back to eligible after
+		// draining stops.
+		require.Equal(structs.NodeSchedulingIneligible, out.SchedulingEligibility)
 	}
 }
 

From 1773de9e30b6ac9be6e3a839154b27097c8b655f Mon Sep 17 00:00:00 2001
From: Alex Dadgar <alex.dadgar@gmail.com>
Date: Fri, 23 Feb 2018 10:42:43 -0800
Subject: [PATCH 09/79] Node.Drain takes strategy

---
 nomad/drain_test.go             |   5 +-
 nomad/fsm.go                    |   2 +-
 nomad/fsm_test.go               |  34 +++++-----
 nomad/mock/mock.go              |   5 ++
 nomad/node_endpoint.go          |  21 ++-----
 nomad/node_endpoint_test.go     | 107 +++++++++++++++-----------------
 nomad/state/state_store.go      |  21 +++----
 nomad/state/state_store_test.go |  73 ++++++----------------
 nomad/structs/structs.go        |  33 +++++++---
 9 files changed, 131 insertions(+), 170 deletions(-)

diff --git a/nomad/drain_test.go b/nomad/drain_test.go
index 0b343549e7ce..c47e0d401548 100644
--- a/nomad/drain_test.go
+++ b/nomad/drain_test.go
@@ -143,9 +143,12 @@ func TestNodeDrainer_SimpleDrain(t *testing.T) {
 
 	// Start draining node 1
 	//FIXME update drain rpc to skip fsm manipulation and use api
+	strategy := &structs.DrainStrategy{
+		Deadline: -1 * time.Second,
+	}
 	node, err := state.NodeByID(nil, c1.NodeID())
 	require.Nil(err)
-	require.Nil(state.UpdateNodeDrain(node.ModifyIndex+1, node.ID, true))
+	require.Nil(state.UpdateNodeDrain(node.ModifyIndex+1, node.ID, strategy, 101))
 
 	// Start node 2
 	c2 := client.TestClient(t, func(conf *config.Config) {
diff --git a/nomad/fsm.go b/nomad/fsm.go
index c8babc50ddb2..58d1527514a1 100644
--- a/nomad/fsm.go
+++ b/nomad/fsm.go
@@ -328,7 +328,7 @@ func (n *nomadFSM) applyDrainUpdate(buf []byte, index uint64) interface{} {
 		panic(fmt.Errorf("failed to decode request: %v", err))
 	}
 
-	if err := n.state.UpdateNodeDrain(index, req.NodeID, req.Drain); err != nil {
+	if err := n.state.UpdateNodeDrain(index, req.NodeID, req.DrainStrategy, req.UpdateTime); err != nil {
 		n.logger.Printf("[ERR] nomad.fsm: UpdateNodeDrain failed: %v", err)
 		return err
 	}
diff --git a/nomad/fsm_test.go b/nomad/fsm_test.go
index 90ba21f14e8e..b9fc4845da82 100644
--- a/nomad/fsm_test.go
+++ b/nomad/fsm_test.go
@@ -280,6 +280,7 @@ func TestFSM_UpdateNodeStatus(t *testing.T) {
 
 func TestFSM_UpdateNodeDrain(t *testing.T) {
 	t.Parallel()
+	require := require.New(t)
 	fsm := testFSM(t)
 
 	node := mock.Node()
@@ -287,38 +288,31 @@ func TestFSM_UpdateNodeDrain(t *testing.T) {
 		Node: node,
 	}
 	buf, err := structs.Encode(structs.NodeRegisterRequestType, req)
-	if err != nil {
-		t.Fatalf("err: %v", err)
-	}
+	require.Nil(err)
 
 	resp := fsm.Apply(makeLog(buf))
-	if resp != nil {
-		t.Fatalf("resp: %v", resp)
-	}
+	require.Nil(resp)
 
+	strategy := &structs.DrainStrategy{
+		Deadline: 10 * time.Second,
+	}
 	req2 := structs.NodeUpdateDrainRequest{
-		NodeID: node.ID,
-		Drain:  true,
+		NodeID:        node.ID,
+		DrainStrategy: strategy,
+		UpdateTime:    101,
 	}
 	buf, err = structs.Encode(structs.NodeUpdateDrainRequestType, req2)
-	if err != nil {
-		t.Fatalf("err: %v", err)
-	}
+	require.Nil(err)
 
 	resp = fsm.Apply(makeLog(buf))
-	if resp != nil {
-		t.Fatalf("resp: %v", resp)
-	}
+	require.Nil(resp)
 
 	// Verify we are NOT registered
 	ws := memdb.NewWatchSet()
 	node, err = fsm.State().NodeByID(ws, req.Node.ID)
-	if err != nil {
-		t.Fatalf("err: %v", err)
-	}
-	if !node.Drain {
-		t.Fatalf("bad node: %#v", node)
-	}
+	require.Nil(err)
+	require.True(node.Drain)
+	require.Equal(node.DrainStrategy, strategy)
 }
 
 func TestFSM_RegisterJob(t *testing.T) {
diff --git a/nomad/mock/mock.go b/nomad/mock/mock.go
index 6c2a3f42e0a3..aef9c475f011 100644
--- a/nomad/mock/mock.go
+++ b/nomad/mock/mock.go
@@ -196,6 +196,10 @@ func SystemJob() *structs.Job {
 					Delay:    1 * time.Minute,
 					Mode:     structs.RestartPolicyModeDelay,
 				},
+				ReschedulePolicy: &structs.ReschedulePolicy{
+					Attempts: 2,
+					Interval: 10 * time.Minute,
+				},
 				EphemeralDisk: structs.DefaultEphemeralDisk(),
 				Tasks: []*structs.Task{
 					{
@@ -240,6 +244,7 @@ func PeriodicJob() *structs.Job {
 		Spec:     "*/30 * * * *",
 	}
 	job.Status = structs.JobStatusRunning
+	job.TaskGroups[0].Migrate = nil
 	return job
 }
 
diff --git a/nomad/node_endpoint.go b/nomad/node_endpoint.go
index 182817392bdf..2631939ad13d 100644
--- a/nomad/node_endpoint.go
+++ b/nomad/node_endpoint.go
@@ -434,28 +434,15 @@ func (n *Node) UpdateDrain(args *structs.NodeUpdateDrainRequest,
 	}
 
 	// Update the timestamp to
-	node.StatusUpdatedAt = time.Now().Unix()
+	args.UpdateTime = time.Now().Unix()
 
 	// Commit this update via Raft
-	var index uint64
-	if node.Drain != args.Drain {
-		_, index, err = n.srv.raftApply(structs.NodeUpdateDrainRequestType, args)
-		if err != nil {
-			n.srv.logger.Printf("[ERR] nomad.client: drain update failed: %v", err)
-			return err
-		}
-		reply.NodeModifyIndex = index
-	}
-
-	// Always attempt to create Node evaluations because there may be a System
-	// job registered that should be evaluated.
-	evalIDs, evalIndex, err := n.createNodeEvals(args.NodeID, index)
+	_, index, err := n.srv.raftApply(structs.NodeUpdateDrainRequestType, args)
 	if err != nil {
-		n.srv.logger.Printf("[ERR] nomad.client: eval creation failed: %v", err)
+		n.srv.logger.Printf("[ERR] nomad.client: drain update failed: %v", err)
 		return err
 	}
-	reply.EvalIDs = evalIDs
-	reply.EvalCreateIndex = evalIndex
+	reply.NodeModifyIndex = index
 
 	// Set the reply index
 	reply.Index = index
diff --git a/nomad/node_endpoint_test.go b/nomad/node_endpoint_test.go
index 375ca8731cb3..0de46ed22c61 100644
--- a/nomad/node_endpoint_test.go
+++ b/nomad/node_endpoint_test.go
@@ -747,6 +747,7 @@ func TestClientEndpoint_UpdateStatus_HeartbeatOnly_Advertise(t *testing.T) {
 
 func TestClientEndpoint_UpdateDrain(t *testing.T) {
 	t.Parallel()
+	require := require.New(t)
 	s1 := TestServer(t, nil)
 	defer s1.Shutdown()
 	codec := rpcClient(t, s1)
@@ -761,34 +762,29 @@ func TestClientEndpoint_UpdateDrain(t *testing.T) {
 
 	// Fetch the response
 	var resp structs.NodeUpdateResponse
-	if err := msgpackrpc.CallWithCodec(codec, "Node.Register", reg, &resp); err != nil {
-		t.Fatalf("err: %v", err)
+	require.Nil(msgpackrpc.CallWithCodec(codec, "Node.Register", reg, &resp))
+
+	strategy := &structs.DrainStrategy{
+		Deadline: 10 * time.Second,
 	}
 
 	// Update the status
 	dereg := &structs.NodeUpdateDrainRequest{
-		NodeID:       node.ID,
-		Drain:        true,
-		WriteRequest: structs.WriteRequest{Region: "global"},
+		NodeID:        node.ID,
+		DrainStrategy: strategy,
+		WriteRequest:  structs.WriteRequest{Region: "global"},
 	}
 	var resp2 structs.NodeDrainUpdateResponse
-	if err := msgpackrpc.CallWithCodec(codec, "Node.UpdateDrain", dereg, &resp2); err != nil {
-		t.Fatalf("err: %v", err)
-	}
-	if resp2.Index == 0 {
-		t.Fatalf("bad index: %d", resp2.Index)
-	}
+	require.Nil(msgpackrpc.CallWithCodec(codec, "Node.UpdateDrain", dereg, &resp2))
+	require.NotZero(resp2.Index)
 
 	// Check for the node in the FSM
 	state := s1.fsm.State()
 	ws := memdb.NewWatchSet()
 	out, err := state.NodeByID(ws, node.ID)
-	if err != nil {
-		t.Fatalf("err: %v", err)
-	}
-	if !out.Drain {
-		t.Fatalf("bad: %#v", out)
-	}
+	require.Nil(err)
+	require.True(out.Drain)
+	require.Equal(strategy, out.DrainStrategy)
 }
 
 func TestClientEndpoint_UpdateDrain_ACL(t *testing.T) {
@@ -797,13 +793,13 @@ func TestClientEndpoint_UpdateDrain_ACL(t *testing.T) {
 	defer s1.Shutdown()
 	codec := rpcClient(t, s1)
 	testutil.WaitForLeader(t, s1.RPC)
-	assert := assert.New(t)
+	require := require.New(t)
 
 	// Create the node
 	node := mock.Node()
 	state := s1.fsm.State()
 
-	assert.Nil(state.UpsertNode(1, node), "UpsertNode")
+	require.Nil(state.UpsertNode(1, node), "UpsertNode")
 
 	// Create the policy and tokens
 	validToken := mock.CreatePolicyAndToken(t, state, 1001, "test-valid", mock.NodePolicy(acl.PolicyWrite))
@@ -811,22 +807,24 @@ func TestClientEndpoint_UpdateDrain_ACL(t *testing.T) {
 
 	// Update the status without a token and expect failure
 	dereg := &structs.NodeUpdateDrainRequest{
-		NodeID:       node.ID,
-		Drain:        true,
+		NodeID: node.ID,
+		DrainStrategy: &structs.DrainStrategy{
+			Deadline: 10 * time.Second,
+		},
 		WriteRequest: structs.WriteRequest{Region: "global"},
 	}
 	{
 		var resp structs.NodeDrainUpdateResponse
 		err := msgpackrpc.CallWithCodec(codec, "Node.UpdateDrain", dereg, &resp)
-		assert.NotNil(err, "RPC")
-		assert.Equal(err.Error(), structs.ErrPermissionDenied.Error())
+		require.NotNil(err, "RPC")
+		require.Equal(err.Error(), structs.ErrPermissionDenied.Error())
 	}
 
 	// Try with a valid token
 	dereg.AuthToken = validToken.SecretID
 	{
 		var resp structs.NodeDrainUpdateResponse
-		assert.Nil(msgpackrpc.CallWithCodec(codec, "Node.UpdateDrain", dereg, &resp), "RPC")
+		require.Nil(msgpackrpc.CallWithCodec(codec, "Node.UpdateDrain", dereg, &resp), "RPC")
 	}
 
 	// Try with a invalid token
@@ -834,15 +832,15 @@ func TestClientEndpoint_UpdateDrain_ACL(t *testing.T) {
 	{
 		var resp structs.NodeDrainUpdateResponse
 		err := msgpackrpc.CallWithCodec(codec, "Node.UpdateDrain", dereg, &resp)
-		assert.NotNil(err, "RPC")
-		assert.Equal(err.Error(), structs.ErrPermissionDenied.Error())
+		require.NotNil(err, "RPC")
+		require.Equal(err.Error(), structs.ErrPermissionDenied.Error())
 	}
 
 	// Try with a root token
 	dereg.AuthToken = root.SecretID
 	{
 		var resp structs.NodeDrainUpdateResponse
-		assert.Nil(msgpackrpc.CallWithCodec(codec, "Node.UpdateDrain", dereg, &resp), "RPC")
+		require.Nil(msgpackrpc.CallWithCodec(codec, "Node.UpdateDrain", dereg, &resp), "RPC")
 	}
 }
 
@@ -854,6 +852,7 @@ func TestClientEndpoint_Drain_Down(t *testing.T) {
 	defer s1.Shutdown()
 	codec := rpcClient(t, s1)
 	testutil.WaitForLeader(t, s1.RPC)
+	require := require.New(t)
 
 	// Register a node
 	node := mock.Node()
@@ -863,9 +862,7 @@ func TestClientEndpoint_Drain_Down(t *testing.T) {
 	}
 	// Fetch the response
 	var resp structs.NodeUpdateResponse
-	if err := msgpackrpc.CallWithCodec(codec, "Node.Register", reg, &resp); err != nil {
-		t.Fatalf("err: %v", err)
-	}
+	require.Nil(msgpackrpc.CallWithCodec(codec, "Node.Register", reg, &resp))
 
 	// Register a service job
 	var jobResp structs.JobRegisterResponse
@@ -878,15 +875,12 @@ func TestClientEndpoint_Drain_Down(t *testing.T) {
 			Namespace: job.Namespace,
 		},
 	}
-	if err := msgpackrpc.CallWithCodec(codec, "Job.Register", jobReq, &jobResp); err != nil {
-		t.Fatalf("err: %v", err)
-	}
+	require.Nil(msgpackrpc.CallWithCodec(codec, "Job.Register", jobReq, &jobResp))
 
 	// Register a system job
 	var jobResp1 structs.JobRegisterResponse
-	job1 := mock.Job()
+	job1 := mock.SystemJob()
 	job1.TaskGroups[0].Count = 1
-	job1.Type = structs.JobTypeSystem
 	jobReq1 := &structs.JobRegisterRequest{
 		Job: job1,
 		WriteRequest: structs.WriteRequest{
@@ -894,9 +888,7 @@ func TestClientEndpoint_Drain_Down(t *testing.T) {
 			Namespace: job1.Namespace,
 		},
 	}
-	if err := msgpackrpc.CallWithCodec(codec, "Job.Register", jobReq1, &jobResp1); err != nil {
-		t.Fatalf("err: %v", err)
-	}
+	require.Nil(msgpackrpc.CallWithCodec(codec, "Job.Register", jobReq1, &jobResp1))
 
 	// Wait for the scheduler to create an allocation
 	testutil.WaitForResult(func() (bool, error) {
@@ -916,14 +908,14 @@ func TestClientEndpoint_Drain_Down(t *testing.T) {
 
 	// Drain the node
 	dereg := &structs.NodeUpdateDrainRequest{
-		NodeID:       node.ID,
-		Drain:        true,
+		NodeID: node.ID,
+		DrainStrategy: &structs.DrainStrategy{
+			Deadline: -1 * time.Second,
+		},
 		WriteRequest: structs.WriteRequest{Region: "global"},
 	}
 	var resp2 structs.NodeDrainUpdateResponse
-	if err := msgpackrpc.CallWithCodec(codec, "Node.UpdateDrain", dereg, &resp2); err != nil {
-		t.Fatalf("err: %v", err)
-	}
+	require.Nil(msgpackrpc.CallWithCodec(codec, "Node.UpdateDrain", dereg, &resp2))
 
 	// Mark the node as down
 	node.Status = structs.NodeStatusDown
@@ -931,9 +923,7 @@ func TestClientEndpoint_Drain_Down(t *testing.T) {
 		Node:         node,
 		WriteRequest: structs.WriteRequest{Region: "global"},
 	}
-	if err := msgpackrpc.CallWithCodec(codec, "Node.Register", reg, &resp); err != nil {
-		t.Fatalf("err: %v", err)
-	}
+	require.Nil(msgpackrpc.CallWithCodec(codec, "Node.Register", reg, &resp))
 
 	// Ensure that the allocation has transitioned to lost
 	testutil.WaitForResult(func() (bool, error) {
@@ -956,7 +946,7 @@ func TestClientEndpoint_Drain_Down(t *testing.T) {
 			ModifyIndex: summary.ModifyIndex,
 		}
 		if !reflect.DeepEqual(summary, expectedSummary) {
-			return false, fmt.Errorf("expected: %#v, actual: %#v", expectedSummary, summary)
+			return false, fmt.Errorf("Service: expected: %#v, actual: %#v", expectedSummary, summary)
 		}
 
 		summary1, err := s1.fsm.state.JobSummaryByID(ws, job1.Namespace, job1.ID)
@@ -976,7 +966,7 @@ func TestClientEndpoint_Drain_Down(t *testing.T) {
 			ModifyIndex: summary1.ModifyIndex,
 		}
 		if !reflect.DeepEqual(summary1, expectedSummary1) {
-			return false, fmt.Errorf("expected: %#v, actual: %#v", expectedSummary1, summary1)
+			return false, fmt.Errorf("System: expected: %#v, actual: %#v", expectedSummary1, summary1)
 		}
 		return true, nil
 	}, func(err error) {
@@ -2378,7 +2368,10 @@ func TestClientEndpoint_ListNodes_Blocking(t *testing.T) {
 
 	// Node drain updates trigger watches.
 	time.AfterFunc(100*time.Millisecond, func() {
-		if err := state.UpdateNodeDrain(3, node.ID, true); err != nil {
+		s := &structs.DrainStrategy{
+			Deadline: 10 * time.Second,
+		}
+		if err := state.UpdateNodeDrain(3, node.ID, s, 101); err != nil {
 			t.Fatalf("err: %v", err)
 		}
 	})
@@ -2402,12 +2395,12 @@ func TestClientEndpoint_ListNodes_Blocking(t *testing.T) {
 
 	// Node status update triggers watches
 	time.AfterFunc(100*time.Millisecond, func() {
-		if err := state.UpdateNodeStatus(4, node.ID, structs.NodeStatusDown); err != nil {
+		if err := state.UpdateNodeStatus(40, node.ID, structs.NodeStatusDown); err != nil {
 			t.Fatalf("err: %v", err)
 		}
 	})
 
-	req.MinQueryIndex = 3
+	req.MinQueryIndex = 38
 	var resp3 structs.NodeListResponse
 	start = time.Now()
 	if err := msgpackrpc.CallWithCodec(codec, "Node.List", req, &resp3); err != nil {
@@ -2417,8 +2410,8 @@ func TestClientEndpoint_ListNodes_Blocking(t *testing.T) {
 	if elapsed := time.Since(start); elapsed < 100*time.Millisecond {
 		t.Fatalf("should block (returned in %s) %#v", elapsed, resp3)
 	}
-	if resp3.Index != 4 {
-		t.Fatalf("Bad index: %d %d", resp3.Index, 4)
+	if resp3.Index != 40 {
+		t.Fatalf("Bad index: %d %d", resp3.Index, 40)
 	}
 	if len(resp3.Nodes) != 1 || resp3.Nodes[0].Status != structs.NodeStatusDown {
 		t.Fatalf("bad: %#v", resp3.Nodes)
@@ -2426,12 +2419,12 @@ func TestClientEndpoint_ListNodes_Blocking(t *testing.T) {
 
 	// Node delete triggers watches.
 	time.AfterFunc(100*time.Millisecond, func() {
-		if err := state.DeleteNode(5, node.ID); err != nil {
+		if err := state.DeleteNode(50, node.ID); err != nil {
 			t.Fatalf("err: %v", err)
 		}
 	})
 
-	req.MinQueryIndex = 4
+	req.MinQueryIndex = 45
 	var resp4 structs.NodeListResponse
 	start = time.Now()
 	if err := msgpackrpc.CallWithCodec(codec, "Node.List", req, &resp4); err != nil {
@@ -2441,8 +2434,8 @@ func TestClientEndpoint_ListNodes_Blocking(t *testing.T) {
 	if elapsed := time.Since(start); elapsed < 100*time.Millisecond {
 		t.Fatalf("should block (returned in %s) %#v", elapsed, resp4)
 	}
-	if resp4.Index != 5 {
-		t.Fatalf("Bad index: %d %d", resp4.Index, 5)
+	if resp4.Index != 50 {
+		t.Fatalf("Bad index: %d %d", resp4.Index, 50)
 	}
 	if len(resp4.Nodes) != 0 {
 		t.Fatalf("bad: %#v", resp4.Nodes)
diff --git a/nomad/state/state_store.go b/nomad/state/state_store.go
index 90af315012f5..45c595d04b36 100644
--- a/nomad/state/state_store.go
+++ b/nomad/state/state_store.go
@@ -6,7 +6,6 @@ import (
 	"io"
 	"log"
 	"sort"
-	"time"
 
 	"github.com/hashicorp/go-memdb"
 	multierror "github.com/hashicorp/go-multierror"
@@ -617,7 +616,9 @@ func (s *StateStore) UpdateNodeStatus(index uint64, nodeID, status string) error
 }
 
 // UpdateNodeDrain is used to update the drain of a node
-func (s *StateStore) UpdateNodeDrain(index uint64, nodeID string, drain bool) error {
+func (s *StateStore) UpdateNodeDrain(index uint64, nodeID string,
+	drain *structs.DrainStrategy, updateTime int64) error {
+
 	txn := s.db.Txn(true)
 	defer txn.Abort()
 
@@ -635,20 +636,18 @@ func (s *StateStore) UpdateNodeDrain(index uint64, nodeID string, drain bool) er
 	copyNode := existingNode.Copy()
 
 	// Update the drain in the copy
-	copyNode.Drain = drain
-	//FIXME
-	if drain {
-		copyNode.DrainStrategy = &structs.DrainStrategy{
-			StartTime: time.Now().UnixNano(),
-			Deadline:  10 * time.Second,
-		}
-		copyNode.SchedulingEligibility = structs.NodeSchedulingIneligible
-	} else {
+	copyNode.Drain = drain != nil // COMPAT: Remove in Nomad 0.9
+	copyNode.DrainStrategy = drain
+	if drain == nil {
 		// When stopping a drain unset the strategy but leave the node
 		// ineligible for scheduling
 		copyNode.DrainStrategy = nil
+	} else {
+		copyNode.SchedulingEligibility = structs.NodeSchedulingIneligible
 	}
+
 	copyNode.ModifyIndex = index
+	copyNode.StatusUpdatedAt = updateTime
 
 	// Insert the node
 	if err := txn.Insert("nodes", copyNode); err != nil {
diff --git a/nomad/state/state_store_test.go b/nomad/state/state_store_test.go
index 73c86bbd9f59..af2e8cceb9dd 100644
--- a/nomad/state/state_store_test.go
+++ b/nomad/state/state_store_test.go
@@ -705,64 +705,31 @@ func TestStateStore_UpdateNodeDrain_Node(t *testing.T) {
 
 	require.Nil(state.UpsertNode(1000, node))
 
-	// Create a watchset so we can test that update node drain fires the watch
-	ws := memdb.NewWatchSet()
-
-	// Assert initial node state
-	{
-		out, err := state.NodeByID(ws, node.ID)
-		require.Nil(err)
-
-		require.False(out.Drain)
-		require.Nil(out.DrainStrategy)
-		require.Equal(structs.NodeSchedulingEligible, out.SchedulingEligibility)
-		if out.ModifyIndex != 1000 {
-			t.Fatalf("expected ModifyIndex=1000, found %d", out.ModifyIndex)
-		}
-	}
-
-	// Start draining
-	{
-		require.Nil(state.UpdateNodeDrain(1001, node.ID, true))
-		require.True(watchFired(ws))
-
-		ws = memdb.NewWatchSet()
-		out, err := state.NodeByID(ws, node.ID)
-		require.Nil(err)
-
-		require.True(out.Drain)
-		require.NotNil(out.DrainStrategy)
-		require.Equal(structs.NodeSchedulingIneligible, out.SchedulingEligibility)
-		if out.ModifyIndex != 1001 {
-			t.Fatalf("expected ModifyIndex=1001, found %d", out.ModifyIndex)
-		}
-
-		index, err := state.Index("nodes")
-		require.Nil(err)
-		if index != 1001 {
-			t.Fatalf("expected index=1001, found %d", index)
-		}
-
-		require.False(watchFired(ws))
+	expectedTime := int64(101)
+	expectedDrain := &structs.DrainStrategy{
+		Deadline: 10 * time.Second,
 	}
 
-	// Stop draining (no need to retest watch behavior)
-	{
-		require.Nil(state.UpdateNodeDrain(1002, node.ID, false))
+	// Create a watchset so we can test that update node drain fires the watch
+	ws := memdb.NewWatchSet()
+	_, err := state.NodeByID(ws, node.ID)
+	require.Nil(err)
 
-		out, err := state.NodeByID(nil, node.ID)
-		require.Nil(err)
+	require.Nil(state.UpdateNodeDrain(1001, node.ID, expectedDrain, expectedTime))
+	require.True(watchFired(ws))
 
-		require.False(out.Drain)
-		require.Nil(out.DrainStrategy)
-		if out.ModifyIndex != 1002 {
-			t.Fatalf("expected ModifyIndex=1002, found %d", out.ModifyIndex)
-		}
+	ws = memdb.NewWatchSet()
+	out, err := state.NodeByID(ws, node.ID)
+	require.Nil(err)
+	require.True(out.Drain)
+	require.NotNil(out.DrainStrategy)
+	require.Equal(out.DrainStrategy, expectedDrain)
+	require.EqualValues(1001, out.ModifyIndex)
 
-		// Scheduling eligibility should *not* flip back to eligible after
-		// draining stops.
-		require.Equal(structs.NodeSchedulingIneligible, out.SchedulingEligibility)
-	}
+	index, err := state.Index("nodes")
+	require.Nil(err)
+	require.EqualValues(1001, index)
+	require.False(watchFired(ws))
 }
 
 func TestStateStore_AddSingleNodeEvent(t *testing.T) {
diff --git a/nomad/structs/structs.go b/nomad/structs/structs.go
index 6f6a98a6fb70..12c279e561f6 100644
--- a/nomad/structs/structs.go
+++ b/nomad/structs/structs.go
@@ -302,10 +302,12 @@ type NodeUpdateStatusRequest struct {
 	WriteRequest
 }
 
-// NodeUpdateDrainRequest is used for updating the drain status
+// NodeUpdateDrainRequest is used for updating the drain strategy
 type NodeUpdateDrainRequest struct {
-	NodeID string
-	Drain  bool
+	NodeID        string
+	Drain         bool // TODO Deprecate
+	DrainStrategy *DrainStrategy
+	UpdateTime    int64
 	WriteRequest
 }
 
@@ -871,10 +873,13 @@ type NodeUpdateResponse struct {
 
 // NodeDrainUpdateResponse is used to respond to a node drain update
 type NodeDrainUpdateResponse struct {
-	EvalIDs         []string
-	EvalCreateIndex uint64
 	NodeModifyIndex uint64
 	QueryMeta
+
+	// Deprecated in Nomad 0.8 as an evaluation is not immediately created but
+	// is instead handled by the drainer.
+	EvalIDs         []string
+	EvalCreateIndex uint64
 }
 
 // NodeAllocsResponse is used to return allocs for a single node
@@ -1179,6 +1184,9 @@ func ValidNodeStatus(status string) bool {
 }
 
 const (
+	// NodeSchedulingEligible and Ineligible marks the node as eligible or not,
+	// respectively, for receiving allocations. This is orthoginal to the node
+	// status being ready.
 	NodeSchedulingEligible   = "eligbile"
 	NodeSchedulingIneligible = "ineligible"
 )
@@ -1192,6 +1200,10 @@ type DrainStrategy struct {
 	// Deadline is the duration after StartTime when the remaining
 	// allocations on a draining Node should be told to stop.
 	Deadline time.Duration
+
+	// IgnoreSystemJobs allows systems jobs to remain on the node even though it
+	// has been marked for draining.
+	IgnoreSystemJobs bool
 }
 
 func (d *DrainStrategy) Copy() *DrainStrategy {
@@ -1275,6 +1287,7 @@ type Node struct {
 	// attributes and capabilities.
 	ComputedClass string
 
+	// COMPAT: Remove in Nomad 0.9
 	// Drain is controlled by the servers, and not the client.
 	// If true, no jobs will be scheduled to this node, and existing
 	// allocations will be drained. Superceded by DrainStrategy in Nomad
@@ -1324,12 +1337,12 @@ func (n *Node) Copy() *Node {
 	nn := new(Node)
 	*nn = *n
 	nn.Attributes = helper.CopyMapStringString(nn.Attributes)
-	nn.DrainStrategy = nn.DrainStrategy.Copy()
 	nn.Resources = nn.Resources.Copy()
 	nn.Reserved = nn.Reserved.Copy()
 	nn.Links = helper.CopyMapStringString(nn.Links)
 	nn.Meta = helper.CopyMapStringString(nn.Meta)
 	nn.Events = copyNodeEvents(n.Events)
+	nn.DrainStrategy = nn.DrainStrategy.Copy()
 	return nn
 }
 
@@ -3189,10 +3202,10 @@ func (tg *TaskGroup) Validate(j *Job) error {
 	// Validate the migration strategy
 	switch j.Type {
 	case JobTypeService:
-		if tg.Count == 1 && tg.Migrate != nil {
-			mErr.Errors = append(mErr.Errors, fmt.Errorf("Task Group %v should not have a migration strategy with a count = 1", tg.Name))
-		} else if err := tg.Migrate.Validate(); err != nil {
-			mErr.Errors = append(mErr.Errors, err)
+		if tg.Migrate != nil {
+			if err := tg.Migrate.Validate(); err != nil {
+				mErr.Errors = append(mErr.Errors, err)
+			}
 		}
 	default:
 		if tg.Migrate != nil {

From 2bdeacebffc4e9c3aab8706799512ee67b2aff2d Mon Sep 17 00:00:00 2001
From: Alex Dadgar <alex.dadgar@gmail.com>
Date: Fri, 23 Feb 2018 15:56:36 -0800
Subject: [PATCH 10/79] Drain cli, api, http

---
 api/nodes.go                        | 80 ++++++++++++++++-------
 api/nodes_test.go                   | 14 +++--
 command/agent/node_endpoint.go      | 22 ++++---
 command/agent/node_endpoint_test.go | 68 ++++++++++++--------
 command/node.go                     | 19 ++++++
 command/node_drain.go               | 98 ++++++++++++++++++++++++++---
 command/node_drain_test.go          | 43 +++++++++++++
 command/node_status.go              |  2 +-
 commands.go                         | 16 ++++-
 main.go                             |  1 +
 nomad/structs/structs.go            | 18 ++++--
 11 files changed, 298 insertions(+), 83 deletions(-)
 create mode 100644 command/node.go

diff --git a/api/nodes.go b/api/nodes.go
index 549eeea66639..4868fef7cfd4 100644
--- a/api/nodes.go
+++ b/api/nodes.go
@@ -3,7 +3,6 @@ package api
 import (
 	"fmt"
 	"sort"
-	"strconv"
 	"time"
 )
 
@@ -42,10 +41,24 @@ func (n *Nodes) Info(nodeID string, q *QueryOptions) (*Node, *QueryMeta, error)
 	return &resp, qm, nil
 }
 
-// ToggleDrain is used to toggle drain mode on/off for a given node.
-func (n *Nodes) ToggleDrain(nodeID string, drain bool, q *WriteOptions) (*WriteMeta, error) {
-	drainArg := strconv.FormatBool(drain)
-	wm, err := n.client.write("/v1/node/"+nodeID+"/drain?enable="+drainArg, nil, nil, q)
+// NodeUpdateDrainRequest is used to update the drain specification for a node.
+type NodeUpdateDrainRequest struct {
+	// NodeID is the node to update the drain specification for.
+	NodeID string
+
+	// DrainSpec is the drain specification to set for the node. A nil DrainSpec
+	// will disable draining.
+	DrainSpec *DrainSpec
+}
+
+// UpdateDrain is used to update the drain strategy for a given node.
+func (n *Nodes) UpdateDrain(nodeID string, spec *DrainSpec, q *WriteOptions) (*WriteMeta, error) {
+	req := &NodeUpdateDrainRequest{
+		NodeID:    nodeID,
+		DrainSpec: spec,
+	}
+
+	wm, err := n.client.write("/v1/node/"+nodeID+"/drain", req, nil, q)
 	if err != nil {
 		return nil, err
 	}
@@ -108,25 +121,44 @@ type DriverInfo struct {
 
 // Node is used to deserialize a node entry.
 type Node struct {
-	ID                string
-	Datacenter        string
-	Name              string
-	HTTPAddr          string
-	TLSEnabled        bool
-	Attributes        map[string]string
-	Resources         *Resources
-	Reserved          *Resources
-	Links             map[string]string
-	Meta              map[string]string
-	NodeClass         string
-	Drain             bool
-	Status            string
-	StatusDescription string
-	StatusUpdatedAt   int64
-	Events            []*NodeEvent
-	Drivers           map[string]*DriverInfo
-	CreateIndex       uint64
-	ModifyIndex       uint64
+	ID                    string
+	Datacenter            string
+	Name                  string
+	HTTPAddr              string
+	TLSEnabled            bool
+	Attributes            map[string]string
+	Resources             *Resources
+	Reserved              *Resources
+	Links                 map[string]string
+	Meta                  map[string]string
+	NodeClass             string
+	Drain                 bool
+	DrainStrategy         *DrainStrategy
+	SchedulingEligibility string
+	Status                string
+	StatusDescription     string
+	StatusUpdatedAt       int64
+	Events                []*NodeEvent
+	Drivers               map[string]*DriverInfo
+	CreateIndex           uint64
+	ModifyIndex           uint64
+}
+
+// DrainStrategy describes a Node's drain behavior.
+type DrainStrategy struct {
+	// DrainSpec is the user declared drain specification
+	DrainSpec
+}
+
+// DrainSpec describes a Node's drain behavior.
+type DrainSpec struct {
+	// Deadline is the duration after StartTime when the remaining
+	// allocations on a draining Node should be told to stop.
+	Deadline time.Duration
+
+	// IgnoreSystemJobs allows systems jobs to remain on the node even though it
+	// has been marked for draining.
+	IgnoreSystemJobs bool
 }
 
 const (
diff --git a/api/nodes_test.go b/api/nodes_test.go
index 06b960746942..e2c0a3c78136 100644
--- a/api/nodes_test.go
+++ b/api/nodes_test.go
@@ -174,7 +174,10 @@ func TestNodes_ToggleDrain(t *testing.T) {
 	}
 
 	// Toggle it on
-	wm, err := nodes.ToggleDrain(nodeID, true, nil)
+	spec := &DrainSpec{
+		Deadline: 10 * time.Second,
+	}
+	wm, err := nodes.UpdateDrain(nodeID, spec, nil)
 	if err != nil {
 		t.Fatalf("err: %s", err)
 	}
@@ -185,12 +188,12 @@ func TestNodes_ToggleDrain(t *testing.T) {
 	if err != nil {
 		t.Fatalf("err: %s", err)
 	}
-	if !out.Drain {
-		t.Fatalf("drain mode should be on")
+	if out.SchedulingEligibility != structs.NodeSchedulingIneligible {
+		t.Fatalf("bad eligibility: %v vs %v", out.SchedulingEligibility, structs.NodeSchedulingIneligible)
 	}
 
 	// Toggle off again
-	wm, err = nodes.ToggleDrain(nodeID, false, nil)
+	wm, err = nodes.UpdateDrain(nodeID, nil, nil)
 	if err != nil {
 		t.Fatalf("err: %s", err)
 	}
@@ -204,6 +207,9 @@ func TestNodes_ToggleDrain(t *testing.T) {
 	if out.Drain {
 		t.Fatalf("drain mode should be off")
 	}
+	if out.DrainStrategy != nil {
+		t.Fatalf("drain strategy should be unset")
+	}
 }
 
 func TestNodes_Allocations(t *testing.T) {
diff --git a/command/agent/node_endpoint.go b/command/agent/node_endpoint.go
index fd396a67c40f..b22850873a56 100644
--- a/command/agent/node_endpoint.go
+++ b/command/agent/node_endpoint.go
@@ -2,9 +2,9 @@ package agent
 
 import (
 	"net/http"
-	"strconv"
 	"strings"
 
+	"github.com/hashicorp/nomad/api"
 	"github.com/hashicorp/nomad/nomad/structs"
 )
 
@@ -101,19 +101,21 @@ func (s *HTTPServer) nodeToggleDrain(resp http.ResponseWriter, req *http.Request
 		return nil, CodedError(405, ErrInvalidMethod)
 	}
 
-	// Get the enable value
-	enableRaw := req.URL.Query().Get("enable")
-	if enableRaw == "" {
-		return nil, CodedError(400, "missing enable value")
-	}
-	enable, err := strconv.ParseBool(enableRaw)
-	if err != nil {
-		return nil, CodedError(400, "invalid enable value")
+	var drainRequest api.NodeUpdateDrainRequest
+	if err := decodeBody(req, &drainRequest); err != nil {
+		return nil, CodedError(400, err.Error())
 	}
 
 	args := structs.NodeUpdateDrainRequest{
 		NodeID: nodeID,
-		Drain:  enable,
+	}
+	if drainRequest.DrainSpec != nil {
+		args.DrainStrategy = &structs.DrainStrategy{
+			DrainSpec: structs.DrainSpec{
+				Deadline:         drainRequest.DrainSpec.Deadline,
+				IgnoreSystemJobs: drainRequest.DrainSpec.IgnoreSystemJobs,
+			},
+		}
 	}
 	s.parseWriteRequest(req, &args.WriteRequest)
 
diff --git a/command/agent/node_endpoint_test.go b/command/agent/node_endpoint_test.go
index a5566adc19fb..ac1bd00b7286 100644
--- a/command/agent/node_endpoint_test.go
+++ b/command/agent/node_endpoint_test.go
@@ -4,10 +4,13 @@ import (
 	"net/http"
 	"net/http/httptest"
 	"testing"
+	"time"
 
+	"github.com/hashicorp/nomad/api"
 	"github.com/hashicorp/nomad/nomad/mock"
 	"github.com/hashicorp/nomad/nomad/structs"
 	"github.com/stretchr/testify/assert"
+	"github.com/stretchr/testify/require"
 )
 
 func TestHTTP_NodesList(t *testing.T) {
@@ -238,6 +241,7 @@ func TestHTTP_NodeAllocations(t *testing.T) {
 
 func TestHTTP_NodeDrain(t *testing.T) {
 	t.Parallel()
+	require := require.New(t)
 	httpTest(t, nil, func(s *TestAgent) {
 		// Create the node
 		node := mock.Node()
@@ -246,45 +250,55 @@ func TestHTTP_NodeDrain(t *testing.T) {
 			WriteRequest: structs.WriteRequest{Region: "global"},
 		}
 		var resp structs.NodeUpdateResponse
-		if err := s.Agent.RPC("Node.Register", &args, &resp); err != nil {
-			t.Fatalf("err: %v", err)
-		}
+		require.Nil(s.Agent.RPC("Node.Register", &args, &resp))
 
-		// Directly manipulate the state
-		state := s.Agent.server.State()
-		alloc1 := mock.Alloc()
-		alloc1.NodeID = node.ID
-		if err := state.UpsertJobSummary(999, mock.JobSummary(alloc1.JobID)); err != nil {
-			t.Fatal(err)
-		}
-		err := state.UpsertAllocs(1000, []*structs.Allocation{alloc1})
-		if err != nil {
-			t.Fatalf("err: %v", err)
+		drainReq := api.NodeUpdateDrainRequest{
+			NodeID: node.ID,
+			DrainSpec: &api.DrainSpec{
+				Deadline: 10 * time.Second,
+			},
 		}
 
 		// Make the HTTP request
-		req, err := http.NewRequest("POST", "/v1/node/"+node.ID+"/drain?enable=1", nil)
-		if err != nil {
-			t.Fatalf("err: %v", err)
-		}
+		buf := encodeReq(drainReq)
+		req, err := http.NewRequest("POST", "/v1/node/"+node.ID+"/drain", buf)
+		require.Nil(err)
 		respW := httptest.NewRecorder()
 
 		// Make the request
 		obj, err := s.Server.NodeSpecificRequest(respW, req)
-		if err != nil {
-			t.Fatalf("err: %v", err)
-		}
+		require.Nil(err)
 
 		// Check for the index
-		if respW.HeaderMap.Get("X-Nomad-Index") == "" {
-			t.Fatalf("missing index")
-		}
+		require.NotZero(respW.HeaderMap.Get("X-Nomad-Index"))
 
 		// Check the response
-		upd := obj.(structs.NodeDrainUpdateResponse)
-		if len(upd.EvalIDs) == 0 {
-			t.Fatalf("bad: %v", upd)
-		}
+		_, ok := obj.(structs.NodeDrainUpdateResponse)
+		require.True(ok)
+
+		// Check that the node has been updated
+		state := s.Agent.server.State()
+		out, err := state.NodeByID(nil, node.ID)
+		require.Nil(err)
+		require.True(out.Drain)
+		require.NotNil(out.DrainStrategy)
+		require.Equal(10*time.Second, out.DrainStrategy.Deadline)
+
+		// Make the HTTP request to unset drain
+		drainReq.DrainSpec = nil
+		buf = encodeReq(drainReq)
+		req, err = http.NewRequest("POST", "/v1/node/"+node.ID+"/drain", buf)
+		require.Nil(err)
+		respW = httptest.NewRecorder()
+
+		// Make the request
+		obj, err = s.Server.NodeSpecificRequest(respW, req)
+		require.Nil(err)
+
+		out, err = state.NodeByID(nil, node.ID)
+		require.Nil(err)
+		require.False(out.Drain)
+		require.Nil(out.DrainStrategy)
 	})
 }
 
diff --git a/command/node.go b/command/node.go
new file mode 100644
index 000000000000..36436d9b7868
--- /dev/null
+++ b/command/node.go
@@ -0,0 +1,19 @@
+package command
+
+import "github.com/mitchellh/cli"
+
+type NodeCommand struct {
+	Meta
+}
+
+func (f *NodeCommand) Help() string {
+	return "This command is accessed by using one of the subcommands below."
+}
+
+func (f *NodeCommand) Synopsis() string {
+	return "Interact with nodes"
+}
+
+func (f *NodeCommand) Run(args []string) int {
+	return cli.RunResultHelp
+}
diff --git a/command/node_drain.go b/command/node_drain.go
index b40757b7c90b..c27068b97e59 100644
--- a/command/node_drain.go
+++ b/command/node_drain.go
@@ -3,18 +3,26 @@ package command
 import (
 	"fmt"
 	"strings"
+	"time"
 
+	"github.com/hashicorp/nomad/api"
 	"github.com/hashicorp/nomad/api/contexts"
 	"github.com/posener/complete"
 )
 
+var (
+	// defaultDrainDuration is the default drain duration if it is not specified
+	// explicitly
+	defaultDrainDuration = 1 * time.Hour
+)
+
 type NodeDrainCommand struct {
 	Meta
 }
 
 func (c *NodeDrainCommand) Help() string {
 	helpText := `
-Usage: nomad node-drain [options] <node>
+Usage: nomad node drain [options] <node>
 
   Toggles node draining on a specified node. It is required
   that either -enable or -disable is specified, but not both.
@@ -32,8 +40,24 @@ Node Drain Options:
   -enable
     Enable draining for the specified node.
 
+  -deadline <duration>
+    Set the deadline by which all allocations must be moved off the node.
+    Remaining allocations after the deadline are forced removed from the node.
+    If unspecified, a default deadline of one hour is applied.
+
+  -force
+    Force remove allocations off the node immediately.
+
+  -no-deadline
+    No deadline allows the allocations to drain off the node without being force
+    stopped after a certain deadline.
+
+  -ignore-system
+    Ignore system allows the drain to complete without stopping system job
+    allocations.
+
   -self
-    Query the status of the local node.
+    Set the drain status of the local node.
 
   -yes
     Automatic yes to prompts.
@@ -48,10 +72,14 @@ func (c *NodeDrainCommand) Synopsis() string {
 func (c *NodeDrainCommand) AutocompleteFlags() complete.Flags {
 	return mergeAutocompleteFlags(c.Meta.AutocompleteFlags(FlagSetClient),
 		complete.Flags{
-			"-disable": complete.PredictNothing,
-			"-enable":  complete.PredictNothing,
-			"-self":    complete.PredictNothing,
-			"-yes":     complete.PredictNothing,
+			"-disable":       complete.PredictNothing,
+			"-enable":        complete.PredictNothing,
+			"-deadline":      complete.PredictAnything,
+			"-force":         complete.PredictNothing,
+			"-no-deadline":   complete.PredictNothing,
+			"-ignore-system": complete.PredictNothing,
+			"-self":          complete.PredictNothing,
+			"-yes":           complete.PredictNothing,
 		})
 }
 
@@ -71,12 +99,18 @@ func (c *NodeDrainCommand) AutocompleteArgs() complete.Predictor {
 }
 
 func (c *NodeDrainCommand) Run(args []string) int {
-	var enable, disable, self, autoYes bool
+	var enable, disable, force,
+		noDeadline, ignoreSystem, self, autoYes bool
+	var deadline string
 
 	flags := c.Meta.FlagSet("node-drain", FlagSetClient)
 	flags.Usage = func() { c.Ui.Output(c.Help()) }
 	flags.BoolVar(&enable, "enable", false, "Enable drain mode")
 	flags.BoolVar(&disable, "disable", false, "Disable drain mode")
+	flags.StringVar(&deadline, "deadline", "", "Deadline after which allocations are force stopped")
+	flags.BoolVar(&force, "force", false, "Force immediate drain")
+	flags.BoolVar(&noDeadline, "no-deadline", false, "Drain node with no deadline")
+	flags.BoolVar(&ignoreSystem, "ignore-system", false, "Do not drain system job allocations from the node")
 	flags.BoolVar(&self, "self", false, "")
 	flags.BoolVar(&autoYes, "yes", false, "Automatic yes to prompts.")
 
@@ -93,10 +127,46 @@ func (c *NodeDrainCommand) Run(args []string) int {
 	// Check that we got a node ID
 	args = flags.Args()
 	if l := len(args); self && l != 0 || !self && l != 1 {
-		c.Ui.Error(c.Help())
+		c.Ui.Error("Node ID must be specified if -self isn't being used")
+		return 1
+	}
+
+	// Validate a compatible set of flags were set
+	if disable && (deadline != "" || force || noDeadline || ignoreSystem) {
+		c.Ui.Error("-disable can't be combined with flags configuring drain strategy")
+		return 1
+	}
+	if deadline != "" && (force || noDeadline) {
+		c.Ui.Error("-deadline can't be combined with -force or -no-deadline")
+		return 1
+	}
+	if force && noDeadline {
+		c.Ui.Error("-force and -no-deadline are mutually exclusive")
 		return 1
 	}
 
+	// Parse the duration
+	var d time.Duration
+	if force {
+		d = -1 * time.Second
+	} else if noDeadline {
+		d = 0
+	} else if deadline != "" {
+		dur, err := time.ParseDuration(deadline)
+		if err != nil {
+			c.Ui.Error(fmt.Sprintf("Failed to parse deadline %q: %v", deadline, err))
+			return 1
+		}
+		if dur <= 0 {
+			c.Ui.Error("A positive drain duration must be given")
+			return 1
+		}
+
+		d = dur
+	} else {
+		d = defaultDrainDuration
+	}
+
 	// Get the HTTP client
 	client, err := c.Meta.Client()
 	if err != nil {
@@ -186,9 +256,17 @@ func (c *NodeDrainCommand) Run(args []string) int {
 		}
 	}
 
+	var spec *api.DrainSpec
+	if enable {
+		spec = &api.DrainSpec{
+			Deadline:         d,
+			IgnoreSystemJobs: ignoreSystem,
+		}
+	}
+
 	// Toggle node draining
-	if _, err := client.Nodes().ToggleDrain(node.ID, enable, nil); err != nil {
-		c.Ui.Error(fmt.Sprintf("Error toggling drain mode: %s", err))
+	if _, err := client.Nodes().UpdateDrain(node.ID, spec, nil); err != nil {
+		c.Ui.Error(fmt.Sprintf("Error updating drain specification: %s", err))
 		return 1
 	}
 	return 0
diff --git a/command/node_drain_test.go b/command/node_drain_test.go
index 241845ab4878..20f63d95f571 100644
--- a/command/node_drain_test.go
+++ b/command/node_drain_test.go
@@ -85,6 +85,49 @@ func TestNodeDrainCommand_Fails(t *testing.T) {
 	if out := ui.ErrorWriter.String(); !strings.Contains(out, "No node(s) with prefix or id") {
 		t.Fatalf("expected not exist error, got: %s", out)
 	}
+	ui.ErrorWriter.Reset()
+
+	// Fail on disable being used with drain strategy flags
+	for _, flag := range []string{"-force", "-no-deadline", "-ignore-system"} {
+		if code := cmd.Run([]string{"-address=" + url, "-disable", flag, "12345678-abcd-efab-cdef-123456789abc"}); code != 1 {
+			t.Fatalf("expected exit 1, got: %d", code)
+		}
+		if out := ui.ErrorWriter.String(); !strings.Contains(out, "combined with flags configuring drain strategy") {
+			t.Fatalf("got: %s", out)
+		}
+		ui.ErrorWriter.Reset()
+	}
+
+	// Fail on setting a deadline plus deadline modifying flags
+	for _, flag := range []string{"-force", "-no-deadline"} {
+		if code := cmd.Run([]string{"-address=" + url, "-enable", "-deadline=10s", flag, "12345678-abcd-efab-cdef-123456789abc"}); code != 1 {
+			t.Fatalf("expected exit 1, got: %d", code)
+		}
+		if out := ui.ErrorWriter.String(); !strings.Contains(out, "deadline can't be combined with") {
+			t.Fatalf("got: %s", out)
+		}
+		ui.ErrorWriter.Reset()
+	}
+
+	// Fail on setting a force and no deadline
+	if code := cmd.Run([]string{"-address=" + url, "-enable", "-force", "-no-deadline", "12345678-abcd-efab-cdef-123456789abc"}); code != 1 {
+		t.Fatalf("expected exit 1, got: %d", code)
+	}
+	if out := ui.ErrorWriter.String(); !strings.Contains(out, "mutually exclusive") {
+		t.Fatalf("got: %s", out)
+	}
+	ui.ErrorWriter.Reset()
+
+	// Fail on setting a bad deadline
+	for _, flag := range []string{"-deadline=0s", "-deadline=-1s"} {
+		if code := cmd.Run([]string{"-address=" + url, "-enable", flag, "12345678-abcd-efab-cdef-123456789abc"}); code != 1 {
+			t.Fatalf("expected exit 1, got: %d", code)
+		}
+		if out := ui.ErrorWriter.String(); !strings.Contains(out, "positive") {
+			t.Fatalf("got: %s", out)
+		}
+		ui.ErrorWriter.Reset()
+	}
 }
 
 func TestNodeDrainCommand_AutocompleteArgs(t *testing.T) {
diff --git a/command/node_status.go b/command/node_status.go
index cbce475346a9..b347b6b78853 100644
--- a/command/node_status.go
+++ b/command/node_status.go
@@ -37,7 +37,7 @@ type NodeStatusCommand struct {
 
 func (c *NodeStatusCommand) Help() string {
 	helpText := `
-Usage: nomad node-status [options] <node>
+Usage: nomad node status [options] <node>
 
   Display status information about a given node. The list of nodes
   returned includes only nodes which jobs may be scheduled to, and
diff --git a/commands.go b/commands.go
index 75155948bd21..9e27664af896 100644
--- a/commands.go
+++ b/commands.go
@@ -258,17 +258,31 @@ func Commands(metaPtr *command.Meta) map[string]cli.CommandFactory {
 				Meta: meta,
 			}, nil
 		},
+		"node": func() (cli.Command, error) {
+			return &command.NodeCommand{
+				Meta: meta,
+			}, nil
+		},
 		"node-drain": func() (cli.Command, error) {
 			return &command.NodeDrainCommand{
 				Meta: meta,
 			}, nil
 		},
+		"node drain": func() (cli.Command, error) {
+			return &command.NodeDrainCommand{
+				Meta: meta,
+			}, nil
+		},
 		"node-status": func() (cli.Command, error) {
 			return &command.NodeStatusCommand{
 				Meta: meta,
 			}, nil
 		},
-
+		"node status": func() (cli.Command, error) {
+			return &command.NodeStatusCommand{
+				Meta: meta,
+			}, nil
+		},
 		"operator": func() (cli.Command, error) {
 			return &command.OperatorCommand{
 				Meta: meta,
diff --git a/main.go b/main.go
index 4fe38fd6a998..3c178e145709 100644
--- a/main.go
+++ b/main.go
@@ -37,6 +37,7 @@ func RunCustom(args []string, commands map[string]cli.CommandFactory) int {
 		case "quota list", "quota delete", "quota apply", "quota status", "quota inspect", "quota init":
 		case "operator raft", "operator raft list-peers", "operator raft remove-peer":
 		case "acl policy", "acl policy apply", "acl token", "acl token create":
+		case "node-drain", "node-status":
 		default:
 			commandsInclude = append(commandsInclude, k)
 		}
diff --git a/nomad/structs/structs.go b/nomad/structs/structs.go
index 12c279e561f6..eec0964bef6f 100644
--- a/nomad/structs/structs.go
+++ b/nomad/structs/structs.go
@@ -1191,12 +1191,8 @@ const (
 	NodeSchedulingIneligible = "ineligible"
 )
 
-// DrainStrategy describes a Node's drain behavior.
-type DrainStrategy struct {
-	// StartTime as nanoseconds since Unix epoch indicating when a drain
-	// began for deadline calcuations.
-	StartTime int64
-
+// DrainSpec describes a Node's desired drain behavior.
+type DrainSpec struct {
 	// Deadline is the duration after StartTime when the remaining
 	// allocations on a draining Node should be told to stop.
 	Deadline time.Duration
@@ -1206,6 +1202,16 @@ type DrainStrategy struct {
 	IgnoreSystemJobs bool
 }
 
+// DrainStrategy describes a Node's drain behavior.
+type DrainStrategy struct {
+	// DrainSpec is the user declared drain specification
+	DrainSpec
+
+	// StartTime as nanoseconds since Unix epoch indicating when a drain
+	// began for deadline calcuations.
+	StartTime int64
+}
+
 func (d *DrainStrategy) Copy() *DrainStrategy {
 	if d == nil {
 		return nil

From 762db7c5d71f199167fa2f7be04963f525ba8a94 Mon Sep 17 00:00:00 2001
From: Alex Dadgar <alex.dadgar@gmail.com>
Date: Mon, 26 Feb 2018 14:34:32 -0800
Subject: [PATCH 11/79] Fix tests

---
 nomad/drain_test.go             |  4 +++-
 nomad/fsm_test.go               |  4 +++-
 nomad/node_endpoint_test.go     | 16 ++++++++++++----
 nomad/state/state_store_test.go |  4 +++-
 4 files changed, 21 insertions(+), 7 deletions(-)

diff --git a/nomad/drain_test.go b/nomad/drain_test.go
index c47e0d401548..13465ede0763 100644
--- a/nomad/drain_test.go
+++ b/nomad/drain_test.go
@@ -144,7 +144,9 @@ func TestNodeDrainer_SimpleDrain(t *testing.T) {
 	// Start draining node 1
 	//FIXME update drain rpc to skip fsm manipulation and use api
 	strategy := &structs.DrainStrategy{
-		Deadline: -1 * time.Second,
+		DrainSpec: structs.DrainSpec{
+			Deadline: -1 * time.Second,
+		},
 	}
 	node, err := state.NodeByID(nil, c1.NodeID())
 	require.Nil(err)
diff --git a/nomad/fsm_test.go b/nomad/fsm_test.go
index b9fc4845da82..87c3a8b44d3c 100644
--- a/nomad/fsm_test.go
+++ b/nomad/fsm_test.go
@@ -294,7 +294,9 @@ func TestFSM_UpdateNodeDrain(t *testing.T) {
 	require.Nil(resp)
 
 	strategy := &structs.DrainStrategy{
-		Deadline: 10 * time.Second,
+		DrainSpec: structs.DrainSpec{
+			Deadline: 10 * time.Second,
+		},
 	}
 	req2 := structs.NodeUpdateDrainRequest{
 		NodeID:        node.ID,
diff --git a/nomad/node_endpoint_test.go b/nomad/node_endpoint_test.go
index 0de46ed22c61..5ce3cc2728a2 100644
--- a/nomad/node_endpoint_test.go
+++ b/nomad/node_endpoint_test.go
@@ -765,7 +765,9 @@ func TestClientEndpoint_UpdateDrain(t *testing.T) {
 	require.Nil(msgpackrpc.CallWithCodec(codec, "Node.Register", reg, &resp))
 
 	strategy := &structs.DrainStrategy{
-		Deadline: 10 * time.Second,
+		DrainSpec: structs.DrainSpec{
+			Deadline: 10 * time.Second,
+		},
 	}
 
 	// Update the status
@@ -809,7 +811,9 @@ func TestClientEndpoint_UpdateDrain_ACL(t *testing.T) {
 	dereg := &structs.NodeUpdateDrainRequest{
 		NodeID: node.ID,
 		DrainStrategy: &structs.DrainStrategy{
-			Deadline: 10 * time.Second,
+			DrainSpec: structs.DrainSpec{
+				Deadline: 10 * time.Second,
+			},
 		},
 		WriteRequest: structs.WriteRequest{Region: "global"},
 	}
@@ -910,7 +914,9 @@ func TestClientEndpoint_Drain_Down(t *testing.T) {
 	dereg := &structs.NodeUpdateDrainRequest{
 		NodeID: node.ID,
 		DrainStrategy: &structs.DrainStrategy{
-			Deadline: -1 * time.Second,
+			DrainSpec: structs.DrainSpec{
+				Deadline: -1 * time.Second,
+			},
 		},
 		WriteRequest: structs.WriteRequest{Region: "global"},
 	}
@@ -2369,7 +2375,9 @@ func TestClientEndpoint_ListNodes_Blocking(t *testing.T) {
 	// Node drain updates trigger watches.
 	time.AfterFunc(100*time.Millisecond, func() {
 		s := &structs.DrainStrategy{
-			Deadline: 10 * time.Second,
+			DrainSpec: structs.DrainSpec{
+				Deadline: 10 * time.Second,
+			},
 		}
 		if err := state.UpdateNodeDrain(3, node.ID, s, 101); err != nil {
 			t.Fatalf("err: %v", err)
diff --git a/nomad/state/state_store_test.go b/nomad/state/state_store_test.go
index af2e8cceb9dd..3d97cf7f541b 100644
--- a/nomad/state/state_store_test.go
+++ b/nomad/state/state_store_test.go
@@ -707,7 +707,9 @@ func TestStateStore_UpdateNodeDrain_Node(t *testing.T) {
 
 	expectedTime := int64(101)
 	expectedDrain := &structs.DrainStrategy{
-		Deadline: 10 * time.Second,
+		DrainSpec: structs.DrainSpec{
+			Deadline: -1 * time.Second,
+		},
 	}
 
 	// Create a watchset so we can test that update node drain fires the watch

From fba20fd58d1784605e46432e9ddfc918a71e853f Mon Sep 17 00:00:00 2001
From: Alex Dadgar <alex.dadgar@gmail.com>
Date: Mon, 26 Feb 2018 15:06:01 -0800
Subject: [PATCH 12/79] Remove update time

---
 nomad/drain_test.go             |  2 +-
 nomad/fsm.go                    |  2 +-
 nomad/fsm_test.go               |  1 -
 nomad/node_endpoint.go          |  3 ---
 nomad/node_endpoint_test.go     |  2 +-
 nomad/state/state_store.go      |  4 +---
 nomad/state/state_store_test.go | 13 ++++++-------
 nomad/structs/structs.go        |  1 -
 8 files changed, 10 insertions(+), 18 deletions(-)

diff --git a/nomad/drain_test.go b/nomad/drain_test.go
index 13465ede0763..9bae27fe38d2 100644
--- a/nomad/drain_test.go
+++ b/nomad/drain_test.go
@@ -150,7 +150,7 @@ func TestNodeDrainer_SimpleDrain(t *testing.T) {
 	}
 	node, err := state.NodeByID(nil, c1.NodeID())
 	require.Nil(err)
-	require.Nil(state.UpdateNodeDrain(node.ModifyIndex+1, node.ID, strategy, 101))
+	require.Nil(state.UpdateNodeDrain(node.ModifyIndex+1, node.ID, strategy))
 
 	// Start node 2
 	c2 := client.TestClient(t, func(conf *config.Config) {
diff --git a/nomad/fsm.go b/nomad/fsm.go
index 58d1527514a1..a946d523fd0d 100644
--- a/nomad/fsm.go
+++ b/nomad/fsm.go
@@ -328,7 +328,7 @@ func (n *nomadFSM) applyDrainUpdate(buf []byte, index uint64) interface{} {
 		panic(fmt.Errorf("failed to decode request: %v", err))
 	}
 
-	if err := n.state.UpdateNodeDrain(index, req.NodeID, req.DrainStrategy, req.UpdateTime); err != nil {
+	if err := n.state.UpdateNodeDrain(index, req.NodeID, req.DrainStrategy); err != nil {
 		n.logger.Printf("[ERR] nomad.fsm: UpdateNodeDrain failed: %v", err)
 		return err
 	}
diff --git a/nomad/fsm_test.go b/nomad/fsm_test.go
index 87c3a8b44d3c..922ba2ca61bb 100644
--- a/nomad/fsm_test.go
+++ b/nomad/fsm_test.go
@@ -301,7 +301,6 @@ func TestFSM_UpdateNodeDrain(t *testing.T) {
 	req2 := structs.NodeUpdateDrainRequest{
 		NodeID:        node.ID,
 		DrainStrategy: strategy,
-		UpdateTime:    101,
 	}
 	buf, err = structs.Encode(structs.NodeUpdateDrainRequestType, req2)
 	require.Nil(err)
diff --git a/nomad/node_endpoint.go b/nomad/node_endpoint.go
index 2631939ad13d..2aa02bfce191 100644
--- a/nomad/node_endpoint.go
+++ b/nomad/node_endpoint.go
@@ -433,9 +433,6 @@ func (n *Node) UpdateDrain(args *structs.NodeUpdateDrainRequest,
 		return fmt.Errorf("node not found")
 	}
 
-	// Update the timestamp to
-	args.UpdateTime = time.Now().Unix()
-
 	// Commit this update via Raft
 	_, index, err := n.srv.raftApply(structs.NodeUpdateDrainRequestType, args)
 	if err != nil {
diff --git a/nomad/node_endpoint_test.go b/nomad/node_endpoint_test.go
index 5ce3cc2728a2..7649cc2dbe06 100644
--- a/nomad/node_endpoint_test.go
+++ b/nomad/node_endpoint_test.go
@@ -2379,7 +2379,7 @@ func TestClientEndpoint_ListNodes_Blocking(t *testing.T) {
 				Deadline: 10 * time.Second,
 			},
 		}
-		if err := state.UpdateNodeDrain(3, node.ID, s, 101); err != nil {
+		if err := state.UpdateNodeDrain(3, node.ID, s); err != nil {
 			t.Fatalf("err: %v", err)
 		}
 	})
diff --git a/nomad/state/state_store.go b/nomad/state/state_store.go
index 45c595d04b36..cfef5e4831d8 100644
--- a/nomad/state/state_store.go
+++ b/nomad/state/state_store.go
@@ -616,8 +616,7 @@ func (s *StateStore) UpdateNodeStatus(index uint64, nodeID, status string) error
 }
 
 // UpdateNodeDrain is used to update the drain of a node
-func (s *StateStore) UpdateNodeDrain(index uint64, nodeID string,
-	drain *structs.DrainStrategy, updateTime int64) error {
+func (s *StateStore) UpdateNodeDrain(index uint64, nodeID string, drain *structs.DrainStrategy) error {
 
 	txn := s.db.Txn(true)
 	defer txn.Abort()
@@ -647,7 +646,6 @@ func (s *StateStore) UpdateNodeDrain(index uint64, nodeID string,
 	}
 
 	copyNode.ModifyIndex = index
-	copyNode.StatusUpdatedAt = updateTime
 
 	// Insert the node
 	if err := txn.Insert("nodes", copyNode); err != nil {
diff --git a/nomad/state/state_store_test.go b/nomad/state/state_store_test.go
index 3d97cf7f541b..81e17eaabe4f 100644
--- a/nomad/state/state_store_test.go
+++ b/nomad/state/state_store_test.go
@@ -705,19 +705,18 @@ func TestStateStore_UpdateNodeDrain_Node(t *testing.T) {
 
 	require.Nil(state.UpsertNode(1000, node))
 
-	expectedTime := int64(101)
+	// Create a watchset so we can test that update node drain fires the watch
+	ws := memdb.NewWatchSet()
+	_, err := state.NodeByID(ws, node.ID)
+	require.Nil(err)
+
 	expectedDrain := &structs.DrainStrategy{
 		DrainSpec: structs.DrainSpec{
 			Deadline: -1 * time.Second,
 		},
 	}
 
-	// Create a watchset so we can test that update node drain fires the watch
-	ws := memdb.NewWatchSet()
-	_, err := state.NodeByID(ws, node.ID)
-	require.Nil(err)
-
-	require.Nil(state.UpdateNodeDrain(1001, node.ID, expectedDrain, expectedTime))
+	require.Nil(state.UpdateNodeDrain(1001, node.ID, expectedDrain))
 	require.True(watchFired(ws))
 
 	ws = memdb.NewWatchSet()
diff --git a/nomad/structs/structs.go b/nomad/structs/structs.go
index eec0964bef6f..d28c89fa8985 100644
--- a/nomad/structs/structs.go
+++ b/nomad/structs/structs.go
@@ -307,7 +307,6 @@ type NodeUpdateDrainRequest struct {
 	NodeID        string
 	Drain         bool // TODO Deprecate
 	DrainStrategy *DrainStrategy
-	UpdateTime    int64
 	WriteRequest
 }
 

From a7833bc609ff110ca215c30e9f927d224c88d757 Mon Sep 17 00:00:00 2001
From: Alex Dadgar <alex.dadgar@gmail.com>
Date: Tue, 27 Feb 2018 09:40:17 -0800
Subject: [PATCH 13/79] Upgrade path

---
 command/agent/node_endpoint.go | 26 ++++++++++++++++++++++++--
 nomad/node_endpoint.go         | 10 ++++++++++
 2 files changed, 34 insertions(+), 2 deletions(-)

diff --git a/command/agent/node_endpoint.go b/command/agent/node_endpoint.go
index b22850873a56..1a937447eb04 100644
--- a/command/agent/node_endpoint.go
+++ b/command/agent/node_endpoint.go
@@ -2,7 +2,9 @@ package agent
 
 import (
 	"net/http"
+	"strconv"
 	"strings"
+	"time"
 
 	"github.com/hashicorp/nomad/api"
 	"github.com/hashicorp/nomad/nomad/structs"
@@ -102,8 +104,28 @@ func (s *HTTPServer) nodeToggleDrain(resp http.ResponseWriter, req *http.Request
 	}
 
 	var drainRequest api.NodeUpdateDrainRequest
-	if err := decodeBody(req, &drainRequest); err != nil {
-		return nil, CodedError(400, err.Error())
+
+	// COMPAT: Remove in 0.9. Allow the old style enable query param.
+	// Get the enable parameter
+	enableRaw := req.URL.Query().Get("enable")
+	var enable bool
+	if enableRaw != "" {
+		var err error
+		enable, err = strconv.ParseBool(enableRaw)
+		if err != nil {
+			return nil, CodedError(400, "invalid enable value")
+		}
+
+		// Use the force drain to have it keep the same behavior as old clients.
+		if enable {
+			drainRequest.DrainSpec = &api.DrainSpec{
+				Deadline: -1 * time.Second,
+			}
+		}
+	} else {
+		if err := decodeBody(req, &drainRequest); err != nil {
+			return nil, CodedError(400, err.Error())
+		}
 	}
 
 	args := structs.NodeUpdateDrainRequest{
diff --git a/nomad/node_endpoint.go b/nomad/node_endpoint.go
index 2aa02bfce191..082491e89370 100644
--- a/nomad/node_endpoint.go
+++ b/nomad/node_endpoint.go
@@ -433,6 +433,16 @@ func (n *Node) UpdateDrain(args *structs.NodeUpdateDrainRequest,
 		return fmt.Errorf("node not found")
 	}
 
+	// COMPAT: Remove in 0.9. Attempt to upgrade the request if it is of the old
+	// format.
+	if args.Drain && args.DrainStrategy == nil {
+		args.DrainStrategy = &structs.DrainStrategy{
+			DrainSpec: structs.DrainSpec{
+				Deadline: -1 * time.Second, // Force drain
+			},
+		}
+	}
+
 	// Commit this update via Raft
 	_, index, err := n.srv.raftApply(structs.NodeUpdateDrainRequestType, args)
 	if err != nil {

From 5c101de72581ce6382b87deebc470fddff3fa5af Mon Sep 17 00:00:00 2001
From: Alex Dadgar <alex.dadgar@gmail.com>
Date: Tue, 27 Feb 2018 09:41:55 -0800
Subject: [PATCH 14/79] flag comment

---
 command/node_drain.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/command/node_drain.go b/command/node_drain.go
index c27068b97e59..0b92a0a8e990 100644
--- a/command/node_drain.go
+++ b/command/node_drain.go
@@ -54,7 +54,7 @@ Node Drain Options:
 
   -ignore-system
     Ignore system allows the drain to complete without stopping system job
-    allocations.
+    allocations. By default system jobs are stopped last.
 
   -self
     Set the drain status of the local node.

From dcafa8b46027178a4bb602462f4b209f5a9fe253 Mon Sep 17 00:00:00 2001
From: Alex Dadgar <alex.dadgar@gmail.com>
Date: Mon, 26 Feb 2018 16:34:42 -0800
Subject: [PATCH 15/79] RPC/FSM/State Store for Eligibility

---
 nomad/fsm.go                    | 16 ++++++
 nomad/fsm_test.go               | 61 ++++++++++++++++++++++
 nomad/node_endpoint.go          | 56 ++++++++++++++++++++
 nomad/node_endpoint_test.go     | 91 +++++++++++++++++++++++++++++++++
 nomad/state/state_store.go      | 46 ++++++++++++++++-
 nomad/state/state_store_test.go | 46 +++++++++++++++++
 nomad/structs/structs.go        |  8 +++
 7 files changed, 322 insertions(+), 2 deletions(-)

diff --git a/nomad/fsm.go b/nomad/fsm.go
index a946d523fd0d..7df2582dbb72 100644
--- a/nomad/fsm.go
+++ b/nomad/fsm.go
@@ -242,6 +242,8 @@ func (n *nomadFSM) Apply(log *raft.Log) interface{} {
 		return n.applyBatchDeregisterJob(buf[1:], log.Index)
 	case structs.AllocUpdateDesiredTransitionRequestType:
 		return n.applyAllocUpdateDesiredTransition(buf[1:], log.Index)
+	case structs.NodeUpdateEligibilityRequestType:
+		return n.applyNodeEligibilityUpdate(buf[1:], log.Index)
 	}
 
 	// Check enterprise only message types.
@@ -335,6 +337,20 @@ func (n *nomadFSM) applyDrainUpdate(buf []byte, index uint64) interface{} {
 	return nil
 }
 
+func (n *nomadFSM) applyNodeEligibilityUpdate(buf []byte, index uint64) interface{} {
+	defer metrics.MeasureSince([]string{"nomad", "fsm", "node_eligibility_update"}, time.Now())
+	var req structs.NodeUpdateEligibilityRequest
+	if err := structs.Decode(buf, &req); err != nil {
+		panic(fmt.Errorf("failed to decode request: %v", err))
+	}
+
+	if err := n.state.UpdateNodeEligibility(index, req.NodeID, req.Eligibility); err != nil {
+		n.logger.Printf("[ERR] nomad.fsm: UpdateNodeEligibility failed: %v", err)
+		return err
+	}
+	return nil
+}
+
 func (n *nomadFSM) applyUpsertJob(buf []byte, index uint64) interface{} {
 	defer metrics.MeasureSince([]string{"nomad", "fsm", "register_job"}, time.Now())
 	var req structs.JobRegisterRequest
diff --git a/nomad/fsm_test.go b/nomad/fsm_test.go
index 922ba2ca61bb..b834b432f5ff 100644
--- a/nomad/fsm_test.go
+++ b/nomad/fsm_test.go
@@ -316,6 +316,67 @@ func TestFSM_UpdateNodeDrain(t *testing.T) {
 	require.Equal(node.DrainStrategy, strategy)
 }
 
+func TestFSM_UpdateNodeEligibility(t *testing.T) {
+	t.Parallel()
+	require := require.New(t)
+	fsm := testFSM(t)
+
+	node := mock.Node()
+	req := structs.NodeRegisterRequest{
+		Node: node,
+	}
+	buf, err := structs.Encode(structs.NodeRegisterRequestType, req)
+	require.Nil(err)
+
+	resp := fsm.Apply(makeLog(buf))
+	require.Nil(resp)
+
+	// Set the eligibility
+	req2 := structs.NodeUpdateEligibilityRequest{
+		NodeID:      node.ID,
+		Eligibility: structs.NodeSchedulingIneligible,
+	}
+	buf, err = structs.Encode(structs.NodeUpdateEligibilityRequestType, req2)
+	require.Nil(err)
+
+	resp = fsm.Apply(makeLog(buf))
+	require.Nil(resp)
+
+	// Lookup the node and check
+	node, err = fsm.State().NodeByID(nil, req.Node.ID)
+	require.Nil(err)
+	require.Equal(node.SchedulingEligibility, structs.NodeSchedulingIneligible)
+
+	// Update the drain
+	strategy := &structs.DrainStrategy{
+		DrainSpec: structs.DrainSpec{
+			Deadline: 10 * time.Second,
+		},
+	}
+	req3 := structs.NodeUpdateDrainRequest{
+		NodeID:        node.ID,
+		DrainStrategy: strategy,
+	}
+	buf, err = structs.Encode(structs.NodeUpdateDrainRequestType, req3)
+	require.Nil(err)
+	resp = fsm.Apply(makeLog(buf))
+	require.Nil(resp)
+
+	// Try forcing eligibility
+	req4 := structs.NodeUpdateEligibilityRequest{
+		NodeID:      node.ID,
+		Eligibility: structs.NodeSchedulingEligible,
+	}
+	buf, err = structs.Encode(structs.NodeUpdateEligibilityRequestType, req4)
+	require.Nil(err)
+
+	resp = fsm.Apply(makeLog(buf))
+	require.NotNil(resp)
+	err, ok := resp.(error)
+	require.True(ok)
+	require.Contains(err.Error(), "draining")
+}
+
 func TestFSM_RegisterJob(t *testing.T) {
 	t.Parallel()
 	fsm := testFSM(t)
diff --git a/nomad/node_endpoint.go b/nomad/node_endpoint.go
index 082491e89370..f46de16618df 100644
--- a/nomad/node_endpoint.go
+++ b/nomad/node_endpoint.go
@@ -456,6 +456,62 @@ func (n *Node) UpdateDrain(args *structs.NodeUpdateDrainRequest,
 	return nil
 }
 
+// UpdateEligibility is used to update the scheduling eligibility of a node
+func (n *Node) UpdateEligibility(args *structs.NodeUpdateEligibilityRequest,
+	reply *structs.GenericResponse) error {
+	if done, err := n.srv.forward("Node.UpdateEligibility", args, args, reply); done {
+		return err
+	}
+	defer metrics.MeasureSince([]string{"nomad", "client", "update_eligibility"}, time.Now())
+
+	// Check node write permissions
+	if aclObj, err := n.srv.ResolveToken(args.AuthToken); err != nil {
+		return err
+	} else if aclObj != nil && !aclObj.AllowNodeWrite() {
+		return structs.ErrPermissionDenied
+	}
+
+	// Verify the arguments
+	if args.NodeID == "" {
+		return fmt.Errorf("missing node ID for setting scheduling eligibility")
+	}
+
+	// Look for the node
+	snap, err := n.srv.fsm.State().Snapshot()
+	if err != nil {
+		return err
+	}
+	ws := memdb.NewWatchSet()
+	node, err := snap.NodeByID(ws, args.NodeID)
+	if err != nil {
+		return err
+	}
+	if node == nil {
+		return fmt.Errorf("node not found")
+	}
+
+	if node.DrainStrategy != nil && args.Eligibility == structs.NodeSchedulingEligible {
+		return fmt.Errorf("can not set node's scheduling eligibility to eligible while it is draining")
+	}
+
+	// Commit this update via Raft
+	outErr, index, err := n.srv.raftApply(structs.NodeUpdateEligibilityRequestType, args)
+	if err != nil {
+		n.srv.logger.Printf("[ERR] nomad.client: eligibility update failed: %v", err)
+		return err
+	}
+	if outErr != nil {
+		if err, ok := outErr.(error); ok && err != nil {
+			n.srv.logger.Printf("[ERR] nomad.client: eligibility update failed: %v", err)
+			return err
+		}
+	}
+
+	// Set the reply index
+	reply.Index = index
+	return nil
+}
+
 // Evaluate is used to force a re-evaluation of the node
 func (n *Node) Evaluate(args *structs.NodeEvaluateRequest, reply *structs.NodeUpdateResponse) error {
 	if done, err := n.srv.forward("Node.Evaluate", args, args, reply); done {
diff --git a/nomad/node_endpoint_test.go b/nomad/node_endpoint_test.go
index 7649cc2dbe06..87c418d0d8a9 100644
--- a/nomad/node_endpoint_test.go
+++ b/nomad/node_endpoint_test.go
@@ -980,6 +980,97 @@ func TestClientEndpoint_Drain_Down(t *testing.T) {
 	})
 }
 
+func TestClientEndpoint_UpdateEligibility(t *testing.T) {
+	t.Parallel()
+	require := require.New(t)
+	s1 := TestServer(t, nil)
+	defer s1.Shutdown()
+	codec := rpcClient(t, s1)
+	testutil.WaitForLeader(t, s1.RPC)
+
+	// Create the register request
+	node := mock.Node()
+	reg := &structs.NodeRegisterRequest{
+		Node:         node,
+		WriteRequest: structs.WriteRequest{Region: "global"},
+	}
+
+	// Fetch the response
+	var resp structs.NodeUpdateResponse
+	require.Nil(msgpackrpc.CallWithCodec(codec, "Node.Register", reg, &resp))
+
+	// Update the eligibility
+	dereg := &structs.NodeUpdateEligibilityRequest{
+		NodeID:       node.ID,
+		Eligibility:  structs.NodeSchedulingIneligible,
+		WriteRequest: structs.WriteRequest{Region: "global"},
+	}
+	var resp2 structs.GenericResponse
+	require.Nil(msgpackrpc.CallWithCodec(codec, "Node.UpdateEligibility", dereg, &resp2))
+	require.NotZero(resp2.Index)
+
+	// Check for the node in the FSM
+	state := s1.fsm.State()
+	out, err := state.NodeByID(nil, node.ID)
+	require.Nil(err)
+	require.Equal(out.SchedulingEligibility, structs.NodeSchedulingIneligible)
+}
+
+func TestClientEndpoint_UpdateEligibility_ACL(t *testing.T) {
+	t.Parallel()
+	s1, root := TestACLServer(t, nil)
+	defer s1.Shutdown()
+	codec := rpcClient(t, s1)
+	testutil.WaitForLeader(t, s1.RPC)
+	require := require.New(t)
+
+	// Create the node
+	node := mock.Node()
+	state := s1.fsm.State()
+
+	require.Nil(state.UpsertNode(1, node), "UpsertNode")
+
+	// Create the policy and tokens
+	validToken := mock.CreatePolicyAndToken(t, state, 1001, "test-valid", mock.NodePolicy(acl.PolicyWrite))
+	invalidToken := mock.CreatePolicyAndToken(t, state, 1003, "test-invalid", mock.NodePolicy(acl.PolicyRead))
+
+	// Update the status without a token and expect failure
+	dereg := &structs.NodeUpdateEligibilityRequest{
+		NodeID:       node.ID,
+		Eligibility:  structs.NodeSchedulingIneligible,
+		WriteRequest: structs.WriteRequest{Region: "global"},
+	}
+	{
+		var resp structs.GenericResponse
+		err := msgpackrpc.CallWithCodec(codec, "Node.UpdateEligibility", dereg, &resp)
+		require.NotNil(err, "RPC")
+		require.Equal(err.Error(), structs.ErrPermissionDenied.Error())
+	}
+
+	// Try with a valid token
+	dereg.AuthToken = validToken.SecretID
+	{
+		var resp structs.GenericResponse
+		require.Nil(msgpackrpc.CallWithCodec(codec, "Node.UpdateEligibility", dereg, &resp), "RPC")
+	}
+
+	// Try with a invalid token
+	dereg.AuthToken = invalidToken.SecretID
+	{
+		var resp structs.GenericResponse
+		err := msgpackrpc.CallWithCodec(codec, "Node.UpdateEligibility", dereg, &resp)
+		require.NotNil(err, "RPC")
+		require.Equal(err.Error(), structs.ErrPermissionDenied.Error())
+	}
+
+	// Try with a root token
+	dereg.AuthToken = root.SecretID
+	{
+		var resp structs.GenericResponse
+		require.Nil(msgpackrpc.CallWithCodec(codec, "Node.UpdateEligibility", dereg, &resp), "RPC")
+	}
+}
+
 func TestClientEndpoint_GetNode(t *testing.T) {
 	t.Parallel()
 	s1 := TestServer(t, nil)
diff --git a/nomad/state/state_store.go b/nomad/state/state_store.go
index cfef5e4831d8..e48a940e86b5 100644
--- a/nomad/state/state_store.go
+++ b/nomad/state/state_store.go
@@ -509,7 +509,7 @@ func (s *StateStore) DeleteDeployment(index uint64, deploymentIDs []string) erro
 
 // UpsertNode is used to register a node or update a node definition
 // This is assumed to be triggered by the client, so we retain the value
-// of drain which is set by the scheduler.
+// of drain/eligibility which is set by the scheduler.
 func (s *StateStore) UpsertNode(index uint64, node *structs.Node) error {
 	txn := s.db.Txn(true)
 	defer txn.Abort()
@@ -525,10 +525,12 @@ func (s *StateStore) UpsertNode(index uint64, node *structs.Node) error {
 		exist := existing.(*structs.Node)
 		node.CreateIndex = exist.CreateIndex
 		node.ModifyIndex = index
-		node.Drain = exist.Drain // Retain the drain mode
 
 		// Retain node events that have already been set on the node
 		node.Events = exist.Events
+
+		node.Drain = exist.Drain                                 // Retain the drain mode
+		node.SchedulingEligibility = exist.SchedulingEligibility // Retain the eligibility
 	} else {
 		// Because this is the first time the node is being registered, we should
 		// also create a node registration event
@@ -659,6 +661,46 @@ func (s *StateStore) UpdateNodeDrain(index uint64, nodeID string, drain *structs
 	return nil
 }
 
+// UpdateNodeEligibility is used to update the scheduling eligibility of a node
+func (s *StateStore) UpdateNodeEligibility(index uint64, nodeID string, eligibility string) error {
+
+	txn := s.db.Txn(true)
+	defer txn.Abort()
+
+	// Lookup the node
+	existing, err := txn.First("nodes", "id", nodeID)
+	if err != nil {
+		return fmt.Errorf("node lookup failed: %v", err)
+	}
+	if existing == nil {
+		return fmt.Errorf("node not found")
+	}
+
+	// Copy the existing node
+	existingNode := existing.(*structs.Node)
+	copyNode := existingNode.Copy()
+
+	// Check if this is a valid action
+	if copyNode.DrainStrategy != nil && eligibility == structs.NodeSchedulingEligible {
+		return fmt.Errorf("can not set node's scheduling eligibility to eligible while it is draining")
+	}
+
+	// Update the eligibility in the copy
+	copyNode.SchedulingEligibility = eligibility
+	copyNode.ModifyIndex = index
+
+	// Insert the node
+	if err := txn.Insert("nodes", copyNode); err != nil {
+		return fmt.Errorf("node update failed: %v", err)
+	}
+	if err := txn.Insert("index", &IndexEntry{"nodes", index}); err != nil {
+		return fmt.Errorf("index update failed: %v", err)
+	}
+
+	txn.Commit()
+	return nil
+}
+
 // UpsertNodeEvents adds the node events to the nodes, rotating events as
 // necessary.
 func (s *StateStore) UpsertNodeEvents(index uint64, nodeEvents map[string][]*structs.NodeEvent) error {
diff --git a/nomad/state/state_store_test.go b/nomad/state/state_store_test.go
index 81e17eaabe4f..1bf1467deda5 100644
--- a/nomad/state/state_store_test.go
+++ b/nomad/state/state_store_test.go
@@ -822,6 +822,52 @@ func TestStateStore_NodeEvents_RetentionWindow(t *testing.T) {
 	require.Equal(uint64(20), out.Events[len(out.Events)-1].CreateIndex)
 }
 
+func TestStateStore_UpdateNodeEligibility(t *testing.T) {
+	require := require.New(t)
+	state := testStateStore(t)
+	node := mock.Node()
+
+	err := state.UpsertNode(1000, node)
+	if err != nil {
+		t.Fatalf("err: %v", err)
+	}
+
+	expectedEligibility := structs.NodeSchedulingIneligible
+
+	// Create a watchset so we can test that update node drain fires the watch
+	ws := memdb.NewWatchSet()
+	if _, err := state.NodeByID(ws, node.ID); err != nil {
+		t.Fatalf("bad: %v", err)
+	}
+
+	require.Nil(state.UpdateNodeEligibility(1001, node.ID, expectedEligibility))
+	require.True(watchFired(ws))
+
+	ws = memdb.NewWatchSet()
+	out, err := state.NodeByID(ws, node.ID)
+	require.Nil(err)
+	require.Equal(out.SchedulingEligibility, expectedEligibility)
+	require.EqualValues(1001, out.ModifyIndex)
+
+	index, err := state.Index("nodes")
+	require.Nil(err)
+	require.EqualValues(1001, index)
+	require.False(watchFired(ws))
+
+	// Set a drain strategy
+	expectedDrain := &structs.DrainStrategy{
+		DrainSpec: structs.DrainSpec{
+			Deadline: -1 * time.Second,
+		},
+	}
+	require.Nil(state.UpdateNodeDrain(1002, node.ID, expectedDrain))
+
+	// Try to set the node to eligible
+	err = state.UpdateNodeEligibility(1003, node.ID, structs.NodeSchedulingEligible)
+	require.NotNil(err)
+	require.Contains(err.Error(), "while it is draining")
+}
+
 func TestStateStore_Nodes(t *testing.T) {
 	state := testStateStore(t)
 	var nodes []*structs.Node
diff --git a/nomad/structs/structs.go b/nomad/structs/structs.go
index d28c89fa8985..e1d9b077752d 100644
--- a/nomad/structs/structs.go
+++ b/nomad/structs/structs.go
@@ -79,6 +79,7 @@ const (
 	UpsertNodeEventsType
 	JobBatchDeregisterRequestType
 	AllocUpdateDesiredTransitionRequestType
+	NodeUpdateEligibilityRequestType
 )
 
 const (
@@ -310,6 +311,13 @@ type NodeUpdateDrainRequest struct {
 	WriteRequest
 }
 
+// NodeUpdateEligibilityRequest is used for updating the scheduling	eligibility
+type NodeUpdateEligibilityRequest struct {
+	NodeID      string
+	Eligibility string
+	WriteRequest
+}
+
 // NodeEvaluateRequest is used to re-evaluate the node
 type NodeEvaluateRequest struct {
 	NodeID string

From 0fb9ba7732742555b91dd68577c50f0109a87f5e Mon Sep 17 00:00:00 2001
From: Alex Dadgar <alex.dadgar@gmail.com>
Date: Tue, 27 Feb 2018 12:59:27 -0800
Subject: [PATCH 16/79] HTTP and API

---
 api/nodes.go                        | 28 ++++++++++++
 api/nodes_test.go                   | 66 +++++++++++++++++++++++++++++
 command/agent/node_endpoint.go      | 23 ++++++++++
 command/agent/node_endpoint_test.go | 51 ++++++++++++++++++++++
 nomad/node_endpoint.go              |  6 +++
 5 files changed, 174 insertions(+)

diff --git a/api/nodes.go b/api/nodes.go
index 4868fef7cfd4..94fc206ce5ee 100644
--- a/api/nodes.go
+++ b/api/nodes.go
@@ -4,6 +4,8 @@ import (
 	"fmt"
 	"sort"
 	"time"
+
+	"github.com/hashicorp/nomad/nomad/structs"
 )
 
 // Nodes is used to query node-related API endpoints
@@ -65,6 +67,32 @@ func (n *Nodes) UpdateDrain(nodeID string, spec *DrainSpec, q *WriteOptions) (*W
 	return wm, nil
 }
 
+// NodeUpdateEligibilityRequest is used to update the drain specification for a node.
+type NodeUpdateEligibilityRequest struct {
+	// NodeID is the node to update the drain specification for.
+	NodeID      string
+	Eligibility string
+}
+
+// ToggleEligibility is used to update the scheduling eligibility of the node
+func (n *Nodes) ToggleEligibility(nodeID string, eligible bool, q *WriteOptions) (*WriteMeta, error) {
+	e := structs.NodeSchedulingEligible
+	if !eligible {
+		e = structs.NodeSchedulingIneligible
+	}
+
+	req := &NodeUpdateEligibilityRequest{
+		NodeID:      nodeID,
+		Eligibility: e,
+	}
+
+	wm, err := n.client.write("/v1/node/"+nodeID+"/eligibility", req, nil, q)
+	if err != nil {
+		return nil, err
+	}
+	return wm, nil
+}
+
 // Allocations is used to return the allocations associated with a node.
 func (n *Nodes) Allocations(nodeID string, q *QueryOptions) ([]*Allocation, *QueryMeta, error) {
 	var resp []*Allocation
diff --git a/api/nodes_test.go b/api/nodes_test.go
index e2c0a3c78136..22d61c4011af 100644
--- a/api/nodes_test.go
+++ b/api/nodes_test.go
@@ -212,6 +212,72 @@ func TestNodes_ToggleDrain(t *testing.T) {
 	}
 }
 
+func TestNodes_ToggleEligibility(t *testing.T) {
+	t.Parallel()
+	c, s := makeClient(t, nil, func(c *testutil.TestServerConfig) {
+		c.DevMode = true
+	})
+	defer s.Stop()
+	nodes := c.Nodes()
+
+	// Wait for node registration and get the ID
+	var nodeID string
+	testutil.WaitForResult(func() (bool, error) {
+		out, _, err := nodes.List(nil)
+		if err != nil {
+			return false, err
+		}
+		if n := len(out); n != 1 {
+			return false, fmt.Errorf("expected 1 node, got: %d", n)
+		}
+		nodeID = out[0].ID
+		return true, nil
+	}, func(err error) {
+		t.Fatalf("err: %s", err)
+	})
+
+	// Check for eligibility
+	out, _, err := nodes.Info(nodeID, nil)
+	if err != nil {
+		t.Fatalf("err: %s", err)
+	}
+	if out.SchedulingEligibility != structs.NodeSchedulingEligible {
+		t.Fatalf("node should be eligible")
+	}
+
+	// Toggle it off
+	wm, err := nodes.ToggleEligibility(nodeID, false, nil)
+	if err != nil {
+		t.Fatalf("err: %s", err)
+	}
+	assertWriteMeta(t, wm)
+
+	// Check again
+	out, _, err = nodes.Info(nodeID, nil)
+	if err != nil {
+		t.Fatalf("err: %s", err)
+	}
+	if out.SchedulingEligibility != structs.NodeSchedulingIneligible {
+		t.Fatalf("bad eligibility: %v vs %v", out.SchedulingEligibility, structs.NodeSchedulingIneligible)
+	}
+
+	// Toggle on
+	wm, err = nodes.ToggleEligibility(nodeID, true, nil)
+	if err != nil {
+		t.Fatalf("err: %s", err)
+	}
+	assertWriteMeta(t, wm)
+
+	// Check again
+	out, _, err = nodes.Info(nodeID, nil)
+	if err != nil {
+		t.Fatalf("err: %s", err)
+	}
+	if out.SchedulingEligibility != structs.NodeSchedulingEligible {
+		t.Fatalf("bad eligibility: %v vs %v", out.SchedulingEligibility, structs.NodeSchedulingEligible)
+	}
+}
+
 func TestNodes_Allocations(t *testing.T) {
 	t.Parallel()
 	c, s := makeClient(t, nil, nil)
diff --git a/command/agent/node_endpoint.go b/command/agent/node_endpoint.go
index 1a937447eb04..a86df751c1ab 100644
--- a/command/agent/node_endpoint.go
+++ b/command/agent/node_endpoint.go
@@ -44,6 +44,9 @@ func (s *HTTPServer) NodeSpecificRequest(resp http.ResponseWriter, req *http.Req
 	case strings.HasSuffix(path, "/drain"):
 		nodeName := strings.TrimSuffix(path, "/drain")
 		return s.nodeToggleDrain(resp, req, nodeName)
+	case strings.HasSuffix(path, "/eligibility"):
+		nodeName := strings.TrimSuffix(path, "/eligibility")
+		return s.nodeToggleEligibility(resp, req, nodeName)
 	case strings.HasSuffix(path, "/purge"):
 		nodeName := strings.TrimSuffix(path, "/purge")
 		return s.nodePurge(resp, req, nodeName)
@@ -149,6 +152,26 @@ func (s *HTTPServer) nodeToggleDrain(resp http.ResponseWriter, req *http.Request
 	return out, nil
 }
 
+func (s *HTTPServer) nodeToggleEligibility(resp http.ResponseWriter, req *http.Request,
+	nodeID string) (interface{}, error) {
+	if req.Method != "PUT" && req.Method != "POST" {
+		return nil, CodedError(405, ErrInvalidMethod)
+	}
+
+	var drainRequest structs.NodeUpdateEligibilityRequest
+	if err := decodeBody(req, &drainRequest); err != nil {
+		return nil, CodedError(400, err.Error())
+	}
+	s.parseWriteRequest(req, &drainRequest.WriteRequest)
+
+	var out structs.GenericResponse
+	if err := s.agent.RPC("Node.UpdateEligibility", &drainRequest, &out); err != nil {
+		return nil, err
+	}
+	setIndex(resp, out.Index)
+	return nil, nil
+}
+
 func (s *HTTPServer) nodeQuery(resp http.ResponseWriter, req *http.Request,
 	nodeID string) (interface{}, error) {
 	if req.Method != "GET" {
diff --git a/command/agent/node_endpoint_test.go b/command/agent/node_endpoint_test.go
index ac1bd00b7286..e208f59b72e8 100644
--- a/command/agent/node_endpoint_test.go
+++ b/command/agent/node_endpoint_test.go
@@ -302,6 +302,57 @@ func TestHTTP_NodeDrain(t *testing.T) {
 	})
 }
 
+func TestHTTP_NodeEligble(t *testing.T) {
+	t.Parallel()
+	require := require.New(t)
+	httpTest(t, nil, func(s *TestAgent) {
+		// Create the node
+		node := mock.Node()
+		args := structs.NodeRegisterRequest{
+			Node:         node,
+			WriteRequest: structs.WriteRequest{Region: "global"},
+		}
+		var resp structs.NodeUpdateResponse
+		require.Nil(s.Agent.RPC("Node.Register", &args, &resp))
+
+		drainReq := api.NodeUpdateEligibilityRequest{
+			NodeID:      node.ID,
+			Eligibility: structs.NodeSchedulingIneligible,
+		}
+
+		// Make the HTTP request
+		buf := encodeReq(drainReq)
+		req, err := http.NewRequest("POST", "/v1/node/"+node.ID+"/eligibility", buf)
+		require.Nil(err)
+		respW := httptest.NewRecorder()
+
+		// Make the request
+		_, err = s.Server.NodeSpecificRequest(respW, req)
+		require.Nil(err)
+
+		// Check for the index
+		require.NotZero(respW.HeaderMap.Get("X-Nomad-Index"))
+
+		// Check that the node has been updated
+		state := s.Agent.server.State()
+		out, err := state.NodeByID(nil, node.ID)
+		require.Nil(err)
+		require.Equal(structs.NodeSchedulingIneligible, out.SchedulingEligibility)
+
+		// Make the HTTP request to set something invalid
+		drainReq.Eligibility = "foo"
+		buf = encodeReq(drainReq)
+		req, err = http.NewRequest("POST", "/v1/node/"+node.ID+"/eligibility", buf)
+		require.Nil(err)
+		respW = httptest.NewRecorder()
+
+		// Make the request
+		_, err = s.Server.NodeSpecificRequest(respW, req)
+		require.NotNil(err)
+		require.Contains(err.Error(), "invalid")
+	})
+}
+
 func TestHTTP_NodePurge(t *testing.T) {
 	t.Parallel()
 	httpTest(t, nil, func(s *TestAgent) {
diff --git a/nomad/node_endpoint.go b/nomad/node_endpoint.go
index f46de16618df..5cf5aa587d7e 100644
--- a/nomad/node_endpoint.go
+++ b/nomad/node_endpoint.go
@@ -494,6 +494,12 @@ func (n *Node) UpdateEligibility(args *structs.NodeUpdateEligibilityRequest,
 		return fmt.Errorf("can not set node's scheduling eligibility to eligible while it is draining")
 	}
 
+	switch args.Eligibility {
+	case structs.NodeSchedulingEligible, structs.NodeSchedulingIneligible:
+	default:
+		return fmt.Errorf("invalid scheduling eligibility %q", args.Eligibility)
+	}
+
 	// Commit this update via Raft
 	outErr, index, err := n.srv.raftApply(structs.NodeUpdateEligibilityRequestType, args)
 	if err != nil {

From 378c56629405f5c1dcd9dcf1e871a84b109e9ef7 Mon Sep 17 00:00:00 2001
From: Alex Dadgar <alex.dadgar@gmail.com>
Date: Tue, 27 Feb 2018 13:54:27 -0800
Subject: [PATCH 17/79] node eligibility command

---
 command/node_drain.go            |   2 +
 command/node_eligibility.go      | 168 +++++++++++++++++++++++++++++++
 command/node_eligibility_test.go | 124 +++++++++++++++++++++++
 commands.go                      |   5 +
 main.go                          |   2 +-
 5 files changed, 300 insertions(+), 1 deletion(-)
 create mode 100644 command/node_eligibility.go
 create mode 100644 command/node_eligibility_test.go

diff --git a/command/node_drain.go b/command/node_drain.go
index 0b92a0a8e990..18bd695c4256 100644
--- a/command/node_drain.go
+++ b/command/node_drain.go
@@ -269,5 +269,7 @@ func (c *NodeDrainCommand) Run(args []string) int {
 		c.Ui.Error(fmt.Sprintf("Error updating drain specification: %s", err))
 		return 1
 	}
+
+	c.Ui.Output(fmt.Sprintf("Node %q drain strategy set", node.ID))
 	return 0
 }
diff --git a/command/node_eligibility.go b/command/node_eligibility.go
new file mode 100644
index 000000000000..2db14ddc29b8
--- /dev/null
+++ b/command/node_eligibility.go
@@ -0,0 +1,168 @@
+package command
+
+import (
+	"fmt"
+	"strings"
+
+	"github.com/hashicorp/nomad/api/contexts"
+	"github.com/posener/complete"
+)
+
+type NodeEligibilityCommand struct {
+	Meta
+}
+
+func (c *NodeEligibilityCommand) Help() string {
+	helpText := `
+Usage: nomad node eligibility [options] <node>
+
+  Toggles the nodes scheduling eligibility. When a node is marked as ineligible,
+  no new allocations will be placed on it but existing allocations will remain.
+  To remove existing allocations, use the node drain command.
+
+  It is required that either -enable or -disable is specified, but not both.
+  The -self flag is useful to drain the local node.
+
+General Options:
+
+  ` + generalOptionsUsage() + `
+
+Node Eligibility Options:
+
+  -disable
+    Mark the specified node as ineligible for new allocations.
+
+  -enable
+    Mark the specified node as eligible for new allocations.
+
+  -self
+    Set the eligibility of the local node.
+`
+	return strings.TrimSpace(helpText)
+}
+
+func (c *NodeEligibilityCommand) Synopsis() string {
+	return "Toggle scheduling eligibility for a given node"
+}
+
+func (c *NodeEligibilityCommand) AutocompleteFlags() complete.Flags {
+	return mergeAutocompleteFlags(c.Meta.AutocompleteFlags(FlagSetClient),
+		complete.Flags{
+			"-disable": complete.PredictNothing,
+			"-enable":  complete.PredictNothing,
+			"-self":    complete.PredictNothing,
+		})
+}
+
+func (c *NodeEligibilityCommand) AutocompleteArgs() complete.Predictor {
+	return complete.PredictFunc(func(a complete.Args) []string {
+		client, err := c.Meta.Client()
+		if err != nil {
+			return nil
+		}
+
+		resp, _, err := client.Search().PrefixSearch(a.Last, contexts.Nodes, nil)
+		if err != nil {
+			return []string{}
+		}
+		return resp.Matches[contexts.Nodes]
+	})
+}
+
+func (c *NodeEligibilityCommand) Run(args []string) int {
+	var enable, disable, self bool
+
+	flags := c.Meta.FlagSet("node-eligibility", FlagSetClient)
+	flags.Usage = func() { c.Ui.Output(c.Help()) }
+	flags.BoolVar(&enable, "enable", false, "Mark node as eligibile for scheduling")
+	flags.BoolVar(&disable, "disable", false, "Mark node as ineligibile for scheduling")
+	flags.BoolVar(&self, "self", false, "")
+
+	if err := flags.Parse(args); err != nil {
+		return 1
+	}
+
+	// Check that we got either enable or disable, but not both.
+	if (enable && disable) || (!enable && !disable) {
+		c.Ui.Error(c.Help())
+		return 1
+	}
+
+	// Check that we got a node ID
+	args = flags.Args()
+	if l := len(args); self && l != 0 || !self && l != 1 {
+		c.Ui.Error("Node ID must be specified if -self isn't being used")
+		return 1
+	}
+
+	// Get the HTTP client
+	client, err := c.Meta.Client()
+	if err != nil {
+		c.Ui.Error(fmt.Sprintf("Error initializing client: %s", err))
+		return 1
+	}
+
+	// If -self flag is set then determine the current node.
+	var nodeID string
+	if !self {
+		nodeID = args[0]
+	} else {
+		var err error
+		if nodeID, err = getLocalNodeID(client); err != nil {
+			c.Ui.Error(err.Error())
+			return 1
+		}
+	}
+
+	// Check if node exists
+	if len(nodeID) == 1 {
+		c.Ui.Error(fmt.Sprintf("Identifier must contain at least two characters."))
+		return 1
+	}
+
+	nodeID = sanatizeUUIDPrefix(nodeID)
+	nodes, _, err := client.Nodes().PrefixList(nodeID)
+	if err != nil {
+		c.Ui.Error(fmt.Sprintf("Error toggling drain mode: %s", err))
+		return 1
+	}
+	// Return error if no nodes are found
+	if len(nodes) == 0 {
+		c.Ui.Error(fmt.Sprintf("No node(s) with prefix or id %q found", nodeID))
+		return 1
+	}
+	if len(nodes) > 1 {
+		// Format the nodes list that matches the prefix so that the user
+		// can create a more specific request
+		out := make([]string, len(nodes)+1)
+		out[0] = "ID|Datacenter|Name|Class|Drain|Status"
+		for i, node := range nodes {
+			out[i+1] = fmt.Sprintf("%s|%s|%s|%s|%v|%s",
+				node.ID,
+				node.Datacenter,
+				node.Name,
+				node.NodeClass,
+				node.Drain,
+				node.Status)
+		}
+		// Dump the output
+		c.Ui.Error(fmt.Sprintf("Prefix matched multiple nodes\n\n%s", formatList(out)))
+		return 1
+	}
+
+	// Prefix lookup matched a single node
+	node, _, err := client.Nodes().Info(nodes[0].ID, nil)
+	if err != nil {
+		c.Ui.Error(fmt.Sprintf("Error toggling drain mode: %s", err))
+		return 1
+	}
+
+	// Toggle node eligibility
+	if _, err := client.Nodes().ToggleEligibility(node.ID, enable, nil); err != nil {
+		c.Ui.Error(fmt.Sprintf("Error updating scheduling eligibility: %s", err))
+		return 1
+	}
+
+	c.Ui.Output(fmt.Sprintf("Node %q scheduling eligibility set", node.ID))
+	return 0
+}
diff --git a/command/node_eligibility_test.go b/command/node_eligibility_test.go
new file mode 100644
index 000000000000..3129fe86a19b
--- /dev/null
+++ b/command/node_eligibility_test.go
@@ -0,0 +1,124 @@
+package command
+
+import (
+	"fmt"
+	"strings"
+	"testing"
+
+	"github.com/hashicorp/nomad/testutil"
+	"github.com/mitchellh/cli"
+	"github.com/posener/complete"
+	"github.com/stretchr/testify/assert"
+)
+
+func TestNodeEligibilityCommand_Implements(t *testing.T) {
+	t.Parallel()
+	var _ cli.Command = &NodeEligibilityCommand{}
+}
+
+func TestNodeEligibilityCommand_Fails(t *testing.T) {
+	t.Parallel()
+	srv, _, url := testServer(t, false, nil)
+	defer srv.Shutdown()
+
+	ui := new(cli.MockUi)
+	cmd := &NodeEligibilityCommand{Meta: Meta{Ui: ui}}
+
+	// Fails on misuse
+	if code := cmd.Run([]string{"some", "bad", "args"}); code != 1 {
+		t.Fatalf("expected exit code 1, got: %d", code)
+	}
+	if out := ui.ErrorWriter.String(); !strings.Contains(out, cmd.Help()) {
+		t.Fatalf("expected help output, got: %s", out)
+	}
+	ui.ErrorWriter.Reset()
+
+	// Fails on connection failure
+	if code := cmd.Run([]string{"-address=nope", "-enable", "12345678-abcd-efab-cdef-123456789abc"}); code != 1 {
+		t.Fatalf("expected exit code 1, got: %d", code)
+	}
+	if out := ui.ErrorWriter.String(); !strings.Contains(out, "Error toggling") {
+		t.Fatalf("expected failed toggle error, got: %s", out)
+	}
+	ui.ErrorWriter.Reset()
+
+	// Fails on non-existent node
+	if code := cmd.Run([]string{"-address=" + url, "-enable", "12345678-abcd-efab-cdef-123456789abc"}); code != 1 {
+		t.Fatalf("expected exit 1, got: %d", code)
+	}
+	if out := ui.ErrorWriter.String(); !strings.Contains(out, "No node(s) with prefix or id") {
+		t.Fatalf("expected not exist error, got: %s", out)
+	}
+	ui.ErrorWriter.Reset()
+
+	// Fails if both enable and disable specified
+	if code := cmd.Run([]string{"-enable", "-disable", "12345678-abcd-efab-cdef-123456789abc"}); code != 1 {
+		t.Fatalf("expected exit 1, got: %d", code)
+	}
+	if out := ui.ErrorWriter.String(); !strings.Contains(out, cmd.Help()) {
+		t.Fatalf("expected help output, got: %s", out)
+	}
+	ui.ErrorWriter.Reset()
+
+	// Fails if neither enable or disable specified
+	if code := cmd.Run([]string{"12345678-abcd-efab-cdef-123456789abc"}); code != 1 {
+		t.Fatalf("expected exit 1, got: %d", code)
+	}
+	if out := ui.ErrorWriter.String(); !strings.Contains(out, cmd.Help()) {
+		t.Fatalf("expected help output, got: %s", out)
+	}
+	ui.ErrorWriter.Reset()
+
+	// Fail on identifier with too few characters
+	if code := cmd.Run([]string{"-address=" + url, "-enable", "1"}); code != 1 {
+		t.Fatalf("expected exit 1, got: %d", code)
+	}
+	if out := ui.ErrorWriter.String(); !strings.Contains(out, "must contain at least two characters.") {
+		t.Fatalf("expected too few characters error, got: %s", out)
+	}
+	ui.ErrorWriter.Reset()
+
+	// Identifiers with uneven length should produce a query result
+	if code := cmd.Run([]string{"-address=" + url, "-enable", "123"}); code != 1 {
+		t.Fatalf("expected exit 1, got: %d", code)
+	}
+	if out := ui.ErrorWriter.String(); !strings.Contains(out, "No node(s) with prefix or id") {
+		t.Fatalf("expected not exist error, got: %s", out)
+	}
+	ui.ErrorWriter.Reset()
+}
+
+func TestNodeEligibilityCommand_AutocompleteArgs(t *testing.T) {
+	assert := assert.New(t)
+	t.Parallel()
+
+	srv, client, url := testServer(t, true, nil)
+	defer srv.Shutdown()
+
+	// Wait for a node to appear
+	var nodeID string
+	testutil.WaitForResult(func() (bool, error) {
+		nodes, _, err := client.Nodes().List(nil)
+		if err != nil {
+			return false, err
+		}
+		if len(nodes) == 0 {
+			return false, fmt.Errorf("missing node")
+		}
+		nodeID = nodes[0].ID
+		return true, nil
+	}, func(err error) {
+		t.Fatalf("err: %s", err)
+	})
+
+	ui := new(cli.MockUi)
+	cmd := &NodeEligibilityCommand{Meta: Meta{Ui: ui, flagAddress: url}}
+
+	prefix := nodeID[:len(nodeID)-5]
+	args := complete.Args{Last: prefix}
+	predictor := cmd.AutocompleteArgs()
+
+	res := predictor.Predict(args)
+	assert.Equal(1, len(res))
+	assert.Equal(nodeID, res[0])
+}
diff --git a/commands.go b/commands.go
index 9e27664af896..0b3a422f0348 100644
--- a/commands.go
+++ b/commands.go
@@ -273,6 +273,11 @@ func Commands(metaPtr *command.Meta) map[string]cli.CommandFactory {
 				Meta: meta,
 			}, nil
 		},
+		"node eligibility": func() (cli.Command, error) {
+			return &command.NodeEligibilityCommand{
+				Meta: meta,
+			}, nil
+		},
 		"node-status": func() (cli.Command, error) {
 			return &command.NodeStatusCommand{
 				Meta: meta,
diff --git a/main.go b/main.go
index 3c178e145709..f482ca2838bd 100644
--- a/main.go
+++ b/main.go
@@ -47,7 +47,7 @@ func RunCustom(args []string, commands map[string]cli.CommandFactory) int {
 	// users should not be running should be placed here, versus hiding
 	// subcommands from the main help, which should be filtered out of the
 	// commands above.
-	hidden := []string{"check", "executor", "syslog"}
+	hidden := []string{"check", "executor", "syslog", "node-drain", "node-status"}
 
 	cli := &cli.CLI{
 		Name:           "nomad",

From d6399cb733e764cd6fa0e5328868b91e361a85e9 Mon Sep 17 00:00:00 2001
From: Alex Dadgar <alex.dadgar@gmail.com>
Date: Tue, 27 Feb 2018 14:00:55 -0800
Subject: [PATCH 18/79] Add eligibility to node view

---
 api/nodes.go           | 23 ++++++++++++-----------
 command/node_status.go | 12 ++++++++----
 2 files changed, 20 insertions(+), 15 deletions(-)

diff --git a/api/nodes.go b/api/nodes.go
index 94fc206ce5ee..37adb8fc34e5 100644
--- a/api/nodes.go
+++ b/api/nodes.go
@@ -241,17 +241,18 @@ type HostDiskStats struct {
 // NodeListStub is a subset of information returned during
 // node list operations.
 type NodeListStub struct {
-	Address           string
-	ID                string
-	Datacenter        string
-	Name              string
-	NodeClass         string
-	Version           string
-	Drain             bool
-	Status            string
-	StatusDescription string
-	CreateIndex       uint64
-	ModifyIndex       uint64
+	Address               string
+	ID                    string
+	Datacenter            string
+	Name                  string
+	NodeClass             string
+	Version               string
+	Drain                 bool
+	SchedulingEligibility string
+	Status                string
+	StatusDescription     string
+	CreateIndex           uint64
+	ModifyIndex           uint64
 }
 
 // NodeIndexSort reverse sorts nodes by CreateIndex
diff --git a/command/node_status.go b/command/node_status.go
index b347b6b78853..c59b8d7e6f8e 100644
--- a/command/node_status.go
+++ b/command/node_status.go
@@ -183,7 +183,7 @@ func (c *NodeStatusCommand) Run(args []string) int {
 			out[0] += "Address|Version|"
 		}
 
-		out[0] += "Drain|Status"
+		out[0] += "Drain|Eligibility|Status"
 
 		if c.list_allocs {
 			out[0] += "|Running Allocs"
@@ -199,9 +199,11 @@ func (c *NodeStatusCommand) Run(args []string) int {
 				out[i+1] += fmt.Sprintf("|%s|%s",
 					node.Address, node.Version)
 			}
-			out[i+1] += fmt.Sprintf("|%v|%s",
+			out[i+1] += fmt.Sprintf("|%v|%s|%s",
 				node.Drain,
+				node.SchedulingEligibility,
 				node.Status)
+
 			if c.list_allocs {
 				numAllocs, err := getRunningAllocs(client, node.ID)
 				if err != nil {
@@ -249,14 +251,15 @@ func (c *NodeStatusCommand) Run(args []string) int {
 		// Format the nodes list that matches the prefix so that the user
 		// can create a more specific request
 		out := make([]string, len(nodes)+1)
-		out[0] = "ID|DC|Name|Class|Drain|Status"
+		out[0] = "ID|DC|Name|Class|Drain|Eligibility|Status"
 		for i, node := range nodes {
-			out[i+1] = fmt.Sprintf("%s|%s|%s|%s|%v|%s",
+			out[i+1] = fmt.Sprintf("%s|%s|%s|%s|%v|%s|%s",
 				limit(node.ID, c.length),
 				node.Datacenter,
 				node.Name,
 				node.NodeClass,
 				node.Drain,
+				node.SchedulingEligibility,
 				node.Status)
 		}
 		// Dump the output
@@ -313,6 +316,7 @@ func (c *NodeStatusCommand) formatNode(client *api.Client, node *api.Node) int {
 		fmt.Sprintf("Class|%s", node.NodeClass),
 		fmt.Sprintf("DC|%s", node.Datacenter),
 		fmt.Sprintf("Drain|%v", node.Drain),
+		fmt.Sprintf("Eligibility|%s", node.SchedulingEligibility),
 		fmt.Sprintf("Status|%s", node.Status),
 		fmt.Sprintf("Drivers|%s", strings.Join(nodeDrivers(node), ",")),
 	}

From 451b77d5d761e58d5c877b2cbbc6b97b1d2bddf1 Mon Sep 17 00:00:00 2001
From: Alex Dadgar <alex.dadgar@gmail.com>
Date: Tue, 27 Feb 2018 14:08:29 -0800
Subject: [PATCH 19/79] Unblock evals once eligible

---
 nomad/fsm.go      | 15 +++++++++++++
 nomad/fsm_test.go | 54 +++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 69 insertions(+)

diff --git a/nomad/fsm.go b/nomad/fsm.go
index 7df2582dbb72..b377f09b3fef 100644
--- a/nomad/fsm.go
+++ b/nomad/fsm.go
@@ -344,10 +344,25 @@ func (n *nomadFSM) applyNodeEligibilityUpdate(buf []byte, index uint64) interfac
 		panic(fmt.Errorf("failed to decode request: %v", err))
 	}
 
+	// Lookup the existing node
+	node, err := n.state.NodeByID(nil, req.NodeID)
+	if err != nil {
+		n.logger.Printf("[ERR] nomad.fsm: UpdateNodeEligibility failed to lookup node %q: %v", req.NodeID, err)
+		return err
+	}
+
 	if err := n.state.UpdateNodeEligibility(index, req.NodeID, req.Eligibility); err != nil {
 		n.logger.Printf("[ERR] nomad.fsm: UpdateNodeEligibility failed: %v", err)
 		return err
 	}
+
+	// Unblock evals for the nodes computed node class if it is in a ready
+	// state.
+	if node != nil && node.SchedulingEligibility == structs.NodeSchedulingIneligible &&
+		req.Eligibility == structs.NodeSchedulingEligible {
+		n.blockedEvals.Unblock(node.ComputedClass, index)
+	}
+
 	return nil
 }
 
diff --git a/nomad/fsm_test.go b/nomad/fsm_test.go
index b834b432f5ff..9f8ed205a77e 100644
--- a/nomad/fsm_test.go
+++ b/nomad/fsm_test.go
@@ -377,6 +377,60 @@ func TestFSM_UpdateNodeEligibility(t *testing.T) {
 	require.Contains(err.Error(), "draining")
 }
 
+func TestFSM_UpdateNodeEligibility_Unblock(t *testing.T) {
+	t.Parallel()
+	require := require.New(t)
+	fsm := testFSM(t)
+
+	node := mock.Node()
+	req := structs.NodeRegisterRequest{
+		Node: node,
+	}
+	buf, err := structs.Encode(structs.NodeRegisterRequestType, req)
+	require.Nil(err)
+
+	resp := fsm.Apply(makeLog(buf))
+	require.Nil(resp)
+
+	// Set the eligibility
+	req2 := structs.NodeUpdateEligibilityRequest{
+		NodeID:      node.ID,
+		Eligibility: structs.NodeSchedulingIneligible,
+	}
+	buf, err = structs.Encode(structs.NodeUpdateEligibilityRequestType, req2)
+	require.Nil(err)
+
+	resp = fsm.Apply(makeLog(buf))
+	require.Nil(resp)
+
+	// Mark an eval as blocked.
+	eval := mock.Eval()
+	eval.ClassEligibility = map[string]bool{node.ComputedClass: true}
+	fsm.blockedEvals.Block(eval)
+
+	// Set eligible
+	req4 := structs.NodeUpdateEligibilityRequest{
+		NodeID:      node.ID,
+		Eligibility: structs.NodeSchedulingEligible,
+	}
+	buf, err = structs.Encode(structs.NodeUpdateEligibilityRequestType, req4)
+	require.Nil(err)
+
+	resp = fsm.Apply(makeLog(buf))
+	require.Nil(resp)
+
+	// Verify the eval was unblocked.
+	testutil.WaitForResult(func() (bool, error) {
+		bStats := fsm.blockedEvals.Stats()
+		if bStats.TotalBlocked != 0 {
+			return false, fmt.Errorf("bad: %#v", bStats)
+		}
+		return true, nil
+	}, func(err error) {
+		t.Fatalf("err: %s", err)
+	})
+}
+
 func TestFSM_RegisterJob(t *testing.T) {
 	t.Parallel()
 	fsm := testFSM(t)

From a96c3374e2abe08022ce716368fd0377977f41d3 Mon Sep 17 00:00:00 2001
From: Alex Dadgar <alex.dadgar@gmail.com>
Date: Tue, 27 Feb 2018 14:20:28 -0800
Subject: [PATCH 20/79] Fix retaining the drain

---
 nomad/state/state_store.go | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

diff --git a/nomad/state/state_store.go b/nomad/state/state_store.go
index e48a940e86b5..ef6a51754167 100644
--- a/nomad/state/state_store.go
+++ b/nomad/state/state_store.go
@@ -531,6 +531,7 @@ func (s *StateStore) UpsertNode(index uint64, node *structs.Node) error {
 
 		node.Drain = exist.Drain                                 // Retain the drain mode
 		node.SchedulingEligibility = exist.SchedulingEligibility // Retain the eligibility
+		node.DrainStrategy = exist.DrainStrategy                 // Retain the drain strategy
 	} else {
 		// Because this is the first time the node is being registered, we should
 		// also create a node registration event
@@ -598,8 +599,7 @@ func (s *StateStore) UpdateNodeStatus(index uint64, nodeID, status string) error
 
 	// Copy the existing node
 	existingNode := existing.(*structs.Node)
-	copyNode := new(structs.Node)
-	*copyNode = *existingNode
+	copyNode := existingNode.Copy()
 
 	// Update the status in the copy
 	copyNode.Status = status
@@ -639,11 +639,7 @@ func (s *StateStore) UpdateNodeDrain(index uint64, nodeID string, drain *structs
 	// Update the drain in the copy
 	copyNode.Drain = drain != nil // COMPAT: Remove in Nomad 0.9
 	copyNode.DrainStrategy = drain
-	if drain == nil {
-		// When stopping a drain unset the strategy but leave the node
-		// ineligible for scheduling
-		copyNode.DrainStrategy = nil
-	} else {
+	if drain != nil {
 		copyNode.SchedulingEligibility = structs.NodeSchedulingIneligible
 	}
 

From d65ae92dfa180a46eafe9f6a44ddf74e2cb61c49 Mon Sep 17 00:00:00 2001
From: Alex Dadgar <alex.dadgar@gmail.com>
Date: Tue, 27 Feb 2018 14:43:35 -0800
Subject: [PATCH 21/79] Small refactor and cleanups

---
 command/node_drain.go       | 17 ++-----------
 command/node_eligibility.go | 23 ++++--------------
 command/node_status.go      | 48 +++++++++++++++++++++++++------------
 3 files changed, 40 insertions(+), 48 deletions(-)

diff --git a/command/node_drain.go b/command/node_drain.go
index 18bd695c4256..f6475c7bedd4 100644
--- a/command/node_drain.go
+++ b/command/node_drain.go
@@ -204,21 +204,8 @@ func (c *NodeDrainCommand) Run(args []string) int {
 		return 1
 	}
 	if len(nodes) > 1 {
-		// Format the nodes list that matches the prefix so that the user
-		// can create a more specific request
-		out := make([]string, len(nodes)+1)
-		out[0] = "ID|Datacenter|Name|Class|Drain|Status"
-		for i, node := range nodes {
-			out[i+1] = fmt.Sprintf("%s|%s|%s|%s|%v|%s",
-				node.ID,
-				node.Datacenter,
-				node.Name,
-				node.NodeClass,
-				node.Drain,
-				node.Status)
-		}
-		// Dump the output
-		c.Ui.Error(fmt.Sprintf("Prefix matched multiple nodes\n\n%s", formatList(out)))
+		c.Ui.Error(fmt.Sprintf("Prefix matched multiple nodes\n\n%s",
+			formatNodeStubList(nodes, true)))
 		return 1
 	}
 
diff --git a/command/node_eligibility.go b/command/node_eligibility.go
index 2db14ddc29b8..b0bcbc35bccf 100644
--- a/command/node_eligibility.go
+++ b/command/node_eligibility.go
@@ -21,7 +21,7 @@ Usage: nomad node eligibility [options] <node>
   To remove existing allocations, use the node drain command.
 
   It is required that either -enable or -disable is specified, but not both.
-  The -self flag is useful to drain the local node.
+  The -self flag is useful to set the scheduling eligibility of the local node.
 
 General Options:
 
@@ -123,7 +123,7 @@ func (c *NodeEligibilityCommand) Run(args []string) int {
 	nodeID = sanatizeUUIDPrefix(nodeID)
 	nodes, _, err := client.Nodes().PrefixList(nodeID)
 	if err != nil {
-		c.Ui.Error(fmt.Sprintf("Error toggling drain mode: %s", err))
+		c.Ui.Error(fmt.Sprintf("Error updating scheduling eligibility: %s", err))
 		return 1
 	}
 	// Return error if no nodes are found
@@ -132,28 +132,15 @@ func (c *NodeEligibilityCommand) Run(args []string) int {
 		return 1
 	}
 	if len(nodes) > 1 {
-		// Format the nodes list that matches the prefix so that the user
-		// can create a more specific request
-		out := make([]string, len(nodes)+1)
-		out[0] = "ID|Datacenter|Name|Class|Drain|Status"
-		for i, node := range nodes {
-			out[i+1] = fmt.Sprintf("%s|%s|%s|%s|%v|%s",
-				node.ID,
-				node.Datacenter,
-				node.Name,
-				node.NodeClass,
-				node.Drain,
-				node.Status)
-		}
-		// Dump the output
-		c.Ui.Error(fmt.Sprintf("Prefix matched multiple nodes\n\n%s", formatList(out)))
+		c.Ui.Error(fmt.Sprintf("Prefix matched multiple nodes\n\n%s",
+			formatNodeStubList(nodes, true)))
 		return 1
 	}
 
 	// Prefix lookup matched a single node
 	node, _, err := client.Nodes().Info(nodes[0].ID, nil)
 	if err != nil {
-		c.Ui.Error(fmt.Sprintf("Error toggling drain mode: %s", err))
+		c.Ui.Error(fmt.Sprintf("Error updating scheduling eligibility: %s", err))
 		return 1
 	}
 
diff --git a/command/node_status.go b/command/node_status.go
index c59b8d7e6f8e..68c72342b11f 100644
--- a/command/node_status.go
+++ b/command/node_status.go
@@ -248,24 +248,12 @@ func (c *NodeStatusCommand) Run(args []string) int {
 		return 1
 	}
 	if len(nodes) > 1 {
-		// Format the nodes list that matches the prefix so that the user
-		// can create a more specific request
-		out := make([]string, len(nodes)+1)
-		out[0] = "ID|DC|Name|Class|Drain|Eligibility|Status"
-		for i, node := range nodes {
-			out[i+1] = fmt.Sprintf("%s|%s|%s|%s|%v|%s|%s",
-				limit(node.ID, c.length),
-				node.Datacenter,
-				node.Name,
-				node.NodeClass,
-				node.Drain,
-				node.SchedulingEligibility,
-				node.Status)
-		}
 		// Dump the output
-		c.Ui.Error(fmt.Sprintf("Prefix matched multiple nodes\n\n%s", formatList(out)))
+		c.Ui.Error(fmt.Sprintf("Prefix matched multiple nodes\n\n%s",
+			formatNodeStubList(nodes, c.verbose)))
 		return 1
 	}
+
 	// Prefix lookup matched a single node
 	node, _, err := client.Nodes().Info(nodes[0].ID, nil)
 	if err != nil {
@@ -641,3 +629,33 @@ func getHostResources(hostStats *api.HostStats, node *api.Node) ([]string, error
 	}
 	return resources, nil
 }
+
+// formatNodeStubList is used to return a table format of a list of node stubs.
+func formatNodeStubList(nodes []*api.NodeListStub, verbose bool) string {
+	// Return error if no nodes are found
+	if len(nodes) == 0 {
+		return ""
+	}
+	// Truncate the id unless full length is requested
+	length := shortId
+	if verbose {
+		length = fullId
+	}
+
+	// Format the nodes list that matches the prefix so that the user
+	// can create a more specific request
+	out := make([]string, len(nodes)+1)
+	out[0] = "ID|DC|Name|Class|Drain|Eligibility|Status"
+	for i, node := range nodes {
+		out[i+1] = fmt.Sprintf("%s|%s|%s|%s|%v|%s|%s",
+			limit(node.ID, length),
+			node.Datacenter,
+			node.Name,
+			node.NodeClass,
+			node.Drain,
+			node.SchedulingEligibility,
+			node.Status)
+	}
+
+	return formatList(out)
+}

From 7d58209927da49d09ad7b27edc87eb70eb6af75d Mon Sep 17 00:00:00 2001
From: Alex Dadgar <alex.dadgar@gmail.com>
Date: Tue, 27 Feb 2018 14:46:40 -0800
Subject: [PATCH 22/79] code review

---
 command/agent/node_endpoint_test.go | 2 +-
 command/node_eligibility.go         | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/command/agent/node_endpoint_test.go b/command/agent/node_endpoint_test.go
index e208f59b72e8..19ff6e64cc1e 100644
--- a/command/agent/node_endpoint_test.go
+++ b/command/agent/node_endpoint_test.go
@@ -302,7 +302,7 @@ func TestHTTP_NodeDrain(t *testing.T) {
 	})
 }
 
-func TestHTTP_NodeEligble(t *testing.T) {
+func TestHTTP_NodeEligible(t *testing.T) {
 	t.Parallel()
 	require := require.New(t)
 	httpTest(t, nil, func(s *TestAgent) {
diff --git a/command/node_eligibility.go b/command/node_eligibility.go
index b0bcbc35bccf..a3fe5f802cfd 100644
--- a/command/node_eligibility.go
+++ b/command/node_eligibility.go
@@ -120,7 +120,7 @@ func (c *NodeEligibilityCommand) Run(args []string) int {
 		return 1
 	}
 
-	nodeID = sanatizeUUIDPrefix(nodeID)
+	nodeID = sanitizeUUIDPrefix(nodeID)
 	nodes, _, err := client.Nodes().PrefixList(nodeID)
 	if err != nil {
 		c.Ui.Error(fmt.Sprintf("Error updating scheduling eligibility: %s", err))

From 5be32632049e4326eaa53b7d9db3a3ed821e5c82 Mon Sep 17 00:00:00 2001
From: Michael Schurter <schmichael@hashicorp.com>
Date: Mon, 26 Feb 2018 16:28:10 -0800
Subject: [PATCH 23/79] refactor drainer into a subpkg

---
 .../deploymentwatcher/deployments_watcher.go  |   4 +-
 nomad/{ => drainer}/drain.go                  | 141 +++++++++++-------
 nomad/{ => drainer}/drain_test.go             |  36 ++++-
 nomad/drainer_shims.go                        |  30 ++++
 nomad/leader.go                               |  25 +++-
 nomad/rpc_test.go                             |   2 +-
 nomad/server.go                               |  16 ++
 7 files changed, 176 insertions(+), 78 deletions(-)
 rename nomad/{ => drainer}/drain.go (85%)
 rename nomad/{ => drainer}/drain_test.go (89%)
 create mode 100644 nomad/drainer_shims.go

diff --git a/nomad/deploymentwatcher/deployments_watcher.go b/nomad/deploymentwatcher/deployments_watcher.go
index d9aab78770fb..a88a1de67f93 100644
--- a/nomad/deploymentwatcher/deployments_watcher.go
+++ b/nomad/deploymentwatcher/deployments_watcher.go
@@ -102,7 +102,7 @@ func NewDeploymentsWatcher(logger *log.Logger,
 // SetEnabled is used to control if the watcher is enabled. The watcher
 // should only be enabled on the active leader. When being enabled the state is
 // passed in as it is no longer valid once a leader election has taken place.
-func (w *Watcher) SetEnabled(enabled bool, state *state.StateStore) error {
+func (w *Watcher) SetEnabled(enabled bool, state *state.StateStore) {
 	w.l.Lock()
 	defer w.l.Unlock()
 
@@ -120,8 +120,6 @@ func (w *Watcher) SetEnabled(enabled bool, state *state.StateStore) error {
 	if enabled && !wasEnabled {
 		go w.watchDeployments(w.ctx)
 	}
-
-	return nil
 }
 
 // flush is used to clear the state of the watcher
diff --git a/nomad/drain.go b/nomad/drainer/drain.go
similarity index 85%
rename from nomad/drain.go
rename to nomad/drainer/drain.go
index f0e1dd59b89f..e0c386f4056e 100644
--- a/nomad/drain.go
+++ b/nomad/drainer/drain.go
@@ -1,4 +1,4 @@
-package nomad
+package drainer
 
 import (
 	"context"
@@ -71,21 +71,67 @@ func (s *stopAllocs) add(j *structs.Job, a *structs.Allocation) {
 	s.jobBatch[jobKey{a.Namespace, a.JobID}] = j
 }
 
-// startNodeDrainer should be called in establishLeadership by the leader.
-func (s *Server) startNodeDrainer(stopCh chan struct{}) {
-	state := s.fsm.State()
+// RaftApplier contains methods for applying the raft requests required by the
+// NodeDrainer.
+type RaftApplier interface {
+	AllocUpdateDesiredTransition(allocs map[string]*structs.DesiredTransition, evals []*structs.Evaluation) error
+	NodeDrainComplete(nodeID string) error
+}
 
-	ctx, cancel := context.WithCancel(context.Background())
-	defer cancel()
-	go func() {
-		select {
-		case <-stopCh:
+type nodeDrainerState struct {
+	enabled bool
+	state   *state.StateStore
+}
+
+type NodeDrainer struct {
+	enabledCh chan nodeDrainerState
+
+	raft RaftApplier
+
+	logger *log.Logger
+}
+
+func NewNodeDrainer(logger *log.Logger, raft RaftApplier) *NodeDrainer {
+	return &NodeDrainer{
+		enabledCh: make(chan nodeDrainerState),
+		raft:      raft,
+		logger:    logger,
+	}
+}
+
+func (n *NodeDrainer) SetEnabled(enabled bool, state *state.StateStore) {
+	n.enabledCh <- nodeDrainerState{enabled, state}
+}
+
+//FIXME never exits
+func (n *NodeDrainer) Run() {
+	running := false
+	var ctx context.Context
+	cancel := func() {}
+	for s := range n.enabledCh {
+		switch {
+		case s.enabled && running:
+			// Already running
+			continue
+		case !s.enabled && !running:
+			// Already stopped
+			continue
+		case !s.enabled && running:
+			// Stop running node drainer
 			cancel()
-		case <-ctx.Done():
+			running = false
+		case s.enabled && !running:
+			// Start running node drainer
+			ctx, cancel = context.WithCancel(context.Background())
+			go n.nodeDrainer(ctx, s.state)
+			running = true
 		}
-	}()
+	}
+}
 
-	nodes, nodesIndex, drainingJobs, allocsIndex := initDrainer(s.logger, state)
+// nodeDrainer should be called in establishLeadership by the leader.
+func (n *NodeDrainer) nodeDrainer(ctx context.Context, state *state.StateStore) {
+	nodes, nodesIndex, drainingJobs, allocsIndex := initDrainer(n.logger, state)
 
 	// Wait for a node's drain deadline to expire
 	var nextDeadline time.Time
@@ -102,12 +148,12 @@ func (s *Server) startNodeDrainer(stopCh chan struct{}) {
 	deadlineTimer := time.NewTimer(time.Until(nextDeadline))
 
 	// Watch for nodes to start or stop draining
-	nodeWatcher := newNodeWatcher(s.logger, nodes, nodesIndex, state)
+	nodeWatcher := newNodeWatcher(n.logger, nodes, nodesIndex, state)
 	go nodeWatcher.run(ctx)
 
 	// Watch for drained allocations to be replaced
 	// Watch for changes in allocs for jobs with allocs on draining nodes
-	jobWatcher := newJobWatcher(s.logger, drainingJobs, allocsIndex, state)
+	jobWatcher := newJobWatcher(n.logger, drainingJobs, allocsIndex, state)
 	go jobWatcher.run(ctx)
 
 	for {
@@ -116,11 +162,11 @@ func (s *Server) startNodeDrainer(stopCh chan struct{}) {
 		//possible outcome of this is that an allocation could be
 		//stopped on a node that recently had its drain cancelled which
 		//doesn't seem like that bad of a pathological case
-		s.logger.Printf("[TRACE] nomad.drain: LOOP next deadline: %s (%s)", nextDeadline, time.Until(nextDeadline))
+		n.logger.Printf("[TRACE] nomad.drain: LOOP next deadline: %s (%s)", nextDeadline, time.Until(nextDeadline))
 		select {
 		case nodes = <-nodeWatcher.nodesCh:
 			// update draining nodes
-			s.logger.Printf("[TRACE] nomad.drain: running due to node change (%d nodes draining)", len(nodes))
+			n.logger.Printf("[TRACE] nomad.drain: running due to node change (%d nodes draining)", len(nodes))
 
 			// update deadline timer
 			changed := false
@@ -139,7 +185,7 @@ func (s *Server) startNodeDrainer(stopCh chan struct{}) {
 
 			// if changed reset the timer
 			if changed {
-				s.logger.Printf("[TRACE] nomad.drain: new node deadline: %s", nextDeadline)
+				n.logger.Printf("[TRACE] nomad.drain: new node deadline: %s", nextDeadline)
 				if !deadlineTimer.Stop() {
 					// timer may have been recv'd in a
 					// previous loop, so don't block
@@ -152,10 +198,10 @@ func (s *Server) startNodeDrainer(stopCh chan struct{}) {
 			}
 
 		case jobs := <-jobWatcher.WaitCh():
-			s.logger.Printf("[TRACE] nomad.drain: running due to alloc change (%d jobs updated)", len(jobs))
+			n.logger.Printf("[TRACE] nomad.drain: running due to alloc change (%d jobs updated)", len(jobs))
 		case when := <-deadlineTimer.C:
 			// deadline for a node was reached
-			s.logger.Printf("[TRACE] nomad.drain: running due to deadline reached (at %s)", when)
+			n.logger.Printf("[TRACE] nomad.drain: running due to deadline reached (at %s)", when)
 		case <-ctx.Done():
 			// exit
 			return
@@ -164,15 +210,13 @@ func (s *Server) startNodeDrainer(stopCh chan struct{}) {
 		// Tracks nodes that are done draining
 		doneNodes := map[string]*structs.Node{}
 
-		//TODO work from a state snapshot? perhaps from a last update
-		//index? I can't think of why this would be beneficial as this
-		//entire process runs asynchronously with the fsm/scheduler/etc
+		// Capture state (statestore and time) to do consistent comparisons
 		snapshot, err := state.Snapshot()
 		if err != nil {
 			//FIXME
 			panic(err)
 		}
-		now := time.Now() // for determing deadlines in a consistent way
+		now := time.Now()
 
 		// job key -> {job, allocs}
 		// Collect all allocs for all jobs with at least one
@@ -227,14 +271,14 @@ func (s *Server) startNodeDrainer(stopCh chan struct{}) {
 				// allocs left to be drained
 				if !alloc.TerminalStatus() {
 					if !allocsLeft {
-						s.logger.Printf("[TRACE] nomad.drain: node %s has allocs left to drain", nodeID[:6])
+						n.logger.Printf("[TRACE] nomad.drain: node %s has allocs left to drain", nodeID[:6])
 						allocsLeft = true
 					}
 				}
 
 				// Don't bother collecting system/batch jobs for nodes that haven't hit their deadline
 				if job.Type != structs.JobTypeService && !deadlineReached {
-					s.logger.Printf("[TRACE] nomad.drain: not draining %s job %s because deadline isn't for %s",
+					n.logger.Printf("[TRACE] nomad.drain: not draining %s job %s because deadline isn't for %s",
 						job.Type, job.Name, node.DrainStrategy.DeadlineTime().Sub(now))
 					skipJob[jobkey] = struct{}{}
 					continue
@@ -248,14 +292,14 @@ func (s *Server) startNodeDrainer(stopCh chan struct{}) {
 
 				// Count the number of down (terminal or nil deployment status) per task group
 				if job.Type == structs.JobTypeService {
-					n := 0
+					num := 0
 					for _, a := range jobAllocs {
 						if !a.TerminalStatus() && a.DeploymentStatus != nil {
 							upPerTG[makeTaskGroupKey(a)]++
-							n++
+							num++
 						}
 					}
-					s.logger.Printf("[TRACE] nomad.drain: job %s has %d task groups running", job.Name, n)
+					n.logger.Printf("[TRACE] nomad.drain: job %s has %d task groups running", job.Name, num)
 				}
 
 				drainable[jobkey] = &drainingJob{
@@ -268,7 +312,7 @@ func (s *Server) startNodeDrainer(stopCh chan struct{}) {
 
 			// if node has no allocs or has hit its deadline, it's done draining!
 			if !allocsLeft || deadlineReached {
-				s.logger.Printf("[TRACE] nomad.drain: node %s has no more allocs left to drain or has reached deadline", nodeID)
+				n.logger.Printf("[TRACE] nomad.drain: node %s has no more allocs left to drain or has reached deadline", nodeID)
 				jobWatcher.nodeDone(nodeID)
 				doneNodes[nodeID] = node
 			}
@@ -298,7 +342,7 @@ func (s *Server) startNodeDrainer(stopCh chan struct{}) {
 				tgKey := makeTaskGroupKey(alloc)
 
 				if node.DrainStrategy.DeadlineTime().Before(now) {
-					s.logger.Printf("[TRACE] nomad.drain: draining job %s alloc %s from node %s due to node's drain deadline", drainingJob.job.Name, alloc.ID[:6], alloc.NodeID[:6])
+					n.logger.Printf("[TRACE] nomad.drain: draining job %s alloc %s from node %s due to node's drain deadline", drainingJob.job.Name, alloc.ID[:6], alloc.NodeID[:6])
 					// Alloc's Node has reached its deadline
 					stoplist.add(drainingJob.job, alloc)
 					upPerTG[tgKey]--
@@ -319,19 +363,19 @@ func (s *Server) startNodeDrainer(stopCh chan struct{}) {
 
 				// Only 1, drain
 				if tg.Count == 1 {
-					s.logger.Printf("[TRACE] nomad.drain: draining job %s alloc %s from node %s due to count=1", drainingJob.job.Name, alloc.ID[:6], alloc.NodeID[:6])
+					n.logger.Printf("[TRACE] nomad.drain: draining job %s alloc %s from node %s due to count=1", drainingJob.job.Name, alloc.ID[:6], alloc.NodeID[:6])
 					stoplist.add(drainingJob.job, alloc)
 					continue
 				}
 
 				// No migrate strategy or a max parallel of 0 mean force draining
 				if tg.Migrate == nil || tg.Migrate.MaxParallel == 0 {
-					s.logger.Printf("[TRACE] nomad.drain: draining job %s alloc %s from node %s due to force drain", drainingJob.job.Name, alloc.ID[:6], alloc.NodeID[:6])
+					n.logger.Printf("[TRACE] nomad.drain: draining job %s alloc %s from node %s due to force drain", drainingJob.job.Name, alloc.ID[:6], alloc.NodeID[:6])
 					stoplist.add(drainingJob.job, alloc)
 					continue
 				}
 
-				s.logger.Printf("[TRACE] nomad.drain: considering job %s alloc %s  count %d  maxp %d  up %d",
+				n.logger.Printf("[TRACE] nomad.drain: considering job %s alloc %s  count %d  maxp %d  up %d",
 					drainingJob.job.Name, alloc.ID[:6], tg.Count, tg.Migrate.MaxParallel, upPerTG[tgKey])
 
 				// Count - MaxParalell = minimum number of allocations that must be "up"
@@ -339,7 +383,7 @@ func (s *Server) startNodeDrainer(stopCh chan struct{}) {
 
 				// If minimum is < the current number up it is safe to stop one.
 				if minUp < upPerTG[tgKey] {
-					s.logger.Printf("[TRACE] nomad.drain: draining job %s alloc %s from node %s due to max parallel", drainingJob.job.Name, alloc.ID[:6], alloc.NodeID[:6])
+					n.logger.Printf("[TRACE] nomad.drain: draining job %s alloc %s from node %s due to max parallel", drainingJob.job.Name, alloc.ID[:6], alloc.NodeID[:6])
 					// More migrations are allowed, add to stoplist
 					stoplist.add(drainingJob.job, alloc)
 					upPerTG[tgKey]--
@@ -348,7 +392,7 @@ func (s *Server) startNodeDrainer(stopCh chan struct{}) {
 		}
 
 		if len(stoplist.allocBatch) > 0 {
-			s.logger.Printf("[DEBUG] nomad.drain: stopping %d alloc(s) for %d job(s)", len(stoplist.allocBatch), len(stoplist.jobBatch))
+			n.logger.Printf("[DEBUG] nomad.drain: stopping %d alloc(s) for %d job(s)", len(stoplist.allocBatch), len(stoplist.jobBatch))
 
 			// Reevaluate affected jobs
 			evals := make([]*structs.Evaluation, 0, len(stoplist.jobBatch))
@@ -365,40 +409,21 @@ func (s *Server) startNodeDrainer(stopCh chan struct{}) {
 				})
 			}
 
-			// Send raft request
-			batch := &structs.AllocUpdateDesiredTransitionRequest{
-				Allocs:       stoplist.allocBatch,
-				Evals:        evals,
-				WriteRequest: structs.WriteRequest{Region: s.config.Region},
-			}
-
 			// Commit this update via Raft
-			//TODO Not the right request
-			_, index, err := s.raftApply(structs.AllocUpdateDesiredTransitionRequestType, batch)
-			if err != nil {
+			if err := n.raft.AllocUpdateDesiredTransition(stoplist.allocBatch, evals); err != nil {
 				//FIXME
 				panic(err)
 			}
-
-			//TODO i bet there's something useful to do with this index
-			_ = index
 		}
 
 		// Unset drain for nodes done draining
 		for nodeID, node := range doneNodes {
-			args := structs.NodeUpdateDrainRequest{
-				NodeID:       nodeID,
-				Drain:        false,
-				WriteRequest: structs.WriteRequest{Region: s.config.Region},
-			}
-
-			_, _, err := s.raftApply(structs.NodeUpdateDrainRequestType, &args)
-			if err != nil {
-				s.logger.Printf("[ERR] nomad.drain: failed to unset drain for: %v", err)
+			if err := n.raft.NodeDrainComplete(nodeID); err != nil {
+				n.logger.Printf("[ERR] nomad.drain: failed to unset drain for: %v", err)
 				//FIXME
 				panic(err)
 			}
-			s.logger.Printf("[INFO] nomad.drain: node %s (%s) completed draining", nodeID, node.Name)
+			n.logger.Printf("[INFO] nomad.drain: node %s (%s) completed draining", nodeID, node.Name)
 			delete(nodes, nodeID)
 		}
 	}
diff --git a/nomad/drain_test.go b/nomad/drainer/drain_test.go
similarity index 89%
rename from nomad/drain_test.go
rename to nomad/drainer/drain_test.go
index 9bae27fe38d2..1f38a4c293fc 100644
--- a/nomad/drain_test.go
+++ b/nomad/drainer/drain_test.go
@@ -1,7 +1,9 @@
-package nomad
+package drainer_test
 
 import (
 	"fmt"
+	"net"
+	"net/rpc"
 	"sort"
 	"strings"
 	"testing"
@@ -10,7 +12,9 @@ import (
 	msgpackrpc "github.com/hashicorp/net-rpc-msgpackrpc"
 	"github.com/hashicorp/nomad/client"
 	"github.com/hashicorp/nomad/client/config"
+	"github.com/hashicorp/nomad/helper/pool"
 	"github.com/hashicorp/nomad/helper/testlog"
+	"github.com/hashicorp/nomad/nomad"
 	"github.com/hashicorp/nomad/nomad/mock"
 	"github.com/hashicorp/nomad/nomad/structs"
 	"github.com/hashicorp/nomad/testutil"
@@ -20,11 +24,29 @@ import (
 	"github.com/stretchr/testify/require"
 )
 
+// rpcClient is a test helper method to return a ClientCodec to use to make rpc
+// calls to the passed server.
+func rpcClient(t *testing.T, conf *nomad.Config) rpc.ClientCodec {
+	addr := conf.RPCAddr
+	conn, err := net.DialTimeout("tcp", addr.String(), time.Second)
+	if err != nil {
+		t.Fatalf("err: %v", err)
+	}
+	// Write the Nomad RPC byte to set the mode
+	conn.Write([]byte{byte(pool.RpcNomad)})
+	return pool.NewClientCodec(conn)
+}
+
 // TestNodeDrainer_SimpleDrain asserts that draining when there are two nodes
 // moves allocs from the draining node to the other node.
 func TestNodeDrainer_SimpleDrain(t *testing.T) {
 	require := require.New(t)
-	server := TestServer(t, nil)
+
+	// Capture test servers config
+	var serverConfig *nomad.Config
+	server := nomad.TestServer(t, func(c *nomad.Config) {
+		serverConfig = c
+	})
 	defer server.Shutdown()
 
 	testutil.WaitForLeader(t, server.RPC)
@@ -32,7 +54,7 @@ func TestNodeDrainer_SimpleDrain(t *testing.T) {
 	// Setup 2 Nodes: A & B; A has allocs and is draining
 
 	// Create mock jobs
-	state := server.fsm.State()
+	state := server.State()
 
 	serviceJob := mock.Job()
 	serviceJob.Name = "service-job"
@@ -83,12 +105,12 @@ func TestNodeDrainer_SimpleDrain(t *testing.T) {
 	// Start node 1
 	c1 := client.TestClient(t, func(conf *config.Config) {
 		conf.LogOutput = testlog.NewWriter(t)
-		conf.Servers = []string{server.config.RPCAddr.String()}
+		conf.Servers = []string{serverConfig.RPCAddr.String()}
 	})
 	defer c1.Shutdown()
 
 	// Start jobs so they all get placed on node 1
-	codec := rpcClient(t, server)
+	codec := rpcClient(t, serverConfig)
 	for _, job := range []*structs.Job{systemJob, serviceJob, batchJob} {
 		req := &structs.JobRegisterRequest{
 			Job: job.Copy(),
@@ -137,7 +159,6 @@ func TestNodeDrainer_SimpleDrain(t *testing.T) {
 				t.Logf("%d alloc %s job %s status %s", i, alloc.ID, alloc.Job.Name, alloc.ClientStatus)
 			}
 		}
-		server.logger.Println("----------------------------------------------------------------------quitting--------------------------------------------------------")
 		t.Fatalf("failed waiting for all allocs to start: %v", err)
 	})
 
@@ -155,7 +176,7 @@ func TestNodeDrainer_SimpleDrain(t *testing.T) {
 	// Start node 2
 	c2 := client.TestClient(t, func(conf *config.Config) {
 		conf.NetworkSpeed = 10000
-		conf.Servers = []string{server.config.RPCAddr.String()}
+		conf.Servers = []string{serverConfig.RPCAddr.String()}
 	})
 	defer c2.Shutdown()
 
@@ -191,7 +212,6 @@ func TestNodeDrainer_SimpleDrain(t *testing.T) {
 				t.Logf("%d alloc %s job %s status %s prev %s", i, alloc.ID, alloc.Job.Name, alloc.ClientStatus, alloc.PreviousAllocation)
 			}
 		}
-		server.logger.Println("----------------------------------------------------------------------quitting--------------------------------------------------------")
 		t.Errorf("failed waiting for all allocs to migrate: %v", err)
 	})
 
diff --git a/nomad/drainer_shims.go b/nomad/drainer_shims.go
new file mode 100644
index 000000000000..0ced081f5fd9
--- /dev/null
+++ b/nomad/drainer_shims.go
@@ -0,0 +1,30 @@
+package nomad
+
+import "github.com/hashicorp/nomad/nomad/structs"
+
+// drainerShim implements the drainer.RaftApplier interface required by the
+// NodeDrainer.
+type drainerShim struct {
+	s *Server
+}
+
+func (d drainerShim) NodeDrainComplete(nodeID string) error {
+	args := &structs.NodeUpdateDrainRequest{
+		NodeID:       nodeID,
+		Drain:        false,
+		WriteRequest: structs.WriteRequest{Region: d.s.config.Region},
+	}
+
+	_, _, err := d.s.raftApply(structs.NodeUpdateDrainRequestType, args)
+	return err
+}
+
+func (d drainerShim) AllocUpdateDesiredTransition(allocs map[string]*structs.DesiredTransition, evals []*structs.Evaluation) error {
+	args := &structs.AllocUpdateDesiredTransitionRequest{
+		Allocs:       allocs,
+		Evals:        evals,
+		WriteRequest: structs.WriteRequest{Region: d.s.config.Region},
+	}
+	_, _, err := d.s.raftApply(structs.AllocUpdateDesiredTransitionRequestType, args)
+	return err
+}
diff --git a/nomad/leader.go b/nomad/leader.go
index b81b65d23232..a395c3a91d5b 100644
--- a/nomad/leader.go
+++ b/nomad/leader.go
@@ -199,9 +199,10 @@ func (s *Server) establishLeadership(stopCh chan struct{}) error {
 	s.blockedEvals.SetTimetable(s.fsm.TimeTable())
 
 	// Enable the deployment watcher, since we are now the leader
-	if err := s.deploymentWatcher.SetEnabled(true, s.State()); err != nil {
-		return err
-	}
+	s.deploymentWatcher.SetEnabled(true, s.State())
+
+	// Enable the NodeDrainer
+	s.nodeDrainer.SetEnabled(true, s.State())
 
 	// Restore the eval broker state
 	if err := s.restoreEvals(); err != nil {
@@ -267,8 +268,15 @@ func (s *Server) establishLeadership(stopCh chan struct{}) error {
 		go s.replicateACLTokens(stopCh)
 	}
 
-	// Start Node Drainer
-	go s.startNodeDrainer(stopCh)
+	// Convert stopCh into a Context
+	ctx, cancel := context.WithCancel(context.Background())
+	go func() {
+		defer cancel()
+		select {
+		case <-stopCh:
+		case <-ctx.Done():
+		}
+	}()
 
 	// Setup any enterprise systems required.
 	if err := s.establishEnterpriseLeadership(stopCh); err != nil {
@@ -676,9 +684,10 @@ func (s *Server) revokeLeadership() error {
 	s.vault.SetActive(false)
 
 	// Disable the deployment watcher as it is only useful as a leader.
-	if err := s.deploymentWatcher.SetEnabled(false, nil); err != nil {
-		return err
-	}
+	s.deploymentWatcher.SetEnabled(false, nil)
+
+	// Disable the node drainer
+	s.nodeDrainer.SetEnabled(false, nil)
 
 	// Disable any enterprise systems required.
 	if err := s.revokeEnterpriseLeadership(); err != nil {
diff --git a/nomad/rpc_test.go b/nomad/rpc_test.go
index c876c6adb1df..ec885cc652c8 100644
--- a/nomad/rpc_test.go
+++ b/nomad/rpc_test.go
@@ -30,7 +30,7 @@ func rpcClient(t *testing.T, s *Server) rpc.ClientCodec {
 	if err != nil {
 		t.Fatalf("err: %v", err)
 	}
-	// Write the Consul RPC byte to set the mode
+	// Write the Nomad RPC byte to set the mode
 	conn.Write([]byte{byte(pool.RpcNomad)})
 	return pool.NewClientCodec(conn)
 }
diff --git a/nomad/server.go b/nomad/server.go
index 68789da4a259..f49c62b9cf5a 100644
--- a/nomad/server.go
+++ b/nomad/server.go
@@ -27,6 +27,7 @@ import (
 	"github.com/hashicorp/nomad/helper/stats"
 	"github.com/hashicorp/nomad/helper/tlsutil"
 	"github.com/hashicorp/nomad/nomad/deploymentwatcher"
+	"github.com/hashicorp/nomad/nomad/drainer"
 	"github.com/hashicorp/nomad/nomad/state"
 	"github.com/hashicorp/nomad/nomad/structs"
 	"github.com/hashicorp/nomad/nomad/structs/config"
@@ -172,6 +173,9 @@ type Server struct {
 	// make the required calls to continue to transition the deployment.
 	deploymentWatcher *deploymentwatcher.Watcher
 
+	// nodeDrainer is used to drain allocations from nodes.
+	nodeDrainer *drainer.NodeDrainer
+
 	// evalBroker is used to manage the in-progress evaluations
 	// that are waiting to be brokered to a sub-scheduler
 	evalBroker *EvalBroker
@@ -355,6 +359,9 @@ func NewServer(config *Config, consulCatalog consul.CatalogAPI, logger *log.Logg
 		return nil, fmt.Errorf("failed to create deployment watcher: %v", err)
 	}
 
+	// Setup the node drainer.
+	s.setupNodeDrainer()
+
 	// Setup the enterprise state
 	if err := s.setupEnterprise(config); err != nil {
 		return nil, err
@@ -880,6 +887,15 @@ func (s *Server) setupDeploymentWatcher() error {
 	return nil
 }
 
+// setupNodeDrainer creates a node drainer which will be enabled when a server
+// becomes a leader.
+func (s *Server) setupNodeDrainer() {
+	// create a shim around raft requests
+	shim := drainerShim{s}
+	s.nodeDrainer = drainer.NewNodeDrainer(s.logger, shim)
+	go s.nodeDrainer.Run()
+}
+
 // setupVaultClient is used to set up the Vault API client.
 func (s *Server) setupVaultClient() error {
 	v, err := NewVaultClient(s.config.VaultConfig, s.logger, s.purgeVaultAccessors)

From 9de890899ad4241c0ab3f68ec095366abfbdae6a Mon Sep 17 00:00:00 2001
From: Michael Schurter <schmichael@hashicorp.com>
Date: Tue, 27 Feb 2018 13:48:13 -0800
Subject: [PATCH 24/79] drainer: drainer should shutdown with server

---
 nomad/drainer/drain.go | 44 ++++++++++++++++++++++++++++++++++--------
 nomad/server.go        |  2 +-
 2 files changed, 37 insertions(+), 9 deletions(-)

diff --git a/nomad/drainer/drain.go b/nomad/drainer/drain.go
index e0c386f4056e..b795e0eea173 100644
--- a/nomad/drainer/drain.go
+++ b/nomad/drainer/drain.go
@@ -78,37 +78,64 @@ type RaftApplier interface {
 	NodeDrainComplete(nodeID string) error
 }
 
+// nodeDrainerState is used to communicate the state set by
+// NodeDrainer.SetEnabled to the concurrently executing Run loop.
 type nodeDrainerState struct {
 	enabled bool
 	state   *state.StateStore
 }
 
+// NodeDrainer migrates allocations off of draining nodes. SetEnabled(true)
+// should be called when a server establishes leadership and SetEnabled(false)
+// called when leadership is lost.
 type NodeDrainer struct {
 	enabledCh chan nodeDrainerState
 
 	raft RaftApplier
 
+	shutdownCh <-chan struct{}
+
 	logger *log.Logger
 }
 
-func NewNodeDrainer(logger *log.Logger, raft RaftApplier) *NodeDrainer {
+// NewNodeDrainer creates a new NodeDrainer which will exit when shutdownCh is
+// closed. A RaftApplier shim must be supplied to allow NodeDrainer access to
+// the raft messages it sends.
+func NewNodeDrainer(logger *log.Logger, shutdownCh <-chan struct{}, raft RaftApplier) *NodeDrainer {
 	return &NodeDrainer{
-		enabledCh: make(chan nodeDrainerState),
-		raft:      raft,
-		logger:    logger,
+		enabledCh:  make(chan nodeDrainerState),
+		raft:       raft,
+		shutdownCh: shutdownCh,
+		logger:     logger,
 	}
 }
 
+// SetEnabled will start or stop the node draining goroutine depending on the
+// enabled boolean. SetEnabled is meant to be called concurrently with Run.
 func (n *NodeDrainer) SetEnabled(enabled bool, state *state.StateStore) {
-	n.enabledCh <- nodeDrainerState{enabled, state}
+	select {
+	case n.enabledCh <- nodeDrainerState{enabled, state}:
+	case <-n.shutdownCh:
+	}
 }
 
-//FIXME never exits
+// Run monitors the shutdown chan as well as SetEnabled calls and starts/stops
+// the node draining goroutine appropriately. As it blocks it should be called
+// in a goroutine.
 func (n *NodeDrainer) Run() {
 	running := false
+	var s nodeDrainerState
 	var ctx context.Context
 	cancel := func() {}
-	for s := range n.enabledCh {
+	for {
+		select {
+		case s = <-n.enabledCh:
+		case <-n.shutdownCh:
+			// Stop drainer and exit
+			cancel()
+			return
+		}
+
 		switch {
 		case s.enabled && running:
 			// Already running
@@ -129,7 +156,8 @@ func (n *NodeDrainer) Run() {
 	}
 }
 
-// nodeDrainer should be called in establishLeadership by the leader.
+// nodeDrainer is the core node draining main loop and should be started in a
+// goroutine when a server establishes leadership.
 func (n *NodeDrainer) nodeDrainer(ctx context.Context, state *state.StateStore) {
 	nodes, nodesIndex, drainingJobs, allocsIndex := initDrainer(n.logger, state)
 
diff --git a/nomad/server.go b/nomad/server.go
index f49c62b9cf5a..a9984ac34afb 100644
--- a/nomad/server.go
+++ b/nomad/server.go
@@ -892,7 +892,7 @@ func (s *Server) setupDeploymentWatcher() error {
 func (s *Server) setupNodeDrainer() {
 	// create a shim around raft requests
 	shim := drainerShim{s}
-	s.nodeDrainer = drainer.NewNodeDrainer(s.logger, shim)
+	s.nodeDrainer = drainer.NewNodeDrainer(s.logger, s.shutdownCh, shim)
 	go s.nodeDrainer.Run()
 }
 

From 57c03359409578154fcf6385f7faf3c445b158f3 Mon Sep 17 00:00:00 2001
From: Michael Schurter <schmichael@hashicorp.com>
Date: Tue, 27 Feb 2018 13:51:37 -0800
Subject: [PATCH 25/79] Remove unused context

---
 nomad/leader.go | 10 ----------
 1 file changed, 10 deletions(-)

diff --git a/nomad/leader.go b/nomad/leader.go
index a395c3a91d5b..f65a22477727 100644
--- a/nomad/leader.go
+++ b/nomad/leader.go
@@ -268,16 +268,6 @@ func (s *Server) establishLeadership(stopCh chan struct{}) error {
 		go s.replicateACLTokens(stopCh)
 	}
 
-	// Convert stopCh into a Context
-	ctx, cancel := context.WithCancel(context.Background())
-	go func() {
-		defer cancel()
-		select {
-		case <-stopCh:
-		case <-ctx.Done():
-		}
-	}()
-
 	// Setup any enterprise systems required.
 	if err := s.establishEnterpriseLeadership(stopCh); err != nil {
 		return err

From f2de735cdc341a10643512207740fa4891e236d4 Mon Sep 17 00:00:00 2001
From: Michael Schurter <schmichael@hashicorp.com>
Date: Tue, 27 Feb 2018 14:08:30 -0800
Subject: [PATCH 26/79] Restart every time SetEnabled(true) is called

---
 nomad/drainer/drain.go | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/nomad/drainer/drain.go b/nomad/drainer/drain.go
index b795e0eea173..5175f609f55d 100644
--- a/nomad/drainer/drain.go
+++ b/nomad/drainer/drain.go
@@ -125,8 +125,7 @@ func (n *NodeDrainer) SetEnabled(enabled bool, state *state.StateStore) {
 func (n *NodeDrainer) Run() {
 	running := false
 	var s nodeDrainerState
-	var ctx context.Context
-	cancel := func() {}
+	ctx, cancel := context.WithCancel(context.Background())
 	for {
 		select {
 		case s = <-n.enabledCh:
@@ -138,15 +137,19 @@ func (n *NodeDrainer) Run() {
 
 		switch {
 		case s.enabled && running:
-			// Already running
-			continue
+			// Already running, must restart to ensure the latest StateStore is used
+			cancel()
+			ctx, cancel = context.WithCancel(context.Background())
+			go n.nodeDrainer(ctx, s.state)
+
 		case !s.enabled && !running:
-			// Already stopped
-			continue
+			// Already stopped; nothing to do
+
 		case !s.enabled && running:
 			// Stop running node drainer
 			cancel()
 			running = false
+
 		case s.enabled && !running:
 			// Start running node drainer
 			ctx, cancel = context.WithCancel(context.Background())

From 678fbe1755a0bb0d9a54a798768a20686fd7cf92 Mon Sep 17 00:00:00 2001
From: Michael Schurter <schmichael@hashicorp.com>
Date: Tue, 27 Feb 2018 14:50:17 -0800
Subject: [PATCH 27/79] drainer: factor job & node watchers out of drainer.go

---
 nomad/drainer/drain.go       | 240 -----------------------------------
 nomad/drainer/jobwatcher.go  | 140 ++++++++++++++++++++
 nomad/drainer/nodewatcher.go | 121 ++++++++++++++++++
 3 files changed, 261 insertions(+), 240 deletions(-)
 create mode 100644 nomad/drainer/jobwatcher.go
 create mode 100644 nomad/drainer/nodewatcher.go

diff --git a/nomad/drainer/drain.go b/nomad/drainer/drain.go
index 5175f609f55d..5f35bca6c0b4 100644
--- a/nomad/drainer/drain.go
+++ b/nomad/drainer/drain.go
@@ -4,10 +4,8 @@ import (
 	"context"
 	"log"
 	"strings"
-	"sync"
 	"time"
 
-	memdb "github.com/hashicorp/go-memdb"
 	"github.com/hashicorp/nomad/helper"
 	"github.com/hashicorp/nomad/helper/uuid"
 	"github.com/hashicorp/nomad/nomad/state"
@@ -460,244 +458,6 @@ func (n *NodeDrainer) nodeDrainer(ctx context.Context, state *state.StateStore)
 	}
 }
 
-// nodeWatcher watches for nodes to start or stop draining
-type nodeWatcher struct {
-	index   uint64
-	nodes   map[string]*structs.Node
-	nodesCh chan map[string]*structs.Node
-	state   *state.StateStore
-	logger  *log.Logger
-}
-
-func newNodeWatcher(logger *log.Logger, nodes map[string]*structs.Node, index uint64, state *state.StateStore) *nodeWatcher {
-	return &nodeWatcher{
-		nodes:   nodes,
-		nodesCh: make(chan map[string]*structs.Node),
-		index:   index,
-		state:   state,
-		logger:  logger,
-	}
-}
-
-func (n *nodeWatcher) run(ctx context.Context) {
-	// Trigger an initial drain pass if there are already nodes draining
-	//FIXME this is unneccessary if a node has reached a deadline
-	n.logger.Printf("[TRACE] nomad.drain: initial draining nodes: %d", len(n.nodes))
-	if len(n.nodes) > 0 {
-		n.nodesCh <- n.nodes
-	}
-
-	for {
-		//FIXME it seems possible for this to return a nil error and a 0 index, what to do in that case?
-		resp, index, err := n.state.BlockingQuery(n.queryNodeDrain, n.index, ctx)
-		if err != nil {
-			if err == context.Canceled {
-				n.logger.Printf("[TRACE] nomad.drain: draining node watcher shutting down")
-				return
-			}
-			n.logger.Printf("[ERR] nomad.drain: error blocking on node updates at index %d: %v", n.index, err)
-			return
-		}
-
-		// update index for next run
-		n.index = index
-
-		changed := false
-		newNodes := resp.([]*structs.Node)
-		n.logger.Printf("[TRACE] nomad.drain: %d nodes to consider", len(newNodes)) //FIXME remove
-		for _, newNode := range newNodes {
-			if existingNode, ok := n.nodes[newNode.ID]; ok {
-				// Node was draining, see if it has changed
-				if !newNode.Drain {
-					// Node stopped draining
-					delete(n.nodes, newNode.ID)
-					changed = true
-				} else if !newNode.DrainStrategy.DeadlineTime().Equal(existingNode.DrainStrategy.DeadlineTime()) {
-					// Update deadline
-					n.nodes[newNode.ID] = newNode
-					changed = true
-				}
-			} else {
-				// Node was not draining
-				if newNode.Drain {
-					// Node started draining
-					n.nodes[newNode.ID] = newNode
-					changed = true
-				}
-			}
-		}
-
-		// Send a copy of the draining nodes if there were changes
-		if !changed {
-			continue
-		}
-
-		nodesCopy := make(map[string]*structs.Node, len(n.nodes))
-		for k, v := range n.nodes {
-			nodesCopy[k] = v
-		}
-
-		select {
-		case n.nodesCh <- nodesCopy:
-		case <-ctx.Done():
-			return
-		}
-	}
-}
-
-func (n *nodeWatcher) queryNodeDrain(ws memdb.WatchSet, state *state.StateStore) (interface{}, uint64, error) {
-	iter, err := state.Nodes(ws)
-	if err != nil {
-		return nil, 0, err
-	}
-
-	index, err := state.Index("nodes")
-	if err != nil {
-		return nil, 0, err
-	}
-
-	resp := make([]*structs.Node, 0, 8)
-
-	for {
-		raw := iter.Next()
-		if raw == nil {
-			break
-		}
-
-		node := raw.(*structs.Node)
-		resp = append(resp, node)
-	}
-
-	return resp, index, nil
-}
-
-type jobWatcher struct {
-	// allocsIndex to start watching from
-	allocsIndex uint64
-
-	// job -> node.ID
-	jobs   map[jobKey]string
-	jobsMu sync.Mutex
-
-	jobsCh chan map[jobKey]struct{}
-
-	state *state.StateStore
-
-	logger *log.Logger
-}
-
-func newJobWatcher(logger *log.Logger, jobs map[jobKey]string, allocsIndex uint64, state *state.StateStore) *jobWatcher {
-	return &jobWatcher{
-		allocsIndex: allocsIndex,
-		logger:      logger,
-		jobs:        jobs,
-		jobsCh:      make(chan map[jobKey]struct{}),
-		state:       state,
-	}
-}
-
-func (j *jobWatcher) watch(k jobKey, nodeID string) {
-	j.logger.Printf("[TRACE] nomad.drain: watching job %s on draining node %s", k.jobid, nodeID[:6])
-	j.jobsMu.Lock()
-	j.jobs[k] = nodeID
-	j.jobsMu.Unlock()
-}
-
-func (j *jobWatcher) nodeDone(nodeID string) {
-	j.jobsMu.Lock()
-	defer j.jobsMu.Unlock()
-	for k, v := range j.jobs {
-		if v == nodeID {
-			j.logger.Printf("[TRACE] nomad.drain: UNwatching job %s on done draining node %s", k.jobid, nodeID[:6])
-			delete(j.jobs, k)
-		}
-	}
-}
-
-func (j *jobWatcher) WaitCh() <-chan map[jobKey]struct{} {
-	return j.jobsCh
-}
-
-func (j *jobWatcher) run(ctx context.Context) {
-	var resp interface{}
-	var err error
-
-	for {
-		//FIXME have watchAllocs create a closure and give it a copy of j.jobs to remove locking?
-		//FIXME it seems possible for this to return a nil error and a 0 index, what to do in that case?
-		var newIndex uint64
-		resp, newIndex, err = j.state.BlockingQuery(j.watchAllocs, j.allocsIndex, ctx)
-		if err != nil {
-			if err == context.Canceled {
-				j.logger.Printf("[TRACE] nomad.drain: job watcher shutting down")
-				return
-			}
-			j.logger.Printf("[ERR] nomad.drain: error blocking on alloc updates: %v", err)
-			return
-		}
-
-		j.logger.Printf("[TRACE] nomad.drain: job watcher old index: %d new index: %d", j.allocsIndex, newIndex)
-		j.allocsIndex = newIndex
-
-		changedJobs := resp.(map[jobKey]struct{})
-		if len(changedJobs) > 0 {
-			select {
-			case j.jobsCh <- changedJobs:
-			case <-ctx.Done():
-				return
-			}
-		}
-	}
-}
-
-func (j *jobWatcher) watchAllocs(ws memdb.WatchSet, state *state.StateStore) (interface{}, uint64, error) {
-	iter, err := state.Allocs(ws)
-	if err != nil {
-		return nil, 0, err
-	}
-
-	index, err := state.Index("allocs")
-	if err != nil {
-		return nil, 0, err
-	}
-
-	skipped := 0
-
-	// job ids
-	resp := map[jobKey]struct{}{}
-
-	for {
-		raw := iter.Next()
-		if raw == nil {
-			break
-		}
-
-		alloc := raw.(*structs.Allocation)
-
-		j.jobsMu.Lock()
-		_, ok := j.jobs[jobKey{alloc.Namespace, alloc.JobID}]
-		j.jobsMu.Unlock()
-
-		if !ok {
-			// alloc is not part of a draining job
-			skipped++
-			continue
-		}
-
-		// don't wake drain loop if alloc hasn't updated its health
-		if alloc.DeploymentStatus.IsHealthy() || alloc.DeploymentStatus.IsUnhealthy() {
-			j.logger.Printf("[TRACE] nomad.drain: job watcher found alloc %s - deployment status: %t", alloc.ID[:6], *alloc.DeploymentStatus.Healthy)
-			resp[jobKey{alloc.Namespace, alloc.JobID}] = struct{}{}
-		} else {
-			j.logger.Printf("[TRACE] nomad.drain: job watcher ignoring alloc %s - no deployment status", alloc.ID[:6])
-		}
-	}
-
-	j.logger.Printf("[TRACE] nomad.drain: job watcher ignoring %d allocs - not part of draining job at index %d", skipped, index)
-
-	return resp, index, nil
-}
-
 // initDrainer initializes the node drainer state and returns a list of
 // draining nodes as well as allocs that are draining that should be watched
 // for a replacement.
diff --git a/nomad/drainer/jobwatcher.go b/nomad/drainer/jobwatcher.go
new file mode 100644
index 000000000000..95a1be5d157e
--- /dev/null
+++ b/nomad/drainer/jobwatcher.go
@@ -0,0 +1,140 @@
+package drainer
+
+import (
+	"context"
+	"log"
+	"sync"
+
+	memdb "github.com/hashicorp/go-memdb"
+	"github.com/hashicorp/nomad/nomad/state"
+	"github.com/hashicorp/nomad/nomad/structs"
+)
+
+// jobWatcher watches allocation changes for jobs with at least one allocation
+// on a draining node.
+type jobWatcher struct {
+	// allocsIndex to start watching from
+	allocsIndex uint64
+
+	// job -> node.ID
+	jobs   map[jobKey]string
+	jobsMu sync.Mutex
+
+	jobsCh chan map[jobKey]struct{}
+
+	state *state.StateStore
+
+	logger *log.Logger
+}
+
+func newJobWatcher(logger *log.Logger, jobs map[jobKey]string, allocsIndex uint64, state *state.StateStore) *jobWatcher {
+	return &jobWatcher{
+		allocsIndex: allocsIndex,
+		logger:      logger,
+		jobs:        jobs,
+		jobsCh:      make(chan map[jobKey]struct{}),
+		state:       state,
+	}
+}
+
+func (j *jobWatcher) watch(k jobKey, nodeID string) {
+	j.logger.Printf("[TRACE] nomad.drain: watching job %s on draining node %s", k.jobid, nodeID[:6])
+	j.jobsMu.Lock()
+	j.jobs[k] = nodeID
+	j.jobsMu.Unlock()
+}
+
+func (j *jobWatcher) nodeDone(nodeID string) {
+	j.jobsMu.Lock()
+	defer j.jobsMu.Unlock()
+	for k, v := range j.jobs {
+		if v == nodeID {
+			j.logger.Printf("[TRACE] nomad.drain: UNwatching job %s on done draining node %s", k.jobid, nodeID[:6])
+			delete(j.jobs, k)
+		}
+	}
+}
+
+func (j *jobWatcher) WaitCh() <-chan map[jobKey]struct{} {
+	return j.jobsCh
+}
+
+func (j *jobWatcher) run(ctx context.Context) {
+	var resp interface{}
+	var err error
+
+	for {
+		//FIXME have watchAllocs create a closure and give it a copy of j.jobs to remove locking?
+		//FIXME it seems possible for this to return a nil error and a 0 index, what to do in that case?
+		var newIndex uint64
+		resp, newIndex, err = j.state.BlockingQuery(j.watchAllocs, j.allocsIndex, ctx)
+		if err != nil {
+			if err == context.Canceled {
+				j.logger.Printf("[TRACE] nomad.drain: job watcher shutting down")
+				return
+			}
+			j.logger.Printf("[ERR] nomad.drain: error blocking on alloc updates: %v", err)
+			return
+		}
+
+		j.logger.Printf("[TRACE] nomad.drain: job watcher old index: %d new index: %d", j.allocsIndex, newIndex)
+		j.allocsIndex = newIndex
+
+		changedJobs := resp.(map[jobKey]struct{})
+		if len(changedJobs) > 0 {
+			select {
+			case j.jobsCh <- changedJobs:
+			case <-ctx.Done():
+				return
+			}
+		}
+	}
+}
+
+func (j *jobWatcher) watchAllocs(ws memdb.WatchSet, state *state.StateStore) (interface{}, uint64, error) {
+	iter, err := state.Allocs(ws)
+	if err != nil {
+		return nil, 0, err
+	}
+
+	index, err := state.Index("allocs")
+	if err != nil {
+		return nil, 0, err
+	}
+
+	skipped := 0
+
+	// job ids
+	resp := map[jobKey]struct{}{}
+
+	for {
+		raw := iter.Next()
+		if raw == nil {
+			break
+		}
+
+		alloc := raw.(*structs.Allocation)
+
+		j.jobsMu.Lock()
+		_, ok := j.jobs[jobKey{alloc.Namespace, alloc.JobID}]
+		j.jobsMu.Unlock()
+
+		if !ok {
+			// alloc is not part of a draining job
+			skipped++
+			continue
+		}
+
+		// don't wake drain loop if alloc hasn't updated its health
+		if alloc.DeploymentStatus.IsHealthy() || alloc.DeploymentStatus.IsUnhealthy() {
+			j.logger.Printf("[TRACE] nomad.drain: job watcher found alloc %s - deployment status: %t", alloc.ID[:6], *alloc.DeploymentStatus.Healthy)
+			resp[jobKey{alloc.Namespace, alloc.JobID}] = struct{}{}
+		} else {
+			j.logger.Printf("[TRACE] nomad.drain: job watcher ignoring alloc %s - no deployment status", alloc.ID[:6])
+		}
+	}
+
+	j.logger.Printf("[TRACE] nomad.drain: job watcher ignoring %d allocs - not part of draining job at index %d", skipped, index)
+
+	return resp, index, nil
+}
diff --git a/nomad/drainer/nodewatcher.go b/nomad/drainer/nodewatcher.go
new file mode 100644
index 000000000000..eb54e4995842
--- /dev/null
+++ b/nomad/drainer/nodewatcher.go
@@ -0,0 +1,121 @@
+package drainer
+
+import (
+	"context"
+	"log"
+
+	memdb "github.com/hashicorp/go-memdb"
+	"github.com/hashicorp/nomad/nomad/state"
+	"github.com/hashicorp/nomad/nomad/structs"
+)
+
+// nodeWatcher watches for nodes to start or stop draining
+type nodeWatcher struct {
+	index   uint64
+	nodes   map[string]*structs.Node
+	nodesCh chan map[string]*structs.Node
+	state   *state.StateStore
+	logger  *log.Logger
+}
+
+func newNodeWatcher(logger *log.Logger, nodes map[string]*structs.Node, index uint64, state *state.StateStore) *nodeWatcher {
+	return &nodeWatcher{
+		nodes:   nodes,
+		nodesCh: make(chan map[string]*structs.Node),
+		index:   index,
+		state:   state,
+		logger:  logger,
+	}
+}
+
+func (n *nodeWatcher) run(ctx context.Context) {
+	// Trigger an initial drain pass if there are already nodes draining
+	//FIXME this is unneccessary if a node has reached a deadline
+	n.logger.Printf("[TRACE] nomad.drain: initial draining nodes: %d", len(n.nodes))
+	if len(n.nodes) > 0 {
+		n.nodesCh <- n.nodes
+	}
+
+	for {
+		//FIXME it seems possible for this to return a nil error and a 0 index, what to do in that case?
+		resp, index, err := n.state.BlockingQuery(n.queryNodeDrain, n.index, ctx)
+		if err != nil {
+			if err == context.Canceled {
+				n.logger.Printf("[TRACE] nomad.drain: draining node watcher shutting down")
+				return
+			}
+			n.logger.Printf("[ERR] nomad.drain: error blocking on node updates at index %d: %v", n.index, err)
+			return
+		}
+
+		// update index for next run
+		n.index = index
+
+		changed := false
+		newNodes := resp.([]*structs.Node)
+		n.logger.Printf("[TRACE] nomad.drain: %d nodes to consider", len(newNodes)) //FIXME remove
+		for _, newNode := range newNodes {
+			if existingNode, ok := n.nodes[newNode.ID]; ok {
+				// Node was draining, see if it has changed
+				if !newNode.Drain {
+					// Node stopped draining
+					delete(n.nodes, newNode.ID)
+					changed = true
+				} else if !newNode.DrainStrategy.DeadlineTime().Equal(existingNode.DrainStrategy.DeadlineTime()) {
+					// Update deadline
+					n.nodes[newNode.ID] = newNode
+					changed = true
+				}
+			} else {
+				// Node was not draining
+				if newNode.Drain {
+					// Node started draining
+					n.nodes[newNode.ID] = newNode
+					changed = true
+				}
+			}
+		}
+
+		// Send a copy of the draining nodes if there were changes
+		if !changed {
+			continue
+		}
+
+		nodesCopy := make(map[string]*structs.Node, len(n.nodes))
+		for k, v := range n.nodes {
+			nodesCopy[k] = v
+		}
+
+		select {
+		case n.nodesCh <- nodesCopy:
+		case <-ctx.Done():
+			return
+		}
+	}
+}
+
+func (n *nodeWatcher) queryNodeDrain(ws memdb.WatchSet, state *state.StateStore) (interface{}, uint64, error) {
+	iter, err := state.Nodes(ws)
+	if err != nil {
+		return nil, 0, err
+	}
+
+	index, err := state.Index("nodes")
+	if err != nil {
+		return nil, 0, err
+	}
+
+	resp := make([]*structs.Node, 0, 8)
+
+	for {
+		raw := iter.Next()
+		if raw == nil {
+			break
+		}
+
+		node := raw.(*structs.Node)
+		resp = append(resp, node)
+	}
+
+	return resp, index, nil
+}

From 3b25f784bec188264c51381a04083b0361742c7e Mon Sep 17 00:00:00 2001
From: Michael Schurter <schmichael@hashicorp.com>
Date: Tue, 27 Feb 2018 15:18:32 -0800
Subject: [PATCH 28/79] drainer: convert fsm errors to go errors

---
 nomad/drainer_shims.go | 21 ++++++++++++++++++---
 1 file changed, 18 insertions(+), 3 deletions(-)

diff --git a/nomad/drainer_shims.go b/nomad/drainer_shims.go
index 0ced081f5fd9..09a1a8f6635c 100644
--- a/nomad/drainer_shims.go
+++ b/nomad/drainer_shims.go
@@ -15,8 +15,8 @@ func (d drainerShim) NodeDrainComplete(nodeID string) error {
 		WriteRequest: structs.WriteRequest{Region: d.s.config.Region},
 	}
 
-	_, _, err := d.s.raftApply(structs.NodeUpdateDrainRequestType, args)
-	return err
+	resp, _, err := d.s.raftApply(structs.NodeUpdateDrainRequestType, args)
+	return d.convertApplyErrors(resp, err)
 }
 
 func (d drainerShim) AllocUpdateDesiredTransition(allocs map[string]*structs.DesiredTransition, evals []*structs.Evaluation) error {
@@ -25,6 +25,21 @@ func (d drainerShim) AllocUpdateDesiredTransition(allocs map[string]*structs.Des
 		Evals:        evals,
 		WriteRequest: structs.WriteRequest{Region: d.s.config.Region},
 	}
-	_, _, err := d.s.raftApply(structs.AllocUpdateDesiredTransitionRequestType, args)
+	resp, _, err := d.s.raftApply(structs.AllocUpdateDesiredTransitionRequestType, args)
+	return d.convertApplyErrors(resp, err)
+}
+
+// convertApplyErrors parses the results of a raftApply and returns the index at
+// which it was applied and any error that occurred. Raft Apply returns two
+// separate errors, Raft library errors and user returned errors from the FSM.
+// This helper, joins the errors by inspecting the applyResponse for an error.
+//
+// Similar to deployment watcher's convertApplyErrors
+func (d drainerShim) convertApplyErrors(applyResp interface{}, err error) error {
+	if applyResp != nil {
+		if fsmErr, ok := applyResp.(error); ok && fsmErr != nil {
+			return fsmErr
+		}
+	}
 	return err
 }

From 3fe3c6eff70e3cc7e1df121803bae39a11b66030 Mon Sep 17 00:00:00 2001
From: Alex Dadgar <alex.dadgar@gmail.com>
Date: Wed, 28 Feb 2018 16:25:56 -0800
Subject: [PATCH 29/79] Improve DeadlineTime helper

---
 api/nodes.go                 |  3 ++
 client/testing.go            |  4 ++
 nomad/drainer/drain.go       | 84 +++++++++++++++++++-----------------
 nomad/drainer/drain_test.go  |  8 ++++
 nomad/drainer/nodewatcher.go |  6 +--
 nomad/node_endpoint.go       |  5 +++
 nomad/structs/structs.go     | 51 +++++++++++++++++-----
 7 files changed, 108 insertions(+), 53 deletions(-)

diff --git a/api/nodes.go b/api/nodes.go
index 37adb8fc34e5..a505d9ae369f 100644
--- a/api/nodes.go
+++ b/api/nodes.go
@@ -176,6 +176,9 @@ type Node struct {
 type DrainStrategy struct {
 	// DrainSpec is the user declared drain specification
 	DrainSpec
+
+	// DeadlineTime is the deadline time for the drain.
+	DeadlineTime time.Time
 }
 
 // DrainSpec describes a Node's drain behavior.
diff --git a/client/testing.go b/client/testing.go
index a86728365abe..4043da298738 100644
--- a/client/testing.go
+++ b/client/testing.go
@@ -21,6 +21,10 @@ func TestClient(t testing.T, cb func(c *config.Config)) *Client {
 		},
 	}
 
+	// Loosen GC threshold
+	conf.GCDiskUsageThreshold = 98.0
+	conf.GCInodeUsageThreshold = 98.0
+
 	// Tighten the fingerprinter timeouts
 	if conf.Options == nil {
 		conf.Options = make(map[string]string)
diff --git a/nomad/drainer/drain.go b/nomad/drainer/drain.go
index 5f35bca6c0b4..450c4261f3f6 100644
--- a/nomad/drainer/drain.go
+++ b/nomad/drainer/drain.go
@@ -157,24 +157,41 @@ func (n *NodeDrainer) Run() {
 	}
 }
 
+// getNextDeadline is a helper that takes a set of draining nodes and returns the
+// next deadline. It also returns a boolean if there is a deadline.
+func getNextDeadline(nodes map[string]*structs.Node) (time.Time, bool) {
+	var nextDeadline time.Time
+	found := false
+	for _, node := range nodes {
+		inf, d := node.DrainStrategy.DeadlineTime()
+		if !inf && (nextDeadline.IsZero() || d.Before(nextDeadline)) {
+			nextDeadline = d
+			found = true
+		}
+	}
+
+	return nextDeadline, found
+}
+
 // nodeDrainer is the core node draining main loop and should be started in a
 // goroutine when a server establishes leadership.
 func (n *NodeDrainer) nodeDrainer(ctx context.Context, state *state.StateStore) {
 	nodes, nodesIndex, drainingJobs, allocsIndex := initDrainer(n.logger, state)
 
 	// Wait for a node's drain deadline to expire
-	var nextDeadline time.Time
-	for _, node := range nodes {
-		if nextDeadline.IsZero() {
-			nextDeadline = node.DrainStrategy.DeadlineTime()
-			continue
-		}
-		if deadline := node.DrainStrategy.DeadlineTime(); deadline.Before(nextDeadline) {
-			nextDeadline = deadline
+	nextDeadline, ok := getNextDeadline(nodes)
+	deadlineTimer := time.NewTimer(time.Until(nextDeadline))
+	stopDeadlineTimer := func() {
+		if !deadlineTimer.Stop() {
+			select {
+			case <-deadlineTimer.C:
+			default:
+			}
 		}
-
 	}
-	deadlineTimer := time.NewTimer(time.Until(nextDeadline))
+	if !ok {
+		stopDeadlineTimer()
+	}
 
 	// Watch for nodes to start or stop draining
 	nodeWatcher := newNodeWatcher(n.logger, nodes, nodesIndex, state)
@@ -197,33 +214,14 @@ func (n *NodeDrainer) nodeDrainer(ctx context.Context, state *state.StateStore)
 			// update draining nodes
 			n.logger.Printf("[TRACE] nomad.drain: running due to node change (%d nodes draining)", len(nodes))
 
-			// update deadline timer
-			changed := false
-			for _, n := range nodes {
-				if nextDeadline.IsZero() {
-					nextDeadline = n.DrainStrategy.DeadlineTime()
-					changed = true
-					continue
-				}
-
-				if deadline := n.DrainStrategy.DeadlineTime(); deadline.Before(nextDeadline) {
-					nextDeadline = deadline
-					changed = true
-				}
-			}
-
-			// if changed reset the timer
-			if changed {
+			d, ok := getNextDeadline(nodes)
+			if ok && !nextDeadline.Equal(d) {
+				nextDeadline = d
 				n.logger.Printf("[TRACE] nomad.drain: new node deadline: %s", nextDeadline)
-				if !deadlineTimer.Stop() {
-					// timer may have been recv'd in a
-					// previous loop, so don't block
-					select {
-					case <-deadlineTimer.C:
-					default:
-					}
-				}
+				stopDeadlineTimer()
 				deadlineTimer.Reset(time.Until(nextDeadline))
+			} else if !ok {
+				stopDeadlineTimer()
 			}
 
 		case jobs := <-jobWatcher.WaitCh():
@@ -275,7 +273,8 @@ func (n *NodeDrainer) nodeDrainer(ctx context.Context, state *state.StateStore)
 
 			// track number of allocs left on this node to be drained
 			allocsLeft := false
-			deadlineReached := node.DrainStrategy.DeadlineTime().Before(now)
+			inf, deadline := node.DrainStrategy.DeadlineTime()
+			deadlineReached := !inf && deadline.Before(now)
 			for _, alloc := range allocs {
 				jobkey := jobKey{alloc.Namespace, alloc.JobID}
 
@@ -307,8 +306,13 @@ func (n *NodeDrainer) nodeDrainer(ctx context.Context, state *state.StateStore)
 
 				// Don't bother collecting system/batch jobs for nodes that haven't hit their deadline
 				if job.Type != structs.JobTypeService && !deadlineReached {
-					n.logger.Printf("[TRACE] nomad.drain: not draining %s job %s because deadline isn't for %s",
-						job.Type, job.Name, node.DrainStrategy.DeadlineTime().Sub(now))
+					if inf, d := node.DrainStrategy.DeadlineTime(); inf {
+						n.logger.Printf("[TRACE] nomad.drain: not draining %s job %s because node has an infinite deadline",
+							job.Type, job.Name)
+					} else {
+						n.logger.Printf("[TRACE] nomad.drain: not draining %s job %s because deadline isn't for %s",
+							job.Type, job.Name, d.Sub(now))
+					}
 					skipJob[jobkey] = struct{}{}
 					continue
 				}
@@ -370,7 +374,7 @@ func (n *NodeDrainer) nodeDrainer(ctx context.Context, state *state.StateStore)
 
 				tgKey := makeTaskGroupKey(alloc)
 
-				if node.DrainStrategy.DeadlineTime().Before(now) {
+				if inf, d := node.DrainStrategy.DeadlineTime(); !inf && d.Before(now) {
 					n.logger.Printf("[TRACE] nomad.drain: draining job %s alloc %s from node %s due to node's drain deadline", drainingJob.job.Name, alloc.ID[:6], alloc.NodeID[:6])
 					// Alloc's Node has reached its deadline
 					stoplist.add(drainingJob.job, alloc)
@@ -494,7 +498,7 @@ func initDrainer(logger *log.Logger, state *state.StateStore) (map[string]*struc
 		nodes[node.ID] = node
 
 		// No point in tracking draining allocs as the deadline has been reached
-		if node.DrainStrategy.DeadlineTime().Before(now) {
+		if inf, d := node.DrainStrategy.DeadlineTime(); !inf && d.Before(now) {
 			continue
 		}
 
diff --git a/nomad/drainer/drain_test.go b/nomad/drainer/drain_test.go
index 1f38a4c293fc..dd25becccf48 100644
--- a/nomad/drainer/drain_test.go
+++ b/nomad/drainer/drain_test.go
@@ -59,6 +59,7 @@ func TestNodeDrainer_SimpleDrain(t *testing.T) {
 	serviceJob := mock.Job()
 	serviceJob.Name = "service-job"
 	serviceJob.Type = structs.JobTypeService
+	serviceJob.Constraints = nil
 	serviceJob.TaskGroups[0].Migrate = &structs.MigrateStrategy{
 		MaxParallel:     1,
 		HealthCheck:     structs.MigrateStrategyHealthStates,
@@ -76,6 +77,7 @@ func TestNodeDrainer_SimpleDrain(t *testing.T) {
 	systemJob := mock.SystemJob()
 	systemJob.Name = "system-job"
 	systemJob.Type = structs.JobTypeSystem
+	systemJob.Constraints = nil
 	//FIXME hack until system job reschedule policy validation is fixed
 	systemJob.TaskGroups[0].ReschedulePolicy = &structs.ReschedulePolicy{Attempts: 1, Interval: time.Minute}
 	systemJob.TaskGroups[0].Tasks[0].Driver = "mock_driver"
@@ -90,6 +92,7 @@ func TestNodeDrainer_SimpleDrain(t *testing.T) {
 	batchJob := mock.Job()
 	batchJob.Name = "batch-job"
 	batchJob.Type = structs.JobTypeBatch
+	batchJob.Constraints = nil
 	batchJob.TaskGroups[0].Name = "batch-group"
 	batchJob.TaskGroups[0].Migrate = nil
 	batchJob.TaskGroups[0].Tasks[0].Name = "batch-task"
@@ -159,6 +162,11 @@ func TestNodeDrainer_SimpleDrain(t *testing.T) {
 				t.Logf("%d alloc %s job %s status %s", i, alloc.ID, alloc.Job.Name, alloc.ClientStatus)
 			}
 		}
+		if resp, err := rpc.EvalList(); err == nil {
+			for _, eval := range resp.Evaluations {
+				t.Logf("% #v\n", pretty.Formatter(eval))
+			}
+		}
 		t.Fatalf("failed waiting for all allocs to start: %v", err)
 	})
 
diff --git a/nomad/drainer/nodewatcher.go b/nomad/drainer/nodewatcher.go
index eb54e4995842..5f419ea2ca91 100644
--- a/nomad/drainer/nodewatcher.go
+++ b/nomad/drainer/nodewatcher.go
@@ -57,18 +57,18 @@ func (n *nodeWatcher) run(ctx context.Context) {
 		for _, newNode := range newNodes {
 			if existingNode, ok := n.nodes[newNode.ID]; ok {
 				// Node was draining, see if it has changed
-				if !newNode.Drain {
+				if newNode.DrainStrategy == nil {
 					// Node stopped draining
 					delete(n.nodes, newNode.ID)
 					changed = true
-				} else if !newNode.DrainStrategy.DeadlineTime().Equal(existingNode.DrainStrategy.DeadlineTime()) {
+				} else if !newNode.DrainStrategy.Equal(existingNode.DrainStrategy) {
 					// Update deadline
 					n.nodes[newNode.ID] = newNode
 					changed = true
 				}
 			} else {
 				// Node was not draining
-				if newNode.Drain {
+				if newNode.DrainStrategy != nil {
 					// Node started draining
 					n.nodes[newNode.ID] = newNode
 					changed = true
diff --git a/nomad/node_endpoint.go b/nomad/node_endpoint.go
index 5cf5aa587d7e..6cfe62ae7e5c 100644
--- a/nomad/node_endpoint.go
+++ b/nomad/node_endpoint.go
@@ -443,6 +443,11 @@ func (n *Node) UpdateDrain(args *structs.NodeUpdateDrainRequest,
 		}
 	}
 
+	// Mark the deadline time
+	if args.DrainStrategy != nil && args.DrainStrategy.Deadline.Nanoseconds() > 0 {
+		args.DrainStrategy.ForceDeadline = time.Now().Add(args.DrainStrategy.Deadline)
+	}
+
 	// Commit this update via Raft
 	_, index, err := n.srv.raftApply(structs.NodeUpdateDrainRequestType, args)
 	if err != nil {
diff --git a/nomad/structs/structs.go b/nomad/structs/structs.go
index e1d9b077752d..018b96c422ec 100644
--- a/nomad/structs/structs.go
+++ b/nomad/structs/structs.go
@@ -1214,9 +1214,9 @@ type DrainStrategy struct {
 	// DrainSpec is the user declared drain specification
 	DrainSpec
 
-	// StartTime as nanoseconds since Unix epoch indicating when a drain
-	// began for deadline calcuations.
-	StartTime int64
+	// ForceDeadline is the deadline time for the drain after which drains will
+	// be forced
+	ForceDeadline time.Time
 }
 
 func (d *DrainStrategy) Copy() *DrainStrategy {
@@ -1229,16 +1229,47 @@ func (d *DrainStrategy) Copy() *DrainStrategy {
 	return nd
 }
 
-// DeadlineTime returns the Time this drain's deadline will be reached or the
-// zero value for Time if DrainStrategy is nil or Duration is <= 0.
-func (d *DrainStrategy) DeadlineTime() time.Time {
+// DeadlineTime returns a boolean whether the drain strategy allows an infinite
+// duration or otherwise the deadline time. The force drain is captured by the
+// deadline time being in the past.
+func (d *DrainStrategy) DeadlineTime() (infinite bool, deadline time.Time) {
+	// Treat the nil case as a force drain so during an upgrade where a node may
+	// not have a drain strategy but has Drain set to true, it is treated as a
+	// force to mimick old behavior.
 	if d == nil {
-		return time.Time{}
+		return false, time.Time{}
 	}
-	if d.Deadline <= 0 {
-		return time.Time{}
+
+	ns := d.Deadline.Nanoseconds()
+	switch {
+	case ns < 0: // Force
+		return false, time.Time{}
+	case ns == 0: // Infinite
+		return true, time.Time{}
+	default:
+		return false, d.ForceDeadline
+	}
+}
+
+func (d *DrainStrategy) Equal(o *DrainStrategy) bool {
+	if d == nil && o == nil {
+		return true
+	} else if o != nil && d == nil {
+		return false
+	} else if d != nil && o == nil {
+		return false
 	}
-	return time.Unix(0, d.StartTime).Add(d.Deadline)
+
+	// Compare values
+	if d.ForceDeadline != o.ForceDeadline {
+		return false
+	} else if d.Deadline != o.Deadline {
+		return false
+	} else if d.IgnoreSystemJobs != o.IgnoreSystemJobs {
+		return false
+	}
+
+	return true
 }
 
 // Node is a representation of a schedulable client node

From 3ca9cdfadc5988d134fc16a52c933c6c7e3ef67a Mon Sep 17 00:00:00 2001
From: Michael Schurter <schmichael@hashicorp.com>
Date: Tue, 27 Feb 2018 15:51:09 -0800
Subject: [PATCH 30/79] client: don't monitor health of non-service jobs

Also fix system job draining; won't work without deadline fixes
---
 client/alloc_runner_health_watcher.go |  15 +-
 nomad/drainer/drain.go                | 121 +++++++----
 nomad/drainer/drain_test.go           | 282 +++++++++++++++++++++++---
 testutil/rpcapi/rcpapi.go             |  18 ++
 4 files changed, 357 insertions(+), 79 deletions(-)

diff --git a/client/alloc_runner_health_watcher.go b/client/alloc_runner_health_watcher.go
index db9164740319..bdb7eaa82261 100644
--- a/client/alloc_runner_health_watcher.go
+++ b/client/alloc_runner_health_watcher.go
@@ -31,25 +31,24 @@ func (r *AllocRunner) watchHealth(ctx context.Context) {
 
 	// See if we should watch the allocs health
 	alloc := r.Alloc()
-	if alloc.Job.Type == structs.JobTypeSystem || alloc.DeploymentStatus.IsHealthy() || alloc.DeploymentStatus.IsUnhealthy() {
-		// Neither deployments nor migrations apply to system jobs and
-		// we don't need to track allocations which already have a
-		// status
+	if alloc.Job.Type != structs.JobTypeService {
+		// No need to watch non-service jos
 		return
 	}
 
-	isDeploy := alloc.DeploymentID != ""
-
-	if isDeploy && alloc.Job.Type != structs.JobTypeService {
-		// Deployments don't track non-Service jobs
+	if alloc.DeploymentStatus.IsHealthy() || alloc.DeploymentStatus.IsUnhealthy() {
+		// No need to watch health as it's already set
 		return
 	}
 
+	isDeploy := alloc.DeploymentID != ""
+
 	tg := alloc.Job.LookupTaskGroup(alloc.TaskGroup)
 	if tg == nil {
 		r.logger.Printf("[ERR] client.alloc_watcher: failed to lookup allocation's task group. Exiting watcher")
 		return
 	}
+
 	if isDeploy && (tg.Update == nil || tg.Update.HealthCheck == structs.UpdateStrategyHealthCheck_Manual) {
 		return
 	}
diff --git a/nomad/drainer/drain.go b/nomad/drainer/drain.go
index 450c4261f3f6..e52a735aaf13 100644
--- a/nomad/drainer/drain.go
+++ b/nomad/drainer/drain.go
@@ -18,13 +18,13 @@ type jobKey struct {
 	jobid string
 }
 
-// drainingJob contains the Job and allocations for that job meant to be used
+// runningJob contains the Job and allocations for that job meant to be used
 // when collecting all allocations for a job with at least one allocation on a
 // draining node.
 //
-// This allows the MaxParallel calculation to take the entire job's allocation
-// state into account. FIXME is that even useful?
-type drainingJob struct {
+// In order to drain an allocation we must also emit an evaluation for its job,
+// so this struct bundles allocations with their job.
+type runningJob struct {
 	job    *structs.Job
 	allocs []*structs.Allocation
 }
@@ -247,17 +247,16 @@ func (n *NodeDrainer) nodeDrainer(ctx context.Context, state *state.StateStore)
 
 		// job key -> {job, allocs}
 		// Collect all allocs for all jobs with at least one
-		// alloc on a draining node.
+		// non-terminal alloc on a draining node.
 		// Invariants:
-		//  - No system jobs
-		//  - No batch jobs unless their node's deadline is reached
+		//  - Only service jobs
 		//  - No entries with 0 allocs
 		//TODO could this be a helper method on prevAllocWatcher
-		drainable := map[jobKey]*drainingJob{}
+		drainableSvcs := map[jobKey]*runningJob{}
 
-		// track jobs we've looked up before and know we shouldn't
-		// consider for draining eg system jobs
-		skipJob := map[jobKey]struct{}{}
+		// drainNow are allocs for batch or system jobs that should be
+		// drained due to a node deadline being reached
+		drainNow := map[jobKey]*runningJob{}
 
 		// track number of "up" allocs per task group (not terminal and
 		// have a deployment status)
@@ -271,22 +270,21 @@ func (n *NodeDrainer) nodeDrainer(ctx context.Context, state *state.StateStore)
 				panic(err)
 			}
 
+			// drainableSys are allocs for system jobs that should be
+			// drained if there are no other allocs left
+			drainableSys := map[jobKey]*runningJob{}
+
 			// track number of allocs left on this node to be drained
 			allocsLeft := false
 			inf, deadline := node.DrainStrategy.DeadlineTime()
 			deadlineReached := !inf && deadline.Before(now)
 			for _, alloc := range allocs {
-				jobkey := jobKey{alloc.Namespace, alloc.JobID}
-
-				if _, ok := drainable[jobkey]; ok {
-					// already found
+				// Don't need to consider drained allocs
+				if alloc.TerminalStatus() {
 					continue
 				}
 
-				if _, ok := skipJob[jobkey]; ok {
-					// already looked up and skipped
-					continue
-				}
+				jobkey := jobKey{alloc.Namespace, alloc.JobID}
 
 				// job does not found yet
 				job, err := snapshot.JobByID(nil, alloc.Namespace, alloc.JobID)
@@ -295,28 +293,49 @@ func (n *NodeDrainer) nodeDrainer(ctx context.Context, state *state.StateStore)
 					panic(err)
 				}
 
-				// If alloc isn't yet terminal this node has
-				// allocs left to be drained
-				if !alloc.TerminalStatus() {
-					if !allocsLeft {
-						n.logger.Printf("[TRACE] nomad.drain: node %s has allocs left to drain", nodeID[:6])
-						allocsLeft = true
+				// IgnoreSystemJobs if specified in the node's DrainStrategy
+				if node.DrainStrategy.IgnoreSystemJobs && job.Type == structs.JobTypeSystem {
+					continue
+				}
+
+				// When the node deadline is reached all batch
+				// and service jobs will be drained
+				if deadlineReached && job.Type != structs.JobTypeService {
+					n.logger.Printf("[TRACE] nomad.drain: draining alloc %s due to node %s reaching drain deadline", alloc.ID, node.ID)
+					if j, ok := drainNow[jobkey]; ok {
+						j.allocs = append(j.allocs, alloc)
+					} else {
+						// First alloc for this job, create entry
+						drainNow[jobkey] = &runningJob{
+							job:    job,
+							allocs: []*structs.Allocation{alloc},
+						}
 					}
+					continue
 				}
 
-				// Don't bother collecting system/batch jobs for nodes that haven't hit their deadline
-				if job.Type != structs.JobTypeService && !deadlineReached {
-					if inf, d := node.DrainStrategy.DeadlineTime(); inf {
-						n.logger.Printf("[TRACE] nomad.drain: not draining %s job %s because node has an infinite deadline",
-							job.Type, job.Name)
+				// If deadline hasn't been reached, system jobs
+				// may still be drained if there are no other
+				// allocs left
+				if !deadlineReached && job.Type == structs.JobTypeSystem {
+					n.logger.Printf("[TRACE] nomad.drain: system alloc %s will be drained if no other allocs on node %s", alloc.ID, node.ID)
+					if j, ok := drainableSys[jobkey]; ok {
+						j.allocs = append(j.allocs, alloc)
 					} else {
-						n.logger.Printf("[TRACE] nomad.drain: not draining %s job %s because deadline isn't for %s",
-							job.Type, job.Name, d.Sub(now))
+						// First alloc for this job, create entry
+						drainableSys[jobkey] = &runningJob{
+							job:    job,
+							allocs: []*structs.Allocation{alloc},
+						}
 					}
-					skipJob[jobkey] = struct{}{}
 					continue
 				}
 
+				// This alloc is still running on a draining
+				// node, so treat the node as having allocs
+				// remaining
+				allocsLeft = true
+
 				jobAllocs, err := snapshot.AllocsByJob(nil, alloc.Namespace, alloc.JobID, true)
 				if err != nil {
 					//FIXME
@@ -328,14 +347,15 @@ func (n *NodeDrainer) nodeDrainer(ctx context.Context, state *state.StateStore)
 					num := 0
 					for _, a := range jobAllocs {
 						if !a.TerminalStatus() && a.DeploymentStatus != nil {
+							// Not terminal and health updated, count it as up!
 							upPerTG[makeTaskGroupKey(a)]++
 							num++
 						}
 					}
-					n.logger.Printf("[TRACE] nomad.drain: job %s has %d task groups running", job.Name, num)
+					n.logger.Printf("[TRACE] nomad.drain: job %s has %d allocs running", job.Name, num)
 				}
 
-				drainable[jobkey] = &drainingJob{
+				drainableSvcs[jobkey] = &runningJob{
 					job:    job,
 					allocs: jobAllocs,
 				}
@@ -348,6 +368,17 @@ func (n *NodeDrainer) nodeDrainer(ctx context.Context, state *state.StateStore)
 				n.logger.Printf("[TRACE] nomad.drain: node %s has no more allocs left to drain or has reached deadline", nodeID)
 				jobWatcher.nodeDone(nodeID)
 				doneNodes[nodeID] = node
+
+				// Add all system jobs on this node to the drainNow slice
+				for k, sysj := range drainableSys {
+					if j, ok := drainNow[k]; ok {
+						// Job already has at least one alloc draining, append this one
+						j.allocs = append(j.allocs, sysj.allocs...)
+					} else {
+						// First draining alloc for this job, add the entry
+						drainNow[k] = sysj
+					}
+				}
 			}
 		}
 
@@ -358,8 +389,15 @@ func (n *NodeDrainer) nodeDrainer(ctx context.Context, state *state.StateStore)
 			jobBatch:   make(map[jobKey]*structs.Job),
 		}
 
+		// Immediately drain all allocs in drainNow
+		for _, drainingJob := range drainNow {
+			for _, a := range drainingJob.allocs {
+				stoplist.add(drainingJob.job, a)
+			}
+		}
+
 		// build drain list considering deadline & max_parallel
-		for _, drainingJob := range drainable {
+		for _, drainingJob := range drainableSvcs {
 			for _, alloc := range drainingJob.allocs {
 				// Already draining/dead allocs don't need to be drained
 				if alloc.TerminalStatus() {
@@ -383,13 +421,6 @@ func (n *NodeDrainer) nodeDrainer(ctx context.Context, state *state.StateStore)
 					continue
 				}
 
-				// Batch/System jobs are only stopped when the
-				// node deadline is reached which has already
-				// been done.
-				if drainingJob.job.Type != structs.JobTypeService {
-					continue
-				}
-
 				// Stop allocs with count=1, max_parallel==0, or draining<max_parallel
 				tg := drainingJob.job.LookupTaskGroup(alloc.TaskGroup)
 				//FIXME tg==nil here?
@@ -427,6 +458,10 @@ func (n *NodeDrainer) nodeDrainer(ctx context.Context, state *state.StateStore)
 		if len(stoplist.allocBatch) > 0 {
 			n.logger.Printf("[DEBUG] nomad.drain: stopping %d alloc(s) for %d job(s)", len(stoplist.allocBatch), len(stoplist.jobBatch))
 
+			for id, _ := range stoplist.allocBatch {
+				n.logger.Printf("[TRACE] nomad.drain: migrating alloc %s", id[:6])
+			}
+
 			// Reevaluate affected jobs
 			evals := make([]*structs.Evaluation, 0, len(stoplist.jobBatch))
 			for _, job := range stoplist.jobBatch {
diff --git a/nomad/drainer/drain_test.go b/nomad/drainer/drain_test.go
index dd25becccf48..8361a56593ca 100644
--- a/nomad/drainer/drain_test.go
+++ b/nomad/drainer/drain_test.go
@@ -9,7 +9,6 @@ import (
 	"testing"
 	"time"
 
-	msgpackrpc "github.com/hashicorp/net-rpc-msgpackrpc"
 	"github.com/hashicorp/nomad/client"
 	"github.com/hashicorp/nomad/client/config"
 	"github.com/hashicorp/nomad/helper/pool"
@@ -40,6 +39,7 @@ func rpcClient(t *testing.T, conf *nomad.Config) rpc.ClientCodec {
 // TestNodeDrainer_SimpleDrain asserts that draining when there are two nodes
 // moves allocs from the draining node to the other node.
 func TestNodeDrainer_SimpleDrain(t *testing.T) {
+	assert := assert.New(t)
 	require := require.New(t)
 
 	// Capture test servers config
@@ -78,8 +78,6 @@ func TestNodeDrainer_SimpleDrain(t *testing.T) {
 	systemJob.Name = "system-job"
 	systemJob.Type = structs.JobTypeSystem
 	systemJob.Constraints = nil
-	//FIXME hack until system job reschedule policy validation is fixed
-	systemJob.TaskGroups[0].ReschedulePolicy = &structs.ReschedulePolicy{Attempts: 1, Interval: time.Minute}
 	systemJob.TaskGroups[0].Tasks[0].Driver = "mock_driver"
 	systemJob.TaskGroups[0].Tasks[0].Config = map[string]interface{}{
 		"run_for":    "10m",
@@ -111,28 +109,20 @@ func TestNodeDrainer_SimpleDrain(t *testing.T) {
 		conf.Servers = []string{serverConfig.RPCAddr.String()}
 	})
 	defer c1.Shutdown()
+	node1ID := c1.NodeID()
 
 	// Start jobs so they all get placed on node 1
 	codec := rpcClient(t, serverConfig)
+	rpc := rpcapi.NewRPC(codec)
 	for _, job := range []*structs.Job{systemJob, serviceJob, batchJob} {
-		req := &structs.JobRegisterRequest{
-			Job: job.Copy(),
-			WriteRequest: structs.WriteRequest{
-				Region:    "global",
-				Namespace: job.Namespace,
-			},
-		}
-
-		// Fetch the response
-		var resp structs.JobRegisterResponse
-		require.Nil(msgpackrpc.CallWithCodec(codec, "Job.Register", req, &resp))
+		resp, err := rpc.JobRegister(job)
+		require.Nil(err)
 		require.NotZero(resp.Index)
 	}
 
 	// Wait for jobs to start on c1
-	rpc := rpcapi.NewRPC(codec)
 	testutil.WaitForResult(func() (bool, error) {
-		resp, err := rpc.NodeGetAllocs(c1.NodeID())
+		resp, err := rpc.NodeGetAllocs(node1ID)
 		if err != nil {
 			return false, err
 		}
@@ -157,7 +147,7 @@ func TestNodeDrainer_SimpleDrain(t *testing.T) {
 		}
 		return true, nil
 	}, func(err error) {
-		if resp, err := rpc.NodeGetAllocs(c1.NodeID()); err == nil {
+		if resp, err := rpc.NodeGetAllocs(node1ID); err == nil {
 			for i, alloc := range resp.Allocs {
 				t.Logf("%d alloc %s job %s status %s", i, alloc.ID, alloc.Job.Name, alloc.ClientStatus)
 			}
@@ -170,27 +160,28 @@ func TestNodeDrainer_SimpleDrain(t *testing.T) {
 		t.Fatalf("failed waiting for all allocs to start: %v", err)
 	})
 
-	// Start draining node 1
-	//FIXME update drain rpc to skip fsm manipulation and use api
+	// Start draining node 1 with no deadline
 	strategy := &structs.DrainStrategy{
 		DrainSpec: structs.DrainSpec{
 			Deadline: -1 * time.Second,
 		},
 	}
-	node, err := state.NodeByID(nil, c1.NodeID())
+	node1Resp, err := rpc.NodeGet(node1ID)
 	require.Nil(err)
-	require.Nil(state.UpdateNodeDrain(node.ModifyIndex+1, node.ID, strategy))
+	node1 := node1Resp.Node
+	require.Nil(state.UpdateNodeDrain(node1.ModifyIndex+1, node1ID, strategy))
 
 	// Start node 2
 	c2 := client.TestClient(t, func(conf *config.Config) {
-		conf.NetworkSpeed = 10000
+		conf.LogOutput = testlog.NewWriter(t)
 		conf.Servers = []string{serverConfig.RPCAddr.String()}
 	})
 	defer c2.Shutdown()
+	node2ID := c2.NodeID()
 
 	// Wait for services to be migrated
 	testutil.WaitForResult(func() (bool, error) {
-		resp, err := rpc.NodeGetAllocs(c2.NodeID())
+		resp, err := rpc.NodeGetAllocs(node2ID)
 		if err != nil {
 			return false, err
 		}
@@ -215,7 +206,7 @@ func TestNodeDrainer_SimpleDrain(t *testing.T) {
 		}
 		return true, nil
 	}, func(err error) {
-		if resp, err := rpc.NodeGetAllocs(c2.NodeID()); err == nil {
+		if resp, err := rpc.NodeGetAllocs(node2ID); err == nil {
 			for i, alloc := range resp.Allocs {
 				t.Logf("%d alloc %s job %s status %s prev %s", i, alloc.ID, alloc.Job.Name, alloc.ClientStatus, alloc.PreviousAllocation)
 			}
@@ -223,12 +214,247 @@ func TestNodeDrainer_SimpleDrain(t *testing.T) {
 		t.Errorf("failed waiting for all allocs to migrate: %v", err)
 	})
 
-	node1, err := rpc.NodeGet(c1.NodeID())
+	// Wait for drained services to be dead
+	testutil.WaitForResult(func() (bool, error) {
+		resp, err := rpc.NodeGetAllocs(c1.NodeID())
+		if err != nil {
+			return false, err
+		}
+
+		running := make([]string, 0, len(resp.Allocs))
+		for _, alloc := range resp.Allocs {
+			if alloc.ClientStatus == structs.AllocClientStatusRunning {
+				running = append(running, alloc.ID[:6])
+			}
+		}
+
+		if len(running) > 0 {
+			return false, fmt.Errorf("%d alloc(s) on draining node %s still running: %s", len(running), c1.NodeID()[:6], running)
+		}
+		return true, nil
+	}, func(err error) {
+		t.Errorf("failed waiting for all draining allocs to stop: %v", err)
+	})
+
+	node1Resp, err = rpc.NodeGet(node1ID)
+	require.Nil(err)
+	node1 = node1Resp.Node
+	assert.False(node1.Drain)
+	assert.Nil(node1.DrainStrategy)
+	assert.Equal(structs.NodeSchedulingIneligible, node1.SchedulingEligibility)
+
+	jobs, err := rpc.JobList()
+	require.Nil(err)
+	t.Logf("--> %d jobs", len(jobs.Jobs))
+	for _, job := range jobs.Jobs {
+		t.Logf("job: %s status: %s %s", job.Name, job.Status, job.StatusDescription)
+	}
+
+	allocs, err := rpc.AllocAll()
+	require.Nil(err)
+
+	sort.Slice(allocs, func(i, j int) bool {
+		r := strings.Compare(allocs[i].Job.Name, allocs[j].Job.Name)
+		switch {
+		case r < 0:
+			return true
+		case r == 0:
+			return allocs[i].ModifyIndex < allocs[j].ModifyIndex
+		case r > 0:
+			return false
+		}
+		panic("unreachable")
+	})
+
+	t.Logf("--> %d allocs", len(allocs))
+	for _, alloc := range allocs {
+		t.Logf("job: %s  node: %s  alloc: %s  desired_status: %s  desired_transition: %s  actual: %s  replaces: %s",
+			alloc.Job.Name, alloc.NodeID[:6], alloc.ID[:6], alloc.DesiredStatus, pretty.Sprint(alloc.DesiredTransition.Migrate), alloc.ClientStatus, alloc.PreviousAllocation)
+	}
+}
+
+// TestNodeDrainer_SystemDrain asserts system jobs are drained
+func TestNodeDrainer_SystemDrain(t *testing.T) {
 	assert := assert.New(t)
+	require := require.New(t)
+
+	// Capture test servers config
+	var serverConfig *nomad.Config
+	server := nomad.TestServer(t, func(c *nomad.Config) {
+		serverConfig = c
+	})
+	defer server.Shutdown()
+
+	testutil.WaitForLeader(t, server.RPC)
+
+	// Setup 2 Nodes: A & B; A has allocs and is draining
+
+	// Create mock jobs
+	state := server.State()
+
+	serviceJob := mock.Job()
+	serviceJob.Name = "service-job"
+	serviceJob.Type = structs.JobTypeService
+	serviceJob.TaskGroups[0].Count = 2
+	serviceJob.TaskGroups[0].Migrate = &structs.MigrateStrategy{
+		MaxParallel:     1,
+		HealthCheck:     structs.MigrateStrategyHealthStates,
+		MinHealthyTime:  time.Millisecond,
+		HealthyDeadline: 2 * time.Second,
+	}
+	serviceJob.TaskGroups[0].Tasks[0].Driver = "mock_driver"
+	serviceJob.TaskGroups[0].Tasks[0].Resources = structs.MinResources()
+	serviceJob.TaskGroups[0].Tasks[0].Config = map[string]interface{}{
+		"run_for":    "10m",
+		"kill_after": "1ms",
+	}
+	serviceJob.TaskGroups[0].Tasks[0].Services = nil
+
+	systemJob := mock.SystemJob()
+	systemJob.Name = "system-job"
+	systemJob.Type = structs.JobTypeSystem
+	systemJob.TaskGroups[0].Tasks[0].Driver = "mock_driver"
+	systemJob.TaskGroups[0].Tasks[0].Config = map[string]interface{}{
+		"run_for":    "10m",
+		"kill_after": "1ms",
+	}
+	systemJob.TaskGroups[0].Tasks[0].Resources = structs.MinResources()
+	systemJob.TaskGroups[0].Tasks[0].Services = nil
+
+	// Start node 1
+	c1 := client.TestClient(t, func(conf *config.Config) {
+		conf.LogOutput = testlog.NewWriter(t)
+		conf.Servers = []string{serverConfig.RPCAddr.String()}
+	})
+	defer c1.Shutdown()
+	node1ID := c1.NodeID()
+
+	// Start jobs so they all get placed on node 1
+	codec := rpcClient(t, serverConfig)
+	rpc := rpcapi.NewRPC(codec)
+	for _, job := range []*structs.Job{systemJob, serviceJob} {
+		resp, err := rpc.JobRegister(job)
+		require.Nil(err)
+		require.NotZero(resp.Index)
+	}
+
+	// Wait for jobs to start on c1
+	testutil.WaitForResult(func() (bool, error) {
+		resp, err := rpc.NodeGetAllocs(c1.NodeID())
+		if err != nil {
+			return false, err
+		}
+
+		system, service := 0, 0
+		for _, alloc := range resp.Allocs {
+			if alloc.ClientStatus != structs.AllocClientStatusRunning {
+				return false, fmt.Errorf("alloc %s for job %s not running: %s", alloc.ID, alloc.Job.Name, alloc.ClientStatus)
+			}
+			switch alloc.JobID {
+			case serviceJob.ID:
+				service++
+			case systemJob.ID:
+				system++
+			default:
+				return false, fmt.Errorf("unknown job: %s", alloc.Job.Name)
+			}
+		}
+		// 1 system + 2 service = 3
+		if system+service != 3 {
+			return false, fmt.Errorf("wrong number of allocs: system %d/1, service %d/2", system, service)
+		}
+		return true, nil
+	}, func(err error) {
+		if resp, err := rpc.NodeGetAllocs(c1.NodeID()); err == nil {
+			for i, alloc := range resp.Allocs {
+				t.Logf("%d alloc %s job %s status %s", i, alloc.ID, alloc.Job.Name, alloc.ClientStatus)
+			}
+		}
+		t.Fatalf("failed waiting for all allocs to start: %v", err)
+	})
+
+	// Start draining node 1
+	strategy := &structs.DrainStrategy{
+		DrainSpec: structs.DrainSpec{
+			Deadline: 1 * time.Hour,
+		},
+	}
+	node1Resp, err := rpc.NodeGet(node1ID)
+	require.Nil(err)
+	node1 := node1Resp.Node
+	require.Nil(state.UpdateNodeDrain(node1.ModifyIndex+1, node1ID, strategy))
+
+	// Start node 2
+	c2 := client.TestClient(t, func(conf *config.Config) {
+		conf.LogOutput = testlog.NewWriter(t)
+		conf.Servers = []string{serverConfig.RPCAddr.String()}
+	})
+	defer c2.Shutdown()
+	node2ID := c2.NodeID()
+
+	// Wait for services to be migrated
+	testutil.WaitForResult(func() (bool, error) {
+		resp, err := rpc.NodeGetAllocs(node2ID)
+		if err != nil {
+			return false, err
+		}
+
+		system, service := 0, 0
+		for _, alloc := range resp.Allocs {
+			if alloc.ClientStatus != structs.AllocClientStatusRunning {
+				return false, fmt.Errorf("alloc %s for job %s not running: %s", alloc.ID, alloc.Job.Name, alloc.ClientStatus)
+			}
+			switch alloc.JobID {
+			case serviceJob.ID:
+				service++
+			case systemJob.ID:
+				system++
+			default:
+				return false, fmt.Errorf("unknown job: %s", alloc.Job.Name)
+			}
+		}
+		// 1 system + 2 service = 3
+		if system+service != 3 {
+			return false, fmt.Errorf("wrong number of allocs: system %d/1, service %d/2", system, service)
+		}
+		return true, nil
+	}, func(err error) {
+		if resp, err := rpc.NodeGetAllocs(node2ID); err == nil {
+			for i, alloc := range resp.Allocs {
+				t.Logf("%d alloc %s job %s status %s prev %s", i, alloc.ID, alloc.Job.Name, alloc.ClientStatus, alloc.PreviousAllocation)
+			}
+		}
+		t.Errorf("failed waiting for all allocs to migrate: %v", err)
+	})
+
+	// Wait for drained services to be dead
+	testutil.WaitForResult(func() (bool, error) {
+		resp, err := rpc.NodeGetAllocs(node1ID)
+		if err != nil {
+			return false, err
+		}
+
+		running := make([]string, 0, len(resp.Allocs))
+		for _, alloc := range resp.Allocs {
+			if alloc.ClientStatus == structs.AllocClientStatusRunning {
+				running = append(running, alloc.ID[:6])
+			}
+		}
+
+		if len(running) > 0 {
+			return false, fmt.Errorf("%d alloc(s) on draining node %s still running: %s", len(running), node1ID[:6], running)
+		}
+		return true, nil
+	}, func(err error) {
+		t.Errorf("failed waiting for all draining allocs to stop: %v", err)
+	})
+
+	node1Resp, err = rpc.NodeGet(node1ID)
 	require.Nil(err)
-	assert.False(node1.Node.Drain)
-	assert.Nil(node1.Node.DrainStrategy)
-	assert.Equal(structs.NodeSchedulingIneligible, node1.Node.SchedulingEligibility)
+	node1 = node1Resp.Node
+	assert.False(node1.Drain)
+	assert.Nil(node1.DrainStrategy)
+	assert.Equal(structs.NodeSchedulingIneligible, node1.SchedulingEligibility)
 
 	jobs, err := rpc.JobList()
 	require.Nil(err)
diff --git a/testutil/rpcapi/rcpapi.go b/testutil/rpcapi/rcpapi.go
index 795123fdabcc..1eafabccbdb3 100644
--- a/testutil/rpcapi/rcpapi.go
+++ b/testutil/rpcapi/rcpapi.go
@@ -103,6 +103,24 @@ func (r *RPC) JobList() (*structs.JobListResponse, error) {
 	return &resp, nil
 }
 
+// Job.Register RPC
+func (r *RPC) JobRegister(j *structs.Job) (*structs.JobRegisterResponse, error) {
+	req := &structs.JobRegisterRequest{
+		Job: j.Copy(),
+		WriteRequest: structs.WriteRequest{
+			Region:    r.Region,
+			Namespace: j.Namespace,
+		},
+	}
+
+	// Fetch the response
+	var resp structs.JobRegisterResponse
+	if err := msgpackrpc.CallWithCodec(r.codec, "Job.Register", req, &resp); err != nil {
+		return nil, err
+	}
+	return &resp, nil
+}
+
 // Node.List RPC
 func (r *RPC) NodeList() (*structs.NodeListResponse, error) {
 	get := &structs.NodeListRequest{

From 1f73cd5d4264e86427ff58bea5c7140419f0a633 Mon Sep 17 00:00:00 2001
From: Michael Schurter <schmichael@hashicorp.com>
Date: Wed, 28 Feb 2018 16:42:29 -0800
Subject: [PATCH 31/79] drainer: refactor newStopAllocs, applyMigrations

---
 nomad/drainer/drain.go      | 93 +++++++++++++++++++++----------------
 nomad/drainer/drain_test.go |  4 ++
 2 files changed, 56 insertions(+), 41 deletions(-)

diff --git a/nomad/drainer/drain.go b/nomad/drainer/drain.go
index e52a735aaf13..4f2b73556cff 100644
--- a/nomad/drainer/drain.go
+++ b/nomad/drainer/drain.go
@@ -51,7 +51,8 @@ func makeTaskGroupKey(a *structs.Allocation) string {
 	return strings.Join([]string{a.Namespace, a.JobID, a.TaskGroup}, "-")
 }
 
-// stopAllocs tracks allocs to drain by a unique TG key
+// stopAllocs tracks allocs to drain by a unique TG key along with their jobs
+// as we need to emit evaluations for each allocations job
 type stopAllocs struct {
 	allocBatch map[string]*structs.DesiredTransition
 
@@ -59,6 +60,25 @@ type stopAllocs struct {
 	jobBatch map[jobKey]*structs.Job
 }
 
+// newStopAllocs creates a list of allocs to migrate from an initial list of
+// running jobs+allocs that need immediate draining.
+func newStopAllocs(initial map[jobKey]*runningJob) *stopAllocs {
+	s := &stopAllocs{
+		allocBatch: make(map[string]*structs.DesiredTransition),
+		jobBatch:   make(map[jobKey]*structs.Job),
+	}
+
+	// Add initial allocs
+	for _, drainingJob := range initial {
+		for _, a := range drainingJob.allocs {
+			s.add(drainingJob.job, a)
+		}
+	}
+	return s
+}
+
+// add an allocation to be migrated. Its job must also be specified in order to
+// emit an evaluation.
 func (s *stopAllocs) add(j *structs.Job, a *structs.Allocation) {
 	// Add the desired migration transition to the batch
 	s.allocBatch[a.ID] = &structs.DesiredTransition{
@@ -203,11 +223,6 @@ func (n *NodeDrainer) nodeDrainer(ctx context.Context, state *state.StateStore)
 	go jobWatcher.run(ctx)
 
 	for {
-		//TODO this method of async node updates means we could make
-		//migration decisions on out of date information. the worst
-		//possible outcome of this is that an allocation could be
-		//stopped on a node that recently had its drain cancelled which
-		//doesn't seem like that bad of a pathological case
 		n.logger.Printf("[TRACE] nomad.drain: LOOP next deadline: %s (%s)", nextDeadline, time.Until(nextDeadline))
 		select {
 		case nodes = <-nodeWatcher.nodesCh:
@@ -383,18 +398,9 @@ func (n *NodeDrainer) nodeDrainer(ctx context.Context, state *state.StateStore)
 		}
 
 		// stoplist are the allocations to migrate and their jobs to emit
-		// evaluations for
-		stoplist := &stopAllocs{
-			allocBatch: make(map[string]*structs.DesiredTransition),
-			jobBatch:   make(map[jobKey]*structs.Job),
-		}
-
-		// Immediately drain all allocs in drainNow
-		for _, drainingJob := range drainNow {
-			for _, a := range drainingJob.allocs {
-				stoplist.add(drainingJob.job, a)
-			}
-		}
+		// evaluations for. Initialized with allocations that should be
+		// immediately drained regardless of MaxParallel
+		stoplist := newStopAllocs(drainNow)
 
 		// build drain list considering deadline & max_parallel
 		for _, drainingJob := range drainableSvcs {
@@ -456,29 +462,7 @@ func (n *NodeDrainer) nodeDrainer(ctx context.Context, state *state.StateStore)
 		}
 
 		if len(stoplist.allocBatch) > 0 {
-			n.logger.Printf("[DEBUG] nomad.drain: stopping %d alloc(s) for %d job(s)", len(stoplist.allocBatch), len(stoplist.jobBatch))
-
-			for id, _ := range stoplist.allocBatch {
-				n.logger.Printf("[TRACE] nomad.drain: migrating alloc %s", id[:6])
-			}
-
-			// Reevaluate affected jobs
-			evals := make([]*structs.Evaluation, 0, len(stoplist.jobBatch))
-			for _, job := range stoplist.jobBatch {
-				evals = append(evals, &structs.Evaluation{
-					ID:             uuid.Generate(),
-					Namespace:      job.Namespace,
-					Priority:       job.Priority,
-					Type:           job.Type,
-					TriggeredBy:    structs.EvalTriggerNodeDrain,
-					JobID:          job.ID,
-					JobModifyIndex: job.ModifyIndex,
-					Status:         structs.EvalStatusPending,
-				})
-			}
-
-			// Commit this update via Raft
-			if err := n.raft.AllocUpdateDesiredTransition(stoplist.allocBatch, evals); err != nil {
+			if err := n.applyMigrations(stoplist); err != nil {
 				//FIXME
 				panic(err)
 			}
@@ -497,6 +481,33 @@ func (n *NodeDrainer) nodeDrainer(ctx context.Context, state *state.StateStore)
 	}
 }
 
+// applyMigrations applies the specified allocation migrations along with their
+// evaluations to raft.
+func (n *NodeDrainer) applyMigrations(stoplist *stopAllocs) error {
+	n.logger.Printf("[DEBUG] nomad.drain: stopping %d alloc(s) for %d job(s)", len(stoplist.allocBatch), len(stoplist.jobBatch))
+
+	for id, _ := range stoplist.allocBatch {
+		n.logger.Printf("[TRACE] nomad.drain: migrating alloc %s", id[:6])
+	}
+	// Reevaluate affected jobs
+	evals := make([]*structs.Evaluation, 0, len(stoplist.jobBatch))
+	for _, job := range stoplist.jobBatch {
+		evals = append(evals, &structs.Evaluation{
+			ID:             uuid.Generate(),
+			Namespace:      job.Namespace,
+			Priority:       job.Priority,
+			Type:           job.Type,
+			TriggeredBy:    structs.EvalTriggerNodeDrain,
+			JobID:          job.ID,
+			JobModifyIndex: job.ModifyIndex,
+			Status:         structs.EvalStatusPending,
+		})
+	}
+
+	// Commit this update via Raft
+	return n.raft.AllocUpdateDesiredTransition(stoplist.allocBatch, evals)
+}
+
 // initDrainer initializes the node drainer state and returns a list of
 // draining nodes as well as allocs that are draining that should be watched
 // for a replacement.
diff --git a/nomad/drainer/drain_test.go b/nomad/drainer/drain_test.go
index 8361a56593ca..f92f2503e14f 100644
--- a/nomad/drainer/drain_test.go
+++ b/nomad/drainer/drain_test.go
@@ -271,6 +271,8 @@ func TestNodeDrainer_SimpleDrain(t *testing.T) {
 		t.Logf("job: %s  node: %s  alloc: %s  desired_status: %s  desired_transition: %s  actual: %s  replaces: %s",
 			alloc.Job.Name, alloc.NodeID[:6], alloc.ID[:6], alloc.DesiredStatus, pretty.Sprint(alloc.DesiredTransition.Migrate), alloc.ClientStatus, alloc.PreviousAllocation)
 	}
+
+	t.Logf("==> PASS")
 }
 
 // TestNodeDrainer_SystemDrain asserts system jobs are drained
@@ -484,4 +486,6 @@ func TestNodeDrainer_SystemDrain(t *testing.T) {
 		t.Logf("job: %s  node: %s  alloc: %s  desired_status: %s  desired_transition: %s  actual: %s  replaces: %s",
 			alloc.Job.Name, alloc.NodeID[:6], alloc.ID[:6], alloc.DesiredStatus, pretty.Sprint(alloc.DesiredTransition.Migrate), alloc.ClientStatus, alloc.PreviousAllocation)
 	}
+
+	t.Logf("==> PASS")
 }

From 478209807e10fcb2c6592e62d36bce2334aeb74b Mon Sep 17 00:00:00 2001
From: Michael Schurter <schmichael@hashicorp.com>
Date: Wed, 28 Feb 2018 20:59:41 -0800
Subject: [PATCH 32/79] refactor main drainloop into 2 more methods

---
 nomad/drainer/drain.go | 405 +++++++++++++++++++++--------------------
 1 file changed, 212 insertions(+), 193 deletions(-)

diff --git a/nomad/drainer/drain.go b/nomad/drainer/drain.go
index 4f2b73556cff..8db56ac7dacc 100644
--- a/nomad/drainer/drain.go
+++ b/nomad/drainer/drain.go
@@ -29,21 +29,28 @@ type runningJob struct {
 	allocs []*structs.Allocation
 }
 
-// drainingAlloc contains a conservative deadline an alloc has to be healthy by
-// before it should stopped being watched and replaced.
-type drainingAlloc struct {
-	// LastModified+MigrateStrategy.HealthyDeadline
-	deadline time.Time
-
-	// Task Group key
-	tgKey string
-}
-
-func newDrainingAlloc(a *structs.Allocation, deadline time.Time) drainingAlloc {
-	return drainingAlloc{
-		deadline: deadline,
-		tgKey:    makeTaskGroupKey(a),
-	}
+// collectResult is the state collected by scanning for drain eligible allocs
+type collectResult struct {
+	// drainableSvcs contains all service jobs and allocs that are
+	// potentially drainable meaning they have at least one allocation on a
+	// draining node.
+	drainableSvcs map[jobKey]*runningJob
+
+	// drainNow contains all batch and system jobs that should be
+	// immediately drained due to a deadline or in the case of system jobs:
+	// all other allocs on the node have completed draining.
+	drainNow map[jobKey]*runningJob
+
+	// upPerTG is a count of running allocs per task group for the
+	// migration mark phase to use when considering how many allocs can be
+	// migrated for a given group.
+	upPerTG map[string]int
+
+	// doneNodes need no coordinating to finish their drain. Either all
+	// allocs have drained, the node is being force drained, or the drain
+	// deadline was hit. Any remaining allocs will be migrated via
+	// drainNow.
+	doneNodes map[string]*structs.Node
 }
 
 // makeTaskGroupKey returns a unique key for an allocation's task group
@@ -107,10 +114,15 @@ type nodeDrainerState struct {
 // should be called when a server establishes leadership and SetEnabled(false)
 // called when leadership is lost.
 type NodeDrainer struct {
+	// enabledCh is used by SetEnabled to signal Run when to start/stop the
+	// nodeDrainer goroutine
 	enabledCh chan nodeDrainerState
 
+	// raft is a shim around the raft messages necessary for draining
 	raft RaftApplier
 
+	// shutdownCh is closed when the Server is shutting down the
+	// NodeDrainer should permanently exit
 	shutdownCh <-chan struct{}
 
 	logger *log.Logger
@@ -249,9 +261,6 @@ func (n *NodeDrainer) nodeDrainer(ctx context.Context, state *state.StateStore)
 			return
 		}
 
-		// Tracks nodes that are done draining
-		doneNodes := map[string]*structs.Node{}
-
 		// Capture state (statestore and time) to do consistent comparisons
 		snapshot, err := state.Snapshot()
 		if err != nil {
@@ -260,223 +269,233 @@ func (n *NodeDrainer) nodeDrainer(ctx context.Context, state *state.StateStore)
 		}
 		now := time.Now()
 
-		// job key -> {job, allocs}
-		// Collect all allocs for all jobs with at least one
-		// non-terminal alloc on a draining node.
-		// Invariants:
-		//  - Only service jobs
-		//  - No entries with 0 allocs
-		//TODO could this be a helper method on prevAllocWatcher
-		drainableSvcs := map[jobKey]*runningJob{}
+		// Collect all drainable jobs
+		result, err := n.collectDrainable(nodes, snapshot, jobWatcher, now)
+		if err != nil {
+			//FIXME
+			panic(err)
+		}
 
-		// drainNow are allocs for batch or system jobs that should be
-		// drained due to a node deadline being reached
-		drainNow := map[jobKey]*runningJob{}
+		// stoplist are the allocations to migrate and their jobs to emit
+		// evaluations for. Initialized with allocations that should be
+		// immediately drained regardless of MaxParallel
+		stoplist := newStopAllocs(result.drainNow)
 
-		// track number of "up" allocs per task group (not terminal and
-		// have a deployment status)
-		upPerTG := map[string]int{}
+		// build drain list considering deadline & max_parallel
+		n.markMigrations(stoplist, result.upPerTG, result.drainableSvcs, nodes, now)
 
-		// Collect all drainable jobs
-		for nodeID, node := range nodes {
-			allocs, err := snapshot.AllocsByNode(nil, nodeID)
-			if err != nil {
+		if len(stoplist.allocBatch) > 0 {
+			if err := n.applyMigrations(stoplist); err != nil {
 				//FIXME
 				panic(err)
 			}
+		}
 
-			// drainableSys are allocs for system jobs that should be
-			// drained if there are no other allocs left
-			drainableSys := map[jobKey]*runningJob{}
-
-			// track number of allocs left on this node to be drained
-			allocsLeft := false
-			inf, deadline := node.DrainStrategy.DeadlineTime()
-			deadlineReached := !inf && deadline.Before(now)
-			for _, alloc := range allocs {
-				// Don't need to consider drained allocs
-				if alloc.TerminalStatus() {
-					continue
-				}
+		// Unset drain for nodes done draining
+		for nodeID, node := range result.doneNodes {
+			if err := n.raft.NodeDrainComplete(nodeID); err != nil {
+				n.logger.Printf("[ERR] nomad.drain: failed to unset drain for: %v", err)
+				//FIXME
+				panic(err)
+			}
+			n.logger.Printf("[INFO] nomad.drain: node %s (%s) completed draining", nodeID, node.Name)
+			delete(nodes, nodeID)
+		}
+	}
+}
 
-				jobkey := jobKey{alloc.Namespace, alloc.JobID}
+// collectDrainable scans all nodes and allocs on draining nodes and builds a
+// structure of eligible allocs to drain.
+func (n *NodeDrainer) collectDrainable(nodes map[string]*structs.Node, state *state.StateSnapshot,
+	jobWatcher *jobWatcher, now time.Time) (*collectResult, error) {
 
-				// job does not found yet
-				job, err := snapshot.JobByID(nil, alloc.Namespace, alloc.JobID)
-				if err != nil {
-					//FIXME
-					panic(err)
-				}
+	svcs := map[jobKey]*runningJob{}
+	drainNow := map[jobKey]*runningJob{}
+	upPerTG := map[string]int{}
+	doneNodes := map[string]*structs.Node{}
 
-				// IgnoreSystemJobs if specified in the node's DrainStrategy
-				if node.DrainStrategy.IgnoreSystemJobs && job.Type == structs.JobTypeSystem {
-					continue
-				}
+	for nodeID, node := range nodes {
+		allocs, err := state.AllocsByNode(nil, nodeID)
+		if err != nil {
+			return nil, err
+		}
 
-				// When the node deadline is reached all batch
-				// and service jobs will be drained
-				if deadlineReached && job.Type != structs.JobTypeService {
-					n.logger.Printf("[TRACE] nomad.drain: draining alloc %s due to node %s reaching drain deadline", alloc.ID, node.ID)
-					if j, ok := drainNow[jobkey]; ok {
-						j.allocs = append(j.allocs, alloc)
-					} else {
-						// First alloc for this job, create entry
-						drainNow[jobkey] = &runningJob{
-							job:    job,
-							allocs: []*structs.Allocation{alloc},
-						}
-					}
-					continue
-				}
+		// drainableSys are allocs for system jobs that should be
+		// drained if there are no other allocs left
+		drainableSys := map[jobKey]*runningJob{}
 
-				// If deadline hasn't been reached, system jobs
-				// may still be drained if there are no other
-				// allocs left
-				if !deadlineReached && job.Type == structs.JobTypeSystem {
-					n.logger.Printf("[TRACE] nomad.drain: system alloc %s will be drained if no other allocs on node %s", alloc.ID, node.ID)
-					if j, ok := drainableSys[jobkey]; ok {
-						j.allocs = append(j.allocs, alloc)
-					} else {
-						// First alloc for this job, create entry
-						drainableSys[jobkey] = &runningJob{
-							job:    job,
-							allocs: []*structs.Allocation{alloc},
-						}
-					}
-					continue
-				}
+		// track number of allocs left on this node to be drained
+		allocsLeft := false
+		inf, deadline := node.DrainStrategy.DeadlineTime()
+		deadlineReached := !inf && deadline.Before(now)
+		for _, alloc := range allocs {
+			// Don't need to consider drained allocs
+			if alloc.TerminalStatus() {
+				continue
+			}
 
-				// This alloc is still running on a draining
-				// node, so treat the node as having allocs
-				// remaining
-				allocsLeft = true
+			jobkey := jobKey{alloc.Namespace, alloc.JobID}
 
-				jobAllocs, err := snapshot.AllocsByJob(nil, alloc.Namespace, alloc.JobID, true)
-				if err != nil {
-					//FIXME
-					panic(err)
-				}
+			// job does not found yet
+			job, err := state.JobByID(nil, alloc.Namespace, alloc.JobID)
+			if err != nil {
+				return nil, err
+			}
 
-				// Count the number of down (terminal or nil deployment status) per task group
-				if job.Type == structs.JobTypeService {
-					num := 0
-					for _, a := range jobAllocs {
-						if !a.TerminalStatus() && a.DeploymentStatus != nil {
-							// Not terminal and health updated, count it as up!
-							upPerTG[makeTaskGroupKey(a)]++
-							num++
-						}
+			// IgnoreSystemJobs if specified in the node's DrainStrategy
+			if node.DrainStrategy.IgnoreSystemJobs && job.Type == structs.JobTypeSystem {
+				continue
+			}
+
+			// When the node deadline is reached all batch
+			// and service jobs will be drained
+			if deadlineReached && job.Type != structs.JobTypeService {
+				n.logger.Printf("[TRACE] nomad.drain: draining alloc %s due to node %s reaching drain deadline", alloc.ID, node.ID)
+				if j, ok := drainNow[jobkey]; ok {
+					j.allocs = append(j.allocs, alloc)
+				} else {
+					// First alloc for this job, create entry
+					drainNow[jobkey] = &runningJob{
+						job:    job,
+						allocs: []*structs.Allocation{alloc},
 					}
-					n.logger.Printf("[TRACE] nomad.drain: job %s has %d allocs running", job.Name, num)
 				}
+				continue
+			}
 
-				drainableSvcs[jobkey] = &runningJob{
-					job:    job,
-					allocs: jobAllocs,
+			// If deadline hasn't been reached, system jobs
+			// may still be drained if there are no other
+			// allocs left
+			if !deadlineReached && job.Type == structs.JobTypeSystem {
+				n.logger.Printf("[TRACE] nomad.drain: system alloc %s will be drained if no other allocs on node %s", alloc.ID, node.ID)
+				if j, ok := drainableSys[jobkey]; ok {
+					j.allocs = append(j.allocs, alloc)
+				} else {
+					// First alloc for this job, create entry
+					drainableSys[jobkey] = &runningJob{
+						job:    job,
+						allocs: []*structs.Allocation{alloc},
+					}
 				}
+				continue
+			}
+
+			// This alloc is still running on a draining
+			// node, so treat the node as having allocs
+			// remaining
+			allocsLeft = true
 
-				jobWatcher.watch(jobkey, nodeID)
+			jobAllocs, err := state.AllocsByJob(nil, alloc.Namespace, alloc.JobID, true)
+			if err != nil {
+				return nil, err
 			}
 
-			// if node has no allocs or has hit its deadline, it's done draining!
-			if !allocsLeft || deadlineReached {
-				n.logger.Printf("[TRACE] nomad.drain: node %s has no more allocs left to drain or has reached deadline", nodeID)
-				jobWatcher.nodeDone(nodeID)
-				doneNodes[nodeID] = node
-
-				// Add all system jobs on this node to the drainNow slice
-				for k, sysj := range drainableSys {
-					if j, ok := drainNow[k]; ok {
-						// Job already has at least one alloc draining, append this one
-						j.allocs = append(j.allocs, sysj.allocs...)
-					} else {
-						// First draining alloc for this job, add the entry
-						drainNow[k] = sysj
+			// Count the number of down (terminal or nil deployment status) per task group
+			if job.Type == structs.JobTypeService {
+				num := 0
+				for _, a := range jobAllocs {
+					if !a.TerminalStatus() && a.DeploymentStatus != nil {
+						// Not terminal and health updated, count it as up!
+						upPerTG[makeTaskGroupKey(a)]++
+						num++
 					}
 				}
+				n.logger.Printf("[TRACE] nomad.drain: job %s has %d allocs running", job.Name, num)
 			}
-		}
 
-		// stoplist are the allocations to migrate and their jobs to emit
-		// evaluations for. Initialized with allocations that should be
-		// immediately drained regardless of MaxParallel
-		stoplist := newStopAllocs(drainNow)
+			svcs[jobkey] = &runningJob{
+				job:    job,
+				allocs: jobAllocs,
+			}
 
-		// build drain list considering deadline & max_parallel
-		for _, drainingJob := range drainableSvcs {
-			for _, alloc := range drainingJob.allocs {
-				// Already draining/dead allocs don't need to be drained
-				if alloc.TerminalStatus() {
-					continue
-				}
+			jobWatcher.watch(jobkey, nodeID)
+		}
 
-				node, ok := nodes[alloc.NodeID]
-				if !ok {
-					// Alloc's node is not draining so not elligible for draining!
-					continue
+		// if node has no allocs or has hit its deadline, it's done draining!
+		if !allocsLeft || deadlineReached {
+			n.logger.Printf("[TRACE] nomad.drain: node %s has no more allocs left to drain or has reached deadline", nodeID)
+			jobWatcher.nodeDone(nodeID)
+			doneNodes[nodeID] = node
+
+			// Add all system jobs on this node to the drainNow slice
+			for k, sysj := range drainableSys {
+				if j, ok := drainNow[k]; ok {
+					// Job already has at least one alloc draining, append this one
+					j.allocs = append(j.allocs, sysj.allocs...)
+				} else {
+					// First draining alloc for this job, add the entry
+					drainNow[k] = sysj
 				}
+			}
+		}
+	}
 
-				tgKey := makeTaskGroupKey(alloc)
-
-				if inf, d := node.DrainStrategy.DeadlineTime(); !inf && d.Before(now) {
-					n.logger.Printf("[TRACE] nomad.drain: draining job %s alloc %s from node %s due to node's drain deadline", drainingJob.job.Name, alloc.ID[:6], alloc.NodeID[:6])
-					// Alloc's Node has reached its deadline
-					stoplist.add(drainingJob.job, alloc)
-					upPerTG[tgKey]--
+	result := &collectResult{
+		drainableSvcs: svcs,
+		drainNow:      drainNow,
+		upPerTG:       upPerTG,
+		doneNodes:     doneNodes,
+	}
+	return result, nil
+}
 
-					continue
-				}
+// markMigrations marks services to be drained for migration in the stoplist.
+func (n *NodeDrainer) markMigrations(stoplist *stopAllocs, upPerTG map[string]int, drainable map[jobKey]*runningJob, nodes map[string]*structs.Node, now time.Time) {
+	for _, drainingJob := range drainable {
+		for _, alloc := range drainingJob.allocs {
+			// Already draining/dead allocs don't need to be drained
+			if alloc.TerminalStatus() {
+				continue
+			}
 
-				// Stop allocs with count=1, max_parallel==0, or draining<max_parallel
-				tg := drainingJob.job.LookupTaskGroup(alloc.TaskGroup)
-				//FIXME tg==nil here?
+			node, ok := nodes[alloc.NodeID]
+			if !ok {
+				// Alloc's node is not draining so not elligible for draining!
+				continue
+			}
 
-				// Only 1, drain
-				if tg.Count == 1 {
-					n.logger.Printf("[TRACE] nomad.drain: draining job %s alloc %s from node %s due to count=1", drainingJob.job.Name, alloc.ID[:6], alloc.NodeID[:6])
-					stoplist.add(drainingJob.job, alloc)
-					continue
-				}
+			tgKey := makeTaskGroupKey(alloc)
 
-				// No migrate strategy or a max parallel of 0 mean force draining
-				if tg.Migrate == nil || tg.Migrate.MaxParallel == 0 {
-					n.logger.Printf("[TRACE] nomad.drain: draining job %s alloc %s from node %s due to force drain", drainingJob.job.Name, alloc.ID[:6], alloc.NodeID[:6])
-					stoplist.add(drainingJob.job, alloc)
-					continue
-				}
+			if inf, d := node.DrainStrategy.DeadlineTime(); !inf && d.Before(now) {
+				n.logger.Printf("[TRACE] nomad.drain: draining job %s alloc %s from node %s due to node's drain deadline", drainingJob.job.Name, alloc.ID[:6], alloc.NodeID[:6])
+				// Alloc's Node has reached its deadline
+				stoplist.add(drainingJob.job, alloc)
+				upPerTG[tgKey]--
 
-				n.logger.Printf("[TRACE] nomad.drain: considering job %s alloc %s  count %d  maxp %d  up %d",
-					drainingJob.job.Name, alloc.ID[:6], tg.Count, tg.Migrate.MaxParallel, upPerTG[tgKey])
+				continue
+			}
 
-				// Count - MaxParalell = minimum number of allocations that must be "up"
-				minUp := (tg.Count - tg.Migrate.MaxParallel)
+			// Stop allocs with count=1, max_parallel==0, or draining<max_parallel
+			tg := drainingJob.job.LookupTaskGroup(alloc.TaskGroup)
+			//FIXME tg==nil here?
 
-				// If minimum is < the current number up it is safe to stop one.
-				if minUp < upPerTG[tgKey] {
-					n.logger.Printf("[TRACE] nomad.drain: draining job %s alloc %s from node %s due to max parallel", drainingJob.job.Name, alloc.ID[:6], alloc.NodeID[:6])
-					// More migrations are allowed, add to stoplist
-					stoplist.add(drainingJob.job, alloc)
-					upPerTG[tgKey]--
-				}
+			// Only 1, drain
+			if tg.Count == 1 {
+				n.logger.Printf("[TRACE] nomad.drain: draining job %s alloc %s from node %s due to count=1", drainingJob.job.Name, alloc.ID[:6], alloc.NodeID[:6])
+				stoplist.add(drainingJob.job, alloc)
+				continue
 			}
-		}
 
-		if len(stoplist.allocBatch) > 0 {
-			if err := n.applyMigrations(stoplist); err != nil {
-				//FIXME
-				panic(err)
+			// No migrate strategy or a max parallel of 0 mean force draining
+			if tg.Migrate == nil || tg.Migrate.MaxParallel == 0 {
+				n.logger.Printf("[TRACE] nomad.drain: draining job %s alloc %s from node %s due to force drain", drainingJob.job.Name, alloc.ID[:6], alloc.NodeID[:6])
+				stoplist.add(drainingJob.job, alloc)
+				continue
 			}
-		}
 
-		// Unset drain for nodes done draining
-		for nodeID, node := range doneNodes {
-			if err := n.raft.NodeDrainComplete(nodeID); err != nil {
-				n.logger.Printf("[ERR] nomad.drain: failed to unset drain for: %v", err)
-				//FIXME
-				panic(err)
+			n.logger.Printf("[TRACE] nomad.drain: considering job %s alloc %s  count %d  maxp %d  up %d",
+				drainingJob.job.Name, alloc.ID[:6], tg.Count, tg.Migrate.MaxParallel, upPerTG[tgKey])
+
+			// Count - MaxParalell = minimum number of allocations that must be "up"
+			minUp := (tg.Count - tg.Migrate.MaxParallel)
+
+			// If minimum is < the current number up it is safe to stop one.
+			if minUp < upPerTG[tgKey] {
+				n.logger.Printf("[TRACE] nomad.drain: draining job %s alloc %s from node %s due to max parallel", drainingJob.job.Name, alloc.ID[:6], alloc.NodeID[:6])
+				// More migrations are allowed, add to stoplist
+				stoplist.add(drainingJob.job, alloc)
+				upPerTG[tgKey]--
 			}
-			n.logger.Printf("[INFO] nomad.drain: node %s (%s) completed draining", nodeID, node.Name)
-			delete(nodes, nodeID)
 		}
 	}
 }

From 7f989499ffb4bfc5eca31531ade7c385cfc77a56 Mon Sep 17 00:00:00 2001
From: Alex Dadgar <alex.dadgar@gmail.com>
Date: Thu, 1 Mar 2018 11:21:32 -0800
Subject: [PATCH 33/79] Correct defaulting

---
 api/jobs.go                           |   1 +
 api/tasks.go                          |  44 +++++++-
 api/tasks_test.go                     | 152 ++++++++++++++++++++++++++
 command/agent/job_endpoint.go         |   9 ++
 command/agent/job_endpoint_test.go    |  12 ++
 jobspec/parse.go                      |  19 +++-
 jobspec/parse_test.go                 |  38 +++++++
 jobspec/test-fixtures/migrate-job.hcl |  28 +++++
 8 files changed, 297 insertions(+), 6 deletions(-)
 create mode 100644 jobspec/test-fixtures/migrate-job.hcl

diff --git a/api/jobs.go b/api/jobs.go
index 9e3227af49e8..5fcecf403871 100644
--- a/api/jobs.go
+++ b/api/jobs.go
@@ -559,6 +559,7 @@ type Job struct {
 	ParameterizedJob  *ParameterizedJobConfig
 	Payload           []byte
 	Reschedule        *ReschedulePolicy
+	Migrate           *MigrateStrategy
 	Meta              map[string]string
 	VaultToken        *string `mapstructure:"vault_token"`
 	Status            *string
diff --git a/api/tasks.go b/api/tasks.go
index f7d3d9fb0737..47b502d57558 100644
--- a/api/tasks.go
+++ b/api/tasks.go
@@ -321,6 +321,30 @@ func (m *MigrateStrategy) Canonicalize() {
 	}
 }
 
+func (m *MigrateStrategy) Merge(o *MigrateStrategy) {
+	if o.MaxParallel != nil {
+		m.MaxParallel = o.MaxParallel
+	}
+	if o.HealthCheck != nil {
+		m.HealthCheck = o.HealthCheck
+	}
+	if o.MinHealthyTime != nil {
+		m.MinHealthyTime = o.MinHealthyTime
+	}
+	if o.HealthyDeadline != nil {
+		m.HealthyDeadline = o.HealthyDeadline
+	}
+}
+
+func (m *MigrateStrategy) Copy() *MigrateStrategy {
+	if m == nil {
+		return nil
+	}
+	nm := new(MigrateStrategy)
+	*nm = *m
+	return nm
+}
+
 // TaskGroup is the unit of scheduling.
 type TaskGroup struct {
 	Name             *string
@@ -415,7 +439,25 @@ func (g *TaskGroup) Canonicalize(job *Job) {
 	}
 	g.ReschedulePolicy = defaultReschedulePolicy
 
-	g.Migrate.Canonicalize()
+	// Merge the migrate strategy from the job
+	if jm, tm := job.Migrate != nil, g.Migrate != nil; jm && tm {
+		jobMigrate := job.Migrate.Copy()
+		jobMigrate.Merge(g.Migrate)
+		g.Migrate = jobMigrate
+	} else if jm {
+		jobMigrate := job.Migrate.Copy()
+		g.Migrate = jobMigrate
+	}
+
+	// Merge with default reschedule policy
+	if *job.Type == "service" {
+		defaultMigrateStrategy := &MigrateStrategy{}
+		defaultMigrateStrategy.Canonicalize()
+		if g.Migrate != nil {
+			defaultMigrateStrategy.Merge(g.Migrate)
+		}
+		g.Migrate = defaultMigrateStrategy
+	}
 
 	var defaultRestartPolicy *RestartPolicy
 	switch *job.Type {
diff --git a/api/tasks_test.go b/api/tasks_test.go
index 3280507ad591..d72acc179bf6 100644
--- a/api/tasks_test.go
+++ b/api/tasks_test.go
@@ -430,6 +430,158 @@ func TestTaskGroup_Canonicalize_ReschedulePolicy(t *testing.T) {
 	}
 }
 
+// Verifies that migrate strategy is merged correctly
+func TestTaskGroup_Canonicalize_MigrateStrategy(t *testing.T) {
+	type testCase struct {
+		desc        string
+		jobType     string
+		jobMigrate  *MigrateStrategy
+		taskMigrate *MigrateStrategy
+		expected    *MigrateStrategy
+	}
+
+	testCases := []testCase{
+		{
+			desc:        "Default batch",
+			jobType:     "batch",
+			jobMigrate:  nil,
+			taskMigrate: nil,
+			expected:    nil,
+		},
+		{
+			desc:        "Default service",
+			jobType:     "service",
+			jobMigrate:  nil,
+			taskMigrate: nil,
+			expected: &MigrateStrategy{
+				MaxParallel:     helper.IntToPtr(1),
+				HealthCheck:     helper.StringToPtr("checks"),
+				MinHealthyTime:  helper.TimeToPtr(10 * time.Second),
+				HealthyDeadline: helper.TimeToPtr(5 * time.Minute),
+			},
+		},
+		{
+			desc:    "Empty job migrate strategy",
+			jobType: "service",
+			jobMigrate: &MigrateStrategy{
+				MaxParallel:     helper.IntToPtr(0),
+				HealthCheck:     helper.StringToPtr(""),
+				MinHealthyTime:  helper.TimeToPtr(0),
+				HealthyDeadline: helper.TimeToPtr(0),
+			},
+			taskMigrate: nil,
+			expected: &MigrateStrategy{
+				MaxParallel:     helper.IntToPtr(0),
+				HealthCheck:     helper.StringToPtr(""),
+				MinHealthyTime:  helper.TimeToPtr(0),
+				HealthyDeadline: helper.TimeToPtr(0),
+			},
+		},
+		{
+			desc:    "Inherit from job",
+			jobType: "service",
+			jobMigrate: &MigrateStrategy{
+				MaxParallel:     helper.IntToPtr(3),
+				HealthCheck:     helper.StringToPtr("checks"),
+				MinHealthyTime:  helper.TimeToPtr(2),
+				HealthyDeadline: helper.TimeToPtr(2),
+			},
+			taskMigrate: nil,
+			expected: &MigrateStrategy{
+				MaxParallel:     helper.IntToPtr(3),
+				HealthCheck:     helper.StringToPtr("checks"),
+				MinHealthyTime:  helper.TimeToPtr(2),
+				HealthyDeadline: helper.TimeToPtr(2),
+			},
+		},
+		{
+			desc:       "Set in task",
+			jobType:    "service",
+			jobMigrate: nil,
+			taskMigrate: &MigrateStrategy{
+				MaxParallel:     helper.IntToPtr(3),
+				HealthCheck:     helper.StringToPtr("checks"),
+				MinHealthyTime:  helper.TimeToPtr(2),
+				HealthyDeadline: helper.TimeToPtr(2),
+			},
+			expected: &MigrateStrategy{
+				MaxParallel:     helper.IntToPtr(3),
+				HealthCheck:     helper.StringToPtr("checks"),
+				MinHealthyTime:  helper.TimeToPtr(2),
+				HealthyDeadline: helper.TimeToPtr(2),
+			},
+		},
+		{
+			desc:    "Merge from job",
+			jobType: "service",
+			jobMigrate: &MigrateStrategy{
+				MaxParallel: helper.IntToPtr(11),
+			},
+			taskMigrate: &MigrateStrategy{
+				HealthCheck:     helper.StringToPtr("checks"),
+				MinHealthyTime:  helper.TimeToPtr(2),
+				HealthyDeadline: helper.TimeToPtr(2),
+			},
+			expected: &MigrateStrategy{
+				MaxParallel:     helper.IntToPtr(11),
+				HealthCheck:     helper.StringToPtr("checks"),
+				MinHealthyTime:  helper.TimeToPtr(2),
+				HealthyDeadline: helper.TimeToPtr(2),
+			},
+		},
+		{
+			desc:    "Override from group",
+			jobType: "service",
+			jobMigrate: &MigrateStrategy{
+				MaxParallel: helper.IntToPtr(11),
+			},
+			taskMigrate: &MigrateStrategy{
+				MaxParallel:     helper.IntToPtr(5),
+				HealthCheck:     helper.StringToPtr("checks"),
+				MinHealthyTime:  helper.TimeToPtr(2),
+				HealthyDeadline: helper.TimeToPtr(2),
+			},
+			expected: &MigrateStrategy{
+				MaxParallel:     helper.IntToPtr(5),
+				HealthCheck:     helper.StringToPtr("checks"),
+				MinHealthyTime:  helper.TimeToPtr(2),
+				HealthyDeadline: helper.TimeToPtr(2),
+			},
+		},
+		{
+			desc:    "Parallel from job, defaulting",
+			jobType: "service",
+			jobMigrate: &MigrateStrategy{
+				MaxParallel: helper.IntToPtr(5),
+			},
+			taskMigrate: nil,
+			expected: &MigrateStrategy{
+				MaxParallel:     helper.IntToPtr(5),
+				HealthCheck:     helper.StringToPtr("checks"),
+				MinHealthyTime:  helper.TimeToPtr(10 * time.Second),
+				HealthyDeadline: helper.TimeToPtr(5 * time.Minute),
+			},
+		},
+	}
+
+	for _, tc := range testCases {
+		t.Run(tc.desc, func(t *testing.T) {
+			job := &Job{
+				ID:      helper.StringToPtr("test"),
+				Migrate: tc.jobMigrate,
+				Type:    helper.StringToPtr(tc.jobType),
+			}
+			job.Canonicalize()
+			tg := &TaskGroup{
+				Name:    helper.StringToPtr("foo"),
+				Migrate: tc.taskMigrate,
+			}
+			tg.Canonicalize(job)
+			assert.Equal(t, tc.expected, tg.Migrate)
+		})
+	}
+}
+
 // TestService_CheckRestart asserts Service.CheckRestart settings are properly
 // inherited by Checks.
 func TestService_CheckRestart(t *testing.T) {
diff --git a/command/agent/job_endpoint.go b/command/agent/job_endpoint.go
index 840fb1feeda9..ce1605728740 100644
--- a/command/agent/job_endpoint.go
+++ b/command/agent/job_endpoint.go
@@ -649,6 +649,15 @@ func ApiTgToStructsTG(taskGroup *api.TaskGroup, tg *structs.TaskGroup) {
 		}
 	}
 
+	if taskGroup.Migrate != nil {
+		tg.Migrate = &structs.MigrateStrategy{
+			MaxParallel:     *taskGroup.Migrate.MaxParallel,
+			HealthCheck:     *taskGroup.Migrate.HealthCheck,
+			MinHealthyTime:  *taskGroup.Migrate.MinHealthyTime,
+			HealthyDeadline: *taskGroup.Migrate.HealthyDeadline,
+		}
+	}
+
 	tg.EphemeralDisk = &structs.EphemeralDisk{
 		Sticky:  *taskGroup.EphemeralDisk.Sticky,
 		SizeMB:  *taskGroup.EphemeralDisk.SizeMB,
diff --git a/command/agent/job_endpoint_test.go b/command/agent/job_endpoint_test.go
index f59acaaf2eef..57b5d1869d24 100644
--- a/command/agent/job_endpoint_test.go
+++ b/command/agent/job_endpoint_test.go
@@ -1179,6 +1179,12 @@ func TestJobs_ApiJobToStructsJob(t *testing.T) {
 					Unlimited:     helper.BoolToPtr(true),
 					MaxDelay:      helper.TimeToPtr(20 * time.Minute),
 				},
+				Migrate: &api.MigrateStrategy{
+					MaxParallel:     helper.IntToPtr(12),
+					HealthCheck:     helper.StringToPtr("task_events"),
+					MinHealthyTime:  helper.TimeToPtr(12 * time.Hour),
+					HealthyDeadline: helper.TimeToPtr(12 * time.Hour),
+				},
 				EphemeralDisk: &api.EphemeralDisk{
 					SizeMB:  helper.IntToPtr(100),
 					Sticky:  helper.BoolToPtr(true),
@@ -1395,6 +1401,12 @@ func TestJobs_ApiJobToStructsJob(t *testing.T) {
 					Unlimited:     true,
 					MaxDelay:      20 * time.Minute,
 				},
+				Migrate: &structs.MigrateStrategy{
+					MaxParallel:     12,
+					HealthCheck:     "task_events",
+					MinHealthyTime:  12 * time.Hour,
+					HealthyDeadline: 12 * time.Hour,
+				},
 				EphemeralDisk: &structs.EphemeralDisk{
 					SizeMB:  100,
 					Sticky:  true,
diff --git a/jobspec/parse.go b/jobspec/parse.go
index 4bfebc9099aa..e56161cd4c40 100644
--- a/jobspec/parse.go
+++ b/jobspec/parse.go
@@ -104,11 +104,12 @@ func parseJob(result *api.Job, list *ast.ObjectList) error {
 	}
 	delete(m, "constraint")
 	delete(m, "meta")
-	delete(m, "update")
-	delete(m, "periodic")
-	delete(m, "vault")
+	delete(m, "migrate")
 	delete(m, "parameterized")
+	delete(m, "periodic")
 	delete(m, "reschedule")
+	delete(m, "update")
+	delete(m, "vault")
 
 	// Set the ID and name to the object key
 	result.ID = helper.StringToPtr(obj.Keys[0].Token.Value().(string))
@@ -132,19 +133,20 @@ func parseJob(result *api.Job, list *ast.ObjectList) error {
 		"all_at_once",
 		"constraint",
 		"datacenters",
-		"parameterized",
 		"group",
 		"id",
 		"meta",
+		"migrate",
 		"name",
 		"namespace",
+		"parameterized",
 		"periodic",
 		"priority",
 		"region",
+		"reschedule",
 		"task",
 		"type",
 		"update",
-		"reschedule",
 		"vault",
 		"vault_token",
 	}
@@ -187,6 +189,13 @@ func parseJob(result *api.Job, list *ast.ObjectList) error {
 		}
 	}
 
+	// If we have a migration strategy, then parse that
+	if o := listVal.Filter("migrate"); len(o.Items) > 0 {
+		if err := parseMigrate(&result.Migrate, o); err != nil {
+			return multierror.Prefix(err, "migrate ->")
+		}
+	}
+
 	// Parse out meta fields. These are in HCL as a list so we need
 	// to iterate over them and merge them.
 	if metaO := listVal.Filter("meta"); len(metaO.Items) > 0 {
diff --git a/jobspec/parse_test.go b/jobspec/parse_test.go
index c3989a68ca94..1275cd51c90f 100644
--- a/jobspec/parse_test.go
+++ b/jobspec/parse_test.go
@@ -741,6 +741,44 @@ func TestParse(t *testing.T) {
 			},
 			false,
 		},
+		{
+			"migrate-job.hcl",
+			&api.Job{
+				ID:          helper.StringToPtr("foo"),
+				Name:        helper.StringToPtr("foo"),
+				Type:        helper.StringToPtr("batch"),
+				Datacenters: []string{"dc1"},
+				Migrate: &api.MigrateStrategy{
+					MaxParallel:     helper.IntToPtr(2),
+					HealthCheck:     helper.StringToPtr("task_states"),
+					MinHealthyTime:  helper.TimeToPtr(11 * time.Second),
+					HealthyDeadline: helper.TimeToPtr(11 * time.Minute),
+				},
+				TaskGroups: []*api.TaskGroup{
+					{
+						Name:  helper.StringToPtr("bar"),
+						Count: helper.IntToPtr(3),
+						Migrate: &api.MigrateStrategy{
+							MaxParallel:     helper.IntToPtr(3),
+							HealthCheck:     helper.StringToPtr("checks"),
+							MinHealthyTime:  helper.TimeToPtr(1 * time.Second),
+							HealthyDeadline: helper.TimeToPtr(1 * time.Minute),
+						},
+						Tasks: []*api.Task{
+							{
+								Name:   "bar",
+								Driver: "raw_exec",
+								Config: map[string]interface{}{
+									"command": "bash",
+									"args":    []interface{}{"-c", "echo hi"},
+								},
+							},
+						},
+					},
+				},
+			},
+			false,
+		},
 	}
 
 	for _, tc := range cases {
diff --git a/jobspec/test-fixtures/migrate-job.hcl b/jobspec/test-fixtures/migrate-job.hcl
new file mode 100644
index 000000000000..5ec05e6b5141
--- /dev/null
+++ b/jobspec/test-fixtures/migrate-job.hcl
@@ -0,0 +1,28 @@
+job "foo" {
+  datacenters = ["dc1"]
+  type = "batch"
+  migrate {
+      max_parallel = 2
+      health_check = "task_states"
+      min_healthy_time = "11s"
+      healthy_deadline = "11m"
+  }
+
+  group "bar" {
+    count = 3
+    task "bar" {
+      driver = "raw_exec"
+      config {
+         command = "bash"
+         args    = ["-c", "echo hi"]
+      }
+    }
+
+    migrate {
+        max_parallel = 3
+        health_check = "checks"
+        min_healthy_time = "1s"
+        healthy_deadline = "1m"
+    }
+  }
+}

From a027016b87dab56fd004424f3a757674feb763ab Mon Sep 17 00:00:00 2001
From: Alex Dadgar <alex.dadgar@gmail.com>
Date: Thu, 1 Mar 2018 11:27:36 -0800
Subject: [PATCH 34/79] Fix file names

---
 nomad/drainer/jobwatcher.go  | 140 -----------------------------------
 nomad/drainer/nodewatcher.go | 121 ------------------------------
 2 files changed, 261 deletions(-)
 delete mode 100644 nomad/drainer/jobwatcher.go
 delete mode 100644 nomad/drainer/nodewatcher.go

diff --git a/nomad/drainer/jobwatcher.go b/nomad/drainer/jobwatcher.go
deleted file mode 100644
index 95a1be5d157e..000000000000
--- a/nomad/drainer/jobwatcher.go
+++ /dev/null
@@ -1,140 +0,0 @@
-package drainer
-
-import (
-	"context"
-	"log"
-	"sync"
-
-	memdb "github.com/hashicorp/go-memdb"
-	"github.com/hashicorp/nomad/nomad/state"
-	"github.com/hashicorp/nomad/nomad/structs"
-)
-
-// jobWatcher watches allocation changes for jobs with at least one allocation
-// on a draining node.
-type jobWatcher struct {
-	// allocsIndex to start watching from
-	allocsIndex uint64
-
-	// job -> node.ID
-	jobs   map[jobKey]string
-	jobsMu sync.Mutex
-
-	jobsCh chan map[jobKey]struct{}
-
-	state *state.StateStore
-
-	logger *log.Logger
-}
-
-func newJobWatcher(logger *log.Logger, jobs map[jobKey]string, allocsIndex uint64, state *state.StateStore) *jobWatcher {
-	return &jobWatcher{
-		allocsIndex: allocsIndex,
-		logger:      logger,
-		jobs:        jobs,
-		jobsCh:      make(chan map[jobKey]struct{}),
-		state:       state,
-	}
-}
-
-func (j *jobWatcher) watch(k jobKey, nodeID string) {
-	j.logger.Printf("[TRACE] nomad.drain: watching job %s on draining node %s", k.jobid, nodeID[:6])
-	j.jobsMu.Lock()
-	j.jobs[k] = nodeID
-	j.jobsMu.Unlock()
-}
-
-func (j *jobWatcher) nodeDone(nodeID string) {
-	j.jobsMu.Lock()
-	defer j.jobsMu.Unlock()
-	for k, v := range j.jobs {
-		if v == nodeID {
-			j.logger.Printf("[TRACE] nomad.drain: UNwatching job %s on done draining node %s", k.jobid, nodeID[:6])
-			delete(j.jobs, k)
-		}
-	}
-}
-
-func (j *jobWatcher) WaitCh() <-chan map[jobKey]struct{} {
-	return j.jobsCh
-}
-
-func (j *jobWatcher) run(ctx context.Context) {
-	var resp interface{}
-	var err error
-
-	for {
-		//FIXME have watchAllocs create a closure and give it a copy of j.jobs to remove locking?
-		//FIXME it seems possible for this to return a nil error and a 0 index, what to do in that case?
-		var newIndex uint64
-		resp, newIndex, err = j.state.BlockingQuery(j.watchAllocs, j.allocsIndex, ctx)
-		if err != nil {
-			if err == context.Canceled {
-				j.logger.Printf("[TRACE] nomad.drain: job watcher shutting down")
-				return
-			}
-			j.logger.Printf("[ERR] nomad.drain: error blocking on alloc updates: %v", err)
-			return
-		}
-
-		j.logger.Printf("[TRACE] nomad.drain: job watcher old index: %d new index: %d", j.allocsIndex, newIndex)
-		j.allocsIndex = newIndex
-
-		changedJobs := resp.(map[jobKey]struct{})
-		if len(changedJobs) > 0 {
-			select {
-			case j.jobsCh <- changedJobs:
-			case <-ctx.Done():
-				return
-			}
-		}
-	}
-}
-
-func (j *jobWatcher) watchAllocs(ws memdb.WatchSet, state *state.StateStore) (interface{}, uint64, error) {
-	iter, err := state.Allocs(ws)
-	if err != nil {
-		return nil, 0, err
-	}
-
-	index, err := state.Index("allocs")
-	if err != nil {
-		return nil, 0, err
-	}
-
-	skipped := 0
-
-	// job ids
-	resp := map[jobKey]struct{}{}
-
-	for {
-		raw := iter.Next()
-		if raw == nil {
-			break
-		}
-
-		alloc := raw.(*structs.Allocation)
-
-		j.jobsMu.Lock()
-		_, ok := j.jobs[jobKey{alloc.Namespace, alloc.JobID}]
-		j.jobsMu.Unlock()
-
-		if !ok {
-			// alloc is not part of a draining job
-			skipped++
-			continue
-		}
-
-		// don't wake drain loop if alloc hasn't updated its health
-		if alloc.DeploymentStatus.IsHealthy() || alloc.DeploymentStatus.IsUnhealthy() {
-			j.logger.Printf("[TRACE] nomad.drain: job watcher found alloc %s - deployment status: %t", alloc.ID[:6], *alloc.DeploymentStatus.Healthy)
-			resp[jobKey{alloc.Namespace, alloc.JobID}] = struct{}{}
-		} else {
-			j.logger.Printf("[TRACE] nomad.drain: job watcher ignoring alloc %s - no deployment status", alloc.ID[:6])
-		}
-	}
-
-	j.logger.Printf("[TRACE] nomad.drain: job watcher ignoring %d allocs - not part of draining job at index %d", skipped, index)
-
-	return resp, index, nil
-}
diff --git a/nomad/drainer/nodewatcher.go b/nomad/drainer/nodewatcher.go
deleted file mode 100644
index 5f419ea2ca91..000000000000
--- a/nomad/drainer/nodewatcher.go
+++ /dev/null
@@ -1,121 +0,0 @@
-package drainer
-
-import (
-	"context"
-	"log"
-
-	memdb "github.com/hashicorp/go-memdb"
-	"github.com/hashicorp/nomad/nomad/state"
-	"github.com/hashicorp/nomad/nomad/structs"
-)
-
-// nodeWatcher watches for nodes to start or stop draining
-type nodeWatcher struct {
-	index   uint64
-	nodes   map[string]*structs.Node
-	nodesCh chan map[string]*structs.Node
-	state   *state.StateStore
-	logger  *log.Logger
-}
-
-func newNodeWatcher(logger *log.Logger, nodes map[string]*structs.Node, index uint64, state *state.StateStore) *nodeWatcher {
-	return &nodeWatcher{
-		nodes:   nodes,
-		nodesCh: make(chan map[string]*structs.Node),
-		index:   index,
-		state:   state,
-		logger:  logger,
-	}
-}
-
-func (n *nodeWatcher) run(ctx context.Context) {
-	// Trigger an initial drain pass if there are already nodes draining
-	//FIXME this is unneccessary if a node has reached a deadline
-	n.logger.Printf("[TRACE] nomad.drain: initial draining nodes: %d", len(n.nodes))
-	if len(n.nodes) > 0 {
-		n.nodesCh <- n.nodes
-	}
-
-	for {
-		//FIXME it seems possible for this to return a nil error and a 0 index, what to do in that case?
-		resp, index, err := n.state.BlockingQuery(n.queryNodeDrain, n.index, ctx)
-		if err != nil {
-			if err == context.Canceled {
-				n.logger.Printf("[TRACE] nomad.drain: draining node watcher shutting down")
-				return
-			}
-			n.logger.Printf("[ERR] nomad.drain: error blocking on node updates at index %d: %v", n.index, err)
-			return
-		}
-
-		// update index for next run
-		n.index = index
-
-		changed := false
-		newNodes := resp.([]*structs.Node)
-		n.logger.Printf("[TRACE] nomad.drain: %d nodes to consider", len(newNodes)) //FIXME remove
-		for _, newNode := range newNodes {
-			if existingNode, ok := n.nodes[newNode.ID]; ok {
-				// Node was draining, see if it has changed
-				if newNode.DrainStrategy == nil {
-					// Node stopped draining
-					delete(n.nodes, newNode.ID)
-					changed = true
-				} else if !newNode.DrainStrategy.Equal(existingNode.DrainStrategy) {
-					// Update deadline
-					n.nodes[newNode.ID] = newNode
-					changed = true
-				}
-			} else {
-				// Node was not draining
-				if newNode.DrainStrategy != nil {
-					// Node started draining
-					n.nodes[newNode.ID] = newNode
-					changed = true
-				}
-			}
-		}
-
-		// Send a copy of the draining nodes if there were changes
-		if !changed {
-			continue
-		}
-
-		nodesCopy := make(map[string]*structs.Node, len(n.nodes))
-		for k, v := range n.nodes {
-			nodesCopy[k] = v
-		}
-
-		select {
-		case n.nodesCh <- nodesCopy:
-		case <-ctx.Done():
-			return
-		}
-	}
-}
-
-func (n *nodeWatcher) queryNodeDrain(ws memdb.WatchSet, state *state.StateStore) (interface{}, uint64, error) {
-	iter, err := state.Nodes(ws)
-	if err != nil {
-		return nil, 0, err
-	}
-
-	index, err := state.Index("nodes")
-	if err != nil {
-		return nil, 0, err
-	}
-
-	resp := make([]*structs.Node, 0, 8)
-
-	for {
-		raw := iter.Next()
-		if raw == nil {
-			break
-		}
-
-		node := raw.(*structs.Node)
-		resp = append(resp, node)
-	}
-
-	return resp, index, nil
-}

From c00c02df6258296427e0ba78df06e1763b24fba2 Mon Sep 17 00:00:00 2001
From: Alex Dadgar <alex.dadgar@gmail.com>
Date: Thu, 1 Mar 2018 13:36:26 -0800
Subject: [PATCH 35/79] System test runs on mac

---
 nomad/drainer/drain_test.go   |   4 +-
 nomad/drainer/job_watcher.go  | 140 ++++++++++++++++++++++++++++++++++
 nomad/drainer/node_watcher.go | 121 +++++++++++++++++++++++++++++
 3 files changed, 263 insertions(+), 2 deletions(-)
 create mode 100644 nomad/drainer/job_watcher.go
 create mode 100644 nomad/drainer/node_watcher.go

diff --git a/nomad/drainer/drain_test.go b/nomad/drainer/drain_test.go
index f92f2503e14f..993a65fcd0ed 100644
--- a/nomad/drainer/drain_test.go
+++ b/nomad/drainer/drain_test.go
@@ -297,6 +297,7 @@ func TestNodeDrainer_SystemDrain(t *testing.T) {
 	serviceJob := mock.Job()
 	serviceJob.Name = "service-job"
 	serviceJob.Type = structs.JobTypeService
+	serviceJob.Constraints = nil
 	serviceJob.TaskGroups[0].Count = 2
 	serviceJob.TaskGroups[0].Migrate = &structs.MigrateStrategy{
 		MaxParallel:     1,
@@ -315,6 +316,7 @@ func TestNodeDrainer_SystemDrain(t *testing.T) {
 	systemJob := mock.SystemJob()
 	systemJob.Name = "system-job"
 	systemJob.Type = structs.JobTypeSystem
+	systemJob.Constraints = nil
 	systemJob.TaskGroups[0].Tasks[0].Driver = "mock_driver"
 	systemJob.TaskGroups[0].Tasks[0].Config = map[string]interface{}{
 		"run_for":    "10m",
@@ -486,6 +488,4 @@ func TestNodeDrainer_SystemDrain(t *testing.T) {
 		t.Logf("job: %s  node: %s  alloc: %s  desired_status: %s  desired_transition: %s  actual: %s  replaces: %s",
 			alloc.Job.Name, alloc.NodeID[:6], alloc.ID[:6], alloc.DesiredStatus, pretty.Sprint(alloc.DesiredTransition.Migrate), alloc.ClientStatus, alloc.PreviousAllocation)
 	}
-
-	t.Logf("==> PASS")
 }
diff --git a/nomad/drainer/job_watcher.go b/nomad/drainer/job_watcher.go
new file mode 100644
index 000000000000..95a1be5d157e
--- /dev/null
+++ b/nomad/drainer/job_watcher.go
@@ -0,0 +1,140 @@
+package drainer
+
+import (
+	"context"
+	"log"
+	"sync"
+
+	memdb "github.com/hashicorp/go-memdb"
+	"github.com/hashicorp/nomad/nomad/state"
+	"github.com/hashicorp/nomad/nomad/structs"
+)
+
+// jobWatcher watches allocation changes for jobs with at least one allocation
+// on a draining node.
+type jobWatcher struct {
+	// allocsIndex to start watching from
+	allocsIndex uint64
+
+	// job -> node.ID
+	jobs   map[jobKey]string
+	jobsMu sync.Mutex
+
+	jobsCh chan map[jobKey]struct{}
+
+	state *state.StateStore
+
+	logger *log.Logger
+}
+
+func newJobWatcher(logger *log.Logger, jobs map[jobKey]string, allocsIndex uint64, state *state.StateStore) *jobWatcher {
+	return &jobWatcher{
+		allocsIndex: allocsIndex,
+		logger:      logger,
+		jobs:        jobs,
+		jobsCh:      make(chan map[jobKey]struct{}),
+		state:       state,
+	}
+}
+
+func (j *jobWatcher) watch(k jobKey, nodeID string) {
+	j.logger.Printf("[TRACE] nomad.drain: watching job %s on draining node %s", k.jobid, nodeID[:6])
+	j.jobsMu.Lock()
+	j.jobs[k] = nodeID
+	j.jobsMu.Unlock()
+}
+
+func (j *jobWatcher) nodeDone(nodeID string) {
+	j.jobsMu.Lock()
+	defer j.jobsMu.Unlock()
+	for k, v := range j.jobs {
+		if v == nodeID {
+			j.logger.Printf("[TRACE] nomad.drain: UNwatching job %s on done draining node %s", k.jobid, nodeID[:6])
+			delete(j.jobs, k)
+		}
+	}
+}
+
+func (j *jobWatcher) WaitCh() <-chan map[jobKey]struct{} {
+	return j.jobsCh
+}
+
+func (j *jobWatcher) run(ctx context.Context) {
+	var resp interface{}
+	var err error
+
+	for {
+		//FIXME have watchAllocs create a closure and give it a copy of j.jobs to remove locking?
+		//FIXME it seems possible for this to return a nil error and a 0 index, what to do in that case?
+		var newIndex uint64
+		resp, newIndex, err = j.state.BlockingQuery(j.watchAllocs, j.allocsIndex, ctx)
+		if err != nil {
+			if err == context.Canceled {
+				j.logger.Printf("[TRACE] nomad.drain: job watcher shutting down")
+				return
+			}
+			j.logger.Printf("[ERR] nomad.drain: error blocking on alloc updates: %v", err)
+			return
+		}
+
+		j.logger.Printf("[TRACE] nomad.drain: job watcher old index: %d new index: %d", j.allocsIndex, newIndex)
+		j.allocsIndex = newIndex
+
+		changedJobs := resp.(map[jobKey]struct{})
+		if len(changedJobs) > 0 {
+			select {
+			case j.jobsCh <- changedJobs:
+			case <-ctx.Done():
+				return
+			}
+		}
+	}
+}
+
+func (j *jobWatcher) watchAllocs(ws memdb.WatchSet, state *state.StateStore) (interface{}, uint64, error) {
+	iter, err := state.Allocs(ws)
+	if err != nil {
+		return nil, 0, err
+	}
+
+	index, err := state.Index("allocs")
+	if err != nil {
+		return nil, 0, err
+	}
+
+	skipped := 0
+
+	// job ids
+	resp := map[jobKey]struct{}{}
+
+	for {
+		raw := iter.Next()
+		if raw == nil {
+			break
+		}
+
+		alloc := raw.(*structs.Allocation)
+
+		j.jobsMu.Lock()
+		_, ok := j.jobs[jobKey{alloc.Namespace, alloc.JobID}]
+		j.jobsMu.Unlock()
+
+		if !ok {
+			// alloc is not part of a draining job
+			skipped++
+			continue
+		}
+
+		// don't wake drain loop if alloc hasn't updated its health
+		if alloc.DeploymentStatus.IsHealthy() || alloc.DeploymentStatus.IsUnhealthy() {
+			j.logger.Printf("[TRACE] nomad.drain: job watcher found alloc %s - deployment status: %t", alloc.ID[:6], *alloc.DeploymentStatus.Healthy)
+			resp[jobKey{alloc.Namespace, alloc.JobID}] = struct{}{}
+		} else {
+			j.logger.Printf("[TRACE] nomad.drain: job watcher ignoring alloc %s - no deployment status", alloc.ID[:6])
+		}
+	}
+
+	j.logger.Printf("[TRACE] nomad.drain: job watcher ignoring %d allocs - not part of draining job at index %d", skipped, index)
+
+	return resp, index, nil
+}
diff --git a/nomad/drainer/node_watcher.go b/nomad/drainer/node_watcher.go
new file mode 100644
index 000000000000..5f419ea2ca91
--- /dev/null
+++ b/nomad/drainer/node_watcher.go
@@ -0,0 +1,121 @@
+package drainer
+
+import (
+	"context"
+	"log"
+
+	memdb "github.com/hashicorp/go-memdb"
+	"github.com/hashicorp/nomad/nomad/state"
+	"github.com/hashicorp/nomad/nomad/structs"
+)
+
+// nodeWatcher watches for nodes to start or stop draining
+type nodeWatcher struct {
+	index   uint64
+	nodes   map[string]*structs.Node
+	nodesCh chan map[string]*structs.Node
+	state   *state.StateStore
+	logger  *log.Logger
+}
+
+func newNodeWatcher(logger *log.Logger, nodes map[string]*structs.Node, index uint64, state *state.StateStore) *nodeWatcher {
+	return &nodeWatcher{
+		nodes:   nodes,
+		nodesCh: make(chan map[string]*structs.Node),
+		index:   index,
+		state:   state,
+		logger:  logger,
+	}
+}
+
+func (n *nodeWatcher) run(ctx context.Context) {
+	// Trigger an initial drain pass if there are already nodes draining
+	//FIXME this is unneccessary if a node has reached a deadline
+	n.logger.Printf("[TRACE] nomad.drain: initial draining nodes: %d", len(n.nodes))
+	if len(n.nodes) > 0 {
+		n.nodesCh <- n.nodes
+	}
+
+	for {
+		//FIXME it seems possible for this to return a nil error and a 0 index, what to do in that case?
+		resp, index, err := n.state.BlockingQuery(n.queryNodeDrain, n.index, ctx)
+		if err != nil {
+			if err == context.Canceled {
+				n.logger.Printf("[TRACE] nomad.drain: draining node watcher shutting down")
+				return
+			}
+			n.logger.Printf("[ERR] nomad.drain: error blocking on node updates at index %d: %v", n.index, err)
+			return
+		}
+
+		// update index for next run
+		n.index = index
+
+		changed := false
+		newNodes := resp.([]*structs.Node)
+		n.logger.Printf("[TRACE] nomad.drain: %d nodes to consider", len(newNodes)) //FIXME remove
+		for _, newNode := range newNodes {
+			if existingNode, ok := n.nodes[newNode.ID]; ok {
+				// Node was draining, see if it has changed
+				if newNode.DrainStrategy == nil {
+					// Node stopped draining
+					delete(n.nodes, newNode.ID)
+					changed = true
+				} else if !newNode.DrainStrategy.Equal(existingNode.DrainStrategy) {
+					// Update deadline
+					n.nodes[newNode.ID] = newNode
+					changed = true
+				}
+			} else {
+				// Node was not draining
+				if newNode.DrainStrategy != nil {
+					// Node started draining
+					n.nodes[newNode.ID] = newNode
+					changed = true
+				}
+			}
+		}
+
+		// Send a copy of the draining nodes if there were changes
+		if !changed {
+			continue
+		}
+
+		nodesCopy := make(map[string]*structs.Node, len(n.nodes))
+		for k, v := range n.nodes {
+			nodesCopy[k] = v
+		}
+
+		select {
+		case n.nodesCh <- nodesCopy:
+		case <-ctx.Done():
+			return
+		}
+	}
+}
+
+func (n *nodeWatcher) queryNodeDrain(ws memdb.WatchSet, state *state.StateStore) (interface{}, uint64, error) {
+	iter, err := state.Nodes(ws)
+	if err != nil {
+		return nil, 0, err
+	}
+
+	index, err := state.Index("nodes")
+	if err != nil {
+		return nil, 0, err
+	}
+
+	resp := make([]*structs.Node, 0, 8)
+
+	for {
+		raw := iter.Next()
+		if raw == nil {
+			break
+		}
+
+		node := raw.(*structs.Node)
+		resp = append(resp, node)
+	}
+
+	return resp, index, nil
+}

From 6026af2a8a3e04e1ddc4930f849109f58aaa73fd Mon Sep 17 00:00:00 2001
From: Alex Dadgar <alex.dadgar@gmail.com>
Date: Thu, 1 Mar 2018 16:37:19 -0800
Subject: [PATCH 36/79] Initial design

---
 nomad/drainerv2/drain_heap.go       |  20 ++++
 nomad/drainerv2/drain_interfaces.go |   1 +
 nomad/drainerv2/drainer.go          | 167 ++++++++++++++++++++++++++++
 nomad/drainerv2/draining_node.go    |  65 +++++++++++
 nomad/drainerv2/watch_jobs.go       |   8 ++
 nomad/drainerv2/watch_nodes.go      |   7 ++
 6 files changed, 268 insertions(+)
 create mode 100644 nomad/drainerv2/drain_heap.go
 create mode 100644 nomad/drainerv2/drain_interfaces.go
 create mode 100644 nomad/drainerv2/drainer.go
 create mode 100644 nomad/drainerv2/draining_node.go
 create mode 100644 nomad/drainerv2/watch_jobs.go
 create mode 100644 nomad/drainerv2/watch_nodes.go

diff --git a/nomad/drainerv2/drain_heap.go b/nomad/drainerv2/drain_heap.go
new file mode 100644
index 000000000000..899b8dd16b7f
--- /dev/null
+++ b/nomad/drainerv2/drain_heap.go
@@ -0,0 +1,20 @@
+package drainerv2
+
+import (
+	"time"
+
+	"github.com/hashicorp/nomad/nomad/structs"
+)
+
+type DrainDeadlineNotifier interface {
+	NextBatch() <-chan []*structs.Node
+	Remove(nodeID string)
+	Watch(nodeID string, deadline time.Time)
+}
+
+type deadlineHeap struct {
+}
+
+func (d *deadlineHeap) NextBatch() <-chan []structs.Node        { return nil }
+func (d *deadlineHeap) Remove(nodeID string)                    {}
+func (d *deadlineHeap) Watch(nodeID string, deadline time.Time) {}
diff --git a/nomad/drainerv2/drain_interfaces.go b/nomad/drainerv2/drain_interfaces.go
new file mode 100644
index 000000000000..008537619830
--- /dev/null
+++ b/nomad/drainerv2/drain_interfaces.go
@@ -0,0 +1 @@
+package drainerv2
diff --git a/nomad/drainerv2/drainer.go b/nomad/drainerv2/drainer.go
new file mode 100644
index 000000000000..a7156dc91d9c
--- /dev/null
+++ b/nomad/drainerv2/drainer.go
@@ -0,0 +1,167 @@
+package drainerv2
+
+import (
+	"context"
+	"log"
+	"sync"
+
+	"github.com/hashicorp/nomad/nomad/state"
+	"github.com/hashicorp/nomad/nomad/structs"
+	"golang.org/x/time/rate"
+)
+
+const (
+	// LimitStateQueriesPerSecond is the number of state queries allowed per
+	// second
+	LimitStateQueriesPerSecond = 100.0
+)
+
+// RaftApplier contains methods for applying the raft requests required by the
+// NodeDrainer.
+type RaftApplier interface {
+	AllocUpdateDesiredTransition(allocs map[string]*structs.DesiredTransition, evals []*structs.Evaluation) error
+	NodeDrainComplete(nodeID string) error
+}
+
+type AllocDrainer interface {
+	drain(allocs []*structs.Allocation)
+}
+
+type DrainingJobWatcherFactory func(context.Context, *rate.Limiter, AllocDrainer) DrainingJobWatcher
+type DrainingNodeWatcherFactory func(context.Context, *rate.Limiter, AllocDrainer) DrainingNodeWatcher
+type DrainDeadlineNotifierFactory func(context.Context) DrainDeadlineNotifier
+
+type NodeDrainerConfig struct {
+	Logger                *log.Logger
+	Raft                  RaftApplier
+	JobFactory            DrainingJobWatcherFactory
+	NodeFactory           DrainingNodeWatcherFactory
+	DrainDeadlineFactory  DrainDeadlineNotifierFactory
+	StateQueriesPerSecond float64
+}
+
+type NodeDrainer struct {
+	enabled bool
+	logger  *log.Logger
+
+	// nodes is the set of draining nodes
+	nodes map[string]*drainingNode
+
+	// doneNodeCh is used to signal that a node is done draining
+	doneNodeCh chan string
+
+	nodeWatcher DrainingNodeWatcher
+	nodeFactory DrainingNodeWatcherFactory
+
+	jobWatcher DrainingJobWatcher
+	jobFactory DrainingJobWatcherFactory
+
+	deadlineNotifier        DrainDeadlineNotifier
+	deadlineNotifierFactory DrainDeadlineNotifierFactory
+
+	// state is the state that is watched for state changes.
+	state *state.StateStore
+
+	// queryLimiter is used to limit the rate of blocking queries
+	queryLimiter *rate.Limiter
+
+	// raft is a shim around the raft messages necessary for draining
+	raft RaftApplier
+
+	// ctx and exitFn are used to cancel the watcher
+	ctx    context.Context
+	exitFn context.CancelFunc
+
+	l sync.RWMutex
+}
+
+func NewNodeDrainer(c *NodeDrainerConfig) *NodeDrainer {
+	return &NodeDrainer{
+		raft:                    c.Raft,
+		logger:                  c.Logger,
+		jobFactory:              c.JobFactory,
+		nodeFactory:             c.NodeFactory,
+		deadlineNotifierFactory: c.DrainDeadlineFactory,
+		queryLimiter:            rate.NewLimiter(rate.Limit(c.StateQueriesPerSecond), 100),
+	}
+}
+
+// SetEnabled will start or stop the node draining goroutine depending on the
+// enabled boolean.
+func (n *NodeDrainer) SetEnabled(enabled bool, state *state.StateStore) {
+	n.l.Lock()
+	defer n.l.Unlock()
+
+	wasEnabled := n.enabled
+	n.enabled = enabled
+
+	if state != nil {
+		n.state = state
+	}
+
+	// Flush the state to create the necessary objects
+	n.flush()
+
+	// If we are starting now, launch the watch daemon
+	if enabled && !wasEnabled {
+		n.run(n.ctx)
+	}
+}
+
+// flush is used to clear the state of the watcher
+func (n *NodeDrainer) flush() {
+	// Kill everything associated with the watcher
+	if n.exitFn != nil {
+		n.exitFn()
+	}
+
+	n.ctx, n.exitFn = context.WithCancel(context.Background())
+	n.jobWatcher = n.jobFactory(n.ctx, n.queryLimiter, n)
+	n.nodeWatcher = n.nodeFactory(n.ctx, n.queryLimiter, n)
+	n.deadlineNotifier = n.deadlineNotifierFactory(n.ctx)
+	n.nodes = make(map[string]*drainingNode, 32)
+	n.doneNodeCh = make(chan string, 4)
+}
+
+func (n *NodeDrainer) run(ctx context.Context) {
+	for {
+		select {
+		case <-n.ctx.Done():
+			return
+		case nodes := <-n.deadlineNotifier.NextBatch():
+			n.handleDeadlinedNodes(nodes)
+		case nodes := <-n.nodeWatcher.Transistioning():
+			n.handleNodeDrainTransistion(nodes)
+		case allocs := <-n.jobWatcher.Drain():
+			n.handleJobAllocDrain(allocs)
+		case node := <-n.doneNodeCh:
+			n.handleDoneNode(node)
+		}
+	}
+}
+
+func (n *NodeDrainer) handleDeadlinedNodes(nodes []*structs.Node) {
+	// TODO
+}
+
+func (n *NodeDrainer) handleNodeDrainTransistion(nodes []*structs.Node) {
+	// TODO
+}
+
+func (n *NodeDrainer) handleJobAllocDrain(allocs []*structs.Allocation) {
+	// TODO
+
+	// TODO Call check on the appropriate nodes when the final allocs
+	// transistion to stop so we have a place to determine with the node
+	// is done and the final drain of system allocs
+	// TODO This probably requires changing the interface such that it
+	// returns replaced allocs as well.
+}
+
+func (n *NodeDrainer) handleDoneNode(nodeID string) {
+	// TODO
+}
+
+func (n *NodeDrainer) drain(allocs []*structs.Allocation) {
+	// TODO
+}
diff --git a/nomad/drainerv2/draining_node.go b/nomad/drainerv2/draining_node.go
new file mode 100644
index 000000000000..3150be1fd52d
--- /dev/null
+++ b/nomad/drainerv2/draining_node.go
@@ -0,0 +1,65 @@
+package drainerv2
+
+import (
+	"sync"
+	"time"
+
+	"github.com/hashicorp/nomad/nomad/state"
+	"github.com/hashicorp/nomad/nomad/structs"
+)
+
+// TODO make this an interface and then I can optimize the infinite case by
+// using a singleton object
+
+type drainCoordinator interface {
+	done(nodeID string)
+}
+
+func (n *NodeDrainer) nodeDone(nodeID string) {
+	select {
+	case <-n.ctx.Done():
+	case n.doneNodeCh <- nodeID:
+	}
+}
+
+type drainingNode struct {
+	coordinator drainCoordinator
+	state       *state.StateStore
+	node        *structs.Node
+	l           sync.RWMutex
+}
+
+func NewDrainingNode(node *structs.Node, state *state.StateStore, coordinator drainCoordinator) *drainingNode {
+	return &drainingNode{
+		coordinator: coordinator,
+		state:       state,
+		node:        node,
+	}
+}
+
+func (n *drainingNode) Update(node *structs.Node) {
+	n.l.Lock()
+	defer n.l.Unlock()
+	n.node = node
+}
+
+// DeadlineTime returns if the node has a deadline and if so what it is
+func (n *drainingNode) DeadlineTime() (bool, time.Time) {
+	n.l.RLock()
+	defer n.l.RUnlock()
+
+	// Should never happen
+	if n.node == nil || n.node.DrainStrategy == nil {
+		return false, time.Time{}
+	}
+
+	return n.node.DrainStrategy.DeadlineTime()
+}
+
+// DeadlineAllocs returns the set of allocations that should be drained given a
+// node is at its deadline
+func (n *drainingNode) DeadlineAllocs() ([]*structs.Allocation, error) {
+	n.l.RLock()
+	defer n.l.RUnlock()
+	return nil, nil
+}
diff --git a/nomad/drainerv2/watch_jobs.go b/nomad/drainerv2/watch_jobs.go
new file mode 100644
index 000000000000..836cea6856e6
--- /dev/null
+++ b/nomad/drainerv2/watch_jobs.go
@@ -0,0 +1,8 @@
+package drainerv2
+
+import "github.com/hashicorp/nomad/nomad/structs"
+
+type DrainingJobWatcher interface {
+	RegisterJob(jobID, namespace string)
+	Drain() <-chan []*structs.Allocation
+}
diff --git a/nomad/drainerv2/watch_nodes.go b/nomad/drainerv2/watch_nodes.go
new file mode 100644
index 000000000000..623c2edb234f
--- /dev/null
+++ b/nomad/drainerv2/watch_nodes.go
@@ -0,0 +1,7 @@
+package drainerv2
+
+import "github.com/hashicorp/nomad/nomad/structs"
+
+type DrainingNodeWatcher interface {
+	Transistioning() <-chan []*structs.Node
+}

From e566fcdf5f7443b5931db7ea4f0109ce8f974360 Mon Sep 17 00:00:00 2001
From: Alex Dadgar <alex.dadgar@gmail.com>
Date: Fri, 2 Mar 2018 15:19:55 -0800
Subject: [PATCH 37/79] drain heap

---
 nomad/drainerv2/drain_heap.go      | 165 +++++++++++++++++++++++++++--
 nomad/drainerv2/drain_heap_test.go | 149 ++++++++++++++++++++++++++
 nomad/drainerv2/drainer.go         |   2 +-
 3 files changed, 309 insertions(+), 7 deletions(-)
 create mode 100644 nomad/drainerv2/drain_heap_test.go

diff --git a/nomad/drainerv2/drain_heap.go b/nomad/drainerv2/drain_heap.go
index 899b8dd16b7f..b661447e2b12 100644
--- a/nomad/drainerv2/drain_heap.go
+++ b/nomad/drainerv2/drain_heap.go
@@ -1,20 +1,173 @@
 package drainerv2
 
 import (
+	"context"
+	"sync"
 	"time"
-
-	"github.com/hashicorp/nomad/nomad/structs"
 )
 
+// DrainDeadlineNotifier allows batch notification of nodes that have reached
+// their drain deadline.
 type DrainDeadlineNotifier interface {
-	NextBatch() <-chan []*structs.Node
+	// NextBatch returns the next batch of nodes that have reached their
+	// deadline.
+	NextBatch() <-chan []string
+
+	// Remove removes the given node from being tracked for a deadline.
 	Remove(nodeID string)
+
+	// Watch marks the given node for being watched for its deadline.
 	Watch(nodeID string, deadline time.Time)
 }
 
+// TODO Make any of what I just wrote true :) Initially it is just a simple
+// implementation.
+
+// deadlineHeap implements the DrainDeadlineNotifier and is backed by a min-heap
+// to efficiently determine the next deadlining node. It also supports
+// coalescing several deadlines into a single emission.
 type deadlineHeap struct {
+	ctx            context.Context
+	coalesceWindow time.Duration
+	batch          chan []string
+	nodes          map[string]time.Time
+	trigger        chan string
+	l              sync.RWMutex
+}
+
+// NewDeadlineHeap returns a new deadline heap that coalesces for the given
+// duration and will stop watching when the passed context is cancelled.
+func NewDeadlineHeap(ctx context.Context, coalesceWindow time.Duration) *deadlineHeap {
+	d := &deadlineHeap{
+		ctx:            ctx,
+		coalesceWindow: coalesceWindow,
+		batch:          make(chan []string, 4),
+		nodes:          make(map[string]time.Time, 64),
+		trigger:        make(chan string, 4),
+	}
+
+	go d.watch()
+	return d
+}
+
+func (d *deadlineHeap) watch() {
+	timer := time.NewTimer(0 * time.Millisecond)
+	if !timer.Stop() {
+		select {
+		case <-timer.C:
+		default:
+		}
+	}
+
+	var nextDeadline time.Time
+	defer timer.Stop()
+
+	for {
+		select {
+		case <-d.ctx.Done():
+			return
+		case <-timer.C:
+			if nextDeadline.IsZero() {
+				continue
+			}
+
+			d.l.Lock()
+			var batch []string
+			for nodeID, nodeDeadline := range d.nodes {
+				if !nodeDeadline.After(nextDeadline) {
+					batch = append(batch, nodeID)
+				}
+			}
+
+			// If there is nothing exit early
+			if len(batch) == 0 {
+				d.l.Unlock()
+				goto CALC
+			}
+
+			// Send the batch
+			select {
+			case d.batch <- batch:
+			case <-d.ctx.Done():
+				d.l.Unlock()
+				return
+			}
+
+			// Clean up the nodes
+			for _, nodeID := range batch {
+				delete(d.nodes, nodeID)
+			}
+			d.l.Unlock()
+		case <-d.trigger:
+		}
+
+	CALC:
+		deadline, ok := d.calculateNextDeadline()
+		if !ok {
+			continue
+		}
+
+		if !deadline.Equal(nextDeadline) {
+			timer.Reset(deadline.Sub(time.Now()))
+			nextDeadline = deadline
+		}
+	}
+}
+
+// calculateNextDeadline returns the next deadline in which to scan for
+// deadlined nodes. It applies the coalesce window.
+func (d *deadlineHeap) calculateNextDeadline() (time.Time, bool) {
+	d.l.Lock()
+	defer d.l.Unlock()
+
+	if len(d.nodes) == 0 {
+		return time.Time{}, false
+	}
+
+	// Calculate the new timer value
+	var deadline time.Time
+	for _, v := range d.nodes {
+		if deadline.IsZero() || v.Before(deadline) {
+			deadline = v
+		}
+	}
+
+	var maxWithinWindow time.Time
+	coalescedDeadline := deadline.Add(d.coalesceWindow)
+	for _, nodeDeadline := range d.nodes {
+		if nodeDeadline.Before(coalescedDeadline) {
+			if maxWithinWindow.IsZero() || nodeDeadline.After(maxWithinWindow) {
+				maxWithinWindow = nodeDeadline
+			}
+		}
+	}
+
+	return maxWithinWindow, true
 }
 
-func (d *deadlineHeap) NextBatch() <-chan []structs.Node        { return nil }
-func (d *deadlineHeap) Remove(nodeID string)                    {}
-func (d *deadlineHeap) Watch(nodeID string, deadline time.Time) {}
+// NextBatch returns the next batch of nodes to be drained.
+func (d *deadlineHeap) NextBatch() <-chan []string {
+	return d.batch
+}
+
+func (d *deadlineHeap) Remove(nodeID string) {
+	d.l.Lock()
+	defer d.l.Unlock()
+	delete(d.nodes, nodeID)
+
+	select {
+	case d.trigger <- nodeID:
+	default:
+	}
+}
+
+func (d *deadlineHeap) Watch(nodeID string, deadline time.Time) {
+	d.l.Lock()
+	defer d.l.Unlock()
+	d.nodes[nodeID] = deadline
+
+	select {
+	case d.trigger <- nodeID:
+	default:
+	}
+}
diff --git a/nomad/drainerv2/drain_heap_test.go b/nomad/drainerv2/drain_heap_test.go
new file mode 100644
index 000000000000..a47a98ff7473
--- /dev/null
+++ b/nomad/drainerv2/drain_heap_test.go
@@ -0,0 +1,149 @@
+package drainerv2
+
+import (
+	"context"
+	"testing"
+	"time"
+
+	"github.com/stretchr/testify/require"
+)
+
+func TestDeadlineHeap_Interface(t *testing.T) {
+	t.Parallel()
+	require := require.New(t)
+	h := NewDeadlineHeap(context.Background(), 1*time.Second)
+	require.Implements((*DrainDeadlineNotifier)(nil), h)
+}
+
+func TestDeadlineHeap_WatchAndGet(t *testing.T) {
+	t.Parallel()
+	require := require.New(t)
+	h := NewDeadlineHeap(context.Background(), 1*time.Second)
+
+	now := time.Now()
+	nodeID := "1"
+	wait := 10 * time.Millisecond
+	deadline := now.Add(wait)
+	h.Watch(nodeID, deadline)
+
+	var batch []string
+	select {
+	case batch = <-h.NextBatch():
+	case <-time.After(2 * wait):
+		t.Fatal("timeout")
+	}
+
+	require.Len(batch, 1)
+	require.Equal(nodeID, batch[0])
+}
+
+func TestDeadlineHeap_WatchThenUpdateAndGet(t *testing.T) {
+	t.Parallel()
+	require := require.New(t)
+	h := NewDeadlineHeap(context.Background(), 1*time.Second)
+
+	now := time.Now()
+	nodeID := "1"
+	wait := 10 * time.Millisecond
+	deadline := now.Add(wait)
+
+	// Initially watch way in the future
+	h.Watch(nodeID, now.Add(24*time.Hour))
+
+	// Rewatch
+	h.Watch(nodeID, deadline)
+
+	var batch []string
+	select {
+	case batch = <-h.NextBatch():
+	case <-time.After(2 * wait):
+		t.Fatal("timeout")
+	}
+
+	require.Len(batch, 1)
+	require.Equal(nodeID, batch[0])
+}
+
+func TestDeadlineHeap_MultiwatchAndDelete(t *testing.T) {
+	t.Parallel()
+	require := require.New(t)
+	h := NewDeadlineHeap(context.Background(), 1*time.Second)
+
+	now := time.Now()
+	wait := 50 * time.Millisecond
+	deadline := now.Add(wait)
+
+	nodeID1 := "1"
+	nodeID2 := "2"
+	h.Watch(nodeID1, deadline)
+	h.Watch(nodeID2, deadline)
+
+	time.Sleep(1 * time.Millisecond)
+	h.Remove(nodeID2)
+
+	var batch []string
+	select {
+	case batch = <-h.NextBatch():
+	case <-time.After(2 * wait):
+		t.Fatal("timeout")
+	}
+
+	require.Len(batch, 1)
+	require.Equal(nodeID1, batch[0])
+}
+
+func TestDeadlineHeap_WatchCoalesce(t *testing.T) {
+	t.Parallel()
+	require := require.New(t)
+	h := NewDeadlineHeap(context.Background(), 250*time.Millisecond)
+
+	now := time.Now()
+
+	group1 := map[string]time.Time{
+		"1": now.Add(5 * time.Millisecond),
+		"2": now.Add(10 * time.Millisecond),
+		"3": now.Add(20 * time.Millisecond),
+		"4": now.Add(100 * time.Millisecond),
+	}
+
+	group2 := map[string]time.Time{
+		"10": now.Add(355 * time.Millisecond),
+		"11": now.Add(360 * time.Millisecond),
+	}
+
+	for _, g := range []map[string]time.Time{group1, group2} {
+		for n, d := range g {
+			h.Watch(n, d)
+		}
+	}
+
+	var batch []string
+	select {
+	case batch = <-h.NextBatch():
+	case <-time.After(1 * time.Second):
+		t.Fatal("timeout")
+	}
+
+	require.Len(batch, len(group1))
+	for nodeID := range group1 {
+		require.Contains(batch, nodeID)
+	}
+	batch = nil
+
+	select {
+	case batch = <-h.NextBatch():
+	case <-time.After(2 * time.Second):
+		t.Fatal("timeout")
+	}
+
+	require.Len(batch, len(group2))
+	for nodeID := range group2 {
+		require.Contains(batch, nodeID)
+	}
+
+	select {
+	case <-h.NextBatch():
+		t.Fatal("unexpected batch")
+	case <-time.After(100 * time.Millisecond):
+	}
+}
diff --git a/nomad/drainerv2/drainer.go b/nomad/drainerv2/drainer.go
index a7156dc91d9c..6e9b4b73b570 100644
--- a/nomad/drainerv2/drainer.go
+++ b/nomad/drainerv2/drainer.go
@@ -140,7 +140,7 @@ func (n *NodeDrainer) run(ctx context.Context) {
 	}
 }
 
-func (n *NodeDrainer) handleDeadlinedNodes(nodes []*structs.Node) {
+func (n *NodeDrainer) handleDeadlinedNodes(nodes []string) {
 	// TODO
 }
 

From da368105e6317564ad5d251780ff26c57130f656 Mon Sep 17 00:00:00 2001
From: Alex Dadgar <alex.dadgar@gmail.com>
Date: Fri, 2 Mar 2018 17:15:38 -0800
Subject: [PATCH 38/79] node watcher

---
 nomad/drainerv2/drain_interfaces.go |   1 -
 nomad/drainerv2/drain_testing.go    |  46 ++++++++
 nomad/drainerv2/drainer.go          |  27 +++--
 nomad/drainerv2/draining_node.go    |   8 +-
 nomad/drainerv2/watch_nodes.go      | 177 +++++++++++++++++++++++++++-
 nomad/drainerv2/watch_nodes_test.go | 153 ++++++++++++++++++++++++
 6 files changed, 397 insertions(+), 15 deletions(-)
 delete mode 100644 nomad/drainerv2/drain_interfaces.go
 create mode 100644 nomad/drainerv2/drain_testing.go
 create mode 100644 nomad/drainerv2/watch_nodes_test.go

diff --git a/nomad/drainerv2/drain_interfaces.go b/nomad/drainerv2/drain_interfaces.go
deleted file mode 100644
index 008537619830..000000000000
--- a/nomad/drainerv2/drain_interfaces.go
+++ /dev/null
@@ -1 +0,0 @@
-package drainerv2
diff --git a/nomad/drainerv2/drain_testing.go b/nomad/drainerv2/drain_testing.go
new file mode 100644
index 000000000000..af143894bd93
--- /dev/null
+++ b/nomad/drainerv2/drain_testing.go
@@ -0,0 +1,46 @@
+package drainerv2
+
+import (
+	"sync"
+
+	"github.com/hashicorp/nomad/nomad/structs"
+)
+
+type MockNodeTrackerEvent struct {
+	NodeUpdate *structs.Node
+	NodeRemove string
+}
+
+type MockNodeTracker struct {
+	Nodes  map[string]*structs.Node
+	Events []*MockNodeTrackerEvent
+	sync.Mutex
+}
+
+func NewMockNodeTracker() *MockNodeTracker {
+	return &MockNodeTracker{
+		Nodes:  make(map[string]*structs.Node),
+		Events: make([]*MockNodeTrackerEvent, 0, 16),
+	}
+}
+
+func (m *MockNodeTracker) Tracking(nodeID string) (*structs.Node, bool) {
+	m.Lock()
+	defer m.Unlock()
+	n, ok := m.Nodes[nodeID]
+	return n, ok
+}
+
+func (m *MockNodeTracker) Remove(nodeID string) {
+	m.Lock()
+	defer m.Unlock()
+	delete(m.Nodes, nodeID)
+	m.Events = append(m.Events, &MockNodeTrackerEvent{NodeRemove: nodeID})
+}
+
+func (m *MockNodeTracker) Update(node *structs.Node) {
+	m.Lock()
+	defer m.Unlock()
+	m.Nodes[node.ID] = node
+	m.Events = append(m.Events, &MockNodeTrackerEvent{NodeUpdate: node})
+}
diff --git a/nomad/drainerv2/drainer.go b/nomad/drainerv2/drainer.go
index 6e9b4b73b570..18b07eff5606 100644
--- a/nomad/drainerv2/drainer.go
+++ b/nomad/drainerv2/drainer.go
@@ -4,12 +4,19 @@ import (
 	"context"
 	"log"
 	"sync"
+	"time"
 
 	"github.com/hashicorp/nomad/nomad/state"
 	"github.com/hashicorp/nomad/nomad/structs"
 	"golang.org/x/time/rate"
 )
 
+var (
+	// stateReadErrorDelay is the delay to apply before retrying reading state
+	// when there is an error
+	stateReadErrorDelay = 1 * time.Second
+)
+
 const (
 	// LimitStateQueriesPerSecond is the number of state queries allowed per
 	// second
@@ -27,8 +34,14 @@ type AllocDrainer interface {
 	drain(allocs []*structs.Allocation)
 }
 
-type DrainingJobWatcherFactory func(context.Context, *rate.Limiter, AllocDrainer) DrainingJobWatcher
-type DrainingNodeWatcherFactory func(context.Context, *rate.Limiter, AllocDrainer) DrainingNodeWatcher
+type NodeTracker interface {
+	Tracking(nodeID string) (*structs.Node, bool)
+	Remove(nodeID string)
+	Update(node *structs.Node)
+}
+
+type DrainingJobWatcherFactory func(context.Context, *rate.Limiter, *state.StateStore, *log.Logger, AllocDrainer) DrainingJobWatcher
+type DrainingNodeWatcherFactory func(context.Context, *rate.Limiter, *state.StateStore, *log.Logger, NodeTracker) DrainingNodeWatcher
 type DrainDeadlineNotifierFactory func(context.Context) DrainDeadlineNotifier
 
 type NodeDrainerConfig struct {
@@ -116,8 +129,8 @@ func (n *NodeDrainer) flush() {
 	}
 
 	n.ctx, n.exitFn = context.WithCancel(context.Background())
-	n.jobWatcher = n.jobFactory(n.ctx, n.queryLimiter, n)
-	n.nodeWatcher = n.nodeFactory(n.ctx, n.queryLimiter, n)
+	n.jobWatcher = n.jobFactory(n.ctx, n.queryLimiter, n.state, n.logger, n)
+	n.nodeWatcher = n.nodeFactory(n.ctx, n.queryLimiter, n.state, n.logger, n)
 	n.deadlineNotifier = n.deadlineNotifierFactory(n.ctx)
 	n.nodes = make(map[string]*drainingNode, 32)
 	n.doneNodeCh = make(chan string, 4)
@@ -130,8 +143,6 @@ func (n *NodeDrainer) run(ctx context.Context) {
 			return
 		case nodes := <-n.deadlineNotifier.NextBatch():
 			n.handleDeadlinedNodes(nodes)
-		case nodes := <-n.nodeWatcher.Transistioning():
-			n.handleNodeDrainTransistion(nodes)
 		case allocs := <-n.jobWatcher.Drain():
 			n.handleJobAllocDrain(allocs)
 		case node := <-n.doneNodeCh:
@@ -144,10 +155,6 @@ func (n *NodeDrainer) handleDeadlinedNodes(nodes []string) {
 	// TODO
 }
 
-func (n *NodeDrainer) handleNodeDrainTransistion(nodes []*structs.Node) {
-	// TODO
-}
-
 func (n *NodeDrainer) handleJobAllocDrain(allocs []*structs.Allocation) {
 	// TODO
 
diff --git a/nomad/drainerv2/draining_node.go b/nomad/drainerv2/draining_node.go
index 3150be1fd52d..32233573b3e3 100644
--- a/nomad/drainerv2/draining_node.go
+++ b/nomad/drainerv2/draining_node.go
@@ -12,7 +12,7 @@ import (
 // using a singleton object
 
 type drainCoordinator interface {
-	done(nodeID string)
+	nodeDone(nodeID string)
 }
 
 func (n *NodeDrainer) nodeDone(nodeID string) {
@@ -37,6 +37,12 @@ func NewDrainingNode(node *structs.Node, state *state.StateStore, coordinator dr
 	}
 }
 
+func (n *drainingNode) GetNode() *structs.Node {
+	n.l.Lock()
+	defer n.l.Unlock()
+	return n.node
+}
+
 func (n *drainingNode) Update(node *structs.Node) {
 	n.l.Lock()
 	defer n.l.Unlock()
diff --git a/nomad/drainerv2/watch_nodes.go b/nomad/drainerv2/watch_nodes.go
index 623c2edb234f..ddf5f2b9a8f9 100644
--- a/nomad/drainerv2/watch_nodes.go
+++ b/nomad/drainerv2/watch_nodes.go
@@ -1,7 +1,178 @@
 package drainerv2
 
-import "github.com/hashicorp/nomad/nomad/structs"
+import (
+	"context"
+	"log"
+	"time"
 
-type DrainingNodeWatcher interface {
-	Transistioning() <-chan []*structs.Node
+	memdb "github.com/hashicorp/go-memdb"
+	"github.com/hashicorp/nomad/nomad/state"
+	"github.com/hashicorp/nomad/nomad/structs"
+	"golang.org/x/time/rate"
+)
+
+// DrainingNodeWatcher is the interface for watching for draining nodes.
+type DrainingNodeWatcher interface{}
+
+// Tracking returns the whether the node is being tracked and if so the copy of
+// the node object that is tracked.
+func (n *NodeDrainer) Tracking(nodeID string) (*structs.Node, bool) {
+	n.l.RLock()
+	defer n.l.RUnlock()
+
+	draining, ok := n.nodes[nodeID]
+	if !ok {
+		return nil, false
+	}
+
+	return draining.GetNode(), true
+}
+
+// Remove removes the given node from being tracked
+func (n *NodeDrainer) Remove(nodeID string) {
+	n.l.Lock()
+	defer n.l.Unlock()
+	delete(n.nodes, nodeID)
+}
+
+// Update updates the node, either updating the tracked version or starting to
+// track the node.
+func (n *NodeDrainer) Update(node *structs.Node) {
+	n.l.Lock()
+	defer n.l.Unlock()
+
+	if node == nil {
+		return
+	}
+
+	draining, ok := n.nodes[node.ID]
+	if !ok {
+		n.nodes[node.ID] = NewDrainingNode(node, n.state, n)
+		return
+	}
+
+	draining.Update(node)
+}
+
+// nodeDrainWatcher is used to watch nodes that are entering, leaving or
+// changing their drain strategy.
+type nodeDrainWatcher struct {
+	ctx    context.Context
+	logger *log.Logger
+
+	// state is the state that is watched for state changes.
+	state *state.StateStore
+
+	// limiter is used to limit the rate of blocking queries
+	limiter *rate.Limiter
+
+	// tracker is the object that is tracking the nodes and provides us with the
+	// needed callbacks
+	tracker NodeTracker
+}
+
+// NewNodeDrainWatcher returns a new node drain watcher.
+func NewNodeDrainWatcher(ctx context.Context, limiter *rate.Limiter, state *state.StateStore, logger *log.Logger, tracker NodeTracker) *nodeDrainWatcher {
+	w := &nodeDrainWatcher{
+		ctx:     ctx,
+		limiter: limiter,
+		logger:  logger,
+		tracker: tracker,
+		state:   state,
+	}
+
+	go w.watch()
+	return w
+}
+
+// watch is the long lived watching routine that detects node changes.
+func (w *nodeDrainWatcher) watch() {
+	nindex := uint64(1)
+	for {
+		w.logger.Printf("[TRACE] nomad.drain.node_watcher: getting nodes at index %d", nindex)
+		nodes, index, err := w.getNodes(nindex)
+		if err != nil {
+			if err == context.Canceled {
+				w.logger.Printf("[TRACE] nomad.drain.node_watcher: shutting down")
+				return
+			}
+
+			w.logger.Printf("[ERR] nomad.drain.node_watcher: error watching node updates at index %d: %v", nindex, err)
+			select {
+			case <-w.ctx.Done():
+				w.logger.Printf("[TRACE] nomad.drain.node_watcher: shutting down")
+				return
+			case <-time.After(stateReadErrorDelay):
+				continue
+			}
+		}
+
+		// update index for next run
+		nindex = index
+
+		for _, node := range nodes {
+			newDraining := node.DrainStrategy != nil
+			currentNode, tracked := w.tracker.Tracking(node.ID)
+
+			switch {
+			// If the node is tracked but not draining, untrack
+			case tracked && !newDraining:
+				w.logger.Printf("[TRACE] nomad.drain.node_watcher: tracked node %q is no longer draining", node.ID)
+				w.tracker.Remove(node.ID)
+
+				// If the node is not being tracked but is draining, track
+			case !tracked && newDraining:
+				w.logger.Printf("[TRACE] nomad.drain.node_watcher: untracked node %q is draining", node.ID)
+				w.tracker.Update(node)
+
+				// If the node is being tracked but has changed, update:
+			case tracked && newDraining && !currentNode.DrainStrategy.Equal(node.DrainStrategy):
+				w.logger.Printf("[TRACE] nomad.drain.node_watcher: tracked node %q has updated drain", node.ID)
+				w.tracker.Update(node)
+			default:
+				w.logger.Printf("[TRACE] nomad.drain.node_watcher: node %q at index %v: tracked %v, draining %v", node.ID, node.ModifyIndex, tracked, newDraining)
+			}
+		}
+	}
+}
+
+// getNodes returns all nodes blocking until the nodes are after the given index.
+func (w *nodeDrainWatcher) getNodes(minIndex uint64) ([]*structs.Node, uint64, error) {
+	if err := w.limiter.Wait(w.ctx); err != nil {
+		return nil, 0, err
+	}
+
+	resp, index, err := w.state.BlockingQuery(w.getNodesImpl, minIndex, w.ctx)
+	if err != nil {
+		return nil, 0, err
+	}
+
+	return resp.([]*structs.Node), index, nil
+}
+
+// getNodesImpl is used to get nodes from the state store, returning the set of
+// nodes and the given index.
+func (w *nodeDrainWatcher) getNodesImpl(ws memdb.WatchSet, state *state.StateStore) (interface{}, uint64, error) {
+	iter, err := state.Nodes(ws)
+	if err != nil {
+		return nil, 0, err
+	}
+
+	index, err := state.Index("nodes")
+	if err != nil {
+		return nil, 0, err
+	}
+
+	resp := make([]*structs.Node, 0, 64)
+	for {
+		raw := iter.Next()
+		if raw == nil {
+			break
+		}
+
+		node := raw.(*structs.Node)
+		resp = append(resp, node)
+	}
+
+	return resp, index, nil
 }
diff --git a/nomad/drainerv2/watch_nodes_test.go b/nomad/drainerv2/watch_nodes_test.go
new file mode 100644
index 000000000000..8b3a63e1c250
--- /dev/null
+++ b/nomad/drainerv2/watch_nodes_test.go
@@ -0,0 +1,153 @@
+package drainerv2
+
+import (
+	"context"
+	"testing"
+	"time"
+
+	"github.com/hashicorp/nomad/helper/testlog"
+	"github.com/hashicorp/nomad/nomad/mock"
+	"github.com/hashicorp/nomad/nomad/state"
+	"github.com/hashicorp/nomad/nomad/structs"
+	"github.com/hashicorp/nomad/testutil"
+	"github.com/stretchr/testify/require"
+	"golang.org/x/time/rate"
+)
+
+func testNodeDrainWatcher(t *testing.T) (*nodeDrainWatcher, *state.StateStore, *MockNodeTracker) {
+	t.Helper()
+
+	sconfig := &state.StateStoreConfig{
+		LogOutput: testlog.NewWriter(t),
+		Region:    "global",
+	}
+	state, err := state.NewStateStore(sconfig)
+	if err != nil {
+		t.Fatalf("failed to create state store: %v", err)
+	}
+
+	limiter := rate.NewLimiter(100.0, 100)
+	logger := testlog.Logger(t)
+	m := NewMockNodeTracker()
+	w := NewNodeDrainWatcher(context.Background(), limiter, state, logger, m)
+	return w, state, m
+}
+
+func TestNodeDrainWatcher_Interface(t *testing.T) {
+	t.Parallel()
+	require := require.New(t)
+	w, _, _ := testNodeDrainWatcher(t)
+	require.Implements((*DrainingNodeWatcher)(nil), w)
+}
+
+func TestNodeDrainWatcher_AddDraining(t *testing.T) {
+	t.Parallel()
+	require := require.New(t)
+	_, state, m := testNodeDrainWatcher(t)
+
+	// Create two nodes, one draining and one not draining
+	n1, n2 := mock.Node(), mock.Node()
+	n2.DrainStrategy = &structs.DrainStrategy{
+		DrainSpec: structs.DrainSpec{
+			Deadline: time.Hour,
+		},
+		ForceDeadline: time.Now().Add(time.Hour),
+	}
+
+	require.Nil(state.UpsertNode(100, n1))
+	require.Nil(state.UpsertNode(101, n2))
+
+	testutil.WaitForResult(func() (bool, error) {
+		return len(m.Events) == 1, nil
+	}, func(err error) {
+		t.Fatal("No node drain events")
+	})
+
+	_, ok1 := m.Tracking(n1.ID)
+	out2, ok2 := m.Tracking(n2.ID)
+	require.False(ok1)
+	require.True(ok2)
+	require.Equal(n2, out2)
+
+}
+
+func TestNodeDrainWatcher_Remove(t *testing.T) {
+	t.Parallel()
+	require := require.New(t)
+	_, state, m := testNodeDrainWatcher(t)
+
+	// Create a draining node
+	n := mock.Node()
+	n.DrainStrategy = &structs.DrainStrategy{
+		DrainSpec: structs.DrainSpec{
+			Deadline: time.Hour,
+		},
+		ForceDeadline: time.Now().Add(time.Hour),
+	}
+
+	// Wait for it to be tracked
+	require.Nil(state.UpsertNode(100, n))
+	testutil.WaitForResult(func() (bool, error) {
+		return len(m.Events) == 1, nil
+	}, func(err error) {
+		t.Fatal("No node drain events")
+	})
+
+	out, ok := m.Tracking(n.ID)
+	require.True(ok)
+	require.Equal(n, out)
+
+	// Change the node to be not draining and wait for it to be untracked
+	require.Nil(state.UpdateNodeDrain(101, n.ID, nil))
+	testutil.WaitForResult(func() (bool, error) {
+		return len(m.Events) == 2, nil
+	}, func(err error) {
+		t.Fatal("No new node drain events")
+	})
+
+	_, ok = m.Tracking(n.ID)
+	require.False(ok)
+}
+
+func TestNodeDrainWatcher_Update(t *testing.T) {
+	t.Parallel()
+	require := require.New(t)
+	_, state, m := testNodeDrainWatcher(t)
+
+	// Create a draining node
+	n := mock.Node()
+	n.DrainStrategy = &structs.DrainStrategy{
+		DrainSpec: structs.DrainSpec{
+			Deadline: time.Hour,
+		},
+		ForceDeadline: time.Now().Add(time.Hour),
+	}
+
+	// Wait for it to be tracked
+	require.Nil(state.UpsertNode(100, n))
+	testutil.WaitForResult(func() (bool, error) {
+		return len(m.Events) == 1, nil
+	}, func(err error) {
+		t.Fatal("No node drain events")
+	})
+
+	out, ok := m.Tracking(n.ID)
+	require.True(ok)
+	require.Equal(n, out)
+
+	// Change the node to have a new spec
+	s2 := n.DrainStrategy.Copy()
+	s2.Deadline += time.Hour
+	require.Nil(state.UpdateNodeDrain(101, n.ID, s2))
+
+	// Wait for it to be updated
+	testutil.WaitForResult(func() (bool, error) {
+		return len(m.Events) == 2, nil
+	}, func(err error) {
+		t.Fatal("No new node drain events")
+	})
+
+	out, ok = m.Tracking(n.ID)
+	require.True(ok)
+	require.Equal(out.DrainStrategy, s2)
+}

From d45532d038a4f887be2ee870ca3b55cc5f8bf9a2 Mon Sep 17 00:00:00 2001
From: Alex Dadgar <alex.dadgar@gmail.com>
Date: Fri, 2 Mar 2018 17:24:48 -0800
Subject: [PATCH 39/79] Node's being untracked or having updated deadlines,
 updates the deadliner

---
 nomad/drainerv2/watch_nodes.go | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/nomad/drainerv2/watch_nodes.go b/nomad/drainerv2/watch_nodes.go
index ddf5f2b9a8f9..7b0bd8573389 100644
--- a/nomad/drainerv2/watch_nodes.go
+++ b/nomad/drainerv2/watch_nodes.go
@@ -32,7 +32,11 @@ func (n *NodeDrainer) Tracking(nodeID string) (*structs.Node, bool) {
 func (n *NodeDrainer) Remove(nodeID string) {
 	n.l.Lock()
 	defer n.l.Unlock()
+
+	// TODO test the notifier is updated
+	// Remove it from being tracked and remove it from the dealiner
 	delete(n.nodes, nodeID)
+	n.deadlineNotifier.Remove(nodeID)
 }
 
 // Update updates the node, either updating the tracked version or starting to
@@ -51,7 +55,21 @@ func (n *NodeDrainer) Update(node *structs.Node) {
 		return
 	}
 
+	// Update it and update the dealiner
 	draining.Update(node)
+
+	// TODO test the notifier is updated
+	if inf, deadline := node.DrainStrategy.DeadlineTime(); !inf {
+		n.deadlineNotifier.Watch(node.ID, deadline)
+	} else {
+		// TODO think about handling any race that may occur. I believe it is
+		// totally fine as long as the handlers are locked.
+
+		// There is an infinite deadline so it shouldn't be tracked for
+		// deadlining
+		n.deadlineNotifier.Remove(node.ID)
+	}
+
 }
 
 // nodeDrainWatcher is used to watch nodes that are entering, leaving or

From 0e51b2065745c94651c74231229bd2bcf376ec58 Mon Sep 17 00:00:00 2001
From: Alex Dadgar <alex.dadgar@gmail.com>
Date: Tue, 6 Mar 2018 10:12:17 -0800
Subject: [PATCH 40/79] job watcher

---
 nomad/drainerv2/drain_testing.go    |   5 +-
 nomad/drainerv2/drainer.go          |   7 +-
 nomad/drainerv2/watch_jobs.go       | 411 +++++++++++++++++++++++++++-
 nomad/drainerv2/watch_jobs_test.go  | 372 +++++++++++++++++++++++++
 nomad/drainerv2/watch_nodes.go      |  43 +--
 nomad/drainerv2/watch_nodes_test.go |  69 +++--
 nomad/state/testing.go              |   5 +-
 nomad/structs/structs.go            |  20 ++
 8 files changed, 888 insertions(+), 44 deletions(-)
 create mode 100644 nomad/drainerv2/watch_jobs_test.go

diff --git a/nomad/drainerv2/drain_testing.go b/nomad/drainerv2/drain_testing.go
index af143894bd93..60d710e4a593 100644
--- a/nomad/drainerv2/drain_testing.go
+++ b/nomad/drainerv2/drain_testing.go
@@ -24,11 +24,10 @@ func NewMockNodeTracker() *MockNodeTracker {
 	}
 }
 
-func (m *MockNodeTracker) Tracking(nodeID string) (*structs.Node, bool) {
+func (m *MockNodeTracker) TrackedNodes() map[string]*structs.Node {
 	m.Lock()
 	defer m.Unlock()
-	n, ok := m.Nodes[nodeID]
-	return n, ok
+	return m.Nodes
 }
 
 func (m *MockNodeTracker) Remove(nodeID string) {
diff --git a/nomad/drainerv2/drainer.go b/nomad/drainerv2/drainer.go
index 18b07eff5606..d78019b8499e 100644
--- a/nomad/drainerv2/drainer.go
+++ b/nomad/drainerv2/drainer.go
@@ -35,12 +35,12 @@ type AllocDrainer interface {
 }
 
 type NodeTracker interface {
-	Tracking(nodeID string) (*structs.Node, bool)
+	TrackedNodes() map[string]*structs.Node
 	Remove(nodeID string)
 	Update(node *structs.Node)
 }
 
-type DrainingJobWatcherFactory func(context.Context, *rate.Limiter, *state.StateStore, *log.Logger, AllocDrainer) DrainingJobWatcher
+type DrainingJobWatcherFactory func(context.Context, *rate.Limiter, *state.StateStore, *log.Logger) DrainingJobWatcher
 type DrainingNodeWatcherFactory func(context.Context, *rate.Limiter, *state.StateStore, *log.Logger, NodeTracker) DrainingNodeWatcher
 type DrainDeadlineNotifierFactory func(context.Context) DrainDeadlineNotifier
 
@@ -129,7 +129,7 @@ func (n *NodeDrainer) flush() {
 	}
 
 	n.ctx, n.exitFn = context.WithCancel(context.Background())
-	n.jobWatcher = n.jobFactory(n.ctx, n.queryLimiter, n.state, n.logger, n)
+	n.jobWatcher = n.jobFactory(n.ctx, n.queryLimiter, n.state, n.logger)
 	n.nodeWatcher = n.nodeFactory(n.ctx, n.queryLimiter, n.state, n.logger, n)
 	n.deadlineNotifier = n.deadlineNotifierFactory(n.ctx)
 	n.nodes = make(map[string]*drainingNode, 32)
@@ -146,6 +146,7 @@ func (n *NodeDrainer) run(ctx context.Context) {
 		case allocs := <-n.jobWatcher.Drain():
 			n.handleJobAllocDrain(allocs)
 		case node := <-n.doneNodeCh:
+			// TODO probably remove this as a channel
 			n.handleDoneNode(node)
 		}
 	}
diff --git a/nomad/drainerv2/watch_jobs.go b/nomad/drainerv2/watch_jobs.go
index 836cea6856e6..a2e6ef45ef50 100644
--- a/nomad/drainerv2/watch_jobs.go
+++ b/nomad/drainerv2/watch_jobs.go
@@ -1,8 +1,417 @@
 package drainerv2
 
-import "github.com/hashicorp/nomad/nomad/structs"
+import (
+	"context"
+	"fmt"
+	"log"
+	"sync"
+	"time"
 
+	memdb "github.com/hashicorp/go-memdb"
+	"github.com/hashicorp/nomad/helper"
+	"github.com/hashicorp/nomad/nomad/state"
+	"github.com/hashicorp/nomad/nomad/structs"
+	"golang.org/x/time/rate"
+)
+
+// DrainingJobWatcher is the interface for watching a job drain
 type DrainingJobWatcher interface {
+	// RegisterJob is used to start watching a draining job
 	RegisterJob(jobID, namespace string)
+
+	// TODO This should probably be a drain future such that we can block the
+	// next loop till the raft apply happens such that we don't emit the same
+	// drain many times. We would get the applied index back and block till
+	// then.
+	// Drain is used to emit allocations that should be drained.
 	Drain() <-chan []*structs.Allocation
+
+	// Migrated is allocations for draining jobs that have transistioned to
+	// stop. There is no guarantee that duplicates won't be published.
+	Migrated() <-chan []*structs.Allocation
+}
+
+// drainingJobWatcher is used to watch draining jobs and emit events when
+// draining allocations have replacements
+type drainingJobWatcher struct {
+	ctx    context.Context
+	logger *log.Logger
+
+	// state is the state that is watched for state changes.
+	state *state.StateStore
+
+	// limiter is used to limit the rate of blocking queries
+	limiter *rate.Limiter
+
+	// jobs is the set of tracked jobs.
+	jobs map[structs.JobNs]struct{}
+
+	// queryCtx is used to cancel a blocking query.
+	queryCtx    context.Context
+	queryCancel context.CancelFunc
+
+	// drainCh and migratedCh are used to emit allocations
+	drainCh    chan []*structs.Allocation
+	migratedCh chan []*structs.Allocation
+
+	l sync.RWMutex
+}
+
+// NewDrainingJobWatcher returns a new job watcher. The caller is expected to
+// cancel the context to clean up the drainer.
+func NewDrainingJobWatcher(ctx context.Context, limiter *rate.Limiter, state *state.StateStore, logger *log.Logger) *drainingJobWatcher {
+
+	// Create a context that can cancel the blocking query so that when a new
+	// job gets registered it is handled.
+	queryCtx, queryCancel := context.WithCancel(ctx)
+
+	w := &drainingJobWatcher{
+		ctx:         ctx,
+		queryCtx:    queryCtx,
+		queryCancel: queryCancel,
+		limiter:     limiter,
+		logger:      logger,
+		state:       state,
+		jobs:        make(map[structs.JobNs]struct{}, 64),
+		drainCh:     make(chan []*structs.Allocation, 8),
+		migratedCh:  make(chan []*structs.Allocation, 8),
+	}
+
+	go w.watch()
+	return w
+}
+
+// RegisterJob marks the given job as draining and adds it to being watched.
+func (w *drainingJobWatcher) RegisterJob(jobID, namespace string) {
+	w.l.Lock()
+	defer w.l.Unlock()
+
+	jns := structs.JobNs{
+		ID:        jobID,
+		Namespace: namespace,
+	}
+	if _, ok := w.jobs[jns]; ok {
+		return
+	}
+
+	// Add the job and cancel the context
+	w.jobs[jns] = struct{}{}
+	w.queryCancel()
+
+	// Create a new query context
+	w.queryCtx, w.queryCancel = context.WithCancel(w.ctx)
+}
+
+// Drain returns the channel that emits allocations to drain.
+func (w *drainingJobWatcher) Drain() <-chan []*structs.Allocation {
+	return w.drainCh
+}
+
+// Migrated returns the channel that emits allocations for draining jobs that
+// have been migrated.
+func (w *drainingJobWatcher) Migrated() <-chan []*structs.Allocation {
+	return w.migratedCh
+}
+
+// deregisterJob removes the job from being watched.
+func (w *drainingJobWatcher) deregisterJob(jobID, namespace string) {
+	w.l.Lock()
+	defer w.l.Unlock()
+	jns := structs.JobNs{
+		ID:        jobID,
+		Namespace: namespace,
+	}
+	delete(w.jobs, jns)
+	w.logger.Printf("[TRACE] nomad.drain.job_watcher: deregistering job %v", jns)
+}
+
+// watch is the long lived watching routine that detects job drain changes.
+func (w *drainingJobWatcher) watch() {
+	jindex := uint64(1)
+	for {
+		w.logger.Printf("[TRACE] nomad.drain.job_watcher: getting job allocs at index %d", jindex)
+		jobAllocs, index, err := w.getJobAllocs(w.getQueryCtx(), jindex)
+		if err != nil {
+			if err == context.Canceled {
+				// Determine if it is a cancel or a shutdown
+				select {
+				case <-w.ctx.Done():
+					w.logger.Printf("[TRACE] nomad.drain.job_watcher: shutting down")
+					return
+				default:
+					// The query context was cancelled
+					continue
+				}
+			}
+
+			w.logger.Printf("[ERR] nomad.drain.job_watcher: error watching job allocs updates at index %d: %v", jindex, err)
+			select {
+			case <-w.ctx.Done():
+				w.logger.Printf("[TRACE] nomad.drain.job_watcher: shutting down")
+				return
+			case <-time.After(stateReadErrorDelay):
+				continue
+			}
+		}
+
+		// update index for next run
+		lastHandled := jindex
+		jindex = index
+
+		// Snapshot the state store
+		snap, err := w.state.Snapshot()
+		if err != nil {
+			w.logger.Printf("[WARN] nomad.drain.job_watcher: failed to snapshot statestore: %v", err)
+			continue
+		}
+
+		currentJobs := w.drainingJobs()
+		var allDrain, allMigrated []*structs.Allocation
+		for job, allocs := range jobAllocs {
+			// Check if the job is still registered
+			if _, ok := currentJobs[job]; !ok {
+				continue
+			}
+
+			w.logger.Printf("[TRACE] nomad.drain.job_watcher: handling job %v", job)
+
+			// Lookup the job
+			job, err := w.state.JobByID(nil, job.Namespace, job.ID)
+			if err != nil {
+				w.logger.Printf("[WARN] nomad.drain.job_watcher: failed to lookup job %v: %v", job, err)
+				continue
+			}
+
+			// Ignore all non-service jobs
+			if job.Type != structs.JobTypeService {
+				w.deregisterJob(job.ID, job.Namespace)
+				continue
+			}
+
+			result, err := handleJob(snap, job, allocs, lastHandled)
+			if err != nil {
+				w.logger.Printf("[ERR] nomad.drain.job_watcher: handling drain for job %v failed: %v", job, err)
+				continue
+			}
+
+			allDrain = append(allDrain, result.drain...)
+			allMigrated = append(allMigrated, result.migrated...)
+
+			// Stop tracking this job
+			if result.done {
+				w.deregisterJob(job.ID, job.Namespace)
+			}
+		}
+
+		if allDrain != nil {
+			select {
+			case w.drainCh <- allDrain:
+			case <-w.ctx.Done():
+				w.logger.Printf("[TRACE] nomad.drain.job_watcher: shutting down")
+				return
+			}
+		}
+
+		if allMigrated != nil {
+			select {
+			case w.migratedCh <- allMigrated:
+			case <-w.ctx.Done():
+				w.logger.Printf("[TRACE] nomad.drain.job_watcher: shutting down")
+				return
+			}
+		}
+	}
+}
+
+// jobResult is the set of actions to take for a draining job given its current
+// state.
+type jobResult struct {
+	// drain is the set of allocations to emit for draining.
+	drain []*structs.Allocation
+
+	// migrated is the set of allocations to emit as migrated
+	migrated []*structs.Allocation
+
+	// done marks whether the job has been fully drained.
+	done bool
+}
+
+// newJobResult returns an initialized jobResult
+func newJobResult() *jobResult {
+	return &jobResult{
+		done: true,
+	}
+}
+
+// handleJob takes the state of a draining job and returns the desired actions.
+func handleJob(snap *state.StateSnapshot, job *structs.Job, allocs []*structs.Allocation, lastHandledIndex uint64) (*jobResult, error) {
+	r := newJobResult()
+	taskGroups := make(map[string]*structs.TaskGroup, len(job.TaskGroups))
+	for _, tg := range job.TaskGroups {
+		if tg.Migrate != nil {
+			// TODO handle the upgrade path
+			// Only capture the groups that have a migrate strategy
+			taskGroups[tg.Name] = tg
+		}
+	}
+
+	// Sort the allocations by TG
+	tgAllocs := make(map[string][]*structs.Allocation, len(taskGroups))
+	for _, alloc := range allocs {
+		if _, ok := taskGroups[alloc.TaskGroup]; !ok {
+			continue
+		}
+
+		tgAllocs[alloc.TaskGroup] = append(tgAllocs[alloc.TaskGroup], alloc)
+	}
+
+	for name, tg := range taskGroups {
+		allocs := tgAllocs[name]
+		if err := handleTaskGroup(snap, tg, allocs, lastHandledIndex, r); err != nil {
+			return nil, fmt.Errorf("drain for task group %q failed: %v", name, err)
+		}
+	}
+
+	return r, nil
+}
+
+// handleTaskGroup takes the state of a draining task group and computes the desired actions.
+func handleTaskGroup(snap *state.StateSnapshot, tg *structs.TaskGroup,
+	allocs []*structs.Allocation, lastHandledIndex uint64, result *jobResult) error {
+
+	// Determine how many allocations can be drained
+	drainingNodes := make(map[string]bool, 4)
+	healthy := 0
+	remainingDrainingAlloc := false
+	var drainable []*structs.Allocation
+
+	for _, alloc := range allocs {
+		// Check if the alloc is on a draining node.
+		onDrainingNode, ok := drainingNodes[alloc.NodeID]
+		if !ok {
+			// Look up the node
+			node, err := snap.NodeByID(nil, alloc.NodeID)
+			if err != nil {
+				return err
+			}
+
+			onDrainingNode = node.DrainStrategy != nil
+			drainingNodes[node.ID] = onDrainingNode
+		}
+
+		// Check if the alloc should be considered migrated. A migrated
+		// allocation is one that is terminal, is on a draining
+		// allocation, and has only happened since our last handled index to
+		// avoid emitting many duplicate migrate events.
+		if alloc.TerminalStatus() &&
+			onDrainingNode &&
+			alloc.ModifyIndex > lastHandledIndex {
+			result.migrated = append(result.migrated, alloc)
+			continue
+		}
+
+		// If the alloc is running and has its deployment status set, it is
+		// considered healthy from a migration standpoint.
+		if !alloc.TerminalStatus() &&
+			alloc.DeploymentStatus != nil &&
+			alloc.DeploymentStatus.Healthy != nil {
+			healthy++
+		}
+
+		// An alloc can't be considered for migration if:
+		// - It isn't on a draining node
+		// - It is already terminal
+		// - It has already been marked for draining
+		if !onDrainingNode || alloc.TerminalStatus() || alloc.DesiredTransition.ShouldMigrate() {
+			continue
+		}
+
+		// This alloc is drainable, so capture it and the fact that the job
+		// isn't done draining yet.
+		remainingDrainingAlloc = true
+		drainable = append(drainable, alloc)
+	}
+
+	// Update the done status
+	if remainingDrainingAlloc {
+		result.done = false
+	}
+
+	// Determine how many we can drain
+	thresholdCount := tg.Count - tg.Migrate.MaxParallel
+	numToDrain := healthy - thresholdCount
+	numToDrain = helper.IntMin(len(drainable), numToDrain)
+	if numToDrain <= 0 {
+		return nil
+	}
+
+	result.drain = append(result.drain, drainable[0:numToDrain]...)
+	return nil
+}
+
+// getJobAllocs returns all allocations for draining jobs
+func (w *drainingJobWatcher) getJobAllocs(ctx context.Context, minIndex uint64) (map[structs.JobNs][]*structs.Allocation, uint64, error) {
+	if err := w.limiter.Wait(ctx); err != nil {
+		return nil, 0, err
+	}
+
+	resp, index, err := w.state.BlockingQuery(w.getJobAllocsImpl, minIndex, ctx)
+	if err != nil {
+		return nil, 0, err
+	}
+
+	return resp.(map[structs.JobNs][]*structs.Allocation), index, nil
+}
+
+// getJobAllocsImpl returns a map of draining jobs to their allocations.
+func (w *drainingJobWatcher) getJobAllocsImpl(ws memdb.WatchSet, state *state.StateStore) (interface{}, uint64, error) {
+	index, err := state.Index("allocs")
+	if err != nil {
+		return nil, 0, err
+	}
+
+	// Capture the draining jobs.
+	draining := w.drainingJobs()
+	l := len(draining)
+	if l == 0 {
+		return nil, index, nil
+	}
+
+	// Capture the allocs for each draining job.
+	resp := make(map[structs.JobNs][]*structs.Allocation, l)
+	for jns := range draining {
+		allocs, err := state.AllocsByJob(ws, jns.Namespace, jns.ID, false)
+		if err != nil {
+			return nil, index, err
+		}
+
+		resp[jns] = allocs
+	}
+
+	return resp, index, nil
+}
+
+// drainingJobs captures the set of draining jobs.
+func (w *drainingJobWatcher) drainingJobs() map[structs.JobNs]struct{} {
+	w.l.RLock()
+	defer w.l.RUnlock()
+
+	l := len(w.jobs)
+	if l == 0 {
+		return nil
+	}
+
+	draining := make(map[structs.JobNs]struct{}, l)
+	for k := range w.jobs {
+		draining[k] = struct{}{}
+	}
+
+	return draining
+}
+
+// getQueryCtx is a helper for getting the query context.
+func (w *drainingJobWatcher) getQueryCtx() context.Context {
+	w.l.RLock()
+	defer w.l.RUnlock()
+	return w.queryCtx
 }
diff --git a/nomad/drainerv2/watch_jobs_test.go b/nomad/drainerv2/watch_jobs_test.go
new file mode 100644
index 000000000000..6d9b1846ec5c
--- /dev/null
+++ b/nomad/drainerv2/watch_jobs_test.go
@@ -0,0 +1,372 @@
+package drainerv2
+
+import (
+	"context"
+	"fmt"
+	"testing"
+	"time"
+
+	"github.com/hashicorp/nomad/helper"
+	"github.com/hashicorp/nomad/helper/testlog"
+	"github.com/hashicorp/nomad/nomad/mock"
+	"github.com/hashicorp/nomad/nomad/state"
+	"github.com/hashicorp/nomad/nomad/structs"
+	"github.com/stretchr/testify/require"
+	"golang.org/x/time/rate"
+)
+
+func testDrainingJobWatcher(t *testing.T) (*drainingJobWatcher, *state.StateStore) {
+	t.Helper()
+
+	state := state.TestStateStore(t)
+	limiter := rate.NewLimiter(100.0, 100)
+	logger := testlog.Logger(t)
+	w := NewDrainingJobWatcher(context.Background(), limiter, state, logger)
+	return w, state
+}
+
+func TestDrainingJobWatcher_Interface(t *testing.T) {
+	t.Parallel()
+	require := require.New(t)
+	w, _ := testDrainingJobWatcher(t)
+	require.Implements((*DrainingJobWatcher)(nil), w)
+}
+
+// DrainingJobWatcher tests:
+// TODO Test that several jobs allocation changes get batched
+// TODO Test that jobs are deregistered when they have no more to migrate
+// TODO Test that the watcher gets triggered on alloc changes
+// TODO Test that the watcher cancels its query when a new job is registered
+
+func TestHandleTaskGroup_AllDone(t *testing.T) {
+	t.Parallel()
+	require := require.New(t)
+
+	// Create a non-draining node
+	state := state.TestStateStore(t)
+	n := mock.Node()
+	require.Nil(state.UpsertNode(100, n))
+
+	job := mock.Job()
+	require.Nil(state.UpsertJob(101, job))
+
+	// Create 10 running allocs on the healthy node
+	var allocs []*structs.Allocation
+	for i := 0; i < 10; i++ {
+		a := mock.Alloc()
+		a.Job = job
+		a.TaskGroup = job.TaskGroups[0].Name
+		a.NodeID = n.ID
+		a.DeploymentStatus = &structs.AllocDeploymentStatus{
+			Healthy: helper.BoolToPtr(false),
+		}
+		allocs = append(allocs, a)
+	}
+	require.Nil(state.UpsertAllocs(102, allocs))
+
+	snap, err := state.Snapshot()
+	require.Nil(err)
+
+	res := &jobResult{}
+	require.Nil(handleTaskGroup(snap, job.TaskGroups[0], allocs, 101, res))
+	require.Empty(res.drain)
+	require.Empty(res.migrated)
+	require.True(res.done)
+}
+
+func TestHandleTaskGroup_AllOnDrainingNodes(t *testing.T) {
+	t.Parallel()
+	require := require.New(t)
+
+	// The loop value sets the max parallel for the drain strategy
+	for i := 1; i < 8; i++ {
+		// Create a draining node
+		state := state.TestStateStore(t)
+		n := mock.Node()
+		n.DrainStrategy = &structs.DrainStrategy{
+			DrainSpec: structs.DrainSpec{
+				Deadline: 5 * time.Minute,
+			},
+			ForceDeadline: time.Now().Add(1 * time.Minute),
+		}
+		require.Nil(state.UpsertNode(100, n))
+
+		job := mock.Job()
+		job.TaskGroups[0].Migrate.MaxParallel = i
+		require.Nil(state.UpsertJob(101, job))
+
+		// Create 10 running allocs on the draining node
+		var allocs []*structs.Allocation
+		for i := 0; i < 10; i++ {
+			a := mock.Alloc()
+			a.Job = job
+			a.TaskGroup = job.TaskGroups[0].Name
+			a.NodeID = n.ID
+			a.DeploymentStatus = &structs.AllocDeploymentStatus{
+				Healthy: helper.BoolToPtr(false),
+			}
+			allocs = append(allocs, a)
+		}
+		require.Nil(state.UpsertAllocs(102, allocs))
+
+		snap, err := state.Snapshot()
+		require.Nil(err)
+
+		res := &jobResult{}
+		require.Nil(handleTaskGroup(snap, job.TaskGroups[0], allocs, 101, res))
+		require.Len(res.drain, i)
+		require.Empty(res.migrated)
+		require.False(res.done)
+	}
+}
+
+func TestHandleTaskGroup_MixedHealth(t *testing.T) {
+	cases := []struct {
+		maxParallel        int
+		drainingNodeAllocs int
+		healthSet          int
+		healthUnset        int
+		expectedDrain      int
+		expectedMigrated   int
+		expectedDone       bool
+	}{
+		{
+			maxParallel:        2,
+			drainingNodeAllocs: 10,
+			healthSet:          0,
+			healthUnset:        0,
+			expectedDrain:      2,
+			expectedMigrated:   0,
+			expectedDone:       false,
+		},
+		{
+			maxParallel:        2,
+			drainingNodeAllocs: 9,
+			healthSet:          0,
+			healthUnset:        0,
+			expectedDrain:      1,
+			expectedMigrated:   1,
+			expectedDone:       false,
+		},
+		{
+			maxParallel:        5,
+			drainingNodeAllocs: 9,
+			healthSet:          0,
+			healthUnset:        0,
+			expectedDrain:      4,
+			expectedMigrated:   1,
+			expectedDone:       false,
+		},
+		{
+			maxParallel:        2,
+			drainingNodeAllocs: 5,
+			healthSet:          2,
+			healthUnset:        0,
+			expectedDrain:      0,
+			expectedMigrated:   5,
+			expectedDone:       false,
+		},
+		{
+			maxParallel:        2,
+			drainingNodeAllocs: 5,
+			healthSet:          3,
+			healthUnset:        0,
+			expectedDrain:      0,
+			expectedMigrated:   5,
+			expectedDone:       false,
+		},
+		{
+			maxParallel:        2,
+			drainingNodeAllocs: 5,
+			healthSet:          4,
+			healthUnset:        0,
+			expectedDrain:      1,
+			expectedMigrated:   5,
+			expectedDone:       false,
+		},
+		{
+			maxParallel:        2,
+			drainingNodeAllocs: 5,
+			healthSet:          4,
+			healthUnset:        1,
+			expectedDrain:      1,
+			expectedMigrated:   5,
+			expectedDone:       false,
+		},
+		{
+			maxParallel:        1,
+			drainingNodeAllocs: 5,
+			healthSet:          4,
+			healthUnset:        1,
+			expectedDrain:      0,
+			expectedMigrated:   5,
+			expectedDone:       false,
+		},
+		{
+			maxParallel:        3,
+			drainingNodeAllocs: 5,
+			healthSet:          3,
+			healthUnset:        0,
+			expectedDrain:      1,
+			expectedMigrated:   5,
+			expectedDone:       false,
+		},
+		{
+			maxParallel:        3,
+			drainingNodeAllocs: 0,
+			healthSet:          10,
+			healthUnset:        0,
+			expectedDrain:      0,
+			expectedMigrated:   10,
+			expectedDone:       true,
+		},
+		{
+			// Is the case where deadline is hit and all 10 are just marked
+			// stopped. We should detect the job as done.
+			maxParallel:        3,
+			drainingNodeAllocs: 0,
+			healthSet:          0,
+			healthUnset:        0,
+			expectedDrain:      0,
+			expectedMigrated:   10,
+			expectedDone:       true,
+		},
+	}
+
+	for cnum, c := range cases {
+		t.Run(fmt.Sprintf("%d", cnum), func(t *testing.T) {
+			require := require.New(t)
+
+			// Create a draining node
+			state := state.TestStateStore(t)
+
+			drainingNode := mock.Node()
+			drainingNode.DrainStrategy = &structs.DrainStrategy{
+				DrainSpec: structs.DrainSpec{
+					Deadline: 5 * time.Minute,
+				},
+				ForceDeadline: time.Now().Add(1 * time.Minute),
+			}
+			require.Nil(state.UpsertNode(100, drainingNode))
+
+			healthyNode := mock.Node()
+			require.Nil(state.UpsertNode(101, healthyNode))
+
+			job := mock.Job()
+			job.TaskGroups[0].Migrate.MaxParallel = c.maxParallel
+			require.Nil(state.UpsertJob(101, job))
+
+			// Create running allocs on the draining node with health set
+			var allocs []*structs.Allocation
+			for i := 0; i < c.drainingNodeAllocs; i++ {
+				a := mock.Alloc()
+				a.Job = job
+				a.TaskGroup = job.TaskGroups[0].Name
+				a.NodeID = drainingNode.ID
+				a.DeploymentStatus = &structs.AllocDeploymentStatus{
+					Healthy: helper.BoolToPtr(false),
+				}
+				allocs = append(allocs, a)
+			}
+
+			// Create stopped allocs on the draining node
+			for i := 10 - c.drainingNodeAllocs; i > 0; i-- {
+				a := mock.Alloc()
+				a.Job = job
+				a.TaskGroup = job.TaskGroups[0].Name
+				a.NodeID = drainingNode.ID
+				a.DeploymentStatus = &structs.AllocDeploymentStatus{
+					Healthy: helper.BoolToPtr(false),
+				}
+				a.DesiredStatus = structs.AllocDesiredStatusStop
+				allocs = append(allocs, a)
+			}
+
+			// Create allocs on the healthy node with health set
+			for i := 0; i < c.healthSet; i++ {
+				a := mock.Alloc()
+				a.Job = job
+				a.TaskGroup = job.TaskGroups[0].Name
+				a.NodeID = healthyNode.ID
+				a.DeploymentStatus = &structs.AllocDeploymentStatus{
+					Healthy: helper.BoolToPtr(false),
+				}
+				allocs = append(allocs, a)
+			}
+
+			// Create allocs on the healthy node with health not set
+			for i := 0; i < c.healthUnset; i++ {
+				a := mock.Alloc()
+				a.Job = job
+				a.TaskGroup = job.TaskGroups[0].Name
+				a.NodeID = healthyNode.ID
+				allocs = append(allocs, a)
+			}
+			require.Nil(state.UpsertAllocs(103, allocs))
+
+			snap, err := state.Snapshot()
+			require.Nil(err)
+
+			res := &jobResult{}
+			require.Nil(handleTaskGroup(snap, job.TaskGroups[0], allocs, 101, res))
+			require.Len(res.drain, c.expectedDrain)
+			require.Len(res.migrated, c.expectedMigrated)
+			require.Equal(c.expectedDone, res.done)
+		})
+	}
+}
+
+func TestHandleTaskGroup_Migrations(t *testing.T) {
+	t.Parallel()
+	require := require.New(t)
+
+	// Create a draining node
+	state := state.TestStateStore(t)
+	n := mock.Node()
+	n.DrainStrategy = &structs.DrainStrategy{
+		DrainSpec: structs.DrainSpec{
+			Deadline: 5 * time.Minute,
+		},
+		ForceDeadline: time.Now().Add(1 * time.Minute),
+	}
+	require.Nil(state.UpsertNode(100, n))
+
+	job := mock.Job()
+	require.Nil(state.UpsertJob(101, job))
+
+	// Create 10 done allocs
+	var allocs []*structs.Allocation
+	for i := 0; i < 10; i++ {
+		a := mock.Alloc()
+		a.Job = job
+		a.TaskGroup = job.TaskGroups[0].Name
+		a.NodeID = n.ID
+		a.DeploymentStatus = &structs.AllocDeploymentStatus{
+			Healthy: helper.BoolToPtr(false),
+		}
+
+		if i%2 == 0 {
+			a.DesiredStatus = structs.AllocDesiredStatusStop
+		} else {
+			a.ClientStatus = structs.AllocClientStatusFailed
+		}
+		allocs = append(allocs, a)
+	}
+	require.Nil(state.UpsertAllocs(102, allocs))
+
+	snap, err := state.Snapshot()
+	require.Nil(err)
+
+	// Handle before and after indexes
+	res := &jobResult{}
+	require.Nil(handleTaskGroup(snap, job.TaskGroups[0], allocs, 101, res))
+	require.Empty(res.drain)
+	require.Len(res.migrated, 10)
+	require.True(res.done)
+
+	res = &jobResult{}
+	require.Nil(handleTaskGroup(snap, job.TaskGroups[0], allocs, 103, res))
+	require.Empty(res.drain)
+	require.Empty(res.migrated)
+	require.True(res.done)
+}
diff --git a/nomad/drainerv2/watch_nodes.go b/nomad/drainerv2/watch_nodes.go
index 7b0bd8573389..568678f747d6 100644
--- a/nomad/drainerv2/watch_nodes.go
+++ b/nomad/drainerv2/watch_nodes.go
@@ -14,18 +14,17 @@ import (
 // DrainingNodeWatcher is the interface for watching for draining nodes.
 type DrainingNodeWatcher interface{}
 
-// Tracking returns the whether the node is being tracked and if so the copy of
-// the node object that is tracked.
-func (n *NodeDrainer) Tracking(nodeID string) (*structs.Node, bool) {
+// TrackedNodes returns the set of tracked nodes
+func (n *NodeDrainer) TrackedNodes() map[string]*structs.Node {
 	n.l.RLock()
 	defer n.l.RUnlock()
 
-	draining, ok := n.nodes[nodeID]
-	if !ok {
-		return nil, false
+	t := make(map[string]*structs.Node, len(n.nodes))
+	for n, d := range n.nodes {
+		t[n] = d.GetNode()
 	}
 
-	return draining.GetNode(), true
+	return t
 }
 
 // Remove removes the given node from being tracked
@@ -128,34 +127,42 @@ func (w *nodeDrainWatcher) watch() {
 		// update index for next run
 		nindex = index
 
-		for _, node := range nodes {
+		tracked := w.tracker.TrackedNodes()
+		for nodeID, node := range nodes {
 			newDraining := node.DrainStrategy != nil
-			currentNode, tracked := w.tracker.Tracking(node.ID)
+			currentNode, tracked := tracked[nodeID]
 
 			switch {
 			// If the node is tracked but not draining, untrack
 			case tracked && !newDraining:
-				w.logger.Printf("[TRACE] nomad.drain.node_watcher: tracked node %q is no longer draining", node.ID)
-				w.tracker.Remove(node.ID)
+				w.logger.Printf("[TRACE] nomad.drain.node_watcher: tracked node %q is no longer draining", nodeID)
+				w.tracker.Remove(nodeID)
 
 				// If the node is not being tracked but is draining, track
 			case !tracked && newDraining:
-				w.logger.Printf("[TRACE] nomad.drain.node_watcher: untracked node %q is draining", node.ID)
+				w.logger.Printf("[TRACE] nomad.drain.node_watcher: untracked node %q is draining", nodeID)
 				w.tracker.Update(node)
 
 				// If the node is being tracked but has changed, update:
 			case tracked && newDraining && !currentNode.DrainStrategy.Equal(node.DrainStrategy):
-				w.logger.Printf("[TRACE] nomad.drain.node_watcher: tracked node %q has updated drain", node.ID)
+				w.logger.Printf("[TRACE] nomad.drain.node_watcher: tracked node %q has updated drain", nodeID)
 				w.tracker.Update(node)
 			default:
-				w.logger.Printf("[TRACE] nomad.drain.node_watcher: node %q at index %v: tracked %v, draining %v", node.ID, node.ModifyIndex, tracked, newDraining)
+				w.logger.Printf("[TRACE] nomad.drain.node_watcher: node %q at index %v: tracked %v, draining %v", nodeID, node.ModifyIndex, tracked, newDraining)
+			}
+		}
+
+		for nodeID := range tracked {
+			if _, ok := nodes[nodeID]; !ok {
+				w.logger.Printf("[TRACE] nomad.drain.node_watcher: tracked node %q is no longer exists", nodeID)
+				w.tracker.Remove(nodeID)
 			}
 		}
 	}
 }
 
 // getNodes returns all nodes blocking until the nodes are after the given index.
-func (w *nodeDrainWatcher) getNodes(minIndex uint64) ([]*structs.Node, uint64, error) {
+func (w *nodeDrainWatcher) getNodes(minIndex uint64) (map[string]*structs.Node, uint64, error) {
 	if err := w.limiter.Wait(w.ctx); err != nil {
 		return nil, 0, err
 	}
@@ -165,7 +172,7 @@ func (w *nodeDrainWatcher) getNodes(minIndex uint64) ([]*structs.Node, uint64, e
 		return nil, 0, err
 	}
 
-	return resp.([]*structs.Node), index, nil
+	return resp.(map[string]*structs.Node), index, nil
 }
 
 // getNodesImpl is used to get nodes from the state store, returning the set of
@@ -181,7 +188,7 @@ func (w *nodeDrainWatcher) getNodesImpl(ws memdb.WatchSet, state *state.StateSto
 		return nil, 0, err
 	}
 
-	resp := make([]*structs.Node, 0, 64)
+	resp := make(map[string]*structs.Node, 64)
 	for {
 		raw := iter.Next()
 		if raw == nil {
@@ -189,7 +196,7 @@ func (w *nodeDrainWatcher) getNodesImpl(ws memdb.WatchSet, state *state.StateSto
 		}
 
 		node := raw.(*structs.Node)
-		resp = append(resp, node)
+		resp[node.ID] = node
 	}
 
 	return resp, index, nil
diff --git a/nomad/drainerv2/watch_nodes_test.go b/nomad/drainerv2/watch_nodes_test.go
index 8b3a63e1c250..dab304c32c9c 100644
--- a/nomad/drainerv2/watch_nodes_test.go
+++ b/nomad/drainerv2/watch_nodes_test.go
@@ -63,11 +63,10 @@ func TestNodeDrainWatcher_AddDraining(t *testing.T) {
 		t.Fatal("No node drain events")
 	})
 
-	_, ok1 := m.Tracking(n1.ID)
-	out2, ok2 := m.Tracking(n2.ID)
-	require.False(ok1)
-	require.True(ok2)
-	require.Equal(n2, out2)
+	tracked := m.TrackedNodes()
+	require.NotContains(tracked, n1.ID)
+	require.Contains(tracked, n2.ID)
+	require.Equal(n2, tracked[n2.ID])
 
 }
 
@@ -93,9 +92,9 @@ func TestNodeDrainWatcher_Remove(t *testing.T) {
 		t.Fatal("No node drain events")
 	})
 
-	out, ok := m.Tracking(n.ID)
-	require.True(ok)
-	require.Equal(n, out)
+	tracked := m.TrackedNodes()
+	require.Contains(tracked, n.ID)
+	require.Equal(n, tracked[n.ID])
 
 	// Change the node to be not draining and wait for it to be untracked
 	require.Nil(state.UpdateNodeDrain(101, n.ID, nil))
@@ -105,8 +104,46 @@ func TestNodeDrainWatcher_Remove(t *testing.T) {
 		t.Fatal("No new node drain events")
 	})
 
-	_, ok = m.Tracking(n.ID)
-	require.False(ok)
+	tracked = m.TrackedNodes()
+	require.NotContains(tracked, n.ID)
+}
+
+func TestNodeDrainWatcher_Remove_Nonexistent(t *testing.T) {
+	t.Parallel()
+	require := require.New(t)
+	_, state, m := testNodeDrainWatcher(t)
+
+	// Create a draining node
+	n := mock.Node()
+	n.DrainStrategy = &structs.DrainStrategy{
+		DrainSpec: structs.DrainSpec{
+			Deadline: time.Hour,
+		},
+		ForceDeadline: time.Now().Add(time.Hour),
+	}
+
+	// Wait for it to be tracked
+	require.Nil(state.UpsertNode(100, n))
+	testutil.WaitForResult(func() (bool, error) {
+		return len(m.Events) == 1, nil
+	}, func(err error) {
+		t.Fatal("No node drain events")
+	})
+
+	tracked := m.TrackedNodes()
+	require.Contains(tracked, n.ID)
+	require.Equal(n, tracked[n.ID])
+
+	// Delete the node
+	require.Nil(state.DeleteNode(101, n.ID))
+	testutil.WaitForResult(func() (bool, error) {
+		return len(m.Events) == 2, nil
+	}, func(err error) {
+		t.Fatal("No new node drain events")
+	})
+
+	tracked = m.TrackedNodes()
+	require.NotContains(tracked, n.ID)
 }
 
 func TestNodeDrainWatcher_Update(t *testing.T) {
@@ -131,9 +168,9 @@ func TestNodeDrainWatcher_Update(t *testing.T) {
 		t.Fatal("No node drain events")
 	})
 
-	out, ok := m.Tracking(n.ID)
-	require.True(ok)
-	require.Equal(n, out)
+	tracked := m.TrackedNodes()
+	require.Contains(tracked, n.ID)
+	require.Equal(n, tracked[n.ID])
 
 	// Change the node to have a new spec
 	s2 := n.DrainStrategy.Copy()
@@ -147,7 +184,7 @@ func TestNodeDrainWatcher_Update(t *testing.T) {
 		t.Fatal("No new node drain events")
 	})
 
-	out, ok = m.Tracking(n.ID)
-	require.True(ok)
-	require.Equal(out.DrainStrategy, s2)
+	tracked = m.TrackedNodes()
+	require.Contains(tracked, n.ID)
+	require.Equal(s2, tracked[n.ID].DrainStrategy)
 }
diff --git a/nomad/state/testing.go b/nomad/state/testing.go
index 69509714d179..ee7dce1d6c7f 100644
--- a/nomad/state/testing.go
+++ b/nomad/state/testing.go
@@ -1,14 +1,13 @@
 package state
 
 import (
-	"os"
-
+	"github.com/hashicorp/nomad/helper/testlog"
 	"github.com/mitchellh/go-testing-interface"
 )
 
 func TestStateStore(t testing.T) *StateStore {
 	config := &StateStoreConfig{
-		LogOutput: os.Stderr,
+		LogOutput: testlog.NewWriter(t),
 		Region:    "global",
 	}
 	state, err := NewStateStore(config)
diff --git a/nomad/structs/structs.go b/nomad/structs/structs.go
index 018b96c422ec..29e794cbf2ba 100644
--- a/nomad/structs/structs.go
+++ b/nomad/structs/structs.go
@@ -1771,6 +1771,26 @@ func (n *NetworkResource) PortLabels() map[string]int {
 	return labelValues
 }
 
+// JobNs is a Job.ID and Namespace tuple
+type JobNs struct {
+	ID, Namespace string
+}
+
+func NewJobNs(namespace, id string) *JobNs {
+	return &JobNs{
+		ID:        id,
+		Namespace: namespace,
+	}
+}
+
+func (j *JobNs) String() string {
+	if j == nil {
+		return "<nil, nil>"
+	}
+
+	return fmt.Sprintf("<ns: %q, id: %q>", j.Namespace, j.ID)
+}
+
 const (
 	// JobTypeNomad is reserved for internal system tasks and is
 	// always handled by the CoreScheduler.

From cec2c5a72652581cd726a1791c6ace1ef238a4be Mon Sep 17 00:00:00 2001
From: Alex Dadgar <alex.dadgar@gmail.com>
Date: Tue, 6 Mar 2018 14:37:37 -0800
Subject: [PATCH 41/79] Drainer

---
 nomad/drainer_shims.go           |  20 ++-
 nomad/drainerv2/drainer.go       | 207 +++++++++++++++++++++++++++----
 nomad/drainerv2/draining_node.go |  99 +++++++++++----
 nomad/drainerv2/watch_jobs.go    |  48 +++++--
 nomad/drainerv2/watch_nodes.go   |   5 +-
 nomad/node_endpoint.go           |  38 +-----
 nomad/node_endpoint_test.go      |  30 +----
 nomad/server.go                  |  18 ++-
 nomad/structs/structs.go         |  42 +++++++
 nomad/structs/structs_test.go    |  28 +++++
 10 files changed, 390 insertions(+), 145 deletions(-)

diff --git a/nomad/drainer_shims.go b/nomad/drainer_shims.go
index 09a1a8f6635c..1c7ffb1a9b76 100644
--- a/nomad/drainer_shims.go
+++ b/nomad/drainer_shims.go
@@ -8,38 +8,36 @@ type drainerShim struct {
 	s *Server
 }
 
-func (d drainerShim) NodeDrainComplete(nodeID string) error {
+func (d drainerShim) NodeDrainComplete(nodeID string) (uint64, error) {
 	args := &structs.NodeUpdateDrainRequest{
 		NodeID:       nodeID,
 		Drain:        false,
 		WriteRequest: structs.WriteRequest{Region: d.s.config.Region},
 	}
 
-	resp, _, err := d.s.raftApply(structs.NodeUpdateDrainRequestType, args)
-	return d.convertApplyErrors(resp, err)
+	resp, index, err := d.s.raftApply(structs.NodeUpdateDrainRequestType, args)
+	return d.convertApplyErrors(resp, index, err)
 }
 
-func (d drainerShim) AllocUpdateDesiredTransition(allocs map[string]*structs.DesiredTransition, evals []*structs.Evaluation) error {
+func (d drainerShim) AllocUpdateDesiredTransition(allocs map[string]*structs.DesiredTransition, evals []*structs.Evaluation) (uint64, error) {
 	args := &structs.AllocUpdateDesiredTransitionRequest{
 		Allocs:       allocs,
 		Evals:        evals,
 		WriteRequest: structs.WriteRequest{Region: d.s.config.Region},
 	}
-	resp, _, err := d.s.raftApply(structs.AllocUpdateDesiredTransitionRequestType, args)
-	return d.convertApplyErrors(resp, err)
+	resp, index, err := d.s.raftApply(structs.AllocUpdateDesiredTransitionRequestType, args)
+	return d.convertApplyErrors(resp, index, err)
 }
 
 // convertApplyErrors parses the results of a raftApply and returns the index at
 // which it was applied and any error that occurred. Raft Apply returns two
 // separate errors, Raft library errors and user returned errors from the FSM.
 // This helper, joins the errors by inspecting the applyResponse for an error.
-//
-// Similar to deployment watcher's convertApplyErrors
-func (d drainerShim) convertApplyErrors(applyResp interface{}, err error) error {
+func (d drainerShim) convertApplyErrors(applyResp interface{}, index uint64, err error) (uint64, error) {
 	if applyResp != nil {
 		if fsmErr, ok := applyResp.(error); ok && fsmErr != nil {
-			return fsmErr
+			return index, fsmErr
 		}
 	}
-	return err
+	return index, err
 }
diff --git a/nomad/drainerv2/drainer.go b/nomad/drainerv2/drainer.go
index d78019b8499e..f3553da1fcc3 100644
--- a/nomad/drainerv2/drainer.go
+++ b/nomad/drainerv2/drainer.go
@@ -6,6 +6,8 @@ import (
 	"sync"
 	"time"
 
+	"github.com/hashicorp/nomad/helper"
+	"github.com/hashicorp/nomad/helper/uuid"
 	"github.com/hashicorp/nomad/nomad/state"
 	"github.com/hashicorp/nomad/nomad/structs"
 	"golang.org/x/time/rate"
@@ -21,17 +23,20 @@ const (
 	// LimitStateQueriesPerSecond is the number of state queries allowed per
 	// second
 	LimitStateQueriesPerSecond = 100.0
+
+	// BatchUpdateInterval is how long we wait to batch updates
+	BatchUpdateInterval = 1 * time.Second
+
+	// NodeDeadlineCoalesceWindow is the duration in which deadlining nodes will
+	// be coalesced together
+	NodeDeadlineCoalesceWindow = 5 * time.Second
 )
 
 // RaftApplier contains methods for applying the raft requests required by the
 // NodeDrainer.
 type RaftApplier interface {
-	AllocUpdateDesiredTransition(allocs map[string]*structs.DesiredTransition, evals []*structs.Evaluation) error
-	NodeDrainComplete(nodeID string) error
-}
-
-type AllocDrainer interface {
-	drain(allocs []*structs.Allocation)
+	AllocUpdateDesiredTransition(allocs map[string]*structs.DesiredTransition, evals []*structs.Evaluation) (uint64, error)
+	NodeDrainComplete(nodeID string) (uint64, error)
 }
 
 type NodeTracker interface {
@@ -44,6 +49,38 @@ type DrainingJobWatcherFactory func(context.Context, *rate.Limiter, *state.State
 type DrainingNodeWatcherFactory func(context.Context, *rate.Limiter, *state.StateStore, *log.Logger, NodeTracker) DrainingNodeWatcher
 type DrainDeadlineNotifierFactory func(context.Context) DrainDeadlineNotifier
 
+func GetDrainingJobWatcher(ctx context.Context, limiter *rate.Limiter, state *state.StateStore, logger *log.Logger) DrainingJobWatcher {
+	return NewDrainingJobWatcher(ctx, limiter, state, logger)
+}
+
+func GetDeadlineNotifier(ctx context.Context) DrainDeadlineNotifier {
+	return NewDeadlineHeap(ctx, NodeDeadlineCoalesceWindow)
+}
+
+func GetNodeWatcherFactory() DrainingNodeWatcherFactory {
+	return func(ctx context.Context, limiter *rate.Limiter, state *state.StateStore, logger *log.Logger, tracker NodeTracker) DrainingNodeWatcher {
+		return NewNodeDrainWatcher(ctx, limiter, state, logger, tracker)
+	}
+}
+
+type allocMigrateBatcher struct {
+	// updates holds pending client status updates for allocations
+	updates []*structs.Allocation
+
+	// updateFuture is used to wait for the pending batch update
+	// to complete. This may be nil if no batch is pending.
+	updateFuture *structs.BatchFuture
+
+	// updateTimer is the timer that will trigger the next batch
+	// update, and may be nil if there is no batch pending.
+	updateTimer *time.Timer
+
+	batchWindow time.Duration
+
+	// synchronizes access to the updates list, the future and the timer.
+	sync.Mutex
+}
+
 type NodeDrainerConfig struct {
 	Logger                *log.Logger
 	Raft                  RaftApplier
@@ -51,8 +88,10 @@ type NodeDrainerConfig struct {
 	NodeFactory           DrainingNodeWatcherFactory
 	DrainDeadlineFactory  DrainDeadlineNotifierFactory
 	StateQueriesPerSecond float64
+	BatchUpdateInterval   time.Duration
 }
 
+// TODO Add stats
 type NodeDrainer struct {
 	enabled bool
 	logger  *log.Logger
@@ -60,9 +99,6 @@ type NodeDrainer struct {
 	// nodes is the set of draining nodes
 	nodes map[string]*drainingNode
 
-	// doneNodeCh is used to signal that a node is done draining
-	doneNodeCh chan string
-
 	nodeWatcher DrainingNodeWatcher
 	nodeFactory DrainingNodeWatcherFactory
 
@@ -81,6 +117,9 @@ type NodeDrainer struct {
 	// raft is a shim around the raft messages necessary for draining
 	raft RaftApplier
 
+	// batcher is used to batch alloc migrations.
+	batcher allocMigrateBatcher
+
 	// ctx and exitFn are used to cancel the watcher
 	ctx    context.Context
 	exitFn context.CancelFunc
@@ -96,6 +135,9 @@ func NewNodeDrainer(c *NodeDrainerConfig) *NodeDrainer {
 		nodeFactory:             c.NodeFactory,
 		deadlineNotifierFactory: c.DrainDeadlineFactory,
 		queryLimiter:            rate.NewLimiter(rate.Limit(c.StateQueriesPerSecond), 100),
+		batcher: allocMigrateBatcher{
+			batchWindow: c.BatchUpdateInterval,
+		},
 	}
 }
 
@@ -133,7 +175,6 @@ func (n *NodeDrainer) flush() {
 	n.nodeWatcher = n.nodeFactory(n.ctx, n.queryLimiter, n.state, n.logger, n)
 	n.deadlineNotifier = n.deadlineNotifierFactory(n.ctx)
 	n.nodes = make(map[string]*drainingNode, 32)
-	n.doneNodeCh = make(chan string, 4)
 }
 
 func (n *NodeDrainer) run(ctx context.Context) {
@@ -143,33 +184,145 @@ func (n *NodeDrainer) run(ctx context.Context) {
 			return
 		case nodes := <-n.deadlineNotifier.NextBatch():
 			n.handleDeadlinedNodes(nodes)
-		case allocs := <-n.jobWatcher.Drain():
-			n.handleJobAllocDrain(allocs)
-		case node := <-n.doneNodeCh:
-			// TODO probably remove this as a channel
-			n.handleDoneNode(node)
+		case req := <-n.jobWatcher.Drain():
+			n.handleJobAllocDrain(req)
+		case allocs := <-n.jobWatcher.Migrated():
+			n.handleMigratedAllocs(allocs)
 		}
 	}
 }
 
 func (n *NodeDrainer) handleDeadlinedNodes(nodes []string) {
-	// TODO
+	// Retrieve the set of allocations that will be force stopped.
+	n.l.RLock()
+	var forceStop []*structs.Allocation
+	for _, node := range nodes {
+		draining, ok := n.nodes[node]
+		if !ok {
+			n.logger.Printf("[DEBUG] nomad.node_drainer: skipping untracked deadlined node %q", node)
+			continue
+		}
+
+		allocs, err := draining.DeadlineAllocs()
+		if err != nil {
+			n.logger.Printf("[ERR] nomad.node_drainer: failed to retrive allocs on deadlined node %q: %v", node, err)
+			continue
+		}
+
+		forceStop = append(forceStop, allocs...)
+	}
+	n.l.RUnlock()
+	n.batchDrainAllocs(forceStop)
+}
+
+func (n *NodeDrainer) handleJobAllocDrain(req *DrainRequest) {
+	// This should be syncronous
+	index, err := n.batchDrainAllocs(req.Allocs)
+	req.Resp.Respond(index, err)
 }
 
-func (n *NodeDrainer) handleJobAllocDrain(allocs []*structs.Allocation) {
-	// TODO
+func (n *NodeDrainer) handleMigratedAllocs(allocs []*structs.Allocation) {
+	// Determine the set of nodes that were effected
+	nodes := make(map[string]struct{})
+	for _, alloc := range allocs {
+		nodes[alloc.NodeID] = struct{}{}
+	}
+
+	// For each node, check if it is now done
+	n.l.RLock()
+	var done []string
+	for node := range nodes {
+		draining, ok := n.nodes[node]
+		if !ok {
+			continue
+		}
+
+		isDone, err := draining.IsDone()
+		if err != nil {
+			n.logger.Printf("[ERR] nomad.drain: checking if node %q is done draining: %v", node, err)
+			continue
+		}
+
+		if !isDone {
+			continue
+		}
 
-	// TODO Call check on the appropriate nodes when the final allocs
-	// transistion to stop so we have a place to determine with the node
-	// is done and the final drain of system allocs
-	// TODO This probably requires changing the interface such that it
-	// returns replaced allocs as well.
+		done = append(done, node)
+	}
+	n.l.RUnlock()
+
+	// TODO This should probably be a single Raft transaction
+	for _, doneNode := range done {
+		index, err := n.raft.NodeDrainComplete(doneNode)
+		if err != nil {
+			n.logger.Printf("[ERR] nomad.drain: failed to unset drain for node %q: %v", doneNode, err)
+		} else {
+			n.logger.Printf("[INFO] nomad.drain: node %q completed draining at index %d", doneNode, index)
+		}
+	}
 }
 
-func (n *NodeDrainer) handleDoneNode(nodeID string) {
-	// TODO
+func (n *NodeDrainer) batchDrainAllocs(allocs []*structs.Allocation) (uint64, error) {
+	// Add this to the batch
+	n.batcher.Lock()
+	n.batcher.updates = append(n.batcher.updates, allocs...)
+
+	// Start a new batch if none
+	future := n.batcher.updateFuture
+	if future == nil {
+		future = structs.NewBatchFuture()
+		n.batcher.updateFuture = future
+		n.batcher.updateTimer = time.AfterFunc(n.batcher.batchWindow, func() {
+			// Get the pending updates
+			n.batcher.Lock()
+			updates := n.batcher.updates
+			future := n.batcher.updateFuture
+			n.batcher.updates = nil
+			n.batcher.updateFuture = nil
+			n.batcher.updateTimer = nil
+			n.batcher.Unlock()
+
+			// Perform the batch update
+			n.drainAllocs(future, updates)
+		})
+	}
+	n.batcher.Unlock()
+
+	// Wait for the future
+	if err := future.Wait(); err != nil {
+		return 0, err
+	}
+
+	return future.Index(), nil
 }
 
-func (n *NodeDrainer) drain(allocs []*structs.Allocation) {
-	// TODO
+func (n *NodeDrainer) drainAllocs(future *structs.BatchFuture, allocs []*structs.Allocation) {
+	// TODO This should shard to limit the size of the transaction.
+
+	// Compute the effected jobs and make the transistion map
+	jobs := make(map[string]*structs.Allocation, 4)
+	transistions := make(map[string]*structs.DesiredTransition, len(allocs))
+	for _, alloc := range allocs {
+		transistions[alloc.ID] = &structs.DesiredTransition{
+			Migrate: helper.BoolToPtr(true),
+		}
+		jobs[alloc.JobID] = alloc
+	}
+
+	evals := make([]*structs.Evaluation, 0, len(jobs))
+	for job, alloc := range jobs {
+		evals = append(evals, &structs.Evaluation{
+			ID:          uuid.Generate(),
+			Namespace:   alloc.Namespace,
+			Priority:    alloc.Job.Priority,
+			Type:        alloc.Job.Type,
+			TriggeredBy: structs.EvalTriggerNodeDrain,
+			JobID:       job,
+			Status:      structs.EvalStatusPending,
+		})
+	}
+
+	// Commit this update via Raft
+	index, err := n.raft.AllocUpdateDesiredTransition(transistions, evals)
+	future.Respond(index, err)
 }
diff --git a/nomad/drainerv2/draining_node.go b/nomad/drainerv2/draining_node.go
index 32233573b3e3..01ce49123d65 100644
--- a/nomad/drainerv2/draining_node.go
+++ b/nomad/drainerv2/draining_node.go
@@ -1,6 +1,7 @@
 package drainerv2
 
 import (
+	"fmt"
 	"sync"
 	"time"
 
@@ -8,32 +9,16 @@ import (
 	"github.com/hashicorp/nomad/nomad/structs"
 )
 
-// TODO make this an interface and then I can optimize the infinite case by
-// using a singleton object
-
-type drainCoordinator interface {
-	nodeDone(nodeID string)
-}
-
-func (n *NodeDrainer) nodeDone(nodeID string) {
-	select {
-	case <-n.ctx.Done():
-	case n.doneNodeCh <- nodeID:
-	}
-}
-
 type drainingNode struct {
-	coordinator drainCoordinator
-	state       *state.StateStore
-	node        *structs.Node
-	l           sync.RWMutex
+	state *state.StateStore
+	node  *structs.Node
+	l     sync.RWMutex
 }
 
-func NewDrainingNode(node *structs.Node, state *state.StateStore, coordinator drainCoordinator) *drainingNode {
+func NewDrainingNode(node *structs.Node, state *state.StateStore) *drainingNode {
 	return &drainingNode{
-		coordinator: coordinator,
-		state:       state,
-		node:        node,
+		state: state,
+		node:  node,
 	}
 }
 
@@ -62,10 +47,78 @@ func (n *drainingNode) DeadlineTime() (bool, time.Time) {
 	return n.node.DrainStrategy.DeadlineTime()
 }
 
+// IsDone returns if the node is done draining
+func (n *drainingNode) IsDone() (bool, error) {
+	n.l.RLock()
+	defer n.l.RUnlock()
+
+	// Should never happen
+	if n.node == nil || n.node.DrainStrategy == nil {
+		return false, fmt.Errorf("node doesn't have a drain strategy set")
+	}
+
+	// Grab the relevant drain info
+	ignoreSystem := n.node.DrainStrategy.IgnoreSystemJobs
+
+	// Retrieve the allocs on the node
+	allocs, err := n.state.AllocsByNode(nil, n.node.ID)
+	if err != nil {
+		return false, err
+	}
+
+	for _, alloc := range allocs {
+		// Skip system if configured to
+		if alloc.Job.Type == structs.JobTypeSystem && ignoreSystem {
+			continue
+		}
+
+		// If there is a non-terminal we aren't done
+		if !alloc.TerminalStatus() {
+			return false, nil
+		}
+	}
+
+	return true, nil
+}
+
 // DeadlineAllocs returns the set of allocations that should be drained given a
 // node is at its deadline
 func (n *drainingNode) DeadlineAllocs() ([]*structs.Allocation, error) {
 	n.l.RLock()
 	defer n.l.RUnlock()
-	return nil, nil
+
+	// Should never happen
+	if n.node == nil || n.node.DrainStrategy == nil {
+		return nil, fmt.Errorf("node doesn't have a drain strategy set")
+	}
+
+	// Grab the relevant drain info
+	inf, _ := n.node.DrainStrategy.DeadlineTime()
+	if inf {
+		return nil, nil
+	}
+	ignoreSystem := n.node.DrainStrategy.IgnoreSystemJobs
+
+	// Retrieve the allocs on the node
+	allocs, err := n.state.AllocsByNode(nil, n.node.ID)
+	if err != nil {
+		return nil, err
+	}
+
+	var drain []*structs.Allocation
+	for _, alloc := range allocs {
+		// Nothing to do on a terminal allocation
+		if alloc.TerminalStatus() {
+			continue
+		}
+
+		// Skip system if configured to
+		if alloc.Job.Type == structs.JobTypeSystem && ignoreSystem {
+			continue
+		}
+
+		drain = append(drain, alloc)
+	}
+
+	return drain, nil
 }
diff --git a/nomad/drainerv2/watch_jobs.go b/nomad/drainerv2/watch_jobs.go
index a2e6ef45ef50..b4442cd02469 100644
--- a/nomad/drainerv2/watch_jobs.go
+++ b/nomad/drainerv2/watch_jobs.go
@@ -14,17 +14,25 @@ import (
 	"golang.org/x/time/rate"
 )
 
+type DrainRequest struct {
+	Allocs []*structs.Allocation
+	Resp   *structs.BatchFuture
+}
+
+func NewDrainRequest(allocs []*structs.Allocation) *DrainRequest {
+	return &DrainRequest{
+		Allocs: allocs,
+		Resp:   structs.NewBatchFuture(),
+	}
+}
+
 // DrainingJobWatcher is the interface for watching a job drain
 type DrainingJobWatcher interface {
 	// RegisterJob is used to start watching a draining job
 	RegisterJob(jobID, namespace string)
 
-	// TODO This should probably be a drain future such that we can block the
-	// next loop till the raft apply happens such that we don't emit the same
-	// drain many times. We would get the applied index back and block till
-	// then.
 	// Drain is used to emit allocations that should be drained.
-	Drain() <-chan []*structs.Allocation
+	Drain() <-chan *DrainRequest
 
 	// Migrated is allocations for draining jobs that have transistioned to
 	// stop. There is no guarantee that duplicates won't be published.
@@ -51,7 +59,7 @@ type drainingJobWatcher struct {
 	queryCancel context.CancelFunc
 
 	// drainCh and migratedCh are used to emit allocations
-	drainCh    chan []*structs.Allocation
+	drainCh    chan *DrainRequest
 	migratedCh chan []*structs.Allocation
 
 	l sync.RWMutex
@@ -73,7 +81,7 @@ func NewDrainingJobWatcher(ctx context.Context, limiter *rate.Limiter, state *st
 		logger:      logger,
 		state:       state,
 		jobs:        make(map[structs.JobNs]struct{}, 64),
-		drainCh:     make(chan []*structs.Allocation, 8),
+		drainCh:     make(chan *DrainRequest, 8),
 		migratedCh:  make(chan []*structs.Allocation, 8),
 	}
 
@@ -103,7 +111,7 @@ func (w *drainingJobWatcher) RegisterJob(jobID, namespace string) {
 }
 
 // Drain returns the channel that emits allocations to drain.
-func (w *drainingJobWatcher) Drain() <-chan []*structs.Allocation {
+func (w *drainingJobWatcher) Drain() <-chan *DrainRequest {
 	return w.drainCh
 }
 
@@ -203,16 +211,34 @@ func (w *drainingJobWatcher) watch() {
 			}
 		}
 
-		if allDrain != nil {
+		if len(allDrain) != 0 {
+			// Create the request
+			req := NewDrainRequest(allDrain)
+
 			select {
-			case w.drainCh <- allDrain:
+			case w.drainCh <- req:
 			case <-w.ctx.Done():
 				w.logger.Printf("[TRACE] nomad.drain.job_watcher: shutting down")
 				return
 			}
+
+			// Wait for the request to be commited
+			select {
+			case <-req.Resp.WaitCh():
+			case <-w.ctx.Done():
+				w.logger.Printf("[TRACE] nomad.drain.job_watcher: shutting down")
+				return
+			}
+
+			// See if it successfully committed
+			if err := req.Resp.Error(); err != nil {
+				w.logger.Printf("[ERR] nomad.drain.job_watcher: failed to transistion allocations: %v", err)
+			}
+
+			// TODO Probably want to wait till the new index
 		}
 
-		if allMigrated != nil {
+		if len(allMigrated) != 0 {
 			select {
 			case w.migratedCh <- allMigrated:
 			case <-w.ctx.Done():
diff --git a/nomad/drainerv2/watch_nodes.go b/nomad/drainerv2/watch_nodes.go
index 568678f747d6..dd1686e566b3 100644
--- a/nomad/drainerv2/watch_nodes.go
+++ b/nomad/drainerv2/watch_nodes.go
@@ -50,7 +50,7 @@ func (n *NodeDrainer) Update(node *structs.Node) {
 
 	draining, ok := n.nodes[node.ID]
 	if !ok {
-		n.nodes[node.ID] = NewDrainingNode(node, n.state, n)
+		n.nodes[node.ID] = NewDrainingNode(node, n.state)
 		return
 	}
 
@@ -61,9 +61,6 @@ func (n *NodeDrainer) Update(node *structs.Node) {
 	if inf, deadline := node.DrainStrategy.DeadlineTime(); !inf {
 		n.deadlineNotifier.Watch(node.ID, deadline)
 	} else {
-		// TODO think about handling any race that may occur. I believe it is
-		// totally fine as long as the handlers are locked.
-
 		// There is an infinite deadline so it shouldn't be tracked for
 		// deadlining
 		n.deadlineNotifier.Remove(node.ID)
diff --git a/nomad/node_endpoint.go b/nomad/node_endpoint.go
index 6cfe62ae7e5c..e8726a2f4125 100644
--- a/nomad/node_endpoint.go
+++ b/nomad/node_endpoint.go
@@ -41,7 +41,7 @@ type Node struct {
 
 	// updateFuture is used to wait for the pending batch update
 	// to complete. This may be nil if no batch is pending.
-	updateFuture *batchFuture
+	updateFuture *structs.BatchFuture
 
 	// updateTimer is the timer that will trigger the next batch
 	// update, and may be nil if there is no batch pending.
@@ -933,7 +933,7 @@ func (n *Node) UpdateAlloc(args *structs.AllocUpdateRequest, reply *structs.Gene
 	// Start a new batch if none
 	future := n.updateFuture
 	if future == nil {
-		future = NewBatchFuture()
+		future = structs.NewBatchFuture()
 		n.updateFuture = future
 		n.updateTimer = time.AfterFunc(batchUpdateInterval, func() {
 			// Get the pending updates
@@ -962,7 +962,7 @@ func (n *Node) UpdateAlloc(args *structs.AllocUpdateRequest, reply *structs.Gene
 }
 
 // batchUpdate is used to update all the allocations
-func (n *Node) batchUpdate(future *batchFuture, updates []*structs.Allocation, evals []*structs.Evaluation) {
+func (n *Node) batchUpdate(future *structs.BatchFuture, updates []*structs.Allocation, evals []*structs.Evaluation) {
 	// Prepare the batch update
 	batch := &structs.AllocUpdateRequest{
 		Alloc:        updates,
@@ -1166,38 +1166,6 @@ func (n *Node) createNodeEvals(nodeID string, nodeIndex uint64) ([]string, uint6
 	return evalIDs, evalIndex, nil
 }
 
-// batchFuture is used to wait on a batch update to complete
-type batchFuture struct {
-	doneCh chan struct{}
-	err    error
-	index  uint64
-}
-
-// NewBatchFuture creates a new batch future
-func NewBatchFuture() *batchFuture {
-	return &batchFuture{
-		doneCh: make(chan struct{}),
-	}
-}
-
-// Wait is used to block for the future to complete and returns the error
-func (b *batchFuture) Wait() error {
-	<-b.doneCh
-	return b.err
-}
-
-// Index is used to return the index of the batch, only after Wait()
-func (b *batchFuture) Index() uint64 {
-	return b.index
-}
-
-// Respond is used to unblock the future
-func (b *batchFuture) Respond(index uint64, err error) {
-	b.index = index
-	b.err = err
-	close(b.doneCh)
-}
-
 // DeriveVaultToken is used by the clients to request wrapped Vault tokens for
 // tasks
 func (n *Node) DeriveVaultToken(args *structs.DeriveVaultTokenRequest,
diff --git a/nomad/node_endpoint_test.go b/nomad/node_endpoint_test.go
index 87c418d0d8a9..3d98a942f52b 100644
--- a/nomad/node_endpoint_test.go
+++ b/nomad/node_endpoint_test.go
@@ -1975,7 +1975,7 @@ func TestClientEndpoint_BatchUpdate(t *testing.T) {
 	clientAlloc.ClientStatus = structs.AllocClientStatusFailed
 
 	// Call to do the batch update
-	bf := NewBatchFuture()
+	bf := structs.NewBatchFuture()
 	endpoint := s1.staticEndpoints.Node
 	endpoint.batchUpdate(bf, []*structs.Allocation{clientAlloc}, nil)
 	if err := bf.Wait(); err != nil {
@@ -2541,34 +2541,6 @@ func TestClientEndpoint_ListNodes_Blocking(t *testing.T) {
 	}
 }
 
-func TestBatchFuture(t *testing.T) {
-	t.Parallel()
-	bf := NewBatchFuture()
-
-	// Async respond to the future
-	expect := fmt.Errorf("testing")
-	go func() {
-		time.Sleep(10 * time.Millisecond)
-		bf.Respond(1000, expect)
-	}()
-
-	// Block for the result
-	start := time.Now()
-	err := bf.Wait()
-	diff := time.Since(start)
-	if diff < 5*time.Millisecond {
-		t.Fatalf("too fast")
-	}
-
-	// Check the results
-	if err != expect {
-		t.Fatalf("bad: %s", err)
-	}
-	if bf.Index() != 1000 {
-		t.Fatalf("bad: %d", bf.Index())
-	}
-}
-
 func TestClientEndpoint_DeriveVaultToken_Bad(t *testing.T) {
 	t.Parallel()
 	s1 := TestServer(t, nil)
diff --git a/nomad/server.go b/nomad/server.go
index a9984ac34afb..afe7ee9871ca 100644
--- a/nomad/server.go
+++ b/nomad/server.go
@@ -27,7 +27,7 @@ import (
 	"github.com/hashicorp/nomad/helper/stats"
 	"github.com/hashicorp/nomad/helper/tlsutil"
 	"github.com/hashicorp/nomad/nomad/deploymentwatcher"
-	"github.com/hashicorp/nomad/nomad/drainer"
+	"github.com/hashicorp/nomad/nomad/drainerv2"
 	"github.com/hashicorp/nomad/nomad/state"
 	"github.com/hashicorp/nomad/nomad/structs"
 	"github.com/hashicorp/nomad/nomad/structs/config"
@@ -174,7 +174,7 @@ type Server struct {
 	deploymentWatcher *deploymentwatcher.Watcher
 
 	// nodeDrainer is used to drain allocations from nodes.
-	nodeDrainer *drainer.NodeDrainer
+	nodeDrainer *drainerv2.NodeDrainer
 
 	// evalBroker is used to manage the in-progress evaluations
 	// that are waiting to be brokered to a sub-scheduler
@@ -890,10 +890,18 @@ func (s *Server) setupDeploymentWatcher() error {
 // setupNodeDrainer creates a node drainer which will be enabled when a server
 // becomes a leader.
 func (s *Server) setupNodeDrainer() {
-	// create a shim around raft requests
+	// Create a shim around Raft requests
 	shim := drainerShim{s}
-	s.nodeDrainer = drainer.NewNodeDrainer(s.logger, s.shutdownCh, shim)
-	go s.nodeDrainer.Run()
+	c := &drainerv2.NodeDrainerConfig{
+		Logger:                s.logger,
+		Raft:                  shim,
+		JobFactory:            drainerv2.GetDrainingJobWatcher,
+		NodeFactory:           drainerv2.GetNodeWatcherFactory(),
+		DrainDeadlineFactory:  drainerv2.GetDeadlineNotifier,
+		StateQueriesPerSecond: drainerv2.LimitStateQueriesPerSecond,
+		BatchUpdateInterval:   drainerv2.BatchUpdateInterval,
+	}
+	s.nodeDrainer = drainerv2.NewNodeDrainer(c)
 }
 
 // setupVaultClient is used to set up the Vault API client.
diff --git a/nomad/structs/structs.go b/nomad/structs/structs.go
index 29e794cbf2ba..8750e7c0829c 100644
--- a/nomad/structs/structs.go
+++ b/nomad/structs/structs.go
@@ -7015,3 +7015,45 @@ type ACLTokenUpsertResponse struct {
 	Tokens []*ACLToken
 	WriteMeta
 }
+
+// BatchFuture is used to wait on a batch update to complete
+type BatchFuture struct {
+	doneCh chan struct{}
+	err    error
+	index  uint64
+}
+
+// NewBatchFuture creates a new batch future
+func NewBatchFuture() *BatchFuture {
+	return &BatchFuture{
+		doneCh: make(chan struct{}),
+	}
+}
+
+// Wait is used to block for the future to complete and returns the error
+func (b *BatchFuture) Wait() error {
+	<-b.doneCh
+	return b.err
+}
+
+// WaitCh is used to block for the future to complete
+func (b *BatchFuture) WaitCh() <-chan struct{} {
+	return b.doneCh
+}
+
+// Error is used to return the error of the batch, only after Wait()
+func (b *BatchFuture) Error() error {
+	return b.err
+}
+
+// Index is used to return the index of the batch, only after Wait()
+func (b *BatchFuture) Index() uint64 {
+	return b.index
+}
+
+// Respond is used to unblock the future
+func (b *BatchFuture) Respond(index uint64, err error) {
+	b.index = index
+	b.err = err
+	close(b.doneCh)
+}
diff --git a/nomad/structs/structs_test.go b/nomad/structs/structs_test.go
index f3cbe3d055e0..9df3d9e4f706 100644
--- a/nomad/structs/structs_test.go
+++ b/nomad/structs/structs_test.go
@@ -3597,3 +3597,31 @@ func TestNetworkResourcesEquals(t *testing.T) {
 		require.Equal(testCase.expected, first.Equals(second), testCase.errorMsg)
 	}
 }
+
+func TestBatchFuture(t *testing.T) {
+	t.Parallel()
+	bf := NewBatchFuture()
+
+	// Async respond to the future
+	expect := fmt.Errorf("testing")
+	go func() {
+		time.Sleep(10 * time.Millisecond)
+		bf.Respond(1000, expect)
+	}()
+
+	// Block for the result
+	start := time.Now()
+	err := bf.Wait()
+	diff := time.Since(start)
+	if diff < 5*time.Millisecond {
+		t.Fatalf("too fast")
+	}
+
+	// Check the results
+	if err != expect {
+		t.Fatalf("bad: %s", err)
+	}
+	if bf.Index() != 1000 {
+		t.Fatalf("bad: %d", bf.Index())
+	}
+}

From c0354223c233e1150eb323c28e560bcc0d6cd3aa Mon Sep 17 00:00:00 2001
From: Alex Dadgar <alex.dadgar@gmail.com>
Date: Wed, 7 Mar 2018 14:57:35 -0800
Subject: [PATCH 42/79] integration test and basic fixes

---
 nomad/drainer_int_test.go        | 188 +++++++++++++++++++++++++++++++
 nomad/drainerv2/drainer.go       |   2 +-
 nomad/drainerv2/draining_node.go |  29 +++++
 nomad/drainerv2/watch_jobs.go    |  77 ++++++++-----
 nomad/drainerv2/watch_nodes.go   |  26 ++++-
 nomad/structs/structs.go         |  10 +-
 nomad/worker.go                  |   2 +-
 7 files changed, 294 insertions(+), 40 deletions(-)
 create mode 100644 nomad/drainer_int_test.go

diff --git a/nomad/drainer_int_test.go b/nomad/drainer_int_test.go
new file mode 100644
index 000000000000..0c0372d16925
--- /dev/null
+++ b/nomad/drainer_int_test.go
@@ -0,0 +1,188 @@
+package nomad
+
+import (
+	"context"
+	"fmt"
+	"log"
+	"net/rpc"
+	"testing"
+	"time"
+
+	memdb "github.com/hashicorp/go-memdb"
+	msgpackrpc "github.com/hashicorp/net-rpc-msgpackrpc"
+	"github.com/hashicorp/nomad/helper"
+	"github.com/hashicorp/nomad/nomad/mock"
+	"github.com/hashicorp/nomad/nomad/state"
+	"github.com/hashicorp/nomad/nomad/structs"
+	"github.com/hashicorp/nomad/testutil"
+	"github.com/stretchr/testify/require"
+)
+
+func allocPromoter(t *testing.T, ctx context.Context,
+	state *state.StateStore, codec rpc.ClientCodec, nodeID string,
+	logger *log.Logger) {
+	t.Helper()
+
+	nindex := uint64(1)
+	for {
+		allocs, index, err := getNodeAllocs(ctx, state, nodeID, nindex)
+		if err != nil {
+			if err == context.Canceled {
+				return
+			}
+
+			t.Fatalf("failed to get node allocs: %v", err)
+		}
+		nindex = index
+
+		// For each alloc that doesn't have its deployment status set, set it
+		var updates []*structs.Allocation
+		for _, alloc := range allocs {
+			if alloc.DeploymentStatus != nil && alloc.DeploymentStatus.Healthy != nil {
+				continue
+			}
+
+			newAlloc := alloc.Copy()
+			newAlloc.DeploymentStatus = &structs.AllocDeploymentStatus{
+				Healthy: helper.BoolToPtr(true),
+			}
+			updates = append(updates, newAlloc)
+			logger.Printf("Marked deployment health for alloc %q", alloc.ID)
+		}
+
+		if len(updates) == 0 {
+			continue
+		}
+
+		// Send the update
+		req := &structs.AllocUpdateRequest{
+			Alloc:        updates,
+			WriteRequest: structs.WriteRequest{Region: "global"},
+		}
+		var resp structs.NodeAllocsResponse
+		require.Nil(t, msgpackrpc.CallWithCodec(codec, "Node.UpdateAlloc", req, &resp))
+	}
+}
+
+func getNodeAllocs(ctx context.Context, state *state.StateStore, nodeID string, index uint64) ([]*structs.Allocation, uint64, error) {
+	resp, index, err := state.BlockingQuery(getNodeAllocsImpl(nodeID), index, ctx)
+	if err != nil {
+		return nil, 0, err
+	}
+	if err := ctx.Err(); err != nil {
+		return nil, 0, err
+	}
+
+	return resp.([]*structs.Allocation), index, nil
+}
+
+func getNodeAllocsImpl(nodeID string) func(ws memdb.WatchSet, state *state.StateStore) (interface{}, uint64, error) {
+	return func(ws memdb.WatchSet, state *state.StateStore) (interface{}, uint64, error) {
+		// Capture all the allocations
+		allocs, err := state.AllocsByNode(ws, nodeID)
+		if err != nil {
+			return nil, 0, err
+		}
+
+		// Use the last index that affected the jobs table
+		index, err := state.Index("allocs")
+		if err != nil {
+			return nil, index, err
+		}
+
+		return allocs, index, nil
+	}
+}
+
+func TestDrainer_Simple_ServiceOnly(t *testing.T) {
+	t.Parallel()
+	require := require.New(t)
+	s1 := TestServer(t, nil)
+	defer s1.Shutdown()
+	codec := rpcClient(t, s1)
+	testutil.WaitForLeader(t, s1.RPC)
+
+	// Create two nodes
+	n1, n2 := mock.Node(), mock.Node()
+	nodeReg := &structs.NodeRegisterRequest{
+		Node:         n1,
+		WriteRequest: structs.WriteRequest{Region: "global"},
+	}
+	var nodeResp structs.NodeUpdateResponse
+	require.Nil(msgpackrpc.CallWithCodec(codec, "Node.Register", nodeReg, &nodeResp))
+
+	// Create a job that runs on just one
+	job := mock.Job()
+	job.TaskGroups[0].Count = 2
+	req := &structs.JobRegisterRequest{
+		Job: job,
+		WriteRequest: structs.WriteRequest{
+			Region:    "global",
+			Namespace: job.Namespace,
+		},
+	}
+
+	// Fetch the response
+	var resp structs.JobRegisterResponse
+	require.Nil(msgpackrpc.CallWithCodec(codec, "Job.Register", req, &resp))
+	require.NotZero(resp.Index)
+
+	// Wait for the two allocations to be placed
+	state := s1.State()
+	testutil.WaitForResult(func() (bool, error) {
+		allocs, err := state.AllocsByJob(nil, job.Namespace, job.ID, false)
+		if err != nil {
+			return false, err
+		}
+		return len(allocs) == 2, fmt.Errorf("got %d allocs", len(allocs))
+	}, func(err error) {
+		t.Fatalf("err: %v", err)
+	})
+
+	// Create the second node
+	nodeReg = &structs.NodeRegisterRequest{
+		Node:         n2,
+		WriteRequest: structs.WriteRequest{Region: "global"},
+	}
+	require.Nil(msgpackrpc.CallWithCodec(codec, "Node.Register", nodeReg, &nodeResp))
+
+	// Drain the first node
+	drainReq := &structs.NodeUpdateDrainRequest{
+		NodeID: n1.ID,
+		DrainStrategy: &structs.DrainStrategy{
+			DrainSpec: structs.DrainSpec{
+				Deadline: 10 * time.Minute,
+			},
+		},
+		WriteRequest: structs.WriteRequest{Region: "global"},
+	}
+	var drainResp structs.NodeDrainUpdateResponse
+	require.Nil(msgpackrpc.CallWithCodec(codec, "Node.UpdateDrain", drainReq, &drainResp))
+
+	// Wait for the allocs to be replaced
+	ctx, cancel := context.WithCancel(context.Background())
+	defer cancel()
+	go allocPromoter(t, ctx, state, codec, n1.ID, s1.logger)
+	go allocPromoter(t, ctx, state, codec, n2.ID, s1.logger)
+
+	testutil.WaitForResult(func() (bool, error) {
+		allocs, err := state.AllocsByNode(nil, n2.ID)
+		if err != nil {
+			return false, err
+		}
+		return len(allocs) == 2, fmt.Errorf("got %d allocs", len(allocs))
+	}, func(err error) {
+		t.Fatalf("err: %v", err)
+	})
+
+	// Check that the node drain is removed
+	testutil.WaitForResult(func() (bool, error) {
+		node, err := state.NodeByID(nil, n1.ID)
+		if err != nil {
+			return false, err
+		}
+		return node.DrainStrategy == nil, fmt.Errorf("has drain strategy still set")
+	}, func(err error) {
+		t.Fatalf("err: %v", err)
+	})
+}
diff --git a/nomad/drainerv2/drainer.go b/nomad/drainerv2/drainer.go
index f3553da1fcc3..a44ea1e8f876 100644
--- a/nomad/drainerv2/drainer.go
+++ b/nomad/drainerv2/drainer.go
@@ -159,7 +159,7 @@ func (n *NodeDrainer) SetEnabled(enabled bool, state *state.StateStore) {
 
 	// If we are starting now, launch the watch daemon
 	if enabled && !wasEnabled {
-		n.run(n.ctx)
+		go n.run(n.ctx)
 	}
 }
 
diff --git a/nomad/drainerv2/draining_node.go b/nomad/drainerv2/draining_node.go
index 01ce49123d65..93b3e5fb31ca 100644
--- a/nomad/drainerv2/draining_node.go
+++ b/nomad/drainerv2/draining_node.go
@@ -122,3 +122,32 @@ func (n *drainingNode) DeadlineAllocs() ([]*structs.Allocation, error) {
 
 	return drain, nil
 }
+
+// RunningServices returns the set of jobs on the node
+func (n *drainingNode) RunningServices() ([]structs.JobNs, error) {
+	n.l.RLock()
+	defer n.l.RUnlock()
+
+	// Retrieve the allocs on the node
+	allocs, err := n.state.AllocsByNode(nil, n.node.ID)
+	if err != nil {
+		return nil, err
+	}
+
+	jobIDs := make(map[structs.JobNs]struct{})
+	var jobs []structs.JobNs
+	for _, alloc := range allocs {
+		if alloc.TerminalStatus() || alloc.Job.Type != structs.JobTypeService {
+			continue
+		}
+
+		jns := structs.NewJobNs(alloc.Namespace, alloc.JobID)
+		if _, ok := jobIDs[jns]; ok {
+			continue
+		}
+		jobIDs[jns] = struct{}{}
+		jobs = append(jobs, jns)
+	}
+
+	return jobs, nil
+}
diff --git a/nomad/drainerv2/watch_jobs.go b/nomad/drainerv2/watch_jobs.go
index b4442cd02469..b0235e6ccda2 100644
--- a/nomad/drainerv2/watch_jobs.go
+++ b/nomad/drainerv2/watch_jobs.go
@@ -29,7 +29,7 @@ func NewDrainRequest(allocs []*structs.Allocation) *DrainRequest {
 // DrainingJobWatcher is the interface for watching a job drain
 type DrainingJobWatcher interface {
 	// RegisterJob is used to start watching a draining job
-	RegisterJob(jobID, namespace string)
+	RegisterJob(job structs.JobNs)
 
 	// Drain is used to emit allocations that should be drained.
 	Drain() <-chan *DrainRequest
@@ -90,20 +90,17 @@ func NewDrainingJobWatcher(ctx context.Context, limiter *rate.Limiter, state *st
 }
 
 // RegisterJob marks the given job as draining and adds it to being watched.
-func (w *drainingJobWatcher) RegisterJob(jobID, namespace string) {
+func (w *drainingJobWatcher) RegisterJob(job structs.JobNs) {
 	w.l.Lock()
 	defer w.l.Unlock()
 
-	jns := structs.JobNs{
-		ID:        jobID,
-		Namespace: namespace,
-	}
-	if _, ok := w.jobs[jns]; ok {
+	if _, ok := w.jobs[job]; ok {
 		return
 	}
 
 	// Add the job and cancel the context
-	w.jobs[jns] = struct{}{}
+	w.logger.Printf("[TRACE] nomad.drain.job_watcher: registering job %v", job)
+	w.jobs[job] = struct{}{}
 	w.queryCancel()
 
 	// Create a new query context
@@ -135,10 +132,11 @@ func (w *drainingJobWatcher) deregisterJob(jobID, namespace string) {
 
 // watch is the long lived watching routine that detects job drain changes.
 func (w *drainingJobWatcher) watch() {
-	jindex := uint64(1)
+	waitIndex := uint64(1)
 	for {
-		w.logger.Printf("[TRACE] nomad.drain.job_watcher: getting job allocs at index %d", jindex)
-		jobAllocs, index, err := w.getJobAllocs(w.getQueryCtx(), jindex)
+		w.logger.Printf("[TRACE] nomad.drain.job_watcher: getting job allocs at index %d", waitIndex)
+		jobAllocs, index, err := w.getJobAllocs(w.getQueryCtx(), waitIndex)
+		w.logger.Printf("[TRACE] nomad.drain.job_watcher: got job allocs %d at index %d: %v", len(jobAllocs), waitIndex, err)
 		if err != nil {
 			if err == context.Canceled {
 				// Determine if it is a cancel or a shutdown
@@ -152,7 +150,7 @@ func (w *drainingJobWatcher) watch() {
 				}
 			}
 
-			w.logger.Printf("[ERR] nomad.drain.job_watcher: error watching job allocs updates at index %d: %v", jindex, err)
+			w.logger.Printf("[ERR] nomad.drain.job_watcher: error watching job allocs updates at index %d: %v", waitIndex, err)
 			select {
 			case <-w.ctx.Done():
 				w.logger.Printf("[TRACE] nomad.drain.job_watcher: shutting down")
@@ -163,8 +161,8 @@ func (w *drainingJobWatcher) watch() {
 		}
 
 		// update index for next run
-		lastHandled := jindex
-		jindex = index
+		lastHandled := waitIndex
+		waitIndex = index
 
 		// Snapshot the state store
 		snap, err := w.state.Snapshot()
@@ -175,18 +173,19 @@ func (w *drainingJobWatcher) watch() {
 
 		currentJobs := w.drainingJobs()
 		var allDrain, allMigrated []*structs.Allocation
-		for job, allocs := range jobAllocs {
+		for jns, allocs := range jobAllocs {
 			// Check if the job is still registered
-			if _, ok := currentJobs[job]; !ok {
+			if _, ok := currentJobs[jns]; !ok {
+				w.logger.Printf("[TRACE] nomad.drain.job_watcher: skipping job %v as it is no longer registered for draining", jns)
 				continue
 			}
 
-			w.logger.Printf("[TRACE] nomad.drain.job_watcher: handling job %v", job)
+			w.logger.Printf("[TRACE] nomad.drain.job_watcher: handling job %v", jns)
 
 			// Lookup the job
-			job, err := w.state.JobByID(nil, job.Namespace, job.ID)
+			job, err := w.state.JobByID(nil, jns.Namespace, jns.ID)
 			if err != nil {
-				w.logger.Printf("[WARN] nomad.drain.job_watcher: failed to lookup job %v: %v", job, err)
+				w.logger.Printf("[WARN] nomad.drain.job_watcher: failed to lookup job %v: %v", jns, err)
 				continue
 			}
 
@@ -198,10 +197,12 @@ func (w *drainingJobWatcher) watch() {
 
 			result, err := handleJob(snap, job, allocs, lastHandled)
 			if err != nil {
-				w.logger.Printf("[ERR] nomad.drain.job_watcher: handling drain for job %v failed: %v", job, err)
+				w.logger.Printf("[ERR] nomad.drain.job_watcher: handling drain for job %v failed: %v", jns, err)
 				continue
 			}
 
+			w.logger.Printf("[TRACE] nomad.drain.job_watcher: result for job %v: %v", jns, result)
+
 			allDrain = append(allDrain, result.drain...)
 			allMigrated = append(allMigrated, result.migrated...)
 
@@ -214,6 +215,7 @@ func (w *drainingJobWatcher) watch() {
 		if len(allDrain) != 0 {
 			// Create the request
 			req := NewDrainRequest(allDrain)
+			w.logger.Printf("[TRACE] nomad.drain.job_watcher: sending drain request for %d allocs", len(allDrain))
 
 			select {
 			case w.drainCh <- req:
@@ -235,10 +237,14 @@ func (w *drainingJobWatcher) watch() {
 				w.logger.Printf("[ERR] nomad.drain.job_watcher: failed to transistion allocations: %v", err)
 			}
 
-			// TODO Probably want to wait till the new index
+			// Wait until the new index
+			if index := req.Resp.Index(); index > waitIndex {
+				waitIndex = index
+			}
 		}
 
 		if len(allMigrated) != 0 {
+			w.logger.Printf("[TRACE] nomad.drain.job_watcher: sending migrated for %d allocs", len(allMigrated))
 			select {
 			case w.migratedCh <- allMigrated:
 			case <-w.ctx.Done():
@@ -269,6 +275,10 @@ func newJobResult() *jobResult {
 	}
 }
 
+func (r *jobResult) String() string {
+	return fmt.Sprintf("Drain %d ; Migrate %d ; Done %v", len(r.drain), len(r.migrated), r.done)
+}
+
 // handleJob takes the state of a draining job and returns the desired actions.
 func handleJob(snap *state.StateSnapshot, job *structs.Job, allocs []*structs.Allocation, lastHandledIndex uint64) (*jobResult, error) {
 	r := newJobResult()
@@ -312,6 +322,8 @@ func handleTaskGroup(snap *state.StateSnapshot, tg *structs.TaskGroup,
 	var drainable []*structs.Allocation
 
 	for _, alloc := range allocs {
+		fmt.Printf("--- Looking at alloc %q\n", alloc.ID)
+
 		// Check if the alloc is on a draining node.
 		onDrainingNode, ok := drainingNodes[alloc.NodeID]
 		if !ok {
@@ -333,6 +345,7 @@ func handleTaskGroup(snap *state.StateSnapshot, tg *structs.TaskGroup,
 			onDrainingNode &&
 			alloc.ModifyIndex > lastHandledIndex {
 			result.migrated = append(result.migrated, alloc)
+			fmt.Printf("------- Alloc %q marked as migrated\n", alloc.ID)
 			continue
 		}
 
@@ -341,25 +354,33 @@ func handleTaskGroup(snap *state.StateSnapshot, tg *structs.TaskGroup,
 		if !alloc.TerminalStatus() &&
 			alloc.DeploymentStatus != nil &&
 			alloc.DeploymentStatus.Healthy != nil {
+			fmt.Printf("------- Alloc %q considered as healthy\n", alloc.ID)
 			healthy++
 		}
 
 		// An alloc can't be considered for migration if:
 		// - It isn't on a draining node
 		// - It is already terminal
-		// - It has already been marked for draining
-		if !onDrainingNode || alloc.TerminalStatus() || alloc.DesiredTransition.ShouldMigrate() {
+		if !onDrainingNode || alloc.TerminalStatus() {
+			fmt.Printf("------- Alloc %q not drainable\n", alloc.ID)
 			continue
 		}
 
-		// This alloc is drainable, so capture it and the fact that the job
-		// isn't done draining yet.
+		// Capture the fact that there is an allocation that is still draining
+		// for this job.
 		remainingDrainingAlloc = true
-		drainable = append(drainable, alloc)
+
+		// If we haven't marked this allocation for migration already, capture
+		// it as eligible for draining.
+		if !alloc.DesiredTransition.ShouldMigrate() {
+			drainable = append(drainable, alloc)
+			fmt.Printf("------- Alloc %q drainable\n", alloc.ID)
+		}
 	}
 
 	// Update the done status
 	if remainingDrainingAlloc {
+		fmt.Printf("------- Job has remaining allocs to drain\n")
 		result.done = false
 	}
 
@@ -368,6 +389,7 @@ func handleTaskGroup(snap *state.StateSnapshot, tg *structs.TaskGroup,
 	numToDrain := healthy - thresholdCount
 	numToDrain = helper.IntMin(len(drainable), numToDrain)
 	if numToDrain <= 0 {
+		fmt.Printf("------- Not draining any allocs\n")
 		return nil
 	}
 
@@ -385,6 +407,9 @@ func (w *drainingJobWatcher) getJobAllocs(ctx context.Context, minIndex uint64)
 	if err != nil {
 		return nil, 0, err
 	}
+	if resp == nil {
+		return nil, index, nil
+	}
 
 	return resp.(map[structs.JobNs][]*structs.Allocation), index, nil
 }
diff --git a/nomad/drainerv2/watch_nodes.go b/nomad/drainerv2/watch_nodes.go
index dd1686e566b3..9b6b32b3a0d4 100644
--- a/nomad/drainerv2/watch_nodes.go
+++ b/nomad/drainerv2/watch_nodes.go
@@ -50,13 +50,13 @@ func (n *NodeDrainer) Update(node *structs.Node) {
 
 	draining, ok := n.nodes[node.ID]
 	if !ok {
-		n.nodes[node.ID] = NewDrainingNode(node, n.state)
-		return
+		draining = NewDrainingNode(node, n.state)
+		n.nodes[node.ID] = draining
+	} else {
+		// Update it
+		draining.Update(node)
 	}
 
-	// Update it and update the dealiner
-	draining.Update(node)
-
 	// TODO test the notifier is updated
 	if inf, deadline := node.DrainStrategy.DeadlineTime(); !inf {
 		n.deadlineNotifier.Watch(node.ID, deadline)
@@ -66,6 +66,21 @@ func (n *NodeDrainer) Update(node *structs.Node) {
 		n.deadlineNotifier.Remove(node.ID)
 	}
 
+	// TODO Test this
+	// Register interest in the draining jobs.
+	jobs, err := draining.RunningServices()
+	if err != nil {
+		n.logger.Printf("[ERR] nomad.drain: error retrieving services on node %q: %v", node.ID, err)
+		return
+	}
+	n.logger.Printf("[TRACE] nomad.drain: node %q has %d services on it", node.ID, len(jobs))
+	for _, job := range jobs {
+		n.jobWatcher.RegisterJob(job)
+	}
+
+	// TODO we need to check if the node is done such that if an operator drains
+	// a node with nothing on it we unset drain
+
 }
 
 // nodeDrainWatcher is used to watch nodes that are entering, leaving or
@@ -105,6 +120,7 @@ func (w *nodeDrainWatcher) watch() {
 	for {
 		w.logger.Printf("[TRACE] nomad.drain.node_watcher: getting nodes at index %d", nindex)
 		nodes, index, err := w.getNodes(nindex)
+		w.logger.Printf("[TRACE] nomad.drain.node_watcher: got nodes %d at index %d: %v", len(nodes), nindex, err)
 		if err != nil {
 			if err == context.Canceled {
 				w.logger.Printf("[TRACE] nomad.drain.node_watcher: shutting down")
diff --git a/nomad/structs/structs.go b/nomad/structs/structs.go
index 8750e7c0829c..f85a4bf48c3b 100644
--- a/nomad/structs/structs.go
+++ b/nomad/structs/structs.go
@@ -1776,18 +1776,14 @@ type JobNs struct {
 	ID, Namespace string
 }
 
-func NewJobNs(namespace, id string) *JobNs {
-	return &JobNs{
+func NewJobNs(namespace, id string) JobNs {
+	return JobNs{
 		ID:        id,
 		Namespace: namespace,
 	}
 }
 
-func (j *JobNs) String() string {
-	if j == nil {
-		return "<nil, nil>"
-	}
-
+func (j JobNs) String() string {
 	return fmt.Sprintf("<ns: %q, id: %q>", j.Namespace, j.ID)
 }
 
diff --git a/nomad/worker.go b/nomad/worker.go
index 209d0b2938f8..6908188fbaf2 100644
--- a/nomad/worker.go
+++ b/nomad/worker.go
@@ -327,7 +327,7 @@ SUBMIT:
 		}
 		return nil, nil, err
 	} else {
-		w.logger.Printf("[DEBUG] worker: submitted plan for evaluation %s", plan.EvalID)
+		w.logger.Printf("[DEBUG] worker: submitted plan at index %d for evaluation %s", resp.Index, plan.EvalID)
 		w.backoffReset()
 	}
 

From fb40e8babe259632a3dbb0956266141d8b9ec8d4 Mon Sep 17 00:00:00 2001
From: Alex Dadgar <alex.dadgar@gmail.com>
Date: Wed, 7 Mar 2018 15:16:45 -0800
Subject: [PATCH 43/79] handle empty node case

---
 nomad/drainer_int_test.go      | 43 ++++++++++++++++++++++++++++++++++
 nomad/drainerv2/watch_nodes.go | 17 ++++++++++++--
 2 files changed, 58 insertions(+), 2 deletions(-)

diff --git a/nomad/drainer_int_test.go b/nomad/drainer_int_test.go
index 0c0372d16925..39422a5a0ddb 100644
--- a/nomad/drainer_int_test.go
+++ b/nomad/drainer_int_test.go
@@ -186,3 +186,46 @@ func TestDrainer_Simple_ServiceOnly(t *testing.T) {
 		t.Fatalf("err: %v", err)
 	})
 }
+
+func TestDrainer_DrainEmptyNode(t *testing.T) {
+	t.Parallel()
+	require := require.New(t)
+	s1 := TestServer(t, nil)
+	defer s1.Shutdown()
+	codec := rpcClient(t, s1)
+	testutil.WaitForLeader(t, s1.RPC)
+
+	// Create a node
+	n1 := mock.Node()
+	nodeReg := &structs.NodeRegisterRequest{
+		Node:         n1,
+		WriteRequest: structs.WriteRequest{Region: "global"},
+	}
+	var nodeResp structs.NodeUpdateResponse
+	require.Nil(msgpackrpc.CallWithCodec(codec, "Node.Register", nodeReg, &nodeResp))
+
+	// Drain the node
+	drainReq := &structs.NodeUpdateDrainRequest{
+		NodeID: n1.ID,
+		DrainStrategy: &structs.DrainStrategy{
+			DrainSpec: structs.DrainSpec{
+				Deadline: 10 * time.Minute,
+			},
+		},
+		WriteRequest: structs.WriteRequest{Region: "global"},
+	}
+	var drainResp structs.NodeDrainUpdateResponse
+	require.Nil(msgpackrpc.CallWithCodec(codec, "Node.UpdateDrain", drainReq, &drainResp))
+
+	// Check that the node drain is removed
+	state := s1.State()
+	testutil.WaitForResult(func() (bool, error) {
+		node, err := state.NodeByID(nil, n1.ID)
+		if err != nil {
+			return false, err
+		}
+		return node.DrainStrategy == nil, fmt.Errorf("has drain strategy still set")
+	}, func(err error) {
+		t.Fatalf("err: %v", err)
+	})
+}
diff --git a/nomad/drainerv2/watch_nodes.go b/nomad/drainerv2/watch_nodes.go
index 9b6b32b3a0d4..289767d344e7 100644
--- a/nomad/drainerv2/watch_nodes.go
+++ b/nomad/drainerv2/watch_nodes.go
@@ -78,9 +78,22 @@ func (n *NodeDrainer) Update(node *structs.Node) {
 		n.jobWatcher.RegisterJob(job)
 	}
 
-	// TODO we need to check if the node is done such that if an operator drains
-	// a node with nothing on it we unset drain
+	// Check if the node is done such that if an operator drains a node with
+	// nothing on it we unset drain
+	done, err := draining.IsDone()
+	if err != nil {
+		n.logger.Printf("[ERR] nomad.drain: failed to check if node %q is done draining: %v", node.ID, err)
+		return
+	}
 
+	if done {
+		index, err := n.raft.NodeDrainComplete(node.ID)
+		if err != nil {
+			n.logger.Printf("[ERR] nomad.drain: failed to unset drain for node %q: %v", node.ID, err)
+		} else {
+			n.logger.Printf("[INFO] nomad.drain: node %q completed draining at index %d", node.ID, index)
+		}
+	}
 }
 
 // nodeDrainWatcher is used to watch nodes that are entering, leaving or

From 4b4e234516aef11a7bda444e3b001fe285224ea8 Mon Sep 17 00:00:00 2001
From: Alex Dadgar <alex.dadgar@gmail.com>
Date: Wed, 7 Mar 2018 15:42:17 -0800
Subject: [PATCH 44/79] Comments

---
 nomad/drainerv2/drainer.go       | 67 +++++++++++++++++++++++++++-----
 nomad/drainerv2/draining_node.go |  1 +
 nomad/drainerv2/watch_jobs.go    |  1 +
 nomad/drainerv2/watch_nodes.go   |  4 ++
 4 files changed, 64 insertions(+), 9 deletions(-)

diff --git a/nomad/drainerv2/drainer.go b/nomad/drainerv2/drainer.go
index a44ea1e8f876..787f65bfd6aa 100644
--- a/nomad/drainerv2/drainer.go
+++ b/nomad/drainerv2/drainer.go
@@ -39,30 +39,48 @@ type RaftApplier interface {
 	NodeDrainComplete(nodeID string) (uint64, error)
 }
 
+// NodeTracker is the interface to notify an object that is tracking draining
+// nodes of changes
 type NodeTracker interface {
+	// TrackedNodes returns all the nodes that are currently tracked as
+	// draining.
 	TrackedNodes() map[string]*structs.Node
+
+	// Remove removes a node from the draining set.
 	Remove(nodeID string)
+
+	// Update either updates the specification of a draining node or tracks the
+	// node as draining.
 	Update(node *structs.Node)
 }
 
+// DrainingJobWatcherFactory returns a new DrainingJobWatcher
 type DrainingJobWatcherFactory func(context.Context, *rate.Limiter, *state.StateStore, *log.Logger) DrainingJobWatcher
+
+// DrainingNodeWatcherFactory returns a new DrainingNodeWatcher
 type DrainingNodeWatcherFactory func(context.Context, *rate.Limiter, *state.StateStore, *log.Logger, NodeTracker) DrainingNodeWatcher
+
+// DrainDeadlineNotifierFactory returns a new DrainDeadlineNotifier
 type DrainDeadlineNotifierFactory func(context.Context) DrainDeadlineNotifier
 
+// GetDrainingJobWatcher returns a draining job watcher
 func GetDrainingJobWatcher(ctx context.Context, limiter *rate.Limiter, state *state.StateStore, logger *log.Logger) DrainingJobWatcher {
 	return NewDrainingJobWatcher(ctx, limiter, state, logger)
 }
 
+// GetDeadlineNotifier returns a node deadline notifier with default coalescing.
 func GetDeadlineNotifier(ctx context.Context) DrainDeadlineNotifier {
 	return NewDeadlineHeap(ctx, NodeDeadlineCoalesceWindow)
 }
 
+// GetNodeWatcherFactory returns a DrainingNodeWatcherFactory
 func GetNodeWatcherFactory() DrainingNodeWatcherFactory {
 	return func(ctx context.Context, limiter *rate.Limiter, state *state.StateStore, logger *log.Logger, tracker NodeTracker) DrainingNodeWatcher {
 		return NewNodeDrainWatcher(ctx, limiter, state, logger, tracker)
 	}
 }
 
+// allocMigrateBatcher is used to batch allocation updates.
 type allocMigrateBatcher struct {
 	// updates holds pending client status updates for allocations
 	updates []*structs.Allocation
@@ -81,17 +99,24 @@ type allocMigrateBatcher struct {
 	sync.Mutex
 }
 
+// NodeDrainerConfig is used to configure a new node drainer.
 type NodeDrainerConfig struct {
-	Logger                *log.Logger
-	Raft                  RaftApplier
-	JobFactory            DrainingJobWatcherFactory
-	NodeFactory           DrainingNodeWatcherFactory
-	DrainDeadlineFactory  DrainDeadlineNotifierFactory
+	Logger               *log.Logger
+	Raft                 RaftApplier
+	JobFactory           DrainingJobWatcherFactory
+	NodeFactory          DrainingNodeWatcherFactory
+	DrainDeadlineFactory DrainDeadlineNotifierFactory
+
+	// StateQueriesPerSecond configures the query limit against the state store
+	// that is allowed by the node drainer.
 	StateQueriesPerSecond float64
-	BatchUpdateInterval   time.Duration
+
+	// BatchUpdateInterval is the interval in which allocation updates are
+	// batched.
+	BatchUpdateInterval time.Duration
 }
 
-// TODO Add stats
+// TODO(alex) Add stats
 type NodeDrainer struct {
 	enabled bool
 	logger  *log.Logger
@@ -99,12 +124,16 @@ type NodeDrainer struct {
 	// nodes is the set of draining nodes
 	nodes map[string]*drainingNode
 
+	// nodeWatcher watches for nodes to transistion in and out of drain state.
 	nodeWatcher DrainingNodeWatcher
 	nodeFactory DrainingNodeWatcherFactory
 
+	// jobWatcher watches draining jobs and emits desired drains and notifies
+	// when migrations take place.
 	jobWatcher DrainingJobWatcher
 	jobFactory DrainingJobWatcherFactory
 
+	// deadlineNotifier notifies when nodes reach their drain deadline.
 	deadlineNotifier        DrainDeadlineNotifier
 	deadlineNotifierFactory DrainDeadlineNotifierFactory
 
@@ -127,6 +156,10 @@ type NodeDrainer struct {
 	l sync.RWMutex
 }
 
+// NewNodeDrainer returns a new new node drainer. The node drainer is
+// responsible for marking allocations on draining nodes with a desired
+// migration transistion, updating the drain strategy on nodes when they are
+// complete and creating evaluations for the system to react to these changes.
 func NewNodeDrainer(c *NodeDrainerConfig) *NodeDrainer {
 	return &NodeDrainer{
 		raft:                    c.Raft,
@@ -177,6 +210,8 @@ func (n *NodeDrainer) flush() {
 	n.nodes = make(map[string]*drainingNode, 32)
 }
 
+// run is a long lived event handler that receives changes from the relevant
+// watchers and takes action based on them.
 func (n *NodeDrainer) run(ctx context.Context) {
 	for {
 		select {
@@ -192,6 +227,9 @@ func (n *NodeDrainer) run(ctx context.Context) {
 	}
 }
 
+// handleDeadlinedNodes handles a set of nodes reaching their drain deadline.
+// The handler detects the remaining allocations on the nodes and immediately
+// marks them for migration.
 func (n *NodeDrainer) handleDeadlinedNodes(nodes []string) {
 	// Retrieve the set of allocations that will be force stopped.
 	n.l.RLock()
@@ -215,12 +253,18 @@ func (n *NodeDrainer) handleDeadlinedNodes(nodes []string) {
 	n.batchDrainAllocs(forceStop)
 }
 
+// handleJobAllocDrain handles marking a set of allocations as having a desired
+// transistion to drain. The handler blocks till the changes to the allocation
+// have occured.
 func (n *NodeDrainer) handleJobAllocDrain(req *DrainRequest) {
 	// This should be syncronous
 	index, err := n.batchDrainAllocs(req.Allocs)
 	req.Resp.Respond(index, err)
 }
 
+// handleMigratedAllocs checks to see if any nodes can be considered done
+// draining based on the set of allocations that have migrated because of an
+// ongoing drain for a job.
 func (n *NodeDrainer) handleMigratedAllocs(allocs []*structs.Allocation) {
 	// Determine the set of nodes that were effected
 	nodes := make(map[string]struct{})
@@ -251,7 +295,7 @@ func (n *NodeDrainer) handleMigratedAllocs(allocs []*structs.Allocation) {
 	}
 	n.l.RUnlock()
 
-	// TODO This should probably be a single Raft transaction
+	// TODO(alex) This should probably be a single Raft transaction
 	for _, doneNode := range done {
 		index, err := n.raft.NodeDrainComplete(doneNode)
 		if err != nil {
@@ -262,6 +306,8 @@ func (n *NodeDrainer) handleMigratedAllocs(allocs []*structs.Allocation) {
 	}
 }
 
+// batchDrainAllocs is used to batch the draining of allocations. It will block
+// until the batch is complete.
 func (n *NodeDrainer) batchDrainAllocs(allocs []*structs.Allocation) (uint64, error) {
 	// Add this to the batch
 	n.batcher.Lock()
@@ -296,8 +342,11 @@ func (n *NodeDrainer) batchDrainAllocs(allocs []*structs.Allocation) (uint64, er
 	return future.Index(), nil
 }
 
+// drainAllocs is a non batch, marking of the desired transistion to migrate for
+// the set of allocations. It will also create the necessary evaluations for the
+// affected jobs.
 func (n *NodeDrainer) drainAllocs(future *structs.BatchFuture, allocs []*structs.Allocation) {
-	// TODO This should shard to limit the size of the transaction.
+	// TODO(alex) This should shard to limit the size of the transaction.
 
 	// Compute the effected jobs and make the transistion map
 	jobs := make(map[string]*structs.Allocation, 4)
diff --git a/nomad/drainerv2/draining_node.go b/nomad/drainerv2/draining_node.go
index 93b3e5fb31ca..0f13a1b74a77 100644
--- a/nomad/drainerv2/draining_node.go
+++ b/nomad/drainerv2/draining_node.go
@@ -81,6 +81,7 @@ func (n *drainingNode) IsDone() (bool, error) {
 	return true, nil
 }
 
+// TODO test that we return the right thing given the strategies
 // DeadlineAllocs returns the set of allocations that should be drained given a
 // node is at its deadline
 func (n *drainingNode) DeadlineAllocs() ([]*structs.Allocation, error) {
diff --git a/nomad/drainerv2/watch_jobs.go b/nomad/drainerv2/watch_jobs.go
index b0235e6ccda2..0644f347f0f8 100644
--- a/nomad/drainerv2/watch_jobs.go
+++ b/nomad/drainerv2/watch_jobs.go
@@ -322,6 +322,7 @@ func handleTaskGroup(snap *state.StateSnapshot, tg *structs.TaskGroup,
 	var drainable []*structs.Allocation
 
 	for _, alloc := range allocs {
+		// TODO Remove at the end/when no more bugs
 		fmt.Printf("--- Looking at alloc %q\n", alloc.ID)
 
 		// Check if the alloc is on a draining node.
diff --git a/nomad/drainerv2/watch_nodes.go b/nomad/drainerv2/watch_nodes.go
index 289767d344e7..34cc7a9c97d3 100644
--- a/nomad/drainerv2/watch_nodes.go
+++ b/nomad/drainerv2/watch_nodes.go
@@ -78,6 +78,8 @@ func (n *NodeDrainer) Update(node *structs.Node) {
 		n.jobWatcher.RegisterJob(job)
 	}
 
+	// TODO Test at this layer as well that a node drain on a node without
+	// allocs immediately gets unmarked as draining
 	// Check if the node is done such that if an operator drains a node with
 	// nothing on it we unset drain
 	done, err := draining.IsDone()
@@ -176,6 +178,8 @@ func (w *nodeDrainWatcher) watch() {
 			default:
 				w.logger.Printf("[TRACE] nomad.drain.node_watcher: node %q at index %v: tracked %v, draining %v", nodeID, node.ModifyIndex, tracked, newDraining)
 			}
+
+			// TODO(schmichael) handle the case of a lost node
 		}
 
 		for nodeID := range tracked {

From bd701979b8f9ee4b7d5c24abc5dd8b7f165c7495 Mon Sep 17 00:00:00 2001
From: Alex Dadgar <alex.dadgar@gmail.com>
Date: Wed, 7 Mar 2018 16:51:57 -0800
Subject: [PATCH 45/79] spelling fixes

---
 nomad/drainerv2/drainer.go    | 19 +++++++++----------
 nomad/drainerv2/watch_jobs.go |  6 +++---
 2 files changed, 12 insertions(+), 13 deletions(-)

diff --git a/nomad/drainerv2/drainer.go b/nomad/drainerv2/drainer.go
index 787f65bfd6aa..3ab57a7c5c22 100644
--- a/nomad/drainerv2/drainer.go
+++ b/nomad/drainerv2/drainer.go
@@ -124,7 +124,7 @@ type NodeDrainer struct {
 	// nodes is the set of draining nodes
 	nodes map[string]*drainingNode
 
-	// nodeWatcher watches for nodes to transistion in and out of drain state.
+	// nodeWatcher watches for nodes to transition in and out of drain state.
 	nodeWatcher DrainingNodeWatcher
 	nodeFactory DrainingNodeWatcherFactory
 
@@ -158,7 +158,7 @@ type NodeDrainer struct {
 
 // NewNodeDrainer returns a new new node drainer. The node drainer is
 // responsible for marking allocations on draining nodes with a desired
-// migration transistion, updating the drain strategy on nodes when they are
+// migration transition, updating the drain strategy on nodes when they are
 // complete and creating evaluations for the system to react to these changes.
 func NewNodeDrainer(c *NodeDrainerConfig) *NodeDrainer {
 	return &NodeDrainer{
@@ -254,10 +254,9 @@ func (n *NodeDrainer) handleDeadlinedNodes(nodes []string) {
 }
 
 // handleJobAllocDrain handles marking a set of allocations as having a desired
-// transistion to drain. The handler blocks till the changes to the allocation
-// have occured.
+// transition to drain. The handler blocks till the changes to the allocation
+// have occurred.
 func (n *NodeDrainer) handleJobAllocDrain(req *DrainRequest) {
-	// This should be syncronous
 	index, err := n.batchDrainAllocs(req.Allocs)
 	req.Resp.Respond(index, err)
 }
@@ -342,17 +341,17 @@ func (n *NodeDrainer) batchDrainAllocs(allocs []*structs.Allocation) (uint64, er
 	return future.Index(), nil
 }
 
-// drainAllocs is a non batch, marking of the desired transistion to migrate for
+// drainAllocs is a non batch, marking of the desired transition to migrate for
 // the set of allocations. It will also create the necessary evaluations for the
 // affected jobs.
 func (n *NodeDrainer) drainAllocs(future *structs.BatchFuture, allocs []*structs.Allocation) {
 	// TODO(alex) This should shard to limit the size of the transaction.
 
-	// Compute the effected jobs and make the transistion map
+	// Compute the effected jobs and make the transition map
 	jobs := make(map[string]*structs.Allocation, 4)
-	transistions := make(map[string]*structs.DesiredTransition, len(allocs))
+	transitions := make(map[string]*structs.DesiredTransition, len(allocs))
 	for _, alloc := range allocs {
-		transistions[alloc.ID] = &structs.DesiredTransition{
+		transitions[alloc.ID] = &structs.DesiredTransition{
 			Migrate: helper.BoolToPtr(true),
 		}
 		jobs[alloc.JobID] = alloc
@@ -372,6 +371,6 @@ func (n *NodeDrainer) drainAllocs(future *structs.BatchFuture, allocs []*structs
 	}
 
 	// Commit this update via Raft
-	index, err := n.raft.AllocUpdateDesiredTransition(transistions, evals)
+	index, err := n.raft.AllocUpdateDesiredTransition(transitions, evals)
 	future.Respond(index, err)
 }
diff --git a/nomad/drainerv2/watch_jobs.go b/nomad/drainerv2/watch_jobs.go
index 0644f347f0f8..0f5b04872869 100644
--- a/nomad/drainerv2/watch_jobs.go
+++ b/nomad/drainerv2/watch_jobs.go
@@ -34,7 +34,7 @@ type DrainingJobWatcher interface {
 	// Drain is used to emit allocations that should be drained.
 	Drain() <-chan *DrainRequest
 
-	// Migrated is allocations for draining jobs that have transistioned to
+	// Migrated is allocations for draining jobs that have transitioned to
 	// stop. There is no guarantee that duplicates won't be published.
 	Migrated() <-chan []*structs.Allocation
 }
@@ -224,7 +224,7 @@ func (w *drainingJobWatcher) watch() {
 				return
 			}
 
-			// Wait for the request to be commited
+			// Wait for the request to be committed
 			select {
 			case <-req.Resp.WaitCh():
 			case <-w.ctx.Done():
@@ -234,7 +234,7 @@ func (w *drainingJobWatcher) watch() {
 
 			// See if it successfully committed
 			if err := req.Resp.Error(); err != nil {
-				w.logger.Printf("[ERR] nomad.drain.job_watcher: failed to transistion allocations: %v", err)
+				w.logger.Printf("[ERR] nomad.drain.job_watcher: failed to transition allocations: %v", err)
 			}
 
 			// Wait until the new index

From 5b36af986005422dac47b39537a832b27602c360 Mon Sep 17 00:00:00 2001
From: Alex Dadgar <alex.dadgar@gmail.com>
Date: Thu, 8 Mar 2018 13:25:09 -0800
Subject: [PATCH 46/79] code review

---
 nomad/drainerv2/drain_heap.go | 36 +++++++++++++++--------------------
 nomad/drainerv2/drainer.go    | 29 ++++++++++++++--------------
 nomad/drainerv2/watch_jobs.go |  4 ++--
 3 files changed, 32 insertions(+), 37 deletions(-)

diff --git a/nomad/drainerv2/drain_heap.go b/nomad/drainerv2/drain_heap.go
index b661447e2b12..efde8a92d380 100644
--- a/nomad/drainerv2/drain_heap.go
+++ b/nomad/drainerv2/drain_heap.go
@@ -31,8 +31,8 @@ type deadlineHeap struct {
 	coalesceWindow time.Duration
 	batch          chan []string
 	nodes          map[string]time.Time
-	trigger        chan string
-	l              sync.RWMutex
+	trigger        chan struct{}
+	mu             sync.Mutex
 }
 
 // NewDeadlineHeap returns a new deadline heap that coalesces for the given
@@ -41,9 +41,9 @@ func NewDeadlineHeap(ctx context.Context, coalesceWindow time.Duration) *deadlin
 	d := &deadlineHeap{
 		ctx:            ctx,
 		coalesceWindow: coalesceWindow,
-		batch:          make(chan []string, 4),
+		batch:          make(chan []string),
 		nodes:          make(map[string]time.Time, 64),
-		trigger:        make(chan string, 4),
+		trigger:        make(chan struct{}),
 	}
 
 	go d.watch()
@@ -71,17 +71,18 @@ func (d *deadlineHeap) watch() {
 				continue
 			}
 
-			d.l.Lock()
+			d.mu.Lock()
 			var batch []string
 			for nodeID, nodeDeadline := range d.nodes {
 				if !nodeDeadline.After(nextDeadline) {
 					batch = append(batch, nodeID)
+					delete(d.nodes, nodeID)
 				}
 			}
+			d.mu.Unlock()
 
 			// If there is nothing exit early
 			if len(batch) == 0 {
-				d.l.Unlock()
 				goto CALC
 			}
 
@@ -89,15 +90,8 @@ func (d *deadlineHeap) watch() {
 			select {
 			case d.batch <- batch:
 			case <-d.ctx.Done():
-				d.l.Unlock()
 				return
 			}
-
-			// Clean up the nodes
-			for _, nodeID := range batch {
-				delete(d.nodes, nodeID)
-			}
-			d.l.Unlock()
 		case <-d.trigger:
 		}
 
@@ -117,8 +111,8 @@ func (d *deadlineHeap) watch() {
 // calculateNextDeadline returns the next deadline in which to scan for
 // deadlined nodes. It applies the coalesce window.
 func (d *deadlineHeap) calculateNextDeadline() (time.Time, bool) {
-	d.l.Lock()
-	defer d.l.Unlock()
+	d.mu.Lock()
+	defer d.mu.Unlock()
 
 	if len(d.nodes) == 0 {
 		return time.Time{}, false
@@ -151,23 +145,23 @@ func (d *deadlineHeap) NextBatch() <-chan []string {
 }
 
 func (d *deadlineHeap) Remove(nodeID string) {
-	d.l.Lock()
-	defer d.l.Unlock()
+	d.mu.Lock()
+	defer d.mu.Unlock()
 	delete(d.nodes, nodeID)
 
 	select {
-	case d.trigger <- nodeID:
+	case d.trigger <- struct{}{}:
 	default:
 	}
 }
 
 func (d *deadlineHeap) Watch(nodeID string, deadline time.Time) {
-	d.l.Lock()
-	defer d.l.Unlock()
+	d.mu.Lock()
+	defer d.mu.Unlock()
 	d.nodes[nodeID] = deadline
 
 	select {
-	case d.trigger <- nodeID:
+	case d.trigger <- struct{}{}:
 	default:
 	}
 }
diff --git a/nomad/drainerv2/drainer.go b/nomad/drainerv2/drainer.go
index 3ab57a7c5c22..b5842559d0b5 100644
--- a/nomad/drainerv2/drainer.go
+++ b/nomad/drainerv2/drainer.go
@@ -116,7 +116,8 @@ type NodeDrainerConfig struct {
 	BatchUpdateInterval time.Duration
 }
 
-// TODO(alex) Add stats
+// NodeDrainer is used to orchestrate migrating allocations off of draining
+// nodes.
 type NodeDrainer struct {
 	enabled bool
 	logger  *log.Logger
@@ -180,29 +181,29 @@ func (n *NodeDrainer) SetEnabled(enabled bool, state *state.StateStore) {
 	n.l.Lock()
 	defer n.l.Unlock()
 
-	wasEnabled := n.enabled
+	// If we are starting now or have a new state, init state and start the
+	// run loop
 	n.enabled = enabled
-
-	if state != nil {
-		n.state = state
-	}
-
-	// Flush the state to create the necessary objects
-	n.flush()
-
-	// If we are starting now, launch the watch daemon
-	if enabled && !wasEnabled {
+	if enabled {
+		n.flush(state)
 		go n.run(n.ctx)
+	} else if !enabled && n.exitFn != nil {
+		n.exitFn()
 	}
 }
 
 // flush is used to clear the state of the watcher
-func (n *NodeDrainer) flush() {
-	// Kill everything associated with the watcher
+func (n *NodeDrainer) flush(state *state.StateStore) {
+	// Cancel anything that may be running.
 	if n.exitFn != nil {
 		n.exitFn()
 	}
 
+	// Store the new state
+	if state != nil {
+		n.state = state
+	}
+
 	n.ctx, n.exitFn = context.WithCancel(context.Background())
 	n.jobWatcher = n.jobFactory(n.ctx, n.queryLimiter, n.state, n.logger)
 	n.nodeWatcher = n.nodeFactory(n.ctx, n.queryLimiter, n.state, n.logger, n)
diff --git a/nomad/drainerv2/watch_jobs.go b/nomad/drainerv2/watch_jobs.go
index 0f5b04872869..3a28f647ceff 100644
--- a/nomad/drainerv2/watch_jobs.go
+++ b/nomad/drainerv2/watch_jobs.go
@@ -81,8 +81,8 @@ func NewDrainingJobWatcher(ctx context.Context, limiter *rate.Limiter, state *st
 		logger:      logger,
 		state:       state,
 		jobs:        make(map[structs.JobNs]struct{}, 64),
-		drainCh:     make(chan *DrainRequest, 8),
-		migratedCh:  make(chan []*structs.Allocation, 8),
+		drainCh:     make(chan *DrainRequest),
+		migratedCh:  make(chan []*structs.Allocation),
 	}
 
 	go w.watch()

From d15371405fb356a0fc244c4cf150d1b97d46e52b Mon Sep 17 00:00:00 2001
From: Alex Dadgar <alex.dadgar@gmail.com>
Date: Thu, 8 Mar 2018 11:06:30 -0800
Subject: [PATCH 47/79] Toggle Drain allows resetting eligibility

This PR allows marking a node as eligible for scheduling while toggling
drain. By default the `nomad node drain -disable` commmand will mark it
as eligible but the drainer will maintain in-eligibility.
---
 api/nodes.go                    | 14 +++++++----
 api/nodes_test.go               |  7 ++++--
 command/agent/node_endpoint.go  |  3 ++-
 command/node_drain.go           | 27 ++++++++++++++--------
 nomad/fsm.go                    |  2 +-
 nomad/node_endpoint_test.go     |  2 +-
 nomad/state/state_store.go      |  5 +++-
 nomad/state/state_store_test.go | 41 +++++++++++++++++++++++++++++++--
 nomad/structs/structs.go        |  3 +++
 9 files changed, 82 insertions(+), 22 deletions(-)

diff --git a/api/nodes.go b/api/nodes.go
index a505d9ae369f..9261528544f8 100644
--- a/api/nodes.go
+++ b/api/nodes.go
@@ -51,13 +51,19 @@ type NodeUpdateDrainRequest struct {
 	// DrainSpec is the drain specification to set for the node. A nil DrainSpec
 	// will disable draining.
 	DrainSpec *DrainSpec
+
+	// MarkEligible marks the node as eligible if removing the drain strategy.
+	MarkEligible bool
 }
 
-// UpdateDrain is used to update the drain strategy for a given node.
-func (n *Nodes) UpdateDrain(nodeID string, spec *DrainSpec, q *WriteOptions) (*WriteMeta, error) {
+// UpdateDrain is used to update the drain strategy for a given node. If
+// markEligible is true and the drain is being removed, the node will be marked
+// as having its scheduling being elibile
+func (n *Nodes) UpdateDrain(nodeID string, spec *DrainSpec, markEligible bool, q *WriteOptions) (*WriteMeta, error) {
 	req := &NodeUpdateDrainRequest{
-		NodeID:    nodeID,
-		DrainSpec: spec,
+		NodeID:       nodeID,
+		DrainSpec:    spec,
+		MarkEligible: markEligible,
 	}
 
 	wm, err := n.client.write("/v1/node/"+nodeID+"/drain", req, nil, q)
diff --git a/api/nodes_test.go b/api/nodes_test.go
index 22d61c4011af..d2b02b82c243 100644
--- a/api/nodes_test.go
+++ b/api/nodes_test.go
@@ -177,7 +177,7 @@ func TestNodes_ToggleDrain(t *testing.T) {
 	spec := &DrainSpec{
 		Deadline: 10 * time.Second,
 	}
-	wm, err := nodes.UpdateDrain(nodeID, spec, nil)
+	wm, err := nodes.UpdateDrain(nodeID, spec, false, nil)
 	if err != nil {
 		t.Fatalf("err: %s", err)
 	}
@@ -193,7 +193,7 @@ func TestNodes_ToggleDrain(t *testing.T) {
 	}
 
 	// Toggle off again
-	wm, err = nodes.UpdateDrain(nodeID, nil, nil)
+	wm, err = nodes.UpdateDrain(nodeID, nil, true, nil)
 	if err != nil {
 		t.Fatalf("err: %s", err)
 	}
@@ -210,6 +210,9 @@ func TestNodes_ToggleDrain(t *testing.T) {
 	if out.DrainStrategy != nil {
 		t.Fatalf("drain strategy should be unset")
 	}
+	if out.SchedulingEligibility != structs.NodeSchedulingEligible {
+		t.Fatalf("should be eligible")
+	}
 }
 
 func TestNodes_ToggleEligibility(t *testing.T) {
diff --git a/command/agent/node_endpoint.go b/command/agent/node_endpoint.go
index a86df751c1ab..bad4fc445b4d 100644
--- a/command/agent/node_endpoint.go
+++ b/command/agent/node_endpoint.go
@@ -132,7 +132,8 @@ func (s *HTTPServer) nodeToggleDrain(resp http.ResponseWriter, req *http.Request
 	}
 
 	args := structs.NodeUpdateDrainRequest{
-		NodeID: nodeID,
+		NodeID:       nodeID,
+		MarkEligible: drainRequest.MarkEligible,
 	}
 	if drainRequest.DrainSpec != nil {
 		args.DrainStrategy = &structs.DrainStrategy{
diff --git a/command/node_drain.go b/command/node_drain.go
index f6475c7bedd4..9d8326d472a0 100644
--- a/command/node_drain.go
+++ b/command/node_drain.go
@@ -56,6 +56,11 @@ Node Drain Options:
     Ignore system allows the drain to complete without stopping system job
     allocations. By default system jobs are stopped last.
 
+  -keep-ineligible
+    Keep ineligible will maintain the node's scheduling ineligibility even if
+    the drain is being disabled. This is useful when an existing drain is being
+    cancelled but additional scheduling on the node is not desired.
+
   -self
     Set the drain status of the local node.
 
@@ -72,14 +77,15 @@ func (c *NodeDrainCommand) Synopsis() string {
 func (c *NodeDrainCommand) AutocompleteFlags() complete.Flags {
 	return mergeAutocompleteFlags(c.Meta.AutocompleteFlags(FlagSetClient),
 		complete.Flags{
-			"-disable":       complete.PredictNothing,
-			"-enable":        complete.PredictNothing,
-			"-deadline":      complete.PredictAnything,
-			"-force":         complete.PredictNothing,
-			"-no-deadline":   complete.PredictNothing,
-			"-ignore-system": complete.PredictNothing,
-			"-self":          complete.PredictNothing,
-			"-yes":           complete.PredictNothing,
+			"-disable":         complete.PredictNothing,
+			"-enable":          complete.PredictNothing,
+			"-deadline":        complete.PredictAnything,
+			"-force":           complete.PredictNothing,
+			"-no-deadline":     complete.PredictNothing,
+			"-ignore-system":   complete.PredictNothing,
+			"-keep-ineligible": complete.PredictNothing,
+			"-self":            complete.PredictNothing,
+			"-yes":             complete.PredictNothing,
 		})
 }
 
@@ -100,7 +106,7 @@ func (c *NodeDrainCommand) AutocompleteArgs() complete.Predictor {
 
 func (c *NodeDrainCommand) Run(args []string) int {
 	var enable, disable, force,
-		noDeadline, ignoreSystem, self, autoYes bool
+		noDeadline, ignoreSystem, keepIneligible, self, autoYes bool
 	var deadline string
 
 	flags := c.Meta.FlagSet("node-drain", FlagSetClient)
@@ -111,6 +117,7 @@ func (c *NodeDrainCommand) Run(args []string) int {
 	flags.BoolVar(&force, "force", false, "Force immediate drain")
 	flags.BoolVar(&noDeadline, "no-deadline", false, "Drain node with no deadline")
 	flags.BoolVar(&ignoreSystem, "ignore-system", false, "Do not drain system job allocations from the node")
+	flags.BoolVar(&keepIneligible, "keep-ineligible", false, "Do not update the nodes scheduling eligibility")
 	flags.BoolVar(&self, "self", false, "")
 	flags.BoolVar(&autoYes, "yes", false, "Automatic yes to prompts.")
 
@@ -252,7 +259,7 @@ func (c *NodeDrainCommand) Run(args []string) int {
 	}
 
 	// Toggle node draining
-	if _, err := client.Nodes().UpdateDrain(node.ID, spec, nil); err != nil {
+	if _, err := client.Nodes().UpdateDrain(node.ID, spec, !keepIneligible, nil); err != nil {
 		c.Ui.Error(fmt.Sprintf("Error updating drain specification: %s", err))
 		return 1
 	}
diff --git a/nomad/fsm.go b/nomad/fsm.go
index b377f09b3fef..bc52f256e343 100644
--- a/nomad/fsm.go
+++ b/nomad/fsm.go
@@ -330,7 +330,7 @@ func (n *nomadFSM) applyDrainUpdate(buf []byte, index uint64) interface{} {
 		panic(fmt.Errorf("failed to decode request: %v", err))
 	}
 
-	if err := n.state.UpdateNodeDrain(index, req.NodeID, req.DrainStrategy); err != nil {
+	if err := n.state.UpdateNodeDrain(index, req.NodeID, req.DrainStrategy, req.MarkEligible); err != nil {
 		n.logger.Printf("[ERR] nomad.fsm: UpdateNodeDrain failed: %v", err)
 		return err
 	}
diff --git a/nomad/node_endpoint_test.go b/nomad/node_endpoint_test.go
index 3d98a942f52b..0a18f937cb17 100644
--- a/nomad/node_endpoint_test.go
+++ b/nomad/node_endpoint_test.go
@@ -2470,7 +2470,7 @@ func TestClientEndpoint_ListNodes_Blocking(t *testing.T) {
 				Deadline: 10 * time.Second,
 			},
 		}
-		if err := state.UpdateNodeDrain(3, node.ID, s); err != nil {
+		if err := state.UpdateNodeDrain(3, node.ID, s, false); err != nil {
 			t.Fatalf("err: %v", err)
 		}
 	})
diff --git a/nomad/state/state_store.go b/nomad/state/state_store.go
index ef6a51754167..5f4564001135 100644
--- a/nomad/state/state_store.go
+++ b/nomad/state/state_store.go
@@ -618,7 +618,8 @@ func (s *StateStore) UpdateNodeStatus(index uint64, nodeID, status string) error
 }
 
 // UpdateNodeDrain is used to update the drain of a node
-func (s *StateStore) UpdateNodeDrain(index uint64, nodeID string, drain *structs.DrainStrategy) error {
+func (s *StateStore) UpdateNodeDrain(index uint64, nodeID string,
+	drain *structs.DrainStrategy, markEligible bool) error {
 
 	txn := s.db.Txn(true)
 	defer txn.Abort()
@@ -641,6 +642,8 @@ func (s *StateStore) UpdateNodeDrain(index uint64, nodeID string, drain *structs
 	copyNode.DrainStrategy = drain
 	if drain != nil {
 		copyNode.SchedulingEligibility = structs.NodeSchedulingIneligible
+	} else if markEligible {
+		copyNode.SchedulingEligibility = structs.NodeSchedulingEligible
 	}
 
 	copyNode.ModifyIndex = index
diff --git a/nomad/state/state_store_test.go b/nomad/state/state_store_test.go
index 1bf1467deda5..7eeb4672e212 100644
--- a/nomad/state/state_store_test.go
+++ b/nomad/state/state_store_test.go
@@ -716,7 +716,7 @@ func TestStateStore_UpdateNodeDrain_Node(t *testing.T) {
 		},
 	}
 
-	require.Nil(state.UpdateNodeDrain(1001, node.ID, expectedDrain))
+	require.Nil(state.UpdateNodeDrain(1001, node.ID, expectedDrain, false))
 	require.True(watchFired(ws))
 
 	ws = memdb.NewWatchSet()
@@ -822,6 +822,43 @@ func TestStateStore_NodeEvents_RetentionWindow(t *testing.T) {
 	require.Equal(uint64(20), out.Events[len(out.Events)-1].CreateIndex)
 }
 
+func TestStateStore_UpdateNodeDrain_ResetEligiblity(t *testing.T) {
+	require := require.New(t)
+	state := testStateStore(t)
+	node := mock.Node()
+	require.Nil(state.UpsertNode(1000, node))
+
+	// Create a watchset so we can test that update node drain fires the watch
+	ws := memdb.NewWatchSet()
+	_, err := state.NodeByID(ws, node.ID)
+	require.Nil(err)
+
+	drain := &structs.DrainStrategy{
+		DrainSpec: structs.DrainSpec{
+			Deadline: -1 * time.Second,
+		},
+	}
+
+	require.Nil(state.UpdateNodeDrain(1001, node.ID, drain, false))
+	require.True(watchFired(ws))
+
+	// Remove the drain
+	require.Nil(state.UpdateNodeDrain(1002, node.ID, nil, true))
+
+	ws = memdb.NewWatchSet()
+	out, err := state.NodeByID(ws, node.ID)
+	require.Nil(err)
+	require.False(out.Drain)
+	require.Nil(out.DrainStrategy)
+	require.Equal(out.SchedulingEligibility, structs.NodeSchedulingEligible)
+	require.EqualValues(1002, out.ModifyIndex)
+
+	index, err := state.Index("nodes")
+	require.Nil(err)
+	require.EqualValues(1002, index)
+	require.False(watchFired(ws))
+}
+
 func TestStateStore_UpdateNodeEligibility(t *testing.T) {
 	require := require.New(t)
 	state := testStateStore(t)
@@ -860,7 +897,7 @@ func TestStateStore_UpdateNodeEligibility(t *testing.T) {
 			Deadline: -1 * time.Second,
 		},
 	}
-	require.Nil(state.UpdateNodeDrain(1002, node.ID, expectedDrain))
+	require.Nil(state.UpdateNodeDrain(1002, node.ID, expectedDrain, false))
 
 	// Try to set the node to eligible
 	err = state.UpdateNodeEligibility(1003, node.ID, structs.NodeSchedulingEligible)
diff --git a/nomad/structs/structs.go b/nomad/structs/structs.go
index f85a4bf48c3b..04c073946be5 100644
--- a/nomad/structs/structs.go
+++ b/nomad/structs/structs.go
@@ -308,6 +308,9 @@ type NodeUpdateDrainRequest struct {
 	NodeID        string
 	Drain         bool // TODO Deprecate
 	DrainStrategy *DrainStrategy
+
+	// MarkEligible marks the node as eligible if removing the drain strategy.
+	MarkEligible bool
 	WriteRequest
 }
 

From efb6601129d61b11df63ea779f22606c98ad94f6 Mon Sep 17 00:00:00 2001
From: Michael Schurter <schmichael@hashicorp.com>
Date: Thu, 8 Mar 2018 15:08:23 -0800
Subject: [PATCH 48/79] Switch to drainerv2 impl

---
 nomad/drainer/drain.go                        | 590 ------------------
 nomad/{drainerv2 => drainer}/drain_heap.go    |   2 +-
 .../{drainerv2 => drainer}/drain_heap_test.go |   2 +-
 nomad/drainer/drain_test.go                   | 491 ---------------
 nomad/{drainerv2 => drainer}/drain_testing.go |   2 +-
 nomad/{drainerv2 => drainer}/drainer.go       |   2 +-
 nomad/{drainerv2 => drainer}/draining_node.go |   2 +-
 nomad/drainer/job_watcher.go                  | 140 -----
 nomad/drainer/node_watcher.go                 | 121 ----
 nomad/{drainerv2 => drainer}/watch_jobs.go    |   2 +-
 .../{drainerv2 => drainer}/watch_jobs_test.go |   2 +-
 nomad/{drainerv2 => drainer}/watch_nodes.go   |   2 +-
 .../watch_nodes_test.go                       |   6 +-
 nomad/server.go                               |  18 +-
 14 files changed, 20 insertions(+), 1362 deletions(-)
 delete mode 100644 nomad/drainer/drain.go
 rename nomad/{drainerv2 => drainer}/drain_heap.go (99%)
 rename nomad/{drainerv2 => drainer}/drain_heap_test.go (99%)
 delete mode 100644 nomad/drainer/drain_test.go
 rename nomad/{drainerv2 => drainer}/drain_testing.go (98%)
 rename nomad/{drainerv2 => drainer}/drainer.go (99%)
 rename nomad/{drainerv2 => drainer}/draining_node.go (99%)
 delete mode 100644 nomad/drainer/job_watcher.go
 delete mode 100644 nomad/drainer/node_watcher.go
 rename nomad/{drainerv2 => drainer}/watch_jobs.go (99%)
 rename nomad/{drainerv2 => drainer}/watch_jobs_test.go (99%)
 rename nomad/{drainerv2 => drainer}/watch_nodes.go (99%)
 rename nomad/{drainerv2 => drainer}/watch_nodes_test.go (97%)

diff --git a/nomad/drainer/drain.go b/nomad/drainer/drain.go
deleted file mode 100644
index 8db56ac7dacc..000000000000
--- a/nomad/drainer/drain.go
+++ /dev/null
@@ -1,590 +0,0 @@
-package drainer
-
-import (
-	"context"
-	"log"
-	"strings"
-	"time"
-
-	"github.com/hashicorp/nomad/helper"
-	"github.com/hashicorp/nomad/helper/uuid"
-	"github.com/hashicorp/nomad/nomad/state"
-	"github.com/hashicorp/nomad/nomad/structs"
-)
-
-// jobKey is a tuple of namespace+jobid for use as a map key by job
-type jobKey struct {
-	ns    string
-	jobid string
-}
-
-// runningJob contains the Job and allocations for that job meant to be used
-// when collecting all allocations for a job with at least one allocation on a
-// draining node.
-//
-// In order to drain an allocation we must also emit an evaluation for its job,
-// so this struct bundles allocations with their job.
-type runningJob struct {
-	job    *structs.Job
-	allocs []*structs.Allocation
-}
-
-// collectResult is the state collected by scanning for drain eligible allocs
-type collectResult struct {
-	// drainableSvcs contains all service jobs and allocs that are
-	// potentially drainable meaning they have at least one allocation on a
-	// draining node.
-	drainableSvcs map[jobKey]*runningJob
-
-	// drainNow contains all batch and system jobs that should be
-	// immediately drained due to a deadline or in the case of system jobs:
-	// all other allocs on the node have completed draining.
-	drainNow map[jobKey]*runningJob
-
-	// upPerTG is a count of running allocs per task group for the
-	// migration mark phase to use when considering how many allocs can be
-	// migrated for a given group.
-	upPerTG map[string]int
-
-	// doneNodes need no coordinating to finish their drain. Either all
-	// allocs have drained, the node is being force drained, or the drain
-	// deadline was hit. Any remaining allocs will be migrated via
-	// drainNow.
-	doneNodes map[string]*structs.Node
-}
-
-// makeTaskGroupKey returns a unique key for an allocation's task group
-func makeTaskGroupKey(a *structs.Allocation) string {
-	return strings.Join([]string{a.Namespace, a.JobID, a.TaskGroup}, "-")
-}
-
-// stopAllocs tracks allocs to drain by a unique TG key along with their jobs
-// as we need to emit evaluations for each allocations job
-type stopAllocs struct {
-	allocBatch map[string]*structs.DesiredTransition
-
-	// namespace+jobid -> Job
-	jobBatch map[jobKey]*structs.Job
-}
-
-// newStopAllocs creates a list of allocs to migrate from an initial list of
-// running jobs+allocs that need immediate draining.
-func newStopAllocs(initial map[jobKey]*runningJob) *stopAllocs {
-	s := &stopAllocs{
-		allocBatch: make(map[string]*structs.DesiredTransition),
-		jobBatch:   make(map[jobKey]*structs.Job),
-	}
-
-	// Add initial allocs
-	for _, drainingJob := range initial {
-		for _, a := range drainingJob.allocs {
-			s.add(drainingJob.job, a)
-		}
-	}
-	return s
-}
-
-// add an allocation to be migrated. Its job must also be specified in order to
-// emit an evaluation.
-func (s *stopAllocs) add(j *structs.Job, a *structs.Allocation) {
-	// Add the desired migration transition to the batch
-	s.allocBatch[a.ID] = &structs.DesiredTransition{
-		Migrate: helper.BoolToPtr(true),
-	}
-
-	// Add job to the job batch
-	s.jobBatch[jobKey{a.Namespace, a.JobID}] = j
-}
-
-// RaftApplier contains methods for applying the raft requests required by the
-// NodeDrainer.
-type RaftApplier interface {
-	AllocUpdateDesiredTransition(allocs map[string]*structs.DesiredTransition, evals []*structs.Evaluation) error
-	NodeDrainComplete(nodeID string) error
-}
-
-// nodeDrainerState is used to communicate the state set by
-// NodeDrainer.SetEnabled to the concurrently executing Run loop.
-type nodeDrainerState struct {
-	enabled bool
-	state   *state.StateStore
-}
-
-// NodeDrainer migrates allocations off of draining nodes. SetEnabled(true)
-// should be called when a server establishes leadership and SetEnabled(false)
-// called when leadership is lost.
-type NodeDrainer struct {
-	// enabledCh is used by SetEnabled to signal Run when to start/stop the
-	// nodeDrainer goroutine
-	enabledCh chan nodeDrainerState
-
-	// raft is a shim around the raft messages necessary for draining
-	raft RaftApplier
-
-	// shutdownCh is closed when the Server is shutting down the
-	// NodeDrainer should permanently exit
-	shutdownCh <-chan struct{}
-
-	logger *log.Logger
-}
-
-// NewNodeDrainer creates a new NodeDrainer which will exit when shutdownCh is
-// closed. A RaftApplier shim must be supplied to allow NodeDrainer access to
-// the raft messages it sends.
-func NewNodeDrainer(logger *log.Logger, shutdownCh <-chan struct{}, raft RaftApplier) *NodeDrainer {
-	return &NodeDrainer{
-		enabledCh:  make(chan nodeDrainerState),
-		raft:       raft,
-		shutdownCh: shutdownCh,
-		logger:     logger,
-	}
-}
-
-// SetEnabled will start or stop the node draining goroutine depending on the
-// enabled boolean. SetEnabled is meant to be called concurrently with Run.
-func (n *NodeDrainer) SetEnabled(enabled bool, state *state.StateStore) {
-	select {
-	case n.enabledCh <- nodeDrainerState{enabled, state}:
-	case <-n.shutdownCh:
-	}
-}
-
-// Run monitors the shutdown chan as well as SetEnabled calls and starts/stops
-// the node draining goroutine appropriately. As it blocks it should be called
-// in a goroutine.
-func (n *NodeDrainer) Run() {
-	running := false
-	var s nodeDrainerState
-	ctx, cancel := context.WithCancel(context.Background())
-	for {
-		select {
-		case s = <-n.enabledCh:
-		case <-n.shutdownCh:
-			// Stop drainer and exit
-			cancel()
-			return
-		}
-
-		switch {
-		case s.enabled && running:
-			// Already running, must restart to ensure the latest StateStore is used
-			cancel()
-			ctx, cancel = context.WithCancel(context.Background())
-			go n.nodeDrainer(ctx, s.state)
-
-		case !s.enabled && !running:
-			// Already stopped; nothing to do
-
-		case !s.enabled && running:
-			// Stop running node drainer
-			cancel()
-			running = false
-
-		case s.enabled && !running:
-			// Start running node drainer
-			ctx, cancel = context.WithCancel(context.Background())
-			go n.nodeDrainer(ctx, s.state)
-			running = true
-		}
-	}
-}
-
-// getNextDeadline is a helper that takes a set of draining nodes and returns the
-// next deadline. It also returns a boolean if there is a deadline.
-func getNextDeadline(nodes map[string]*structs.Node) (time.Time, bool) {
-	var nextDeadline time.Time
-	found := false
-	for _, node := range nodes {
-		inf, d := node.DrainStrategy.DeadlineTime()
-		if !inf && (nextDeadline.IsZero() || d.Before(nextDeadline)) {
-			nextDeadline = d
-			found = true
-		}
-	}
-
-	return nextDeadline, found
-}
-
-// nodeDrainer is the core node draining main loop and should be started in a
-// goroutine when a server establishes leadership.
-func (n *NodeDrainer) nodeDrainer(ctx context.Context, state *state.StateStore) {
-	nodes, nodesIndex, drainingJobs, allocsIndex := initDrainer(n.logger, state)
-
-	// Wait for a node's drain deadline to expire
-	nextDeadline, ok := getNextDeadline(nodes)
-	deadlineTimer := time.NewTimer(time.Until(nextDeadline))
-	stopDeadlineTimer := func() {
-		if !deadlineTimer.Stop() {
-			select {
-			case <-deadlineTimer.C:
-			default:
-			}
-		}
-	}
-	if !ok {
-		stopDeadlineTimer()
-	}
-
-	// Watch for nodes to start or stop draining
-	nodeWatcher := newNodeWatcher(n.logger, nodes, nodesIndex, state)
-	go nodeWatcher.run(ctx)
-
-	// Watch for drained allocations to be replaced
-	// Watch for changes in allocs for jobs with allocs on draining nodes
-	jobWatcher := newJobWatcher(n.logger, drainingJobs, allocsIndex, state)
-	go jobWatcher.run(ctx)
-
-	for {
-		n.logger.Printf("[TRACE] nomad.drain: LOOP next deadline: %s (%s)", nextDeadline, time.Until(nextDeadline))
-		select {
-		case nodes = <-nodeWatcher.nodesCh:
-			// update draining nodes
-			n.logger.Printf("[TRACE] nomad.drain: running due to node change (%d nodes draining)", len(nodes))
-
-			d, ok := getNextDeadline(nodes)
-			if ok && !nextDeadline.Equal(d) {
-				nextDeadline = d
-				n.logger.Printf("[TRACE] nomad.drain: new node deadline: %s", nextDeadline)
-				stopDeadlineTimer()
-				deadlineTimer.Reset(time.Until(nextDeadline))
-			} else if !ok {
-				stopDeadlineTimer()
-			}
-
-		case jobs := <-jobWatcher.WaitCh():
-			n.logger.Printf("[TRACE] nomad.drain: running due to alloc change (%d jobs updated)", len(jobs))
-		case when := <-deadlineTimer.C:
-			// deadline for a node was reached
-			n.logger.Printf("[TRACE] nomad.drain: running due to deadline reached (at %s)", when)
-		case <-ctx.Done():
-			// exit
-			return
-		}
-
-		// Capture state (statestore and time) to do consistent comparisons
-		snapshot, err := state.Snapshot()
-		if err != nil {
-			//FIXME
-			panic(err)
-		}
-		now := time.Now()
-
-		// Collect all drainable jobs
-		result, err := n.collectDrainable(nodes, snapshot, jobWatcher, now)
-		if err != nil {
-			//FIXME
-			panic(err)
-		}
-
-		// stoplist are the allocations to migrate and their jobs to emit
-		// evaluations for. Initialized with allocations that should be
-		// immediately drained regardless of MaxParallel
-		stoplist := newStopAllocs(result.drainNow)
-
-		// build drain list considering deadline & max_parallel
-		n.markMigrations(stoplist, result.upPerTG, result.drainableSvcs, nodes, now)
-
-		if len(stoplist.allocBatch) > 0 {
-			if err := n.applyMigrations(stoplist); err != nil {
-				//FIXME
-				panic(err)
-			}
-		}
-
-		// Unset drain for nodes done draining
-		for nodeID, node := range result.doneNodes {
-			if err := n.raft.NodeDrainComplete(nodeID); err != nil {
-				n.logger.Printf("[ERR] nomad.drain: failed to unset drain for: %v", err)
-				//FIXME
-				panic(err)
-			}
-			n.logger.Printf("[INFO] nomad.drain: node %s (%s) completed draining", nodeID, node.Name)
-			delete(nodes, nodeID)
-		}
-	}
-}
-
-// collectDrainable scans all nodes and allocs on draining nodes and builds a
-// structure of eligible allocs to drain.
-func (n *NodeDrainer) collectDrainable(nodes map[string]*structs.Node, state *state.StateSnapshot,
-	jobWatcher *jobWatcher, now time.Time) (*collectResult, error) {
-
-	svcs := map[jobKey]*runningJob{}
-	drainNow := map[jobKey]*runningJob{}
-	upPerTG := map[string]int{}
-	doneNodes := map[string]*structs.Node{}
-
-	for nodeID, node := range nodes {
-		allocs, err := state.AllocsByNode(nil, nodeID)
-		if err != nil {
-			return nil, err
-		}
-
-		// drainableSys are allocs for system jobs that should be
-		// drained if there are no other allocs left
-		drainableSys := map[jobKey]*runningJob{}
-
-		// track number of allocs left on this node to be drained
-		allocsLeft := false
-		inf, deadline := node.DrainStrategy.DeadlineTime()
-		deadlineReached := !inf && deadline.Before(now)
-		for _, alloc := range allocs {
-			// Don't need to consider drained allocs
-			if alloc.TerminalStatus() {
-				continue
-			}
-
-			jobkey := jobKey{alloc.Namespace, alloc.JobID}
-
-			// job does not found yet
-			job, err := state.JobByID(nil, alloc.Namespace, alloc.JobID)
-			if err != nil {
-				return nil, err
-			}
-
-			// IgnoreSystemJobs if specified in the node's DrainStrategy
-			if node.DrainStrategy.IgnoreSystemJobs && job.Type == structs.JobTypeSystem {
-				continue
-			}
-
-			// When the node deadline is reached all batch
-			// and service jobs will be drained
-			if deadlineReached && job.Type != structs.JobTypeService {
-				n.logger.Printf("[TRACE] nomad.drain: draining alloc %s due to node %s reaching drain deadline", alloc.ID, node.ID)
-				if j, ok := drainNow[jobkey]; ok {
-					j.allocs = append(j.allocs, alloc)
-				} else {
-					// First alloc for this job, create entry
-					drainNow[jobkey] = &runningJob{
-						job:    job,
-						allocs: []*structs.Allocation{alloc},
-					}
-				}
-				continue
-			}
-
-			// If deadline hasn't been reached, system jobs
-			// may still be drained if there are no other
-			// allocs left
-			if !deadlineReached && job.Type == structs.JobTypeSystem {
-				n.logger.Printf("[TRACE] nomad.drain: system alloc %s will be drained if no other allocs on node %s", alloc.ID, node.ID)
-				if j, ok := drainableSys[jobkey]; ok {
-					j.allocs = append(j.allocs, alloc)
-				} else {
-					// First alloc for this job, create entry
-					drainableSys[jobkey] = &runningJob{
-						job:    job,
-						allocs: []*structs.Allocation{alloc},
-					}
-				}
-				continue
-			}
-
-			// This alloc is still running on a draining
-			// node, so treat the node as having allocs
-			// remaining
-			allocsLeft = true
-
-			jobAllocs, err := state.AllocsByJob(nil, alloc.Namespace, alloc.JobID, true)
-			if err != nil {
-				return nil, err
-			}
-
-			// Count the number of down (terminal or nil deployment status) per task group
-			if job.Type == structs.JobTypeService {
-				num := 0
-				for _, a := range jobAllocs {
-					if !a.TerminalStatus() && a.DeploymentStatus != nil {
-						// Not terminal and health updated, count it as up!
-						upPerTG[makeTaskGroupKey(a)]++
-						num++
-					}
-				}
-				n.logger.Printf("[TRACE] nomad.drain: job %s has %d allocs running", job.Name, num)
-			}
-
-			svcs[jobkey] = &runningJob{
-				job:    job,
-				allocs: jobAllocs,
-			}
-
-			jobWatcher.watch(jobkey, nodeID)
-		}
-
-		// if node has no allocs or has hit its deadline, it's done draining!
-		if !allocsLeft || deadlineReached {
-			n.logger.Printf("[TRACE] nomad.drain: node %s has no more allocs left to drain or has reached deadline", nodeID)
-			jobWatcher.nodeDone(nodeID)
-			doneNodes[nodeID] = node
-
-			// Add all system jobs on this node to the drainNow slice
-			for k, sysj := range drainableSys {
-				if j, ok := drainNow[k]; ok {
-					// Job already has at least one alloc draining, append this one
-					j.allocs = append(j.allocs, sysj.allocs...)
-				} else {
-					// First draining alloc for this job, add the entry
-					drainNow[k] = sysj
-				}
-			}
-		}
-	}
-
-	result := &collectResult{
-		drainableSvcs: svcs,
-		drainNow:      drainNow,
-		upPerTG:       upPerTG,
-		doneNodes:     doneNodes,
-	}
-	return result, nil
-}
-
-// markMigrations marks services to be drained for migration in the stoplist.
-func (n *NodeDrainer) markMigrations(stoplist *stopAllocs, upPerTG map[string]int, drainable map[jobKey]*runningJob, nodes map[string]*structs.Node, now time.Time) {
-	for _, drainingJob := range drainable {
-		for _, alloc := range drainingJob.allocs {
-			// Already draining/dead allocs don't need to be drained
-			if alloc.TerminalStatus() {
-				continue
-			}
-
-			node, ok := nodes[alloc.NodeID]
-			if !ok {
-				// Alloc's node is not draining so not elligible for draining!
-				continue
-			}
-
-			tgKey := makeTaskGroupKey(alloc)
-
-			if inf, d := node.DrainStrategy.DeadlineTime(); !inf && d.Before(now) {
-				n.logger.Printf("[TRACE] nomad.drain: draining job %s alloc %s from node %s due to node's drain deadline", drainingJob.job.Name, alloc.ID[:6], alloc.NodeID[:6])
-				// Alloc's Node has reached its deadline
-				stoplist.add(drainingJob.job, alloc)
-				upPerTG[tgKey]--
-
-				continue
-			}
-
-			// Stop allocs with count=1, max_parallel==0, or draining<max_parallel
-			tg := drainingJob.job.LookupTaskGroup(alloc.TaskGroup)
-			//FIXME tg==nil here?
-
-			// Only 1, drain
-			if tg.Count == 1 {
-				n.logger.Printf("[TRACE] nomad.drain: draining job %s alloc %s from node %s due to count=1", drainingJob.job.Name, alloc.ID[:6], alloc.NodeID[:6])
-				stoplist.add(drainingJob.job, alloc)
-				continue
-			}
-
-			// No migrate strategy or a max parallel of 0 mean force draining
-			if tg.Migrate == nil || tg.Migrate.MaxParallel == 0 {
-				n.logger.Printf("[TRACE] nomad.drain: draining job %s alloc %s from node %s due to force drain", drainingJob.job.Name, alloc.ID[:6], alloc.NodeID[:6])
-				stoplist.add(drainingJob.job, alloc)
-				continue
-			}
-
-			n.logger.Printf("[TRACE] nomad.drain: considering job %s alloc %s  count %d  maxp %d  up %d",
-				drainingJob.job.Name, alloc.ID[:6], tg.Count, tg.Migrate.MaxParallel, upPerTG[tgKey])
-
-			// Count - MaxParalell = minimum number of allocations that must be "up"
-			minUp := (tg.Count - tg.Migrate.MaxParallel)
-
-			// If minimum is < the current number up it is safe to stop one.
-			if minUp < upPerTG[tgKey] {
-				n.logger.Printf("[TRACE] nomad.drain: draining job %s alloc %s from node %s due to max parallel", drainingJob.job.Name, alloc.ID[:6], alloc.NodeID[:6])
-				// More migrations are allowed, add to stoplist
-				stoplist.add(drainingJob.job, alloc)
-				upPerTG[tgKey]--
-			}
-		}
-	}
-}
-
-// applyMigrations applies the specified allocation migrations along with their
-// evaluations to raft.
-func (n *NodeDrainer) applyMigrations(stoplist *stopAllocs) error {
-	n.logger.Printf("[DEBUG] nomad.drain: stopping %d alloc(s) for %d job(s)", len(stoplist.allocBatch), len(stoplist.jobBatch))
-
-	for id, _ := range stoplist.allocBatch {
-		n.logger.Printf("[TRACE] nomad.drain: migrating alloc %s", id[:6])
-	}
-	// Reevaluate affected jobs
-	evals := make([]*structs.Evaluation, 0, len(stoplist.jobBatch))
-	for _, job := range stoplist.jobBatch {
-		evals = append(evals, &structs.Evaluation{
-			ID:             uuid.Generate(),
-			Namespace:      job.Namespace,
-			Priority:       job.Priority,
-			Type:           job.Type,
-			TriggeredBy:    structs.EvalTriggerNodeDrain,
-			JobID:          job.ID,
-			JobModifyIndex: job.ModifyIndex,
-			Status:         structs.EvalStatusPending,
-		})
-	}
-
-	// Commit this update via Raft
-	return n.raft.AllocUpdateDesiredTransition(stoplist.allocBatch, evals)
-}
-
-// initDrainer initializes the node drainer state and returns a list of
-// draining nodes as well as allocs that are draining that should be watched
-// for a replacement.
-func initDrainer(logger *log.Logger, state *state.StateStore) (map[string]*structs.Node, uint64, map[jobKey]string, uint64) {
-	// StateStore.Snapshot never returns an error so don't bother checking it
-	snapshot, _ := state.Snapshot()
-	now := time.Now()
-
-	iter, err := snapshot.Nodes(nil)
-	if err != nil {
-		logger.Printf("[ERR] nomad.drain: error iterating nodes: %v", err)
-		panic(err) //FIXME
-	}
-
-	// map of draining nodes keyed by node ID
-	nodes := map[string]*structs.Node{}
-
-	// map of draining job IDs keyed by {namespace, job id} -> node.ID
-	jobs := map[jobKey]string{}
-
-	for {
-		raw := iter.Next()
-		if raw == nil {
-			break
-		}
-
-		// Filter on datacenter and status
-		node := raw.(*structs.Node)
-		if !node.Drain {
-			continue
-		}
-
-		// Track draining node
-		nodes[node.ID] = node
-
-		// No point in tracking draining allocs as the deadline has been reached
-		if inf, d := node.DrainStrategy.DeadlineTime(); !inf && d.Before(now) {
-			continue
-		}
-
-		allocs, err := snapshot.AllocsByNode(nil, node.ID)
-		if err != nil {
-			logger.Printf("[ERR] nomad.drain: error iterating allocs for node %q: %v", node.ID, err)
-			panic(err) //FIXME
-		}
-
-		for _, alloc := range allocs {
-			jobs[jobKey{alloc.Namespace, alloc.JobID}] = node.ID
-		}
-	}
-
-	nodesIndex, _ := snapshot.Index("nodes")
-	if nodesIndex == 0 {
-		nodesIndex = 1
-	}
-	allocsIndex, _ := snapshot.Index("allocs")
-	if allocsIndex == 0 {
-		allocsIndex = 1
-	}
-	return nodes, nodesIndex, jobs, allocsIndex
-}
diff --git a/nomad/drainerv2/drain_heap.go b/nomad/drainer/drain_heap.go
similarity index 99%
rename from nomad/drainerv2/drain_heap.go
rename to nomad/drainer/drain_heap.go
index efde8a92d380..1a6c23f13cf9 100644
--- a/nomad/drainerv2/drain_heap.go
+++ b/nomad/drainer/drain_heap.go
@@ -1,4 +1,4 @@
-package drainerv2
+package drainer
 
 import (
 	"context"
diff --git a/nomad/drainerv2/drain_heap_test.go b/nomad/drainer/drain_heap_test.go
similarity index 99%
rename from nomad/drainerv2/drain_heap_test.go
rename to nomad/drainer/drain_heap_test.go
index a47a98ff7473..147ad9192eff 100644
--- a/nomad/drainerv2/drain_heap_test.go
+++ b/nomad/drainer/drain_heap_test.go
@@ -1,4 +1,4 @@
-package drainerv2
+package drainer
 
 import (
 	"context"
diff --git a/nomad/drainer/drain_test.go b/nomad/drainer/drain_test.go
deleted file mode 100644
index 993a65fcd0ed..000000000000
--- a/nomad/drainer/drain_test.go
+++ /dev/null
@@ -1,491 +0,0 @@
-package drainer_test
-
-import (
-	"fmt"
-	"net"
-	"net/rpc"
-	"sort"
-	"strings"
-	"testing"
-	"time"
-
-	"github.com/hashicorp/nomad/client"
-	"github.com/hashicorp/nomad/client/config"
-	"github.com/hashicorp/nomad/helper/pool"
-	"github.com/hashicorp/nomad/helper/testlog"
-	"github.com/hashicorp/nomad/nomad"
-	"github.com/hashicorp/nomad/nomad/mock"
-	"github.com/hashicorp/nomad/nomad/structs"
-	"github.com/hashicorp/nomad/testutil"
-	"github.com/hashicorp/nomad/testutil/rpcapi"
-	"github.com/kr/pretty"
-	"github.com/stretchr/testify/assert"
-	"github.com/stretchr/testify/require"
-)
-
-// rpcClient is a test helper method to return a ClientCodec to use to make rpc
-// calls to the passed server.
-func rpcClient(t *testing.T, conf *nomad.Config) rpc.ClientCodec {
-	addr := conf.RPCAddr
-	conn, err := net.DialTimeout("tcp", addr.String(), time.Second)
-	if err != nil {
-		t.Fatalf("err: %v", err)
-	}
-	// Write the Nomad RPC byte to set the mode
-	conn.Write([]byte{byte(pool.RpcNomad)})
-	return pool.NewClientCodec(conn)
-}
-
-// TestNodeDrainer_SimpleDrain asserts that draining when there are two nodes
-// moves allocs from the draining node to the other node.
-func TestNodeDrainer_SimpleDrain(t *testing.T) {
-	assert := assert.New(t)
-	require := require.New(t)
-
-	// Capture test servers config
-	var serverConfig *nomad.Config
-	server := nomad.TestServer(t, func(c *nomad.Config) {
-		serverConfig = c
-	})
-	defer server.Shutdown()
-
-	testutil.WaitForLeader(t, server.RPC)
-
-	// Setup 2 Nodes: A & B; A has allocs and is draining
-
-	// Create mock jobs
-	state := server.State()
-
-	serviceJob := mock.Job()
-	serviceJob.Name = "service-job"
-	serviceJob.Type = structs.JobTypeService
-	serviceJob.Constraints = nil
-	serviceJob.TaskGroups[0].Migrate = &structs.MigrateStrategy{
-		MaxParallel:     1,
-		HealthCheck:     structs.MigrateStrategyHealthStates,
-		MinHealthyTime:  time.Millisecond,
-		HealthyDeadline: 2 * time.Second,
-	}
-	serviceJob.TaskGroups[0].Tasks[0].Driver = "mock_driver"
-	serviceJob.TaskGroups[0].Tasks[0].Resources = structs.MinResources()
-	serviceJob.TaskGroups[0].Tasks[0].Config = map[string]interface{}{
-		"run_for":    "10m",
-		"kill_after": "1ms",
-	}
-	serviceJob.TaskGroups[0].Tasks[0].Services = nil
-
-	systemJob := mock.SystemJob()
-	systemJob.Name = "system-job"
-	systemJob.Type = structs.JobTypeSystem
-	systemJob.Constraints = nil
-	systemJob.TaskGroups[0].Tasks[0].Driver = "mock_driver"
-	systemJob.TaskGroups[0].Tasks[0].Config = map[string]interface{}{
-		"run_for":    "10m",
-		"kill_after": "1ms",
-	}
-	systemJob.TaskGroups[0].Tasks[0].Resources = structs.MinResources()
-	systemJob.TaskGroups[0].Tasks[0].Services = nil
-
-	// Batch job will run until the node's drain deadline is reached
-	batchJob := mock.Job()
-	batchJob.Name = "batch-job"
-	batchJob.Type = structs.JobTypeBatch
-	batchJob.Constraints = nil
-	batchJob.TaskGroups[0].Name = "batch-group"
-	batchJob.TaskGroups[0].Migrate = nil
-	batchJob.TaskGroups[0].Tasks[0].Name = "batch-task"
-	batchJob.TaskGroups[0].Tasks[0].Driver = "mock_driver"
-	batchJob.TaskGroups[0].Tasks[0].Config = map[string]interface{}{
-		"run_for":    "10m",
-		"kill_after": "1ms",
-		"exit_code":  13, // set nonzero exit code to cause rescheduling
-	}
-	batchJob.TaskGroups[0].Tasks[0].Resources = structs.MinResources()
-	batchJob.TaskGroups[0].Tasks[0].Services = nil
-
-	// Start node 1
-	c1 := client.TestClient(t, func(conf *config.Config) {
-		conf.LogOutput = testlog.NewWriter(t)
-		conf.Servers = []string{serverConfig.RPCAddr.String()}
-	})
-	defer c1.Shutdown()
-	node1ID := c1.NodeID()
-
-	// Start jobs so they all get placed on node 1
-	codec := rpcClient(t, serverConfig)
-	rpc := rpcapi.NewRPC(codec)
-	for _, job := range []*structs.Job{systemJob, serviceJob, batchJob} {
-		resp, err := rpc.JobRegister(job)
-		require.Nil(err)
-		require.NotZero(resp.Index)
-	}
-
-	// Wait for jobs to start on c1
-	testutil.WaitForResult(func() (bool, error) {
-		resp, err := rpc.NodeGetAllocs(node1ID)
-		if err != nil {
-			return false, err
-		}
-
-		system, batch, service := 0, 0, 0
-		for _, alloc := range resp.Allocs {
-			if alloc.ClientStatus != structs.AllocClientStatusRunning {
-				return false, fmt.Errorf("alloc %s for job %s not running: %s", alloc.ID, alloc.Job.Name, alloc.ClientStatus)
-			}
-			switch alloc.JobID {
-			case batchJob.ID:
-				batch++
-			case serviceJob.ID:
-				service++
-			case systemJob.ID:
-				system++
-			}
-		}
-		// 1 system + 10 batch + 10 service = 21
-		if system+batch+service != 21 {
-			return false, fmt.Errorf("wrong number of allocs: system %d/1, batch %d/10, service %d/10", system, batch, service)
-		}
-		return true, nil
-	}, func(err error) {
-		if resp, err := rpc.NodeGetAllocs(node1ID); err == nil {
-			for i, alloc := range resp.Allocs {
-				t.Logf("%d alloc %s job %s status %s", i, alloc.ID, alloc.Job.Name, alloc.ClientStatus)
-			}
-		}
-		if resp, err := rpc.EvalList(); err == nil {
-			for _, eval := range resp.Evaluations {
-				t.Logf("% #v\n", pretty.Formatter(eval))
-			}
-		}
-		t.Fatalf("failed waiting for all allocs to start: %v", err)
-	})
-
-	// Start draining node 1 with no deadline
-	strategy := &structs.DrainStrategy{
-		DrainSpec: structs.DrainSpec{
-			Deadline: -1 * time.Second,
-		},
-	}
-	node1Resp, err := rpc.NodeGet(node1ID)
-	require.Nil(err)
-	node1 := node1Resp.Node
-	require.Nil(state.UpdateNodeDrain(node1.ModifyIndex+1, node1ID, strategy))
-
-	// Start node 2
-	c2 := client.TestClient(t, func(conf *config.Config) {
-		conf.LogOutput = testlog.NewWriter(t)
-		conf.Servers = []string{serverConfig.RPCAddr.String()}
-	})
-	defer c2.Shutdown()
-	node2ID := c2.NodeID()
-
-	// Wait for services to be migrated
-	testutil.WaitForResult(func() (bool, error) {
-		resp, err := rpc.NodeGetAllocs(node2ID)
-		if err != nil {
-			return false, err
-		}
-
-		system, batch, service := 0, 0, 0
-		for _, alloc := range resp.Allocs {
-			if alloc.ClientStatus != structs.AllocClientStatusRunning {
-				return false, fmt.Errorf("alloc %s for job %s not running: %s", alloc.ID, alloc.Job.Name, alloc.ClientStatus)
-			}
-			switch alloc.JobID {
-			case batchJob.ID:
-				batch++
-			case serviceJob.ID:
-				service++
-			case systemJob.ID:
-				system++
-			}
-		}
-		// 1 system + 10 batch + 10 service = 21
-		if system+batch+service != 21 {
-			return false, fmt.Errorf("wrong number of allocs: system %d/1, batch %d/10, service %d/10", system, batch, service)
-		}
-		return true, nil
-	}, func(err error) {
-		if resp, err := rpc.NodeGetAllocs(node2ID); err == nil {
-			for i, alloc := range resp.Allocs {
-				t.Logf("%d alloc %s job %s status %s prev %s", i, alloc.ID, alloc.Job.Name, alloc.ClientStatus, alloc.PreviousAllocation)
-			}
-		}
-		t.Errorf("failed waiting for all allocs to migrate: %v", err)
-	})
-
-	// Wait for drained services to be dead
-	testutil.WaitForResult(func() (bool, error) {
-		resp, err := rpc.NodeGetAllocs(c1.NodeID())
-		if err != nil {
-			return false, err
-		}
-
-		running := make([]string, 0, len(resp.Allocs))
-		for _, alloc := range resp.Allocs {
-			if alloc.ClientStatus == structs.AllocClientStatusRunning {
-				running = append(running, alloc.ID[:6])
-			}
-		}
-
-		if len(running) > 0 {
-			return false, fmt.Errorf("%d alloc(s) on draining node %s still running: %s", len(running), c1.NodeID()[:6], running)
-		}
-		return true, nil
-	}, func(err error) {
-		t.Errorf("failed waiting for all draining allocs to stop: %v", err)
-	})
-
-	node1Resp, err = rpc.NodeGet(node1ID)
-	require.Nil(err)
-	node1 = node1Resp.Node
-	assert.False(node1.Drain)
-	assert.Nil(node1.DrainStrategy)
-	assert.Equal(structs.NodeSchedulingIneligible, node1.SchedulingEligibility)
-
-	jobs, err := rpc.JobList()
-	require.Nil(err)
-	t.Logf("--> %d jobs", len(jobs.Jobs))
-	for _, job := range jobs.Jobs {
-		t.Logf("job: %s status: %s %s", job.Name, job.Status, job.StatusDescription)
-	}
-
-	allocs, err := rpc.AllocAll()
-	require.Nil(err)
-
-	sort.Slice(allocs, func(i, j int) bool {
-		r := strings.Compare(allocs[i].Job.Name, allocs[j].Job.Name)
-		switch {
-		case r < 0:
-			return true
-		case r == 0:
-			return allocs[i].ModifyIndex < allocs[j].ModifyIndex
-		case r > 0:
-			return false
-		}
-		panic("unreachable")
-	})
-
-	t.Logf("--> %d allocs", len(allocs))
-	for _, alloc := range allocs {
-		t.Logf("job: %s  node: %s  alloc: %s  desired_status: %s  desired_transition: %s  actual: %s  replaces: %s",
-			alloc.Job.Name, alloc.NodeID[:6], alloc.ID[:6], alloc.DesiredStatus, pretty.Sprint(alloc.DesiredTransition.Migrate), alloc.ClientStatus, alloc.PreviousAllocation)
-	}
-
-	t.Logf("==> PASS")
-}
-
-// TestNodeDrainer_SystemDrain asserts system jobs are drained
-func TestNodeDrainer_SystemDrain(t *testing.T) {
-	assert := assert.New(t)
-	require := require.New(t)
-
-	// Capture test servers config
-	var serverConfig *nomad.Config
-	server := nomad.TestServer(t, func(c *nomad.Config) {
-		serverConfig = c
-	})
-	defer server.Shutdown()
-
-	testutil.WaitForLeader(t, server.RPC)
-
-	// Setup 2 Nodes: A & B; A has allocs and is draining
-
-	// Create mock jobs
-	state := server.State()
-
-	serviceJob := mock.Job()
-	serviceJob.Name = "service-job"
-	serviceJob.Type = structs.JobTypeService
-	serviceJob.Constraints = nil
-	serviceJob.TaskGroups[0].Count = 2
-	serviceJob.TaskGroups[0].Migrate = &structs.MigrateStrategy{
-		MaxParallel:     1,
-		HealthCheck:     structs.MigrateStrategyHealthStates,
-		MinHealthyTime:  time.Millisecond,
-		HealthyDeadline: 2 * time.Second,
-	}
-	serviceJob.TaskGroups[0].Tasks[0].Driver = "mock_driver"
-	serviceJob.TaskGroups[0].Tasks[0].Resources = structs.MinResources()
-	serviceJob.TaskGroups[0].Tasks[0].Config = map[string]interface{}{
-		"run_for":    "10m",
-		"kill_after": "1ms",
-	}
-	serviceJob.TaskGroups[0].Tasks[0].Services = nil
-
-	systemJob := mock.SystemJob()
-	systemJob.Name = "system-job"
-	systemJob.Type = structs.JobTypeSystem
-	systemJob.Constraints = nil
-	systemJob.TaskGroups[0].Tasks[0].Driver = "mock_driver"
-	systemJob.TaskGroups[0].Tasks[0].Config = map[string]interface{}{
-		"run_for":    "10m",
-		"kill_after": "1ms",
-	}
-	systemJob.TaskGroups[0].Tasks[0].Resources = structs.MinResources()
-	systemJob.TaskGroups[0].Tasks[0].Services = nil
-
-	// Start node 1
-	c1 := client.TestClient(t, func(conf *config.Config) {
-		conf.LogOutput = testlog.NewWriter(t)
-		conf.Servers = []string{serverConfig.RPCAddr.String()}
-	})
-	defer c1.Shutdown()
-	node1ID := c1.NodeID()
-
-	// Start jobs so they all get placed on node 1
-	codec := rpcClient(t, serverConfig)
-	rpc := rpcapi.NewRPC(codec)
-	for _, job := range []*structs.Job{systemJob, serviceJob} {
-		resp, err := rpc.JobRegister(job)
-		require.Nil(err)
-		require.NotZero(resp.Index)
-	}
-
-	// Wait for jobs to start on c1
-	testutil.WaitForResult(func() (bool, error) {
-		resp, err := rpc.NodeGetAllocs(c1.NodeID())
-		if err != nil {
-			return false, err
-		}
-
-		system, service := 0, 0
-		for _, alloc := range resp.Allocs {
-			if alloc.ClientStatus != structs.AllocClientStatusRunning {
-				return false, fmt.Errorf("alloc %s for job %s not running: %s", alloc.ID, alloc.Job.Name, alloc.ClientStatus)
-			}
-			switch alloc.JobID {
-			case serviceJob.ID:
-				service++
-			case systemJob.ID:
-				system++
-			default:
-				return false, fmt.Errorf("unknown job: %s", alloc.Job.Name)
-			}
-		}
-		// 1 system + 2 service = 3
-		if system+service != 3 {
-			return false, fmt.Errorf("wrong number of allocs: system %d/1, service %d/2", system, service)
-		}
-		return true, nil
-	}, func(err error) {
-		if resp, err := rpc.NodeGetAllocs(c1.NodeID()); err == nil {
-			for i, alloc := range resp.Allocs {
-				t.Logf("%d alloc %s job %s status %s", i, alloc.ID, alloc.Job.Name, alloc.ClientStatus)
-			}
-		}
-		t.Fatalf("failed waiting for all allocs to start: %v", err)
-	})
-
-	// Start draining node 1
-	strategy := &structs.DrainStrategy{
-		DrainSpec: structs.DrainSpec{
-			Deadline: 1 * time.Hour,
-		},
-	}
-	node1Resp, err := rpc.NodeGet(node1ID)
-	require.Nil(err)
-	node1 := node1Resp.Node
-	require.Nil(state.UpdateNodeDrain(node1.ModifyIndex+1, node1ID, strategy))
-
-	// Start node 2
-	c2 := client.TestClient(t, func(conf *config.Config) {
-		conf.LogOutput = testlog.NewWriter(t)
-		conf.Servers = []string{serverConfig.RPCAddr.String()}
-	})
-	defer c2.Shutdown()
-	node2ID := c2.NodeID()
-
-	// Wait for services to be migrated
-	testutil.WaitForResult(func() (bool, error) {
-		resp, err := rpc.NodeGetAllocs(node2ID)
-		if err != nil {
-			return false, err
-		}
-
-		system, service := 0, 0
-		for _, alloc := range resp.Allocs {
-			if alloc.ClientStatus != structs.AllocClientStatusRunning {
-				return false, fmt.Errorf("alloc %s for job %s not running: %s", alloc.ID, alloc.Job.Name, alloc.ClientStatus)
-			}
-			switch alloc.JobID {
-			case serviceJob.ID:
-				service++
-			case systemJob.ID:
-				system++
-			default:
-				return false, fmt.Errorf("unknown job: %s", alloc.Job.Name)
-			}
-		}
-		// 1 system + 2 service = 3
-		if system+service != 3 {
-			return false, fmt.Errorf("wrong number of allocs: system %d/1, service %d/2", system, service)
-		}
-		return true, nil
-	}, func(err error) {
-		if resp, err := rpc.NodeGetAllocs(node2ID); err == nil {
-			for i, alloc := range resp.Allocs {
-				t.Logf("%d alloc %s job %s status %s prev %s", i, alloc.ID, alloc.Job.Name, alloc.ClientStatus, alloc.PreviousAllocation)
-			}
-		}
-		t.Errorf("failed waiting for all allocs to migrate: %v", err)
-	})
-
-	// Wait for drained services to be dead
-	testutil.WaitForResult(func() (bool, error) {
-		resp, err := rpc.NodeGetAllocs(node1ID)
-		if err != nil {
-			return false, err
-		}
-
-		running := make([]string, 0, len(resp.Allocs))
-		for _, alloc := range resp.Allocs {
-			if alloc.ClientStatus == structs.AllocClientStatusRunning {
-				running = append(running, alloc.ID[:6])
-			}
-		}
-
-		if len(running) > 0 {
-			return false, fmt.Errorf("%d alloc(s) on draining node %s still running: %s", len(running), node1ID[:6], running)
-		}
-		return true, nil
-	}, func(err error) {
-		t.Errorf("failed waiting for all draining allocs to stop: %v", err)
-	})
-
-	node1Resp, err = rpc.NodeGet(node1ID)
-	require.Nil(err)
-	node1 = node1Resp.Node
-	assert.False(node1.Drain)
-	assert.Nil(node1.DrainStrategy)
-	assert.Equal(structs.NodeSchedulingIneligible, node1.SchedulingEligibility)
-
-	jobs, err := rpc.JobList()
-	require.Nil(err)
-	t.Logf("--> %d jobs", len(jobs.Jobs))
-	for _, job := range jobs.Jobs {
-		t.Logf("job: %s status: %s %s", job.Name, job.Status, job.StatusDescription)
-	}
-
-	allocs, err := rpc.AllocAll()
-	require.Nil(err)
-
-	sort.Slice(allocs, func(i, j int) bool {
-		r := strings.Compare(allocs[i].Job.Name, allocs[j].Job.Name)
-		switch {
-		case r < 0:
-			return true
-		case r == 0:
-			return allocs[i].ModifyIndex < allocs[j].ModifyIndex
-		case r > 0:
-			return false
-		}
-		panic("unreachable")
-	})
-
-	t.Logf("--> %d allocs", len(allocs))
-	for _, alloc := range allocs {
-		t.Logf("job: %s  node: %s  alloc: %s  desired_status: %s  desired_transition: %s  actual: %s  replaces: %s",
-			alloc.Job.Name, alloc.NodeID[:6], alloc.ID[:6], alloc.DesiredStatus, pretty.Sprint(alloc.DesiredTransition.Migrate), alloc.ClientStatus, alloc.PreviousAllocation)
-	}
-}
diff --git a/nomad/drainerv2/drain_testing.go b/nomad/drainer/drain_testing.go
similarity index 98%
rename from nomad/drainerv2/drain_testing.go
rename to nomad/drainer/drain_testing.go
index 60d710e4a593..5af351fe819f 100644
--- a/nomad/drainerv2/drain_testing.go
+++ b/nomad/drainer/drain_testing.go
@@ -1,4 +1,4 @@
-package drainerv2
+package drainer
 
 import (
 	"sync"
diff --git a/nomad/drainerv2/drainer.go b/nomad/drainer/drainer.go
similarity index 99%
rename from nomad/drainerv2/drainer.go
rename to nomad/drainer/drainer.go
index b5842559d0b5..2b6a328070d0 100644
--- a/nomad/drainerv2/drainer.go
+++ b/nomad/drainer/drainer.go
@@ -1,4 +1,4 @@
-package drainerv2
+package drainer
 
 import (
 	"context"
diff --git a/nomad/drainerv2/draining_node.go b/nomad/drainer/draining_node.go
similarity index 99%
rename from nomad/drainerv2/draining_node.go
rename to nomad/drainer/draining_node.go
index 0f13a1b74a77..078399f049f9 100644
--- a/nomad/drainerv2/draining_node.go
+++ b/nomad/drainer/draining_node.go
@@ -1,4 +1,4 @@
-package drainerv2
+package drainer
 
 import (
 	"fmt"
diff --git a/nomad/drainer/job_watcher.go b/nomad/drainer/job_watcher.go
deleted file mode 100644
index 95a1be5d157e..000000000000
--- a/nomad/drainer/job_watcher.go
+++ /dev/null
@@ -1,140 +0,0 @@
-package drainer
-
-import (
-	"context"
-	"log"
-	"sync"
-
-	memdb "github.com/hashicorp/go-memdb"
-	"github.com/hashicorp/nomad/nomad/state"
-	"github.com/hashicorp/nomad/nomad/structs"
-)
-
-// jobWatcher watches allocation changes for jobs with at least one allocation
-// on a draining node.
-type jobWatcher struct {
-	// allocsIndex to start watching from
-	allocsIndex uint64
-
-	// job -> node.ID
-	jobs   map[jobKey]string
-	jobsMu sync.Mutex
-
-	jobsCh chan map[jobKey]struct{}
-
-	state *state.StateStore
-
-	logger *log.Logger
-}
-
-func newJobWatcher(logger *log.Logger, jobs map[jobKey]string, allocsIndex uint64, state *state.StateStore) *jobWatcher {
-	return &jobWatcher{
-		allocsIndex: allocsIndex,
-		logger:      logger,
-		jobs:        jobs,
-		jobsCh:      make(chan map[jobKey]struct{}),
-		state:       state,
-	}
-}
-
-func (j *jobWatcher) watch(k jobKey, nodeID string) {
-	j.logger.Printf("[TRACE] nomad.drain: watching job %s on draining node %s", k.jobid, nodeID[:6])
-	j.jobsMu.Lock()
-	j.jobs[k] = nodeID
-	j.jobsMu.Unlock()
-}
-
-func (j *jobWatcher) nodeDone(nodeID string) {
-	j.jobsMu.Lock()
-	defer j.jobsMu.Unlock()
-	for k, v := range j.jobs {
-		if v == nodeID {
-			j.logger.Printf("[TRACE] nomad.drain: UNwatching job %s on done draining node %s", k.jobid, nodeID[:6])
-			delete(j.jobs, k)
-		}
-	}
-}
-
-func (j *jobWatcher) WaitCh() <-chan map[jobKey]struct{} {
-	return j.jobsCh
-}
-
-func (j *jobWatcher) run(ctx context.Context) {
-	var resp interface{}
-	var err error
-
-	for {
-		//FIXME have watchAllocs create a closure and give it a copy of j.jobs to remove locking?
-		//FIXME it seems possible for this to return a nil error and a 0 index, what to do in that case?
-		var newIndex uint64
-		resp, newIndex, err = j.state.BlockingQuery(j.watchAllocs, j.allocsIndex, ctx)
-		if err != nil {
-			if err == context.Canceled {
-				j.logger.Printf("[TRACE] nomad.drain: job watcher shutting down")
-				return
-			}
-			j.logger.Printf("[ERR] nomad.drain: error blocking on alloc updates: %v", err)
-			return
-		}
-
-		j.logger.Printf("[TRACE] nomad.drain: job watcher old index: %d new index: %d", j.allocsIndex, newIndex)
-		j.allocsIndex = newIndex
-
-		changedJobs := resp.(map[jobKey]struct{})
-		if len(changedJobs) > 0 {
-			select {
-			case j.jobsCh <- changedJobs:
-			case <-ctx.Done():
-				return
-			}
-		}
-	}
-}
-
-func (j *jobWatcher) watchAllocs(ws memdb.WatchSet, state *state.StateStore) (interface{}, uint64, error) {
-	iter, err := state.Allocs(ws)
-	if err != nil {
-		return nil, 0, err
-	}
-
-	index, err := state.Index("allocs")
-	if err != nil {
-		return nil, 0, err
-	}
-
-	skipped := 0
-
-	// job ids
-	resp := map[jobKey]struct{}{}
-
-	for {
-		raw := iter.Next()
-		if raw == nil {
-			break
-		}
-
-		alloc := raw.(*structs.Allocation)
-
-		j.jobsMu.Lock()
-		_, ok := j.jobs[jobKey{alloc.Namespace, alloc.JobID}]
-		j.jobsMu.Unlock()
-
-		if !ok {
-			// alloc is not part of a draining job
-			skipped++
-			continue
-		}
-
-		// don't wake drain loop if alloc hasn't updated its health
-		if alloc.DeploymentStatus.IsHealthy() || alloc.DeploymentStatus.IsUnhealthy() {
-			j.logger.Printf("[TRACE] nomad.drain: job watcher found alloc %s - deployment status: %t", alloc.ID[:6], *alloc.DeploymentStatus.Healthy)
-			resp[jobKey{alloc.Namespace, alloc.JobID}] = struct{}{}
-		} else {
-			j.logger.Printf("[TRACE] nomad.drain: job watcher ignoring alloc %s - no deployment status", alloc.ID[:6])
-		}
-	}
-
-	j.logger.Printf("[TRACE] nomad.drain: job watcher ignoring %d allocs - not part of draining job at index %d", skipped, index)
-
-	return resp, index, nil
-}
diff --git a/nomad/drainer/node_watcher.go b/nomad/drainer/node_watcher.go
deleted file mode 100644
index 5f419ea2ca91..000000000000
--- a/nomad/drainer/node_watcher.go
+++ /dev/null
@@ -1,121 +0,0 @@
-package drainer
-
-import (
-	"context"
-	"log"
-
-	memdb "github.com/hashicorp/go-memdb"
-	"github.com/hashicorp/nomad/nomad/state"
-	"github.com/hashicorp/nomad/nomad/structs"
-)
-
-// nodeWatcher watches for nodes to start or stop draining
-type nodeWatcher struct {
-	index   uint64
-	nodes   map[string]*structs.Node
-	nodesCh chan map[string]*structs.Node
-	state   *state.StateStore
-	logger  *log.Logger
-}
-
-func newNodeWatcher(logger *log.Logger, nodes map[string]*structs.Node, index uint64, state *state.StateStore) *nodeWatcher {
-	return &nodeWatcher{
-		nodes:   nodes,
-		nodesCh: make(chan map[string]*structs.Node),
-		index:   index,
-		state:   state,
-		logger:  logger,
-	}
-}
-
-func (n *nodeWatcher) run(ctx context.Context) {
-	// Trigger an initial drain pass if there are already nodes draining
-	//FIXME this is unneccessary if a node has reached a deadline
-	n.logger.Printf("[TRACE] nomad.drain: initial draining nodes: %d", len(n.nodes))
-	if len(n.nodes) > 0 {
-		n.nodesCh <- n.nodes
-	}
-
-	for {
-		//FIXME it seems possible for this to return a nil error and a 0 index, what to do in that case?
-		resp, index, err := n.state.BlockingQuery(n.queryNodeDrain, n.index, ctx)
-		if err != nil {
-			if err == context.Canceled {
-				n.logger.Printf("[TRACE] nomad.drain: draining node watcher shutting down")
-				return
-			}
-			n.logger.Printf("[ERR] nomad.drain: error blocking on node updates at index %d: %v", n.index, err)
-			return
-		}
-
-		// update index for next run
-		n.index = index
-
-		changed := false
-		newNodes := resp.([]*structs.Node)
-		n.logger.Printf("[TRACE] nomad.drain: %d nodes to consider", len(newNodes)) //FIXME remove
-		for _, newNode := range newNodes {
-			if existingNode, ok := n.nodes[newNode.ID]; ok {
-				// Node was draining, see if it has changed
-				if newNode.DrainStrategy == nil {
-					// Node stopped draining
-					delete(n.nodes, newNode.ID)
-					changed = true
-				} else if !newNode.DrainStrategy.Equal(existingNode.DrainStrategy) {
-					// Update deadline
-					n.nodes[newNode.ID] = newNode
-					changed = true
-				}
-			} else {
-				// Node was not draining
-				if newNode.DrainStrategy != nil {
-					// Node started draining
-					n.nodes[newNode.ID] = newNode
-					changed = true
-				}
-			}
-		}
-
-		// Send a copy of the draining nodes if there were changes
-		if !changed {
-			continue
-		}
-
-		nodesCopy := make(map[string]*structs.Node, len(n.nodes))
-		for k, v := range n.nodes {
-			nodesCopy[k] = v
-		}
-
-		select {
-		case n.nodesCh <- nodesCopy:
-		case <-ctx.Done():
-			return
-		}
-	}
-}
-
-func (n *nodeWatcher) queryNodeDrain(ws memdb.WatchSet, state *state.StateStore) (interface{}, uint64, error) {
-	iter, err := state.Nodes(ws)
-	if err != nil {
-		return nil, 0, err
-	}
-
-	index, err := state.Index("nodes")
-	if err != nil {
-		return nil, 0, err
-	}
-
-	resp := make([]*structs.Node, 0, 8)
-
-	for {
-		raw := iter.Next()
-		if raw == nil {
-			break
-		}
-
-		node := raw.(*structs.Node)
-		resp = append(resp, node)
-	}
-
-	return resp, index, nil
-}
diff --git a/nomad/drainerv2/watch_jobs.go b/nomad/drainer/watch_jobs.go
similarity index 99%
rename from nomad/drainerv2/watch_jobs.go
rename to nomad/drainer/watch_jobs.go
index 3a28f647ceff..714bac2b7e53 100644
--- a/nomad/drainerv2/watch_jobs.go
+++ b/nomad/drainer/watch_jobs.go
@@ -1,4 +1,4 @@
-package drainerv2
+package drainer
 
 import (
 	"context"
diff --git a/nomad/drainerv2/watch_jobs_test.go b/nomad/drainer/watch_jobs_test.go
similarity index 99%
rename from nomad/drainerv2/watch_jobs_test.go
rename to nomad/drainer/watch_jobs_test.go
index 6d9b1846ec5c..3db5ea0ac4b8 100644
--- a/nomad/drainerv2/watch_jobs_test.go
+++ b/nomad/drainer/watch_jobs_test.go
@@ -1,4 +1,4 @@
-package drainerv2
+package drainer
 
 import (
 	"context"
diff --git a/nomad/drainerv2/watch_nodes.go b/nomad/drainer/watch_nodes.go
similarity index 99%
rename from nomad/drainerv2/watch_nodes.go
rename to nomad/drainer/watch_nodes.go
index 34cc7a9c97d3..738f496fda78 100644
--- a/nomad/drainerv2/watch_nodes.go
+++ b/nomad/drainer/watch_nodes.go
@@ -1,4 +1,4 @@
-package drainerv2
+package drainer
 
 import (
 	"context"
diff --git a/nomad/drainerv2/watch_nodes_test.go b/nomad/drainer/watch_nodes_test.go
similarity index 97%
rename from nomad/drainerv2/watch_nodes_test.go
rename to nomad/drainer/watch_nodes_test.go
index dab304c32c9c..476c7a39bb50 100644
--- a/nomad/drainerv2/watch_nodes_test.go
+++ b/nomad/drainer/watch_nodes_test.go
@@ -1,4 +1,4 @@
-package drainerv2
+package drainer
 
 import (
 	"context"
@@ -97,7 +97,7 @@ func TestNodeDrainWatcher_Remove(t *testing.T) {
 	require.Equal(n, tracked[n.ID])
 
 	// Change the node to be not draining and wait for it to be untracked
-	require.Nil(state.UpdateNodeDrain(101, n.ID, nil))
+	require.Nil(state.UpdateNodeDrain(101, n.ID, nil, false))
 	testutil.WaitForResult(func() (bool, error) {
 		return len(m.Events) == 2, nil
 	}, func(err error) {
@@ -175,7 +175,7 @@ func TestNodeDrainWatcher_Update(t *testing.T) {
 	// Change the node to have a new spec
 	s2 := n.DrainStrategy.Copy()
 	s2.Deadline += time.Hour
-	require.Nil(state.UpdateNodeDrain(101, n.ID, s2))
+	require.Nil(state.UpdateNodeDrain(101, n.ID, s2, false))
 
 	// Wait for it to be updated
 	testutil.WaitForResult(func() (bool, error) {
diff --git a/nomad/server.go b/nomad/server.go
index afe7ee9871ca..b69e0a022571 100644
--- a/nomad/server.go
+++ b/nomad/server.go
@@ -27,7 +27,7 @@ import (
 	"github.com/hashicorp/nomad/helper/stats"
 	"github.com/hashicorp/nomad/helper/tlsutil"
 	"github.com/hashicorp/nomad/nomad/deploymentwatcher"
-	"github.com/hashicorp/nomad/nomad/drainerv2"
+	"github.com/hashicorp/nomad/nomad/drainer"
 	"github.com/hashicorp/nomad/nomad/state"
 	"github.com/hashicorp/nomad/nomad/structs"
 	"github.com/hashicorp/nomad/nomad/structs/config"
@@ -174,7 +174,7 @@ type Server struct {
 	deploymentWatcher *deploymentwatcher.Watcher
 
 	// nodeDrainer is used to drain allocations from nodes.
-	nodeDrainer *drainerv2.NodeDrainer
+	nodeDrainer *drainer.NodeDrainer
 
 	// evalBroker is used to manage the in-progress evaluations
 	// that are waiting to be brokered to a sub-scheduler
@@ -892,16 +892,16 @@ func (s *Server) setupDeploymentWatcher() error {
 func (s *Server) setupNodeDrainer() {
 	// Create a shim around Raft requests
 	shim := drainerShim{s}
-	c := &drainerv2.NodeDrainerConfig{
+	c := &drainer.NodeDrainerConfig{
 		Logger:                s.logger,
 		Raft:                  shim,
-		JobFactory:            drainerv2.GetDrainingJobWatcher,
-		NodeFactory:           drainerv2.GetNodeWatcherFactory(),
-		DrainDeadlineFactory:  drainerv2.GetDeadlineNotifier,
-		StateQueriesPerSecond: drainerv2.LimitStateQueriesPerSecond,
-		BatchUpdateInterval:   drainerv2.BatchUpdateInterval,
+		JobFactory:            drainer.GetDrainingJobWatcher,
+		NodeFactory:           drainer.GetNodeWatcherFactory(),
+		DrainDeadlineFactory:  drainer.GetDeadlineNotifier,
+		StateQueriesPerSecond: drainer.LimitStateQueriesPerSecond,
+		BatchUpdateInterval:   drainer.BatchUpdateInterval,
 	}
-	s.nodeDrainer = drainerv2.NewNodeDrainer(c)
+	s.nodeDrainer = drainer.NewNodeDrainer(c)
 }
 
 // setupVaultClient is used to set up the Vault API client.

From 45e7e885585e4c894e0e86c988751bce49243edf Mon Sep 17 00:00:00 2001
From: Alex Dadgar <alex.dadgar@gmail.com>
Date: Mon, 12 Mar 2018 13:44:33 -0700
Subject: [PATCH 49/79] Fix deadline handling

---
 api/nodes.go                |  5 ++-
 nomad/drainer_int_test.go   | 88 +++++++++++++++++++++++++++++++++++++
 scheduler/generic_sched.go  | 26 +++--------
 scheduler/reconcile.go      | 26 +----------
 scheduler/reconcile_test.go | 86 +++---------------------------------
 5 files changed, 103 insertions(+), 128 deletions(-)

diff --git a/api/nodes.go b/api/nodes.go
index 9261528544f8..d625629fb5e3 100644
--- a/api/nodes.go
+++ b/api/nodes.go
@@ -183,8 +183,9 @@ type DrainStrategy struct {
 	// DrainSpec is the user declared drain specification
 	DrainSpec
 
-	// DeadlineTime is the deadline time for the drain.
-	DeadlineTime time.Time
+	// ForceDeadline is the deadline time for the drain after which drains will
+	// be forced
+	ForceDeadline time.Time
 }
 
 // DrainSpec describes a Node's drain behavior.
diff --git a/nomad/drainer_int_test.go b/nomad/drainer_int_test.go
index 39422a5a0ddb..8e03a2ef5ff3 100644
--- a/nomad/drainer_int_test.go
+++ b/nomad/drainer_int_test.go
@@ -187,6 +187,94 @@ func TestDrainer_Simple_ServiceOnly(t *testing.T) {
 	})
 }
 
+func TestDrainer_Simple_ServiceOnly_Deadline(t *testing.T) {
+	t.Parallel()
+	require := require.New(t)
+	s1 := TestServer(t, nil)
+	defer s1.Shutdown()
+	codec := rpcClient(t, s1)
+	testutil.WaitForLeader(t, s1.RPC)
+
+	// Create a node
+	n1 := mock.Node()
+	nodeReg := &structs.NodeRegisterRequest{
+		Node:         n1,
+		WriteRequest: structs.WriteRequest{Region: "global"},
+	}
+	var nodeResp structs.NodeUpdateResponse
+	require.Nil(msgpackrpc.CallWithCodec(codec, "Node.Register", nodeReg, &nodeResp))
+
+	// Create a job that runs on just one
+	job := mock.Job()
+	job.Update = *structs.DefaultUpdateStrategy
+	job.Update.Stagger = 30 * time.Second
+	job.TaskGroups[0].Count = 2
+	req := &structs.JobRegisterRequest{
+		Job: job,
+		WriteRequest: structs.WriteRequest{
+			Region:    "global",
+			Namespace: job.Namespace,
+		},
+	}
+
+	// Fetch the response
+	var resp structs.JobRegisterResponse
+	require.Nil(msgpackrpc.CallWithCodec(codec, "Job.Register", req, &resp))
+	require.NotZero(resp.Index)
+
+	// Wait for the two allocations to be placed
+	state := s1.State()
+	testutil.WaitForResult(func() (bool, error) {
+		allocs, err := state.AllocsByJob(nil, job.Namespace, job.ID, false)
+		if err != nil {
+			return false, err
+		}
+		return len(allocs) == 2, fmt.Errorf("got %d allocs", len(allocs))
+	}, func(err error) {
+		t.Fatalf("err: %v", err)
+	})
+
+	// Drain the node
+	drainReq := &structs.NodeUpdateDrainRequest{
+		NodeID: n1.ID,
+		DrainStrategy: &structs.DrainStrategy{
+			DrainSpec: structs.DrainSpec{
+				Deadline: 1 * time.Second,
+			},
+		},
+		WriteRequest: structs.WriteRequest{Region: "global"},
+	}
+	var drainResp structs.NodeDrainUpdateResponse
+	require.Nil(msgpackrpc.CallWithCodec(codec, "Node.UpdateDrain", drainReq, &drainResp))
+
+	// Wait for the allocs to be stopped
+	testutil.WaitForResult(func() (bool, error) {
+		allocs, err := state.AllocsByNode(nil, n1.ID)
+		if err != nil {
+			return false, err
+		}
+		for _, alloc := range allocs {
+			if alloc.DesiredStatus != structs.AllocDesiredStatusStop {
+				return false, fmt.Errorf("got desired status %v", alloc.DesiredStatus)
+			}
+		}
+		return true, nil
+	}, func(err error) {
+		t.Fatalf("err: %v", err)
+	})
+
+	// Check that the node drain is removed
+	testutil.WaitForResult(func() (bool, error) {
+		node, err := state.NodeByID(nil, n1.ID)
+		if err != nil {
+			return false, err
+		}
+		return node.DrainStrategy == nil, fmt.Errorf("has drain strategy still set")
+	}, func(err error) {
+		t.Fatalf("err: %v", err)
+	})
+}
+
 func TestDrainer_DrainEmptyNode(t *testing.T) {
 	t.Parallel()
 	require := require.New(t)
diff --git a/scheduler/generic_sched.go b/scheduler/generic_sched.go
index 32758359b8c4..6b812740ce9b 100644
--- a/scheduler/generic_sched.go
+++ b/scheduler/generic_sched.go
@@ -76,10 +76,7 @@ type GenericScheduler struct {
 	ctx        *EvalContext
 	stack      *GenericStack
 
-	// Deprecated, was used in pre Nomad 0.7 rolling update stanza and in node draining prior to Nomad 0.8
-	followupEvalWait time.Duration
-	nextEval         *structs.Evaluation
-	followUpEvals    []*structs.Evaluation
+	followUpEvals []*structs.Evaluation
 
 	deployment *structs.Deployment
 
@@ -125,7 +122,7 @@ func (s *GenericScheduler) Process(eval *structs.Evaluation) error {
 	default:
 		desc := fmt.Sprintf("scheduler cannot handle '%s' evaluation reason",
 			eval.TriggeredBy)
-		return setStatus(s.logger, s.planner, s.eval, s.nextEval, s.blocked,
+		return setStatus(s.logger, s.planner, s.eval, nil, s.blocked,
 			s.failedTGAllocs, structs.EvalStatusFailed, desc, s.queuedAllocs,
 			s.deployment.GetID())
 	}
@@ -144,7 +141,7 @@ func (s *GenericScheduler) Process(eval *structs.Evaluation) error {
 			if err := s.createBlockedEval(true); err != nil {
 				mErr.Errors = append(mErr.Errors, err)
 			}
-			if err := setStatus(s.logger, s.planner, s.eval, s.nextEval, s.blocked,
+			if err := setStatus(s.logger, s.planner, s.eval, nil, s.blocked,
 				s.failedTGAllocs, statusErr.EvalStatus, err.Error(),
 				s.queuedAllocs, s.deployment.GetID()); err != nil {
 				mErr.Errors = append(mErr.Errors, err)
@@ -166,7 +163,7 @@ func (s *GenericScheduler) Process(eval *structs.Evaluation) error {
 	}
 
 	// Update the status to complete
-	return setStatus(s.logger, s.planner, s.eval, s.nextEval, s.blocked,
+	return setStatus(s.logger, s.planner, s.eval, nil, s.blocked,
 		s.failedTGAllocs, structs.EvalStatusComplete, "", s.queuedAllocs,
 		s.deployment.GetID())
 }
@@ -259,16 +256,6 @@ func (s *GenericScheduler) process() (bool, error) {
 		return true, nil
 	}
 
-	// If we need a followup eval and we haven't created one, do so.
-	if s.followupEvalWait != 0 && s.nextEval == nil {
-		s.nextEval = s.eval.NextRollingEval(s.followupEvalWait)
-		if err := s.planner.CreateEval(s.nextEval); err != nil {
-			s.logger.Printf("[ERR] sched: %#v failed to make next eval for rolling migration: %v", s.eval, err)
-			return false, err
-		}
-		s.logger.Printf("[DEBUG] sched: %#v: rolling migration limit reached, next eval '%s' created", s.eval, s.nextEval.ID)
-	}
-
 	// Create follow up evals for any delayed reschedule eligible allocations
 	if len(s.followUpEvals) > 0 {
 		for _, eval := range s.followUpEvals {
@@ -353,16 +340,13 @@ func (s *GenericScheduler) computeJobAllocs() error {
 	s.plan.Deployment = results.deployment
 	s.plan.DeploymentUpdates = results.deploymentUpdates
 
-	// Store the the follow up eval wait duration. If set this will trigger a
-	// follow up eval to handle node draining.
-	s.followupEvalWait = results.followupEvalWait
-
 	// Store all the follow up evaluations from rescheduled allocations
 	if len(results.desiredFollowupEvals) > 0 {
 		for _, evals := range results.desiredFollowupEvals {
 			s.followUpEvals = append(s.followUpEvals, evals...)
 		}
 	}
+
 	// Update the stored deployment
 	if results.deployment != nil {
 		s.deployment = results.deployment
diff --git a/scheduler/reconcile.go b/scheduler/reconcile.go
index cdc375510750..a4e1d1c06d3f 100644
--- a/scheduler/reconcile.go
+++ b/scheduler/reconcile.go
@@ -99,11 +99,6 @@ type reconcileResults struct {
 	// task group.
 	desiredTGUpdates map[string]*structs.DesiredUpdates
 
-	// followupEvalWait is set if there should be a followup eval run after the
-	// given duration
-	// Deprecated, the delay strategy that sets this is not available after nomad 0.7.0
-	followupEvalWait time.Duration
-
 	// desiredFollowupEvals is the map of follow up evaluations to create per task group
 	// This is used to create a delayed evaluation for rescheduling failed allocations.
 	desiredFollowupEvals map[string][]*structs.Evaluation
@@ -131,9 +126,6 @@ func (r *reconcileResults) GoString() string {
 		base += fmt.Sprintf("\nDeployment Update for ID %q: Status %q; Description %q",
 			u.DeploymentID, u.Status, u.StatusDescription)
 	}
-	if r.followupEvalWait != 0 {
-		base += fmt.Sprintf("\nFollowup Eval in %v", r.followupEvalWait)
-	}
 	for tg, u := range r.desiredTGUpdates {
 		base += fmt.Sprintf("\nDesired Changes for %q: %#v", tg, u)
 	}
@@ -461,16 +453,12 @@ func (a *allocReconciler) computeGroup(group string, all allocSet) bool {
 
 	// Calculate the allowed number of changes and set the desired changes
 	// accordingly.
-	min := helper.IntMin(len(migrate), limit)
 	if !a.deploymentFailed && !a.deploymentPaused {
-		desiredChanges.Migrate += uint64(min)
-		desiredChanges.Ignore += uint64(len(migrate) - min)
+		desiredChanges.Migrate += uint64(len(migrate))
 	} else {
 		desiredChanges.Stop += uint64(len(migrate))
 	}
 
-	followup := false
-	migrated := 0
 	for _, alloc := range migrate.nameOrder() {
 		// If the deployment is failed or paused, don't replace it, just mark as stop.
 		if a.deploymentFailed || a.deploymentPaused {
@@ -481,12 +469,6 @@ func (a *allocReconciler) computeGroup(group string, all allocSet) bool {
 			continue
 		}
 
-		if migrated >= limit {
-			followup = true
-			break
-		}
-
-		migrated++
 		a.result.stop = append(a.result.stop, allocStopResult{
 			alloc:             alloc,
 			statusDescription: allocMigrating,
@@ -499,12 +481,6 @@ func (a *allocReconciler) computeGroup(group string, all allocSet) bool {
 		})
 	}
 
-	// TODO Deprecate
-	// We need to create a followup evaluation.
-	if followup && strategy != nil && a.result.followupEvalWait < strategy.Stagger {
-		a.result.followupEvalWait = strategy.Stagger
-	}
-
 	// Create a new deployment if necessary
 	if !existingDeployment && strategy != nil && dstate.DesiredTotal != 0 {
 		// A previous group may have made the deployment already
diff --git a/scheduler/reconcile_test.go b/scheduler/reconcile_test.go
index a00471fba603..604347fa5bd9 100644
--- a/scheduler/reconcile_test.go
+++ b/scheduler/reconcile_test.go
@@ -75,7 +75,6 @@ Update stanza Tests:
 √  Failed deployment cancels non-promoted task groups
 √  Failed deployment and updated job works
 √  Finished deployment gets marked as complete
-√  The stagger is correctly calculated when it is applied across multiple task groups.
 √  Change job change while scaling up
 √  Update the job when all allocations from the previous job haven't been placed yet.
 √  Paused or failed deployment doesn't do any rescheduling of failed allocs
@@ -306,7 +305,6 @@ type resultExpectation struct {
 	inplace           int
 	stop              int
 	desiredTGUpdates  map[string]*structs.DesiredUpdates
-	followupEvalWait  time.Duration
 }
 
 func assertResults(t *testing.T, r *reconcileResults, exp *resultExpectation) {
@@ -342,9 +340,6 @@ func assertResults(t *testing.T, r *reconcileResults, exp *resultExpectation) {
 	if l := len(r.desiredTGUpdates); l != len(exp.desiredTGUpdates) {
 		t.Fatalf("Expected %d task group desired tg updates annotations; got %d", len(exp.desiredTGUpdates), l)
 	}
-	if r.followupEvalWait != exp.followupEvalWait {
-		t.Fatalf("Unexpected followup eval wait time. Got %v; want %v", r.followupEvalWait, exp.followupEvalWait)
-	}
 
 	// Check the desired updates happened
 	for group, desired := range exp.desiredTGUpdates {
@@ -3043,24 +3038,23 @@ func TestReconciler_TaintedNode_RollingUpgrade(t *testing.T) {
 	assertResults(t, r, &resultExpectation{
 		createDeployment:  nil,
 		deploymentUpdates: nil,
-		place:             2,
+		place:             3,
 		destructive:       2,
-		stop:              2,
-		followupEvalWait:  31 * time.Second,
+		stop:              3,
 		desiredTGUpdates: map[string]*structs.DesiredUpdates{
 			job.TaskGroups[0].Name: {
 				Place:             1, // Place the lost
 				Stop:              1, // Stop the lost
-				Migrate:           1, // Migrate the tainted
+				Migrate:           2, // Migrate the tainted
 				DestructiveUpdate: 2,
-				Ignore:            6,
+				Ignore:            5,
 			},
 		},
 	})
 
 	assertNamesHaveIndexes(t, intRange(8, 9), destructiveResultsToNames(r.destructiveUpdate))
-	assertNamesHaveIndexes(t, intRange(0, 1), placeResultsToNames(r.place))
-	assertNamesHaveIndexes(t, intRange(0, 1), stopResultsToNames(r.stop))
+	assertNamesHaveIndexes(t, intRange(0, 2), placeResultsToNames(r.place))
+	assertNamesHaveIndexes(t, intRange(0, 2), stopResultsToNames(r.stop))
 }
 
 // Tests the reconciler handles a failed deployment and only replaces lost
@@ -3132,7 +3126,6 @@ func TestReconciler_FailedDeployment_PlacementLost(t *testing.T) {
 		place:             1, // Only replace the lost node
 		inplace:           0,
 		stop:              2,
-		followupEvalWait:  0, // Since the deployment is failed, there should be no followup
 		desiredTGUpdates: map[string]*structs.DesiredUpdates{
 			job.TaskGroups[0].Name: {
 				Place:  1,
@@ -3413,73 +3406,6 @@ func TestReconciler_MarkDeploymentComplete(t *testing.T) {
 	})
 }
 
-// Tests the reconciler picks the maximum of the staggers when multiple task
-// groups are under going node drains.
-func TestReconciler_TaintedNode_MultiGroups(t *testing.T) {
-	// Create a job with two task groups
-	job := mock.Job()
-	job.TaskGroups[0].Update = noCanaryUpdate
-	job.TaskGroups = append(job.TaskGroups, job.TaskGroups[0].Copy())
-	job.TaskGroups[1].Name = "two"
-	job.TaskGroups[1].Update.Stagger = 100 * time.Second
-
-	// Create the allocations
-	var allocs []*structs.Allocation
-	for j := 0; j < 2; j++ {
-		for i := 0; i < 10; i++ {
-			alloc := mock.Alloc()
-			alloc.Job = job
-			alloc.JobID = job.ID
-			alloc.NodeID = uuid.Generate()
-			alloc.Name = structs.AllocName(job.ID, job.TaskGroups[j].Name, uint(i))
-			alloc.TaskGroup = job.TaskGroups[j].Name
-			allocs = append(allocs, alloc)
-		}
-	}
-
-	// Build a map of tainted nodes
-	tainted := make(map[string]*structs.Node, 15)
-	for i := 0; i < 15; i++ {
-		n := mock.Node()
-		n.ID = allocs[i].NodeID
-		allocs[i].DesiredTransition.Migrate = helper.BoolToPtr(true)
-		n.Drain = true
-		tainted[n.ID] = n
-	}
-
-	reconciler := NewAllocReconciler(testLogger(), allocUpdateFnIgnore, false, job.ID, job, nil, allocs, tainted)
-	r := reconciler.Compute()
-
-	// Assert the correct results
-	assertResults(t, r, &resultExpectation{
-		createDeployment:  nil,
-		deploymentUpdates: nil,
-		place:             8,
-		inplace:           0,
-		stop:              8,
-		followupEvalWait:  100 * time.Second,
-		desiredTGUpdates: map[string]*structs.DesiredUpdates{
-			job.TaskGroups[0].Name: {
-				Place:             0,
-				Stop:              0,
-				Migrate:           4,
-				DestructiveUpdate: 0,
-				Ignore:            6,
-			},
-			job.TaskGroups[1].Name: {
-				Place:             0,
-				Stop:              0,
-				Migrate:           4,
-				DestructiveUpdate: 0,
-				Ignore:            6,
-			},
-		},
-	})
-
-	assertNamesHaveIndexes(t, intRange(0, 3, 0, 3), placeResultsToNames(r.place))
-	assertNamesHaveIndexes(t, intRange(0, 3, 0, 3), stopResultsToNames(r.stop))
-}
-
 // Tests the reconciler handles changing a job such that a deployment is created
 // while doing a scale up but as the second eval.
 func TestReconciler_JobChange_ScaleUp_SecondEval(t *testing.T) {

From ad2f211712e76949c90916ed7ac3a754b0c78d84 Mon Sep 17 00:00:00 2001
From: Alex Dadgar <alex.dadgar@gmail.com>
Date: Fri, 9 Mar 2018 14:15:21 -0800
Subject: [PATCH 50/79] Batch drain update

---
 nomad/drainer/drainer.go        | 13 +++------
 nomad/drainer/watch_nodes.go    |  2 +-
 nomad/drainer_shims.go          | 14 ++++++----
 nomad/fsm.go                    | 16 +++++++++++
 nomad/fsm_test.go               | 41 ++++++++++++++++++++++++++++
 nomad/state/state_store.go      | 23 +++++++++++++++-
 nomad/state/state_store_test.go | 47 +++++++++++++++++++++++++++++++++
 nomad/structs/structs.go        | 18 +++++++++++++
 8 files changed, 158 insertions(+), 16 deletions(-)

diff --git a/nomad/drainer/drainer.go b/nomad/drainer/drainer.go
index 2b6a328070d0..c8d9abaa5562 100644
--- a/nomad/drainer/drainer.go
+++ b/nomad/drainer/drainer.go
@@ -36,7 +36,7 @@ const (
 // NodeDrainer.
 type RaftApplier interface {
 	AllocUpdateDesiredTransition(allocs map[string]*structs.DesiredTransition, evals []*structs.Evaluation) (uint64, error)
-	NodeDrainComplete(nodeID string) (uint64, error)
+	NodesDrainComplete(nodes []string) (uint64, error)
 }
 
 // NodeTracker is the interface to notify an object that is tracking draining
@@ -295,14 +295,9 @@ func (n *NodeDrainer) handleMigratedAllocs(allocs []*structs.Allocation) {
 	}
 	n.l.RUnlock()
 
-	// TODO(alex) This should probably be a single Raft transaction
-	for _, doneNode := range done {
-		index, err := n.raft.NodeDrainComplete(doneNode)
-		if err != nil {
-			n.logger.Printf("[ERR] nomad.drain: failed to unset drain for node %q: %v", doneNode, err)
-		} else {
-			n.logger.Printf("[INFO] nomad.drain: node %q completed draining at index %d", doneNode, index)
-		}
+	// TODO(alex) Shard
+	if _, err := n.raft.NodesDrainComplete(done); err != nil {
+		n.logger.Printf("[ERR] nomad.drain: failed to unset drain for nodes: %v", err)
 	}
 }
 
diff --git a/nomad/drainer/watch_nodes.go b/nomad/drainer/watch_nodes.go
index 738f496fda78..ed99fb6938c5 100644
--- a/nomad/drainer/watch_nodes.go
+++ b/nomad/drainer/watch_nodes.go
@@ -89,7 +89,7 @@ func (n *NodeDrainer) Update(node *structs.Node) {
 	}
 
 	if done {
-		index, err := n.raft.NodeDrainComplete(node.ID)
+		index, err := n.raft.NodesDrainComplete([]string{node.ID})
 		if err != nil {
 			n.logger.Printf("[ERR] nomad.drain: failed to unset drain for node %q: %v", node.ID, err)
 		} else {
diff --git a/nomad/drainer_shims.go b/nomad/drainer_shims.go
index 1c7ffb1a9b76..0eb8c43a27b3 100644
--- a/nomad/drainer_shims.go
+++ b/nomad/drainer_shims.go
@@ -8,14 +8,18 @@ type drainerShim struct {
 	s *Server
 }
 
-func (d drainerShim) NodeDrainComplete(nodeID string) (uint64, error) {
-	args := &structs.NodeUpdateDrainRequest{
-		NodeID:       nodeID,
-		Drain:        false,
+func (d drainerShim) NodesDrainComplete(nodes []string) (uint64, error) {
+	args := &structs.BatchNodeUpdateDrainRequest{
+		Updates:      make(map[string]*structs.DrainUpdate, len(nodes)),
 		WriteRequest: structs.WriteRequest{Region: d.s.config.Region},
 	}
 
-	resp, index, err := d.s.raftApply(structs.NodeUpdateDrainRequestType, args)
+	update := &structs.DrainUpdate{}
+	for _, node := range nodes {
+		args.Updates[node] = update
+	}
+
+	resp, index, err := d.s.raftApply(structs.BatchNodeUpdateDrainRequestType, args)
 	return d.convertApplyErrors(resp, index, err)
 }
 
diff --git a/nomad/fsm.go b/nomad/fsm.go
index bc52f256e343..afe726eede39 100644
--- a/nomad/fsm.go
+++ b/nomad/fsm.go
@@ -244,6 +244,8 @@ func (n *nomadFSM) Apply(log *raft.Log) interface{} {
 		return n.applyAllocUpdateDesiredTransition(buf[1:], log.Index)
 	case structs.NodeUpdateEligibilityRequestType:
 		return n.applyNodeEligibilityUpdate(buf[1:], log.Index)
+	case structs.BatchNodeUpdateDrainRequestType:
+		return n.applyBatchDrainUpdate(buf[1:], log.Index)
 	}
 
 	// Check enterprise only message types.
@@ -337,6 +339,20 @@ func (n *nomadFSM) applyDrainUpdate(buf []byte, index uint64) interface{} {
 	return nil
 }
 
+func (n *nomadFSM) applyBatchDrainUpdate(buf []byte, index uint64) interface{} {
+	defer metrics.MeasureSince([]string{"nomad", "fsm", "batch_node_drain_update"}, time.Now())
+	var req structs.BatchNodeUpdateDrainRequest
+	if err := structs.Decode(buf, &req); err != nil {
+		panic(fmt.Errorf("failed to decode request: %v", err))
+	}
+
+	if err := n.state.BatchUpdateNodeDrain(index, req.Updates); err != nil {
+		n.logger.Printf("[ERR] nomad.fsm: BatchUpdateNodeDrain failed: %v", err)
+		return err
+	}
+	return nil
+}
+
 func (n *nomadFSM) applyNodeEligibilityUpdate(buf []byte, index uint64) interface{} {
 	defer metrics.MeasureSince([]string{"nomad", "fsm", "node_eligibility_update"}, time.Now())
 	var req structs.NodeUpdateEligibilityRequest
diff --git a/nomad/fsm_test.go b/nomad/fsm_test.go
index 9f8ed205a77e..6d4aaf968fb6 100644
--- a/nomad/fsm_test.go
+++ b/nomad/fsm_test.go
@@ -278,6 +278,47 @@ func TestFSM_UpdateNodeStatus(t *testing.T) {
 	})
 }
 
+func TestFSM_BatchUpdateNodeDrain(t *testing.T) {
+	t.Parallel()
+	require := require.New(t)
+	fsm := testFSM(t)
+
+	node := mock.Node()
+	req := structs.NodeRegisterRequest{
+		Node: node,
+	}
+	buf, err := structs.Encode(structs.NodeRegisterRequestType, req)
+	require.Nil(err)
+
+	resp := fsm.Apply(makeLog(buf))
+	require.Nil(resp)
+
+	strategy := &structs.DrainStrategy{
+		DrainSpec: structs.DrainSpec{
+			Deadline: 10 * time.Second,
+		},
+	}
+	req2 := structs.BatchNodeUpdateDrainRequest{
+		Updates: map[string]*structs.DrainUpdate{
+			node.ID: &structs.DrainUpdate{
+				DrainStrategy: strategy,
+			},
+		},
+	}
+	buf, err = structs.Encode(structs.BatchNodeUpdateDrainRequestType, req2)
+	require.Nil(err)
+
+	resp = fsm.Apply(makeLog(buf))
+	require.Nil(resp)
+
+	// Verify we are NOT registered
+	ws := memdb.NewWatchSet()
+	node, err = fsm.State().NodeByID(ws, req.Node.ID)
+	require.Nil(err)
+	require.True(node.Drain)
+	require.Equal(node.DrainStrategy, strategy)
+}
+
 func TestFSM_UpdateNodeDrain(t *testing.T) {
 	t.Parallel()
 	require := require.New(t)
diff --git a/nomad/state/state_store.go b/nomad/state/state_store.go
index 5f4564001135..6e4f3978db65 100644
--- a/nomad/state/state_store.go
+++ b/nomad/state/state_store.go
@@ -617,12 +617,34 @@ func (s *StateStore) UpdateNodeStatus(index uint64, nodeID, status string) error
 	return nil
 }
 
+// BatchUpdateNodeDrain is used to update the drain of a node set of nodes
+func (s *StateStore) BatchUpdateNodeDrain(index uint64, updates map[string]*structs.DrainUpdate) error {
+	txn := s.db.Txn(true)
+	defer txn.Abort()
+	for node, update := range updates {
+		if err := s.updateNodeDrainImpl(txn, index, node, update.DrainStrategy, update.MarkEligible); err != nil {
+			return err
+		}
+	}
+	txn.Commit()
+	return nil
+}
+
 // UpdateNodeDrain is used to update the drain of a node
 func (s *StateStore) UpdateNodeDrain(index uint64, nodeID string,
 	drain *structs.DrainStrategy, markEligible bool) error {
 
 	txn := s.db.Txn(true)
 	defer txn.Abort()
+	if err := s.updateNodeDrainImpl(txn, index, nodeID, drain, markEligible); err != nil {
+		return err
+	}
+	txn.Commit()
+	return nil
+}
+
+func (s *StateStore) updateNodeDrainImpl(txn *memdb.Txn, index uint64, nodeID string,
+	drain *structs.DrainStrategy, markEligible bool) error {
 
 	// Lookup the node
 	existing, err := txn.First("nodes", "id", nodeID)
@@ -656,7 +678,6 @@ func (s *StateStore) UpdateNodeDrain(index uint64, nodeID string,
 		return fmt.Errorf("index update failed: %v", err)
 	}
 
-	txn.Commit()
 	return nil
 }
 
diff --git a/nomad/state/state_store_test.go b/nomad/state/state_store_test.go
index 7eeb4672e212..20ebbe88fcd2 100644
--- a/nomad/state/state_store_test.go
+++ b/nomad/state/state_store_test.go
@@ -698,6 +698,53 @@ func TestStateStore_UpdateNodeStatus_Node(t *testing.T) {
 	}
 }
 
+func TestStateStore_BatchUpdateNodeDrain(t *testing.T) {
+	require := require.New(t)
+	state := testStateStore(t)
+
+	n1, n2 := mock.Node(), mock.Node()
+	require.Nil(state.UpsertNode(1000, n1))
+	require.Nil(state.UpsertNode(1001, n2))
+
+	// Create a watchset so we can test that update node drain fires the watch
+	ws := memdb.NewWatchSet()
+	_, err := state.NodeByID(ws, n1.ID)
+	require.Nil(err)
+
+	expectedDrain := &structs.DrainStrategy{
+		DrainSpec: structs.DrainSpec{
+			Deadline: -1 * time.Second,
+		},
+	}
+
+	update := map[string]*structs.DrainUpdate{
+		n1.ID: &structs.DrainUpdate{
+			DrainStrategy: expectedDrain,
+		},
+		n2.ID: &structs.DrainUpdate{
+			DrainStrategy: expectedDrain,
+		},
+	}
+
+	require.Nil(state.BatchUpdateNodeDrain(1002, update))
+	require.True(watchFired(ws))
+
+	ws = memdb.NewWatchSet()
+	for _, id := range []string{n1.ID, n2.ID} {
+		out, err := state.NodeByID(ws, id)
+		require.Nil(err)
+		require.True(out.Drain)
+		require.NotNil(out.DrainStrategy)
+		require.Equal(out.DrainStrategy, expectedDrain)
+		require.EqualValues(1002, out.ModifyIndex)
+	}
+
+	index, err := state.Index("nodes")
+	require.Nil(err)
+	require.EqualValues(1002, index)
+	require.False(watchFired(ws))
+}
+
 func TestStateStore_UpdateNodeDrain_Node(t *testing.T) {
 	require := require.New(t)
 	state := testStateStore(t)
diff --git a/nomad/structs/structs.go b/nomad/structs/structs.go
index 04c073946be5..72f2c0a31948 100644
--- a/nomad/structs/structs.go
+++ b/nomad/structs/structs.go
@@ -80,6 +80,7 @@ const (
 	JobBatchDeregisterRequestType
 	AllocUpdateDesiredTransitionRequestType
 	NodeUpdateEligibilityRequestType
+	BatchNodeUpdateDrainRequestType
 )
 
 const (
@@ -314,6 +315,23 @@ type NodeUpdateDrainRequest struct {
 	WriteRequest
 }
 
+// BatchNodeUpdateDrainRequest is used for updating the drain strategy for a
+// batch of nodes
+type BatchNodeUpdateDrainRequest struct {
+	// Updates is a mapping of nodes to their updated drain strategy
+	Updates map[string]*DrainUpdate
+	WriteRequest
+}
+
+// DrainUpdate is used to update the drain of a node
+type DrainUpdate struct {
+	// DrainStrategy is the new strategy for the node
+	DrainStrategy *DrainStrategy
+
+	// MarkEligible marks the node as eligible if removing the drain strategy.
+	MarkEligible bool
+}
+
 // NodeUpdateEligibilityRequest is used for updating the scheduling	eligibility
 type NodeUpdateEligibilityRequest struct {
 	NodeID      string

From 5324e56e1c0604ebb597e249a71114e50d71c988 Mon Sep 17 00:00:00 2001
From: Alex Dadgar <alex.dadgar@gmail.com>
Date: Fri, 9 Mar 2018 16:10:38 -0800
Subject: [PATCH 51/79] sharding

---
 nomad/drainer/drainer.go           | 27 ++++++---
 nomad/drainer/drainer_util.go      | 93 ++++++++++++++++++++++++++++++
 nomad/drainer/drainer_util_test.go | 54 +++++++++++++++++
 3 files changed, 165 insertions(+), 9 deletions(-)
 create mode 100644 nomad/drainer/drainer_util.go
 create mode 100644 nomad/drainer/drainer_util_test.go

diff --git a/nomad/drainer/drainer.go b/nomad/drainer/drainer.go
index c8d9abaa5562..98c52479a865 100644
--- a/nomad/drainer/drainer.go
+++ b/nomad/drainer/drainer.go
@@ -295,9 +295,12 @@ func (n *NodeDrainer) handleMigratedAllocs(allocs []*structs.Allocation) {
 	}
 	n.l.RUnlock()
 
-	// TODO(alex) Shard
-	if _, err := n.raft.NodesDrainComplete(done); err != nil {
-		n.logger.Printf("[ERR] nomad.drain: failed to unset drain for nodes: %v", err)
+	// Submit the node transistions in a sharded form to ensure a reasonable
+	// Raft transaction size.
+	for _, nodes := range partitionIds(done) {
+		if _, err := n.raft.NodesDrainComplete(nodes); err != nil {
+			n.logger.Printf("[ERR] nomad.drain: failed to unset drain for nodes: %v", err)
+		}
 	}
 }
 
@@ -341,13 +344,11 @@ func (n *NodeDrainer) batchDrainAllocs(allocs []*structs.Allocation) (uint64, er
 // the set of allocations. It will also create the necessary evaluations for the
 // affected jobs.
 func (n *NodeDrainer) drainAllocs(future *structs.BatchFuture, allocs []*structs.Allocation) {
-	// TODO(alex) This should shard to limit the size of the transaction.
-
 	// Compute the effected jobs and make the transition map
 	jobs := make(map[string]*structs.Allocation, 4)
-	transitions := make(map[string]*structs.DesiredTransition, len(allocs))
+	transistions := make(map[string]*structs.DesiredTransition, len(allocs))
 	for _, alloc := range allocs {
-		transitions[alloc.ID] = &structs.DesiredTransition{
+		transistions[alloc.ID] = &structs.DesiredTransition{
 			Migrate: helper.BoolToPtr(true),
 		}
 		jobs[alloc.JobID] = alloc
@@ -367,6 +368,14 @@ func (n *NodeDrainer) drainAllocs(future *structs.BatchFuture, allocs []*structs
 	}
 
 	// Commit this update via Raft
-	index, err := n.raft.AllocUpdateDesiredTransition(transitions, evals)
-	future.Respond(index, err)
+	var finalIndex uint64
+	for _, u := range partitionAllocDrain(transistions, evals) {
+		index, err := n.raft.AllocUpdateDesiredTransition(u.Transistions, u.Evals)
+		if err != nil {
+			future.Respond(index, err)
+		}
+		finalIndex = index
+	}
+
+	future.Respond(finalIndex, nil)
 }
diff --git a/nomad/drainer/drainer_util.go b/nomad/drainer/drainer_util.go
new file mode 100644
index 000000000000..09d026235aa0
--- /dev/null
+++ b/nomad/drainer/drainer_util.go
@@ -0,0 +1,93 @@
+package drainer
+
+import (
+	"github.com/hashicorp/nomad/nomad/structs"
+)
+
+var (
+	// maxIdsPerTxn is the maximum number of IDs that can be included in a
+	// single Raft transaction. This is to ensure that the Raft message does not
+	// become too large.
+	maxIdsPerTxn = (1024 * 256) / 36 // 0.25 MB of ids.
+)
+
+// partitionIds takes a set of IDs and returns a partitioned view of them such
+// that no batch would result in an overly large raft transaction.
+func partitionIds(ids []string) [][]string {
+	index := 0
+	total := len(ids)
+	var partitions [][]string
+	for remaining := total - index; remaining > 0; remaining = total - index {
+		if remaining < maxIdsPerTxn {
+			partitions = append(partitions, ids[index:])
+			break
+		} else {
+			partitions = append(partitions, ids[index:index+maxIdsPerTxn])
+			index += maxIdsPerTxn
+		}
+	}
+
+	return partitions
+}
+
+// transistionTuple is used to group desired transistions and evals
+type transistionTuple struct {
+	Transistions map[string]*structs.DesiredTransition
+	Evals        []*structs.Evaluation
+}
+
+// partitionAllocDrain returns a list of alloc transistions and evals to apply
+// in a single raft transaction.This is necessary to ensure that the Raft
+// transaction does not become too large.
+func partitionAllocDrain(transistions map[string]*structs.DesiredTransition,
+	evals []*structs.Evaluation) []*transistionTuple {
+
+	// Determine a stable ordering of the transistioning allocs
+	allocs := make([]string, 0, len(transistions))
+	for id := range transistions {
+		allocs = append(allocs, id)
+	}
+
+	var requests []*transistionTuple
+	submittedEvals, submittedTrans := 0, 0
+	for submittedEvals != len(evals) || submittedTrans != len(transistions) {
+		req := &transistionTuple{
+			Transistions: make(map[string]*structs.DesiredTransition),
+		}
+		requests = append(requests, req)
+		available := maxIdsPerTxn
+
+		// Add the allocs first
+		if remaining := len(allocs) - submittedTrans; remaining > 0 {
+			if remaining <= available {
+				for _, id := range allocs[submittedTrans:] {
+					req.Transistions[id] = transistions[id]
+				}
+				available -= remaining
+				submittedTrans += remaining
+			} else {
+				for _, id := range allocs[submittedTrans : submittedTrans+available] {
+					req.Transistions[id] = transistions[id]
+				}
+				submittedTrans += available
+
+				// Exhausted space so skip adding evals
+				continue
+			}
+
+		}
+
+		// Add the evals
+		if remaining := len(evals) - submittedEvals; remaining > 0 {
+			if remaining <= available {
+				req.Evals = evals[submittedEvals:]
+				submittedEvals += remaining
+			} else {
+				req.Evals = evals[submittedEvals : submittedEvals+available]
+				submittedEvals += available
+			}
+		}
+	}
+
+	return requests
+}
diff --git a/nomad/drainer/drainer_util_test.go b/nomad/drainer/drainer_util_test.go
new file mode 100644
index 000000000000..ee2f4a79f508
--- /dev/null
+++ b/nomad/drainer/drainer_util_test.go
@@ -0,0 +1,54 @@
+package drainer
+
+import (
+	"testing"
+
+	"github.com/hashicorp/nomad/nomad/structs"
+	"github.com/stretchr/testify/require"
+)
+
+func TestDrainer_PartitionAllocDrain(t *testing.T) {
+	// Set the max ids per reap to something lower.
+	old := maxIdsPerTxn
+	defer func() { maxIdsPerTxn = old }()
+	maxIdsPerTxn = 2
+
+	require := require.New(t)
+	transistions := map[string]*structs.DesiredTransition{"a": nil, "b": nil, "c": nil}
+	evals := []*structs.Evaluation{nil, nil, nil}
+	requests := partitionAllocDrain(transistions, evals)
+	require.Len(requests, 3)
+
+	first := requests[0]
+	require.Len(first.Transistions, 2)
+	require.Len(first.Evals, 0)
+
+	second := requests[1]
+	require.Len(second.Transistions, 1)
+	require.Len(second.Evals, 1)
+
+	third := requests[2]
+	require.Len(third.Transistions, 0)
+	require.Len(third.Evals, 2)
+}
+
+func TestDrainer_PartitionIds(t *testing.T) {
+	require := require.New(t)
+
+	// Set the max ids per reap to something lower.
+	old := maxIdsPerTxn
+	defer func() { maxIdsPerTxn = old }()
+	maxIdsPerTxn = 2
+
+	ids := []string{"1", "2", "3", "4", "5"}
+	requests := partitionIds(ids)
+	require.Len(requests, 3)
+	require.Len(requests[0], 2)
+	require.Len(requests[1], 2)
+	require.Len(requests[2], 1)
+	require.Equal(requests[0][0], ids[0])
+	require.Equal(requests[0][1], ids[1])
+	require.Equal(requests[1][0], ids[2])
+	require.Equal(requests[1][1], ids[3])
+	require.Equal(requests[2][0], ids[4])
+}

From 270699bab27bf73ec7cb2181d0432a429f5a0fa7 Mon Sep 17 00:00:00 2001
From: Alex Dadgar <alex.dadgar@gmail.com>
Date: Wed, 14 Mar 2018 16:38:19 -0700
Subject: [PATCH 52/79] fix comment

---
 nomad/fsm_test.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/nomad/fsm_test.go b/nomad/fsm_test.go
index 6d4aaf968fb6..1a7b08a4bd60 100644
--- a/nomad/fsm_test.go
+++ b/nomad/fsm_test.go
@@ -311,7 +311,7 @@ func TestFSM_BatchUpdateNodeDrain(t *testing.T) {
 	resp = fsm.Apply(makeLog(buf))
 	require.Nil(resp)
 
-	// Verify we are NOT registered
+	// Verify drain is set
 	ws := memdb.NewWatchSet()
 	node, err = fsm.State().NodeByID(ws, req.Node.ID)
 	require.Nil(err)

From fb6c821526ee7555fae795db0dc2cae9286f8a79 Mon Sep 17 00:00:00 2001
From: Michael Schurter <schmichael@hashicorp.com>
Date: Tue, 6 Mar 2018 13:12:36 -0800
Subject: [PATCH 53/79] Fix node eligibility test

---
 command/node_eligibility_test.go | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/command/node_eligibility_test.go b/command/node_eligibility_test.go
index 3129fe86a19b..6fbb3c91d8ff 100644
--- a/command/node_eligibility_test.go
+++ b/command/node_eligibility_test.go
@@ -37,8 +37,9 @@ func TestNodeEligibilityCommand_Fails(t *testing.T) {
 	if code := cmd.Run([]string{"-address=nope", "-enable", "12345678-abcd-efab-cdef-123456789abc"}); code != 1 {
 		t.Fatalf("expected exit code 1, got: %d", code)
 	}
-	if out := ui.ErrorWriter.String(); !strings.Contains(out, "Error toggling") {
-		t.Fatalf("expected failed toggle error, got: %s", out)
+	expected := "Error updating scheduling eligibility"
+	if out := ui.ErrorWriter.String(); !strings.Contains(out, expected) {
+		t.Fatalf("expected %q, got: %s", expected, out)
 	}
 	ui.ErrorWriter.Reset()
 

From 6347baec55774aa011201e5c22356636be6822f2 Mon Sep 17 00:00:00 2001
From: Michael Schurter <schmichael@hashicorp.com>
Date: Tue, 6 Mar 2018 16:23:21 -0800
Subject: [PATCH 54/79] Add DesiredTransition.ShouldMigrate to api pkg

---
 api/allocations.go      | 5 +++++
 api/allocations_test.go | 7 +++++++
 2 files changed, 12 insertions(+)

diff --git a/api/allocations.go b/api/allocations.go
index c3759806741f..fc035ebb16ce 100644
--- a/api/allocations.go
+++ b/api/allocations.go
@@ -215,3 +215,8 @@ type DesiredTransition struct {
 	// migrated to another node.
 	Migrate *bool
 }
+
+// ShouldMigrate returns whether the transition object dictates a migration.
+func (d DesiredTransition) ShouldMigrate() bool {
+	return d.Migrate != nil && *d.Migrate
+}
diff --git a/api/allocations_test.go b/api/allocations_test.go
index dd5ae333bd1a..5eb5508bb69f 100644
--- a/api/allocations_test.go
+++ b/api/allocations_test.go
@@ -239,3 +239,10 @@ func TestAllocations_RescheduleInfo(t *testing.T) {
 	}
 
 }
+
+func TestAllocations_ShouldMigrate(t *testing.T) {
+	t.Parallel()
+	require.True(t, DesiredTransition{Migrate: helper.BoolToPtr(true)}.ShouldMigrate())
+	require.False(t, DesiredTransition{}.ShouldMigrate())
+	require.False(t, DesiredTransition{Migrate: helper.BoolToPtr(false)}.ShouldMigrate())
+}

From 11d0eae5eddd58b5d1437dd50bdadcf6f15e73f3 Mon Sep 17 00:00:00 2001
From: Michael Schurter <schmichael@hashicorp.com>
Date: Tue, 6 Mar 2018 14:16:20 -0800
Subject: [PATCH 55/79] Monitor node drains until completion in CLI

allow -detach like other commands
---
 command/node_drain.go      | 134 ++++++++++++++++++++++++++++-
 command/node_drain_test.go | 171 +++++++++++++++++++++++++++++++++++++
 2 files changed, 303 insertions(+), 2 deletions(-)

diff --git a/command/node_drain.go b/command/node_drain.go
index 9d8326d472a0..b4a2ebad7369 100644
--- a/command/node_drain.go
+++ b/command/node_drain.go
@@ -45,6 +45,9 @@ Node Drain Options:
     Remaining allocations after the deadline are forced removed from the node.
     If unspecified, a default deadline of one hour is applied.
 
+  -detach
+    Return immediately instead of entering monitor mode.
+
   -force
     Force remove allocations off the node immediately.
 
@@ -80,6 +83,7 @@ func (c *NodeDrainCommand) AutocompleteFlags() complete.Flags {
 			"-disable":         complete.PredictNothing,
 			"-enable":          complete.PredictNothing,
 			"-deadline":        complete.PredictAnything,
+			"-detach":          complete.PredictNothing,
 			"-force":           complete.PredictNothing,
 			"-no-deadline":     complete.PredictNothing,
 			"-ignore-system":   complete.PredictNothing,
@@ -105,7 +109,7 @@ func (c *NodeDrainCommand) AutocompleteArgs() complete.Predictor {
 }
 
 func (c *NodeDrainCommand) Run(args []string) int {
-	var enable, disable, force,
+	var enable, disable, detach, force,
 		noDeadline, ignoreSystem, keepIneligible, self, autoYes bool
 	var deadline string
 
@@ -114,6 +118,7 @@ func (c *NodeDrainCommand) Run(args []string) int {
 	flags.BoolVar(&enable, "enable", false, "Enable drain mode")
 	flags.BoolVar(&disable, "disable", false, "Disable drain mode")
 	flags.StringVar(&deadline, "deadline", "", "Deadline after which allocations are force stopped")
+	flags.BoolVar(&detach, "detach", false, "")
 	flags.BoolVar(&force, "force", false, "Force immediate drain")
 	flags.BoolVar(&noDeadline, "no-deadline", false, "Drain node with no deadline")
 	flags.BoolVar(&ignoreSystem, "ignore-system", false, "Do not drain system job allocations from the node")
@@ -259,11 +264,136 @@ func (c *NodeDrainCommand) Run(args []string) int {
 	}
 
 	// Toggle node draining
-	if _, err := client.Nodes().UpdateDrain(node.ID, spec, !keepIneligible, nil); err != nil {
+	meta, err := client.Nodes().UpdateDrain(node.ID, spec, !keepIneligible, nil)
+	if err != nil {
 		c.Ui.Error(fmt.Sprintf("Error updating drain specification: %s", err))
 		return 1
 	}
 
 	c.Ui.Output(fmt.Sprintf("Node %q drain strategy set", node.ID))
+
+	if enable && !detach {
+		if err := monitorDrain(c.Ui.Output, client.Nodes(), node.ID, meta.LastIndex); err != nil {
+			c.Ui.Error(fmt.Sprintf("Error monitoring drain: %v", err))
+			return 1
+		}
+
+		c.Ui.Output(fmt.Sprintf("Node %q drain complete", nodeID))
+	}
+
 	return 0
 }
+
+// monitorDrain monitors the node being drained and exits when the node has
+// finished draining.
+func monitorDrain(output func(string), nodeClient *api.Nodes, nodeID string, index uint64) error {
+	doneCh := make(chan struct{})
+	defer close(doneCh)
+
+	// Errors from either goroutine are sent here
+	errCh := make(chan error, 1)
+
+	// Monitor node changes and close chan when drain is complete
+	nodeCh := make(chan struct{})
+	go func() {
+		for {
+			q := api.QueryOptions{
+				AllowStale: true,
+				WaitIndex:  index,
+			}
+			node, meta, err := nodeClient.Info(nodeID, &q)
+			if err != nil {
+				select {
+				case errCh <- err:
+				case <-doneCh:
+				}
+				return
+			}
+
+			if node.DrainStrategy == nil {
+				close(nodeCh)
+				return
+			}
+
+			// Drain still ongoing
+			index = meta.LastIndex
+		}
+	}()
+
+	// Monitor alloc changes
+	allocCh := make(chan string, 1)
+	go func() {
+		allocs, meta, err := nodeClient.Allocations(nodeID, nil)
+		if err != nil {
+			select {
+			case errCh <- err:
+			case <-doneCh:
+			}
+			return
+		}
+
+		initial := make(map[string]*api.Allocation, len(allocs))
+		for _, a := range allocs {
+			initial[a.ID] = a
+		}
+
+		for {
+			q := api.QueryOptions{
+				AllowStale: true,
+				WaitIndex:  meta.LastIndex,
+			}
+
+			allocs, meta, err = nodeClient.Allocations(nodeID, &q)
+			if err != nil {
+				select {
+				case errCh <- err:
+				case <-doneCh:
+				}
+				return
+			}
+
+			for _, a := range allocs {
+				// Get previous version of alloc
+				orig, ok := initial[a.ID]
+
+				// Update local alloc state
+				initial[a.ID] = a
+
+				msg := ""
+				switch {
+				case !ok:
+					// Should only be possible if response
+					// from initial Allocations call was
+					// stale. No need to output
+
+				case orig.ClientStatus != a.ClientStatus:
+					// Alloc status has changed; output
+					msg = fmt.Sprintf("status %s -> %s", orig.ClientStatus, a.ClientStatus)
+
+				case !orig.DesiredTransition.ShouldMigrate() && a.DesiredTransition.ShouldMigrate():
+					// Alloc marked for migration
+					msg = "draining"
+				}
+
+				if msg != "" {
+					select {
+					case allocCh <- fmt.Sprintf("Alloc %q %s", a.ID, msg):
+					case <-doneCh:
+						return
+					}
+				}
+			}
+		}
+	}()
+
+	for {
+		select {
+		case err := <-errCh:
+			return err
+		case <-nodeCh:
+			return nil
+		case msg := <-allocCh:
+			output(msg)
+		}
+	}
+}
diff --git a/command/node_drain_test.go b/command/node_drain_test.go
index 20f63d95f571..1207047454a3 100644
--- a/command/node_drain_test.go
+++ b/command/node_drain_test.go
@@ -4,11 +4,16 @@ import (
 	"fmt"
 	"strings"
 	"testing"
+	"time"
 
+	"github.com/hashicorp/nomad/api"
+	"github.com/hashicorp/nomad/command/agent"
+	"github.com/hashicorp/nomad/helper"
 	"github.com/hashicorp/nomad/testutil"
 	"github.com/mitchellh/cli"
 	"github.com/posener/complete"
 	"github.com/stretchr/testify/assert"
+	"github.com/stretchr/testify/require"
 )
 
 func TestNodeDrainCommand_Implements(t *testing.T) {
@@ -16,6 +21,172 @@ func TestNodeDrainCommand_Implements(t *testing.T) {
 	var _ cli.Command = &NodeDrainCommand{}
 }
 
+func TestNodeDrainCommand_Detach(t *testing.T) {
+	t.Parallel()
+	require := require.New(t)
+	server, client, url := testServer(t, true, func(c *agent.Config) {
+		c.NodeName = "drain_detach_node"
+	})
+	defer server.Shutdown()
+
+	// Wait for a node to appear
+	var nodeID string
+	testutil.WaitForResult(func() (bool, error) {
+		nodes, _, err := client.Nodes().List(nil)
+		if err != nil {
+			return false, err
+		}
+		if len(nodes) == 0 {
+			return false, fmt.Errorf("missing node")
+		}
+		nodeID = nodes[0].ID
+		return true, nil
+	}, func(err error) {
+		t.Fatalf("err: %s", err)
+	})
+
+	// Register a job to create an alloc to drain that will block draining
+	job := &api.Job{
+		ID:          helper.StringToPtr("mock_service"),
+		Name:        helper.StringToPtr("mock_service"),
+		Datacenters: []string{"dc1"},
+		TaskGroups: []*api.TaskGroup{
+			{
+				Name: helper.StringToPtr("mock_group"),
+				Tasks: []*api.Task{
+					{
+						Name:   "mock_task",
+						Driver: "mock_driver",
+						Config: map[string]interface{}{
+							"run_for":    "10m",
+							"exit_after": "10m",
+						},
+					},
+				},
+			},
+		},
+	}
+
+	_, _, err := client.Jobs().Register(job, nil)
+	require.Nil(err)
+
+	testutil.WaitForResult(func() (bool, error) {
+		allocs, _, err := client.Nodes().Allocations(nodeID, nil)
+		if err != nil {
+			return false, err
+		}
+		return len(allocs) > 0, fmt.Errorf("no allocs")
+	}, func(err error) {
+		t.Fatalf("err: %v", err)
+	})
+
+	ui := new(cli.MockUi)
+	cmd := &NodeDrainCommand{Meta: Meta{Ui: ui}}
+	if code := cmd.Run([]string{"-address=" + url, "-self", "-enable", "-detach"}); code != 0 {
+		t.Fatalf("expected exit 0, got: %d", code)
+	}
+
+	out := ui.OutputWriter.String()
+	expected := "drain strategy set"
+	require.Contains(out, expected)
+
+	node, _, err := client.Nodes().Info(nodeID, nil)
+	require.Nil(err)
+	require.NotNil(node.DrainStrategy)
+}
+
+func TestNodeDrainCommand_Monitor(t *testing.T) {
+	t.Parallel()
+	require := require.New(t)
+	server, client, url := testServer(t, true, func(c *agent.Config) {
+		c.NodeName = "drain_monitor_node"
+	})
+	defer server.Shutdown()
+
+	// Wait for a node to appear
+	var nodeID string
+	testutil.WaitForResult(func() (bool, error) {
+		nodes, _, err := client.Nodes().List(nil)
+		if err != nil {
+			return false, err
+		}
+		if len(nodes) == 0 {
+			return false, fmt.Errorf("missing node")
+		}
+		nodeID = nodes[0].ID
+		return true, nil
+	}, func(err error) {
+		t.Fatalf("err: %s", err)
+	})
+
+	// Register a job to create an alloc to drain
+	count := 3
+	job := &api.Job{
+		ID:          helper.StringToPtr("mock_service"),
+		Name:        helper.StringToPtr("mock_service"),
+		Datacenters: []string{"dc1"},
+		TaskGroups: []*api.TaskGroup{
+			{
+				Name:  helper.StringToPtr("mock_group"),
+				Count: &count,
+				Migrate: &api.MigrateStrategy{
+					MaxParallel:     helper.IntToPtr(1),
+					HealthCheck:     helper.StringToPtr("task_states"),
+					MinHealthyTime:  helper.TimeToPtr(10 * time.Millisecond),
+					HealthyDeadline: helper.TimeToPtr(5 * time.Minute),
+				},
+				Tasks: []*api.Task{
+					{
+						Name:   "mock_task",
+						Driver: "mock_driver",
+						Config: map[string]interface{}{
+							"run_for": "10m",
+						},
+					},
+				},
+			},
+		},
+	}
+
+	_, _, err := client.Jobs().Register(job, nil)
+	require.Nil(err)
+
+	var allocs []*api.Allocation
+	testutil.WaitForResult(func() (bool, error) {
+		allocs, _, err = client.Nodes().Allocations(nodeID, nil)
+		if err != nil {
+			return false, err
+		}
+		if len(allocs) != count {
+			return false, fmt.Errorf("number of allocs %d != count (%d)", len(allocs), count)
+		}
+		for _, a := range allocs {
+			if a.ClientStatus != "running" {
+				return false, fmt.Errorf("alloc %q still not running: %s", a.ID, a.ClientStatus)
+			}
+		}
+		return true, nil
+	}, func(err error) {
+		t.Fatalf("err: %v", err)
+	})
+
+	ui := new(cli.MockUi)
+	cmd := &NodeDrainCommand{Meta: Meta{Ui: ui}}
+	args := []string{"-address=" + url, "-self", "-enable", "-deadline", "1s"}
+	t.Logf("Running: %v", args)
+	if code := cmd.Run(args); code != 0 {
+		t.Fatalf("expected exit 0, got: %d", code)
+	}
+
+	out := ui.OutputWriter.String()
+	t.Logf("Output:\n%s", out)
+
+	require.Contains(out, "drain complete")
+	for _, a := range allocs {
+		require.Contains(out, fmt.Sprintf("Alloc %q draining", a.ID))
+	}
+}
+
 func TestNodeDrainCommand_Fails(t *testing.T) {
 	t.Parallel()
 	srv, _, url := testServer(t, false, nil)

From e669e8213a7f41335c179158b710aba80c26f469 Mon Sep 17 00:00:00 2001
From: Michael Schurter <schmichael@hashicorp.com>
Date: Fri, 16 Mar 2018 10:43:28 -0700
Subject: [PATCH 56/79] Improve drain log messages

Also delay "node complete" after the node has been marked complete to
capture a few more alloc events. There are other ways to implement this
that could trade off correctness for responsiveness as technically a
node is considered drained when all of its allocs have been marked to
stop and not when they've actually stopped (which may not happen for a
long time).
---
 command/node_drain.go      | 39 ++++++++++++++++++++++++++++++++++----
 command/node_drain_test.go |  1 +
 2 files changed, 36 insertions(+), 4 deletions(-)

diff --git a/command/node_drain.go b/command/node_drain.go
index b4a2ebad7369..9f170c76e082 100644
--- a/command/node_drain.go
+++ b/command/node_drain.go
@@ -7,6 +7,7 @@ import (
 
 	"github.com/hashicorp/nomad/api"
 	"github.com/hashicorp/nomad/api/contexts"
+	"github.com/hashicorp/nomad/nomad/structs"
 	"github.com/posener/complete"
 )
 
@@ -359,6 +360,8 @@ func monitorDrain(output func(string), nodeClient *api.Nodes, nodeID string, ind
 				// Update local alloc state
 				initial[a.ID] = a
 
+				migrating := a.DesiredTransition.ShouldMigrate()
+
 				msg := ""
 				switch {
 				case !ok:
@@ -370,9 +373,15 @@ func monitorDrain(output func(string), nodeClient *api.Nodes, nodeID string, ind
 					// Alloc status has changed; output
 					msg = fmt.Sprintf("status %s -> %s", orig.ClientStatus, a.ClientStatus)
 
-				case !orig.DesiredTransition.ShouldMigrate() && a.DesiredTransition.ShouldMigrate():
-					// Alloc marked for migration
+				case migrating && !orig.DesiredTransition.ShouldMigrate():
+					// Alloc was marked for migration
+					msg = "marked for migration"
+				case migrating && (orig.DesiredStatus != a.DesiredStatus) && a.DesiredStatus == structs.AllocDesiredStatusStop:
+					// Alloc has already been marked for migration and is now being stopped
 					msg = "draining"
+				case a.NextAllocation != "" && orig.NextAllocation == "":
+					// Alloc has been replaced by another allocation
+					msg = fmt.Sprintf("replaced by allocation %q", a.NextAllocation)
 				}
 
 				if msg != "" {
@@ -386,14 +395,36 @@ func monitorDrain(output func(string), nodeClient *api.Nodes, nodeID string, ind
 		}
 	}()
 
-	for {
+	done := false
+	for !done {
 		select {
 		case err := <-errCh:
 			return err
 		case <-nodeCh:
-			return nil
+			done = true
+		case msg := <-allocCh:
+			output(msg)
+		}
+	}
+
+	// Loop on alloc messages for a bit longer as we may have gotten the
+	// "node done" first (since the watchers run concurrently the events
+	// may be received out of order)
+	deadline := 250 * time.Millisecond
+	timer := time.NewTimer(deadline)
+	for {
+		select {
+		case err := <-errCh:
+			return err
 		case msg := <-allocCh:
 			output(msg)
+			if !timer.Stop() {
+				<-timer.C
+			}
+			timer.Reset(deadline)
+		case <-timer.C:
+			// No events within deadline, exit
+			return nil
 		}
 	}
 }
diff --git a/command/node_drain_test.go b/command/node_drain_test.go
index 1207047454a3..01c8b12532bd 100644
--- a/command/node_drain_test.go
+++ b/command/node_drain_test.go
@@ -183,6 +183,7 @@ func TestNodeDrainCommand_Monitor(t *testing.T) {
 
 	require.Contains(out, "drain complete")
 	for _, a := range allocs {
+		require.Contains(out, fmt.Sprintf("Alloc %q marked for migration", a.ID))
 		require.Contains(out, fmt.Sprintf("Alloc %q draining", a.ID))
 	}
 }

From 0a1f1d2c561a92366c0b3645b8b21392f06d26d7 Mon Sep 17 00:00:00 2001
From: Michael Schurter <schmichael@hashicorp.com>
Date: Thu, 8 Mar 2018 16:08:21 -0800
Subject: [PATCH 57/79] Fix deadline heap triggering

Chan must be buffered to avoid skipping triggering altogether

Also made timing in a test a bit more lenient
---
 nomad/drainer/drain_heap.go      | 34 +++++++++++++++-----------------
 nomad/drainer/drain_heap_test.go |  4 ++--
 2 files changed, 18 insertions(+), 20 deletions(-)

diff --git a/nomad/drainer/drain_heap.go b/nomad/drainer/drain_heap.go
index 1a6c23f13cf9..1642b0fdb330 100644
--- a/nomad/drainer/drain_heap.go
+++ b/nomad/drainer/drain_heap.go
@@ -43,7 +43,7 @@ func NewDeadlineHeap(ctx context.Context, coalesceWindow time.Duration) *deadlin
 		coalesceWindow: coalesceWindow,
 		batch:          make(chan []string),
 		nodes:          make(map[string]time.Time, 64),
-		trigger:        make(chan struct{}),
+		trigger:        make(chan struct{}, 1),
 	}
 
 	go d.watch()
@@ -51,12 +51,11 @@ func NewDeadlineHeap(ctx context.Context, coalesceWindow time.Duration) *deadlin
 }
 
 func (d *deadlineHeap) watch() {
-	timer := time.NewTimer(0 * time.Millisecond)
-	if !timer.Stop() {
-		select {
-		case <-timer.C:
-		default:
-		}
+	timer := time.NewTimer(0)
+	timer.Stop()
+	select {
+	case <-timer.C:
+	default:
 	}
 
 	var nextDeadline time.Time
@@ -71,8 +70,9 @@ func (d *deadlineHeap) watch() {
 				continue
 			}
 
-			d.mu.Lock()
 			var batch []string
+
+			d.mu.Lock()
 			for nodeID, nodeDeadline := range d.nodes {
 				if !nodeDeadline.After(nextDeadline) {
 					batch = append(batch, nodeID)
@@ -81,21 +81,19 @@ func (d *deadlineHeap) watch() {
 			}
 			d.mu.Unlock()
 
-			// If there is nothing exit early
-			if len(batch) == 0 {
-				goto CALC
+			if len(batch) > 0 {
+				// Send the batch
+				select {
+				case d.batch <- batch:
+				case <-d.ctx.Done():
+					return
+				}
 			}
 
-			// Send the batch
-			select {
-			case d.batch <- batch:
-			case <-d.ctx.Done():
-				return
-			}
 		case <-d.trigger:
 		}
 
-	CALC:
+		// Calculate the next deadline
 		deadline, ok := d.calculateNextDeadline()
 		if !ok {
 			continue
diff --git a/nomad/drainer/drain_heap_test.go b/nomad/drainer/drain_heap_test.go
index 147ad9192eff..02108e1dfa0e 100644
--- a/nomad/drainer/drain_heap_test.go
+++ b/nomad/drainer/drain_heap_test.go
@@ -95,7 +95,7 @@ func TestDeadlineHeap_MultiwatchAndDelete(t *testing.T) {
 func TestDeadlineHeap_WatchCoalesce(t *testing.T) {
 	t.Parallel()
 	require := require.New(t)
-	h := NewDeadlineHeap(context.Background(), 250*time.Millisecond)
+	h := NewDeadlineHeap(context.Background(), 100*time.Millisecond)
 
 	now := time.Now()
 
@@ -107,7 +107,7 @@ func TestDeadlineHeap_WatchCoalesce(t *testing.T) {
 	}
 
 	group2 := map[string]time.Time{
-		"10": now.Add(355 * time.Millisecond),
+		"10": now.Add(350 * time.Millisecond),
 		"11": now.Add(360 * time.Millisecond),
 	}
 

From 8217ebf11e2b7585bef61b142550fa9bd3dfcb0e Mon Sep 17 00:00:00 2001
From: Michael Schurter <schmichael@hashicorp.com>
Date: Fri, 9 Mar 2018 16:25:46 -0800
Subject: [PATCH 58/79] drainer: RegisterJob -> RegisterJobs

Test job watcher
---
 nomad/drainer/drainer.go         |   1 -
 nomad/drainer/watch_jobs.go      |  38 +-
 nomad/drainer/watch_jobs_test.go | 714 +++++++++++++++++++++----------
 nomad/drainer/watch_nodes.go     |   4 +-
 nomad/structs/structs.go         |   4 +-
 5 files changed, 505 insertions(+), 256 deletions(-)

diff --git a/nomad/drainer/drainer.go b/nomad/drainer/drainer.go
index 98c52479a865..46dcad696d4c 100644
--- a/nomad/drainer/drainer.go
+++ b/nomad/drainer/drainer.go
@@ -332,7 +332,6 @@ func (n *NodeDrainer) batchDrainAllocs(allocs []*structs.Allocation) (uint64, er
 	}
 	n.batcher.Unlock()
 
-	// Wait for the future
 	if err := future.Wait(); err != nil {
 		return 0, err
 	}
diff --git a/nomad/drainer/watch_jobs.go b/nomad/drainer/watch_jobs.go
index 714bac2b7e53..61a615646019 100644
--- a/nomad/drainer/watch_jobs.go
+++ b/nomad/drainer/watch_jobs.go
@@ -29,7 +29,7 @@ func NewDrainRequest(allocs []*structs.Allocation) *DrainRequest {
 // DrainingJobWatcher is the interface for watching a job drain
 type DrainingJobWatcher interface {
 	// RegisterJob is used to start watching a draining job
-	RegisterJob(job structs.JobNs)
+	RegisterJobs(job []structs.JobNs)
 
 	// Drain is used to emit allocations that should be drained.
 	Drain() <-chan *DrainRequest
@@ -90,21 +90,28 @@ func NewDrainingJobWatcher(ctx context.Context, limiter *rate.Limiter, state *st
 }
 
 // RegisterJob marks the given job as draining and adds it to being watched.
-func (w *drainingJobWatcher) RegisterJob(job structs.JobNs) {
+func (w *drainingJobWatcher) RegisterJobs(jobs []structs.JobNs) {
 	w.l.Lock()
 	defer w.l.Unlock()
 
-	if _, ok := w.jobs[job]; ok {
-		return
+	updated := false
+	for _, jns := range jobs {
+		if _, ok := w.jobs[jns]; ok {
+			continue
+		}
+
+		// Add the job and cancel the context
+		w.logger.Printf("[TRACE] nomad.drain.job_watcher: registering job %v", jns)
+		w.jobs[jns] = struct{}{}
+		updated = true
 	}
 
-	// Add the job and cancel the context
-	w.logger.Printf("[TRACE] nomad.drain.job_watcher: registering job %v", job)
-	w.jobs[job] = struct{}{}
-	w.queryCancel()
+	if updated {
+		w.queryCancel()
 
-	// Create a new query context
-	w.queryCtx, w.queryCancel = context.WithCancel(w.ctx)
+		// Create a new query context
+		w.queryCtx, w.queryCancel = context.WithCancel(w.ctx)
+	}
 }
 
 // Drain returns the channel that emits allocations to drain.
@@ -160,7 +167,6 @@ func (w *drainingJobWatcher) watch() {
 			}
 		}
 
-		// update index for next run
 		lastHandled := waitIndex
 		waitIndex = index
 
@@ -184,7 +190,7 @@ func (w *drainingJobWatcher) watch() {
 
 			// Lookup the job
 			job, err := w.state.JobByID(nil, jns.Namespace, jns.ID)
-			if err != nil {
+			if err != nil || job == nil {
 				w.logger.Printf("[WARN] nomad.drain.job_watcher: failed to lookup job %v: %v", jns, err)
 				continue
 			}
@@ -268,7 +274,8 @@ type jobResult struct {
 	done bool
 }
 
-// newJobResult returns an initialized jobResult
+// newJobResult returns a jobResult with done=true. It is the responsibility of
+// callers to set done=false when a remaining drainable alloc is found.
 func newJobResult() *jobResult {
 	return &jobResult{
 		done: true,
@@ -390,10 +397,13 @@ func handleTaskGroup(snap *state.StateSnapshot, tg *structs.TaskGroup,
 	numToDrain := healthy - thresholdCount
 	numToDrain = helper.IntMin(len(drainable), numToDrain)
 	if numToDrain <= 0 {
-		fmt.Printf("------- Not draining any allocs\n")
+		fmt.Printf("------- Not draining any allocs: drainable:%d  healthy:%d thresholdCount:%d\n",
+			len(drainable), healthy, thresholdCount)
 		return nil
 	}
 
+	fmt.Printf("------- DRAINing allocs: n: %d drainable:%d  healthy:%d thresholdCount:%d\n",
+		numToDrain, len(drainable), healthy, thresholdCount)
 	result.drain = append(result.drain, drainable[0:numToDrain]...)
 	return nil
 }
diff --git a/nomad/drainer/watch_jobs_test.go b/nomad/drainer/watch_jobs_test.go
index 3db5ea0ac4b8..078e5316ec32 100644
--- a/nomad/drainer/watch_jobs_test.go
+++ b/nomad/drainer/watch_jobs_test.go
@@ -2,7 +2,6 @@ package drainer
 
 import (
 	"context"
-	"fmt"
 	"testing"
 	"time"
 
@@ -11,309 +10,552 @@ import (
 	"github.com/hashicorp/nomad/nomad/mock"
 	"github.com/hashicorp/nomad/nomad/state"
 	"github.com/hashicorp/nomad/nomad/structs"
+	"github.com/stretchr/testify/assert"
 	"github.com/stretchr/testify/require"
 	"golang.org/x/time/rate"
 )
 
-func testDrainingJobWatcher(t *testing.T) (*drainingJobWatcher, *state.StateStore) {
+func testNodes(t *testing.T, state *state.StateStore) (drainingNode, runningNode *structs.Node) {
+	n1 := mock.Node()
+	n1.Name = "draining"
+	n1.DrainStrategy = &structs.DrainStrategy{
+		DrainSpec: structs.DrainSpec{
+			Deadline: time.Minute,
+		},
+		ForceDeadline: time.Now().Add(time.Minute),
+	}
+	require.Nil(t, state.UpsertNode(100, n1))
+
+	// Create a non-draining node
+	n2 := mock.Node()
+	n2.Name = "running"
+	require.Nil(t, state.UpsertNode(101, n2))
+	return n1, n2
+}
+
+func testDrainingJobWatcher(t *testing.T, state *state.StateStore) (*drainingJobWatcher, context.CancelFunc) {
 	t.Helper()
 
-	state := state.TestStateStore(t)
 	limiter := rate.NewLimiter(100.0, 100)
 	logger := testlog.Logger(t)
-	w := NewDrainingJobWatcher(context.Background(), limiter, state, logger)
-	return w, state
+	ctx, cancel := context.WithCancel(context.Background())
+	w := NewDrainingJobWatcher(ctx, limiter, state, logger)
+	return w, cancel
 }
 
+// TestDrainingJobWatcher_Interface is a compile-time assertion that we
+// implement the intended interface.
 func TestDrainingJobWatcher_Interface(t *testing.T) {
-	t.Parallel()
-	require := require.New(t)
-	w, _ := testDrainingJobWatcher(t)
-	require.Implements((*DrainingJobWatcher)(nil), w)
+	w, cancel := testDrainingJobWatcher(t, state.TestStateStore(t))
+	cancel()
+	var _ DrainingJobWatcher = w
 }
 
-// DrainingJobWatcher tests:
-// TODO Test that several jobs allocation changes get batched
-// TODO Test that jobs are deregistered when they have no more to migrate
-// TODO Test that the watcher gets triggered on alloc changes
-// TODO Test that the watcher cancels its query when a new job is registered
-
-func TestHandleTaskGroup_AllDone(t *testing.T) {
+// TestDrainingJobWatcher_DrainJobs asserts DrainingJobWatcher batches
+// allocation changes from multiple jobs.
+func TestDrainingJobWatcher_DrainJobs(t *testing.T) {
 	t.Parallel()
 	require := require.New(t)
 
-	// Create a non-draining node
 	state := state.TestStateStore(t)
-	n := mock.Node()
-	require.Nil(state.UpsertNode(100, n))
+	jobWatcher, cancelWatcher := testDrainingJobWatcher(t, state)
+	defer cancelWatcher()
+	drainingNode, runningNode := testNodes(t, state)
 
-	job := mock.Job()
-	require.Nil(state.UpsertJob(101, job))
+	var index uint64 = 101
+	count := 8
 
-	// Create 10 running allocs on the healthy node
-	var allocs []*structs.Allocation
-	for i := 0; i < 10; i++ {
+	newAlloc := func(node *structs.Node, job *structs.Job) *structs.Allocation {
 		a := mock.Alloc()
+		a.JobID = job.ID
 		a.Job = job
 		a.TaskGroup = job.TaskGroups[0].Name
-		a.NodeID = n.ID
-		a.DeploymentStatus = &structs.AllocDeploymentStatus{
-			Healthy: helper.BoolToPtr(false),
+		a.NodeID = node.ID
+		return a
+	}
+
+	// 2 jobs with count 10, max parallel 3
+	jnss := make([]structs.JobNs, 2)
+	jobs := make([]*structs.Job, 2)
+	for i := 0; i < 2; i++ {
+		job := mock.Job()
+		jobs[i] = job
+		jnss[i] = structs.NewJobNs(job.Namespace, job.ID)
+		job.TaskGroups[0].Migrate.MaxParallel = 3
+		job.TaskGroups[0].Count = count
+		require.Nil(state.UpsertJob(index, job))
+		index++
+
+		var allocs []*structs.Allocation
+		for i := 0; i < count; i++ {
+			a := newAlloc(drainingNode, job)
+			a.DeploymentStatus = &structs.AllocDeploymentStatus{
+				Healthy: helper.BoolToPtr(true),
+			}
+			allocs = append(allocs, a)
 		}
-		allocs = append(allocs, a)
+
+		require.Nil(state.UpsertAllocs(index, allocs))
+		index++
+
 	}
-	require.Nil(state.UpsertAllocs(102, allocs))
 
-	snap, err := state.Snapshot()
-	require.Nil(err)
+	// Only register jobs with watcher after creating all data models as
+	// once the watcher starts we need to track the index carefully for
+	// updating the batch future
+	jobWatcher.RegisterJobs(jnss)
+
+	// assertOps asserts how many allocs should be drained and migrated.
+	// The drains and migrations - if any - are returned.
+	assertOps := func(drained, migrated int) (drains *DrainRequest, migrations []*structs.Allocation) {
+		t.Helper()
+		var drainsChecked, migrationsChecked bool
+		for {
+			select {
+			case drains = <-jobWatcher.Drain():
+				ids := make([]string, len(drains.Allocs))
+				for i, a := range drains.Allocs {
+					ids[i] = a.JobID[:6] + ":" + a.ID[:6]
+				}
+				t.Logf("draining %d allocs: %v", len(ids), ids)
+				require.False(drainsChecked, "drains already received")
+				drainsChecked = true
+				require.Lenf(drains.Allocs, drained,
+					"expected %d drains but found %d", drained, len(drains.Allocs))
+			case migrations = <-jobWatcher.Migrated():
+				ids := make([]string, len(migrations))
+				for i, a := range migrations {
+					ids[i] = a.JobID[:6] + ":" + a.ID[:6]
+				}
+				t.Logf("migrating %d allocs: %v", len(ids), ids)
+				require.False(migrationsChecked, "migrations already received")
+				migrationsChecked = true
+				require.Lenf(migrations, migrated,
+					"expected %d migrations but found %d", migrated, len(migrations))
+			case <-time.After(10 * time.Millisecond):
+				if !drainsChecked && drained > 0 {
+					t.Fatalf("expected %d drains but none happened", drained)
+				}
+				if !migrationsChecked && migrated > 0 {
+					t.Fatalf("expected %d migrations but none happened", migrated)
+				}
+				return drains, migrations
+			}
+		}
+	}
 
-	res := &jobResult{}
-	require.Nil(handleTaskGroup(snap, job.TaskGroups[0], allocs, 101, res))
-	require.Empty(res.drain)
-	require.Empty(res.migrated)
-	require.True(res.done)
-}
+	// Expect a first batch of MaxParallel allocs from each job
+	drains, _ := assertOps(6, 0)
 
-func TestHandleTaskGroup_AllOnDrainingNodes(t *testing.T) {
-	t.Parallel()
-	require := require.New(t)
+	// Fake migrating the drained allocs by starting new ones and stopping
+	// the old ones
+	drainedAllocs := make([]*structs.Allocation, len(drains.Allocs))
+	for i, a := range drains.Allocs {
+		a.DesiredTransition.Migrate = helper.BoolToPtr(true)
 
-	// The loop value sets the max parallel for the drain strategy
-	for i := 1; i < 8; i++ {
-		// Create a draining node
-		state := state.TestStateStore(t)
-		n := mock.Node()
-		n.DrainStrategy = &structs.DrainStrategy{
-			DrainSpec: structs.DrainSpec{
-				Deadline: 5 * time.Minute,
-			},
-			ForceDeadline: time.Now().Add(1 * time.Minute),
+		// create a copy so we can reuse this slice
+		drainedAllocs[i] = a.Copy()
+	}
+	require.Nil(state.UpsertAllocs(index, drainedAllocs))
+	drains.Resp.Respond(index, nil)
+	index++
+
+	// Just setting ShouldMigrate should not cause any further drains
+	assertOps(0, 0)
+
+	// Proceed our fake migration along by creating new allocs and stopping
+	// old ones
+	replacements := make([]*structs.Allocation, len(drainedAllocs))
+	updates := make([]*structs.Allocation, 0, len(drainedAllocs)*2)
+	for i, a := range drainedAllocs {
+		// Stop drained allocs
+		a.DesiredTransition.Migrate = nil
+		a.DesiredStatus = structs.AllocDesiredStatusStop
+
+		// Create a replacement
+		replacement := mock.Alloc()
+		replacement.JobID = a.Job.ID
+		replacement.Job = a.Job
+		replacement.TaskGroup = a.TaskGroup
+		replacement.NodeID = runningNode.ID
+		// start in pending state with no health status
+
+		updates = append(updates, a, replacement)
+		replacements[i] = replacement.Copy()
+	}
+	require.Nil(state.UpsertAllocs(index, updates))
+	index++
+
+	// The drained allocs stopping cause migrations but no new drains
+	// because the replacements have not started
+	assertOps(0, 6)
+
+	// Finally kickoff further drain activity by "starting" replacements
+	for _, a := range replacements {
+		a.ClientStatus = structs.AllocClientStatusRunning
+		a.DeploymentStatus = &structs.AllocDeploymentStatus{
+			Healthy: helper.BoolToPtr(true),
 		}
-		require.Nil(state.UpsertNode(100, n))
+	}
+	require.Nil(state.UpsertAllocs(index, replacements))
+	index++
 
-		job := mock.Job()
-		job.TaskGroups[0].Migrate.MaxParallel = i
-		require.Nil(state.UpsertJob(101, job))
+	require.NotEmpty(jobWatcher.drainingJobs())
 
-		// Create 10 running allocs on the draining node
-		var allocs []*structs.Allocation
-		for i := 0; i < 10; i++ {
-			a := mock.Alloc()
-			a.Job = job
-			a.TaskGroup = job.TaskGroups[0].Name
-			a.NodeID = n.ID
-			a.DeploymentStatus = &structs.AllocDeploymentStatus{
-				Healthy: helper.BoolToPtr(false),
-			}
-			allocs = append(allocs, a)
+	// 6 new drains
+	drains, _ = assertOps(6, 0)
+
+	// Fake migrations once more to finish the drain
+	drainedAllocs = make([]*structs.Allocation, len(drains.Allocs))
+	for i, a := range drains.Allocs {
+		a.DesiredTransition.Migrate = helper.BoolToPtr(true)
+
+		// create a copy so we can reuse this slice
+		drainedAllocs[i] = a.Copy()
+	}
+	require.Nil(state.UpsertAllocs(index, drainedAllocs))
+	drains.Resp.Respond(index, nil)
+	index++
+
+	assertOps(0, 0)
+
+	replacements = make([]*structs.Allocation, len(drainedAllocs))
+	updates = make([]*structs.Allocation, 0, len(drainedAllocs)*2)
+	for i, a := range drainedAllocs {
+		a.DesiredTransition.Migrate = nil
+		a.DesiredStatus = structs.AllocDesiredStatusStop
+
+		replacement := newAlloc(runningNode, a.Job)
+		updates = append(updates, a, replacement)
+		replacements[i] = replacement.Copy()
+	}
+	require.Nil(state.UpsertAllocs(index, updates))
+	index++
+
+	assertOps(0, 6)
+
+	for _, a := range replacements {
+		a.ClientStatus = structs.AllocClientStatusRunning
+		a.DeploymentStatus = &structs.AllocDeploymentStatus{
+			Healthy: helper.BoolToPtr(true),
 		}
-		require.Nil(state.UpsertAllocs(102, allocs))
+	}
+	require.Nil(state.UpsertAllocs(index, replacements))
+	index++
+
+	require.NotEmpty(jobWatcher.drainingJobs())
 
-		snap, err := state.Snapshot()
-		require.Nil(err)
+	// Final 4 new drains
+	drains, _ = assertOps(4, 0)
 
-		res := &jobResult{}
-		require.Nil(handleTaskGroup(snap, job.TaskGroups[0], allocs, 101, res))
-		require.Len(res.drain, i)
-		require.Empty(res.migrated)
-		require.False(res.done)
+	// Fake migrations once more to finish the drain
+	drainedAllocs = make([]*structs.Allocation, len(drains.Allocs))
+	for i, a := range drains.Allocs {
+		a.DesiredTransition.Migrate = helper.BoolToPtr(true)
+
+		// create a copy so we can reuse this slice
+		drainedAllocs[i] = a.Copy()
 	}
+	require.Nil(state.UpsertAllocs(index, drainedAllocs))
+	drains.Resp.Respond(index, nil)
+	index++
+
+	assertOps(0, 0)
+
+	replacements = make([]*structs.Allocation, len(drainedAllocs))
+	updates = make([]*structs.Allocation, 0, len(drainedAllocs)*2)
+	for i, a := range drainedAllocs {
+		a.DesiredTransition.Migrate = nil
+		a.DesiredStatus = structs.AllocDesiredStatusStop
+
+		replacement := newAlloc(runningNode, a.Job)
+		updates = append(updates, a, replacement)
+		replacements[i] = replacement.Copy()
+	}
+	require.Nil(state.UpsertAllocs(index, updates))
+	index++
+
+	assertOps(0, 4)
+
+	for _, a := range replacements {
+		a.ClientStatus = structs.AllocClientStatusRunning
+		a.DeploymentStatus = &structs.AllocDeploymentStatus{
+			Healthy: helper.BoolToPtr(true),
+		}
+	}
+	require.Nil(state.UpsertAllocs(index, replacements))
+	index++
+
+	// No jobs should be left!
+	require.Empty(jobWatcher.drainingJobs())
+}
+
+// DrainingJobWatcher tests:
+// TODO Test that the watcher cancels its query when a new job is registered
+
+// handleTaskGroupTestCase is the test case struct for TestHandleTaskGroup
+//
+// Two nodes will be initialized: one draining and one running.
+type handleTaskGroupTestCase struct {
+	// Name of test
+	Name string
+
+	// Expectations
+	ExpectedDrained  int
+	ExpectedMigrated int
+	ExpectedDone     bool
+
+	// Count overrides the default count of 10 if set
+	Count int
+
+	// MaxParallel overrides the default max_parallel of 1 if set
+	MaxParallel int
+
+	// AddAlloc will be called 10 times to create test allocs
+	//
+	// Allocs default to be healthy on the draining node
+	AddAlloc func(i int, a *structs.Allocation, drainingID, runningID string)
 }
 
-func TestHandleTaskGroup_MixedHealth(t *testing.T) {
-	cases := []struct {
-		maxParallel        int
-		drainingNodeAllocs int
-		healthSet          int
-		healthUnset        int
-		expectedDrain      int
-		expectedMigrated   int
-		expectedDone       bool
-	}{
+func TestHandeTaskGroup_Table(t *testing.T) {
+	cases := []handleTaskGroupTestCase{
 		{
-			maxParallel:        2,
-			drainingNodeAllocs: 10,
-			healthSet:          0,
-			healthUnset:        0,
-			expectedDrain:      2,
-			expectedMigrated:   0,
-			expectedDone:       false,
+			// All allocs on draining node
+			Name:             "AllDraining",
+			ExpectedDrained:  1,
+			ExpectedMigrated: 0,
+			ExpectedDone:     false,
 		},
 		{
-			maxParallel:        2,
-			drainingNodeAllocs: 9,
-			healthSet:          0,
-			healthUnset:        0,
-			expectedDrain:      1,
-			expectedMigrated:   1,
-			expectedDone:       false,
+			// All allocs on non-draining node
+			Name:             "AllNonDraining",
+			ExpectedDrained:  0,
+			ExpectedMigrated: 0,
+			ExpectedDone:     true,
+			AddAlloc: func(i int, a *structs.Allocation, drainingID, runningID string) {
+				a.NodeID = runningID
+			},
 		},
 		{
-			maxParallel:        5,
-			drainingNodeAllocs: 9,
-			healthSet:          0,
-			healthUnset:        0,
-			expectedDrain:      4,
-			expectedMigrated:   1,
-			expectedDone:       false,
+			// Some allocs on non-draining node but not healthy
+			Name:             "SomeNonDrainingUnhealthy",
+			ExpectedDrained:  0,
+			ExpectedMigrated: 0,
+			ExpectedDone:     false,
+			AddAlloc: func(i int, a *structs.Allocation, drainingID, runningID string) {
+				if i%2 == 0 {
+					a.NodeID = runningID
+					a.DeploymentStatus = nil
+				}
+			},
 		},
 		{
-			maxParallel:        2,
-			drainingNodeAllocs: 5,
-			healthSet:          2,
-			healthUnset:        0,
-			expectedDrain:      0,
-			expectedMigrated:   5,
-			expectedDone:       false,
+			// One draining, other allocs on non-draining node and healthy
+			Name:             "OneDraining",
+			ExpectedDrained:  1,
+			ExpectedMigrated: 0,
+			ExpectedDone:     false,
+			AddAlloc: func(i int, a *structs.Allocation, drainingID, runningID string) {
+				if i != 0 {
+					a.NodeID = runningID
+				}
+			},
 		},
 		{
-			maxParallel:        2,
-			drainingNodeAllocs: 5,
-			healthSet:          3,
-			healthUnset:        0,
-			expectedDrain:      0,
-			expectedMigrated:   5,
-			expectedDone:       false,
+			// One already draining, other allocs on non-draining node and healthy
+			Name:             "OneAlreadyDraining",
+			ExpectedDrained:  0,
+			ExpectedMigrated: 0,
+			ExpectedDone:     false,
+			AddAlloc: func(i int, a *structs.Allocation, drainingID, runningID string) {
+				if i == 0 {
+					a.DesiredTransition.Migrate = helper.BoolToPtr(true)
+					return
+				}
+				a.NodeID = runningID
+			},
 		},
 		{
-			maxParallel:        2,
-			drainingNodeAllocs: 5,
-			healthSet:          4,
-			healthUnset:        0,
-			expectedDrain:      1,
-			expectedMigrated:   5,
-			expectedDone:       false,
+			// One already drained, other allocs on non-draining node and healthy
+			Name:             "OneAlreadyDrained",
+			ExpectedDrained:  0,
+			ExpectedMigrated: 1,
+			ExpectedDone:     true,
+			AddAlloc: func(i int, a *structs.Allocation, drainingID, runningID string) {
+				if i == 0 {
+					a.DesiredStatus = structs.AllocDesiredStatusStop
+					return
+				}
+				a.NodeID = runningID
+			},
 		},
 		{
-			maxParallel:        2,
-			drainingNodeAllocs: 5,
-			healthSet:          4,
-			healthUnset:        1,
-			expectedDrain:      1,
-			expectedMigrated:   5,
-			expectedDone:       false,
+			// All allocs are terminl, nothing to be drained
+			Name:             "AllMigrating",
+			ExpectedDrained:  0,
+			ExpectedMigrated: 10,
+			ExpectedDone:     true,
+			AddAlloc: func(i int, a *structs.Allocation, drainingID, runningID string) {
+				a.DesiredStatus = structs.AllocDesiredStatusStop
+			},
 		},
 		{
-			maxParallel:        1,
-			drainingNodeAllocs: 5,
-			healthSet:          4,
-			healthUnset:        1,
-			expectedDrain:      0,
-			expectedMigrated:   5,
-			expectedDone:       false,
+			// All allocs may be drained at once
+			Name:             "AllAtOnce",
+			ExpectedDrained:  10,
+			ExpectedMigrated: 0,
+			ExpectedDone:     false,
+			MaxParallel:      10,
 		},
 		{
-			maxParallel:        3,
-			drainingNodeAllocs: 5,
-			healthSet:          3,
-			healthUnset:        0,
-			expectedDrain:      1,
-			expectedMigrated:   5,
-			expectedDone:       false,
+			// Drain 2
+			Name:             "Drain2",
+			ExpectedDrained:  2,
+			ExpectedMigrated: 0,
+			ExpectedDone:     false,
+			MaxParallel:      2,
 		},
 		{
-			maxParallel:        3,
-			drainingNodeAllocs: 0,
-			healthSet:          10,
-			healthUnset:        0,
-			expectedDrain:      0,
-			expectedMigrated:   10,
-			expectedDone:       true,
+			// One on new node, one drained, and one draining
+			ExpectedDrained:  1,
+			ExpectedMigrated: 1,
+			MaxParallel:      2,
+			AddAlloc: func(i int, a *structs.Allocation, drainingID, runningID string) {
+				switch i {
+				case 0:
+					// One alloc on running node
+					a.NodeID = runningID
+				case 1:
+					// One alloc already migrated
+					a.DesiredStatus = structs.AllocDesiredStatusStop
+				}
+			},
 		},
 		{
-			// Is the case where deadline is hit and all 10 are just marked
-			// stopped. We should detect the job as done.
-			maxParallel:        3,
-			drainingNodeAllocs: 0,
-			healthSet:          0,
-			healthUnset:        0,
-			expectedDrain:      0,
-			expectedMigrated:   10,
-			expectedDone:       true,
+			// 8 on new node, one drained, and one draining
+			ExpectedDrained:  1,
+			ExpectedMigrated: 1,
+			MaxParallel:      2,
+			AddAlloc: func(i int, a *structs.Allocation, drainingID, runningID string) {
+				switch i {
+				case 0, 1, 2, 3, 4, 5, 6, 7:
+					a.NodeID = runningID
+				case 8:
+					a.DesiredStatus = structs.AllocDesiredStatusStop
+				}
+			},
+		},
+		{
+			// 5 on new node, two drained, and three draining
+			ExpectedDrained:  3,
+			ExpectedMigrated: 2,
+			MaxParallel:      5,
+			AddAlloc: func(i int, a *structs.Allocation, drainingID, runningID string) {
+				switch i {
+				case 0, 1, 2, 3, 4:
+					a.NodeID = runningID
+				case 8, 9:
+					a.DesiredStatus = structs.AllocDesiredStatusStop
+				}
+			},
+		},
+		{
+			// Not all on new node have health set
+			Name:             "PendingHealth",
+			ExpectedDrained:  1,
+			ExpectedMigrated: 1,
+			MaxParallel:      3,
+			AddAlloc: func(i int, a *structs.Allocation, drainingID, runningID string) {
+				switch i {
+				case 0:
+					// Deployment status UNset for 1 on new node
+					a.NodeID = runningID
+					a.DeploymentStatus = nil
+				case 1, 2, 3, 4:
+					// Deployment status set for 4 on new node
+					a.NodeID = runningID
+				case 9:
+					a.DesiredStatus = structs.AllocDesiredStatusStop
+				}
+			},
+		},
+		{
+			// 5 max parallel - 1 migrating - 2 with unset health = 2 drainable
+			Name:             "PendingHealthHigherMax",
+			ExpectedDrained:  2,
+			ExpectedMigrated: 1,
+			MaxParallel:      5,
+			AddAlloc: func(i int, a *structs.Allocation, drainingID, runningID string) {
+				switch i {
+				case 0, 1:
+					// Deployment status UNset for 2 on new node
+					a.NodeID = runningID
+					a.DeploymentStatus = nil
+				case 2, 3, 4:
+					// Deployment status set for 3 on new node
+					a.NodeID = runningID
+				case 9:
+					a.DesiredStatus = structs.AllocDesiredStatusStop
+				}
+			},
 		},
 	}
 
-	for cnum, c := range cases {
-		t.Run(fmt.Sprintf("%d", cnum), func(t *testing.T) {
-			require := require.New(t)
+	for _, testCase := range cases {
+		t.Run(testCase.Name, func(t *testing.T) {
+			testHandleTaskGroup(t, testCase)
+		})
+	}
+}
 
-			// Create a draining node
-			state := state.TestStateStore(t)
+func testHandleTaskGroup(t *testing.T, tc handleTaskGroupTestCase) {
+	t.Parallel()
+	require := require.New(t)
+	assert := assert.New(t)
 
-			drainingNode := mock.Node()
-			drainingNode.DrainStrategy = &structs.DrainStrategy{
-				DrainSpec: structs.DrainSpec{
-					Deadline: 5 * time.Minute,
-				},
-				ForceDeadline: time.Now().Add(1 * time.Minute),
-			}
-			require.Nil(state.UpsertNode(100, drainingNode))
-
-			healthyNode := mock.Node()
-			require.Nil(state.UpsertNode(101, healthyNode))
-
-			job := mock.Job()
-			job.TaskGroups[0].Migrate.MaxParallel = c.maxParallel
-			require.Nil(state.UpsertJob(101, job))
-
-			// Create running allocs on the draining node with health set
-			var allocs []*structs.Allocation
-			for i := 0; i < c.drainingNodeAllocs; i++ {
-				a := mock.Alloc()
-				a.Job = job
-				a.TaskGroup = job.TaskGroups[0].Name
-				a.NodeID = drainingNode.ID
-				a.DeploymentStatus = &structs.AllocDeploymentStatus{
-					Healthy: helper.BoolToPtr(false),
-				}
-				allocs = append(allocs, a)
-			}
+	// Create nodes
+	state := state.TestStateStore(t)
+	drainingNode, runningNode := testNodes(t, state)
 
-			// Create stopped allocs on the draining node
-			for i := 10 - c.drainingNodeAllocs; i > 0; i-- {
-				a := mock.Alloc()
-				a.Job = job
-				a.TaskGroup = job.TaskGroups[0].Name
-				a.NodeID = drainingNode.ID
-				a.DeploymentStatus = &structs.AllocDeploymentStatus{
-					Healthy: helper.BoolToPtr(false),
-				}
-				a.DesiredStatus = structs.AllocDesiredStatusStop
-				allocs = append(allocs, a)
-			}
+	job := mock.Job()
+	job.TaskGroups[0].Count = 10
+	if tc.Count > 0 {
+		job.TaskGroups[0].Count = tc.Count
+	}
+	if tc.MaxParallel > 0 {
+		job.TaskGroups[0].Migrate.MaxParallel = tc.MaxParallel
+	}
+	require.Nil(state.UpsertJob(102, job))
 
-			// Create allocs on the healthy node with health set
-			for i := 0; i < c.healthSet; i++ {
-				a := mock.Alloc()
-				a.Job = job
-				a.TaskGroup = job.TaskGroups[0].Name
-				a.NodeID = healthyNode.ID
-				a.DeploymentStatus = &structs.AllocDeploymentStatus{
-					Healthy: helper.BoolToPtr(false),
-				}
-				allocs = append(allocs, a)
-			}
+	var allocs []*structs.Allocation
+	for i := 0; i < 10; i++ {
+		a := mock.Alloc()
+		a.JobID = job.ID
+		a.Job = job
+		a.TaskGroup = job.TaskGroups[0].Name
 
-			// Create allocs on the healthy node with health not set
-			for i := 0; i < c.healthUnset; i++ {
-				a := mock.Alloc()
-				a.Job = job
-				a.TaskGroup = job.TaskGroups[0].Name
-				a.NodeID = healthyNode.ID
-				allocs = append(allocs, a)
-			}
-			require.Nil(state.UpsertAllocs(103, allocs))
+		// Default to being healthy on the draining node
+		a.NodeID = drainingNode.ID
+		a.DeploymentStatus = &structs.AllocDeploymentStatus{
+			Healthy: helper.BoolToPtr(true),
+		}
+		if tc.AddAlloc != nil {
+			tc.AddAlloc(i, a, drainingNode.ID, runningNode.ID)
+		}
+		allocs = append(allocs, a)
+	}
 
-			snap, err := state.Snapshot()
-			require.Nil(err)
+	require.Nil(state.UpsertAllocs(103, allocs))
+	snap, err := state.Snapshot()
+	require.Nil(err)
 
-			res := &jobResult{}
-			require.Nil(handleTaskGroup(snap, job.TaskGroups[0], allocs, 101, res))
-			require.Len(res.drain, c.expectedDrain)
-			require.Len(res.migrated, c.expectedMigrated)
-			require.Equal(c.expectedDone, res.done)
-		})
-	}
+	res := newJobResult()
+	require.Nil(handleTaskGroup(snap, job.TaskGroups[0], allocs, 102, res))
+	assert.Lenf(res.drain, tc.ExpectedDrained, "Drain expected %d but found: %d",
+		tc.ExpectedDrained, len(res.drain))
+	assert.Lenf(res.migrated, tc.ExpectedMigrated, "Migrate expected %d but found: %d",
+		tc.ExpectedMigrated, len(res.migrated))
+	assert.Equal(tc.ExpectedDone, res.done)
 }
 
 func TestHandleTaskGroup_Migrations(t *testing.T) {
diff --git a/nomad/drainer/watch_nodes.go b/nomad/drainer/watch_nodes.go
index ed99fb6938c5..97c6cf8b24ce 100644
--- a/nomad/drainer/watch_nodes.go
+++ b/nomad/drainer/watch_nodes.go
@@ -74,9 +74,7 @@ func (n *NodeDrainer) Update(node *structs.Node) {
 		return
 	}
 	n.logger.Printf("[TRACE] nomad.drain: node %q has %d services on it", node.ID, len(jobs))
-	for _, job := range jobs {
-		n.jobWatcher.RegisterJob(job)
-	}
+	n.jobWatcher.RegisterJobs(jobs)
 
 	// TODO Test at this layer as well that a node drain on a node without
 	// allocs immediately gets unmarked as draining
diff --git a/nomad/structs/structs.go b/nomad/structs/structs.go
index 72f2c0a31948..fa16284abf1d 100644
--- a/nomad/structs/structs.go
+++ b/nomad/structs/structs.go
@@ -1794,13 +1794,13 @@ func (n *NetworkResource) PortLabels() map[string]int {
 
 // JobNs is a Job.ID and Namespace tuple
 type JobNs struct {
-	ID, Namespace string
+	Namespace, ID string
 }
 
 func NewJobNs(namespace, id string) JobNs {
 	return JobNs{
-		ID:        id,
 		Namespace: namespace,
+		ID:        id,
 	}
 }
 

From 74dc8fd46076dbc6e351b6fb472a481289e1da11 Mon Sep 17 00:00:00 2001
From: Michael Schurter <schmichael@hashicorp.com>
Date: Mon, 19 Mar 2018 10:12:12 -0700
Subject: [PATCH 59/79] JobNs -> NamespacedID

Also drop the New func as it's easy to swap the order of arguments since
they're both strings.
---
 nomad/drainer/draining_node.go   |  8 ++++----
 nomad/drainer/watch_jobs.go      | 20 ++++++++++----------
 nomad/drainer/watch_jobs_test.go |  4 ++--
 nomad/structs/structs.go         | 20 ++++----------------
 4 files changed, 20 insertions(+), 32 deletions(-)

diff --git a/nomad/drainer/draining_node.go b/nomad/drainer/draining_node.go
index 078399f049f9..af5c094b8089 100644
--- a/nomad/drainer/draining_node.go
+++ b/nomad/drainer/draining_node.go
@@ -125,7 +125,7 @@ func (n *drainingNode) DeadlineAllocs() ([]*structs.Allocation, error) {
 }
 
 // RunningServices returns the set of jobs on the node
-func (n *drainingNode) RunningServices() ([]structs.JobNs, error) {
+func (n *drainingNode) RunningServices() ([]structs.NamespacedID, error) {
 	n.l.RLock()
 	defer n.l.RUnlock()
 
@@ -135,14 +135,14 @@ func (n *drainingNode) RunningServices() ([]structs.JobNs, error) {
 		return nil, err
 	}
 
-	jobIDs := make(map[structs.JobNs]struct{})
-	var jobs []structs.JobNs
+	jobIDs := make(map[structs.NamespacedID]struct{})
+	var jobs []structs.NamespacedID
 	for _, alloc := range allocs {
 		if alloc.TerminalStatus() || alloc.Job.Type != structs.JobTypeService {
 			continue
 		}
 
-		jns := structs.NewJobNs(alloc.Namespace, alloc.JobID)
+		jns := structs.NamespacedID{Namespace: alloc.Namespace, ID: alloc.JobID}
 		if _, ok := jobIDs[jns]; ok {
 			continue
 		}
diff --git a/nomad/drainer/watch_jobs.go b/nomad/drainer/watch_jobs.go
index 61a615646019..181871b204db 100644
--- a/nomad/drainer/watch_jobs.go
+++ b/nomad/drainer/watch_jobs.go
@@ -29,7 +29,7 @@ func NewDrainRequest(allocs []*structs.Allocation) *DrainRequest {
 // DrainingJobWatcher is the interface for watching a job drain
 type DrainingJobWatcher interface {
 	// RegisterJob is used to start watching a draining job
-	RegisterJobs(job []structs.JobNs)
+	RegisterJobs(job []structs.NamespacedID)
 
 	// Drain is used to emit allocations that should be drained.
 	Drain() <-chan *DrainRequest
@@ -52,7 +52,7 @@ type drainingJobWatcher struct {
 	limiter *rate.Limiter
 
 	// jobs is the set of tracked jobs.
-	jobs map[structs.JobNs]struct{}
+	jobs map[structs.NamespacedID]struct{}
 
 	// queryCtx is used to cancel a blocking query.
 	queryCtx    context.Context
@@ -80,7 +80,7 @@ func NewDrainingJobWatcher(ctx context.Context, limiter *rate.Limiter, state *st
 		limiter:     limiter,
 		logger:      logger,
 		state:       state,
-		jobs:        make(map[structs.JobNs]struct{}, 64),
+		jobs:        make(map[structs.NamespacedID]struct{}, 64),
 		drainCh:     make(chan *DrainRequest),
 		migratedCh:  make(chan []*structs.Allocation),
 	}
@@ -90,7 +90,7 @@ func NewDrainingJobWatcher(ctx context.Context, limiter *rate.Limiter, state *st
 }
 
 // RegisterJob marks the given job as draining and adds it to being watched.
-func (w *drainingJobWatcher) RegisterJobs(jobs []structs.JobNs) {
+func (w *drainingJobWatcher) RegisterJobs(jobs []structs.NamespacedID) {
 	w.l.Lock()
 	defer w.l.Unlock()
 
@@ -129,7 +129,7 @@ func (w *drainingJobWatcher) Migrated() <-chan []*structs.Allocation {
 func (w *drainingJobWatcher) deregisterJob(jobID, namespace string) {
 	w.l.Lock()
 	defer w.l.Unlock()
-	jns := structs.JobNs{
+	jns := structs.NamespacedID{
 		ID:        jobID,
 		Namespace: namespace,
 	}
@@ -409,7 +409,7 @@ func handleTaskGroup(snap *state.StateSnapshot, tg *structs.TaskGroup,
 }
 
 // getJobAllocs returns all allocations for draining jobs
-func (w *drainingJobWatcher) getJobAllocs(ctx context.Context, minIndex uint64) (map[structs.JobNs][]*structs.Allocation, uint64, error) {
+func (w *drainingJobWatcher) getJobAllocs(ctx context.Context, minIndex uint64) (map[structs.NamespacedID][]*structs.Allocation, uint64, error) {
 	if err := w.limiter.Wait(ctx); err != nil {
 		return nil, 0, err
 	}
@@ -422,7 +422,7 @@ func (w *drainingJobWatcher) getJobAllocs(ctx context.Context, minIndex uint64)
 		return nil, index, nil
 	}
 
-	return resp.(map[structs.JobNs][]*structs.Allocation), index, nil
+	return resp.(map[structs.NamespacedID][]*structs.Allocation), index, nil
 }
 
 // getJobAllocsImpl returns a map of draining jobs to their allocations.
@@ -440,7 +440,7 @@ func (w *drainingJobWatcher) getJobAllocsImpl(ws memdb.WatchSet, state *state.St
 	}
 
 	// Capture the allocs for each draining job.
-	resp := make(map[structs.JobNs][]*structs.Allocation, l)
+	resp := make(map[structs.NamespacedID][]*structs.Allocation, l)
 	for jns := range draining {
 		allocs, err := state.AllocsByJob(ws, jns.Namespace, jns.ID, false)
 		if err != nil {
@@ -454,7 +454,7 @@ func (w *drainingJobWatcher) getJobAllocsImpl(ws memdb.WatchSet, state *state.St
 }
 
 // drainingJobs captures the set of draining jobs.
-func (w *drainingJobWatcher) drainingJobs() map[structs.JobNs]struct{} {
+func (w *drainingJobWatcher) drainingJobs() map[structs.NamespacedID]struct{} {
 	w.l.RLock()
 	defer w.l.RUnlock()
 
@@ -463,7 +463,7 @@ func (w *drainingJobWatcher) drainingJobs() map[structs.JobNs]struct{} {
 		return nil
 	}
 
-	draining := make(map[structs.JobNs]struct{}, l)
+	draining := make(map[structs.NamespacedID]struct{}, l)
 	for k := range w.jobs {
 		draining[k] = struct{}{}
 	}
diff --git a/nomad/drainer/watch_jobs_test.go b/nomad/drainer/watch_jobs_test.go
index 078e5316ec32..399ee46a16ec 100644
--- a/nomad/drainer/watch_jobs_test.go
+++ b/nomad/drainer/watch_jobs_test.go
@@ -75,12 +75,12 @@ func TestDrainingJobWatcher_DrainJobs(t *testing.T) {
 	}
 
 	// 2 jobs with count 10, max parallel 3
-	jnss := make([]structs.JobNs, 2)
+	jnss := make([]structs.NamespacedID, 2)
 	jobs := make([]*structs.Job, 2)
 	for i := 0; i < 2; i++ {
 		job := mock.Job()
 		jobs[i] = job
-		jnss[i] = structs.NewJobNs(job.Namespace, job.ID)
+		jnss[i] = structs.NamespacedID{Namespace: job.Namespace, ID: job.ID}
 		job.TaskGroups[0].Migrate.MaxParallel = 3
 		job.TaskGroups[0].Count = count
 		require.Nil(state.UpsertJob(index, job))
diff --git a/nomad/structs/structs.go b/nomad/structs/structs.go
index fa16284abf1d..307cce7faf0c 100644
--- a/nomad/structs/structs.go
+++ b/nomad/structs/structs.go
@@ -156,6 +156,10 @@ type NamespacedID struct {
 	Namespace string
 }
 
+func (n NamespacedID) String() string {
+	return fmt.Sprintf("<ns: %q, id: %q>", n.Namespace, n.ID)
+}
+
 // RPCInfo is used to describe common information about query
 type RPCInfo interface {
 	RequestRegion() string
@@ -1792,22 +1796,6 @@ func (n *NetworkResource) PortLabels() map[string]int {
 	return labelValues
 }
 
-// JobNs is a Job.ID and Namespace tuple
-type JobNs struct {
-	Namespace, ID string
-}
-
-func NewJobNs(namespace, id string) JobNs {
-	return JobNs{
-		Namespace: namespace,
-		ID:        id,
-	}
-}
-
-func (j JobNs) String() string {
-	return fmt.Sprintf("<ns: %q, id: %q>", j.Namespace, j.ID)
-}
-
 const (
 	// JobTypeNomad is reserved for internal system tasks and is
 	// always handled by the CoreScheduler.

From 8ef7863bed8271769d11c09fefb01f6b4d48cb95 Mon Sep 17 00:00:00 2001
From: Michael Schurter <schmichael@hashicorp.com>
Date: Mon, 19 Mar 2018 10:18:20 -0700
Subject: [PATCH 60/79] Deregister garbage collected jobs

---
 nomad/drainer/watch_jobs.go | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/nomad/drainer/watch_jobs.go b/nomad/drainer/watch_jobs.go
index 181871b204db..388bf9a7f7b4 100644
--- a/nomad/drainer/watch_jobs.go
+++ b/nomad/drainer/watch_jobs.go
@@ -189,12 +189,19 @@ func (w *drainingJobWatcher) watch() {
 			w.logger.Printf("[TRACE] nomad.drain.job_watcher: handling job %v", jns)
 
 			// Lookup the job
-			job, err := w.state.JobByID(nil, jns.Namespace, jns.ID)
-			if err != nil || job == nil {
+			job, err := snap.JobByID(nil, jns.Namespace, jns.ID)
+			if err != nil {
 				w.logger.Printf("[WARN] nomad.drain.job_watcher: failed to lookup job %v: %v", jns, err)
 				continue
 			}
 
+			// Ignore purged jobs
+			if job == nil {
+				w.logger.Printf("[TRACE] nomad.drain.job_watcher: ignoring garbage collected job %q", jns)
+				w.deregisterJob(jns.ID, jns.Namespace)
+				continue
+			}
+
 			// Ignore all non-service jobs
 			if job.Type != structs.JobTypeService {
 				w.deregisterJob(job.ID, job.Namespace)

From e003b0534b46db55097144f7f565f2f21b9356a0 Mon Sep 17 00:00:00 2001
From: Michael Schurter <schmichael@hashicorp.com>
Date: Mon, 19 Mar 2018 10:23:45 -0700
Subject: [PATCH 61/79] Remove debug prints

---
 nomad/drainer/watch_jobs.go | 12 ------------
 1 file changed, 12 deletions(-)

diff --git a/nomad/drainer/watch_jobs.go b/nomad/drainer/watch_jobs.go
index 388bf9a7f7b4..93232aeb40e9 100644
--- a/nomad/drainer/watch_jobs.go
+++ b/nomad/drainer/watch_jobs.go
@@ -336,9 +336,6 @@ func handleTaskGroup(snap *state.StateSnapshot, tg *structs.TaskGroup,
 	var drainable []*structs.Allocation
 
 	for _, alloc := range allocs {
-		// TODO Remove at the end/when no more bugs
-		fmt.Printf("--- Looking at alloc %q\n", alloc.ID)
-
 		// Check if the alloc is on a draining node.
 		onDrainingNode, ok := drainingNodes[alloc.NodeID]
 		if !ok {
@@ -360,7 +357,6 @@ func handleTaskGroup(snap *state.StateSnapshot, tg *structs.TaskGroup,
 			onDrainingNode &&
 			alloc.ModifyIndex > lastHandledIndex {
 			result.migrated = append(result.migrated, alloc)
-			fmt.Printf("------- Alloc %q marked as migrated\n", alloc.ID)
 			continue
 		}
 
@@ -369,7 +365,6 @@ func handleTaskGroup(snap *state.StateSnapshot, tg *structs.TaskGroup,
 		if !alloc.TerminalStatus() &&
 			alloc.DeploymentStatus != nil &&
 			alloc.DeploymentStatus.Healthy != nil {
-			fmt.Printf("------- Alloc %q considered as healthy\n", alloc.ID)
 			healthy++
 		}
 
@@ -377,7 +372,6 @@ func handleTaskGroup(snap *state.StateSnapshot, tg *structs.TaskGroup,
 		// - It isn't on a draining node
 		// - It is already terminal
 		if !onDrainingNode || alloc.TerminalStatus() {
-			fmt.Printf("------- Alloc %q not drainable\n", alloc.ID)
 			continue
 		}
 
@@ -389,13 +383,11 @@ func handleTaskGroup(snap *state.StateSnapshot, tg *structs.TaskGroup,
 		// it as eligible for draining.
 		if !alloc.DesiredTransition.ShouldMigrate() {
 			drainable = append(drainable, alloc)
-			fmt.Printf("------- Alloc %q drainable\n", alloc.ID)
 		}
 	}
 
 	// Update the done status
 	if remainingDrainingAlloc {
-		fmt.Printf("------- Job has remaining allocs to drain\n")
 		result.done = false
 	}
 
@@ -404,13 +396,9 @@ func handleTaskGroup(snap *state.StateSnapshot, tg *structs.TaskGroup,
 	numToDrain := healthy - thresholdCount
 	numToDrain = helper.IntMin(len(drainable), numToDrain)
 	if numToDrain <= 0 {
-		fmt.Printf("------- Not draining any allocs: drainable:%d  healthy:%d thresholdCount:%d\n",
-			len(drainable), healthy, thresholdCount)
 		return nil
 	}
 
-	fmt.Printf("------- DRAINing allocs: n: %d drainable:%d  healthy:%d thresholdCount:%d\n",
-		numToDrain, len(drainable), healthy, thresholdCount)
 	result.drain = append(result.drain, drainable[0:numToDrain]...)
 	return nil
 }

From 08c9116f2d563729ec212c125c6fdc1d0de023b8 Mon Sep 17 00:00:00 2001
From: Michael Schurter <schmichael@hashicorp.com>
Date: Mon, 19 Mar 2018 10:36:31 -0700
Subject: [PATCH 62/79] Refactor assertOps into a helper func

---
 nomad/drainer/watch_jobs_test.go | 101 ++++++++++++++++---------------
 1 file changed, 53 insertions(+), 48 deletions(-)

diff --git a/nomad/drainer/watch_jobs_test.go b/nomad/drainer/watch_jobs_test.go
index 399ee46a16ec..3a73938d713b 100644
--- a/nomad/drainer/watch_jobs_test.go
+++ b/nomad/drainer/watch_jobs_test.go
@@ -51,6 +51,50 @@ func TestDrainingJobWatcher_Interface(t *testing.T) {
 	var _ DrainingJobWatcher = w
 }
 
+// asertJobWatcherOps asserts a certain number of allocs are drained and/or
+// migrated by the job watcher.
+func assertJobWatcherOps(t *testing.T, jw DrainingJobWatcher, drained, migrated int) (
+	*DrainRequest, []*structs.Allocation) {
+	t.Helper()
+	var (
+		drains                           *DrainRequest
+		migrations                       []*structs.Allocation
+		drainsChecked, migrationsChecked bool
+	)
+	for {
+		select {
+		case drains = <-jw.Drain():
+			ids := make([]string, len(drains.Allocs))
+			for i, a := range drains.Allocs {
+				ids[i] = a.JobID[:6] + ":" + a.ID[:6]
+			}
+			t.Logf("draining %d allocs: %v", len(ids), ids)
+			require.False(t, drainsChecked, "drains already received")
+			drainsChecked = true
+			require.Lenf(t, drains.Allocs, drained,
+				"expected %d drains but found %d", drained, len(drains.Allocs))
+		case migrations = <-jw.Migrated():
+			ids := make([]string, len(migrations))
+			for i, a := range migrations {
+				ids[i] = a.JobID[:6] + ":" + a.ID[:6]
+			}
+			t.Logf("migrating %d allocs: %v", len(ids), ids)
+			require.False(t, migrationsChecked, "migrations already received")
+			migrationsChecked = true
+			require.Lenf(t, migrations, migrated,
+				"expected %d migrations but found %d", migrated, len(migrations))
+		case <-time.After(10 * time.Millisecond):
+			if !drainsChecked && drained > 0 {
+				t.Fatalf("expected %d drains but none happened", drained)
+			}
+			if !migrationsChecked && migrated > 0 {
+				t.Fatalf("expected %d migrations but none happened", migrated)
+			}
+			return drains, migrations
+		}
+	}
+}
+
 // TestDrainingJobWatcher_DrainJobs asserts DrainingJobWatcher batches
 // allocation changes from multiple jobs.
 func TestDrainingJobWatcher_DrainJobs(t *testing.T) {
@@ -105,47 +149,8 @@ func TestDrainingJobWatcher_DrainJobs(t *testing.T) {
 	// updating the batch future
 	jobWatcher.RegisterJobs(jnss)
 
-	// assertOps asserts how many allocs should be drained and migrated.
-	// The drains and migrations - if any - are returned.
-	assertOps := func(drained, migrated int) (drains *DrainRequest, migrations []*structs.Allocation) {
-		t.Helper()
-		var drainsChecked, migrationsChecked bool
-		for {
-			select {
-			case drains = <-jobWatcher.Drain():
-				ids := make([]string, len(drains.Allocs))
-				for i, a := range drains.Allocs {
-					ids[i] = a.JobID[:6] + ":" + a.ID[:6]
-				}
-				t.Logf("draining %d allocs: %v", len(ids), ids)
-				require.False(drainsChecked, "drains already received")
-				drainsChecked = true
-				require.Lenf(drains.Allocs, drained,
-					"expected %d drains but found %d", drained, len(drains.Allocs))
-			case migrations = <-jobWatcher.Migrated():
-				ids := make([]string, len(migrations))
-				for i, a := range migrations {
-					ids[i] = a.JobID[:6] + ":" + a.ID[:6]
-				}
-				t.Logf("migrating %d allocs: %v", len(ids), ids)
-				require.False(migrationsChecked, "migrations already received")
-				migrationsChecked = true
-				require.Lenf(migrations, migrated,
-					"expected %d migrations but found %d", migrated, len(migrations))
-			case <-time.After(10 * time.Millisecond):
-				if !drainsChecked && drained > 0 {
-					t.Fatalf("expected %d drains but none happened", drained)
-				}
-				if !migrationsChecked && migrated > 0 {
-					t.Fatalf("expected %d migrations but none happened", migrated)
-				}
-				return drains, migrations
-			}
-		}
-	}
-
 	// Expect a first batch of MaxParallel allocs from each job
-	drains, _ := assertOps(6, 0)
+	drains, _ := assertJobWatcherOps(t, jobWatcher, 6, 0)
 
 	// Fake migrating the drained allocs by starting new ones and stopping
 	// the old ones
@@ -161,7 +166,7 @@ func TestDrainingJobWatcher_DrainJobs(t *testing.T) {
 	index++
 
 	// Just setting ShouldMigrate should not cause any further drains
-	assertOps(0, 0)
+	assertJobWatcherOps(t, jobWatcher, 0, 0)
 
 	// Proceed our fake migration along by creating new allocs and stopping
 	// old ones
@@ -188,7 +193,7 @@ func TestDrainingJobWatcher_DrainJobs(t *testing.T) {
 
 	// The drained allocs stopping cause migrations but no new drains
 	// because the replacements have not started
-	assertOps(0, 6)
+	assertJobWatcherOps(t, jobWatcher, 0, 6)
 
 	// Finally kickoff further drain activity by "starting" replacements
 	for _, a := range replacements {
@@ -203,7 +208,7 @@ func TestDrainingJobWatcher_DrainJobs(t *testing.T) {
 	require.NotEmpty(jobWatcher.drainingJobs())
 
 	// 6 new drains
-	drains, _ = assertOps(6, 0)
+	drains, _ = assertJobWatcherOps(t, jobWatcher, 6, 0)
 
 	// Fake migrations once more to finish the drain
 	drainedAllocs = make([]*structs.Allocation, len(drains.Allocs))
@@ -217,7 +222,7 @@ func TestDrainingJobWatcher_DrainJobs(t *testing.T) {
 	drains.Resp.Respond(index, nil)
 	index++
 
-	assertOps(0, 0)
+	assertJobWatcherOps(t, jobWatcher, 0, 0)
 
 	replacements = make([]*structs.Allocation, len(drainedAllocs))
 	updates = make([]*structs.Allocation, 0, len(drainedAllocs)*2)
@@ -232,7 +237,7 @@ func TestDrainingJobWatcher_DrainJobs(t *testing.T) {
 	require.Nil(state.UpsertAllocs(index, updates))
 	index++
 
-	assertOps(0, 6)
+	assertJobWatcherOps(t, jobWatcher, 0, 6)
 
 	for _, a := range replacements {
 		a.ClientStatus = structs.AllocClientStatusRunning
@@ -246,7 +251,7 @@ func TestDrainingJobWatcher_DrainJobs(t *testing.T) {
 	require.NotEmpty(jobWatcher.drainingJobs())
 
 	// Final 4 new drains
-	drains, _ = assertOps(4, 0)
+	drains, _ = assertJobWatcherOps(t, jobWatcher, 4, 0)
 
 	// Fake migrations once more to finish the drain
 	drainedAllocs = make([]*structs.Allocation, len(drains.Allocs))
@@ -260,7 +265,7 @@ func TestDrainingJobWatcher_DrainJobs(t *testing.T) {
 	drains.Resp.Respond(index, nil)
 	index++
 
-	assertOps(0, 0)
+	assertJobWatcherOps(t, jobWatcher, 0, 0)
 
 	replacements = make([]*structs.Allocation, len(drainedAllocs))
 	updates = make([]*structs.Allocation, 0, len(drainedAllocs)*2)
@@ -275,7 +280,7 @@ func TestDrainingJobWatcher_DrainJobs(t *testing.T) {
 	require.Nil(state.UpsertAllocs(index, updates))
 	index++
 
-	assertOps(0, 4)
+	assertJobWatcherOps(t, jobWatcher, 0, 4)
 
 	for _, a := range replacements {
 		a.ClientStatus = structs.AllocClientStatusRunning

From 98935b82e04829759db8e8aa20913b22e19bb4f8 Mon Sep 17 00:00:00 2001
From: Alex Dadgar <alex.dadgar@gmail.com>
Date: Mon, 19 Mar 2018 15:19:57 -0700
Subject: [PATCH 63/79] fix race in drain integration tests

---
 nomad/drainer_int_test.go | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/nomad/drainer_int_test.go b/nomad/drainer_int_test.go
index 8e03a2ef5ff3..f71363a0d03b 100644
--- a/nomad/drainer_int_test.go
+++ b/nomad/drainer_int_test.go
@@ -60,7 +60,13 @@ func allocPromoter(t *testing.T, ctx context.Context,
 			WriteRequest: structs.WriteRequest{Region: "global"},
 		}
 		var resp structs.NodeAllocsResponse
-		require.Nil(t, msgpackrpc.CallWithCodec(codec, "Node.UpdateAlloc", req, &resp))
+		if err := msgpackrpc.CallWithCodec(codec, "Node.UpdateAlloc", req, &resp); err != nil {
+			if ctx.Err() == context.Canceled {
+				return
+			} else {
+				require.Nil(t, err)
+			}
+		}
 	}
 }
 

From 4efbc349a408fe280ded52a775f321e971db72c2 Mon Sep 17 00:00:00 2001
From: Michael Schurter <schmichael@hashicorp.com>
Date: Tue, 20 Mar 2018 10:54:55 -0700
Subject: [PATCH 64/79] rpcapi: remove; unused

---
 testutil/rpcapi/rcpapi.go | 160 --------------------------------------
 1 file changed, 160 deletions(-)
 delete mode 100644 testutil/rpcapi/rcpapi.go

diff --git a/testutil/rpcapi/rcpapi.go b/testutil/rpcapi/rcpapi.go
deleted file mode 100644
index 1eafabccbdb3..000000000000
--- a/testutil/rpcapi/rcpapi.go
+++ /dev/null
@@ -1,160 +0,0 @@
-package rpcapi
-
-import (
-	"net/rpc"
-
-	msgpackrpc "github.com/hashicorp/net-rpc-msgpackrpc"
-	"github.com/hashicorp/nomad/nomad/structs"
-)
-
-type RPC struct {
-	Region    string
-	Namespace string
-	codec     rpc.ClientCodec
-}
-
-func NewRPC(codec rpc.ClientCodec) *RPC {
-	return &RPC{
-		Region:    "global",
-		Namespace: structs.DefaultNamespace,
-		codec:     codec,
-	}
-}
-
-// AllocAll calls Alloc.List + Alloc.GetAllocs to return all allocs.
-func (r *RPC) AllocAll() ([]*structs.Allocation, error) {
-	listResp, err := r.AllocList()
-	if err != nil {
-		return nil, err
-	}
-
-	ids := make([]string, 0, len(listResp.Allocations))
-	for _, a := range listResp.Allocations {
-		ids = append(ids, a.ID)
-	}
-
-	allocsResp, err := r.AllocGetAllocs(ids)
-	if err != nil {
-		return nil, err
-	}
-	return allocsResp.Allocs, nil
-}
-
-// Alloc.List RPC
-func (r *RPC) AllocList() (*structs.AllocListResponse, error) {
-	get := &structs.AllocListRequest{
-		QueryOptions: structs.QueryOptions{
-			Region:    r.Region,
-			Namespace: r.Namespace,
-		},
-	}
-
-	var resp structs.AllocListResponse
-	if err := msgpackrpc.CallWithCodec(r.codec, "Alloc.List", get, &resp); err != nil {
-		return nil, err
-	}
-	return &resp, nil
-}
-
-// Alloc.GetAllocs RPC
-func (r *RPC) AllocGetAllocs(ids []string) (*structs.AllocsGetResponse, error) {
-	get := &structs.AllocsGetRequest{
-		AllocIDs: ids,
-		QueryOptions: structs.QueryOptions{
-			Region:    r.Region,
-			Namespace: r.Namespace,
-		},
-	}
-	var resp structs.AllocsGetResponse
-	if err := msgpackrpc.CallWithCodec(r.codec, "Alloc.GetAllocs", get, &resp); err != nil {
-		return nil, err
-	}
-	return &resp, nil
-}
-
-// Eval.List RPC
-func (r *RPC) EvalList() (*structs.EvalListResponse, error) {
-	get := &structs.EvalListRequest{
-		QueryOptions: structs.QueryOptions{
-			Region:    r.Region,
-			Namespace: r.Namespace,
-		},
-	}
-	var resp structs.EvalListResponse
-	if err := msgpackrpc.CallWithCodec(r.codec, "Eval.List", get, &resp); err != nil {
-		return nil, err
-	}
-	return &resp, nil
-}
-
-// Job.List RPC
-func (r *RPC) JobList() (*structs.JobListResponse, error) {
-	get := &structs.JobListRequest{
-		QueryOptions: structs.QueryOptions{
-			Region:    r.Region,
-			Namespace: r.Namespace,
-		},
-	}
-
-	var resp structs.JobListResponse
-	if err := msgpackrpc.CallWithCodec(r.codec, "Job.List", get, &resp); err != nil {
-		return nil, err
-	}
-	return &resp, nil
-}
-
-// Job.Register RPC
-func (r *RPC) JobRegister(j *structs.Job) (*structs.JobRegisterResponse, error) {
-	req := &structs.JobRegisterRequest{
-		Job: j.Copy(),
-		WriteRequest: structs.WriteRequest{
-			Region:    r.Region,
-			Namespace: j.Namespace,
-		},
-	}
-
-	// Fetch the response
-	var resp structs.JobRegisterResponse
-	if err := msgpackrpc.CallWithCodec(r.codec, "Job.Register", req, &resp); err != nil {
-		return nil, err
-	}
-	return &resp, nil
-}
-
-// Node.List RPC
-func (r *RPC) NodeList() (*structs.NodeListResponse, error) {
-	get := &structs.NodeListRequest{
-		QueryOptions: structs.QueryOptions{Region: r.Region},
-	}
-	var resp structs.NodeListResponse
-	if err := msgpackrpc.CallWithCodec(r.codec, "Node.List", get, &resp); err != nil {
-		return nil, err
-	}
-	return &resp, nil
-}
-
-// Node.GetAllocs RPC
-func (r *RPC) NodeGetAllocs(nodeID string) (*structs.NodeAllocsResponse, error) {
-	get := &structs.NodeSpecificRequest{
-		NodeID:       nodeID,
-		QueryOptions: structs.QueryOptions{Region: r.Region},
-	}
-	var resp structs.NodeAllocsResponse
-	if err := msgpackrpc.CallWithCodec(r.codec, "Node.GetAllocs", get, &resp); err != nil {
-		return nil, err
-	}
-	return &resp, nil
-}
-
-// Node.GetNode RPC
-func (r *RPC) NodeGet(nodeID string) (*structs.SingleNodeResponse, error) {
-	get := &structs.NodeSpecificRequest{
-		NodeID:       nodeID,
-		QueryOptions: structs.QueryOptions{Region: r.Region},
-	}
-	var resp structs.SingleNodeResponse
-	if err := msgpackrpc.CallWithCodec(r.codec, "Node.GetNode", get, &resp); err != nil {
-		return nil, err
-	}
-	return &resp, nil
-}

From aab1fb76721a3f6ed11fe72b655d351e4f76f518 Mon Sep 17 00:00:00 2001
From: Michael Schurter <schmichael@hashicorp.com>
Date: Tue, 20 Mar 2018 11:31:55 -0700
Subject: [PATCH 65/79] Fix linting errors

---
 command/agent/node_endpoint_test.go | 2 +-
 nomad/drainer/watch_jobs_test.go    | 1 -
 nomad/fsm_test.go                   | 2 +-
 nomad/state/state_store_test.go     | 4 ++--
 scheduler/reconcile.go              | 1 -
 scheduler/reconcile_util_test.go    | 6 +++---
 6 files changed, 7 insertions(+), 9 deletions(-)

diff --git a/command/agent/node_endpoint_test.go b/command/agent/node_endpoint_test.go
index 19ff6e64cc1e..6b3d96c44ed4 100644
--- a/command/agent/node_endpoint_test.go
+++ b/command/agent/node_endpoint_test.go
@@ -292,7 +292,7 @@ func TestHTTP_NodeDrain(t *testing.T) {
 		respW = httptest.NewRecorder()
 
 		// Make the request
-		obj, err = s.Server.NodeSpecificRequest(respW, req)
+		_, err = s.Server.NodeSpecificRequest(respW, req)
 		require.Nil(err)
 
 		out, err = state.NodeByID(nil, node.ID)
diff --git a/nomad/drainer/watch_jobs_test.go b/nomad/drainer/watch_jobs_test.go
index 3a73938d713b..32d97d1040ac 100644
--- a/nomad/drainer/watch_jobs_test.go
+++ b/nomad/drainer/watch_jobs_test.go
@@ -289,7 +289,6 @@ func TestDrainingJobWatcher_DrainJobs(t *testing.T) {
 		}
 	}
 	require.Nil(state.UpsertAllocs(index, replacements))
-	index++
 
 	// No jobs should be left!
 	require.Empty(jobWatcher.drainingJobs())
diff --git a/nomad/fsm_test.go b/nomad/fsm_test.go
index 1a7b08a4bd60..ed8cf2df5944 100644
--- a/nomad/fsm_test.go
+++ b/nomad/fsm_test.go
@@ -300,7 +300,7 @@ func TestFSM_BatchUpdateNodeDrain(t *testing.T) {
 	}
 	req2 := structs.BatchNodeUpdateDrainRequest{
 		Updates: map[string]*structs.DrainUpdate{
-			node.ID: &structs.DrainUpdate{
+			node.ID: {
 				DrainStrategy: strategy,
 			},
 		},
diff --git a/nomad/state/state_store_test.go b/nomad/state/state_store_test.go
index 20ebbe88fcd2..9f13dd10bf36 100644
--- a/nomad/state/state_store_test.go
+++ b/nomad/state/state_store_test.go
@@ -718,10 +718,10 @@ func TestStateStore_BatchUpdateNodeDrain(t *testing.T) {
 	}
 
 	update := map[string]*structs.DrainUpdate{
-		n1.ID: &structs.DrainUpdate{
+		n1.ID: {
 			DrainStrategy: expectedDrain,
 		},
-		n2.ID: &structs.DrainUpdate{
+		n2.ID: {
 			DrainStrategy: expectedDrain,
 		},
 	}
diff --git a/scheduler/reconcile.go b/scheduler/reconcile.go
index a4e1d1c06d3f..b7b936defdca 100644
--- a/scheduler/reconcile.go
+++ b/scheduler/reconcile.go
@@ -436,7 +436,6 @@ func (a *allocReconciler) computeGroup(group string, all allocSet) bool {
 	if deploymentPlaceReady {
 		// Do all destructive updates
 		min := helper.IntMin(len(destructive), limit)
-		limit -= min
 		desiredChanges.DestructiveUpdate += uint64(min)
 		desiredChanges.Ignore += uint64(len(destructive) - min)
 		for _, alloc := range destructive.nameOrder()[:min] {
diff --git a/scheduler/reconcile_util_test.go b/scheduler/reconcile_util_test.go
index 6d85dfb811ed..6905b26fbbd9 100644
--- a/scheduler/reconcile_util_test.go
+++ b/scheduler/reconcile_util_test.go
@@ -36,16 +36,16 @@ func TestAllocSet_filterByTainted(t *testing.T) {
 	require := require.New(t)
 
 	nodes := map[string]*structs.Node{
-		"draining": &structs.Node{
+		"draining": {
 			ID:    "draining",
 			Drain: true,
 		},
-		"lost": &structs.Node{
+		"lost": {
 			ID:     "lost",
 			Status: structs.NodeStatusDown,
 		},
 		"nil": nil,
-		"normal": &structs.Node{
+		"normal": {
 			ID:     "normal",
 			Status: structs.NodeStatusReady,
 		},

From 2bb18741b0fb997304b740f59bd609ab4c00154a Mon Sep 17 00:00:00 2001
From: Michael Schurter <schmichael@hashicorp.com>
Date: Tue, 20 Mar 2018 12:11:08 -0700
Subject: [PATCH 66/79] api: fix tests to expect default migrate strategy

---
 api/jobs_test.go  |  5 +++++
 api/nodes_test.go | 21 ++++++---------------
 2 files changed, 11 insertions(+), 15 deletions(-)

diff --git a/api/jobs_test.go b/api/jobs_test.go
index edf045a3cde1..194470494b84 100644
--- a/api/jobs_test.go
+++ b/api/jobs_test.go
@@ -141,6 +141,7 @@ func TestJobs_Canonicalize(t *testing.T) {
 							MaxDelay:      helper.TimeToPtr(1 * time.Hour),
 							Unlimited:     helper.BoolToPtr(true),
 						},
+						Migrate: DefaultMigrateStrategy(),
 						Tasks: []*Task{
 							{
 								KillTimeout: helper.TimeToPtr(5 * time.Second),
@@ -211,6 +212,7 @@ func TestJobs_Canonicalize(t *testing.T) {
 							MaxDelay:      helper.TimeToPtr(1 * time.Hour),
 							Unlimited:     helper.BoolToPtr(true),
 						},
+						Migrate: DefaultMigrateStrategy(),
 						Tasks: []*Task{
 							{
 								Name:        "task1",
@@ -363,6 +365,7 @@ func TestJobs_Canonicalize(t *testing.T) {
 							AutoRevert:      helper.BoolToPtr(false),
 							Canary:          helper.IntToPtr(0),
 						},
+						Migrate: DefaultMigrateStrategy(),
 						Tasks: []*Task{
 							{
 								Name:   "redis",
@@ -576,6 +579,7 @@ func TestJobs_Canonicalize(t *testing.T) {
 							AutoRevert:      helper.BoolToPtr(true),
 							Canary:          helper.IntToPtr(1),
 						},
+						Migrate: DefaultMigrateStrategy(),
 						Tasks: []*Task{
 							{
 								Name:        "task1",
@@ -616,6 +620,7 @@ func TestJobs_Canonicalize(t *testing.T) {
 							AutoRevert:      helper.BoolToPtr(false),
 							Canary:          helper.IntToPtr(0),
 						},
+						Migrate: DefaultMigrateStrategy(),
 						Tasks: []*Task{
 							{
 								Name:        "task1",
diff --git a/api/nodes_test.go b/api/nodes_test.go
index d2b02b82c243..4945b3f99c76 100644
--- a/api/nodes_test.go
+++ b/api/nodes_test.go
@@ -142,6 +142,7 @@ func TestNodes_Info(t *testing.T) {
 
 func TestNodes_ToggleDrain(t *testing.T) {
 	t.Parallel()
+	require := require.New(t)
 	c, s := makeClient(t, nil, func(c *testutil.TestServerConfig) {
 		c.DevMode = true
 	})
@@ -166,9 +167,7 @@ func TestNodes_ToggleDrain(t *testing.T) {
 
 	// Check for drain mode
 	out, _, err := nodes.Info(nodeID, nil)
-	if err != nil {
-		t.Fatalf("err: %s", err)
-	}
+	require.Nil(err)
 	if out.Drain {
 		t.Fatalf("drain mode should be off")
 	}
@@ -178,32 +177,24 @@ func TestNodes_ToggleDrain(t *testing.T) {
 		Deadline: 10 * time.Second,
 	}
 	wm, err := nodes.UpdateDrain(nodeID, spec, false, nil)
-	if err != nil {
-		t.Fatalf("err: %s", err)
-	}
+	require.Nil(err)
 	assertWriteMeta(t, wm)
 
 	// Check again
 	out, _, err = nodes.Info(nodeID, nil)
-	if err != nil {
-		t.Fatalf("err: %s", err)
-	}
+	require.Nil(err)
 	if out.SchedulingEligibility != structs.NodeSchedulingIneligible {
 		t.Fatalf("bad eligibility: %v vs %v", out.SchedulingEligibility, structs.NodeSchedulingIneligible)
 	}
 
 	// Toggle off again
 	wm, err = nodes.UpdateDrain(nodeID, nil, true, nil)
-	if err != nil {
-		t.Fatalf("err: %s", err)
-	}
+	require.Nil(err)
 	assertWriteMeta(t, wm)
 
 	// Check again
 	out, _, err = nodes.Info(nodeID, nil)
-	if err != nil {
-		t.Fatalf("err: %s", err)
-	}
+	require.Nil(err)
 	if out.Drain {
 		t.Fatalf("drain mode should be off")
 	}

From 1537061ebc4594abec57d7c7f036b649ef422ea3 Mon Sep 17 00:00:00 2001
From: Michael Schurter <schmichael@hashicorp.com>
Date: Tue, 20 Mar 2018 14:15:35 -0700
Subject: [PATCH 67/79] alloc_runner: watch health for deployed batch jobs

---
 client/alloc_runner_health_watcher.go | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/client/alloc_runner_health_watcher.go b/client/alloc_runner_health_watcher.go
index bdb7eaa82261..93d2553324b9 100644
--- a/client/alloc_runner_health_watcher.go
+++ b/client/alloc_runner_health_watcher.go
@@ -31,18 +31,25 @@ func (r *AllocRunner) watchHealth(ctx context.Context) {
 
 	// See if we should watch the allocs health
 	alloc := r.Alloc()
-	if alloc.Job.Type != structs.JobTypeService {
-		// No need to watch non-service jos
+	if alloc.DeploymentStatus.IsHealthy() || alloc.DeploymentStatus.IsUnhealthy() {
+		// No need to watch health as it's already set
 		return
 	}
 
-	if alloc.DeploymentStatus.IsHealthy() || alloc.DeploymentStatus.IsUnhealthy() {
-		// No need to watch health as it's already set
+	// Neither deployments nor migrations care about system jobs so never
+	// watch their health
+	if alloc.Job.Type == structs.JobTypeSystem {
 		return
 	}
 
 	isDeploy := alloc.DeploymentID != ""
 
+	// Migrations don't consider the health of batch jobs so only watch
+	// batch health during deployments
+	if !isDeploy && alloc.Job.Type == structs.JobTypeBatch {
+		return
+	}
+
 	tg := alloc.Job.LookupTaskGroup(alloc.TaskGroup)
 	if tg == nil {
 		r.logger.Printf("[ERR] client.alloc_watcher: failed to lookup allocation's task group. Exiting watcher")

From 9b88749ced575a061afbcd4932d19469f5466194 Mon Sep 17 00:00:00 2001
From: Michael Schurter <schmichael@hashicorp.com>
Date: Tue, 20 Mar 2018 14:49:17 -0700
Subject: [PATCH 68/79] mock: add BatchJob() helper

---
 nomad/mock/mock.go | 66 ++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 66 insertions(+)

diff --git a/nomad/mock/mock.go b/nomad/mock/mock.go
index aef9c475f011..fc12adbb1618 100644
--- a/nomad/mock/mock.go
+++ b/nomad/mock/mock.go
@@ -169,6 +169,72 @@ func Job() *structs.Job {
 	return job
 }
 
+func BatchJob() *structs.Job {
+	job := &structs.Job{
+		Region:      "global",
+		ID:          uuid.Generate(),
+		Name:        "batch-job",
+		Namespace:   structs.DefaultNamespace,
+		Type:        structs.JobTypeBatch,
+		Priority:    50,
+		AllAtOnce:   false,
+		Datacenters: []string{"dc1"},
+		TaskGroups: []*structs.TaskGroup{
+			{
+				Name:  "worker",
+				Count: 10,
+				EphemeralDisk: &structs.EphemeralDisk{
+					SizeMB: 150,
+				},
+				RestartPolicy: &structs.RestartPolicy{
+					Attempts: 3,
+					Interval: 10 * time.Minute,
+					Delay:    1 * time.Minute,
+					Mode:     structs.RestartPolicyModeDelay,
+				},
+				ReschedulePolicy: &structs.ReschedulePolicy{
+					Attempts:      2,
+					Interval:      10 * time.Minute,
+					Delay:         5 * time.Second,
+					DelayFunction: "linear",
+				},
+				Tasks: []*structs.Task{
+					{
+						Name:   "worker",
+						Driver: "mock_driver",
+						Config: map[string]interface{}{
+							"run_for": "500ms",
+						},
+						Env: map[string]string{
+							"FOO": "bar",
+						},
+						LogConfig: structs.DefaultLogConfig(),
+						Resources: &structs.Resources{
+							CPU:      100,
+							MemoryMB: 100,
+							Networks: []*structs.NetworkResource{
+								{
+									MBits: 50,
+								},
+							},
+						},
+						Meta: map[string]string{
+							"foo": "bar",
+						},
+					},
+				},
+			},
+		},
+		Status:         structs.JobStatusPending,
+		Version:        0,
+		CreateIndex:    43,
+		ModifyIndex:    99,
+		JobModifyIndex: 99,
+	}
+	job.Canonicalize()
+	return job
+}
+
 func SystemJob() *structs.Job {
 	job := &structs.Job{
 		Region:      "global",

From 17161ec5f9c9f8230ada7b92e80c3e57a29bfa06 Mon Sep 17 00:00:00 2001
From: Michael Schurter <schmichael@hashicorp.com>
Date: Tue, 20 Mar 2018 14:49:29 -0700
Subject: [PATCH 69/79] tests: use mock.BatchJob to fix tests

---
 nomad/job_endpoint_test.go | 30 ++++++++++--------------------
 1 file changed, 10 insertions(+), 20 deletions(-)

diff --git a/nomad/job_endpoint_test.go b/nomad/job_endpoint_test.go
index 7d0d0e770831..9182cc872754 100644
--- a/nomad/job_endpoint_test.go
+++ b/nomad/job_endpoint_test.go
@@ -421,8 +421,7 @@ func TestJobEndpoint_Register_ParameterizedJob(t *testing.T) {
 	testutil.WaitForLeader(t, s1.RPC)
 
 	// Create the register request for a parameterized job.
-	job := mock.Job()
-	job.Type = structs.JobTypeBatch
+	job := mock.BatchJob()
 	job.ParameterizedJob = &structs.ParameterizedJobConfig{}
 	req := &structs.JobRegisterRequest{
 		Job: job,
@@ -1423,8 +1422,7 @@ func TestJobEndpoint_Evaluate_ParameterizedJob(t *testing.T) {
 	testutil.WaitForLeader(t, s1.RPC)
 
 	// Create the register request
-	job := mock.Job()
-	job.Type = structs.JobTypeBatch
+	job := mock.BatchJob()
 	job.ParameterizedJob = &structs.ParameterizedJobConfig{}
 	req := &structs.JobRegisterRequest{
 		Job: job,
@@ -1751,8 +1749,7 @@ func TestJobEndpoint_Deregister_ParameterizedJob(t *testing.T) {
 	testutil.WaitForLeader(t, s1.RPC)
 
 	// Create the register request
-	job := mock.Job()
-	job.Type = structs.JobTypeBatch
+	job := mock.BatchJob()
 	job.ParameterizedJob = &structs.ParameterizedJobConfig{}
 	reg := &structs.JobRegisterRequest{
 		Job: job,
@@ -3958,8 +3955,7 @@ func TestJobEndpoint_Dispatch_ACL(t *testing.T) {
 	state := s1.fsm.State()
 
 	// Create a parameterized job
-	job := mock.Job()
-	job.Type = structs.JobTypeBatch
+	job := mock.BatchJob()
 	job.ParameterizedJob = &structs.ParameterizedJobConfig{}
 	err := state.UpsertJob(400, job)
 	require.Nil(err)
@@ -4027,34 +4023,29 @@ func TestJobEndpoint_Dispatch(t *testing.T) {
 	t.Parallel()
 
 	// No requirements
-	d1 := mock.Job()
-	d1.Type = structs.JobTypeBatch
+	d1 := mock.BatchJob()
 	d1.ParameterizedJob = &structs.ParameterizedJobConfig{}
 
 	// Require input data
-	d2 := mock.Job()
-	d2.Type = structs.JobTypeBatch
+	d2 := mock.BatchJob()
 	d2.ParameterizedJob = &structs.ParameterizedJobConfig{
 		Payload: structs.DispatchPayloadRequired,
 	}
 
 	// Disallow input data
-	d3 := mock.Job()
-	d3.Type = structs.JobTypeBatch
+	d3 := mock.BatchJob()
 	d3.ParameterizedJob = &structs.ParameterizedJobConfig{
 		Payload: structs.DispatchPayloadForbidden,
 	}
 
 	// Require meta
-	d4 := mock.Job()
-	d4.Type = structs.JobTypeBatch
+	d4 := mock.BatchJob()
 	d4.ParameterizedJob = &structs.ParameterizedJobConfig{
 		MetaRequired: []string{"foo", "bar"},
 	}
 
 	// Optional meta
-	d5 := mock.Job()
-	d5.Type = structs.JobTypeBatch
+	d5 := mock.BatchJob()
 	d5.ParameterizedJob = &structs.ParameterizedJobConfig{
 		MetaOptional: []string{"foo", "bar"},
 	}
@@ -4063,8 +4054,7 @@ func TestJobEndpoint_Dispatch(t *testing.T) {
 	d6 := mock.PeriodicJob()
 	d6.ParameterizedJob = &structs.ParameterizedJobConfig{}
 
-	d7 := mock.Job()
-	d7.Type = structs.JobTypeBatch
+	d7 := mock.BatchJob()
 	d7.ParameterizedJob = &structs.ParameterizedJobConfig{}
 	d7.Stop = true
 

From 80885623c1df75160a7b1c3a6e7283708da0a96a Mon Sep 17 00:00:00 2001
From: Michael Schurter <schmichael@hashicorp.com>
Date: Tue, 20 Mar 2018 15:19:47 -0700
Subject: [PATCH 70/79] test: don't call t.Fatal from within a goroutine

---
 command/agent/fs_endpoint_test.go | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/command/agent/fs_endpoint_test.go b/command/agent/fs_endpoint_test.go
index f59bbd953b0f..9be39497ce9d 100644
--- a/command/agent/fs_endpoint_test.go
+++ b/command/agent/fs_endpoint_test.go
@@ -437,11 +437,10 @@ func TestHTTP_FS_Logs_Follow(t *testing.T) {
 		req, err := http.NewRequest("GET", path, p)
 		require.Nil(err)
 		respW := httptest.NewRecorder()
-		doneCh := make(chan struct{})
+		errCh := make(chan error)
 		go func() {
-			_, err = s.Server.Logs(respW, req)
-			require.Nil(err)
-			close(doneCh)
+			_, err := s.Server.Logs(respW, req)
+			errCh <- err
 		}()
 
 		out := ""
@@ -458,8 +457,8 @@ func TestHTTP_FS_Logs_Follow(t *testing.T) {
 		})
 
 		select {
-		case <-doneCh:
-			t.Fatal("shouldn't close")
+		case err := <-errCh:
+			t.Fatalf("shouldn't exit: %v", err)
 		case <-time.After(1 * time.Second):
 		}
 

From 50a94d73c9ae27d9c2a5efe19ebbf47f93d5c710 Mon Sep 17 00:00:00 2001
From: Michael Schurter <schmichael@hashicorp.com>
Date: Tue, 20 Mar 2018 15:28:18 -0700
Subject: [PATCH 71/79] test: try to prevent flakiness on travis

---
 client/alloc_runner_test.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/client/alloc_runner_test.go b/client/alloc_runner_test.go
index 0ade0ba39dba..b2927f86eb9f 100644
--- a/client/alloc_runner_test.go
+++ b/client/alloc_runner_test.go
@@ -168,7 +168,7 @@ func TestAllocRunner_DeploymentHealth_Unhealthy_Deadline(t *testing.T) {
 	// Make the task block
 	task := ar.alloc.Job.TaskGroups[0].Tasks[0]
 	task.Driver = "mock_driver"
-	task.Config["start_block_for"] = "2s"
+	task.Config["start_block_for"] = "4s"
 	task.Config["run_for"] = "10s"
 
 	// Make the alloc be part of a deployment

From b8b1922b9ba8a6cf4d99d456560d2e4f589f6d75 Mon Sep 17 00:00:00 2001
From: Michael Schurter <schmichael@hashicorp.com>
Date: Tue, 20 Mar 2018 15:28:38 -0700
Subject: [PATCH 72/79] test: fix by using mock.BatchJob

---
 command/agent/job_endpoint_test.go | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/command/agent/job_endpoint_test.go b/command/agent/job_endpoint_test.go
index 57b5d1869d24..3e730a96957d 100644
--- a/command/agent/job_endpoint_test.go
+++ b/command/agent/job_endpoint_test.go
@@ -942,8 +942,7 @@ func TestHTTP_JobDispatch(t *testing.T) {
 	t.Parallel()
 	httpTest(t, nil, func(s *TestAgent) {
 		// Create the parameterized job
-		job := mock.Job()
-		job.Type = "batch"
+		job := mock.BatchJob()
 		job.ParameterizedJob = &structs.ParameterizedJobConfig{}
 
 		args := structs.JobRegisterRequest{

From e8673b14ef5e2a64be46f68d547c5109387248f9 Mon Sep 17 00:00:00 2001
From: Michael Schurter <schmichael@hashicorp.com>
Date: Tue, 20 Mar 2018 15:42:04 -0700
Subject: [PATCH 73/79] test: disable drain during fsm test

drainer was unsetting drain before fsm could read written value
---
 nomad/node_endpoint_test.go | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/nomad/node_endpoint_test.go b/nomad/node_endpoint_test.go
index 0a18f937cb17..26888d830541 100644
--- a/nomad/node_endpoint_test.go
+++ b/nomad/node_endpoint_test.go
@@ -753,6 +753,9 @@ func TestClientEndpoint_UpdateDrain(t *testing.T) {
 	codec := rpcClient(t, s1)
 	testutil.WaitForLeader(t, s1.RPC)
 
+	// Disable drainer to prevent drain from completing during test
+	s1.nodeDrainer.SetEnabled(false, nil)
+
 	// Create the register request
 	node := mock.Node()
 	reg := &structs.NodeRegisterRequest{
@@ -764,6 +767,7 @@ func TestClientEndpoint_UpdateDrain(t *testing.T) {
 	var resp structs.NodeUpdateResponse
 	require.Nil(msgpackrpc.CallWithCodec(codec, "Node.Register", reg, &resp))
 
+	beforeUpdate := time.Now()
 	strategy := &structs.DrainStrategy{
 		DrainSpec: structs.DrainSpec{
 			Deadline: 10 * time.Second,
@@ -786,7 +790,11 @@ func TestClientEndpoint_UpdateDrain(t *testing.T) {
 	out, err := state.NodeByID(ws, node.ID)
 	require.Nil(err)
 	require.True(out.Drain)
-	require.Equal(strategy, out.DrainStrategy)
+	require.Equal(strategy.Deadline, out.DrainStrategy.Deadline)
+	// before+deadline should be before the forced deadline
+	require.True(beforeUpdate.Add(strategy.Deadline).Before(out.DrainStrategy.ForceDeadline))
+	// now+deadline should be after the forced deadline
+	require.True(time.Now().Add(strategy.Deadline).After(out.DrainStrategy.ForceDeadline))
 }
 
 func TestClientEndpoint_UpdateDrain_ACL(t *testing.T) {

From 636693830fda3f4b08a87f079748475eb6c85d35 Mon Sep 17 00:00:00 2001
From: Michael Schurter <schmichael@hashicorp.com>
Date: Tue, 20 Mar 2018 15:51:58 -0700
Subject: [PATCH 74/79] test: disable node drainer during tests

Node drainer would throw off the index checks
---
 nomad/node_endpoint_test.go | 39 ++++++++++++++++++++++++-------------
 1 file changed, 26 insertions(+), 13 deletions(-)

diff --git a/nomad/node_endpoint_test.go b/nomad/node_endpoint_test.go
index 26888d830541..4c278766cbc7 100644
--- a/nomad/node_endpoint_test.go
+++ b/nomad/node_endpoint_test.go
@@ -2439,15 +2439,18 @@ func TestClientEndpoint_ListNodes_Blocking(t *testing.T) {
 	codec := rpcClient(t, s1)
 	testutil.WaitForLeader(t, s1.RPC)
 
+	// Disable drainer to prevent drain from completing during test
+	s1.nodeDrainer.SetEnabled(false, nil)
+
 	// Create the node
 	node := mock.Node()
 
 	// Node upsert triggers watches
-	time.AfterFunc(100*time.Millisecond, func() {
-		if err := state.UpsertNode(2, node); err != nil {
-			t.Fatalf("err: %v", err)
-		}
+	errCh := make(chan error, 1)
+	timer := time.AfterFunc(100*time.Millisecond, func() {
+		errCh <- state.UpsertNode(2, node)
 	})
+	defer timer.Stop()
 
 	req := &structs.NodeListRequest{
 		QueryOptions: structs.QueryOptions{
@@ -2461,6 +2464,10 @@ func TestClientEndpoint_ListNodes_Blocking(t *testing.T) {
 		t.Fatalf("err: %v", err)
 	}
 
+	if err := <-errCh; err != nil {
+		t.Fatalf("error from timer: %v", err)
+	}
+
 	if elapsed := time.Since(start); elapsed < 100*time.Millisecond {
 		t.Fatalf("should block (returned in %s) %#v", elapsed, resp)
 	}
@@ -2478,9 +2485,7 @@ func TestClientEndpoint_ListNodes_Blocking(t *testing.T) {
 				Deadline: 10 * time.Second,
 			},
 		}
-		if err := state.UpdateNodeDrain(3, node.ID, s, false); err != nil {
-			t.Fatalf("err: %v", err)
-		}
+		errCh <- state.UpdateNodeDrain(3, node.ID, s, false)
 	})
 
 	req.MinQueryIndex = 2
@@ -2490,6 +2495,10 @@ func TestClientEndpoint_ListNodes_Blocking(t *testing.T) {
 		t.Fatalf("err: %v", err)
 	}
 
+	if err := <-errCh; err != nil {
+		t.Fatalf("error from timer: %v", err)
+	}
+
 	if elapsed := time.Since(start); elapsed < 100*time.Millisecond {
 		t.Fatalf("should block (returned in %s) %#v", elapsed, resp2)
 	}
@@ -2502,9 +2511,7 @@ func TestClientEndpoint_ListNodes_Blocking(t *testing.T) {
 
 	// Node status update triggers watches
 	time.AfterFunc(100*time.Millisecond, func() {
-		if err := state.UpdateNodeStatus(40, node.ID, structs.NodeStatusDown); err != nil {
-			t.Fatalf("err: %v", err)
-		}
+		errCh <- state.UpdateNodeStatus(40, node.ID, structs.NodeStatusDown)
 	})
 
 	req.MinQueryIndex = 38
@@ -2514,6 +2521,10 @@ func TestClientEndpoint_ListNodes_Blocking(t *testing.T) {
 		t.Fatalf("err: %v", err)
 	}
 
+	if err := <-errCh; err != nil {
+		t.Fatalf("error from timer: %v", err)
+	}
+
 	if elapsed := time.Since(start); elapsed < 100*time.Millisecond {
 		t.Fatalf("should block (returned in %s) %#v", elapsed, resp3)
 	}
@@ -2526,9 +2537,7 @@ func TestClientEndpoint_ListNodes_Blocking(t *testing.T) {
 
 	// Node delete triggers watches.
 	time.AfterFunc(100*time.Millisecond, func() {
-		if err := state.DeleteNode(50, node.ID); err != nil {
-			t.Fatalf("err: %v", err)
-		}
+		errCh <- state.DeleteNode(50, node.ID)
 	})
 
 	req.MinQueryIndex = 45
@@ -2538,6 +2547,10 @@ func TestClientEndpoint_ListNodes_Blocking(t *testing.T) {
 		t.Fatalf("err: %v", err)
 	}
 
+	if err := <-errCh; err != nil {
+		t.Fatalf("error from timer: %v", err)
+	}
+
 	if elapsed := time.Since(start); elapsed < 100*time.Millisecond {
 		t.Fatalf("should block (returned in %s) %#v", elapsed, resp4)
 	}

From ec09ea61be811290afa83126b399bb5dd4f34ead Mon Sep 17 00:00:00 2001
From: Michael Schurter <schmichael@hashicorp.com>
Date: Tue, 20 Mar 2018 16:27:24 -0700
Subject: [PATCH 75/79] test: must initialize jobResults with new func

---
 nomad/drainer/watch_jobs_test.go | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/nomad/drainer/watch_jobs_test.go b/nomad/drainer/watch_jobs_test.go
index 32d97d1040ac..be90ed13d42f 100644
--- a/nomad/drainer/watch_jobs_test.go
+++ b/nomad/drainer/watch_jobs_test.go
@@ -604,13 +604,13 @@ func TestHandleTaskGroup_Migrations(t *testing.T) {
 	require.Nil(err)
 
 	// Handle before and after indexes
-	res := &jobResult{}
+	res := newJobResult()
 	require.Nil(handleTaskGroup(snap, job.TaskGroups[0], allocs, 101, res))
 	require.Empty(res.drain)
 	require.Len(res.migrated, 10)
 	require.True(res.done)
 
-	res = &jobResult{}
+	res = newJobResult()
 	require.Nil(handleTaskGroup(snap, job.TaskGroups[0], allocs, 103, res))
 	require.Empty(res.drain)
 	require.Empty(res.migrated)

From b58a22c2e9cadbabe061904e7c1e1c00fd88db46 Mon Sep 17 00:00:00 2001
From: Michael Schurter <schmichael@hashicorp.com>
Date: Tue, 20 Mar 2018 17:25:28 -0700
Subject: [PATCH 76/79] remove spurious TODOs and FIXMEs

---
 client/alloc_runner_health_watcher.go | 13 +++++--------
 nomad/drainer/drain_heap.go           |  3 ---
 nomad/structs/structs.go              |  1 -
 3 files changed, 5 insertions(+), 12 deletions(-)

diff --git a/client/alloc_runner_health_watcher.go b/client/alloc_runner_health_watcher.go
index 93d2553324b9..b57f9c46e94f 100644
--- a/client/alloc_runner_health_watcher.go
+++ b/client/alloc_runner_health_watcher.go
@@ -196,12 +196,11 @@ func newAllocHealthTracker(parentCtx context.Context, logger *log.Logger, alloc
 	minHealthyTime time.Duration, useChecks bool) *allocHealthTracker {
 
 	a := &allocHealthTracker{
-		logger:       logger,
-		healthy:      make(chan bool, 1),
-		allocStopped: make(chan struct{}),
-		alloc:        alloc,
-		tg:           alloc.Job.LookupTaskGroup(alloc.TaskGroup),
-		//FIXME should i wrap all these parameters up in a struct?
+		logger:         logger,
+		healthy:        make(chan bool, 1),
+		allocStopped:   make(chan struct{}),
+		alloc:          alloc,
+		tg:             alloc.Job.LookupTaskGroup(alloc.TaskGroup),
 		minHealthyTime: minHealthyTime,
 		useChecks:      useChecks,
 		allocUpdates:   allocUpdates,
@@ -260,7 +259,6 @@ func (a *allocHealthTracker) TaskEvents() map[string]string {
 
 	// Go through are task information and build the event map
 	for task, state := range a.taskHealth {
-		//FIXME skip this for migrations?
 		useChecks := a.tg.Update.HealthCheck == structs.UpdateStrategyHealthCheck_Checks
 		if e, ok := state.event(deadline, a.tg.Update.MinHealthyTime, useChecks); ok {
 			events[task] = e
@@ -542,7 +540,6 @@ func (t *taskHealthState) event(deadline time.Time, minHealthyTime time.Duration
 		}
 
 		// We are running so check if we have been running long enough
-		//FIXME need minHealthyTime here
 		if t.state.StartedAt.Add(minHealthyTime).After(deadline) {
 			return fmt.Sprintf("Task not running for min_healthy_time of %v by deadline", minHealthyTime), true
 		}
diff --git a/nomad/drainer/drain_heap.go b/nomad/drainer/drain_heap.go
index 1642b0fdb330..2d0a1506e052 100644
--- a/nomad/drainer/drain_heap.go
+++ b/nomad/drainer/drain_heap.go
@@ -20,9 +20,6 @@ type DrainDeadlineNotifier interface {
 	Watch(nodeID string, deadline time.Time)
 }
 
-// TODO Make any of what I just wrote true :) Initially it is just a simple
-// implementation.
-
 // deadlineHeap implements the DrainDeadlineNotifier and is backed by a min-heap
 // to efficiently determine the next deadlining node. It also supports
 // coalescing several deadlines into a single emission.
diff --git a/nomad/structs/structs.go b/nomad/structs/structs.go
index 307cce7faf0c..b96eef738825 100644
--- a/nomad/structs/structs.go
+++ b/nomad/structs/structs.go
@@ -1193,7 +1193,6 @@ const (
 
 // ShouldDrainNode checks if a given node status should trigger an
 // evaluation. Some states don't require any further action.
-//TODO(schmichael) Update for drainv2?!
 func ShouldDrainNode(status string) bool {
 	switch status {
 	case NodeStatusInit, NodeStatusReady:

From 07fe87918ad8a3f8a79b87ee60fee07804cfae77 Mon Sep 17 00:00:00 2001
From: Michael Schurter <schmichael@hashicorp.com>
Date: Wed, 21 Mar 2018 10:13:26 -0700
Subject: [PATCH 77/79] test: index no longer guaranteed on job list

Also switch to require and add t.Helper to appropriate funcs.
---
 api/jobs_test.go | 27 ++++++++++-----------------
 api/util_test.go |  2 ++
 2 files changed, 12 insertions(+), 17 deletions(-)

diff --git a/api/jobs_test.go b/api/jobs_test.go
index 194470494b84..f14bd7b49fd8 100644
--- a/api/jobs_test.go
+++ b/api/jobs_test.go
@@ -12,41 +12,34 @@ import (
 	"github.com/hashicorp/nomad/testutil"
 	"github.com/kr/pretty"
 	"github.com/stretchr/testify/assert"
+	"github.com/stretchr/testify/require"
 )
 
 func TestJobs_Register(t *testing.T) {
 	t.Parallel()
+	require := require.New(t)
+
 	c, s := makeClient(t, nil, nil)
 	defer s.Stop()
 	jobs := c.Jobs()
 
 	// Listing jobs before registering returns nothing
 	resp, qm, err := jobs.List(nil)
-	if err != nil {
-		t.Fatalf("err: %s", err)
-	}
+	require.Nil(err)
 	assertQueryMeta(t, qm)
-	if n := len(resp); n != 0 {
-		t.Fatalf("expected 0 jobs, got: %d", n)
-	}
+	require.Emptyf(resp, "expected 0 jobs, got: %d", len(resp))
 
 	// Create a job and attempt to register it
 	job := testJob()
 	resp2, wm, err := jobs.Register(job, nil)
-	if err != nil {
-		t.Fatalf("err: %s", err)
-	}
-	if resp2 == nil || resp2.EvalID == "" {
-		t.Fatalf("missing eval id")
-	}
+	require.Nil(err)
+	require.NotNil(resp2)
+	require.NotEmpty(resp2.EvalID)
 	assertWriteMeta(t, wm)
 
 	// Query the jobs back out again
-	resp, qm, err = jobs.List(nil)
-	if err != nil {
-		t.Fatalf("err: %s", err)
-	}
-	assertQueryMeta(t, qm)
+	resp, _, err = jobs.List(nil)
+	require.Nil(err)
 
 	// Check that we got the expected response
 	if len(resp) != 1 || resp[0].ID != *job.ID {
diff --git a/api/util_test.go b/api/util_test.go
index 9aceee0bfdad..c6f99018e4ce 100644
--- a/api/util_test.go
+++ b/api/util_test.go
@@ -7,6 +7,7 @@ import (
 )
 
 func assertQueryMeta(t *testing.T, qm *QueryMeta) {
+	t.Helper()
 	if qm.LastIndex == 0 {
 		t.Fatalf("bad index: %d", qm.LastIndex)
 	}
@@ -16,6 +17,7 @@ func assertQueryMeta(t *testing.T, qm *QueryMeta) {
 }
 
 func assertWriteMeta(t *testing.T, wm *WriteMeta) {
+	t.Helper()
 	if wm.LastIndex == 0 {
 		t.Fatalf("bad index: %d", wm.LastIndex)
 	}

From 3496bcf76670b1fdae3ce543841af0cecc9fa354 Mon Sep 17 00:00:00 2001
From: Michael Schurter <schmichael@hashicorp.com>
Date: Wed, 21 Mar 2018 10:41:06 -0700
Subject: [PATCH 78/79] docs: improve DrainRequest.MarkEligible comment

---
 api/nodes.go | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/api/nodes.go b/api/nodes.go
index d625629fb5e3..76e25594f775 100644
--- a/api/nodes.go
+++ b/api/nodes.go
@@ -52,7 +52,8 @@ type NodeUpdateDrainRequest struct {
 	// will disable draining.
 	DrainSpec *DrainSpec
 
-	// MarkEligible marks the node as eligible if removing the drain strategy.
+	// MarkEligible marks the node as eligible for scheduling if removing
+	// the drain strategy.
 	MarkEligible bool
 }
 

From e10883ca2bc38ad49a1f008ee59dbf040a7986e4 Mon Sep 17 00:00:00 2001
From: Michael Schurter <schmichael@hashicorp.com>
Date: Wed, 21 Mar 2018 10:44:17 -0700
Subject: [PATCH 79/79] eligbile -> eligible

---
 nomad/structs/structs.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/nomad/structs/structs.go b/nomad/structs/structs.go
index b96eef738825..5f455e58d3e9 100644
--- a/nomad/structs/structs.go
+++ b/nomad/structs/structs.go
@@ -1218,7 +1218,7 @@ const (
 	// NodeSchedulingEligible and Ineligible marks the node as eligible or not,
 	// respectively, for receiving allocations. This is orthoginal to the node
 	// status being ready.
-	NodeSchedulingEligible   = "eligbile"
+	NodeSchedulingEligible   = "eligible"
 	NodeSchedulingIneligible = "ineligible"
 )