Merge pull request #1188 from hashicorp/f-no-failed-allocs

Failed Allocation Metrics stored in Evaluation
hashicorp · May 25, 2016 · 6dc9b7e · 6dc9b7e
2 parents 3590f8a + ed94128
commit 6dc9b7e
Show file tree

Hide file tree

Showing 11 changed files with 258 additions and 167 deletions.
diff --git a/api/evaluations.go b/api/evaluations.go
@@ -67,6 +67,8 @@ type Evaluation struct {
 	Wait              time.Duration
 	NextEval          string
 	PreviousEval      string
+	BlockedEval       string
+	FailedTGAllocs    map[string]*AllocationMetric
 	CreateIndex       uint64
 	ModifyIndex       uint64
 }

diff --git a/command/monitor.go b/command/monitor.go
@@ -147,10 +147,14 @@ func (m *monitor) update(update *evalState) {
 		} else {
 			switch {
 			case existing.client != alloc.client:
+				description := ""
+				if alloc.clientDesc != "" {
+					description = fmt.Sprintf(" (%s)", alloc.clientDesc)
+				}
 				// Allocation status has changed
 				m.ui.Output(fmt.Sprintf(
-					"Allocation %q status changed: %q -> %q (%s)",
-					limit(alloc.id, m.length), existing.client, alloc.client, alloc.clientDesc))
+					"Allocation %q status changed: %q -> %q%s",
+					limit(alloc.id, m.length), existing.client, alloc.client, description))
 			}
 		}
 	}
@@ -288,9 +292,31 @@ func (m *monitor) monitor(evalID string, allowPrefix bool) int {
 		m.update(state)
 
 		switch eval.Status {
-		case structs.EvalStatusComplete, structs.EvalStatusFailed:
-			m.ui.Info(fmt.Sprintf("Evaluation %q finished with status %q",
-				limit(eval.ID, m.length), eval.Status))
+		case structs.EvalStatusComplete, structs.EvalStatusFailed, structs.EvalStatusCancelled:
+			if len(eval.FailedTGAllocs) == 0 {
+				m.ui.Info(fmt.Sprintf("Evaluation %q finished with status %q",
+					limit(eval.ID, m.length), eval.Status))
+			} else {
+				// There were failures making the allocations
+				schedFailure = true
+				m.ui.Info(fmt.Sprintf("Evaluation %q finished with status %q but failed to place all allocations:",
+					limit(eval.ID, m.length), eval.Status))
+
+				// Print the failures per task group
+				for tg, metrics := range eval.FailedTGAllocs {
+					noun := "allocation"
+					if metrics.CoalescedFailures > 0 {
+						noun += "s"
+					}
+					m.ui.Output(fmt.Sprintf("Task Group %q (failed to place %d %s):", tg, metrics.CoalescedFailures+1, noun))
+					dumpAllocMetrics(m.ui, metrics, false)
+				}
+
+				if eval.BlockedEval != "" {
+					m.ui.Output(fmt.Sprintf("Evaluation %q waiting for additional capacity to place remainder",
+						limit(eval.BlockedEval, m.length)))
+				}
+			}
 		default:
 			// Wait for the next update
 			time.Sleep(updateWait)
@@ -332,41 +358,46 @@ func dumpAllocStatus(ui cli.Ui, alloc *api.Allocation, length int) {
 	ui.Output(fmt.Sprintf("Allocation %q status %q (%d/%d nodes filtered)",
 		limit(alloc.ID, length), alloc.ClientStatus,
 		alloc.Metrics.NodesFiltered, alloc.Metrics.NodesEvaluated))
+	dumpAllocMetrics(ui, alloc.Metrics, true)
+}
 
+func dumpAllocMetrics(ui cli.Ui, metrics *api.AllocationMetric, scores bool) {
 	// Print a helpful message if we have an eligibility problem
-	if alloc.Metrics.NodesEvaluated == 0 {
+	if metrics.NodesEvaluated == 0 {
 		ui.Output("  * No nodes were eligible for evaluation")
 	}
 
 	// Print a helpful message if the user has asked for a DC that has no
 	// available nodes.
-	for dc, available := range alloc.Metrics.NodesAvailable {
+	for dc, available := range metrics.NodesAvailable {
 		if available == 0 {
 			ui.Output(fmt.Sprintf("  * No nodes are available in datacenter %q", dc))
 		}
 	}
 
 	// Print filter info
-	for class, num := range alloc.Metrics.ClassFiltered {
+	for class, num := range metrics.ClassFiltered {
 		ui.Output(fmt.Sprintf("  * Class %q filtered %d nodes", class, num))
 	}
-	for cs, num := range alloc.Metrics.ConstraintFiltered {
+	for cs, num := range metrics.ConstraintFiltered {
 		ui.Output(fmt.Sprintf("  * Constraint %q filtered %d nodes", cs, num))
 	}
 
 	// Print exhaustion info
-	if ne := alloc.Metrics.NodesExhausted; ne > 0 {
+	if ne := metrics.NodesExhausted; ne > 0 {
 		ui.Output(fmt.Sprintf("  * Resources exhausted on %d nodes", ne))
 	}
-	for class, num := range alloc.Metrics.ClassExhausted {
+	for class, num := range metrics.ClassExhausted {
 		ui.Output(fmt.Sprintf("  * Class %q exhausted on %d nodes", class, num))
 	}
-	for dim, num := range alloc.Metrics.DimensionExhausted {
+	for dim, num := range metrics.DimensionExhausted {
 		ui.Output(fmt.Sprintf("  * Dimension %q exhausted on %d nodes", dim, num))
 	}
 
 	// Print scores
-	for name, score := range alloc.Metrics.Scores {
-		ui.Output(fmt.Sprintf("  * Score %q = %f", name, score))
+	if scores {
+		for name, score := range metrics.Scores {
+			ui.Output(fmt.Sprintf("  * Score %q = %f", name, score))
+		}
 	}
 }
diff --git a/nomad/plan_apply.go b/nomad/plan_apply.go
@@ -124,7 +124,6 @@ func (s *Server) applyPlan(job *structs.Job, result *structs.PlanResult, snap *s
 	// are multiple updates per node
 	minUpdates := len(result.NodeUpdate)
 	minUpdates += len(result.NodeAllocation)
-	minUpdates += len(result.FailedAllocs)
 
 	// Setup the update request
 	req := structs.AllocUpdateRequest{
@@ -137,7 +136,6 @@ func (s *Server) applyPlan(job *structs.Job, result *structs.PlanResult, snap *s
 	for _, allocList := range result.NodeAllocation {
 		req.Alloc = append(req.Alloc, allocList...)
 	}
-	req.Alloc = append(req.Alloc, result.FailedAllocs...)
 
 	// Set the time the alloc was applied for the first time. This can be used
 	// to approximate the scheduling time.
@@ -200,7 +198,6 @@ func evaluatePlan(pool *EvaluatePool, snap *state.StateSnapshot, plan *structs.P
 	result := &structs.PlanResult{
 		NodeUpdate:     make(map[string][]*structs.Allocation),
 		NodeAllocation: make(map[string][]*structs.Allocation),
-		FailedAllocs:   plan.FailedAllocs,
 	}
 
 	// Collect all the nodeIDs

diff --git a/nomad/plan_apply_test.go b/nomad/plan_apply_test.go
@@ -51,12 +51,10 @@ func TestPlanApply_applyPlan(t *testing.T) {
 
 	// Register alloc
 	alloc := mock.Alloc()
-	allocFail := mock.Alloc()
 	plan := &structs.PlanResult{
 		NodeAllocation: map[string][]*structs.Allocation{
 			node.ID: []*structs.Allocation{alloc},
 		},
-		FailedAllocs: []*structs.Allocation{allocFail},
 	}
 
 	// Snapshot the state
@@ -94,15 +92,6 @@ func TestPlanApply_applyPlan(t *testing.T) {
 		t.Fatalf("missing alloc")
 	}
 
-	// Lookup the allocation
-	out, err = s1.fsm.State().AllocByID(allocFail.ID)
-	if err != nil {
-		t.Fatalf("err: %v", err)
-	}
-	if out == nil {
-		t.Fatalf("missing alloc")
-	}
-
 	// Evict alloc, Register alloc2
 	allocEvict := new(structs.Allocation)
 	*allocEvict = *alloc
@@ -178,12 +167,10 @@ func TestPlanApply_EvalPlan_Simple(t *testing.T) {
 	snap, _ := state.Snapshot()
 
 	alloc := mock.Alloc()
-	allocFail := mock.Alloc()
 	plan := &structs.Plan{
 		NodeAllocation: map[string][]*structs.Allocation{
 			node.ID: []*structs.Allocation{alloc},
 		},
-		FailedAllocs: []*structs.Allocation{allocFail},
 	}
 
 	pool := NewEvaluatePool(workerPoolSize, workerPoolBufferSize)
@@ -196,8 +183,8 @@ func TestPlanApply_EvalPlan_Simple(t *testing.T) {
 	if result == nil {
 		t.Fatalf("missing result")
 	}
-	if !reflect.DeepEqual(result.FailedAllocs, plan.FailedAllocs) {
-		t.Fatalf("missing failed allocs")
+	if !reflect.DeepEqual(result.NodeAllocation, plan.NodeAllocation) {
+		t.Fatalf("incorrect node allocations")
 	}
 }
 

diff --git a/nomad/structs/structs.go b/nomad/structs/structs.go
@@ -2649,6 +2649,16 @@ type Evaluation struct {
 	// This is used to support rolling upgrades, where we need a chain of evaluations.
 	PreviousEval string
 
+	// BlockedEval is the evaluation ID for a created blocked eval. A
+	// blocked eval will be created if all allocations could not be placed due
+	// to constraints or lacking resources.
+	BlockedEval string
+
+	// FailedTGAllocs are task groups which have allocations that could not be
+	// made, but the metrics are persisted so that the user can use the feedback
+	// to determine the cause.
+	FailedTGAllocs map[string]*AllocMetric
+
 	// ClassEligibility tracks computed node classes that have been explicitly
 	// marked as eligible or ineligible.
 	ClassEligibility map[string]bool
@@ -2687,6 +2697,25 @@ func (e *Evaluation) Copy() *Evaluation {
 	}
 	ne := new(Evaluation)
 	*ne = *e
+
+	// Copy ClassEligibility
+	if e.ClassEligibility != nil {
+		classes := make(map[string]bool, len(e.ClassEligibility))
+		for class, elig := range e.ClassEligibility {
+			classes[class] = elig
+		}
+		ne.ClassEligibility = classes
+	}
+
+	// Copy FailedTGAllocs
+	if e.FailedTGAllocs != nil {
+		failedTGs := make(map[string]*AllocMetric, len(e.FailedTGAllocs))
+		for tg, metric := range e.FailedTGAllocs {
+			failedTGs[tg] = metric.Copy()
+		}
+		ne.FailedTGAllocs = failedTGs
+	}
+
 	return ne
 }
 
@@ -2747,10 +2776,10 @@ func (e *Evaluation) NextRollingEval(wait time.Duration) *Evaluation {
 	}
 }
 
-// BlockedEval creates a blocked evaluation to followup this eval to place any
+// CreateBlockedEval creates a blocked evaluation to followup this eval to place any
 // failed allocations. It takes the classes marked explicitly eligible or
 // ineligible and whether the job has escaped computed node classes.
-func (e *Evaluation) BlockedEval(classEligibility map[string]bool, escaped bool) *Evaluation {
+func (e *Evaluation) CreateBlockedEval(classEligibility map[string]bool, escaped bool) *Evaluation {
 	return &Evaluation{
 		ID:                   GenerateUUID(),
 		Priority:             e.Priority,
@@ -2801,11 +2830,6 @@ type Plan struct {
 	// The evicts must be considered prior to the allocations.
 	NodeAllocation map[string][]*Allocation
 
-	// FailedAllocs are allocations that could not be made,
-	// but are persisted so that the user can use the feedback
-	// to determine the cause.
-	FailedAllocs []*Allocation
-
 	// Annotations contains annotations by the scheduler to be used by operators
 	// to understand the decisions made by the scheduler.
 	Annotations *PlanAnnotations
@@ -2853,13 +2877,9 @@ func (p *Plan) AppendAlloc(alloc *Allocation) {
 	p.NodeAllocation[node] = append(existing, alloc)
 }
 
-func (p *Plan) AppendFailed(alloc *Allocation) {
-	p.FailedAllocs = append(p.FailedAllocs, alloc)
-}
-
 // IsNoOp checks if this plan would do nothing
 func (p *Plan) IsNoOp() bool {
-	return len(p.NodeUpdate) == 0 && len(p.NodeAllocation) == 0 && len(p.FailedAllocs) == 0
+	return len(p.NodeUpdate) == 0 && len(p.NodeAllocation) == 0
 }
 
 // PlanResult is the result of a plan submitted to the leader.
@@ -2870,11 +2890,6 @@ type PlanResult struct {
 	// NodeAllocation contains all the allocations that were committed.
 	NodeAllocation map[string][]*Allocation
 
-	// FailedAllocs are allocations that could not be made,
-	// but are persisted so that the user can use the feedback
-	// to determine the cause.
-	FailedAllocs []*Allocation
-
 	// RefreshIndex is the index the worker should refresh state up to.
 	// This allows all evictions and allocations to be materialized.
 	// If any allocations were rejected due to stale data (node state,
@@ -2888,7 +2903,7 @@ type PlanResult struct {
 
 // IsNoOp checks if this plan result would do nothing
 func (p *PlanResult) IsNoOp() bool {
-	return len(p.NodeUpdate) == 0 && len(p.NodeAllocation) == 0 && len(p.FailedAllocs) == 0
+	return len(p.NodeUpdate) == 0 && len(p.NodeAllocation) == 0
 }
 
 // FullCommit is used to check if all the allocations in a plan