Skip to content

Commit

Permalink
Merge pull request #1188 from hashicorp/f-no-failed-allocs
Browse files Browse the repository at this point in the history
Failed Allocation Metrics stored in Evaluation
  • Loading branch information
dadgar committed May 25, 2016
2 parents 3590f8a + ed94128 commit 6dc9b7e
Show file tree
Hide file tree
Showing 11 changed files with 258 additions and 167 deletions.
2 changes: 2 additions & 0 deletions api/evaluations.go
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,8 @@ type Evaluation struct {
Wait time.Duration
NextEval string
PreviousEval string
BlockedEval string
FailedTGAllocs map[string]*AllocationMetric
CreateIndex uint64
ModifyIndex uint64
}
Expand Down
59 changes: 45 additions & 14 deletions command/monitor.go
Original file line number Diff line number Diff line change
Expand Up @@ -147,10 +147,14 @@ func (m *monitor) update(update *evalState) {
} else {
switch {
case existing.client != alloc.client:
description := ""
if alloc.clientDesc != "" {
description = fmt.Sprintf(" (%s)", alloc.clientDesc)
}
// Allocation status has changed
m.ui.Output(fmt.Sprintf(
"Allocation %q status changed: %q -> %q (%s)",
limit(alloc.id, m.length), existing.client, alloc.client, alloc.clientDesc))
"Allocation %q status changed: %q -> %q%s",
limit(alloc.id, m.length), existing.client, alloc.client, description))
}
}
}
Expand Down Expand Up @@ -288,9 +292,31 @@ func (m *monitor) monitor(evalID string, allowPrefix bool) int {
m.update(state)

switch eval.Status {
case structs.EvalStatusComplete, structs.EvalStatusFailed:
m.ui.Info(fmt.Sprintf("Evaluation %q finished with status %q",
limit(eval.ID, m.length), eval.Status))
case structs.EvalStatusComplete, structs.EvalStatusFailed, structs.EvalStatusCancelled:
if len(eval.FailedTGAllocs) == 0 {
m.ui.Info(fmt.Sprintf("Evaluation %q finished with status %q",
limit(eval.ID, m.length), eval.Status))
} else {
// There were failures making the allocations
schedFailure = true
m.ui.Info(fmt.Sprintf("Evaluation %q finished with status %q but failed to place all allocations:",
limit(eval.ID, m.length), eval.Status))

// Print the failures per task group
for tg, metrics := range eval.FailedTGAllocs {
noun := "allocation"
if metrics.CoalescedFailures > 0 {
noun += "s"
}
m.ui.Output(fmt.Sprintf("Task Group %q (failed to place %d %s):", tg, metrics.CoalescedFailures+1, noun))
dumpAllocMetrics(m.ui, metrics, false)
}

if eval.BlockedEval != "" {
m.ui.Output(fmt.Sprintf("Evaluation %q waiting for additional capacity to place remainder",
limit(eval.BlockedEval, m.length)))
}
}
default:
// Wait for the next update
time.Sleep(updateWait)
Expand Down Expand Up @@ -332,41 +358,46 @@ func dumpAllocStatus(ui cli.Ui, alloc *api.Allocation, length int) {
ui.Output(fmt.Sprintf("Allocation %q status %q (%d/%d nodes filtered)",
limit(alloc.ID, length), alloc.ClientStatus,
alloc.Metrics.NodesFiltered, alloc.Metrics.NodesEvaluated))
dumpAllocMetrics(ui, alloc.Metrics, true)
}

func dumpAllocMetrics(ui cli.Ui, metrics *api.AllocationMetric, scores bool) {
// Print a helpful message if we have an eligibility problem
if alloc.Metrics.NodesEvaluated == 0 {
if metrics.NodesEvaluated == 0 {
ui.Output(" * No nodes were eligible for evaluation")
}

// Print a helpful message if the user has asked for a DC that has no
// available nodes.
for dc, available := range alloc.Metrics.NodesAvailable {
for dc, available := range metrics.NodesAvailable {
if available == 0 {
ui.Output(fmt.Sprintf(" * No nodes are available in datacenter %q", dc))
}
}

// Print filter info
for class, num := range alloc.Metrics.ClassFiltered {
for class, num := range metrics.ClassFiltered {
ui.Output(fmt.Sprintf(" * Class %q filtered %d nodes", class, num))
}
for cs, num := range alloc.Metrics.ConstraintFiltered {
for cs, num := range metrics.ConstraintFiltered {
ui.Output(fmt.Sprintf(" * Constraint %q filtered %d nodes", cs, num))
}

// Print exhaustion info
if ne := alloc.Metrics.NodesExhausted; ne > 0 {
if ne := metrics.NodesExhausted; ne > 0 {
ui.Output(fmt.Sprintf(" * Resources exhausted on %d nodes", ne))
}
for class, num := range alloc.Metrics.ClassExhausted {
for class, num := range metrics.ClassExhausted {
ui.Output(fmt.Sprintf(" * Class %q exhausted on %d nodes", class, num))
}
for dim, num := range alloc.Metrics.DimensionExhausted {
for dim, num := range metrics.DimensionExhausted {
ui.Output(fmt.Sprintf(" * Dimension %q exhausted on %d nodes", dim, num))
}

// Print scores
for name, score := range alloc.Metrics.Scores {
ui.Output(fmt.Sprintf(" * Score %q = %f", name, score))
if scores {
for name, score := range metrics.Scores {
ui.Output(fmt.Sprintf(" * Score %q = %f", name, score))
}
}
}
3 changes: 0 additions & 3 deletions nomad/plan_apply.go
Original file line number Diff line number Diff line change
Expand Up @@ -124,7 +124,6 @@ func (s *Server) applyPlan(job *structs.Job, result *structs.PlanResult, snap *s
// are multiple updates per node
minUpdates := len(result.NodeUpdate)
minUpdates += len(result.NodeAllocation)
minUpdates += len(result.FailedAllocs)

// Setup the update request
req := structs.AllocUpdateRequest{
Expand All @@ -137,7 +136,6 @@ func (s *Server) applyPlan(job *structs.Job, result *structs.PlanResult, snap *s
for _, allocList := range result.NodeAllocation {
req.Alloc = append(req.Alloc, allocList...)
}
req.Alloc = append(req.Alloc, result.FailedAllocs...)

// Set the time the alloc was applied for the first time. This can be used
// to approximate the scheduling time.
Expand Down Expand Up @@ -200,7 +198,6 @@ func evaluatePlan(pool *EvaluatePool, snap *state.StateSnapshot, plan *structs.P
result := &structs.PlanResult{
NodeUpdate: make(map[string][]*structs.Allocation),
NodeAllocation: make(map[string][]*structs.Allocation),
FailedAllocs: plan.FailedAllocs,
}

// Collect all the nodeIDs
Expand Down
17 changes: 2 additions & 15 deletions nomad/plan_apply_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -51,12 +51,10 @@ func TestPlanApply_applyPlan(t *testing.T) {

// Register alloc
alloc := mock.Alloc()
allocFail := mock.Alloc()
plan := &structs.PlanResult{
NodeAllocation: map[string][]*structs.Allocation{
node.ID: []*structs.Allocation{alloc},
},
FailedAllocs: []*structs.Allocation{allocFail},
}

// Snapshot the state
Expand Down Expand Up @@ -94,15 +92,6 @@ func TestPlanApply_applyPlan(t *testing.T) {
t.Fatalf("missing alloc")
}

// Lookup the allocation
out, err = s1.fsm.State().AllocByID(allocFail.ID)
if err != nil {
t.Fatalf("err: %v", err)
}
if out == nil {
t.Fatalf("missing alloc")
}

// Evict alloc, Register alloc2
allocEvict := new(structs.Allocation)
*allocEvict = *alloc
Expand Down Expand Up @@ -178,12 +167,10 @@ func TestPlanApply_EvalPlan_Simple(t *testing.T) {
snap, _ := state.Snapshot()

alloc := mock.Alloc()
allocFail := mock.Alloc()
plan := &structs.Plan{
NodeAllocation: map[string][]*structs.Allocation{
node.ID: []*structs.Allocation{alloc},
},
FailedAllocs: []*structs.Allocation{allocFail},
}

pool := NewEvaluatePool(workerPoolSize, workerPoolBufferSize)
Expand All @@ -196,8 +183,8 @@ func TestPlanApply_EvalPlan_Simple(t *testing.T) {
if result == nil {
t.Fatalf("missing result")
}
if !reflect.DeepEqual(result.FailedAllocs, plan.FailedAllocs) {
t.Fatalf("missing failed allocs")
if !reflect.DeepEqual(result.NodeAllocation, plan.NodeAllocation) {
t.Fatalf("incorrect node allocations")
}
}

Expand Down
51 changes: 33 additions & 18 deletions nomad/structs/structs.go
Original file line number Diff line number Diff line change
Expand Up @@ -2649,6 +2649,16 @@ type Evaluation struct {
// This is used to support rolling upgrades, where we need a chain of evaluations.
PreviousEval string

// BlockedEval is the evaluation ID for a created blocked eval. A
// blocked eval will be created if all allocations could not be placed due
// to constraints or lacking resources.
BlockedEval string

// FailedTGAllocs are task groups which have allocations that could not be
// made, but the metrics are persisted so that the user can use the feedback
// to determine the cause.
FailedTGAllocs map[string]*AllocMetric

// ClassEligibility tracks computed node classes that have been explicitly
// marked as eligible or ineligible.
ClassEligibility map[string]bool
Expand Down Expand Up @@ -2687,6 +2697,25 @@ func (e *Evaluation) Copy() *Evaluation {
}
ne := new(Evaluation)
*ne = *e

// Copy ClassEligibility
if e.ClassEligibility != nil {
classes := make(map[string]bool, len(e.ClassEligibility))
for class, elig := range e.ClassEligibility {
classes[class] = elig
}
ne.ClassEligibility = classes
}

// Copy FailedTGAllocs
if e.FailedTGAllocs != nil {
failedTGs := make(map[string]*AllocMetric, len(e.FailedTGAllocs))
for tg, metric := range e.FailedTGAllocs {
failedTGs[tg] = metric.Copy()
}
ne.FailedTGAllocs = failedTGs
}

return ne
}

Expand Down Expand Up @@ -2747,10 +2776,10 @@ func (e *Evaluation) NextRollingEval(wait time.Duration) *Evaluation {
}
}

// BlockedEval creates a blocked evaluation to followup this eval to place any
// CreateBlockedEval creates a blocked evaluation to followup this eval to place any
// failed allocations. It takes the classes marked explicitly eligible or
// ineligible and whether the job has escaped computed node classes.
func (e *Evaluation) BlockedEval(classEligibility map[string]bool, escaped bool) *Evaluation {
func (e *Evaluation) CreateBlockedEval(classEligibility map[string]bool, escaped bool) *Evaluation {
return &Evaluation{
ID: GenerateUUID(),
Priority: e.Priority,
Expand Down Expand Up @@ -2801,11 +2830,6 @@ type Plan struct {
// The evicts must be considered prior to the allocations.
NodeAllocation map[string][]*Allocation

// FailedAllocs are allocations that could not be made,
// but are persisted so that the user can use the feedback
// to determine the cause.
FailedAllocs []*Allocation

// Annotations contains annotations by the scheduler to be used by operators
// to understand the decisions made by the scheduler.
Annotations *PlanAnnotations
Expand Down Expand Up @@ -2853,13 +2877,9 @@ func (p *Plan) AppendAlloc(alloc *Allocation) {
p.NodeAllocation[node] = append(existing, alloc)
}

func (p *Plan) AppendFailed(alloc *Allocation) {
p.FailedAllocs = append(p.FailedAllocs, alloc)
}

// IsNoOp checks if this plan would do nothing
func (p *Plan) IsNoOp() bool {
return len(p.NodeUpdate) == 0 && len(p.NodeAllocation) == 0 && len(p.FailedAllocs) == 0
return len(p.NodeUpdate) == 0 && len(p.NodeAllocation) == 0
}

// PlanResult is the result of a plan submitted to the leader.
Expand All @@ -2870,11 +2890,6 @@ type PlanResult struct {
// NodeAllocation contains all the allocations that were committed.
NodeAllocation map[string][]*Allocation

// FailedAllocs are allocations that could not be made,
// but are persisted so that the user can use the feedback
// to determine the cause.
FailedAllocs []*Allocation

// RefreshIndex is the index the worker should refresh state up to.
// This allows all evictions and allocations to be materialized.
// If any allocations were rejected due to stale data (node state,
Expand All @@ -2888,7 +2903,7 @@ type PlanResult struct {

// IsNoOp checks if this plan result would do nothing
func (p *PlanResult) IsNoOp() bool {
return len(p.NodeUpdate) == 0 && len(p.NodeAllocation) == 0 && len(p.FailedAllocs) == 0
return len(p.NodeUpdate) == 0 && len(p.NodeAllocation) == 0
}

// FullCommit is used to check if all the allocations in a plan
Expand Down
Loading

0 comments on commit 6dc9b7e

Please sign in to comment.