Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Failed Allocation Metrics stored in Evaluation #1188

Merged
merged 6 commits into from
May 25, 2016
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 17 additions & 15 deletions api/evaluations.go
Original file line number Diff line number Diff line change
Expand Up @@ -54,21 +54,23 @@ func (e *Evaluations) Allocations(evalID string, q *QueryOptions) ([]*Allocation

// Evaluation is used to serialize an evaluation.
type Evaluation struct {
ID string
Priority int
Type string
TriggeredBy string
JobID string
JobModifyIndex uint64
NodeID string
NodeModifyIndex uint64
Status string
StatusDescription string
Wait time.Duration
NextEval string
PreviousEval string
CreateIndex uint64
ModifyIndex uint64
ID string
Priority int
Type string
TriggeredBy string
JobID string
JobModifyIndex uint64
NodeID string
NodeModifyIndex uint64
Status string
StatusDescription string
Wait time.Duration
NextEval string
PreviousEval string
SpawnedBlockedEval string
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Renamed to BlockedEval

FailedTGAllocs map[string]*AllocationMetric
CreateIndex uint64
ModifyIndex uint64
}

// EvalIndexSort is a wrapper to sort evaluations by CreateIndex.
Expand Down
50 changes: 38 additions & 12 deletions command/monitor.go
Original file line number Diff line number Diff line change
Expand Up @@ -288,9 +288,30 @@ func (m *monitor) monitor(evalID string, allowPrefix bool) int {
m.update(state)

switch eval.Status {
case structs.EvalStatusComplete, structs.EvalStatusFailed:
m.ui.Info(fmt.Sprintf("Evaluation %q finished with status %q",
limit(eval.ID, m.length), eval.Status))
case structs.EvalStatusComplete, structs.EvalStatusFailed, structs.EvalStatusCancelled:
if len(eval.FailedTGAllocs) == 0 {
m.ui.Info(fmt.Sprintf("Evaluation %q finished with status %q",
limit(eval.ID, m.length), eval.Status))
} else {
// There were failures making the allocations
m.ui.Info(fmt.Sprintf("Evaluation %q finished with status %q but failed to place all allocations:",
limit(eval.ID, m.length), eval.Status))

// Print the failures per task group
for tg, metrics := range eval.FailedTGAllocs {
noun := "allocation"
if metrics.CoalescedFailures > 0 {
noun += "s"
}
m.ui.Output(fmt.Sprintf("Task Group %q (failed to place %d %s):", tg, metrics.CoalescedFailures+1, noun))
dumpAllocMetrics(m.ui, metrics, false)
}

if eval.SpawnedBlockedEval != "" {
m.ui.Output(fmt.Sprintf("Spawned follow up blocked evaluation %q to place remainder",
limit(eval.SpawnedBlockedEval, m.length)))
}
}
default:
// Wait for the next update
time.Sleep(updateWait)
Expand Down Expand Up @@ -332,41 +353,46 @@ func dumpAllocStatus(ui cli.Ui, alloc *api.Allocation, length int) {
ui.Output(fmt.Sprintf("Allocation %q status %q (%d/%d nodes filtered)",
limit(alloc.ID, length), alloc.ClientStatus,
alloc.Metrics.NodesFiltered, alloc.Metrics.NodesEvaluated))
dumpAllocMetrics(ui, alloc.Metrics, true)
}

func dumpAllocMetrics(ui cli.Ui, metrics *api.AllocationMetric, scores bool) {
// Print a helpful message if we have an eligibility problem
if alloc.Metrics.NodesEvaluated == 0 {
if metrics.NodesEvaluated == 0 {
ui.Output(" * No nodes were eligible for evaluation")
}

// Print a helpful message if the user has asked for a DC that has no
// available nodes.
for dc, available := range alloc.Metrics.NodesAvailable {
for dc, available := range metrics.NodesAvailable {
if available == 0 {
ui.Output(fmt.Sprintf(" * No nodes are available in datacenter %q", dc))
}
}

// Print filter info
for class, num := range alloc.Metrics.ClassFiltered {
for class, num := range metrics.ClassFiltered {
ui.Output(fmt.Sprintf(" * Class %q filtered %d nodes", class, num))
}
for cs, num := range alloc.Metrics.ConstraintFiltered {
for cs, num := range metrics.ConstraintFiltered {
ui.Output(fmt.Sprintf(" * Constraint %q filtered %d nodes", cs, num))
}

// Print exhaustion info
if ne := alloc.Metrics.NodesExhausted; ne > 0 {
if ne := metrics.NodesExhausted; ne > 0 {
ui.Output(fmt.Sprintf(" * Resources exhausted on %d nodes", ne))
}
for class, num := range alloc.Metrics.ClassExhausted {
for class, num := range metrics.ClassExhausted {
ui.Output(fmt.Sprintf(" * Class %q exhausted on %d nodes", class, num))
}
for dim, num := range alloc.Metrics.DimensionExhausted {
for dim, num := range metrics.DimensionExhausted {
ui.Output(fmt.Sprintf(" * Dimension %q exhausted on %d nodes", dim, num))
}

// Print scores
for name, score := range alloc.Metrics.Scores {
ui.Output(fmt.Sprintf(" * Score %q = %f", name, score))
if scores {
for name, score := range metrics.Scores {
ui.Output(fmt.Sprintf(" * Score %q = %f", name, score))
}
}
}
3 changes: 0 additions & 3 deletions nomad/plan_apply.go
Original file line number Diff line number Diff line change
Expand Up @@ -124,7 +124,6 @@ func (s *Server) applyPlan(job *structs.Job, result *structs.PlanResult, snap *s
// are multiple updates per node
minUpdates := len(result.NodeUpdate)
minUpdates += len(result.NodeAllocation)
minUpdates += len(result.FailedAllocs)

// Setup the update request
req := structs.AllocUpdateRequest{
Expand All @@ -137,7 +136,6 @@ func (s *Server) applyPlan(job *structs.Job, result *structs.PlanResult, snap *s
for _, allocList := range result.NodeAllocation {
req.Alloc = append(req.Alloc, allocList...)
}
req.Alloc = append(req.Alloc, result.FailedAllocs...)

// Set the time the alloc was applied for the first time. This can be used
// to approximate the scheduling time.
Expand Down Expand Up @@ -200,7 +198,6 @@ func evaluatePlan(pool *EvaluatePool, snap *state.StateSnapshot, plan *structs.P
result := &structs.PlanResult{
NodeUpdate: make(map[string][]*structs.Allocation),
NodeAllocation: make(map[string][]*structs.Allocation),
FailedAllocs: plan.FailedAllocs,
}

// Collect all the nodeIDs
Expand Down
17 changes: 2 additions & 15 deletions nomad/plan_apply_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -51,12 +51,10 @@ func TestPlanApply_applyPlan(t *testing.T) {

// Register alloc
alloc := mock.Alloc()
allocFail := mock.Alloc()
plan := &structs.PlanResult{
NodeAllocation: map[string][]*structs.Allocation{
node.ID: []*structs.Allocation{alloc},
},
FailedAllocs: []*structs.Allocation{allocFail},
}

// Snapshot the state
Expand Down Expand Up @@ -94,15 +92,6 @@ func TestPlanApply_applyPlan(t *testing.T) {
t.Fatalf("missing alloc")
}

// Lookup the allocation
out, err = s1.fsm.State().AllocByID(allocFail.ID)
if err != nil {
t.Fatalf("err: %v", err)
}
if out == nil {
t.Fatalf("missing alloc")
}

// Evict alloc, Register alloc2
allocEvict := new(structs.Allocation)
*allocEvict = *alloc
Expand Down Expand Up @@ -178,12 +167,10 @@ func TestPlanApply_EvalPlan_Simple(t *testing.T) {
snap, _ := state.Snapshot()

alloc := mock.Alloc()
allocFail := mock.Alloc()
plan := &structs.Plan{
NodeAllocation: map[string][]*structs.Allocation{
node.ID: []*structs.Allocation{alloc},
},
FailedAllocs: []*structs.Allocation{allocFail},
}

pool := NewEvaluatePool(workerPoolSize, workerPoolBufferSize)
Expand All @@ -196,8 +183,8 @@ func TestPlanApply_EvalPlan_Simple(t *testing.T) {
if result == nil {
t.Fatalf("missing result")
}
if !reflect.DeepEqual(result.FailedAllocs, plan.FailedAllocs) {
t.Fatalf("missing failed allocs")
if !reflect.DeepEqual(result.NodeAllocation, plan.NodeAllocation) {
t.Fatalf("incorrect node allocations")
}
}

Expand Down
47 changes: 31 additions & 16 deletions nomad/structs/structs.go
Original file line number Diff line number Diff line change
Expand Up @@ -2617,6 +2617,16 @@ type Evaluation struct {
// This is used to support rolling upgrades, where we need a chain of evaluations.
PreviousEval string

// SpawnedBlockedEval is the evaluation ID for a created blocked eval. A
// blocked eval will be created if all allocations could not be placed due
// to constraints or lacking resources.
SpawnedBlockedEval string

// FailedTGAllocs are task groups which have allocations that could not be
// made, but the metrics are persisted so that the user can use the feedback
// to determine the cause.
FailedTGAllocs map[string]*AllocMetric

// ClassEligibility tracks computed node classes that have been explicitly
// marked as eligible or ineligible.
ClassEligibility map[string]bool
Expand Down Expand Up @@ -2655,6 +2665,25 @@ func (e *Evaluation) Copy() *Evaluation {
}
ne := new(Evaluation)
*ne = *e

// Copy ClassEligibility
if e.ClassEligibility != nil {
classes := make(map[string]bool, len(e.ClassEligibility))
for class, elig := range e.ClassEligibility {
classes[class] = elig
}
ne.ClassEligibility = classes
}

// Copy FailedTGAllocs
if e.FailedTGAllocs != nil {
failedTGs := make(map[string]*AllocMetric, len(e.FailedTGAllocs))
for tg, metric := range e.FailedTGAllocs {
failedTGs[tg] = metric.Copy()
}
ne.FailedTGAllocs = failedTGs
}

return ne
}

Expand Down Expand Up @@ -2769,11 +2798,6 @@ type Plan struct {
// The evicts must be considered prior to the allocations.
NodeAllocation map[string][]*Allocation

// FailedAllocs are allocations that could not be made,
// but are persisted so that the user can use the feedback
// to determine the cause.
FailedAllocs []*Allocation

// Annotations contains annotations by the scheduler to be used by operators
// to understand the decisions made by the scheduler.
Annotations *PlanAnnotations
Expand Down Expand Up @@ -2821,13 +2845,9 @@ func (p *Plan) AppendAlloc(alloc *Allocation) {
p.NodeAllocation[node] = append(existing, alloc)
}

func (p *Plan) AppendFailed(alloc *Allocation) {
p.FailedAllocs = append(p.FailedAllocs, alloc)
}

// IsNoOp checks if this plan would do nothing
func (p *Plan) IsNoOp() bool {
return len(p.NodeUpdate) == 0 && len(p.NodeAllocation) == 0 && len(p.FailedAllocs) == 0
return len(p.NodeUpdate) == 0 && len(p.NodeAllocation) == 0
}

// PlanResult is the result of a plan submitted to the leader.
Expand All @@ -2838,11 +2858,6 @@ type PlanResult struct {
// NodeAllocation contains all the allocations that were committed.
NodeAllocation map[string][]*Allocation

// FailedAllocs are allocations that could not be made,
// but are persisted so that the user can use the feedback
// to determine the cause.
FailedAllocs []*Allocation

// RefreshIndex is the index the worker should refresh state up to.
// This allows all evictions and allocations to be materialized.
// If any allocations were rejected due to stale data (node state,
Expand All @@ -2856,7 +2871,7 @@ type PlanResult struct {

// IsNoOp checks if this plan result would do nothing
func (p *PlanResult) IsNoOp() bool {
return len(p.NodeUpdate) == 0 && len(p.NodeAllocation) == 0 && len(p.FailedAllocs) == 0
return len(p.NodeUpdate) == 0 && len(p.NodeAllocation) == 0
}

// FullCommit is used to check if all the allocations in a plan
Expand Down
Loading