Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

core: fix blocked eval math #13104

Merged
merged 4 commits into from
May 24, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .changelog/13104.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
```release-note:bug
core: Fixed a bug where blocked eval resources were incorrectly computed
```
8 changes: 4 additions & 4 deletions nomad/blocked_evals.go
Original file line number Diff line number Diff line change
Expand Up @@ -248,8 +248,8 @@ func (b *BlockedEvals) processBlockJobDuplicate(eval *structs.Evaluation) (newCa
if ok {
if latestEvalIndex(existingW.eval) <= latestEvalIndex(eval) {
delete(b.captured, existingID)
b.stats.Unblock(eval)
dup = existingW.eval
b.stats.Unblock(dup)
} else {
dup = eval
newCancelled = true
Expand Down Expand Up @@ -727,10 +727,10 @@ func (b *BlockedEvals) EmitStats(period time.Duration, stopCh <-chan struct{}) {
metrics.SetGaugeWithLabels([]string{"nomad", "blocked_evals", "job", "memory"}, float32(v.MemoryMB), labels)
}

for k, v := range stats.BlockedResources.ByNodeInfo {
for k, v := range stats.BlockedResources.ByNode {
labels := []metrics.Label{
{Name: "datacenter", Value: k.Datacenter},
{Name: "node_class", Value: k.NodeClass},
{Name: "datacenter", Value: k.dc},
{Name: "node_class", Value: k.class},
}
metrics.SetGaugeWithLabels([]string{"nomad", "blocked_evals", "cpu"}, float32(v.CPU), labels)
metrics.SetGaugeWithLabels([]string{"nomad", "blocked_evals", "memory"}, float32(v.MemoryMB), labels)
Expand Down
71 changes: 35 additions & 36 deletions nomad/blocked_evals_stats.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,13 @@ type BlockedStats struct {

// BlockedResources stores the amount of resources requested by blocked
// evaluations.
BlockedResources BlockedResourcesStats
BlockedResources *BlockedResourcesStats
}

// node stores information related to nodes.
type node struct {
dc string
class string
}

// NewBlockedStats returns a new BlockedStats.
Expand Down Expand Up @@ -59,16 +65,16 @@ func (b *BlockedStats) prune(cutoff time.Time) {
}
}

for k, v := range b.BlockedResources.ByNodeInfo {
for k, v := range b.BlockedResources.ByNode {
if shouldPrune(v) {
delete(b.BlockedResources.ByNodeInfo, k)
delete(b.BlockedResources.ByNode, k)
}
}
}

// generateResourceStats returns a summary of the resources requested by the
// input evaluation.
func generateResourceStats(eval *structs.Evaluation) BlockedResourcesStats {
func generateResourceStats(eval *structs.Evaluation) *BlockedResourcesStats {
dcs := make(map[string]struct{})
classes := make(map[string]struct{})

Expand All @@ -80,102 +86,95 @@ func generateResourceStats(eval *structs.Evaluation) BlockedResourcesStats {
for dc := range allocMetrics.NodesAvailable {
dcs[dc] = struct{}{}
}

for class := range allocMetrics.ClassExhausted {
classes[class] = struct{}{}
}

for _, r := range allocMetrics.ResourcesExhausted {
resources.CPU += r.CPU
resources.MemoryMB += r.MemoryMB
}
}

byJob := make(map[structs.NamespacedID]BlockedResourcesSummary)
byJob[structs.NewNamespacedID(eval.JobID, eval.Namespace)] = resources
nsID := structs.NewNamespacedID(eval.JobID, eval.Namespace)
byJob[nsID] = resources

byNodeInfo := make(map[NodeInfo]BlockedResourcesSummary)
byNodeInfo := make(map[node]BlockedResourcesSummary)
for dc := range dcs {
for class := range classes {
k := NodeInfo{dc, class}
k := node{dc: dc, class: class}
byNodeInfo[k] = resources
}
}

return BlockedResourcesStats{
ByJob: byJob,
ByNodeInfo: byNodeInfo,
return &BlockedResourcesStats{
ByJob: byJob,
ByNode: byNodeInfo,
}
}

// BlockedResourcesStats stores resources requested by block evaluations
// split into different dimensions.
// BlockedResourcesStats stores resources requested by blocked evaluations,
// tracked both by job and by node.
type BlockedResourcesStats struct {
ByJob map[structs.NamespacedID]BlockedResourcesSummary
ByNodeInfo map[NodeInfo]BlockedResourcesSummary
ByJob map[structs.NamespacedID]BlockedResourcesSummary
ByNode map[node]BlockedResourcesSummary
}

// NewBlockedResourcesStats returns a new BlockedResourcesStats.
func NewBlockedResourcesStats() BlockedResourcesStats {
return BlockedResourcesStats{
ByJob: make(map[structs.NamespacedID]BlockedResourcesSummary),
ByNodeInfo: make(map[NodeInfo]BlockedResourcesSummary),
func NewBlockedResourcesStats() *BlockedResourcesStats {
return &BlockedResourcesStats{
ByJob: make(map[structs.NamespacedID]BlockedResourcesSummary),
ByNode: make(map[node]BlockedResourcesSummary),
}
}

// Copy returns a deep copy of the blocked resource stats.
func (b BlockedResourcesStats) Copy() BlockedResourcesStats {
func (b *BlockedResourcesStats) Copy() *BlockedResourcesStats {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Would it be worth adding a new nil check on these methods since they are pointer receivers now?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I considered this but ended up leaving it out; this object should never be nil. If it is I don't think there's any reasonable behavior other than to panic (although I guess we could do so with a better error message).

result := NewBlockedResourcesStats()

for k, v := range b.ByJob {
result.ByJob[k] = v
result.ByJob[k] = v // value copy
}

for k, v := range b.ByNodeInfo {
result.ByNodeInfo[k] = v
for k, v := range b.ByNode {
result.ByNode[k] = v // value copy
}

return result
}

// Add returns a new BlockedResourcesStats with the values set to the current
// resource values plus the input.
func (b BlockedResourcesStats) Add(a BlockedResourcesStats) BlockedResourcesStats {
func (b *BlockedResourcesStats) Add(a *BlockedResourcesStats) *BlockedResourcesStats {
result := b.Copy()

for k, v := range a.ByJob {
result.ByJob[k] = b.ByJob[k].Add(v)
}

for k, v := range a.ByNodeInfo {
result.ByNodeInfo[k] = b.ByNodeInfo[k].Add(v)
for k, v := range a.ByNode {
result.ByNode[k] = b.ByNode[k].Add(v)
}

return result
}

// Subtract returns a new BlockedResourcesStats with the values set to the
// current resource values minus the input.
func (b BlockedResourcesStats) Subtract(a BlockedResourcesStats) BlockedResourcesStats {
func (b *BlockedResourcesStats) Subtract(a *BlockedResourcesStats) *BlockedResourcesStats {
result := b.Copy()

for k, v := range a.ByJob {
result.ByJob[k] = b.ByJob[k].Subtract(v)
}

for k, v := range a.ByNodeInfo {
result.ByNodeInfo[k] = b.ByNodeInfo[k].Subtract(v)
for k, v := range a.ByNode {
result.ByNode[k] = b.ByNode[k].Subtract(v)
}

return result
}

// NodeInfo stores information related to nodes.
type NodeInfo struct {
Datacenter string
NodeClass string
}

// BlockedResourcesSummary stores resource values for blocked evals.
type BlockedResourcesSummary struct {
Timestamp time.Time
Expand Down
Loading