Skip to content

Commit

Permalink
disconnected clients: Observability plumbing (#12141)
Browse files Browse the repository at this point in the history
* Add disconnects/reconnect to log output and emit reschedule metrics

* TaskGroupSummary: Add Unknown, update StateStore logic, add to metrics
  • Loading branch information
DerekStrickland authored and DerekStrickland committed Apr 5, 2022
1 parent b3fb943 commit 5b5c853
Show file tree
Hide file tree
Showing 6 changed files with 24 additions and 3 deletions.
2 changes: 2 additions & 0 deletions nomad/leader.go
Original file line number Diff line number Diff line change
Expand Up @@ -1033,6 +1033,8 @@ func (s *Server) iterateJobSummaryMetrics(summary *structs.JobSummary) {
float32(tgSummary.Starting), labels)
metrics.SetGaugeWithLabels([]string{"nomad", "job_summary", "lost"},
float32(tgSummary.Lost), labels)
metrics.SetGaugeWithLabels([]string{"nomad", "job_summary", "unknown"},
float32(tgSummary.Unknown), labels)
}
}

Expand Down
8 changes: 8 additions & 0 deletions nomad/state/state_store.go
Original file line number Diff line number Diff line change
Expand Up @@ -4733,6 +4733,8 @@ func (s *StateStore) ReconcileJobSummaries(index uint64) error {
tg.Failed += 1
case structs.AllocClientStatusLost:
tg.Lost += 1
case structs.AllocClientStatusUnknown:
tg.Unknown += 1
case structs.AllocClientStatusComplete:
tg.Complete += 1
case structs.AllocClientStatusRunning:
Expand Down Expand Up @@ -5290,6 +5292,8 @@ func (s *StateStore) updateSummaryWithAlloc(index uint64, alloc *structs.Allocat
tgSummary.Complete += 1
case structs.AllocClientStatusLost:
tgSummary.Lost += 1
case structs.AllocClientStatusUnknown:
tgSummary.Unknown += 1
}

// Decrementing the count of the bin of the last state
Expand All @@ -5306,6 +5310,10 @@ func (s *StateStore) updateSummaryWithAlloc(index uint64, alloc *structs.Allocat
if tgSummary.Lost > 0 {
tgSummary.Lost -= 1
}
case structs.AllocClientStatusUnknown:
if tgSummary.Unknown > 0 {
tgSummary.Unknown -= 1
}
case structs.AllocClientStatusFailed, structs.AllocClientStatusComplete:
default:
s.logger.Error("invalid old client status for allocation",
Expand Down
9 changes: 8 additions & 1 deletion nomad/state/state_store_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -5793,7 +5793,13 @@ func TestStateStore_ReconcileJobSummary(t *testing.T) {
alloc11 := alloc10.Copy()
alloc11.ClientStatus = structs.AllocClientStatusLost

state.UpsertAllocs(structs.MsgTypeTestSetup, 130, []*structs.Allocation{alloc4, alloc6, alloc8, alloc10})
alloc12 := mock.Alloc()
alloc12.JobID = alloc.JobID
alloc12.Job = alloc.Job
alloc12.TaskGroup = "db"
alloc12.ClientStatus = structs.AllocClientStatusUnknown

state.UpsertAllocs(structs.MsgTypeTestSetup, 130, []*structs.Allocation{alloc4, alloc6, alloc8, alloc10, alloc12})

state.UpdateAllocsFromClient(structs.MsgTypeTestSetup, 150, []*structs.Allocation{alloc5, alloc7, alloc9, alloc11})

Expand All @@ -5817,6 +5823,7 @@ func TestStateStore_ReconcileJobSummary(t *testing.T) {
Failed: 1,
Complete: 1,
Lost: 1,
Unknown: 1,
},
},
CreateIndex: 100,
Expand Down
1 change: 1 addition & 0 deletions nomad/structs/structs.go
Original file line number Diff line number Diff line change
Expand Up @@ -4791,6 +4791,7 @@ type TaskGroupSummary struct {
Running int
Starting int
Lost int
Unknown int
}

const (
Expand Down
6 changes: 4 additions & 2 deletions scheduler/reconcile.go
Original file line number Diff line number Diff line change
Expand Up @@ -145,8 +145,8 @@ type delayedRescheduleInfo struct {
}

func (r *reconcileResults) GoString() string {
base := fmt.Sprintf("Total changes: (place %d) (destructive %d) (inplace %d) (stop %d)",
len(r.place), len(r.destructiveUpdate), len(r.inplaceUpdate), len(r.stop))
base := fmt.Sprintf("Total changes: (place %d) (destructive %d) (inplace %d) (stop %d) (disconnect %d) (reconnect %d)",
len(r.place), len(r.destructiveUpdate), len(r.inplaceUpdate), len(r.stop), len(r.disconnectUpdates), len(r.reconnectUpdates))

if r.deployment != nil {
base += fmt.Sprintf("\nCreated Deployment: %q", r.deployment.ID)
Expand Down Expand Up @@ -1267,6 +1267,8 @@ func (a *allocReconciler) createTimeoutLaterEvals(disconnecting allocSet, tgName
allocIDToFollowupEvalID[timeoutInfo.allocID] = eval.ID
}

emitRescheduleInfo(timeoutInfo.alloc, eval)

// Create updates that will be applied to the allocs to mark the FollowupEvalID
// and the unknown ClientStatus.
updatedAlloc := timeoutInfo.alloc.Copy()
Expand Down
1 change: 1 addition & 0 deletions website/content/docs/operations/metrics-reference.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -211,6 +211,7 @@ Job summary metrics are emitted by the Nomad leader server.
| `nomad.nomad.job_summary.complete` | Number of complete allocations for a job | Integer | Gauge | host, job, namespace, task_group |
| `nomad.nomad.job_summary.failed` | Number of failed allocations for a job | Integer | Gauge | host, job, namespace, task_group |
| `nomad.nomad.job_summary.lost` | Number of lost allocations for a job | Integer | Gauge | host, job, namespace, task_group |
| `nomad.nomad.job_summary.unknown` | Number of unknown allocations for a job | Integer | Gauge | host, job, namespace, task_group |
| `nomad.nomad.job_summary.queued` | Number of queued allocations for a job | Integer | Gauge | host, job, namespace, task_group |
| `nomad.nomad.job_summary.running` | Number of running allocations for a job | Integer | Gauge | host, job, namespace, task_group |
| `nomad.nomad.job_summary.starting` | Number of starting allocations for a job | Integer | Gauge | host, job, namespace, task_group |
Expand Down

0 comments on commit 5b5c853

Please sign in to comment.