Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

nomad: add job status metrics #6003

Merged
merged 1 commit into from
Aug 7, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
59 changes: 59 additions & 0 deletions nomad/leader.go
Original file line number Diff line number Diff line change
Expand Up @@ -254,6 +254,9 @@ func (s *Server) establishLeadership(stopCh chan struct{}) error {
// Periodically publish job summary metrics
go s.publishJobSummaryMetrics(stopCh)

// Periodically publish job status metrics
go s.publishJobStatusMetrics(stopCh)

// Setup the heartbeat timers. This is done both when starting up or when
// a leader fail over happens. Since the timers are maintained by the leader
// node, effectively this means all the timers are renewed at the time of failover.
Expand Down Expand Up @@ -700,6 +703,62 @@ func (s *Server) iterateJobSummaryMetrics(summary *structs.JobSummary) {
}
}

// publishJobStatusMetrics publishes the job statuses as metrics
func (s *Server) publishJobStatusMetrics(stopCh chan struct{}) {
timer := time.NewTimer(0)
defer timer.Stop()

for {
select {
case <-stopCh:
return
case <-timer.C:
timer.Reset(s.config.StatsCollectionInterval)
state, err := s.State().Snapshot()
if err != nil {
s.logger.Error("failed to get state", "error", err)
continue
}
ws := memdb.NewWatchSet()
iter, err := state.Jobs(ws)
if err != nil {
s.logger.Error("failed to get job statuses", "error", err)
continue
}

s.iterateJobStatusMetrics(&iter)
}
}
}

func (s *Server) iterateJobStatusMetrics(jobs *memdb.ResultIterator) {
var pending int64 // Sum of all jobs in 'pending' state
var running int64 // Sum of all jobs in 'running' state
var dead int64 // Sum of all jobs in 'dead' state
langmartin marked this conversation as resolved.
Show resolved Hide resolved

for {
raw := (*jobs).Next()
if raw == nil {
break
}

job := raw.(*structs.Job)

switch job.Status {
case structs.JobStatusPending:
pending++
case structs.JobStatusRunning:
running++
case structs.JobStatusDead:
dead++
}
}

metrics.SetGauge([]string{"nomad", "job_status", "pending"}, float32(pending))
metrics.SetGauge([]string{"nomad", "job_status", "running"}, float32(running))
metrics.SetGauge([]string{"nomad", "job_status", "dead"}, float32(dead))
}

// revokeLeadership is invoked once we step down as leader.
// This is used to cleanup any state that may be specific to a leader.
func (s *Server) revokeLeadership() error {
Expand Down
35 changes: 33 additions & 2 deletions website/source/docs/telemetry/metrics.html.md
Original file line number Diff line number Diff line change
Expand Up @@ -705,9 +705,9 @@ detailed above) but any new metrics will only be available in the new format.
</tr>
</table>

## Job Metrics
## Job Summary Metrics

Job metrics are emitted by the Nomad leader server.
Job summary metrics are emitted by the Nomad leader server.

<table class="table table-bordered table-striped">
<tr>
Expand Down Expand Up @@ -761,6 +761,37 @@ Job metrics are emitted by the Nomad leader server.
</tr>
</table>

## Job Status Metrics

Job status metrics are emitted by the Nomad leader server.

<table class="table table-bordered table-striped">
<tr>
<th>Metric</th>
<th>Description</th>
<th>Unit</th>
<th>Type</th>
</tr>
<tr>
<td>`nomad.job_status.pending`</td>
<td>Number jobs pending</td>
<td>Integer</td>
<td>Gauge</td>
</tr>
<tr>
<td>`nomad.job_status.running`</td>
<td>Number jobs running</td>
<td>Integer</td>
<td>Gauge</td>
</tr>
<tr>
<td>`nomad.job_status.dead`</td>
<td>Number of dead jobs</td>
<td>Integer</td>
<td>Gauge</td>
</tr>
</table>

## Metric Types

<table class="table table-bordered table-striped">
Expand Down