Skip to content

Commit

Permalink
Merge pull request #6003 from pete-woods/add-job-status-metrics
Browse files Browse the repository at this point in the history
nomad: add job status metrics
  • Loading branch information
schmichael committed Aug 7, 2019
2 parents d0abe89 + d07ea34 commit d45a322
Show file tree
Hide file tree
Showing 2 changed files with 92 additions and 2 deletions.
59 changes: 59 additions & 0 deletions nomad/leader.go
Original file line number Diff line number Diff line change
Expand Up @@ -254,6 +254,9 @@ func (s *Server) establishLeadership(stopCh chan struct{}) error {
// Periodically publish job summary metrics
go s.publishJobSummaryMetrics(stopCh)

// Periodically publish job status metrics
go s.publishJobStatusMetrics(stopCh)

// Setup the heartbeat timers. This is done both when starting up or when
// a leader fail over happens. Since the timers are maintained by the leader
// node, effectively this means all the timers are renewed at the time of failover.
Expand Down Expand Up @@ -700,6 +703,62 @@ func (s *Server) iterateJobSummaryMetrics(summary *structs.JobSummary) {
}
}

// publishJobStatusMetrics publishes the job statuses as metrics
func (s *Server) publishJobStatusMetrics(stopCh chan struct{}) {
timer := time.NewTimer(0)
defer timer.Stop()

for {
select {
case <-stopCh:
return
case <-timer.C:
timer.Reset(s.config.StatsCollectionInterval)
state, err := s.State().Snapshot()
if err != nil {
s.logger.Error("failed to get state", "error", err)
continue
}
ws := memdb.NewWatchSet()
iter, err := state.Jobs(ws)
if err != nil {
s.logger.Error("failed to get job statuses", "error", err)
continue
}

s.iterateJobStatusMetrics(&iter)
}
}
}

func (s *Server) iterateJobStatusMetrics(jobs *memdb.ResultIterator) {
var pending int64 // Sum of all jobs in 'pending' state
var running int64 // Sum of all jobs in 'running' state
var dead int64 // Sum of all jobs in 'dead' state

for {
raw := (*jobs).Next()
if raw == nil {
break
}

job := raw.(*structs.Job)

switch job.Status {
case structs.JobStatusPending:
pending++
case structs.JobStatusRunning:
running++
case structs.JobStatusDead:
dead++
}
}

metrics.SetGauge([]string{"nomad", "job_status", "pending"}, float32(pending))
metrics.SetGauge([]string{"nomad", "job_status", "running"}, float32(running))
metrics.SetGauge([]string{"nomad", "job_status", "dead"}, float32(dead))
}

// revokeLeadership is invoked once we step down as leader.
// This is used to cleanup any state that may be specific to a leader.
func (s *Server) revokeLeadership() error {
Expand Down
35 changes: 33 additions & 2 deletions website/source/docs/telemetry/metrics.html.md
Original file line number Diff line number Diff line change
Expand Up @@ -705,9 +705,9 @@ detailed above) but any new metrics will only be available in the new format.
</tr>
</table>

## Job Metrics
## Job Summary Metrics

Job metrics are emitted by the Nomad leader server.
Job summary metrics are emitted by the Nomad leader server.

<table class="table table-bordered table-striped">
<tr>
Expand Down Expand Up @@ -761,6 +761,37 @@ Job metrics are emitted by the Nomad leader server.
</tr>
</table>

## Job Status Metrics

Job status metrics are emitted by the Nomad leader server.

<table class="table table-bordered table-striped">
<tr>
<th>Metric</th>
<th>Description</th>
<th>Unit</th>
<th>Type</th>
</tr>
<tr>
<td>`nomad.job_status.pending`</td>
<td>Number jobs pending</td>
<td>Integer</td>
<td>Gauge</td>
</tr>
<tr>
<td>`nomad.job_status.running`</td>
<td>Number jobs running</td>
<td>Integer</td>
<td>Gauge</td>
</tr>
<tr>
<td>`nomad.job_status.dead`</td>
<td>Number of dead jobs</td>
<td>Integer</td>
<td>Gauge</td>
</tr>
</table>

## Metric Types

<table class="table table-bordered table-striped">
Expand Down

0 comments on commit d45a322

Please sign in to comment.