From d07ea34b0cf6009b17e38234a8e617de7df013a5 Mon Sep 17 00:00:00 2001 From: Pete Woods Date: Wed, 24 Jul 2019 14:17:33 +0100 Subject: [PATCH] Add job status metrics This avoids having to write services to repeatedly hit the jobs API --- nomad/leader.go | 59 +++++++++++++++++++ website/source/docs/telemetry/metrics.html.md | 35 ++++++++++- 2 files changed, 92 insertions(+), 2 deletions(-) diff --git a/nomad/leader.go b/nomad/leader.go index 60ef4e7ee406..145a02367a7d 100644 --- a/nomad/leader.go +++ b/nomad/leader.go @@ -254,6 +254,9 @@ func (s *Server) establishLeadership(stopCh chan struct{}) error { // Periodically publish job summary metrics go s.publishJobSummaryMetrics(stopCh) + // Periodically publish job status metrics + go s.publishJobStatusMetrics(stopCh) + // Setup the heartbeat timers. This is done both when starting up or when // a leader fail over happens. Since the timers are maintained by the leader // node, effectively this means all the timers are renewed at the time of failover. @@ -700,6 +703,62 @@ func (s *Server) iterateJobSummaryMetrics(summary *structs.JobSummary) { } } +// publishJobStatusMetrics publishes the job statuses as metrics +func (s *Server) publishJobStatusMetrics(stopCh chan struct{}) { + timer := time.NewTimer(0) + defer timer.Stop() + + for { + select { + case <-stopCh: + return + case <-timer.C: + timer.Reset(s.config.StatsCollectionInterval) + state, err := s.State().Snapshot() + if err != nil { + s.logger.Error("failed to get state", "error", err) + continue + } + ws := memdb.NewWatchSet() + iter, err := state.Jobs(ws) + if err != nil { + s.logger.Error("failed to get job statuses", "error", err) + continue + } + + s.iterateJobStatusMetrics(&iter) + } + } +} + +func (s *Server) iterateJobStatusMetrics(jobs *memdb.ResultIterator) { + var pending int64 // Sum of all jobs in 'pending' state + var running int64 // Sum of all jobs in 'running' state + var dead int64 // Sum of all jobs in 'dead' state + + for { + raw := (*jobs).Next() + if raw == nil { + break + } + + job := raw.(*structs.Job) + + switch job.Status { + case structs.JobStatusPending: + pending++ + case structs.JobStatusRunning: + running++ + case structs.JobStatusDead: + dead++ + } + } + + metrics.SetGauge([]string{"nomad", "job_status", "pending"}, float32(pending)) + metrics.SetGauge([]string{"nomad", "job_status", "running"}, float32(running)) + metrics.SetGauge([]string{"nomad", "job_status", "dead"}, float32(dead)) +} + // revokeLeadership is invoked once we step down as leader. // This is used to cleanup any state that may be specific to a leader. func (s *Server) revokeLeadership() error { diff --git a/website/source/docs/telemetry/metrics.html.md b/website/source/docs/telemetry/metrics.html.md index a99ad844d197..93d560156353 100644 --- a/website/source/docs/telemetry/metrics.html.md +++ b/website/source/docs/telemetry/metrics.html.md @@ -705,9 +705,9 @@ detailed above) but any new metrics will only be available in the new format. -## Job Metrics +## Job Summary Metrics -Job metrics are emitted by the Nomad leader server. +Job summary metrics are emitted by the Nomad leader server. @@ -761,6 +761,37 @@ Job metrics are emitted by the Nomad leader server.
+## Job Status Metrics + +Job status metrics are emitted by the Nomad leader server. + + + + + + + + + + + + + + + + + + + + + + + + + + +
MetricDescriptionUnitType
`nomad.job_status.pending`Number jobs pendingIntegerGauge
`nomad.job_status.running`Number jobs runningIntegerGauge
`nomad.job_status.dead`Number of dead jobsIntegerGauge
+ ## Metric Types