diff --git a/docs/job-metrics.md b/docs/job-metrics.md index 66377dff17..d0ec58ce60 100644 --- a/docs/job-metrics.md +++ b/docs/job-metrics.md @@ -10,7 +10,7 @@ | kube_job_spec_active_deadline_seconds | Gauge | `job_name`=<job-name>
`namespace`=<job-namespace> | STABLE | | kube_job_status_active | Gauge | `job_name`=<job-name>
`namespace`=<job-namespace> | STABLE | | kube_job_status_succeeded | Gauge | `job_name`=<job-name>
`namespace`=<job-namespace> | STABLE | -| kube_job_status_failed | Gauge | `job_name`=<job-name>
`namespace`=<job-namespace> | STABLE | +| kube_job_status_failed | Gauge | `job_name`=<job-name>
`namespace`=<job-namespace>
`reason`=<failure reason> | STABLE | | kube_job_status_start_time | Gauge | `job_name`=<job-name>
`namespace`=<job-namespace> | STABLE | | kube_job_status_completion_time | Gauge | `job_name`=<job-name>
`namespace`=<job-namespace> | STABLE | | kube_job_complete | Gauge | `job_name`=<job-name>
`namespace`=<job-namespace>
`condition`=<true\|false\|unknown> | STABLE | diff --git a/internal/store/job.go b/internal/store/job.go index 03b463fae4..2807dc6beb 100644 --- a/internal/store/job.go +++ b/internal/store/job.go @@ -35,6 +35,7 @@ var ( descJobLabelsName = "kube_job_labels" descJobLabelsHelp = "Kubernetes labels converted to Prometheus labels." descJobLabelsDefaultLabels = []string{"namespace", "job_name"} + jobFailureReasons = []string{"BackoffLimitExceeded", "DeadLineExceeded", "Evicted"} jobMetricFamilies = []generator.FamilyGenerator{ *generator.NewFamilyGenerator( @@ -163,16 +164,48 @@ var ( ), *generator.NewFamilyGenerator( "kube_job_status_failed", - "The number of pods which reached Phase Failed.", + "The number of pods which reached Phase Failed and the reason for failure.", metric.Gauge, "", wrapJobFunc(func(j *v1batch.Job) *metric.Family { - return &metric.Family{ - Metrics: []*metric.Metric{ - { - Value: float64(j.Status.Failed), + var ms []*metric.Metric + + if float64(j.Status.Failed) == 0 { + return &metric.Family{ + Metrics: []*metric.Metric{ + { + Value: float64(j.Status.Failed), + }, }, - }, + } + } + + for _, condition := range j.Status.Conditions { + if condition.Type == v1batch.JobFailed { + reasonKnown := false + for _, reason := range jobFailureReasons { + reasonKnown = reasonKnown || failureReason(&condition, reason) + + // for known reasons + ms = append(ms, &metric.Metric{ + LabelKeys: []string{"reason"}, + LabelValues: []string{reason}, + Value: boolFloat64(failureReason(&condition, reason)), + }) + } + // for unknown reasons + if !reasonKnown { + ms = append(ms, &metric.Metric{ + LabelKeys: []string{"reason"}, + LabelValues: []string{""}, + Value: float64(j.Status.Failed), + }) + } + } + } + + return &metric.Family{ + Metrics: ms, } }), ), @@ -350,3 +383,10 @@ func createJobListWatch(kubeClient clientset.Interface, ns string) cache.ListerW }, } } + +func failureReason(jc *v1batch.JobCondition, reason string) bool { + if jc == nil { + return false + } + return jc.Reason == reason +} diff --git a/internal/store/job_test.go b/internal/store/job_test.go index 6328df60dd..da2dca1c91 100644 --- a/internal/store/job_test.go +++ b/internal/store/job_test.go @@ -70,13 +70,13 @@ func TestJobStore(t *testing.T) { # TYPE kube_job_status_active gauge # HELP kube_job_status_completion_time CompletionTime represents time when the job was completed. # TYPE kube_job_status_completion_time gauge - # HELP kube_job_status_failed The number of pods which reached Phase Failed. + # HELP kube_job_status_failed The number of pods which reached Phase Failed and the reason for failure. # TYPE kube_job_status_failed gauge # HELP kube_job_status_start_time StartTime represents time when the job was acknowledged by the Job Manager. # TYPE kube_job_status_start_time gauge # HELP kube_job_status_succeeded The number of pods which reached Phase Succeeded. - # TYPE kube_job_status_succeeded gauge - ` + # TYPE kube_job_status_succeeded gauge` + cases := []generateMetricsTestCase{ { Obj: &v1batch.Job{ @@ -183,7 +183,7 @@ func TestJobStore(t *testing.T) { CompletionTime: &metav1.Time{Time: FailedJob1CompletionTime}, StartTime: &metav1.Time{Time: FailedJob1StartTime}, Conditions: []v1batch.JobCondition{ - {Type: v1batch.JobFailed, Status: v1.ConditionTrue}, + {Type: v1batch.JobFailed, Status: v1.ConditionTrue, Reason: "BackoffLimitExceeded"}, }, }, Spec: v1batch.JobSpec{ @@ -204,7 +204,9 @@ func TestJobStore(t *testing.T) { kube_job_spec_parallelism{job_name="FailedJob1",namespace="ns1"} 1 kube_job_status_active{job_name="FailedJob1",namespace="ns1"} 0 kube_job_status_completion_time{job_name="FailedJob1",namespace="ns1"} 1.495810807e+09 - kube_job_status_failed{job_name="FailedJob1",namespace="ns1"} 1 + kube_job_status_failed{job_name="FailedJob1",namespace="ns1",reason="BackoffLimitExceeded"} 1 + kube_job_status_failed{job_name="FailedJob1",namespace="ns1",reason="DeadLineExceeded"} 0 + kube_job_status_failed{job_name="FailedJob1",namespace="ns1",reason="Evicted"} 0 kube_job_status_start_time{job_name="FailedJob1",namespace="ns1"} 1.495807207e+09 kube_job_status_succeeded{job_name="FailedJob1",namespace="ns1"} 0 `,