diff --git a/docs/job-metrics.md b/docs/job-metrics.md
index 66377dff17..d0ec58ce60 100644
--- a/docs/job-metrics.md
+++ b/docs/job-metrics.md
@@ -10,7 +10,7 @@
| kube_job_spec_active_deadline_seconds | Gauge | `job_name`=<job-name>
`namespace`=<job-namespace> | STABLE |
| kube_job_status_active | Gauge | `job_name`=<job-name>
`namespace`=<job-namespace> | STABLE |
| kube_job_status_succeeded | Gauge | `job_name`=<job-name>
`namespace`=<job-namespace> | STABLE |
-| kube_job_status_failed | Gauge | `job_name`=<job-name>
`namespace`=<job-namespace> | STABLE |
+| kube_job_status_failed | Gauge | `job_name`=<job-name>
`namespace`=<job-namespace>
`reason`=<failure reason> | STABLE |
| kube_job_status_start_time | Gauge | `job_name`=<job-name>
`namespace`=<job-namespace> | STABLE |
| kube_job_status_completion_time | Gauge | `job_name`=<job-name>
`namespace`=<job-namespace> | STABLE |
| kube_job_complete | Gauge | `job_name`=<job-name>
`namespace`=<job-namespace>
`condition`=<true\|false\|unknown> | STABLE |
diff --git a/internal/store/job.go b/internal/store/job.go
index 03b463fae4..2807dc6beb 100644
--- a/internal/store/job.go
+++ b/internal/store/job.go
@@ -35,6 +35,7 @@ var (
descJobLabelsName = "kube_job_labels"
descJobLabelsHelp = "Kubernetes labels converted to Prometheus labels."
descJobLabelsDefaultLabels = []string{"namespace", "job_name"}
+ jobFailureReasons = []string{"BackoffLimitExceeded", "DeadLineExceeded", "Evicted"}
jobMetricFamilies = []generator.FamilyGenerator{
*generator.NewFamilyGenerator(
@@ -163,16 +164,48 @@ var (
),
*generator.NewFamilyGenerator(
"kube_job_status_failed",
- "The number of pods which reached Phase Failed.",
+ "The number of pods which reached Phase Failed and the reason for failure.",
metric.Gauge,
"",
wrapJobFunc(func(j *v1batch.Job) *metric.Family {
- return &metric.Family{
- Metrics: []*metric.Metric{
- {
- Value: float64(j.Status.Failed),
+ var ms []*metric.Metric
+
+ if float64(j.Status.Failed) == 0 {
+ return &metric.Family{
+ Metrics: []*metric.Metric{
+ {
+ Value: float64(j.Status.Failed),
+ },
},
- },
+ }
+ }
+
+ for _, condition := range j.Status.Conditions {
+ if condition.Type == v1batch.JobFailed {
+ reasonKnown := false
+ for _, reason := range jobFailureReasons {
+ reasonKnown = reasonKnown || failureReason(&condition, reason)
+
+ // for known reasons
+ ms = append(ms, &metric.Metric{
+ LabelKeys: []string{"reason"},
+ LabelValues: []string{reason},
+ Value: boolFloat64(failureReason(&condition, reason)),
+ })
+ }
+ // for unknown reasons
+ if !reasonKnown {
+ ms = append(ms, &metric.Metric{
+ LabelKeys: []string{"reason"},
+ LabelValues: []string{""},
+ Value: float64(j.Status.Failed),
+ })
+ }
+ }
+ }
+
+ return &metric.Family{
+ Metrics: ms,
}
}),
),
@@ -350,3 +383,10 @@ func createJobListWatch(kubeClient clientset.Interface, ns string) cache.ListerW
},
}
}
+
+func failureReason(jc *v1batch.JobCondition, reason string) bool {
+ if jc == nil {
+ return false
+ }
+ return jc.Reason == reason
+}
diff --git a/internal/store/job_test.go b/internal/store/job_test.go
index 6328df60dd..da2dca1c91 100644
--- a/internal/store/job_test.go
+++ b/internal/store/job_test.go
@@ -70,13 +70,13 @@ func TestJobStore(t *testing.T) {
# TYPE kube_job_status_active gauge
# HELP kube_job_status_completion_time CompletionTime represents time when the job was completed.
# TYPE kube_job_status_completion_time gauge
- # HELP kube_job_status_failed The number of pods which reached Phase Failed.
+ # HELP kube_job_status_failed The number of pods which reached Phase Failed and the reason for failure.
# TYPE kube_job_status_failed gauge
# HELP kube_job_status_start_time StartTime represents time when the job was acknowledged by the Job Manager.
# TYPE kube_job_status_start_time gauge
# HELP kube_job_status_succeeded The number of pods which reached Phase Succeeded.
- # TYPE kube_job_status_succeeded gauge
- `
+ # TYPE kube_job_status_succeeded gauge`
+
cases := []generateMetricsTestCase{
{
Obj: &v1batch.Job{
@@ -183,7 +183,7 @@ func TestJobStore(t *testing.T) {
CompletionTime: &metav1.Time{Time: FailedJob1CompletionTime},
StartTime: &metav1.Time{Time: FailedJob1StartTime},
Conditions: []v1batch.JobCondition{
- {Type: v1batch.JobFailed, Status: v1.ConditionTrue},
+ {Type: v1batch.JobFailed, Status: v1.ConditionTrue, Reason: "BackoffLimitExceeded"},
},
},
Spec: v1batch.JobSpec{
@@ -204,7 +204,9 @@ func TestJobStore(t *testing.T) {
kube_job_spec_parallelism{job_name="FailedJob1",namespace="ns1"} 1
kube_job_status_active{job_name="FailedJob1",namespace="ns1"} 0
kube_job_status_completion_time{job_name="FailedJob1",namespace="ns1"} 1.495810807e+09
- kube_job_status_failed{job_name="FailedJob1",namespace="ns1"} 1
+ kube_job_status_failed{job_name="FailedJob1",namespace="ns1",reason="BackoffLimitExceeded"} 1
+ kube_job_status_failed{job_name="FailedJob1",namespace="ns1",reason="DeadLineExceeded"} 0
+ kube_job_status_failed{job_name="FailedJob1",namespace="ns1",reason="Evicted"} 0
kube_job_status_start_time{job_name="FailedJob1",namespace="ns1"} 1.495807207e+09
kube_job_status_succeeded{job_name="FailedJob1",namespace="ns1"} 0
`,