diff --git a/pkg/collector/corechecks/cluster/ksm/kubernetes_state_transformers.go b/pkg/collector/corechecks/cluster/ksm/kubernetes_state_transformers.go index bfaa00e4dd8c4..608b3ad600b4d 100644 --- a/pkg/collector/corechecks/cluster/ksm/kubernetes_state_transformers.go +++ b/pkg/collector/corechecks/cluster/ksm/kubernetes_state_transformers.go @@ -421,10 +421,26 @@ func trimJobTag(tag string) (string, bool) { return trimmed, tag != trimmed } +var jobFailureReasons = map[string]struct{}{ + "backofflimitexceeded": {}, + "deadlineexceeded": {}, +} + +func validJobFailureReason(reason string) bool { + _, ok := jobFailureReasons[strings.ToLower(reason)] + return ok +} + // validateJob detects active jobs and adds the `kube_cronjob` tag func validateJob(val float64, tags []string) ([]string, bool) { kubeCronjob := "" - for _, tag := range tags { + for i, tag := range tags { + if strings.HasPrefix(tag, "reason:") { + if v := strings.TrimPrefix(tag, "reason:"); !validJobFailureReason(v) { + tags = append(tags[:i], tags[i+1:]...) + continue + } + } split := strings.Split(tag, ":") if len(split) == 2 && split[0] == "kube_job" || split[0] == "job" || split[0] == "job_name" { // Trim the timestamp suffix to avoid high cardinality @@ -482,10 +498,6 @@ func jobStatusFailedTransformer(s sender.Sender, name string, metric ksmstore.DD return } - if reasonTagIndex != -1 { - tags = append(tags[:reasonTagIndex], tags[reasonTagIndex+1:]...) - } - jobMetric(s, metric, ksmMetricPrefix+"job.failed", hostname, tags) } diff --git a/pkg/collector/corechecks/cluster/ksm/kubernetes_state_transformers_test.go b/pkg/collector/corechecks/cluster/ksm/kubernetes_state_transformers_test.go index 39251c412419b..d9df9dc7eadf1 100644 --- a/pkg/collector/corechecks/cluster/ksm/kubernetes_state_transformers_test.go +++ b/pkg/collector/corechecks/cluster/ksm/kubernetes_state_transformers_test.go @@ -544,7 +544,99 @@ func Test_jobStatusFailedTransformer(t *testing.T) { }, }, { - name: "irrelevant reason", + name: "BackoffLimitExceeded and value 0", + args: args{ + name: "kube_job_status_failed", + metric: ksmstore.DDMetric{ + Val: 0, + Labels: map[string]string{ + "job": "foo-1509998340", + "namespace": "default", + "reason": "BackoffLimitExceeded", + }, + }, + tags: []string{"job:foo-1509998340", "namespace:default", "reason:backofflimitexceeded"}, + }, + expected: nil, + }, + { + name: "BackoffLimitExceeded and value 1", + args: args{ + name: "kube_job_status_failed", + metric: ksmstore.DDMetric{ + Val: 1, + Labels: map[string]string{ + "job": "foo-1509998340", + "namespace": "default", + "reason": "BackoffLimitExceeded", + }, + }, + tags: []string{"job:foo-1509998340", "namespace:default", "reason:backofflimitexceeded"}, + }, + expected: &metricsExpected{ + name: "kubernetes_state.job.failed", + val: 1, + tags: []string{"kube_cronjob:foo", "namespace:default", "reason:backofflimitexceeded"}, + }, + }, + { + name: "DeadlineExceeded and value 0", + args: args{ + name: "kube_job_status_failed", + metric: ksmstore.DDMetric{ + Val: 0, + Labels: map[string]string{ + "job": "foo-1509998340", + "namespace": "default", + "reason": "DeadlineExceeded", + }, + }, + tags: []string{"job:foo-1509998340", "namespace:default", "reason:deadlineexceeded"}, + }, + expected: nil, + }, + { + name: "DeadlineExceeded and value 1.0", + args: args{ + name: "kube_job_status_failed", + metric: ksmstore.DDMetric{ + Val: 1, + Labels: map[string]string{ + "job": "foo-1509998340", + "namespace": "default", + "reason": "DeadlineExceeded", + }, + }, + tags: []string{"job:foo-1509998340", "namespace:default", "reason:deadlineexceeded"}, + }, + expected: &metricsExpected{ + name: "kubernetes_state.job.failed", + val: 1, + tags: []string{"kube_cronjob:foo", "namespace:default", "reason:deadlineexceeded"}, + }, + }, + { + name: "DeadlineExceeded and value 1.0", + args: args{ + name: "kube_job_status_failed", + metric: ksmstore.DDMetric{ + Val: 1, + Labels: map[string]string{ + "job": "foo-1509998340", + "namespace": "default", + "reason": "DeadlineExceeded", + }, + }, + tags: []string{"job:foo-1509998340", "namespace:default", "reason:deadlineexceeded"}, + }, + expected: &metricsExpected{ + name: "kubernetes_state.job.failed", + val: 1, + tags: []string{"kube_cronjob:foo", "namespace:default", "reason:deadlineexceeded"}, + }, + }, + { + name: "Evicted and 0", args: args{ name: "kube_job_status_failed", metric: ksmstore.DDMetric{ @@ -559,6 +651,26 @@ func Test_jobStatusFailedTransformer(t *testing.T) { }, expected: nil, }, + { + name: "Evicted and 1", + args: args{ + name: "kube_job_status_failed", + metric: ksmstore.DDMetric{ + Val: 1, + Labels: map[string]string{ + "job": "foo-1509998340", + "namespace": "default", + "reason": "Evicted", + }, + }, + tags: []string{"job:foo-1509998340", "namespace:default", "reason:Evicted"}, + }, + expected: &metricsExpected{ + name: "kubernetes_state.job.failed", + val: 1, + tags: []string{"kube_cronjob:foo", "namespace:default"}, + }, + }, { name: "inactive", args: args{ @@ -1562,6 +1674,34 @@ func Test_validateJob(t *testing.T) { want: []string{"foo:bar", "job_name:foo"}, want1: true, }, + { + name: "reason:BackoffLimitExceeded", + val: 1.0, + tags: []string{"foo:bar", "job_name:foo-1600167000", "kube_job:foo-1600167000", "reason:BackoffLimitExceeded"}, + want: []string{"foo:bar", "job_name:foo-1600167000", "kube_job:foo-1600167000", "kube_cronjob:foo", "reason:BackoffLimitExceeded"}, + want1: true, + }, + { + name: "reason:DeadLineExceeded", + val: 1.0, + tags: []string{"foo:bar", "job_name:foo-1600167000", "reason:DeadLineExceeded", "kube_job:foo-1600167000"}, + want: []string{"foo:bar", "job_name:foo-1600167000", "kube_job:foo-1600167000", "kube_cronjob:foo", "reason:DeadLineExceeded"}, + want1: true, + }, + { + name: "empty reason tag", + val: 1.0, + tags: []string{"reason:", "foo:bar", "job_name:foo-1600167000", "kube_job:foo-1600167000"}, + want: []string{"foo:bar", "job_name:foo-1600167000", "kube_job:foo-1600167000", "kube_cronjob:foo"}, + want1: true, + }, + { + name: "invalid reason", + val: 1.0, + tags: []string{"foo:bar", "reason:error", "job_name:foo-1600167000", "kube_job:foo-1600167000"}, + want: []string{"foo:bar", "job_name:foo-1600167000", "kube_job:foo-1600167000", "kube_cronjob:foo"}, + want1: true, + }, { name: "invalid", val: 0.0, diff --git a/releasenotes-dca/notes/add-reason-tags-to-kube_job_status_failed-755bbfbb67d7e4c6.yaml b/releasenotes-dca/notes/add-reason-tags-to-kube_job_status_failed-755bbfbb67d7e4c6.yaml new file mode 100644 index 0000000000000..1e55bbe2b8a8d --- /dev/null +++ b/releasenotes-dca/notes/add-reason-tags-to-kube_job_status_failed-755bbfbb67d7e4c6.yaml @@ -0,0 +1,12 @@ +# Each section from every release note are combined when the +# CHANGELOG.rst is rendered. So the text needs to be worded so that +# it does not depend on any information only available in another +# section. This may mean repeating some details, but each section +# must be readable independently of the other. +# +# Each section note must be formatted as reStructuredText. +--- +enhancements: + - | + Add ``reason:backofflimitexceeded,deadlineexceeded`` to the + ``kubernetes_state.job.failed`` metric to help users understand why a job failed.