From 1be85ef633106eafe6b55fafd5780ba25f151bb7 Mon Sep 17 00:00:00 2001 From: keisku Date: Thu, 25 Apr 2024 11:13:03 +0900 Subject: [PATCH 1/8] add reason tags to KSM and kubelet metrics --- .../ksm/kubernetes_state_transformers.go | 10 ++-- .../ksm/kubernetes_state_transformers_test.go | 56 ++++++++++++++++--- .../kubelet/provider/pod/provider.go | 8 ++- 3 files changed, 62 insertions(+), 12 deletions(-) diff --git a/pkg/collector/corechecks/cluster/ksm/kubernetes_state_transformers.go b/pkg/collector/corechecks/cluster/ksm/kubernetes_state_transformers.go index bfaa00e4dd8c4..6b48ede738652 100644 --- a/pkg/collector/corechecks/cluster/ksm/kubernetes_state_transformers.go +++ b/pkg/collector/corechecks/cluster/ksm/kubernetes_state_transformers.go @@ -273,9 +273,11 @@ func containerWaitingReasonTransformer(s sender.Sender, name string, metric ksms } var allowedTerminatedReasons = map[string]struct{}{ - "oomkilled": {}, - "containercannotrun": {}, - "error": {}, + "oomkilled": {}, + "containercannotrun": {}, + "error": {}, + "deadlineexceeded": {}, + "backofflimitexceeded": {}, } // containerTerminatedReasonTransformer validates the container waiting reasons for metric kube_pod_container_status_terminated_reason @@ -426,7 +428,7 @@ func validateJob(val float64, tags []string) ([]string, bool) { kubeCronjob := "" for _, tag := range tags { split := strings.Split(tag, ":") - if len(split) == 2 && split[0] == "kube_job" || split[0] == "job" || split[0] == "job_name" { + if len(split) == 2 && split[0] == "kube_job" || split[0] == "job" || split[0] == "job_name" || split[0] == "reason" { // Trim the timestamp suffix to avoid high cardinality if name, trimmed := trimJobTag(split[1]); trimmed { // The trimmed job name corresponds to the parent cronjob name diff --git a/pkg/collector/corechecks/cluster/ksm/kubernetes_state_transformers_test.go b/pkg/collector/corechecks/cluster/ksm/kubernetes_state_transformers_test.go index 39251c412419b..c12a313b8397e 100644 --- a/pkg/collector/corechecks/cluster/ksm/kubernetes_state_transformers_test.go +++ b/pkg/collector/corechecks/cluster/ksm/kubernetes_state_transformers_test.go @@ -343,17 +343,17 @@ func Test_jobFailedTransformer(t *testing.T) { "condition": "true", }, }, - tags: []string{"job_name:foo-1509998340", "namespace:default", "condition:true"}, + tags: []string{"job_name:foo-1509998340", "namespace:default", "condition:true", "reason:backofflimitexceeded"}, }, expectedServiceCheck: &serviceCheck{ name: "kubernetes_state.job.complete", status: servicecheck.ServiceCheckCritical, - tags: []string{"kube_cronjob:foo", "namespace:default"}, + tags: []string{"kube_cronjob:foo", "namespace:default", "reason:backofflimitexceeded"}, }, expectedMetric: &metricsExpected{ name: "kubernetes_state.job.completion.failed", val: 1, - tags: []string{"kube_cronjob:foo", "namespace:default"}, + tags: []string{"kube_cronjob:foo", "namespace:default", "reason:backofflimitexceeded"}, }, }, { @@ -368,17 +368,17 @@ func Test_jobFailedTransformer(t *testing.T) { "condition": "true", }, }, - tags: []string{"job:foo-1509998340", "namespace:default", "condition:true"}, + tags: []string{"job:foo-1509998340", "namespace:default", "condition:true", "reason:deadlineexceeded"}, }, expectedServiceCheck: &serviceCheck{ name: "kubernetes_state.job.complete", status: servicecheck.ServiceCheckCritical, - tags: []string{"kube_cronjob:foo", "namespace:default"}, + tags: []string{"kube_cronjob:foo", "namespace:default", "reason:deadlineexceeded"}, }, expectedMetric: &metricsExpected{ name: "kubernetes_state.job.completion.failed", val: 1, - tags: []string{"kube_cronjob:foo", "namespace:default"}, + tags: []string{"kube_cronjob:foo", "namespace:default", "reason:deadlineexceeded"}, }, }, { @@ -393,7 +393,7 @@ func Test_jobFailedTransformer(t *testing.T) { "condition": "true", }, }, - tags: []string{"job_name:foo-1509998340", "namespace:default", "condition:true"}, + tags: []string{"job_name:foo-1509998340", "namespace:default", "condition:true", "reason:backofflimitexceeded"}, }, expectedServiceCheck: nil, expectedMetric: nil, @@ -1009,6 +1009,48 @@ func Test_containerTerminatedReasonTransformer(t *testing.T) { tags: []string{"container:foo", "pod:bar", "namespace:default", "reason:Error"}, }, }, + { + name: "BackoffLimitExceeded", + args: args{ + name: "kube_pod_container_status_terminated_reason", + metric: ksmstore.DDMetric{ + Val: 1, + Labels: map[string]string{ + "container": "foo", + "pod": "bar", + "namespace": "default", + "reason": "BackoffLimitExceeded", + }, + }, + tags: []string{"container:foo", "pod:bar", "namespace:default", "reason:BackoffLimitExceeded"}, + }, + expected: &metricsExpected{ + name: "kubernetes_state.container.status_report.count.terminated", + val: 1, + tags: []string{"container:foo", "pod:bar", "namespace:default", "reason:BackoffLimitExceeded"}, + }, + }, + { + name: "DeadlineExceeded", + args: args{ + name: "kube_pod_container_status_terminated_reason", + metric: ksmstore.DDMetric{ + Val: 1, + Labels: map[string]string{ + "container": "foo", + "pod": "bar", + "namespace": "default", + "reason": "DeadlineExceeded", + }, + }, + tags: []string{"container:foo", "pod:bar", "namespace:default", "reason:DeadlineExceeded"}, + }, + expected: &metricsExpected{ + name: "kubernetes_state.container.status_report.count.terminated", + val: 1, + tags: []string{"container:foo", "pod:bar", "namespace:default", "reason:DeadlineExceeded"}, + }, + }, } for _, tt := range tests { s := mocksender.NewMockSender("ksm") diff --git a/pkg/collector/corechecks/containers/kubelet/provider/pod/provider.go b/pkg/collector/corechecks/containers/kubelet/provider/pod/provider.go index 1aeaf2ca8a235..8773a9d2ddefa 100644 --- a/pkg/collector/corechecks/containers/kubelet/provider/pod/provider.go +++ b/pkg/collector/corechecks/containers/kubelet/provider/pod/provider.go @@ -37,7 +37,13 @@ var includeContainerStateReason = map[string][]string{ "invalidimagename", "createcontainerconfigerror", }, - "terminated": {"oomkilled", "containercannotrun", "error"}, + "terminated": { + "oomkilled", + "containercannotrun", + "error", + "deadlineexceeded", + "backofflimitexceeded", + }, } const kubeNamespaceTag = "kube_namespace" From 13f3195f4a1baffb035a746c1567df018f0dabee Mon Sep 17 00:00:00 2001 From: keisku Date: Sat, 27 Apr 2024 00:14:35 +0000 Subject: [PATCH 2/8] add releasenote --- .../add-reason-tags-to-ksm-26b5404f117c96e5.yaml | 13 +++++++++++++ ...on-tags-to-kubelet-metrics-c3c8800ae018992b.yaml | 12 ++++++++++++ 2 files changed, 25 insertions(+) create mode 100644 releasenotes-dca/notes/add-reason-tags-to-ksm-26b5404f117c96e5.yaml create mode 100644 releasenotes/notes/add-reason-tags-to-kubelet-metrics-c3c8800ae018992b.yaml diff --git a/releasenotes-dca/notes/add-reason-tags-to-ksm-26b5404f117c96e5.yaml b/releasenotes-dca/notes/add-reason-tags-to-ksm-26b5404f117c96e5.yaml new file mode 100644 index 0000000000000..b84a70cd2221b --- /dev/null +++ b/releasenotes-dca/notes/add-reason-tags-to-ksm-26b5404f117c96e5.yaml @@ -0,0 +1,13 @@ +# Each section from every release note are combined when the +# CHANGELOG.rst is rendered. So the text needs to be worded so that +# it does not depend on any information only available in another +# section. This may mean repeating some details, but each section +# must be readable independently of the other. +# +# Each section note must be formatted as reStructuredText. +--- +enhancements: + - | + Add ``reason:deadlineexceeded`` and ``reason:backofflimitexceeded`` + to ``kubernetes_state.container.status_report.count.terminated``. + Add ``reason`` tags to ``kubernetes_state.job.*``. diff --git a/releasenotes/notes/add-reason-tags-to-kubelet-metrics-c3c8800ae018992b.yaml b/releasenotes/notes/add-reason-tags-to-kubelet-metrics-c3c8800ae018992b.yaml new file mode 100644 index 0000000000000..54ad0d13732aa --- /dev/null +++ b/releasenotes/notes/add-reason-tags-to-kubelet-metrics-c3c8800ae018992b.yaml @@ -0,0 +1,12 @@ +# Each section from every release note are combined when the +# CHANGELOG.rst is rendered. So the text needs to be worded so that +# it does not depend on any information only available in another +# section. This may mean repeating some details, but each section +# must be readable independently of the other. +# +# Each section note must be formatted as reStructuredText. +--- +enhancements: + - | + Add ``reason:deadlineexceeded`` and ``reason:backofflimitexceeded`` to + ``kubernetes.containers.{state,last_status}.terminated``. From b014cdec2dd602c4e0c039b1ed4e413f479a673a Mon Sep 17 00:00:00 2001 From: keisku Date: Wed, 1 May 2024 06:20:18 +0000 Subject: [PATCH 3/8] Revert "add releasenote" This reverts commit 13f3195f4a1baffb035a746c1567df018f0dabee. --- .../add-reason-tags-to-ksm-26b5404f117c96e5.yaml | 13 ------------- ...on-tags-to-kubelet-metrics-c3c8800ae018992b.yaml | 12 ------------ 2 files changed, 25 deletions(-) delete mode 100644 releasenotes-dca/notes/add-reason-tags-to-ksm-26b5404f117c96e5.yaml delete mode 100644 releasenotes/notes/add-reason-tags-to-kubelet-metrics-c3c8800ae018992b.yaml diff --git a/releasenotes-dca/notes/add-reason-tags-to-ksm-26b5404f117c96e5.yaml b/releasenotes-dca/notes/add-reason-tags-to-ksm-26b5404f117c96e5.yaml deleted file mode 100644 index b84a70cd2221b..0000000000000 --- a/releasenotes-dca/notes/add-reason-tags-to-ksm-26b5404f117c96e5.yaml +++ /dev/null @@ -1,13 +0,0 @@ -# Each section from every release note are combined when the -# CHANGELOG.rst is rendered. So the text needs to be worded so that -# it does not depend on any information only available in another -# section. This may mean repeating some details, but each section -# must be readable independently of the other. -# -# Each section note must be formatted as reStructuredText. ---- -enhancements: - - | - Add ``reason:deadlineexceeded`` and ``reason:backofflimitexceeded`` - to ``kubernetes_state.container.status_report.count.terminated``. - Add ``reason`` tags to ``kubernetes_state.job.*``. diff --git a/releasenotes/notes/add-reason-tags-to-kubelet-metrics-c3c8800ae018992b.yaml b/releasenotes/notes/add-reason-tags-to-kubelet-metrics-c3c8800ae018992b.yaml deleted file mode 100644 index 54ad0d13732aa..0000000000000 --- a/releasenotes/notes/add-reason-tags-to-kubelet-metrics-c3c8800ae018992b.yaml +++ /dev/null @@ -1,12 +0,0 @@ -# Each section from every release note are combined when the -# CHANGELOG.rst is rendered. So the text needs to be worded so that -# it does not depend on any information only available in another -# section. This may mean repeating some details, but each section -# must be readable independently of the other. -# -# Each section note must be formatted as reStructuredText. ---- -enhancements: - - | - Add ``reason:deadlineexceeded`` and ``reason:backofflimitexceeded`` to - ``kubernetes.containers.{state,last_status}.terminated``. From 6a7ae4a1980a4b1e7fba2c97c3aa8502fbd1b384 Mon Sep 17 00:00:00 2001 From: keisku Date: Wed, 1 May 2024 06:20:30 +0000 Subject: [PATCH 4/8] Revert "add reason tags to KSM and kubelet metrics" This reverts commit 1be85ef633106eafe6b55fafd5780ba25f151bb7. --- .../ksm/kubernetes_state_transformers.go | 10 ++-- .../ksm/kubernetes_state_transformers_test.go | 56 +++---------------- .../kubelet/provider/pod/provider.go | 8 +-- 3 files changed, 12 insertions(+), 62 deletions(-) diff --git a/pkg/collector/corechecks/cluster/ksm/kubernetes_state_transformers.go b/pkg/collector/corechecks/cluster/ksm/kubernetes_state_transformers.go index 6b48ede738652..bfaa00e4dd8c4 100644 --- a/pkg/collector/corechecks/cluster/ksm/kubernetes_state_transformers.go +++ b/pkg/collector/corechecks/cluster/ksm/kubernetes_state_transformers.go @@ -273,11 +273,9 @@ func containerWaitingReasonTransformer(s sender.Sender, name string, metric ksms } var allowedTerminatedReasons = map[string]struct{}{ - "oomkilled": {}, - "containercannotrun": {}, - "error": {}, - "deadlineexceeded": {}, - "backofflimitexceeded": {}, + "oomkilled": {}, + "containercannotrun": {}, + "error": {}, } // containerTerminatedReasonTransformer validates the container waiting reasons for metric kube_pod_container_status_terminated_reason @@ -428,7 +426,7 @@ func validateJob(val float64, tags []string) ([]string, bool) { kubeCronjob := "" for _, tag := range tags { split := strings.Split(tag, ":") - if len(split) == 2 && split[0] == "kube_job" || split[0] == "job" || split[0] == "job_name" || split[0] == "reason" { + if len(split) == 2 && split[0] == "kube_job" || split[0] == "job" || split[0] == "job_name" { // Trim the timestamp suffix to avoid high cardinality if name, trimmed := trimJobTag(split[1]); trimmed { // The trimmed job name corresponds to the parent cronjob name diff --git a/pkg/collector/corechecks/cluster/ksm/kubernetes_state_transformers_test.go b/pkg/collector/corechecks/cluster/ksm/kubernetes_state_transformers_test.go index c12a313b8397e..39251c412419b 100644 --- a/pkg/collector/corechecks/cluster/ksm/kubernetes_state_transformers_test.go +++ b/pkg/collector/corechecks/cluster/ksm/kubernetes_state_transformers_test.go @@ -343,17 +343,17 @@ func Test_jobFailedTransformer(t *testing.T) { "condition": "true", }, }, - tags: []string{"job_name:foo-1509998340", "namespace:default", "condition:true", "reason:backofflimitexceeded"}, + tags: []string{"job_name:foo-1509998340", "namespace:default", "condition:true"}, }, expectedServiceCheck: &serviceCheck{ name: "kubernetes_state.job.complete", status: servicecheck.ServiceCheckCritical, - tags: []string{"kube_cronjob:foo", "namespace:default", "reason:backofflimitexceeded"}, + tags: []string{"kube_cronjob:foo", "namespace:default"}, }, expectedMetric: &metricsExpected{ name: "kubernetes_state.job.completion.failed", val: 1, - tags: []string{"kube_cronjob:foo", "namespace:default", "reason:backofflimitexceeded"}, + tags: []string{"kube_cronjob:foo", "namespace:default"}, }, }, { @@ -368,17 +368,17 @@ func Test_jobFailedTransformer(t *testing.T) { "condition": "true", }, }, - tags: []string{"job:foo-1509998340", "namespace:default", "condition:true", "reason:deadlineexceeded"}, + tags: []string{"job:foo-1509998340", "namespace:default", "condition:true"}, }, expectedServiceCheck: &serviceCheck{ name: "kubernetes_state.job.complete", status: servicecheck.ServiceCheckCritical, - tags: []string{"kube_cronjob:foo", "namespace:default", "reason:deadlineexceeded"}, + tags: []string{"kube_cronjob:foo", "namespace:default"}, }, expectedMetric: &metricsExpected{ name: "kubernetes_state.job.completion.failed", val: 1, - tags: []string{"kube_cronjob:foo", "namespace:default", "reason:deadlineexceeded"}, + tags: []string{"kube_cronjob:foo", "namespace:default"}, }, }, { @@ -393,7 +393,7 @@ func Test_jobFailedTransformer(t *testing.T) { "condition": "true", }, }, - tags: []string{"job_name:foo-1509998340", "namespace:default", "condition:true", "reason:backofflimitexceeded"}, + tags: []string{"job_name:foo-1509998340", "namespace:default", "condition:true"}, }, expectedServiceCheck: nil, expectedMetric: nil, @@ -1009,48 +1009,6 @@ func Test_containerTerminatedReasonTransformer(t *testing.T) { tags: []string{"container:foo", "pod:bar", "namespace:default", "reason:Error"}, }, }, - { - name: "BackoffLimitExceeded", - args: args{ - name: "kube_pod_container_status_terminated_reason", - metric: ksmstore.DDMetric{ - Val: 1, - Labels: map[string]string{ - "container": "foo", - "pod": "bar", - "namespace": "default", - "reason": "BackoffLimitExceeded", - }, - }, - tags: []string{"container:foo", "pod:bar", "namespace:default", "reason:BackoffLimitExceeded"}, - }, - expected: &metricsExpected{ - name: "kubernetes_state.container.status_report.count.terminated", - val: 1, - tags: []string{"container:foo", "pod:bar", "namespace:default", "reason:BackoffLimitExceeded"}, - }, - }, - { - name: "DeadlineExceeded", - args: args{ - name: "kube_pod_container_status_terminated_reason", - metric: ksmstore.DDMetric{ - Val: 1, - Labels: map[string]string{ - "container": "foo", - "pod": "bar", - "namespace": "default", - "reason": "DeadlineExceeded", - }, - }, - tags: []string{"container:foo", "pod:bar", "namespace:default", "reason:DeadlineExceeded"}, - }, - expected: &metricsExpected{ - name: "kubernetes_state.container.status_report.count.terminated", - val: 1, - tags: []string{"container:foo", "pod:bar", "namespace:default", "reason:DeadlineExceeded"}, - }, - }, } for _, tt := range tests { s := mocksender.NewMockSender("ksm") diff --git a/pkg/collector/corechecks/containers/kubelet/provider/pod/provider.go b/pkg/collector/corechecks/containers/kubelet/provider/pod/provider.go index 8773a9d2ddefa..1aeaf2ca8a235 100644 --- a/pkg/collector/corechecks/containers/kubelet/provider/pod/provider.go +++ b/pkg/collector/corechecks/containers/kubelet/provider/pod/provider.go @@ -37,13 +37,7 @@ var includeContainerStateReason = map[string][]string{ "invalidimagename", "createcontainerconfigerror", }, - "terminated": { - "oomkilled", - "containercannotrun", - "error", - "deadlineexceeded", - "backofflimitexceeded", - }, + "terminated": {"oomkilled", "containercannotrun", "error"}, } const kubeNamespaceTag = "kube_namespace" From 313399ce629453a1f61530b368c47996c08b9be0 Mon Sep 17 00:00:00 2001 From: keisku Date: Wed, 1 May 2024 07:36:22 +0000 Subject: [PATCH 5/8] add reason tag to kubernetes_state.job.failed --- .../ksm/kubernetes_state_transformers.go | 24 +++- .../ksm/kubernetes_state_transformers_test.go | 135 +++++++++++++++++- ...be_job_status_failed-755bbfbb67d7e4c6.yaml | 12 ++ 3 files changed, 164 insertions(+), 7 deletions(-) create mode 100644 releasenotes-dca/notes/add-reason-tags-to-kube_job_status_failed-755bbfbb67d7e4c6.yaml diff --git a/pkg/collector/corechecks/cluster/ksm/kubernetes_state_transformers.go b/pkg/collector/corechecks/cluster/ksm/kubernetes_state_transformers.go index bfaa00e4dd8c4..31d5c44dba61a 100644 --- a/pkg/collector/corechecks/cluster/ksm/kubernetes_state_transformers.go +++ b/pkg/collector/corechecks/cluster/ksm/kubernetes_state_transformers.go @@ -421,12 +421,25 @@ func trimJobTag(tag string) (string, bool) { return trimmed, tag != trimmed } +var allowedJobReasons = map[string]struct{}{ + "backofflimitexceeded": {}, + "deadlineexceeded": {}, +} + +func validJobReason(reason string) bool { + _, ok := allowedJobReasons[reason] + return ok +} + // validateJob detects active jobs and adds the `kube_cronjob` tag func validateJob(val float64, tags []string) ([]string, bool) { kubeCronjob := "" - for _, tag := range tags { + for i, tag := range tags { split := strings.Split(tag, ":") - if len(split) == 2 && split[0] == "kube_job" || split[0] == "job" || split[0] == "job_name" { + if len(split) != 2 { + continue + } + if split[0] == "kube_job" || split[0] == "job" || split[0] == "job_name" { // Trim the timestamp suffix to avoid high cardinality if name, trimmed := trimJobTag(split[1]); trimmed { // The trimmed job name corresponds to the parent cronjob name @@ -434,6 +447,9 @@ func validateJob(val float64, tags []string) ([]string, bool) { kubeCronjob = name } } + if split[0] == "reason" && !validJobReason(split[1]) { + tags = append(tags[:i], tags[i+1:]...) + } } if kubeCronjob != "" { @@ -482,10 +498,6 @@ func jobStatusFailedTransformer(s sender.Sender, name string, metric ksmstore.DD return } - if reasonTagIndex != -1 { - tags = append(tags[:reasonTagIndex], tags[reasonTagIndex+1:]...) - } - jobMetric(s, metric, ksmMetricPrefix+"job.failed", hostname, tags) } diff --git a/pkg/collector/corechecks/cluster/ksm/kubernetes_state_transformers_test.go b/pkg/collector/corechecks/cluster/ksm/kubernetes_state_transformers_test.go index 39251c412419b..091ab7fc08b23 100644 --- a/pkg/collector/corechecks/cluster/ksm/kubernetes_state_transformers_test.go +++ b/pkg/collector/corechecks/cluster/ksm/kubernetes_state_transformers_test.go @@ -544,7 +544,99 @@ func Test_jobStatusFailedTransformer(t *testing.T) { }, }, { - name: "irrelevant reason", + name: "BackoffLimitExceeded and value 0", + args: args{ + name: "kube_job_status_failed", + metric: ksmstore.DDMetric{ + Val: 0, + Labels: map[string]string{ + "job": "foo-1509998340", + "namespace": "default", + "reason": "BackoffLimitExceeded", + }, + }, + tags: []string{"job:foo-1509998340", "namespace:default", "reason:backofflimitexceeded"}, + }, + expected: nil, + }, + { + name: "BackoffLimitExceeded and value 1", + args: args{ + name: "kube_job_status_failed", + metric: ksmstore.DDMetric{ + Val: 1, + Labels: map[string]string{ + "job": "foo-1509998340", + "namespace": "default", + "reason": "BackoffLimitExceeded", + }, + }, + tags: []string{"job:foo-1509998340", "namespace:default", "reason:backofflimitexceeded"}, + }, + expected: &metricsExpected{ + name: "kubernetes_state.job.failed", + val: 1, + tags: []string{"kube_cronjob:foo", "namespace:default", "reason:backofflimitexceeded"}, + }, + }, + { + name: "DeadlineExceeded and value 0", + args: args{ + name: "kube_job_status_failed", + metric: ksmstore.DDMetric{ + Val: 0, + Labels: map[string]string{ + "job": "foo-1509998340", + "namespace": "default", + "reason": "DeadlineExceeded", + }, + }, + tags: []string{"job:foo-1509998340", "namespace:default", "reason:deadlineexceeded"}, + }, + expected: nil, + }, + { + name: "DeadlineExceeded and value 1.0", + args: args{ + name: "kube_job_status_failed", + metric: ksmstore.DDMetric{ + Val: 1, + Labels: map[string]string{ + "job": "foo-1509998340", + "namespace": "default", + "reason": "DeadlineExceeded", + }, + }, + tags: []string{"job:foo-1509998340", "namespace:default", "reason:deadlineexceeded"}, + }, + expected: &metricsExpected{ + name: "kubernetes_state.job.failed", + val: 1, + tags: []string{"kube_cronjob:foo", "namespace:default", "reason:deadlineexceeded"}, + }, + }, + { + name: "DeadlineExceeded and value 1.0", + args: args{ + name: "kube_job_status_failed", + metric: ksmstore.DDMetric{ + Val: 1, + Labels: map[string]string{ + "job": "foo-1509998340", + "namespace": "default", + "reason": "DeadlineExceeded", + }, + }, + tags: []string{"job:foo-1509998340", "namespace:default", "reason:deadlineexceeded"}, + }, + expected: &metricsExpected{ + name: "kubernetes_state.job.failed", + val: 1, + tags: []string{"kube_cronjob:foo", "namespace:default", "reason:deadlineexceeded"}, + }, + }, + { + name: "Evicted and 0", args: args{ name: "kube_job_status_failed", metric: ksmstore.DDMetric{ @@ -559,6 +651,26 @@ func Test_jobStatusFailedTransformer(t *testing.T) { }, expected: nil, }, + { + name: "Evicted and 1", + args: args{ + name: "kube_job_status_failed", + metric: ksmstore.DDMetric{ + Val: 1, + Labels: map[string]string{ + "job": "foo-1509998340", + "namespace": "default", + "reason": "Evicted", + }, + }, + tags: []string{"job:foo-1509998340", "namespace:default", "reason:Evicted"}, + }, + expected: &metricsExpected{ + name: "kubernetes_state.job.failed", + val: 1, + tags: []string{"kube_cronjob:foo", "namespace:default"}, + }, + }, { name: "inactive", args: args{ @@ -1562,6 +1674,27 @@ func Test_validateJob(t *testing.T) { want: []string{"foo:bar", "job_name:foo"}, want1: true, }, + { + name: "reason:backofflimitexceeded", + val: 1.0, + tags: []string{"foo:bar", "job_name:foo-1600167000", "kube_job:foo-1600167000", "reason:backofflimitexceeded"}, + want: []string{"foo:bar", "job_name:foo-1600167000", "kube_job:foo-1600167000", "kube_cronjob:foo", "reason:backofflimitexceeded"}, + want1: true, + }, + { + name: "reason:deadlineexceeded", + val: 1.0, + tags: []string{"foo:bar", "job_name:foo-1600167000", "reason:deadlineexceeded", "kube_job:foo-1600167000"}, + want: []string{"foo:bar", "job_name:foo-1600167000", "kube_job:foo-1600167000", "kube_cronjob:foo", "reason:deadlineexceeded"}, + want1: true, + }, + { + name: "invalid reason", + val: 1.0, + tags: []string{"foo:bar", "reason:error", "job_name:foo-1600167000", "kube_job:foo-1600167000"}, + want: []string{"foo:bar", "job_name:foo-1600167000", "kube_job:foo-1600167000", "kube_cronjob:foo"}, + want1: true, + }, { name: "invalid", val: 0.0, diff --git a/releasenotes-dca/notes/add-reason-tags-to-kube_job_status_failed-755bbfbb67d7e4c6.yaml b/releasenotes-dca/notes/add-reason-tags-to-kube_job_status_failed-755bbfbb67d7e4c6.yaml new file mode 100644 index 0000000000000..1e55bbe2b8a8d --- /dev/null +++ b/releasenotes-dca/notes/add-reason-tags-to-kube_job_status_failed-755bbfbb67d7e4c6.yaml @@ -0,0 +1,12 @@ +# Each section from every release note are combined when the +# CHANGELOG.rst is rendered. So the text needs to be worded so that +# it does not depend on any information only available in another +# section. This may mean repeating some details, but each section +# must be readable independently of the other. +# +# Each section note must be formatted as reStructuredText. +--- +enhancements: + - | + Add ``reason:backofflimitexceeded,deadlineexceeded`` to the + ``kubernetes_state.job.failed`` metric to help users understand why a job failed. From 2805b99a9884bac586b03e8c9a704ab0b351ba88 Mon Sep 17 00:00:00 2001 From: keisku Date: Wed, 1 May 2024 10:59:46 +0000 Subject: [PATCH 6/8] make validJobReason case-insensive --- .../cluster/ksm/kubernetes_state_transformers.go | 2 +- .../ksm/kubernetes_state_transformers_test.go | 12 ++++++------ 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/pkg/collector/corechecks/cluster/ksm/kubernetes_state_transformers.go b/pkg/collector/corechecks/cluster/ksm/kubernetes_state_transformers.go index 31d5c44dba61a..2fd769e78d0e9 100644 --- a/pkg/collector/corechecks/cluster/ksm/kubernetes_state_transformers.go +++ b/pkg/collector/corechecks/cluster/ksm/kubernetes_state_transformers.go @@ -427,7 +427,7 @@ var allowedJobReasons = map[string]struct{}{ } func validJobReason(reason string) bool { - _, ok := allowedJobReasons[reason] + _, ok := allowedJobReasons[strings.ToLower(reason)] return ok } diff --git a/pkg/collector/corechecks/cluster/ksm/kubernetes_state_transformers_test.go b/pkg/collector/corechecks/cluster/ksm/kubernetes_state_transformers_test.go index 091ab7fc08b23..41f377960197e 100644 --- a/pkg/collector/corechecks/cluster/ksm/kubernetes_state_transformers_test.go +++ b/pkg/collector/corechecks/cluster/ksm/kubernetes_state_transformers_test.go @@ -1675,17 +1675,17 @@ func Test_validateJob(t *testing.T) { want1: true, }, { - name: "reason:backofflimitexceeded", + name: "reason:BackoffLimitExceeded", val: 1.0, - tags: []string{"foo:bar", "job_name:foo-1600167000", "kube_job:foo-1600167000", "reason:backofflimitexceeded"}, - want: []string{"foo:bar", "job_name:foo-1600167000", "kube_job:foo-1600167000", "kube_cronjob:foo", "reason:backofflimitexceeded"}, + tags: []string{"foo:bar", "job_name:foo-1600167000", "kube_job:foo-1600167000", "reason:BackoffLimitExceeded"}, + want: []string{"foo:bar", "job_name:foo-1600167000", "kube_job:foo-1600167000", "kube_cronjob:foo", "reason:BackoffLimitExceeded"}, want1: true, }, { - name: "reason:deadlineexceeded", + name: "reason:DeadLineExceeded", val: 1.0, - tags: []string{"foo:bar", "job_name:foo-1600167000", "reason:deadlineexceeded", "kube_job:foo-1600167000"}, - want: []string{"foo:bar", "job_name:foo-1600167000", "kube_job:foo-1600167000", "kube_cronjob:foo", "reason:deadlineexceeded"}, + tags: []string{"foo:bar", "job_name:foo-1600167000", "reason:DeadLineExceeded", "kube_job:foo-1600167000"}, + want: []string{"foo:bar", "job_name:foo-1600167000", "kube_job:foo-1600167000", "kube_cronjob:foo", "reason:DeadLineExceeded"}, want1: true, }, { From a56a5ed892196e03b248fbfa7438a8be18926193 Mon Sep 17 00:00:00 2001 From: keisku Date: Wed, 1 May 2024 12:07:49 +0000 Subject: [PATCH 7/8] remove empty reason tag as well --- .../ksm/kubernetes_state_transformers.go | 18 +++++++++--------- .../ksm/kubernetes_state_transformers_test.go | 7 +++++++ 2 files changed, 16 insertions(+), 9 deletions(-) diff --git a/pkg/collector/corechecks/cluster/ksm/kubernetes_state_transformers.go b/pkg/collector/corechecks/cluster/ksm/kubernetes_state_transformers.go index 2fd769e78d0e9..6ee238422d4f0 100644 --- a/pkg/collector/corechecks/cluster/ksm/kubernetes_state_transformers.go +++ b/pkg/collector/corechecks/cluster/ksm/kubernetes_state_transformers.go @@ -421,13 +421,13 @@ func trimJobTag(tag string) (string, bool) { return trimmed, tag != trimmed } -var allowedJobReasons = map[string]struct{}{ +var jobFailureReasons = map[string]struct{}{ "backofflimitexceeded": {}, "deadlineexceeded": {}, } func validJobReason(reason string) bool { - _, ok := allowedJobReasons[strings.ToLower(reason)] + _, ok := jobFailureReasons[strings.ToLower(reason)] return ok } @@ -435,11 +435,14 @@ func validJobReason(reason string) bool { func validateJob(val float64, tags []string) ([]string, bool) { kubeCronjob := "" for i, tag := range tags { - split := strings.Split(tag, ":") - if len(split) != 2 { - continue + if strings.HasPrefix(tag, "reason:") { + if v := strings.TrimPrefix(tag, "reason:"); !validJobReason(v) { + tags = append(tags[:i], tags[i+1:]...) + continue + } } - if split[0] == "kube_job" || split[0] == "job" || split[0] == "job_name" { + split := strings.Split(tag, ":") + if len(split) == 2 && split[0] == "kube_job" || split[0] == "job" || split[0] == "job_name" { // Trim the timestamp suffix to avoid high cardinality if name, trimmed := trimJobTag(split[1]); trimmed { // The trimmed job name corresponds to the parent cronjob name @@ -447,9 +450,6 @@ func validateJob(val float64, tags []string) ([]string, bool) { kubeCronjob = name } } - if split[0] == "reason" && !validJobReason(split[1]) { - tags = append(tags[:i], tags[i+1:]...) - } } if kubeCronjob != "" { diff --git a/pkg/collector/corechecks/cluster/ksm/kubernetes_state_transformers_test.go b/pkg/collector/corechecks/cluster/ksm/kubernetes_state_transformers_test.go index 41f377960197e..d9df9dc7eadf1 100644 --- a/pkg/collector/corechecks/cluster/ksm/kubernetes_state_transformers_test.go +++ b/pkg/collector/corechecks/cluster/ksm/kubernetes_state_transformers_test.go @@ -1688,6 +1688,13 @@ func Test_validateJob(t *testing.T) { want: []string{"foo:bar", "job_name:foo-1600167000", "kube_job:foo-1600167000", "kube_cronjob:foo", "reason:DeadLineExceeded"}, want1: true, }, + { + name: "empty reason tag", + val: 1.0, + tags: []string{"reason:", "foo:bar", "job_name:foo-1600167000", "kube_job:foo-1600167000"}, + want: []string{"foo:bar", "job_name:foo-1600167000", "kube_job:foo-1600167000", "kube_cronjob:foo"}, + want1: true, + }, { name: "invalid reason", val: 1.0, From 1d21ebfc2abc63cca83f2b37b496739ba80465ae Mon Sep 17 00:00:00 2001 From: keisku Date: Wed, 1 May 2024 12:48:53 +0000 Subject: [PATCH 8/8] rename to validJobFailureReason --- .../corechecks/cluster/ksm/kubernetes_state_transformers.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pkg/collector/corechecks/cluster/ksm/kubernetes_state_transformers.go b/pkg/collector/corechecks/cluster/ksm/kubernetes_state_transformers.go index 6ee238422d4f0..608b3ad600b4d 100644 --- a/pkg/collector/corechecks/cluster/ksm/kubernetes_state_transformers.go +++ b/pkg/collector/corechecks/cluster/ksm/kubernetes_state_transformers.go @@ -426,7 +426,7 @@ var jobFailureReasons = map[string]struct{}{ "deadlineexceeded": {}, } -func validJobReason(reason string) bool { +func validJobFailureReason(reason string) bool { _, ok := jobFailureReasons[strings.ToLower(reason)] return ok } @@ -436,7 +436,7 @@ func validateJob(val float64, tags []string) ([]string, bool) { kubeCronjob := "" for i, tag := range tags { if strings.HasPrefix(tag, "reason:") { - if v := strings.TrimPrefix(tag, "reason:"); !validJobReason(v) { + if v := strings.TrimPrefix(tag, "reason:"); !validJobFailureReason(v) { tags = append(tags[:i], tags[i+1:]...) continue }