Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add reason tag to kubernetes_state.job.failed #25103

Merged
merged 9 commits into from
May 9, 2024
Merged
Original file line number Diff line number Diff line change
Expand Up @@ -273,9 +273,11 @@ func containerWaitingReasonTransformer(s sender.Sender, name string, metric ksms
}

var allowedTerminatedReasons = map[string]struct{}{
"oomkilled": {},
"containercannotrun": {},
"error": {},
"oomkilled": {},
"containercannotrun": {},
"error": {},
"deadlineexceeded": {},
"backofflimitexceeded": {},
}

// containerTerminatedReasonTransformer validates the container waiting reasons for metric kube_pod_container_status_terminated_reason
Expand Down Expand Up @@ -426,7 +428,7 @@ func validateJob(val float64, tags []string) ([]string, bool) {
kubeCronjob := ""
for _, tag := range tags {
split := strings.Split(tag, ":")
if len(split) == 2 && split[0] == "kube_job" || split[0] == "job" || split[0] == "job_name" {
if len(split) == 2 && split[0] == "kube_job" || split[0] == "job" || split[0] == "job_name" || split[0] == "reason" {
// Trim the timestamp suffix to avoid high cardinality
if name, trimmed := trimJobTag(split[1]); trimmed {
keisku marked this conversation as resolved.
Show resolved Hide resolved
// The trimmed job name corresponds to the parent cronjob name
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -343,17 +343,17 @@ func Test_jobFailedTransformer(t *testing.T) {
"condition": "true",
},
},
tags: []string{"job_name:foo-1509998340", "namespace:default", "condition:true"},
tags: []string{"job_name:foo-1509998340", "namespace:default", "condition:true", "reason:backofflimitexceeded"},
},
expectedServiceCheck: &serviceCheck{
name: "kubernetes_state.job.complete",
status: servicecheck.ServiceCheckCritical,
tags: []string{"kube_cronjob:foo", "namespace:default"},
tags: []string{"kube_cronjob:foo", "namespace:default", "reason:backofflimitexceeded"},
},
expectedMetric: &metricsExpected{
name: "kubernetes_state.job.completion.failed",
val: 1,
tags: []string{"kube_cronjob:foo", "namespace:default"},
tags: []string{"kube_cronjob:foo", "namespace:default", "reason:backofflimitexceeded"},
},
},
{
Expand All @@ -368,17 +368,17 @@ func Test_jobFailedTransformer(t *testing.T) {
"condition": "true",
},
},
tags: []string{"job:foo-1509998340", "namespace:default", "condition:true"},
tags: []string{"job:foo-1509998340", "namespace:default", "condition:true", "reason:deadlineexceeded"},
},
expectedServiceCheck: &serviceCheck{
name: "kubernetes_state.job.complete",
status: servicecheck.ServiceCheckCritical,
tags: []string{"kube_cronjob:foo", "namespace:default"},
tags: []string{"kube_cronjob:foo", "namespace:default", "reason:deadlineexceeded"},
},
expectedMetric: &metricsExpected{
name: "kubernetes_state.job.completion.failed",
val: 1,
tags: []string{"kube_cronjob:foo", "namespace:default"},
tags: []string{"kube_cronjob:foo", "namespace:default", "reason:deadlineexceeded"},
},
},
{
Expand All @@ -393,7 +393,7 @@ func Test_jobFailedTransformer(t *testing.T) {
"condition": "true",
},
},
tags: []string{"job_name:foo-1509998340", "namespace:default", "condition:true"},
tags: []string{"job_name:foo-1509998340", "namespace:default", "condition:true", "reason:backofflimitexceeded"},
},
expectedServiceCheck: nil,
expectedMetric: nil,
Expand Down Expand Up @@ -1009,6 +1009,48 @@ func Test_containerTerminatedReasonTransformer(t *testing.T) {
tags: []string{"container:foo", "pod:bar", "namespace:default", "reason:Error"},
},
},
{
name: "BackoffLimitExceeded",
args: args{
name: "kube_pod_container_status_terminated_reason",
metric: ksmstore.DDMetric{
Val: 1,
Labels: map[string]string{
"container": "foo",
"pod": "bar",
"namespace": "default",
"reason": "BackoffLimitExceeded",
},
},
tags: []string{"container:foo", "pod:bar", "namespace:default", "reason:BackoffLimitExceeded"},
},
expected: &metricsExpected{
name: "kubernetes_state.container.status_report.count.terminated",
val: 1,
tags: []string{"container:foo", "pod:bar", "namespace:default", "reason:BackoffLimitExceeded"},
},
},
{
name: "DeadlineExceeded",
args: args{
name: "kube_pod_container_status_terminated_reason",
metric: ksmstore.DDMetric{
Val: 1,
Labels: map[string]string{
"container": "foo",
"pod": "bar",
"namespace": "default",
"reason": "DeadlineExceeded",
},
},
tags: []string{"container:foo", "pod:bar", "namespace:default", "reason:DeadlineExceeded"},
},
expected: &metricsExpected{
name: "kubernetes_state.container.status_report.count.terminated",
val: 1,
tags: []string{"container:foo", "pod:bar", "namespace:default", "reason:DeadlineExceeded"},
},
},
}
for _, tt := range tests {
s := mocksender.NewMockSender("ksm")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,13 @@ var includeContainerStateReason = map[string][]string{
"invalidimagename",
"createcontainerconfigerror",
},
"terminated": {"oomkilled", "containercannotrun", "error"},
"terminated": {
"oomkilled",
"containercannotrun",
"error",
"deadlineexceeded",
"backofflimitexceeded",
},
}

const kubeNamespaceTag = "kube_namespace"
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
# Each section from every release note are combined when the
# CHANGELOG.rst is rendered. So the text needs to be worded so that
# it does not depend on any information only available in another
# section. This may mean repeating some details, but each section
# must be readable independently of the other.
#
# Each section note must be formatted as reStructuredText.
---
enhancements:
- |
Add ``reason:deadlineexceeded`` and ``reason:backofflimitexceeded``
to ``kubernetes_state.container.status_report.count.terminated``.
Add ``reason`` tags to ``kubernetes_state.job.*``.
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
# Each section from every release note are combined when the
# CHANGELOG.rst is rendered. So the text needs to be worded so that
# it does not depend on any information only available in another
# section. This may mean repeating some details, but each section
# must be readable independently of the other.
#
# Each section note must be formatted as reStructuredText.
---
enhancements:
- |
Add ``reason:deadlineexceeded`` and ``reason:backofflimitexceeded`` to
``kubernetes.containers.{state,last_status}.terminated``.
Loading