Skip to content

Commit

Permalink
Add reason tag to kubernetes_state.job.failed (#25103)
Browse files Browse the repository at this point in the history
* add reason tags to KSM and kubelet metrics

* add releasenote

* Revert "add releasenote"

This reverts commit 13f3195.

* Revert "add reason tags to KSM and kubelet metrics"

This reverts commit 1be85ef.

* add reason tag to kubernetes_state.job.failed

* make validJobReason case-insensive

* remove empty reason tag as well

* rename to validJobFailureReason
  • Loading branch information
keisku committed May 9, 2024
1 parent 93ed8f7 commit 4faaee8
Show file tree
Hide file tree
Showing 3 changed files with 170 additions and 6 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -421,10 +421,26 @@ func trimJobTag(tag string) (string, bool) {
return trimmed, tag != trimmed
}

var jobFailureReasons = map[string]struct{}{
"backofflimitexceeded": {},
"deadlineexceeded": {},
}

func validJobFailureReason(reason string) bool {
_, ok := jobFailureReasons[strings.ToLower(reason)]
return ok
}

// validateJob detects active jobs and adds the `kube_cronjob` tag
func validateJob(val float64, tags []string) ([]string, bool) {
kubeCronjob := ""
for _, tag := range tags {
for i, tag := range tags {
if strings.HasPrefix(tag, "reason:") {
if v := strings.TrimPrefix(tag, "reason:"); !validJobFailureReason(v) {
tags = append(tags[:i], tags[i+1:]...)
continue
}
}
split := strings.Split(tag, ":")
if len(split) == 2 && split[0] == "kube_job" || split[0] == "job" || split[0] == "job_name" {
// Trim the timestamp suffix to avoid high cardinality
Expand Down Expand Up @@ -482,10 +498,6 @@ func jobStatusFailedTransformer(s sender.Sender, name string, metric ksmstore.DD
return
}

if reasonTagIndex != -1 {
tags = append(tags[:reasonTagIndex], tags[reasonTagIndex+1:]...)
}

jobMetric(s, metric, ksmMetricPrefix+"job.failed", hostname, tags)
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -544,7 +544,99 @@ func Test_jobStatusFailedTransformer(t *testing.T) {
},
},
{
name: "irrelevant reason",
name: "BackoffLimitExceeded and value 0",
args: args{
name: "kube_job_status_failed",
metric: ksmstore.DDMetric{
Val: 0,
Labels: map[string]string{
"job": "foo-1509998340",
"namespace": "default",
"reason": "BackoffLimitExceeded",
},
},
tags: []string{"job:foo-1509998340", "namespace:default", "reason:backofflimitexceeded"},
},
expected: nil,
},
{
name: "BackoffLimitExceeded and value 1",
args: args{
name: "kube_job_status_failed",
metric: ksmstore.DDMetric{
Val: 1,
Labels: map[string]string{
"job": "foo-1509998340",
"namespace": "default",
"reason": "BackoffLimitExceeded",
},
},
tags: []string{"job:foo-1509998340", "namespace:default", "reason:backofflimitexceeded"},
},
expected: &metricsExpected{
name: "kubernetes_state.job.failed",
val: 1,
tags: []string{"kube_cronjob:foo", "namespace:default", "reason:backofflimitexceeded"},
},
},
{
name: "DeadlineExceeded and value 0",
args: args{
name: "kube_job_status_failed",
metric: ksmstore.DDMetric{
Val: 0,
Labels: map[string]string{
"job": "foo-1509998340",
"namespace": "default",
"reason": "DeadlineExceeded",
},
},
tags: []string{"job:foo-1509998340", "namespace:default", "reason:deadlineexceeded"},
},
expected: nil,
},
{
name: "DeadlineExceeded and value 1.0",
args: args{
name: "kube_job_status_failed",
metric: ksmstore.DDMetric{
Val: 1,
Labels: map[string]string{
"job": "foo-1509998340",
"namespace": "default",
"reason": "DeadlineExceeded",
},
},
tags: []string{"job:foo-1509998340", "namespace:default", "reason:deadlineexceeded"},
},
expected: &metricsExpected{
name: "kubernetes_state.job.failed",
val: 1,
tags: []string{"kube_cronjob:foo", "namespace:default", "reason:deadlineexceeded"},
},
},
{
name: "DeadlineExceeded and value 1.0",
args: args{
name: "kube_job_status_failed",
metric: ksmstore.DDMetric{
Val: 1,
Labels: map[string]string{
"job": "foo-1509998340",
"namespace": "default",
"reason": "DeadlineExceeded",
},
},
tags: []string{"job:foo-1509998340", "namespace:default", "reason:deadlineexceeded"},
},
expected: &metricsExpected{
name: "kubernetes_state.job.failed",
val: 1,
tags: []string{"kube_cronjob:foo", "namespace:default", "reason:deadlineexceeded"},
},
},
{
name: "Evicted and 0",
args: args{
name: "kube_job_status_failed",
metric: ksmstore.DDMetric{
Expand All @@ -559,6 +651,26 @@ func Test_jobStatusFailedTransformer(t *testing.T) {
},
expected: nil,
},
{
name: "Evicted and 1",
args: args{
name: "kube_job_status_failed",
metric: ksmstore.DDMetric{
Val: 1,
Labels: map[string]string{
"job": "foo-1509998340",
"namespace": "default",
"reason": "Evicted",
},
},
tags: []string{"job:foo-1509998340", "namespace:default", "reason:Evicted"},
},
expected: &metricsExpected{
name: "kubernetes_state.job.failed",
val: 1,
tags: []string{"kube_cronjob:foo", "namespace:default"},
},
},
{
name: "inactive",
args: args{
Expand Down Expand Up @@ -1562,6 +1674,34 @@ func Test_validateJob(t *testing.T) {
want: []string{"foo:bar", "job_name:foo"},
want1: true,
},
{
name: "reason:BackoffLimitExceeded",
val: 1.0,
tags: []string{"foo:bar", "job_name:foo-1600167000", "kube_job:foo-1600167000", "reason:BackoffLimitExceeded"},
want: []string{"foo:bar", "job_name:foo-1600167000", "kube_job:foo-1600167000", "kube_cronjob:foo", "reason:BackoffLimitExceeded"},
want1: true,
},
{
name: "reason:DeadLineExceeded",
val: 1.0,
tags: []string{"foo:bar", "job_name:foo-1600167000", "reason:DeadLineExceeded", "kube_job:foo-1600167000"},
want: []string{"foo:bar", "job_name:foo-1600167000", "kube_job:foo-1600167000", "kube_cronjob:foo", "reason:DeadLineExceeded"},
want1: true,
},
{
name: "empty reason tag",
val: 1.0,
tags: []string{"reason:", "foo:bar", "job_name:foo-1600167000", "kube_job:foo-1600167000"},
want: []string{"foo:bar", "job_name:foo-1600167000", "kube_job:foo-1600167000", "kube_cronjob:foo"},
want1: true,
},
{
name: "invalid reason",
val: 1.0,
tags: []string{"foo:bar", "reason:error", "job_name:foo-1600167000", "kube_job:foo-1600167000"},
want: []string{"foo:bar", "job_name:foo-1600167000", "kube_job:foo-1600167000", "kube_cronjob:foo"},
want1: true,
},
{
name: "invalid",
val: 0.0,
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
# Each section from every release note are combined when the
# CHANGELOG.rst is rendered. So the text needs to be worded so that
# it does not depend on any information only available in another
# section. This may mean repeating some details, but each section
# must be readable independently of the other.
#
# Each section note must be formatted as reStructuredText.
---
enhancements:
- |
Add ``reason:backofflimitexceeded,deadlineexceeded`` to the
``kubernetes_state.job.failed`` metric to help users understand why a job failed.

0 comments on commit 4faaee8

Please sign in to comment.