Skip to content

Commit

Permalink
Extend the Job API for BackoffLimitPerIndex
Browse files Browse the repository at this point in the history
  • Loading branch information
mimowo committed Jul 18, 2023
1 parent b2a9c06 commit fcbfdc1
Show file tree
Hide file tree
Showing 31 changed files with 1,359 additions and 133 deletions.
16 changes: 15 additions & 1 deletion api/openapi-spec/swagger.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

16 changes: 15 additions & 1 deletion api/openapi-spec/v3/apis__batch__v1_openapi.json
Original file line number Diff line number Diff line change
Expand Up @@ -332,6 +332,11 @@
"format": "int32",
"type": "integer"
},
"backoffLimitPerIndex": {
"description": "Specifies the limit for the number of retries within an index before marking this index as failed. When enabled the number of failures per index is kept in the pod's batch.kubernetes.io/job-index-failure-count annotation. It can only be set when Job's completionMode=Indexed, and the Pod's restart policy is Never. The field is immutable. This field is alpha-level. It can be used when the `JobBackoffLimitPerIndex` feature gate is enabled (disabled by default).",
"format": "int32",
"type": "integer"
},
"completionMode": {
"description": "completionMode specifies how Pod completions are tracked. It can be `NonIndexed` (default) or `Indexed`.\n\n`NonIndexed` means that the Job is considered complete when there have been .spec.completions successfully completed Pods. Each Pod completion is homologous to each other.\n\n`Indexed` means that the Pods of a Job get an associated completion index from 0 to (.spec.completions - 1), available in the annotation batch.kubernetes.io/job-completion-index. The Job is considered complete when there is one successfully completed Pod for each index. When value is `Indexed`, .spec.completions must be specified and `.spec.parallelism` must be less than or equal to 10^5. In addition, The Pod name takes the form `$(job-name)-$(index)-$(random-string)`, the Pod hostname takes the form `$(job-name)-$(index)`.\n\nMore completion modes can be added in the future. If the Job controller observes a mode that it doesn't recognize, which is possible during upgrades due to version skew, the controller skips updates for the Job.",
"type": "string"
Expand All @@ -345,6 +350,11 @@
"description": "manualSelector controls generation of pod labels and pod selectors. Leave `manualSelector` unset unless you are certain what you are doing. When false or unset, the system pick labels unique to this job and appends those labels to the pod template. When true, the user is responsible for picking unique labels and specifying the selector. Failure to pick a unique label may cause this and other jobs to not function correctly. However, You may see `manualSelector=true` in jobs that were created with the old `extensions/v1beta1` API. More info: https://kubernetes.io/docs/concepts/workloads/controllers/jobs-run-to-completion/#specifying-your-own-pod-selector",
"type": "boolean"
},
"maxFailedIndexes": {
"description": "Specifies the maximal number of failed indexes before marking the Job as failed, when backoffLimitPerIndex is set. Once the number of failed indexes exceeds this number the entire Job is marked as Failed and its execution is terminated. When left as null the job continues execution of all of its indexes and is marked with the `Complete` Job condition. It can only be specified when backoffLimitPerIndex is set. It can be null or up to completions. It is required and must be less than or equal to 10^4 when is completions greater than 10^5. This field is alpha-level. It can be used when the `JobBackoffLimitPerIndex` feature gate is enabled (disabled by default).",
"format": "int32",
"type": "integer"
},
"parallelism": {
"description": "Specifies the maximum desired number of pods the job should run at any given time. The actual number of pods running in steady state will be less than this number when ((.spec.completions - .status.successful) < .spec.parallelism), i.e. when the work left to do is less than max parallelism. More info: https://kubernetes.io/docs/concepts/workloads/controllers/jobs-run-to-completion/",
"format": "int32",
Expand Down Expand Up @@ -430,6 +440,10 @@
"format": "int32",
"type": "integer"
},
"failedIndexes": {
"description": "FailedIndexes holds the failed indexes when backoffLimitPerIndex=true. The indexes are represented in the text format analogous as for the `completedIndexes` field, ie. they are kept as decimal integers separated by commas. The numbers are listed in increasing order. Three or more consecutive numbers are compressed and represented by the first and last element of the series, separated by a hyphen. For example, if the failed indexes are 1, 3, 4, 5 and 7, they are represented as \"1,3-5,7\". This field is alpha-level. It can be used when the `JobBackoffLimitPerIndex` feature gate is enabled (disabled by default).",
"type": "string"
},
"ready": {
"description": "The number of pods which have a Ready condition.\n\nThis field is beta-level. The job controller populates the field when the feature gate JobReadyPods is enabled (enabled by default).",
"format": "int32",
Expand Down Expand Up @@ -559,7 +573,7 @@
"properties": {
"action": {
"default": "",
"description": "Specifies the action taken on a pod failure when the requirements are satisfied. Possible values are:\n\n- FailJob: indicates that the pod's job is marked as Failed and all\n running pods are terminated.\n- Ignore: indicates that the counter towards the .backoffLimit is not\n incremented and a replacement pod is created.\n- Count: indicates that the pod is handled in the default way - the\n counter towards the .backoffLimit is incremented.\nAdditional values are considered to be added in the future. Clients should react to an unknown action by skipping the rule.",
"description": "Specifies the action taken on a pod failure when the requirements are satisfied. Possible values are:\n\n- FailJob: indicates that the pod's job is marked as Failed and all\n running pods are terminated.\n- FailIndex: indicates that the pod's index is marked as Failed and will\n not be restarted.\n This value is alpha-level. It can be used when the\n `JobBackoffLimitPerIndex` feature gate is enabled (disabled by default).\n- Ignore: indicates that the counter towards the .backoffLimit is not\n incremented and a replacement pod is created.\n- Count: indicates that the pod is handled in the default way - the\n counter towards the .backoffLimit is incremented.\nAdditional values are considered to be added in the future. Clients should react to an unknown action by skipping the rule.",
"type": "string"
},
"onExitCodes": {
Expand Down
7 changes: 7 additions & 0 deletions pkg/apis/batch/fuzzer/fuzzer.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@ limitations under the License.
package fuzzer

import (
"math"

fuzz "github.com/google/gofuzz"
runtimeserializer "k8s.io/apimachinery/pkg/runtime/serializer"
"k8s.io/kubernetes/pkg/apis/batch"
Expand Down Expand Up @@ -51,6 +53,11 @@ var Funcs = func(codecs runtimeserializer.CodecFactory) []interface{} {
mode := batch.NonIndexedCompletion
if c.RandBool() {
mode = batch.IndexedCompletion
j.BackoffLimitPerIndex = pointer.Int32(c.Rand.Int31())
j.MaxFailedIndexes = pointer.Int32(c.Rand.Int31())
}
if c.RandBool() {
j.BackoffLimit = pointer.Int32(math.MaxInt32)
}
j.CompletionMode = &mode
// We're fuzzing the internal JobSpec type, not the v1 type, so we don't
Expand Down
50 changes: 50 additions & 0 deletions pkg/apis/batch/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,9 @@ const (
JobNameLabel = labelPrefix + LegacyJobNameLabel
// Controller UID is used for selectors and labels for jobs
ControllerUidLabel = labelPrefix + LegacyControllerUidLabel
// Annotation indicating the number of failures for the index corresponding
// to the pod.
JobIndexFailureCountAnnotation = labelPrefix + "job-index-failure-count"
)

// +k8s:deepcopy-gen:interfaces=k8s.io/apimachinery/pkg/runtime.Object
Expand Down Expand Up @@ -119,6 +122,12 @@ const (
// pod's job as Failed and terminate all running pods.
PodFailurePolicyActionFailJob PodFailurePolicyAction = "FailJob"

// This is an action which might be taken on a pod failure - mark the
// Job's index as failed to avoid restarts within this index. This action
// can only be used when backoffLimitPerIndex is set.
// This value is alpha-level.
PodFailurePolicyActionFailIndex PodFailurePolicyAction = "FailIndex"

// This is an action which might be taken on a pod failure - the counter towards
// .backoffLimit, represented by the job's .status.failed field, is not
// incremented and a replacement pod is created.
Expand Down Expand Up @@ -195,6 +204,10 @@ type PodFailurePolicyRule struct {
//
// - FailJob: indicates that the pod's job is marked as Failed and all
// running pods are terminated.
// - FailIndex: indicates that the pod's index is marked as Failed and will
// not be restarted.
// This value is alpha-level. It can be used when the
// `JobBackoffLimitPerIndex` feature gate is enabled (disabled by default).
// - Ignore: indicates that the counter towards the .backoffLimit is not
// incremented and a replacement pod is created.
// - Count: indicates that the pod is handled in the default way - the
Expand Down Expand Up @@ -269,6 +282,30 @@ type JobSpec struct {
// +optional
BackoffLimit *int32

// Specifies the limit for the number of retries within an
// index before marking this index as failed. When enabled the number of
// failures per index is kept in the pod's
// batch.kubernetes.io/job-index-failure-count annotation. It can only
// be set when Job's completionMode=Indexed, and the Pod's restart
// policy is Never. The field is immutable.
// This field is alpha-level. It can be used when the `JobBackoffLimitPerIndex`
// feature gate is enabled (disabled by default).
// +optional
BackoffLimitPerIndex *int32

// Specifies the maximal number of failed indexes before marking the Job as
// failed, when backoffLimitPerIndex is set. Once the number of failed
// indexes exceeds this number the entire Job is marked as Failed and its
// execution is terminated. When left as null the job continues execution of
// all of its indexes and is marked with the `Complete` Job condition.
// It can only be specified when backoffLimitPerIndex is set.
// It can be null or up to completions. It is required and must be
// less than or equal to 10^4 when is completions greater than 10^5.
// This field is alpha-level. It can be used when the `JobBackoffLimitPerIndex`
// feature gate is enabled (disabled by default).
// +optional
MaxFailedIndexes *int32

// TODO enabled it when https://github.com/kubernetes/kubernetes/issues/28486 has been fixed
// Optional number of failed pods to retain.
// +optional
Expand Down Expand Up @@ -397,6 +434,19 @@ type JobStatus struct {
// +optional
CompletedIndexes string

// FailedIndexes holds the failed indexes when backoffLimitPerIndex=true.
// The indexes are represented in the text format analogous as for the
// `completedIndexes` field, ie. they are kept as decimal integers
// separated by commas. The numbers are listed in increasing order. Three or
// more consecutive numbers are compressed and represented by the first and
// last element of the series, separated by a hyphen.
// For example, if the failed indexes are 1, 3, 4, 5 and 7, they are
// represented as "1,3-5,7".
// This field is alpha-level. It can be used when the `JobBackoffLimitPerIndex`
// feature gate is enabled (disabled by default).
// +optional
FailedIndexes *string

// uncountedTerminatedPods holds the UIDs of Pods that have terminated but
// the job controller hasn't yet accounted for in the status counters.
//
Expand Down
8 changes: 7 additions & 1 deletion pkg/apis/batch/v1/defaults.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@ limitations under the License.
package v1

import (
"math"

batchv1 "k8s.io/api/batch/v1"
corev1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/runtime"
Expand All @@ -38,7 +40,11 @@ func SetDefaults_Job(obj *batchv1.Job) {
obj.Spec.Parallelism = utilpointer.Int32(1)
}
if obj.Spec.BackoffLimit == nil {
obj.Spec.BackoffLimit = utilpointer.Int32(6)
if obj.Spec.BackoffLimitPerIndex != nil {
obj.Spec.BackoffLimit = utilpointer.Int32(math.MaxInt32)
} else {
obj.Spec.BackoffLimit = utilpointer.Int32(6)
}
}
labels := obj.Spec.Template.Labels
if labels != nil && len(obj.Labels) == 0 {
Expand Down
Loading

0 comments on commit fcbfdc1

Please sign in to comment.