Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add limit to the retry period for re-queing workloads #2264

Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions apis/config/v1beta1/configuration_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -259,6 +259,12 @@ type RequeuingStrategy struct {
// Defaults to 60.
// +optional
BackoffBaseSeconds *int32 `json:"backoffBaseSeconds,omitempty"`

// BackoffMaxSeconds defines the maximum length of retry period for re-queuing.
IrvingMg marked this conversation as resolved.
Show resolved Hide resolved
//
// Defaults to 3600.
// +optional
BackoffMaxSeconds *int32 `json:"backoffMaxSeconds,omitempty"`
}

type RequeuingTimestamp string
Expand Down
4 changes: 4 additions & 0 deletions apis/config/v1beta1/defaults.go
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ const (
DefaultMultiKueueOrigin = "multikueue"
DefaultMultiKueueWorkerLostTimeout = 15 * time.Minute
DefaultRequeuingBackoffBaseSeconds = 60
DefaultRequeuingBackoffMaxSeconds = 3600
)

func getOperatorNamespace() string {
Expand Down Expand Up @@ -133,6 +134,9 @@ func SetDefaults_Configuration(cfg *Configuration) {
if cfg.WaitForPodsReady.RequeuingStrategy.BackoffBaseSeconds == nil {
cfg.WaitForPodsReady.RequeuingStrategy.BackoffBaseSeconds = ptr.To[int32](DefaultRequeuingBackoffBaseSeconds)
}
if cfg.WaitForPodsReady.RequeuingStrategy.BackoffMaxSeconds == nil {
cfg.WaitForPodsReady.RequeuingStrategy.BackoffMaxSeconds = ptr.To[int32](DefaultRequeuingBackoffMaxSeconds)
}
}
if cfg.Integrations == nil {
cfg.Integrations = &Integrations{}
Expand Down
4 changes: 4 additions & 0 deletions apis/config/v1beta1/defaults_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -366,6 +366,7 @@ func TestSetDefaults_Configuration(t *testing.T) {
RequeuingStrategy: &RequeuingStrategy{
Timestamp: ptr.To(EvictionTimestamp),
BackoffBaseSeconds: ptr.To[int32](DefaultRequeuingBackoffBaseSeconds),
BackoffMaxSeconds: ptr.To[int32](DefaultRequeuingBackoffMaxSeconds),
},
},
Namespace: ptr.To(DefaultNamespace),
Expand Down Expand Up @@ -396,6 +397,7 @@ func TestSetDefaults_Configuration(t *testing.T) {
RequeuingStrategy: &RequeuingStrategy{
Timestamp: ptr.To(EvictionTimestamp),
BackoffBaseSeconds: ptr.To[int32](DefaultRequeuingBackoffBaseSeconds),
BackoffMaxSeconds: ptr.To[int32](DefaultRequeuingBackoffMaxSeconds),
},
},
Namespace: ptr.To(DefaultNamespace),
Expand All @@ -417,6 +419,7 @@ func TestSetDefaults_Configuration(t *testing.T) {
RequeuingStrategy: &RequeuingStrategy{
Timestamp: ptr.To(CreationTimestamp),
BackoffBaseSeconds: ptr.To[int32](63),
BackoffMaxSeconds: ptr.To[int32](1800),
},
},
InternalCertManagement: &InternalCertManagement{
Expand All @@ -431,6 +434,7 @@ func TestSetDefaults_Configuration(t *testing.T) {
RequeuingStrategy: &RequeuingStrategy{
Timestamp: ptr.To(CreationTimestamp),
BackoffBaseSeconds: ptr.To[int32](63),
BackoffMaxSeconds: ptr.To[int32](1800),
},
},
Namespace: ptr.To(DefaultNamespace),
Expand Down
5 changes: 5 additions & 0 deletions apis/config/v1beta1/zz_generated.deepcopy.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 2 additions & 0 deletions pkg/config/config_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -158,6 +158,7 @@ waitForPodsReady:
timestamp: Creation
backoffLimitCount: 10
backoffBaseSeconds: 30
backoffMaxSeconds: 1800
`), os.FileMode(0600)); err != nil {
t.Fatal(err)
}
Expand Down Expand Up @@ -556,6 +557,7 @@ multiKueue:
Timestamp: ptr.To(configapi.CreationTimestamp),
BackoffLimitCount: ptr.To[int32](10),
BackoffBaseSeconds: ptr.To[int32](30),
BackoffMaxSeconds: ptr.To[int32](1800),
},
},
ClientConnection: defaultClientConnection,
Expand Down
4 changes: 4 additions & 0 deletions pkg/config/validation.go
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,10 @@ func validateWaitForPodsReady(c *configapi.Configuration) field.ErrorList {
allErrs = append(allErrs, field.Invalid(requeuingStrategyPath.Child("backoffBaseSeconds"),
*strategy.BackoffBaseSeconds, constants.IsNegativeErrorMsg))
}
if strategy.BackoffMaxSeconds != nil && *strategy.BackoffMaxSeconds < 0 {
IrvingMg marked this conversation as resolved.
Show resolved Hide resolved
allErrs = append(allErrs, field.Invalid(requeuingStrategyPath.Child("backoffMaxSeconds"),
*strategy.BackoffMaxSeconds, constants.IsNegativeErrorMsg))
}
}
return allErrs
}
Expand Down
18 changes: 18 additions & 0 deletions pkg/config/validation_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -354,6 +354,7 @@ func TestValidate(t *testing.T) {
Timestamp: ptr.To(configapi.CreationTimestamp),
BackoffLimitCount: ptr.To[int32](10),
BackoffBaseSeconds: ptr.To[int32](30),
BackoffMaxSeconds: ptr.To[int32](1800),
},
},
},
Expand Down Expand Up @@ -392,6 +393,23 @@ func TestValidate(t *testing.T) {
},
},
},
"negative waitForPodsReady.requeuingStrategy.backoffMaxSeconds": {
cfg: &configapi.Configuration{
Integrations: defaultIntegrations,
WaitForPodsReady: &configapi.WaitForPodsReady{
Enable: true,
RequeuingStrategy: &configapi.RequeuingStrategy{
BackoffMaxSeconds: ptr.To[int32](-1),
},
},
},
wantErr: field.ErrorList{
&field.Error{
Type: field.ErrorTypeInvalid,
Field: "waitForPodsReady.requeuingStrategy.backoffMaxSeconds",
},
},
},
"negative multiKueue.gcInterval": {
cfg: &configapi.Configuration{
Integrations: defaultIntegrations,
Expand Down
1 change: 1 addition & 0 deletions pkg/controller/core/core.go
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,7 @@ func waitForPodsReady(cfg *configapi.WaitForPodsReady) *waitForPodsReadyConfig {
if cfg.RequeuingStrategy != nil {
result.requeuingBackoffBaseSeconds = *cfg.RequeuingStrategy.BackoffBaseSeconds
result.requeuingBackoffLimitCount = cfg.RequeuingStrategy.BackoffLimitCount
result.requeuingBackoffMaxSeconds = *cfg.RequeuingStrategy.BackoffMaxSeconds
result.requeuingBackoffJitter = 0.0001
}
return &result
Expand Down
13 changes: 13 additions & 0 deletions pkg/controller/core/workload_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,7 @@ type waitForPodsReadyConfig struct {
timeout time.Duration
requeuingBackoffLimitCount *int32
requeuingBackoffBaseSeconds int32
requeuingBackoffMaxSeconds int32
requeuingBackoffJitter float64
}

Expand Down Expand Up @@ -470,6 +471,18 @@ func (r *WorkloadReconciler) triggerDeactivationOrBackoffRequeue(ctx context.Con
for backoff.Steps > 0 {
waitDuration = backoff.Step()
}

maxWaitDuration := time.Duration(r.waitForPodsReady.requeuingBackoffMaxSeconds) * time.Second
if waitDuration > maxWaitDuration {
wl.Spec.Active = ptr.To(false)
if err := r.client.Update(ctx, wl); err != nil {
return false, err
}
r.recorder.Eventf(wl, corev1.EventTypeNormal, kueue.WorkloadEvictedByDeactivation,
"Deactivated Workload %q by reached re-queue backoffMaxSeconds", klog.KObj(wl))
return true, nil
}
IrvingMg marked this conversation as resolved.
Show resolved Hide resolved

wl.Status.RequeueState.RequeueAt = ptr.To(metav1.NewTime(r.clock.Now().Add(waitDuration)))
wl.Status.RequeueState.Count = &requeuingCount
return false, nil
Expand Down
44 changes: 44 additions & 0 deletions pkg/controller/core/workload_controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -511,6 +511,7 @@ func TestReconcile(t *testing.T) {
requeuingBackoffLimitCount: ptr.To[int32](100),
requeuingBackoffBaseSeconds: 10,
requeuingBackoffJitter: 0,
requeuingBackoffMaxSeconds: 3600,
}),
},
workload: utiltesting.MakeWorkload("wl", "ns").
Expand Down Expand Up @@ -590,6 +591,49 @@ func TestReconcile(t *testing.T) {
Message: "Deactivated Workload \"ns/wl\" by reached re-queue backoffLimitCount",
}},
},
"deactivate workload when reaching backoffMaxSeconds": {
reconcilerOpts: []Option{
WithWaitForPodsReady(&waitForPodsReadyConfig{
timeout: 3 * time.Second,
requeuingBackoffLimitCount: ptr.To[int32](100),
requeuingBackoffBaseSeconds: 10,
requeuingBackoffJitter: 0,
requeuingBackoffMaxSeconds: 7200,
}),
},
workload: utiltesting.MakeWorkload("wl", "ns").
ReserveQuota(utiltesting.MakeAdmission("q1").Obj()).
AdmissionCheck(kueue.AdmissionCheckState{
Name: "check",
State: kueue.CheckStateReady,
}).
Condition(metav1.Condition{ // Override LastTransitionTime
Type: kueue.WorkloadAdmitted,
Status: metav1.ConditionTrue,
LastTransitionTime: metav1.NewTime(testStartTime.Add(-5 * time.Minute)),
Reason: "ByTest",
Message: "Admitted by ClusterQueue q1",
}).
Admitted(true).
RequeueState(ptr.To[int32](10), ptr.To(metav1.NewTime(testStartTime.Add(1*time.Second).Truncate(time.Second)))).
Obj(),
wantWorkload: utiltesting.MakeWorkload("wl", "ns").
Active(false).
ReserveQuota(utiltesting.MakeAdmission("q1").Obj()).
Admitted(true).
AdmissionCheck(kueue.AdmissionCheckState{
Name: "check",
State: kueue.CheckStateReady,
}).
RequeueState(ptr.To[int32](10), ptr.To(metav1.NewTime(testStartTime.Add(1*time.Second).Truncate(time.Second)))).
Obj(),
wantEvents: []utiltesting.EventRecord{{
Key: types.NamespacedName{Name: "wl", Namespace: "ns"},
EventType: v1.EventTypeNormal,
Reason: kueue.WorkloadEvictedByDeactivation,
Message: "Deactivated Workload \"ns/wl\" by reached re-queue backoffMaxSeconds",
}},
},
"should set the WorkloadRequeued condition to true on re-activated": {
workload: utiltesting.MakeWorkload("wl", "ns").
Active(true).
Expand Down
8 changes: 8 additions & 0 deletions site/content/en/docs/reference/kueue-config.v1beta1.md
Original file line number Diff line number Diff line change
Expand Up @@ -733,6 +733,14 @@ re-queuing an evicted workload.</p>
<p>Defaults to 60.</p>
</td>
</tr>
<tr><td><code>backoffMaxSeconds</code><br/>
<code>int32</code>
</td>
<td>
<p>BackoffMaxSeconds defines the maximum length of retry period for re-queuing.</p>
<p>Defaults to 3600.</p>
</td>
</tr>
</tbody>
</table>

Expand Down