Skip to content
This repository has been archived by the owner on Oct 9, 2023. It is now read-only.

Commit

Permalink
Finish simplify retry behavior PR (#623)
Browse files Browse the repository at this point in the history
* Cleanup retry behavior

Signed-off-by: Dennis Keck <26092524+fellhorn@users.noreply.github.com>

* Fix interruptible retry threshold for odl behavior

Signed-off-by: Dennis Keck <26092524+fellhorn@users.noreply.github.com>

* Add tests for BuildNodeExecutionContext

Signed-off-by: Dennis Keck <26092524+fellhorn@users.noreply.github.com>

* Fix IsElgibileForRetries Tests

Signed-off-by: Dennis Keck <26092524+fellhorn@users.noreply.github.com>

---------

Signed-off-by: Dennis Keck <26092524+fellhorn@users.noreply.github.com>
  • Loading branch information
fellhorn authored Sep 29, 2023
1 parent 6a73b29 commit a9b52af
Show file tree
Hide file tree
Showing 4 changed files with 33 additions and 25 deletions.
17 changes: 6 additions & 11 deletions pkg/controller/nodes/executor.go
Original file line number Diff line number Diff line change
Expand Up @@ -796,12 +796,7 @@ func isTimeoutExpired(queuedAt *metav1.Time, timeout time.Duration) bool {

func (c *nodeExecutor) isEligibleForRetry(nCtx interfaces.NodeExecutionContext, nodeStatus v1alpha1.ExecutableNodeStatus, err *core.ExecutionError) (currentAttempt, maxAttempts uint32, isEligible bool) {
if config.GetConfig().NodeConfig.IgnoreRetryCause {
currentAttempt = nodeStatus.GetAttempts() + 1 + nodeStatus.GetSystemFailures()
maxAttempts = uint32(config.GetConfig().NodeConfig.DefaultMaxAttempts)
if nCtx.Node().GetRetryStrategy() != nil && nCtx.Node().GetRetryStrategy().MinAttempts != nil && *nCtx.Node().GetRetryStrategy().MinAttempts != 0 {
maxAttempts = uint32(*nCtx.Node().GetRetryStrategy().MinAttempts)
}
isEligible = currentAttempt < maxAttempts
currentAttempt = nodeStatus.GetAttempts() + 1
} else {
if err.Kind == core.ExecutionError_SYSTEM {
currentAttempt = nodeStatus.GetSystemFailures()
Expand All @@ -811,12 +806,12 @@ func (c *nodeExecutor) isEligibleForRetry(nCtx interfaces.NodeExecutionContext,
}

currentAttempt = (nodeStatus.GetAttempts() + 1) - nodeStatus.GetSystemFailures()
maxAttempts = uint32(config.GetConfig().NodeConfig.DefaultMaxAttempts)
if nCtx.Node().GetRetryStrategy() != nil && nCtx.Node().GetRetryStrategy().MinAttempts != nil && *nCtx.Node().GetRetryStrategy().MinAttempts != 0 {
maxAttempts = uint32(*nCtx.Node().GetRetryStrategy().MinAttempts)
}
isEligible = currentAttempt < maxAttempts
}
maxAttempts = uint32(config.GetConfig().NodeConfig.DefaultMaxAttempts)
if nCtx.Node().GetRetryStrategy() != nil && nCtx.Node().GetRetryStrategy().MinAttempts != nil && *nCtx.Node().GetRetryStrategy().MinAttempts != 1 {
maxAttempts = uint32(*nCtx.Node().GetRetryStrategy().MinAttempts)
}
isEligible = currentAttempt < maxAttempts
return
}

Expand Down
4 changes: 2 additions & 2 deletions pkg/controller/nodes/executor_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -2780,11 +2780,11 @@ func TestNodeExecutor_IsEligibleForRetry(t *testing.T) {
{"EligibleUserRetries", false, 0, 0, 2, 0, core.ExecutionError_USER, true},
{"IneligibleUserRetries", false, 1, 0, 2, 0, core.ExecutionError_USER, false},
{"EligibleSystemRetries", false, 0, 0, 1, 1, core.ExecutionError_SYSTEM, true},
{"IneligibleSystemRetries", false, 0, 1, 1, 1, core.ExecutionError_SYSTEM, false},
{"IneligibleSystemRetries", false, 1, 1, 1, 1, core.ExecutionError_SYSTEM, false},
{"IgnoreCauseEligibleUserRetries", true, 0, 0, 2, 0, core.ExecutionError_USER, true},
{"IgnoreCauseIneligibleUserRetries", true, 1, 0, 2, 0, core.ExecutionError_USER, false},
{"IgnoreCauseEligibleSystemRetries", true, 0, 0, 2, 0, core.ExecutionError_SYSTEM, true},
{"IgnoreCauseIneligibleSystemRetries", true, 0, 1, 2, 0, core.ExecutionError_SYSTEM, false},
{"IgnoreCauseIneligibleSystemRetries", true, 1, 1, 2, 0, core.ExecutionError_SYSTEM, false},
}

for _, test := range tests {
Expand Down
22 changes: 14 additions & 8 deletions pkg/controller/nodes/node_exec_context.go
Original file line number Diff line number Diff line change
Expand Up @@ -255,6 +255,14 @@ func newNodeExecContext(_ context.Context, store *storage.DataStore, execContext
}
}

func isAboveInterruptibleFailureThreshold(numFailures uint32, maxAttempts uint32, interruptibleThreshold int32) bool {
if interruptibleThreshold > 0 {
return numFailures >= uint32(interruptibleThreshold)
} else {
return numFailures >= maxAttempts-uint32(-interruptibleThreshold)
}
}

func (c *nodeExecutor) BuildNodeExecutionContext(ctx context.Context, executionContext executors.ExecutionContext,
nl executors.NodeLookup, currentNodeID v1alpha1.NodeID) (interfaces.NodeExecutionContext, error) {
n, ok := nl.GetNode(currentNodeID)
Expand Down Expand Up @@ -289,21 +297,19 @@ func (c *nodeExecutor) BuildNodeExecutionContext(ctx context.Context, executionC
if config.GetConfig().NodeConfig.IgnoreRetryCause {
// For the unified retry behavior we execute the last interruptibleFailureThreshold attempts on a non
// interruptible machine
currentAttempt := int32(s.GetAttempts() + s.GetSystemFailures())
maxAttempts := config.GetConfig().NodeConfig.DefaultMaxAttempts
if n.GetRetryStrategy() != nil && n.GetRetryStrategy().MinAttempts != nil && *n.GetRetryStrategy().MinAttempts != 0 {
maxAttempts = int32(*n.GetRetryStrategy().MinAttempts)
maxAttempts := uint32(config.GetConfig().NodeConfig.DefaultMaxAttempts)
if n.GetRetryStrategy() != nil && n.GetRetryStrategy().MinAttempts != nil && *n.GetRetryStrategy().MinAttempts != 1 {
maxAttempts = uint32(*n.GetRetryStrategy().MinAttempts)
}

if interruptible && ((c.interruptibleFailureThreshold > 0 && currentAttempt >= c.interruptibleFailureThreshold) ||
(c.interruptibleFailureThreshold <= 0 && currentAttempt >= maxAttempts+c.interruptibleFailureThreshold)) {
// For interruptible nodes run at least one attempt on an interruptible machine (thus s.GetAttempts() > 0) even if there won't be any retries
if interruptible && s.GetAttempts() > 0 && isAboveInterruptibleFailureThreshold(s.GetAttempts(), maxAttempts, c.interruptibleFailureThreshold) {
interruptible = false
c.metrics.InterruptedThresholdHit.Inc(ctx)
}
} else {
// Else a node is not considered interruptible if the system failures have exceeded the configured threshold
if interruptible && ((c.interruptibleFailureThreshold > 0 && int32(s.GetSystemFailures()) >= c.interruptibleFailureThreshold) ||
(c.interruptibleFailureThreshold <= 0 && int32(s.GetSystemFailures()) > int32(c.maxNodeRetriesForSystemFailures)+c.interruptibleFailureThreshold)) {
if interruptible && isAboveInterruptibleFailureThreshold(s.GetSystemFailures(), c.maxNodeRetriesForSystemFailures+1, c.interruptibleFailureThreshold) {
interruptible = false
c.metrics.InterruptedThresholdHit.Inc(ctx)
}
Expand Down
15 changes: 11 additions & 4 deletions pkg/controller/nodes/node_exec_context_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -392,10 +392,17 @@ func Test_NodeContext_IsInterruptible(t *testing.T) {
{"Interruptible", false, 0, 0, 2, 1, 1, true},
{"NonInterruptible", false, 0, 1, 2, 1, 1, false},
{"InterruptibleNegativeThreshold", false, 0, 0, 2, 1, -1, true},
{"NonInterruptibleNegativeThreshold", false, 0, 1, 2, 1, -1, false},
{"IgnoreCauseInterruptible", true, 0, 0, 2, 1, 1, true},
{"IgnoreCauseNonInterruptibleSystem", true, 0, 1, 2, 1, 1, false},
{"IgnoreCauseNonInterruptibleUser", true, 1, 0, 2, 1, 1, false},
{"InterruptibleNegativeThreshold2", false, 3, 3, 5, 4, -1, true},
{"NonInterruptibleNegativeThreshold", false, 1, 1, 2, 1, -1, false},
// maxSystemFailures should be ignored if ignoreRetryCause is true
{"IgnoreCauseInterruptible", true, 0, 0, 2, 999, 1, true},
{"IgnoreCauseInterruptibleFirstTry", true, 0, 0, 1, 999, -1, true}, // First try should always be interruptible if interruptible is set
{"IgnoreCauseInterruptibleNegativeThreshold", true, 0, 0, 2, 999, -1, true},
{"IgnoreCauseInterruptibleNegativeThreshold2", true, 2, 1, 4, 999, -1, true},
{"IgnoreCauseNonInterruptibleSystem", true, 1, 1, 2, 999, 1, false},
{"IgnoreCauseNonInterruptibleUser", true, 1, 0, 2, 999, 1, false},
{"IgnoreCauseNonInterruptibleSystemNegativeThreshold", true, 3, 3, 4, 0, -1, false},
{"IgnoreCauseNonInterruptibleUserNegativeThreshold", true, 3, 0, 4, 0, -1, false},
}

for _, tt := range tests {
Expand Down

0 comments on commit a9b52af

Please sign in to comment.