Skip to content

Commit

Permalink
fix: refactor retries (akuity#3060)
Browse files Browse the repository at this point in the history
Signed-off-by: Kent Rancourt <kent.rancourt@gmail.com>
  • Loading branch information
krancour authored Dec 5, 2024
1 parent 2ee439c commit 58588b8
Show file tree
Hide file tree
Showing 17 changed files with 1,790 additions and 545 deletions.
974 changes: 720 additions & 254 deletions api/v1alpha1/generated.pb.go

Large diffs are not rendered by default.

71 changes: 61 additions & 10 deletions api/v1alpha1/generated.proto

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

91 changes: 75 additions & 16 deletions api/v1alpha1/promotion_types.go
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
package v1alpha1

import (
"time"

apiextensionsv1 "k8s.io/apiextensions-apiserver/pkg/apis/apiextensions/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"sigs.k8s.io/yaml"
Expand Down Expand Up @@ -111,23 +113,58 @@ type PromotionVariable struct {

// PromotionStepRetry describes the retry policy for a PromotionStep.
type PromotionStepRetry struct {
// Attempts is the number of times the step can be attempted before the
// PromotionStep is marked as failed.
// Timeout is the soft maximum interval in which a step that returns a Running
// status (which typically indicates it's waiting for something to happen)
// may be retried.
//
// The maximum is a soft one because the check for whether the interval has
// elapsed occurs AFTER the step has run. This effectively means a step may
// run ONCE beyond the close of the interval.
//
// If this field is set to nil, the effective default will be a step-specific
// one. If no step-specific default exists (i.e. is also nil), the effective
// default will be the system-wide default of 0.
//
// A value of 0 will cause the step to be retried indefinitely unless the
// ErrorThreshold is reached.
Timeout *metav1.Duration `json:"timeout,omitempty" protobuf:"bytes,1,opt,name=timeout"`
// ErrorThreshold is the number of consecutive times the step must fail (for
// any reason) before retries are abandoned and the entire Promotion is marked
// as failed.
//
// If this field is set to 1, the step will not be retried. If this
// field is set to -1, the step will be retried indefinitely.
// If this field is set to 0, the effective default will be a step-specific
// one. If no step-specific default exists (i.e. is also 0), the effective
// default will be the system-wide default of 1.
//
// The default of this field depends on the step being executed. Refer to
// the documentation for the specific step for more information.
Attempts int64 `json:"attempts,omitempty" protobuf:"varint,1,opt,name=attempts"`
// A value of 1 will cause the Promotion to be marked as failed after just
// a single failure; i.e. no retries will be attempted.
//
// There is no option to specify an infinite number of retries using a value
// such as -1.
//
// In a future release, Kargo is likely to become capable of distinguishing
// between recoverable and non-recoverable step failures. At that time, it is
// planned that unrecoverable failures will not be subject to this threshold
// and will immediately cause the Promotion to be marked as failed without
// further condition.
ErrorThreshold uint32 `json:"errorThreshold,omitempty" protobuf:"varint,2,opt,name=errorThreshold"`
}

// GetAttempts returns the Attempts field with the given fallback value.
func (r *PromotionStepRetry) GetAttempts(fallback int64) int64 {
if r == nil || r.Attempts == 0 {
// GetTimeout returns the Timeout field with the given fallback value.
func (r *PromotionStepRetry) GetTimeout(fallback *time.Duration) *time.Duration {
if r == nil || r.Timeout == nil {
return fallback
}
return r.Attempts
return &r.Timeout.Duration
}

// GetErrorThreshold returns the ErrorThreshold field with the given fallback
// value.
func (r *PromotionStepRetry) GetErrorThreshold(fallback uint32) uint32 {
if r == nil || r.ErrorThreshold == 0 {
return fallback
}
return r.ErrorThreshold
}

// PromotionStep describes a directive to be executed as part of a Promotion.
Expand Down Expand Up @@ -177,9 +214,9 @@ type PromotionStatus struct {
// permits steps that have already run successfully to be skipped on
// subsequent reconciliations attempts.
CurrentStep int64 `json:"currentStep,omitempty" protobuf:"varint,9,opt,name=currentStep"`
// CurrentStepAttempt is the number of times the current step has been
// attempted.
CurrentStepAttempt int64 `json:"currentStepAttempt,omitempty" protobuf:"varint,11,opt,name=currentStepAttempt"`
// StepExecutionMetadata tracks metadata pertaining to the execution
// of individual promotion steps.
StepExecutionMetadata StepExecutionMetadataList `json:"stepExecutionMetadata,omitempty" protobuf:"bytes,11,rep,name=stepExecutionMetadata"`
// State stores the state of the promotion process between reconciliation
// attempts.
State *apiextensionsv1.JSON `json:"state,omitempty" protobuf:"bytes,10,opt,name=state"`
Expand Down Expand Up @@ -224,8 +261,8 @@ func (s *HealthCheckStep) GetConfig() map[string]any {
}

// WithPhase returns a copy of PromotionStatus with the given phase
func (p *PromotionStatus) WithPhase(phase PromotionPhase) *PromotionStatus {
status := p.DeepCopy()
func (s *PromotionStatus) WithPhase(phase PromotionPhase) *PromotionStatus {
status := s.DeepCopy()
status.Phase = phase
return status
}
Expand All @@ -238,3 +275,25 @@ type PromotionList struct {
metav1.ListMeta `json:"metadata,omitempty" protobuf:"bytes,1,opt,name=metadata"`
Items []Promotion `json:"items" protobuf:"bytes,2,rep,name=items"`
}

// StepExecutionMetadataList is a list of StepExecutionMetadata.
type StepExecutionMetadataList []StepExecutionMetadata

// StepExecutionMetadata tracks metadata pertaining to the execution of
// a promotion step.
type StepExecutionMetadata struct {
// Alias is the alias of the step.
Alias string `json:"alias,omitempty" protobuf:"bytes,1,opt,name=alias"`
// StartedAt is the time at which the first attempt to execute the step
// began.
StartedAt *metav1.Time `json:"startedAt,omitempty" protobuf:"bytes,2,opt,name=startedAt"`
// FinishedAt is the time at which the final attempt to execute the step
// completed.
FinishedAt *metav1.Time `json:"finishedAt,omitempty" protobuf:"bytes,3,opt,name=finishedAt"`
// ErrorCount tracks consecutive failed attempts to execute the step.
ErrorCount uint32 `json:"errorCount,omitempty" protobuf:"varint,4,opt,name=errorCount"`
// Status is the high-level outcome of the step.
Status PromotionPhase `json:"status,omitempty" protobuf:"bytes,5,opt,name=status"`
// Message is a display message about the step, including any errors.
Message string `json:"message,omitempty" protobuf:"bytes,6,opt,name=message"`
}
55 changes: 46 additions & 9 deletions api/v1alpha1/promotion_types_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,16 +2,53 @@ package v1alpha1

import (
"testing"
"time"

"github.com/stretchr/testify/require"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/utils/ptr"
)

func TestPromotionRetry_GetAttempts(t *testing.T) {
func TestPromotionRetry_GetTimeout(t *testing.T) {
tests := []struct {
name string
retry *PromotionStepRetry
fallback int64
want int64
fallback *time.Duration
want *time.Duration
}{
{
name: "retry is nil",
retry: nil,
fallback: ptr.To(time.Hour),
want: ptr.To(time.Hour),
},
{
name: "timeout is not set",
retry: &PromotionStepRetry{},
fallback: ptr.To(time.Hour),
want: ptr.To(time.Hour),
},
{
name: "timeout is set",
retry: &PromotionStepRetry{
Timeout: &metav1.Duration{
Duration: 3 * time.Hour,
},
},
want: ptr.To(3 * time.Hour),
},
}
for _, tt := range tests {
require.Equal(t, tt.want, tt.retry.GetTimeout(tt.fallback))
}
}

func TestPromotionRetry_GetErrorThreshold(t *testing.T) {
tests := []struct {
name string
retry *PromotionStepRetry
fallback uint32
want uint32
}{
{
name: "retry is nil",
Expand All @@ -20,20 +57,20 @@ func TestPromotionRetry_GetAttempts(t *testing.T) {
want: 1,
},
{
name: "attempts is not set",
name: "threshold is not set",
retry: &PromotionStepRetry{},
fallback: -1,
want: -1,
fallback: 1,
want: 1,
},
{
name: "attempts is set",
name: "threshold is set",
retry: &PromotionStepRetry{
Attempts: 3,
ErrorThreshold: 3,
},
want: 3,
},
}
for _, tt := range tests {
require.Equal(t, tt.want, tt.retry.GetAttempts(tt.fallback))
require.Equal(t, tt.want, tt.retry.GetErrorThreshold(tt.fallback))
}
}
58 changes: 57 additions & 1 deletion api/v1alpha1/zz_generated.deepcopy.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading

0 comments on commit 58588b8

Please sign in to comment.