Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: refactor retries #3060

Merged
merged 6 commits into from
Dec 5, 2024
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
974 changes: 720 additions & 254 deletions api/v1alpha1/generated.pb.go

Large diffs are not rendered by default.

71 changes: 61 additions & 10 deletions api/v1alpha1/generated.proto

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

91 changes: 75 additions & 16 deletions api/v1alpha1/promotion_types.go
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
package v1alpha1

import (
"time"

apiextensionsv1 "k8s.io/apiextensions-apiserver/pkg/apis/apiextensions/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"sigs.k8s.io/yaml"
Expand Down Expand Up @@ -111,23 +113,58 @@

// PromotionStepRetry describes the retry policy for a PromotionStep.
type PromotionStepRetry struct {
// Attempts is the number of times the step can be attempted before the
// PromotionStep is marked as failed.
// Timeout is the soft maximum interval in which a step that returns a Running
// status (which typically indicates it's waiting for something to happen)
// may be retried.
//
// The maximum is a soft one because the check for whether the interval has
// elapsed occurs AFTER the step has run. This effectively means a step may
// run ONCE beyond the close of the interval.
//
// If this field is set to nil, the effective default will be a step-specific
// one. If no step-specific default exists (i.e. is also nil), the effective
// default will be the system-wide default of 0.
//
// A value of 0 will cause the step to be retried indefinitely unless the
// ErrorThreshold is reached.
Timeout *metav1.Duration `json:"timeout,omitempty" protobuf:"bytes,1,opt,name=timeout"`
// ErrorThreshold is the number of consecutive times the step must fail (for
// any reason) before retries are abandoned and the entire Promotion is marked
// as failed.
//
// If this field is set to 1, the step will not be retried. If this
// field is set to -1, the step will be retried indefinitely.
// If this field is set to 0, the effective default will be a step-specific
// one. If no step-specific default exists (i.e. is also 0), the effective
// default will be the system-wide default of 1.
//
// The default of this field depends on the step being executed. Refer to
// the documentation for the specific step for more information.
Attempts int64 `json:"attempts,omitempty" protobuf:"varint,1,opt,name=attempts"`
// A value of 1 will cause the Promotion to be marked as failed after just
// a single failure; i.e. no retries will be attempted.
//
// There is no option to specify an infinite number of retries using a value
// such as -1.
//
// In a future release, Kargo is likely to become capable of distinguishing
// between recoverable and non-recoverable step failures. At that time, it is
// planned that unrecoverable failures will not be subject to this threshold
// and will immediately cause the Promotion to be marked as failed without
// further condition.
ErrorThreshold uint32 `json:"errorThreshold,omitempty" protobuf:"varint,2,opt,name=errorThreshold"`
}

// GetAttempts returns the Attempts field with the given fallback value.
func (r *PromotionStepRetry) GetAttempts(fallback int64) int64 {
if r == nil || r.Attempts == 0 {
// GetTimeout returns the Timeout field with the given fallback value.
func (r *PromotionStepRetry) GetTimeout(fallback *time.Duration) *time.Duration {
if r == nil || r.Timeout == nil {
return fallback
}
return r.Attempts
return &r.Timeout.Duration
}

// GetErrorThreshold returns the ErrorThreshold field with the given fallback
// value.
func (r *PromotionStepRetry) GetErrorThreshold(fallback uint32) uint32 {
if r == nil || r.ErrorThreshold == 0 {
return fallback
}
return r.ErrorThreshold
}

// PromotionStep describes a directive to be executed as part of a Promotion.
Expand Down Expand Up @@ -177,9 +214,9 @@
// permits steps that have already run successfully to be skipped on
// subsequent reconciliations attempts.
CurrentStep int64 `json:"currentStep,omitempty" protobuf:"varint,9,opt,name=currentStep"`
// CurrentStepAttempt is the number of times the current step has been
// attempted.
CurrentStepAttempt int64 `json:"currentStepAttempt,omitempty" protobuf:"varint,11,opt,name=currentStepAttempt"`
// StepExecutionMetadata tracks metadata pertaining to the execution
// of individual promotion steps.
StepExecutionMetadata StepExecutionMetadataList `json:"stepExecutionMetadata,omitempty" protobuf:"bytes,11,rep,name=stepExecutionMetadata"`
// State stores the state of the promotion process between reconciliation
// attempts.
State *apiextensionsv1.JSON `json:"state,omitempty" protobuf:"bytes,10,opt,name=state"`
Expand Down Expand Up @@ -224,8 +261,8 @@
}

// WithPhase returns a copy of PromotionStatus with the given phase
func (p *PromotionStatus) WithPhase(phase PromotionPhase) *PromotionStatus {
status := p.DeepCopy()
func (s *PromotionStatus) WithPhase(phase PromotionPhase) *PromotionStatus {
status := s.DeepCopy()

Check warning on line 265 in api/v1alpha1/promotion_types.go

View check run for this annotation

Codecov / codecov/patch

api/v1alpha1/promotion_types.go#L264-L265

Added lines #L264 - L265 were not covered by tests
status.Phase = phase
return status
}
Expand All @@ -238,3 +275,25 @@
metav1.ListMeta `json:"metadata,omitempty" protobuf:"bytes,1,opt,name=metadata"`
Items []Promotion `json:"items" protobuf:"bytes,2,rep,name=items"`
}

// StepExecutionMetadataList is a list of StepExecutionMetadata.
type StepExecutionMetadataList []StepExecutionMetadata

// StepExecutionMetadata tracks metadata pertaining to the execution of
// a promotion step.
type StepExecutionMetadata struct {
// Alias is the alias of the step.
Alias string `json:"alias,omitempty" protobuf:"bytes,1,opt,name=alias"`
// StartedAt is the time at which the first attempt to execute the step
// began.
StartedAt *metav1.Time `json:"startedAt,omitempty" protobuf:"bytes,2,opt,name=startedAt"`
// FinishedAt is the time at which the final attempt to execute the step
// completed.
FinishedAt *metav1.Time `json:"finishedAt,omitempty" protobuf:"bytes,3,opt,name=finishedAt"`
// ErrorCount tracks consecutive failed attempts to execute the step.
ErrorCount uint32 `json:"errorCount,omitempty" protobuf:"varint,4,opt,name=errorCount"`
// Status is the high-level outcome of the step.
Status PromotionPhase `json:"status,omitempty" protobuf:"bytes,5,opt,name=status"`
// Message is a display message about the step, including any errors.
Message string `json:"message,omitempty" protobuf:"bytes,6,opt,name=message"`
}
55 changes: 46 additions & 9 deletions api/v1alpha1/promotion_types_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,16 +2,53 @@ package v1alpha1

import (
"testing"
"time"

"github.com/stretchr/testify/require"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/utils/ptr"
)

func TestPromotionRetry_GetAttempts(t *testing.T) {
func TestPromotionRetry_GetTimeout(t *testing.T) {
tests := []struct {
name string
retry *PromotionStepRetry
fallback int64
want int64
fallback *time.Duration
want *time.Duration
}{
{
name: "retry is nil",
retry: nil,
fallback: ptr.To(time.Hour),
want: ptr.To(time.Hour),
},
{
name: "threshold is not set",
krancour marked this conversation as resolved.
Show resolved Hide resolved
retry: &PromotionStepRetry{},
fallback: ptr.To(time.Hour),
want: ptr.To(time.Hour),
},
{
name: "threshold is set",
krancour marked this conversation as resolved.
Show resolved Hide resolved
retry: &PromotionStepRetry{
Timeout: &metav1.Duration{
Duration: 3 * time.Hour,
},
},
want: ptr.To(3 * time.Hour),
},
}
for _, tt := range tests {
require.Equal(t, tt.want, tt.retry.GetTimeout(tt.fallback))
}
}

func TestPromotionRetry_GetErrorThreshold(t *testing.T) {
tests := []struct {
name string
retry *PromotionStepRetry
fallback uint32
want uint32
}{
{
name: "retry is nil",
Expand All @@ -20,20 +57,20 @@ func TestPromotionRetry_GetAttempts(t *testing.T) {
want: 1,
},
{
name: "attempts is not set",
name: "threshold is not set",
retry: &PromotionStepRetry{},
fallback: -1,
want: -1,
fallback: 1,
want: 1,
},
{
name: "attempts is set",
name: "threshold is set",
retry: &PromotionStepRetry{
Attempts: 3,
ErrorThreshold: 3,
},
want: 3,
},
}
for _, tt := range tests {
require.Equal(t, tt.want, tt.retry.GetAttempts(tt.fallback))
require.Equal(t, tt.want, tt.retry.GetErrorThreshold(tt.fallback))
}
}
58 changes: 57 additions & 1 deletion api/v1alpha1/zz_generated.deepcopy.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading
Loading