Skip to content

Commit

Permalink
feat: Sync timeouts for applications (argoproj#6055) (argoproj#20816)
Browse files Browse the repository at this point in the history
* feat: Sync timeouts for applications (argoproj#6055)

Helps with argoproj#6055

Introduces a controller-level configuration for terminating sync after timeout.

Signed-off-by: Andrii Korotkov <andrii.korotkov@verkada.com>

* Fix env variable name

Signed-off-by: Andrii Korotkov <andrii.korotkov@verkada.com>

---------

Signed-off-by: Andrii Korotkov <andrii.korotkov@verkada.com>
  • Loading branch information
andrii-korotkov-verkada authored and GuySaar8 committed Dec 12, 2024
1 parent ef0360b commit fd8fa77
Show file tree
Hide file tree
Showing 12 changed files with 112 additions and 0 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@ func NewCommand() *cobra.Command {
selfHealBackoffTimeoutSeconds int
selfHealBackoffFactor int
selfHealBackoffCapSeconds int
syncTimeout int
statusProcessors int
operationProcessors int
glogLevel int
Expand Down Expand Up @@ -189,6 +190,7 @@ func NewCommand() *cobra.Command {
time.Duration(appResyncJitter)*time.Second,
time.Duration(selfHealTimeoutSeconds)*time.Second,
selfHealBackoff,
time.Duration(syncTimeout)*time.Second,
time.Duration(repoErrorGracePeriod)*time.Second,
metricsPort,
metricsCacheExpiration,
Expand Down Expand Up @@ -256,6 +258,7 @@ func NewCommand() *cobra.Command {
command.Flags().IntVar(&selfHealBackoffTimeoutSeconds, "self-heal-backoff-timeout-seconds", env.ParseNumFromEnv("ARGOCD_APPLICATION_CONTROLLER_SELF_HEAL_BACKOFF_TIMEOUT_SECONDS", 2, 0, math.MaxInt32), "Specifies initial timeout of exponential backoff between self heal attempts")
command.Flags().IntVar(&selfHealBackoffFactor, "self-heal-backoff-factor", env.ParseNumFromEnv("ARGOCD_APPLICATION_CONTROLLER_SELF_HEAL_BACKOFF_FACTOR", 3, 0, math.MaxInt32), "Specifies factor of exponential timeout between application self heal attempts")
command.Flags().IntVar(&selfHealBackoffCapSeconds, "self-heal-backoff-cap-seconds", env.ParseNumFromEnv("ARGOCD_APPLICATION_CONTROLLER_SELF_HEAL_BACKOFF_CAP_SECONDS", 300, 0, math.MaxInt32), "Specifies max timeout of exponential backoff between application self heal attempts")
command.Flags().IntVar(&syncTimeout, "sync-timeout", env.ParseNumFromEnv("ARGOCD_APPLICATION_CONTROLLER_SYNC_TIMEOUT", 0, 0, math.MaxInt32), "Specifies the timeout after which a sync would be terminated. 0 means no timeout (default 0).")
command.Flags().Int64Var(&kubectlParallelismLimit, "kubectl-parallelism-limit", env.ParseInt64FromEnv("ARGOCD_APPLICATION_CONTROLLER_KUBECTL_PARALLELISM_LIMIT", 20, 0, math.MaxInt64), "Number of allowed concurrent kubectl fork/execs. Any value less than 1 means no limit.")
command.Flags().BoolVar(&repoServerPlaintext, "repo-server-plaintext", env.ParseBoolFromEnv("ARGOCD_APPLICATION_CONTROLLER_REPO_SERVER_PLAINTEXT", false), "Disable TLS on connections to repo server")
command.Flags().BoolVar(&repoServerStrictTLS, "repo-server-strict-tls", env.ParseBoolFromEnv("ARGOCD_APPLICATION_CONTROLLER_REPO_SERVER_STRICT_TLS", false), "Whether to use strict validation of the TLS cert presented by the repo server")
Expand Down
12 changes: 12 additions & 0 deletions controller/appcontroller.go
Original file line number Diff line number Diff line change
Expand Up @@ -131,6 +131,7 @@ type ApplicationController struct {
statusRefreshJitter time.Duration
selfHealTimeout time.Duration
selfHealBackOff *wait.Backoff
syncTimeout time.Duration
db db.ArgoDB
settingsMgr *settings_util.SettingsManager
refreshRequestedApps map[string]CompareWith
Expand Down Expand Up @@ -161,6 +162,7 @@ func NewApplicationController(
appResyncJitter time.Duration,
selfHealTimeout time.Duration,
selfHealBackoff *wait.Backoff,
syncTimeout time.Duration,
repoErrorGracePeriod time.Duration,
metricsPort int,
metricsCacheExpiration time.Duration,
Expand Down Expand Up @@ -202,6 +204,7 @@ func NewApplicationController(
settingsMgr: settingsMgr,
selfHealTimeout: selfHealTimeout,
selfHealBackOff: selfHealBackoff,
syncTimeout: syncTimeout,
clusterSharding: clusterSharding,
projByNameCache: sync.Map{},
applicationNamespaces: applicationNamespaces,
Expand Down Expand Up @@ -1373,12 +1376,21 @@ func (ctrl *ApplicationController) processRequestedAppOperation(app *appv1.Appli
// Get rid of sync results and null out previous operation completion time
state.SyncResult = nil
}
} else if ctrl.syncTimeout != time.Duration(0) && time.Now().After(state.StartedAt.Add(ctrl.syncTimeout)) && !terminating {
state.Phase = synccommon.OperationTerminating
state.Message = "operation is terminating due to timeout"
ctrl.setOperationState(app, state)
logCtx.Infof("Terminating in-progress operation due to timeout. Started at: %v, timeout: %v", state.StartedAt, ctrl.syncTimeout)
} else {
logCtx.Infof("Resuming in-progress operation. phase: %s, message: %s", state.Phase, state.Message)
}
} else {
state = &appv1.OperationState{Phase: synccommon.OperationRunning, Operation: *app.Operation, StartedAt: metav1.Now()}
ctrl.setOperationState(app, state)
if ctrl.syncTimeout != time.Duration(0) {
// Schedule a check during which the timeout would be checked.
ctrl.appOperationQueue.AddAfter(ctrl.toAppKey(app.QualifiedName()), ctrl.syncTimeout)
}
logCtx.Infof("Initialized new operation: %v", *app.Operation)
}
ts.AddCheckpoint("initial_operation_stage_ms")
Expand Down
52 changes: 52 additions & 0 deletions controller/appcontroller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -167,6 +167,7 @@ func newFakeControllerWithResync(data *fakeData, appResyncPeriod time.Duration,
time.Second,
time.Minute,
nil,
0,
time.Second*10,
common.DefaultPortArgoCDMetrics,
data.metricsCacheExpiration,
Expand Down Expand Up @@ -2481,3 +2482,54 @@ func TestSelfHealExponentialBackoff(t *testing.T) {
})
}
}

func TestSyncTimeout(t *testing.T) {
testCases := []struct {
delta time.Duration
expectedPhase synccommon.OperationPhase
expectedMessage string
}{{
delta: 2 * time.Minute,
expectedPhase: synccommon.OperationFailed,
expectedMessage: "Operation terminated",
}, {
delta: 30 * time.Second,
expectedPhase: synccommon.OperationSucceeded,
expectedMessage: "successfully synced (no more tasks)",
}}
for i := range testCases {
tc := testCases[i]
t.Run(fmt.Sprintf("test case %d", i), func(t *testing.T) {
app := newFakeApp()
app.Spec.Project = "default"
app.Operation = &v1alpha1.Operation{
Sync: &v1alpha1.SyncOperation{
Revision: "HEAD",
},
}
ctrl := newFakeController(&fakeData{
apps: []runtime.Object{app, &defaultProj},
manifestResponses: []*apiclient.ManifestResponse{{
Manifests: []string{},
}},
}, nil)

ctrl.syncTimeout = time.Minute
app.Status.OperationState = &v1alpha1.OperationState{
Operation: v1alpha1.Operation{
Sync: &v1alpha1.SyncOperation{
Revision: "HEAD",
},
},
Phase: synccommon.OperationRunning,
StartedAt: metav1.NewTime(time.Now().Add(-tc.delta)),
}
ctrl.processRequestedAppOperation(app)

app, err := ctrl.applicationClientset.ArgoprojV1alpha1().Applications(app.ObjectMeta.Namespace).Get(context.Background(), app.ObjectMeta.Name, metav1.GetOptions{})
require.NoError(t, err)
require.Equal(t, tc.expectedPhase, app.Status.OperationState.Phase)
require.Equal(t, tc.expectedMessage, app.Status.OperationState.Message)
})
}
}
2 changes: 2 additions & 0 deletions docs/operator-manual/argocd-cmd-params-cm.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,8 @@ data:
controller.self.heal.timeout.seconds: "2"
controller.self.heal.backoff.factor: "3"
controller.self.heal.backoff.cap.seconds: "300"
# Specifies a sync timeout for applications. "0" means no timeout (default "0")
controller.sync.timeout.seconds: "0"

# Cache expiration for app state (default 1h0m0s)
controller.app.state.cache.expiration: "1h0m0s"
Expand Down

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,12 @@ spec:
name: argocd-cmd-params-cm
key: controller.self.heal.backoff.cap.seconds
optional: true
- name: ARGOCD_APPLICATION_CONTROLLER_SYNC_TIMEOUT
valueFrom:
configMapKeyRef:
name: argocd-cmd-params-cm
key: controller.sync.timeout.seconds
optional: true
- name: ARGOCD_APPLICATION_CONTROLLER_REPO_SERVER_PLAINTEXT
valueFrom:
configMapKeyRef:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,12 @@ spec:
name: argocd-cmd-params-cm
key: controller.self.heal.backoff.cap.seconds
optional: true
- name: ARGOCD_APPLICATION_CONTROLLER_SYNC_TIMEOUT
valueFrom:
configMapKeyRef:
name: argocd-cmd-params-cm
key: controller.sync.timeout.seconds
optional: true
- name: ARGOCD_APPLICATION_CONTROLLER_REPO_SERVER_PLAINTEXT
valueFrom:
configMapKeyRef:
Expand Down
6 changes: 6 additions & 0 deletions manifests/core-install.yaml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

6 changes: 6 additions & 0 deletions manifests/ha/install.yaml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

6 changes: 6 additions & 0 deletions manifests/ha/namespace-install.yaml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

6 changes: 6 additions & 0 deletions manifests/install.yaml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

6 changes: 6 additions & 0 deletions manifests/namespace-install.yaml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

0 comments on commit fd8fa77

Please sign in to comment.