Skip to content

Commit

Permalink
fix: Avoid controller crashes when running large number of workflows (a…
Browse files Browse the repository at this point in the history
…rgoproj#9691)

Signed-off-by: Yuan Tang <terrytangyuan@gmail.com>
Signed-off-by: juchao <juchao@coscene.io>
  • Loading branch information
terrytangyuan authored and juchaosong committed Nov 3, 2022
1 parent 03b0be1 commit d184742
Show file tree
Hide file tree
Showing 4 changed files with 7 additions and 2 deletions.
1 change: 1 addition & 0 deletions .spelling
Original file line number Diff line number Diff line change
Expand Up @@ -148,6 +148,7 @@ k8s-jobs
kube
kubelet
kubernetes
liveness
localhost
memoization
memoized
Expand Down
2 changes: 2 additions & 0 deletions docs/environment-variables.md
Original file line number Diff line number Diff line change
Expand Up @@ -33,12 +33,14 @@ most users. Environment variables may be removed at any time.
| `GZIP_IMPLEMENTATION` | `string` | `PGZip` | The implementation of compression/decompression. Currently only "`PGZip`" and "`GZip`" are supported. |
| `INFORMER_WRITE_BACK` | `bool` | `true` | Whether to write back to informer instead of catching up. |
| `HEALTHZ_AGE` | `time.Duration` | `5m` | How old a un-reconciled workflow is to report unhealthy. |
| `HEALTHZ_LIST_LIMIT` | `int` | `200` | The maximum number of responses to return for a list call on workflows for liveness check. |
| `INDEX_WORKFLOW_SEMAPHORE_KEYS` | `bool` | `true` | Whether or not to index semaphores. |
| `LEADER_ELECTION_IDENTITY` | `string` | Controller's `metadata.name` | The ID used for workflow controllers to elect a leader. |
| `LEADER_ELECTION_DISABLE` | `bool` | `false` | Whether leader election should be disabled. |
| `LEADER_ELECTION_LEASE_DURATION` | `time.Duration` | `15s` | The duration that non-leader candidates will wait to force acquire leadership. |
| `LEADER_ELECTION_RENEW_DEADLINE` | `time.Duration` | `10s` | The duration that the acting master will retry refreshing leadership before giving up. |
| `LEADER_ELECTION_RETRY_PERIOD` | `time.Duration` | `5s` | The duration that the leader election clients should wait between tries of actions. |
| `LIST_LIMIT` | `int` | `200` | The maximum number of responses to return for a list call on workflows for workflow informer. |
| `MAX_OPERATION_TIME` | `time.Duration` | `30s` | The maximum time a workflow operation is allowed to run for before re-queuing the workflow onto the work queue. |
| `OFFLOAD_NODE_STATUS_TTL` | `time.Duration` | `5m` | The TTL to delete the offloaded node status. Currently only used for testing. |
| `POD_NAMES` | `string` | `v2` | Whether to have pod names contain the template name (v2) or be the node id (v1) - should be set the same for Argo Server. |
Expand Down
1 change: 1 addition & 0 deletions workflow/controller/controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -803,6 +803,7 @@ func (wfc *WorkflowController) tweakListOptions(options *metav1.ListOptions) {
labelSelector := labels.NewSelector().
Add(util.InstanceIDRequirement(wfc.Config.InstanceID))
options.LabelSelector = labelSelector.String()
options.Limit = int64(env.LookupEnvIntOr("LIST_LIMIT", 200))
}

func getWfPriority(obj interface{}) (int32, time.Time) {
Expand Down
5 changes: 3 additions & 2 deletions workflow/controller/healthz.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,8 @@ import (
)

var (
age = env.LookupEnvDurationOr("HEALTHZ_AGE", 5*time.Minute)
age = env.LookupEnvDurationOr("HEALTHZ_AGE", 5*time.Minute)
limit = int64(env.LookupEnvIntOr("HEALTHZ_LIST_LIMIT", 200))
)

// https://kubernetes.io/docs/tasks/configure-pod-container/configure-liveness-readiness-startup-probes/#define-a-liveness-http-request
Expand All @@ -30,7 +31,7 @@ func (wfc *WorkflowController) Healthz(w http.ResponseWriter, r *http.Request) {
labelSelector := "!" + common.LabelKeyPhase + "," + instanceIDSelector
err := func() error {
// avoid problems with informers, but directly querying the API
list, err := wfc.wfclientset.ArgoprojV1alpha1().Workflows(wfc.managedNamespace).List(ctx, metav1.ListOptions{LabelSelector: labelSelector})
list, err := wfc.wfclientset.ArgoprojV1alpha1().Workflows(wfc.managedNamespace).List(ctx, metav1.ListOptions{LabelSelector: labelSelector, Limit: limit})
if err != nil {
return err
}
Expand Down

0 comments on commit d184742

Please sign in to comment.