diff --git a/cmd/workflow-controller/main.go b/cmd/workflow-controller/main.go index abca29f23068..593837bee616 100644 --- a/cmd/workflow-controller/main.go +++ b/cmd/workflow-controller/main.go @@ -99,6 +99,8 @@ func NewRootCommand() *cobra.Command { go wfController.Run(ctx, workflowWorkers, workflowTTLWorkers, podWorkers, podCleanupWorkers) + http.HandleFunc("/healthz", wfController.Healthz) + go func() { log.Println(http.ListenAndServe("localhost:6060", nil)) }() diff --git a/docs/environment-variables.md b/docs/environment-variables.md index 2e5bfb64eeaa..b7b93927079a 100644 --- a/docs/environment-variables.md +++ b/docs/environment-variables.md @@ -15,6 +15,7 @@ Note that these environment variables may be removed at any time. | `CRON_SYNC_PERIOD` | `time.Duration` | How ofen to sync cron workflows. Default `10s` | | `DEFAULT_REQUEUE_TIME` | `time.Duration` | The requeue time for the rate limiter of the workflow queue. | | `GZIP_IMPLEMENTATION` | `string` | The implementation of compression/decompression. Currently only "PGZip" and "GZip" are supported. Defaults to "PGZip". | +| `HEALTHZ_AGE` | `time.Duration` | How old a un-reconciled workflow is to report unhealthy. Defaults to `5m`. | | `INDEX_WORKFLOW_SEMAPHORE_KEYS` | `bool` | Whether or not to index semaphores. Defaults to `true`. | | `LEADER_ELECTION_IDENTITY` | `string` | The ID used for workflow controllers to elect a leader. | | `LEADER_ELECTION_DISABLE` | `bool` | Whether leader election should be disabled. | diff --git a/manifests/base/workflow-controller/workflow-controller-deployment.yaml b/manifests/base/workflow-controller/workflow-controller-deployment.yaml index 91872200f7c2..7ff06d2ee5b1 100644 --- a/manifests/base/workflow-controller/workflow-controller-deployment.yaml +++ b/manifests/base/workflow-controller/workflow-controller-deployment.yaml @@ -35,17 +35,15 @@ spec: ports: - name: metrics containerPort: 9090 - # Periodically check we are listening on the metrics port - # causing a restart if it is not OK. - # This takes advantage of the fact that if the metrics service has died, - # then the controller has died. - # In testing, it appears to take 60-90s from failure to restart. + - containerPort: 6060 livenessProbe: httpGet: - port: metrics - path: /metrics - initialDelaySeconds: 30 - periodSeconds: 30 + port: 6060 + path: /healthz + # Require three failures to tolerate transient errors. + failureThreshold: 3 + initialDelaySeconds: 90 + periodSeconds: 60 securityContext: runAsNonRoot: true nodeSelector: diff --git a/manifests/install.yaml b/manifests/install.yaml index a1d478658f47..b8e4ef0e8bb7 100644 --- a/manifests/install.yaml +++ b/manifests/install.yaml @@ -607,15 +607,17 @@ spec: fieldPath: metadata.name image: argoproj/workflow-controller:v3.0.4 livenessProbe: + failureThreshold: 3 httpGet: - path: /metrics - port: metrics - initialDelaySeconds: 30 - periodSeconds: 30 + path: /healthz + port: 6060 + initialDelaySeconds: 90 + periodSeconds: 60 name: workflow-controller ports: - containerPort: 9090 name: metrics + - containerPort: 6060 securityContext: capabilities: drop: diff --git a/manifests/namespace-install.yaml b/manifests/namespace-install.yaml index cb5bc755b4d0..0dc3400ebbdf 100644 --- a/manifests/namespace-install.yaml +++ b/manifests/namespace-install.yaml @@ -502,15 +502,17 @@ spec: fieldPath: metadata.name image: argoproj/workflow-controller:v3.0.4 livenessProbe: + failureThreshold: 3 httpGet: - path: /metrics - port: metrics - initialDelaySeconds: 30 - periodSeconds: 30 + path: /healthz + port: 6060 + initialDelaySeconds: 90 + periodSeconds: 60 name: workflow-controller ports: - containerPort: 9090 name: metrics + - containerPort: 6060 securityContext: capabilities: drop: diff --git a/manifests/quick-start-minimal.yaml b/manifests/quick-start-minimal.yaml index 923859fa8516..f80a12391fb0 100644 --- a/manifests/quick-start-minimal.yaml +++ b/manifests/quick-start-minimal.yaml @@ -856,15 +856,17 @@ spec: fieldPath: metadata.name image: argoproj/workflow-controller:v3.0.4 livenessProbe: + failureThreshold: 3 httpGet: - path: /metrics - port: metrics - initialDelaySeconds: 30 - periodSeconds: 30 + path: /healthz + port: 6060 + initialDelaySeconds: 90 + periodSeconds: 60 name: workflow-controller ports: - containerPort: 9090 name: metrics + - containerPort: 6060 securityContext: capabilities: drop: diff --git a/manifests/quick-start-mysql.yaml b/manifests/quick-start-mysql.yaml index 1f96c99759e7..2362b7253d71 100644 --- a/manifests/quick-start-mysql.yaml +++ b/manifests/quick-start-mysql.yaml @@ -945,15 +945,17 @@ spec: fieldPath: metadata.name image: argoproj/workflow-controller:v3.0.4 livenessProbe: + failureThreshold: 3 httpGet: - path: /metrics - port: metrics - initialDelaySeconds: 30 - periodSeconds: 30 + path: /healthz + port: 6060 + initialDelaySeconds: 90 + periodSeconds: 60 name: workflow-controller ports: - containerPort: 9090 name: metrics + - containerPort: 6060 securityContext: capabilities: drop: diff --git a/manifests/quick-start-postgres.yaml b/manifests/quick-start-postgres.yaml index b6abffc8d1c9..6dcb7a3c20fc 100644 --- a/manifests/quick-start-postgres.yaml +++ b/manifests/quick-start-postgres.yaml @@ -937,15 +937,17 @@ spec: fieldPath: metadata.name image: argoproj/workflow-controller:v3.0.4 livenessProbe: + failureThreshold: 3 httpGet: - path: /metrics - port: metrics - initialDelaySeconds: 30 - periodSeconds: 30 + path: /healthz + port: 6060 + initialDelaySeconds: 90 + periodSeconds: 60 name: workflow-controller ports: - containerPort: 9090 name: metrics + - containerPort: 6060 securityContext: capabilities: drop: diff --git a/workflow/controller/healthz.go b/workflow/controller/healthz.go new file mode 100644 index 000000000000..9e9bded526cd --- /dev/null +++ b/workflow/controller/healthz.go @@ -0,0 +1,57 @@ +package controller + +import ( + "fmt" + "net/http" + "time" + + log "github.com/sirupsen/logrus" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + + "github.com/argoproj/argo-workflows/v3/util/env" + "github.com/argoproj/argo-workflows/v3/workflow/common" +) + +var ( + age = env.LookupEnvDurationOr("HEALTHZ_AGE", 5*time.Minute) +) + +// https://kubernetes.io/docs/tasks/configure-pod-container/configure-liveness-readiness-startup-probes/#define-a-liveness-http-request +// If we are in a state where there are any workflows that have not been reconciled in the last 2m, we've gone wrong. +func (wfc *WorkflowController) Healthz(w http.ResponseWriter, r *http.Request) { + ctx := r.Context() + instanceID := wfc.Config.InstanceID + instanceIDSelector := func() string { + if instanceID != "" { + return common.LabelKeyControllerInstanceID + "=" + instanceID + } + return "!" + common.LabelKeyControllerInstanceID + }() + labelSelector := "!" + common.LabelKeyPhase + "," + instanceIDSelector + err := func() error { + // avoid problems with informers, but directly querying the API + list, err := wfc.wfclientset.ArgoprojV1alpha1().Workflows(wfc.managedNamespace).List(ctx, metav1.ListOptions{LabelSelector: labelSelector}) + if err != nil { + return err + } + for _, wf := range list.Items { + if time.Since(wf.GetCreationTimestamp().Time) > age { + return fmt.Errorf("workflow never reconciled: %s", wf.Name) + } + } + return nil + }() + log.WithField("err", err). + WithField("managedNamespace", wfc.managedNamespace). + WithField("instanceID", instanceID). + WithField("labelSelector", labelSelector). + WithField("age", age). + Info("healthz") + if err != nil { + w.WriteHeader(500) + _, _ = w.Write([]byte(err.Error())) + } else { + w.WriteHeader(200) + _, _ = w.Write([]byte("ok")) + } +}