Skip to content

Commit

Permalink
feat(controller): Add liveness probe (#5875)
Browse files Browse the repository at this point in the history
Signed-off-by: Alex Collins <alex_collins@intuit.com>
  • Loading branch information
alexec committed May 24, 2021
1 parent e64607e commit f893ea6
Show file tree
Hide file tree
Showing 9 changed files with 97 additions and 29 deletions.
2 changes: 2 additions & 0 deletions cmd/workflow-controller/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,8 @@ func NewRootCommand() *cobra.Command {

go wfController.Run(ctx, workflowWorkers, workflowTTLWorkers, podWorkers, podCleanupWorkers)

http.HandleFunc("/healthz", wfController.Healthz)

go func() {
log.Println(http.ListenAndServe("localhost:6060", nil))
}()
Expand Down
1 change: 1 addition & 0 deletions docs/environment-variables.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ Note that these environment variables may be removed at any time.
| `CRON_SYNC_PERIOD` | `time.Duration` | How ofen to sync cron workflows. Default `10s` |
| `DEFAULT_REQUEUE_TIME` | `time.Duration` | The requeue time for the rate limiter of the workflow queue. |
| `GZIP_IMPLEMENTATION` | `string` | The implementation of compression/decompression. Currently only "PGZip" and "GZip" are supported. Defaults to "PGZip". |
| `HEALTHZ_AGE` | `time.Duration` | How old a un-reconciled workflow is to report unhealthy. Defaults to `5m`. |
| `INDEX_WORKFLOW_SEMAPHORE_KEYS` | `bool` | Whether or not to index semaphores. Defaults to `true`. |
| `LEADER_ELECTION_IDENTITY` | `string` | The ID used for workflow controllers to elect a leader. |
| `LEADER_ELECTION_DISABLE` | `bool` | Whether leader election should be disabled. |
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -35,17 +35,15 @@ spec:
ports:
- name: metrics
containerPort: 9090
# Periodically check we are listening on the metrics port
# causing a restart if it is not OK.
# This takes advantage of the fact that if the metrics service has died,
# then the controller has died.
# In testing, it appears to take 60-90s from failure to restart.
- containerPort: 6060
livenessProbe:
httpGet:
port: metrics
path: /metrics
initialDelaySeconds: 30
periodSeconds: 30
port: 6060
path: /healthz
# Require three failures to tolerate transient errors.
failureThreshold: 3
initialDelaySeconds: 90
periodSeconds: 60
securityContext:
runAsNonRoot: true
nodeSelector:
Expand Down
10 changes: 6 additions & 4 deletions manifests/install.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -607,15 +607,17 @@ spec:
fieldPath: metadata.name
image: argoproj/workflow-controller:v3.0.4
livenessProbe:
failureThreshold: 3
httpGet:
path: /metrics
port: metrics
initialDelaySeconds: 30
periodSeconds: 30
path: /healthz
port: 6060
initialDelaySeconds: 90
periodSeconds: 60
name: workflow-controller
ports:
- containerPort: 9090
name: metrics
- containerPort: 6060
securityContext:
capabilities:
drop:
Expand Down
10 changes: 6 additions & 4 deletions manifests/namespace-install.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -502,15 +502,17 @@ spec:
fieldPath: metadata.name
image: argoproj/workflow-controller:v3.0.4
livenessProbe:
failureThreshold: 3
httpGet:
path: /metrics
port: metrics
initialDelaySeconds: 30
periodSeconds: 30
path: /healthz
port: 6060
initialDelaySeconds: 90
periodSeconds: 60
name: workflow-controller
ports:
- containerPort: 9090
name: metrics
- containerPort: 6060
securityContext:
capabilities:
drop:
Expand Down
10 changes: 6 additions & 4 deletions manifests/quick-start-minimal.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -856,15 +856,17 @@ spec:
fieldPath: metadata.name
image: argoproj/workflow-controller:v3.0.4
livenessProbe:
failureThreshold: 3
httpGet:
path: /metrics
port: metrics
initialDelaySeconds: 30
periodSeconds: 30
path: /healthz
port: 6060
initialDelaySeconds: 90
periodSeconds: 60
name: workflow-controller
ports:
- containerPort: 9090
name: metrics
- containerPort: 6060
securityContext:
capabilities:
drop:
Expand Down
10 changes: 6 additions & 4 deletions manifests/quick-start-mysql.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -945,15 +945,17 @@ spec:
fieldPath: metadata.name
image: argoproj/workflow-controller:v3.0.4
livenessProbe:
failureThreshold: 3
httpGet:
path: /metrics
port: metrics
initialDelaySeconds: 30
periodSeconds: 30
path: /healthz
port: 6060
initialDelaySeconds: 90
periodSeconds: 60
name: workflow-controller
ports:
- containerPort: 9090
name: metrics
- containerPort: 6060
securityContext:
capabilities:
drop:
Expand Down
10 changes: 6 additions & 4 deletions manifests/quick-start-postgres.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -937,15 +937,17 @@ spec:
fieldPath: metadata.name
image: argoproj/workflow-controller:v3.0.4
livenessProbe:
failureThreshold: 3
httpGet:
path: /metrics
port: metrics
initialDelaySeconds: 30
periodSeconds: 30
path: /healthz
port: 6060
initialDelaySeconds: 90
periodSeconds: 60
name: workflow-controller
ports:
- containerPort: 9090
name: metrics
- containerPort: 6060
securityContext:
capabilities:
drop:
Expand Down
57 changes: 57 additions & 0 deletions workflow/controller/healthz.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
package controller

import (
"fmt"
"net/http"
"time"

log "github.com/sirupsen/logrus"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"

"github.com/argoproj/argo-workflows/v3/util/env"
"github.com/argoproj/argo-workflows/v3/workflow/common"
)

var (
age = env.LookupEnvDurationOr("HEALTHZ_AGE", 5*time.Minute)
)

// https://kubernetes.io/docs/tasks/configure-pod-container/configure-liveness-readiness-startup-probes/#define-a-liveness-http-request
// If we are in a state where there are any workflows that have not been reconciled in the last 2m, we've gone wrong.
func (wfc *WorkflowController) Healthz(w http.ResponseWriter, r *http.Request) {
ctx := r.Context()
instanceID := wfc.Config.InstanceID
instanceIDSelector := func() string {
if instanceID != "" {
return common.LabelKeyControllerInstanceID + "=" + instanceID
}
return "!" + common.LabelKeyControllerInstanceID
}()
labelSelector := "!" + common.LabelKeyPhase + "," + instanceIDSelector
err := func() error {
// avoid problems with informers, but directly querying the API
list, err := wfc.wfclientset.ArgoprojV1alpha1().Workflows(wfc.managedNamespace).List(ctx, metav1.ListOptions{LabelSelector: labelSelector})
if err != nil {
return err
}
for _, wf := range list.Items {
if time.Since(wf.GetCreationTimestamp().Time) > age {
return fmt.Errorf("workflow never reconciled: %s", wf.Name)
}
}
return nil
}()
log.WithField("err", err).
WithField("managedNamespace", wfc.managedNamespace).
WithField("instanceID", instanceID).
WithField("labelSelector", labelSelector).
WithField("age", age).
Info("healthz")
if err != nil {
w.WriteHeader(500)
_, _ = w.Write([]byte(err.Error()))
} else {
w.WriteHeader(200)
_, _ = w.Write([]byte("ok"))
}
}

0 comments on commit f893ea6

Please sign in to comment.