From 22c4f84b0864b0c038309ada2edb2336060b6dfd Mon Sep 17 00:00:00 2001 From: Steve Kriss Date: Tue, 10 Jan 2023 09:53:47 -0700 Subject: [PATCH] remove shutdown-manager liveness probe (#4967) The probe can currently cause problems when it fails by causing the shutdown-manager container to be restarted by itself, which then results in the envoy container getting stuck in a "DRAINING" state indefinitely. Not having the probe is less bad overall because envoy pods are less likely to get stuck in "DRAINING", and the worst case without it is that shutdown-manager is truly unresponsive during a pod termination, in which case the envoy container will simply terminate without first draining active connections. Updates #4851. Signed-off-by: Steve Kriss Signed-off-by: yy --- changelogs/unreleased/4967-skriss-minor.md | 9 +++++++++ examples/contour/03-envoy.yaml | 6 ------ examples/deployment/03-envoy-deployment.yaml | 6 ------ examples/render/contour-deployment.yaml | 6 ------ examples/render/contour-gateway.yaml | 6 ------ examples/render/contour.yaml | 6 ------ internal/provisioner/equality/equality_test.go | 7 ------- .../provisioner/objects/dataplane/dataplane.go | 14 -------------- site/content/docs/main/redeploy-envoy.md | 11 +---------- 9 files changed, 10 insertions(+), 61 deletions(-) create mode 100644 changelogs/unreleased/4967-skriss-minor.md diff --git a/changelogs/unreleased/4967-skriss-minor.md b/changelogs/unreleased/4967-skriss-minor.md new file mode 100644 index 00000000000..aad78e52dfc --- /dev/null +++ b/changelogs/unreleased/4967-skriss-minor.md @@ -0,0 +1,9 @@ +## shutdown-manager sidecar container liveness probe removed + +The liveness probe has been removed from the Envoy pods' shutdown-manager sidecar container. +This change is to mitigate a problem where when the liveness probe fails, the shutdown-manager container is restarted by itself. +This ultimately has the unintended effect of causing the envoy container to be stuck indefinitely in a "DRAINING" state and not serving traffic. + +Overall, not having the liveness probe on the shutdown-manager container is less bad because envoy pods are less likely to get stuck in "DRAINING" indefinitely. +In the worst case, during termination of an Envoy pod (due to upgrade, scaling, etc.), shutdown-manager is truly unresponsive, in which case the envoy container will simply terminate without first draining active connections. +If appropriate (i.e. during an upgrade), a new Envoy pod will then be created and re-added to the set of ready Envoys to load balance traffic to. diff --git a/examples/contour/03-envoy.yaml b/examples/contour/03-envoy.yaml index a8a0793cd1e..a75f1841b82 100644 --- a/examples/contour/03-envoy.yaml +++ b/examples/contour/03-envoy.yaml @@ -38,12 +38,6 @@ spec: - /bin/contour - envoy - shutdown - livenessProbe: - httpGet: - path: /healthz - port: 8090 - initialDelaySeconds: 3 - periodSeconds: 10 name: shutdown-manager volumeMounts: - name: envoy-admin diff --git a/examples/deployment/03-envoy-deployment.yaml b/examples/deployment/03-envoy-deployment.yaml index 6bab83a64e5..3e404e41dbf 100644 --- a/examples/deployment/03-envoy-deployment.yaml +++ b/examples/deployment/03-envoy-deployment.yaml @@ -51,12 +51,6 @@ spec: - /bin/contour - envoy - shutdown - livenessProbe: - httpGet: - path: /healthz - port: 8090 - initialDelaySeconds: 3 - periodSeconds: 10 name: shutdown-manager volumeMounts: - name: envoy-admin diff --git a/examples/render/contour-deployment.yaml b/examples/render/contour-deployment.yaml index f687ec393ae..785e20d0138 100644 --- a/examples/render/contour-deployment.yaml +++ b/examples/render/contour-deployment.yaml @@ -7362,12 +7362,6 @@ spec: - /bin/contour - envoy - shutdown - livenessProbe: - httpGet: - path: /healthz - port: 8090 - initialDelaySeconds: 3 - periodSeconds: 10 name: shutdown-manager volumeMounts: - name: envoy-admin diff --git a/examples/render/contour-gateway.yaml b/examples/render/contour-gateway.yaml index d0b6eea6abc..34a2ead620e 100644 --- a/examples/render/contour-gateway.yaml +++ b/examples/render/contour-gateway.yaml @@ -7355,12 +7355,6 @@ spec: - /bin/contour - envoy - shutdown - livenessProbe: - httpGet: - path: /healthz - port: 8090 - initialDelaySeconds: 3 - periodSeconds: 10 name: shutdown-manager volumeMounts: - name: envoy-admin diff --git a/examples/render/contour.yaml b/examples/render/contour.yaml index 8ea48ace4c3..edf4895f909 100644 --- a/examples/render/contour.yaml +++ b/examples/render/contour.yaml @@ -7349,12 +7349,6 @@ spec: - /bin/contour - envoy - shutdown - livenessProbe: - httpGet: - path: /healthz - port: 8090 - initialDelaySeconds: 3 - periodSeconds: 10 name: shutdown-manager volumeMounts: - name: envoy-admin diff --git a/internal/provisioner/equality/equality_test.go b/internal/provisioner/equality/equality_test.go index 7e3b5530c80..ca757fb1daf 100644 --- a/internal/provisioner/equality/equality_test.go +++ b/internal/provisioner/equality/equality_test.go @@ -105,13 +105,6 @@ func TestDaemonSetConfigChanged(t *testing.T) { description: "if probe values are set to default values", mutate: func(ds *appsv1.DaemonSet) { for i, c := range ds.Spec.Template.Spec.Containers { - if c.Name == dataplane.ShutdownContainerName { - ds.Spec.Template.Spec.Containers[i].LivenessProbe.ProbeHandler.HTTPGet.Scheme = "HTTP" - ds.Spec.Template.Spec.Containers[i].LivenessProbe.TimeoutSeconds = int32(1) - ds.Spec.Template.Spec.Containers[i].LivenessProbe.PeriodSeconds = int32(10) - ds.Spec.Template.Spec.Containers[i].LivenessProbe.SuccessThreshold = int32(1) - ds.Spec.Template.Spec.Containers[i].LivenessProbe.FailureThreshold = int32(3) - } if c.Name == dataplane.EnvoyContainerName { ds.Spec.Template.Spec.Containers[i].ReadinessProbe.TimeoutSeconds = int32(1) // ReadinessProbe InitialDelaySeconds and PeriodSeconds are not set as defaults, diff --git a/internal/provisioner/objects/dataplane/dataplane.go b/internal/provisioner/objects/dataplane/dataplane.go index 0e18de82666..89209abbe6f 100644 --- a/internal/provisioner/objects/dataplane/dataplane.go +++ b/internal/provisioner/objects/dataplane/dataplane.go @@ -148,20 +148,6 @@ func desiredContainers(contour *model.Contour, contourImage, envoyImage string) "envoy", "shutdown-manager", }, - LivenessProbe: &corev1.Probe{ - FailureThreshold: int32(3), - ProbeHandler: corev1.ProbeHandler{ - HTTPGet: &corev1.HTTPGetAction{ - Scheme: corev1.URISchemeHTTP, - Path: "/healthz", - Port: intstr.IntOrString{IntVal: int32(8090)}, - }, - }, - InitialDelaySeconds: int32(3), - PeriodSeconds: int32(10), - SuccessThreshold: int32(1), - TimeoutSeconds: int32(1), - }, Lifecycle: &corev1.Lifecycle{ PreStop: &corev1.LifecycleHandler{ Exec: &corev1.ExecAction{ diff --git a/site/content/docs/main/redeploy-envoy.md b/site/content/docs/main/redeploy-envoy.md index 508b0ba33ac..2456b53d2bf 100644 --- a/site/content/docs/main/redeploy-envoy.md +++ b/site/content/docs/main/redeploy-envoy.md @@ -13,10 +13,7 @@ When implementing this roll out, the following steps should be taken: Contour implements an `envoy` sub-command named `shutdown-manager` whose job is to manage a single Envoy instances lifecycle for Kubernetes. The `shutdown-manager` runs as a new container alongside the Envoy container in the same pod. -It exposes two HTTP endpoints which are used for `livenessProbe` as well as to handle the Kubernetes `preStop` event hook. - -- **livenessProbe**: This is used to validate the shutdown manager is still running properly. If requests to `/healthz` fail, the container will be restarted. -- **preStop**: This is used to keep the Envoy container running while waiting for connections to drain. The `/shutdown` endpoint blocks until the connections are drained. +It uses a Kubernetes `preStop` event hook to keep the Envoy container running while waiting for connections to drain. The `/shutdown` endpoint blocks until the connections are drained. ```yaml - name: shutdown-manager @@ -34,12 +31,6 @@ It exposes two HTTP endpoints which are used for `livenessProbe` as well as to h - /bin/contour - envoy - shutdown - livenessProbe: - httpGet: - path: /healthz - port: 8090 - initialDelaySeconds: 3 - periodSeconds: 10 ``` The Envoy container also has some configuration to implement the shutdown manager.