From cbde1c6f4de60ab347b003933117875404f5d8b9 Mon Sep 17 00:00:00 2001 From: Adrian Ludwin Date: Wed, 23 Mar 2022 23:52:05 -0400 Subject: [PATCH] Fix probes when installing on a fresh cluster See issue #158. If the Secret with the certs doesn't exist, starting the webhook server will fail and HNC will exit. Before we configured the probes, we carefully didn't start the webhook server until after the certs were ready, but as a side effect of creating the probe functions, we accidentally started the webhook server before the certs were generated. This worked fine on a cluster that already had the certs (like the one I was testing on) but failed for everyone else (oops). The fix is simply to use a non-default checker that knows whether the certs have been generated, and avoids accidentally starting the webhook server if the certs don't exist yet. Tested: manually on a cluster without the Secret. Without this change, I can see the error from the webhook server complaining that the certs don't exist; with this change, I can see that that in HNC's first invocation, we get some "healthz check failed" messages, but the cert controller runs, generates the certs, and restarts HNC. The second invocation works just fine. --- cmd/manager/main.go | 30 ++++++++++++++++++++++++------ 1 file changed, 24 insertions(+), 6 deletions(-) diff --git a/cmd/manager/main.go b/cmd/manager/main.go index f4a6e7cb5..67d4a7de0 100644 --- a/cmd/manager/main.go +++ b/cmd/manager/main.go @@ -16,7 +16,9 @@ limitations under the License. package main import ( + "errors" "flag" + "net/http" "os" "strings" @@ -90,7 +92,6 @@ func main() { metricsCleanupFn := enableMetrics() defer metricsCleanupFn() mgr := createManager() - setupChecks(mgr) // Make sure certs are generated and valid if webhooks are enabled and internal certs are used. setupLog.Info("Starting certificate generation") @@ -100,9 +101,12 @@ func main() { os.Exit(1) } + setupProbeEndpoints(mgr, certsReady) + // The call to mgr.Start will never return, but the certs won't be ready until the manager starts - // and we can't set up the webhooks without them. So start a goroutine which will wait until the - // certs are ready, and then create the rest of the HNC controllers. + // and we can't set up the webhooks without them (the webhook server runnable will try to read the + // certs, and if those certs don't exist, the entire process will exit). So start a goroutine + // which will wait until the certs are ready, and then create the rest of the HNC controllers. go startControllers(mgr, certsReady) setupLog.Info("Starting manager") @@ -234,15 +238,29 @@ func createManager() ctrl.Manager { return mgr } -func setupChecks(mgr ctrl.Manager) { - if err := mgr.AddHealthzCheck("healthz", mgr.GetWebhookServer().StartedChecker()); err != nil { +// setupProbeEndpoints registers the health endpoints +func setupProbeEndpoints(mgr ctrl.Manager, certsReady chan struct{}) { + // We can't use the default checker directly, since the checker assumes that the webhook server + // has been started, and it will error out (and crash HNC) if the certs don't exist yet. + // Therefore, this thin wrapper checks whether the certs are ready, and if so, bypasses the + // controller-manager checker. + checker := func(req *http.Request) error { + select { + case <-certsReady: + return mgr.GetWebhookServer().StartedChecker()(req) + default: + return errors.New("HNC internal certs are not yet ready") + } + } + if err := mgr.AddHealthzCheck("healthz", checker); err != nil { setupLog.Error(err, "unable to set up health check") os.Exit(1) } - if err := mgr.AddReadyzCheck("readyz", mgr.GetWebhookServer().StartedChecker()); err != nil { + if err := mgr.AddReadyzCheck("readyz", checker); err != nil { setupLog.Error(err, "unable to set up ready check") os.Exit(1) } + setupLog.Info("Probe endpoints are configured on healthz and readyz") } func startControllers(mgr ctrl.Manager, certsReady chan struct{}) {