Skip to content

Commit

Permalink
Add liveness and startup/readiness probes
Browse files Browse the repository at this point in the history
This adds liveness and startup/readiness probes to the HNC Deployment using functionality recently added to controller-runtime. The probes configuration might need some adjustments of timeouts etc. to work without issues for a typical HNC deployment. The default configuration should probably be adjusted; timeouts etc., and should be tested in a live cluster - which I do not have available.

Testing: Ran all tests successfully on my local workstation, when using the simple ping probe. But when changing to the deeper webhook probe, the container never becomes ready. Seems to be an issue related to cert-rotator - which I do not know at all. We use cert-manager. On request, Adrian Ludwin took a look at my issues, and somehow this seems to work for him.
  • Loading branch information
erikgb committed Mar 6, 2022
1 parent 281b24f commit db2831b
Show file tree
Hide file tree
Showing 2 changed files with 32 additions and 5 deletions.
25 changes: 20 additions & 5 deletions cmd/manager/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ var (
)

var (
probeAddr string
metricsAddr string
enableStackdriver bool
maxReconciles int
Expand Down Expand Up @@ -89,6 +90,7 @@ func main() {
metricsCleanupFn := enableMetrics()
defer metricsCleanupFn()
mgr := createManager()
setupChecks(mgr)

// Make sure certs are generated and valid if webhooks are enabled and internal certs are used.
setupLog.Info("Starting certificate generation")
Expand All @@ -112,6 +114,7 @@ func main() {

func parseFlags() {
setupLog.Info("Parsing flags")
flag.StringVar(&probeAddr, "health-probe-bind-address", ":8081", "The address the probe endpoint binds to.")
flag.StringVar(&metricsAddr, "metrics-addr", ":8080", "The address the metric endpoint binds to.")
flag.BoolVar(&enableStackdriver, "enable-stackdriver", true, "If true, export metrics to stackdriver")
flag.BoolVar(&enableLeaderElection, "enable-leader-election", false,
Expand Down Expand Up @@ -217,11 +220,12 @@ func createManager() ctrl.Manager {
// it turns out to be harmful.
cfg.Burst = int(cfg.QPS * 1.5)
mgr, err := ctrl.NewManager(cfg, ctrl.Options{
Scheme: scheme,
MetricsBindAddress: metricsAddr,
LeaderElection: enableLeaderElection,
LeaderElectionID: leaderElectionId,
Port: webhookServerPort,
Scheme: scheme,
MetricsBindAddress: metricsAddr,
HealthProbeBindAddress: probeAddr,
LeaderElection: enableLeaderElection,
LeaderElectionID: leaderElectionId,
Port: webhookServerPort,
})
if err != nil {
setupLog.Error(err, "unable to create manager")
Expand All @@ -230,6 +234,17 @@ func createManager() ctrl.Manager {
return mgr
}

func setupChecks(mgr ctrl.Manager) {
if err := mgr.AddHealthzCheck("healthz", mgr.GetWebhookServer().StartedChecker()); err != nil {
setupLog.Error(err, "unable to set up health check")
os.Exit(1)
}
if err := mgr.AddReadyzCheck("readyz", mgr.GetWebhookServer().StartedChecker()); err != nil {
setupLog.Error(err, "unable to set up ready check")
os.Exit(1)
}
}

func startControllers(mgr ctrl.Manager, certsReady chan struct{}) {
// The controllers won't work until the webhooks are operating, and those won't work until the
// certs are all in place.
Expand Down
12 changes: 12 additions & 0 deletions config/manager/manager.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,18 @@ spec:
- "--excluded-namespace=kube-node-lease"
image: controller:latest
name: manager
livenessProbe:
httpGet:
path: /healthz
port: 8081
failureThreshold: 1
periodSeconds: 10
startupProbe:
httpGet:
path: /readyz
port: 8081
failureThreshold: 100
periodSeconds: 5
resources:
limits:
cpu: 100m
Expand Down

0 comments on commit db2831b

Please sign in to comment.