Skip to content

Commit

Permalink
add healthz subpathes for all controllers
Browse files Browse the repository at this point in the history
  • Loading branch information
haouc committed Apr 21, 2023
1 parent 83e4e5f commit f59d39f
Show file tree
Hide file tree
Showing 26 changed files with 607 additions and 109 deletions.
12 changes: 10 additions & 2 deletions controllers/apps/deployment_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,15 +16,17 @@ package apps
import (
"context"

"github.com/aws/amazon-vpc-resource-controller-k8s/controllers/core"
controllers "github.com/aws/amazon-vpc-resource-controller-k8s/controllers/core"
"github.com/aws/amazon-vpc-resource-controller-k8s/pkg/condition"
"github.com/aws/amazon-vpc-resource-controller-k8s/pkg/config"
rcHealthz "github.com/aws/amazon-vpc-resource-controller-k8s/pkg/healthz"
"github.com/aws/amazon-vpc-resource-controller-k8s/pkg/k8s"
"github.com/aws/amazon-vpc-resource-controller-k8s/pkg/node/manager"

"github.com/go-logr/logr"
appV1 "k8s.io/api/apps/v1"
ctrl "sigs.k8s.io/controller-runtime"
"sigs.k8s.io/controller-runtime/pkg/healthz"
)

type DeploymentReconciler struct {
Expand Down Expand Up @@ -63,7 +65,13 @@ func (r *DeploymentReconciler) Reconcile(ctx context.Context, req ctrl.Request)
return ctrl.Result{}, nil
}

func (r *DeploymentReconciler) SetupWithManager(mgr ctrl.Manager) error {
func (r *DeploymentReconciler) SetupWithManager(mgr ctrl.Manager, healthzHandler *rcHealthz.HealthzHandler) error {
// add health check on subpath for deployment controller
// TODO: this is a simple controller and unlikely hit blocking issue but we can revisit this after subpaths are released for a while
healthzHandler.AddControllersHealthCheckers(
map[string]healthz.Checker{"health-deploy-controller": rcHealthz.SimplePing("deployment controller", r.Log)},
)

return ctrl.NewControllerManagedBy(mgr).
For(&appV1.Deployment{}).
Complete(r)
Expand Down
16 changes: 15 additions & 1 deletion controllers/core/configmap_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ import (

"github.com/aws/amazon-vpc-resource-controller-k8s/pkg/condition"
"github.com/aws/amazon-vpc-resource-controller-k8s/pkg/config"
rcHealthz "github.com/aws/amazon-vpc-resource-controller-k8s/pkg/healthz"
"github.com/aws/amazon-vpc-resource-controller-k8s/pkg/k8s"
"github.com/aws/amazon-vpc-resource-controller-k8s/pkg/node/manager"
"github.com/go-logr/logr"
Expand All @@ -27,6 +28,7 @@ import (
"k8s.io/apimachinery/pkg/runtime"
ctrl "sigs.k8s.io/controller-runtime"
"sigs.k8s.io/controller-runtime/pkg/client"
"sigs.k8s.io/controller-runtime/pkg/healthz"
)

// ConfigMapReconciler reconciles a ConfigMap object
Expand All @@ -38,6 +40,7 @@ type ConfigMapReconciler struct {
K8sAPI k8s.K8sWrapper
Condition condition.Conditions
curWinIPAMEnabledCond bool
Context context.Context
}

//+kubebuilder:rbac:groups=core,resources=configmaps,namespace=kube-system,resourceNames=amazon-vpc-cni,verbs=get;list;watch
Expand Down Expand Up @@ -84,7 +87,12 @@ func (r *ConfigMapReconciler) Reconcile(ctx context.Context, req ctrl.Request) (
}

// SetupWithManager sets up the controller with the Manager.
func (r *ConfigMapReconciler) SetupWithManager(mgr ctrl.Manager) error {
func (r *ConfigMapReconciler) SetupWithManager(mgr ctrl.Manager, healthzHandler *rcHealthz.HealthzHandler) error {
// add health check on subpath for CM controller
healthzHandler.AddControllersHealthCheckers(
map[string]healthz.Checker{"health-cm-controller": r.check()},
)

return ctrl.NewControllerManagedBy(mgr).
For(&corev1.ConfigMap{}).
Complete(r)
Expand All @@ -110,3 +118,9 @@ func UpdateNodesOnConfigMapChanges(k8sAPI k8s.K8sWrapper, nodeManager manager.Ma
}
return nil
}

func (r *ConfigMapReconciler) check() healthz.Checker {
r.Log.Info("ConfigMap controller's healthz subpath was added")
// We can revisit this to use PingWithTimeout() instead if we have concerns on this controller.
return rcHealthz.SimplePing("configmap controller", r.Log)
}
54 changes: 44 additions & 10 deletions controllers/core/node_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,17 +16,22 @@ package controllers
import (
"context"
goErr "errors"
"net/http"

"github.com/aws/amazon-vpc-resource-controller-k8s/pkg/condition"
rcHealthz "github.com/aws/amazon-vpc-resource-controller-k8s/pkg/healthz"
"github.com/aws/amazon-vpc-resource-controller-k8s/pkg/node/manager"
"github.com/google/uuid"

"github.com/go-logr/logr"
corev1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/errors"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/apimachinery/pkg/types"
ctrl "sigs.k8s.io/controller-runtime"
"sigs.k8s.io/controller-runtime/pkg/client"
"sigs.k8s.io/controller-runtime/pkg/controller"
"sigs.k8s.io/controller-runtime/pkg/healthz"
)

// MaxNodeConcurrentReconciles is the number of go routines that can invoke
Expand All @@ -43,7 +48,7 @@ type NodeReconciler struct {
Scheme *runtime.Scheme
Manager manager.Manager
Conditions condition.Conditions
// NodeEventCache *bigcache.BigCache
Context context.Context
}

// +kubebuilder:rbac:groups=core,resources=nodes,verbs=get;list;watch
Expand All @@ -68,6 +73,7 @@ func (r *NodeReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.

if err := r.Client.Get(ctx, req.NamespacedName, node); err != nil {
if errors.IsNotFound(err) {
r.Log.V(1).Info("the requested node couldn't be found by k8s client", "Node", req.NamespacedName)
_, found := r.Manager.GetNode(req.Name)
// if cachedNode != nil && cachedNode.HasInstance() {
// // delete the not found node instance id from node event cache for housekeeping
Expand All @@ -80,7 +86,7 @@ func (r *NodeReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.
logger.Error(err, "failed to delete node from manager")
return ctrl.Result{}, nil
}
logger.Info("deleted the node from manager")
logger.V(1).Info("deleted the node from manager")
}
}
return ctrl.Result{}, client.IgnoreNotFound(err)
Expand All @@ -98,17 +104,45 @@ func (r *NodeReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.
return ctrl.Result{}, err
}

// func (r *NodeReconciler) deleteNodeFromNodeEventCache(nodeId string) {
// if err := r.NodeEventCache.Delete(nodeId); err != nil {
// r.Log.V(1).Info("node controller removing node from node event cache failed", "Error", err)
// } else {
// r.Log.V(1).Info("node controller removed the node from node event cache successfully", "InstanceId", nodeId)
// }
// }
func (r *NodeReconciler) SetupWithManager(mgr ctrl.Manager, healthzHandler *rcHealthz.HealthzHandler) error {
// add health check on subpath for node controller
healthzHandler.AddControllersHealthCheckers(
map[string]healthz.Checker{"health-node-controller": r.Check()},
)

func (r *NodeReconciler) SetupWithManager(mgr ctrl.Manager) error {
return ctrl.NewControllerManagedBy(mgr).
For(&corev1.Node{}).
WithOptions(controller.Options{MaxConcurrentReconciles: MaxNodeConcurrentReconciles}).
Complete(r)
}

func (r *NodeReconciler) Check() healthz.Checker {
r.Log.Info("Node controller's healthz subpath was added")
return func(req *http.Request) error {
// if the reconciler is not ready, using the simple ping to test
// this can test the referenced cached pod datastore
if !r.Conditions.GetPodDataStoreSyncStatus() {
r.Log.V(1).Info("***** node controller healthz enpoint tested Simple Ping *****")
return nil
}

err := rcHealthz.PingWithTimeout(func(c chan<- error) {
// when the reconciler is ready, testing the reconciler with a fake node request
pingRequest := &ctrl.Request{
NamespacedName: types.NamespacedName{
Namespace: corev1.NamespaceDefault,
Name: uuid.New().String(),
},
}

// expecting to 'return ctrl.Result{}, client.IgnoreNotFound(err)'
// IgnoreNotFound returns nil on NotFound errors.
// this can test the pod cached datastore and node cached datastore
_, rErr := r.Reconcile(r.Context, *pingRequest)
r.Log.V(1).Info("***** node controller healthz endpoint tested Reconcile *****")
c <- rErr
}, r.Log)

return err
}
}
44 changes: 41 additions & 3 deletions controllers/core/pod_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,20 +15,25 @@ package controllers

import (
"context"
"net/http"
"time"

"github.com/aws/amazon-vpc-resource-controller-k8s/controllers/custom"
"github.com/aws/amazon-vpc-resource-controller-k8s/pkg/condition"
rcHealthz "github.com/aws/amazon-vpc-resource-controller-k8s/pkg/healthz"
"github.com/aws/amazon-vpc-resource-controller-k8s/pkg/k8s"
"github.com/aws/amazon-vpc-resource-controller-k8s/pkg/k8s/pod"
"github.com/aws/amazon-vpc-resource-controller-k8s/pkg/node/manager"
"github.com/aws/amazon-vpc-resource-controller-k8s/pkg/resource"
"github.com/google/uuid"

"github.com/go-logr/logr"
v1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/types"
"k8s.io/client-go/kubernetes"
"k8s.io/client-go/tools/cache"
ctrl "sigs.k8s.io/controller-runtime"
"sigs.k8s.io/controller-runtime/pkg/healthz"
)

// +kubebuilder:rbac:groups="",resources=events,verbs=create;update;patch
Expand Down Expand Up @@ -71,7 +76,7 @@ func (r *PodReconciler) Reconcile(request custom.Request) (ctrl.Result, error) {
return ctrl.Result{}, nil
}
if !exists {
r.Log.Info("pod doesn't exists in the cache anymore",
r.Log.V(1).Info("pod doesn't exists in the cache anymore",
"namespace name", request.NamespacedName.String())
return ctrl.Result{}, nil
}
Expand Down Expand Up @@ -174,9 +179,10 @@ func getAggregateResources(pod *v1.Pod) map[string]int64 {
// list of runnable. After Manager acquire the lease the pod controller runnable
// will be started and the Pod events will be sent to Reconcile function
func (r *PodReconciler) SetupWithManager(ctx context.Context, manager ctrl.Manager,
clientSet *kubernetes.Clientset, pageLimit int, syncPeriod time.Duration) error {
clientSet *kubernetes.Clientset, pageLimit int, syncPeriod time.Duration, healthzHandler *rcHealthz.HealthzHandler) error {
r.Log.Info("The pod controller is using MaxConcurrentReconciles", "Routines", MaxPodConcurrentReconciles)
return custom.NewControllerManagedBy(ctx, manager).

customChecker, err := custom.NewControllerManagedBy(ctx, manager).
WithLogger(r.Log.WithName("custom pod controller")).
UsingDataStore(r.DataStore).
WithClientSet(clientSet).
Expand All @@ -188,4 +194,36 @@ func (r *PodReconciler) SetupWithManager(ctx context.Context, manager ctrl.Manag
ResyncPeriod: syncPeriod,
MaxConcurrentReconciles: MaxPodConcurrentReconciles,
}).UsingConditions(r.Condition).Complete(r)

// add health check on subpath for pod and pod customized controllers
healthzHandler.AddControllersHealthCheckers(
map[string]healthz.Checker{
"health-pod-controller": r.check(),
"health-custom-pod-controller": customChecker,
},
)

return err
}

func (r *PodReconciler) check() healthz.Checker {
r.Log.Info("Pod controller's healthz subpath was added")
// more meaningful ping
return func(req *http.Request) error {
err := rcHealthz.PingWithTimeout(func(c chan<- error) {
pingRequest := &custom.Request{
NamespacedName: types.NamespacedName{
Namespace: v1.NamespaceDefault,
Name: uuid.New().String(),
},
DeletedObject: nil,
}
// calling reconcile will test pod cache
_, rErr := r.Reconcile(*pingRequest)
r.Log.V(1).Info("***** pod controller healthz endpoint tested reconcile *****")
c <- rErr
}, r.Log)

return err
}
}
29 changes: 15 additions & 14 deletions controllers/custom/builder.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ import (
"k8s.io/client-go/kubernetes"
"k8s.io/client-go/tools/cache"
"k8s.io/client-go/util/workqueue"
"sigs.k8s.io/controller-runtime/pkg/healthz"
"sigs.k8s.io/controller-runtime/pkg/manager"
)

Expand Down Expand Up @@ -89,21 +90,21 @@ func NewControllerManagedBy(ctx context.Context, mgr manager.Manager) *Builder {

// Complete adds the controller to manager's Runnable. The Controller
// runnable will start when the manager starts
func (b *Builder) Complete(reconciler Reconciler) error {
func (b *Builder) Complete(reconciler Reconciler) (healthz.Checker, error) {
// Loggr is no longer an interface
// The suggestion is using LogSink to do nil check now
if b.log.GetSink() == nil {
return fmt.Errorf("need to set the logger")
return nil, fmt.Errorf("need to set the logger")
}
if b.converter == nil {
return fmt.Errorf("converter not provided, " +
return nil, fmt.Errorf("converter not provided, " +
"must use high level controller if conversion not required")
}
if b.clientSet == nil {
return fmt.Errorf("need to set kubernetes clienset")
return nil, fmt.Errorf("need to set kubernetes clienset")
}
if b.dataStore == nil {
return fmt.Errorf("need datastore to start the controller")
return nil, fmt.Errorf("need datastore to start the controller")
}

b.SetDefaults()
Expand Down Expand Up @@ -172,17 +173,17 @@ func (b *Builder) Complete(reconciler Reconciler) error {
},
}

controller := &CustomController{
log: b.log,
options: b.options,
config: config,
Do: reconciler,
workQueue: workQueue,
conditions: b.conditions,
}
controller := NewCustomController(
b.log,
b.options,
config,
reconciler,
workQueue,
b.conditions,
)

// Adds the controller to the manager's Runnable
return b.mgr.Add(controller)
return controller.checker, b.mgr.Add(controller)
}

// SetDefaults sets the default options for controller
Expand Down
Loading

0 comments on commit f59d39f

Please sign in to comment.