Skip to content

Commit

Permalink
Feature-gated Builtin healtcheck
Browse files Browse the repository at this point in the history
Adds a "builtin" healthcheck function.

It returns true if all resources are healthy with nil error.
It returns false if any of the resources are not healthy, the error contains the GVK + resource name, and the error message of
each unhealthy resource.

The current list of supported resources is:
- Deployments
- StatefulSets
- DaemonSets
- ReplicaSets
- Pods
- APIServices
- CustomResourceDefinitions

If the resource is not supported, it is assumed to be healthy.

Signed-off-by: Joaquim Moreno Prusi <joaquim@redhat.com>
  • Loading branch information
jmprusi authored and ncdc committed Oct 2, 2023
1 parent ee2aa19 commit 6dbf5ed
Show file tree
Hide file tree
Showing 8 changed files with 1,043 additions and 52 deletions.
1 change: 1 addition & 0 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ require (
k8s.io/cli-runtime v0.26.1
k8s.io/client-go v0.26.1
k8s.io/component-base v0.26.1
k8s.io/kube-aggregator v0.26.1
k8s.io/utils v0.0.0-20221128185143-99ec85e7a448
sigs.k8s.io/controller-runtime v0.14.4
sigs.k8s.io/yaml v1.3.0
Expand Down
2 changes: 2 additions & 0 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -1696,6 +1696,8 @@ k8s.io/klog/v2 v2.2.0/go.mod h1:Od+F08eJP+W3HUb4pSrPpgp9DGU4GzlpG/TmITuYh/Y=
k8s.io/klog/v2 v2.4.0/go.mod h1:Od+F08eJP+W3HUb4pSrPpgp9DGU4GzlpG/TmITuYh/Y=
k8s.io/klog/v2 v2.80.1 h1:atnLQ121W371wYYFawwYx1aEY2eUfs4l3J72wtgAwV4=
k8s.io/klog/v2 v2.80.1/go.mod h1:y1WjHnz7Dj687irZUWR/WLkLc5N1YHtjLdmgWjndZn0=
k8s.io/kube-aggregator v0.26.1 h1:TqDWwuaUJpyhWGWw4JrXR8ZAAaHa9qrsXxR41aR3igw=
k8s.io/kube-aggregator v0.26.1/go.mod h1:E6dnKoQ6f4eFl8QQXHxTASZKXBX6+XcjROWl7GRltl4=
k8s.io/kube-openapi v0.0.0-20200805222855-6aeccd4b50c6/go.mod h1:UuqjUnNftUyPE5H64/qeyjQoUZhGpeFDVdxjTeEVN2o=
k8s.io/kube-openapi v0.0.0-20201113171705-d219536bb9fd/go.mod h1:WOJ3KddDSol4tAGcJo0Tvi+dK12EcqSLqcWsryKMpfM=
k8s.io/kube-openapi v0.0.0-20221012153701-172d655c2280 h1:+70TFaan3hfJzs+7VK2o+OGxg8HsuBr/5f6tVAjDu6E=
Expand Down
95 changes: 47 additions & 48 deletions internal/controllers/bundledeployment/bundledeployment.go
Original file line number Diff line number Diff line change
Expand Up @@ -35,9 +35,11 @@ import (
"sigs.k8s.io/controller-runtime/pkg/source"

rukpakv1alpha1 "github.com/operator-framework/rukpak/api/v1alpha1"
"github.com/operator-framework/rukpak/internal/healthchecks"
helmpredicate "github.com/operator-framework/rukpak/internal/helm-operator-plugins/predicate"
"github.com/operator-framework/rukpak/internal/storage"
"github.com/operator-framework/rukpak/internal/util"
"github.com/operator-framework/rukpak/pkg/features"
)

/*
Expand Down Expand Up @@ -263,12 +265,7 @@ func (c *controller) reconcile(ctx context.Context, bd *rukpakv1alpha1.BundleDep
cl, err := c.acg.ActionClientFor(bd)
bd.SetNamespace("")
if err != nil {
meta.SetStatusCondition(&bd.Status.Conditions, metav1.Condition{
Type: rukpakv1alpha1.TypeInstalled,
Status: metav1.ConditionFalse,
Reason: rukpakv1alpha1.ReasonErrorGettingClient,
Message: err.Error(),
})
setInstalledAndHealthyFalse(bd, rukpakv1alpha1.ReasonErrorGettingClient, err.Error())
return ctrl.Result{}, err
}

Expand All @@ -281,12 +278,7 @@ func (c *controller) reconcile(ctx context.Context, bd *rukpakv1alpha1.BundleDep

rel, state, err := c.getReleaseState(cl, bd, chrt, values, post)
if err != nil {
meta.SetStatusCondition(&bd.Status.Conditions, metav1.Condition{
Type: rukpakv1alpha1.TypeInstalled,
Status: metav1.ConditionFalse,
Reason: rukpakv1alpha1.ReasonErrorGettingReleaseState,
Message: err.Error(),
})
setInstalledAndHealthyFalse(bd, rukpakv1alpha1.ReasonErrorGettingReleaseState, err.Error())
return ctrl.Result{}, err
}

Expand All @@ -306,12 +298,7 @@ func (c *controller) reconcile(ctx context.Context, bd *rukpakv1alpha1.BundleDep
if isResourceNotFoundErr(err) {
err = errRequiredResourceNotFound{err}
}
meta.SetStatusCondition(&bd.Status.Conditions, metav1.Condition{
Type: rukpakv1alpha1.TypeInstalled,
Status: metav1.ConditionFalse,
Reason: rukpakv1alpha1.ReasonInstallFailed,
Message: err.Error(),
})
setInstalledAndHealthyFalse(bd, rukpakv1alpha1.ReasonInstallFailed, err.Error())
return ctrl.Result{}, err
}
case stateNeedsUpgrade:
Expand All @@ -326,25 +313,15 @@ func (c *controller) reconcile(ctx context.Context, bd *rukpakv1alpha1.BundleDep
if isResourceNotFoundErr(err) {
err = errRequiredResourceNotFound{err}
}
meta.SetStatusCondition(&bd.Status.Conditions, metav1.Condition{
Type: rukpakv1alpha1.TypeInstalled,
Status: metav1.ConditionFalse,
Reason: rukpakv1alpha1.ReasonUpgradeFailed,
Message: err.Error(),
})
setInstalledAndHealthyFalse(bd, rukpakv1alpha1.ReasonUpgradeFailed, err.Error())
return ctrl.Result{}, err
}
case stateUnchanged:
if err := cl.Reconcile(rel); err != nil {
if isResourceNotFoundErr(err) {
err = errRequiredResourceNotFound{err}
}
meta.SetStatusCondition(&bd.Status.Conditions, metav1.Condition{
Type: rukpakv1alpha1.TypeInstalled,
Status: metav1.ConditionFalse,
Reason: rukpakv1alpha1.ReasonReconcileFailed,
Message: err.Error(),
})
setInstalledAndHealthyFalse(bd, rukpakv1alpha1.ReasonReconcileFailed, err.Error())
return ctrl.Result{}, err
}
default:
Expand All @@ -353,24 +330,14 @@ func (c *controller) reconcile(ctx context.Context, bd *rukpakv1alpha1.BundleDep

relObjects, err := util.ManifestObjects(strings.NewReader(rel.Manifest), fmt.Sprintf("%s-release-manifest", rel.Name))
if err != nil {
meta.SetStatusCondition(&bd.Status.Conditions, metav1.Condition{
Type: rukpakv1alpha1.TypeInstalled,
Status: metav1.ConditionFalse,
Reason: rukpakv1alpha1.ReasonCreateDynamicWatchFailed,
Message: err.Error(),
})
setInstalledAndHealthyFalse(bd, rukpakv1alpha1.ReasonCreateDynamicWatchFailed, err.Error())
return ctrl.Result{}, err
}

for _, obj := range relObjects {
uMap, err := runtime.DefaultUnstructuredConverter.ToUnstructured(obj)
if err != nil {
meta.SetStatusCondition(&bd.Status.Conditions, metav1.Condition{
Type: rukpakv1alpha1.TypeInstalled,
Status: metav1.ConditionFalse,
Reason: rukpakv1alpha1.ReasonCreateDynamicWatchFailed,
Message: err.Error(),
})
setInstalledAndHealthyFalse(bd, rukpakv1alpha1.ReasonCreateDynamicWatchFailed, err.Error())
return ctrl.Result{}, err
}

Expand All @@ -391,12 +358,7 @@ func (c *controller) reconcile(ctx context.Context, bd *rukpakv1alpha1.BundleDep
}
return nil
}(); err != nil {
meta.SetStatusCondition(&bd.Status.Conditions, metav1.Condition{
Type: rukpakv1alpha1.TypeInstalled,
Status: metav1.ConditionFalse,
Reason: rukpakv1alpha1.ReasonCreateDynamicWatchFailed,
Message: err.Error(),
})
setInstalledAndHealthyFalse(bd, rukpakv1alpha1.ReasonCreateDynamicWatchFailed, err.Error())
return ctrl.Result{}, err
}
}
Expand All @@ -408,13 +370,50 @@ func (c *controller) reconcile(ctx context.Context, bd *rukpakv1alpha1.BundleDep
})
bd.Status.ActiveBundle = bundle.GetName()

if features.RukpakFeatureGate.Enabled(features.BundleDeploymentHealth) {
if err = healthchecks.AreObjectsHealthy(ctx, c.cl, relObjects); err != nil {
meta.SetStatusCondition(&bd.Status.Conditions, metav1.Condition{
Type: rukpakv1alpha1.TypeHealthy,
Status: metav1.ConditionFalse,
Reason: rukpakv1alpha1.ReasonUnhealthy,
Message: err.Error(),
})
return ctrl.Result{}, err
}
meta.SetStatusCondition(&bd.Status.Conditions, metav1.Condition{
Type: rukpakv1alpha1.TypeHealthy,
Status: metav1.ConditionTrue,
Reason: rukpakv1alpha1.ReasonHealthy,
Message: "BundleDeployment is healthy",
})
}
if err := c.reconcileOldBundles(ctx, bundle, allBundles); err != nil {
return ctrl.Result{}, fmt.Errorf("failed to delete old bundles: %v", err)
}

return ctrl.Result{}, nil
}

// setInstalledAndHealthyFalse sets the Installed and if the feature gate is enabled, the Healthy conditions to False,
// and allows to set the Installed condition reason and message.
func setInstalledAndHealthyFalse(bd *rukpakv1alpha1.BundleDeployment, installedConditionReason, installedConditionMessage string) {
meta.SetStatusCondition(&bd.Status.Conditions, metav1.Condition{
Type: rukpakv1alpha1.TypeInstalled,
Status: metav1.ConditionFalse,
Reason: installedConditionReason,
Message: installedConditionMessage,
})

if features.RukpakFeatureGate.Enabled(features.BundleDeploymentHealth) {
meta.SetStatusCondition(&bd.Status.Conditions, metav1.Condition{
Type: rukpakv1alpha1.TypeHealthy,
Status: metav1.ConditionFalse,
Reason: rukpakv1alpha1.ReasonInstallationStatusFalse,
Message: "Installed condition is false",
})
}
}

// reconcileOldBundles is responsible for garbage collecting any Bundles
// that no longer match the desired Bundle template.
func (c *controller) reconcileOldBundles(ctx context.Context, currBundle *rukpakv1alpha1.Bundle, allBundles *rukpakv1alpha1.BundleList) error {
Expand Down
206 changes: 206 additions & 0 deletions internal/healthchecks/builtin.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,206 @@
package healthchecks

import (
"context"
"errors"
"fmt"

appsv1 "k8s.io/api/apps/v1"
corev1 "k8s.io/api/core/v1"
apiextensionsv1 "k8s.io/apiextensions-apiserver/pkg/apis/apiextensions/v1"
"k8s.io/apimachinery/pkg/apis/meta/v1/unstructured"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/apimachinery/pkg/types"
apiregistrationv1 "k8s.io/kube-aggregator/pkg/apis/apiregistration/v1"
"sigs.k8s.io/controller-runtime/pkg/client"
)

// AreObjectsHealthy checks if the given resources are healthy.
// It returns a nil error if all the resources are healthy, if any resource is not healthy, the error will
// contain the GVK + namespace/resourceName and the error message of each unhealthy resource.
//
// The current list of supported resources is:
// - Deployments
// - StatefulSets
// - DaemonSets
// - ReplicaSets
// - Pods
// - APIServices
// - CustomResourceDefinitions
//
// If the resource is not supported, it is assumed to be healthy.
func AreObjectsHealthy(ctx context.Context, client client.Client, objects []client.Object) error {
var gvkErrors []error

for _, object := range objects {
objectKey := types.NamespacedName{
Name: object.GetName(),
Namespace: object.GetNamespace(),
}

u := &unstructured.Unstructured{}
u.SetGroupVersionKind(object.GetObjectKind().GroupVersionKind())
if err := client.Get(ctx, objectKey, u); err != nil {
gvkErrors = appendResourceError(gvkErrors, object, err.Error())
continue
}

switch u.GroupVersionKind() {
case appsv1.SchemeGroupVersion.WithKind("Deployment"):
// Check if the deployment is available.
obj := &appsv1.Deployment{}
if err := runtime.DefaultUnstructuredConverter.FromUnstructured(u.Object, obj); err != nil {
gvkErrors = appendResourceError(gvkErrors, obj, err.Error())
continue
}
conditionExists := false
for _, condition := range obj.Status.Conditions {
if condition.Type == appsv1.DeploymentAvailable {
if condition.Status != "True" {
gvkErrors = appendResourceError(gvkErrors, obj, condition.Message)
}
conditionExists = true
break
}
}
if conditionExists {
continue
}
gvkErrors = appendResourceError(gvkErrors, obj, "DeploymentAvailable condition not found")
case appsv1.SchemeGroupVersion.WithKind("StatefulSet"):
// This logic has been adapted from the helm codebase.
obj := &appsv1.StatefulSet{}
if err := runtime.DefaultUnstructuredConverter.FromUnstructured(u.Object, obj); err != nil {
gvkErrors = appendResourceError(gvkErrors, obj, err.Error())
continue
}
// This logic has been adapted from the helm codebase.
// - https://github.com/helm/helm/blob/e7bb860d9a32e8739c944b8e7b7f7031d752411a/pkg/kube/ready.go#L357-L410

// If the statefulset is not using the RollingUpdate strategy, we assume it's healthy.
if obj.Spec.UpdateStrategy.Type != appsv1.RollingUpdateStatefulSetStrategyType {
continue
}
if obj.Status.ObservedGeneration < obj.Generation {
gvkErrors = appendResourceError(gvkErrors, obj, "StatefulSet is not ready (update has not yet been observed)")
}

var partition int
var replicas = 1
if obj.Spec.UpdateStrategy.RollingUpdate != nil && obj.Spec.UpdateStrategy.RollingUpdate.Partition != nil {
partition = int(*obj.Spec.UpdateStrategy.RollingUpdate.Partition)
}
if obj.Spec.Replicas != nil {
replicas = int(*obj.Spec.Replicas)
}
expectedReplicas := replicas - partition

if obj.Status.UpdatedReplicas < int32(expectedReplicas) {
gvkErrors = appendResourceError(gvkErrors, obj, fmt.Sprintf("StatefulSet is not ready (expected %d replicas, got %d)", expectedReplicas, obj.Status.UpdatedReplicas))
continue
}
if int(obj.Status.ReadyReplicas) != replicas {
gvkErrors = appendResourceError(gvkErrors, obj, fmt.Sprintf("StatefulSet is not ready (expected %d replicas, got %d)", replicas, obj.Status.ReadyReplicas))
continue
}
if partition == 0 && obj.Status.CurrentRevision != obj.Status.UpdateRevision {
gvkErrors = appendResourceError(gvkErrors, obj, fmt.Sprintf("StatefulSet is not ready (expected revision %s, got %s)", obj.Status.CurrentRevision, obj.Status.UpdateRevision))
continue
}
case appsv1.SchemeGroupVersion.WithKind("DaemonSet"):
// Check if the daemonset is ready.
obj := &appsv1.DaemonSet{}
if err := runtime.DefaultUnstructuredConverter.FromUnstructured(u.Object, obj); err != nil {
gvkErrors = appendResourceError(gvkErrors, obj, err.Error())
continue
}
if obj.Status.NumberAvailable != obj.Status.DesiredNumberScheduled {
gvkErrors = appendResourceError(gvkErrors, obj, "DaemonSet is not ready")
}
case appsv1.SchemeGroupVersion.WithKind("ReplicaSet"):
// Check if the replicaset is ready.
obj := &appsv1.ReplicaSet{}
if err := runtime.DefaultUnstructuredConverter.FromUnstructured(u.Object, obj); err != nil {
gvkErrors = appendResourceError(gvkErrors, obj, err.Error())
continue
}
if obj.Status.AvailableReplicas != obj.Status.Replicas {
gvkErrors = appendResourceError(gvkErrors, obj, "ReplicaSet is not ready")
}
case corev1.SchemeGroupVersion.WithKind("Pod"):
// Check if the pod is running or succeeded.
obj := &corev1.Pod{}
if err := runtime.DefaultUnstructuredConverter.FromUnstructured(u.Object, obj); err != nil {
gvkErrors = appendResourceError(gvkErrors, obj, err.Error())
continue
}
if obj.Status.Phase != corev1.PodRunning && obj.Status.Phase != corev1.PodSucceeded {
gvkErrors = appendResourceError(gvkErrors, obj, "Pod is not Running or Succeeded")
}
case apiregistrationv1.SchemeGroupVersion.WithKind("APIService"):
// Check if the APIService is available.
obj := &apiregistrationv1.APIService{}
if err := runtime.DefaultUnstructuredConverter.FromUnstructured(u.Object, obj); err != nil {
gvkErrors = appendResourceError(gvkErrors, obj, err.Error())
continue
}
conditionExists := false
for _, condition := range obj.Status.Conditions {
if condition.Type == apiregistrationv1.Available {
if condition.Status != "True" {
gvkErrors = appendResourceError(gvkErrors, obj, condition.Message)
}
conditionExists = true
break
}
}
if conditionExists {
continue
}
// If we are here we didn't find the "Available" condition, so we assume the APIService is non healthy.
gvkErrors = appendResourceError(gvkErrors, obj, "Available condition not found")
case apiextensionsv1.SchemeGroupVersion.WithKind("CustomResourceDefinition"):
// Check if the CRD is established.
obj := &apiextensionsv1.CustomResourceDefinition{}
if err := runtime.DefaultUnstructuredConverter.FromUnstructured(u.Object, obj); err != nil {
gvkErrors = appendResourceError(gvkErrors, obj, err.Error())
continue
}
conditionExists := false
for _, condition := range obj.Status.Conditions {
if condition.Type == apiextensionsv1.Established {
if condition.Status != "True" {
gvkErrors = appendResourceError(gvkErrors, obj, condition.Message)
}
conditionExists = true
break
}
}
if conditionExists {
continue
}
gvkErrors = appendResourceError(gvkErrors, obj, "Established condition not found")
default:
// If we don't know how to check the health of the object, we assume it's healthy.
continue
}
}

return errors.Join(gvkErrors...)
}

// toErrKey returns a string that identifies a resource based on its GVK and namespace/name. This key is used
// to identify the resource in the error message.
func toErrKey(resource client.Object) string {
// If the resource is namespaced, include the namespace in the key.
if resource.GetNamespace() != "" {
return fmt.Sprintf("(%s)(%s/%s)", resource.GetObjectKind().GroupVersionKind().String(), resource.GetNamespace(), resource.GetName())
}

return fmt.Sprintf("(%s)(%s)", resource.GetObjectKind().GroupVersionKind().String(), resource.GetName())
}

// appendResourceError appends a new error to the given slice of errors and returns it.
func appendResourceError(gvkErrors []error, resource client.Object, message string) []error {
return append(gvkErrors, errors.New(toErrKey(resource)+": "+message))
}
Loading

0 comments on commit 6dbf5ed

Please sign in to comment.