diff --git a/pkg/addonmanager/constants/constants.go b/pkg/addonmanager/constants/constants.go index 0aad08816..e2ea4e2bf 100644 --- a/pkg/addonmanager/constants/constants.go +++ b/pkg/addonmanager/constants/constants.go @@ -1,5 +1,7 @@ package constants +import "fmt" + const ( // AddonLabel is the label for addon AddonLabel = "open-cluster-management.io/addon-name" @@ -13,3 +15,8 @@ const ( // PreDeleteHookFinalizer is the finalizer for an addon which has deployed hook objects PreDeleteHookFinalizer = "cluster.open-cluster-management.io/addon-pre-delete" ) + +// DeployWorkName return the name of work for the addon +func DeployWorkName(addonName string) string { + return fmt.Sprintf("addon-%s-deploy", addonName) +} diff --git a/pkg/addonmanager/controllers/addonhealthcheck/controller.go b/pkg/addonmanager/controllers/addonhealthcheck/controller.go index 56af4a699..ca7b23780 100644 --- a/pkg/addonmanager/controllers/addonhealthcheck/controller.go +++ b/pkg/addonmanager/controllers/addonhealthcheck/controller.go @@ -6,23 +6,29 @@ import ( "github.com/openshift/library-go/pkg/controller/factory" "github.com/openshift/library-go/pkg/operator/events" + "k8s.io/apimachinery/pkg/api/equality" "k8s.io/apimachinery/pkg/api/errors" "k8s.io/apimachinery/pkg/api/meta" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime" "k8s.io/client-go/tools/cache" "k8s.io/klog/v2" + "open-cluster-management.io/addon-framework/pkg/addonmanager/constants" "open-cluster-management.io/addon-framework/pkg/agent" addonapiv1alpha1 "open-cluster-management.io/api/addon/v1alpha1" addonv1alpha1client "open-cluster-management.io/api/client/addon/clientset/versioned" addoninformerv1alpha1 "open-cluster-management.io/api/client/addon/informers/externalversions/addon/v1alpha1" addonlisterv1alpha1 "open-cluster-management.io/api/client/addon/listers/addon/v1alpha1" + workinformers "open-cluster-management.io/api/client/work/informers/externalversions/work/v1" + worklister "open-cluster-management.io/api/client/work/listers/work/v1" + workapiv1 "open-cluster-management.io/api/work/v1" ) // addonHealthCheckController reconciles instances of ManagedClusterAddon on the hub. type addonHealthCheckController struct { addonClient addonv1alpha1client.Interface managedClusterAddonLister addonlisterv1alpha1.ManagedClusterAddOnLister + workLister worklister.ManifestWorkLister agentAddons map[string]agent.AgentAddon eventRecorder events.Recorder } @@ -30,14 +36,16 @@ type addonHealthCheckController struct { func NewAddonHealthCheckController( addonClient addonv1alpha1client.Interface, addonInformers addoninformerv1alpha1.ManagedClusterAddOnInformer, + workInformers workinformers.ManifestWorkInformer, agentAddons map[string]agent.AgentAddon, recorder events.Recorder, ) factory.Controller { c := &addonHealthCheckController{ addonClient: addonClient, managedClusterAddonLister: addonInformers.Lister(), + workLister: workInformers.Lister(), agentAddons: agentAddons, - eventRecorder: recorder.WithComponentSuffix(fmt.Sprintf("addon-healthcheck-controller")), + eventRecorder: recorder.WithComponentSuffix("addon-healthcheck-controller"), } return factory.New().WithFilteredEventsInformersQueueKeyFunc( @@ -53,8 +61,34 @@ func NewAddonHealthCheckController( return true }, addonInformers.Informer()). + WithFilteredEventsInformersQueueKeyFunc( + func(obj runtime.Object) string { + accessor, _ := meta.Accessor(obj) + return fmt.Sprintf("%s/%s", accessor.GetNamespace(), accessor.GetLabels()[constants.AddonLabel]) + }, + func(obj interface{}) bool { + accessor, _ := meta.Accessor(obj) + if accessor.GetLabels() == nil { + return false + } + + addonName, ok := accessor.GetLabels()[constants.AddonLabel] + if !ok { + return false + } + + if _, ok := c.agentAddons[addonName]; !ok { + return false + } + if accessor.GetName() != constants.DeployWorkName(addonName) { + return false + } + return true + }, + workInformers.Informer(), + ). WithSync(c.sync). - ToController(fmt.Sprintf("addon-healthcheck-controller"), recorder) + ToController("addon-healthcheck-controller", recorder) } func (c *addonHealthCheckController) sync(ctx context.Context, syncCtx factory.SyncContext) error { @@ -73,44 +107,172 @@ func (c *addonHealthCheckController) sync(ctx context.Context, syncCtx factory.S return err } - for addonName, addon := range c.agentAddons { - if addon.GetAgentAddonOptions().HealthProber == nil { - continue - } - return c.syncAddonHealthChecker(ctx, managedClusterAddon, addonName, clusterName) + agentAddon := c.agentAddons[addonName] + if agentAddon == nil { + return nil } - return nil + return c.syncAddonHealthChecker(ctx, managedClusterAddon, agentAddon) } -func (c *addonHealthCheckController) syncAddonHealthChecker(ctx context.Context, addon *addonapiv1alpha1.ManagedClusterAddOn, addonName, clusterName string) error { +func (c *addonHealthCheckController) syncAddonHealthChecker(ctx context.Context, addon *addonapiv1alpha1.ManagedClusterAddOn, agentAddon agent.AgentAddon) error { // for in-place edit addon = addon.DeepCopy() // reconcile health check mode var expectedHealthCheckMode addonapiv1alpha1.HealthCheckMode - agentAddon := c.agentAddons[addonName] - if agentAddon != nil && agentAddon.GetAgentAddonOptions().HealthProber != nil { - switch c.agentAddons[addonName].GetAgentAddonOptions().HealthProber.Type { - // TODO(yue9944882): implement work api health checker - //case agent.HealthProberTypeWork: - //fallthrough - case agent.HealthProberTypeNone: - expectedHealthCheckMode = addonapiv1alpha1.HealthCheckModeCustomized - case agent.HealthProberTypeLease: - fallthrough - default: - expectedHealthCheckMode = addonapiv1alpha1.HealthCheckModeLease - } + + if agentAddon.GetAgentAddonOptions().HealthProber == nil { + return nil + } + + switch agentAddon.GetAgentAddonOptions().HealthProber.Type { + case agent.HealthProberTypeWork: + fallthrough + case agent.HealthProberTypeNone: + expectedHealthCheckMode = addonapiv1alpha1.HealthCheckModeCustomized + case agent.HealthProberTypeLease: + fallthrough + default: + expectedHealthCheckMode = addonapiv1alpha1.HealthCheckModeLease } + if expectedHealthCheckMode != addon.Status.HealthCheck.Mode { addon.Status.HealthCheck.Mode = expectedHealthCheckMode c.eventRecorder.Eventf("HealthCheckModeUpdated", "Updated health check mode to %s", expectedHealthCheckMode) - _, err := c.addonClient.AddonV1alpha1().ManagedClusterAddOns(clusterName). + _, err := c.addonClient.AddonV1alpha1().ManagedClusterAddOns(addon.Namespace). UpdateStatus(ctx, addon, metav1.UpdateOptions{}) if err != nil { return err } } + return c.probeAddonStatus(ctx, addon, agentAddon) +} + +func (c *addonHealthCheckController) probeAddonStatus(ctx context.Context, addon *addonapiv1alpha1.ManagedClusterAddOn, agentAddon agent.AgentAddon) error { + if agentAddon.GetAgentAddonOptions().HealthProber == nil { + return nil + } + + if agentAddon.GetAgentAddonOptions().HealthProber.Type != agent.HealthProberTypeWork { + return nil + } + + addonWork, err := c.workLister.ManifestWorks(addon.Namespace).Get(constants.DeployWorkName(addon.Name)) + if err != nil { + cond := metav1.Condition{ + Type: "Available", + Status: metav1.ConditionUnknown, + Reason: "WorkNotFound", + Message: "Work for addon is not found", + } + return c.updateConditions(ctx, addon, cond) + } + + // Check the overall work available condition at first. + workCond := meta.FindStatusCondition(addonWork.Status.Conditions, workapiv1.WorkAvailable) + switch { + case workCond == nil: + cond := metav1.Condition{ + Type: "Available", + Status: metav1.ConditionUnknown, + Reason: "WorkNotApplied", + Message: "Work is not applied yet", + } + return c.updateConditions(ctx, addon, cond) + case workCond.Status == metav1.ConditionFalse: + cond := metav1.Condition{ + Type: "Available", + Status: metav1.ConditionFalse, + Reason: "WorkApplyFailed", + Message: workCond.Message, + } + return c.updateConditions(ctx, addon, cond) + } + + if agentAddon.GetAgentAddonOptions().HealthProber.WorkProber == nil { + cond := metav1.Condition{ + Type: "Available", + Status: metav1.ConditionTrue, + Reason: "WorkApplied", + Message: "Addon work is applied", + } + return c.updateConditions(ctx, addon, cond) + } + + probeFields := agentAddon.GetAgentAddonOptions().HealthProber.WorkProber.ProbeFields + + for _, field := range probeFields { + result := findResultByIdentifier(field.ResourceIdentifier, addonWork) + // if no results are returned. it is possible that work agent has not returned the feedback value. + // mark condition to unknown + if result == nil { + cond := metav1.Condition{ + Type: "Available", + Status: metav1.ConditionUnknown, + Reason: "NoProbeResult", + Message: "Probe results are not returned", + } + return c.updateConditions(ctx, addon, cond) + } + + err := agentAddon.GetAgentAddonOptions().HealthProber.WorkProber.HealthCheck(field.ResourceIdentifier, *result) + if err != nil { + cond := metav1.Condition{ + Type: "Available", + Status: metav1.ConditionFalse, + Reason: "ProbeUnavailable", + Message: fmt.Sprintf("Probe addon unavailable with err %v", err), + } + return c.updateConditions(ctx, addon, cond) + } + } + + cond := metav1.Condition{ + Type: "Available", + Status: metav1.ConditionTrue, + Reason: "ProbeAvailable", + Message: "Addon is available", + } + return c.updateConditions(ctx, addon, cond) +} + +func (c *addonHealthCheckController) updateConditions(ctx context.Context, addon *addonapiv1alpha1.ManagedClusterAddOn, conds ...metav1.Condition) error { + addonCopy := addon.DeepCopy() + + for _, cond := range conds { + meta.SetStatusCondition(&addonCopy.Status.Conditions, cond) + } + + if equality.Semantic.DeepEqual(addon.Status.Conditions, addonCopy.Status.Conditions) { + return nil + } + + _, err := c.addonClient.AddonV1alpha1().ManagedClusterAddOns(addonCopy.Namespace).UpdateStatus(ctx, addonCopy, metav1.UpdateOptions{}) + return err +} + +func findResultByIdentifier(identifier workapiv1.ResourceIdentifier, work *workapiv1.ManifestWork) *workapiv1.StatusFeedbackResult { + for _, status := range work.Status.ResourceStatus.Manifests { + if identifier.Group != status.ResourceMeta.Group { + continue + } + if identifier.Resource != status.ResourceMeta.Resource { + continue + } + if identifier.Name != status.ResourceMeta.Name { + continue + } + if identifier.Namespace != status.ResourceMeta.Namespace { + continue + } + + if len(status.StatusFeedbacks.Values) == 0 { + return nil + } + + return &status.StatusFeedbacks + } + return nil } diff --git a/pkg/addonmanager/controllers/addonhealthcheck/controller_test.go b/pkg/addonmanager/controllers/addonhealthcheck/controller_test.go index 22be33741..211e837f7 100644 --- a/pkg/addonmanager/controllers/addonhealthcheck/controller_test.go +++ b/pkg/addonmanager/controllers/addonhealthcheck/controller_test.go @@ -2,19 +2,26 @@ package addonhealthcheck import ( "context" + "fmt" "testing" "time" "github.com/openshift/library-go/pkg/operator/events/eventstesting" + "k8s.io/apimachinery/pkg/api/meta" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime" clienttesting "k8s.io/client-go/testing" "k8s.io/client-go/tools/cache" "open-cluster-management.io/addon-framework/pkg/addonmanager/addontesting" + "open-cluster-management.io/addon-framework/pkg/addonmanager/constants" "open-cluster-management.io/addon-framework/pkg/agent" addonapiv1alpha1 "open-cluster-management.io/api/addon/v1alpha1" fakeaddon "open-cluster-management.io/api/client/addon/clientset/versioned/fake" addoninformers "open-cluster-management.io/api/client/addon/informers/externalversions" + fakework "open-cluster-management.io/api/client/work/clientset/versioned/fake" + workinformers "open-cluster-management.io/api/client/work/informers/externalversions" clusterv1 "open-cluster-management.io/api/cluster/v1" + workapiv1 "open-cluster-management.io/api/work/v1" ) type testAgent struct { @@ -134,3 +141,237 @@ func NewAddonWithHealthCheck(name, namespace string, mode addonapiv1alpha1.Healt addon.Status.HealthCheck = addonapiv1alpha1.HealthCheck{Mode: mode} return addon } + +func TestReconcileWithWork(t *testing.T) { + addon := NewAddonWithHealthCheck("test", "cluster1", addonapiv1alpha1.HealthCheckModeCustomized) + + fakeAddonClient := fakeaddon.NewSimpleClientset(addon) + fakeWorkClient := fakework.NewSimpleClientset() + + addonInformers := addoninformers.NewSharedInformerFactory(fakeAddonClient, 10*time.Minute) + workInformers := workinformers.NewSharedInformerFactory(fakeWorkClient, 10*time.Minute) + + addonInformers.Addon().V1alpha1().ManagedClusterAddOns().Informer().GetStore().Add(addon) + + testaddon := &testAgent{ + name: "test", + health: &agent.HealthProber{ + Type: agent.HealthProberTypeWork, + }, + } + + controller := addonHealthCheckController{ + addonClient: fakeAddonClient, + managedClusterAddonLister: addonInformers.Addon().V1alpha1().ManagedClusterAddOns().Lister(), + workLister: workInformers.Work().V1().ManifestWorks().Lister(), + agentAddons: map[string]agent.AgentAddon{testaddon.name: testaddon}, + eventRecorder: eventstesting.NewTestingEventRecorder(t), + } + + key, _ := cache.MetaNamespaceKeyFunc(addon) + syncContext := addontesting.NewFakeSyncContext(t, key) + err := controller.sync(context.TODO(), syncContext) + if err != nil { + t.Errorf("expected no error when sync: %v", err) + } + + addontesting.AssertActions(t, fakeAddonClient.Actions(), "update") + actual := fakeAddonClient.Actions()[0].(clienttesting.UpdateActionImpl).Object + addOn := actual.(*addonapiv1alpha1.ManagedClusterAddOn) + if meta.IsStatusConditionTrue(addOn.Status.Conditions, "Available") { + t.Errorf("addon condition should be unavailable: %v", addOn.Status.Conditions) + } + + fakeAddonClient.ClearActions() + work := &workapiv1.ManifestWork{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: addon.Namespace, + Name: constants.DeployWorkName(addon.Name), + }, + } + + workInformers.Work().V1().ManifestWorks().Informer().GetStore().Add(work) + + err = controller.sync(context.TODO(), syncContext) + if err != nil { + t.Errorf("expected no error when sync: %v", err) + } + + addontesting.AssertActions(t, fakeAddonClient.Actions(), "update") + actual = fakeAddonClient.Actions()[0].(clienttesting.UpdateActionImpl).Object + addOn = actual.(*addonapiv1alpha1.ManagedClusterAddOn) + + cond := meta.FindStatusCondition(addOn.Status.Conditions, "Available") + if cond == nil && cond.Status != metav1.ConditionUnknown { + t.Errorf("addon condition should be unknown: %v", addOn.Status.Conditions) + } + + work.Status = workapiv1.ManifestWorkStatus{ + Conditions: []metav1.Condition{ + { + Type: workapiv1.WorkAvailable, + Status: metav1.ConditionTrue, + }, + }, + } + + fakeAddonClient.ClearActions() + workInformers.Work().V1().ManifestWorks().Informer().GetStore().Update(work) + + err = controller.sync(context.TODO(), syncContext) + if err != nil { + t.Errorf("expected no error when sync: %v", err) + } + + addontesting.AssertActions(t, fakeAddonClient.Actions(), "update") + actual = fakeAddonClient.Actions()[0].(clienttesting.UpdateActionImpl).Object + addOn = actual.(*addonapiv1alpha1.ManagedClusterAddOn) + + cond = meta.FindStatusCondition(addOn.Status.Conditions, "Available") + if cond == nil && cond.Status != metav1.ConditionTrue { + t.Errorf("addon condition should be available: %v", addOn.Status.Conditions) + } +} + +type testProbe struct { + checkError error +} + +func (p *testProbe) ProbeFields() []agent.ProbeField { + return []agent.ProbeField{ + { + ResourceIdentifier: workapiv1.ResourceIdentifier{ + Resource: "tests", + Name: "test", + Namespace: "testns", + }, + ProbeRules: []workapiv1.FeedbackRule{ + { + Type: workapiv1.WellKnownStatusType, + }, + }, + }, + } +} + +// HealthCheck check status of the addon based on probe result. +func (p *testProbe) HealthCheck(workapiv1.ResourceIdentifier, workapiv1.StatusFeedbackResult) error { + return p.checkError +} + +func TestReconcileWithProbe(t *testing.T) { + addon := NewAddonWithHealthCheck("test", "cluster1", addonapiv1alpha1.HealthCheckModeCustomized) + work := &workapiv1.ManifestWork{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: addon.Namespace, + Name: constants.DeployWorkName(addon.Name), + }, + Status: workapiv1.ManifestWorkStatus{ + Conditions: []metav1.Condition{ + { + Type: workapiv1.WorkAvailable, + Status: metav1.ConditionTrue, + }, + }, + }, + } + + fakeAddonClient := fakeaddon.NewSimpleClientset(addon) + fakeWorkClient := fakework.NewSimpleClientset(work) + + addonInformers := addoninformers.NewSharedInformerFactory(fakeAddonClient, 10*time.Minute) + workInformers := workinformers.NewSharedInformerFactory(fakeWorkClient, 10*time.Minute) + + addonInformers.Addon().V1alpha1().ManagedClusterAddOns().Informer().GetStore().Add(addon) + workInformers.Work().V1().ManifestWorks().Informer().GetStore().Add(work) + + prober := &testProbe{ + checkError: fmt.Errorf("health check fails"), + } + + testaddon := &testAgent{ + name: "test", + health: &agent.HealthProber{ + Type: agent.HealthProberTypeWork, + WorkProber: &agent.WorkHealthProber{ + ProbeFields: prober.ProbeFields(), + HealthCheck: prober.HealthCheck, + }, + }, + } + + controller := addonHealthCheckController{ + addonClient: fakeAddonClient, + managedClusterAddonLister: addonInformers.Addon().V1alpha1().ManagedClusterAddOns().Lister(), + workLister: workInformers.Work().V1().ManifestWorks().Lister(), + agentAddons: map[string]agent.AgentAddon{testaddon.name: testaddon}, + eventRecorder: eventstesting.NewTestingEventRecorder(t), + } + + key, _ := cache.MetaNamespaceKeyFunc(addon) + syncContext := addontesting.NewFakeSyncContext(t, key) + err := controller.sync(context.TODO(), syncContext) + if err != nil { + t.Errorf("expected no error when sync: %v", err) + } + + // return unknown if no status are found + addontesting.AssertActions(t, fakeAddonClient.Actions(), "update") + actual := fakeAddonClient.Actions()[0].(clienttesting.UpdateActionImpl).Object + addOn := actual.(*addonapiv1alpha1.ManagedClusterAddOn) + cond := meta.FindStatusCondition(addOn.Status.Conditions, "Available") + if cond == nil && cond.Status != metav1.ConditionUnknown { + t.Errorf("addon condition should be unknown: %v", addOn.Status.Conditions) + } + + work.Status.ResourceStatus = workapiv1.ManifestResourceStatus{ + Manifests: []workapiv1.ManifestCondition{ + { + ResourceMeta: workapiv1.ManifestResourceMeta{ + Resource: "tests", + Name: "test", + Namespace: "testns", + }, + StatusFeedbacks: workapiv1.StatusFeedbackResult{ + Values: []workapiv1.FeedbackValue{ + { + Name: "noop", + }, + }, + }, + }, + }, + } + + fakeAddonClient.ClearActions() + workInformers.Work().V1().ManifestWorks().Informer().GetStore().Update(work) + + err = controller.sync(context.TODO(), syncContext) + if err != nil { + t.Errorf("expected no error when sync: %v", err) + } + + // return unavailable if check returns err + addontesting.AssertActions(t, fakeAddonClient.Actions(), "update") + actual = fakeAddonClient.Actions()[0].(clienttesting.UpdateActionImpl).Object + addOn = actual.(*addonapiv1alpha1.ManagedClusterAddOn) + + if !meta.IsStatusConditionFalse(addOn.Status.Conditions, "Available") { + t.Errorf("addon condition should be unavailable: %v", addOn.Status.Conditions) + } + + prober.checkError = nil + fakeAddonClient.ClearActions() + err = controller.sync(context.TODO(), syncContext) + if err != nil { + t.Errorf("expected no error when sync: %v", err) + } + + // return available if check returns nil + addontesting.AssertActions(t, fakeAddonClient.Actions(), "update") + actual = fakeAddonClient.Actions()[0].(clienttesting.UpdateActionImpl).Object + addOn = actual.(*addonapiv1alpha1.ManagedClusterAddOn) + if !meta.IsStatusConditionTrue(addOn.Status.Conditions, "Available") { + t.Errorf("addon condition should be available: %v", addOn.Status.Conditions) + } +} diff --git a/pkg/addonmanager/controllers/agentdeploy/controller.go b/pkg/addonmanager/controllers/agentdeploy/controller.go index b6a5c65af..84bb60970 100644 --- a/pkg/addonmanager/controllers/agentdeploy/controller.go +++ b/pkg/addonmanager/controllers/agentdeploy/controller.go @@ -92,7 +92,7 @@ func NewAddonDeployController( if _, ok := c.agentAddons[addonName]; !ok { return false } - if accessor.GetName() != deployWorkName(addonName) { + if accessor.GetName() != constants.DeployWorkName(addonName) { return false } return true @@ -120,7 +120,7 @@ func (c *addonDeployController) sync(ctx context.Context, syncCtx factory.SyncCo // Get ManagedCluster managedCluster, err := c.managedClusterLister.Get(clusterName) if errors.IsNotFound(err) { - c.cache.removeCache(deployWorkName(addonName), clusterName) + c.cache.removeCache(constants.DeployWorkName(addonName), clusterName) return nil } if err != nil { @@ -134,7 +134,7 @@ func (c *addonDeployController) sync(ctx context.Context, syncCtx factory.SyncCo managedClusterAddon, err := c.managedClusterAddonLister.ManagedClusterAddOns(clusterName).Get(addonName) if errors.IsNotFound(err) { - c.cache.removeCache(deployWorkName(addonName), clusterName) + c.cache.removeCache(constants.DeployWorkName(addonName), clusterName) return nil } if err != nil { @@ -142,7 +142,7 @@ func (c *addonDeployController) sync(ctx context.Context, syncCtx factory.SyncCo } if !managedClusterAddon.DeletionTimestamp.IsZero() { - c.cache.removeCache(deployWorkName(addonName), clusterName) + c.cache.removeCache(constants.DeployWorkName(addonName), clusterName) return nil } @@ -163,6 +163,8 @@ func (c *addonDeployController) sync(ctx context.Context, syncCtx factory.SyncCo } work.OwnerReferences = []metav1.OwnerReference{*owner} + c.setStatusFeedbackRule(work, agentAddon) + // apply work work, err = applyWork(c.workClient, c.workLister, c.cache, c.eventRecorder, ctx, work) if err != nil { @@ -202,3 +204,28 @@ func (c *addonDeployController) sync(ctx context.Context, syncCtx factory.SyncCo ctx, managedClusterAddonCopy, metav1.UpdateOptions{}) return err } + +func (c *addonDeployController) setStatusFeedbackRule(work *workapiv1.ManifestWork, agentAddon agent.AgentAddon) { + if agentAddon.GetAgentAddonOptions().HealthProber == nil { + return + } + + if agentAddon.GetAgentAddonOptions().HealthProber.Type != agent.HealthProberTypeWork { + return + } + + if agentAddon.GetAgentAddonOptions().HealthProber.WorkProber == nil { + return + } + + probeRules := agentAddon.GetAgentAddonOptions().HealthProber.WorkProber.ProbeFields + + work.Spec.ManifestConfigs = []workapiv1.ManifestConfigOption{} + + for _, rule := range probeRules { + work.Spec.ManifestConfigs = append(work.Spec.ManifestConfigs, workapiv1.ManifestConfigOption{ + ResourceIdentifier: rule.ResourceIdentifier, + FeedbackRules: rule.ProbeRules, + }) + } +} diff --git a/pkg/addonmanager/controllers/agentdeploy/utils.go b/pkg/addonmanager/controllers/agentdeploy/utils.go index 6c19ee6f8..4c64dcac1 100644 --- a/pkg/addonmanager/controllers/agentdeploy/utils.go +++ b/pkg/addonmanager/controllers/agentdeploy/utils.go @@ -16,10 +16,6 @@ import ( workapiv1 "open-cluster-management.io/api/work/v1" ) -func deployWorkName(addonName string) string { - return fmt.Sprintf("addon-%s-deploy", addonName) -} - func preDeleteHookWorkName(addonName string) string { return fmt.Sprintf("addon-%s-pre-delete", addonName) } @@ -154,7 +150,7 @@ func buildManifestWorkFromObject( } } - deployManifestWork = newManifestWork(deployWorkName(addonName), addonName, cluster, deployManifests) + deployManifestWork = newManifestWork(constants.DeployWorkName(addonName), addonName, cluster, deployManifests) hookManifestWork = newManifestWork(preDeleteHookWorkName(addonName), addonName, cluster, hookManifests) if hookManifestWork != nil { hookManifestWork.Spec.ManifestConfigs = manifestConfigs diff --git a/pkg/addonmanager/manager.go b/pkg/addonmanager/manager.go index c0f5eba0f..223098802 100644 --- a/pkg/addonmanager/manager.go +++ b/pkg/addonmanager/manager.go @@ -46,10 +46,10 @@ type addonManager struct { func (a *addonManager) AddAgent(addon agent.AgentAddon) error { addonOption := addon.GetAgentAddonOptions() if len(addonOption.AddonName) == 0 { - return fmt.Errorf("Addon name should be set") + return fmt.Errorf("addon name should be set") } if _, ok := a.addonAgents[addonOption.AddonName]; ok { - return fmt.Errorf("An agent is added for the addon already") + return fmt.Errorf("an agent is added for the addon already") } a.addonAgents[addonOption.AddonName] = addon return nil @@ -188,6 +188,7 @@ func (a *addonManager) Start(ctx context.Context) error { addonHealthCheckController := addonhealthcheck.NewAddonHealthCheckController( addonClient, addonInformers.Addon().V1alpha1().ManagedClusterAddOns(), + workInformers.Work().V1().ManifestWorks(), a.addonAgents, eventRecorder) diff --git a/pkg/agent/inteface.go b/pkg/agent/inteface.go index 5d211c6bc..6aade072f 100644 --- a/pkg/agent/inteface.go +++ b/pkg/agent/inteface.go @@ -8,6 +8,7 @@ import ( "k8s.io/apimachinery/pkg/runtime" addonapiv1alpha1 "open-cluster-management.io/api/addon/v1alpha1" clusterv1 "open-cluster-management.io/api/cluster/v1" + workapiv1 "open-cluster-management.io/api/work/v1" ) // AgentAddon is a mandatory interface for implementing a custom addon. @@ -127,6 +128,27 @@ type InstallStrategy struct { type HealthProber struct { Type HealthProberType + + WorkProber *WorkHealthProber +} + +type AddonHealthCheckFunc func(workapiv1.ResourceIdentifier, workapiv1.StatusFeedbackResult) error + +type WorkHealthProber struct { + // ProbeFields tells addon framework what field to probe + ProbeFields []ProbeField + + // HealthCheck check status of the addon based on probe result. + HealthCheck AddonHealthCheckFunc +} + +// ProbeField defines the field of a resource to be probed +type ProbeField struct { + // ResourceIdentifier sets what resource shoule be probed + ResourceIdentifier workapiv1.ResourceIdentifier + + // ProbeRules sets the rules to probe the field + ProbeRules []workapiv1.FeedbackRule } type HealthProberType string @@ -140,12 +162,12 @@ const ( // Note that the lease object is expected to periodically refresh by a local agent // deployed in the managed cluster implementing lease.LeaseUpdater interface. HealthProberTypeLease HealthProberType = "Lease" - // TODO(yue9944882): implement work api health checker // HealthProberTypeWork indicates the healthiness of the addon is equal to the overall // dispatching status of the corresponding ManifestWork resource. // It's applicable to those addons that don't have a local agent instance in the managed - // clusters. - //HealthProberTypeWork HealthProberType = "Work" + // clusters. The addon framework will check if the work is Available on the spoke. In addition + // user can define a prober to check more detailed status based on status feedback from work. + HealthProberTypeWork HealthProberType = "Work" ) func KubeClientSignerConfigurations(addonName, agentName string) func(cluster *clusterv1.ManagedCluster) []addonapiv1alpha1.RegistrationConfig { diff --git a/pkg/utils/permission.go b/pkg/utils/permission.go index d8aba04b7..39fc02969 100644 --- a/pkg/utils/permission.go +++ b/pkg/utils/permission.go @@ -11,7 +11,6 @@ import ( "k8s.io/utils/pointer" "open-cluster-management.io/addon-framework/pkg/agent" addonapiv1alpha1 "open-cluster-management.io/api/addon/v1alpha1" - addonv1alpha1 "open-cluster-management.io/api/addon/v1alpha1" clusterv1 "open-cluster-management.io/api/cluster/v1" ) @@ -187,10 +186,10 @@ func (b *unionPermissionBuilder) build() agent.PermissionConfigFunc { } } -func ensureAddonOwnerReference(metadata *metav1.ObjectMeta, addon *addonv1alpha1.ManagedClusterAddOn) { +func ensureAddonOwnerReference(metadata *metav1.ObjectMeta, addon *addonapiv1alpha1.ManagedClusterAddOn) { metadata.OwnerReferences = []metav1.OwnerReference{ { - APIVersion: addonv1alpha1.GroupVersion.String(), + APIVersion: addonapiv1alpha1.GroupVersion.String(), Kind: "ManagedClusterAddOn", Name: addon.Name, BlockOwnerDeletion: pointer.Bool(true), diff --git a/pkg/utils/probe_helper.go b/pkg/utils/probe_helper.go new file mode 100644 index 000000000..7982f1857 --- /dev/null +++ b/pkg/utils/probe_helper.go @@ -0,0 +1,79 @@ +package utils + +import ( + "fmt" + + "k8s.io/apimachinery/pkg/types" + "open-cluster-management.io/addon-framework/pkg/agent" + workapiv1 "open-cluster-management.io/api/work/v1" +) + +// DeploymentProber is to check the addon status based on status +// of the agent deployment status +type DeploymentProber struct { + deployments []types.NamespacedName +} + +func NewDeploymentProber(deployments ...types.NamespacedName) *agent.HealthProber { + probeFields := []agent.ProbeField{} + for _, deploy := range deployments { + probeFields = append(probeFields, agent.ProbeField{ + ResourceIdentifier: workapiv1.ResourceIdentifier{ + Group: "apps", + Resource: "deployments", + Name: deploy.Name, + Namespace: deploy.Namespace, + }, + ProbeRules: []workapiv1.FeedbackRule{ + { + Type: workapiv1.WellKnownStatusType, + }, + }, + }) + } + return &agent.HealthProber{ + Type: agent.HealthProberTypeWork, + WorkProber: &agent.WorkHealthProber{ + ProbeFields: probeFields, + HealthCheck: HealthCheck, + }, + } +} + +func (d *DeploymentProber) ProbeFields() []agent.ProbeField { + probeFields := []agent.ProbeField{} + for _, deploy := range d.deployments { + probeFields = append(probeFields, agent.ProbeField{ + ResourceIdentifier: workapiv1.ResourceIdentifier{ + Group: "apps", + Resource: "deployments", + Name: deploy.Name, + Namespace: deploy.Namespace, + }, + ProbeRules: []workapiv1.FeedbackRule{ + { + Type: workapiv1.WellKnownStatusType, + }, + }, + }) + } + return probeFields +} + +func HealthCheck(identifier workapiv1.ResourceIdentifier, result workapiv1.StatusFeedbackResult) error { + if len(result.Values) == 0 { + return fmt.Errorf("no values are probed for deployment %s/%s", identifier.Namespace, identifier.Name) + } + for _, value := range result.Values { + if value.Name != "ReadyReplicas" { + continue + } + + if *value.Value.Integer >= 1 { + return nil + } + + return fmt.Errorf("readyReplica is %d for deployement %s/%s", *value.Value.Integer, identifier.Namespace, identifier.Name) + } + return fmt.Errorf("readyReplica is not probed") +} diff --git a/pkg/utils/probe_helper_test.go b/pkg/utils/probe_helper_test.go new file mode 100644 index 000000000..cd76be3c2 --- /dev/null +++ b/pkg/utils/probe_helper_test.go @@ -0,0 +1,103 @@ +package utils + +import ( + "testing" + + "k8s.io/apimachinery/pkg/types" + workapiv1 "open-cluster-management.io/api/work/v1" +) + +func boolPtr(n int64) *int64 { + return &n +} + +func TestDeploymentProbe(t *testing.T) { + cases := []struct { + name string + result workapiv1.StatusFeedbackResult + expectedErr bool + }{ + { + name: "no result", + result: workapiv1.StatusFeedbackResult{}, + expectedErr: true, + }, + { + name: "no matched value", + result: workapiv1.StatusFeedbackResult{ + Values: []workapiv1.FeedbackValue{ + { + Name: "Replicas", + Value: workapiv1.FieldValue{ + Integer: boolPtr(1), + }, + }, + { + Name: "AvailableReplicas", + Value: workapiv1.FieldValue{ + Integer: boolPtr(1), + }, + }, + }, + }, + expectedErr: true, + }, + { + name: "check failed with 0 replica", + result: workapiv1.StatusFeedbackResult{ + Values: []workapiv1.FeedbackValue{ + { + Name: "Replicas", + Value: workapiv1.FieldValue{ + Integer: boolPtr(1), + }, + }, + { + Name: "ReadyReplicas", + Value: workapiv1.FieldValue{ + Integer: boolPtr(0), + }, + }, + }, + }, + expectedErr: true, + }, + { + name: "check passed", + result: workapiv1.StatusFeedbackResult{ + Values: []workapiv1.FeedbackValue{ + { + Name: "Replicas", + Value: workapiv1.FieldValue{ + Integer: boolPtr(1), + }, + }, + { + Name: "ReadyReplicas", + Value: workapiv1.FieldValue{ + Integer: boolPtr(2), + }, + }, + }, + }, + expectedErr: false, + }, + } + + for _, c := range cases { + t.Run(c.name, func(t *testing.T) { + prober := NewDeploymentProber(types.NamespacedName{Name: "test", Namespace: "testns"}) + + fields := prober.WorkProber.ProbeFields + + err := prober.WorkProber.HealthCheck(fields[0].ResourceIdentifier, c.result) + if err != nil && !c.expectedErr { + t.Errorf("expected no error but got %v", err) + } + + if err == nil && c.expectedErr { + t.Error("expected error but got no error") + } + }) + } +} diff --git a/test/integration/agent_deploy_test.go b/test/integration/agent_deploy_test.go index a3b01fec1..6622e3ca9 100644 --- a/test/integration/agent_deploy_test.go +++ b/test/integration/agent_deploy_test.go @@ -12,9 +12,13 @@ import ( metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/types" "k8s.io/apimachinery/pkg/util/rand" + "open-cluster-management.io/addon-framework/pkg/agent" + "open-cluster-management.io/addon-framework/pkg/utils" addonapiv1alpha1 "open-cluster-management.io/api/addon/v1alpha1" clusterv1 "open-cluster-management.io/api/cluster/v1" + workapiv1 "open-cluster-management.io/api/work/v1" ) const ( @@ -94,6 +98,9 @@ var _ = ginkgo.Describe("Agent deploy", func() { err := obj.UnmarshalJSON([]byte(deploymentJson)) gomega.Expect(err).ToNot(gomega.HaveOccurred()) testAddonImpl.manifests[managedClusterName] = []runtime.Object{obj} + testAddonImpl.prober = &agent.HealthProber{ + Type: agent.HealthProberTypeWork, + } addon := &addonapiv1alpha1.ManagedClusterAddOn{ ObjectMeta: metav1.ObjectMeta{ @@ -140,6 +147,153 @@ var _ = ginkgo.Describe("Agent deploy", func() { } return nil }, eventuallyTimeout, eventuallyInterval).ShouldNot(gomega.HaveOccurred()) + + // update work to available so addon becomes available + work, err = hubWorkClient.WorkV1().ManifestWorks(managedClusterName).Get(context.Background(), fmt.Sprintf("addon-%s-deploy", testAddonImpl.name), metav1.GetOptions{}) + gomega.Expect(err).ToNot(gomega.HaveOccurred()) + meta.SetStatusCondition(&work.Status.Conditions, metav1.Condition{Type: workapiv1.WorkAvailable, Status: metav1.ConditionTrue, Reason: "WorkAvailable"}) + _, err = hubWorkClient.WorkV1().ManifestWorks(managedClusterName).UpdateStatus(context.Background(), work, metav1.UpdateOptions{}) + gomega.Expect(err).ToNot(gomega.HaveOccurred()) + + gomega.Eventually(func() error { + addon, err := hubAddonClient.AddonV1alpha1().ManagedClusterAddOns(managedClusterName).Get(context.Background(), testAddonImpl.name, metav1.GetOptions{}) + if err != nil { + return err + } + + if !meta.IsStatusConditionTrue(addon.Status.Conditions, "Available") { + return fmt.Errorf("Unexpected addon available condition, %v", addon.Status.Conditions) + } + return nil + }, eventuallyTimeout, eventuallyInterval).ShouldNot(gomega.HaveOccurred()) + }) + + ginkgo.It("Should deploy agent and get available with prober func", func() { + obj := &unstructured.Unstructured{} + err := obj.UnmarshalJSON([]byte(deploymentJson)) + gomega.Expect(err).ToNot(gomega.HaveOccurred()) + testAddonImpl.manifests[managedClusterName] = []runtime.Object{obj} + testAddonImpl.prober = utils.NewDeploymentProber(types.NamespacedName{Name: "nginx-deployment", Namespace: "default"}) + + addon := &addonapiv1alpha1.ManagedClusterAddOn{ + ObjectMeta: metav1.ObjectMeta{ + Name: testAddonImpl.name, + }, + Spec: addonapiv1alpha1.ManagedClusterAddOnSpec{ + InstallNamespace: "default", + }, + } + _, err = hubAddonClient.AddonV1alpha1().ManagedClusterAddOns(managedClusterName).Create(context.Background(), addon, metav1.CreateOptions{}) + gomega.Expect(err).ToNot(gomega.HaveOccurred()) + + gomega.Eventually(func() error { + work, err := hubWorkClient.WorkV1().ManifestWorks(managedClusterName).Get(context.Background(), fmt.Sprintf("addon-%s-deploy", testAddonImpl.name), metav1.GetOptions{}) + if err != nil { + return err + } + + if len(work.Spec.Workload.Manifests) != 1 { + return fmt.Errorf("Unexpected number of work manifests") + } + + if len(work.Spec.ManifestConfigs) != 1 { + return fmt.Errorf("Unexpected number of work manifests configuration") + } + + if apiequality.Semantic.DeepEqual(work.Spec.Workload.Manifests[0].Raw, []byte(deploymentJson)) { + return fmt.Errorf("expected manifest is no correct, get %v", work.Spec.Workload.Manifests[0].Raw) + } + return nil + }, eventuallyTimeout, eventuallyInterval).ShouldNot(gomega.HaveOccurred()) + + // Update work status to trigger addon status + work, err := hubWorkClient.WorkV1().ManifestWorks(managedClusterName).Get(context.Background(), fmt.Sprintf("addon-%s-deploy", testAddonImpl.name), metav1.GetOptions{}) + gomega.Expect(err).ToNot(gomega.HaveOccurred()) + meta.SetStatusCondition(&work.Status.Conditions, metav1.Condition{Type: workapiv1.WorkAvailable, Status: metav1.ConditionTrue, Reason: "WorkAvailable"}) + + replica := int64(1) + + // update work status to a wrong feedback status + work.Status.ResourceStatus = workapiv1.ManifestResourceStatus{ + Manifests: []workapiv1.ManifestCondition{ + { + ResourceMeta: workapiv1.ManifestResourceMeta{ + Ordinal: 0, + Group: "apps", + Resource: "deployments", + Name: "nginx-deployment", + Namespace: "default", + }, + StatusFeedbacks: workapiv1.StatusFeedbackResult{ + Values: []workapiv1.FeedbackValue{ + { + Name: "Replicas", + Value: workapiv1.FieldValue{ + Type: workapiv1.Integer, + Integer: &replica, + }, + }, + }, + }, + }, + }, + } + _, err = hubWorkClient.WorkV1().ManifestWorks(managedClusterName).UpdateStatus(context.Background(), work, metav1.UpdateOptions{}) + gomega.Expect(err).ToNot(gomega.HaveOccurred()) + + gomega.Eventually(func() error { + addon, err := hubAddonClient.AddonV1alpha1().ManagedClusterAddOns(managedClusterName).Get(context.Background(), testAddonImpl.name, metav1.GetOptions{}) + if err != nil { + return err + } + + if !meta.IsStatusConditionFalse(addon.Status.Conditions, "Available") { + return fmt.Errorf("Unexpected addon available condition, %v", addon.Status.Conditions) + } + return nil + }, eventuallyTimeout, eventuallyInterval).ShouldNot(gomega.HaveOccurred()) + + // update to the correct condition + work, err = hubWorkClient.WorkV1().ManifestWorks(managedClusterName).Get(context.Background(), fmt.Sprintf("addon-%s-deploy", testAddonImpl.name), metav1.GetOptions{}) + gomega.Expect(err).ToNot(gomega.HaveOccurred()) + work.Status.ResourceStatus = workapiv1.ManifestResourceStatus{ + Manifests: []workapiv1.ManifestCondition{ + { + ResourceMeta: workapiv1.ManifestResourceMeta{ + Ordinal: 0, + Group: "apps", + Resource: "deployments", + Name: "nginx-deployment", + Namespace: "default", + }, + StatusFeedbacks: workapiv1.StatusFeedbackResult{ + Values: []workapiv1.FeedbackValue{ + { + Name: "ReadyReplicas", + Value: workapiv1.FieldValue{ + Type: workapiv1.Integer, + Integer: &replica, + }, + }, + }, + }, + }, + }, + } + _, err = hubWorkClient.WorkV1().ManifestWorks(managedClusterName).UpdateStatus(context.Background(), work, metav1.UpdateOptions{}) + gomega.Expect(err).ToNot(gomega.HaveOccurred()) + + gomega.Eventually(func() error { + addon, err := hubAddonClient.AddonV1alpha1().ManagedClusterAddOns(managedClusterName).Get(context.Background(), testAddonImpl.name, metav1.GetOptions{}) + if err != nil { + return err + } + + if !meta.IsStatusConditionTrue(addon.Status.Conditions, "Available") { + return fmt.Errorf("Unexpected addon available condition, %v", addon.Status.Conditions) + } + return nil + }, eventuallyTimeout, eventuallyInterval).ShouldNot(gomega.HaveOccurred()) }) }) diff --git a/test/integration/suite_test.go b/test/integration/suite_test.go index 790fdc703..c2762d5b9 100644 --- a/test/integration/suite_test.go +++ b/test/integration/suite_test.go @@ -100,6 +100,7 @@ type testAddon struct { registrations map[string][]addonapiv1alpha1.RegistrationConfig approveCSR bool cert []byte + prober *agent.HealthProber } func (t *testAddon) Manifests(cluster *clusterv1.ManagedCluster, addon *addonapiv1alpha1.ManagedClusterAddOn) ([]runtime.Object, error) { @@ -108,7 +109,8 @@ func (t *testAddon) Manifests(cluster *clusterv1.ManagedCluster, addon *addonapi func (t *testAddon) GetAgentAddonOptions() agent.AgentAddonOptions { option := agent.AgentAddonOptions{ - AddonName: t.name, + AddonName: t.name, + HealthProber: t.prober, } if len(t.registrations) > 0 {