From 0cd4ab06f0f85a3d9f01e26e88ae974baef17897 Mon Sep 17 00:00:00 2001 From: janiskemper Date: Fri, 21 Jun 2024 14:35:45 +0200 Subject: [PATCH] :seedling: Reduce reconcilements - Filter HCloudMachine events so that status updates of the resource don't trigger another reconcilement of HCloudMachine - Filter HetznerCluster events so that status updates of the resource don't trigger another reconcilement of HetznerCluster - Filter HetznerCluster events so that some status updates that are not important don't trigger reconcilement of HCloudMachine - Filter Machine events so that status updates of the resource don't trigger another reconcilement of HCloudMachine a - Filter Cluster events so that status updates of the resource don't trigger another reconcilement of HetznerCluster - Increase the time period to requeue if a server is starting - Increase the time period to requeue if a server has not status yet - Increase the time period to requeue if the kubeapi server is not yet reachable before adding it as target to the load balancer - Don't trigger multiple shutdown calls for one server and increase the timeout to requeue after a shutdown is triggered --- controllers/hcloudmachine_controller.go | 125 ++++++++++++++++++- controllers/hcloudmachine_controller_test.go | 4 +- controllers/hetznercluster_controller.go | 110 ++++++++++++++++ pkg/services/hcloud/server/server.go | 19 +-- 4 files changed, 244 insertions(+), 14 deletions(-) diff --git a/controllers/hcloudmachine_controller.go b/controllers/hcloudmachine_controller.go index d3c1716eb..3f5f4fb02 100644 --- a/controllers/hcloudmachine_controller.go +++ b/controllers/hcloudmachine_controller.go @@ -220,14 +220,16 @@ func (r *HCloudMachineReconciler) SetupWithManager(ctx context.Context, mgr ctrl WithOptions(options). For(&infrav1.HCloudMachine{}). WithEventFilter(predicates.ResourceNotPausedAndHasFilterLabel(log, r.WatchFilterValue)). + WithEventFilter(IgnoreInsignificantHCloudMachineStatusUpdates(log)). Watches( &clusterv1.Machine{}, handler.EnqueueRequestsFromMapFunc(util.MachineToInfrastructureMapFunc(infrav1.GroupVersion.WithKind("HCloudMachine"))), + builder.WithPredicates(IgnoreInsignificantMachineStatusUpdates(log)), ). Watches( &infrav1.HetznerCluster{}, handler.EnqueueRequestsFromMapFunc(r.HetznerClusterToHCloudMachines(ctx)), - builder.WithPredicates(IgnoreHetznerClusterConditionUpdates(log)), + builder.WithPredicates(IgnoreInsignificantHetznerClusterUpdates(log)), ). Build(r) if err != nil { @@ -303,8 +305,8 @@ func (r *HCloudMachineReconciler) HetznerClusterToHCloudMachines(_ context.Conte } } -// IgnoreHetznerClusterConditionUpdates is a predicate used for ignoring HetznerCluster condition updates. -func IgnoreHetznerClusterConditionUpdates(logger logr.Logger) predicate.Funcs { +// IgnoreInsignificantHetznerClusterUpdates is a predicate used for ignoring insignificant HetznerCluster.Status updates. +func IgnoreInsignificantHetznerClusterUpdates(logger logr.Logger) predicate.Funcs { return predicate.Funcs{ UpdateFunc: func(e event.UpdateEvent) bool { log := logger.WithValues( @@ -341,6 +343,23 @@ func IgnoreHetznerClusterConditionUpdates(logger logr.Logger) predicate.Funcs { oldCluster.Status.Conditions = nil newCluster.Status.Conditions = nil + if oldCluster.Status.ControlPlaneLoadBalancer != nil { + oldCluster.Status.ControlPlaneLoadBalancer.Target = nil + } + if newCluster.Status.ControlPlaneLoadBalancer != nil { + newCluster.Status.ControlPlaneLoadBalancer.Target = nil + } + + oldCluster.Status.HCloudPlacementGroups = nil + newCluster.Status.HCloudPlacementGroups = nil + + if oldCluster.Status.Network != nil { + oldCluster.Status.Network.AttachedServers = nil + } + if newCluster.Status.Network != nil { + newCluster.Status.Network.AttachedServers = nil + } + if reflect.DeepEqual(oldCluster, newCluster) { // Only insignificant fields changed, no need to reconcile return false @@ -355,3 +374,103 @@ func IgnoreHetznerClusterConditionUpdates(logger logr.Logger) predicate.Funcs { GenericFunc: func(_ event.GenericEvent) bool { return true }, } } + +// IgnoreInsignificantMachineStatusUpdates is a predicate used for ignoring insignificant Machine.Status updates. +func IgnoreInsignificantMachineStatusUpdates(logger logr.Logger) predicate.Funcs { + return predicate.Funcs{ + UpdateFunc: func(e event.UpdateEvent) bool { + log := logger.WithValues( + "predicate", "IgnoreInsignificantMachineStatusUpdates", + "type", "update", + "namespace", e.ObjectNew.GetNamespace(), + "kind", strings.ToLower(e.ObjectNew.GetObjectKind().GroupVersionKind().Kind), + "name", e.ObjectNew.GetName(), + ) + + var oldMachine, newMachine *clusterv1.Machine + var ok bool + // This predicate only looks at Machine objects + if oldMachine, ok = e.ObjectOld.(*clusterv1.Machine); !ok { + return true + } + if newMachine, ok = e.ObjectNew.(*clusterv1.Machine); !ok { + // Something weird happened, and we received two different kinds of objects + return true + } + + // We should not modify the original objects, this causes issues with code that relies on the original object. + oldMachine = oldMachine.DeepCopy() + newMachine = newMachine.DeepCopy() + + oldMachine.ManagedFields = nil + newMachine.ManagedFields = nil + + oldMachine.ResourceVersion = "" + newMachine.ResourceVersion = "" + + oldMachine.Status = clusterv1.MachineStatus{} + newMachine.Status = clusterv1.MachineStatus{} + + if reflect.DeepEqual(oldMachine, newMachine) { + // Only insignificant fields changed, no need to reconcile + return false + } + + log.V(1).Info("Machine -> HCloudMachine event") + return true + }, + CreateFunc: func(_ event.CreateEvent) bool { return true }, + DeleteFunc: func(_ event.DeleteEvent) bool { return true }, + GenericFunc: func(_ event.GenericEvent) bool { return true }, + } +} + +// IgnoreInsignificantHCloudMachineStatusUpdates is a predicate used for ignoring insignificant HCloudMachine.Status updates. +func IgnoreInsignificantHCloudMachineStatusUpdates(logger logr.Logger) predicate.Funcs { + return predicate.Funcs{ + UpdateFunc: func(e event.UpdateEvent) bool { + log := logger.WithValues( + "predicate", "IgnoreInsignificantHCloudMachineStatusUpdates", + "type", "update", + "namespace", e.ObjectNew.GetNamespace(), + "kind", strings.ToLower(e.ObjectNew.GetObjectKind().GroupVersionKind().Kind), + "name", e.ObjectNew.GetName(), + ) + + var oldHCloudMachine, newHCloudMachine *infrav1.HCloudMachine + var ok bool + // This predicate only looks at HCloudMachine objects + if oldHCloudMachine, ok = e.ObjectOld.(*infrav1.HCloudMachine); !ok { + return true + } + if newHCloudMachine, ok = e.ObjectNew.(*infrav1.HCloudMachine); !ok { + // Something weird happened, and we received two different kinds of objects + return true + } + + // We should not modify the original objects, this causes issues with code that relies on the original object. + oldHCloudMachine = oldHCloudMachine.DeepCopy() + newHCloudMachine = newHCloudMachine.DeepCopy() + + oldHCloudMachine.ManagedFields = nil + newHCloudMachine.ManagedFields = nil + + oldHCloudMachine.ResourceVersion = "" + newHCloudMachine.ResourceVersion = "" + + oldHCloudMachine.Status = infrav1.HCloudMachineStatus{} + newHCloudMachine.Status = infrav1.HCloudMachineStatus{} + + if reflect.DeepEqual(oldHCloudMachine, newHCloudMachine) { + // Only insignificant fields changed, no need to reconcile + return false + } + + log.V(1).Info("HCloudMachine event") + return true + }, + CreateFunc: func(_ event.CreateEvent) bool { return true }, + DeleteFunc: func(_ event.DeleteEvent) bool { return true }, + GenericFunc: func(_ event.GenericEvent) bool { return true }, + } +} diff --git a/controllers/hcloudmachine_controller_test.go b/controllers/hcloudmachine_controller_test.go index bbbe5553a..1ebd8fcff 100644 --- a/controllers/hcloudmachine_controller_test.go +++ b/controllers/hcloudmachine_controller_test.go @@ -627,7 +627,7 @@ var _ = Describe("HCloudMachine validation", func() { }) }) -var _ = Describe("IgnoreHetznerClusterConditionUpdates Predicate", func() { +var _ = Describe("IgnoreInsignificantHetznerClusterUpdates Predicate", func() { var ( predicate predicate.Predicate @@ -636,7 +636,7 @@ var _ = Describe("IgnoreHetznerClusterConditionUpdates Predicate", func() { ) BeforeEach(func() { - predicate = IgnoreHetznerClusterConditionUpdates(klog.Background()) + predicate = IgnoreInsignificantHetznerClusterUpdates(klog.Background()) oldCluster = &infrav1.HetznerCluster{ ObjectMeta: metav1.ObjectMeta{Name: "test-predicate", ResourceVersion: "1"}, diff --git a/controllers/hetznercluster_controller.go b/controllers/hetznercluster_controller.go index 6c4000665..be4540c18 100644 --- a/controllers/hetznercluster_controller.go +++ b/controllers/hetznercluster_controller.go @@ -21,11 +21,13 @@ import ( "context" "errors" "fmt" + "reflect" "strconv" "strings" "sync" "time" + "github.com/go-logr/logr" certificatesv1 "k8s.io/api/certificates/v1" corev1 "k8s.io/api/core/v1" apierrors "k8s.io/apimachinery/pkg/api/errors" @@ -46,9 +48,11 @@ import ( "sigs.k8s.io/controller-runtime/pkg/client/apiutil" "sigs.k8s.io/controller-runtime/pkg/controller" "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil" + "sigs.k8s.io/controller-runtime/pkg/event" "sigs.k8s.io/controller-runtime/pkg/handler" "sigs.k8s.io/controller-runtime/pkg/log" metricsserver "sigs.k8s.io/controller-runtime/pkg/metrics/server" + "sigs.k8s.io/controller-runtime/pkg/predicate" "sigs.k8s.io/controller-runtime/pkg/reconcile" "sigs.k8s.io/controller-runtime/pkg/source" @@ -732,6 +736,7 @@ func (r *HetznerClusterReconciler) SetupWithManager(ctx context.Context, mgr ctr For(&infrav1.HetznerCluster{}). WithEventFilter(predicates.ResourceNotPausedAndHasFilterLabel(log, r.WatchFilterValue)). WithEventFilter(predicates.ResourceIsNotExternallyManaged(log)). + WithEventFilter(IgnoreInsignificantHetznerClusterStatusUpdates(log)). Owns(&corev1.Secret{}). Build(r) if err != nil { @@ -777,5 +782,110 @@ func (r *HetznerClusterReconciler) SetupWithManager(ctx context.Context, mgr ctr }, } }), + IgnoreInsignificantClusterStatusUpdates(log), ) } + +// IgnoreInsignificantClusterStatusUpdates is a predicate used for ignoring insignificant HetznerCluster.Status updates. +func IgnoreInsignificantClusterStatusUpdates(logger logr.Logger) predicate.Funcs { + return predicate.Funcs{ + UpdateFunc: func(e event.UpdateEvent) bool { + log := logger.WithValues( + "predicate", "IgnoreInsignificantClusterStatusUpdates", + "type", "update", + "namespace", e.ObjectNew.GetNamespace(), + "kind", strings.ToLower(e.ObjectNew.GetObjectKind().GroupVersionKind().Kind), + "name", e.ObjectNew.GetName(), + ) + + var oldCluster, newCluster *clusterv1.Cluster + var ok bool + // This predicate only looks at Cluster objects + if oldCluster, ok = e.ObjectOld.(*clusterv1.Cluster); !ok { + return true + } + if newCluster, ok = e.ObjectNew.(*clusterv1.Cluster); !ok { + // Something weird happened, and we received two different kinds of objects + return true + } + + // We should not modify the original objects, this causes issues with code that relies on the original object. + oldCluster = oldCluster.DeepCopy() + newCluster = newCluster.DeepCopy() + + // Set fields we do not care about to nil + + oldCluster.ManagedFields = nil + newCluster.ManagedFields = nil + + oldCluster.ResourceVersion = "" + newCluster.ResourceVersion = "" + + oldCluster.Status = clusterv1.ClusterStatus{} + newCluster.Status = clusterv1.ClusterStatus{} + + if reflect.DeepEqual(oldCluster, newCluster) { + // Only insignificant fields changed, no need to reconcile + return false + } + // There is a noteworthy diff, so we should reconcile + log.V(1).Info("Cluster -> HetznerCluster") + return true + }, + CreateFunc: func(_ event.CreateEvent) bool { return true }, + DeleteFunc: func(_ event.DeleteEvent) bool { return true }, + GenericFunc: func(_ event.GenericEvent) bool { return true }, + } +} + +// IgnoreInsignificantHetznerClusterStatusUpdates is a predicate used for ignoring insignificant HetznerHetznerCluster.Status updates. +func IgnoreInsignificantHetznerClusterStatusUpdates(logger logr.Logger) predicate.Funcs { + return predicate.Funcs{ + UpdateFunc: func(e event.UpdateEvent) bool { + log := logger.WithValues( + "predicate", "IgnoreInsignificantHetznerClusterStatusUpdates", + "namespace", e.ObjectNew.GetNamespace(), + "kind", strings.ToLower(e.ObjectNew.GetObjectKind().GroupVersionKind().Kind), + "name", e.ObjectNew.GetName(), + ) + + var oldHetznerCluster, newHetznerCluster *infrav1.HetznerCluster + var ok bool + // This predicate only looks at HetznerCluster objects + if oldHetznerCluster, ok = e.ObjectOld.(*infrav1.HetznerCluster); !ok { + return true + } + if newHetznerCluster, ok = e.ObjectNew.(*infrav1.HetznerCluster); !ok { + // Something weird happened, and we received two different kinds of objects + return true + } + + // We should not modify the original objects, this causes issues with code that relies on the original object. + oldHetznerCluster = oldHetznerCluster.DeepCopy() + newHetznerCluster = newHetznerCluster.DeepCopy() + + // Set fields we do not care about to nil + + oldHetznerCluster.ManagedFields = nil + newHetznerCluster.ManagedFields = nil + + oldHetznerCluster.ResourceVersion = "" + newHetznerCluster.ResourceVersion = "" + + oldHetznerCluster.Status = infrav1.HetznerClusterStatus{} + newHetznerCluster.Status = infrav1.HetznerClusterStatus{} + + if reflect.DeepEqual(oldHetznerCluster, newHetznerCluster) { + // Only insignificant fields changed, no need to reconcile + return false + } + // There is a noteworthy diff, so we should reconcile + log.V(1).Info("HetznerCluster Update") + return true + }, + // We only care about Update events, anything else should be reconciled + CreateFunc: func(_ event.CreateEvent) bool { return true }, + DeleteFunc: func(_ event.DeleteEvent) bool { return true }, + GenericFunc: func(_ event.GenericEvent) bool { return true }, + } +} diff --git a/pkg/services/hcloud/server/server.go b/pkg/services/hcloud/server/server.go index 9b9d35c40..c42fe2861 100644 --- a/pkg/services/hcloud/server/server.go +++ b/pkg/services/hcloud/server/server.go @@ -141,12 +141,12 @@ func (s *Service) Reconcile(ctx context.Context) (res reconcile.Result, err erro clusterv1.ConditionSeverityInfo, "server is starting", ) - return reconcile.Result{RequeueAfter: 10 * time.Second}, nil + return reconcile.Result{RequeueAfter: 1 * time.Minute}, nil case hcloud.ServerStatusRunning: // do nothing default: // some temporary status s.scope.SetReady(false) - return reconcile.Result{RequeueAfter: 2 * time.Second}, nil + return reconcile.Result{RequeueAfter: 10 * time.Second}, nil } // check whether server is attached to the network @@ -296,7 +296,7 @@ func (s *Service) reconcileLoadBalancerAttachment(ctx context.Context, server *h // we attach only nodes with kube-apiserver pod healthy to avoid downtime, skipped for the first node if len(s.scope.HetznerCluster.Status.ControlPlaneLoadBalancer.Target) > 0 && !conditions.IsTrue(s.scope.Machine, controlplanev1.MachineAPIServerPodHealthyCondition) { - return reconcile.Result{RequeueAfter: 5 * time.Second}, nil + return reconcile.Result{RequeueAfter: 30 * time.Second}, nil } opts := hcloud.LoadBalancerAddServerTargetOpts{ @@ -318,8 +318,8 @@ func (s *Service) reconcileLoadBalancerAttachment(ctx context.Context, server *h record.Eventf( s.scope.HetznerCluster, "AddedAsTargetToLoadBalancer", - "Added new server %s with ID %d to the loadbalancer %s with ID %d", - server.Name, server.ID, s.scope.HetznerCluster.Spec.ControlPlaneLoadBalancer.Name, s.scope.HetznerCluster.Status.ControlPlaneLoadBalancer.ID) + "Added new server %s with ID %d to the loadbalancer with ID %d", + server.Name, server.ID, s.scope.HetznerCluster.Status.ControlPlaneLoadBalancer.ID) return reconcile.Result{}, nil } @@ -499,7 +499,7 @@ func (s *Service) getServerImage(ctx context.Context) (*hcloud.Image, error) { infrav1.ServerCreateSucceededCondition, infrav1.ServerTypeNotFoundReason, clusterv1.ConditionSeverityError, - err.Error(), + "failed to get server type - nil type", ) return nil, errServerCreateNotPossible } @@ -566,13 +566,14 @@ func (s *Service) handleServerStatusOff(ctx context.Context, server *hcloud.Serv if serverAvailableCondition != nil && serverAvailableCondition.Status == corev1.ConditionFalse && serverAvailableCondition.Reason == infrav1.ServerOffReason { + s.scope.Info("Trigger power on again") if time.Now().Before(serverAvailableCondition.LastTransitionTime.Time.Add(serverOffTimeout)) { // Not yet timed out, try again to power on if err := s.scope.HCloudClient.PowerOnServer(ctx, server); err != nil { hcloudutil.HandleRateLimitExceeded(s.scope.HCloudMachine, err, "PowerOnServer") if hcloud.IsError(err, hcloud.ErrorCodeLocked) { // if server is locked, we just retry again - return reconcile.Result{Requeue: true}, nil + return reconcile.Result{RequeueAfter: 30 * time.Second}, nil } return reconcile.Result{}, fmt.Errorf("failed to power on server: %w", err) } @@ -587,7 +588,7 @@ func (s *Service) handleServerStatusOff(ctx context.Context, server *hcloud.Serv hcloudutil.HandleRateLimitExceeded(s.scope.HCloudMachine, err, "PowerOnServer") if hcloud.IsError(err, hcloud.ErrorCodeLocked) { // if server is locked, we just retry again - return reconcile.Result{Requeue: true}, nil + return reconcile.Result{RequeueAfter: 30 * time.Second}, nil } return reconcile.Result{}, fmt.Errorf("failed to power on server: %w", err) } @@ -609,7 +610,7 @@ func (s *Service) handleDeleteServerStatusRunning(ctx context.Context, server *h // 1. The server has not yet been tried to shut down and still is marked as "ready". // 2. The server has been tried to shut down without an effect and the timeout is not reached yet. - if s.scope.HasServerAvailableCondition() || (s.scope.HasServerTerminatedCondition() && !s.scope.HasShutdownTimedOut()) { + if s.scope.HasServerAvailableCondition() { if err := s.scope.HCloudClient.ShutdownServer(ctx, server); err != nil { hcloudutil.HandleRateLimitExceeded(s.scope.HCloudMachine, err, "ShutdownServer") return reconcile.Result{}, fmt.Errorf("failed to shutdown server: %w", err)