From 939123cb6960f1a87d216481f439a6f1aff6b5b4 Mon Sep 17 00:00:00 2001 From: "Beata Lach (Skiba)" Date: Wed, 24 Jul 2024 11:32:16 +0000 Subject: [PATCH] Do not break the loop after removing failed scale up nodes Clean up cluster state after removing failed scale up nodes, so that the loop can continue. Most importantly, update the target for the affected node group, so that the deleted nodes are not considered upcoming. --- .../clusterstate/clusterstate_test.go | 71 +++++++++++++++++++ cluster-autoscaler/core/static_autoscaler.go | 12 ++-- .../core/static_autoscaler_test.go | 29 ++++---- 3 files changed, 92 insertions(+), 20 deletions(-) diff --git a/cluster-autoscaler/clusterstate/clusterstate_test.go b/cluster-autoscaler/clusterstate/clusterstate_test.go index 7b4fb3d6c736..ac11a1cfa5c1 100644 --- a/cluster-autoscaler/clusterstate/clusterstate_test.go +++ b/cluster-autoscaler/clusterstate/clusterstate_test.go @@ -176,6 +176,77 @@ func TestHasNodeGroupStartedScaleUp(t *testing.T) { } } +// TestRecalculateStateAfterNodeGroupSizeChanged checks that Recalculate updates state correctly after +// some node group size changed. We verify that acceptable ranges are updated accordingly +// and that the UpcomingNodes reflect the node group size change (important for recalculating state after +// deleting scale-up nodes that failed to create). +func TestRecalculateStateAfterNodeGroupSizeChanged(t *testing.T) { + ngName := "ng1" + testCases := []struct { + name string + acceptableRange AcceptableRange + readiness Readiness + newTarget int + scaleUpRequest *ScaleUpRequest + wantAcceptableRange AcceptableRange + wantUpcoming int + }{ + { + name: "failed scale up by 3 nodes", + acceptableRange: AcceptableRange{MinNodes: 1, CurrentTarget: 4, MaxNodes: 4}, + readiness: Readiness{Ready: make([]string, 1)}, + newTarget: 1, + wantAcceptableRange: AcceptableRange{MinNodes: 1, CurrentTarget: 1, MaxNodes: 1}, + wantUpcoming: 0, + }, { + name: "partially failed scale up", + acceptableRange: AcceptableRange{MinNodes: 5, CurrentTarget: 7, MaxNodes: 8}, + readiness: Readiness{Ready: make([]string, 5)}, + newTarget: 6, + wantAcceptableRange: AcceptableRange{MinNodes: 5, CurrentTarget: 6, MaxNodes: 6}, + scaleUpRequest: &ScaleUpRequest{Increase: 1}, + wantUpcoming: 1, + }, { + name: "scale up ongoing, no change", + acceptableRange: AcceptableRange{MinNodes: 1, CurrentTarget: 4, MaxNodes: 4}, + readiness: Readiness{Ready: make([]string, 1)}, + newTarget: 4, + wantAcceptableRange: AcceptableRange{MinNodes: 1, CurrentTarget: 4, MaxNodes: 4}, + scaleUpRequest: &ScaleUpRequest{Increase: 3}, + wantUpcoming: 3, + }, { + name: "no scale up, no change", + acceptableRange: AcceptableRange{MinNodes: 4, CurrentTarget: 4, MaxNodes: 4}, + readiness: Readiness{Ready: make([]string, 4)}, + newTarget: 4, + wantAcceptableRange: AcceptableRange{MinNodes: 4, CurrentTarget: 4, MaxNodes: 4}, + wantUpcoming: 0, + }, + } + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + provider := testprovider.NewTestCloudProvider(nil, nil) + provider.AddNodeGroup(ngName, 0, 1000, tc.newTarget) + + fakeLogRecorder, _ := utils.NewStatusMapRecorder(&fake.Clientset{}, "kube-system", kube_record.NewFakeRecorder(5), false, "my-cool-configmap") + clusterState := NewClusterStateRegistry(provider, ClusterStateRegistryConfig{}, fakeLogRecorder, + newBackoff(), nodegroupconfig.NewDefaultNodeGroupConfigProcessor(config.NodeGroupAutoscalingOptions{})) + clusterState.acceptableRanges = map[string]AcceptableRange{ngName: tc.acceptableRange} + clusterState.perNodeGroupReadiness = map[string]Readiness{ngName: tc.readiness} + if tc.scaleUpRequest != nil { + clusterState.scaleUpRequests = map[string]*ScaleUpRequest{ngName: tc.scaleUpRequest} + } + + clusterState.Recalculate() + assert.Equal(t, tc.wantAcceptableRange, clusterState.acceptableRanges[ngName]) + upcomingCounts, _ := clusterState.GetUpcomingNodes() + if upcoming, found := upcomingCounts[ngName]; found { + assert.Equal(t, tc.wantUpcoming, upcoming, "Unexpected upcoming nodes count, want: %d got: %d", tc.wantUpcoming, upcomingCounts[ngName]) + } + }) + } +} + func TestOKOneUnreadyNode(t *testing.T) { now := time.Now() diff --git a/cluster-autoscaler/core/static_autoscaler.go b/cluster-autoscaler/core/static_autoscaler.go index 8003f12490fe..027ec59630b3 100644 --- a/cluster-autoscaler/core/static_autoscaler.go +++ b/cluster-autoscaler/core/static_autoscaler.go @@ -448,10 +448,7 @@ func (a *StaticAutoscaler) RunOnce(currentTime time.Time) caerrors.AutoscalerErr return nil } - if deletedNodes := a.deleteCreatedNodesWithErrors(); deletedNodes { - klog.V(0).Infof("Some nodes that failed to create were removed, skipping iteration") - return nil - } + a.deleteCreatedNodesWithErrors() // Check if there has been a constant difference between the number of nodes in k8s and // the number of nodes on the cloud provider side. @@ -821,7 +818,7 @@ func toNodes(unregisteredNodes []clusterstate.UnregisteredNode) []*apiv1.Node { return nodes } -func (a *StaticAutoscaler) deleteCreatedNodesWithErrors() bool { +func (a *StaticAutoscaler) deleteCreatedNodesWithErrors() { // We always schedule deleting of incoming errornous nodes // TODO[lukaszos] Consider adding logic to not retry delete every loop iteration nodeGroups := a.nodeGroupsById() @@ -848,7 +845,10 @@ func (a *StaticAutoscaler) deleteCreatedNodesWithErrors() bool { } } - return deletedAny + if deletedAny { + klog.V(0).Infof("Some nodes that failed to create were removed, recalculating cluster state.") + a.clusterStateRegistry.Recalculate() + } } // overrideNodesToDeleteForZeroOrMax returns a list of nodes to delete, taking into account that diff --git a/cluster-autoscaler/core/static_autoscaler_test.go b/cluster-autoscaler/core/static_autoscaler_test.go index cecf660982b9..b49406577a5d 100644 --- a/cluster-autoscaler/core/static_autoscaler_test.go +++ b/cluster-autoscaler/core/static_autoscaler_test.go @@ -1709,8 +1709,11 @@ func TestStaticAutoscalerInstanceCreationErrors(t *testing.T) { clusterState.UpdateNodes([]*apiv1.Node{}, nil, now) // delete nodes with create errors - removedNodes := autoscaler.deleteCreatedNodesWithErrors() - assert.True(t, removedNodes) + autoscaler.deleteCreatedNodesWithErrors() + + // nodes should be deleted + expectedDeleteCalls := 1 + nodeGroupA.AssertNumberOfCalls(t, "DeleteNodes", expectedDeleteCalls) // check delete was called on correct nodes nodeGroupA.AssertCalled(t, "DeleteNodes", mock.MatchedBy( @@ -1734,10 +1737,12 @@ func TestStaticAutoscalerInstanceCreationErrors(t *testing.T) { clusterState.UpdateNodes([]*apiv1.Node{}, nil, now) // delete nodes with create errors - removedNodes = autoscaler.deleteCreatedNodesWithErrors() - assert.True(t, removedNodes) + autoscaler.deleteCreatedNodesWithErrors() // nodes should be deleted again + expectedDeleteCalls += 1 + nodeGroupA.AssertNumberOfCalls(t, "DeleteNodes", expectedDeleteCalls) + nodeGroupA.AssertCalled(t, "DeleteNodes", mock.MatchedBy( func(nodes []*apiv1.Node) bool { if len(nodes) != 4 { @@ -1798,11 +1803,10 @@ func TestStaticAutoscalerInstanceCreationErrors(t *testing.T) { clusterState.UpdateNodes([]*apiv1.Node{}, nil, now) // delete nodes with create errors - removedNodes = autoscaler.deleteCreatedNodesWithErrors() - assert.False(t, removedNodes) + autoscaler.deleteCreatedNodesWithErrors() - // we expect no more Delete Nodes - nodeGroupA.AssertNumberOfCalls(t, "DeleteNodes", 2) + // we expect no more Delete Nodes, don't increase expectedDeleteCalls + nodeGroupA.AssertNumberOfCalls(t, "DeleteNodes", expectedDeleteCalls) // failed node not included by NodeGroupForNode nodeGroupC := &mockprovider.NodeGroup{} @@ -1840,8 +1844,7 @@ func TestStaticAutoscalerInstanceCreationErrors(t *testing.T) { clusterState.UpdateNodes([]*apiv1.Node{}, nil, time.Now()) // No nodes are deleted when failed nodes don't have matching node groups - removedNodes = autoscaler.deleteCreatedNodesWithErrors() - assert.False(t, removedNodes) + autoscaler.deleteCreatedNodesWithErrors() nodeGroupC.AssertNumberOfCalls(t, "DeleteNodes", 0) nodeGroupAtomic := &mockprovider.NodeGroup{} @@ -1896,8 +1899,7 @@ func TestStaticAutoscalerInstanceCreationErrors(t *testing.T) { clusterState.UpdateNodes([]*apiv1.Node{}, nil, now) // delete nodes with create errors - removedNodes = autoscaler.deleteCreatedNodesWithErrors() - assert.True(t, removedNodes) + autoscaler.deleteCreatedNodesWithErrors() nodeGroupAtomic.AssertCalled(t, "DeleteNodes", mock.MatchedBy( func(nodes []*apiv1.Node) bool { @@ -1957,8 +1959,7 @@ func TestStaticAutoscalerInstanceCreationErrors(t *testing.T) { clusterState.UpdateNodes([]*apiv1.Node{}, nil, now) // delete nodes with create errors - removedNodes = autoscaler.deleteCreatedNodesWithErrors() - assert.False(t, removedNodes) + autoscaler.deleteCreatedNodesWithErrors() nodeGroupError.AssertNumberOfCalls(t, "DeleteNodes", 0) }