From c19aac12ca5b27ac877eacb398b715e56fce2154 Mon Sep 17 00:00:00 2001 From: Aleksandra Malinowska Date: Thu, 25 Apr 2024 12:06:43 +0200 Subject: [PATCH 1/2] Refactor scale-up to apply resource limits before creating a node group --- .../core/scaleup/orchestrator/orchestrator.go | 41 ++++++++++--------- 1 file changed, 22 insertions(+), 19 deletions(-) diff --git a/cluster-autoscaler/core/scaleup/orchestrator/orchestrator.go b/cluster-autoscaler/core/scaleup/orchestrator/orchestrator.go index 43ba38f76670..e94ed4ccae49 100644 --- a/cluster-autoscaler/core/scaleup/orchestrator/orchestrator.go +++ b/cluster-autoscaler/core/scaleup/orchestrator/orchestrator.go @@ -187,11 +187,31 @@ func (o *ScaleUpOrchestrator) ScaleUp( } klog.V(1).Infof("Estimated %d nodes needed in %s", bestOption.NodeCount, bestOption.NodeGroup.Id()) + // Cap new nodes to supported number of nodes in the cluster. newNodes, aErr := o.GetCappedNewNodeCount(bestOption.NodeCount, len(nodes)+len(upcomingNodes)) if aErr != nil { return status.UpdateScaleUpError(&status.ScaleUpStatus{PodsTriggeredScaleUp: bestOption.Pods}, aErr) } + // Apply upper limits for resources in the cluster. + nodeInfo, found := nodeInfos[bestOption.NodeGroup.Id()] + if !found { + // This should never happen, as we already should have retrieved nodeInfo for any considered nodegroup. + klog.Errorf("No node info for: %s", bestOption.NodeGroup.Id()) + return status.UpdateScaleUpError( + &status.ScaleUpStatus{PodsTriggeredScaleUp: bestOption.Pods}, + errors.NewAutoscalerError( + errors.CloudProviderError, + "No node info for best expansion option!")) + } + newNodes, aErr = o.resourceManager.ApplyLimits(o.autoscalingContext, newNodes, resourcesLeft, nodeInfo, bestOption.NodeGroup) + if aErr != nil { + return status.UpdateScaleUpError( + &status.ScaleUpStatus{PodsTriggeredScaleUp: bestOption.Pods}, + aErr) + } + + // If necessary, create the node group. This is no longer simulation, an empty node group will be created by cloud provider if supported. createNodeGroupResults := make([]nodegroups.CreateNodeGroupResult, 0) if !bestOption.NodeGroup.Exist() { var scaleUpStatus *status.ScaleUpStatus @@ -215,25 +235,7 @@ func (o *ScaleUpOrchestrator) ScaleUp( klog.V(2).Info("No similar node groups found") } - nodeInfo, found := nodeInfos[bestOption.NodeGroup.Id()] - if !found { - // This should never happen, as we already should have retrieved nodeInfo for any considered nodegroup. - klog.Errorf("No node info for: %s", bestOption.NodeGroup.Id()) - return status.UpdateScaleUpError( - &status.ScaleUpStatus{CreateNodeGroupResults: createNodeGroupResults, PodsTriggeredScaleUp: bestOption.Pods}, - errors.NewAutoscalerError( - errors.CloudProviderError, - "No node info for best expansion option!")) - } - - // Apply upper limits for CPU and memory. - newNodes, aErr = o.resourceManager.ApplyLimits(o.autoscalingContext, newNodes, resourcesLeft, nodeInfo, bestOption.NodeGroup) - if aErr != nil { - return status.UpdateScaleUpError( - &status.ScaleUpStatus{CreateNodeGroupResults: createNodeGroupResults, PodsTriggeredScaleUp: bestOption.Pods}, - aErr) - } - + // Balance between similar node groups. targetNodeGroups := []cloudprovider.NodeGroup{bestOption.NodeGroup} for _, ng := range bestOption.SimilarNodeGroups { targetNodeGroups = append(targetNodeGroups, ng) @@ -254,6 +256,7 @@ func (o *ScaleUpOrchestrator) ScaleUp( aErr) } + // Execute scale up. klog.V(1).Infof("Final scale-up plan: %v", scaleUpInfos) aErr, failedNodeGroups := o.scaleUpExecutor.ExecuteScaleUps(scaleUpInfos, nodeInfos, now) if aErr != nil { From a6ce449e5fdfd8f1629480342f00bec4f25c1703 Mon Sep 17 00:00:00 2001 From: Aleksandra Malinowska Date: Wed, 8 May 2024 13:48:14 +0200 Subject: [PATCH 2/2] Add unit test for max cluster limits preventing creation of an autoprovisioned group --- .../scaleup/orchestrator/orchestrator_test.go | 53 ++++++++++++++++++- cluster-autoscaler/core/test/common.go | 2 + 2 files changed, 54 insertions(+), 1 deletion(-) diff --git a/cluster-autoscaler/core/scaleup/orchestrator/orchestrator_test.go b/cluster-autoscaler/core/scaleup/orchestrator/orchestrator_test.go index 3972602a6c50..2c446dcf0f44 100644 --- a/cluster-autoscaler/core/scaleup/orchestrator/orchestrator_test.go +++ b/cluster-autoscaler/core/scaleup/orchestrator/orchestrator_test.go @@ -822,8 +822,48 @@ func TestNoScaleUpMaxCoresLimitHit(t *testing.T) { simpleNoScaleUpTest(t, config, results) } +func TestNoCreateNodeGroupMaxCoresLimitHit(t *testing.T) { + options := defaultOptions + options.MaxCoresTotal = 7 + options.MaxMemoryTotal = 100000 + options.NodeAutoprovisioningEnabled = true + + largeNode := BuildTestNode("n", 8000, 8000) + SetNodeReadyState(largeNode, true, time.Time{}) + largeNodeInfo := schedulerframework.NewNodeInfo() + largeNodeInfo.SetNode(largeNode) + + config := &ScaleUpTestConfig{ + EnableAutoprovisioning: true, + Nodes: []NodeConfig{ + {Name: "n1", Cpu: 2000, Memory: 1000, Gpu: 0, Ready: true, Group: "ng-small"}, + }, + Pods: []PodConfig{}, + ExtraPods: []PodConfig{ + {Name: "large-pod", Cpu: 8000, Memory: 0, Gpu: 0, Node: "", ToleratesGpu: false}, + }, + Options: &options, + NodeTemplateConfigs: map[string]*NodeTemplateConfig{ + "n1-standard-8": { + NodeGroupName: "autoprovisioned-n1-standard-8", + MachineType: "n1-standard-8", + NodeInfo: largeNodeInfo, + }, + }, + } + results := &ScaleTestResults{ + NoScaleUpReason: "Insufficient cpu", + ScaleUpStatus: ScaleUpStatusInfo{ + PodsRemainUnschedulable: []string{"large-pod"}, + }, + } + + simpleNoScaleUpTest(t, config, results) +} + func simpleScaleUpTest(t *testing.T, config *ScaleUpTestConfig, expectedResults *ScaleTestResults) { results := runSimpleScaleUpTest(t, config) + assert.NotNil(t, results.GroupSizeChanges, "Expected scale up event") assert.NotNil(t, results.GroupSizeChanges[0], "Expected scale up event") assert.Equal(t, expectedResults.FinalOption, results.GroupSizeChanges[0]) assert.True(t, results.ScaleUpStatus.WasSuccessful()) @@ -920,14 +960,21 @@ func runSimpleScaleUpTest(t *testing.T, config *ScaleUpTestConfig) *ScaleUpTestR } return nil } + onCreateGroupFunc := func(nodeGroup string) error { + if config.OnCreateGroup != nil { + return config.OnCreateGroup(nodeGroup) + } + return fmt.Errorf("unexpected node group create: OnCreateGroup not defined") + } if len(config.NodeTemplateConfigs) > 0 { machineTypes := []string{} machineTemplates := map[string]*schedulerframework.NodeInfo{} for _, ntc := range config.NodeTemplateConfigs { machineTypes = append(machineTypes, ntc.MachineType) machineTemplates[ntc.NodeGroupName] = ntc.NodeInfo + machineTemplates[ntc.MachineType] = ntc.NodeInfo } - provider = testprovider.NewTestAutoprovisioningCloudProvider(onScaleUpFunc, nil, nil, nil, machineTypes, machineTemplates) + provider = testprovider.NewTestAutoprovisioningCloudProvider(onScaleUpFunc, nil, onCreateGroupFunc, nil, machineTypes, machineTemplates) } else { provider = testprovider.NewTestCloudProvider(onScaleUpFunc, nil) } @@ -975,6 +1022,10 @@ func runSimpleScaleUpTest(t *testing.T, config *ScaleUpTestConfig) *ScaleUpTestR clusterState.UpdateNodes(nodes, nodeInfos, time.Now()) processors := NewTestProcessors(&context) processors.ScaleStateNotifier.Register(clusterState) + if config.EnableAutoprovisioning { + processors.NodeGroupListProcessor = &MockAutoprovisioningNodeGroupListProcessor{T: t} + processors.NodeGroupManager = &MockAutoprovisioningNodeGroupManager{T: t, ExtraGroups: 0} + } orchestrator := New() orchestrator.Initialize(&context, processors, clusterState, newEstimatorBuilder(), taints.TaintConfig{}) expander := NewMockRepotingStrategy(t, config.ExpansionOptionToChoose) diff --git a/cluster-autoscaler/core/test/common.go b/cluster-autoscaler/core/test/common.go index de7529d66bb2..ff9ef91a545c 100644 --- a/cluster-autoscaler/core/test/common.go +++ b/cluster-autoscaler/core/test/common.go @@ -122,9 +122,11 @@ type ScaleUpTestConfig struct { Pods []PodConfig ExtraPods []PodConfig OnScaleUp testcloudprovider.OnScaleUpFunc + OnCreateGroup testcloudprovider.OnNodeGroupCreateFunc ExpansionOptionToChoose *GroupSizeChange Options *config.AutoscalingOptions NodeTemplateConfigs map[string]*NodeTemplateConfig + EnableAutoprovisioning bool } // ScaleUpTestResult represents a node groups scale up result