Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[cluster-autoscaler-release-1.30] Refactor scale-up to apply resource limits before creating a node group #6931

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
41 changes: 22 additions & 19 deletions cluster-autoscaler/core/scaleup/orchestrator/orchestrator.go
Original file line number Diff line number Diff line change
Expand Up @@ -187,11 +187,31 @@ func (o *ScaleUpOrchestrator) ScaleUp(
}
klog.V(1).Infof("Estimated %d nodes needed in %s", bestOption.NodeCount, bestOption.NodeGroup.Id())

// Cap new nodes to supported number of nodes in the cluster.
newNodes, aErr := o.GetCappedNewNodeCount(bestOption.NodeCount, len(nodes)+len(upcomingNodes))
if aErr != nil {
return status.UpdateScaleUpError(&status.ScaleUpStatus{PodsTriggeredScaleUp: bestOption.Pods}, aErr)
}

// Apply upper limits for resources in the cluster.
nodeInfo, found := nodeInfos[bestOption.NodeGroup.Id()]
if !found {
// This should never happen, as we already should have retrieved nodeInfo for any considered nodegroup.
klog.Errorf("No node info for: %s", bestOption.NodeGroup.Id())
return status.UpdateScaleUpError(
&status.ScaleUpStatus{PodsTriggeredScaleUp: bestOption.Pods},
errors.NewAutoscalerError(
errors.CloudProviderError,
"No node info for best expansion option!"))
}
newNodes, aErr = o.resourceManager.ApplyLimits(o.autoscalingContext, newNodes, resourcesLeft, nodeInfo, bestOption.NodeGroup)
if aErr != nil {
return status.UpdateScaleUpError(
&status.ScaleUpStatus{PodsTriggeredScaleUp: bestOption.Pods},
aErr)
}

// If necessary, create the node group. This is no longer simulation, an empty node group will be created by cloud provider if supported.
createNodeGroupResults := make([]nodegroups.CreateNodeGroupResult, 0)
if !bestOption.NodeGroup.Exist() {
var scaleUpStatus *status.ScaleUpStatus
Expand All @@ -215,25 +235,7 @@ func (o *ScaleUpOrchestrator) ScaleUp(
klog.V(2).Info("No similar node groups found")
}

nodeInfo, found := nodeInfos[bestOption.NodeGroup.Id()]
if !found {
// This should never happen, as we already should have retrieved nodeInfo for any considered nodegroup.
klog.Errorf("No node info for: %s", bestOption.NodeGroup.Id())
return status.UpdateScaleUpError(
&status.ScaleUpStatus{CreateNodeGroupResults: createNodeGroupResults, PodsTriggeredScaleUp: bestOption.Pods},
errors.NewAutoscalerError(
errors.CloudProviderError,
"No node info for best expansion option!"))
}

// Apply upper limits for CPU and memory.
newNodes, aErr = o.resourceManager.ApplyLimits(o.autoscalingContext, newNodes, resourcesLeft, nodeInfo, bestOption.NodeGroup)
if aErr != nil {
return status.UpdateScaleUpError(
&status.ScaleUpStatus{CreateNodeGroupResults: createNodeGroupResults, PodsTriggeredScaleUp: bestOption.Pods},
aErr)
}

// Balance between similar node groups.
targetNodeGroups := []cloudprovider.NodeGroup{bestOption.NodeGroup}
for _, ng := range bestOption.SimilarNodeGroups {
targetNodeGroups = append(targetNodeGroups, ng)
Expand All @@ -254,6 +256,7 @@ func (o *ScaleUpOrchestrator) ScaleUp(
aErr)
}

// Execute scale up.
klog.V(1).Infof("Final scale-up plan: %v", scaleUpInfos)
aErr, failedNodeGroups := o.scaleUpExecutor.ExecuteScaleUps(scaleUpInfos, nodeInfos, now)
if aErr != nil {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -822,8 +822,48 @@ func TestNoScaleUpMaxCoresLimitHit(t *testing.T) {
simpleNoScaleUpTest(t, config, results)
}

func TestNoCreateNodeGroupMaxCoresLimitHit(t *testing.T) {
options := defaultOptions
options.MaxCoresTotal = 7
options.MaxMemoryTotal = 100000
options.NodeAutoprovisioningEnabled = true

largeNode := BuildTestNode("n", 8000, 8000)
SetNodeReadyState(largeNode, true, time.Time{})
largeNodeInfo := schedulerframework.NewNodeInfo()
largeNodeInfo.SetNode(largeNode)

config := &ScaleUpTestConfig{
EnableAutoprovisioning: true,
Nodes: []NodeConfig{
{Name: "n1", Cpu: 2000, Memory: 1000, Gpu: 0, Ready: true, Group: "ng-small"},
},
Pods: []PodConfig{},
ExtraPods: []PodConfig{
{Name: "large-pod", Cpu: 8000, Memory: 0, Gpu: 0, Node: "", ToleratesGpu: false},
},
Options: &options,
NodeTemplateConfigs: map[string]*NodeTemplateConfig{
"n1-standard-8": {
NodeGroupName: "autoprovisioned-n1-standard-8",
MachineType: "n1-standard-8",
NodeInfo: largeNodeInfo,
},
},
}
results := &ScaleTestResults{
NoScaleUpReason: "Insufficient cpu",
ScaleUpStatus: ScaleUpStatusInfo{
PodsRemainUnschedulable: []string{"large-pod"},
},
}

simpleNoScaleUpTest(t, config, results)
}

func simpleScaleUpTest(t *testing.T, config *ScaleUpTestConfig, expectedResults *ScaleTestResults) {
results := runSimpleScaleUpTest(t, config)
assert.NotNil(t, results.GroupSizeChanges, "Expected scale up event")
assert.NotNil(t, results.GroupSizeChanges[0], "Expected scale up event")
assert.Equal(t, expectedResults.FinalOption, results.GroupSizeChanges[0])
assert.True(t, results.ScaleUpStatus.WasSuccessful())
Expand Down Expand Up @@ -920,14 +960,21 @@ func runSimpleScaleUpTest(t *testing.T, config *ScaleUpTestConfig) *ScaleUpTestR
}
return nil
}
onCreateGroupFunc := func(nodeGroup string) error {
if config.OnCreateGroup != nil {
return config.OnCreateGroup(nodeGroup)
}
return fmt.Errorf("unexpected node group create: OnCreateGroup not defined")
}
if len(config.NodeTemplateConfigs) > 0 {
machineTypes := []string{}
machineTemplates := map[string]*schedulerframework.NodeInfo{}
for _, ntc := range config.NodeTemplateConfigs {
machineTypes = append(machineTypes, ntc.MachineType)
machineTemplates[ntc.NodeGroupName] = ntc.NodeInfo
machineTemplates[ntc.MachineType] = ntc.NodeInfo
}
provider = testprovider.NewTestAutoprovisioningCloudProvider(onScaleUpFunc, nil, nil, nil, machineTypes, machineTemplates)
provider = testprovider.NewTestAutoprovisioningCloudProvider(onScaleUpFunc, nil, onCreateGroupFunc, nil, machineTypes, machineTemplates)
} else {
provider = testprovider.NewTestCloudProvider(onScaleUpFunc, nil)
}
Expand Down Expand Up @@ -975,6 +1022,10 @@ func runSimpleScaleUpTest(t *testing.T, config *ScaleUpTestConfig) *ScaleUpTestR
clusterState.UpdateNodes(nodes, nodeInfos, time.Now())
processors := NewTestProcessors(&context)
processors.ScaleStateNotifier.Register(clusterState)
if config.EnableAutoprovisioning {
processors.NodeGroupListProcessor = &MockAutoprovisioningNodeGroupListProcessor{T: t}
processors.NodeGroupManager = &MockAutoprovisioningNodeGroupManager{T: t, ExtraGroups: 0}
}
orchestrator := New()
orchestrator.Initialize(&context, processors, clusterState, newEstimatorBuilder(), taints.TaintConfig{})
expander := NewMockRepotingStrategy(t, config.ExpansionOptionToChoose)
Expand Down
2 changes: 2 additions & 0 deletions cluster-autoscaler/core/test/common.go
Original file line number Diff line number Diff line change
Expand Up @@ -122,9 +122,11 @@ type ScaleUpTestConfig struct {
Pods []PodConfig
ExtraPods []PodConfig
OnScaleUp testcloudprovider.OnScaleUpFunc
OnCreateGroup testcloudprovider.OnNodeGroupCreateFunc
ExpansionOptionToChoose *GroupSizeChange
Options *config.AutoscalingOptions
NodeTemplateConfigs map[string]*NodeTemplateConfig
EnableAutoprovisioning bool
}

// ScaleUpTestResult represents a node groups scale up result
Expand Down
Loading