Skip to content

Commit

Permalink
Merge pull request #5826 from hbostan/master
Browse files Browse the repository at this point in the history
Add support for scaling up with ZeroToMaxNodesScaling option
  • Loading branch information
k8s-ci-robot committed Jun 30, 2023
2 parents c887626 + 333a028 commit e6397c6
Show file tree
Hide file tree
Showing 11 changed files with 345 additions and 21 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -443,7 +443,7 @@ func (tng *TestNodeGroup) DeleteNodes(nodes []*apiv1.Node) error {
id := tng.id
tng.targetSize -= len(nodes)
tng.Unlock()
if tng.opts != nil && tng.opts.AtomicScaling && tng.targetSize != 0 {
if tng.opts != nil && tng.opts.ZeroOrMaxNodeScaling && tng.targetSize != 0 {
return fmt.Errorf("TestNodeGroup: attempted to partially scale down a node group that should be scaled down atomically")
}
for _, node := range nodes {
Expand Down
4 changes: 2 additions & 2 deletions cluster-autoscaler/config/autoscaling_options.go
Original file line number Diff line number Diff line change
Expand Up @@ -46,8 +46,8 @@ type NodeGroupAutoscalingOptions struct {
ScaleDownUnreadyTime time.Duration
// Maximum time CA waits for node to be provisioned
MaxNodeProvisionTime time.Duration
// AtomicScaling means that all nodes should be provisioned and brought down all at once instead of one-by-one
AtomicScaling bool
// ZeroOrMaxNodeScaling means that a node group should be scaled up to maximum size or down to zero nodes all at once instead of one-by-one.
ZeroOrMaxNodeScaling bool
}

// GCEOptions contain autoscaling options specific to GCE cloud provider.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1103,7 +1103,7 @@ func TestStartDeletionInBatchBasic(t *testing.T) {
func sizedNodeGroup(id string, size int, atomic bool) cloudprovider.NodeGroup {
ng := testprovider.NewTestNodeGroup(id, 10000, 0, size, true, false, "n1-standard-2", nil, nil)
ng.SetOptions(&config.NodeGroupAutoscalingOptions{
AtomicScaling: atomic,
ZeroOrMaxNodeScaling: atomic,
})
return ng
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ func (ds *GroupDeletionScheduler) ScheduleDeletion(nodeInfo *framework.NodeInfo,
return
}

ds.addToBatcher(nodeInfo, nodeGroup, batchSize, drain, opts.AtomicScaling)
ds.addToBatcher(nodeInfo, nodeGroup, batchSize, drain, opts.ZeroOrMaxNodeScaling)
}

// prepareNodeForDeletion is a long-running operation, so it needs to avoid locking the AtomicDeletionScheduler object
Expand Down
4 changes: 2 additions & 2 deletions cluster-autoscaler/core/scaledown/budgets/budgets.go
Original file line number Diff line number Diff line change
Expand Up @@ -173,7 +173,7 @@ func (bp *ScaleDownBudgetProcessor) categorize(groups []*NodeGroupView) (individ
klog.Errorf("Failed to get autoscaling options for node group %s: %v", view.Group.Id(), err)
continue
}
if autoscalingOptions != nil && autoscalingOptions.AtomicScaling {
if autoscalingOptions != nil && autoscalingOptions.ZeroOrMaxNodeScaling {
atomic = append(atomic, view)
} else {
individual = append(individual, view)
Expand All @@ -196,7 +196,7 @@ func (bp *ScaleDownBudgetProcessor) groupByNodeGroup(nodes []*apiv1.Node) (indiv
klog.Errorf("Failed to get autoscaling options for node group %s: %v", nodeGroup.Id(), err)
continue
}
if autoscalingOptions != nil && autoscalingOptions.AtomicScaling {
if autoscalingOptions != nil && autoscalingOptions.ZeroOrMaxNodeScaling {
if idx, ok := atomicGroup[nodeGroup]; ok {
atomic[idx].Nodes = append(atomic[idx].Nodes, node)
} else {
Expand Down
2 changes: 1 addition & 1 deletion cluster-autoscaler/core/scaledown/budgets/budgets_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -361,7 +361,7 @@ var transformNodeGroupView = cmp.Transformer("transformNodeGroupView", func(b No
func sizedNodeGroup(id string, size int, atomic bool) cloudprovider.NodeGroup {
ng := testprovider.NewTestNodeGroup(id, 10000, 0, size, true, false, "n1-standard-2", nil, nil)
ng.SetOptions(&config.NodeGroupAutoscalingOptions{
AtomicScaling: atomic,
ZeroOrMaxNodeScaling: atomic,
})
return ng
}
Expand Down
2 changes: 1 addition & 1 deletion cluster-autoscaler/core/scaledown/planner/planner_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -795,7 +795,7 @@ func TestNodesToDelete(t *testing.T) {
func sizedNodeGroup(id string, size int, atomic bool) cloudprovider.NodeGroup {
ng := testprovider.NewTestNodeGroup(id, 10000, 0, size, true, false, "n1-standard-2", nil, nil)
ng.SetOptions(&config.NodeGroupAutoscalingOptions{
AtomicScaling: atomic,
ZeroOrMaxNodeScaling: atomic,
})
return ng
}
Expand Down
44 changes: 40 additions & 4 deletions cluster-autoscaler/core/scaleup/orchestrator/orchestrator.go
Original file line number Diff line number Diff line change
Expand Up @@ -130,7 +130,7 @@ func (o *ScaleUpOrchestrator) ScaleUp(
now := time.Now()

// Filter out invalid node groups
validNodeGroups, skippedNodeGroups := o.filterValidScaleUpNodeGroups(nodeGroups, nodeInfos, resourcesLeft, now)
validNodeGroups, skippedNodeGroups := o.filterValidScaleUpNodeGroups(nodeGroups, nodeInfos, resourcesLeft, len(nodes)+len(upcomingNodes), now)

// Mark skipped node groups as processed.
for nodegroupID := range skippedNodeGroups {
Expand Down Expand Up @@ -362,7 +362,7 @@ func (o *ScaleUpOrchestrator) ScaleUpToNodeGroupMinSize(
continue
}

if skipReason := o.IsNodeGroupResourceExceeded(resourcesLeft, ng, nodeInfo); skipReason != nil {
if skipReason := o.IsNodeGroupResourceExceeded(resourcesLeft, ng, nodeInfo, 1); skipReason != nil {
klog.Warning("ScaleUpToNodeGroupMinSize: node group resource excceded: %v", skipReason)
continue
}
Expand Down Expand Up @@ -418,6 +418,7 @@ func (o *ScaleUpOrchestrator) filterValidScaleUpNodeGroups(
nodeGroups []cloudprovider.NodeGroup,
nodeInfos map[string]*schedulerframework.NodeInfo,
resourcesLeft resource.Limits,
currentNodeCount int,
now time.Time,
) ([]cloudprovider.NodeGroup, map[string]status.Reasons) {
var validNodeGroups []cloudprovider.NodeGroup
Expand All @@ -440,14 +441,27 @@ func (o *ScaleUpOrchestrator) filterValidScaleUpNodeGroups(
skippedNodeGroups[nodeGroup.Id()] = MaxLimitReachedReason
continue
}
autoscalingOptions, err := nodeGroup.GetOptions(o.autoscalingContext.NodeGroupDefaults)
if err != nil {
klog.Errorf("Couldn't get autoscaling options for ng: %v", nodeGroup.Id())
}
numNodes := 1
if autoscalingOptions != nil && autoscalingOptions.ZeroOrMaxNodeScaling {
numNodes = nodeGroup.MaxSize() - currentTargetSize
if o.autoscalingContext.MaxNodesTotal != 0 && currentNodeCount+numNodes > o.autoscalingContext.MaxNodesTotal {
klog.V(4).Infof("Skipping node group %s - atomic scale-up exceeds cluster node count limit", nodeGroup.Id())
skippedNodeGroups[nodeGroup.Id()] = NewSkippedReasons("atomic scale-up exceeds cluster node count limit")
continue
}
}

nodeInfo, found := nodeInfos[nodeGroup.Id()]
if !found {
klog.Errorf("No node info for: %s", nodeGroup.Id())
skippedNodeGroups[nodeGroup.Id()] = NotReadyReason
continue
}
if skipReason := o.IsNodeGroupResourceExceeded(resourcesLeft, nodeGroup, nodeInfo); skipReason != nil {
if skipReason := o.IsNodeGroupResourceExceeded(resourcesLeft, nodeGroup, nodeInfo, numNodes); skipReason != nil {
skippedNodeGroups[nodeGroup.Id()] = skipReason
continue
}
Expand Down Expand Up @@ -476,6 +490,16 @@ func (o *ScaleUpOrchestrator) ComputeExpansionOption(
estimator := o.autoscalingContext.EstimatorBuilder(o.autoscalingContext.PredicateChecker, o.autoscalingContext.ClusterSnapshot)
option.NodeCount, option.Pods = estimator.Estimate(pods, nodeInfo, nodeGroup)
option.SimilarNodeGroups = o.ComputeSimilarNodeGroups(nodeGroup, nodeInfos, schedulablePods, now)

autoscalingOptions, err := nodeGroup.GetOptions(o.autoscalingContext.NodeGroupDefaults)
if err != nil {
klog.Errorf("Failed to get autoscaling options for node group %s: %v", nodeGroup.Id(), err)
}
if autoscalingOptions != nil && autoscalingOptions.ZeroOrMaxNodeScaling {
if option.NodeCount > 0 && option.NodeCount != nodeGroup.MaxSize() {
option.NodeCount = nodeGroup.MaxSize()
}
}
return option
}

Expand Down Expand Up @@ -551,13 +575,17 @@ func (o *ScaleUpOrchestrator) IsNodeGroupReadyToScaleUp(nodeGroup cloudprovider.
}

// IsNodeGroupResourceExceeded returns nil if node group resource limits are not exceeded, otherwise a reason is provided.
func (o *ScaleUpOrchestrator) IsNodeGroupResourceExceeded(resourcesLeft resource.Limits, nodeGroup cloudprovider.NodeGroup, nodeInfo *schedulerframework.NodeInfo) status.Reasons {
func (o *ScaleUpOrchestrator) IsNodeGroupResourceExceeded(resourcesLeft resource.Limits, nodeGroup cloudprovider.NodeGroup, nodeInfo *schedulerframework.NodeInfo, numNodes int) status.Reasons {
resourcesDelta, err := o.resourceManager.DeltaForNode(o.autoscalingContext, nodeInfo, nodeGroup)
if err != nil {
klog.Errorf("Skipping node group %s; error getting node group resources: %v", nodeGroup.Id(), err)
return NotReadyReason
}

for resource, delta := range resourcesDelta {
resourcesDelta[resource] = delta * int64(numNodes)
}

checkResult := resource.CheckDeltaWithinLimits(resourcesLeft, resourcesDelta)
if checkResult.Exceeded {
klog.V(4).Infof("Skipping node group %s; maximal limit exceeded for %v", nodeGroup.Id(), checkResult.ExceededResources)
Expand Down Expand Up @@ -601,6 +629,14 @@ func (o *ScaleUpOrchestrator) ComputeSimilarNodeGroups(
return nil
}

autoscalingOptions, err := nodeGroup.GetOptions(o.autoscalingContext.NodeGroupDefaults)
if err != nil {
klog.Errorf("Failed to get autoscaling options for node group %s: %v", nodeGroup.Id(), err)
}
if autoscalingOptions != nil && autoscalingOptions.ZeroOrMaxNodeScaling {
return nil
}

groupSchedulablePods, found := schedulablePods[nodeGroup.Id()]
if !found || len(groupSchedulablePods) == 0 {
return nil
Expand Down
Loading

0 comments on commit e6397c6

Please sign in to comment.