Skip to content

Commit

Permalink
feat: set IgnoreDaemonSetsUtilization per nodegroup
Browse files Browse the repository at this point in the history
Signed-off-by: vadasambar <surajrbanakar@gmail.com>

fix: test cases failing for actuator and scaledown/eligibility
- abstract default values into `config`
Signed-off-by: vadasambar <surajrbanakar@gmail.com>

refactor: rename global `IgnoreDaemonSetsUtilization` -> `GlobalIgnoreDaemonSetsUtilization` in code
- there is no change in the flag name
- rename `thresholdGetter` -> `configGetter` and tweak it to accomodate `GetIgnoreDaemonSetsUtilization`
Signed-off-by: vadasambar <surajrbanakar@gmail.com>

refactor: reset help text for `ignore-daemonsets-utilization` flag
- because per nodegroup override is supported only for AWS ASG tags as of now
Signed-off-by: vadasambar <surajrbanakar@gmail.com>

docs: add info about overriding `--ignore-daemonsets-utilization` per ASG
- in AWS cloud provider README
Signed-off-by: vadasambar <surajrbanakar@gmail.com>

refactor: use a limiting interface in actuator in place of `NodeGroupConfigProcessor` interface
- to limit the functions that can be used
- since we need it only for `GetIgnoreDaemonSetsUtilization`
Signed-off-by: vadasambar <surajrbanakar@gmail.com>

fix: tests failing for actuator
- rename `staticNodeGroupConfigProcessor` -> `MockNodeGroupConfigGetter`
- move `MockNodeGroupConfigGetter` to test/common so that it can be used in different tests
Signed-off-by: vadasambar <surajrbanakar@gmail.com>

fix: go lint errors for `MockNodeGroupConfigGetter`
Signed-off-by: vadasambar <surajrbanakar@gmail.com>

test: add tests for `IgnoreDaemonSetsUtilization` in cloud provider dir
Signed-off-by: vadasambar <surajrbanakar@gmail.com>

test: update node group config processor tests for `IgnoreDaemonSetsUtilization`
Signed-off-by: vadasambar <surajrbanakar@gmail.com>

test: update eligibility test cases for `IgnoreDaemonSetsUtilization`
Signed-off-by: vadasambar <surajrbanakar@gmail.com>

test: run actuation tests for 2 NGS
- one with `IgnoreDaemonSetsUtilization`: `false`
- one with `IgnoreDaemonSetsUtilization`: `true`
Signed-off-by: vadasambar <surajrbanakar@gmail.com>

test: add tests for `IgnoreDaemonSetsUtilization` in actuator
- add helper to generate multiple ds pods dynamically
- get rid of mock config processor because it is not required
Signed-off-by: vadasambar <surajrbanakar@gmail.com>
  • Loading branch information
vadasambar committed Apr 20, 2023
1 parent 299c963 commit 4400d80
Show file tree
Hide file tree
Showing 16 changed files with 636 additions and 332 deletions.
2 changes: 2 additions & 0 deletions cluster-autoscaler/cloudprovider/aws/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -246,6 +246,8 @@ as string). Currently supported autoscaling options (and example values) are:
(overrides `--scale-down-unneeded-time` value for that specific ASG)
* `k8s.io/cluster-autoscaler/node-template/autoscaling-options/scaledownunreadytime`: `20m0s`
(overrides `--scale-down-unready-time` value for that specific ASG)
* `k8s.io/cluster-autoscaler/node-template/autoscaling-options/ignoredaemonsetsutilization`: `true`
(overrides `--ignore-daemonsets-utilization` value for that specific ASG)

**NOTE:** It is your responsibility to ensure such labels and/or taints are
applied via the node's kubelet configuration at startup. Cluster Autoscaler will not set the node taints for you.
Expand Down
9 changes: 9 additions & 0 deletions cluster-autoscaler/cloudprovider/aws/aws_manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -245,6 +245,15 @@ func (m *AwsManager) GetAsgOptions(asg asg, defaults config.NodeGroupAutoscaling
}
}

if stringOpt, found := options[config.DefaultIgnoreDaemonSetsUtilizationKey]; found {
if opt, err := strconv.ParseBool(stringOpt); err != nil {
klog.Warningf("failed to convert asg %s %s tag to bool: %v",
asg.Name, config.DefaultScaleDownUnreadyTimeKey, err)
} else {
defaults.IgnoreDaemonSetsUtilization = opt
}
}

return &defaults
}

Expand Down
42 changes: 32 additions & 10 deletions cluster-autoscaler/cloudprovider/aws/aws_manager_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,7 @@ func TestGetAsgOptions(t *testing.T) {
ScaleDownGpuUtilizationThreshold: 0.2,
ScaleDownUnneededTime: time.Second,
ScaleDownUnreadyTime: time.Minute,
IgnoreDaemonSetsUtilization: false,
}

tests := []struct {
Expand All @@ -124,39 +125,60 @@ func TestGetAsgOptions(t *testing.T) {
{
description: "keep defaults on invalid tags values",
tags: map[string]string{
"scaledownutilizationthreshold": "not-a-float",
"scaledownunneededtime": "not-a-duration",
"ScaleDownUnreadyTime": "",
config.DefaultScaleDownUtilizationThresholdKey: "not-a-float",
config.DefaultScaleDownUnneededTimeKey: "not-a-duration",
"ScaleDownUnreadyTime": "",
config.DefaultIgnoreDaemonSetsUtilizationKey: "not-a-bool",
},
expected: &defaultOptions,
},
{
description: "use provided tags and fill missing with defaults",
tags: map[string]string{
"scaledownutilizationthreshold": "0.42",
"scaledownunneededtime": "1h",
config.DefaultScaleDownUtilizationThresholdKey: "0.42",
config.DefaultScaleDownUnneededTimeKey: "1h",
config.DefaultIgnoreDaemonSetsUtilizationKey: "true",
},
expected: &config.NodeGroupAutoscalingOptions{
ScaleDownUtilizationThreshold: 0.42,
ScaleDownGpuUtilizationThreshold: defaultOptions.ScaleDownGpuUtilizationThreshold,
ScaleDownUnneededTime: time.Hour,
ScaleDownUnreadyTime: defaultOptions.ScaleDownUnreadyTime,
IgnoreDaemonSetsUtilization: true,
},
},
{
description: "use provided tags (happy path)",
tags: map[string]string{
config.DefaultScaleDownUtilizationThresholdKey: "0.42",
config.DefaultScaleDownUnneededTimeKey: "1h",
config.DefaultScaleDownGpuUtilizationThresholdKey: "0.7",
config.DefaultScaleDownUnreadyTimeKey: "25m",
config.DefaultIgnoreDaemonSetsUtilizationKey: "true",
},
expected: &config.NodeGroupAutoscalingOptions{
ScaleDownUtilizationThreshold: 0.42,
ScaleDownGpuUtilizationThreshold: 0.7,
ScaleDownUnneededTime: time.Hour,
ScaleDownUnreadyTime: 25 * time.Minute,
IgnoreDaemonSetsUtilization: true,
},
},
{
description: "ignore unknown tags",
tags: map[string]string{
"scaledownutilizationthreshold": "0.6",
"scaledowngpuutilizationthreshold": "0.7",
"scaledownunneededtime": "1m",
"scaledownunreadytime": "1h",
"notyetspecified": "42",
config.DefaultScaleDownUtilizationThresholdKey: "0.6",
config.DefaultScaleDownGpuUtilizationThresholdKey: "0.7",
config.DefaultScaleDownUnneededTimeKey: "1m",
config.DefaultScaleDownUnreadyTimeKey: "1h",
"notyetspecified": "42",
},
expected: &config.NodeGroupAutoscalingOptions{
ScaleDownUtilizationThreshold: 0.6,
ScaleDownGpuUtilizationThreshold: 0.7,
ScaleDownUnneededTime: time.Minute,
ScaleDownUnreadyTime: time.Hour,
IgnoreDaemonSetsUtilization: false,
},
},
}
Expand Down
7 changes: 5 additions & 2 deletions cluster-autoscaler/config/autoscaling_options.go
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,8 @@ type NodeGroupAutoscalingOptions struct {
ScaleDownUnreadyTime time.Duration
// Maximum time CA waits for node to be provisioned
MaxNodeProvisionTime time.Duration
// IgnoreDaemonSetsUtilization sets if daemonsets utilization should be considered during node scale-down
IgnoreDaemonSetsUtilization bool
}

// GCEOptions contain autoscaling options specific to GCE cloud provider.
Expand Down Expand Up @@ -115,8 +117,9 @@ type AutoscalingOptions struct {
GRPCExpanderCert string
// GRPCExpanderURL is the url of the gRPC server when using the gRPC expander
GRPCExpanderURL string
// IgnoreDaemonSetsUtilization is whether CA will ignore DaemonSet pods when calculating resource utilization for scaling down
IgnoreDaemonSetsUtilization bool
// GlobalIgnoreDaemonSetsUtilization is whether CA will ignore DaemonSet pods when calculating resource utilization for scaling down
// for the entire cluster (unless per nodegroup IgnoreDaemonSetsUtilization is set by the user)
GlobalIgnoreDaemonSetsUtilization bool
// IgnoreMirrorPodsUtilization is whether CA will ignore Mirror pods when calculating resource utilization for scaling down
IgnoreMirrorPodsUtilization bool
// MaxGracefulTerminationSec is maximum number of seconds scale down waits for pods to terminate before
Expand Down
12 changes: 12 additions & 0 deletions cluster-autoscaler/config/const.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@ limitations under the License.

package config

import "time"

const (
// DefaultMaxClusterCores is the default maximum number of cores in the cluster.
DefaultMaxClusterCores = 5000 * 64
Expand All @@ -32,4 +34,14 @@ const (
DefaultScaleDownUnreadyTimeKey = "scaledownunreadytime"
// DefaultMaxNodeProvisionTimeKey identifies MaxNodeProvisionTime autoscaling option
DefaultMaxNodeProvisionTimeKey = "maxnodeprovisiontime"
// DefaultIgnoreDaemonSetsUtilizationKey identifies IgnoreDaemonSetsUtilization autoscaling option
DefaultIgnoreDaemonSetsUtilizationKey = "ignoredaemonsetsutilization"
// DefaultScaleDownUnneededTime identifies ScaleDownUnneededTime autoscaling option
DefaultScaleDownUnneededTime = 10 * time.Minute
// DefaultScaleDownUnreadyTime identifies ScaleDownUnreadyTime autoscaling option
DefaultScaleDownUnreadyTime = 20 * time.Minute
// DefaultScaleDownUtilizationThreshold identifies ScaleDownUtilizationThreshold autoscaling option
DefaultScaleDownUtilizationThreshold = 0.5
// DefaultScaleDownGpuUtilizationThreshold identifies ScaleDownGpuUtilizationThreshold autoscaling option
DefaultScaleDownGpuUtilizationThreshold = 0.5
)
20 changes: 18 additions & 2 deletions cluster-autoscaler/core/scaledown/actuation/actuator.go
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ import (
"k8s.io/autoscaler/cluster-autoscaler/core/scaledown/status"
"k8s.io/autoscaler/cluster-autoscaler/core/utils"
"k8s.io/autoscaler/cluster-autoscaler/metrics"
"k8s.io/autoscaler/cluster-autoscaler/processors"
"k8s.io/autoscaler/cluster-autoscaler/simulator"
"k8s.io/autoscaler/cluster-autoscaler/simulator/clustersnapshot"
"k8s.io/autoscaler/cluster-autoscaler/simulator/utilization"
Expand All @@ -51,10 +52,18 @@ type Actuator struct {
nodeDeletionBatcher *NodeDeletionBatcher
evictor Evictor
deleteOptions simulator.NodeDeleteOptions
configGetter actuatorNodeGroupConfigGetter
}

// actuatorNodeGroupConfigGetter is an interface to limit the functions that can be used
// from NodeGroupConfigProcessor interface
type actuatorNodeGroupConfigGetter interface {
// GetIgnoreDaemonSetsUtilization returns IgnoreDaemonSetsUtilization value that should be used for a given NodeGroup.
GetIgnoreDaemonSetsUtilization(context *context.AutoscalingContext, nodeGroup cloudprovider.NodeGroup) (bool, error)
}

// NewActuator returns a new instance of Actuator.
func NewActuator(ctx *context.AutoscalingContext, csr *clusterstate.ClusterStateRegistry, ndt *deletiontracker.NodeDeletionTracker, deleteOptions simulator.NodeDeleteOptions) *Actuator {
func NewActuator(ctx *context.AutoscalingContext, csr *clusterstate.ClusterStateRegistry, ndt *deletiontracker.NodeDeletionTracker, deleteOptions simulator.NodeDeleteOptions, processors *processors.AutoscalingProcessors) *Actuator {
nbd := NewNodeDeletionBatcher(ctx, csr, ndt, ctx.NodeDeletionBatcherInterval)
return &Actuator{
ctx: ctx,
Expand All @@ -63,6 +72,7 @@ func NewActuator(ctx *context.AutoscalingContext, csr *clusterstate.ClusterState
nodeDeletionBatcher: nbd,
evictor: NewDefaultEvictor(deleteOptions, ndt),
deleteOptions: deleteOptions,
configGetter: processors.NodeGroupConfigProcessor,
}
}

Expand Down Expand Up @@ -306,8 +316,14 @@ func (a *Actuator) scaleDownNodeToReport(node *apiv1.Node, drain bool) (*status.
if err != nil {
return nil, err
}

ignoreDaemonSetsUtilization, err := a.configGetter.GetIgnoreDaemonSetsUtilization(a.ctx, nodeGroup)
if err != nil {
return nil, err
}

gpuConfig := a.ctx.CloudProvider.GetNodeGpuConfig(node)
utilInfo, err := utilization.Calculate(nodeInfo, a.ctx.IgnoreDaemonSetsUtilization, a.ctx.IgnoreMirrorPodsUtilization, gpuConfig, time.Now())
utilInfo, err := utilization.Calculate(nodeInfo, ignoreDaemonSetsUtilization, a.ctx.IgnoreMirrorPodsUtilization, gpuConfig, time.Now())
if err != nil {
return nil, err
}
Expand Down
Loading

0 comments on commit 4400d80

Please sign in to comment.