diff --git a/pkg/scheduler/apis/config/types.go b/pkg/scheduler/apis/config/types.go index 2545120a2..51d47eab3 100644 --- a/pkg/scheduler/apis/config/types.go +++ b/pkg/scheduler/apis/config/types.go @@ -37,6 +37,8 @@ type LoadAwareSchedulingArgs struct { // When NodeMetrics expired, the node is considered abnormal. // Default is 180 seconds. NodeMetricExpirationSeconds *int64 + // EnableScheduleWhenNodeMetricsExpired Indicates whether nodes with expired nodeMetrics are allowed to schedule pods. + EnableScheduleWhenNodeMetricsExpired *bool // ResourceWeights indicates the weights of resources. // The weights of CPU and Memory are both 1 by default. ResourceWeights map[corev1.ResourceName]int64 diff --git a/pkg/scheduler/apis/config/v1beta3/defaults.go b/pkg/scheduler/apis/config/v1beta3/defaults.go index fbf0d246a..3d5ad10c5 100644 --- a/pkg/scheduler/apis/config/v1beta3/defaults.go +++ b/pkg/scheduler/apis/config/v1beta3/defaults.go @@ -78,6 +78,9 @@ func SetDefaults_LoadAwareSchedulingArgs(obj *LoadAwareSchedulingArgs) { if obj.FilterExpiredNodeMetrics == nil { obj.FilterExpiredNodeMetrics = pointer.Bool(true) } + if obj.EnableScheduleWhenNodeMetricsExpired == nil { + obj.EnableScheduleWhenNodeMetricsExpired = pointer.Bool(false) + } if obj.NodeMetricExpirationSeconds == nil { obj.NodeMetricExpirationSeconds = pointer.Int64(defaultNodeMetricExpirationSeconds) } diff --git a/pkg/scheduler/apis/config/v1beta3/types.go b/pkg/scheduler/apis/config/v1beta3/types.go index b194264fb..513206196 100644 --- a/pkg/scheduler/apis/config/v1beta3/types.go +++ b/pkg/scheduler/apis/config/v1beta3/types.go @@ -36,6 +36,8 @@ type LoadAwareSchedulingArgs struct { // When NodeMetrics expired, the node is considered abnormal. // Default is 180 seconds. NodeMetricExpirationSeconds *int64 `json:"nodeMetricExpirationSeconds,omitempty"` + // EnableScheduleWhenNodeMetricsExpired Indicates whether nodes with expired nodeMetrics are allowed to schedule pods. + EnableScheduleWhenNodeMetricsExpired *bool `json:"enableScheduleWhenNodeMetricsExpired,omitempty"` // ResourceWeights indicates the weights of resources. // The weights of CPU and Memory are both 1 by default. ResourceWeights map[corev1.ResourceName]int64 `json:"resourceWeights,omitempty"` diff --git a/pkg/scheduler/apis/config/v1beta3/zz_generated.conversion.go b/pkg/scheduler/apis/config/v1beta3/zz_generated.conversion.go index 1340a4e2b..3d64d59a3 100644 --- a/pkg/scheduler/apis/config/v1beta3/zz_generated.conversion.go +++ b/pkg/scheduler/apis/config/v1beta3/zz_generated.conversion.go @@ -277,6 +277,7 @@ func Convert_config_LoadAwareSchedulingAggregatedArgs_To_v1beta3_LoadAwareSchedu func autoConvert_v1beta3_LoadAwareSchedulingArgs_To_config_LoadAwareSchedulingArgs(in *LoadAwareSchedulingArgs, out *config.LoadAwareSchedulingArgs, s conversion.Scope) error { out.FilterExpiredNodeMetrics = (*bool)(unsafe.Pointer(in.FilterExpiredNodeMetrics)) out.NodeMetricExpirationSeconds = (*int64)(unsafe.Pointer(in.NodeMetricExpirationSeconds)) + out.EnableScheduleWhenNodeMetricsExpired = (*bool)(unsafe.Pointer(in.EnableScheduleWhenNodeMetricsExpired)) out.ResourceWeights = *(*map[corev1.ResourceName]int64)(unsafe.Pointer(&in.ResourceWeights)) out.UsageThresholds = *(*map[corev1.ResourceName]int64)(unsafe.Pointer(&in.UsageThresholds)) out.ProdUsageThresholds = *(*map[corev1.ResourceName]int64)(unsafe.Pointer(&in.ProdUsageThresholds)) @@ -300,6 +301,7 @@ func autoConvert_v1beta3_LoadAwareSchedulingArgs_To_config_LoadAwareSchedulingAr func autoConvert_config_LoadAwareSchedulingArgs_To_v1beta3_LoadAwareSchedulingArgs(in *config.LoadAwareSchedulingArgs, out *LoadAwareSchedulingArgs, s conversion.Scope) error { out.FilterExpiredNodeMetrics = (*bool)(unsafe.Pointer(in.FilterExpiredNodeMetrics)) out.NodeMetricExpirationSeconds = (*int64)(unsafe.Pointer(in.NodeMetricExpirationSeconds)) + out.EnableScheduleWhenNodeMetricsExpired = (*bool)(unsafe.Pointer(in.EnableScheduleWhenNodeMetricsExpired)) out.ResourceWeights = *(*map[corev1.ResourceName]int64)(unsafe.Pointer(&in.ResourceWeights)) out.UsageThresholds = *(*map[corev1.ResourceName]int64)(unsafe.Pointer(&in.UsageThresholds)) out.ProdUsageThresholds = *(*map[corev1.ResourceName]int64)(unsafe.Pointer(&in.ProdUsageThresholds)) diff --git a/pkg/scheduler/apis/config/v1beta3/zz_generated.deepcopy.go b/pkg/scheduler/apis/config/v1beta3/zz_generated.deepcopy.go index 78abb035c..b19989107 100644 --- a/pkg/scheduler/apis/config/v1beta3/zz_generated.deepcopy.go +++ b/pkg/scheduler/apis/config/v1beta3/zz_generated.deepcopy.go @@ -209,6 +209,11 @@ func (in *LoadAwareSchedulingArgs) DeepCopyInto(out *LoadAwareSchedulingArgs) { *out = new(int64) **out = **in } + if in.EnableScheduleWhenNodeMetricsExpired != nil { + in, out := &in.EnableScheduleWhenNodeMetricsExpired, &out.EnableScheduleWhenNodeMetricsExpired + *out = new(bool) + **out = **in + } if in.ResourceWeights != nil { in, out := &in.ResourceWeights, &out.ResourceWeights *out = make(map[corev1.ResourceName]int64, len(*in)) diff --git a/pkg/scheduler/apis/config/zz_generated.deepcopy.go b/pkg/scheduler/apis/config/zz_generated.deepcopy.go index d83136de2..97dac7428 100644 --- a/pkg/scheduler/apis/config/zz_generated.deepcopy.go +++ b/pkg/scheduler/apis/config/zz_generated.deepcopy.go @@ -163,6 +163,11 @@ func (in *LoadAwareSchedulingArgs) DeepCopyInto(out *LoadAwareSchedulingArgs) { *out = new(int64) **out = **in } + if in.EnableScheduleWhenNodeMetricsExpired != nil { + in, out := &in.EnableScheduleWhenNodeMetricsExpired, &out.EnableScheduleWhenNodeMetricsExpired + *out = new(bool) + **out = **in + } if in.ResourceWeights != nil { in, out := &in.ResourceWeights, &out.ResourceWeights *out = make(map[v1.ResourceName]int64, len(*in)) diff --git a/pkg/scheduler/plugins/loadaware/load_aware.go b/pkg/scheduler/plugins/loadaware/load_aware.go index 5baa15102..98e784846 100644 --- a/pkg/scheduler/plugins/loadaware/load_aware.go +++ b/pkg/scheduler/plugins/loadaware/load_aware.go @@ -42,6 +42,7 @@ import ( const ( Name = "LoadAwareScheduling" + ErrReasonNodeMetricExpired = "node(s) nodeMetric expired" ErrReasonUsageExceedThreshold = "node(s) %s usage exceed threshold" ErrReasonAggregatedUsageExceedThreshold = "node(s) %s aggregated usage exceed threshold" ErrReasonFailedEstimatePod @@ -143,6 +144,9 @@ func (p *Plugin) Filter(ctx context.Context, state *framework.CycleState, pod *c if p.args.FilterExpiredNodeMetrics != nil && *p.args.FilterExpiredNodeMetrics && p.args.NodeMetricExpirationSeconds != nil && isNodeMetricExpired(nodeMetric, *p.args.NodeMetricExpirationSeconds) { + if p.args.EnableScheduleWhenNodeMetricsExpired != nil && !*p.args.EnableScheduleWhenNodeMetricsExpired { + return framework.NewStatus(framework.Unschedulable, ErrReasonNodeMetricExpired) + } return nil } diff --git a/pkg/scheduler/plugins/loadaware/load_aware_test.go b/pkg/scheduler/plugins/loadaware/load_aware_test.go index 31daad858..c231a77da 100644 --- a/pkg/scheduler/plugins/loadaware/load_aware_test.go +++ b/pkg/scheduler/plugins/loadaware/load_aware_test.go @@ -183,7 +183,7 @@ func TestFilterExpiredNodeMetric(t *testing.T) { }, }, }, - wantStatus: nil, + wantStatus: framework.NewStatus(framework.Unschedulable, ErrReasonNodeMetricExpired), }, { name: "filter unhealthy nodeMetric with expired updateTime", @@ -202,13 +202,172 @@ func TestFilterExpiredNodeMetric(t *testing.T) { }, }, }, + wantStatus: framework.NewStatus(framework.Unschedulable, ErrReasonNodeMetricExpired), + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + var v1beta3args v1beta3.LoadAwareSchedulingArgs + v1beta3.SetDefaults_LoadAwareSchedulingArgs(&v1beta3args) + var loadAwareSchedulingArgs config.LoadAwareSchedulingArgs + err := v1beta3.Convert_v1beta3_LoadAwareSchedulingArgs_To_config_LoadAwareSchedulingArgs(&v1beta3args, &loadAwareSchedulingArgs, nil) + assert.NoError(t, err) + + koordClientSet := koordfake.NewSimpleClientset() + koordSharedInformerFactory := koordinatorinformers.NewSharedInformerFactory(koordClientSet, 0) + extenderFactory, _ := frameworkext.NewFrameworkExtenderFactory( + frameworkext.WithKoordinatorClientSet(koordClientSet), + frameworkext.WithKoordinatorSharedInformerFactory(koordSharedInformerFactory), + ) + proxyNew := frameworkext.PluginFactoryProxy(extenderFactory, New) + + cs := kubefake.NewSimpleClientset() + informerFactory := informers.NewSharedInformerFactory(cs, 0) + + nodes := []*corev1.Node{ + { + ObjectMeta: metav1.ObjectMeta{ + Name: tt.nodeMetric.Name, + }, + }, + } + + snapshot := newTestSharedLister(nil, nodes) + registeredPlugins := []schedulertesting.RegisterPluginFunc{ + schedulertesting.RegisterBindPlugin(defaultbinder.Name, defaultbinder.New), + schedulertesting.RegisterQueueSortPlugin(queuesort.Name, queuesort.New), + } + fh, err := schedulertesting.NewFramework(context.TODO(), registeredPlugins, "koord-scheduler", + frameworkruntime.WithClientSet(cs), + frameworkruntime.WithInformerFactory(informerFactory), + frameworkruntime.WithSnapshotSharedLister(snapshot), + ) + assert.Nil(t, err) + + p, err := proxyNew(&loadAwareSchedulingArgs, fh) + assert.NotNil(t, p) + assert.Nil(t, err) + + _, err = koordClientSet.SloV1alpha1().NodeMetrics().Create(context.TODO(), tt.nodeMetric, metav1.CreateOptions{}) + assert.NoError(t, err) + + koordSharedInformerFactory.Start(context.TODO().Done()) + koordSharedInformerFactory.WaitForCacheSync(context.TODO().Done()) + + cycleState := framework.NewCycleState() + + nodeInfo, err := snapshot.Get(tt.nodeMetric.Name) + assert.NoError(t, err) + assert.NotNil(t, nodeInfo) + + status := p.(*Plugin).Filter(context.TODO(), cycleState, &corev1.Pod{}, nodeInfo) + assert.True(t, tt.wantStatus.Equal(status), "want status: %s, but got %s", tt.wantStatus.Message(), status.Message()) + }) + } +} + +func TestEnableScheduleWhenNodeMetricsExpired(t *testing.T) { + tests := []struct { + name string + nodeMetric *slov1alpha1.NodeMetric + enableScheduleWhenNodeMetricsExpired *bool + wantStatus *framework.Status + }{ + { + name: "filter healthy nodeMetrics", + nodeMetric: &slov1alpha1.NodeMetric{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-node-1", + }, + Spec: slov1alpha1.NodeMetricSpec{ + CollectPolicy: &slov1alpha1.NodeMetricCollectPolicy{ + ReportIntervalSeconds: pointer.Int64(60), + }, + }, + Status: slov1alpha1.NodeMetricStatus{ + UpdateTime: &metav1.Time{ + Time: time.Now(), + }, + }, + }, wantStatus: nil, }, + { + name: "enable scheduling when nodeMetric with nil updateTime", + nodeMetric: &slov1alpha1.NodeMetric{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-node-1", + }, + Spec: slov1alpha1.NodeMetricSpec{ + CollectPolicy: &slov1alpha1.NodeMetricCollectPolicy{ + ReportIntervalSeconds: pointer.Int64(60), + }, + }, + }, + enableScheduleWhenNodeMetricsExpired: pointer.Bool(true), + wantStatus: nil, + }, + { + name: "enable scheduling when nodeMetric with expired updateTime", + nodeMetric: &slov1alpha1.NodeMetric{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-node-1", + }, + Spec: slov1alpha1.NodeMetricSpec{ + CollectPolicy: &slov1alpha1.NodeMetricCollectPolicy{ + ReportIntervalSeconds: pointer.Int64(60), + }, + }, + Status: slov1alpha1.NodeMetricStatus{ + UpdateTime: &metav1.Time{ + Time: time.Now().Add(-180 * time.Second), + }, + }, + }, + enableScheduleWhenNodeMetricsExpired: pointer.Bool(true), + wantStatus: nil, + }, + { + name: "disable scheduling when nodeMetric with nil updateTime", + nodeMetric: &slov1alpha1.NodeMetric{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-node-1", + }, + Spec: slov1alpha1.NodeMetricSpec{ + CollectPolicy: &slov1alpha1.NodeMetricCollectPolicy{ + ReportIntervalSeconds: pointer.Int64(60), + }, + }, + }, + enableScheduleWhenNodeMetricsExpired: pointer.Bool(false), + wantStatus: framework.NewStatus(framework.Unschedulable, ErrReasonNodeMetricExpired), + }, + { + name: "disable scheduling when nodeMetric with expired updateTime", + nodeMetric: &slov1alpha1.NodeMetric{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-node-1", + }, + Spec: slov1alpha1.NodeMetricSpec{ + CollectPolicy: &slov1alpha1.NodeMetricCollectPolicy{ + ReportIntervalSeconds: pointer.Int64(60), + }, + }, + Status: slov1alpha1.NodeMetricStatus{ + UpdateTime: &metav1.Time{ + Time: time.Now().Add(-180 * time.Second), + }, + }, + }, + enableScheduleWhenNodeMetricsExpired: pointer.Bool(false), + wantStatus: framework.NewStatus(framework.Unschedulable, ErrReasonNodeMetricExpired), + }, } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { var v1beta3args v1beta3.LoadAwareSchedulingArgs v1beta3.SetDefaults_LoadAwareSchedulingArgs(&v1beta3args) + v1beta3args.EnableScheduleWhenNodeMetricsExpired = tt.enableScheduleWhenNodeMetricsExpired var loadAwareSchedulingArgs config.LoadAwareSchedulingArgs err := v1beta3.Convert_v1beta3_LoadAwareSchedulingArgs_To_config_LoadAwareSchedulingArgs(&v1beta3args, &loadAwareSchedulingArgs, nil) assert.NoError(t, err)