diff --git a/pkg/slo-controller/noderesource/noderesource_controller.go b/pkg/slo-controller/noderesource/noderesource_controller.go index da978c2d6..b0d9c002d 100644 --- a/pkg/slo-controller/noderesource/noderesource_controller.go +++ b/pkg/slo-controller/noderesource/noderesource_controller.go @@ -34,7 +34,6 @@ import ( "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/source" - schedulingv1alpha1 "github.com/koordinator-sh/koordinator/apis/scheduling/v1alpha1" slov1alpha1 "github.com/koordinator-sh/koordinator/apis/slo/v1alpha1" "github.com/koordinator-sh/koordinator/pkg/slo-controller/config" "github.com/koordinator-sh/koordinator/pkg/slo-controller/metrics" @@ -123,7 +122,7 @@ func (r *NodeResourceReconciler) Reconcile(ctx context.Context, req ctrl.Request return ctrl.Result{Requeue: true}, err } - // do other node updates. e.g. update device resources + // do other node updates. if err := r.updateNodeExtensions(node, nodeMetric, podList); err != nil { klog.ErrorS(err, "failed to update node extensions for node", "node", node.Name) return ctrl.Result{Requeue: true}, err @@ -187,8 +186,7 @@ func (r *NodeResourceReconciler) SetupWithManager(mgr ctrl.Manager) error { Named(Name). // avoid conflict with others reconciling `Node` For(&corev1.Node{}). Watches(&source.Kind{Type: &slov1alpha1.NodeMetric{}}, &EnqueueRequestForNodeMetric{syncContext: r.NodeSyncContext}). - Watches(&source.Kind{Type: &corev1.ConfigMap{}}, cfgHandler). - Watches(&source.Kind{Type: &schedulingv1alpha1.Device{}}, &EnqueueRequestForDevice{syncContext: r.GPUSyncContext}) + Watches(&source.Kind{Type: &corev1.ConfigMap{}}, cfgHandler) // setup plugins // allow plugins to mutate controller via the builder diff --git a/pkg/slo-controller/noderesource/plugins/batchresource/plugin.go b/pkg/slo-controller/noderesource/plugins/batchresource/plugin.go index f3ae5882a..68db3167a 100644 --- a/pkg/slo-controller/noderesource/plugins/batchresource/plugin.go +++ b/pkg/slo-controller/noderesource/plugins/batchresource/plugin.go @@ -26,11 +26,11 @@ import ( "k8s.io/apimachinery/pkg/api/errors" "k8s.io/apimachinery/pkg/api/resource" "k8s.io/apimachinery/pkg/types" - "k8s.io/apimachinery/pkg/util/clock" "k8s.io/apimachinery/pkg/util/sets" quotav1 "k8s.io/apiserver/pkg/quota/v1" clientgoscheme "k8s.io/client-go/kubernetes/scheme" "k8s.io/klog/v2" + "k8s.io/utils/clock" ctrlclient "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/source" @@ -50,7 +50,7 @@ var ( ) var ( - Clock clock.Clock = clock.RealClock{} // for testing + Clock clock.WithTickerAndDelayedExecution = clock.RealClock{} // for testing client ctrlclient.Client nrtHandler *NRTHandler nrtSyncContext *framework.SyncContext diff --git a/pkg/slo-controller/noderesource/plugins/gpudeviceresource/device_event_handler.go b/pkg/slo-controller/noderesource/plugins/gpudeviceresource/device_event_handler.go new file mode 100644 index 000000000..e48075185 --- /dev/null +++ b/pkg/slo-controller/noderesource/plugins/gpudeviceresource/device_event_handler.go @@ -0,0 +1,70 @@ +/* +Copyright 2022 The Koordinator Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package gpudeviceresource + +import ( + "reflect" + + "k8s.io/apimachinery/pkg/types" + "k8s.io/client-go/util/workqueue" + "sigs.k8s.io/controller-runtime/pkg/event" + "sigs.k8s.io/controller-runtime/pkg/handler" + "sigs.k8s.io/controller-runtime/pkg/reconcile" + + schedulingv1alpha1 "github.com/koordinator-sh/koordinator/apis/scheduling/v1alpha1" +) + +var _ handler.EventHandler = &DeviceHandler{} + +type DeviceHandler struct{} + +func (d *DeviceHandler) Create(e event.CreateEvent, q workqueue.RateLimitingInterface) { + device := e.Object.(*schedulingv1alpha1.Device) + q.Add(reconcile.Request{ + NamespacedName: types.NamespacedName{ + Name: device.Name, + }, + }) +} + +func (d *DeviceHandler) Update(e event.UpdateEvent, q workqueue.RateLimitingInterface) { + newDevice := e.ObjectNew.(*schedulingv1alpha1.Device) + oldDevice := e.ObjectOld.(*schedulingv1alpha1.Device) + if reflect.DeepEqual(newDevice.Spec, oldDevice.Spec) { + return + } + q.Add(reconcile.Request{ + NamespacedName: types.NamespacedName{ + Name: newDevice.Name, + }, + }) +} + +func (d *DeviceHandler) Delete(e event.DeleteEvent, q workqueue.RateLimitingInterface) { + device, ok := e.Object.(*schedulingv1alpha1.Device) + if !ok { + return + } + q.Add(reconcile.Request{ + NamespacedName: types.NamespacedName{ + Name: device.Name, + }, + }) +} + +func (d *DeviceHandler) Generic(e event.GenericEvent, q workqueue.RateLimitingInterface) { +} diff --git a/pkg/slo-controller/noderesource/plugins/gpudeviceresource/device_event_handler_test.go b/pkg/slo-controller/noderesource/plugins/gpudeviceresource/device_event_handler_test.go new file mode 100644 index 000000000..d7e344871 --- /dev/null +++ b/pkg/slo-controller/noderesource/plugins/gpudeviceresource/device_event_handler_test.go @@ -0,0 +1,165 @@ +/* +Copyright 2022 The Koordinator Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package gpudeviceresource + +import ( + "testing" + + "github.com/stretchr/testify/assert" + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/client-go/util/workqueue" + "sigs.k8s.io/controller-runtime/pkg/event" + "sigs.k8s.io/controller-runtime/pkg/handler" + "sigs.k8s.io/controller-runtime/pkg/reconcile" + + schedulingv1alpha1 "github.com/koordinator-sh/koordinator/apis/scheduling/v1alpha1" +) + +func Test_EnqueueRequestForNodeMetricMetric(t *testing.T) { + tests := []struct { + name string + fn func(handler handler.EventHandler, q workqueue.RateLimitingInterface) + hasEvent bool + eventName string + }{ + { + name: "create device event", + fn: func(handler handler.EventHandler, q workqueue.RateLimitingInterface) { + handler.Create(event.CreateEvent{ + Object: &schedulingv1alpha1.Device{ + ObjectMeta: metav1.ObjectMeta{ + Name: "node1", + }, + }, + }, q) + }, + hasEvent: true, + eventName: "node1", + }, + { + name: "delete device event", + fn: func(handler handler.EventHandler, q workqueue.RateLimitingInterface) { + handler.Delete(event.DeleteEvent{ + Object: &schedulingv1alpha1.Device{ + ObjectMeta: metav1.ObjectMeta{ + Name: "node1", + }, + }, + }, q) + }, + hasEvent: true, + eventName: "node1", + }, + { + name: "delete event not device", + fn: func(handler handler.EventHandler, q workqueue.RateLimitingInterface) { + handler.Delete(event.DeleteEvent{ + Object: &corev1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: "node1", + }, + }, + }, q) + }, + hasEvent: false, + }, + { + name: "generic event ignore", + fn: func(handler handler.EventHandler, q workqueue.RateLimitingInterface) { + handler.Generic(event.GenericEvent{}, q) + }, + hasEvent: false, + }, + { + name: "update device event", + fn: func(handler handler.EventHandler, q workqueue.RateLimitingInterface) { + handler.Update(event.UpdateEvent{ + ObjectOld: &schedulingv1alpha1.Device{ + ObjectMeta: metav1.ObjectMeta{ + Name: "node1", + ResourceVersion: "100", + }, + Spec: schedulingv1alpha1.DeviceSpec{ + Devices: []schedulingv1alpha1.DeviceInfo{ + {}, + }, + }, + }, + ObjectNew: &schedulingv1alpha1.Device{ + ObjectMeta: metav1.ObjectMeta{ + Name: "node1", + ResourceVersion: "101", + }, + Spec: schedulingv1alpha1.DeviceSpec{ + Devices: []schedulingv1alpha1.DeviceInfo{ + {}, + {}, + }, + }, + }, + }, q) + }, + hasEvent: true, + eventName: "node1", + }, + { + name: "update device event ignore", + fn: func(handler handler.EventHandler, q workqueue.RateLimitingInterface) { + handler.Update(event.UpdateEvent{ + ObjectOld: &schedulingv1alpha1.Device{ + ObjectMeta: metav1.ObjectMeta{ + Name: "node1", + ResourceVersion: "100", + }, + Spec: schedulingv1alpha1.DeviceSpec{ + Devices: []schedulingv1alpha1.DeviceInfo{ + {}, + }, + }, + }, + ObjectNew: &schedulingv1alpha1.Device{ + ObjectMeta: metav1.ObjectMeta{ + Name: "node1", + ResourceVersion: "100", + }, + Spec: schedulingv1alpha1.DeviceSpec{ + Devices: []schedulingv1alpha1.DeviceInfo{ + {}, + }, + }, + }, + }, q) + }, + hasEvent: false, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + queue := workqueue.NewRateLimitingQueue(workqueue.DefaultControllerRateLimiter()) + h := &DeviceHandler{} + tt.fn(h, queue) + assert.Equal(t, tt.hasEvent, queue.Len() > 0, "unexpected event") + if tt.hasEvent { + assert.True(t, queue.Len() >= 0, "expected event") + e, _ := queue.Get() + assert.Equal(t, tt.eventName, e.(reconcile.Request).Name) + } + }) + } + +} diff --git a/pkg/slo-controller/noderesource/plugins/gpudeviceresource/plugin.go b/pkg/slo-controller/noderesource/plugins/gpudeviceresource/plugin.go new file mode 100644 index 000000000..68fc9b08e --- /dev/null +++ b/pkg/slo-controller/noderesource/plugins/gpudeviceresource/plugin.go @@ -0,0 +1,223 @@ +/* +Copyright 2022 The Koordinator Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package gpudeviceresource + +import ( + "context" + "fmt" + "sort" + + corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/errors" + "k8s.io/apimachinery/pkg/api/resource" + "k8s.io/apimachinery/pkg/types" + "k8s.io/klog/v2" + ctrlclient "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/source" + + "github.com/koordinator-sh/koordinator/apis/configuration" + "github.com/koordinator-sh/koordinator/apis/extension" + schedulingv1alpha1 "github.com/koordinator-sh/koordinator/apis/scheduling/v1alpha1" + "github.com/koordinator-sh/koordinator/pkg/slo-controller/noderesource/framework" + "github.com/koordinator-sh/koordinator/pkg/util" +) + +const PluginName = "GPUDeviceResource" + +const ( + ResetResourcesMsg = "reset node gpu resources" + UpdateResourcesMsg = "node gpu resources from device" + UpdateLabelsMsg = "node gpu labels from device" + + NeedSyncForResourceDiffMsg = "gpu resource diff is big than threshold" + NeedSyncForGPUModelMsg = "gpu model changed" + NeedSyncForGPUDriverVersionMsg = "gpu driver version changed" +) + +var ( + // ResourceNames are the known resources reconciled by the plugin from devices. + // TODO: support custom resources. + ResourceNames = []corev1.ResourceName{ + extension.ResourceGPU, + extension.ResourceGPUCore, + extension.ResourceGPUMemory, + extension.ResourceGPUMemoryRatio, + } + + client ctrlclient.Client +) + +type Plugin struct{} + +func (p *Plugin) Name() string { + return PluginName +} + +// +kubebuilder:rbac:groups=core,resources=configmaps,verbs=get;list;watch +// +kubebuilder:rbac:groups=core,resources=nodes,verbs=get;list;watch;patch +// +kubebuilder:rbac:groups=core,resources=nodes/status,verbs=get;update;patch +// +kubebuilder:rbac:groups=core,resources=pods,verbs=get;list;watch +// +kubebuilder:rbac:groups=scheduling.koordinator.sh,resources=devices,verbs=get;list;watch +// +kubebuilder:rbac:groups=topology.node.k8s.io,resources=noderesourcetopologies,verbs=get;list;watch;create;update + +func (p *Plugin) Setup(opt *framework.Option) error { + client = opt.Client + + // schedulingv1alpha1.AddToScheme(opt.Scheme) + opt.Builder = opt.Builder.Watches(&source.Kind{Type: &schedulingv1alpha1.Device{}}, &DeviceHandler{}) + + return nil +} + +func (p *Plugin) NeedSync(strategy *configuration.ColocationStrategy, oldNode, newNode *corev1.Node) (bool, string) { + for _, resourceName := range ResourceNames { + if util.IsResourceDiff(oldNode.Status.Allocatable, newNode.Status.Allocatable, resourceName, + *strategy.ResourceDiffThreshold) { + klog.V(4).Infof("node %s resource %s diff bigger than %v, need sync", + newNode.Name, resourceName, *strategy.ResourceDiffThreshold) + return true, NeedSyncForResourceDiffMsg + } + } + + return false, "" +} + +func (p *Plugin) NeedSyncMeta(_ *configuration.ColocationStrategy, oldNode, newNode *corev1.Node) (bool, string) { + if oldNode.Labels[extension.LabelGPUModel] != newNode.Labels[extension.LabelGPUModel] { + klog.V(4).InfoS("need sync node metadata since gpu model change", "node", newNode.Name, + "old", oldNode.Labels[extension.LabelGPUModel], "new", newNode.Labels[extension.LabelGPUModel]) + return true, NeedSyncForGPUModelMsg + } + if oldNode.Labels[extension.LabelGPUDriverVersion] != newNode.Labels[extension.LabelGPUDriverVersion] { + klog.V(4).InfoS("need sync node metadata since gpu driver version change", "node", newNode.Name, + "old", oldNode.Labels[extension.LabelGPUDriverVersion], "new", newNode.Labels[extension.LabelGPUDriverVersion]) + return true, NeedSyncForGPUDriverVersionMsg + } + + return false, "" +} + +func (p *Plugin) Prepare(_ *configuration.ColocationStrategy, node *corev1.Node, nr *framework.NodeResource) error { + // prepare node resources + for _, resourceName := range ResourceNames { + q := nr.Resources[resourceName] + // TBD: shall we remove the resource when some resource types are missing + if q == nil || nr.Resets[resourceName] { + delete(node.Status.Allocatable, resourceName) + delete(node.Status.Capacity, resourceName) + continue + } + + node.Status.Allocatable[resourceName] = *q + node.Status.Capacity[resourceName] = *q + } + + // prepare node labels + // TBD: shall we reset labels if not exist in the NR + if nr.Labels != nil { + if _, ok := nr.Labels[extension.LabelGPUModel]; ok { + node.Labels[extension.LabelGPUModel] = nr.Labels[extension.LabelGPUModel] + } + if _, ok := nr.Labels[extension.LabelGPUDriverVersion]; ok { + node.Labels[extension.LabelGPUDriverVersion] = nr.Labels[extension.LabelGPUDriverVersion] + } + } + + return nil +} + +func (p *Plugin) Reset(node *corev1.Node, message string) []framework.ResourceItem { + return nil +} + +func (p *Plugin) Calculate(_ *configuration.ColocationStrategy, node *corev1.Node, _ *corev1.PodList, _ *framework.ResourceMetrics) ([]framework.ResourceItem, error) { + if node == nil || node.Status.Allocatable == nil { + return nil, fmt.Errorf("missing essential arguments") + } + + // calculate device resources + device := &schedulingv1alpha1.Device{} + if err := client.Get(context.TODO(), types.NamespacedName{Name: node.Name, Namespace: node.Namespace}, device); err != nil { + if !errors.IsNotFound(err) { + klog.V(4).InfoS("failed to get device for node", "node", node.Name, "err", err) + return nil, fmt.Errorf("failed to get device resources: %w", err) + } + + // device not found, reset gpu resources on node + return p.resetGPUNodeResource() + } + + // TODO: calculate NUMA-level resources against NRT + return p.calculate(node, device) +} + +func (p *Plugin) calculate(node *corev1.Node, device *schedulingv1alpha1.Device) ([]framework.ResourceItem, error) { + if device == nil { + return nil, fmt.Errorf("invalid device") + } + + // calculate gpu resources + gpuResources := make(corev1.ResourceList) + totalKoordGPU := resource.NewQuantity(0, resource.DecimalSI) + for _, d := range device.Spec.Devices { + if d.Type != schedulingv1alpha1.GPU || !d.Health { + continue + } + util.AddResourceList(gpuResources, d.Resources) + totalKoordGPU.Add(d.Resources[extension.ResourceGPUCore]) + } + gpuResources[extension.ResourceGPU] = *totalKoordGPU + var items []framework.ResourceItem + for resourceName := range gpuResources { + q := gpuResources[resourceName] + items = append(items, framework.ResourceItem{ + Name: resourceName, + Quantity: &q, + Message: UpdateResourcesMsg, + }) + } + sort.Slice(items, func(i, j int) bool { return items[i].Name < items[j].Name }) + klog.V(5).InfoS("calculate gpu resources", "node", node.Name, "resources", gpuResources) + + // calculate labels about gpu driver and model + if device.Labels != nil { + items = append(items, framework.ResourceItem{ + Name: PluginName, + Labels: map[string]string{ + extension.LabelGPUModel: device.Labels[extension.LabelGPUModel], + extension.LabelGPUDriverVersion: device.Labels[extension.LabelGPUDriverVersion], + }, + Message: UpdateLabelsMsg, + }) + } + klog.V(5).InfoS("calculate gpu labels", "node", node.Name, "labels", device.Labels) + + return items, nil +} + +func (p *Plugin) resetGPUNodeResource() ([]framework.ResourceItem, error) { + items := make([]framework.ResourceItem, len(ResourceNames)) + // FIXME: shall we reset node resources in devices but not in ResourceNames? + for i := range ResourceNames { + items[i] = framework.ResourceItem{ + Name: ResourceNames[i], + Reset: true, + Message: ResetResourcesMsg, + } + } + return items, nil +} diff --git a/pkg/slo-controller/noderesource/plugins/gpudeviceresource/plugin_test.go b/pkg/slo-controller/noderesource/plugins/gpudeviceresource/plugin_test.go new file mode 100644 index 000000000..917822080 --- /dev/null +++ b/pkg/slo-controller/noderesource/plugins/gpudeviceresource/plugin_test.go @@ -0,0 +1,692 @@ +/* +Copyright 2022 The Koordinator Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package gpudeviceresource + +import ( + "testing" + + topov1alpha1 "github.com/k8stopologyawareschedwg/noderesourcetopology-api/pkg/apis/topology/v1alpha1" + "github.com/stretchr/testify/assert" + corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/resource" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + clientgoscheme "k8s.io/client-go/kubernetes/scheme" + "k8s.io/utils/pointer" + "sigs.k8s.io/controller-runtime/pkg/builder" + ctrlclient "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/client/fake" + + "github.com/koordinator-sh/koordinator/apis/configuration" + "github.com/koordinator-sh/koordinator/apis/extension" + schedulingv1alpha1 "github.com/koordinator-sh/koordinator/apis/scheduling/v1alpha1" + "github.com/koordinator-sh/koordinator/pkg/slo-controller/noderesource/framework" +) + +func TestPlugin(t *testing.T) { + t.Run("test", func(t *testing.T) { + p := &Plugin{} + assert.Equal(t, PluginName, p.Name()) + + testScheme := runtime.NewScheme() + testOpt := &framework.Option{ + Scheme: testScheme, + Client: fake.NewClientBuilder().WithScheme(testScheme).Build(), + Builder: &builder.Builder{}, + } + err := p.Setup(testOpt) + assert.NoError(t, err) + + got := p.Reset(nil, "") + assert.Nil(t, got) + }) +} + +func TestPluginNeedSync(t *testing.T) { + testStrategy := &configuration.ColocationStrategy{ + Enable: pointer.Bool(true), + CPUReclaimThresholdPercent: pointer.Int64(65), + MemoryReclaimThresholdPercent: pointer.Int64(65), + DegradeTimeMinutes: pointer.Int64(15), + UpdateTimeThresholdSeconds: pointer.Int64(300), + ResourceDiffThreshold: pointer.Float64(0.1), + } + testNodeWithoutDevice := &corev1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-node", + Labels: map[string]string{ + "test-label": "test-value", + }, + }, + Status: corev1.NodeStatus{ + Allocatable: map[corev1.ResourceName]resource.Quantity{ + corev1.ResourceCPU: resource.MustParse("100"), + corev1.ResourceMemory: resource.MustParse("400Gi"), + }, + Capacity: map[corev1.ResourceName]resource.Quantity{ + corev1.ResourceCPU: resource.MustParse("100"), + corev1.ResourceMemory: resource.MustParse("400Gi"), + }, + }, + } + testNodeWithDevice := &corev1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-node", + Labels: map[string]string{ + "test-label": "test-value", + extension.LabelGPUModel: "A100", + extension.LabelGPUDriverVersion: "480", + }, + }, + Status: corev1.NodeStatus{ + Allocatable: map[corev1.ResourceName]resource.Quantity{ + corev1.ResourceCPU: resource.MustParse("100"), + corev1.ResourceMemory: resource.MustParse("400Gi"), + extension.ResourceGPU: *resource.NewQuantity(200, resource.DecimalSI), + extension.ResourceGPUCore: *resource.NewQuantity(200, resource.DecimalSI), + extension.ResourceGPUMemory: *resource.NewQuantity(18000, resource.DecimalSI), + extension.ResourceGPUMemoryRatio: *resource.NewQuantity(200, resource.DecimalSI), + }, + Capacity: map[corev1.ResourceName]resource.Quantity{ + corev1.ResourceCPU: resource.MustParse("100"), + corev1.ResourceMemory: resource.MustParse("400Gi"), + extension.ResourceGPU: *resource.NewQuantity(200, resource.DecimalSI), + extension.ResourceGPUCore: *resource.NewQuantity(200, resource.DecimalSI), + extension.ResourceGPUMemory: *resource.NewQuantity(18000, resource.DecimalSI), + extension.ResourceGPUMemoryRatio: *resource.NewQuantity(200, resource.DecimalSI), + }, + }, + } + testNodeWithDeviceDriverUpdate := &corev1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-node", + Labels: map[string]string{ + "test-label": "test-value", + extension.LabelGPUModel: "A100", + extension.LabelGPUDriverVersion: "486", // only driver version change + }, + }, + Status: corev1.NodeStatus{ + Allocatable: map[corev1.ResourceName]resource.Quantity{ + corev1.ResourceCPU: resource.MustParse("100"), + corev1.ResourceMemory: resource.MustParse("400Gi"), + extension.ResourceGPU: *resource.NewQuantity(200, resource.DecimalSI), + extension.ResourceGPUCore: *resource.NewQuantity(200, resource.DecimalSI), + extension.ResourceGPUMemory: *resource.NewQuantity(18000, resource.DecimalSI), + extension.ResourceGPUMemoryRatio: *resource.NewQuantity(200, resource.DecimalSI), + }, + Capacity: map[corev1.ResourceName]resource.Quantity{ + corev1.ResourceCPU: resource.MustParse("100"), + corev1.ResourceMemory: resource.MustParse("400Gi"), + extension.ResourceGPU: *resource.NewQuantity(200, resource.DecimalSI), + extension.ResourceGPUCore: *resource.NewQuantity(200, resource.DecimalSI), + extension.ResourceGPUMemory: *resource.NewQuantity(18000, resource.DecimalSI), + extension.ResourceGPUMemoryRatio: *resource.NewQuantity(200, resource.DecimalSI), + }, + }, + } + testNodeWithDeviceResourceUpdate := &corev1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-node", + Labels: map[string]string{ + "test-label": "test-value", + extension.LabelGPUModel: "A100", + extension.LabelGPUDriverVersion: "480", + }, + }, + Status: corev1.NodeStatus{ + Allocatable: map[corev1.ResourceName]resource.Quantity{ + corev1.ResourceCPU: resource.MustParse("100"), + corev1.ResourceMemory: resource.MustParse("400Gi"), + extension.ResourceGPU: *resource.NewQuantity(160, resource.DecimalSI), + extension.ResourceGPUCore: *resource.NewQuantity(160, resource.DecimalSI), + extension.ResourceGPUMemory: *resource.NewQuantity(18000, resource.DecimalSI), + extension.ResourceGPUMemoryRatio: *resource.NewQuantity(200, resource.DecimalSI), + }, + Capacity: map[corev1.ResourceName]resource.Quantity{ + corev1.ResourceCPU: resource.MustParse("100"), + corev1.ResourceMemory: resource.MustParse("400Gi"), + extension.ResourceGPU: *resource.NewQuantity(160, resource.DecimalSI), + extension.ResourceGPUCore: *resource.NewQuantity(160, resource.DecimalSI), + extension.ResourceGPUMemory: *resource.NewQuantity(18000, resource.DecimalSI), + extension.ResourceGPUMemoryRatio: *resource.NewQuantity(200, resource.DecimalSI), + }, + }, + } + t.Run("test", func(t *testing.T) { + p := &Plugin{} + + // nothing change, both have no gpu device + got, got1 := p.NeedSync(testStrategy, testNodeWithoutDevice, testNodeWithoutDevice) + assert.False(t, got) + assert.Equal(t, "", got1) + // nothing change, both has gpu devices + got, got1 = p.NeedSync(testStrategy, testNodeWithDevice, testNodeWithDevice) + assert.False(t, got) + assert.Equal(t, "", got1) + // ignore labels change + got, got1 = p.NeedSync(testStrategy, testNodeWithDevice, testNodeWithDeviceDriverUpdate) + assert.False(t, got) + assert.Equal(t, "", got1) + + // add resources + got, got1 = p.NeedSync(testStrategy, testNodeWithoutDevice, testNodeWithDevice) + assert.True(t, got) + assert.Equal(t, NeedSyncForResourceDiffMsg, got1) + // resource update + got, got1 = p.NeedSync(testStrategy, testNodeWithDevice, testNodeWithDeviceResourceUpdate) + assert.True(t, got) + assert.Equal(t, NeedSyncForResourceDiffMsg, got1) + + // delete resources + got, got1 = p.NeedSync(testStrategy, testNodeWithDevice, testNodeWithoutDevice) + assert.True(t, got) + assert.Equal(t, NeedSyncForResourceDiffMsg, got1) + }) +} + +func TestPluginNeedSyncMeta(t *testing.T) { + testNodeWithoutDevice := &corev1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-node", + Labels: map[string]string{ + "test-label": "test-value", + }, + }, + Status: corev1.NodeStatus{ + Allocatable: map[corev1.ResourceName]resource.Quantity{ + corev1.ResourceCPU: resource.MustParse("100"), + corev1.ResourceMemory: resource.MustParse("400Gi"), + }, + Capacity: map[corev1.ResourceName]resource.Quantity{ + corev1.ResourceCPU: resource.MustParse("100"), + corev1.ResourceMemory: resource.MustParse("400Gi"), + }, + }, + } + testNodeWithDevice := &corev1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-node", + Labels: map[string]string{ + "test-label": "test-value", + extension.LabelGPUModel: "A100", + extension.LabelGPUDriverVersion: "480", + }, + }, + Status: corev1.NodeStatus{ + Allocatable: map[corev1.ResourceName]resource.Quantity{ + corev1.ResourceCPU: resource.MustParse("100"), + corev1.ResourceMemory: resource.MustParse("400Gi"), + extension.ResourceGPU: *resource.NewQuantity(200, resource.DecimalSI), + extension.ResourceGPUCore: *resource.NewQuantity(200, resource.DecimalSI), + extension.ResourceGPUMemory: *resource.NewQuantity(18000, resource.DecimalSI), + extension.ResourceGPUMemoryRatio: *resource.NewQuantity(200, resource.DecimalSI), + }, + Capacity: map[corev1.ResourceName]resource.Quantity{ + corev1.ResourceCPU: resource.MustParse("100"), + corev1.ResourceMemory: resource.MustParse("400Gi"), + extension.ResourceGPU: *resource.NewQuantity(200, resource.DecimalSI), + extension.ResourceGPUCore: *resource.NewQuantity(200, resource.DecimalSI), + extension.ResourceGPUMemory: *resource.NewQuantity(18000, resource.DecimalSI), + extension.ResourceGPUMemoryRatio: *resource.NewQuantity(200, resource.DecimalSI), + }, + }, + } + testNodeWithDeviceDriverUpdate := &corev1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-node", + Labels: map[string]string{ + "test-label": "test-value", + extension.LabelGPUModel: "A100", + extension.LabelGPUDriverVersion: "486", // only driver version change + }, + }, + Status: corev1.NodeStatus{ + Allocatable: map[corev1.ResourceName]resource.Quantity{ + corev1.ResourceCPU: resource.MustParse("100"), + corev1.ResourceMemory: resource.MustParse("400Gi"), + extension.ResourceGPU: *resource.NewQuantity(200, resource.DecimalSI), + extension.ResourceGPUCore: *resource.NewQuantity(200, resource.DecimalSI), + extension.ResourceGPUMemory: *resource.NewQuantity(18000, resource.DecimalSI), + extension.ResourceGPUMemoryRatio: *resource.NewQuantity(200, resource.DecimalSI), + }, + Capacity: map[corev1.ResourceName]resource.Quantity{ + corev1.ResourceCPU: resource.MustParse("100"), + corev1.ResourceMemory: resource.MustParse("400Gi"), + extension.ResourceGPU: *resource.NewQuantity(200, resource.DecimalSI), + extension.ResourceGPUCore: *resource.NewQuantity(200, resource.DecimalSI), + extension.ResourceGPUMemory: *resource.NewQuantity(18000, resource.DecimalSI), + extension.ResourceGPUMemoryRatio: *resource.NewQuantity(200, resource.DecimalSI), + }, + }, + } + testNodeWithDeviceResourceUpdate := &corev1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-node", + Labels: map[string]string{ + "test-label": "test-value", + extension.LabelGPUModel: "A100", + extension.LabelGPUDriverVersion: "480", + }, + }, + Status: corev1.NodeStatus{ + Allocatable: map[corev1.ResourceName]resource.Quantity{ + corev1.ResourceCPU: resource.MustParse("100"), + corev1.ResourceMemory: resource.MustParse("400Gi"), + extension.ResourceGPU: *resource.NewQuantity(160, resource.DecimalSI), + extension.ResourceGPUCore: *resource.NewQuantity(160, resource.DecimalSI), + extension.ResourceGPUMemory: *resource.NewQuantity(18000, resource.DecimalSI), + extension.ResourceGPUMemoryRatio: *resource.NewQuantity(200, resource.DecimalSI), + }, + Capacity: map[corev1.ResourceName]resource.Quantity{ + corev1.ResourceCPU: resource.MustParse("100"), + corev1.ResourceMemory: resource.MustParse("400Gi"), + extension.ResourceGPU: *resource.NewQuantity(160, resource.DecimalSI), + extension.ResourceGPUCore: *resource.NewQuantity(160, resource.DecimalSI), + extension.ResourceGPUMemory: *resource.NewQuantity(18000, resource.DecimalSI), + extension.ResourceGPUMemoryRatio: *resource.NewQuantity(200, resource.DecimalSI), + }, + }, + } + t.Run("test", func(t *testing.T) { + p := &Plugin{} + + // nothing change + got, got1 := p.NeedSyncMeta(nil, testNodeWithoutDevice, testNodeWithoutDevice) + assert.False(t, got) + assert.Equal(t, "", got1) + // nothing change 1 + got, got1 = p.NeedSyncMeta(nil, testNodeWithDevice, testNodeWithDevice) + assert.False(t, got) + assert.Equal(t, "", got1) + // ignore resources change + got, got1 = p.NeedSyncMeta(nil, testNodeWithDevice, testNodeWithDeviceResourceUpdate) + assert.False(t, got) + assert.Equal(t, "", got1) + + // add labels + got, got1 = p.NeedSyncMeta(nil, testNodeWithoutDevice, testNodeWithDevice) + assert.True(t, got) + assert.Equal(t, NeedSyncForGPUModelMsg, got1) + // driver version update + got, got1 = p.NeedSyncMeta(nil, testNodeWithDevice, testNodeWithDeviceDriverUpdate) + assert.True(t, got) + assert.Equal(t, NeedSyncForGPUDriverVersionMsg, got1) + + // remove labels + got, got1 = p.NeedSyncMeta(nil, testNodeWithDevice, testNodeWithoutDevice) + assert.True(t, got) + assert.Equal(t, NeedSyncForGPUModelMsg, got1) + }) +} + +func TestPluginPrepare(t *testing.T) { + testNodeWithoutDevice := &corev1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-node", + Labels: map[string]string{ + "test-label": "test-value", + }, + }, + Status: corev1.NodeStatus{ + Allocatable: map[corev1.ResourceName]resource.Quantity{ + corev1.ResourceCPU: resource.MustParse("100"), + corev1.ResourceMemory: resource.MustParse("400Gi"), + }, + Capacity: map[corev1.ResourceName]resource.Quantity{ + corev1.ResourceCPU: resource.MustParse("100"), + corev1.ResourceMemory: resource.MustParse("400Gi"), + }, + }, + } + testNodeWithDevice := &corev1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-node", + Labels: map[string]string{ + "test-label": "test-value", + extension.LabelGPUModel: "A100", + extension.LabelGPUDriverVersion: "480", + }, + }, + Status: corev1.NodeStatus{ + Allocatable: map[corev1.ResourceName]resource.Quantity{ + corev1.ResourceCPU: resource.MustParse("100"), + corev1.ResourceMemory: resource.MustParse("400Gi"), + extension.ResourceGPU: *resource.NewQuantity(200, resource.DecimalSI), + extension.ResourceGPUCore: *resource.NewQuantity(200, resource.DecimalSI), + extension.ResourceGPUMemory: *resource.NewQuantity(18000, resource.DecimalSI), + extension.ResourceGPUMemoryRatio: *resource.NewQuantity(200, resource.DecimalSI), + }, + Capacity: map[corev1.ResourceName]resource.Quantity{ + corev1.ResourceCPU: resource.MustParse("100"), + corev1.ResourceMemory: resource.MustParse("400Gi"), + extension.ResourceGPU: *resource.NewQuantity(200, resource.DecimalSI), + extension.ResourceGPUCore: *resource.NewQuantity(200, resource.DecimalSI), + extension.ResourceGPUMemory: *resource.NewQuantity(18000, resource.DecimalSI), + extension.ResourceGPUMemoryRatio: *resource.NewQuantity(200, resource.DecimalSI), + }, + }, + } + testNodeWithoutDeviceResources := &corev1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-node", + Labels: map[string]string{ + "test-label": "test-value", + extension.LabelGPUModel: "A100", + extension.LabelGPUDriverVersion: "480", + }, + }, + Status: corev1.NodeStatus{ + Allocatable: map[corev1.ResourceName]resource.Quantity{ + corev1.ResourceCPU: resource.MustParse("100"), + corev1.ResourceMemory: resource.MustParse("400Gi"), + }, + Capacity: map[corev1.ResourceName]resource.Quantity{ + corev1.ResourceCPU: resource.MustParse("100"), + corev1.ResourceMemory: resource.MustParse("400Gi"), + }, + }, + } + type args struct { + node *corev1.Node + nr *framework.NodeResource + } + tests := []struct { + name string + args args + wantErr bool + wantField *corev1.Node + }{ + { + name: "nothing to prepare", + args: args{ + node: testNodeWithoutDevice, + nr: framework.NewNodeResource(), + }, + wantErr: false, + wantField: testNodeWithoutDevice, + }, + { + name: "update resources and labels correctly", + args: args{ + node: testNodeWithoutDevice, + nr: &framework.NodeResource{ + Resources: map[corev1.ResourceName]*resource.Quantity{ + extension.ResourceGPU: resource.NewQuantity(200, resource.DecimalSI), + extension.ResourceGPUCore: resource.NewQuantity(200, resource.DecimalSI), + extension.ResourceGPUMemory: resource.NewQuantity(18000, resource.DecimalSI), + extension.ResourceGPUMemoryRatio: resource.NewQuantity(200, resource.DecimalSI), + }, + ZoneResources: map[string]corev1.ResourceList{}, + Labels: map[string]string{ + extension.LabelGPUModel: "A100", + extension.LabelGPUDriverVersion: "480", + }, + Annotations: map[string]string{ + "ignored-annotation": "ignored-value", + }, + Messages: map[corev1.ResourceName]string{}, + Resets: map[corev1.ResourceName]bool{}, + }, + }, + wantErr: false, + wantField: testNodeWithDevice, + }, + { + name: "reset resources correctly", + args: args{ + node: testNodeWithDevice, + nr: framework.NewNodeResource(), + }, + wantErr: false, + wantField: testNodeWithoutDeviceResources, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + p := &Plugin{} + gotErr := p.Prepare(nil, tt.args.node, tt.args.nr) + assert.Equal(t, tt.wantErr, gotErr != nil, gotErr) + assert.Equal(t, tt.wantField, tt.args.node) + }) + } +} + +func TestPluginCalculate(t *testing.T) { + testScheme := runtime.NewScheme() + err := clientgoscheme.AddToScheme(testScheme) + assert.NoError(t, err) + err = topov1alpha1.AddToScheme(testScheme) + assert.NoError(t, err) + err = schedulingv1alpha1.AddToScheme(testScheme) + assert.NoError(t, err) + testNode := &corev1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-node", + Labels: map[string]string{ + "test-label": "test-value", + }, + }, + Status: corev1.NodeStatus{ + Allocatable: map[corev1.ResourceName]resource.Quantity{ + corev1.ResourceCPU: resource.MustParse("100"), + corev1.ResourceMemory: resource.MustParse("400Gi"), + }, + Capacity: map[corev1.ResourceName]resource.Quantity{ + corev1.ResourceCPU: resource.MustParse("100"), + corev1.ResourceMemory: resource.MustParse("400Gi"), + }, + }, + } + testDevice := &schedulingv1alpha1.Device{ + ObjectMeta: metav1.ObjectMeta{ + Name: testNode.Name, + Labels: map[string]string{ + extension.LabelGPUModel: "A100", + extension.LabelGPUDriverVersion: "480", + }, + }, + Spec: schedulingv1alpha1.DeviceSpec{ + Devices: []schedulingv1alpha1.DeviceInfo{ + { + UUID: "1", + Minor: pointer.Int32(0), + Health: true, + Type: schedulingv1alpha1.GPU, + Resources: map[corev1.ResourceName]resource.Quantity{ + extension.ResourceGPUCore: *resource.NewQuantity(100, resource.DecimalSI), + extension.ResourceGPUMemory: *resource.NewQuantity(8000, resource.DecimalSI), + extension.ResourceGPUMemoryRatio: *resource.NewQuantity(100, resource.DecimalSI), + }, + }, + { + UUID: "2", + Minor: pointer.Int32(1), + Health: true, + Type: schedulingv1alpha1.GPU, + Resources: map[corev1.ResourceName]resource.Quantity{ + extension.ResourceGPUCore: *resource.NewQuantity(100, resource.DecimalSI), + extension.ResourceGPUMemory: *resource.NewQuantity(10000, resource.DecimalSI), + extension.ResourceGPUMemoryRatio: *resource.NewQuantity(100, resource.DecimalSI), + }, + }, + }, + }, + } + type fields struct { + client ctrlclient.Client + } + type args struct { + node *corev1.Node + } + tests := []struct { + name string + fields fields + args args + want []framework.ResourceItem + wantErr bool + }{ + { + name: "args missing essential fields", + fields: fields{ + client: fake.NewClientBuilder().WithScheme(testScheme).Build(), + }, + args: args{ + node: &corev1.Node{}, + }, + want: nil, + wantErr: true, + }, + { + name: "get device object error", + fields: fields{ + client: fake.NewClientBuilder().WithScheme(runtime.NewScheme()).Build(), + }, + args: args{ + node: testNode, + }, + want: nil, + wantErr: true, + }, + { + name: "calculate device resources correctly", + fields: fields{ + client: fake.NewClientBuilder().WithScheme(testScheme).WithObjects(testNode, testDevice).Build(), + }, + args: args{ + node: testNode, + }, + want: []framework.ResourceItem{ + { + Name: extension.ResourceGPU, + Quantity: resource.NewQuantity(200, resource.DecimalSI), + Message: UpdateResourcesMsg, + }, + { + Name: extension.ResourceGPUCore, + Quantity: resource.NewQuantity(200, resource.DecimalSI), + Message: UpdateResourcesMsg, + }, + { + Name: extension.ResourceGPUMemory, + Quantity: resource.NewScaledQuantity(18, 3), + Message: UpdateResourcesMsg, + }, + { + Name: extension.ResourceGPUMemoryRatio, + Quantity: resource.NewQuantity(200, resource.DecimalSI), + Message: UpdateResourcesMsg, + }, + { + Name: PluginName, + Labels: map[string]string{ + extension.LabelGPUModel: "A100", + extension.LabelGPUDriverVersion: "480", + }, + Message: UpdateLabelsMsg, + }, + }, + wantErr: false, + }, + { + name: "calculate device resources correctly", + fields: fields{ + client: fake.NewClientBuilder().WithScheme(testScheme).WithObjects(testNode, testDevice).Build(), + }, + args: args{ + node: testNode, + }, + want: []framework.ResourceItem{ + { + Name: extension.ResourceGPU, + Quantity: resource.NewQuantity(200, resource.DecimalSI), + Message: UpdateResourcesMsg, + }, + { + Name: extension.ResourceGPUCore, + Quantity: resource.NewQuantity(200, resource.DecimalSI), + Message: UpdateResourcesMsg, + }, + { + Name: extension.ResourceGPUMemory, + Quantity: resource.NewScaledQuantity(18, 3), + Message: UpdateResourcesMsg, + }, + { + Name: extension.ResourceGPUMemoryRatio, + Quantity: resource.NewQuantity(200, resource.DecimalSI), + Message: UpdateResourcesMsg, + }, + { + Name: PluginName, + Labels: map[string]string{ + extension.LabelGPUModel: "A100", + extension.LabelGPUDriverVersion: "480", + }, + Message: UpdateLabelsMsg, + }, + }, + wantErr: false, + }, + { + name: "calculate resetting device resources", + fields: fields{ + client: fake.NewClientBuilder().WithScheme(testScheme).WithObjects(testNode).Build(), + }, + args: args{ + node: testNode, + }, + want: []framework.ResourceItem{ + { + Name: extension.ResourceGPU, + Reset: true, + Message: ResetResourcesMsg, + }, + { + Name: extension.ResourceGPUCore, + Reset: true, + Message: ResetResourcesMsg, + }, + { + Name: extension.ResourceGPUMemory, + Reset: true, + Message: ResetResourcesMsg, + }, + { + Name: extension.ResourceGPUMemoryRatio, + Reset: true, + Message: ResetResourcesMsg, + }, + }, + wantErr: false, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + p := &Plugin{} + client = tt.fields.client + defer testPluginCleanup() + got, gotErr := p.Calculate(nil, tt.args.node, nil, nil) + assert.Equal(t, tt.wantErr, gotErr != nil, gotErr) + assert.Equal(t, tt.want, got) + }) + } +} + +func testPluginCleanup() { + client = nil +} diff --git a/pkg/slo-controller/noderesource/plugins/midresource/plugin.go b/pkg/slo-controller/noderesource/plugins/midresource/plugin.go index 50bbf7490..d554ac916 100644 --- a/pkg/slo-controller/noderesource/plugins/midresource/plugin.go +++ b/pkg/slo-controller/noderesource/plugins/midresource/plugin.go @@ -22,8 +22,8 @@ import ( corev1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/resource" - "k8s.io/apimachinery/pkg/util/clock" "k8s.io/klog/v2" + "k8s.io/utils/clock" "github.com/koordinator-sh/koordinator/apis/configuration" "github.com/koordinator-sh/koordinator/apis/extension" @@ -38,7 +38,7 @@ const PluginName = "MidResource" // ResourceNames defines the Mid-tier extended resource names to update. var ResourceNames = []corev1.ResourceName{extension.MidCPU, extension.MidMemory} -var clk clock.Clock = clock.RealClock{} // for testing +var clk clock.WithTickerAndDelayedExecution = clock.RealClock{} // for testing type Plugin struct{} @@ -90,7 +90,7 @@ func (p *Plugin) Calculate(strategy *configuration.ColocationStrategy, node *cor // if the node metric is abnormal, do degraded calculation if p.isDegradeNeeded(strategy, metrics.NodeMetric, node) { - klog.InfoS("node Mid-tier need degradation, reset node resources", "node", node.Name) + klog.V(5).InfoS("node Mid-tier need degradation, reset node resources", "node", node.Name) return p.degradeCalculate(node, "degrade node Mid resource because of abnormal nodeMetric, reason: degradedByMidResource"), nil } diff --git a/pkg/slo-controller/noderesource/plugins_profile.go b/pkg/slo-controller/noderesource/plugins_profile.go index dc287807e..3c252be93 100644 --- a/pkg/slo-controller/noderesource/plugins_profile.go +++ b/pkg/slo-controller/noderesource/plugins_profile.go @@ -20,6 +20,7 @@ import ( "github.com/koordinator-sh/koordinator/pkg/slo-controller/noderesource/framework" "github.com/koordinator-sh/koordinator/pkg/slo-controller/noderesource/plugins/batchresource" "github.com/koordinator-sh/koordinator/pkg/slo-controller/noderesource/plugins/cpunormalization" + "github.com/koordinator-sh/koordinator/pkg/slo-controller/noderesource/plugins/gpudeviceresource" "github.com/koordinator-sh/koordinator/pkg/slo-controller/noderesource/plugins/midresource" "github.com/koordinator-sh/koordinator/pkg/slo-controller/noderesource/plugins/resourceamplification" ) @@ -32,6 +33,7 @@ func init() { addPluginOption(&batchresource.Plugin{}, true) addPluginOption(&cpunormalization.Plugin{}, true) addPluginOption(&resourceamplification.Plugin{}, true) + addPluginOption(&gpudeviceresource.Plugin{}, true) } func addPlugins(filter framework.FilterFn) { @@ -49,6 +51,7 @@ var ( setupPlugins = []framework.SetupPlugin{ &cpunormalization.Plugin{}, &batchresource.Plugin{}, + &gpudeviceresource.Plugin{}, } // NodePreUpdatePlugin implements node resource pre-updating. nodePreUpdatePlugins = []framework.NodePreUpdatePlugin{ @@ -60,16 +63,19 @@ var ( &resourceamplification.Plugin{}, &midresource.Plugin{}, &batchresource.Plugin{}, + &gpudeviceresource.Plugin{}, } // NodeSyncPlugin implements the check of resource updating. nodeStatusCheckPlugins = []framework.NodeStatusCheckPlugin{ &midresource.Plugin{}, &batchresource.Plugin{}, + &gpudeviceresource.Plugin{}, } // nodeMetaCheckPlugins implements the check of node meta updating. nodeMetaCheckPlugins = []framework.NodeMetaCheckPlugin{ &cpunormalization.Plugin{}, &resourceamplification.Plugin{}, + &gpudeviceresource.Plugin{}, } // ResourceCalculatePlugin implements resource counting and overcommitment algorithms. resourceCalculatePlugins = []framework.ResourceCalculatePlugin{ @@ -77,5 +83,6 @@ var ( &resourceamplification.Plugin{}, &midresource.Plugin{}, &batchresource.Plugin{}, + &gpudeviceresource.Plugin{}, } ) diff --git a/pkg/slo-controller/noderesource/resource_calculator.go b/pkg/slo-controller/noderesource/resource_calculator.go index bc29d6ec0..bb50742ff 100644 --- a/pkg/slo-controller/noderesource/resource_calculator.go +++ b/pkg/slo-controller/noderesource/resource_calculator.go @@ -168,15 +168,6 @@ func (r *NodeResourceReconciler) updateNodeMeta(node *corev1.Node, strategy *con // updateNodeExtensions is an extension point for updating node other than node metric resources. func (r *NodeResourceReconciler) updateNodeExtensions(node *corev1.Node, nodeMetric *slov1alpha1.NodeMetric, podList *corev1.PodList) error { - // update device resources - if err := r.updateDeviceResources(node); err != nil { - metrics.RecordNodeResourceReconcileCount(false, "updateDeviceResources") - klog.V(4).InfoS("failed to update device resources for node", "node", node.Name, - "err", err) - return err - } - metrics.RecordNodeResourceReconcileCount(true, "updateDeviceResources") - return nil }