scheduler:support multi gpu share

Signed-off-by: machao <986292120@qq.com>
koordinator-sh · Jul 10, 2024 · 7e3031b · 7e3031b
1 parent 448015c
commit 7e3031b
Show file tree

Hide file tree

Showing 11 changed files with 1,040 additions and 20 deletions.
diff --git a/pkg/scheduler/plugins/deviceshare/devicehandler_gpu.go b/pkg/scheduler/plugins/deviceshare/devicehandler_gpu.go
@@ -51,17 +51,23 @@ func (h *GPUHandler) CalcDesiredRequestsAndCount(node *corev1.Node, pod *corev1.
 	requests := podRequests
 	desiredCount := int64(1)
 
-	memoryRatio := podRequests[apiext.ResourceGPUMemoryRatio]
-	multiDevices := memoryRatio.Value() > 100 && memoryRatio.Value()%100 == 0
-	if multiDevices {
-		gpuCore, gpuMem, gpuMemoryRatio := podRequests[apiext.ResourceGPUCore], podRequests[apiext.ResourceGPUMemory], podRequests[apiext.ResourceGPUMemoryRatio]
-		desiredCount = gpuMemoryRatio.Value() / 100
-		requests = corev1.ResourceList{
-			apiext.ResourceGPUCore:        *resource.NewQuantity(gpuCore.Value()/desiredCount, resource.DecimalSI),
-			apiext.ResourceGPUMemory:      *resource.NewQuantity(gpuMem.Value()/desiredCount, resource.BinarySI),
-			apiext.ResourceGPUMemoryRatio: *resource.NewQuantity(gpuMemoryRatio.Value()/desiredCount, resource.DecimalSI),
+	gpuShare, ok := podRequests[apiext.ResourceGPUShared]
+	gpuCore, gpuMem, gpuMemoryRatio := podRequests[apiext.ResourceGPUCore], podRequests[apiext.ResourceGPUMemory], podRequests[apiext.ResourceGPUMemoryRatio]
+	// gpu share mode
+	if ok && gpuShare.Value() > 0 {
+		desiredCount = gpuShare.Value()
+	} else {
+		if gpuMemoryRatio.Value() > 100 && gpuMemoryRatio.Value()%100 == 0 {
+			desiredCount = gpuMemoryRatio.Value() / 100
 		}
 	}
+
+	requests = corev1.ResourceList{
+		apiext.ResourceGPUCore:        *resource.NewQuantity(gpuCore.Value()/desiredCount, resource.DecimalSI),
+		apiext.ResourceGPUMemory:      *resource.NewQuantity(gpuMem.Value()/desiredCount, resource.BinarySI),
+		apiext.ResourceGPUMemoryRatio: *resource.NewQuantity(gpuMemoryRatio.Value()/desiredCount, resource.DecimalSI),
+	}
+
 	return requests, int(desiredCount), nil
 }
 

diff --git a/pkg/scheduler/plugins/deviceshare/plugin_test.go b/pkg/scheduler/plugins/deviceshare/plugin_test.go
@@ -1848,6 +1848,221 @@ func Test_Plugin_Filter(t *testing.T) {
 			nodeInfo: testNodeInfo,
 			want:     nil,
 		},
+		{
+			name: "pod stuck when use multi gpu",
+			state: &preFilterState{
+				skip: false,
+				podRequests: map[schedulingv1alpha1.DeviceType]corev1.ResourceList{
+					schedulingv1alpha1.GPU: {
+						apiext.ResourceGPUShared: resource.MustParse("4"),
+						apiext.ResourceGPUMemory: resource.MustParse("160G"),
+					},
+				},
+			},
+			// reserved: apiext.DeviceAllocations{},
+			nodeDeviceCache: &nodeDeviceCache{
+				nodeDeviceInfos: map[string]*nodeDevice{
+					"test-node": {
+						allocateSet: map[schedulingv1alpha1.DeviceType]map[types.NamespacedName]deviceResources{},
+						deviceFree: map[schedulingv1alpha1.DeviceType]deviceResources{
+							schedulingv1alpha1.GPU: {
+								0: corev1.ResourceList{
+									apiext.ResourceGPUCore:        resource.MustParse("100"),
+									apiext.ResourceGPUMemoryRatio: resource.MustParse("100"),
+									apiext.ResourceGPUMemory:      resource.MustParse("80Gi"),
+								},
+								1: corev1.ResourceList{
+									apiext.ResourceGPUCore:        resource.MustParse("100"),
+									apiext.ResourceGPUMemoryRatio: resource.MustParse("100"),
+									apiext.ResourceGPUMemory:      resource.MustParse("80Gi"),
+								},
+								2: corev1.ResourceList{
+									apiext.ResourceGPUCore:        resource.MustParse("100"),
+									apiext.ResourceGPUMemoryRatio: resource.MustParse("100"),
+									apiext.ResourceGPUMemory:      resource.MustParse("80Gi"),
+								},
+								3: corev1.ResourceList{
+									apiext.ResourceGPUCore:        resource.MustParse("100"),
+									apiext.ResourceGPUMemoryRatio: resource.MustParse("100"),
+									apiext.ResourceGPUMemory:      resource.MustParse("80Gi"),
+								},
+								4: corev1.ResourceList{
+									apiext.ResourceGPUCore:        resource.MustParse("100"),
+									apiext.ResourceGPUMemoryRatio: resource.MustParse("100"),
+									apiext.ResourceGPUMemory:      resource.MustParse("80Gi"),
+								},
+								5: corev1.ResourceList{
+									apiext.ResourceGPUCore:        resource.MustParse("100"),
+									apiext.ResourceGPUMemoryRatio: resource.MustParse("100"),
+									apiext.ResourceGPUMemory:      resource.MustParse("80Gi"),
+								},
+								6: corev1.ResourceList{
+									apiext.ResourceGPUCore:        resource.MustParse("100"),
+									apiext.ResourceGPUMemoryRatio: resource.MustParse("100"),
+									apiext.ResourceGPUMemory:      resource.MustParse("80Gi"),
+								},
+								7: corev1.ResourceList{
+									apiext.ResourceGPUCore:        resource.MustParse("100"),
+									apiext.ResourceGPUMemoryRatio: resource.MustParse("100"),
+									apiext.ResourceGPUMemory:      resource.MustParse("80Gi"),
+								},
+								8: corev1.ResourceList{
+									apiext.ResourceGPUCore:        resource.MustParse("100"),
+									apiext.ResourceGPUMemoryRatio: resource.MustParse("100"),
+									apiext.ResourceGPUMemory:      resource.MustParse("80Gi"),
+								},
+							},
+						},
+						deviceTotal: map[schedulingv1alpha1.DeviceType]deviceResources{
+							schedulingv1alpha1.GPU: {
+								0: corev1.ResourceList{
+									apiext.ResourceGPUCore:        resource.MustParse("100"),
+									apiext.ResourceGPUMemoryRatio: resource.MustParse("100"),
+									apiext.ResourceGPUMemory:      resource.MustParse("80Gi"),
+								},
+								1: corev1.ResourceList{
+									apiext.ResourceGPUCore:        resource.MustParse("100"),
+									apiext.ResourceGPUMemoryRatio: resource.MustParse("100"),
+									apiext.ResourceGPUMemory:      resource.MustParse("80Gi"),
+								},
+								2: corev1.ResourceList{
+									apiext.ResourceGPUCore:        resource.MustParse("100"),
+									apiext.ResourceGPUMemoryRatio: resource.MustParse("100"),
+									apiext.ResourceGPUMemory:      resource.MustParse("80Gi"),
+								},
+								3: corev1.ResourceList{
+									apiext.ResourceGPUCore:        resource.MustParse("100"),
+									apiext.ResourceGPUMemoryRatio: resource.MustParse("100"),
+									apiext.ResourceGPUMemory:      resource.MustParse("80Gi"),
+								},
+								4: corev1.ResourceList{
+									apiext.ResourceGPUCore:        resource.MustParse("100"),
+									apiext.ResourceGPUMemoryRatio: resource.MustParse("100"),
+									apiext.ResourceGPUMemory:      resource.MustParse("80Gi"),
+								},
+								5: corev1.ResourceList{
+									apiext.ResourceGPUCore:        resource.MustParse("100"),
+									apiext.ResourceGPUMemoryRatio: resource.MustParse("100"),
+									apiext.ResourceGPUMemory:      resource.MustParse("80Gi"),
+								},
+								6: corev1.ResourceList{
+									apiext.ResourceGPUCore:        resource.MustParse("100"),
+									apiext.ResourceGPUMemoryRatio: resource.MustParse("100"),
+									apiext.ResourceGPUMemory:      resource.MustParse("80Gi"),
+								},
+								7: corev1.ResourceList{
+									apiext.ResourceGPUCore:        resource.MustParse("100"),
+									apiext.ResourceGPUMemoryRatio: resource.MustParse("100"),
+									apiext.ResourceGPUMemory:      resource.MustParse("80Gi"),
+								},
+								8: corev1.ResourceList{
+									apiext.ResourceGPUCore:        resource.MustParse("100"),
+									apiext.ResourceGPUMemoryRatio: resource.MustParse("100"),
+									apiext.ResourceGPUMemory:      resource.MustParse("80Gi"),
+								},
+							},
+						},
+						deviceUsed:    map[schedulingv1alpha1.DeviceType]deviceResources{},
+						vfAllocations: map[schedulingv1alpha1.DeviceType]*VFAllocation{},
+						numaTopology:  &NUMATopology{},
+						deviceInfos: map[schedulingv1alpha1.DeviceType][]*schedulingv1alpha1.DeviceInfo{
+							schedulingv1alpha1.GPU: {
+								{
+									Type:   schedulingv1alpha1.GPU,
+									Health: true,
+									UUID:   "123456-0",
+									Minor:  pointer.Int32(0),
+									Resources: corev1.ResourceList{
+										apiext.ResourceGPUCore:        resource.MustParse("100"),
+										apiext.ResourceGPUMemoryRatio: resource.MustParse("100"),
+										apiext.ResourceGPUMemory:      resource.MustParse("80Gi"),
+									},
+								},
+								{
+									Type:   schedulingv1alpha1.GPU,
+									Health: true,
+									UUID:   "123456-1",
+									Minor:  pointer.Int32(1),
+									Resources: corev1.ResourceList{
+										apiext.ResourceGPUCore:        resource.MustParse("100"),
+										apiext.ResourceGPUMemoryRatio: resource.MustParse("100"),
+										apiext.ResourceGPUMemory:      resource.MustParse("80Gi"),
+									},
+								},
+								{
+									Type:   schedulingv1alpha1.GPU,
+									Health: true,
+									UUID:   "123456-2",
+									Minor:  pointer.Int32(2),
+									Resources: corev1.ResourceList{
+										apiext.ResourceGPUCore:        resource.MustParse("100"),
+										apiext.ResourceGPUMemoryRatio: resource.MustParse("100"),
+										apiext.ResourceGPUMemory:      resource.MustParse("80Gi"),
+									},
+								},
+								{
+									Type:   schedulingv1alpha1.GPU,
+									Health: true,
+									UUID:   "123456-3",
+									Minor:  pointer.Int32(3),
+									Resources: corev1.ResourceList{
+										apiext.ResourceGPUCore:        resource.MustParse("100"),
+										apiext.ResourceGPUMemoryRatio: resource.MustParse("100"),
+										apiext.ResourceGPUMemory:      resource.MustParse("80Gi"),
+									},
+								},
+								{
+									Type:   schedulingv1alpha1.GPU,
+									Health: true,
+									UUID:   "123456-4",
+									Minor:  pointer.Int32(4),
+									Resources: corev1.ResourceList{
+										apiext.ResourceGPUCore:        resource.MustParse("100"),
+										apiext.ResourceGPUMemoryRatio: resource.MustParse("100"),
+										apiext.ResourceGPUMemory:      resource.MustParse("80Gi"),
+									},
+								},
+								{
+									Type:   schedulingv1alpha1.GPU,
+									Health: true,
+									UUID:   "123456-5",
+									Minor:  pointer.Int32(5),
+									Resources: corev1.ResourceList{
+										apiext.ResourceGPUCore:        resource.MustParse("100"),
+										apiext.ResourceGPUMemoryRatio: resource.MustParse("100"),
+										apiext.ResourceGPUMemory:      resource.MustParse("80Gi"),
+									},
+								},
+								{
+									Type:   schedulingv1alpha1.GPU,
+									Health: true,
+									UUID:   "123456-6",
+									Minor:  pointer.Int32(6),
+									Resources: corev1.ResourceList{
+										apiext.ResourceGPUCore:        resource.MustParse("100"),
+										apiext.ResourceGPUMemoryRatio: resource.MustParse("100"),
+										apiext.ResourceGPUMemory:      resource.MustParse("80Gi"),
+									},
+								},
+								{
+									Type:   schedulingv1alpha1.GPU,
+									Health: true,
+									UUID:   "123456-7",
+									Minor:  pointer.Int32(7),
+									Resources: corev1.ResourceList{
+										apiext.ResourceGPUCore:        resource.MustParse("100"),
+										apiext.ResourceGPUMemoryRatio: resource.MustParse("100"),
+										apiext.ResourceGPUMemory:      resource.MustParse("80Gi"),
+									},
+								},
+							},
+						},
+					},
+				},
+			},
+			nodeInfo: testNodeInfo,
+			want:     nil,
+		},
 	}
 	for _, tt := range tests {
 		t.Run(tt.name, func(t *testing.T) {
@@ -1910,6 +2125,7 @@ func Test_Plugin_Filter(t *testing.T) {
 				}
 				cycleState.Write(reservationRestoreStateKey, restoreState)
 			}
+			// fmt.Println("-----pod", pod.Spec.Containers[0].Resources)
 			status := p.Filter(context.TODO(), cycleState, pod, tt.nodeInfo)
 			assert.Equal(t, tt.want.Code(), status.Code())
 			assert.True(t, strings.Contains(status.Message(), tt.want.Message()))

diff --git a/pkg/scheduler/plugins/deviceshare/utils.go b/pkg/scheduler/plugins/deviceshare/utils.go
@@ -38,6 +38,7 @@ const (
 	NvidiaGPU = 1 << iota
 	HygonDCU
 	KoordGPU
+	GPUShared
 	GPUCore
 	GPUMemory
 	GPUMemoryRatio
@@ -50,6 +51,7 @@ var DeviceResourceNames = map[schedulingv1alpha1.DeviceType][]corev1.ResourceNam
 		apiext.ResourceNvidiaGPU,
 		apiext.ResourceHygonDCU,
 		apiext.ResourceGPU,
+		apiext.ResourceGPUShared,
 		apiext.ResourceGPUCore,
 		apiext.ResourceGPUMemory,
 		apiext.ResourceGPUMemoryRatio,
@@ -65,20 +67,25 @@ var DeviceResourceFlags = map[corev1.ResourceName]uint{
 	apiext.ResourceGPUCore:        GPUCore,
 	apiext.ResourceGPUMemory:      GPUMemory,
 	apiext.ResourceGPUMemoryRatio: GPUMemoryRatio,
+	apiext.ResourceGPUShared:      GPUShared,
 	apiext.ResourceFPGA:           FPGA,
 	apiext.ResourceRDMA:           RDMA,
 }
 
 var ValidDeviceResourceCombinations = map[uint]bool{
-	NvidiaGPU:                true,
-	HygonDCU:                 true,
-	KoordGPU:                 true,
-	GPUMemory:                true,
-	GPUMemoryRatio:           true,
-	GPUCore | GPUMemory:      true,
-	GPUCore | GPUMemoryRatio: true,
-	FPGA:                     true,
-	RDMA:                     true,
+	NvidiaGPU:                            true,
+	HygonDCU:                             true,
+	KoordGPU:                             true,
+	GPUMemory:                            true,
+	GPUMemoryRatio:                       true,
+	GPUCore | GPUMemory:                  true,
+	GPUCore | GPUMemoryRatio:             true,
+	GPUShared | GPUMemory:                true,
+	GPUShared | GPUMemoryRatio:           true,
+	GPUShared | GPUCore | GPUMemory:      true,
+	GPUShared | GPUCore | GPUMemoryRatio: true,
+	FPGA:                                 true,
+	RDMA:                                 true,
 }
 
 var DeviceResourceValidators = map[corev1.ResourceName]func(q resource.Quantity) bool{
@@ -118,6 +125,32 @@ var ResourceCombinationsMapper = map[uint]func(podRequest corev1.ResourceList) c
 			apiext.ResourceGPUMemoryRatio: podRequest[apiext.ResourceGPU],
 		}
 	},
+	GPUShared | GPUMemory: func(podRequest corev1.ResourceList) corev1.ResourceList {
+		return corev1.ResourceList{
+			apiext.ResourceGPUShared: podRequest[apiext.ResourceGPUShared],
+			apiext.ResourceGPUMemory: podRequest[apiext.ResourceGPUMemory],
+		}
+	},
+	GPUShared | GPUMemoryRatio: func(podRequest corev1.ResourceList) corev1.ResourceList {
+		return corev1.ResourceList{
+			apiext.ResourceGPUShared: podRequest[apiext.ResourceGPUShared],
+			apiext.ResourceGPUMemory: podRequest[apiext.ResourceGPUMemory],
+		}
+	},
+	GPUShared | GPUCore | GPUMemory: func(podRequest corev1.ResourceList) corev1.ResourceList {
+		return corev1.ResourceList{
+			apiext.ResourceGPUShared: podRequest[apiext.ResourceGPUShared],
+			apiext.ResourceGPUCore:   podRequest[apiext.ResourceGPUCore],
+			apiext.ResourceGPUMemory: podRequest[apiext.ResourceGPUMemory],
+		}
+	},
+	GPUShared | GPUCore | GPUMemoryRatio: func(podRequest corev1.ResourceList) corev1.ResourceList {
+		return corev1.ResourceList{
+			apiext.ResourceGPUShared:      podRequest[apiext.ResourceGPUShared],
+			apiext.ResourceGPUCore:        podRequest[apiext.ResourceGPUCore],
+			apiext.ResourceGPUMemoryRatio: podRequest[apiext.ResourceGPUMemoryRatio],
+		}
+	},
 	NvidiaGPU: func(podRequest corev1.ResourceList) corev1.ResourceList {
 		nvidiaGPU := podRequest[apiext.ResourceNvidiaGPU]
 		return corev1.ResourceList{

diff --git a/pkg/slo-controller/noderesource/plugins/gpudeviceresource/plugin.go b/pkg/slo-controller/noderesource/plugins/gpudeviceresource/plugin.go
@@ -161,14 +161,17 @@ func (p *Plugin) calculate(node *corev1.Node, device *schedulingv1alpha1.Device)
 	// calculate gpu resources
 	gpuResources := make(corev1.ResourceList)
 	totalKoordGPU := resource.NewQuantity(0, resource.DecimalSI)
+	healthGPUNum := 0
 	for _, d := range device.Spec.Devices {
 		if d.Type != schedulingv1alpha1.GPU || !d.Health {
 			continue
 		}
+		healthGPUNum++
 		util.AddResourceList(gpuResources, d.Resources)
 		totalKoordGPU.Add(d.Resources[extension.ResourceGPUCore])
 	}
 	gpuResources[extension.ResourceGPU] = *totalKoordGPU
+	gpuResources[extension.ResourceGPUShared] = *resource.NewQuantity(int64(healthGPUNum)*100, resource.DecimalSI)
 	var items []framework.ResourceItem
 	// FIXME: shall we add node resources in devices but not in ResourceNames?
 	for resourceName := range gpuResources {