scheduler:support multi gpu share

Signed-off-by: machao <986292120@qq.com>
koordinator-sh · Jul 13, 2024 · 81f2016 · 81f2016
1 parent 6861c87
commit 81f2016
Show file tree

Hide file tree

Showing 13 changed files with 1,713 additions and 676 deletions.
diff --git a/apis/configuration/slo_controller_config.go b/apis/configuration/slo_controller_config.go
@@ -251,7 +251,8 @@ type ColocationStrategy struct {
 	// MidMemoryThresholdPercent defines the maximum percentage of the Mid-tier memory resource dividing the node allocatable.
 	// MidMemoryAllocatable <= NodeMemoryAllocatable * MidMemoryThresholdPercent / 100.
 	MidMemoryThresholdPercent *int64 `json:"midMemoryThresholdPercent,omitempty" validate:"omitempty,min=0,max=100"`
-
+	// UseDeviceGPUSharedResource determines use shared resource reported by deivce or calculated by slo-controller noderesource plugin
+	UseDeviceGPUSharedResource *bool            `json:"useDeviceGPUSharedResource,omitempty"`
 	ColocationStrategyExtender `json:",inline"` // for third-party extension
 }
 

diff --git a/apis/configuration/zz_generated.deepcopy.go b/apis/configuration/zz_generated.deepcopy.go
diff --git a/pkg/scheduler/plugins/deviceshare/devicehandler_gpu.go b/pkg/scheduler/plugins/deviceshare/devicehandler_gpu.go
@@ -51,17 +51,25 @@ func (h *GPUHandler) CalcDesiredRequestsAndCount(node *corev1.Node, pod *corev1.
 	requests := podRequests
 	desiredCount := int64(1)
 
-	memoryRatio := podRequests[apiext.ResourceGPUMemoryRatio]
-	multiDevices := memoryRatio.Value() > 100 && memoryRatio.Value()%100 == 0
-	if multiDevices {
-		gpuCore, gpuMem, gpuMemoryRatio := podRequests[apiext.ResourceGPUCore], podRequests[apiext.ResourceGPUMemory], podRequests[apiext.ResourceGPUMemoryRatio]
-		desiredCount = gpuMemoryRatio.Value() / 100
+	gpuShare, ok := podRequests[apiext.ResourceGPUShared]
+	gpuCore, gpuMem, gpuMemoryRatio := podRequests[apiext.ResourceGPUCore], podRequests[apiext.ResourceGPUMemory], podRequests[apiext.ResourceGPUMemoryRatio]
+	// gpu share mode
+	if ok && gpuShare.Value() > 0 {
+		desiredCount = gpuShare.Value()
+	} else {
+		if gpuMemoryRatio.Value() > 100 && gpuMemoryRatio.Value()%100 == 0 {
+			desiredCount = gpuMemoryRatio.Value() / 100
+		}
+	}
+
+	if desiredCount > 1 {
 		requests = corev1.ResourceList{
 			apiext.ResourceGPUCore:        *resource.NewQuantity(gpuCore.Value()/desiredCount, resource.DecimalSI),
 			apiext.ResourceGPUMemory:      *resource.NewQuantity(gpuMem.Value()/desiredCount, resource.BinarySI),
 			apiext.ResourceGPUMemoryRatio: *resource.NewQuantity(gpuMemoryRatio.Value()/desiredCount, resource.DecimalSI),
 		}
 	}
+
 	return requests, int(desiredCount), nil
 }