Skip to content

Commit

Permalink
scheduler:support multi gpu share
Browse files Browse the repository at this point in the history
Signed-off-by: machao <986292120@qq.com>
  • Loading branch information
AdrianMachao committed Jul 13, 2024
1 parent 6861c87 commit 81f2016
Show file tree
Hide file tree
Showing 13 changed files with 1,713 additions and 676 deletions.
3 changes: 2 additions & 1 deletion apis/configuration/slo_controller_config.go
Original file line number Diff line number Diff line change
Expand Up @@ -251,7 +251,8 @@ type ColocationStrategy struct {
// MidMemoryThresholdPercent defines the maximum percentage of the Mid-tier memory resource dividing the node allocatable.
// MidMemoryAllocatable <= NodeMemoryAllocatable * MidMemoryThresholdPercent / 100.
MidMemoryThresholdPercent *int64 `json:"midMemoryThresholdPercent,omitempty" validate:"omitempty,min=0,max=100"`

// UseDeviceGPUSharedResource determines use shared resource reported by deivce or calculated by slo-controller noderesource plugin
UseDeviceGPUSharedResource *bool `json:"useDeviceGPUSharedResource,omitempty"`
ColocationStrategyExtender `json:",inline"` // for third-party extension
}

Expand Down
5 changes: 5 additions & 0 deletions apis/configuration/zz_generated.deepcopy.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

18 changes: 13 additions & 5 deletions pkg/scheduler/plugins/deviceshare/devicehandler_gpu.go
Original file line number Diff line number Diff line change
Expand Up @@ -51,17 +51,25 @@ func (h *GPUHandler) CalcDesiredRequestsAndCount(node *corev1.Node, pod *corev1.
requests := podRequests
desiredCount := int64(1)

memoryRatio := podRequests[apiext.ResourceGPUMemoryRatio]
multiDevices := memoryRatio.Value() > 100 && memoryRatio.Value()%100 == 0
if multiDevices {
gpuCore, gpuMem, gpuMemoryRatio := podRequests[apiext.ResourceGPUCore], podRequests[apiext.ResourceGPUMemory], podRequests[apiext.ResourceGPUMemoryRatio]
desiredCount = gpuMemoryRatio.Value() / 100
gpuShare, ok := podRequests[apiext.ResourceGPUShared]
gpuCore, gpuMem, gpuMemoryRatio := podRequests[apiext.ResourceGPUCore], podRequests[apiext.ResourceGPUMemory], podRequests[apiext.ResourceGPUMemoryRatio]
// gpu share mode
if ok && gpuShare.Value() > 0 {
desiredCount = gpuShare.Value()
} else {
if gpuMemoryRatio.Value() > 100 && gpuMemoryRatio.Value()%100 == 0 {
desiredCount = gpuMemoryRatio.Value() / 100
}
}

if desiredCount > 1 {
requests = corev1.ResourceList{
apiext.ResourceGPUCore: *resource.NewQuantity(gpuCore.Value()/desiredCount, resource.DecimalSI),
apiext.ResourceGPUMemory: *resource.NewQuantity(gpuMem.Value()/desiredCount, resource.BinarySI),
apiext.ResourceGPUMemoryRatio: *resource.NewQuantity(gpuMemoryRatio.Value()/desiredCount, resource.DecimalSI),
}
}

return requests, int(desiredCount), nil
}

Expand Down
Loading

0 comments on commit 81f2016

Please sign in to comment.