Merge branch 'master' of github.com:Project-HAMi/HAMi

Project-HAMi · Apr 25, 2024 · e892934 · e892934
2 parents 4dd84d3 + c30db33
commit e892934
Show file tree

Hide file tree

Showing 22 changed files with 1,950 additions and 192 deletions.
diff --git a/charts/hami/templates/scheduler/deployment.yaml b/charts/hami/templates/scheduler/deployment.yaml
@@ -47,6 +47,9 @@ spec:
             - {{ . }}
             {{- end }}
             {{- end }}
+            - --leader-elect={{ .Values.scheduler.leaderElect }}
+            - --leader-elect-resource-name={{ .Values.schedulerName }}
+            - --leader-elect-resource-namespace={{ .Release.Namespace }}
           volumeMounts:
             - name: scheduler-config
               mountPath: /config

diff --git a/charts/hami/values.yaml b/charts/hami/values.yaml
@@ -2,7 +2,7 @@
 
 nameOverride: ""
 fullnameOverride: ""
-imagePullSecrets: []
+imagePullSecrets: [ ]
 version: "v2.3.9"
 
 #Nvidia GPU Parameters
@@ -46,6 +46,7 @@ scheduler:
   defaultCores: 0
   defaultGPUNum: 1
   metricsBindAddress: ":9395"
+  leaderElect: true
   kubeScheduler:
     # @param enabled indicate whether to run kube-scheduler container in the scheduler pod, it's true by default.
     enabled: true
@@ -57,7 +58,6 @@ scheduler:
       - -v=4
     extraArgs:
       - --policy-config-file=/config/config.json
-      - --leader-elect=false
       - -v=4
   extender:
     image: "projecthami/hami"

diff --git a/cmd/device-plugin/nvidia/main.go b/cmd/device-plugin/nvidia/main.go
@@ -154,6 +154,7 @@ func loadConfig(c *cli.Context, flags []cli.Flag) (*spec.Config, error) {
 
 func start(c *cli.Context, flags []cli.Flag) error {
 	klog.Info("Starting FS watcher.")
+	util.NodeName = os.Getenv(util.NodeNameEnvName)
 	watcher, err := newFSWatcher(kubeletdevicepluginv1beta1.DevicePluginPath)
 	if err != nil {
 		return fmt.Errorf("failed to create FS watcher: %v", err)

diff --git a/cmd/device-plugin/nvidia/vgpucfg.go b/cmd/device-plugin/nvidia/vgpucfg.go
@@ -150,6 +150,5 @@ func generateDeviceConfigFromNvidia(cfg *spec.Config, c *cli.Context, flags []cl
 		}
 	}
 	readFromConfigFile()
-	util.NodeName = os.Getenv(util.NodeNameEnvName)
 	return devcfg, nil
 }
diff --git a/cmd/scheduler/main.go b/cmd/scheduler/main.go
@@ -22,6 +22,7 @@ import (
 	"github.com/Project-HAMi/HAMi/pkg/device"
 	"github.com/Project-HAMi/HAMi/pkg/scheduler"
 	"github.com/Project-HAMi/HAMi/pkg/scheduler/config"
+	"github.com/Project-HAMi/HAMi/pkg/scheduler/policy"
 	"github.com/Project-HAMi/HAMi/pkg/scheduler/routes"
 	"github.com/Project-HAMi/HAMi/pkg/util"
 	"github.com/Project-HAMi/HAMi/pkg/version"
@@ -57,6 +58,8 @@ func init() {
 	rootCmd.Flags().Int32Var(&config.DefaultMem, "default-mem", 0, "default gpu device memory to allocate")
 	rootCmd.Flags().Int32Var(&config.DefaultCores, "default-cores", 0, "default gpu core percentage to allocate")
 	rootCmd.Flags().Int32Var(&config.DefaultResourceNum, "default-gpu", 1, "default gpu to allocate")
+	rootCmd.Flags().StringVar(&config.NodeSchedulerPolicy, "node-scheduler-policy", policy.NodeSchedulerPolicyBinpack.String(), "node scheduler policy")
+	rootCmd.Flags().StringVar(&config.GPUSchedulerPolicy, "gpu-scheduler-policy", policy.GPUSchedulerPolicySpread.String(), "GPU scheduler policy")
 	rootCmd.Flags().StringVar(&config.MetricsBindAddress, "metrics-bind-address", ":9395", "The TCP address that the scheduler should bind to for serving prometheus metrics(e.g. 127.0.0.1:9395, :9395)")
 	rootCmd.PersistentFlags().AddGoFlagSet(device.GlobalFlagSet())
 	rootCmd.AddCommand(version.VersionCmd)

diff --git a/cmd/scheduler/metrics.go b/cmd/scheduler/metrics.go
@@ -101,49 +101,49 @@ func (cc ClusterManagerCollector) Collect(ch chan<- prometheus.Metric) {
 	)
 	nu := sher.InspectAllNodesUsage()
 	for nodeID, val := range *nu {
-		for _, devs := range val.Devices {
+		for _, devs := range val.Devices.DeviceLists {
 			ch <- prometheus.MustNewConstMetric(
 				nodevGPUMemoryLimitDesc,
 				prometheus.GaugeValue,
-				float64(devs.Totalmem)*float64(1024)*float64(1024),
-				nodeID, devs.ID, fmt.Sprint(devs.Index),
+				float64(devs.Device.Totalmem)*float64(1024)*float64(1024),
+				nodeID, devs.Device.ID, fmt.Sprint(devs.Device.Index),
 			)
 			ch <- prometheus.MustNewConstMetric(
 				nodevGPUCoreLimitDesc,
 				prometheus.GaugeValue,
-				float64(devs.Totalcore),
-				nodeID, devs.ID, fmt.Sprint(devs.Index),
+				float64(devs.Device.Totalcore),
+				nodeID, devs.Device.ID, fmt.Sprint(devs.Device.Index),
 			)
 			ch <- prometheus.MustNewConstMetric(
 				nodevGPUMemoryAllocatedDesc,
 				prometheus.GaugeValue,
-				float64(devs.Usedmem)*float64(1024)*float64(1024),
-				nodeID, devs.ID, fmt.Sprint(devs.Index), fmt.Sprint(devs.Usedcores),
+				float64(devs.Device.Usedmem)*float64(1024)*float64(1024),
+				nodeID, devs.Device.ID, fmt.Sprint(devs.Device.Index), fmt.Sprint(devs.Device.Usedcores),
 			)
 			ch <- prometheus.MustNewConstMetric(
 				nodevGPUSharedNumDesc,
 				prometheus.GaugeValue,
-				float64(devs.Used),
-				nodeID, devs.ID, fmt.Sprint(devs.Index),
+				float64(devs.Device.Used),
+				nodeID, devs.Device.ID, fmt.Sprint(devs.Device.Index),
 			)
 
 			ch <- prometheus.MustNewConstMetric(
 				nodeGPUCoreAllocatedDesc,
 				prometheus.GaugeValue,
-				float64(devs.Usedcores),
-				nodeID, devs.ID, fmt.Sprint(devs.Index),
+				float64(devs.Device.Usedcores),
+				nodeID, devs.Device.ID, fmt.Sprint(devs.Device.Index),
 			)
 			ch <- prometheus.MustNewConstMetric(
 				nodeGPUOverview,
 				prometheus.GaugeValue,
-				float64(devs.Usedmem)*float64(1024)*float64(1024),
-				nodeID, devs.ID, fmt.Sprint(devs.Index), fmt.Sprint(devs.Usedcores), fmt.Sprint(devs.Used), fmt.Sprint(devs.Totalmem), devs.Type,
+				float64(devs.Device.Usedmem)*float64(1024)*float64(1024),
+				nodeID, devs.Device.ID, fmt.Sprint(devs.Device.Index), fmt.Sprint(devs.Device.Usedcores), fmt.Sprint(devs.Device.Used), fmt.Sprint(devs.Device.Totalmem), devs.Device.Type,
 			)
 			ch <- prometheus.MustNewConstMetric(
 				nodeGPUMemoryPercentage,
 				prometheus.GaugeValue,
-				float64(devs.Usedmem)/float64(devs.Totalmem),
-				nodeID, devs.ID, fmt.Sprint(devs.Index),
+				float64(devs.Device.Usedmem)/float64(devs.Device.Totalmem),
+				nodeID, devs.Device.ID, fmt.Sprint(devs.Device.Index),
 			)
 		}
 	}
@@ -181,10 +181,10 @@ func (cc ClusterManagerCollector) Collect(ch chan<- prometheus.Metric) {
 					var totaldev int32
 					found := false
 					for _, ni := range *nu {
-						for _, nodedev := range ni.Devices {
+						for _, nodedev := range ni.Devices.DeviceLists {
 							//fmt.Println("uuid=", nodedev.ID, ctrdevval.UUID)
-							if strings.Compare(nodedev.ID, ctrdevval.UUID) == 0 {
-								totaldev = nodedev.Totalmem
+							if strings.Compare(nodedev.Device.ID, ctrdevval.UUID) == 0 {
+								totaldev = nodedev.Device.Totalmem
 								found = true
 								break
 							}

diff --git a/docs/develop/imgs/gpu-scheduler-policy-demo.png b/docs/develop/imgs/gpu-scheduler-policy-demo.png
diff --git a/docs/develop/imgs/node-shceduler-policy-demo.png b/docs/develop/imgs/node-shceduler-policy-demo.png
diff --git a/docs/develop/imgs/scheduler-policy-story.png b/docs/develop/imgs/scheduler-policy-story.png
diff --git a/docs/develop/scheduler-policy.md b/docs/develop/scheduler-policy.md
@@ -0,0 +1,167 @@
+# Scheduler Policy Design
+
+## Summary
+
+Current in a cluster with many GPU nodes, nodes are not `binpack` or `spread` when making scheduling decisions, nor are GPU cards `binpack` or `spread` when using vGPU.
+
+## Proposal
+
+We add a `node-scheduler-policy` and `gpu-scheduler-policy` to config, then scheduler to use this policy can impl node `binpack` or `spread` or GPU `binpack` or `spread`. and
+use can set Pod annotation to change this default policy, use `hami.io/node-scheduler-policy` and `hami.io/gpu-scheduler-policy` to overlay scheduler config.
+
+### User Stories
+
+This is a GPU cluster, having two node, the following story takes this cluster as a prerequisite.
+
+![scheduler-policy-story.png](./imgs/scheduler-policy-story.png)
+
+#### Story 1
+
+node binpack, use one node’s GPU card whenever possible, egs:
+- cluster resources:
+  - node1: GPU having 4 GPU device
+  - node2: GPU having 4 GPU device
+
+- request:
+  - pod1: User 1 GPU
+  - pod2: User 1 GPU
+
+- scheduler result:
+  - pod1: scheduler to node1
+  - pod2: scheduler to node1
+
+#### Story 2
+
+node spread, use GPU cards from different nodes as much as possible, egs:
+
+- cluster resources:
+    - node1: GPU having 4 GPU device
+    - node2: GPU having 4 GPU device
+
+- request:
+    - pod1: User 1 GPU
+    - pod2: User 1 GPU
+
+- scheduler result:
+    - pod1: scheduler to node1
+    - pod2: scheduler to node2
+
+#### Story 3
+
+GPU binpack, use the same GPU card as much as possible, egs:
+
+- cluster resources:
+    - node1: GPU having 4 GPU device, they are GPU1,GPU2,GPU3,GPU4
+
+- request:
+    - pod1: User 1 GPU, gpucore is 20%, gpumem-percentage is 20% 
+    - pod2: User 1 GPU, gpucore is 20%, gpumem-percentage is 20%
+
+- scheduler result:
+    - pod1: scheduler to node1, select GPU1 this device
+    - pod2: scheduler to node1, select GPU1 this device
+
+#### Story 4
+
+GPU spread, use different GPU cards when possible, egs:
+
+- cluster resources:
+    - node1: GPU having 4 GPU device, they are GPU1,GPU2,GPU3,GPU4
+
+- request:
+    - pod1: User 1 GPU, gpucore is 20%, gpumem-percentage is 20%
+    - pod2: User 1 GPU, gpucore is 20%, gpumem-percentage is 20%
+
+- scheduler result:
+    - pod1: scheduler to node1, select GPU1 this device
+    - pod2: scheduler to node1, select GPU2 this device
+
+## Design Details
+
+### Node-scheduler-policy
+
+![node-shceduler-policy-demo.png](./imgs/node-shceduler-policy-demo.png)
+
+#### Binpack
+
+Binpack mainly considers node resource usage. The more full the usage, the higher the score.
+
+```
+score: ((request + used) / allocatable) * 10 
+```
+
+1. Binpack scoring information for Node 1 is as follows
+
+```
+Node1 score: ((1+3)/4) * 10= 10
+```
+
+2. Binpack scoring information for Node 2 is as follows
+
+```
+Node2 score: ((1+2)/4) * 10= 7.5
+```
+
+So, in `Binpack` policy we can select `Node1`.
+
+#### Spread
+
+Spread mainly considers node resource usage. The less it is used, the higher the score.
+
+```
+score: ((request + used) / allocatable) * 10 
+```
+
+1. Spread scoring information for Node 1 is as follows
+```
+Node1 score: ((1+3)/4) * 10= 10
+```
+
+2. Spread scoring information for Node 2 is as follows
+```
+Node2 score: ((1+2)/4) * 10= 7.5
+```
+
+So, in `Spread` policy we can select `Node2`.
+
+### GPU-scheduler-policy
+
+![gpu-scheduler-policy-demo.png](./imgs/gpu-scheduler-policy-demo.png)
+
+#### Binpack
+
+Binpack mainly focuses on the computing power and video memory usage of each card. The more it is used, the higher the score.
+```
+score: ((request.core + used.core) / allocatable.core + (request.mem + used.mem) / allocatable.mem)) * 10
+```
+
+1. Binpack scoring information for GPU 1 is as follows
+```
+GPU1 Score: ((20+10)/100 + (1000+2000)/8000)) * 10 = 6.75
+```
+
+2. Binpack scoring information for GPU 2 is as follows
+```
+GPU2 Score: ((20+70)/100 + (1000+6000)/8000)) * 10 = 17.75
+```
+
+So, in `Binpack` policy we can select `GPU2`.
+
+#### Spread
+
+Spread mainly focuses on the computing power and video memory usage of each card. The less it is used, the higher the score.
+```
+score: ((request.core + used.core) / allocatable.core + (request.mem + used.mem) / allocatable.mem)) * 10
+```
+
+1. Spread scoring information for GPU 1 is as follows
+```
+GPU1 Score: ((20+10)/100 + (1000+2000)/8000)) * 10 = 6.75
+```
+
+2. Spread scoring information for GPU 2 is as follows
+```
+GPU2 Score: ((20+70)/100 + (1000+6000)/8000)) * 10 = 17.75
+```
+
+So, in `Spread` policy we can select `GPU1`.
diff --git a/go.mod b/go.mod
@@ -43,6 +43,7 @@ require (
 	github.com/cpuguy83/go-md2man/v2 v2.0.2 // indirect
 	github.com/davecgh/go-spew v1.1.1 // indirect
 	github.com/emicklei/go-restful/v3 v3.11.0 // indirect
+	github.com/evanphx/json-patch v5.6.0+incompatible // indirect
 	github.com/go-logr/logr v1.4.1 // indirect
 	github.com/go-openapi/jsonpointer v0.19.6 // indirect
 	github.com/go-openapi/jsonreference v0.20.2 // indirect
@@ -66,6 +67,7 @@ require (
 	github.com/onsi/gomega v1.32.0 // indirect
 	github.com/opencontainers/runc v1.1.7 // indirect
 	github.com/opencontainers/runtime-tools v0.9.1-0.20221107090550-2e043c6bd626 // indirect
+	github.com/pkg/errors v0.9.1 // indirect
 	github.com/pmezard/go-difflib v1.0.0 // indirect
 	github.com/prometheus/client_model v0.4.0 // indirect
 	github.com/prometheus/common v0.44.0 // indirect

diff --git a/pkg/scheduler/config/config.go b/pkg/scheduler/config/config.go
@@ -16,11 +16,18 @@ limitations under the License.
 
 package config
 
+import "github.com/Project-HAMi/HAMi/pkg/scheduler/policy"
+
 var (
 	HTTPBind           string
 	SchedulerName      string
 	DefaultMem         int32
 	DefaultCores       int32
 	DefaultResourceNum int32
 	MetricsBindAddress string
+
+	// NodeSchedulerPolicy is config this scheduler node to use `binpack` or `spread`. default value is binpack.
+	NodeSchedulerPolicy = policy.NodeSchedulerPolicyBinpack.String()
+	// GPUSchedulerPolicy is config this scheduler GPU to use `binpack` or `spread`. default value is spread.
+	GPUSchedulerPolicy = policy.GPUSchedulerPolicySpread.String()
 )
diff --git a/pkg/scheduler/nodes.go b/pkg/scheduler/nodes.go
@@ -21,15 +21,14 @@ import (
 	"strings"
 	"sync"
 
+	"github.com/Project-HAMi/HAMi/pkg/scheduler/policy"
 	"github.com/Project-HAMi/HAMi/pkg/util"
 
 	"k8s.io/klog/v2"
 )
 
-type DeviceUsageList []*util.DeviceUsage
-
 type NodeUsage struct {
-	Devices DeviceUsageList
+	Devices policy.DeviceUsageList
 }
 
 type nodeManager struct {
@@ -84,6 +83,7 @@ func (m *nodeManager) rmNodeDevice(nodeID string, nodeInfo *util.NodeInfo) {
 		m.nodes[nodeID].Devices = tmp
 		if len(m.nodes[nodeID].Devices) == 0 {
 			delete(m.nodes, nodeID)
+			return
 		}
 		klog.Infoln("Rm Devices res:", m.nodes[nodeID].Devices)
 	}