Skip to content

Commit

Permalink
Merge branch 'master' of github.com:Project-HAMi/HAMi
Browse files Browse the repository at this point in the history
  • Loading branch information
archlitchi committed Apr 25, 2024
2 parents 4dd84d3 + c30db33 commit e892934
Show file tree
Hide file tree
Showing 22 changed files with 1,950 additions and 192 deletions.
3 changes: 3 additions & 0 deletions charts/hami/templates/scheduler/deployment.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,9 @@ spec:
- {{ . }}
{{- end }}
{{- end }}
- --leader-elect={{ .Values.scheduler.leaderElect }}
- --leader-elect-resource-name={{ .Values.schedulerName }}
- --leader-elect-resource-namespace={{ .Release.Namespace }}
volumeMounts:
- name: scheduler-config
mountPath: /config
Expand Down
4 changes: 2 additions & 2 deletions charts/hami/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

nameOverride: ""
fullnameOverride: ""
imagePullSecrets: []
imagePullSecrets: [ ]
version: "v2.3.9"

#Nvidia GPU Parameters
Expand Down Expand Up @@ -46,6 +46,7 @@ scheduler:
defaultCores: 0
defaultGPUNum: 1
metricsBindAddress: ":9395"
leaderElect: true
kubeScheduler:
# @param enabled indicate whether to run kube-scheduler container in the scheduler pod, it's true by default.
enabled: true
Expand All @@ -57,7 +58,6 @@ scheduler:
- -v=4
extraArgs:
- --policy-config-file=/config/config.json
- --leader-elect=false
- -v=4
extender:
image: "projecthami/hami"
Expand Down
1 change: 1 addition & 0 deletions cmd/device-plugin/nvidia/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -154,6 +154,7 @@ func loadConfig(c *cli.Context, flags []cli.Flag) (*spec.Config, error) {

func start(c *cli.Context, flags []cli.Flag) error {
klog.Info("Starting FS watcher.")
util.NodeName = os.Getenv(util.NodeNameEnvName)
watcher, err := newFSWatcher(kubeletdevicepluginv1beta1.DevicePluginPath)
if err != nil {
return fmt.Errorf("failed to create FS watcher: %v", err)
Expand Down
1 change: 0 additions & 1 deletion cmd/device-plugin/nvidia/vgpucfg.go
Original file line number Diff line number Diff line change
Expand Up @@ -150,6 +150,5 @@ func generateDeviceConfigFromNvidia(cfg *spec.Config, c *cli.Context, flags []cl
}
}
readFromConfigFile()
util.NodeName = os.Getenv(util.NodeNameEnvName)
return devcfg, nil
}
3 changes: 3 additions & 0 deletions cmd/scheduler/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ import (
"github.com/Project-HAMi/HAMi/pkg/device"
"github.com/Project-HAMi/HAMi/pkg/scheduler"
"github.com/Project-HAMi/HAMi/pkg/scheduler/config"
"github.com/Project-HAMi/HAMi/pkg/scheduler/policy"
"github.com/Project-HAMi/HAMi/pkg/scheduler/routes"
"github.com/Project-HAMi/HAMi/pkg/util"
"github.com/Project-HAMi/HAMi/pkg/version"
Expand Down Expand Up @@ -57,6 +58,8 @@ func init() {
rootCmd.Flags().Int32Var(&config.DefaultMem, "default-mem", 0, "default gpu device memory to allocate")
rootCmd.Flags().Int32Var(&config.DefaultCores, "default-cores", 0, "default gpu core percentage to allocate")
rootCmd.Flags().Int32Var(&config.DefaultResourceNum, "default-gpu", 1, "default gpu to allocate")
rootCmd.Flags().StringVar(&config.NodeSchedulerPolicy, "node-scheduler-policy", policy.NodeSchedulerPolicyBinpack.String(), "node scheduler policy")
rootCmd.Flags().StringVar(&config.GPUSchedulerPolicy, "gpu-scheduler-policy", policy.GPUSchedulerPolicySpread.String(), "GPU scheduler policy")
rootCmd.Flags().StringVar(&config.MetricsBindAddress, "metrics-bind-address", ":9395", "The TCP address that the scheduler should bind to for serving prometheus metrics(e.g. 127.0.0.1:9395, :9395)")
rootCmd.PersistentFlags().AddGoFlagSet(device.GlobalFlagSet())
rootCmd.AddCommand(version.VersionCmd)
Expand Down
36 changes: 18 additions & 18 deletions cmd/scheduler/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -101,49 +101,49 @@ func (cc ClusterManagerCollector) Collect(ch chan<- prometheus.Metric) {
)
nu := sher.InspectAllNodesUsage()
for nodeID, val := range *nu {
for _, devs := range val.Devices {
for _, devs := range val.Devices.DeviceLists {
ch <- prometheus.MustNewConstMetric(
nodevGPUMemoryLimitDesc,
prometheus.GaugeValue,
float64(devs.Totalmem)*float64(1024)*float64(1024),
nodeID, devs.ID, fmt.Sprint(devs.Index),
float64(devs.Device.Totalmem)*float64(1024)*float64(1024),
nodeID, devs.Device.ID, fmt.Sprint(devs.Device.Index),
)
ch <- prometheus.MustNewConstMetric(
nodevGPUCoreLimitDesc,
prometheus.GaugeValue,
float64(devs.Totalcore),
nodeID, devs.ID, fmt.Sprint(devs.Index),
float64(devs.Device.Totalcore),
nodeID, devs.Device.ID, fmt.Sprint(devs.Device.Index),
)
ch <- prometheus.MustNewConstMetric(
nodevGPUMemoryAllocatedDesc,
prometheus.GaugeValue,
float64(devs.Usedmem)*float64(1024)*float64(1024),
nodeID, devs.ID, fmt.Sprint(devs.Index), fmt.Sprint(devs.Usedcores),
float64(devs.Device.Usedmem)*float64(1024)*float64(1024),
nodeID, devs.Device.ID, fmt.Sprint(devs.Device.Index), fmt.Sprint(devs.Device.Usedcores),
)
ch <- prometheus.MustNewConstMetric(
nodevGPUSharedNumDesc,
prometheus.GaugeValue,
float64(devs.Used),
nodeID, devs.ID, fmt.Sprint(devs.Index),
float64(devs.Device.Used),
nodeID, devs.Device.ID, fmt.Sprint(devs.Device.Index),
)

ch <- prometheus.MustNewConstMetric(
nodeGPUCoreAllocatedDesc,
prometheus.GaugeValue,
float64(devs.Usedcores),
nodeID, devs.ID, fmt.Sprint(devs.Index),
float64(devs.Device.Usedcores),
nodeID, devs.Device.ID, fmt.Sprint(devs.Device.Index),
)
ch <- prometheus.MustNewConstMetric(
nodeGPUOverview,
prometheus.GaugeValue,
float64(devs.Usedmem)*float64(1024)*float64(1024),
nodeID, devs.ID, fmt.Sprint(devs.Index), fmt.Sprint(devs.Usedcores), fmt.Sprint(devs.Used), fmt.Sprint(devs.Totalmem), devs.Type,
float64(devs.Device.Usedmem)*float64(1024)*float64(1024),
nodeID, devs.Device.ID, fmt.Sprint(devs.Device.Index), fmt.Sprint(devs.Device.Usedcores), fmt.Sprint(devs.Device.Used), fmt.Sprint(devs.Device.Totalmem), devs.Device.Type,
)
ch <- prometheus.MustNewConstMetric(
nodeGPUMemoryPercentage,
prometheus.GaugeValue,
float64(devs.Usedmem)/float64(devs.Totalmem),
nodeID, devs.ID, fmt.Sprint(devs.Index),
float64(devs.Device.Usedmem)/float64(devs.Device.Totalmem),
nodeID, devs.Device.ID, fmt.Sprint(devs.Device.Index),
)
}
}
Expand Down Expand Up @@ -181,10 +181,10 @@ func (cc ClusterManagerCollector) Collect(ch chan<- prometheus.Metric) {
var totaldev int32
found := false
for _, ni := range *nu {
for _, nodedev := range ni.Devices {
for _, nodedev := range ni.Devices.DeviceLists {
//fmt.Println("uuid=", nodedev.ID, ctrdevval.UUID)
if strings.Compare(nodedev.ID, ctrdevval.UUID) == 0 {
totaldev = nodedev.Totalmem
if strings.Compare(nodedev.Device.ID, ctrdevval.UUID) == 0 {
totaldev = nodedev.Device.Totalmem
found = true
break
}
Expand Down
Binary file added docs/develop/imgs/gpu-scheduler-policy-demo.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added docs/develop/imgs/scheduler-policy-story.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
167 changes: 167 additions & 0 deletions docs/develop/scheduler-policy.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,167 @@
# Scheduler Policy Design

## Summary

Current in a cluster with many GPU nodes, nodes are not `binpack` or `spread` when making scheduling decisions, nor are GPU cards `binpack` or `spread` when using vGPU.

## Proposal

We add a `node-scheduler-policy` and `gpu-scheduler-policy` to config, then scheduler to use this policy can impl node `binpack` or `spread` or GPU `binpack` or `spread`. and
use can set Pod annotation to change this default policy, use `hami.io/node-scheduler-policy` and `hami.io/gpu-scheduler-policy` to overlay scheduler config.

### User Stories

This is a GPU cluster, having two node, the following story takes this cluster as a prerequisite.

![scheduler-policy-story.png](./imgs/scheduler-policy-story.png)

#### Story 1

node binpack, use one node’s GPU card whenever possible, egs:
- cluster resources:
- node1: GPU having 4 GPU device
- node2: GPU having 4 GPU device

- request:
- pod1: User 1 GPU
- pod2: User 1 GPU

- scheduler result:
- pod1: scheduler to node1
- pod2: scheduler to node1

#### Story 2

node spread, use GPU cards from different nodes as much as possible, egs:

- cluster resources:
- node1: GPU having 4 GPU device
- node2: GPU having 4 GPU device

- request:
- pod1: User 1 GPU
- pod2: User 1 GPU

- scheduler result:
- pod1: scheduler to node1
- pod2: scheduler to node2

#### Story 3

GPU binpack, use the same GPU card as much as possible, egs:

- cluster resources:
- node1: GPU having 4 GPU device, they are GPU1,GPU2,GPU3,GPU4

- request:
- pod1: User 1 GPU, gpucore is 20%, gpumem-percentage is 20%
- pod2: User 1 GPU, gpucore is 20%, gpumem-percentage is 20%

- scheduler result:
- pod1: scheduler to node1, select GPU1 this device
- pod2: scheduler to node1, select GPU1 this device

#### Story 4

GPU spread, use different GPU cards when possible, egs:

- cluster resources:
- node1: GPU having 4 GPU device, they are GPU1,GPU2,GPU3,GPU4

- request:
- pod1: User 1 GPU, gpucore is 20%, gpumem-percentage is 20%
- pod2: User 1 GPU, gpucore is 20%, gpumem-percentage is 20%

- scheduler result:
- pod1: scheduler to node1, select GPU1 this device
- pod2: scheduler to node1, select GPU2 this device

## Design Details

### Node-scheduler-policy

![node-shceduler-policy-demo.png](./imgs/node-shceduler-policy-demo.png)

#### Binpack

Binpack mainly considers node resource usage. The more full the usage, the higher the score.

```
score: ((request + used) / allocatable) * 10
```

1. Binpack scoring information for Node 1 is as follows

```
Node1 score: ((1+3)/4) * 10= 10
```

2. Binpack scoring information for Node 2 is as follows

```
Node2 score: ((1+2)/4) * 10= 7.5
```

So, in `Binpack` policy we can select `Node1`.

#### Spread

Spread mainly considers node resource usage. The less it is used, the higher the score.

```
score: ((request + used) / allocatable) * 10
```

1. Spread scoring information for Node 1 is as follows
```
Node1 score: ((1+3)/4) * 10= 10
```

2. Spread scoring information for Node 2 is as follows
```
Node2 score: ((1+2)/4) * 10= 7.5
```

So, in `Spread` policy we can select `Node2`.

### GPU-scheduler-policy

![gpu-scheduler-policy-demo.png](./imgs/gpu-scheduler-policy-demo.png)

#### Binpack

Binpack mainly focuses on the computing power and video memory usage of each card. The more it is used, the higher the score.
```
score: ((request.core + used.core) / allocatable.core + (request.mem + used.mem) / allocatable.mem)) * 10
```

1. Binpack scoring information for GPU 1 is as follows
```
GPU1 Score: ((20+10)/100 + (1000+2000)/8000)) * 10 = 6.75
```

2. Binpack scoring information for GPU 2 is as follows
```
GPU2 Score: ((20+70)/100 + (1000+6000)/8000)) * 10 = 17.75
```

So, in `Binpack` policy we can select `GPU2`.

#### Spread

Spread mainly focuses on the computing power and video memory usage of each card. The less it is used, the higher the score.
```
score: ((request.core + used.core) / allocatable.core + (request.mem + used.mem) / allocatable.mem)) * 10
```

1. Spread scoring information for GPU 1 is as follows
```
GPU1 Score: ((20+10)/100 + (1000+2000)/8000)) * 10 = 6.75
```

2. Spread scoring information for GPU 2 is as follows
```
GPU2 Score: ((20+70)/100 + (1000+6000)/8000)) * 10 = 17.75
```

So, in `Spread` policy we can select `GPU1`.
2 changes: 2 additions & 0 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ require (
github.com/cpuguy83/go-md2man/v2 v2.0.2 // indirect
github.com/davecgh/go-spew v1.1.1 // indirect
github.com/emicklei/go-restful/v3 v3.11.0 // indirect
github.com/evanphx/json-patch v5.6.0+incompatible // indirect
github.com/go-logr/logr v1.4.1 // indirect
github.com/go-openapi/jsonpointer v0.19.6 // indirect
github.com/go-openapi/jsonreference v0.20.2 // indirect
Expand All @@ -66,6 +67,7 @@ require (
github.com/onsi/gomega v1.32.0 // indirect
github.com/opencontainers/runc v1.1.7 // indirect
github.com/opencontainers/runtime-tools v0.9.1-0.20221107090550-2e043c6bd626 // indirect
github.com/pkg/errors v0.9.1 // indirect
github.com/pmezard/go-difflib v1.0.0 // indirect
github.com/prometheus/client_model v0.4.0 // indirect
github.com/prometheus/common v0.44.0 // indirect
Expand Down
7 changes: 7 additions & 0 deletions pkg/scheduler/config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,11 +16,18 @@ limitations under the License.

package config

import "github.com/Project-HAMi/HAMi/pkg/scheduler/policy"

var (
HTTPBind string
SchedulerName string
DefaultMem int32
DefaultCores int32
DefaultResourceNum int32
MetricsBindAddress string

// NodeSchedulerPolicy is config this scheduler node to use `binpack` or `spread`. default value is binpack.
NodeSchedulerPolicy = policy.NodeSchedulerPolicyBinpack.String()
// GPUSchedulerPolicy is config this scheduler GPU to use `binpack` or `spread`. default value is spread.
GPUSchedulerPolicy = policy.GPUSchedulerPolicySpread.String()
)
6 changes: 3 additions & 3 deletions pkg/scheduler/nodes.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,15 +21,14 @@ import (
"strings"
"sync"

"github.com/Project-HAMi/HAMi/pkg/scheduler/policy"
"github.com/Project-HAMi/HAMi/pkg/util"

"k8s.io/klog/v2"
)

type DeviceUsageList []*util.DeviceUsage

type NodeUsage struct {
Devices DeviceUsageList
Devices policy.DeviceUsageList
}

type nodeManager struct {
Expand Down Expand Up @@ -84,6 +83,7 @@ func (m *nodeManager) rmNodeDevice(nodeID string, nodeInfo *util.NodeInfo) {
m.nodes[nodeID].Devices = tmp
if len(m.nodes[nodeID].Devices) == 0 {
delete(m.nodes, nodeID)
return
}
klog.Infoln("Rm Devices res:", m.nodes[nodeID].Devices)
}
Expand Down
Loading

0 comments on commit e892934

Please sign in to comment.