Skip to content

Commit

Permalink
support config gpu memory factor
Browse files Browse the repository at this point in the history
Signed-off-by: peiniliu <peini.liu@gmail.com>
  • Loading branch information
peiniliu committed Aug 15, 2022
1 parent 820bcae commit b9600c5
Show file tree
Hide file tree
Showing 8 changed files with 29 additions and 7 deletions.
13 changes: 12 additions & 1 deletion doc/config.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,13 @@ The volcano device plugin has a number of options that can be configured. These
| Flag | Envvar | Default Value |
|--------------------------|-------------------------|-----------------|
| `--gpu-strategy` | `$GPU_STRATEGY` | `"share"` |
| `--gpu-memory-factor` | `$GPU_MEMORY_FACTOR` | `1` |
| `--config-file` | `$CONFIG_FILE` | `""` |

when starting volcano-device-plugin.yml, users can specify these parameters by adding args to the container 'volcano-device-plugin'.
For example:
- args: ["--gpu-strategy=number"] will let device plugin using the gpu-number strategy
- args: ["--gpu-strategy=share","--gpu-memory-factor=10"] will let device plugin using the gpu-share strategy, and memory factor is 10MB

### As a configuration file
```
Expand All @@ -21,7 +23,7 @@ flags:
```

### Configuration Option Details
**`GPU_STRATEGY`**:
**`GPU_STRATEGY`(string)**:
the desired strategy for exposing GPU devices

`[number | share ] (default 'share')`
Expand All @@ -30,6 +32,15 @@ flags:
on GPU devices in numbers or sharing mode. More information on what
these strategies are and how to use it in Volcano can be found in Volcano scheduler.

**`GPU_MEMORY_FACTOR`(uint)**:
the desired memory factor for exposing GPU shared memory virtual devices

`(default 1)`

The `GPU_MEMORY_FACTOR` option configures the daemonset to be able to expose
on GPU shared memory virtual devices size. By default each block is set to be 1MB,
but users who have large gpu memory can specify a larger number such as 10MB, 100MB.

**`CONFIG_FILE`**:
point the plugin at a configuration file instead of relying on command line
flags or environment variables
Expand Down
6 changes: 6 additions & 0 deletions main.go
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,12 @@ func main() {
Usage: "the default strategy is using shared GPU devices while using 'number' meaning using GPUs individually. [number| share]",
EnvVars: []string{"GPU_STRATEGY"},
},
&cli.UintFlag{
Name: "gpu-memory-factor",
Value: 1,
Usage: "the default gpu memory block size is 1MB",
EnvVars: []string{"GPU_MEMORY_FACTOR"},
},
&cli.StringFlag{
Name: "config-file",
Usage: "the path to a config file as an alternative to command line options or environment variables",
Expand Down
1 change: 1 addition & 0 deletions pkg/apis/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ func NewConfig(c *cli.Context, flags []cli.Flag) (*Config, error) {
}

log.Println(c.String("gpu-strategy"))
log.Println(c.Uint("gpu-memory-factor"))

configFile := c.String("config-file")
if configFile != "" {
Expand Down
6 changes: 4 additions & 2 deletions pkg/apis/flags.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,11 +27,13 @@ type Flags struct {

// CommandLineFlags holds the list of command line flags used to configure the device plugin and GFD.
type CommandLineFlags struct {
GPUStrategy string `json:"GPUStrategy" yaml:"GPUStrategy"`
GPUStrategy string `json:"GPUStrategy" yaml:"GPUStrategy"`
GPUMemoryFactor uint `json:"GPUMemoryFactor" yaml:"GPUMemoryFactor"`
}

func NewCommandLineFlags(c *cli.Context) *CommandLineFlags {
return &CommandLineFlags{
GPUStrategy: c.String("gpu-strategy"),
GPUStrategy: c.String("gpu-strategy"),
GPUMemoryFactor: c.Uint("gpu-memory-factor"),
}
}
4 changes: 2 additions & 2 deletions pkg/plugin/nvidia/server.go
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,7 @@ func (m *NvidiaDevicePlugin) initialize() {
m.health = make(chan *Device)
m.stop = make(chan struct{})

m.virtualDevices, m.devicesByIndex = GetDevices()
m.virtualDevices, m.devicesByIndex = GetDevices(m.config.Flags.GPUMemoryFactor)
}

func (m *NvidiaDevicePlugin) cleanup() {
Expand Down Expand Up @@ -389,7 +389,7 @@ Allocate:
response := pluginapi.ContainerAllocateResponse{
Envs: map[string]string{
VisibleDevice: strings.Trim(strings.Replace(fmt.Sprint(ids), " ", ",", -1), "[]"),
AllocatedGPUResource: fmt.Sprintf("%d", reqGPU),
AllocatedGPUResource: fmt.Sprintf("%d", reqGPU*int(m.config.Flags.GPUMemoryFactor)),
TotalGPUMemory: fmt.Sprintf("%d", gpuMemory),
},
}
Expand Down
4 changes: 2 additions & 2 deletions pkg/plugin/nvidia/utils.go
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ func GetGPUMemory() uint {
}

// GetDevices returns virtual devices and all physical devices by index.
func GetDevices() ([]*pluginapi.Device, map[uint]string) {
func GetDevices(gpuMemoryFactor uint) ([]*pluginapi.Device, map[uint]string) {
n, err := nvml.GetDeviceCount()
check(err)

Expand All @@ -81,7 +81,7 @@ func GetDevices() ([]*pluginapi.Device, map[uint]string) {
if GetGPUMemory() == uint(0) {
SetGPUMemory(uint(*d.Memory))
}
for j := uint(0); j < GetGPUMemory(); j++ {
for j := uint(0); j < GetGPUMemory()/gpuMemoryFactor; j++ {
fakeID := GenerateVirtualDeviceID(id, j)
virtualDevs = append(virtualDevs, &pluginapi.Device{
ID: fakeID,
Expand Down
1 change: 1 addition & 0 deletions volcano-device-plugin-GKE.yml
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,7 @@ spec:
- image: volcanosh/volcano-device-plugin:latest
name: volcano-device-plugin
#args: ["--gpu-strategy=number"]
#args: ["--gpu-strategy=share", "--gpu-memory-factor=1"]
env:
- name: NODE_NAME
valueFrom:
Expand Down
1 change: 1 addition & 0 deletions volcano-device-plugin.yml
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,7 @@ spec:
containers:
- image: volcanosh/volcano-device-plugin:latest
#args: ["--gpu-strategy=number"]
#args: ["--gpu-strategy=share", "--gpu-memory-factor=1"]
name: volcano-device-plugin
env:
- name: NODE_NAME
Expand Down

0 comments on commit b9600c5

Please sign in to comment.