Skip to content

Commit

Permalink
Merge branch 'main' into add-selinux-context
Browse files Browse the repository at this point in the history
  • Loading branch information
elezar committed Jun 27, 2024
2 parents 5adf3f1 + 35ad180 commit b9160fe
Show file tree
Hide file tree
Showing 14 changed files with 240 additions and 15 deletions.
31 changes: 16 additions & 15 deletions docs/gpu-feature-discovery/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -194,21 +194,22 @@ Environment variables override the command line options if they conflict.
This is the list of the labels generated by NVIDIA GPU Feature Discovery and
their meaning:

| Label Name | Value Type | Meaning | Example |
| -------------------------------| ---------- | -------------------------------------------- | -------------- |
| nvidia.com/cuda.driver.major | Integer | Major of the version of NVIDIA driver | 418 |
| nvidia.com/cuda.driver.minor | Integer | Minor of the version of NVIDIA driver | 30 |
| nvidia.com/cuda.driver.rev | Integer | Revision of the version of NVIDIA driver | 40 |
| nvidia.com/cuda.runtime.major | Integer | Major of the version of CUDA | 10 |
| nvidia.com/cuda.runtime.minor | Integer | Minor of the version of CUDA | 1 |
| nvidia.com/gfd.timestamp | Integer | Timestamp of the generated labels (optional) | 1555019244 |
| nvidia.com/gpu.compute.major | Integer | Major of the compute capabilities | 3 |
| nvidia.com/gpu.compute.minor | Integer | Minor of the compute capabilities | 3 |
| nvidia.com/gpu.count | Integer | Number of GPUs | 2 |
| nvidia.com/gpu.family | String | Architecture family of the GPU | kepler |
| nvidia.com/gpu.machine | String | Machine type | DGX-1 |
| nvidia.com/gpu.memory | Integer | Memory of the GPU in Mb | 2048 |
| nvidia.com/gpu.product | String | Model of the GPU | GeForce-GT-710 |
| Label Name | Value Type | Meaning | Example |
| -------------------------------| ---------- |----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -------------- |
| nvidia.com/cuda.driver.major | Integer | Major of the version of NVIDIA driver | 418 |
| nvidia.com/cuda.driver.minor | Integer | Minor of the version of NVIDIA driver | 30 |
| nvidia.com/cuda.driver.rev | Integer | Revision of the version of NVIDIA driver | 40 |
| nvidia.com/cuda.runtime.major | Integer | Major of the version of CUDA | 10 |
| nvidia.com/cuda.runtime.minor | Integer | Minor of the version of CUDA | 1 |
| nvidia.com/gfd.timestamp | Integer | Timestamp of the generated labels (optional) | 1555019244 |
| nvidia.com/gpu.compute.major | Integer | Major of the compute capabilities | 3 |
| nvidia.com/gpu.compute.minor | Integer | Minor of the compute capabilities | 3 |
| nvidia.com/gpu.count | Integer | Number of GPUs | 2 |
| nvidia.com/gpu.family | String | Architecture family of the GPU | kepler |
| nvidia.com/gpu.machine | String | Machine type | DGX-1 |
| nvidia.com/gpu.memory | Integer | Memory of the GPU in Mb | 2048 |
| nvidia.com/gpu.product | String | Model of the GPU | GeForce-GT-710 |
| nvidia.com/gpu.mode | String | Display or Compute Mode of the GPU. Details of the GPU modes can be found [here](https://docs.nvidia.com/grid/13.0/grid-gpumodeswitch-user-guide/index.html#compute-and-graphics-mode) | compute |

Depending on the MIG strategy used, the following set of labels may also be
available (or override the default values for some of the labels listed above):
Expand Down
61 changes: 61 additions & 0 deletions internal/lm/nvml.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,10 @@ import (
"strconv"
"strings"

"k8s.io/klog/v2"

"github.com/NVIDIA/go-nvlib/pkg/nvpci"

spec "github.com/NVIDIA/k8s-device-plugin/api/config/v1"
"github.com/NVIDIA/k8s-device-plugin/internal/resource"
)
Expand Down Expand Up @@ -71,12 +75,18 @@ func NewDeviceLabeler(manager resource.Manager, config *spec.Config) (Labeler, e
return nil, fmt.Errorf("error creating resource labeler: %v", err)
}

gpuModeLabeler, err := newGPUModeLabeler(devices)
if err != nil {
return nil, fmt.Errorf("error creating resource labeler: %v", err)
}

l := Merge(
machineTypeLabeler,
versionLabeler,
migCapabilityLabeler,
sharingLabeler,
resourceLabeler,
gpuModeLabeler,
)

return l, nil
Expand Down Expand Up @@ -193,3 +203,54 @@ func isMPSCapable(manager resource.Manager) (bool, error) {
}
return true, nil
}

// newGPUModeLabeler creates a new labeler that reports the mode of GPUs on the node.
// GPUs can be in Graphics or Compute mode.
func newGPUModeLabeler(devices []resource.Device) (Labeler, error) {
classes, err := getDeviceClasses(devices)
if err != nil {
return nil, err
}
gpuMode := getModeForClasses(classes)
labels := Labels{
"nvidia.com/gpu.mode": gpuMode,
}
return labels, nil
}

func getModeForClasses(classes []uint32) string {
if len(classes) == 0 {
return "unknown"
}
for _, class := range classes {
if class != classes[0] {
klog.Infof("Not all GPU devices belong to the same class %#06x ", classes)
return "unknown"
}
}
switch classes[0] {
case nvpci.PCIVgaControllerClass:
return "graphics"
case nvpci.PCI3dControllerClass:
return "compute"
default:
return "unknown"
}
}

func getDeviceClasses(devices []resource.Device) ([]uint32, error) {
seenClasses := make(map[uint32]bool)
for _, d := range devices {
class, err := d.GetPCIClass()
if err != nil {
return nil, err
}
seenClasses[class] = true
}

var classes []uint32
for class := range seenClasses {
classes = append(classes, class)
}
return classes, nil
}
86 changes: 86 additions & 0 deletions internal/lm/nvml_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -204,3 +204,89 @@ func TestSharingLabeler(t *testing.T) {
})
}
}

func TestGPUModeLabeler(t *testing.T) {
testCases := []struct {
description string
devices []resource.Device
expectedError bool
expectedLabels map[string]string
}{
{
description: "single device with compute PCI class",
devices: []resource.Device{
rt.NewDeviceWithPCIClassMock(0x030000),
},
expectedLabels: map[string]string{
"nvidia.com/gpu.mode": "graphics",
},
},
{
description: "single device with graphics PCI class",
devices: []resource.Device{
rt.NewDeviceWithPCIClassMock(0x030200),
},
expectedLabels: map[string]string{
"nvidia.com/gpu.mode": "compute",
},
},
{
description: "single device with switch PCI class",
devices: []resource.Device{
rt.NewDeviceWithPCIClassMock(0x068000),
},
expectedLabels: map[string]string{
"nvidia.com/gpu.mode": "unknown",
},
},
{
description: "multiple device have same graphics PCI class",
devices: []resource.Device{
rt.NewDeviceWithPCIClassMock(0x030200),
rt.NewDeviceWithPCIClassMock(0x030200),
rt.NewDeviceWithPCIClassMock(0x030200),
},
expectedLabels: map[string]string{
"nvidia.com/gpu.mode": "compute",
},
},
{
description: "multiple device have same compute PCI class",
devices: []resource.Device{
rt.NewDeviceWithPCIClassMock(0x030000),
rt.NewDeviceWithPCIClassMock(0x030000),
rt.NewDeviceWithPCIClassMock(0x030000),
},
expectedLabels: map[string]string{
"nvidia.com/gpu.mode": "graphics",
},
},
{
description: "multiple device with some with graphics and others with compute PCI class",
devices: []resource.Device{
rt.NewDeviceWithPCIClassMock(0x030000),
rt.NewDeviceWithPCIClassMock(0x030200),
rt.NewDeviceWithPCIClassMock(0x030000),
},
expectedLabels: map[string]string{
"nvidia.com/gpu.mode": "unknown",
},
},
}

for _, tc := range testCases {
t.Run(tc.description, func(t *testing.T) {

gpuModeLabeler, _ := newGPUModeLabeler(tc.devices)

labels, err := gpuModeLabeler.Labels()
if tc.expectedError {
require.Error(t, err)
} else {
require.NoError(t, err)
}

require.EqualValues(t, tc.expectedLabels, labels)
})
}
}
4 changes: 4 additions & 0 deletions internal/resource/cuda-device.go
Original file line number Diff line number Diff line change
Expand Up @@ -96,3 +96,7 @@ func (d *cudaDevice) IsMigCapable() (bool, error) {
func (d *cudaDevice) IsMigEnabled() (bool, error) {
return false, nil
}

func (d *cudaDevice) GetPCIClass() (uint32, error) {
return 0, nil
}
37 changes: 37 additions & 0 deletions internal/resource/device_mock.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

13 changes: 13 additions & 0 deletions internal/resource/nvml-device.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ import (
"fmt"

"github.com/NVIDIA/go-nvlib/pkg/nvlib/device"
"github.com/NVIDIA/go-nvlib/pkg/nvpci"
"github.com/NVIDIA/go-nvml/pkg/nvml"
)

Expand Down Expand Up @@ -86,3 +87,15 @@ func (d nvmlDevice) GetTotalMemoryMB() (uint64, error) {
}
return info.Total / (1024 * 1024), nil
}

func (d nvmlDevice) GetPCIClass() (uint32, error) {
pciBusID, err := d.GetPCIBusID()
if err != nil {
return 0, err
}
nvDevice, err := nvpci.New().GetGPUByPciBusID(pciBusID)
if err != nil {
return 0, err
}
return nvDevice.Class, nil
}
6 changes: 6 additions & 0 deletions internal/resource/nvml-mig-device.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ import (
"strings"

"github.com/NVIDIA/go-nvlib/pkg/nvlib/device"
"github.com/NVIDIA/go-nvlib/pkg/nvpci"
"github.com/NVIDIA/go-nvml/pkg/nvml"
)

Expand Down Expand Up @@ -132,3 +133,8 @@ func totalMemory(attr map[string]interface{}) (uint64, error) {
return 0, fmt.Errorf("unsupported attribute type %v", t)
}
}

func (d nvmlMigDevice) GetPCIClass() (uint32, error) {
// GPU devices that support MIG do not support switching mode between graphics and compute, so they are always in compute mode.
return nvpci.PCI3dControllerClass, nil
}
4 changes: 4 additions & 0 deletions internal/resource/sysfs-device.go
Original file line number Diff line number Diff line change
Expand Up @@ -64,3 +64,7 @@ func (d vfioDevice) IsMigEnabled() (bool, error) {
func (d vfioDevice) IsMigCapable() (bool, error) {
return false, nil
}

func (d vfioDevice) GetPCIClass() (uint32, error) {
return d.nvidiaPCIDevice.Class, nil
}
8 changes: 8 additions & 0 deletions internal/resource/testing/resource-testing.go
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,14 @@ func NewDeviceMock(migEnabled bool) *DeviceMock {
IsMigEnabledFunc: func() (bool, error) { return migEnabled, nil },
IsMigCapableFunc: func() (bool, error) { return migEnabled, nil },
GetMigDevicesFunc: func() ([]resource.Device, error) { return nil, nil },
GetPCIClassFunc: func() (uint32, error) { return 0, nil },
}}
return &d
}

func NewDeviceWithPCIClassMock(pciClass uint32) *DeviceMock {
d := DeviceMock{resource.DeviceMock{
GetPCIClassFunc: func() (uint32, error) { return pciClass, nil },
}}
return &d
}
Expand Down
1 change: 1 addition & 0 deletions internal/resource/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -39,4 +39,5 @@ type Device interface {
GetTotalMemoryMB() (uint64, error)
GetDeviceHandleFromMigDeviceHandle() (Device, error)
GetCudaComputeCapability() (int, int, error)
GetPCIClass() (uint32, error)
}
1 change: 1 addition & 0 deletions tests/expected-output-mig-mixed.txt
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ nvidia\.com\/gpu\.sharing-strategy=[none|mps|time-slicing]
nvidia\.com\/gpu\.product=[A-Za-z_-]+
nvidia\.com\/gpu\.memory=[0-9]+
nvidia\.com\/gpu\.family=[a-z]+
nvidia\.com\/gpu\.mode=[unknown|compute|graphics]
nvidia\.com\/mig\.capable=[true|false]
nvidia\.com\/gpu\.compute\.major=[0-9]+
nvidia\.com\/gpu\.compute\.minor=[0-9]+
Expand Down
1 change: 1 addition & 0 deletions tests/expected-output-mig-none.txt
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ nvidia\.com\/gpu\.sharing-strategy=[none|mps|time-slicing]
nvidia\.com\/gpu\.product=[A-Za-z_-]+
nvidia\.com\/gpu\.memory=[0-9]+
nvidia\.com\/gpu\.family=[a-z]+
nvidia\.com\/gpu\.mode=[unknown|compute|graphics]
nvidia\.com\/mig\.capable=[true|false]
nvidia\.com\/gpu\.compute\.major=[0-9]+
nvidia\.com\/gpu\.compute\.minor=[0-9]+
Expand Down
1 change: 1 addition & 0 deletions tests/expected-output-mig-single.txt
Original file line number Diff line number Diff line change
Expand Up @@ -30,4 +30,5 @@ nvidia\.com\/gpu\.engines\.jpeg=[0-9]+
nvidia\.com\/gpu\.engines\.ofa=[0-9]+
nvidia\.com\/gpu\.slices\.gi=[0-9]+
nvidia\.com\/gpu\.slices\.ci=[0-9]+
nvidia\.com\/gpu\.mode=[compute]
nvidia\.com\/mps\.capable=[true|false]
1 change: 1 addition & 0 deletions tests/expected-output.txt
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ nvidia\.com\/gpu\.sharing-strategy=[none|mps|time-slicing]
nvidia\.com\/gpu\.product=[A-Za-z_-]+
nvidia\.com\/gpu\.memory=[0-9]+
nvidia\.com\/gpu\.family=[a-z]+
nvidia\.com\/gpu\.mode=[unknown|compute|graphics]
nvidia\.com\/mig\.capable=[true|false]
nvidia\.com\/gpu\.compute\.major=[0-9]+
nvidia\.com\/gpu\.compute\.minor=[0-9]+
Expand Down

0 comments on commit b9160fe

Please sign in to comment.