diff --git a/.go-version b/.go-version index 2a0ba77..8405d1c 100644 --- a/.go-version +++ b/.go-version @@ -1 +1,2 @@ -1.22.4 +1.22.6 + diff --git a/README.md b/README.md index fa8efe6..8f6df5c 100644 --- a/README.md +++ b/README.md @@ -14,6 +14,9 @@ RPC. GPUs can be excluded from fingerprinting by setting the `ignored_gpu_ids` field (see below). Plugin sends statistics for fingerprinted devices every `stats_period` period. +The plugin detects whether the GPU has [`Multi-Instance GPU (MIG)`](https://www.nvidia.com/en-us/technologies/multi-instance-gpu/) enabled. +When enabled all instances will be fingerprinted as individual GPUs that can be addressed accordingly. + ## Config The plugin is configured in the Nomad client's diff --git a/device_test.go b/device_test.go index 749a101..ea38dae 100644 --- a/device_test.go +++ b/device_test.go @@ -6,7 +6,7 @@ package nvidia import ( "testing" - hclog "github.com/hashicorp/go-hclog" + "github.com/hashicorp/go-hclog" "github.com/hashicorp/nomad-device-nvidia/nvml" "github.com/hashicorp/nomad/plugins/device" "github.com/shoenig/test/must" diff --git a/nvml/client.go b/nvml/client.go index fb19113..ed23cc2 100644 --- a/nvml/client.go +++ b/nvml/client.go @@ -4,7 +4,9 @@ package nvml import ( + "cmp" "fmt" + "slices" ) // DeviceData represents common fields for Nvidia device @@ -95,28 +97,32 @@ func (c *nvmlClient) GetFingerprintData() (*FingerprintData, error) { */ // Assumed that this method is called with receiver retrieved from - // NewNvmlClient - // because this method handles initialization of NVML library + // NewNvmlClient because this method handles initialization of NVML library driverVersion, err := c.driver.SystemDriverVersion() if err != nil { return nil, fmt.Errorf("nvidia nvml SystemDriverVersion() error: %v\n", err) } - numDevices, err := c.driver.DeviceCount() + deviceUUIDs, err := c.driver.ListDeviceUUIDs() if err != nil { - return nil, fmt.Errorf("nvidia nvml DeviceCount() error: %v\n", err) + return nil, fmt.Errorf("nvidia nvml ListDeviceUUIDs() error: %v\n", err) } - allNvidiaGPUResources := make([]*FingerprintDeviceData, numDevices) + allNvidiaGPUResources := make([]*FingerprintDeviceData, 0, len(deviceUUIDs)) - for i := 0; i < int(numDevices); i++ { - deviceInfo, err := c.driver.DeviceInfoByIndex(uint(i)) + for uuid, mode := range deviceUUIDs { + // do not care about phsyical parents of MIGs + if mode == parent { + continue + } + + deviceInfo, err := c.driver.DeviceInfoByUUID(uuid) if err != nil { - return nil, fmt.Errorf("nvidia nvml DeviceInfoByIndex() error: %v\n", err) + return nil, fmt.Errorf("nvidia nvml DeviceInfoByUUID() error: %v\n", err) } - allNvidiaGPUResources[i] = &FingerprintDeviceData{ + allNvidiaGPUResources = append(allNvidiaGPUResources, &FingerprintDeviceData{ DeviceData: &DeviceData{ DeviceName: deviceInfo.Name, UUID: deviceInfo.UUID, @@ -130,8 +136,13 @@ func (c *nvmlClient) GetFingerprintData() (*FingerprintData, error) { DisplayState: deviceInfo.DisplayState, PersistenceMode: deviceInfo.PersistenceMode, PCIBusID: deviceInfo.PCIBusID, - } + }) + + slices.SortFunc(allNvidiaGPUResources, func(a, b *FingerprintDeviceData) int { + return cmp.Compare(a.DeviceData.UUID, b.DeviceData.UUID) + }) } + return &FingerprintData{ Devices: allNvidiaGPUResources, DriverVersion: driverVersion, @@ -156,23 +167,32 @@ func (c *nvmlClient) GetStatsData() ([]*StatsData, error) { */ // Assumed that this method is called with receiver retrieved from - // NewNvmlClient - // because this method handles initialization of NVML library + // NewNvmlClient because this method handles initialization of NVML library - numDevices, err := c.driver.DeviceCount() + deviceUUIDs, err := c.driver.ListDeviceUUIDs() if err != nil { - return nil, fmt.Errorf("nvidia nvml DeviceCount() error: %v\n", err) + return nil, fmt.Errorf("nvidia nvml ListDeviceUUIDs() error: %v\n", err) } - allNvidiaGPUStats := make([]*StatsData, numDevices) + allNvidiaGPUStats := make([]*StatsData, 0, len(deviceUUIDs)) - for i := 0; i < int(numDevices); i++ { - deviceInfo, deviceStatus, err := c.driver.DeviceInfoAndStatusByIndex(uint(i)) + for uuid, mode := range deviceUUIDs { + + // A30/A100 MIG devices have no stats. + // + // https://docs.nvidia.com/datacenter/tesla/mig-user-guide/#telemetry + // + // Is this fixed on H100 or later? Maybe? + if mode == mig || mode == parent { + continue + } + + deviceInfo, deviceStatus, err := c.driver.DeviceInfoAndStatusByUUID(uuid) if err != nil { - return nil, fmt.Errorf("nvidia nvml DeviceInfoAndStatusByIndex() error: %v\n", err) + return nil, fmt.Errorf("nvidia nvml DeviceInfoAndStatusByUUID() error: %v\n", err) } - allNvidiaGPUStats[i] = &StatsData{ + allNvidiaGPUStats = append(allNvidiaGPUStats, &StatsData{ DeviceData: &DeviceData{ DeviceName: deviceInfo.Name, UUID: deviceInfo.UUID, @@ -191,7 +211,11 @@ func (c *nvmlClient) GetStatsData() ([]*StatsData, error) { ECCErrorsL1Cache: deviceStatus.ECCErrorsL1Cache, ECCErrorsL2Cache: deviceStatus.ECCErrorsL2Cache, ECCErrorsDevice: deviceStatus.ECCErrorsDevice, - } + }) + + slices.SortFunc(allNvidiaGPUStats, func(a, b *StatsData) int { + return cmp.Compare(a.DeviceData.UUID, b.DeviceData.UUID) + }) } return allNvidiaGPUStats, nil } diff --git a/nvml/client_test.go b/nvml/client_test.go index 6958332..57417d5 100644 --- a/nvml/client_test.go +++ b/nvml/client_test.go @@ -11,14 +11,17 @@ import ( "github.com/shoenig/test/must" ) +var _ NvmlDriver = (*MockNVMLDriver)(nil) + type MockNVMLDriver struct { - systemDriverCallSuccessful bool - deviceCountCallSuccessful bool - deviceInfoByIndexCallSuccessful bool - deviceInfoAndStatusByIndexCallSuccessful bool - driverVersion string - devices []*DeviceInfo - deviceStatus []*DeviceStatus + systemDriverCallSuccessful bool + listDeviceUUIDsSuccessful bool + deviceInfoByUUIDCallSuccessful bool + deviceInfoAndStatusByUUIDCallSuccessful bool + driverVersion string + devices []*DeviceInfo + deviceStatus []*DeviceStatus + modes []mode } func (m *MockNVMLDriver) Initialize() error { @@ -36,31 +39,46 @@ func (m *MockNVMLDriver) SystemDriverVersion() (string, error) { return m.driverVersion, nil } -func (m *MockNVMLDriver) DeviceCount() (uint, error) { - if !m.deviceCountCallSuccessful { - return 0, errors.New("failed to get device length") +func (m *MockNVMLDriver) ListDeviceUUIDs() (map[string]mode, error) { + if !m.listDeviceUUIDsSuccessful { + return nil, errors.New("failed to get device length") + } + + allNvidiaGPUUUIDs := make(map[string]mode) + + for i, device := range m.devices { + allNvidiaGPUUUIDs[device.UUID] = m.modes[i] } - return uint(len(m.devices)), nil + + return allNvidiaGPUUUIDs, nil } -func (m *MockNVMLDriver) DeviceInfoByIndex(index uint) (*DeviceInfo, error) { - if index >= uint(len(m.devices)) { - return nil, errors.New("index is out of range") +func (m *MockNVMLDriver) DeviceInfoByUUID(uuid string) (*DeviceInfo, error) { + if !m.deviceInfoByUUIDCallSuccessful { + return nil, errors.New("failed to get device info by UUID") } - if !m.deviceInfoByIndexCallSuccessful { - return nil, errors.New("failed to get device info by index") + + for _, device := range m.devices { + if uuid == device.UUID { + return device, nil + } } - return m.devices[index], nil + + return nil, errors.New("failed to get device handle") } -func (m *MockNVMLDriver) DeviceInfoAndStatusByIndex(index uint) (*DeviceInfo, *DeviceStatus, error) { - if index >= uint(len(m.devices)) || index >= uint(len(m.deviceStatus)) { - return nil, nil, errors.New("index is out of range") - } - if !m.deviceInfoAndStatusByIndexCallSuccessful { +func (m *MockNVMLDriver) DeviceInfoAndStatusByUUID(uuid string) (*DeviceInfo, *DeviceStatus, error) { + if !m.deviceInfoAndStatusByUUIDCallSuccessful { return nil, nil, errors.New("failed to get device info and status by index") } - return m.devices[index], m.deviceStatus[index], nil + + for i, device := range m.devices { + if uuid == device.UUID { + return device, m.deviceStatus[i], nil + } + } + + return nil, nil, errors.New("failed to get device handle") } func TestGetFingerprintDataFromNVML(t *testing.T) { @@ -75,9 +93,9 @@ func TestGetFingerprintDataFromNVML(t *testing.T) { ExpectedError: true, ExpectedResult: nil, DriverConfiguration: &MockNVMLDriver{ - systemDriverCallSuccessful: false, - deviceCountCallSuccessful: true, - deviceInfoByIndexCallSuccessful: true, + systemDriverCallSuccessful: false, + listDeviceUUIDsSuccessful: true, + deviceInfoByUUIDCallSuccessful: true, }, }, { @@ -85,19 +103,20 @@ func TestGetFingerprintDataFromNVML(t *testing.T) { ExpectedError: true, ExpectedResult: nil, DriverConfiguration: &MockNVMLDriver{ - systemDriverCallSuccessful: true, - deviceCountCallSuccessful: false, - deviceInfoByIndexCallSuccessful: true, + systemDriverCallSuccessful: true, + listDeviceUUIDsSuccessful: false, + deviceInfoByUUIDCallSuccessful: true, }, }, { - Name: "fail on deviceInfoByIndexCall", + Name: "fail on deviceInfoByUUIDCall", ExpectedError: true, ExpectedResult: nil, DriverConfiguration: &MockNVMLDriver{ - systemDriverCallSuccessful: true, - deviceCountCallSuccessful: true, - deviceInfoByIndexCallSuccessful: false, + systemDriverCallSuccessful: true, + listDeviceUUIDsSuccessful: true, + deviceInfoByUUIDCallSuccessful: false, + modes: []mode{normal, normal}, devices: []*DeviceInfo{ { UUID: "UUID1", @@ -161,10 +180,11 @@ func TestGetFingerprintDataFromNVML(t *testing.T) { }, }, DriverConfiguration: &MockNVMLDriver{ - systemDriverCallSuccessful: true, - deviceCountCallSuccessful: true, - deviceInfoByIndexCallSuccessful: true, - driverVersion: "driverVersion", + systemDriverCallSuccessful: true, + listDeviceUUIDsSuccessful: true, + deviceInfoByUUIDCallSuccessful: true, + driverVersion: "driverVersion", + modes: []mode{normal, normal}, devices: []*DeviceInfo{ { UUID: "UUID1", @@ -194,16 +214,134 @@ func TestGetFingerprintDataFromNVML(t *testing.T) { }, }, }, + { + Name: "successful migs", + ExpectedError: false, + ExpectedResult: &FingerprintData{ + DriverVersion: "driverVersion", + Devices: []*FingerprintDeviceData{ + { + DeviceData: &DeviceData{ + DeviceName: pointer.Of("ModelName"), + UUID: "UUID1", + MemoryMiB: pointer.Of(uint64(16)), + PowerW: pointer.Of(uint(100)), + BAR1MiB: pointer.Of(uint64(100)), + }, + PCIBusID: "busId1", + PCIBandwidthMBPerS: pointer.Of(uint(100)), + CoresClockMHz: pointer.Of(uint(100)), + MemoryClockMHz: pointer.Of(uint(100)), + DisplayState: "Enabled", + PersistenceMode: "Enabled", + }, + { + DeviceData: &DeviceData{ + DeviceName: pointer.Of("ModelName"), + UUID: "UUID2", + MemoryMiB: pointer.Of(uint64(8)), + PowerW: pointer.Of(uint(200)), + BAR1MiB: pointer.Of(uint64(200)), + }, + PCIBusID: "busId2", + PCIBandwidthMBPerS: pointer.Of(uint(200)), + CoresClockMHz: pointer.Of(uint(200)), + MemoryClockMHz: pointer.Of(uint(200)), + DisplayState: "Enabled", + PersistenceMode: "Enabled", + }, + { + DeviceData: &DeviceData{ + DeviceName: pointer.Of("ModelName"), + UUID: "UUID4", + MemoryMiB: pointer.Of(uint64(8)), + PowerW: pointer.Of(uint(200)), + BAR1MiB: pointer.Of(uint64(200)), + }, + PCIBusID: "busId3", + PCIBandwidthMBPerS: pointer.Of(uint(200)), + CoresClockMHz: pointer.Of(uint(200)), + MemoryClockMHz: pointer.Of(uint(200)), + DisplayState: "Enabled", + PersistenceMode: "Enabled", + }, + }, + }, + DriverConfiguration: &MockNVMLDriver{ + systemDriverCallSuccessful: true, + listDeviceUUIDsSuccessful: true, + deviceInfoByUUIDCallSuccessful: true, + driverVersion: "driverVersion", + modes: []mode{normal, normal, parent, mig}, + devices: []*DeviceInfo{ + { + UUID: "UUID1", + Name: pointer.Of("ModelName"), + MemoryMiB: pointer.Of(uint64(16)), + PCIBusID: "busId1", + PowerW: pointer.Of(uint(100)), + BAR1MiB: pointer.Of(uint64(100)), + PCIBandwidthMBPerS: pointer.Of(uint(100)), + CoresClockMHz: pointer.Of(uint(100)), + MemoryClockMHz: pointer.Of(uint(100)), + DisplayState: "Enabled", + PersistenceMode: "Enabled", + }, + { + UUID: "UUID2", + Name: pointer.Of("ModelName"), + MemoryMiB: pointer.Of(uint64(8)), + PCIBusID: "busId2", + PowerW: pointer.Of(uint(200)), + BAR1MiB: pointer.Of(uint64(200)), + PCIBandwidthMBPerS: pointer.Of(uint(200)), + CoresClockMHz: pointer.Of(uint(200)), + MemoryClockMHz: pointer.Of(uint(200)), + DisplayState: "Enabled", + PersistenceMode: "Enabled", + }, + { + UUID: "UUID3", + Name: pointer.Of("ModelName"), + MemoryMiB: pointer.Of(uint64(8)), + PCIBusID: "busId3", + PowerW: pointer.Of(uint(200)), + BAR1MiB: pointer.Of(uint64(200)), + PCIBandwidthMBPerS: pointer.Of(uint(200)), + CoresClockMHz: pointer.Of(uint(200)), + MemoryClockMHz: pointer.Of(uint(200)), + DisplayState: "Enabled", + PersistenceMode: "Enabled", + }, + { + UUID: "UUID4", + Name: pointer.Of("ModelName"), + MemoryMiB: pointer.Of(uint64(8)), + PCIBusID: "busId3", + PowerW: pointer.Of(uint(200)), + BAR1MiB: pointer.Of(uint64(200)), + PCIBandwidthMBPerS: pointer.Of(uint(200)), + CoresClockMHz: pointer.Of(uint(200)), + MemoryClockMHz: pointer.Of(uint(200)), + DisplayState: "Enabled", + PersistenceMode: "Enabled", + }, + }, + }, + }, } { - cli := nvmlClient{driver: testCase.DriverConfiguration} - fingerprintData, err := cli.GetFingerprintData() - if testCase.ExpectedError { - must.Error(t, err) - } - if !testCase.ExpectedError && err != nil { - must.NoError(t, err) - } - must.Eq(t, testCase.ExpectedResult, fingerprintData) + + t.Run(testCase.Name, func(t *testing.T) { + cli := nvmlClient{driver: testCase.DriverConfiguration} + fingerprintData, err := cli.GetFingerprintData() + if testCase.ExpectedError { + must.Error(t, err) + } + if !testCase.ExpectedError && err != nil { + must.NoError(t, err) + } + must.Eq(t, testCase.ExpectedResult, fingerprintData) + }) } } @@ -215,24 +353,25 @@ func TestGetStatsDataFromNVML(t *testing.T) { ExpectedResult []*StatsData }{ { - Name: "fail on deviceCountCallSuccessful", + Name: "fail on listDeviceUUIDsCallSuccessful", ExpectedError: true, ExpectedResult: nil, DriverConfiguration: &MockNVMLDriver{ - systemDriverCallSuccessful: true, - deviceCountCallSuccessful: false, - deviceInfoByIndexCallSuccessful: true, - deviceInfoAndStatusByIndexCallSuccessful: true, + systemDriverCallSuccessful: true, + listDeviceUUIDsSuccessful: false, + deviceInfoByUUIDCallSuccessful: true, + deviceInfoAndStatusByUUIDCallSuccessful: true, }, }, { - Name: "fail on DeviceInfoAndStatusByIndex call", + Name: "fail on DeviceInfoAndStatusByUUID call", ExpectedError: true, ExpectedResult: nil, DriverConfiguration: &MockNVMLDriver{ - systemDriverCallSuccessful: true, - deviceCountCallSuccessful: true, - deviceInfoAndStatusByIndexCallSuccessful: false, + systemDriverCallSuccessful: true, + listDeviceUUIDsSuccessful: true, + deviceInfoAndStatusByUUIDCallSuccessful: false, + modes: []mode{normal, normal}, devices: []*DeviceInfo{ { UUID: "UUID1", @@ -332,9 +471,10 @@ func TestGetStatsDataFromNVML(t *testing.T) { }, }, DriverConfiguration: &MockNVMLDriver{ - deviceCountCallSuccessful: true, - deviceInfoByIndexCallSuccessful: true, - deviceInfoAndStatusByIndexCallSuccessful: true, + listDeviceUUIDsSuccessful: true, + deviceInfoByUUIDCallSuccessful: true, + deviceInfoAndStatusByUUIDCallSuccessful: true, + modes: []mode{normal, normal}, devices: []*DeviceInfo{ { UUID: "UUID1", @@ -388,6 +528,133 @@ func TestGetStatsDataFromNVML(t *testing.T) { }, }, }, + { + Name: "successful migs", + // stats not available on migs + ExpectedError: false, + ExpectedResult: []*StatsData{ + { + DeviceData: &DeviceData{ + DeviceName: pointer.Of("ModelName"), + UUID: "UUID1", + MemoryMiB: pointer.Of(uint64(16)), + PowerW: pointer.Of(uint(100)), + BAR1MiB: pointer.Of(uint64(100)), + }, + TemperatureC: pointer.Of(uint(1)), + GPUUtilization: pointer.Of(uint(1)), + MemoryUtilization: pointer.Of(uint(1)), + EncoderUtilization: pointer.Of(uint(1)), + DecoderUtilization: pointer.Of(uint(1)), + UsedMemoryMiB: pointer.Of(uint64(1)), + ECCErrorsL1Cache: pointer.Of(uint64(1)), + ECCErrorsL2Cache: pointer.Of(uint64(1)), + ECCErrorsDevice: pointer.Of(uint64(1)), + PowerUsageW: pointer.Of(uint(1)), + BAR1UsedMiB: pointer.Of(uint64(1)), + }, + { + DeviceData: &DeviceData{ + DeviceName: pointer.Of("ModelName"), + UUID: "UUID2", + MemoryMiB: pointer.Of(uint64(8)), + PowerW: pointer.Of(uint(200)), + BAR1MiB: pointer.Of(uint64(200)), + }, + TemperatureC: pointer.Of(uint(2)), + GPUUtilization: pointer.Of(uint(2)), + MemoryUtilization: pointer.Of(uint(2)), + EncoderUtilization: pointer.Of(uint(2)), + DecoderUtilization: pointer.Of(uint(2)), + UsedMemoryMiB: pointer.Of(uint64(2)), + ECCErrorsL1Cache: pointer.Of(uint64(2)), + ECCErrorsL2Cache: pointer.Of(uint64(2)), + ECCErrorsDevice: pointer.Of(uint64(2)), + PowerUsageW: pointer.Of(uint(2)), + BAR1UsedMiB: pointer.Of(uint64(2)), + }, + }, + DriverConfiguration: &MockNVMLDriver{ + listDeviceUUIDsSuccessful: true, + deviceInfoByUUIDCallSuccessful: true, + deviceInfoAndStatusByUUIDCallSuccessful: true, + modes: []mode{normal, normal, parent, mig}, + devices: []*DeviceInfo{ + { + UUID: "UUID1", + Name: pointer.Of("ModelName"), + MemoryMiB: pointer.Of(uint64(16)), + PCIBusID: "busId1", + PowerW: pointer.Of(uint(100)), + BAR1MiB: pointer.Of(uint64(100)), + PCIBandwidthMBPerS: pointer.Of(uint(100)), + CoresClockMHz: pointer.Of(uint(100)), + MemoryClockMHz: pointer.Of(uint(100)), + }, + { + UUID: "UUID2", + Name: pointer.Of("ModelName"), + MemoryMiB: pointer.Of(uint64(8)), + PCIBusID: "busId2", + PowerW: pointer.Of(uint(200)), + BAR1MiB: pointer.Of(uint64(200)), + PCIBandwidthMBPerS: pointer.Of(uint(200)), + CoresClockMHz: pointer.Of(uint(200)), + MemoryClockMHz: pointer.Of(uint(200)), + }, + { // parent, no stats + UUID: "UUID3", + Name: pointer.Of("ModelName"), + MemoryMiB: pointer.Of(uint64(8)), + PCIBusID: "busId3", + PowerW: pointer.Of(uint(200)), + BAR1MiB: pointer.Of(uint64(200)), + PCIBandwidthMBPerS: pointer.Of(uint(200)), + CoresClockMHz: pointer.Of(uint(200)), + MemoryClockMHz: pointer.Of(uint(200)), + }, + { // mig, no stats + UUID: "UUID4", + Name: pointer.Of("ModelName"), + MemoryMiB: pointer.Of(uint64(8)), + PCIBusID: "busId3", + PowerW: pointer.Of(uint(200)), + BAR1MiB: pointer.Of(uint64(200)), + PCIBandwidthMBPerS: pointer.Of(uint(200)), + CoresClockMHz: pointer.Of(uint(200)), + MemoryClockMHz: pointer.Of(uint(200)), + }, + }, + deviceStatus: []*DeviceStatus{ + { + TemperatureC: pointer.Of(uint(1)), + GPUUtilization: pointer.Of(uint(1)), + MemoryUtilization: pointer.Of(uint(1)), + EncoderUtilization: pointer.Of(uint(1)), + DecoderUtilization: pointer.Of(uint(1)), + UsedMemoryMiB: pointer.Of(uint64(1)), + ECCErrorsL1Cache: pointer.Of(uint64(1)), + ECCErrorsL2Cache: pointer.Of(uint64(1)), + ECCErrorsDevice: pointer.Of(uint64(1)), + PowerUsageW: pointer.Of(uint(1)), + BAR1UsedMiB: pointer.Of(uint64(1)), + }, + { + TemperatureC: pointer.Of(uint(2)), + GPUUtilization: pointer.Of(uint(2)), + MemoryUtilization: pointer.Of(uint(2)), + EncoderUtilization: pointer.Of(uint(2)), + DecoderUtilization: pointer.Of(uint(2)), + UsedMemoryMiB: pointer.Of(uint64(2)), + ECCErrorsL1Cache: pointer.Of(uint64(2)), + ECCErrorsL2Cache: pointer.Of(uint64(2)), + ECCErrorsDevice: pointer.Of(uint64(2)), + PowerUsageW: pointer.Of(uint(2)), + BAR1UsedMiB: pointer.Of(uint64(2)), + }, + }, + }, + }, } { cli := nvmlClient{driver: testCase.DriverConfiguration} statsData, err := cli.GetStatsData() diff --git a/nvml/driver_default.go b/nvml/driver_default.go index 97cd4a5..26c50ab 100644 --- a/nvml/driver_default.go +++ b/nvml/driver_default.go @@ -20,17 +20,17 @@ func (n *nvmlDriver) SystemDriverVersion() (string, error) { return "", UnavailableLib } -// DeviceCount reports number of available GPU devices -func (n *nvmlDriver) DeviceCount() (uint, error) { - return 0, UnavailableLib +// ListDeviceUUIDs reports number of available GPU devices +func (n *nvmlDriver) ListDeviceUUIDs() ([]string, error) { + return nil, UnavailableLib } -// DeviceInfoByIndex returns DeviceInfo for index GPU in system device list -func (n *nvmlDriver) DeviceInfoByIndex(index uint) (*DeviceInfo, error) { +// DeviceInfoByUUID returns DeviceInfo for the GPU matching the given UUID +func (n *nvmlDriver) DeviceInfoByUUID(uuid string) (*DeviceInfo, error) { return nil, UnavailableLib } -// DeviceInfoByIndex returns DeviceInfo and DeviceStatus for index GPU in system device list -func (n *nvmlDriver) DeviceInfoAndStatusByIndex(index uint) (*DeviceInfo, *DeviceStatus, error) { +// DeviceInfoAndStatusByUUID returns DeviceInfo and DeviceStatus for the GPU matching the given UUID +func (n *nvmlDriver) DeviceInfoAndStatusByUUID(uuid string) (*DeviceInfo, *DeviceStatus, error) { return nil, nil, UnavailableLib } diff --git a/nvml/driver_linux.go b/nvml/driver_linux.go index 2530824..0006deb 100644 --- a/nvml/driver_linux.go +++ b/nvml/driver_linux.go @@ -40,25 +40,80 @@ func (n *nvmlDriver) SystemDriverVersion() (string, error) { return version, nil } -// DeviceCount reports number of available GPU devices -func (n *nvmlDriver) DeviceCount() (uint, error) { +// List all compute device UUIDs in the system. +// Includes all instances, including normal GPUs, MIGs, and their physical parents. +// Each UUID is associated with a mode indication which type it is. +func (n *nvmlDriver) ListDeviceUUIDs() (map[string]mode, error) { count, code := nvml.DeviceGetCount() if code != nvml.SUCCESS { - return 0, decode("failed to get device count", code) + return nil, decode("failed to get device count", code) } - return uint(count), nil -} -// DeviceInfoByIndex returns DeviceInfo for index GPU in system device list. -func (n *nvmlDriver) DeviceInfoByIndex(index uint) (*DeviceInfo, error) { - device, code := nvml.DeviceGetHandleByIndex(int(index)) - if code != nvml.SUCCESS { - return nil, decode("failed to get device info", code) + uuids := make(map[string]mode) + + for i := 0; i < int(count); i++ { + device, code := nvml.DeviceGetHandleByIndex(int(i)) + if code != nvml.SUCCESS { + return nil, decode(fmt.Sprintf("failed to get device handle %d/%d", i, count), code) + } + + // Get the device MIG mode, and if MIG is not enabled + // or the device doesn't support MIG at all (indicated + // by error code ERROR_NOT_SUPPORTED), then add the + // device UUID to the list and continue. + migMode, _, code := nvml.DeviceGetMigMode(device) + if code == nvml.ERROR_NOT_SUPPORTED || migMode == nvml.DEVICE_MIG_DISABLE { + uuid, code := nvml.DeviceGetUUID(device) + if code != nvml.SUCCESS { + return nil, decode("failed to get device %d uuid", code) + } + + uuids[uuid] = normal + continue + } + if code != nvml.SUCCESS { + return nil, decode("failed to get device MIG mode", code) + } + + migCount, code := nvml.DeviceGetMaxMigDeviceCount(device) + if code != nvml.SUCCESS { + return nil, decode("failed to get device MIG device count", code) + } + + uuid, code := nvml.DeviceGetUUID(device) + if code == nvml.SUCCESS { + uuids[uuid] = parent + } + + for j := 0; j < int(migCount); j++ { + migDevice, code := nvml.DeviceGetMigDeviceHandleByIndex(device, int(j)) + if code == nvml.ERROR_NOT_FOUND || code == nvml.ERROR_INVALID_ARGUMENT { + continue + } + if code != nvml.SUCCESS { + return nil, decode("failed to get device MIG device handle", code) + } + + uuid, code := nvml.DeviceGetUUID(migDevice) + if code != nvml.SUCCESS { + return nil, decode(fmt.Sprintf("failed to get mig device uuid %d", j), code) + } + uuids[uuid] = mig + } } - uuid, code := nvml.DeviceGetUUID(device) + return uuids, nil +} + +func bytesToMegabytes(size uint64) uint64 { + return size / (1 << 20) +} + +// DeviceInfoByUUID returns DeviceInfo for the given GPU's UUID. +func (n *nvmlDriver) DeviceInfoByUUID(uuid string) (*DeviceInfo, error) { + device, code := nvml.DeviceGetHandleByUUID(uuid) if code != nvml.SUCCESS { - return nil, decode("failed to get device uuid", code) + return nil, decode("failed to get device handle", code) } name, code := nvml.Device.GetName(device) @@ -70,11 +125,26 @@ func (n *nvmlDriver) DeviceInfoByIndex(index uint) (*DeviceInfo, error) { if code != nvml.SUCCESS { return nil, decode("failed to get device memory info", code) } - memoryTotal := memory.Total / (1 << 20) + memoryTotal := bytesToMegabytes(memory.Total) + + parentDevice, code := nvml.DeviceGetDeviceHandleFromMigDeviceHandle(device) + if code == nvml.ERROR_NOT_FOUND || code == nvml.ERROR_INVALID_ARGUMENT { + // Device is not a MIG device, so nothing to do. + } else if code != nvml.SUCCESS { + return nil, decode("failed to get device parent device handle", code) + } else { + // Device is a MIG device, and get the auxilary properties (such as PCIE + // bandwidth) from the parent device. + device = parentDevice + } power, code := nvml.DeviceGetPowerUsage(device) if code != nvml.SUCCESS { - return nil, decode("failed to get device power info", code) + if code == nvml.ERROR_NOT_SUPPORTED { + power = 0 + } else { + return nil, decode("failed to get device power info", code) + } } powerU := uint(power) / 1000 @@ -82,7 +152,7 @@ func (n *nvmlDriver) DeviceInfoByIndex(index uint) (*DeviceInfo, error) { if code != nvml.SUCCESS { return nil, decode("failed to get device bar 1 memory info", code) } - bar1total := bar1.Bar1Total / (1 << 20) + bar1total := bytesToMegabytes(bar1.Bar1Total) pci, code := nvml.Device.GetPciInfo(device) if code != nvml.SUCCESS { @@ -91,12 +161,20 @@ func (n *nvmlDriver) DeviceInfoByIndex(index uint) (*DeviceInfo, error) { linkWidth, code := nvml.DeviceGetMaxPcieLinkWidth(device) if code != nvml.SUCCESS { - return nil, decode("failed to get pcie link width", code) + if code == nvml.ERROR_NOT_SUPPORTED { + linkWidth = 0 + } else { + return nil, decode("failed to get pcie link width", code) + } } linkGeneration, code := nvml.DeviceGetMaxPcieLinkGeneration(device) if code != nvml.SUCCESS { - return nil, decode("failed to get pcie link generation", code) + if code == nvml.ERROR_NOT_SUPPORTED { + linkGeneration = 0 + } else { + return nil, decode("failed to get pcie link generation", code) + } } // https://en.wikipedia.org/wiki/PCI_Express @@ -159,70 +237,106 @@ func buildID(id [32]int8) string { return string(b) } -// DeviceInfoAndStatusByIndex returns DeviceInfo and DeviceStatus for index GPU in system device list. -func (n *nvmlDriver) DeviceInfoAndStatusByIndex(index uint) (*DeviceInfo, *DeviceStatus, error) { - di, err := n.DeviceInfoByIndex(index) +// DeviceInfoAndStatusByUUID returns DeviceInfo and DeviceStatus for index GPU in system device list. +func (n *nvmlDriver) DeviceInfoAndStatusByUUID(uuid string) (*DeviceInfo, *DeviceStatus, error) { + di, err := n.DeviceInfoByUUID(uuid) if err != nil { return nil, nil, err } - device, code := nvml.DeviceGetHandleByIndex(int(index)) + device, code := nvml.DeviceGetHandleByUUID(uuid) if code != nvml.SUCCESS { return nil, nil, decode("failed to get device info", code) } - temp, code := nvml.DeviceGetTemperature(device, nvml.TEMPERATURE_GPU) - if code != nvml.SUCCESS { - return nil, nil, decode("failed to get device temperature", code) - } - tempU := uint(temp) - - utz, code := nvml.DeviceGetUtilizationRates(device) - if code != nvml.SUCCESS { - return nil, nil, decode("failed to get device utilization", code) - } - utzGPU := uint(utz.Gpu) - utzMem := uint(utz.Memory) - - utzEnc, _, code := nvml.DeviceGetEncoderUtilization(device) + mem, code := nvml.DeviceGetMemoryInfo(device) if code != nvml.SUCCESS { - return nil, nil, decode("failed to get device encoder utilization", code) + return nil, nil, decode("failed to get device memory utilization", code) } - utzEncU := uint(utzEnc) + memUsedU := bytesToMegabytes(mem.Used) - utzDec, _, code := nvml.Device.GetDecoderUtilization(device) + bar, code := nvml.DeviceGetBAR1MemoryInfo(device) if code != nvml.SUCCESS { - return nil, nil, decode("failed to get device decoder utilization", code) + return nil, nil, decode("failed to get device bar1 memory info", code) } - utzDecU := uint(utzDec) - - mem, code := nvml.DeviceGetMemoryInfo(device) - if code != nvml.SUCCESS { - return nil, nil, decode("failed to get device memory utilization", code) + barUsed := bytesToMegabytes(bar.Bar1Used) + + isMig := false + _, code = nvml.DeviceGetDeviceHandleFromMigDeviceHandle(device) + if code == nvml.ERROR_NOT_FOUND || code == nvml.ERROR_INVALID_ARGUMENT { + // Device is not a MIG device. + } else if code != nvml.SUCCESS { + return nil, nil, decode("failed to get device parent device handle", code) + } else { + isMig = true } - memUsedU := mem.Used / (1 << 20) - power, code := nvml.DeviceGetPowerUsage(device) - if code != nvml.SUCCESS { - return nil, nil, decode("failed to get device power usage", code) + // MIG devices don't have temperature, power usage or utilization properties + // so just nil them out. + utzGPU, utzMem, utzEncU, utzDecU := uint(0), uint(0), uint(0), uint(0) + powerU, tempU := uint(0), uint(0) + if !isMig { + utz, code := nvml.DeviceGetUtilizationRates(device) + if code != nvml.SUCCESS { + return nil, nil, decode("failed to get device utilization", code) + } + utzGPU = uint(utz.Gpu) + utzMem = uint(utz.Memory) + + utzEnc, _, code := nvml.DeviceGetEncoderUtilization(device) + if code != nvml.SUCCESS { + return nil, nil, decode("failed to get device encoder utilization", code) + } + utzEncU = uint(utzEnc) + + utzDec, _, code := nvml.Device.GetDecoderUtilization(device) + if code != nvml.SUCCESS { + return nil, nil, decode("failed to get device decoder utilization", code) + } + utzDecU = uint(utzDec) + + temp, code := nvml.DeviceGetTemperature(device, nvml.TEMPERATURE_GPU) + if code != nvml.SUCCESS { + if code == nvml.ERROR_NOT_SUPPORTED { + temp = 0 + } else { + return nil, nil, decode("failed to get device temperature", code) + } + } + tempU = uint(temp) + + power, code := nvml.DeviceGetPowerUsage(device) + if code != nvml.SUCCESS { + if code == nvml.ERROR_NOT_SUPPORTED { + power = 0 + } else { + return nil, nil, decode("failed to get device power usage", code) + } + } + powerU = uint(power) } - powerU := uint(power) - bar, code := nvml.DeviceGetBAR1MemoryInfo(device) + ecc, code := nvml.DeviceGetDetailedEccErrors(device, nvml.MEMORY_ERROR_TYPE_CORRECTED, nvml.VOLATILE_ECC) if code != nvml.SUCCESS { - return nil, nil, decode("failed to get device bar1 memory info", code) + if code == nvml.ERROR_NOT_SUPPORTED { + ecc = nvml.EccErrorCounts{} + } else { + return nil, nil, decode("failed to get device ecc error counts", code) + } } - barUsed := bar.Bar1Used / (1 << 20) - // note: ecc memory error stats removed; couldn't figure out the API return di, &DeviceStatus{ - TemperatureC: &tempU, - GPUUtilization: &utzGPU, - MemoryUtilization: &utzMem, - EncoderUtilization: &utzEncU, - DecoderUtilization: &utzDecU, - UsedMemoryMiB: &memUsedU, - PowerUsageW: &powerU, - BAR1UsedMiB: &barUsed, + TemperatureC: &tempU, + GPUUtilization: &utzGPU, + MemoryUtilization: &utzMem, + EncoderUtilization: &utzEncU, + DecoderUtilization: &utzDecU, + UsedMemoryMiB: &memUsedU, + PowerUsageW: &powerU, + BAR1UsedMiB: &barUsed, + ECCErrorsDevice: &ecc.DeviceMemory, + ECCErrorsL1Cache: &ecc.L1Cache, + ECCErrorsL2Cache: &ecc.L2Cache, + ECCErrorsRegisterFile: &ecc.RegisterFile, }, nil } diff --git a/nvml/shared.go b/nvml/shared.go index e1ef54c..17596a2 100644 --- a/nvml/shared.go +++ b/nvml/shared.go @@ -10,6 +10,14 @@ var ( UnavailableLib = errors.New("could not load NVML library") ) +type mode int + +const ( + normal mode = iota + parent + mig +) + // nvmlDriver implements NvmlDriver // Users are required to call Initialize method before using any other methods type nvmlDriver struct{} @@ -19,14 +27,14 @@ type NvmlDriver interface { Initialize() error Shutdown() error SystemDriverVersion() (string, error) - DeviceCount() (uint, error) - DeviceInfoByIndex(uint) (*DeviceInfo, error) - DeviceInfoAndStatusByIndex(uint) (*DeviceInfo, *DeviceStatus, error) + ListDeviceUUIDs() (map[string]mode, error) + DeviceInfoByUUID(string) (*DeviceInfo, error) + DeviceInfoAndStatusByUUID(string) (*DeviceInfo, *DeviceStatus, error) } // DeviceInfo represents nvml device data -// this struct is returned by NvmlDriver DeviceInfoByIndex and -// DeviceInfoAndStatusByIndex methods +// this struct is returned by NvmlDriver DeviceInfoByUUID and +// DeviceInfoAndStatusByUUID methods type DeviceInfo struct { // The following fields are guaranteed to be retrieved from nvml UUID string @@ -46,19 +54,20 @@ type DeviceInfo struct { } // DeviceStatus represents nvml device status -// this struct is returned by NvmlDriver DeviceInfoAndStatusByIndex method +// this struct is returned by NvmlDriver DeviceInfoAndStatusByUUID method type DeviceStatus struct { // The following fields can be nil after call to nvml, because nvml was // not able to retrieve this fields for specific nvidia card - PowerUsageW *uint - TemperatureC *uint - GPUUtilization *uint // % - MemoryUtilization *uint // % - EncoderUtilization *uint // % - DecoderUtilization *uint // % - BAR1UsedMiB *uint64 - UsedMemoryMiB *uint64 - ECCErrorsL1Cache *uint64 - ECCErrorsL2Cache *uint64 - ECCErrorsDevice *uint64 + PowerUsageW *uint + TemperatureC *uint + GPUUtilization *uint // % + MemoryUtilization *uint // % + EncoderUtilization *uint // % + DecoderUtilization *uint // % + BAR1UsedMiB *uint64 + UsedMemoryMiB *uint64 + ECCErrorsL1Cache *uint64 + ECCErrorsL2Cache *uint64 + ECCErrorsDevice *uint64 + ECCErrorsRegisterFile *uint64 }