diff --git a/.circleci/config.yml b/.circleci/config.yml index 0a4cec135dce..0fcf418656d6 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -620,13 +620,9 @@ workflows: test_module: "api" filters: *backend_test_branches_filter enable_race_testing: true - - test-container: - name: "test-devices" - test_packages: "./devices/..." - filters: *backend_test_branches_filter - test-machine: name: "test-other" - exclude_packages: "./api|./client|./drivers/docker|./drivers/exec|./drivers/shared/executor|./nomad|./devices|./e2e" + exclude_packages: "./api|./client|./drivers/docker|./drivers/exec|./drivers/shared/executor|./nomad|./e2e" filters: *backend_test_branches_filter - test-machine: name: "test-docker" diff --git a/devices/gpu/nvidia/README.md b/devices/gpu/nvidia/README.md deleted file mode 100644 index 1035c7c89402..000000000000 --- a/devices/gpu/nvidia/README.md +++ /dev/null @@ -1,21 +0,0 @@ -This package provides an implementation of nvidia device plugin - -# Behavior - -Nvidia device plugin uses NVML bindings to get data regarding available nvidia devices and will expose them via Fingerprint RPC. GPUs can be excluded from fingerprinting by setting the `ignored_gpu_ids` field. Plugin sends statistics for fingerprinted devices every `stats_period` period. - -# Config - -The configuration should be passed via an HCL file that begins with a top level `config` stanza: - -``` -config { - ignored_gpu_ids = ["uuid1", "uuid2"] - fingerprint_period = "5s" -} -``` - -The valid configuration options are: - -* `ignored_gpu_ids` (`list(string)`: `[]`): list of GPU UUIDs strings that should not be exposed to nomad -* `fingerprint_period` (`string`: `"1m"`): interval to repeat the fingerprint process to identify possible changes. diff --git a/devices/gpu/nvidia/cmd/main.go b/devices/gpu/nvidia/cmd/main.go deleted file mode 100644 index 5c0bea6c4d86..000000000000 --- a/devices/gpu/nvidia/cmd/main.go +++ /dev/null @@ -1,20 +0,0 @@ -package main - -import ( - "context" - - log "github.com/hashicorp/go-hclog" - - "github.com/hashicorp/nomad/devices/gpu/nvidia" - "github.com/hashicorp/nomad/plugins" -) - -func main() { - // Serve the plugin - plugins.ServeCtx(factory) -} - -// factory returns a new instance of the Nvidia GPU plugin -func factory(ctx context.Context, log log.Logger) interface{} { - return nvidia.NewNvidiaDevice(ctx, log) -} diff --git a/devices/gpu/nvidia/device.go b/devices/gpu/nvidia/device.go deleted file mode 100644 index 67680dc2a0ee..000000000000 --- a/devices/gpu/nvidia/device.go +++ /dev/null @@ -1,228 +0,0 @@ -package nvidia - -import ( - "context" - "fmt" - "strings" - "sync" - "time" - - log "github.com/hashicorp/go-hclog" - "github.com/hashicorp/nomad/devices/gpu/nvidia/nvml" - "github.com/hashicorp/nomad/helper/pluginutils/loader" - "github.com/hashicorp/nomad/plugins/base" - "github.com/hashicorp/nomad/plugins/device" - "github.com/hashicorp/nomad/plugins/shared/hclspec" -) - -const ( - // pluginName is the name of the plugin - pluginName = "nvidia-gpu" - - // vendor is the vendor providing the devices - vendor = "nvidia" - - // deviceType is the type of device being returned - deviceType = device.DeviceTypeGPU - - // notAvailable value is returned to nomad server in case some properties were - // undetected by nvml driver - notAvailable = "N/A" - - // Nvidia-container-runtime environment variable names - NvidiaVisibleDevices = "NVIDIA_VISIBLE_DEVICES" -) - -var ( - // PluginID is the nvidia plugin metadata registered in the plugin - // catalog. - PluginID = loader.PluginID{ - Name: pluginName, - PluginType: base.PluginTypeDevice, - } - - // PluginConfig is the nvidia factory function registered in the - // plugin catalog. - PluginConfig = &loader.InternalPluginConfig{ - Factory: func(ctx context.Context, l log.Logger) interface{} { return NewNvidiaDevice(ctx, l) }, - } - - // pluginInfo describes the plugin - pluginInfo = &base.PluginInfoResponse{ - Type: base.PluginTypeDevice, - PluginApiVersions: []string{device.ApiVersion010}, - PluginVersion: "0.1.0", - Name: pluginName, - } - - // configSpec is the specification of the plugin's configuration - configSpec = hclspec.NewObject(map[string]*hclspec.Spec{ - "enabled": hclspec.NewDefault( - hclspec.NewAttr("enabled", "bool", false), - hclspec.NewLiteral("true"), - ), - "ignored_gpu_ids": hclspec.NewDefault( - hclspec.NewAttr("ignored_gpu_ids", "list(string)", false), - hclspec.NewLiteral("[]"), - ), - "fingerprint_period": hclspec.NewDefault( - hclspec.NewAttr("fingerprint_period", "string", false), - hclspec.NewLiteral("\"1m\""), - ), - }) -) - -// Config contains configuration information for the plugin. -type Config struct { - Enabled bool `codec:"enabled"` - IgnoredGPUIDs []string `codec:"ignored_gpu_ids"` - FingerprintPeriod string `codec:"fingerprint_period"` -} - -// NvidiaDevice contains all plugin specific data -type NvidiaDevice struct { - // enabled indicates whether the plugin should be enabled - enabled bool - - // nvmlClient is used to get data from nvidia - nvmlClient nvml.NvmlClient - - // initErr holds an error retrieved during - // nvmlClient initialization - initErr error - - // ignoredGPUIDs is a set of UUIDs that would not be exposed to nomad - ignoredGPUIDs map[string]struct{} - - // fingerprintPeriod is how often we should call nvml to get list of devices - fingerprintPeriod time.Duration - - // devices is the set of detected eligible devices - devices map[string]struct{} - deviceLock sync.RWMutex - - logger log.Logger -} - -// NewNvidiaDevice returns a new nvidia device plugin. -func NewNvidiaDevice(_ context.Context, log log.Logger) *NvidiaDevice { - nvmlClient, err := nvml.NewNvmlClient() - logger := log.Named(pluginName) - if err != nil && err.Error() != nvml.UnavailableLib.Error() { - logger.Error("unable to initialize Nvidia driver", "reason", err) - } - return &NvidiaDevice{ - logger: logger, - devices: make(map[string]struct{}), - ignoredGPUIDs: make(map[string]struct{}), - nvmlClient: nvmlClient, - initErr: err, - } -} - -// PluginInfo returns information describing the plugin. -func (d *NvidiaDevice) PluginInfo() (*base.PluginInfoResponse, error) { - return pluginInfo, nil -} - -// ConfigSchema returns the plugins configuration schema. -func (d *NvidiaDevice) ConfigSchema() (*hclspec.Spec, error) { - return configSpec, nil -} - -// SetConfig is used to set the configuration of the plugin. -func (d *NvidiaDevice) SetConfig(cfg *base.Config) error { - var config Config - if len(cfg.PluginConfig) != 0 { - if err := base.MsgPackDecode(cfg.PluginConfig, &config); err != nil { - return err - } - } - - d.enabled = config.Enabled - - for _, ignoredGPUId := range config.IgnoredGPUIDs { - d.ignoredGPUIDs[ignoredGPUId] = struct{}{} - } - - period, err := time.ParseDuration(config.FingerprintPeriod) - if err != nil { - return fmt.Errorf("failed to parse fingerprint period %q: %v", config.FingerprintPeriod, err) - } - d.fingerprintPeriod = period - - return nil -} - -// Fingerprint streams detected devices. If device changes are detected or the -// devices health changes, messages will be emitted. -func (d *NvidiaDevice) Fingerprint(ctx context.Context) (<-chan *device.FingerprintResponse, error) { - if !d.enabled { - return nil, device.ErrPluginDisabled - } - - outCh := make(chan *device.FingerprintResponse) - go d.fingerprint(ctx, outCh) - return outCh, nil -} - -type reservationError struct { - notExistingIDs []string -} - -func (e *reservationError) Error() string { - return fmt.Sprintf("unknown device IDs: %s", strings.Join(e.notExistingIDs, ",")) -} - -// Reserve returns information on how to mount given devices. -// Assumption is made that nomad server is responsible for correctness of -// GPU allocations, handling tricky cases such as double-allocation of single GPU -func (d *NvidiaDevice) Reserve(deviceIDs []string) (*device.ContainerReservation, error) { - if len(deviceIDs) == 0 { - return &device.ContainerReservation{}, nil - } - if !d.enabled { - return nil, device.ErrPluginDisabled - } - - // Due to the asynchronous nature of NvidiaPlugin, there is a possibility - // of race condition - // - // Timeline: - // 1 - fingerprint reports that GPU with id "1" is present - // 2 - the following events happen at the same time: - // a) server decides to allocate GPU with id "1" - // b) fingerprint check reports that GPU with id "1" is no more present - // - // The latest and always valid version of fingerprinted ids are stored in - // d.devices map. To avoid this race condition an error is returned if - // any of provided deviceIDs is not found in d.devices map - d.deviceLock.RLock() - var notExistingIDs []string - for _, id := range deviceIDs { - if _, deviceIDExists := d.devices[id]; !deviceIDExists { - notExistingIDs = append(notExistingIDs, id) - } - } - d.deviceLock.RUnlock() - if len(notExistingIDs) != 0 { - return nil, &reservationError{notExistingIDs} - } - - return &device.ContainerReservation{ - Envs: map[string]string{ - NvidiaVisibleDevices: strings.Join(deviceIDs, ","), - }, - }, nil -} - -// Stats streams statistics for the detected devices. -func (d *NvidiaDevice) Stats(ctx context.Context, interval time.Duration) (<-chan *device.StatsResponse, error) { - if !d.enabled { - return nil, device.ErrPluginDisabled - } - - outCh := make(chan *device.StatsResponse) - go d.stats(ctx, outCh, interval) - return outCh, nil -} diff --git a/devices/gpu/nvidia/device_test.go b/devices/gpu/nvidia/device_test.go deleted file mode 100644 index a5ec354e2432..000000000000 --- a/devices/gpu/nvidia/device_test.go +++ /dev/null @@ -1,140 +0,0 @@ -package nvidia - -import ( - "testing" - - hclog "github.com/hashicorp/go-hclog" - "github.com/hashicorp/nomad/devices/gpu/nvidia/nvml" - "github.com/hashicorp/nomad/plugins/device" - "github.com/stretchr/testify/require" -) - -type MockNvmlClient struct { - FingerprintError error - FingerprintResponseReturned *nvml.FingerprintData - - StatsError error - StatsResponseReturned []*nvml.StatsData -} - -func (c *MockNvmlClient) GetFingerprintData() (*nvml.FingerprintData, error) { - return c.FingerprintResponseReturned, c.FingerprintError -} - -func (c *MockNvmlClient) GetStatsData() ([]*nvml.StatsData, error) { - return c.StatsResponseReturned, c.StatsError -} - -func TestReserve(t *testing.T) { - cases := []struct { - Name string - ExpectedReservation *device.ContainerReservation - ExpectedError error - Device *NvidiaDevice - RequestedIDs []string - }{ - { - Name: "All RequestedIDs are not managed by Device", - ExpectedReservation: nil, - ExpectedError: &reservationError{[]string{ - "UUID1", - "UUID2", - "UUID3", - }}, - RequestedIDs: []string{ - "UUID1", - "UUID2", - "UUID3", - }, - Device: &NvidiaDevice{ - logger: hclog.NewNullLogger(), - enabled: true, - }, - }, - { - Name: "Some RequestedIDs are not managed by Device", - ExpectedReservation: nil, - ExpectedError: &reservationError{[]string{ - "UUID1", - "UUID2", - }}, - RequestedIDs: []string{ - "UUID1", - "UUID2", - "UUID3", - }, - Device: &NvidiaDevice{ - devices: map[string]struct{}{ - "UUID3": {}, - }, - logger: hclog.NewNullLogger(), - enabled: true, - }, - }, - { - Name: "All RequestedIDs are managed by Device", - ExpectedReservation: &device.ContainerReservation{ - Envs: map[string]string{ - NvidiaVisibleDevices: "UUID1,UUID2,UUID3", - }, - }, - ExpectedError: nil, - RequestedIDs: []string{ - "UUID1", - "UUID2", - "UUID3", - }, - Device: &NvidiaDevice{ - devices: map[string]struct{}{ - "UUID1": {}, - "UUID2": {}, - "UUID3": {}, - }, - logger: hclog.NewNullLogger(), - enabled: true, - }, - }, - { - Name: "No IDs requested", - ExpectedReservation: &device.ContainerReservation{}, - ExpectedError: nil, - RequestedIDs: nil, - Device: &NvidiaDevice{ - devices: map[string]struct{}{ - "UUID1": {}, - "UUID2": {}, - "UUID3": {}, - }, - logger: hclog.NewNullLogger(), - enabled: true, - }, - }, - { - Name: "Device is disabled", - ExpectedReservation: nil, - ExpectedError: device.ErrPluginDisabled, - RequestedIDs: []string{ - "UUID1", - "UUID2", - "UUID3", - }, - Device: &NvidiaDevice{ - devices: map[string]struct{}{ - "UUID1": {}, - "UUID2": {}, - "UUID3": {}, - }, - logger: hclog.NewNullLogger(), - enabled: false, - }, - }, - } - - for _, c := range cases { - t.Run(c.Name, func(t *testing.T) { - actualReservation, actualError := c.Device.Reserve(c.RequestedIDs) - require.Equal(t, c.ExpectedReservation, actualReservation) - require.Equal(t, c.ExpectedError, actualError) - }) - } -} diff --git a/devices/gpu/nvidia/fingerprint.go b/devices/gpu/nvidia/fingerprint.go deleted file mode 100644 index 45bb34fa3355..000000000000 --- a/devices/gpu/nvidia/fingerprint.go +++ /dev/null @@ -1,229 +0,0 @@ -package nvidia - -import ( - "context" - "time" - - "github.com/hashicorp/nomad/devices/gpu/nvidia/nvml" - "github.com/hashicorp/nomad/helper" - "github.com/hashicorp/nomad/plugins/device" - "github.com/hashicorp/nomad/plugins/shared/structs" -) - -const ( - // Attribute names and units for reporting Fingerprint output - MemoryAttr = "memory" - PowerAttr = "power" - BAR1Attr = "bar1" - DriverVersionAttr = "driver_version" - CoresClockAttr = "cores_clock" - MemoryClockAttr = "memory_clock" - PCIBandwidthAttr = "pci_bandwidth" - DisplayStateAttr = "display_state" - PersistenceModeAttr = "persistence_mode" -) - -// fingerprint is the long running goroutine that detects hardware -func (d *NvidiaDevice) fingerprint(ctx context.Context, devices chan<- *device.FingerprintResponse) { - defer close(devices) - - if d.initErr != nil { - if d.initErr.Error() != nvml.UnavailableLib.Error() { - d.logger.Error("exiting fingerprinting due to problems with NVML loading", "error", d.initErr) - devices <- device.NewFingerprintError(d.initErr) - } - - // Just close the channel to let server know that there are no working - // Nvidia GPU units - return - } - - // Create a timer that will fire immediately for the first detection - ticker := time.NewTimer(0) - - for { - select { - case <-ctx.Done(): - return - case <-ticker.C: - ticker.Reset(d.fingerprintPeriod) - } - d.writeFingerprintToChannel(devices) - } -} - -// writeFingerprintToChannel makes nvml call and writes response to channel -func (d *NvidiaDevice) writeFingerprintToChannel(devices chan<- *device.FingerprintResponse) { - fingerprintData, err := d.nvmlClient.GetFingerprintData() - if err != nil { - d.logger.Error("failed to get fingerprint nvidia devices", "error", err) - devices <- device.NewFingerprintError(err) - return - } - - // ignore devices from fingerprint output - fingerprintDevices := ignoreFingerprintedDevices(fingerprintData.Devices, d.ignoredGPUIDs) - // check if any device health was updated or any device was added to host - if !d.fingerprintChanged(fingerprintDevices) { - return - } - - commonAttributes := map[string]*structs.Attribute{ - DriverVersionAttr: { - String: helper.StringToPtr(fingerprintData.DriverVersion), - }, - } - - // Group all FingerprintDevices by DeviceName attribute - deviceListByDeviceName := make(map[string][]*nvml.FingerprintDeviceData) - for _, device := range fingerprintDevices { - deviceName := device.DeviceName - if deviceName == nil { - // nvml driver was not able to detect device name. This kind - // of devices are placed to single group with 'notAvailable' name - notAvailableCopy := notAvailable - deviceName = ¬AvailableCopy - } - - deviceListByDeviceName[*deviceName] = append(deviceListByDeviceName[*deviceName], device) - } - - // Build Fingerprint response with computed groups and send it over the channel - deviceGroups := make([]*device.DeviceGroup, 0, len(deviceListByDeviceName)) - for groupName, devices := range deviceListByDeviceName { - deviceGroups = append(deviceGroups, deviceGroupFromFingerprintData(groupName, devices, commonAttributes)) - } - devices <- device.NewFingerprint(deviceGroups...) -} - -// ignoreFingerprintedDevices excludes ignored devices from fingerprint output -func ignoreFingerprintedDevices(deviceData []*nvml.FingerprintDeviceData, ignoredGPUIDs map[string]struct{}) []*nvml.FingerprintDeviceData { - var result []*nvml.FingerprintDeviceData - for _, fingerprintDevice := range deviceData { - if _, ignored := ignoredGPUIDs[fingerprintDevice.UUID]; !ignored { - result = append(result, fingerprintDevice) - } - } - return result -} - -// fingerprintChanged checks if there are any previously unseen nvidia devices located -// or any of fingerprinted nvidia devices disappeared since the last fingerprint run. -// Also, this func updates device map on NvidiaDevice with the latest data -func (d *NvidiaDevice) fingerprintChanged(allDevices []*nvml.FingerprintDeviceData) bool { - d.deviceLock.Lock() - defer d.deviceLock.Unlock() - - changeDetected := false - // check if every device in allDevices is in d.devices - for _, device := range allDevices { - if _, ok := d.devices[device.UUID]; !ok { - changeDetected = true - } - } - - // check if every device in d.devices is in allDevices - fingerprintDeviceMap := make(map[string]struct{}) - for _, device := range allDevices { - fingerprintDeviceMap[device.UUID] = struct{}{} - } - for id := range d.devices { - if _, ok := fingerprintDeviceMap[id]; !ok { - changeDetected = true - } - } - - d.devices = fingerprintDeviceMap - return changeDetected -} - -// deviceGroupFromFingerprintData composes deviceGroup from FingerprintDeviceData slice -func deviceGroupFromFingerprintData(groupName string, deviceList []*nvml.FingerprintDeviceData, commonAttributes map[string]*structs.Attribute) *device.DeviceGroup { - // deviceGroup without devices makes no sense -> return nil when no devices are provided - if len(deviceList) == 0 { - return nil - } - - devices := make([]*device.Device, len(deviceList)) - for index, dev := range deviceList { - devices[index] = &device.Device{ - ID: dev.UUID, - // all fingerprinted devices are "healthy" for now - // to get real health data -> dcgm bindings should be used - Healthy: true, - HwLocality: &device.DeviceLocality{ - PciBusID: dev.PCIBusID, - }, - } - } - - deviceGroup := &device.DeviceGroup{ - Vendor: vendor, - Type: deviceType, - Name: groupName, - Devices: devices, - // Assumption made that devices with the same DeviceName have the same - // attributes like amount of memory, power, bar1memory etc - Attributes: attributesFromFingerprintDeviceData(deviceList[0]), - } - - // Extend attribute map with common attributes - for attributeKey, attributeValue := range commonAttributes { - deviceGroup.Attributes[attributeKey] = attributeValue - } - - return deviceGroup -} - -// attributesFromFingerprintDeviceData converts nvml.FingerprintDeviceData -// struct to device.DeviceGroup.Attributes format (map[string]string) -// this function performs all nil checks for FingerprintDeviceData pointers -func attributesFromFingerprintDeviceData(d *nvml.FingerprintDeviceData) map[string]*structs.Attribute { - attrs := map[string]*structs.Attribute{ - DisplayStateAttr: { - String: helper.StringToPtr(d.DisplayState), - }, - PersistenceModeAttr: { - String: helper.StringToPtr(d.PersistenceMode), - }, - } - - if d.MemoryMiB != nil { - attrs[MemoryAttr] = &structs.Attribute{ - Int: helper.Int64ToPtr(int64(*d.MemoryMiB)), - Unit: structs.UnitMiB, - } - } - if d.PowerW != nil { - attrs[PowerAttr] = &structs.Attribute{ - Int: helper.Int64ToPtr(int64(*d.PowerW)), - Unit: structs.UnitW, - } - } - if d.BAR1MiB != nil { - attrs[BAR1Attr] = &structs.Attribute{ - Int: helper.Int64ToPtr(int64(*d.BAR1MiB)), - Unit: structs.UnitMiB, - } - } - if d.CoresClockMHz != nil { - attrs[CoresClockAttr] = &structs.Attribute{ - Int: helper.Int64ToPtr(int64(*d.CoresClockMHz)), - Unit: structs.UnitMHz, - } - } - if d.MemoryClockMHz != nil { - attrs[MemoryClockAttr] = &structs.Attribute{ - Int: helper.Int64ToPtr(int64(*d.MemoryClockMHz)), - Unit: structs.UnitMHz, - } - } - if d.PCIBandwidthMBPerS != nil { - attrs[PCIBandwidthAttr] = &structs.Attribute{ - Int: helper.Int64ToPtr(int64(*d.PCIBandwidthMBPerS)), - Unit: structs.UnitMBPerS, - } - } - - return attrs -} diff --git a/devices/gpu/nvidia/fingerprint_test.go b/devices/gpu/nvidia/fingerprint_test.go deleted file mode 100644 index c85b5c8c90a3..000000000000 --- a/devices/gpu/nvidia/fingerprint_test.go +++ /dev/null @@ -1,1361 +0,0 @@ -package nvidia - -import ( - "context" - "errors" - "sort" - "testing" - - hclog "github.com/hashicorp/go-hclog" - "github.com/hashicorp/nomad/devices/gpu/nvidia/nvml" - "github.com/hashicorp/nomad/helper" - "github.com/hashicorp/nomad/plugins/device" - "github.com/hashicorp/nomad/plugins/shared/structs" - "github.com/stretchr/testify/require" -) - -func TestIgnoreFingerprintedDevices(t *testing.T) { - for _, testCase := range []struct { - Name string - DeviceData []*nvml.FingerprintDeviceData - IgnoredGPUIds map[string]struct{} - ExpectedResult []*nvml.FingerprintDeviceData - }{ - { - Name: "Odd ignored", - DeviceData: []*nvml.FingerprintDeviceData{ - { - DeviceData: &nvml.DeviceData{ - DeviceName: helper.StringToPtr("DeviceName1"), - UUID: "UUID1", - MemoryMiB: helper.Uint64ToPtr(1000), - }, - }, - { - DeviceData: &nvml.DeviceData{ - DeviceName: helper.StringToPtr("DeviceName2"), - UUID: "UUID2", - MemoryMiB: helper.Uint64ToPtr(1000), - }, - }, - { - DeviceData: &nvml.DeviceData{ - DeviceName: helper.StringToPtr("DeviceName3"), - UUID: "UUID3", - MemoryMiB: helper.Uint64ToPtr(1000), - }, - }, - }, - IgnoredGPUIds: map[string]struct{}{ - "UUID2": {}, - }, - ExpectedResult: []*nvml.FingerprintDeviceData{ - { - DeviceData: &nvml.DeviceData{ - DeviceName: helper.StringToPtr("DeviceName1"), - UUID: "UUID1", - MemoryMiB: helper.Uint64ToPtr(1000), - }, - }, - { - DeviceData: &nvml.DeviceData{ - DeviceName: helper.StringToPtr("DeviceName3"), - UUID: "UUID3", - MemoryMiB: helper.Uint64ToPtr(1000), - }, - }, - }, - }, - { - Name: "Even ignored", - DeviceData: []*nvml.FingerprintDeviceData{ - { - DeviceData: &nvml.DeviceData{ - DeviceName: helper.StringToPtr("DeviceName1"), - UUID: "UUID1", - MemoryMiB: helper.Uint64ToPtr(1000), - }, - }, - { - DeviceData: &nvml.DeviceData{ - DeviceName: helper.StringToPtr("DeviceName2"), - UUID: "UUID2", - MemoryMiB: helper.Uint64ToPtr(1000), - }, - }, - { - DeviceData: &nvml.DeviceData{ - DeviceName: helper.StringToPtr("DeviceName3"), - UUID: "UUID3", - MemoryMiB: helper.Uint64ToPtr(1000), - }, - }, - }, - IgnoredGPUIds: map[string]struct{}{ - "UUID1": {}, - "UUID3": {}, - }, - ExpectedResult: []*nvml.FingerprintDeviceData{ - { - DeviceData: &nvml.DeviceData{ - DeviceName: helper.StringToPtr("DeviceName2"), - UUID: "UUID2", - MemoryMiB: helper.Uint64ToPtr(1000), - }, - }, - }, - }, - { - Name: "All ignored", - DeviceData: []*nvml.FingerprintDeviceData{ - { - DeviceData: &nvml.DeviceData{ - DeviceName: helper.StringToPtr("DeviceName1"), - UUID: "UUID1", - MemoryMiB: helper.Uint64ToPtr(1000), - }, - }, - { - DeviceData: &nvml.DeviceData{ - DeviceName: helper.StringToPtr("DeviceName2"), - UUID: "UUID2", - MemoryMiB: helper.Uint64ToPtr(1000), - }, - }, - { - DeviceData: &nvml.DeviceData{ - DeviceName: helper.StringToPtr("DeviceName3"), - UUID: "UUID3", - MemoryMiB: helper.Uint64ToPtr(1000), - }, - }, - }, - IgnoredGPUIds: map[string]struct{}{ - "UUID1": {}, - "UUID2": {}, - "UUID3": {}, - }, - ExpectedResult: nil, - }, - { - Name: "No ignored", - DeviceData: []*nvml.FingerprintDeviceData{ - { - DeviceData: &nvml.DeviceData{ - DeviceName: helper.StringToPtr("DeviceName1"), - UUID: "UUID1", - MemoryMiB: helper.Uint64ToPtr(1000), - }, - }, - { - DeviceData: &nvml.DeviceData{ - DeviceName: helper.StringToPtr("DeviceName2"), - UUID: "UUID2", - MemoryMiB: helper.Uint64ToPtr(1000), - }, - }, - { - DeviceData: &nvml.DeviceData{ - DeviceName: helper.StringToPtr("DeviceName3"), - UUID: "UUID3", - MemoryMiB: helper.Uint64ToPtr(1000), - }, - }, - }, - IgnoredGPUIds: map[string]struct{}{}, - ExpectedResult: []*nvml.FingerprintDeviceData{ - { - DeviceData: &nvml.DeviceData{ - DeviceName: helper.StringToPtr("DeviceName1"), - UUID: "UUID1", - MemoryMiB: helper.Uint64ToPtr(1000), - }, - }, - { - DeviceData: &nvml.DeviceData{ - DeviceName: helper.StringToPtr("DeviceName2"), - UUID: "UUID2", - MemoryMiB: helper.Uint64ToPtr(1000), - }, - }, - { - DeviceData: &nvml.DeviceData{ - DeviceName: helper.StringToPtr("DeviceName3"), - UUID: "UUID3", - MemoryMiB: helper.Uint64ToPtr(1000), - }, - }, - }, - }, - { - Name: "No DeviceData provided", - DeviceData: nil, - IgnoredGPUIds: map[string]struct{}{ - "UUID1": {}, - "UUID2": {}, - "UUID3": {}, - }, - ExpectedResult: nil, - }, - } { - t.Run(testCase.Name, func(t *testing.T) { - actualResult := ignoreFingerprintedDevices(testCase.DeviceData, testCase.IgnoredGPUIds) - require.New(t).Equal(testCase.ExpectedResult, actualResult) - }) - } -} - -func TestCheckFingerprintUpdates(t *testing.T) { - for _, testCase := range []struct { - Name string - Device *NvidiaDevice - AllDevices []*nvml.FingerprintDeviceData - DeviceMapAfterMethodCall map[string]struct{} - ExpectedResult bool - }{ - { - Name: "No updates", - Device: &NvidiaDevice{devices: map[string]struct{}{ - "1": {}, - "2": {}, - "3": {}, - }}, - AllDevices: []*nvml.FingerprintDeviceData{ - { - DeviceData: &nvml.DeviceData{ - UUID: "1", - }, - }, - { - DeviceData: &nvml.DeviceData{ - UUID: "2", - }, - }, - { - DeviceData: &nvml.DeviceData{ - UUID: "3", - }, - }, - }, - ExpectedResult: false, - DeviceMapAfterMethodCall: map[string]struct{}{ - "1": {}, - "2": {}, - "3": {}, - }, - }, - { - Name: "New Device Appeared", - Device: &NvidiaDevice{devices: map[string]struct{}{ - "1": {}, - "2": {}, - "3": {}, - }}, - AllDevices: []*nvml.FingerprintDeviceData{ - { - DeviceData: &nvml.DeviceData{ - UUID: "1", - }, - }, - { - DeviceData: &nvml.DeviceData{ - UUID: "2", - }, - }, - { - DeviceData: &nvml.DeviceData{ - UUID: "3", - }, - }, - { - DeviceData: &nvml.DeviceData{ - UUID: "I am new", - }, - }, - }, - ExpectedResult: true, - DeviceMapAfterMethodCall: map[string]struct{}{ - "1": {}, - "2": {}, - "3": {}, - "I am new": {}, - }, - }, - { - Name: "Device disappeared", - Device: &NvidiaDevice{devices: map[string]struct{}{ - "1": {}, - "2": {}, - "3": {}, - }}, - AllDevices: []*nvml.FingerprintDeviceData{ - { - DeviceData: &nvml.DeviceData{ - UUID: "1", - }, - }, - { - DeviceData: &nvml.DeviceData{ - UUID: "2", - }, - }, - }, - ExpectedResult: true, - DeviceMapAfterMethodCall: map[string]struct{}{ - "1": {}, - "2": {}, - }, - }, - { - Name: "No devices in NvidiaDevice map", - Device: &NvidiaDevice{}, - AllDevices: []*nvml.FingerprintDeviceData{ - { - DeviceData: &nvml.DeviceData{ - UUID: "1", - }, - }, - { - DeviceData: &nvml.DeviceData{ - UUID: "2", - }, - }, - { - DeviceData: &nvml.DeviceData{ - UUID: "3", - }, - }, - }, - ExpectedResult: true, - DeviceMapAfterMethodCall: map[string]struct{}{ - "1": {}, - "2": {}, - "3": {}, - }, - }, - { - Name: "No devices detected", - Device: &NvidiaDevice{devices: map[string]struct{}{ - "1": {}, - "2": {}, - "3": {}, - }}, - AllDevices: nil, - ExpectedResult: true, - DeviceMapAfterMethodCall: map[string]struct{}{}, - }, - } { - t.Run(testCase.Name, func(t *testing.T) { - actualResult := testCase.Device.fingerprintChanged(testCase.AllDevices) - req := require.New(t) - // check that function returns valid "updated / not updated" state - req.Equal(testCase.ExpectedResult, actualResult) - // check that function propely updates devices map - req.Equal(testCase.Device.devices, testCase.DeviceMapAfterMethodCall) - }) - } -} - -func TestAttributesFromFingerprintDeviceData(t *testing.T) { - for _, testCase := range []struct { - Name string - FingerprintDeviceData *nvml.FingerprintDeviceData - ExpectedResult map[string]*structs.Attribute - }{ - { - Name: "All attributes are not nil", - FingerprintDeviceData: &nvml.FingerprintDeviceData{ - DeviceData: &nvml.DeviceData{ - UUID: "1", - DeviceName: helper.StringToPtr("Type1"), - MemoryMiB: helper.Uint64ToPtr(256), - PowerW: helper.UintToPtr(2), - BAR1MiB: helper.Uint64ToPtr(256), - }, - PCIBusID: "pciBusID1", - PCIBandwidthMBPerS: helper.UintToPtr(1), - CoresClockMHz: helper.UintToPtr(1), - MemoryClockMHz: helper.UintToPtr(1), - DisplayState: "Enabled", - PersistenceMode: "Enabled", - }, - ExpectedResult: map[string]*structs.Attribute{ - MemoryAttr: { - Int: helper.Int64ToPtr(256), - Unit: structs.UnitMiB, - }, - PowerAttr: { - Int: helper.Int64ToPtr(2), - Unit: structs.UnitW, - }, - BAR1Attr: { - Int: helper.Int64ToPtr(256), - Unit: structs.UnitMiB, - }, - PCIBandwidthAttr: { - Int: helper.Int64ToPtr(1), - Unit: structs.UnitMBPerS, - }, - CoresClockAttr: { - Int: helper.Int64ToPtr(1), - Unit: structs.UnitMHz, - }, - MemoryClockAttr: { - Int: helper.Int64ToPtr(1), - Unit: structs.UnitMHz, - }, - DisplayStateAttr: { - String: helper.StringToPtr("Enabled"), - }, - PersistenceModeAttr: { - String: helper.StringToPtr("Enabled"), - }, - }, - }, - { - Name: "nil values are omitted", - FingerprintDeviceData: &nvml.FingerprintDeviceData{ - DeviceData: &nvml.DeviceData{ - UUID: "1", - DeviceName: helper.StringToPtr("Type1"), - MemoryMiB: nil, - PowerW: helper.UintToPtr(2), - BAR1MiB: helper.Uint64ToPtr(256), - }, - PCIBusID: "pciBusID1", - DisplayState: "Enabled", - PersistenceMode: "Enabled", - }, - ExpectedResult: map[string]*structs.Attribute{ - PowerAttr: { - Int: helper.Int64ToPtr(2), - Unit: structs.UnitW, - }, - BAR1Attr: { - Int: helper.Int64ToPtr(256), - Unit: structs.UnitMiB, - }, - DisplayStateAttr: { - String: helper.StringToPtr("Enabled"), - }, - PersistenceModeAttr: { - String: helper.StringToPtr("Enabled"), - }, - }, - }, - } { - t.Run(testCase.Name, func(t *testing.T) { - actualResult := attributesFromFingerprintDeviceData(testCase.FingerprintDeviceData) - require.Equal(t, testCase.ExpectedResult, actualResult) - }) - } -} - -func TestDeviceGroupFromFingerprintData(t *testing.T) { - for _, testCase := range []struct { - Name string - GroupName string - Devices []*nvml.FingerprintDeviceData - CommonAttributes map[string]*structs.Attribute - ExpectedResult *device.DeviceGroup - }{ - { - Name: "Devices are provided", - GroupName: "Type1", - Devices: []*nvml.FingerprintDeviceData{ - { - DeviceData: &nvml.DeviceData{ - UUID: "1", - DeviceName: helper.StringToPtr("Type1"), - MemoryMiB: helper.Uint64ToPtr(100), - PowerW: helper.UintToPtr(2), - BAR1MiB: helper.Uint64ToPtr(256), - }, - PCIBusID: "pciBusID1", - PCIBandwidthMBPerS: helper.UintToPtr(1), - CoresClockMHz: helper.UintToPtr(1), - MemoryClockMHz: helper.UintToPtr(1), - DisplayState: "Enabled", - PersistenceMode: "Enabled", - }, - { - DeviceData: &nvml.DeviceData{ - UUID: "2", - DeviceName: helper.StringToPtr("Type1"), - MemoryMiB: helper.Uint64ToPtr(100), - PowerW: helper.UintToPtr(2), - BAR1MiB: helper.Uint64ToPtr(256), - }, - PCIBusID: "pciBusID2", - PCIBandwidthMBPerS: helper.UintToPtr(1), - CoresClockMHz: helper.UintToPtr(1), - MemoryClockMHz: helper.UintToPtr(1), - DisplayState: "Enabled", - PersistenceMode: "Enabled", - }, - }, - ExpectedResult: &device.DeviceGroup{ - Vendor: vendor, - Type: deviceType, - Name: "Type1", - Devices: []*device.Device{ - { - ID: "1", - Healthy: true, - HwLocality: &device.DeviceLocality{ - PciBusID: "pciBusID1", - }, - }, - { - ID: "2", - Healthy: true, - HwLocality: &device.DeviceLocality{ - PciBusID: "pciBusID2", - }, - }, - }, - Attributes: map[string]*structs.Attribute{ - MemoryAttr: { - Int: helper.Int64ToPtr(100), - Unit: structs.UnitMiB, - }, - PowerAttr: { - Int: helper.Int64ToPtr(2), - Unit: structs.UnitW, - }, - BAR1Attr: { - Int: helper.Int64ToPtr(256), - Unit: structs.UnitMiB, - }, - PCIBandwidthAttr: { - Int: helper.Int64ToPtr(1), - Unit: structs.UnitMBPerS, - }, - CoresClockAttr: { - Int: helper.Int64ToPtr(1), - Unit: structs.UnitMHz, - }, - MemoryClockAttr: { - Int: helper.Int64ToPtr(1), - Unit: structs.UnitMHz, - }, - DisplayStateAttr: { - String: helper.StringToPtr("Enabled"), - }, - PersistenceModeAttr: { - String: helper.StringToPtr("Enabled"), - }, - }, - }, - }, - { - Name: "Devices and common attributes are provided", - GroupName: "Type1", - Devices: []*nvml.FingerprintDeviceData{ - { - DeviceData: &nvml.DeviceData{ - UUID: "1", - DeviceName: helper.StringToPtr("Type1"), - MemoryMiB: helper.Uint64ToPtr(100), - PowerW: helper.UintToPtr(2), - BAR1MiB: helper.Uint64ToPtr(256), - }, - PCIBusID: "pciBusID1", - PCIBandwidthMBPerS: helper.UintToPtr(1), - CoresClockMHz: helper.UintToPtr(1), - MemoryClockMHz: helper.UintToPtr(1), - DisplayState: "Enabled", - PersistenceMode: "Enabled", - }, - { - DeviceData: &nvml.DeviceData{ - UUID: "2", - DeviceName: helper.StringToPtr("Type1"), - MemoryMiB: helper.Uint64ToPtr(100), - PowerW: helper.UintToPtr(2), - BAR1MiB: helper.Uint64ToPtr(256), - }, - PCIBusID: "pciBusID2", - PCIBandwidthMBPerS: helper.UintToPtr(1), - CoresClockMHz: helper.UintToPtr(1), - MemoryClockMHz: helper.UintToPtr(1), - DisplayState: "Enabled", - PersistenceMode: "Enabled", - }, - }, - CommonAttributes: map[string]*structs.Attribute{ - DriverVersionAttr: { - String: helper.StringToPtr("1"), - }, - }, - ExpectedResult: &device.DeviceGroup{ - Vendor: vendor, - Type: deviceType, - Name: "Type1", - Devices: []*device.Device{ - { - ID: "1", - Healthy: true, - HwLocality: &device.DeviceLocality{ - PciBusID: "pciBusID1", - }, - }, - { - ID: "2", - Healthy: true, - HwLocality: &device.DeviceLocality{ - PciBusID: "pciBusID2", - }, - }, - }, - Attributes: map[string]*structs.Attribute{ - MemoryAttr: { - Int: helper.Int64ToPtr(100), - Unit: structs.UnitMiB, - }, - PowerAttr: { - Int: helper.Int64ToPtr(2), - Unit: structs.UnitW, - }, - BAR1Attr: { - Int: helper.Int64ToPtr(256), - Unit: structs.UnitMiB, - }, - PCIBandwidthAttr: { - Int: helper.Int64ToPtr(1), - Unit: structs.UnitMBPerS, - }, - CoresClockAttr: { - Int: helper.Int64ToPtr(1), - Unit: structs.UnitMHz, - }, - MemoryClockAttr: { - Int: helper.Int64ToPtr(1), - Unit: structs.UnitMHz, - }, - DisplayStateAttr: { - String: helper.StringToPtr("Enabled"), - }, - PersistenceModeAttr: { - String: helper.StringToPtr("Enabled"), - }, - DriverVersionAttr: { - String: helper.StringToPtr("1"), - }, - }, - }, - }, - { - Name: "Devices are not provided", - GroupName: "Type1", - CommonAttributes: map[string]*structs.Attribute{ - DriverVersionAttr: { - String: helper.StringToPtr("1"), - }, - }, - Devices: nil, - ExpectedResult: nil, - }, - } { - t.Run(testCase.Name, func(t *testing.T) { - actualResult := deviceGroupFromFingerprintData(testCase.GroupName, testCase.Devices, testCase.CommonAttributes) - require.New(t).Equal(testCase.ExpectedResult, actualResult) - }) - } -} - -func TestWriteFingerprintToChannel(t *testing.T) { - for _, testCase := range []struct { - Name string - Device *NvidiaDevice - ExpectedWriteToChannel *device.FingerprintResponse - }{ - { - Name: "Check that FingerprintError is handled properly", - Device: &NvidiaDevice{ - nvmlClient: &MockNvmlClient{ - FingerprintError: errors.New(""), - }, - logger: hclog.NewNullLogger(), - }, - ExpectedWriteToChannel: &device.FingerprintResponse{ - Error: errors.New(""), - }, - }, - { - Name: "Check ignore devices works correctly", - Device: &NvidiaDevice{ - nvmlClient: &MockNvmlClient{ - FingerprintResponseReturned: &nvml.FingerprintData{ - DriverVersion: "1", - Devices: []*nvml.FingerprintDeviceData{ - { - DeviceData: &nvml.DeviceData{ - UUID: "1", - DeviceName: helper.StringToPtr("Name"), - MemoryMiB: helper.Uint64ToPtr(10), - PowerW: helper.UintToPtr(100), - BAR1MiB: helper.Uint64ToPtr(256), - }, - PCIBusID: "pciBusID1", - PCIBandwidthMBPerS: helper.UintToPtr(1), - CoresClockMHz: helper.UintToPtr(1), - MemoryClockMHz: helper.UintToPtr(1), - DisplayState: "Enabled", - PersistenceMode: "Enabled", - }, - { - DeviceData: &nvml.DeviceData{ - UUID: "2", - DeviceName: helper.StringToPtr("Name"), - MemoryMiB: helper.Uint64ToPtr(10), - PowerW: helper.UintToPtr(100), - BAR1MiB: helper.Uint64ToPtr(256), - }, - PCIBusID: "pciBusID2", - PCIBandwidthMBPerS: helper.UintToPtr(1), - CoresClockMHz: helper.UintToPtr(1), - MemoryClockMHz: helper.UintToPtr(1), - DisplayState: "Enabled", - PersistenceMode: "Enabled", - }, - }, - }, - }, - ignoredGPUIDs: map[string]struct{}{ - "1": {}, - }, - logger: hclog.NewNullLogger(), - }, - ExpectedWriteToChannel: &device.FingerprintResponse{ - Devices: []*device.DeviceGroup{ - { - Vendor: vendor, - Type: deviceType, - Name: "Name", - Devices: []*device.Device{ - { - ID: "2", - Healthy: true, - HwLocality: &device.DeviceLocality{ - PciBusID: "pciBusID2", - }, - }, - }, - Attributes: map[string]*structs.Attribute{ - MemoryAttr: { - Int: helper.Int64ToPtr(10), - Unit: structs.UnitMiB, - }, - PowerAttr: { - Int: helper.Int64ToPtr(100), - Unit: structs.UnitW, - }, - BAR1Attr: { - Int: helper.Int64ToPtr(256), - Unit: structs.UnitMiB, - }, - PCIBandwidthAttr: { - Int: helper.Int64ToPtr(1), - Unit: structs.UnitMBPerS, - }, - CoresClockAttr: { - Int: helper.Int64ToPtr(1), - Unit: structs.UnitMHz, - }, - MemoryClockAttr: { - Int: helper.Int64ToPtr(1), - Unit: structs.UnitMHz, - }, - DisplayStateAttr: { - String: helper.StringToPtr("Enabled"), - }, - PersistenceModeAttr: { - String: helper.StringToPtr("Enabled"), - }, - DriverVersionAttr: { - String: helper.StringToPtr("1"), - }, - }, - }, - }, - }, - }, - { - Name: "Check devices are split to multiple device groups 1", - Device: &NvidiaDevice{ - nvmlClient: &MockNvmlClient{ - FingerprintResponseReturned: &nvml.FingerprintData{ - DriverVersion: "1", - Devices: []*nvml.FingerprintDeviceData{ - { - DeviceData: &nvml.DeviceData{ - UUID: "1", - DeviceName: helper.StringToPtr("Name1"), - MemoryMiB: helper.Uint64ToPtr(10), - PowerW: helper.UintToPtr(100), - BAR1MiB: helper.Uint64ToPtr(256), - }, - PCIBusID: "pciBusID1", - PCIBandwidthMBPerS: helper.UintToPtr(1), - CoresClockMHz: helper.UintToPtr(1), - MemoryClockMHz: helper.UintToPtr(1), - DisplayState: "Enabled", - PersistenceMode: "Enabled", - }, - { - DeviceData: &nvml.DeviceData{ - UUID: "2", - DeviceName: helper.StringToPtr("Name2"), - MemoryMiB: helper.Uint64ToPtr(11), - PowerW: helper.UintToPtr(100), - BAR1MiB: helper.Uint64ToPtr(256), - }, - PCIBusID: "pciBusID2", - PCIBandwidthMBPerS: helper.UintToPtr(1), - CoresClockMHz: helper.UintToPtr(1), - MemoryClockMHz: helper.UintToPtr(1), - DisplayState: "Enabled", - PersistenceMode: "Enabled", - }, - { - DeviceData: &nvml.DeviceData{ - UUID: "3", - DeviceName: helper.StringToPtr("Name3"), - MemoryMiB: helper.Uint64ToPtr(12), - PowerW: helper.UintToPtr(100), - BAR1MiB: helper.Uint64ToPtr(256), - }, - PCIBusID: "pciBusID3", - PCIBandwidthMBPerS: helper.UintToPtr(1), - CoresClockMHz: helper.UintToPtr(1), - MemoryClockMHz: helper.UintToPtr(1), - DisplayState: "Enabled", - PersistenceMode: "Enabled", - }, - }, - }, - }, - logger: hclog.NewNullLogger(), - }, - ExpectedWriteToChannel: &device.FingerprintResponse{ - Devices: []*device.DeviceGroup{ - { - Vendor: vendor, - Type: deviceType, - Name: "Name1", - Devices: []*device.Device{ - { - ID: "1", - Healthy: true, - HwLocality: &device.DeviceLocality{ - PciBusID: "pciBusID1", - }, - }, - }, - Attributes: map[string]*structs.Attribute{ - MemoryAttr: { - Int: helper.Int64ToPtr(10), - Unit: structs.UnitMiB, - }, - PowerAttr: { - Int: helper.Int64ToPtr(100), - Unit: structs.UnitW, - }, - BAR1Attr: { - Int: helper.Int64ToPtr(256), - Unit: structs.UnitMiB, - }, - PCIBandwidthAttr: { - Int: helper.Int64ToPtr(1), - Unit: structs.UnitMBPerS, - }, - CoresClockAttr: { - Int: helper.Int64ToPtr(1), - Unit: structs.UnitMHz, - }, - MemoryClockAttr: { - Int: helper.Int64ToPtr(1), - Unit: structs.UnitMHz, - }, - DisplayStateAttr: { - String: helper.StringToPtr("Enabled"), - }, - PersistenceModeAttr: { - String: helper.StringToPtr("Enabled"), - }, - DriverVersionAttr: { - String: helper.StringToPtr("1"), - }, - }, - }, - { - Vendor: vendor, - Type: deviceType, - Name: "Name2", - Devices: []*device.Device{ - { - ID: "2", - Healthy: true, - HwLocality: &device.DeviceLocality{ - PciBusID: "pciBusID2", - }, - }, - }, - Attributes: map[string]*structs.Attribute{ - MemoryAttr: { - Int: helper.Int64ToPtr(11), - Unit: structs.UnitMiB, - }, - PowerAttr: { - Int: helper.Int64ToPtr(100), - Unit: structs.UnitW, - }, - BAR1Attr: { - Int: helper.Int64ToPtr(256), - Unit: structs.UnitMiB, - }, - PCIBandwidthAttr: { - Int: helper.Int64ToPtr(1), - Unit: structs.UnitMBPerS, - }, - CoresClockAttr: { - Int: helper.Int64ToPtr(1), - Unit: structs.UnitMHz, - }, - MemoryClockAttr: { - Int: helper.Int64ToPtr(1), - Unit: structs.UnitMHz, - }, - DisplayStateAttr: { - String: helper.StringToPtr("Enabled"), - }, - PersistenceModeAttr: { - String: helper.StringToPtr("Enabled"), - }, - DriverVersionAttr: { - String: helper.StringToPtr("1"), - }, - }, - }, - { - Vendor: vendor, - Type: deviceType, - Name: "Name3", - Devices: []*device.Device{ - { - ID: "3", - Healthy: true, - HwLocality: &device.DeviceLocality{ - PciBusID: "pciBusID3", - }, - }, - }, - Attributes: map[string]*structs.Attribute{ - MemoryAttr: { - Int: helper.Int64ToPtr(12), - Unit: structs.UnitMiB, - }, - PowerAttr: { - Int: helper.Int64ToPtr(100), - Unit: structs.UnitW, - }, - BAR1Attr: { - Int: helper.Int64ToPtr(256), - Unit: structs.UnitMiB, - }, - PCIBandwidthAttr: { - Int: helper.Int64ToPtr(1), - Unit: structs.UnitMBPerS, - }, - CoresClockAttr: { - Int: helper.Int64ToPtr(1), - Unit: structs.UnitMHz, - }, - MemoryClockAttr: { - Int: helper.Int64ToPtr(1), - Unit: structs.UnitMHz, - }, - DisplayStateAttr: { - String: helper.StringToPtr("Enabled"), - }, - PersistenceModeAttr: { - String: helper.StringToPtr("Enabled"), - }, - DriverVersionAttr: { - String: helper.StringToPtr("1"), - }, - }, - }, - }, - }, - }, - { - Name: "Check devices are split to multiple device groups 2", - Device: &NvidiaDevice{ - nvmlClient: &MockNvmlClient{ - FingerprintResponseReturned: &nvml.FingerprintData{ - DriverVersion: "1", - Devices: []*nvml.FingerprintDeviceData{ - { - DeviceData: &nvml.DeviceData{ - UUID: "1", - DeviceName: helper.StringToPtr("Name1"), - MemoryMiB: helper.Uint64ToPtr(10), - PowerW: helper.UintToPtr(100), - BAR1MiB: helper.Uint64ToPtr(256), - }, - PCIBusID: "pciBusID1", - PCIBandwidthMBPerS: helper.UintToPtr(1), - CoresClockMHz: helper.UintToPtr(1), - MemoryClockMHz: helper.UintToPtr(1), - DisplayState: "Enabled", - PersistenceMode: "Enabled", - }, - { - DeviceData: &nvml.DeviceData{ - UUID: "2", - DeviceName: helper.StringToPtr("Name2"), - MemoryMiB: helper.Uint64ToPtr(11), - PowerW: helper.UintToPtr(100), - BAR1MiB: helper.Uint64ToPtr(256), - }, - PCIBusID: "pciBusID2", - PCIBandwidthMBPerS: helper.UintToPtr(1), - CoresClockMHz: helper.UintToPtr(1), - MemoryClockMHz: helper.UintToPtr(1), - DisplayState: "Enabled", - PersistenceMode: "Enabled", - }, - { - DeviceData: &nvml.DeviceData{ - UUID: "3", - DeviceName: helper.StringToPtr("Name2"), - MemoryMiB: helper.Uint64ToPtr(12), - PowerW: helper.UintToPtr(100), - BAR1MiB: helper.Uint64ToPtr(256), - }, - PCIBusID: "pciBusID3", - PCIBandwidthMBPerS: helper.UintToPtr(1), - CoresClockMHz: helper.UintToPtr(1), - MemoryClockMHz: helper.UintToPtr(1), - DisplayState: "Enabled", - PersistenceMode: "Enabled", - }, - }, - }, - }, - logger: hclog.NewNullLogger(), - }, - ExpectedWriteToChannel: &device.FingerprintResponse{ - Devices: []*device.DeviceGroup{ - { - Vendor: vendor, - Type: deviceType, - Name: "Name1", - Devices: []*device.Device{ - { - ID: "1", - Healthy: true, - HwLocality: &device.DeviceLocality{ - PciBusID: "pciBusID1", - }, - }, - }, - Attributes: map[string]*structs.Attribute{ - MemoryAttr: { - Int: helper.Int64ToPtr(10), - Unit: structs.UnitMiB, - }, - PowerAttr: { - Int: helper.Int64ToPtr(100), - Unit: structs.UnitW, - }, - BAR1Attr: { - Int: helper.Int64ToPtr(256), - Unit: structs.UnitMiB, - }, - PCIBandwidthAttr: { - Int: helper.Int64ToPtr(1), - Unit: structs.UnitMBPerS, - }, - CoresClockAttr: { - Int: helper.Int64ToPtr(1), - Unit: structs.UnitMHz, - }, - MemoryClockAttr: { - Int: helper.Int64ToPtr(1), - Unit: structs.UnitMHz, - }, - DisplayStateAttr: { - String: helper.StringToPtr("Enabled"), - }, - PersistenceModeAttr: { - String: helper.StringToPtr("Enabled"), - }, - DriverVersionAttr: { - String: helper.StringToPtr("1"), - }, - }, - }, - { - Vendor: vendor, - Type: deviceType, - Name: "Name2", - Devices: []*device.Device{ - { - ID: "2", - Healthy: true, - HwLocality: &device.DeviceLocality{ - PciBusID: "pciBusID2", - }, - }, - { - ID: "3", - Healthy: true, - HwLocality: &device.DeviceLocality{ - PciBusID: "pciBusID3", - }, - }, - }, - Attributes: map[string]*structs.Attribute{ - MemoryAttr: { - Int: helper.Int64ToPtr(11), - Unit: structs.UnitMiB, - }, - PowerAttr: { - Int: helper.Int64ToPtr(100), - Unit: structs.UnitW, - }, - BAR1Attr: { - Int: helper.Int64ToPtr(256), - Unit: structs.UnitMiB, - }, - PCIBandwidthAttr: { - Int: helper.Int64ToPtr(1), - Unit: structs.UnitMBPerS, - }, - CoresClockAttr: { - Int: helper.Int64ToPtr(1), - Unit: structs.UnitMHz, - }, - MemoryClockAttr: { - Int: helper.Int64ToPtr(1), - Unit: structs.UnitMHz, - }, - DisplayStateAttr: { - String: helper.StringToPtr("Enabled"), - }, - PersistenceModeAttr: { - String: helper.StringToPtr("Enabled"), - }, - DriverVersionAttr: { - String: helper.StringToPtr("1"), - }, - }, - }, - }, - }, - }, - } { - t.Run(testCase.Name, func(t *testing.T) { - channel := make(chan *device.FingerprintResponse, 1) - testCase.Device.writeFingerprintToChannel(channel) - actualResult := <-channel - // writeFingerprintToChannel iterates over map keys - // and insterts results to an array, so order of elements in output array - // may be different - // actualResult, expectedResult arrays has to be sorted firsted - sort.Slice(actualResult.Devices, func(i, j int) bool { - return actualResult.Devices[i].Name < actualResult.Devices[j].Name - }) - sort.Slice(testCase.ExpectedWriteToChannel.Devices, func(i, j int) bool { - return testCase.ExpectedWriteToChannel.Devices[i].Name < testCase.ExpectedWriteToChannel.Devices[j].Name - }) - require.Equal(t, testCase.ExpectedWriteToChannel, actualResult) - }) - } -} - -// Test if nonworking driver returns empty fingerprint data -func TestFingerprint(t *testing.T) { - for _, testCase := range []struct { - Name string - Device *NvidiaDevice - ExpectedWriteToChannel *device.FingerprintResponse - }{ - { - Name: "Check that working driver returns valid fingeprint data", - Device: &NvidiaDevice{ - initErr: nil, - nvmlClient: &MockNvmlClient{ - FingerprintResponseReturned: &nvml.FingerprintData{ - DriverVersion: "1", - Devices: []*nvml.FingerprintDeviceData{ - { - DeviceData: &nvml.DeviceData{ - UUID: "1", - DeviceName: helper.StringToPtr("Name1"), - MemoryMiB: helper.Uint64ToPtr(10), - PowerW: helper.UintToPtr(100), - BAR1MiB: helper.Uint64ToPtr(256), - }, - PCIBusID: "pciBusID1", - PCIBandwidthMBPerS: helper.UintToPtr(1), - CoresClockMHz: helper.UintToPtr(1), - MemoryClockMHz: helper.UintToPtr(1), - DisplayState: "Enabled", - PersistenceMode: "Enabled", - }, - { - DeviceData: &nvml.DeviceData{ - UUID: "2", - DeviceName: helper.StringToPtr("Name1"), - MemoryMiB: helper.Uint64ToPtr(10), - PowerW: helper.UintToPtr(100), - BAR1MiB: helper.Uint64ToPtr(256), - }, - PCIBusID: "pciBusID2", - PCIBandwidthMBPerS: helper.UintToPtr(1), - CoresClockMHz: helper.UintToPtr(1), - MemoryClockMHz: helper.UintToPtr(1), - DisplayState: "Enabled", - PersistenceMode: "Enabled", - }, - { - DeviceData: &nvml.DeviceData{ - UUID: "3", - DeviceName: helper.StringToPtr("Name1"), - MemoryMiB: helper.Uint64ToPtr(10), - PowerW: helper.UintToPtr(100), - BAR1MiB: helper.Uint64ToPtr(256), - }, - PCIBusID: "pciBusID3", - PCIBandwidthMBPerS: helper.UintToPtr(1), - CoresClockMHz: helper.UintToPtr(1), - MemoryClockMHz: helper.UintToPtr(1), - DisplayState: "Enabled", - PersistenceMode: "Enabled", - }, - }, - }, - }, - logger: hclog.NewNullLogger(), - }, - ExpectedWriteToChannel: &device.FingerprintResponse{ - Devices: []*device.DeviceGroup{ - { - Vendor: vendor, - Type: deviceType, - Name: "Name1", - Devices: []*device.Device{ - { - ID: "1", - Healthy: true, - HwLocality: &device.DeviceLocality{ - PciBusID: "pciBusID1", - }, - }, - { - ID: "2", - Healthy: true, - HwLocality: &device.DeviceLocality{ - PciBusID: "pciBusID2", - }, - }, - { - ID: "3", - Healthy: true, - HwLocality: &device.DeviceLocality{ - PciBusID: "pciBusID3", - }, - }, - }, - Attributes: map[string]*structs.Attribute{ - MemoryAttr: { - Int: helper.Int64ToPtr(10), - Unit: structs.UnitMiB, - }, - PowerAttr: { - Int: helper.Int64ToPtr(100), - Unit: structs.UnitW, - }, - BAR1Attr: { - Int: helper.Int64ToPtr(256), - Unit: structs.UnitMiB, - }, - PCIBandwidthAttr: { - Int: helper.Int64ToPtr(1), - Unit: structs.UnitMBPerS, - }, - CoresClockAttr: { - Int: helper.Int64ToPtr(1), - Unit: structs.UnitMHz, - }, - MemoryClockAttr: { - Int: helper.Int64ToPtr(1), - Unit: structs.UnitMHz, - }, - DisplayStateAttr: { - String: helper.StringToPtr("Enabled"), - }, - PersistenceModeAttr: { - String: helper.StringToPtr("Enabled"), - }, - DriverVersionAttr: { - String: helper.StringToPtr("1"), - }, - }, - }, - }, - }, - }, - { - Name: "Check that not working driver returns error fingeprint data", - Device: &NvidiaDevice{ - initErr: errors.New("foo"), - nvmlClient: &MockNvmlClient{ - FingerprintResponseReturned: &nvml.FingerprintData{ - DriverVersion: "1", - Devices: []*nvml.FingerprintDeviceData{ - { - DeviceData: &nvml.DeviceData{ - UUID: "1", - DeviceName: helper.StringToPtr("Name1"), - MemoryMiB: helper.Uint64ToPtr(10), - }, - }, - { - DeviceData: &nvml.DeviceData{ - UUID: "2", - DeviceName: helper.StringToPtr("Name1"), - MemoryMiB: helper.Uint64ToPtr(10), - }, - }, - { - DeviceData: &nvml.DeviceData{ - UUID: "3", - DeviceName: helper.StringToPtr("Name1"), - MemoryMiB: helper.Uint64ToPtr(10), - }, - }, - }, - }, - }, - logger: hclog.NewNullLogger(), - }, - ExpectedWriteToChannel: &device.FingerprintResponse{ - Error: errors.New("foo"), - }, - }, - } { - t.Run(testCase.Name, func(t *testing.T) { - outCh := make(chan *device.FingerprintResponse) - ctx, cancel := context.WithCancel(context.Background()) - go testCase.Device.fingerprint(ctx, outCh) - result := <-outCh - cancel() - require.New(t).Equal(result, testCase.ExpectedWriteToChannel) - }) - } -} diff --git a/devices/gpu/nvidia/nvml/client.go b/devices/gpu/nvidia/nvml/client.go deleted file mode 100644 index d18dcbe1a9f6..000000000000 --- a/devices/gpu/nvidia/nvml/client.go +++ /dev/null @@ -1,194 +0,0 @@ -package nvml - -import ( - "fmt" -) - -// DeviceData represents common fields for Nvidia device -type DeviceData struct { - UUID string - DeviceName *string - MemoryMiB *uint64 - PowerW *uint - BAR1MiB *uint64 -} - -// FingerprintDeviceData is a superset of DeviceData -// it describes device specific fields returned from -// nvml queries during fingerprinting call -type FingerprintDeviceData struct { - *DeviceData - PCIBandwidthMBPerS *uint - CoresClockMHz *uint - MemoryClockMHz *uint - DisplayState string - PersistenceMode string - PCIBusID string -} - -// FingerprintData represets attributes of driver/devices -type FingerprintData struct { - Devices []*FingerprintDeviceData - DriverVersion string -} - -// StatsData is a superset of DeviceData -// it represents statistics data returned for every Nvidia device -type StatsData struct { - *DeviceData - PowerUsageW *uint - GPUUtilization *uint - MemoryUtilization *uint - EncoderUtilization *uint - DecoderUtilization *uint - TemperatureC *uint - UsedMemoryMiB *uint64 - BAR1UsedMiB *uint64 - ECCErrorsL1Cache *uint64 - ECCErrorsL2Cache *uint64 - ECCErrorsDevice *uint64 -} - -// NvmlClient describes how users would use nvml library -type NvmlClient interface { - GetFingerprintData() (*FingerprintData, error) - GetStatsData() ([]*StatsData, error) -} - -// nvmlClient implements NvmlClient -// Users of this lib are expected to use this struct via NewNvmlClient func -type nvmlClient struct { - driver NvmlDriver -} - -// NewNvmlClient function creates new nvmlClient with real -// NvmlDriver implementation. Also, this func initializes NvmlDriver -func NewNvmlClient() (*nvmlClient, error) { - driver := &nvmlDriver{} - err := driver.Initialize() - if err != nil { - return nil, err - } - return &nvmlClient{ - driver: driver, - }, nil -} - -// GetFingerprintData returns FingerprintData for available Nvidia devices -func (c *nvmlClient) GetFingerprintData() (*FingerprintData, error) { - /* - nvml fields to be fingerprinted # nvml_library_call - 1 - Driver Version # nvmlSystemGetDriverVersion - 2 - Product Name # nvmlDeviceGetName - 3 - GPU UUID # nvmlDeviceGetUUID - 4 - Total Memory # nvmlDeviceGetMemoryInfo - 5 - Power # nvmlDeviceGetPowerManagementLimit - 6 - PCIBusID # nvmlDeviceGetPciInfo - 7 - BAR1 Memory # nvmlDeviceGetBAR1MemoryInfo( - 8 - PCI Bandwidth - 9 - Memory, Cores Clock # nvmlDeviceGetMaxClockInfo - 10 - Display Mode # nvmlDeviceGetDisplayMode - 11 - Persistence Mode # nvmlDeviceGetPersistenceMode - */ - - // Assumed that this method is called with receiver retrieved from - // NewNvmlClient - // because this method handles initialization of NVML library - - driverVersion, err := c.driver.SystemDriverVersion() - if err != nil { - return nil, fmt.Errorf("nvidia nvml SystemDriverVersion() error: %v\n", err) - } - - numDevices, err := c.driver.DeviceCount() - if err != nil { - return nil, fmt.Errorf("nvidia nvml DeviceCount() error: %v\n", err) - } - - allNvidiaGPUResources := make([]*FingerprintDeviceData, numDevices) - - for i := 0; i < int(numDevices); i++ { - deviceInfo, err := c.driver.DeviceInfoByIndex(uint(i)) - if err != nil { - return nil, fmt.Errorf("nvidia nvml DeviceInfoByIndex() error: %v\n", err) - } - - allNvidiaGPUResources[i] = &FingerprintDeviceData{ - DeviceData: &DeviceData{ - DeviceName: deviceInfo.Name, - UUID: deviceInfo.UUID, - MemoryMiB: deviceInfo.MemoryMiB, - PowerW: deviceInfo.PowerW, - BAR1MiB: deviceInfo.BAR1MiB, - }, - PCIBandwidthMBPerS: deviceInfo.PCIBandwidthMBPerS, - CoresClockMHz: deviceInfo.CoresClockMHz, - MemoryClockMHz: deviceInfo.MemoryClockMHz, - DisplayState: deviceInfo.DisplayState, - PersistenceMode: deviceInfo.PersistenceMode, - PCIBusID: deviceInfo.PCIBusID, - } - } - return &FingerprintData{ - Devices: allNvidiaGPUResources, - DriverVersion: driverVersion, - }, nil -} - -// GetStatsData returns statistics data for all devices on this machine -func (c *nvmlClient) GetStatsData() ([]*StatsData, error) { - /* - nvml fields to be reported to stats api # nvml_library_call - 1 - Used Memory # nvmlDeviceGetMemoryInfo - 2 - Utilization of GPU # nvmlDeviceGetUtilizationRates - 3 - Utilization of Memory # nvmlDeviceGetUtilizationRates - 4 - Utilization of Decoder # nvmlDeviceGetDecoderUtilization - 5 - Utilization of Encoder # nvmlDeviceGetEncoderUtilization - 6 - Current GPU Temperature # nvmlDeviceGetTemperature - 7 - Power Draw # nvmlDeviceGetPowerUsage - 8 - BAR1 Used memory # nvmlDeviceGetBAR1MemoryInfo - 9 - ECC Errors on requesting L1Cache # nvmlDeviceGetMemoryErrorCounter - 10 - ECC Errors on requesting L2Cache # nvmlDeviceGetMemoryErrorCounter - 11 - ECC Errors on requesting Device memory # nvmlDeviceGetMemoryErrorCounter - */ - - // Assumed that this method is called with receiver retrieved from - // NewNvmlClient - // because this method handles initialization of NVML library - - numDevices, err := c.driver.DeviceCount() - if err != nil { - return nil, fmt.Errorf("nvidia nvml DeviceCount() error: %v\n", err) - } - - allNvidiaGPUStats := make([]*StatsData, numDevices) - - for i := 0; i < int(numDevices); i++ { - deviceInfo, deviceStatus, err := c.driver.DeviceInfoAndStatusByIndex(uint(i)) - if err != nil { - return nil, fmt.Errorf("nvidia nvml DeviceInfoAndStatusByIndex() error: %v\n", err) - } - - allNvidiaGPUStats[i] = &StatsData{ - DeviceData: &DeviceData{ - DeviceName: deviceInfo.Name, - UUID: deviceInfo.UUID, - MemoryMiB: deviceInfo.MemoryMiB, - PowerW: deviceInfo.PowerW, - BAR1MiB: deviceInfo.BAR1MiB, - }, - PowerUsageW: deviceStatus.PowerUsageW, - GPUUtilization: deviceStatus.GPUUtilization, - MemoryUtilization: deviceStatus.MemoryUtilization, - EncoderUtilization: deviceStatus.EncoderUtilization, - DecoderUtilization: deviceStatus.DecoderUtilization, - TemperatureC: deviceStatus.TemperatureC, - UsedMemoryMiB: deviceStatus.UsedMemoryMiB, - BAR1UsedMiB: deviceStatus.BAR1UsedMiB, - ECCErrorsL1Cache: deviceStatus.ECCErrorsL1Cache, - ECCErrorsL2Cache: deviceStatus.ECCErrorsL2Cache, - ECCErrorsDevice: deviceStatus.ECCErrorsDevice, - } - } - return allNvidiaGPUStats, nil -} diff --git a/devices/gpu/nvidia/nvml/client_test.go b/devices/gpu/nvidia/nvml/client_test.go deleted file mode 100644 index 23731f7b052e..000000000000 --- a/devices/gpu/nvidia/nvml/client_test.go +++ /dev/null @@ -1,399 +0,0 @@ -package nvml - -import ( - "errors" - "testing" - - "github.com/hashicorp/nomad/helper" - "github.com/stretchr/testify/require" -) - -type MockNVMLDriver struct { - systemDriverCallSuccessful bool - deviceCountCallSuccessful bool - deviceInfoByIndexCallSuccessful bool - deviceInfoAndStatusByIndexCallSuccessful bool - driverVersion string - devices []*DeviceInfo - deviceStatus []*DeviceStatus -} - -func (m *MockNVMLDriver) Initialize() error { - return nil -} - -func (m *MockNVMLDriver) Shutdown() error { - return nil -} - -func (m *MockNVMLDriver) SystemDriverVersion() (string, error) { - if !m.systemDriverCallSuccessful { - return "", errors.New("failed to get system driver") - } - return m.driverVersion, nil -} - -func (m *MockNVMLDriver) DeviceCount() (uint, error) { - if !m.deviceCountCallSuccessful { - return 0, errors.New("failed to get device length") - } - return uint(len(m.devices)), nil -} - -func (m *MockNVMLDriver) DeviceInfoByIndex(index uint) (*DeviceInfo, error) { - if index >= uint(len(m.devices)) { - return nil, errors.New("index is out of range") - } - if !m.deviceInfoByIndexCallSuccessful { - return nil, errors.New("failed to get device info by index") - } - return m.devices[index], nil -} - -func (m *MockNVMLDriver) DeviceInfoAndStatusByIndex(index uint) (*DeviceInfo, *DeviceStatus, error) { - if index >= uint(len(m.devices)) || index >= uint(len(m.deviceStatus)) { - return nil, nil, errors.New("index is out of range") - } - if !m.deviceInfoAndStatusByIndexCallSuccessful { - return nil, nil, errors.New("failed to get device info and status by index") - } - return m.devices[index], m.deviceStatus[index], nil -} - -func TestGetFingerprintDataFromNVML(t *testing.T) { - for _, testCase := range []struct { - Name string - DriverConfiguration *MockNVMLDriver - ExpectedError bool - ExpectedResult *FingerprintData - }{ - { - Name: "fail on systemDriverCallSuccessful", - ExpectedError: true, - ExpectedResult: nil, - DriverConfiguration: &MockNVMLDriver{ - systemDriverCallSuccessful: false, - deviceCountCallSuccessful: true, - deviceInfoByIndexCallSuccessful: true, - }, - }, - { - Name: "fail on deviceCountCallSuccessful", - ExpectedError: true, - ExpectedResult: nil, - DriverConfiguration: &MockNVMLDriver{ - systemDriverCallSuccessful: true, - deviceCountCallSuccessful: false, - deviceInfoByIndexCallSuccessful: true, - }, - }, - { - Name: "fail on deviceInfoByIndexCall", - ExpectedError: true, - ExpectedResult: nil, - DriverConfiguration: &MockNVMLDriver{ - systemDriverCallSuccessful: true, - deviceCountCallSuccessful: true, - deviceInfoByIndexCallSuccessful: false, - devices: []*DeviceInfo{ - { - UUID: "UUID1", - Name: helper.StringToPtr("ModelName1"), - MemoryMiB: helper.Uint64ToPtr(16), - PCIBusID: "busId", - PowerW: helper.UintToPtr(100), - BAR1MiB: helper.Uint64ToPtr(100), - PCIBandwidthMBPerS: helper.UintToPtr(100), - CoresClockMHz: helper.UintToPtr(100), - MemoryClockMHz: helper.UintToPtr(100), - }, { - UUID: "UUID2", - Name: helper.StringToPtr("ModelName2"), - MemoryMiB: helper.Uint64ToPtr(8), - PCIBusID: "busId", - PowerW: helper.UintToPtr(100), - BAR1MiB: helper.Uint64ToPtr(100), - PCIBandwidthMBPerS: helper.UintToPtr(100), - CoresClockMHz: helper.UintToPtr(100), - MemoryClockMHz: helper.UintToPtr(100), - }, - }, - }, - }, - { - Name: "successful outcome", - ExpectedError: false, - ExpectedResult: &FingerprintData{ - DriverVersion: "driverVersion", - Devices: []*FingerprintDeviceData{ - { - DeviceData: &DeviceData{ - DeviceName: helper.StringToPtr("ModelName1"), - UUID: "UUID1", - MemoryMiB: helper.Uint64ToPtr(16), - PowerW: helper.UintToPtr(100), - BAR1MiB: helper.Uint64ToPtr(100), - }, - PCIBusID: "busId1", - PCIBandwidthMBPerS: helper.UintToPtr(100), - CoresClockMHz: helper.UintToPtr(100), - MemoryClockMHz: helper.UintToPtr(100), - DisplayState: "Enabled", - PersistenceMode: "Enabled", - }, { - DeviceData: &DeviceData{ - DeviceName: helper.StringToPtr("ModelName2"), - UUID: "UUID2", - MemoryMiB: helper.Uint64ToPtr(8), - PowerW: helper.UintToPtr(200), - BAR1MiB: helper.Uint64ToPtr(200), - }, - PCIBusID: "busId2", - PCIBandwidthMBPerS: helper.UintToPtr(200), - CoresClockMHz: helper.UintToPtr(200), - MemoryClockMHz: helper.UintToPtr(200), - DisplayState: "Enabled", - PersistenceMode: "Enabled", - }, - }, - }, - DriverConfiguration: &MockNVMLDriver{ - systemDriverCallSuccessful: true, - deviceCountCallSuccessful: true, - deviceInfoByIndexCallSuccessful: true, - driverVersion: "driverVersion", - devices: []*DeviceInfo{ - { - UUID: "UUID1", - Name: helper.StringToPtr("ModelName1"), - MemoryMiB: helper.Uint64ToPtr(16), - PCIBusID: "busId1", - PowerW: helper.UintToPtr(100), - BAR1MiB: helper.Uint64ToPtr(100), - PCIBandwidthMBPerS: helper.UintToPtr(100), - CoresClockMHz: helper.UintToPtr(100), - MemoryClockMHz: helper.UintToPtr(100), - DisplayState: "Enabled", - PersistenceMode: "Enabled", - }, { - UUID: "UUID2", - Name: helper.StringToPtr("ModelName2"), - MemoryMiB: helper.Uint64ToPtr(8), - PCIBusID: "busId2", - PowerW: helper.UintToPtr(200), - BAR1MiB: helper.Uint64ToPtr(200), - PCIBandwidthMBPerS: helper.UintToPtr(200), - CoresClockMHz: helper.UintToPtr(200), - MemoryClockMHz: helper.UintToPtr(200), - DisplayState: "Enabled", - PersistenceMode: "Enabled", - }, - }, - }, - }, - } { - cli := nvmlClient{driver: testCase.DriverConfiguration} - fingerprintData, err := cli.GetFingerprintData() - if testCase.ExpectedError && err == nil { - t.Errorf("case '%s' : expected Error, but didn't get one", testCase.Name) - } - if !testCase.ExpectedError && err != nil { - t.Errorf("case '%s' : unexpected Error '%v'", testCase.Name, err) - } - require.New(t).Equal(testCase.ExpectedResult, fingerprintData) - } -} - -func TestGetStatsDataFromNVML(t *testing.T) { - for _, testCase := range []struct { - Name string - DriverConfiguration *MockNVMLDriver - ExpectedError bool - ExpectedResult []*StatsData - }{ - { - Name: "fail on deviceCountCallSuccessful", - ExpectedError: true, - ExpectedResult: nil, - DriverConfiguration: &MockNVMLDriver{ - systemDriverCallSuccessful: true, - deviceCountCallSuccessful: false, - deviceInfoByIndexCallSuccessful: true, - deviceInfoAndStatusByIndexCallSuccessful: true, - }, - }, - { - Name: "fail on DeviceInfoAndStatusByIndex call", - ExpectedError: true, - ExpectedResult: nil, - DriverConfiguration: &MockNVMLDriver{ - systemDriverCallSuccessful: true, - deviceCountCallSuccessful: true, - deviceInfoAndStatusByIndexCallSuccessful: false, - devices: []*DeviceInfo{ - { - UUID: "UUID1", - Name: helper.StringToPtr("ModelName1"), - MemoryMiB: helper.Uint64ToPtr(16), - PCIBusID: "busId1", - PowerW: helper.UintToPtr(100), - BAR1MiB: helper.Uint64ToPtr(100), - PCIBandwidthMBPerS: helper.UintToPtr(100), - CoresClockMHz: helper.UintToPtr(100), - MemoryClockMHz: helper.UintToPtr(100), - }, { - UUID: "UUID2", - Name: helper.StringToPtr("ModelName2"), - MemoryMiB: helper.Uint64ToPtr(8), - PCIBusID: "busId2", - PowerW: helper.UintToPtr(200), - BAR1MiB: helper.Uint64ToPtr(200), - PCIBandwidthMBPerS: helper.UintToPtr(200), - CoresClockMHz: helper.UintToPtr(200), - MemoryClockMHz: helper.UintToPtr(200), - }, - }, - deviceStatus: []*DeviceStatus{ - { - TemperatureC: helper.UintToPtr(1), - GPUUtilization: helper.UintToPtr(1), - MemoryUtilization: helper.UintToPtr(1), - EncoderUtilization: helper.UintToPtr(1), - DecoderUtilization: helper.UintToPtr(1), - UsedMemoryMiB: helper.Uint64ToPtr(1), - ECCErrorsL1Cache: helper.Uint64ToPtr(1), - ECCErrorsL2Cache: helper.Uint64ToPtr(1), - ECCErrorsDevice: helper.Uint64ToPtr(1), - PowerUsageW: helper.UintToPtr(1), - BAR1UsedMiB: helper.Uint64ToPtr(1), - }, - { - TemperatureC: helper.UintToPtr(2), - GPUUtilization: helper.UintToPtr(2), - MemoryUtilization: helper.UintToPtr(2), - EncoderUtilization: helper.UintToPtr(2), - DecoderUtilization: helper.UintToPtr(2), - UsedMemoryMiB: helper.Uint64ToPtr(2), - ECCErrorsL1Cache: helper.Uint64ToPtr(2), - ECCErrorsL2Cache: helper.Uint64ToPtr(2), - ECCErrorsDevice: helper.Uint64ToPtr(2), - PowerUsageW: helper.UintToPtr(2), - BAR1UsedMiB: helper.Uint64ToPtr(2), - }, - }, - }, - }, - { - Name: "successful outcome", - ExpectedError: false, - ExpectedResult: []*StatsData{ - { - DeviceData: &DeviceData{ - DeviceName: helper.StringToPtr("ModelName1"), - UUID: "UUID1", - MemoryMiB: helper.Uint64ToPtr(16), - PowerW: helper.UintToPtr(100), - BAR1MiB: helper.Uint64ToPtr(100), - }, - TemperatureC: helper.UintToPtr(1), - GPUUtilization: helper.UintToPtr(1), - MemoryUtilization: helper.UintToPtr(1), - EncoderUtilization: helper.UintToPtr(1), - DecoderUtilization: helper.UintToPtr(1), - UsedMemoryMiB: helper.Uint64ToPtr(1), - ECCErrorsL1Cache: helper.Uint64ToPtr(1), - ECCErrorsL2Cache: helper.Uint64ToPtr(1), - ECCErrorsDevice: helper.Uint64ToPtr(1), - PowerUsageW: helper.UintToPtr(1), - BAR1UsedMiB: helper.Uint64ToPtr(1), - }, - { - DeviceData: &DeviceData{ - DeviceName: helper.StringToPtr("ModelName2"), - UUID: "UUID2", - MemoryMiB: helper.Uint64ToPtr(8), - PowerW: helper.UintToPtr(200), - BAR1MiB: helper.Uint64ToPtr(200), - }, - TemperatureC: helper.UintToPtr(2), - GPUUtilization: helper.UintToPtr(2), - MemoryUtilization: helper.UintToPtr(2), - EncoderUtilization: helper.UintToPtr(2), - DecoderUtilization: helper.UintToPtr(2), - UsedMemoryMiB: helper.Uint64ToPtr(2), - ECCErrorsL1Cache: helper.Uint64ToPtr(2), - ECCErrorsL2Cache: helper.Uint64ToPtr(2), - ECCErrorsDevice: helper.Uint64ToPtr(2), - PowerUsageW: helper.UintToPtr(2), - BAR1UsedMiB: helper.Uint64ToPtr(2), - }, - }, - DriverConfiguration: &MockNVMLDriver{ - deviceCountCallSuccessful: true, - deviceInfoByIndexCallSuccessful: true, - deviceInfoAndStatusByIndexCallSuccessful: true, - devices: []*DeviceInfo{ - { - UUID: "UUID1", - Name: helper.StringToPtr("ModelName1"), - MemoryMiB: helper.Uint64ToPtr(16), - PCIBusID: "busId1", - PowerW: helper.UintToPtr(100), - BAR1MiB: helper.Uint64ToPtr(100), - PCIBandwidthMBPerS: helper.UintToPtr(100), - CoresClockMHz: helper.UintToPtr(100), - MemoryClockMHz: helper.UintToPtr(100), - }, { - UUID: "UUID2", - Name: helper.StringToPtr("ModelName2"), - MemoryMiB: helper.Uint64ToPtr(8), - PCIBusID: "busId2", - PowerW: helper.UintToPtr(200), - BAR1MiB: helper.Uint64ToPtr(200), - PCIBandwidthMBPerS: helper.UintToPtr(200), - CoresClockMHz: helper.UintToPtr(200), - MemoryClockMHz: helper.UintToPtr(200), - }, - }, - deviceStatus: []*DeviceStatus{ - { - TemperatureC: helper.UintToPtr(1), - GPUUtilization: helper.UintToPtr(1), - MemoryUtilization: helper.UintToPtr(1), - EncoderUtilization: helper.UintToPtr(1), - DecoderUtilization: helper.UintToPtr(1), - UsedMemoryMiB: helper.Uint64ToPtr(1), - ECCErrorsL1Cache: helper.Uint64ToPtr(1), - ECCErrorsL2Cache: helper.Uint64ToPtr(1), - ECCErrorsDevice: helper.Uint64ToPtr(1), - PowerUsageW: helper.UintToPtr(1), - BAR1UsedMiB: helper.Uint64ToPtr(1), - }, - { - TemperatureC: helper.UintToPtr(2), - GPUUtilization: helper.UintToPtr(2), - MemoryUtilization: helper.UintToPtr(2), - EncoderUtilization: helper.UintToPtr(2), - DecoderUtilization: helper.UintToPtr(2), - UsedMemoryMiB: helper.Uint64ToPtr(2), - ECCErrorsL1Cache: helper.Uint64ToPtr(2), - ECCErrorsL2Cache: helper.Uint64ToPtr(2), - ECCErrorsDevice: helper.Uint64ToPtr(2), - PowerUsageW: helper.UintToPtr(2), - BAR1UsedMiB: helper.Uint64ToPtr(2), - }, - }, - }, - }, - } { - cli := nvmlClient{driver: testCase.DriverConfiguration} - statsData, err := cli.GetStatsData() - if testCase.ExpectedError && err == nil { - t.Errorf("case '%s' : expected Error, but didn't get one", testCase.Name) - } - if !testCase.ExpectedError && err != nil { - t.Errorf("case '%s' : unexpected Error '%v'", testCase.Name, err) - } - require.New(t).Equal(testCase.ExpectedResult, statsData) - } -} diff --git a/devices/gpu/nvidia/nvml/driver_default.go b/devices/gpu/nvidia/nvml/driver_default.go deleted file mode 100644 index e67efa22eeaf..000000000000 --- a/devices/gpu/nvidia/nvml/driver_default.go +++ /dev/null @@ -1,33 +0,0 @@ -// +build !linux - -package nvml - -// Initialize nvml library by locating nvml shared object file and calling ldopen -func (n *nvmlDriver) Initialize() error { - return UnavailableLib -} - -// Shutdown stops any further interaction with nvml -func (n *nvmlDriver) Shutdown() error { - return UnavailableLib -} - -// SystemDriverVersion returns installed driver version -func (n *nvmlDriver) SystemDriverVersion() (string, error) { - return "", UnavailableLib -} - -// DeviceCount reports number of available GPU devices -func (n *nvmlDriver) DeviceCount() (uint, error) { - return 0, UnavailableLib -} - -// DeviceInfoByIndex returns DeviceInfo for index GPU in system device list -func (n *nvmlDriver) DeviceInfoByIndex(index uint) (*DeviceInfo, error) { - return nil, UnavailableLib -} - -// DeviceInfoByIndex returns DeviceInfo and DeviceStatus for index GPU in system device list -func (n *nvmlDriver) DeviceInfoAndStatusByIndex(index uint) (*DeviceInfo, *DeviceStatus, error) { - return nil, nil, UnavailableLib -} diff --git a/devices/gpu/nvidia/nvml/driver_linux.go b/devices/gpu/nvidia/nvml/driver_linux.go deleted file mode 100644 index bdd777561bcf..000000000000 --- a/devices/gpu/nvidia/nvml/driver_linux.go +++ /dev/null @@ -1,85 +0,0 @@ -package nvml - -import ( - "github.com/NVIDIA/gpu-monitoring-tools/bindings/go/nvml" -) - -// Initialize nvml library by locating nvml shared object file and calling ldopen -func (n *nvmlDriver) Initialize() error { - return nvml.Init() -} - -// Shutdown stops any further interaction with nvml -func (n *nvmlDriver) Shutdown() error { - return nvml.Shutdown() -} - -// SystemDriverVersion returns installed driver version -func (n *nvmlDriver) SystemDriverVersion() (string, error) { - return nvml.GetDriverVersion() -} - -// DeviceCount reports number of available GPU devices -func (n *nvmlDriver) DeviceCount() (uint, error) { - return nvml.GetDeviceCount() -} - -// DeviceInfoByIndex returns DeviceInfo for index GPU in system device list -func (n *nvmlDriver) DeviceInfoByIndex(index uint) (*DeviceInfo, error) { - device, err := nvml.NewDevice(index) - if err != nil { - return nil, err - } - deviceMode, err := device.GetDeviceMode() - if err != nil { - return nil, err - } - return &DeviceInfo{ - UUID: device.UUID, - Name: device.Model, - MemoryMiB: device.Memory, - PowerW: device.Power, - BAR1MiB: device.PCI.BAR1, - PCIBandwidthMBPerS: device.PCI.Bandwidth, - PCIBusID: device.PCI.BusID, - CoresClockMHz: device.Clocks.Cores, - MemoryClockMHz: device.Clocks.Memory, - DisplayState: deviceMode.DisplayInfo.Mode.String(), - PersistenceMode: deviceMode.Persistence.String(), - }, nil -} - -// DeviceInfoByIndex returns DeviceInfo and DeviceStatus for index GPU in system device list -func (n *nvmlDriver) DeviceInfoAndStatusByIndex(index uint) (*DeviceInfo, *DeviceStatus, error) { - device, err := nvml.NewDevice(index) - if err != nil { - return nil, nil, err - } - status, err := device.Status() - if err != nil { - return nil, nil, err - } - return &DeviceInfo{ - UUID: device.UUID, - Name: device.Model, - MemoryMiB: device.Memory, - PowerW: device.Power, - BAR1MiB: device.PCI.BAR1, - PCIBandwidthMBPerS: device.PCI.Bandwidth, - PCIBusID: device.PCI.BusID, - CoresClockMHz: device.Clocks.Cores, - MemoryClockMHz: device.Clocks.Memory, - }, &DeviceStatus{ - TemperatureC: status.Temperature, - GPUUtilization: status.Utilization.GPU, - MemoryUtilization: status.Utilization.Memory, - EncoderUtilization: status.Utilization.Encoder, - DecoderUtilization: status.Utilization.Decoder, - UsedMemoryMiB: status.Memory.Global.Used, - ECCErrorsL1Cache: status.Memory.ECCErrors.L1Cache, - ECCErrorsL2Cache: status.Memory.ECCErrors.L2Cache, - ECCErrorsDevice: status.Memory.ECCErrors.Device, - PowerUsageW: status.Power, - BAR1UsedMiB: status.PCI.BAR1Used, - }, nil -} diff --git a/devices/gpu/nvidia/nvml/shared.go b/devices/gpu/nvidia/nvml/shared.go deleted file mode 100644 index a0bb04d22234..000000000000 --- a/devices/gpu/nvidia/nvml/shared.go +++ /dev/null @@ -1,61 +0,0 @@ -package nvml - -import "errors" - -var ( - // UnavailableLib is returned when the nvml library could not be loaded. - UnavailableLib = errors.New("could not load NVML library") -) - -// nvmlDriver implements NvmlDriver -// Users are required to call Initialize method before using any other methods -type nvmlDriver struct{} - -// NvmlDriver represents set of methods to query nvml library -type NvmlDriver interface { - Initialize() error - Shutdown() error - SystemDriverVersion() (string, error) - DeviceCount() (uint, error) - DeviceInfoByIndex(uint) (*DeviceInfo, error) - DeviceInfoAndStatusByIndex(uint) (*DeviceInfo, *DeviceStatus, error) -} - -// DeviceInfo represents nvml device data -// this struct is returned by NvmlDriver DeviceInfoByIndex and -// DeviceInfoAndStatusByIndex methods -type DeviceInfo struct { - // The following fields are guaranteed to be retrieved from nvml - UUID string - PCIBusID string - DisplayState string - PersistenceMode string - - // The following fields can be nil after call to nvml, because nvml was - // not able to retrieve this fields for specific nvidia card - Name *string - MemoryMiB *uint64 - PowerW *uint - BAR1MiB *uint64 - PCIBandwidthMBPerS *uint - CoresClockMHz *uint - MemoryClockMHz *uint -} - -// DeviceStatus represents nvml device status -// this struct is returned by NvmlDriver DeviceInfoAndStatusByIndex method -type DeviceStatus struct { - // The following fields can be nil after call to nvml, because nvml was - // not able to retrieve this fields for specific nvidia card - PowerUsageW *uint - TemperatureC *uint - GPUUtilization *uint // % - MemoryUtilization *uint // % - EncoderUtilization *uint // % - DecoderUtilization *uint // % - BAR1UsedMiB *uint64 - UsedMemoryMiB *uint64 - ECCErrorsL1Cache *uint64 - ECCErrorsL2Cache *uint64 - ECCErrorsDevice *uint64 -} diff --git a/devices/gpu/nvidia/stats.go b/devices/gpu/nvidia/stats.go deleted file mode 100644 index c6c447757916..000000000000 --- a/devices/gpu/nvidia/stats.go +++ /dev/null @@ -1,325 +0,0 @@ -package nvidia - -import ( - "context" - "time" - - "github.com/hashicorp/nomad/devices/gpu/nvidia/nvml" - "github.com/hashicorp/nomad/helper" - "github.com/hashicorp/nomad/plugins/device" - "github.com/hashicorp/nomad/plugins/shared/structs" -) - -const ( - // Attribute names for reporting stats output - PowerUsageAttr = "Power usage" - PowerUsageUnit = "W" - PowerUsageDesc = "Power usage for this GPU in watts and " + - "its associated circuitry (e.g. memory) / Maximum GPU Power" - GPUUtilizationAttr = "GPU utilization" - GPUUtilizationUnit = "%" - GPUUtilizationDesc = "Percent of time over the past sample period " + - "during which one or more kernels were executing on the GPU." - MemoryUtilizationAttr = "Memory utilization" - MemoryUtilizationUnit = "%" - MemoryUtilizationDesc = "Percentage of bandwidth used during the past sample period" - EncoderUtilizationAttr = "Encoder utilization" - EncoderUtilizationUnit = "%" - EncoderUtilizationDesc = "Percent of time over the past sample period " + - "during which GPU Encoder was used" - DecoderUtilizationAttr = "Decoder utilization" - DecoderUtilizationUnit = "%" - DecoderUtilizationDesc = "Percent of time over the past sample period " + - "during which GPU Decoder was used" - TemperatureAttr = "Temperature" - TemperatureUnit = "C" // Celsius degrees - TemperatureDesc = "Temperature of the Unit" - MemoryStateAttr = "Memory state" - MemoryStateUnit = "MiB" // Mebibytes - MemoryStateDesc = "UsedMemory / TotalMemory" - BAR1StateAttr = "BAR1 buffer state" - BAR1StateUnit = "MiB" // Mebibytes - BAR1StateDesc = "UsedBAR1 / TotalBAR1" - ECCErrorsL1CacheAttr = "ECC L1 errors" - ECCErrorsL1CacheUnit = "#" // number of errors - ECCErrorsL1CacheDesc = "Requested L1Cache error counter for the device" - ECCErrorsL2CacheAttr = "ECC L2 errors" - ECCErrorsL2CacheUnit = "#" // number of errors - ECCErrorsL2CacheDesc = "Requested L2Cache error counter for the device" - ECCErrorsDeviceAttr = "ECC memory errors" - ECCErrorsDeviceUnit = "#" // number of errors - ECCErrorsDeviceDesc = "Requested memory error counter for the device" -) - -// stats is the long running goroutine that streams device statistics -func (d *NvidiaDevice) stats(ctx context.Context, stats chan<- *device.StatsResponse, interval time.Duration) { - defer close(stats) - - if d.initErr != nil { - if d.initErr.Error() != nvml.UnavailableLib.Error() { - d.logger.Error("exiting stats due to problems with NVML loading", "error", d.initErr) - stats <- device.NewStatsError(d.initErr) - } - - return - } - - // Create a timer that will fire immediately for the first detection - ticker := time.NewTimer(0) - - for { - select { - case <-ctx.Done(): - return - case <-ticker.C: - ticker.Reset(interval) - } - - d.writeStatsToChannel(stats, time.Now()) - } -} - -// filterStatsByID accepts list of StatsData and set of IDs -// this function would return entries from StatsData with IDs found in the set -func filterStatsByID(stats []*nvml.StatsData, ids map[string]struct{}) []*nvml.StatsData { - var filteredStats []*nvml.StatsData - for _, statsItem := range stats { - if _, ok := ids[statsItem.UUID]; ok { - filteredStats = append(filteredStats, statsItem) - } - } - return filteredStats -} - -// writeStatsToChannel collects StatsData from NVML backend, groups StatsData -// by DeviceName attribute, populates DeviceGroupStats structure for every group -// and sends data over provided channel -func (d *NvidiaDevice) writeStatsToChannel(stats chan<- *device.StatsResponse, timestamp time.Time) { - statsData, err := d.nvmlClient.GetStatsData() - if err != nil { - d.logger.Error("failed to get nvidia stats", "error", err) - stats <- &device.StatsResponse{ - Error: err, - } - return - } - - // filter only stats from devices that are stored in NvidiaDevice struct - d.deviceLock.RLock() - statsData = filterStatsByID(statsData, d.devices) - d.deviceLock.RUnlock() - - // group stats by DeviceName struct field - statsListByDeviceName := make(map[string][]*nvml.StatsData) - for _, statsItem := range statsData { - deviceName := statsItem.DeviceName - if deviceName == nil { - // nvml driver was not able to detect device name. This kind - // of devices are placed to single group with 'notAvailable' name - notAvailableCopy := notAvailable - deviceName = ¬AvailableCopy - } - - statsListByDeviceName[*deviceName] = append(statsListByDeviceName[*deviceName], statsItem) - } - - // place data device.DeviceGroupStats struct for every group of stats - deviceGroupsStats := make([]*device.DeviceGroupStats, 0, len(statsListByDeviceName)) - for groupName, groupStats := range statsListByDeviceName { - deviceGroupsStats = append(deviceGroupsStats, statsForGroup(groupName, groupStats, timestamp)) - } - - stats <- &device.StatsResponse{ - Groups: deviceGroupsStats, - } -} - -func newNotAvailableDeviceStats(unit, desc string) *structs.StatValue { - return &structs.StatValue{Unit: unit, Desc: desc, StringVal: helper.StringToPtr(notAvailable)} -} - -// statsForGroup is a helper function that populates device.DeviceGroupStats -// for given groupName with groupStats list -func statsForGroup(groupName string, groupStats []*nvml.StatsData, timestamp time.Time) *device.DeviceGroupStats { - instanceStats := make(map[string]*device.DeviceStats) - for _, statsItem := range groupStats { - instanceStats[statsItem.UUID] = statsForItem(statsItem, timestamp) - } - - return &device.DeviceGroupStats{ - Vendor: vendor, - Type: deviceType, - Name: groupName, - InstanceStats: instanceStats, - } -} - -// statsForItem is a helper function that populates device.DeviceStats for given -// nvml.StatsData -func statsForItem(statsItem *nvml.StatsData, timestamp time.Time) *device.DeviceStats { - // nvml.StatsData holds pointers to values that can be nil - // In case they are nil return stats with 'notAvailable' constant - var ( - powerUsageStat *structs.StatValue - GPUUtilizationStat *structs.StatValue - memoryUtilizationStat *structs.StatValue - encoderUtilizationStat *structs.StatValue - decoderUtilizationStat *structs.StatValue - temperatureStat *structs.StatValue - memoryStateStat *structs.StatValue - BAR1StateStat *structs.StatValue - ECCErrorsL1CacheStat *structs.StatValue - ECCErrorsL2CacheStat *structs.StatValue - ECCErrorsDeviceStat *structs.StatValue - ) - - if statsItem.PowerUsageW == nil || statsItem.PowerW == nil { - powerUsageStat = newNotAvailableDeviceStats(PowerUsageUnit, PowerUsageDesc) - } else { - powerUsageStat = &structs.StatValue{ - Unit: PowerUsageUnit, - Desc: PowerUsageDesc, - IntNumeratorVal: helper.Int64ToPtr(int64(*statsItem.PowerUsageW)), - IntDenominatorVal: uintToInt64Ptr(statsItem.PowerW), - } - } - - if statsItem.GPUUtilization == nil { - GPUUtilizationStat = newNotAvailableDeviceStats(GPUUtilizationUnit, GPUUtilizationDesc) - } else { - GPUUtilizationStat = &structs.StatValue{ - Unit: GPUUtilizationUnit, - Desc: GPUUtilizationDesc, - IntNumeratorVal: uintToInt64Ptr(statsItem.GPUUtilization), - } - } - - if statsItem.MemoryUtilization == nil { - memoryUtilizationStat = newNotAvailableDeviceStats(MemoryUtilizationUnit, MemoryUtilizationDesc) - } else { - memoryUtilizationStat = &structs.StatValue{ - Unit: MemoryUtilizationUnit, - Desc: MemoryUtilizationDesc, - IntNumeratorVal: uintToInt64Ptr(statsItem.MemoryUtilization), - } - } - - if statsItem.EncoderUtilization == nil { - encoderUtilizationStat = newNotAvailableDeviceStats(EncoderUtilizationUnit, EncoderUtilizationDesc) - } else { - encoderUtilizationStat = &structs.StatValue{ - Unit: EncoderUtilizationUnit, - Desc: EncoderUtilizationDesc, - IntNumeratorVal: uintToInt64Ptr(statsItem.EncoderUtilization), - } - } - - if statsItem.DecoderUtilization == nil { - decoderUtilizationStat = newNotAvailableDeviceStats(DecoderUtilizationUnit, DecoderUtilizationDesc) - } else { - decoderUtilizationStat = &structs.StatValue{ - Unit: DecoderUtilizationUnit, - Desc: DecoderUtilizationDesc, - IntNumeratorVal: uintToInt64Ptr(statsItem.DecoderUtilization), - } - } - - if statsItem.TemperatureC == nil { - temperatureStat = newNotAvailableDeviceStats(TemperatureUnit, TemperatureDesc) - } else { - temperatureStat = &structs.StatValue{ - Unit: TemperatureUnit, - Desc: TemperatureDesc, - IntNumeratorVal: uintToInt64Ptr(statsItem.TemperatureC), - } - } - - if statsItem.UsedMemoryMiB == nil || statsItem.MemoryMiB == nil { - memoryStateStat = newNotAvailableDeviceStats(MemoryStateUnit, MemoryStateDesc) - } else { - memoryStateStat = &structs.StatValue{ - Unit: MemoryStateUnit, - Desc: MemoryStateDesc, - IntNumeratorVal: uint64ToInt64Ptr(statsItem.UsedMemoryMiB), - IntDenominatorVal: uint64ToInt64Ptr(statsItem.MemoryMiB), - } - } - - if statsItem.BAR1UsedMiB == nil || statsItem.BAR1MiB == nil { - BAR1StateStat = newNotAvailableDeviceStats(BAR1StateUnit, BAR1StateDesc) - } else { - BAR1StateStat = &structs.StatValue{ - Unit: BAR1StateUnit, - Desc: BAR1StateDesc, - IntNumeratorVal: uint64ToInt64Ptr(statsItem.BAR1UsedMiB), - IntDenominatorVal: uint64ToInt64Ptr(statsItem.BAR1MiB), - } - } - - if statsItem.ECCErrorsL1Cache == nil { - ECCErrorsL1CacheStat = newNotAvailableDeviceStats(ECCErrorsL1CacheUnit, ECCErrorsL1CacheDesc) - } else { - ECCErrorsL1CacheStat = &structs.StatValue{ - Unit: ECCErrorsL1CacheUnit, - Desc: ECCErrorsL1CacheDesc, - IntNumeratorVal: uint64ToInt64Ptr(statsItem.ECCErrorsL1Cache), - } - } - - if statsItem.ECCErrorsL2Cache == nil { - ECCErrorsL2CacheStat = newNotAvailableDeviceStats(ECCErrorsL2CacheUnit, ECCErrorsL2CacheDesc) - } else { - ECCErrorsL2CacheStat = &structs.StatValue{ - Unit: ECCErrorsL2CacheUnit, - Desc: ECCErrorsL2CacheDesc, - IntNumeratorVal: uint64ToInt64Ptr(statsItem.ECCErrorsL2Cache), - } - } - - if statsItem.ECCErrorsDevice == nil { - ECCErrorsDeviceStat = newNotAvailableDeviceStats(ECCErrorsDeviceUnit, ECCErrorsDeviceDesc) - } else { - ECCErrorsDeviceStat = &structs.StatValue{ - Unit: ECCErrorsDeviceUnit, - Desc: ECCErrorsDeviceDesc, - IntNumeratorVal: uint64ToInt64Ptr(statsItem.ECCErrorsDevice), - } - } - return &device.DeviceStats{ - Summary: memoryStateStat, - Stats: &structs.StatObject{ - Attributes: map[string]*structs.StatValue{ - PowerUsageAttr: powerUsageStat, - GPUUtilizationAttr: GPUUtilizationStat, - MemoryUtilizationAttr: memoryUtilizationStat, - EncoderUtilizationAttr: encoderUtilizationStat, - DecoderUtilizationAttr: decoderUtilizationStat, - TemperatureAttr: temperatureStat, - MemoryStateAttr: memoryStateStat, - BAR1StateAttr: BAR1StateStat, - ECCErrorsL1CacheAttr: ECCErrorsL1CacheStat, - ECCErrorsL2CacheAttr: ECCErrorsL2CacheStat, - ECCErrorsDeviceAttr: ECCErrorsDeviceStat, - }, - }, - Timestamp: timestamp, - } -} - -func uintToInt64Ptr(u *uint) *int64 { - if u == nil { - return nil - } - - v := int64(*u) - return &v -} - -func uint64ToInt64Ptr(u *uint64) *int64 { - if u == nil { - return nil - } - - v := int64(*u) - return &v -} diff --git a/devices/gpu/nvidia/stats_test.go b/devices/gpu/nvidia/stats_test.go deleted file mode 100644 index f6221e0f4801..000000000000 --- a/devices/gpu/nvidia/stats_test.go +++ /dev/null @@ -1,3041 +0,0 @@ -package nvidia - -import ( - "errors" - "sort" - "testing" - "time" - - hclog "github.com/hashicorp/go-hclog" - "github.com/hashicorp/nomad/devices/gpu/nvidia/nvml" - "github.com/hashicorp/nomad/helper" - "github.com/hashicorp/nomad/plugins/device" - "github.com/hashicorp/nomad/plugins/shared/structs" - "github.com/stretchr/testify/require" -) - -func TestFilterStatsByID(t *testing.T) { - for _, testCase := range []struct { - Name string - ProvidedStats []*nvml.StatsData - ProvidedIDs map[string]struct{} - ExpectedResult []*nvml.StatsData - }{ - { - Name: "All ids are in the map", - ProvidedStats: []*nvml.StatsData{ - { - DeviceData: &nvml.DeviceData{ - UUID: "UUID1", - DeviceName: helper.StringToPtr("DeviceName1"), - MemoryMiB: helper.Uint64ToPtr(1), - PowerW: helper.UintToPtr(2), - BAR1MiB: helper.Uint64ToPtr(256), - }, - PowerUsageW: helper.UintToPtr(1), - GPUUtilization: helper.UintToPtr(1), - MemoryUtilization: helper.UintToPtr(1), - EncoderUtilization: helper.UintToPtr(1), - DecoderUtilization: helper.UintToPtr(1), - TemperatureC: helper.UintToPtr(1), - UsedMemoryMiB: helper.Uint64ToPtr(1), - ECCErrorsL1Cache: helper.Uint64ToPtr(100), - ECCErrorsL2Cache: helper.Uint64ToPtr(100), - ECCErrorsDevice: helper.Uint64ToPtr(100), - }, - { - DeviceData: &nvml.DeviceData{ - UUID: "UUID2", - DeviceName: helper.StringToPtr("DeviceName1"), - MemoryMiB: helper.Uint64ToPtr(1), - PowerW: helper.UintToPtr(2), - BAR1MiB: helper.Uint64ToPtr(256), - }, - PowerUsageW: helper.UintToPtr(1), - GPUUtilization: helper.UintToPtr(1), - MemoryUtilization: helper.UintToPtr(1), - EncoderUtilization: helper.UintToPtr(1), - DecoderUtilization: helper.UintToPtr(1), - TemperatureC: helper.UintToPtr(1), - UsedMemoryMiB: helper.Uint64ToPtr(1), - ECCErrorsL1Cache: helper.Uint64ToPtr(100), - ECCErrorsL2Cache: helper.Uint64ToPtr(100), - ECCErrorsDevice: helper.Uint64ToPtr(100), - }, - { - DeviceData: &nvml.DeviceData{ - UUID: "UUID3", - DeviceName: helper.StringToPtr("DeviceName1"), - MemoryMiB: helper.Uint64ToPtr(1), - PowerW: helper.UintToPtr(2), - BAR1MiB: helper.Uint64ToPtr(256), - }, - PowerUsageW: helper.UintToPtr(1), - GPUUtilization: helper.UintToPtr(1), - MemoryUtilization: helper.UintToPtr(1), - EncoderUtilization: helper.UintToPtr(1), - DecoderUtilization: helper.UintToPtr(1), - TemperatureC: helper.UintToPtr(1), - UsedMemoryMiB: helper.Uint64ToPtr(1), - ECCErrorsL1Cache: helper.Uint64ToPtr(100), - ECCErrorsL2Cache: helper.Uint64ToPtr(100), - ECCErrorsDevice: helper.Uint64ToPtr(100), - }, - }, - ProvidedIDs: map[string]struct{}{ - "UUID1": {}, - "UUID2": {}, - "UUID3": {}, - }, - ExpectedResult: []*nvml.StatsData{ - { - DeviceData: &nvml.DeviceData{ - UUID: "UUID1", - DeviceName: helper.StringToPtr("DeviceName1"), - MemoryMiB: helper.Uint64ToPtr(1), - PowerW: helper.UintToPtr(2), - BAR1MiB: helper.Uint64ToPtr(256), - }, - PowerUsageW: helper.UintToPtr(1), - GPUUtilization: helper.UintToPtr(1), - MemoryUtilization: helper.UintToPtr(1), - EncoderUtilization: helper.UintToPtr(1), - DecoderUtilization: helper.UintToPtr(1), - TemperatureC: helper.UintToPtr(1), - UsedMemoryMiB: helper.Uint64ToPtr(1), - ECCErrorsL1Cache: helper.Uint64ToPtr(100), - ECCErrorsL2Cache: helper.Uint64ToPtr(100), - ECCErrorsDevice: helper.Uint64ToPtr(100), - }, - { - DeviceData: &nvml.DeviceData{ - UUID: "UUID2", - DeviceName: helper.StringToPtr("DeviceName1"), - MemoryMiB: helper.Uint64ToPtr(1), - PowerW: helper.UintToPtr(2), - BAR1MiB: helper.Uint64ToPtr(256), - }, - PowerUsageW: helper.UintToPtr(1), - GPUUtilization: helper.UintToPtr(1), - MemoryUtilization: helper.UintToPtr(1), - EncoderUtilization: helper.UintToPtr(1), - DecoderUtilization: helper.UintToPtr(1), - TemperatureC: helper.UintToPtr(1), - UsedMemoryMiB: helper.Uint64ToPtr(1), - ECCErrorsL1Cache: helper.Uint64ToPtr(100), - ECCErrorsL2Cache: helper.Uint64ToPtr(100), - ECCErrorsDevice: helper.Uint64ToPtr(100), - }, - { - DeviceData: &nvml.DeviceData{ - UUID: "UUID3", - DeviceName: helper.StringToPtr("DeviceName1"), - MemoryMiB: helper.Uint64ToPtr(1), - PowerW: helper.UintToPtr(2), - BAR1MiB: helper.Uint64ToPtr(256), - }, - PowerUsageW: helper.UintToPtr(1), - GPUUtilization: helper.UintToPtr(1), - MemoryUtilization: helper.UintToPtr(1), - EncoderUtilization: helper.UintToPtr(1), - DecoderUtilization: helper.UintToPtr(1), - TemperatureC: helper.UintToPtr(1), - UsedMemoryMiB: helper.Uint64ToPtr(1), - ECCErrorsL1Cache: helper.Uint64ToPtr(100), - ECCErrorsL2Cache: helper.Uint64ToPtr(100), - ECCErrorsDevice: helper.Uint64ToPtr(100), - }, - }, - }, - { - Name: "Odd are not provided in the map", - ProvidedStats: []*nvml.StatsData{ - { - DeviceData: &nvml.DeviceData{ - UUID: "UUID1", - DeviceName: helper.StringToPtr("DeviceName1"), - MemoryMiB: helper.Uint64ToPtr(1), - PowerW: helper.UintToPtr(2), - BAR1MiB: helper.Uint64ToPtr(256), - }, - PowerUsageW: helper.UintToPtr(1), - GPUUtilization: helper.UintToPtr(1), - MemoryUtilization: helper.UintToPtr(1), - EncoderUtilization: helper.UintToPtr(1), - DecoderUtilization: helper.UintToPtr(1), - TemperatureC: helper.UintToPtr(1), - UsedMemoryMiB: helper.Uint64ToPtr(1), - ECCErrorsL1Cache: helper.Uint64ToPtr(100), - ECCErrorsL2Cache: helper.Uint64ToPtr(100), - ECCErrorsDevice: helper.Uint64ToPtr(100), - }, - { - DeviceData: &nvml.DeviceData{ - UUID: "UUID2", - DeviceName: helper.StringToPtr("DeviceName1"), - MemoryMiB: helper.Uint64ToPtr(1), - PowerW: helper.UintToPtr(2), - BAR1MiB: helper.Uint64ToPtr(256), - }, - PowerUsageW: helper.UintToPtr(1), - GPUUtilization: helper.UintToPtr(1), - MemoryUtilization: helper.UintToPtr(1), - EncoderUtilization: helper.UintToPtr(1), - DecoderUtilization: helper.UintToPtr(1), - TemperatureC: helper.UintToPtr(1), - UsedMemoryMiB: helper.Uint64ToPtr(1), - ECCErrorsL1Cache: helper.Uint64ToPtr(100), - ECCErrorsL2Cache: helper.Uint64ToPtr(100), - ECCErrorsDevice: helper.Uint64ToPtr(100), - }, - { - DeviceData: &nvml.DeviceData{ - UUID: "UUID3", - DeviceName: helper.StringToPtr("DeviceName1"), - MemoryMiB: helper.Uint64ToPtr(1), - PowerW: helper.UintToPtr(2), - BAR1MiB: helper.Uint64ToPtr(256), - }, - PowerUsageW: helper.UintToPtr(1), - GPUUtilization: helper.UintToPtr(1), - MemoryUtilization: helper.UintToPtr(1), - EncoderUtilization: helper.UintToPtr(1), - DecoderUtilization: helper.UintToPtr(1), - TemperatureC: helper.UintToPtr(1), - UsedMemoryMiB: helper.Uint64ToPtr(1), - ECCErrorsL1Cache: helper.Uint64ToPtr(100), - ECCErrorsL2Cache: helper.Uint64ToPtr(100), - ECCErrorsDevice: helper.Uint64ToPtr(100), - }, - }, - ProvidedIDs: map[string]struct{}{ - "UUID2": {}, - }, - ExpectedResult: []*nvml.StatsData{ - { - DeviceData: &nvml.DeviceData{ - UUID: "UUID2", - DeviceName: helper.StringToPtr("DeviceName1"), - MemoryMiB: helper.Uint64ToPtr(1), - PowerW: helper.UintToPtr(2), - BAR1MiB: helper.Uint64ToPtr(256), - }, - PowerUsageW: helper.UintToPtr(1), - GPUUtilization: helper.UintToPtr(1), - MemoryUtilization: helper.UintToPtr(1), - EncoderUtilization: helper.UintToPtr(1), - DecoderUtilization: helper.UintToPtr(1), - TemperatureC: helper.UintToPtr(1), - UsedMemoryMiB: helper.Uint64ToPtr(1), - ECCErrorsL1Cache: helper.Uint64ToPtr(100), - ECCErrorsL2Cache: helper.Uint64ToPtr(100), - ECCErrorsDevice: helper.Uint64ToPtr(100), - }, - }, - }, - { - Name: "Even are not provided in the map", - ProvidedStats: []*nvml.StatsData{ - { - DeviceData: &nvml.DeviceData{ - UUID: "UUID1", - DeviceName: helper.StringToPtr("DeviceName1"), - MemoryMiB: helper.Uint64ToPtr(1), - PowerW: helper.UintToPtr(2), - BAR1MiB: helper.Uint64ToPtr(256), - }, - PowerUsageW: helper.UintToPtr(1), - GPUUtilization: helper.UintToPtr(1), - MemoryUtilization: helper.UintToPtr(1), - EncoderUtilization: helper.UintToPtr(1), - DecoderUtilization: helper.UintToPtr(1), - TemperatureC: helper.UintToPtr(1), - UsedMemoryMiB: helper.Uint64ToPtr(1), - ECCErrorsL1Cache: helper.Uint64ToPtr(100), - ECCErrorsL2Cache: helper.Uint64ToPtr(100), - ECCErrorsDevice: helper.Uint64ToPtr(100), - }, - { - DeviceData: &nvml.DeviceData{ - UUID: "UUID2", - DeviceName: helper.StringToPtr("DeviceName1"), - MemoryMiB: helper.Uint64ToPtr(1), - PowerW: helper.UintToPtr(2), - BAR1MiB: helper.Uint64ToPtr(256), - }, - PowerUsageW: helper.UintToPtr(1), - GPUUtilization: helper.UintToPtr(1), - MemoryUtilization: helper.UintToPtr(1), - EncoderUtilization: helper.UintToPtr(1), - DecoderUtilization: helper.UintToPtr(1), - TemperatureC: helper.UintToPtr(1), - UsedMemoryMiB: helper.Uint64ToPtr(1), - ECCErrorsL1Cache: helper.Uint64ToPtr(100), - ECCErrorsL2Cache: helper.Uint64ToPtr(100), - ECCErrorsDevice: helper.Uint64ToPtr(100), - }, - { - DeviceData: &nvml.DeviceData{ - UUID: "UUID3", - DeviceName: helper.StringToPtr("DeviceName1"), - MemoryMiB: helper.Uint64ToPtr(1), - PowerW: helper.UintToPtr(2), - BAR1MiB: helper.Uint64ToPtr(256), - }, - PowerUsageW: helper.UintToPtr(1), - GPUUtilization: helper.UintToPtr(1), - MemoryUtilization: helper.UintToPtr(1), - EncoderUtilization: helper.UintToPtr(1), - DecoderUtilization: helper.UintToPtr(1), - TemperatureC: helper.UintToPtr(1), - UsedMemoryMiB: helper.Uint64ToPtr(1), - ECCErrorsL1Cache: helper.Uint64ToPtr(100), - ECCErrorsL2Cache: helper.Uint64ToPtr(100), - ECCErrorsDevice: helper.Uint64ToPtr(100), - }, - }, - ProvidedIDs: map[string]struct{}{ - "UUID1": {}, - "UUID3": {}, - }, - ExpectedResult: []*nvml.StatsData{ - { - DeviceData: &nvml.DeviceData{ - UUID: "UUID1", - DeviceName: helper.StringToPtr("DeviceName1"), - MemoryMiB: helper.Uint64ToPtr(1), - PowerW: helper.UintToPtr(2), - BAR1MiB: helper.Uint64ToPtr(256), - }, - PowerUsageW: helper.UintToPtr(1), - GPUUtilization: helper.UintToPtr(1), - MemoryUtilization: helper.UintToPtr(1), - EncoderUtilization: helper.UintToPtr(1), - DecoderUtilization: helper.UintToPtr(1), - TemperatureC: helper.UintToPtr(1), - UsedMemoryMiB: helper.Uint64ToPtr(1), - ECCErrorsL1Cache: helper.Uint64ToPtr(100), - ECCErrorsL2Cache: helper.Uint64ToPtr(100), - ECCErrorsDevice: helper.Uint64ToPtr(100), - }, - { - DeviceData: &nvml.DeviceData{ - UUID: "UUID3", - DeviceName: helper.StringToPtr("DeviceName1"), - MemoryMiB: helper.Uint64ToPtr(1), - PowerW: helper.UintToPtr(2), - BAR1MiB: helper.Uint64ToPtr(256), - }, - PowerUsageW: helper.UintToPtr(1), - GPUUtilization: helper.UintToPtr(1), - MemoryUtilization: helper.UintToPtr(1), - EncoderUtilization: helper.UintToPtr(1), - DecoderUtilization: helper.UintToPtr(1), - TemperatureC: helper.UintToPtr(1), - UsedMemoryMiB: helper.Uint64ToPtr(1), - ECCErrorsL1Cache: helper.Uint64ToPtr(100), - ECCErrorsL2Cache: helper.Uint64ToPtr(100), - ECCErrorsDevice: helper.Uint64ToPtr(100), - }, - }, - }, - { - Name: "No Stats were provided", - ProvidedIDs: map[string]struct{}{ - "UUID1": {}, - "UUID2": {}, - "UUID3": {}, - }, - }, - { - Name: "No Ids were provided", - ProvidedStats: []*nvml.StatsData{ - { - DeviceData: &nvml.DeviceData{ - UUID: "UUID1", - DeviceName: helper.StringToPtr("DeviceName1"), - MemoryMiB: helper.Uint64ToPtr(1), - PowerW: helper.UintToPtr(2), - BAR1MiB: helper.Uint64ToPtr(256), - }, - PowerUsageW: helper.UintToPtr(1), - GPUUtilization: helper.UintToPtr(1), - MemoryUtilization: helper.UintToPtr(1), - EncoderUtilization: helper.UintToPtr(1), - DecoderUtilization: helper.UintToPtr(1), - TemperatureC: helper.UintToPtr(1), - UsedMemoryMiB: helper.Uint64ToPtr(1), - ECCErrorsL1Cache: helper.Uint64ToPtr(100), - ECCErrorsL2Cache: helper.Uint64ToPtr(100), - ECCErrorsDevice: helper.Uint64ToPtr(100), - }, - { - DeviceData: &nvml.DeviceData{ - UUID: "UUID2", - DeviceName: helper.StringToPtr("DeviceName1"), - MemoryMiB: helper.Uint64ToPtr(1), - PowerW: helper.UintToPtr(2), - BAR1MiB: helper.Uint64ToPtr(256), - }, - PowerUsageW: helper.UintToPtr(1), - GPUUtilization: helper.UintToPtr(1), - MemoryUtilization: helper.UintToPtr(1), - EncoderUtilization: helper.UintToPtr(1), - DecoderUtilization: helper.UintToPtr(1), - TemperatureC: helper.UintToPtr(1), - UsedMemoryMiB: helper.Uint64ToPtr(1), - ECCErrorsL1Cache: helper.Uint64ToPtr(100), - ECCErrorsL2Cache: helper.Uint64ToPtr(100), - ECCErrorsDevice: helper.Uint64ToPtr(100), - }, - { - DeviceData: &nvml.DeviceData{ - UUID: "UUID3", - DeviceName: helper.StringToPtr("DeviceName1"), - MemoryMiB: helper.Uint64ToPtr(1), - PowerW: helper.UintToPtr(2), - BAR1MiB: helper.Uint64ToPtr(256), - }, - PowerUsageW: helper.UintToPtr(1), - GPUUtilization: helper.UintToPtr(1), - MemoryUtilization: helper.UintToPtr(1), - EncoderUtilization: helper.UintToPtr(1), - DecoderUtilization: helper.UintToPtr(1), - TemperatureC: helper.UintToPtr(1), - UsedMemoryMiB: helper.Uint64ToPtr(1), - ECCErrorsL1Cache: helper.Uint64ToPtr(100), - ECCErrorsL2Cache: helper.Uint64ToPtr(100), - ECCErrorsDevice: helper.Uint64ToPtr(100), - }, - }, - }, - } { - actualResult := filterStatsByID(testCase.ProvidedStats, testCase.ProvidedIDs) - require.New(t).Equal(testCase.ExpectedResult, actualResult) - } -} - -func TestStatsForItem(t *testing.T) { - for _, testCase := range []struct { - Name string - Timestamp time.Time - ItemStat *nvml.StatsData - ExpectedResult *device.DeviceStats - }{ - { - Name: "All fields in ItemStat are not nil", - Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC), - ItemStat: &nvml.StatsData{ - DeviceData: &nvml.DeviceData{ - UUID: "UUID1", - DeviceName: helper.StringToPtr("DeviceName1"), - MemoryMiB: helper.Uint64ToPtr(1), - PowerW: helper.UintToPtr(1), - BAR1MiB: helper.Uint64ToPtr(256), - }, - PowerUsageW: helper.UintToPtr(1), - GPUUtilization: helper.UintToPtr(1), - MemoryUtilization: helper.UintToPtr(1), - EncoderUtilization: helper.UintToPtr(1), - DecoderUtilization: helper.UintToPtr(1), - TemperatureC: helper.UintToPtr(1), - UsedMemoryMiB: helper.Uint64ToPtr(1), - BAR1UsedMiB: helper.Uint64ToPtr(1), - ECCErrorsL1Cache: helper.Uint64ToPtr(100), - ECCErrorsL2Cache: helper.Uint64ToPtr(100), - ECCErrorsDevice: helper.Uint64ToPtr(100), - }, - ExpectedResult: &device.DeviceStats{ - Summary: &structs.StatValue{ - Unit: MemoryStateUnit, - Desc: MemoryStateDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(1), - }, - Stats: &structs.StatObject{ - Attributes: map[string]*structs.StatValue{ - PowerUsageAttr: { - Unit: PowerUsageUnit, - Desc: PowerUsageDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(1), - }, - GPUUtilizationAttr: { - Unit: GPUUtilizationUnit, - Desc: GPUUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - MemoryUtilizationAttr: { - Unit: MemoryUtilizationUnit, - Desc: MemoryUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - EncoderUtilizationAttr: { - Unit: EncoderUtilizationUnit, - Desc: EncoderUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - DecoderUtilizationAttr: { - Unit: DecoderUtilizationUnit, - Desc: DecoderUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - TemperatureAttr: { - Unit: TemperatureUnit, - Desc: TemperatureDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - MemoryStateAttr: { - Unit: MemoryStateUnit, - Desc: MemoryStateDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(1), - }, - BAR1StateAttr: { - Unit: BAR1StateUnit, - Desc: BAR1StateDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(256), - }, - ECCErrorsL1CacheAttr: { - Unit: ECCErrorsL1CacheUnit, - Desc: ECCErrorsL1CacheDesc, - IntNumeratorVal: helper.Int64ToPtr(100), - }, - ECCErrorsL2CacheAttr: { - Unit: ECCErrorsL2CacheUnit, - Desc: ECCErrorsL2CacheDesc, - IntNumeratorVal: helper.Int64ToPtr(100), - }, - ECCErrorsDeviceAttr: { - Unit: ECCErrorsDeviceUnit, - Desc: ECCErrorsDeviceDesc, - IntNumeratorVal: helper.Int64ToPtr(100), - }, - }, - }, - Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC), - }, - }, - { - Name: "Power usage is nil", - Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC), - ItemStat: &nvml.StatsData{ - DeviceData: &nvml.DeviceData{ - UUID: "UUID1", - DeviceName: helper.StringToPtr("DeviceName1"), - MemoryMiB: helper.Uint64ToPtr(1), - PowerW: helper.UintToPtr(1), - BAR1MiB: helper.Uint64ToPtr(256), - }, - PowerUsageW: nil, - GPUUtilization: helper.UintToPtr(1), - MemoryUtilization: helper.UintToPtr(1), - EncoderUtilization: helper.UintToPtr(1), - DecoderUtilization: helper.UintToPtr(1), - TemperatureC: helper.UintToPtr(1), - UsedMemoryMiB: helper.Uint64ToPtr(1), - BAR1UsedMiB: helper.Uint64ToPtr(1), - ECCErrorsL1Cache: helper.Uint64ToPtr(100), - ECCErrorsL2Cache: helper.Uint64ToPtr(100), - ECCErrorsDevice: helper.Uint64ToPtr(100), - }, - ExpectedResult: &device.DeviceStats{ - Summary: &structs.StatValue{ - Unit: MemoryStateUnit, - Desc: MemoryStateDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(1), - }, - Stats: &structs.StatObject{ - Attributes: map[string]*structs.StatValue{ - PowerUsageAttr: { - Unit: PowerUsageUnit, - Desc: PowerUsageDesc, - StringVal: helper.StringToPtr(notAvailable), - }, - GPUUtilizationAttr: { - Unit: GPUUtilizationUnit, - Desc: GPUUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - MemoryUtilizationAttr: { - Unit: MemoryUtilizationUnit, - Desc: MemoryUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - EncoderUtilizationAttr: { - Unit: EncoderUtilizationUnit, - Desc: EncoderUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - DecoderUtilizationAttr: { - Unit: DecoderUtilizationUnit, - Desc: DecoderUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - TemperatureAttr: { - Unit: TemperatureUnit, - Desc: TemperatureDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - MemoryStateAttr: { - Unit: MemoryStateUnit, - Desc: MemoryStateDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(1), - }, - BAR1StateAttr: { - Unit: BAR1StateUnit, - Desc: BAR1StateDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(256), - }, - ECCErrorsL1CacheAttr: { - Unit: ECCErrorsL1CacheUnit, - Desc: ECCErrorsL1CacheDesc, - IntNumeratorVal: helper.Int64ToPtr(100), - }, - ECCErrorsL2CacheAttr: { - Unit: ECCErrorsL2CacheUnit, - Desc: ECCErrorsL2CacheDesc, - IntNumeratorVal: helper.Int64ToPtr(100), - }, - ECCErrorsDeviceAttr: { - Unit: ECCErrorsDeviceUnit, - Desc: ECCErrorsDeviceDesc, - IntNumeratorVal: helper.Int64ToPtr(100), - }, - }, - }, - Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC), - }, - }, - { - Name: "PowerW is nil", - Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC), - ItemStat: &nvml.StatsData{ - DeviceData: &nvml.DeviceData{ - UUID: "UUID1", - DeviceName: helper.StringToPtr("DeviceName1"), - MemoryMiB: helper.Uint64ToPtr(1), - PowerW: nil, - BAR1MiB: helper.Uint64ToPtr(256), - }, - PowerUsageW: helper.UintToPtr(1), - GPUUtilization: helper.UintToPtr(1), - MemoryUtilization: helper.UintToPtr(1), - EncoderUtilization: helper.UintToPtr(1), - DecoderUtilization: helper.UintToPtr(1), - TemperatureC: helper.UintToPtr(1), - UsedMemoryMiB: helper.Uint64ToPtr(1), - BAR1UsedMiB: helper.Uint64ToPtr(1), - ECCErrorsL1Cache: helper.Uint64ToPtr(100), - ECCErrorsL2Cache: helper.Uint64ToPtr(100), - ECCErrorsDevice: helper.Uint64ToPtr(100), - }, - ExpectedResult: &device.DeviceStats{ - Summary: &structs.StatValue{ - Unit: MemoryStateUnit, - Desc: MemoryStateDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(1), - }, - Stats: &structs.StatObject{ - Attributes: map[string]*structs.StatValue{ - PowerUsageAttr: { - Unit: PowerUsageUnit, - Desc: PowerUsageDesc, - StringVal: helper.StringToPtr(notAvailable), - }, - GPUUtilizationAttr: { - Unit: GPUUtilizationUnit, - Desc: GPUUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - MemoryUtilizationAttr: { - Unit: MemoryUtilizationUnit, - Desc: MemoryUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - EncoderUtilizationAttr: { - Unit: EncoderUtilizationUnit, - Desc: EncoderUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - DecoderUtilizationAttr: { - Unit: DecoderUtilizationUnit, - Desc: DecoderUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - TemperatureAttr: { - Unit: TemperatureUnit, - Desc: TemperatureDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - MemoryStateAttr: { - Unit: MemoryStateUnit, - Desc: MemoryStateDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(1), - }, - BAR1StateAttr: { - Unit: BAR1StateUnit, - Desc: BAR1StateDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(256), - }, - ECCErrorsL1CacheAttr: { - Unit: ECCErrorsL1CacheUnit, - Desc: ECCErrorsL1CacheDesc, - IntNumeratorVal: helper.Int64ToPtr(100), - }, - ECCErrorsL2CacheAttr: { - Unit: ECCErrorsL2CacheUnit, - Desc: ECCErrorsL2CacheDesc, - IntNumeratorVal: helper.Int64ToPtr(100), - }, - ECCErrorsDeviceAttr: { - Unit: ECCErrorsDeviceUnit, - Desc: ECCErrorsDeviceDesc, - IntNumeratorVal: helper.Int64ToPtr(100), - }, - }, - }, - Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC), - }, - }, - { - Name: "GPUUtilization is nil", - Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC), - ItemStat: &nvml.StatsData{ - DeviceData: &nvml.DeviceData{ - UUID: "UUID1", - DeviceName: helper.StringToPtr("DeviceName1"), - MemoryMiB: helper.Uint64ToPtr(1), - PowerW: helper.UintToPtr(1), - BAR1MiB: helper.Uint64ToPtr(256), - }, - PowerUsageW: helper.UintToPtr(1), - GPUUtilization: nil, - MemoryUtilization: helper.UintToPtr(1), - EncoderUtilization: helper.UintToPtr(1), - DecoderUtilization: helper.UintToPtr(1), - TemperatureC: helper.UintToPtr(1), - UsedMemoryMiB: helper.Uint64ToPtr(1), - BAR1UsedMiB: helper.Uint64ToPtr(1), - ECCErrorsL1Cache: helper.Uint64ToPtr(100), - ECCErrorsL2Cache: helper.Uint64ToPtr(100), - ECCErrorsDevice: helper.Uint64ToPtr(100), - }, - ExpectedResult: &device.DeviceStats{ - Summary: &structs.StatValue{ - Unit: MemoryStateUnit, - Desc: MemoryStateDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(1), - }, - Stats: &structs.StatObject{ - Attributes: map[string]*structs.StatValue{ - PowerUsageAttr: { - Unit: PowerUsageUnit, - Desc: PowerUsageDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(1), - }, - GPUUtilizationAttr: { - Unit: GPUUtilizationUnit, - Desc: GPUUtilizationDesc, - StringVal: helper.StringToPtr(notAvailable), - }, - MemoryUtilizationAttr: { - Unit: MemoryUtilizationUnit, - Desc: MemoryUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - EncoderUtilizationAttr: { - Unit: EncoderUtilizationUnit, - Desc: EncoderUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - DecoderUtilizationAttr: { - Unit: DecoderUtilizationUnit, - Desc: DecoderUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - TemperatureAttr: { - Unit: TemperatureUnit, - Desc: TemperatureDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - MemoryStateAttr: { - Unit: MemoryStateUnit, - Desc: MemoryStateDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(1), - }, - BAR1StateAttr: { - Unit: BAR1StateUnit, - Desc: BAR1StateDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(256), - }, - ECCErrorsL1CacheAttr: { - Unit: ECCErrorsL1CacheUnit, - Desc: ECCErrorsL1CacheDesc, - IntNumeratorVal: helper.Int64ToPtr(100), - }, - ECCErrorsL2CacheAttr: { - Unit: ECCErrorsL2CacheUnit, - Desc: ECCErrorsL2CacheDesc, - IntNumeratorVal: helper.Int64ToPtr(100), - }, - ECCErrorsDeviceAttr: { - Unit: ECCErrorsDeviceUnit, - Desc: ECCErrorsDeviceDesc, - IntNumeratorVal: helper.Int64ToPtr(100), - }, - }, - }, - Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC), - }, - }, - { - Name: "MemoryUtilization is nil", - Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC), - ItemStat: &nvml.StatsData{ - DeviceData: &nvml.DeviceData{ - UUID: "UUID1", - DeviceName: helper.StringToPtr("DeviceName1"), - MemoryMiB: helper.Uint64ToPtr(1), - PowerW: helper.UintToPtr(1), - BAR1MiB: helper.Uint64ToPtr(256), - }, - PowerUsageW: helper.UintToPtr(1), - GPUUtilization: helper.UintToPtr(1), - MemoryUtilization: nil, - EncoderUtilization: helper.UintToPtr(1), - DecoderUtilization: helper.UintToPtr(1), - TemperatureC: helper.UintToPtr(1), - UsedMemoryMiB: helper.Uint64ToPtr(1), - BAR1UsedMiB: helper.Uint64ToPtr(1), - ECCErrorsL1Cache: helper.Uint64ToPtr(100), - ECCErrorsL2Cache: helper.Uint64ToPtr(100), - ECCErrorsDevice: helper.Uint64ToPtr(100), - }, - ExpectedResult: &device.DeviceStats{ - Summary: &structs.StatValue{ - Unit: MemoryStateUnit, - Desc: MemoryStateDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(1), - }, - Stats: &structs.StatObject{ - Attributes: map[string]*structs.StatValue{ - PowerUsageAttr: { - Unit: PowerUsageUnit, - Desc: PowerUsageDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(1), - }, - GPUUtilizationAttr: { - Unit: GPUUtilizationUnit, - Desc: GPUUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - MemoryUtilizationAttr: { - Unit: MemoryUtilizationUnit, - Desc: MemoryUtilizationDesc, - StringVal: helper.StringToPtr(notAvailable), - }, - EncoderUtilizationAttr: { - Unit: EncoderUtilizationUnit, - Desc: EncoderUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - DecoderUtilizationAttr: { - Unit: DecoderUtilizationUnit, - Desc: DecoderUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - TemperatureAttr: { - Unit: TemperatureUnit, - Desc: TemperatureDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - MemoryStateAttr: { - Unit: MemoryStateUnit, - Desc: MemoryStateDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(1), - }, - BAR1StateAttr: { - Unit: BAR1StateUnit, - Desc: BAR1StateDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(256), - }, - ECCErrorsL1CacheAttr: { - Unit: ECCErrorsL1CacheUnit, - Desc: ECCErrorsL1CacheDesc, - IntNumeratorVal: helper.Int64ToPtr(100), - }, - ECCErrorsL2CacheAttr: { - Unit: ECCErrorsL2CacheUnit, - Desc: ECCErrorsL2CacheDesc, - IntNumeratorVal: helper.Int64ToPtr(100), - }, - ECCErrorsDeviceAttr: { - Unit: ECCErrorsDeviceUnit, - Desc: ECCErrorsDeviceDesc, - IntNumeratorVal: helper.Int64ToPtr(100), - }, - }, - }, - Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC), - }, - }, - { - Name: "EncoderUtilization is nil", - Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC), - ItemStat: &nvml.StatsData{ - DeviceData: &nvml.DeviceData{ - UUID: "UUID1", - DeviceName: helper.StringToPtr("DeviceName1"), - MemoryMiB: helper.Uint64ToPtr(1), - PowerW: helper.UintToPtr(1), - BAR1MiB: helper.Uint64ToPtr(256), - }, - PowerUsageW: helper.UintToPtr(1), - GPUUtilization: helper.UintToPtr(1), - MemoryUtilization: helper.UintToPtr(1), - EncoderUtilization: nil, - DecoderUtilization: helper.UintToPtr(1), - TemperatureC: helper.UintToPtr(1), - UsedMemoryMiB: helper.Uint64ToPtr(1), - BAR1UsedMiB: helper.Uint64ToPtr(1), - ECCErrorsL1Cache: helper.Uint64ToPtr(100), - ECCErrorsL2Cache: helper.Uint64ToPtr(100), - ECCErrorsDevice: helper.Uint64ToPtr(100), - }, - ExpectedResult: &device.DeviceStats{ - Summary: &structs.StatValue{ - Unit: MemoryStateUnit, - Desc: MemoryStateDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(1), - }, - Stats: &structs.StatObject{ - Attributes: map[string]*structs.StatValue{ - PowerUsageAttr: { - Unit: PowerUsageUnit, - Desc: PowerUsageDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(1), - }, - GPUUtilizationAttr: { - Unit: GPUUtilizationUnit, - Desc: GPUUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - MemoryUtilizationAttr: { - Unit: MemoryUtilizationUnit, - Desc: MemoryUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - EncoderUtilizationAttr: { - Unit: EncoderUtilizationUnit, - Desc: EncoderUtilizationDesc, - StringVal: helper.StringToPtr(notAvailable), - }, - DecoderUtilizationAttr: { - Unit: DecoderUtilizationUnit, - Desc: DecoderUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - TemperatureAttr: { - Unit: TemperatureUnit, - Desc: TemperatureDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - MemoryStateAttr: { - Unit: MemoryStateUnit, - Desc: MemoryStateDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(1), - }, - BAR1StateAttr: { - Unit: BAR1StateUnit, - Desc: BAR1StateDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(256), - }, - ECCErrorsL1CacheAttr: { - Unit: ECCErrorsL1CacheUnit, - Desc: ECCErrorsL1CacheDesc, - IntNumeratorVal: helper.Int64ToPtr(100), - }, - ECCErrorsL2CacheAttr: { - Unit: ECCErrorsL2CacheUnit, - Desc: ECCErrorsL2CacheDesc, - IntNumeratorVal: helper.Int64ToPtr(100), - }, - ECCErrorsDeviceAttr: { - Unit: ECCErrorsDeviceUnit, - Desc: ECCErrorsDeviceDesc, - IntNumeratorVal: helper.Int64ToPtr(100), - }, - }, - }, - Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC), - }, - }, - { - Name: "DecoderUtilization is nil", - Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC), - ItemStat: &nvml.StatsData{ - DeviceData: &nvml.DeviceData{ - UUID: "UUID1", - DeviceName: helper.StringToPtr("DeviceName1"), - MemoryMiB: helper.Uint64ToPtr(1), - PowerW: helper.UintToPtr(1), - BAR1MiB: helper.Uint64ToPtr(256), - }, - PowerUsageW: helper.UintToPtr(1), - GPUUtilization: helper.UintToPtr(1), - MemoryUtilization: helper.UintToPtr(1), - EncoderUtilization: helper.UintToPtr(1), - DecoderUtilization: nil, - TemperatureC: helper.UintToPtr(1), - UsedMemoryMiB: helper.Uint64ToPtr(1), - BAR1UsedMiB: helper.Uint64ToPtr(1), - ECCErrorsL1Cache: helper.Uint64ToPtr(100), - ECCErrorsL2Cache: helper.Uint64ToPtr(100), - ECCErrorsDevice: helper.Uint64ToPtr(100), - }, - ExpectedResult: &device.DeviceStats{ - Summary: &structs.StatValue{ - Unit: MemoryStateUnit, - Desc: MemoryStateDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(1), - }, - Stats: &structs.StatObject{ - Attributes: map[string]*structs.StatValue{ - PowerUsageAttr: { - Unit: PowerUsageUnit, - Desc: PowerUsageDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(1), - }, - GPUUtilizationAttr: { - Unit: GPUUtilizationUnit, - Desc: GPUUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - MemoryUtilizationAttr: { - Unit: MemoryUtilizationUnit, - Desc: MemoryUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - EncoderUtilizationAttr: { - Unit: EncoderUtilizationUnit, - Desc: EncoderUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - DecoderUtilizationAttr: { - Unit: DecoderUtilizationUnit, - Desc: DecoderUtilizationDesc, - StringVal: helper.StringToPtr(notAvailable), - }, - TemperatureAttr: { - Unit: TemperatureUnit, - Desc: TemperatureDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - MemoryStateAttr: { - Unit: MemoryStateUnit, - Desc: MemoryStateDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(1), - }, - BAR1StateAttr: { - Unit: BAR1StateUnit, - Desc: BAR1StateDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(256), - }, - ECCErrorsL1CacheAttr: { - Unit: ECCErrorsL1CacheUnit, - Desc: ECCErrorsL1CacheDesc, - IntNumeratorVal: helper.Int64ToPtr(100), - }, - ECCErrorsL2CacheAttr: { - Unit: ECCErrorsL2CacheUnit, - Desc: ECCErrorsL2CacheDesc, - IntNumeratorVal: helper.Int64ToPtr(100), - }, - ECCErrorsDeviceAttr: { - Unit: ECCErrorsDeviceUnit, - Desc: ECCErrorsDeviceDesc, - IntNumeratorVal: helper.Int64ToPtr(100), - }, - }, - }, - Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC), - }, - }, - { - Name: "Temperature is nil", - Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC), - ItemStat: &nvml.StatsData{ - DeviceData: &nvml.DeviceData{ - UUID: "UUID1", - DeviceName: helper.StringToPtr("DeviceName1"), - MemoryMiB: helper.Uint64ToPtr(1), - PowerW: helper.UintToPtr(1), - BAR1MiB: helper.Uint64ToPtr(256), - }, - PowerUsageW: helper.UintToPtr(1), - GPUUtilization: helper.UintToPtr(1), - MemoryUtilization: helper.UintToPtr(1), - EncoderUtilization: helper.UintToPtr(1), - DecoderUtilization: helper.UintToPtr(1), - TemperatureC: nil, - UsedMemoryMiB: helper.Uint64ToPtr(1), - BAR1UsedMiB: helper.Uint64ToPtr(1), - ECCErrorsL1Cache: helper.Uint64ToPtr(100), - ECCErrorsL2Cache: helper.Uint64ToPtr(100), - ECCErrorsDevice: helper.Uint64ToPtr(100), - }, - ExpectedResult: &device.DeviceStats{ - Summary: &structs.StatValue{ - Unit: MemoryStateUnit, - Desc: MemoryStateDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(1), - }, - Stats: &structs.StatObject{ - Attributes: map[string]*structs.StatValue{ - PowerUsageAttr: { - Unit: PowerUsageUnit, - Desc: PowerUsageDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(1), - }, - GPUUtilizationAttr: { - Unit: GPUUtilizationUnit, - Desc: GPUUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - MemoryUtilizationAttr: { - Unit: MemoryUtilizationUnit, - Desc: MemoryUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - EncoderUtilizationAttr: { - Unit: EncoderUtilizationUnit, - Desc: EncoderUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - DecoderUtilizationAttr: { - Unit: DecoderUtilizationUnit, - Desc: DecoderUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - TemperatureAttr: { - Unit: TemperatureUnit, - Desc: TemperatureDesc, - StringVal: helper.StringToPtr(notAvailable), - }, - MemoryStateAttr: { - Unit: MemoryStateUnit, - Desc: MemoryStateDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(1), - }, - BAR1StateAttr: { - Unit: BAR1StateUnit, - Desc: BAR1StateDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(256), - }, - ECCErrorsL1CacheAttr: { - Unit: ECCErrorsL1CacheUnit, - Desc: ECCErrorsL1CacheDesc, - IntNumeratorVal: helper.Int64ToPtr(100), - }, - ECCErrorsL2CacheAttr: { - Unit: ECCErrorsL2CacheUnit, - Desc: ECCErrorsL2CacheDesc, - IntNumeratorVal: helper.Int64ToPtr(100), - }, - ECCErrorsDeviceAttr: { - Unit: ECCErrorsDeviceUnit, - Desc: ECCErrorsDeviceDesc, - IntNumeratorVal: helper.Int64ToPtr(100), - }, - }, - }, - Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC), - }, - }, - { - Name: "UsedMemoryMiB is nil", - Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC), - ItemStat: &nvml.StatsData{ - DeviceData: &nvml.DeviceData{ - UUID: "UUID1", - DeviceName: helper.StringToPtr("DeviceName1"), - MemoryMiB: helper.Uint64ToPtr(1), - PowerW: helper.UintToPtr(1), - BAR1MiB: helper.Uint64ToPtr(256), - }, - PowerUsageW: helper.UintToPtr(1), - GPUUtilization: helper.UintToPtr(1), - MemoryUtilization: helper.UintToPtr(1), - EncoderUtilization: helper.UintToPtr(1), - DecoderUtilization: helper.UintToPtr(1), - TemperatureC: helper.UintToPtr(1), - UsedMemoryMiB: nil, - BAR1UsedMiB: helper.Uint64ToPtr(1), - ECCErrorsL1Cache: helper.Uint64ToPtr(100), - ECCErrorsL2Cache: helper.Uint64ToPtr(100), - ECCErrorsDevice: helper.Uint64ToPtr(100), - }, - ExpectedResult: &device.DeviceStats{ - Summary: &structs.StatValue{ - Unit: MemoryStateUnit, - Desc: MemoryStateDesc, - StringVal: helper.StringToPtr(notAvailable), - }, - Stats: &structs.StatObject{ - Attributes: map[string]*structs.StatValue{ - PowerUsageAttr: { - Unit: PowerUsageUnit, - Desc: PowerUsageDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(1), - }, - GPUUtilizationAttr: { - Unit: GPUUtilizationUnit, - Desc: GPUUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - MemoryUtilizationAttr: { - Unit: MemoryUtilizationUnit, - Desc: MemoryUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - EncoderUtilizationAttr: { - Unit: EncoderUtilizationUnit, - Desc: EncoderUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - DecoderUtilizationAttr: { - Unit: DecoderUtilizationUnit, - Desc: DecoderUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - TemperatureAttr: { - Unit: TemperatureUnit, - Desc: TemperatureDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - MemoryStateAttr: { - Unit: MemoryStateUnit, - Desc: MemoryStateDesc, - StringVal: helper.StringToPtr(notAvailable), - }, - BAR1StateAttr: { - Unit: BAR1StateUnit, - Desc: BAR1StateDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(256), - }, - ECCErrorsL1CacheAttr: { - Unit: ECCErrorsL1CacheUnit, - Desc: ECCErrorsL1CacheDesc, - IntNumeratorVal: helper.Int64ToPtr(100), - }, - ECCErrorsL2CacheAttr: { - Unit: ECCErrorsL2CacheUnit, - Desc: ECCErrorsL2CacheDesc, - IntNumeratorVal: helper.Int64ToPtr(100), - }, - ECCErrorsDeviceAttr: { - Unit: ECCErrorsDeviceUnit, - Desc: ECCErrorsDeviceDesc, - IntNumeratorVal: helper.Int64ToPtr(100), - }, - }, - }, - Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC), - }, - }, - { - Name: "MemoryMiB is nil", - Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC), - ItemStat: &nvml.StatsData{ - DeviceData: &nvml.DeviceData{ - UUID: "UUID1", - DeviceName: helper.StringToPtr("DeviceName1"), - MemoryMiB: nil, - PowerW: helper.UintToPtr(1), - BAR1MiB: helper.Uint64ToPtr(256), - }, - PowerUsageW: helper.UintToPtr(1), - GPUUtilization: helper.UintToPtr(1), - MemoryUtilization: helper.UintToPtr(1), - EncoderUtilization: helper.UintToPtr(1), - DecoderUtilization: helper.UintToPtr(1), - TemperatureC: helper.UintToPtr(1), - UsedMemoryMiB: helper.Uint64ToPtr(1), - BAR1UsedMiB: helper.Uint64ToPtr(1), - ECCErrorsL1Cache: helper.Uint64ToPtr(100), - ECCErrorsL2Cache: helper.Uint64ToPtr(100), - ECCErrorsDevice: helper.Uint64ToPtr(100), - }, - ExpectedResult: &device.DeviceStats{ - Summary: &structs.StatValue{ - Unit: MemoryStateUnit, - Desc: MemoryStateDesc, - StringVal: helper.StringToPtr(notAvailable), - }, - Stats: &structs.StatObject{ - Attributes: map[string]*structs.StatValue{ - PowerUsageAttr: { - Unit: PowerUsageUnit, - Desc: PowerUsageDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(1), - }, - GPUUtilizationAttr: { - Unit: GPUUtilizationUnit, - Desc: GPUUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - MemoryUtilizationAttr: { - Unit: MemoryUtilizationUnit, - Desc: MemoryUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - EncoderUtilizationAttr: { - Unit: EncoderUtilizationUnit, - Desc: EncoderUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - DecoderUtilizationAttr: { - Unit: DecoderUtilizationUnit, - Desc: DecoderUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - TemperatureAttr: { - Unit: TemperatureUnit, - Desc: TemperatureDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - MemoryStateAttr: { - Unit: MemoryStateUnit, - Desc: MemoryStateDesc, - StringVal: helper.StringToPtr(notAvailable), - }, - BAR1StateAttr: { - Unit: BAR1StateUnit, - Desc: BAR1StateDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(256), - }, - ECCErrorsL1CacheAttr: { - Unit: ECCErrorsL1CacheUnit, - Desc: ECCErrorsL1CacheDesc, - IntNumeratorVal: helper.Int64ToPtr(100), - }, - ECCErrorsL2CacheAttr: { - Unit: ECCErrorsL2CacheUnit, - Desc: ECCErrorsL2CacheDesc, - IntNumeratorVal: helper.Int64ToPtr(100), - }, - ECCErrorsDeviceAttr: { - Unit: ECCErrorsDeviceUnit, - Desc: ECCErrorsDeviceDesc, - IntNumeratorVal: helper.Int64ToPtr(100), - }, - }, - }, - Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC), - }, - }, - { - Name: "BAR1UsedMiB is nil", - Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC), - ItemStat: &nvml.StatsData{ - DeviceData: &nvml.DeviceData{ - UUID: "UUID1", - DeviceName: helper.StringToPtr("DeviceName1"), - MemoryMiB: helper.Uint64ToPtr(1), - PowerW: helper.UintToPtr(1), - BAR1MiB: helper.Uint64ToPtr(256), - }, - PowerUsageW: helper.UintToPtr(1), - GPUUtilization: helper.UintToPtr(1), - MemoryUtilization: helper.UintToPtr(1), - EncoderUtilization: helper.UintToPtr(1), - DecoderUtilization: helper.UintToPtr(1), - TemperatureC: helper.UintToPtr(1), - UsedMemoryMiB: helper.Uint64ToPtr(1), - BAR1UsedMiB: nil, - ECCErrorsL1Cache: helper.Uint64ToPtr(100), - ECCErrorsL2Cache: helper.Uint64ToPtr(100), - ECCErrorsDevice: helper.Uint64ToPtr(100), - }, - ExpectedResult: &device.DeviceStats{ - Summary: &structs.StatValue{ - Unit: MemoryStateUnit, - Desc: MemoryStateDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(1), - }, - Stats: &structs.StatObject{ - Attributes: map[string]*structs.StatValue{ - PowerUsageAttr: { - Unit: PowerUsageUnit, - Desc: PowerUsageDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(1), - }, - GPUUtilizationAttr: { - Unit: GPUUtilizationUnit, - Desc: GPUUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - MemoryUtilizationAttr: { - Unit: MemoryUtilizationUnit, - Desc: MemoryUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - EncoderUtilizationAttr: { - Unit: EncoderUtilizationUnit, - Desc: EncoderUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - DecoderUtilizationAttr: { - Unit: DecoderUtilizationUnit, - Desc: DecoderUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - TemperatureAttr: { - Unit: TemperatureUnit, - Desc: TemperatureDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - MemoryStateAttr: { - Unit: MemoryStateUnit, - Desc: MemoryStateDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(1), - }, - BAR1StateAttr: { - Unit: BAR1StateUnit, - Desc: BAR1StateDesc, - StringVal: helper.StringToPtr(notAvailable), - }, - ECCErrorsL1CacheAttr: { - Unit: ECCErrorsL1CacheUnit, - Desc: ECCErrorsL1CacheDesc, - IntNumeratorVal: helper.Int64ToPtr(100), - }, - ECCErrorsL2CacheAttr: { - Unit: ECCErrorsL2CacheUnit, - Desc: ECCErrorsL2CacheDesc, - IntNumeratorVal: helper.Int64ToPtr(100), - }, - ECCErrorsDeviceAttr: { - Unit: ECCErrorsDeviceUnit, - Desc: ECCErrorsDeviceDesc, - IntNumeratorVal: helper.Int64ToPtr(100), - }, - }, - }, - Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC), - }, - }, - { - Name: "BAR1MiB is nil", - Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC), - ItemStat: &nvml.StatsData{ - DeviceData: &nvml.DeviceData{ - UUID: "UUID1", - DeviceName: helper.StringToPtr("DeviceName1"), - MemoryMiB: helper.Uint64ToPtr(1), - PowerW: helper.UintToPtr(1), - BAR1MiB: nil, - }, - PowerUsageW: helper.UintToPtr(1), - GPUUtilization: helper.UintToPtr(1), - MemoryUtilization: helper.UintToPtr(1), - EncoderUtilization: helper.UintToPtr(1), - DecoderUtilization: helper.UintToPtr(1), - TemperatureC: helper.UintToPtr(1), - UsedMemoryMiB: helper.Uint64ToPtr(1), - BAR1UsedMiB: helper.Uint64ToPtr(1), - ECCErrorsL1Cache: helper.Uint64ToPtr(100), - ECCErrorsL2Cache: helper.Uint64ToPtr(100), - ECCErrorsDevice: helper.Uint64ToPtr(100), - }, - ExpectedResult: &device.DeviceStats{ - Summary: &structs.StatValue{ - Unit: MemoryStateUnit, - Desc: MemoryStateDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(1), - }, - Stats: &structs.StatObject{ - Attributes: map[string]*structs.StatValue{ - PowerUsageAttr: { - Unit: PowerUsageUnit, - Desc: PowerUsageDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(1), - }, - GPUUtilizationAttr: { - Unit: GPUUtilizationUnit, - Desc: GPUUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - MemoryUtilizationAttr: { - Unit: MemoryUtilizationUnit, - Desc: MemoryUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - EncoderUtilizationAttr: { - Unit: EncoderUtilizationUnit, - Desc: EncoderUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - DecoderUtilizationAttr: { - Unit: DecoderUtilizationUnit, - Desc: DecoderUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - TemperatureAttr: { - Unit: TemperatureUnit, - Desc: TemperatureDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - MemoryStateAttr: { - Unit: MemoryStateUnit, - Desc: MemoryStateDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(1), - }, - BAR1StateAttr: { - Unit: BAR1StateUnit, - Desc: BAR1StateDesc, - StringVal: helper.StringToPtr(notAvailable), - }, - ECCErrorsL1CacheAttr: { - Unit: ECCErrorsL1CacheUnit, - Desc: ECCErrorsL1CacheDesc, - IntNumeratorVal: helper.Int64ToPtr(100), - }, - ECCErrorsL2CacheAttr: { - Unit: ECCErrorsL2CacheUnit, - Desc: ECCErrorsL2CacheDesc, - IntNumeratorVal: helper.Int64ToPtr(100), - }, - ECCErrorsDeviceAttr: { - Unit: ECCErrorsDeviceUnit, - Desc: ECCErrorsDeviceDesc, - IntNumeratorVal: helper.Int64ToPtr(100), - }, - }, - }, - Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC), - }, - }, - { - Name: "ECCErrorsL1Cache is nil", - Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC), - ItemStat: &nvml.StatsData{ - DeviceData: &nvml.DeviceData{ - UUID: "UUID1", - DeviceName: helper.StringToPtr("DeviceName1"), - MemoryMiB: helper.Uint64ToPtr(1), - PowerW: helper.UintToPtr(1), - BAR1MiB: helper.Uint64ToPtr(256), - }, - PowerUsageW: helper.UintToPtr(1), - GPUUtilization: helper.UintToPtr(1), - MemoryUtilization: helper.UintToPtr(1), - EncoderUtilization: helper.UintToPtr(1), - DecoderUtilization: helper.UintToPtr(1), - TemperatureC: helper.UintToPtr(1), - UsedMemoryMiB: helper.Uint64ToPtr(1), - BAR1UsedMiB: helper.Uint64ToPtr(1), - ECCErrorsL1Cache: nil, - ECCErrorsL2Cache: helper.Uint64ToPtr(100), - ECCErrorsDevice: helper.Uint64ToPtr(100), - }, - ExpectedResult: &device.DeviceStats{ - Summary: &structs.StatValue{ - Unit: MemoryStateUnit, - Desc: MemoryStateDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(1), - }, - Stats: &structs.StatObject{ - Attributes: map[string]*structs.StatValue{ - PowerUsageAttr: { - Unit: PowerUsageUnit, - Desc: PowerUsageDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(1), - }, - GPUUtilizationAttr: { - Unit: GPUUtilizationUnit, - Desc: GPUUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - MemoryUtilizationAttr: { - Unit: MemoryUtilizationUnit, - Desc: MemoryUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - EncoderUtilizationAttr: { - Unit: EncoderUtilizationUnit, - Desc: EncoderUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - DecoderUtilizationAttr: { - Unit: DecoderUtilizationUnit, - Desc: DecoderUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - TemperatureAttr: { - Unit: TemperatureUnit, - Desc: TemperatureDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - MemoryStateAttr: { - Unit: MemoryStateUnit, - Desc: MemoryStateDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(1), - }, - BAR1StateAttr: { - Unit: BAR1StateUnit, - Desc: BAR1StateDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(256), - }, - ECCErrorsL1CacheAttr: { - Unit: ECCErrorsL1CacheUnit, - Desc: ECCErrorsL1CacheDesc, - StringVal: helper.StringToPtr(notAvailable), - }, - ECCErrorsL2CacheAttr: { - Unit: ECCErrorsL2CacheUnit, - Desc: ECCErrorsL2CacheDesc, - IntNumeratorVal: helper.Int64ToPtr(100), - }, - ECCErrorsDeviceAttr: { - Unit: ECCErrorsDeviceUnit, - Desc: ECCErrorsDeviceDesc, - IntNumeratorVal: helper.Int64ToPtr(100), - }, - }, - }, - Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC), - }, - }, - { - Name: "ECCErrorsL2Cache is nil", - Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC), - ItemStat: &nvml.StatsData{ - DeviceData: &nvml.DeviceData{ - UUID: "UUID1", - DeviceName: helper.StringToPtr("DeviceName1"), - MemoryMiB: helper.Uint64ToPtr(1), - PowerW: helper.UintToPtr(1), - BAR1MiB: helper.Uint64ToPtr(256), - }, - PowerUsageW: helper.UintToPtr(1), - GPUUtilization: helper.UintToPtr(1), - MemoryUtilization: helper.UintToPtr(1), - EncoderUtilization: helper.UintToPtr(1), - DecoderUtilization: helper.UintToPtr(1), - TemperatureC: helper.UintToPtr(1), - UsedMemoryMiB: helper.Uint64ToPtr(1), - BAR1UsedMiB: helper.Uint64ToPtr(1), - ECCErrorsL1Cache: helper.Uint64ToPtr(100), - ECCErrorsL2Cache: nil, - ECCErrorsDevice: helper.Uint64ToPtr(100), - }, - ExpectedResult: &device.DeviceStats{ - Summary: &structs.StatValue{ - Unit: MemoryStateUnit, - Desc: MemoryStateDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(1), - }, - Stats: &structs.StatObject{ - Attributes: map[string]*structs.StatValue{ - PowerUsageAttr: { - Unit: PowerUsageUnit, - Desc: PowerUsageDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(1), - }, - GPUUtilizationAttr: { - Unit: GPUUtilizationUnit, - Desc: GPUUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - MemoryUtilizationAttr: { - Unit: MemoryUtilizationUnit, - Desc: MemoryUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - EncoderUtilizationAttr: { - Unit: EncoderUtilizationUnit, - Desc: EncoderUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - DecoderUtilizationAttr: { - Unit: DecoderUtilizationUnit, - Desc: DecoderUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - TemperatureAttr: { - Unit: TemperatureUnit, - Desc: TemperatureDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - MemoryStateAttr: { - Unit: MemoryStateUnit, - Desc: MemoryStateDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(1), - }, - BAR1StateAttr: { - Unit: BAR1StateUnit, - Desc: BAR1StateDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(256), - }, - ECCErrorsL1CacheAttr: { - Unit: ECCErrorsL1CacheUnit, - Desc: ECCErrorsL1CacheDesc, - IntNumeratorVal: helper.Int64ToPtr(100), - }, - ECCErrorsL2CacheAttr: { - Unit: ECCErrorsL2CacheUnit, - Desc: ECCErrorsL2CacheDesc, - StringVal: helper.StringToPtr(notAvailable), - }, - ECCErrorsDeviceAttr: { - Unit: ECCErrorsDeviceUnit, - Desc: ECCErrorsDeviceDesc, - IntNumeratorVal: helper.Int64ToPtr(100), - }, - }, - }, - Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC), - }, - }, - { - Name: "ECCErrorsDevice is nil", - Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC), - ItemStat: &nvml.StatsData{ - DeviceData: &nvml.DeviceData{ - UUID: "UUID1", - DeviceName: helper.StringToPtr("DeviceName1"), - MemoryMiB: helper.Uint64ToPtr(1), - PowerW: helper.UintToPtr(1), - BAR1MiB: helper.Uint64ToPtr(256), - }, - PowerUsageW: helper.UintToPtr(1), - GPUUtilization: helper.UintToPtr(1), - MemoryUtilization: helper.UintToPtr(1), - EncoderUtilization: helper.UintToPtr(1), - DecoderUtilization: helper.UintToPtr(1), - TemperatureC: helper.UintToPtr(1), - UsedMemoryMiB: helper.Uint64ToPtr(1), - BAR1UsedMiB: helper.Uint64ToPtr(1), - ECCErrorsL1Cache: helper.Uint64ToPtr(100), - ECCErrorsL2Cache: helper.Uint64ToPtr(100), - ECCErrorsDevice: nil, - }, - ExpectedResult: &device.DeviceStats{ - Summary: &structs.StatValue{ - Unit: MemoryStateUnit, - Desc: MemoryStateDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(1), - }, - Stats: &structs.StatObject{ - Attributes: map[string]*structs.StatValue{ - PowerUsageAttr: { - Unit: PowerUsageUnit, - Desc: PowerUsageDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(1), - }, - GPUUtilizationAttr: { - Unit: GPUUtilizationUnit, - Desc: GPUUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - MemoryUtilizationAttr: { - Unit: MemoryUtilizationUnit, - Desc: MemoryUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - EncoderUtilizationAttr: { - Unit: EncoderUtilizationUnit, - Desc: EncoderUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - DecoderUtilizationAttr: { - Unit: DecoderUtilizationUnit, - Desc: DecoderUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - TemperatureAttr: { - Unit: TemperatureUnit, - Desc: TemperatureDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - MemoryStateAttr: { - Unit: MemoryStateUnit, - Desc: MemoryStateDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(1), - }, - BAR1StateAttr: { - Unit: BAR1StateUnit, - Desc: BAR1StateDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(256), - }, - ECCErrorsL1CacheAttr: { - Unit: ECCErrorsL1CacheUnit, - Desc: ECCErrorsL1CacheDesc, - IntNumeratorVal: helper.Int64ToPtr(100), - }, - ECCErrorsL2CacheAttr: { - Unit: ECCErrorsL2CacheUnit, - Desc: ECCErrorsL2CacheDesc, - IntNumeratorVal: helper.Int64ToPtr(100), - }, - ECCErrorsDeviceAttr: { - Unit: ECCErrorsDeviceUnit, - Desc: ECCErrorsDeviceDesc, - StringVal: helper.StringToPtr(notAvailable), - }, - }, - }, - Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC), - }, - }, - } { - actualResult := statsForItem(testCase.ItemStat, testCase.Timestamp) - require.New(t).Equal(testCase.ExpectedResult, actualResult) - } -} - -func TestStatsForGroup(t *testing.T) { - for _, testCase := range []struct { - Name string - Timestamp time.Time - GroupStats []*nvml.StatsData - GroupName string - ExpectedResult *device.DeviceGroupStats - }{ - { - Name: "make sure that all data is transformed correctly", - Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC), - GroupName: "DeviceName1", - GroupStats: []*nvml.StatsData{ - { - DeviceData: &nvml.DeviceData{ - UUID: "UUID1", - DeviceName: helper.StringToPtr("DeviceName1"), - MemoryMiB: helper.Uint64ToPtr(1), - PowerW: helper.UintToPtr(1), - BAR1MiB: helper.Uint64ToPtr(256), - }, - PowerUsageW: helper.UintToPtr(1), - GPUUtilization: helper.UintToPtr(1), - MemoryUtilization: helper.UintToPtr(1), - EncoderUtilization: helper.UintToPtr(1), - DecoderUtilization: helper.UintToPtr(1), - TemperatureC: helper.UintToPtr(1), - UsedMemoryMiB: helper.Uint64ToPtr(1), - BAR1UsedMiB: helper.Uint64ToPtr(1), - ECCErrorsL1Cache: helper.Uint64ToPtr(100), - ECCErrorsL2Cache: helper.Uint64ToPtr(100), - ECCErrorsDevice: helper.Uint64ToPtr(100), - }, - { - DeviceData: &nvml.DeviceData{ - UUID: "UUID2", - DeviceName: helper.StringToPtr("DeviceName2"), - MemoryMiB: helper.Uint64ToPtr(2), - PowerW: helper.UintToPtr(2), - BAR1MiB: helper.Uint64ToPtr(256), - }, - PowerUsageW: helper.UintToPtr(2), - GPUUtilization: helper.UintToPtr(2), - MemoryUtilization: helper.UintToPtr(2), - EncoderUtilization: helper.UintToPtr(2), - DecoderUtilization: helper.UintToPtr(2), - TemperatureC: helper.UintToPtr(2), - UsedMemoryMiB: helper.Uint64ToPtr(2), - BAR1UsedMiB: helper.Uint64ToPtr(2), - ECCErrorsL1Cache: helper.Uint64ToPtr(200), - ECCErrorsL2Cache: helper.Uint64ToPtr(200), - ECCErrorsDevice: helper.Uint64ToPtr(200), - }, - { - DeviceData: &nvml.DeviceData{ - UUID: "UUID3", - DeviceName: helper.StringToPtr("DeviceName3"), - MemoryMiB: helper.Uint64ToPtr(3), - PowerW: helper.UintToPtr(3), - BAR1MiB: helper.Uint64ToPtr(256), - }, - PowerUsageW: helper.UintToPtr(3), - GPUUtilization: helper.UintToPtr(3), - MemoryUtilization: helper.UintToPtr(3), - EncoderUtilization: helper.UintToPtr(3), - DecoderUtilization: helper.UintToPtr(3), - TemperatureC: helper.UintToPtr(3), - UsedMemoryMiB: helper.Uint64ToPtr(3), - BAR1UsedMiB: helper.Uint64ToPtr(3), - ECCErrorsL1Cache: helper.Uint64ToPtr(300), - ECCErrorsL2Cache: helper.Uint64ToPtr(300), - ECCErrorsDevice: helper.Uint64ToPtr(300), - }, - }, - ExpectedResult: &device.DeviceGroupStats{ - Vendor: vendor, - Type: deviceType, - Name: "DeviceName1", - InstanceStats: map[string]*device.DeviceStats{ - "UUID1": { - Summary: &structs.StatValue{ - Unit: MemoryStateUnit, - Desc: MemoryStateDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(1), - }, - Stats: &structs.StatObject{ - Attributes: map[string]*structs.StatValue{ - PowerUsageAttr: { - Unit: PowerUsageUnit, - Desc: PowerUsageDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(1), - }, - GPUUtilizationAttr: { - Unit: GPUUtilizationUnit, - Desc: GPUUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - MemoryUtilizationAttr: { - Unit: MemoryUtilizationUnit, - Desc: MemoryUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - EncoderUtilizationAttr: { - Unit: EncoderUtilizationUnit, - Desc: EncoderUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - DecoderUtilizationAttr: { - Unit: DecoderUtilizationUnit, - Desc: DecoderUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - TemperatureAttr: { - Unit: TemperatureUnit, - Desc: TemperatureDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - MemoryStateAttr: { - Unit: MemoryStateUnit, - Desc: MemoryStateDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(1), - }, - BAR1StateAttr: { - Unit: BAR1StateUnit, - Desc: BAR1StateDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(256), - }, - ECCErrorsL1CacheAttr: { - Unit: ECCErrorsL1CacheUnit, - Desc: ECCErrorsL1CacheDesc, - IntNumeratorVal: helper.Int64ToPtr(100), - }, - ECCErrorsL2CacheAttr: { - Unit: ECCErrorsL2CacheUnit, - Desc: ECCErrorsL2CacheDesc, - IntNumeratorVal: helper.Int64ToPtr(100), - }, - ECCErrorsDeviceAttr: { - Unit: ECCErrorsDeviceUnit, - Desc: ECCErrorsDeviceDesc, - IntNumeratorVal: helper.Int64ToPtr(100), - }, - }, - }, - Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC), - }, - "UUID2": { - Summary: &structs.StatValue{ - Unit: MemoryStateUnit, - Desc: MemoryStateDesc, - IntNumeratorVal: helper.Int64ToPtr(2), - IntDenominatorVal: helper.Int64ToPtr(2), - }, - Stats: &structs.StatObject{ - Attributes: map[string]*structs.StatValue{ - PowerUsageAttr: { - Unit: PowerUsageUnit, - Desc: PowerUsageDesc, - IntNumeratorVal: helper.Int64ToPtr(2), - IntDenominatorVal: helper.Int64ToPtr(2), - }, - GPUUtilizationAttr: { - Unit: GPUUtilizationUnit, - Desc: GPUUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(2), - }, - MemoryUtilizationAttr: { - Unit: MemoryUtilizationUnit, - Desc: MemoryUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(2), - }, - EncoderUtilizationAttr: { - Unit: EncoderUtilizationUnit, - Desc: EncoderUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(2), - }, - DecoderUtilizationAttr: { - Unit: DecoderUtilizationUnit, - Desc: DecoderUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(2), - }, - TemperatureAttr: { - Unit: TemperatureUnit, - Desc: TemperatureDesc, - IntNumeratorVal: helper.Int64ToPtr(2), - }, - MemoryStateAttr: { - Unit: MemoryStateUnit, - Desc: MemoryStateDesc, - IntNumeratorVal: helper.Int64ToPtr(2), - IntDenominatorVal: helper.Int64ToPtr(2), - }, - BAR1StateAttr: { - Unit: BAR1StateUnit, - Desc: BAR1StateDesc, - IntNumeratorVal: helper.Int64ToPtr(2), - IntDenominatorVal: helper.Int64ToPtr(256), - }, - ECCErrorsL1CacheAttr: { - Unit: ECCErrorsL1CacheUnit, - Desc: ECCErrorsL1CacheDesc, - IntNumeratorVal: helper.Int64ToPtr(200), - }, - ECCErrorsL2CacheAttr: { - Unit: ECCErrorsL2CacheUnit, - Desc: ECCErrorsL2CacheDesc, - IntNumeratorVal: helper.Int64ToPtr(200), - }, - ECCErrorsDeviceAttr: { - Unit: ECCErrorsDeviceUnit, - Desc: ECCErrorsDeviceDesc, - IntNumeratorVal: helper.Int64ToPtr(200), - }, - }, - }, - Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC), - }, - "UUID3": { - Summary: &structs.StatValue{ - Unit: MemoryStateUnit, - Desc: MemoryStateDesc, - IntNumeratorVal: helper.Int64ToPtr(3), - IntDenominatorVal: helper.Int64ToPtr(3), - }, - Stats: &structs.StatObject{ - Attributes: map[string]*structs.StatValue{ - PowerUsageAttr: { - Unit: PowerUsageUnit, - Desc: PowerUsageDesc, - IntNumeratorVal: helper.Int64ToPtr(3), - IntDenominatorVal: helper.Int64ToPtr(3), - }, - GPUUtilizationAttr: { - Unit: GPUUtilizationUnit, - Desc: GPUUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(3), - }, - MemoryUtilizationAttr: { - Unit: MemoryUtilizationUnit, - Desc: MemoryUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(3), - }, - EncoderUtilizationAttr: { - Unit: EncoderUtilizationUnit, - Desc: EncoderUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(3), - }, - DecoderUtilizationAttr: { - Unit: DecoderUtilizationUnit, - Desc: DecoderUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(3), - }, - TemperatureAttr: { - Unit: TemperatureUnit, - Desc: TemperatureDesc, - IntNumeratorVal: helper.Int64ToPtr(3), - }, - MemoryStateAttr: { - Unit: MemoryStateUnit, - Desc: MemoryStateDesc, - IntNumeratorVal: helper.Int64ToPtr(3), - IntDenominatorVal: helper.Int64ToPtr(3), - }, - BAR1StateAttr: { - Unit: BAR1StateUnit, - Desc: BAR1StateDesc, - IntNumeratorVal: helper.Int64ToPtr(3), - IntDenominatorVal: helper.Int64ToPtr(256), - }, - ECCErrorsL1CacheAttr: { - Unit: ECCErrorsL1CacheUnit, - Desc: ECCErrorsL1CacheDesc, - IntNumeratorVal: helper.Int64ToPtr(300), - }, - ECCErrorsL2CacheAttr: { - Unit: ECCErrorsL2CacheUnit, - Desc: ECCErrorsL2CacheDesc, - IntNumeratorVal: helper.Int64ToPtr(300), - }, - ECCErrorsDeviceAttr: { - Unit: ECCErrorsDeviceUnit, - Desc: ECCErrorsDeviceDesc, - IntNumeratorVal: helper.Int64ToPtr(300), - }, - }, - }, - Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC), - }, - }, - }, - }, - } { - actualResult := statsForGroup(testCase.GroupName, testCase.GroupStats, testCase.Timestamp) - require.New(t).Equal(testCase.ExpectedResult, actualResult) - } -} - -func TestWriteStatsToChannel(t *testing.T) { - for _, testCase := range []struct { - Name string - ExpectedWriteToChannel *device.StatsResponse - Timestamp time.Time - Device *NvidiaDevice - }{ - { - Name: "NVML wrapper returns error", - Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC), - ExpectedWriteToChannel: &device.StatsResponse{ - Error: errors.New(""), - }, - Device: &NvidiaDevice{ - nvmlClient: &MockNvmlClient{ - StatsError: errors.New(""), - }, - logger: hclog.NewNullLogger(), - }, - }, - { - Name: "Check that stats with multiple DeviceNames are assigned to different groups", - Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC), - Device: &NvidiaDevice{ - devices: map[string]struct{}{ - "UUID1": {}, - "UUID2": {}, - "UUID3": {}, - }, - nvmlClient: &MockNvmlClient{ - StatsResponseReturned: []*nvml.StatsData{ - { - DeviceData: &nvml.DeviceData{ - UUID: "UUID1", - DeviceName: helper.StringToPtr("DeviceName1"), - MemoryMiB: helper.Uint64ToPtr(1), - PowerW: helper.UintToPtr(1), - BAR1MiB: helper.Uint64ToPtr(256), - }, - PowerUsageW: helper.UintToPtr(1), - GPUUtilization: helper.UintToPtr(1), - MemoryUtilization: helper.UintToPtr(1), - EncoderUtilization: helper.UintToPtr(1), - DecoderUtilization: helper.UintToPtr(1), - TemperatureC: helper.UintToPtr(1), - UsedMemoryMiB: helper.Uint64ToPtr(1), - BAR1UsedMiB: helper.Uint64ToPtr(1), - ECCErrorsL1Cache: helper.Uint64ToPtr(100), - ECCErrorsL2Cache: helper.Uint64ToPtr(100), - ECCErrorsDevice: helper.Uint64ToPtr(100), - }, - { - DeviceData: &nvml.DeviceData{ - UUID: "UUID2", - DeviceName: helper.StringToPtr("DeviceName2"), - MemoryMiB: helper.Uint64ToPtr(2), - PowerW: helper.UintToPtr(2), - BAR1MiB: helper.Uint64ToPtr(256), - }, - PowerUsageW: helper.UintToPtr(2), - GPUUtilization: helper.UintToPtr(2), - MemoryUtilization: helper.UintToPtr(2), - EncoderUtilization: helper.UintToPtr(2), - DecoderUtilization: helper.UintToPtr(2), - TemperatureC: helper.UintToPtr(2), - UsedMemoryMiB: helper.Uint64ToPtr(2), - BAR1UsedMiB: helper.Uint64ToPtr(2), - ECCErrorsL1Cache: helper.Uint64ToPtr(200), - ECCErrorsL2Cache: helper.Uint64ToPtr(200), - ECCErrorsDevice: helper.Uint64ToPtr(200), - }, - { - DeviceData: &nvml.DeviceData{ - UUID: "UUID3", - DeviceName: helper.StringToPtr("DeviceName3"), - MemoryMiB: helper.Uint64ToPtr(3), - PowerW: helper.UintToPtr(3), - BAR1MiB: helper.Uint64ToPtr(256), - }, - PowerUsageW: helper.UintToPtr(3), - GPUUtilization: helper.UintToPtr(3), - MemoryUtilization: helper.UintToPtr(3), - EncoderUtilization: helper.UintToPtr(3), - DecoderUtilization: helper.UintToPtr(3), - TemperatureC: helper.UintToPtr(3), - UsedMemoryMiB: helper.Uint64ToPtr(3), - BAR1UsedMiB: helper.Uint64ToPtr(3), - ECCErrorsL1Cache: helper.Uint64ToPtr(300), - ECCErrorsL2Cache: helper.Uint64ToPtr(300), - ECCErrorsDevice: helper.Uint64ToPtr(300), - }, - }, - }, - logger: hclog.NewNullLogger(), - }, - ExpectedWriteToChannel: &device.StatsResponse{ - Groups: []*device.DeviceGroupStats{ - { - Vendor: vendor, - Type: deviceType, - Name: "DeviceName1", - InstanceStats: map[string]*device.DeviceStats{ - "UUID1": { - Summary: &structs.StatValue{ - Unit: MemoryStateUnit, - Desc: MemoryStateDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(1), - }, - Stats: &structs.StatObject{ - Attributes: map[string]*structs.StatValue{ - PowerUsageAttr: { - Unit: PowerUsageUnit, - Desc: PowerUsageDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(1), - }, - GPUUtilizationAttr: { - Unit: GPUUtilizationUnit, - Desc: GPUUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - MemoryUtilizationAttr: { - Unit: MemoryUtilizationUnit, - Desc: MemoryUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - EncoderUtilizationAttr: { - Unit: EncoderUtilizationUnit, - Desc: EncoderUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - DecoderUtilizationAttr: { - Unit: DecoderUtilizationUnit, - Desc: DecoderUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - TemperatureAttr: { - Unit: TemperatureUnit, - Desc: TemperatureDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - MemoryStateAttr: { - Unit: MemoryStateUnit, - Desc: MemoryStateDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(1), - }, - BAR1StateAttr: { - Unit: BAR1StateUnit, - Desc: BAR1StateDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(256), - }, - ECCErrorsL1CacheAttr: { - Unit: ECCErrorsL1CacheUnit, - Desc: ECCErrorsL1CacheDesc, - IntNumeratorVal: helper.Int64ToPtr(100), - }, - ECCErrorsL2CacheAttr: { - Unit: ECCErrorsL2CacheUnit, - Desc: ECCErrorsL2CacheDesc, - IntNumeratorVal: helper.Int64ToPtr(100), - }, - ECCErrorsDeviceAttr: { - Unit: ECCErrorsDeviceUnit, - Desc: ECCErrorsDeviceDesc, - IntNumeratorVal: helper.Int64ToPtr(100), - }, - }, - }, - Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC), - }, - }, - }, - { - Vendor: vendor, - Type: deviceType, - Name: "DeviceName2", - InstanceStats: map[string]*device.DeviceStats{ - "UUID2": { - Summary: &structs.StatValue{ - Unit: MemoryStateUnit, - Desc: MemoryStateDesc, - IntNumeratorVal: helper.Int64ToPtr(2), - IntDenominatorVal: helper.Int64ToPtr(2), - }, - Stats: &structs.StatObject{ - Attributes: map[string]*structs.StatValue{ - PowerUsageAttr: { - Unit: PowerUsageUnit, - Desc: PowerUsageDesc, - IntNumeratorVal: helper.Int64ToPtr(2), - IntDenominatorVal: helper.Int64ToPtr(2), - }, - GPUUtilizationAttr: { - Unit: GPUUtilizationUnit, - Desc: GPUUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(2), - }, - MemoryUtilizationAttr: { - Unit: MemoryUtilizationUnit, - Desc: MemoryUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(2), - }, - EncoderUtilizationAttr: { - Unit: EncoderUtilizationUnit, - Desc: EncoderUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(2), - }, - DecoderUtilizationAttr: { - Unit: DecoderUtilizationUnit, - Desc: DecoderUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(2), - }, - TemperatureAttr: { - Unit: TemperatureUnit, - Desc: TemperatureDesc, - IntNumeratorVal: helper.Int64ToPtr(2), - }, - MemoryStateAttr: { - Unit: MemoryStateUnit, - Desc: MemoryStateDesc, - IntNumeratorVal: helper.Int64ToPtr(2), - IntDenominatorVal: helper.Int64ToPtr(2), - }, - BAR1StateAttr: { - Unit: BAR1StateUnit, - Desc: BAR1StateDesc, - IntNumeratorVal: helper.Int64ToPtr(2), - IntDenominatorVal: helper.Int64ToPtr(256), - }, - ECCErrorsL1CacheAttr: { - Unit: ECCErrorsL1CacheUnit, - Desc: ECCErrorsL1CacheDesc, - IntNumeratorVal: helper.Int64ToPtr(200), - }, - ECCErrorsL2CacheAttr: { - Unit: ECCErrorsL2CacheUnit, - Desc: ECCErrorsL2CacheDesc, - IntNumeratorVal: helper.Int64ToPtr(200), - }, - ECCErrorsDeviceAttr: { - Unit: ECCErrorsDeviceUnit, - Desc: ECCErrorsDeviceDesc, - IntNumeratorVal: helper.Int64ToPtr(200), - }, - }, - }, - Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC), - }, - }, - }, - { - Vendor: vendor, - Type: deviceType, - Name: "DeviceName3", - InstanceStats: map[string]*device.DeviceStats{ - "UUID3": { - Summary: &structs.StatValue{ - Unit: MemoryStateUnit, - Desc: MemoryStateDesc, - IntNumeratorVal: helper.Int64ToPtr(3), - IntDenominatorVal: helper.Int64ToPtr(3), - }, - Stats: &structs.StatObject{ - Attributes: map[string]*structs.StatValue{ - PowerUsageAttr: { - Unit: PowerUsageUnit, - Desc: PowerUsageDesc, - IntNumeratorVal: helper.Int64ToPtr(3), - IntDenominatorVal: helper.Int64ToPtr(3), - }, - GPUUtilizationAttr: { - Unit: GPUUtilizationUnit, - Desc: GPUUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(3), - }, - MemoryUtilizationAttr: { - Unit: MemoryUtilizationUnit, - Desc: MemoryUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(3), - }, - EncoderUtilizationAttr: { - Unit: EncoderUtilizationUnit, - Desc: EncoderUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(3), - }, - DecoderUtilizationAttr: { - Unit: DecoderUtilizationUnit, - Desc: DecoderUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(3), - }, - TemperatureAttr: { - Unit: TemperatureUnit, - Desc: TemperatureDesc, - IntNumeratorVal: helper.Int64ToPtr(3), - }, - MemoryStateAttr: { - Unit: MemoryStateUnit, - Desc: MemoryStateDesc, - IntNumeratorVal: helper.Int64ToPtr(3), - IntDenominatorVal: helper.Int64ToPtr(3), - }, - BAR1StateAttr: { - Unit: BAR1StateUnit, - Desc: BAR1StateDesc, - IntNumeratorVal: helper.Int64ToPtr(3), - IntDenominatorVal: helper.Int64ToPtr(256), - }, - ECCErrorsL1CacheAttr: { - Unit: ECCErrorsL1CacheUnit, - Desc: ECCErrorsL1CacheDesc, - IntNumeratorVal: helper.Int64ToPtr(300), - }, - ECCErrorsL2CacheAttr: { - Unit: ECCErrorsL2CacheUnit, - Desc: ECCErrorsL2CacheDesc, - IntNumeratorVal: helper.Int64ToPtr(300), - }, - ECCErrorsDeviceAttr: { - Unit: ECCErrorsDeviceUnit, - Desc: ECCErrorsDeviceDesc, - IntNumeratorVal: helper.Int64ToPtr(300), - }, - }, - }, - Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC), - }, - }, - }, - }, - }, - }, - { - Name: "Check that stats with multiple DeviceNames are assigned to different groups 2", - Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC), - Device: &NvidiaDevice{ - devices: map[string]struct{}{ - "UUID1": {}, - "UUID2": {}, - "UUID3": {}, - }, - nvmlClient: &MockNvmlClient{ - StatsResponseReturned: []*nvml.StatsData{ - { - DeviceData: &nvml.DeviceData{ - UUID: "UUID1", - DeviceName: helper.StringToPtr("DeviceName1"), - MemoryMiB: helper.Uint64ToPtr(1), - PowerW: helper.UintToPtr(1), - BAR1MiB: helper.Uint64ToPtr(256), - }, - PowerUsageW: helper.UintToPtr(1), - GPUUtilization: helper.UintToPtr(1), - MemoryUtilization: helper.UintToPtr(1), - EncoderUtilization: helper.UintToPtr(1), - DecoderUtilization: helper.UintToPtr(1), - TemperatureC: helper.UintToPtr(1), - UsedMemoryMiB: helper.Uint64ToPtr(1), - BAR1UsedMiB: helper.Uint64ToPtr(1), - ECCErrorsL1Cache: helper.Uint64ToPtr(100), - ECCErrorsL2Cache: helper.Uint64ToPtr(100), - ECCErrorsDevice: helper.Uint64ToPtr(100), - }, - { - DeviceData: &nvml.DeviceData{ - UUID: "UUID2", - DeviceName: helper.StringToPtr("DeviceName2"), - MemoryMiB: helper.Uint64ToPtr(2), - PowerW: helper.UintToPtr(2), - BAR1MiB: helper.Uint64ToPtr(256), - }, - PowerUsageW: helper.UintToPtr(2), - GPUUtilization: helper.UintToPtr(2), - MemoryUtilization: helper.UintToPtr(2), - EncoderUtilization: helper.UintToPtr(2), - DecoderUtilization: helper.UintToPtr(2), - TemperatureC: helper.UintToPtr(2), - UsedMemoryMiB: helper.Uint64ToPtr(2), - BAR1UsedMiB: helper.Uint64ToPtr(2), - ECCErrorsL1Cache: helper.Uint64ToPtr(200), - ECCErrorsL2Cache: helper.Uint64ToPtr(200), - ECCErrorsDevice: helper.Uint64ToPtr(200), - }, - { - DeviceData: &nvml.DeviceData{ - UUID: "UUID3", - DeviceName: helper.StringToPtr("DeviceName2"), - MemoryMiB: helper.Uint64ToPtr(3), - PowerW: helper.UintToPtr(3), - BAR1MiB: helper.Uint64ToPtr(256), - }, - PowerUsageW: helper.UintToPtr(3), - GPUUtilization: helper.UintToPtr(3), - MemoryUtilization: helper.UintToPtr(3), - EncoderUtilization: helper.UintToPtr(3), - DecoderUtilization: helper.UintToPtr(3), - TemperatureC: helper.UintToPtr(3), - UsedMemoryMiB: helper.Uint64ToPtr(3), - BAR1UsedMiB: helper.Uint64ToPtr(3), - ECCErrorsL1Cache: helper.Uint64ToPtr(300), - ECCErrorsL2Cache: helper.Uint64ToPtr(300), - ECCErrorsDevice: helper.Uint64ToPtr(300), - }, - }, - }, - logger: hclog.NewNullLogger(), - }, - ExpectedWriteToChannel: &device.StatsResponse{ - Groups: []*device.DeviceGroupStats{ - { - Vendor: vendor, - Type: deviceType, - Name: "DeviceName1", - InstanceStats: map[string]*device.DeviceStats{ - "UUID1": { - Summary: &structs.StatValue{ - Unit: MemoryStateUnit, - Desc: MemoryStateDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(1), - }, - Stats: &structs.StatObject{ - Attributes: map[string]*structs.StatValue{ - PowerUsageAttr: { - Unit: PowerUsageUnit, - Desc: PowerUsageDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(1), - }, - GPUUtilizationAttr: { - Unit: GPUUtilizationUnit, - Desc: GPUUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - MemoryUtilizationAttr: { - Unit: MemoryUtilizationUnit, - Desc: MemoryUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - EncoderUtilizationAttr: { - Unit: EncoderUtilizationUnit, - Desc: EncoderUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - DecoderUtilizationAttr: { - Unit: DecoderUtilizationUnit, - Desc: DecoderUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - TemperatureAttr: { - Unit: TemperatureUnit, - Desc: TemperatureDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - MemoryStateAttr: { - Unit: MemoryStateUnit, - Desc: MemoryStateDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(1), - }, - BAR1StateAttr: { - Unit: BAR1StateUnit, - Desc: BAR1StateDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(256), - }, - ECCErrorsL1CacheAttr: { - Unit: ECCErrorsL1CacheUnit, - Desc: ECCErrorsL1CacheDesc, - IntNumeratorVal: helper.Int64ToPtr(100), - }, - ECCErrorsL2CacheAttr: { - Unit: ECCErrorsL2CacheUnit, - Desc: ECCErrorsL2CacheDesc, - IntNumeratorVal: helper.Int64ToPtr(100), - }, - ECCErrorsDeviceAttr: { - Unit: ECCErrorsDeviceUnit, - Desc: ECCErrorsDeviceDesc, - IntNumeratorVal: helper.Int64ToPtr(100), - }, - }, - }, - Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC), - }, - }, - }, - { - Vendor: vendor, - Type: deviceType, - Name: "DeviceName2", - InstanceStats: map[string]*device.DeviceStats{ - "UUID3": { - Summary: &structs.StatValue{ - Unit: MemoryStateUnit, - Desc: MemoryStateDesc, - IntNumeratorVal: helper.Int64ToPtr(3), - IntDenominatorVal: helper.Int64ToPtr(3), - }, - Stats: &structs.StatObject{ - Attributes: map[string]*structs.StatValue{ - PowerUsageAttr: { - Unit: PowerUsageUnit, - Desc: PowerUsageDesc, - IntNumeratorVal: helper.Int64ToPtr(3), - IntDenominatorVal: helper.Int64ToPtr(3), - }, - GPUUtilizationAttr: { - Unit: GPUUtilizationUnit, - Desc: GPUUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(3), - }, - MemoryUtilizationAttr: { - Unit: MemoryUtilizationUnit, - Desc: MemoryUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(3), - }, - EncoderUtilizationAttr: { - Unit: EncoderUtilizationUnit, - Desc: EncoderUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(3), - }, - DecoderUtilizationAttr: { - Unit: DecoderUtilizationUnit, - Desc: DecoderUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(3), - }, - TemperatureAttr: { - Unit: TemperatureUnit, - Desc: TemperatureDesc, - IntNumeratorVal: helper.Int64ToPtr(3), - }, - MemoryStateAttr: { - Unit: MemoryStateUnit, - Desc: MemoryStateDesc, - IntNumeratorVal: helper.Int64ToPtr(3), - IntDenominatorVal: helper.Int64ToPtr(3), - }, - BAR1StateAttr: { - Unit: BAR1StateUnit, - Desc: BAR1StateDesc, - IntNumeratorVal: helper.Int64ToPtr(3), - IntDenominatorVal: helper.Int64ToPtr(256), - }, - ECCErrorsL1CacheAttr: { - Unit: ECCErrorsL1CacheUnit, - Desc: ECCErrorsL1CacheDesc, - IntNumeratorVal: helper.Int64ToPtr(300), - }, - ECCErrorsL2CacheAttr: { - Unit: ECCErrorsL2CacheUnit, - Desc: ECCErrorsL2CacheDesc, - IntNumeratorVal: helper.Int64ToPtr(300), - }, - ECCErrorsDeviceAttr: { - Unit: ECCErrorsDeviceUnit, - Desc: ECCErrorsDeviceDesc, - IntNumeratorVal: helper.Int64ToPtr(300), - }, - }, - }, - Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC), - }, - "UUID2": { - Summary: &structs.StatValue{ - Unit: MemoryStateUnit, - Desc: MemoryStateDesc, - IntNumeratorVal: helper.Int64ToPtr(2), - IntDenominatorVal: helper.Int64ToPtr(2), - }, - Stats: &structs.StatObject{ - Attributes: map[string]*structs.StatValue{ - PowerUsageAttr: { - Unit: PowerUsageUnit, - Desc: PowerUsageDesc, - IntNumeratorVal: helper.Int64ToPtr(2), - IntDenominatorVal: helper.Int64ToPtr(2), - }, - GPUUtilizationAttr: { - Unit: GPUUtilizationUnit, - Desc: GPUUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(2), - }, - MemoryUtilizationAttr: { - Unit: MemoryUtilizationUnit, - Desc: MemoryUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(2), - }, - EncoderUtilizationAttr: { - Unit: EncoderUtilizationUnit, - Desc: EncoderUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(2), - }, - DecoderUtilizationAttr: { - Unit: DecoderUtilizationUnit, - Desc: DecoderUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(2), - }, - TemperatureAttr: { - Unit: TemperatureUnit, - Desc: TemperatureDesc, - IntNumeratorVal: helper.Int64ToPtr(2), - }, - MemoryStateAttr: { - Unit: MemoryStateUnit, - Desc: MemoryStateDesc, - IntNumeratorVal: helper.Int64ToPtr(2), - IntDenominatorVal: helper.Int64ToPtr(2), - }, - BAR1StateAttr: { - Unit: BAR1StateUnit, - Desc: BAR1StateDesc, - IntNumeratorVal: helper.Int64ToPtr(2), - IntDenominatorVal: helper.Int64ToPtr(256), - }, - ECCErrorsL1CacheAttr: { - Unit: ECCErrorsL1CacheUnit, - Desc: ECCErrorsL1CacheDesc, - IntNumeratorVal: helper.Int64ToPtr(200), - }, - ECCErrorsL2CacheAttr: { - Unit: ECCErrorsL2CacheUnit, - Desc: ECCErrorsL2CacheDesc, - IntNumeratorVal: helper.Int64ToPtr(200), - }, - ECCErrorsDeviceAttr: { - Unit: ECCErrorsDeviceUnit, - Desc: ECCErrorsDeviceDesc, - IntNumeratorVal: helper.Int64ToPtr(200), - }, - }, - }, - Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC), - }, - }, - }, - }, - }, - }, - { - Name: "Check that only devices from NvidiaDevice.device map stats are reported", - Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC), - Device: &NvidiaDevice{ - devices: map[string]struct{}{ - "UUID1": {}, - "UUID2": {}, - }, - nvmlClient: &MockNvmlClient{ - StatsResponseReturned: []*nvml.StatsData{ - { - DeviceData: &nvml.DeviceData{ - UUID: "UUID1", - DeviceName: helper.StringToPtr("DeviceName1"), - MemoryMiB: helper.Uint64ToPtr(1), - PowerW: helper.UintToPtr(1), - BAR1MiB: helper.Uint64ToPtr(256), - }, - PowerUsageW: helper.UintToPtr(1), - GPUUtilization: helper.UintToPtr(1), - MemoryUtilization: helper.UintToPtr(1), - EncoderUtilization: helper.UintToPtr(1), - DecoderUtilization: helper.UintToPtr(1), - TemperatureC: helper.UintToPtr(1), - UsedMemoryMiB: helper.Uint64ToPtr(1), - BAR1UsedMiB: helper.Uint64ToPtr(1), - ECCErrorsL1Cache: helper.Uint64ToPtr(100), - ECCErrorsL2Cache: helper.Uint64ToPtr(100), - ECCErrorsDevice: helper.Uint64ToPtr(100), - }, - { - DeviceData: &nvml.DeviceData{ - UUID: "UUID2", - DeviceName: helper.StringToPtr("DeviceName2"), - MemoryMiB: helper.Uint64ToPtr(2), - PowerW: helper.UintToPtr(2), - BAR1MiB: helper.Uint64ToPtr(256), - }, - PowerUsageW: helper.UintToPtr(2), - GPUUtilization: helper.UintToPtr(2), - MemoryUtilization: helper.UintToPtr(2), - EncoderUtilization: helper.UintToPtr(2), - DecoderUtilization: helper.UintToPtr(2), - TemperatureC: helper.UintToPtr(2), - UsedMemoryMiB: helper.Uint64ToPtr(2), - BAR1UsedMiB: helper.Uint64ToPtr(2), - ECCErrorsL1Cache: helper.Uint64ToPtr(200), - ECCErrorsL2Cache: helper.Uint64ToPtr(200), - ECCErrorsDevice: helper.Uint64ToPtr(200), - }, - { - DeviceData: &nvml.DeviceData{ - UUID: "UUID3", - DeviceName: helper.StringToPtr("DeviceName3"), - MemoryMiB: helper.Uint64ToPtr(3), - PowerW: helper.UintToPtr(3), - BAR1MiB: helper.Uint64ToPtr(256), - }, - PowerUsageW: helper.UintToPtr(3), - GPUUtilization: helper.UintToPtr(3), - MemoryUtilization: helper.UintToPtr(3), - EncoderUtilization: helper.UintToPtr(3), - DecoderUtilization: helper.UintToPtr(3), - TemperatureC: helper.UintToPtr(3), - UsedMemoryMiB: helper.Uint64ToPtr(3), - BAR1UsedMiB: helper.Uint64ToPtr(3), - ECCErrorsL1Cache: helper.Uint64ToPtr(300), - ECCErrorsL2Cache: helper.Uint64ToPtr(300), - ECCErrorsDevice: helper.Uint64ToPtr(300), - }, - }, - }, - logger: hclog.NewNullLogger(), - }, - ExpectedWriteToChannel: &device.StatsResponse{ - Groups: []*device.DeviceGroupStats{ - { - Vendor: vendor, - Type: deviceType, - Name: "DeviceName1", - InstanceStats: map[string]*device.DeviceStats{ - "UUID1": { - Summary: &structs.StatValue{ - Unit: MemoryStateUnit, - Desc: MemoryStateDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(1), - }, - Stats: &structs.StatObject{ - Attributes: map[string]*structs.StatValue{ - PowerUsageAttr: { - Unit: PowerUsageUnit, - Desc: PowerUsageDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(1), - }, - GPUUtilizationAttr: { - Unit: GPUUtilizationUnit, - Desc: GPUUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - MemoryUtilizationAttr: { - Unit: MemoryUtilizationUnit, - Desc: MemoryUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - EncoderUtilizationAttr: { - Unit: EncoderUtilizationUnit, - Desc: EncoderUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - DecoderUtilizationAttr: { - Unit: DecoderUtilizationUnit, - Desc: DecoderUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - TemperatureAttr: { - Unit: TemperatureUnit, - Desc: TemperatureDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - MemoryStateAttr: { - Unit: MemoryStateUnit, - Desc: MemoryStateDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(1), - }, - BAR1StateAttr: { - Unit: BAR1StateUnit, - Desc: BAR1StateDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(256), - }, - ECCErrorsL1CacheAttr: { - Unit: ECCErrorsL1CacheUnit, - Desc: ECCErrorsL1CacheDesc, - IntNumeratorVal: helper.Int64ToPtr(100), - }, - ECCErrorsL2CacheAttr: { - Unit: ECCErrorsL2CacheUnit, - Desc: ECCErrorsL2CacheDesc, - IntNumeratorVal: helper.Int64ToPtr(100), - }, - ECCErrorsDeviceAttr: { - Unit: ECCErrorsDeviceUnit, - Desc: ECCErrorsDeviceDesc, - IntNumeratorVal: helper.Int64ToPtr(100), - }, - }, - }, - Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC), - }, - }, - }, - { - Vendor: vendor, - Type: deviceType, - Name: "DeviceName2", - InstanceStats: map[string]*device.DeviceStats{ - "UUID2": { - Summary: &structs.StatValue{ - Unit: MemoryStateUnit, - Desc: MemoryStateDesc, - IntNumeratorVal: helper.Int64ToPtr(2), - IntDenominatorVal: helper.Int64ToPtr(2), - }, - Stats: &structs.StatObject{ - Attributes: map[string]*structs.StatValue{ - PowerUsageAttr: { - Unit: PowerUsageUnit, - Desc: PowerUsageDesc, - IntNumeratorVal: helper.Int64ToPtr(2), - IntDenominatorVal: helper.Int64ToPtr(2), - }, - GPUUtilizationAttr: { - Unit: GPUUtilizationUnit, - Desc: GPUUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(2), - }, - MemoryUtilizationAttr: { - Unit: MemoryUtilizationUnit, - Desc: MemoryUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(2), - }, - EncoderUtilizationAttr: { - Unit: EncoderUtilizationUnit, - Desc: EncoderUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(2), - }, - DecoderUtilizationAttr: { - Unit: DecoderUtilizationUnit, - Desc: DecoderUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(2), - }, - TemperatureAttr: { - Unit: TemperatureUnit, - Desc: TemperatureDesc, - IntNumeratorVal: helper.Int64ToPtr(2), - }, - MemoryStateAttr: { - Unit: MemoryStateUnit, - Desc: MemoryStateDesc, - IntNumeratorVal: helper.Int64ToPtr(2), - IntDenominatorVal: helper.Int64ToPtr(2), - }, - BAR1StateAttr: { - Unit: BAR1StateUnit, - Desc: BAR1StateDesc, - IntNumeratorVal: helper.Int64ToPtr(2), - IntDenominatorVal: helper.Int64ToPtr(256), - }, - ECCErrorsL1CacheAttr: { - Unit: ECCErrorsL1CacheUnit, - Desc: ECCErrorsL1CacheDesc, - IntNumeratorVal: helper.Int64ToPtr(200), - }, - ECCErrorsL2CacheAttr: { - Unit: ECCErrorsL2CacheUnit, - Desc: ECCErrorsL2CacheDesc, - IntNumeratorVal: helper.Int64ToPtr(200), - }, - ECCErrorsDeviceAttr: { - Unit: ECCErrorsDeviceUnit, - Desc: ECCErrorsDeviceDesc, - IntNumeratorVal: helper.Int64ToPtr(200), - }, - }, - }, - Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC), - }, - }, - }, - }, - }, - }, - } { - channel := make(chan *device.StatsResponse, 1) - testCase.Device.writeStatsToChannel(channel, testCase.Timestamp) - actualResult := <-channel - // writeStatsToChannel iterates over map keys - // and insterts results to an array, so order of elements in output array - // may be different - // actualResult, expectedWriteToChannel arrays has to be sorted firsted - sort.Slice(actualResult.Groups, func(i, j int) bool { - return actualResult.Groups[i].Name < actualResult.Groups[j].Name - }) - sort.Slice(testCase.ExpectedWriteToChannel.Groups, func(i, j int) bool { - return testCase.ExpectedWriteToChannel.Groups[i].Name < testCase.ExpectedWriteToChannel.Groups[j].Name - }) - require.New(t).Equal(testCase.ExpectedWriteToChannel, actualResult) - } -} diff --git a/go.mod b/go.mod index db72e0ab3e9c..911b72a7663c 100644 --- a/go.mod +++ b/go.mod @@ -23,7 +23,6 @@ require ( github.com/LK4D4/joincontext v0.0.0-20171026170139-1724345da6d5 github.com/Microsoft/go-winio v0.4.15-0.20200113171025-3fe6c5262873 github.com/Microsoft/hcsshim v0.8.8-0.20200312192636-fd0797d766b1 // indirect - github.com/NVIDIA/gpu-monitoring-tools v0.0.0-20180829222009-86f2a9fac6c5 github.com/NYTimes/gziphandler v1.0.1 github.com/armon/circbuf v0.0.0-20150827004946-bbbad097214e github.com/armon/go-metrics v0.3.4 diff --git a/go.sum b/go.sum index d8f6cfda76a7..6dfe3b5a41c6 100644 --- a/go.sum +++ b/go.sum @@ -67,8 +67,6 @@ github.com/LK4D4/joincontext v0.0.0-20171026170139-1724345da6d5/go.mod h1:nxQPcN github.com/Microsoft/hcsshim v0.8.7/go.mod h1:OHd7sQqRFrYd3RmSgbgji+ctCwkbq2wbEYNSzOYtcBQ= github.com/Microsoft/hcsshim v0.8.8-0.20200312192636-fd0797d766b1 h1:2T9t72RkTRjAcuFc+4vaGWnRx/anVngE1/VGN/HFEVk= github.com/Microsoft/hcsshim v0.8.8-0.20200312192636-fd0797d766b1/go.mod h1:LVvUcNYEzt59fFVTuiPEgM6dgF70yMGdy/Qc/UmCbuU= -github.com/NVIDIA/gpu-monitoring-tools v0.0.0-20180829222009-86f2a9fac6c5 h1:WLyvLAM0QfjAarRzRTG9EgT5McqGWNZMvqqSUSoyUUY= -github.com/NVIDIA/gpu-monitoring-tools v0.0.0-20180829222009-86f2a9fac6c5/go.mod h1:nMOvShGpWaf0bXwXmeu4k+O4uziuaEI8pWzIj3BUrOA= github.com/NYTimes/gziphandler v1.0.0 h1:OswZCvpiFsNRCbeapdJxDuikAqVXTgV7XAht8S9olZo= github.com/NYTimes/gziphandler v1.0.0/go.mod h1:3wb06e3pkSAbeQ52E9H9iFoQsEEwGN64994WTCIhntQ= github.com/PuerkitoBio/purell v1.0.0/go.mod h1:c11w/QuzBsJSee3cPx9rAFu61PvFxuPbtSwDGJws/X0= diff --git a/helper/pluginutils/catalog/register_nvidia_linux.go b/helper/pluginutils/catalog/register_nvidia_linux.go deleted file mode 100644 index a50cbe833a75..000000000000 --- a/helper/pluginutils/catalog/register_nvidia_linux.go +++ /dev/null @@ -1,14 +0,0 @@ -// +build !nonvidia - -package catalog - -import ( - "github.com/hashicorp/nomad/devices/gpu/nvidia" -) - -// This file is where all builtin plugins should be registered in the catalog. -// Plugins with build restrictions should be placed in the appropriate -// register_XXX.go file. -func init() { - Register(nvidia.PluginID, nvidia.PluginConfig) -} diff --git a/vendor/github.com/NVIDIA/gpu-monitoring-tools/LICENSE b/vendor/github.com/NVIDIA/gpu-monitoring-tools/LICENSE deleted file mode 100644 index 2a718d63da7f..000000000000 --- a/vendor/github.com/NVIDIA/gpu-monitoring-tools/LICENSE +++ /dev/null @@ -1,29 +0,0 @@ -BSD 3-Clause License - -Copyright (c) 2018, NVIDIA Corporation -All rights reserved. - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are met: - -* Redistributions of source code must retain the above copyright notice, this - list of conditions and the following disclaimer. - -* Redistributions in binary form must reproduce the above copyright notice, - this list of conditions and the following disclaimer in the documentation - and/or other materials provided with the distribution. - -* Neither the name of the copyright holder nor the names of its - contributors may be used to endorse or promote products derived from - this software without specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/vendor/github.com/NVIDIA/gpu-monitoring-tools/bindings/go/nvml/bindings.go b/vendor/github.com/NVIDIA/gpu-monitoring-tools/bindings/go/nvml/bindings.go deleted file mode 100644 index 4bba898342f3..000000000000 --- a/vendor/github.com/NVIDIA/gpu-monitoring-tools/bindings/go/nvml/bindings.go +++ /dev/null @@ -1,634 +0,0 @@ -// Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved. - -package nvml - -// #cgo LDFLAGS: -ldl -Wl,--unresolved-symbols=ignore-in-object-files -// #include "nvml_dl.h" -import "C" - -import ( - "errors" - "fmt" - "io/ioutil" - "os" - "sort" - "strconv" - "strings" -) - -const ( - szDriver = C.NVML_SYSTEM_DRIVER_VERSION_BUFFER_SIZE - szName = C.NVML_DEVICE_NAME_BUFFER_SIZE - szUUID = C.NVML_DEVICE_UUID_BUFFER_SIZE - szProcs = 32 - szProcName = 64 - - XidCriticalError = C.nvmlEventTypeXidCriticalError -) - -type handle struct{ dev C.nvmlDevice_t } -type EventSet struct{ set C.nvmlEventSet_t } -type Event struct { - UUID *string - Etype uint64 - Edata uint64 -} - -func uintPtr(c C.uint) *uint { - i := uint(c) - return &i -} - -func uint64Ptr(c C.ulonglong) *uint64 { - i := uint64(c) - return &i -} - -func stringPtr(c *C.char) *string { - s := C.GoString(c) - return &s -} - -func errorString(ret C.nvmlReturn_t) error { - if ret == C.NVML_SUCCESS { - return nil - } - err := C.GoString(C.nvmlErrorString(ret)) - return fmt.Errorf("nvml: %v", err) -} - -func init_() error { - r := C.nvmlInit_dl() - if r == C.NVML_ERROR_LIBRARY_NOT_FOUND { - return errors.New("could not load NVML library") - } - return errorString(r) -} - -func NewEventSet() EventSet { - var set C.nvmlEventSet_t - C.nvmlEventSetCreate(&set) - - return EventSet{set} -} - -func RegisterEvent(es EventSet, event int) error { - n, err := deviceGetCount() - if err != nil { - return err - } - - var i uint - for i = 0; i < n; i++ { - h, err := deviceGetHandleByIndex(i) - if err != nil { - return err - } - - r := C.nvmlDeviceRegisterEvents(h.dev, C.ulonglong(event), es.set) - if r != C.NVML_SUCCESS { - return errorString(r) - } - } - - return nil -} - -func RegisterEventForDevice(es EventSet, event int, uuid string) error { - n, err := deviceGetCount() - if err != nil { - return err - } - - var i uint - for i = 0; i < n; i++ { - h, err := deviceGetHandleByIndex(i) - if err != nil { - return err - } - - duuid, err := h.deviceGetUUID() - if err != nil { - return err - } - - if *duuid != uuid { - continue - } - - r := C.nvmlDeviceRegisterEvents(h.dev, C.ulonglong(event), es.set) - if r != C.NVML_SUCCESS { - return errorString(r) - } - - return nil - } - - return fmt.Errorf("nvml: device not found") -} - -func DeleteEventSet(es EventSet) { - C.nvmlEventSetFree(es.set) -} - -func WaitForEvent(es EventSet, timeout uint) (Event, error) { - var data C.nvmlEventData_t - - r := C.nvmlEventSetWait(es.set, &data, C.uint(timeout)) - uuid, _ := handle{data.device}.deviceGetUUID() - - return Event{ - UUID: uuid, - Etype: uint64(data.eventType), - Edata: uint64(data.eventData), - }, - errorString(r) -} - -func shutdown() error { - return errorString(C.nvmlShutdown_dl()) -} - -func systemGetDriverVersion() (string, error) { - var driver [szDriver]C.char - - r := C.nvmlSystemGetDriverVersion(&driver[0], szDriver) - return C.GoString(&driver[0]), errorString(r) -} - -func systemGetProcessName(pid uint) (string, error) { - var proc [szProcName]C.char - - r := C.nvmlSystemGetProcessName(C.uint(pid), &proc[0], szProcName) - return C.GoString(&proc[0]), errorString(r) -} - -func deviceGetCount() (uint, error) { - var n C.uint - - r := C.nvmlDeviceGetCount(&n) - return uint(n), errorString(r) -} - -func deviceGetHandleByIndex(idx uint) (handle, error) { - var dev C.nvmlDevice_t - - r := C.nvmlDeviceGetHandleByIndex(C.uint(idx), &dev) - return handle{dev}, errorString(r) -} - -func deviceGetTopologyCommonAncestor(h1, h2 handle) (*uint, error) { - var level C.nvmlGpuTopologyLevel_t - - r := C.nvmlDeviceGetTopologyCommonAncestor_dl(h1.dev, h2.dev, &level) - if r == C.NVML_ERROR_FUNCTION_NOT_FOUND || r == C.NVML_ERROR_NOT_SUPPORTED { - return nil, nil - } - return uintPtr(C.uint(level)), errorString(r) -} - -func (h handle) deviceGetName() (*string, error) { - var name [szName]C.char - - r := C.nvmlDeviceGetName(h.dev, &name[0], szName) - if r == C.NVML_ERROR_NOT_SUPPORTED { - return nil, nil - } - return stringPtr(&name[0]), errorString(r) -} - -func (h handle) deviceGetUUID() (*string, error) { - var uuid [szUUID]C.char - - r := C.nvmlDeviceGetUUID(h.dev, &uuid[0], szUUID) - if r == C.NVML_ERROR_NOT_SUPPORTED { - return nil, nil - } - return stringPtr(&uuid[0]), errorString(r) -} - -func (h handle) deviceGetPciInfo() (*string, error) { - var pci C.nvmlPciInfo_t - - r := C.nvmlDeviceGetPciInfo(h.dev, &pci) - if r == C.NVML_ERROR_NOT_SUPPORTED { - return nil, nil - } - return stringPtr(&pci.busId[0]), errorString(r) -} - -func (h handle) deviceGetMinorNumber() (*uint, error) { - var minor C.uint - - r := C.nvmlDeviceGetMinorNumber(h.dev, &minor) - if r == C.NVML_ERROR_NOT_SUPPORTED { - return nil, nil - } - return uintPtr(minor), errorString(r) -} - -func (h handle) deviceGetBAR1MemoryInfo() (*uint64, *uint64, error) { - var bar1 C.nvmlBAR1Memory_t - - r := C.nvmlDeviceGetBAR1MemoryInfo(h.dev, &bar1) - if r == C.NVML_ERROR_NOT_SUPPORTED { - return nil, nil, nil - } - return uint64Ptr(bar1.bar1Total), uint64Ptr(bar1.bar1Used), errorString(r) -} - -func (h handle) deviceGetPowerManagementLimit() (*uint, error) { - var power C.uint - - r := C.nvmlDeviceGetPowerManagementLimit(h.dev, &power) - if r == C.NVML_ERROR_NOT_SUPPORTED { - return nil, nil - } - return uintPtr(power), errorString(r) -} - -func (h handle) deviceGetMaxClockInfo() (*uint, *uint, error) { - var sm, mem C.uint - - r := C.nvmlDeviceGetMaxClockInfo(h.dev, C.NVML_CLOCK_SM, &sm) - if r == C.NVML_ERROR_NOT_SUPPORTED { - return nil, nil, nil - } - if r == C.NVML_SUCCESS { - r = C.nvmlDeviceGetMaxClockInfo(h.dev, C.NVML_CLOCK_MEM, &mem) - } - return uintPtr(sm), uintPtr(mem), errorString(r) -} - -func (h handle) deviceGetMaxPcieLinkGeneration() (*uint, error) { - var link C.uint - - r := C.nvmlDeviceGetMaxPcieLinkGeneration(h.dev, &link) - if r == C.NVML_ERROR_NOT_SUPPORTED { - return nil, nil - } - return uintPtr(link), errorString(r) -} - -func (h handle) deviceGetMaxPcieLinkWidth() (*uint, error) { - var width C.uint - - r := C.nvmlDeviceGetMaxPcieLinkWidth(h.dev, &width) - if r == C.NVML_ERROR_NOT_SUPPORTED { - return nil, nil - } - return uintPtr(width), errorString(r) -} - -func (h handle) deviceGetPowerUsage() (*uint, error) { - var power C.uint - - r := C.nvmlDeviceGetPowerUsage(h.dev, &power) - if r == C.NVML_ERROR_NOT_SUPPORTED { - return nil, nil - } - return uintPtr(power), errorString(r) -} - -func (h handle) deviceGetTemperature() (*uint, error) { - var temp C.uint - - r := C.nvmlDeviceGetTemperature(h.dev, C.NVML_TEMPERATURE_GPU, &temp) - if r == C.NVML_ERROR_NOT_SUPPORTED { - return nil, nil - } - return uintPtr(temp), errorString(r) -} - -func (h handle) deviceGetUtilizationRates() (*uint, *uint, error) { - var usage C.nvmlUtilization_t - - r := C.nvmlDeviceGetUtilizationRates(h.dev, &usage) - if r == C.NVML_ERROR_NOT_SUPPORTED { - return nil, nil, nil - } - return uintPtr(usage.gpu), uintPtr(usage.memory), errorString(r) -} - -func (h handle) deviceGetEncoderUtilization() (*uint, error) { - var usage, sampling C.uint - - r := C.nvmlDeviceGetEncoderUtilization(h.dev, &usage, &sampling) - if r == C.NVML_ERROR_NOT_SUPPORTED { - return nil, nil - } - return uintPtr(usage), errorString(r) -} - -func (h handle) deviceGetDecoderUtilization() (*uint, error) { - var usage, sampling C.uint - - r := C.nvmlDeviceGetDecoderUtilization(h.dev, &usage, &sampling) - if r == C.NVML_ERROR_NOT_SUPPORTED { - return nil, nil - } - return uintPtr(usage), errorString(r) -} - -func (h handle) deviceGetMemoryInfo() (totalMem *uint64, devMem DeviceMemory, err error) { - var mem C.nvmlMemory_t - - r := C.nvmlDeviceGetMemoryInfo(h.dev, &mem) - if r == C.NVML_ERROR_NOT_SUPPORTED { - return - } - - err = errorString(r) - if r != C.NVML_SUCCESS { - return - } - - totalMem = uint64Ptr(mem.total) - if totalMem != nil { - *totalMem /= 1024 * 1024 // MiB - } - - devMem = DeviceMemory{ - Used: uint64Ptr(mem.used), - Free: uint64Ptr(mem.free), - } - - if devMem.Used != nil { - *devMem.Used /= 1024 * 1024 // MiB - } - - if devMem.Free != nil { - *devMem.Free /= 1024 * 1024 // MiB - } - return -} - -func (h handle) deviceGetClockInfo() (*uint, *uint, error) { - var sm, mem C.uint - - r := C.nvmlDeviceGetClockInfo(h.dev, C.NVML_CLOCK_SM, &sm) - if r == C.NVML_ERROR_NOT_SUPPORTED { - return nil, nil, nil - } - if r == C.NVML_SUCCESS { - r = C.nvmlDeviceGetClockInfo(h.dev, C.NVML_CLOCK_MEM, &mem) - } - return uintPtr(sm), uintPtr(mem), errorString(r) -} - -func (h handle) deviceGetMemoryErrorCounter() (*uint64, *uint64, *uint64, error) { - var l1, l2, mem C.ulonglong - - r := C.nvmlDeviceGetMemoryErrorCounter(h.dev, C.NVML_MEMORY_ERROR_TYPE_UNCORRECTED, - C.NVML_VOLATILE_ECC, C.NVML_MEMORY_LOCATION_L1_CACHE, &l1) - if r == C.NVML_ERROR_NOT_SUPPORTED { - return nil, nil, nil, nil - } - if r == C.NVML_SUCCESS { - r = C.nvmlDeviceGetMemoryErrorCounter(h.dev, C.NVML_MEMORY_ERROR_TYPE_UNCORRECTED, - C.NVML_VOLATILE_ECC, C.NVML_MEMORY_LOCATION_L2_CACHE, &l2) - } - if r == C.NVML_SUCCESS { - r = C.nvmlDeviceGetMemoryErrorCounter(h.dev, C.NVML_MEMORY_ERROR_TYPE_UNCORRECTED, - C.NVML_VOLATILE_ECC, C.NVML_MEMORY_LOCATION_DEVICE_MEMORY, &mem) - } - return uint64Ptr(l1), uint64Ptr(l2), uint64Ptr(mem), errorString(r) -} - -func (h handle) deviceGetPcieThroughput() (*uint, *uint, error) { - var rx, tx C.uint - - r := C.nvmlDeviceGetPcieThroughput(h.dev, C.NVML_PCIE_UTIL_RX_BYTES, &rx) - if r == C.NVML_ERROR_NOT_SUPPORTED { - return nil, nil, nil - } - if r == C.NVML_SUCCESS { - r = C.nvmlDeviceGetPcieThroughput(h.dev, C.NVML_PCIE_UTIL_TX_BYTES, &tx) - } - return uintPtr(rx), uintPtr(tx), errorString(r) -} - -func (h handle) deviceGetComputeRunningProcesses() ([]uint, []uint64, error) { - var procs [szProcs]C.nvmlProcessInfo_t - var count = C.uint(szProcs) - - r := C.nvmlDeviceGetComputeRunningProcesses(h.dev, &count, &procs[0]) - if r == C.NVML_ERROR_NOT_SUPPORTED { - return nil, nil, nil - } - n := int(count) - pids := make([]uint, n) - mems := make([]uint64, n) - for i := 0; i < n; i++ { - pids[i] = uint(procs[i].pid) - mems[i] = uint64(procs[i].usedGpuMemory) - } - return pids, mems, errorString(r) -} - -func (h handle) deviceGetGraphicsRunningProcesses() ([]uint, []uint64, error) { - var procs [szProcs]C.nvmlProcessInfo_t - var count = C.uint(szProcs) - - r := C.nvmlDeviceGetGraphicsRunningProcesses(h.dev, &count, &procs[0]) - if r == C.NVML_ERROR_NOT_SUPPORTED { - return nil, nil, nil - } - n := int(count) - pids := make([]uint, n) - mems := make([]uint64, n) - for i := 0; i < n; i++ { - pids[i] = uint(procs[i].pid) - mems[i] = uint64(procs[i].usedGpuMemory) - } - return pids, mems, errorString(r) -} - -func (h handle) deviceGetAllRunningProcesses() ([]ProcessInfo, error) { - cPids, cpMems, err := h.deviceGetComputeRunningProcesses() - if err != nil { - return nil, err - } - - gPids, gpMems, err := h.deviceGetGraphicsRunningProcesses() - if err != nil { - return nil, err - } - - allPids := make(map[uint]ProcessInfo) - - for i, pid := range cPids { - name, err := processName(pid) - if err != nil { - return nil, err - } - allPids[pid] = ProcessInfo{ - PID: pid, - Name: name, - MemoryUsed: cpMems[i] / (1024 * 1024), // MiB - Type: Compute, - } - - } - - for i, pid := range gPids { - pInfo, exists := allPids[pid] - if exists { - pInfo.Type = ComputeAndGraphics - allPids[pid] = pInfo - } else { - name, err := processName(pid) - if err != nil { - return nil, err - } - allPids[pid] = ProcessInfo{ - PID: pid, - Name: name, - MemoryUsed: gpMems[i] / (1024 * 1024), // MiB - Type: Graphics, - } - } - } - - var processInfo []ProcessInfo - for _, v := range allPids { - processInfo = append(processInfo, v) - } - sort.Slice(processInfo, func(i, j int) bool { - return processInfo[i].PID < processInfo[j].PID - }) - - return processInfo, nil -} - -func (h handle) getClocksThrottleReasons() (reason ThrottleReason, err error) { - var clocksThrottleReasons C.ulonglong - - r := C.nvmlDeviceGetCurrentClocksThrottleReasons(h.dev, &clocksThrottleReasons) - - if r == C.NVML_ERROR_NOT_SUPPORTED { - return ThrottleReasonUnknown, nil - } - - if r != C.NVML_SUCCESS { - return ThrottleReasonUnknown, errorString(r) - } - - switch clocksThrottleReasons { - case C.nvmlClocksThrottleReasonGpuIdle: - reason = ThrottleReasonGpuIdle - case C.nvmlClocksThrottleReasonApplicationsClocksSetting: - reason = ThrottleReasonApplicationsClocksSetting - case C.nvmlClocksThrottleReasonSwPowerCap: - reason = ThrottleReasonSwPowerCap - case C.nvmlClocksThrottleReasonHwSlowdown: - reason = ThrottleReasonHwSlowdown - case C.nvmlClocksThrottleReasonSyncBoost: - reason = ThrottleReasonSyncBoost - case C.nvmlClocksThrottleReasonSwThermalSlowdown: - reason = ThrottleReasonSwThermalSlowdown - case C.nvmlClocksThrottleReasonHwThermalSlowdown: - reason = ThrottleReasonHwThermalSlowdown - case C.nvmlClocksThrottleReasonHwPowerBrakeSlowdown: - reason = ThrottleReasonHwPowerBrakeSlowdown - case C.nvmlClocksThrottleReasonDisplayClockSetting: - reason = ThrottleReasonDisplayClockSetting - case C.nvmlClocksThrottleReasonNone: - reason = ThrottleReasonNone - } - return -} - -func (h handle) getPerformanceState() (PerfState, error) { - var pstate C.nvmlPstates_t - - r := C.nvmlDeviceGetPerformanceState(h.dev, &pstate) - - if r == C.NVML_ERROR_NOT_SUPPORTED { - return PerfStateUnknown, nil - } - - if r != C.NVML_SUCCESS { - return PerfStateUnknown, errorString(r) - } - return PerfState(pstate), nil -} - -func processName(pid uint) (string, error) { - f := `/proc/` + strconv.FormatUint(uint64(pid), 10) + `/comm` - d, err := ioutil.ReadFile(f) - - if err != nil { - // TOCTOU: process terminated - if os.IsNotExist(err) { - return "", nil - } - return "", err - } - return strings.TrimSuffix(string(d), "\n"), err -} - -func (h handle) getAccountingInfo() (accountingInfo Accounting, err error) { - var mode C.nvmlEnableState_t - var buffer C.uint - - r := C.nvmlDeviceGetAccountingMode(h.dev, &mode) - if r == C.NVML_ERROR_NOT_SUPPORTED { - return - } - - if r != C.NVML_SUCCESS { - return accountingInfo, errorString(r) - } - - r = C.nvmlDeviceGetAccountingBufferSize(h.dev, &buffer) - if r == C.NVML_ERROR_NOT_SUPPORTED { - return - } - - if r != C.NVML_SUCCESS { - return accountingInfo, errorString(r) - } - - accountingInfo = Accounting{ - Mode: ModeState(mode), - BufferSize: uintPtr(buffer), - } - return -} - -func (h handle) getDisplayInfo() (display Display, err error) { - var mode, isActive C.nvmlEnableState_t - - r := C.nvmlDeviceGetDisplayActive(h.dev, &mode) - if r == C.NVML_ERROR_NOT_SUPPORTED { - return - } - - if r != C.NVML_SUCCESS { - return display, errorString(r) - } - - r = C.nvmlDeviceGetDisplayMode(h.dev, &isActive) - if r == C.NVML_ERROR_NOT_SUPPORTED { - return - } - if r != C.NVML_SUCCESS { - return display, errorString(r) - } - display = Display{ - Mode: ModeState(mode), - Active: ModeState(isActive), - } - return -} - -func (h handle) getPeristenceMode() (state ModeState, err error) { - var mode C.nvmlEnableState_t - - r := C.nvmlDeviceGetPersistenceMode(h.dev, &mode) - if r == C.NVML_ERROR_NOT_SUPPORTED { - return - } - return ModeState(mode), errorString(r) -} diff --git a/vendor/github.com/NVIDIA/gpu-monitoring-tools/bindings/go/nvml/nvml.go b/vendor/github.com/NVIDIA/gpu-monitoring-tools/bindings/go/nvml/nvml.go deleted file mode 100644 index f6ec9e8fae39..000000000000 --- a/vendor/github.com/NVIDIA/gpu-monitoring-tools/bindings/go/nvml/nvml.go +++ /dev/null @@ -1,533 +0,0 @@ -// Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved. - -package nvml - -// #include "nvml_dl.h" -import "C" - -import ( - "bytes" - "errors" - "fmt" - "io/ioutil" - "strconv" - "strings" -) - -var ( - ErrCPUAffinity = errors.New("failed to retrieve CPU affinity") - ErrUnsupportedP2PLink = errors.New("unsupported P2P link type") - ErrUnsupportedGPU = errors.New("unsupported GPU device") -) - -type ModeState uint - -const ( - Enabled ModeState = iota - Disabled -) - -func (m ModeState) String() string { - switch m { - case Enabled: - return "Enabled" - case Disabled: - return "Disabled" - } - return "N/A" -} - -type Display struct { - Mode ModeState - Active ModeState -} - -type Accounting struct { - Mode ModeState - BufferSize *uint -} - -type DeviceMode struct { - DisplayInfo Display - Persistence ModeState - AccountingInfo Accounting -} - -type ThrottleReason uint - -const ( - ThrottleReasonGpuIdle ThrottleReason = iota - ThrottleReasonApplicationsClocksSetting - ThrottleReasonSwPowerCap - ThrottleReasonHwSlowdown - ThrottleReasonSyncBoost - ThrottleReasonSwThermalSlowdown - ThrottleReasonHwThermalSlowdown - ThrottleReasonHwPowerBrakeSlowdown - ThrottleReasonDisplayClockSetting - ThrottleReasonNone - ThrottleReasonUnknown -) - -func (r ThrottleReason) String() string { - switch r { - case ThrottleReasonGpuIdle: - return "Gpu Idle" - case ThrottleReasonApplicationsClocksSetting: - return "Applications Clocks Setting" - case ThrottleReasonSwPowerCap: - return "SW Power Cap" - case ThrottleReasonHwSlowdown: - return "HW Slowdown" - case ThrottleReasonSyncBoost: - return "Sync Boost" - case ThrottleReasonSwThermalSlowdown: - return "SW Thermal Slowdown" - case ThrottleReasonHwThermalSlowdown: - return "HW Thermal Slowdown" - case ThrottleReasonHwPowerBrakeSlowdown: - return "HW Power Brake Slowdown" - case ThrottleReasonDisplayClockSetting: - return "Display Clock Setting" - case ThrottleReasonNone: - return "No clocks throttling" - } - return "N/A" -} - -type PerfState uint - -const ( - PerfStateMax = 0 - PerfStateMin = 15 - PerfStateUnknown = 32 -) - -func (p PerfState) String() string { - if p >= PerfStateMax && p <= PerfStateMin { - return fmt.Sprintf("P%d", p) - } - return "Unknown" -} - -type ProcessType uint - -const ( - Compute ProcessType = iota - Graphics - ComputeAndGraphics -) - -func (t ProcessType) String() string { - typ := "C+G" - if t == Compute { - typ = "C" - } else if t == Graphics { - typ = "G" - } - return typ -} - -type P2PLinkType uint - -const ( - P2PLinkUnknown P2PLinkType = iota - P2PLinkCrossCPU - P2PLinkSameCPU - P2PLinkHostBridge - P2PLinkMultiSwitch - P2PLinkSingleSwitch - P2PLinkSameBoard -) - -type P2PLink struct { - BusID string - Link P2PLinkType -} - -func (t P2PLinkType) String() string { - switch t { - case P2PLinkCrossCPU: - return "Cross CPU socket" - case P2PLinkSameCPU: - return "Same CPU socket" - case P2PLinkHostBridge: - return "Host PCI bridge" - case P2PLinkMultiSwitch: - return "Multiple PCI switches" - case P2PLinkSingleSwitch: - return "Single PCI switch" - case P2PLinkSameBoard: - return "Same board" - case P2PLinkUnknown: - } - return "N/A" -} - -type ClockInfo struct { - Cores *uint - Memory *uint -} - -type PCIInfo struct { - BusID string - BAR1 *uint64 - Bandwidth *uint -} - -type Device struct { - handle - - UUID string - Path string - Model *string - Power *uint - Memory *uint64 - CPUAffinity *uint - PCI PCIInfo - Clocks ClockInfo - Topology []P2PLink -} - -type UtilizationInfo struct { - GPU *uint - Memory *uint - Encoder *uint - Decoder *uint -} - -type PCIThroughputInfo struct { - RX *uint - TX *uint -} - -type PCIStatusInfo struct { - BAR1Used *uint64 - Throughput PCIThroughputInfo -} - -type ECCErrorsInfo struct { - L1Cache *uint64 - L2Cache *uint64 - Device *uint64 -} - -type DeviceMemory struct { - Used *uint64 - Free *uint64 -} - -type MemoryInfo struct { - Global DeviceMemory - ECCErrors ECCErrorsInfo -} - -type ProcessInfo struct { - PID uint - Name string - MemoryUsed uint64 - Type ProcessType -} - -type DeviceStatus struct { - Power *uint - Temperature *uint - Utilization UtilizationInfo - Memory MemoryInfo - Clocks ClockInfo - PCI PCIStatusInfo - Processes []ProcessInfo - Throttle ThrottleReason - Performance PerfState -} - -func assert(err error) { - if err != nil { - panic(err) - } -} - -func Init() error { - return init_() -} - -func Shutdown() error { - return shutdown() -} - -func GetDeviceCount() (uint, error) { - return deviceGetCount() -} - -func GetDriverVersion() (string, error) { - return systemGetDriverVersion() -} - -func numaNode(busid string) (uint, error) { - // discard leading zeros of busid - b, err := ioutil.ReadFile(fmt.Sprintf("/sys/bus/pci/devices/%s/numa_node", strings.ToLower(busid[4:]))) - if err != nil { - // XXX report node 0 if NUMA support isn't enabled - return 0, nil - } - node, err := strconv.ParseInt(string(bytes.TrimSpace(b)), 10, 8) - if err != nil { - return 0, fmt.Errorf("%v: %v", ErrCPUAffinity, err) - } - if node < 0 { - node = 0 // XXX report node 0 instead of NUMA_NO_NODE - } - return uint(node), nil -} - -func pciBandwidth(gen, width *uint) *uint { - m := map[uint]uint{ - 1: 250, // MB/s - 2: 500, - 3: 985, - 4: 1969, - } - if gen == nil || width == nil { - return nil - } - bw := m[*gen] * *width - return &bw -} - -func NewDevice(idx uint) (device *Device, err error) { - defer func() { - if r := recover(); r != nil { - err = r.(error) - } - }() - - h, err := deviceGetHandleByIndex(idx) - assert(err) - model, err := h.deviceGetName() - assert(err) - uuid, err := h.deviceGetUUID() - assert(err) - minor, err := h.deviceGetMinorNumber() - assert(err) - power, err := h.deviceGetPowerManagementLimit() - assert(err) - totalMem, _, err := h.deviceGetMemoryInfo() - assert(err) - busid, err := h.deviceGetPciInfo() - assert(err) - bar1, _, err := h.deviceGetBAR1MemoryInfo() - assert(err) - pcig, err := h.deviceGetMaxPcieLinkGeneration() - assert(err) - pciw, err := h.deviceGetMaxPcieLinkWidth() - assert(err) - ccore, cmem, err := h.deviceGetMaxClockInfo() - assert(err) - - if minor == nil || busid == nil || uuid == nil { - return nil, ErrUnsupportedGPU - } - path := fmt.Sprintf("/dev/nvidia%d", *minor) - node, err := numaNode(*busid) - assert(err) - - device = &Device{ - handle: h, - UUID: *uuid, - Path: path, - Model: model, - Power: power, - Memory: totalMem, - CPUAffinity: &node, - PCI: PCIInfo{ - BusID: *busid, - BAR1: bar1, - Bandwidth: pciBandwidth(pcig, pciw), // MB/s - }, - Clocks: ClockInfo{ - Cores: ccore, // MHz - Memory: cmem, // MHz - }, - } - if power != nil { - *device.Power /= 1000 // W - } - if bar1 != nil { - *device.PCI.BAR1 /= 1024 * 1024 // MiB - } - return -} - -func NewDeviceLite(idx uint) (device *Device, err error) { - defer func() { - if r := recover(); r != nil { - err = r.(error) - } - }() - - h, err := deviceGetHandleByIndex(idx) - assert(err) - uuid, err := h.deviceGetUUID() - assert(err) - minor, err := h.deviceGetMinorNumber() - assert(err) - busid, err := h.deviceGetPciInfo() - assert(err) - - if minor == nil || busid == nil || uuid == nil { - return nil, ErrUnsupportedGPU - } - path := fmt.Sprintf("/dev/nvidia%d", *minor) - - device = &Device{ - handle: h, - UUID: *uuid, - Path: path, - PCI: PCIInfo{ - BusID: *busid, - }, - } - return -} - -func (d *Device) Status() (status *DeviceStatus, err error) { - defer func() { - if r := recover(); r != nil { - err = r.(error) - } - }() - - power, err := d.deviceGetPowerUsage() - assert(err) - temp, err := d.deviceGetTemperature() - assert(err) - ugpu, umem, err := d.deviceGetUtilizationRates() - assert(err) - uenc, err := d.deviceGetEncoderUtilization() - assert(err) - udec, err := d.deviceGetDecoderUtilization() - assert(err) - _, devMem, err := d.deviceGetMemoryInfo() - assert(err) - ccore, cmem, err := d.deviceGetClockInfo() - assert(err) - _, bar1, err := d.deviceGetBAR1MemoryInfo() - assert(err) - el1, el2, emem, err := d.deviceGetMemoryErrorCounter() - assert(err) - pcirx, pcitx, err := d.deviceGetPcieThroughput() - assert(err) - throttle, err := d.getClocksThrottleReasons() - assert(err) - perfState, err := d.getPerformanceState() - assert(err) - processInfo, err := d.deviceGetAllRunningProcesses() - assert(err) - - status = &DeviceStatus{ - Power: power, - Temperature: temp, // °C - Utilization: UtilizationInfo{ - GPU: ugpu, // % - Memory: umem, // % - Encoder: uenc, // % - Decoder: udec, // % - }, - Memory: MemoryInfo{ - Global: devMem, - ECCErrors: ECCErrorsInfo{ - L1Cache: el1, - L2Cache: el2, - Device: emem, - }, - }, - Clocks: ClockInfo{ - Cores: ccore, // MHz - Memory: cmem, // MHz - }, - PCI: PCIStatusInfo{ - BAR1Used: bar1, - Throughput: PCIThroughputInfo{ - RX: pcirx, - TX: pcitx, - }, - }, - Throttle: throttle, - Performance: perfState, - Processes: processInfo, - } - if power != nil { - *status.Power /= 1000 // W - } - if bar1 != nil { - *status.PCI.BAR1Used /= 1024 * 1024 // MiB - } - if pcirx != nil { - *status.PCI.Throughput.RX /= 1000 // MB/s - } - if pcitx != nil { - *status.PCI.Throughput.TX /= 1000 // MB/s - } - return -} - -func GetP2PLink(dev1, dev2 *Device) (link P2PLinkType, err error) { - level, err := deviceGetTopologyCommonAncestor(dev1.handle, dev2.handle) - if err != nil || level == nil { - return P2PLinkUnknown, err - } - - switch *level { - case C.NVML_TOPOLOGY_INTERNAL: - link = P2PLinkSameBoard - case C.NVML_TOPOLOGY_SINGLE: - link = P2PLinkSingleSwitch - case C.NVML_TOPOLOGY_MULTIPLE: - link = P2PLinkMultiSwitch - case C.NVML_TOPOLOGY_HOSTBRIDGE: - link = P2PLinkHostBridge - case C.NVML_TOPOLOGY_CPU: - link = P2PLinkSameCPU - case C.NVML_TOPOLOGY_SYSTEM: - link = P2PLinkCrossCPU - default: - err = ErrUnsupportedP2PLink - } - return -} - -func (d *Device) GetComputeRunningProcesses() ([]uint, []uint64, error) { - return d.handle.deviceGetComputeRunningProcesses() -} - -func (d *Device) GetGraphicsRunningProcesses() ([]uint, []uint64, error) { - return d.handle.deviceGetGraphicsRunningProcesses() -} - -func (d *Device) GetAllRunningProcesses() ([]ProcessInfo, error) { - return d.handle.deviceGetAllRunningProcesses() -} - -func (d *Device) GetDeviceMode() (mode *DeviceMode, err error) { - defer func() { - if r := recover(); r != nil { - err = r.(error) - } - }() - - display, err := d.getDisplayInfo() - assert(err) - - p, err := d.getPeristenceMode() - assert(err) - - accounting, err := d.getAccountingInfo() - assert(err) - - mode = &DeviceMode{ - DisplayInfo: display, - Persistence: p, - AccountingInfo: accounting, - } - return -} diff --git a/vendor/github.com/NVIDIA/gpu-monitoring-tools/bindings/go/nvml/nvml.h b/vendor/github.com/NVIDIA/gpu-monitoring-tools/bindings/go/nvml/nvml.h deleted file mode 100644 index 60185dac239d..000000000000 --- a/vendor/github.com/NVIDIA/gpu-monitoring-tools/bindings/go/nvml/nvml.h +++ /dev/null @@ -1,5871 +0,0 @@ -/* - * Copyright 1993-2017 NVIDIA Corporation. All rights reserved. - * - * NOTICE TO USER: - * - * This source code is subject to NVIDIA ownership rights under U.S. and - * international Copyright laws. Users and possessors of this source code - * are hereby granted a nonexclusive, royalty-free license to use this code - * in individual and commercial software. - * - * NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE - * CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR - * IMPLIED WARRANTY OF ANY KIND. NVIDIA DISCLAIMS ALL WARRANTIES WITH - * REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. - * IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL, - * OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS - * OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE - * OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE - * OR PERFORMANCE OF THIS SOURCE CODE. - * - * U.S. Government End Users. This source code is a "commercial item" as - * that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting of - * "commercial computer software" and "commercial computer software - * documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995) - * and is provided to the U.S. Government only as a commercial end item. - * Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through - * 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the - * source code with only those rights set forth herein. - * - * Any use of this source code in individual and commercial software must - * include, in the user documentation and internal comments to the code, - * the above Disclaimer and U.S. Government End Users Notice. - */ - -/* -NVML API Reference - -The NVIDIA Management Library (NVML) is a C-based programmatic interface for monitoring and -managing various states within NVIDIA Tesla &tm; GPUs. It is intended to be a platform for building -3rd party applications, and is also the underlying library for the NVIDIA-supported nvidia-smi -tool. NVML is thread-safe so it is safe to make simultaneous NVML calls from multiple threads. - -API Documentation - -Supported platforms: -- Windows: Windows Server 2008 R2 64bit, Windows Server 2012 R2 64bit, Windows 7 64bit, Windows 8 64bit, Windows 10 64bit -- Linux: 32-bit and 64-bit -- Hypervisors: Windows Server 2008R2/2012 Hyper-V 64bit, Citrix XenServer 6.2 SP1+, VMware ESX 5.1/5.5 - -Supported products: -- Full Support - - All Tesla products, starting with the Fermi architecture - - All Quadro products, starting with the Fermi architecture - - All GRID products, starting with the Kepler architecture - - Selected GeForce Titan products -- Limited Support - - All Geforce products, starting with the Fermi architecture - -The NVML library can be found at \%ProgramW6432\%\\"NVIDIA Corporation"\\NVSMI\\ on Windows. It is -not be added to the system path by default. To dynamically link to NVML, add this path to the PATH -environmental variable. To dynamically load NVML, call LoadLibrary with this path. - -On Linux the NVML library will be found on the standard library path. For 64 bit Linux, both the 32 bit -and 64 bit NVML libraries will be installed. - -Online documentation for this library is available at http://docs.nvidia.com/deploy/nvml-api/index.html -*/ - -#ifndef __nvml_nvml_h__ -#define __nvml_nvml_h__ - -#ifdef __cplusplus -extern "C" { -#endif - -/* - * On Windows, set up methods for DLL export - * define NVML_STATIC_IMPORT when using nvml_loader library - */ -#if defined _WINDOWS - #if !defined NVML_STATIC_IMPORT - #if defined NVML_LIB_EXPORT - #define DECLDIR __declspec(dllexport) - #else - #define DECLDIR __declspec(dllimport) - #endif - #else - #define DECLDIR - #endif -#else - #define DECLDIR -#endif - -/** - * NVML API versioning support - */ -#define NVML_API_VERSION 9 -#define NVML_API_VERSION_STR "9" -#define nvmlInit nvmlInit_v2 -#define nvmlDeviceGetPciInfo nvmlDeviceGetPciInfo_v3 -#define nvmlDeviceGetCount nvmlDeviceGetCount_v2 -#define nvmlDeviceGetHandleByIndex nvmlDeviceGetHandleByIndex_v2 -#define nvmlDeviceGetHandleByPciBusId nvmlDeviceGetHandleByPciBusId_v2 -#define nvmlDeviceGetNvLinkRemotePciInfo nvmlDeviceGetNvLinkRemotePciInfo_v2 -#define nvmlDeviceRemoveGpu nvmlDeviceRemoveGpu_v2 - -/***************************************************************************************************/ -/** @defgroup nvmlDeviceStructs Device Structs - * @{ - */ -/***************************************************************************************************/ - -/** - * Special constant that some fields take when they are not available. - * Used when only part of the struct is not available. - * - * Each structure explicitly states when to check for this value. - */ -#define NVML_VALUE_NOT_AVAILABLE (-1) - -typedef struct nvmlDevice_st* nvmlDevice_t; - -/** - * Buffer size guaranteed to be large enough for pci bus id - */ -#define NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE 32 - -/** - * Buffer size guaranteed to be large enough for pci bus id for ::busIdLegacy - */ -#define NVML_DEVICE_PCI_BUS_ID_BUFFER_V2_SIZE 16 - -/** - * PCI information about a GPU device. - */ -typedef struct nvmlPciInfo_st -{ - char busIdLegacy[NVML_DEVICE_PCI_BUS_ID_BUFFER_V2_SIZE]; //!< The legacy tuple domain:bus:device.function PCI identifier (& NULL terminator) - unsigned int domain; //!< The PCI domain on which the device's bus resides, 0 to 0xffffffff - unsigned int bus; //!< The bus on which the device resides, 0 to 0xff - unsigned int device; //!< The device's id on the bus, 0 to 31 - unsigned int pciDeviceId; //!< The combined 16-bit device id and 16-bit vendor id - - // Added in NVML 2.285 API - unsigned int pciSubSystemId; //!< The 32-bit Sub System Device ID - - char busId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE]; //!< The tuple domain:bus:device.function PCI identifier (& NULL terminator) -} nvmlPciInfo_t; - -/** - * Detailed ECC error counts for a device. - * - * @deprecated Different GPU families can have different memory error counters - * See \ref nvmlDeviceGetMemoryErrorCounter - */ -typedef struct nvmlEccErrorCounts_st -{ - unsigned long long l1Cache; //!< L1 cache errors - unsigned long long l2Cache; //!< L2 cache errors - unsigned long long deviceMemory; //!< Device memory errors - unsigned long long registerFile; //!< Register file errors -} nvmlEccErrorCounts_t; - -/** - * Utilization information for a device. - * Each sample period may be between 1 second and 1/6 second, depending on the product being queried. - */ -typedef struct nvmlUtilization_st -{ - unsigned int gpu; //!< Percent of time over the past sample period during which one or more kernels was executing on the GPU - unsigned int memory; //!< Percent of time over the past sample period during which global (device) memory was being read or written -} nvmlUtilization_t; - -/** - * Memory allocation information for a device. - */ -typedef struct nvmlMemory_st -{ - unsigned long long total; //!< Total installed FB memory (in bytes) - unsigned long long free; //!< Unallocated FB memory (in bytes) - unsigned long long used; //!< Allocated FB memory (in bytes). Note that the driver/GPU always sets aside a small amount of memory for bookkeeping -} nvmlMemory_t; - -/** - * BAR1 Memory allocation Information for a device - */ -typedef struct nvmlBAR1Memory_st -{ - unsigned long long bar1Total; //!< Total BAR1 Memory (in bytes) - unsigned long long bar1Free; //!< Unallocated BAR1 Memory (in bytes) - unsigned long long bar1Used; //!< Allocated Used Memory (in bytes) -}nvmlBAR1Memory_t; - -/** - * Information about running compute processes on the GPU - */ -typedef struct nvmlProcessInfo_st -{ - unsigned int pid; //!< Process ID - unsigned long long usedGpuMemory; //!< Amount of used GPU memory in bytes. - //! Under WDDM, \ref NVML_VALUE_NOT_AVAILABLE is always reported - //! because Windows KMD manages all the memory and not the NVIDIA driver -} nvmlProcessInfo_t; - -/** - * Enum to represent type of bridge chip - */ -typedef enum nvmlBridgeChipType_enum -{ - NVML_BRIDGE_CHIP_PLX = 0, - NVML_BRIDGE_CHIP_BRO4 = 1 -}nvmlBridgeChipType_t; - -/** - * Maximum number of NvLink links supported - */ -#define NVML_NVLINK_MAX_LINKS 6 - -/** - * Enum to represent the NvLink utilization counter packet units - */ -typedef enum nvmlNvLinkUtilizationCountUnits_enum -{ - NVML_NVLINK_COUNTER_UNIT_CYCLES = 0, // count by cycles - NVML_NVLINK_COUNTER_UNIT_PACKETS = 1, // count by packets - NVML_NVLINK_COUNTER_UNIT_BYTES = 2, // count by bytes - - // this must be last - NVML_NVLINK_COUNTER_UNIT_COUNT -} nvmlNvLinkUtilizationCountUnits_t; - -/** - * Enum to represent the NvLink utilization counter packet types to count - * ** this is ONLY applicable with the units as packets or bytes - * ** as specified in \a nvmlNvLinkUtilizationCountUnits_t - * ** all packet filter descriptions are target GPU centric - * ** these can be "OR'd" together - */ -typedef enum nvmlNvLinkUtilizationCountPktTypes_enum -{ - NVML_NVLINK_COUNTER_PKTFILTER_NOP = 0x1, // no operation packets - NVML_NVLINK_COUNTER_PKTFILTER_READ = 0x2, // read packets - NVML_NVLINK_COUNTER_PKTFILTER_WRITE = 0x4, // write packets - NVML_NVLINK_COUNTER_PKTFILTER_RATOM = 0x8, // reduction atomic requests - NVML_NVLINK_COUNTER_PKTFILTER_NRATOM = 0x10, // non-reduction atomic requests - NVML_NVLINK_COUNTER_PKTFILTER_FLUSH = 0x20, // flush requests - NVML_NVLINK_COUNTER_PKTFILTER_RESPDATA = 0x40, // responses with data - NVML_NVLINK_COUNTER_PKTFILTER_RESPNODATA = 0x80, // responses without data - NVML_NVLINK_COUNTER_PKTFILTER_ALL = 0xFF // all packets -} nvmlNvLinkUtilizationCountPktTypes_t; - -/** - * Struct to define the NVLINK counter controls - */ -typedef struct nvmlNvLinkUtilizationControl_st -{ - nvmlNvLinkUtilizationCountUnits_t units; - nvmlNvLinkUtilizationCountPktTypes_t pktfilter; -} nvmlNvLinkUtilizationControl_t; - -/** - * Enum to represent NvLink queryable capabilities - */ -typedef enum nvmlNvLinkCapability_enum -{ - NVML_NVLINK_CAP_P2P_SUPPORTED = 0, // P2P over NVLink is supported - NVML_NVLINK_CAP_SYSMEM_ACCESS = 1, // Access to system memory is supported - NVML_NVLINK_CAP_P2P_ATOMICS = 2, // P2P atomics are supported - NVML_NVLINK_CAP_SYSMEM_ATOMICS= 3, // System memory atomics are supported - NVML_NVLINK_CAP_SLI_BRIDGE = 4, // SLI is supported over this link - NVML_NVLINK_CAP_VALID = 5, // Link is supported on this device - // should be last - NVML_NVLINK_CAP_COUNT -} nvmlNvLinkCapability_t; - -/** - * Enum to represent NvLink queryable error counters - */ -typedef enum nvmlNvLinkErrorCounter_enum -{ - NVML_NVLINK_ERROR_DL_REPLAY = 0, // Data link transmit replay error counter - NVML_NVLINK_ERROR_DL_RECOVERY = 1, // Data link transmit recovery error counter - NVML_NVLINK_ERROR_DL_CRC_FLIT = 2, // Data link receive flow control digit CRC error counter - NVML_NVLINK_ERROR_DL_CRC_DATA = 3, // Data link receive data CRC error counter - - // this must be last - NVML_NVLINK_ERROR_COUNT -} nvmlNvLinkErrorCounter_t; - -/** - * Represents level relationships within a system between two GPUs - * The enums are spaced to allow for future relationships - */ -typedef enum nvmlGpuLevel_enum -{ - NVML_TOPOLOGY_INTERNAL = 0, // e.g. Tesla K80 - NVML_TOPOLOGY_SINGLE = 10, // all devices that only need traverse a single PCIe switch - NVML_TOPOLOGY_MULTIPLE = 20, // all devices that need not traverse a host bridge - NVML_TOPOLOGY_HOSTBRIDGE = 30, // all devices that are connected to the same host bridge - NVML_TOPOLOGY_NODE = 40, // all devices that are connected to the same NUMA node but possibly multiple host bridges - NVML_TOPOLOGY_SYSTEM = 50, // all devices in the system - - // there is purposefully no COUNT here because of the need for spacing above -} nvmlGpuTopologyLevel_t; - -/* Compatibility for CPU->NODE renaming */ -#define NVML_TOPOLOGY_CPU NVML_TOPOLOGY_NODE - -/* P2P Capability Index Status*/ -typedef enum nvmlGpuP2PStatus_enum -{ - NVML_P2P_STATUS_OK = 0, - NVML_P2P_STATUS_CHIPSET_NOT_SUPPORED, - NVML_P2P_STATUS_GPU_NOT_SUPPORTED, - NVML_P2P_STATUS_IOH_TOPOLOGY_NOT_SUPPORTED, - NVML_P2P_STATUS_DISABLED_BY_REGKEY, - NVML_P2P_STATUS_NOT_SUPPORTED, - NVML_P2P_STATUS_UNKNOWN - -} nvmlGpuP2PStatus_t; - -/* P2P Capability Index*/ -typedef enum nvmlGpuP2PCapsIndex_enum -{ - NVML_P2P_CAPS_INDEX_READ = 0, - NVML_P2P_CAPS_INDEX_WRITE, - NVML_P2P_CAPS_INDEX_NVLINK, - NVML_P2P_CAPS_INDEX_ATOMICS, - NVML_P2P_CAPS_INDEX_PROP, - NVML_P2P_CAPS_INDEX_UNKNOWN -}nvmlGpuP2PCapsIndex_t; - -/** - * Maximum limit on Physical Bridges per Board - */ -#define NVML_MAX_PHYSICAL_BRIDGE (128) - -/** - * Information about the Bridge Chip Firmware - */ -typedef struct nvmlBridgeChipInfo_st -{ - nvmlBridgeChipType_t type; //!< Type of Bridge Chip - unsigned int fwVersion; //!< Firmware Version. 0=Version is unavailable -}nvmlBridgeChipInfo_t; - -/** - * This structure stores the complete Hierarchy of the Bridge Chip within the board. The immediate - * bridge is stored at index 0 of bridgeInfoList, parent to immediate bridge is at index 1 and so forth. - */ -typedef struct nvmlBridgeChipHierarchy_st -{ - unsigned char bridgeCount; //!< Number of Bridge Chips on the Board - nvmlBridgeChipInfo_t bridgeChipInfo[NVML_MAX_PHYSICAL_BRIDGE]; //!< Hierarchy of Bridge Chips on the board -}nvmlBridgeChipHierarchy_t; - -/** - * Represents Type of Sampling Event - */ -typedef enum nvmlSamplingType_enum -{ - NVML_TOTAL_POWER_SAMPLES = 0, //!< To represent total power drawn by GPU - NVML_GPU_UTILIZATION_SAMPLES = 1, //!< To represent percent of time during which one or more kernels was executing on the GPU - NVML_MEMORY_UTILIZATION_SAMPLES = 2, //!< To represent percent of time during which global (device) memory was being read or written - NVML_ENC_UTILIZATION_SAMPLES = 3, //!< To represent percent of time during which NVENC remains busy - NVML_DEC_UTILIZATION_SAMPLES = 4, //!< To represent percent of time during which NVDEC remains busy - NVML_PROCESSOR_CLK_SAMPLES = 5, //!< To represent processor clock samples - NVML_MEMORY_CLK_SAMPLES = 6, //!< To represent memory clock samples - - // Keep this last - NVML_SAMPLINGTYPE_COUNT -}nvmlSamplingType_t; - -/** - * Represents the queryable PCIe utilization counters - */ -typedef enum nvmlPcieUtilCounter_enum -{ - NVML_PCIE_UTIL_TX_BYTES = 0, // 1KB granularity - NVML_PCIE_UTIL_RX_BYTES = 1, // 1KB granularity - - // Keep this last - NVML_PCIE_UTIL_COUNT -} nvmlPcieUtilCounter_t; - -/** - * Represents the type for sample value returned - */ -typedef enum nvmlValueType_enum -{ - NVML_VALUE_TYPE_DOUBLE = 0, - NVML_VALUE_TYPE_UNSIGNED_INT = 1, - NVML_VALUE_TYPE_UNSIGNED_LONG = 2, - NVML_VALUE_TYPE_UNSIGNED_LONG_LONG = 3, - NVML_VALUE_TYPE_SIGNED_LONG_LONG = 4, - - // Keep this last - NVML_VALUE_TYPE_COUNT -}nvmlValueType_t; - - -/** - * Union to represent different types of Value - */ -typedef union nvmlValue_st -{ - double dVal; //!< If the value is double - unsigned int uiVal; //!< If the value is unsigned int - unsigned long ulVal; //!< If the value is unsigned long - unsigned long long ullVal; //!< If the value is unsigned long long - signed long long sllVal; //!< If the value is signed long long -}nvmlValue_t; - -/** - * Information for Sample - */ -typedef struct nvmlSample_st -{ - unsigned long long timeStamp; //!< CPU Timestamp in microseconds - nvmlValue_t sampleValue; //!< Sample Value -}nvmlSample_t; - -/** - * Represents type of perf policy for which violation times can be queried - */ -typedef enum nvmlPerfPolicyType_enum -{ - NVML_PERF_POLICY_POWER = 0, //!< How long did power violations cause the GPU to be below application clocks - NVML_PERF_POLICY_THERMAL = 1, //!< How long did thermal violations cause the GPU to be below application clocks - NVML_PERF_POLICY_SYNC_BOOST = 2, //!< How long did sync boost cause the GPU to be below application clocks - NVML_PERF_POLICY_BOARD_LIMIT = 3, //!< How long did the board limit cause the GPU to be below application clocks - NVML_PERF_POLICY_LOW_UTILIZATION = 4, //!< How long did low utilization cause the GPU to be below application clocks - NVML_PERF_POLICY_RELIABILITY = 5, //!< How long did the board reliability limit cause the GPU to be below application clocks - - NVML_PERF_POLICY_TOTAL_APP_CLOCKS = 10, //!< Total time the GPU was held below application clocks by any limiter (0 - 5 above) - NVML_PERF_POLICY_TOTAL_BASE_CLOCKS = 11, //!< Total time the GPU was held below base clocks - - // Keep this last - NVML_PERF_POLICY_COUNT -}nvmlPerfPolicyType_t; - -/** - * Struct to hold perf policy violation status data - */ -typedef struct nvmlViolationTime_st -{ - unsigned long long referenceTime; //!< referenceTime represents CPU timestamp in microseconds - unsigned long long violationTime; //!< violationTime in Nanoseconds -}nvmlViolationTime_t; - -/** @} */ - -/***************************************************************************************************/ -/** @defgroup nvmlDeviceEnumvs Device Enums - * @{ - */ -/***************************************************************************************************/ - -/** - * Generic enable/disable enum. - */ -typedef enum nvmlEnableState_enum -{ - NVML_FEATURE_DISABLED = 0, //!< Feature disabled - NVML_FEATURE_ENABLED = 1 //!< Feature enabled -} nvmlEnableState_t; - -//! Generic flag used to specify the default behavior of some functions. See description of particular functions for details. -#define nvmlFlagDefault 0x00 -//! Generic flag used to force some behavior. See description of particular functions for details. -#define nvmlFlagForce 0x01 - -/** - * * The Brand of the GPU - * */ -typedef enum nvmlBrandType_enum -{ - NVML_BRAND_UNKNOWN = 0, - NVML_BRAND_QUADRO = 1, - NVML_BRAND_TESLA = 2, - NVML_BRAND_NVS = 3, - NVML_BRAND_GRID = 4, - NVML_BRAND_GEFORCE = 5, - NVML_BRAND_TITAN = 6, - - // Keep this last - NVML_BRAND_COUNT -} nvmlBrandType_t; - -/** - * Temperature thresholds. - */ -typedef enum nvmlTemperatureThresholds_enum -{ - NVML_TEMPERATURE_THRESHOLD_SHUTDOWN = 0, // Temperature at which the GPU will shut down - // for HW protection - NVML_TEMPERATURE_THRESHOLD_SLOWDOWN = 1, // Temperature at which the GPU will begin HW slowdown - NVML_TEMPERATURE_THRESHOLD_MEM_MAX = 2, // Memory Temperature at which the GPU will begin SW slowdown - NVML_TEMPERATURE_THRESHOLD_GPU_MAX = 3, // GPU Temperature at which the GPU can be throttled below base clock - // Keep this last - NVML_TEMPERATURE_THRESHOLD_COUNT -} nvmlTemperatureThresholds_t; - -/** - * Temperature sensors. - */ -typedef enum nvmlTemperatureSensors_enum -{ - NVML_TEMPERATURE_GPU = 0, //!< Temperature sensor for the GPU die - - // Keep this last - NVML_TEMPERATURE_COUNT -} nvmlTemperatureSensors_t; - -/** - * Compute mode. - * - * NVML_COMPUTEMODE_EXCLUSIVE_PROCESS was added in CUDA 4.0. - * Earlier CUDA versions supported a single exclusive mode, - * which is equivalent to NVML_COMPUTEMODE_EXCLUSIVE_THREAD in CUDA 4.0 and beyond. - */ -typedef enum nvmlComputeMode_enum -{ - NVML_COMPUTEMODE_DEFAULT = 0, //!< Default compute mode -- multiple contexts per device - NVML_COMPUTEMODE_EXCLUSIVE_THREAD = 1, //!< Support Removed - NVML_COMPUTEMODE_PROHIBITED = 2, //!< Compute-prohibited mode -- no contexts per device - NVML_COMPUTEMODE_EXCLUSIVE_PROCESS = 3, //!< Compute-exclusive-process mode -- only one context per device, usable from multiple threads at a time - - // Keep this last - NVML_COMPUTEMODE_COUNT -} nvmlComputeMode_t; - -/** - * ECC bit types. - * - * @deprecated See \ref nvmlMemoryErrorType_t for a more flexible type - */ -#define nvmlEccBitType_t nvmlMemoryErrorType_t - -/** - * Single bit ECC errors - * - * @deprecated Mapped to \ref NVML_MEMORY_ERROR_TYPE_CORRECTED - */ -#define NVML_SINGLE_BIT_ECC NVML_MEMORY_ERROR_TYPE_CORRECTED - -/** - * Double bit ECC errors - * - * @deprecated Mapped to \ref NVML_MEMORY_ERROR_TYPE_UNCORRECTED - */ -#define NVML_DOUBLE_BIT_ECC NVML_MEMORY_ERROR_TYPE_UNCORRECTED - -/** - * Memory error types - */ -typedef enum nvmlMemoryErrorType_enum -{ - /** - * A memory error that was corrected - * - * For ECC errors, these are single bit errors - * For Texture memory, these are errors fixed by resend - */ - NVML_MEMORY_ERROR_TYPE_CORRECTED = 0, - /** - * A memory error that was not corrected - * - * For ECC errors, these are double bit errors - * For Texture memory, these are errors where the resend fails - */ - NVML_MEMORY_ERROR_TYPE_UNCORRECTED = 1, - - - // Keep this last - NVML_MEMORY_ERROR_TYPE_COUNT //!< Count of memory error types - -} nvmlMemoryErrorType_t; - -/** - * ECC counter types. - * - * Note: Volatile counts are reset each time the driver loads. On Windows this is once per boot. On Linux this can be more frequent. - * On Linux the driver unloads when no active clients exist. If persistence mode is enabled or there is always a driver - * client active (e.g. X11), then Linux also sees per-boot behavior. If not, volatile counts are reset each time a compute app - * is run. - */ -typedef enum nvmlEccCounterType_enum -{ - NVML_VOLATILE_ECC = 0, //!< Volatile counts are reset each time the driver loads. - NVML_AGGREGATE_ECC = 1, //!< Aggregate counts persist across reboots (i.e. for the lifetime of the device) - - // Keep this last - NVML_ECC_COUNTER_TYPE_COUNT //!< Count of memory counter types -} nvmlEccCounterType_t; - -/** - * Clock types. - * - * All speeds are in Mhz. - */ -typedef enum nvmlClockType_enum -{ - NVML_CLOCK_GRAPHICS = 0, //!< Graphics clock domain - NVML_CLOCK_SM = 1, //!< SM clock domain - NVML_CLOCK_MEM = 2, //!< Memory clock domain - NVML_CLOCK_VIDEO = 3, //!< Video encoder/decoder clock domain - - // Keep this last - NVML_CLOCK_COUNT //usedGpuMemory is not supported - - - unsigned long long time; //!< Amount of time in ms during which the compute context was active. The time is reported as 0 if - //!< the process is not terminated - - unsigned long long startTime; //!< CPU Timestamp in usec representing start time for the process - - unsigned int isRunning; //!< Flag to represent if the process is running (1 for running, 0 for terminated) - - unsigned int reserved[5]; //!< Reserved for future use -} nvmlAccountingStats_t; - -/** @} */ - -/***************************************************************************************************/ -/** @defgroup nvmlVgpuConstants Vgpu Constants - * @{ - */ -/***************************************************************************************************/ - -/** - * Buffer size guaranteed to be large enough for \ref nvmlVgpuTypeGetLicense - */ -#define NVML_GRID_LICENSE_BUFFER_SIZE 128 - -#define NVML_VGPU_NAME_BUFFER_SIZE 64 - -#define NVML_GRID_LICENSE_FEATURE_MAX_COUNT 3 - -/*! - * Macros for pGPU's virtualization capabilities bitfield. - */ -#define NVML_VGPU_PGPU_VIRTUALIZATION_CAP_MIGRATION 0:0 -#define NVML_VGPU_PGPU_VIRTUALIZATION_CAP_MIGRATION_NO 0x0 -#define NVML_VGPU_PGPU_VIRTUALIZATION_CAP_MIGRATION_YES 0x1 - -/** @} */ - -/***************************************************************************************************/ -/** @defgroup nvmlVgpuEnum Vgpu Enum - * @{ - */ -/***************************************************************************************************/ - -/*! - * Types of VM identifiers - */ -typedef enum nvmlVgpuVmIdType { - NVML_VGPU_VM_ID_DOMAIN_ID = 0, //!< VM ID represents DOMAIN ID - NVML_VGPU_VM_ID_UUID = 1, //!< VM ID represents UUID -} nvmlVgpuVmIdType_t; - -// vGPU GUEST info state. -typedef enum nvmlVgpuGuestInfoState_enum -{ - NVML_VGPU_INSTANCE_GUEST_INFO_STATE_UNINITIALIZED = 0, //= 0 and < \a unitCount - * @param unit Reference in which to return the unit handle - * - * @return - * - \ref NVML_SUCCESS if \a unit has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a index is invalid or \a unit is NULL - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlUnitGetHandleByIndex(unsigned int index, nvmlUnit_t *unit); - -/** - * Retrieves the static information associated with a unit. - * - * For S-class products. - * - * See \ref nvmlUnitInfo_t for details on available unit info. - * - * @param unit The identifier of the target unit - * @param info Reference in which to return the unit information - * - * @return - * - \ref NVML_SUCCESS if \a info has been populated - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a unit is invalid or \a info is NULL - */ -nvmlReturn_t DECLDIR nvmlUnitGetUnitInfo(nvmlUnit_t unit, nvmlUnitInfo_t *info); - -/** - * Retrieves the LED state associated with this unit. - * - * For S-class products. - * - * See \ref nvmlLedState_t for details on allowed states. - * - * @param unit The identifier of the target unit - * @param state Reference in which to return the current LED state - * - * @return - * - \ref NVML_SUCCESS if \a state has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a unit is invalid or \a state is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if this is not an S-class product - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - * - * @see nvmlUnitSetLedState() - */ -nvmlReturn_t DECLDIR nvmlUnitGetLedState(nvmlUnit_t unit, nvmlLedState_t *state); - -/** - * Retrieves the PSU stats for the unit. - * - * For S-class products. - * - * See \ref nvmlPSUInfo_t for details on available PSU info. - * - * @param unit The identifier of the target unit - * @param psu Reference in which to return the PSU information - * - * @return - * - \ref NVML_SUCCESS if \a psu has been populated - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a unit is invalid or \a psu is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if this is not an S-class product - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlUnitGetPsuInfo(nvmlUnit_t unit, nvmlPSUInfo_t *psu); - -/** - * Retrieves the temperature readings for the unit, in degrees C. - * - * For S-class products. - * - * Depending on the product, readings may be available for intake (type=0), - * exhaust (type=1) and board (type=2). - * - * @param unit The identifier of the target unit - * @param type The type of reading to take - * @param temp Reference in which to return the intake temperature - * - * @return - * - \ref NVML_SUCCESS if \a temp has been populated - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a unit or \a type is invalid or \a temp is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if this is not an S-class product - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlUnitGetTemperature(nvmlUnit_t unit, unsigned int type, unsigned int *temp); - -/** - * Retrieves the fan speed readings for the unit. - * - * For S-class products. - * - * See \ref nvmlUnitFanSpeeds_t for details on available fan speed info. - * - * @param unit The identifier of the target unit - * @param fanSpeeds Reference in which to return the fan speed information - * - * @return - * - \ref NVML_SUCCESS if \a fanSpeeds has been populated - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a unit is invalid or \a fanSpeeds is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if this is not an S-class product - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlUnitGetFanSpeedInfo(nvmlUnit_t unit, nvmlUnitFanSpeeds_t *fanSpeeds); - -/** - * Retrieves the set of GPU devices that are attached to the specified unit. - * - * For S-class products. - * - * The \a deviceCount argument is expected to be set to the size of the input \a devices array. - * - * @param unit The identifier of the target unit - * @param deviceCount Reference in which to provide the \a devices array size, and - * to return the number of attached GPU devices - * @param devices Reference in which to return the references to the attached GPU devices - * - * @return - * - \ref NVML_SUCCESS if \a deviceCount and \a devices have been populated - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a deviceCount indicates that the \a devices array is too small - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a unit is invalid, either of \a deviceCount or \a devices is NULL - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlUnitGetDevices(nvmlUnit_t unit, unsigned int *deviceCount, nvmlDevice_t *devices); - -/** - * Retrieves the IDs and firmware versions for any Host Interface Cards (HICs) in the system. - * - * For S-class products. - * - * The \a hwbcCount argument is expected to be set to the size of the input \a hwbcEntries array. - * The HIC must be connected to an S-class system for it to be reported by this function. - * - * @param hwbcCount Size of hwbcEntries array - * @param hwbcEntries Array holding information about hwbc - * - * @return - * - \ref NVML_SUCCESS if \a hwbcCount and \a hwbcEntries have been populated - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if either \a hwbcCount or \a hwbcEntries is NULL - * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a hwbcCount indicates that the \a hwbcEntries array is too small - */ -nvmlReturn_t DECLDIR nvmlSystemGetHicVersion(unsigned int *hwbcCount, nvmlHwbcEntry_t *hwbcEntries); -/** @} */ - -/***************************************************************************************************/ -/** @defgroup nvmlDeviceQueries Device Queries - * This chapter describes that queries that NVML can perform against each device. - * In each case the device is identified with an nvmlDevice_t handle. This handle is obtained by - * calling one of \ref nvmlDeviceGetHandleByIndex(), \ref nvmlDeviceGetHandleBySerial(), - * \ref nvmlDeviceGetHandleByPciBusId(). or \ref nvmlDeviceGetHandleByUUID(). - * @{ - */ -/***************************************************************************************************/ - - /** - * Retrieves the number of compute devices in the system. A compute device is a single GPU. - * - * For all products. - * - * Note: New nvmlDeviceGetCount_v2 (default in NVML 5.319) returns count of all devices in the system - * even if nvmlDeviceGetHandleByIndex_v2 returns NVML_ERROR_NO_PERMISSION for such device. - * Update your code to handle this error, or use NVML 4.304 or older nvml header file. - * For backward binary compatibility reasons _v1 version of the API is still present in the shared - * library. - * Old _v1 version of nvmlDeviceGetCount doesn't count devices that NVML has no permission to talk to. - * - * @param deviceCount Reference in which to return the number of accessible devices - * - * @return - * - \ref NVML_SUCCESS if \a deviceCount has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a deviceCount is NULL - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetCount(unsigned int *deviceCount); - -/** - * Acquire the handle for a particular device, based on its index. - * - * For all products. - * - * Valid indices are derived from the \a accessibleDevices count returned by - * \ref nvmlDeviceGetCount(). For example, if \a accessibleDevices is 2 the valid indices - * are 0 and 1, corresponding to GPU 0 and GPU 1. - * - * The order in which NVML enumerates devices has no guarantees of consistency between reboots. For that reason it - * is recommended that devices be looked up by their PCI ids or UUID. See - * \ref nvmlDeviceGetHandleByUUID() and \ref nvmlDeviceGetHandleByPciBusId(). - * - * Note: The NVML index may not correlate with other APIs, such as the CUDA device index. - * - * Starting from NVML 5, this API causes NVML to initialize the target GPU - * NVML may initialize additional GPUs if: - * - The target GPU is an SLI slave - * - * Note: New nvmlDeviceGetCount_v2 (default in NVML 5.319) returns count of all devices in the system - * even if nvmlDeviceGetHandleByIndex_v2 returns NVML_ERROR_NO_PERMISSION for such device. - * Update your code to handle this error, or use NVML 4.304 or older nvml header file. - * For backward binary compatibility reasons _v1 version of the API is still present in the shared - * library. - * Old _v1 version of nvmlDeviceGetCount doesn't count devices that NVML has no permission to talk to. - * - * This means that nvmlDeviceGetHandleByIndex_v2 and _v1 can return different devices for the same index. - * If you don't touch macros that map old (_v1) versions to _v2 versions at the top of the file you don't - * need to worry about that. - * - * @param index The index of the target GPU, >= 0 and < \a accessibleDevices - * @param device Reference in which to return the device handle - * - * @return - * - \ref NVML_SUCCESS if \a device has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a index is invalid or \a device is NULL - * - \ref NVML_ERROR_INSUFFICIENT_POWER if any attached devices have improperly attached external power cables - * - \ref NVML_ERROR_NO_PERMISSION if the user doesn't have permission to talk to this device - * - \ref NVML_ERROR_IRQ_ISSUE if NVIDIA kernel detected an interrupt issue with the attached GPUs - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - * - * @see nvmlDeviceGetIndex - * @see nvmlDeviceGetCount - */ -nvmlReturn_t DECLDIR nvmlDeviceGetHandleByIndex(unsigned int index, nvmlDevice_t *device); - -/** - * Acquire the handle for a particular device, based on its board serial number. - * - * For Fermi &tm; or newer fully supported devices. - * - * This number corresponds to the value printed directly on the board, and to the value returned by - * \ref nvmlDeviceGetSerial(). - * - * @deprecated Since more than one GPU can exist on a single board this function is deprecated in favor - * of \ref nvmlDeviceGetHandleByUUID. - * For dual GPU boards this function will return NVML_ERROR_INVALID_ARGUMENT. - * - * Starting from NVML 5, this API causes NVML to initialize the target GPU - * NVML may initialize additional GPUs as it searches for the target GPU - * - * @param serial The board serial number of the target GPU - * @param device Reference in which to return the device handle - * - * @return - * - \ref NVML_SUCCESS if \a device has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a serial is invalid, \a device is NULL or more than one - * device has the same serial (dual GPU boards) - * - \ref NVML_ERROR_NOT_FOUND if \a serial does not match a valid device on the system - * - \ref NVML_ERROR_INSUFFICIENT_POWER if any attached devices have improperly attached external power cables - * - \ref NVML_ERROR_IRQ_ISSUE if NVIDIA kernel detected an interrupt issue with the attached GPUs - * - \ref NVML_ERROR_GPU_IS_LOST if any GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - * - * @see nvmlDeviceGetSerial - * @see nvmlDeviceGetHandleByUUID - */ -nvmlReturn_t DECLDIR nvmlDeviceGetHandleBySerial(const char *serial, nvmlDevice_t *device); - -/** - * Acquire the handle for a particular device, based on its globally unique immutable UUID associated with each device. - * - * For all products. - * - * @param uuid The UUID of the target GPU - * @param device Reference in which to return the device handle - * - * Starting from NVML 5, this API causes NVML to initialize the target GPU - * NVML may initialize additional GPUs as it searches for the target GPU - * - * @return - * - \ref NVML_SUCCESS if \a device has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a uuid is invalid or \a device is null - * - \ref NVML_ERROR_NOT_FOUND if \a uuid does not match a valid device on the system - * - \ref NVML_ERROR_INSUFFICIENT_POWER if any attached devices have improperly attached external power cables - * - \ref NVML_ERROR_IRQ_ISSUE if NVIDIA kernel detected an interrupt issue with the attached GPUs - * - \ref NVML_ERROR_GPU_IS_LOST if any GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - * - * @see nvmlDeviceGetUUID - */ -nvmlReturn_t DECLDIR nvmlDeviceGetHandleByUUID(const char *uuid, nvmlDevice_t *device); - -/** - * Acquire the handle for a particular device, based on its PCI bus id. - * - * For all products. - * - * This value corresponds to the nvmlPciInfo_t::busId returned by \ref nvmlDeviceGetPciInfo(). - * - * Starting from NVML 5, this API causes NVML to initialize the target GPU - * NVML may initialize additional GPUs if: - * - The target GPU is an SLI slave - * - * \note NVML 4.304 and older version of nvmlDeviceGetHandleByPciBusId"_v1" returns NVML_ERROR_NOT_FOUND - * instead of NVML_ERROR_NO_PERMISSION. - * - * @param pciBusId The PCI bus id of the target GPU - * @param device Reference in which to return the device handle - * - * @return - * - \ref NVML_SUCCESS if \a device has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a pciBusId is invalid or \a device is NULL - * - \ref NVML_ERROR_NOT_FOUND if \a pciBusId does not match a valid device on the system - * - \ref NVML_ERROR_INSUFFICIENT_POWER if the attached device has improperly attached external power cables - * - \ref NVML_ERROR_NO_PERMISSION if the user doesn't have permission to talk to this device - * - \ref NVML_ERROR_IRQ_ISSUE if NVIDIA kernel detected an interrupt issue with the attached GPUs - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetHandleByPciBusId(const char *pciBusId, nvmlDevice_t *device); - -/** - * Retrieves the name of this device. - * - * For all products. - * - * The name is an alphanumeric string that denotes a particular product, e.g. Tesla &tm; C2070. It will not - * exceed 64 characters in length (including the NULL terminator). See \ref - * nvmlConstants::NVML_DEVICE_NAME_BUFFER_SIZE. - * - * @param device The identifier of the target device - * @param name Reference in which to return the product name - * @param length The maximum allowed length of the string returned in \a name - * - * @return - * - \ref NVML_SUCCESS if \a name has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, or \a name is NULL - * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a length is too small - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetName(nvmlDevice_t device, char *name, unsigned int length); - -/** - * Retrieves the brand of this device. - * - * For all products. - * - * The type is a member of \ref nvmlBrandType_t defined above. - * - * @param device The identifier of the target device - * @param type Reference in which to return the product brand type - * - * @return - * - \ref NVML_SUCCESS if \a name has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, or \a type is NULL - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetBrand(nvmlDevice_t device, nvmlBrandType_t *type); - -/** - * Retrieves the NVML index of this device. - * - * For all products. - * - * Valid indices are derived from the \a accessibleDevices count returned by - * \ref nvmlDeviceGetCount(). For example, if \a accessibleDevices is 2 the valid indices - * are 0 and 1, corresponding to GPU 0 and GPU 1. - * - * The order in which NVML enumerates devices has no guarantees of consistency between reboots. For that reason it - * is recommended that devices be looked up by their PCI ids or GPU UUID. See - * \ref nvmlDeviceGetHandleByPciBusId() and \ref nvmlDeviceGetHandleByUUID(). - * - * Note: The NVML index may not correlate with other APIs, such as the CUDA device index. - * - * @param device The identifier of the target device - * @param index Reference in which to return the NVML index of the device - * - * @return - * - \ref NVML_SUCCESS if \a index has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, or \a index is NULL - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - * - * @see nvmlDeviceGetHandleByIndex() - * @see nvmlDeviceGetCount() - */ -nvmlReturn_t DECLDIR nvmlDeviceGetIndex(nvmlDevice_t device, unsigned int *index); - -/** - * Retrieves the globally unique board serial number associated with this device's board. - * - * For all products with an inforom. - * - * The serial number is an alphanumeric string that will not exceed 30 characters (including the NULL terminator). - * This number matches the serial number tag that is physically attached to the board. See \ref - * nvmlConstants::NVML_DEVICE_SERIAL_BUFFER_SIZE. - * - * @param device The identifier of the target device - * @param serial Reference in which to return the board/module serial number - * @param length The maximum allowed length of the string returned in \a serial - * - * @return - * - \ref NVML_SUCCESS if \a serial has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, or \a serial is NULL - * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a length is too small - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetSerial(nvmlDevice_t device, char *serial, unsigned int length); - -/** - * Retrieves an array of unsigned ints (sized to cpuSetSize) of bitmasks with the ideal CPU affinity for the device - * For example, if processors 0, 1, 32, and 33 are ideal for the device and cpuSetSize == 2, - * result[0] = 0x3, result[1] = 0x3 - * - * For Kepler &tm; or newer fully supported devices. - * Supported on Linux only. - * - * @param device The identifier of the target device - * @param cpuSetSize The size of the cpuSet array that is safe to access - * @param cpuSet Array reference in which to return a bitmask of CPUs, 64 CPUs per - * unsigned long on 64-bit machines, 32 on 32-bit machines - * - * @return - * - \ref NVML_SUCCESS if \a cpuAffinity has been filled - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, cpuSetSize == 0, or cpuSet is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetCpuAffinity(nvmlDevice_t device, unsigned int cpuSetSize, unsigned long *cpuSet); - -/** - * Sets the ideal affinity for the calling thread and device using the guidelines - * given in nvmlDeviceGetCpuAffinity(). Note, this is a change as of version 8.0. - * Older versions set the affinity for a calling process and all children. - * Currently supports up to 64 processors. - * - * For Kepler &tm; or newer fully supported devices. - * Supported on Linux only. - * - * @param device The identifier of the target device - * - * @return - * - \ref NVML_SUCCESS if the calling process has been successfully bound - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceSetCpuAffinity(nvmlDevice_t device); - -/** - * Clear all affinity bindings for the calling thread. Note, this is a change as of version - * 8.0 as older versions cleared the affinity for a calling process and all children. - * - * For Kepler &tm; or newer fully supported devices. - * Supported on Linux only. - * - * @param device The identifier of the target device - * - * @return - * - \ref NVML_SUCCESS if the calling process has been successfully unbound - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceClearCpuAffinity(nvmlDevice_t device); - -/** - * Retrieve the common ancestor for two devices - * For all products. - * Supported on Linux only. - * - * @param device1 The identifier of the first device - * @param device2 The identifier of the second device - * @param pathInfo A \ref nvmlGpuTopologyLevel_t that gives the path type - * - * @return - * - \ref NVML_SUCCESS if \a pathInfo has been set - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device1, or \a device2 is invalid, or \a pathInfo is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if the device or OS does not support this feature - * - \ref NVML_ERROR_UNKNOWN an error has occurred in underlying topology discovery - */ -nvmlReturn_t DECLDIR nvmlDeviceGetTopologyCommonAncestor(nvmlDevice_t device1, nvmlDevice_t device2, nvmlGpuTopologyLevel_t *pathInfo); - -/** - * Retrieve the set of GPUs that are nearest to a given device at a specific interconnectivity level - * For all products. - * Supported on Linux only. - * - * @param device The identifier of the first device - * @param level The \ref nvmlGpuTopologyLevel_t level to search for other GPUs - * @param count When zero, is set to the number of matching GPUs such that \a deviceArray - * can be malloc'd. When non-zero, \a deviceArray will be filled with \a count - * number of device handles. - * @param deviceArray An array of device handles for GPUs found at \a level - * - * @return - * - \ref NVML_SUCCESS if \a deviceArray or \a count (if initially zero) has been set - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device, \a level, or \a count is invalid, or \a deviceArray is NULL with a non-zero \a count - * - \ref NVML_ERROR_NOT_SUPPORTED if the device or OS does not support this feature - * - \ref NVML_ERROR_UNKNOWN an error has occurred in underlying topology discovery - */ -nvmlReturn_t DECLDIR nvmlDeviceGetTopologyNearestGpus(nvmlDevice_t device, nvmlGpuTopologyLevel_t level, unsigned int *count, nvmlDevice_t *deviceArray); - -/** - * Retrieve the set of GPUs that have a CPU affinity with the given CPU number - * For all products. - * Supported on Linux only. - * - * @param cpuNumber The CPU number - * @param count When zero, is set to the number of matching GPUs such that \a deviceArray - * can be malloc'd. When non-zero, \a deviceArray will be filled with \a count - * number of device handles. - * @param deviceArray An array of device handles for GPUs found with affinity to \a cpuNumber - * - * @return - * - \ref NVML_SUCCESS if \a deviceArray or \a count (if initially zero) has been set - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a cpuNumber, or \a count is invalid, or \a deviceArray is NULL with a non-zero \a count - * - \ref NVML_ERROR_NOT_SUPPORTED if the device or OS does not support this feature - * - \ref NVML_ERROR_UNKNOWN an error has occurred in underlying topology discovery - */ -nvmlReturn_t DECLDIR nvmlSystemGetTopologyGpuSet(unsigned int cpuNumber, unsigned int *count, nvmlDevice_t *deviceArray); - -/** - * Retrieve the status for a given p2p capability index between a given pair of GPU - * - * @param device1 The first device - * @param device2 The second device - * @param p2pIndex p2p Capability Index being looked for between \a device1 and \a device2 - * @param p2pStatus Reference in which to return the status of the \a p2pIndex - * between \a device1 and \a device2 - * @return - * - \ref NVML_SUCCESS if \a p2pStatus has been populated - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device1 or \a device2 or \a p2pIndex is invalid or \a p2pStatus is NULL - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetP2PStatus(nvmlDevice_t device1, nvmlDevice_t device2, nvmlGpuP2PCapsIndex_t p2pIndex,nvmlGpuP2PStatus_t *p2pStatus); - -/** - * Retrieves the globally unique immutable UUID associated with this device, as a 5 part hexadecimal string, - * that augments the immutable, board serial identifier. - * - * For all products. - * - * The UUID is a globally unique identifier. It is the only available identifier for pre-Fermi-architecture products. - * It does NOT correspond to any identifier printed on the board. It will not exceed 80 characters in length - * (including the NULL terminator). See \ref nvmlConstants::NVML_DEVICE_UUID_BUFFER_SIZE. - * - * @param device The identifier of the target device - * @param uuid Reference in which to return the GPU UUID - * @param length The maximum allowed length of the string returned in \a uuid - * - * @return - * - \ref NVML_SUCCESS if \a uuid has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, or \a uuid is NULL - * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a length is too small - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetUUID(nvmlDevice_t device, char *uuid, unsigned int length); - -/** - * Retrieves minor number for the device. The minor number for the device is such that the Nvidia device node file for - * each GPU will have the form /dev/nvidia[minor number]. - * - * For all products. - * Supported only for Linux - * - * @param device The identifier of the target device - * @param minorNumber Reference in which to return the minor number for the device - * @return - * - \ref NVML_SUCCESS if the minor number is successfully retrieved - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a minorNumber is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if this query is not supported by the device - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetMinorNumber(nvmlDevice_t device, unsigned int *minorNumber); - -/** - * Retrieves the the device board part number which is programmed into the board's InfoROM - * - * For all products. - * - * @param device Identifier of the target device - * @param partNumber Reference to the buffer to return - * @param length Length of the buffer reference - * - * @return - * - \ref NVML_SUCCESS if \a partNumber has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_NOT_SUPPORTED if the needed VBIOS fields have not been filled - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a serial is NULL - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetBoardPartNumber(nvmlDevice_t device, char* partNumber, unsigned int length); - -/** - * Retrieves the version information for the device's infoROM object. - * - * For all products with an inforom. - * - * Fermi and higher parts have non-volatile on-board memory for persisting device info, such as aggregate - * ECC counts. The version of the data structures in this memory may change from time to time. It will not - * exceed 16 characters in length (including the NULL terminator). - * See \ref nvmlConstants::NVML_DEVICE_INFOROM_VERSION_BUFFER_SIZE. - * - * See \ref nvmlInforomObject_t for details on the available infoROM objects. - * - * @param device The identifier of the target device - * @param object The target infoROM object - * @param version Reference in which to return the infoROM version - * @param length The maximum allowed length of the string returned in \a version - * - * @return - * - \ref NVML_SUCCESS if \a version has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a version is NULL - * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a length is too small - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not have an infoROM - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - * - * @see nvmlDeviceGetInforomImageVersion - */ -nvmlReturn_t DECLDIR nvmlDeviceGetInforomVersion(nvmlDevice_t device, nvmlInforomObject_t object, char *version, unsigned int length); - -/** - * Retrieves the global infoROM image version - * - * For all products with an inforom. - * - * Image version just like VBIOS version uniquely describes the exact version of the infoROM flashed on the board - * in contrast to infoROM object version which is only an indicator of supported features. - * Version string will not exceed 16 characters in length (including the NULL terminator). - * See \ref nvmlConstants::NVML_DEVICE_INFOROM_VERSION_BUFFER_SIZE. - * - * @param device The identifier of the target device - * @param version Reference in which to return the infoROM image version - * @param length The maximum allowed length of the string returned in \a version - * - * @return - * - \ref NVML_SUCCESS if \a version has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a version is NULL - * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a length is too small - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not have an infoROM - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - * - * @see nvmlDeviceGetInforomVersion - */ -nvmlReturn_t DECLDIR nvmlDeviceGetInforomImageVersion(nvmlDevice_t device, char *version, unsigned int length); - -/** - * Retrieves the checksum of the configuration stored in the device's infoROM. - * - * For all products with an inforom. - * - * Can be used to make sure that two GPUs have the exact same configuration. - * Current checksum takes into account configuration stored in PWR and ECC infoROM objects. - * Checksum can change between driver releases or when user changes configuration (e.g. disable/enable ECC) - * - * @param device The identifier of the target device - * @param checksum Reference in which to return the infoROM configuration checksum - * - * @return - * - \ref NVML_SUCCESS if \a checksum has been set - * - \ref NVML_ERROR_CORRUPTED_INFOROM if the device's checksum couldn't be retrieved due to infoROM corruption - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a checksum is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetInforomConfigurationChecksum(nvmlDevice_t device, unsigned int *checksum); - -/** - * Reads the infoROM from the flash and verifies the checksums. - * - * For all products with an inforom. - * - * @param device The identifier of the target device - * - * @return - * - \ref NVML_SUCCESS if infoROM is not corrupted - * - \ref NVML_ERROR_CORRUPTED_INFOROM if the device's infoROM is corrupted - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceValidateInforom(nvmlDevice_t device); - -/** - * Retrieves the display mode for the device. - * - * For all products. - * - * This method indicates whether a physical display (e.g. monitor) is currently connected to - * any of the device's connectors. - * - * See \ref nvmlEnableState_t for details on allowed modes. - * - * @param device The identifier of the target device - * @param display Reference in which to return the display mode - * - * @return - * - \ref NVML_SUCCESS if \a display has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a display is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetDisplayMode(nvmlDevice_t device, nvmlEnableState_t *display); - -/** - * Retrieves the display active state for the device. - * - * For all products. - * - * This method indicates whether a display is initialized on the device. - * For example whether X Server is attached to this device and has allocated memory for the screen. - * - * Display can be active even when no monitor is physically attached. - * - * See \ref nvmlEnableState_t for details on allowed modes. - * - * @param device The identifier of the target device - * @param isActive Reference in which to return the display active state - * - * @return - * - \ref NVML_SUCCESS if \a isActive has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a isActive is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetDisplayActive(nvmlDevice_t device, nvmlEnableState_t *isActive); - -/** - * Retrieves the persistence mode associated with this device. - * - * For all products. - * For Linux only. - * - * When driver persistence mode is enabled the driver software state is not torn down when the last - * client disconnects. By default this feature is disabled. - * - * See \ref nvmlEnableState_t for details on allowed modes. - * - * @param device The identifier of the target device - * @param mode Reference in which to return the current driver persistence mode - * - * @return - * - \ref NVML_SUCCESS if \a mode has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a mode is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - * - * @see nvmlDeviceSetPersistenceMode() - */ -nvmlReturn_t DECLDIR nvmlDeviceGetPersistenceMode(nvmlDevice_t device, nvmlEnableState_t *mode); - -/** - * Retrieves the PCI attributes of this device. - * - * For all products. - * - * See \ref nvmlPciInfo_t for details on the available PCI info. - * - * @param device The identifier of the target device - * @param pci Reference in which to return the PCI info - * - * @return - * - \ref NVML_SUCCESS if \a pci has been populated - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a pci is NULL - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetPciInfo(nvmlDevice_t device, nvmlPciInfo_t *pci); - -/** - * Retrieves the maximum PCIe link generation possible with this device and system - * - * I.E. for a generation 2 PCIe device attached to a generation 1 PCIe bus the max link generation this function will - * report is generation 1. - * - * For Fermi &tm; or newer fully supported devices. - * - * @param device The identifier of the target device - * @param maxLinkGen Reference in which to return the max PCIe link generation - * - * @return - * - \ref NVML_SUCCESS if \a maxLinkGen has been populated - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a maxLinkGen is null - * - \ref NVML_ERROR_NOT_SUPPORTED if PCIe link information is not available - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetMaxPcieLinkGeneration(nvmlDevice_t device, unsigned int *maxLinkGen); - -/** - * Retrieves the maximum PCIe link width possible with this device and system - * - * I.E. for a device with a 16x PCIe bus width attached to a 8x PCIe system bus this function will report - * a max link width of 8. - * - * For Fermi &tm; or newer fully supported devices. - * - * @param device The identifier of the target device - * @param maxLinkWidth Reference in which to return the max PCIe link generation - * - * @return - * - \ref NVML_SUCCESS if \a maxLinkWidth has been populated - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a maxLinkWidth is null - * - \ref NVML_ERROR_NOT_SUPPORTED if PCIe link information is not available - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetMaxPcieLinkWidth(nvmlDevice_t device, unsigned int *maxLinkWidth); - -/** - * Retrieves the current PCIe link generation - * - * For Fermi &tm; or newer fully supported devices. - * - * @param device The identifier of the target device - * @param currLinkGen Reference in which to return the current PCIe link generation - * - * @return - * - \ref NVML_SUCCESS if \a currLinkGen has been populated - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a currLinkGen is null - * - \ref NVML_ERROR_NOT_SUPPORTED if PCIe link information is not available - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetCurrPcieLinkGeneration(nvmlDevice_t device, unsigned int *currLinkGen); - -/** - * Retrieves the current PCIe link width - * - * For Fermi &tm; or newer fully supported devices. - * - * @param device The identifier of the target device - * @param currLinkWidth Reference in which to return the current PCIe link generation - * - * @return - * - \ref NVML_SUCCESS if \a currLinkWidth has been populated - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a currLinkWidth is null - * - \ref NVML_ERROR_NOT_SUPPORTED if PCIe link information is not available - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetCurrPcieLinkWidth(nvmlDevice_t device, unsigned int *currLinkWidth); - -/** - * Retrieve PCIe utilization information. - * This function is querying a byte counter over a 20ms interval and thus is the - * PCIe throughput over that interval. - * - * For Maxwell &tm; or newer fully supported devices. - * - * This method is not supported in virtual machines running virtual GPU (vGPU). - * - * @param device The identifier of the target device - * @param counter The specific counter that should be queried \ref nvmlPcieUtilCounter_t - * @param value Reference in which to return throughput in KB/s - * - * @return - * - \ref NVML_SUCCESS if \a value has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device or \a counter is invalid, or \a value is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetPcieThroughput(nvmlDevice_t device, nvmlPcieUtilCounter_t counter, unsigned int *value); - -/** - * Retrieve the PCIe replay counter. - * - * For Kepler &tm; or newer fully supported devices. - * - * @param device The identifier of the target device - * @param value Reference in which to return the counter's value - * - * @return - * - \ref NVML_SUCCESS if \a value and \a rollover have been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, or \a value or \a rollover are NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetPcieReplayCounter(nvmlDevice_t device, unsigned int *value); - -/** - * Retrieves the current clock speeds for the device. - * - * For Fermi &tm; or newer fully supported devices. - * - * See \ref nvmlClockType_t for details on available clock information. - * - * @param device The identifier of the target device - * @param type Identify which clock domain to query - * @param clock Reference in which to return the clock speed in MHz - * - * @return - * - \ref NVML_SUCCESS if \a clock has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a clock is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if the device cannot report the specified clock - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetClockInfo(nvmlDevice_t device, nvmlClockType_t type, unsigned int *clock); - -/** - * Retrieves the maximum clock speeds for the device. - * - * For Fermi &tm; or newer fully supported devices. - * - * See \ref nvmlClockType_t for details on available clock information. - * - * \note On GPUs from Fermi family current P0 clocks (reported by \ref nvmlDeviceGetClockInfo) can differ from max clocks - * by few MHz. - * - * @param device The identifier of the target device - * @param type Identify which clock domain to query - * @param clock Reference in which to return the clock speed in MHz - * - * @return - * - \ref NVML_SUCCESS if \a clock has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a clock is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if the device cannot report the specified clock - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetMaxClockInfo(nvmlDevice_t device, nvmlClockType_t type, unsigned int *clock); - -/** - * Retrieves the current setting of a clock that applications will use unless an overspec situation occurs. - * Can be changed using \ref nvmlDeviceSetApplicationsClocks. - * - * For Kepler &tm; or newer fully supported devices. - * - * @param device The identifier of the target device - * @param clockType Identify which clock domain to query - * @param clockMHz Reference in which to return the clock in MHz - * - * @return - * - \ref NVML_SUCCESS if \a clockMHz has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a clockMHz is NULL or \a clockType is invalid - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetApplicationsClock(nvmlDevice_t device, nvmlClockType_t clockType, unsigned int *clockMHz); - -/** - * Retrieves the default applications clock that GPU boots with or - * defaults to after \ref nvmlDeviceResetApplicationsClocks call. - * - * For Kepler &tm; or newer fully supported devices. - * - * @param device The identifier of the target device - * @param clockType Identify which clock domain to query - * @param clockMHz Reference in which to return the default clock in MHz - * - * @return - * - \ref NVML_SUCCESS if \a clockMHz has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a clockMHz is NULL or \a clockType is invalid - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - * - * \see nvmlDeviceGetApplicationsClock - */ -nvmlReturn_t DECLDIR nvmlDeviceGetDefaultApplicationsClock(nvmlDevice_t device, nvmlClockType_t clockType, unsigned int *clockMHz); - -/** - * Resets the application clock to the default value - * - * This is the applications clock that will be used after system reboot or driver reload. - * Default value is constant, but the current value an be changed using \ref nvmlDeviceSetApplicationsClocks. - * - * On Pascal and newer hardware, if clocks were previously locked with \ref nvmlDeviceSetApplicationsClocks, - * this call will unlock clocks. This returns clocks their default behavior ofautomatically boosting above - * base clocks as thermal limits allow. - * - * @see nvmlDeviceGetApplicationsClock - * @see nvmlDeviceSetApplicationsClocks - * - * For Fermi &tm; or newer non-GeForce fully supported devices and Maxwell or newer GeForce devices. - * - * @param device The identifier of the target device - * - * @return - * - \ref NVML_SUCCESS if new settings were successfully set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceResetApplicationsClocks(nvmlDevice_t device); - -/** - * Retrieves the clock speed for the clock specified by the clock type and clock ID. - * - * For Kepler &tm; or newer fully supported devices. - * - * @param device The identifier of the target device - * @param clockType Identify which clock domain to query - * @param clockId Identify which clock in the domain to query - * @param clockMHz Reference in which to return the clock in MHz - * - * @return - * - \ref NVML_SUCCESS if \a clockMHz has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a clockMHz is NULL or \a clockType is invalid - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetClock(nvmlDevice_t device, nvmlClockType_t clockType, nvmlClockId_t clockId, unsigned int *clockMHz); - -/** - * Retrieves the customer defined maximum boost clock speed specified by the given clock type. - * - * For Pascal &tm; or newer fully supported devices. - * - * @param device The identifier of the target device - * @param clockType Identify which clock domain to query - * @param clockMHz Reference in which to return the clock in MHz - * - * @return - * - \ref NVML_SUCCESS if \a clockMHz has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a clockMHz is NULL or \a clockType is invalid - * - \ref NVML_ERROR_NOT_SUPPORTED if the device or the \a clockType on this device does not support this feature - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetMaxCustomerBoostClock(nvmlDevice_t device, nvmlClockType_t clockType, unsigned int *clockMHz); - -/** - * Retrieves the list of possible memory clocks that can be used as an argument for \ref nvmlDeviceSetApplicationsClocks. - * - * For Kepler &tm; or newer fully supported devices. - * - * @param device The identifier of the target device - * @param count Reference in which to provide the \a clocksMHz array size, and - * to return the number of elements - * @param clocksMHz Reference in which to return the clock in MHz - * - * @return - * - \ref NVML_SUCCESS if \a count and \a clocksMHz have been populated - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a count is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature - * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a count is too small (\a count is set to the number of - * required elements) - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - * - * @see nvmlDeviceSetApplicationsClocks - * @see nvmlDeviceGetSupportedGraphicsClocks - */ -nvmlReturn_t DECLDIR nvmlDeviceGetSupportedMemoryClocks(nvmlDevice_t device, unsigned int *count, unsigned int *clocksMHz); - -/** - * Retrieves the list of possible graphics clocks that can be used as an argument for \ref nvmlDeviceSetApplicationsClocks. - * - * For Kepler &tm; or newer fully supported devices. - * - * @param device The identifier of the target device - * @param memoryClockMHz Memory clock for which to return possible graphics clocks - * @param count Reference in which to provide the \a clocksMHz array size, and - * to return the number of elements - * @param clocksMHz Reference in which to return the clocks in MHz - * - * @return - * - \ref NVML_SUCCESS if \a count and \a clocksMHz have been populated - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_NOT_FOUND if the specified \a memoryClockMHz is not a supported frequency - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a clock is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature - * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a count is too small - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - * - * @see nvmlDeviceSetApplicationsClocks - * @see nvmlDeviceGetSupportedMemoryClocks - */ -nvmlReturn_t DECLDIR nvmlDeviceGetSupportedGraphicsClocks(nvmlDevice_t device, unsigned int memoryClockMHz, unsigned int *count, unsigned int *clocksMHz); - -/** - * Retrieve the current state of Auto Boosted clocks on a device and store it in \a isEnabled - * - * For Kepler &tm; or newer fully supported devices. - * - * Auto Boosted clocks are enabled by default on some hardware, allowing the GPU to run at higher clock rates - * to maximize performance as thermal limits allow. - * - * On Pascal and newer hardware, Auto Aoosted clocks are controlled through application clocks. - * Use \ref nvmlDeviceSetApplicationsClocks and \ref nvmlDeviceResetApplicationsClocks to control Auto Boost - * behavior. - * - * @param device The identifier of the target device - * @param isEnabled Where to store the current state of Auto Boosted clocks of the target device - * @param defaultIsEnabled Where to store the default Auto Boosted clocks behavior of the target device that the device will - * revert to when no applications are using the GPU - * - * @return - * - \ref NVML_SUCCESS If \a isEnabled has been been set with the Auto Boosted clocks state of \a device - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a isEnabled is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support Auto Boosted clocks - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - * - */ -nvmlReturn_t DECLDIR nvmlDeviceGetAutoBoostedClocksEnabled(nvmlDevice_t device, nvmlEnableState_t *isEnabled, nvmlEnableState_t *defaultIsEnabled); - -/** - * Try to set the current state of Auto Boosted clocks on a device. - * - * For Kepler &tm; or newer fully supported devices. - * - * Auto Boosted clocks are enabled by default on some hardware, allowing the GPU to run at higher clock rates - * to maximize performance as thermal limits allow. Auto Boosted clocks should be disabled if fixed clock - * rates are desired. - * - * Non-root users may use this API by default but can be restricted by root from using this API by calling - * \ref nvmlDeviceSetAPIRestriction with apiType=NVML_RESTRICTED_API_SET_AUTO_BOOSTED_CLOCKS. - * Note: Persistence Mode is required to modify current Auto Boost settings, therefore, it must be enabled. - * - * On Pascal and newer hardware, Auto Boosted clocks are controlled through application clocks. - * Use \ref nvmlDeviceSetApplicationsClocks and \ref nvmlDeviceResetApplicationsClocks to control Auto Boost - * behavior. - * - * @param device The identifier of the target device - * @param enabled What state to try to set Auto Boosted clocks of the target device to - * - * @return - * - \ref NVML_SUCCESS If the Auto Boosted clocks were successfully set to the state specified by \a enabled - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support Auto Boosted clocks - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - * - */ -nvmlReturn_t DECLDIR nvmlDeviceSetAutoBoostedClocksEnabled(nvmlDevice_t device, nvmlEnableState_t enabled); - -/** - * Try to set the default state of Auto Boosted clocks on a device. This is the default state that Auto Boosted clocks will - * return to when no compute running processes (e.g. CUDA application which have an active context) are running - * - * For Kepler &tm; or newer non-GeForce fully supported devices and Maxwell or newer GeForce devices. - * Requires root/admin permissions. - * - * Auto Boosted clocks are enabled by default on some hardware, allowing the GPU to run at higher clock rates - * to maximize performance as thermal limits allow. Auto Boosted clocks should be disabled if fixed clock - * rates are desired. - * - * On Pascal and newer hardware, Auto Boosted clocks are controlled through application clocks. - * Use \ref nvmlDeviceSetApplicationsClocks and \ref nvmlDeviceResetApplicationsClocks to control Auto Boost - * behavior. - * - * @param device The identifier of the target device - * @param enabled What state to try to set default Auto Boosted clocks of the target device to - * @param flags Flags that change the default behavior. Currently Unused. - * - * @return - * - \ref NVML_SUCCESS If the Auto Boosted clock's default state was successfully set to the state specified by \a enabled - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_NO_PERMISSION If the calling user does not have permission to change Auto Boosted clock's default state. - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support Auto Boosted clocks - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - * - */ -nvmlReturn_t DECLDIR nvmlDeviceSetDefaultAutoBoostedClocksEnabled(nvmlDevice_t device, nvmlEnableState_t enabled, unsigned int flags); - - -/** - * Retrieves the intended operating speed of the device's fan. - * - * Note: The reported speed is the intended fan speed. If the fan is physically blocked and unable to spin, the - * output will not match the actual fan speed. - * - * For all discrete products with dedicated fans. - * - * The fan speed is expressed as a percent of the maximum, i.e. full speed is 100%. - * - * @param device The identifier of the target device - * @param speed Reference in which to return the fan speed percentage - * - * @return - * - \ref NVML_SUCCESS if \a speed has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a speed is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not have a fan - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetFanSpeed(nvmlDevice_t device, unsigned int *speed); - -/** - * Retrieves the current temperature readings for the device, in degrees C. - * - * For all products. - * - * See \ref nvmlTemperatureSensors_t for details on available temperature sensors. - * - * @param device The identifier of the target device - * @param sensorType Flag that indicates which sensor reading to retrieve - * @param temp Reference in which to return the temperature reading - * - * @return - * - \ref NVML_SUCCESS if \a temp has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, \a sensorType is invalid or \a temp is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not have the specified sensor - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetTemperature(nvmlDevice_t device, nvmlTemperatureSensors_t sensorType, unsigned int *temp); - -/** - * Retrieves the temperature threshold for the GPU with the specified threshold type in degrees C. - * - * For Kepler &tm; or newer fully supported devices. - * - * See \ref nvmlTemperatureThresholds_t for details on available temperature thresholds. - * - * @param device The identifier of the target device - * @param thresholdType The type of threshold value queried - * @param temp Reference in which to return the temperature reading - * @return - * - \ref NVML_SUCCESS if \a temp has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, \a thresholdType is invalid or \a temp is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not have a temperature sensor or is unsupported - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetTemperatureThreshold(nvmlDevice_t device, nvmlTemperatureThresholds_t thresholdType, unsigned int *temp); - -/** - * Retrieves the current performance state for the device. - * - * For Fermi &tm; or newer fully supported devices. - * - * See \ref nvmlPstates_t for details on allowed performance states. - * - * @param device The identifier of the target device - * @param pState Reference in which to return the performance state reading - * - * @return - * - \ref NVML_SUCCESS if \a pState has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a pState is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetPerformanceState(nvmlDevice_t device, nvmlPstates_t *pState); - -/** - * Retrieves current clocks throttling reasons. - * - * For all fully supported products. - * - * \note More than one bit can be enabled at the same time. Multiple reasons can be affecting clocks at once. - * - * @param device The identifier of the target device - * @param clocksThrottleReasons Reference in which to return bitmask of active clocks throttle - * reasons - * - * @return - * - \ref NVML_SUCCESS if \a clocksThrottleReasons has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a clocksThrottleReasons is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - * - * @see nvmlClocksThrottleReasons - * @see nvmlDeviceGetSupportedClocksThrottleReasons - */ -nvmlReturn_t DECLDIR nvmlDeviceGetCurrentClocksThrottleReasons(nvmlDevice_t device, unsigned long long *clocksThrottleReasons); - -/** - * Retrieves bitmask of supported clocks throttle reasons that can be returned by - * \ref nvmlDeviceGetCurrentClocksThrottleReasons - * - * For all fully supported products. - * - * This method is not supported in virtual machines running virtual GPU (vGPU). - * - * @param device The identifier of the target device - * @param supportedClocksThrottleReasons Reference in which to return bitmask of supported - * clocks throttle reasons - * - * @return - * - \ref NVML_SUCCESS if \a supportedClocksThrottleReasons has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a supportedClocksThrottleReasons is NULL - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - * - * @see nvmlClocksThrottleReasons - * @see nvmlDeviceGetCurrentClocksThrottleReasons - */ -nvmlReturn_t DECLDIR nvmlDeviceGetSupportedClocksThrottleReasons(nvmlDevice_t device, unsigned long long *supportedClocksThrottleReasons); - -/** - * Deprecated: Use \ref nvmlDeviceGetPerformanceState. This function exposes an incorrect generalization. - * - * Retrieve the current performance state for the device. - * - * For Fermi &tm; or newer fully supported devices. - * - * See \ref nvmlPstates_t for details on allowed performance states. - * - * @param device The identifier of the target device - * @param pState Reference in which to return the performance state reading - * - * @return - * - \ref NVML_SUCCESS if \a pState has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a pState is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetPowerState(nvmlDevice_t device, nvmlPstates_t *pState); - -/** - * This API has been deprecated. - * - * Retrieves the power management mode associated with this device. - * - * For products from the Fermi family. - * - Requires \a NVML_INFOROM_POWER version 3.0 or higher. - * - * For from the Kepler or newer families. - * - Does not require \a NVML_INFOROM_POWER object. - * - * This flag indicates whether any power management algorithm is currently active on the device. An - * enabled state does not necessarily mean the device is being actively throttled -- only that - * that the driver will do so if the appropriate conditions are met. - * - * See \ref nvmlEnableState_t for details on allowed modes. - * - * @param device The identifier of the target device - * @param mode Reference in which to return the current power management mode - * - * @return - * - \ref NVML_SUCCESS if \a mode has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a mode is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetPowerManagementMode(nvmlDevice_t device, nvmlEnableState_t *mode); - -/** - * Retrieves the power management limit associated with this device. - * - * For Fermi &tm; or newer fully supported devices. - * - * The power limit defines the upper boundary for the card's power draw. If - * the card's total power draw reaches this limit the power management algorithm kicks in. - * - * This reading is only available if power management mode is supported. - * See \ref nvmlDeviceGetPowerManagementMode. - * - * @param device The identifier of the target device - * @param limit Reference in which to return the power management limit in milliwatts - * - * @return - * - \ref NVML_SUCCESS if \a limit has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a limit is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetPowerManagementLimit(nvmlDevice_t device, unsigned int *limit); - -/** - * Retrieves information about possible values of power management limits on this device. - * - * For Kepler &tm; or newer fully supported devices. - * - * @param device The identifier of the target device - * @param minLimit Reference in which to return the minimum power management limit in milliwatts - * @param maxLimit Reference in which to return the maximum power management limit in milliwatts - * - * @return - * - \ref NVML_SUCCESS if \a minLimit and \a maxLimit have been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a minLimit or \a maxLimit is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - * - * @see nvmlDeviceSetPowerManagementLimit - */ -nvmlReturn_t DECLDIR nvmlDeviceGetPowerManagementLimitConstraints(nvmlDevice_t device, unsigned int *minLimit, unsigned int *maxLimit); - -/** - * Retrieves default power management limit on this device, in milliwatts. - * Default power management limit is a power management limit that the device boots with. - * - * For Kepler &tm; or newer fully supported devices. - * - * @param device The identifier of the target device - * @param defaultLimit Reference in which to return the default power management limit in milliwatts - * - * @return - * - \ref NVML_SUCCESS if \a defaultLimit has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a defaultLimit is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetPowerManagementDefaultLimit(nvmlDevice_t device, unsigned int *defaultLimit); - -/** - * Retrieves power usage for this GPU in milliwatts and its associated circuitry (e.g. memory) - * - * For Fermi &tm; or newer fully supported devices. - * - * On Fermi and Kepler GPUs the reading is accurate to within +/- 5% of current power draw. - * - * It is only available if power management mode is supported. See \ref nvmlDeviceGetPowerManagementMode. - * - * @param device The identifier of the target device - * @param power Reference in which to return the power usage information - * - * @return - * - \ref NVML_SUCCESS if \a power has been populated - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a power is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support power readings - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetPowerUsage(nvmlDevice_t device, unsigned int *power); - -/** - * Retrieves total energy consumption for this GPU in millijoules (mJ) since the driver was last reloaded - * - * For newer than Pascal &tm; fully supported devices. - * - * @param device The identifier of the target device - * @param energy Reference in which to return the energy consumption information - * - * @return - * - \ref NVML_SUCCESS if \a energy has been populated - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a energy is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support energy readings - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetTotalEnergyConsumption(nvmlDevice_t device, unsigned long long *energy); - -/** - * Get the effective power limit that the driver enforces after taking into account all limiters - * - * Note: This can be different from the \ref nvmlDeviceGetPowerManagementLimit if other limits are set elsewhere - * This includes the out of band power limit interface - * - * For Kepler &tm; or newer fully supported devices. - * - * @param device The device to communicate with - * @param limit Reference in which to return the power management limit in milliwatts - * - * @return - * - \ref NVML_SUCCESS if \a limit has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a limit is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetEnforcedPowerLimit(nvmlDevice_t device, unsigned int *limit); - -/** - * Retrieves the current GOM and pending GOM (the one that GPU will switch to after reboot). - * - * For GK110 M-class and X-class Tesla &tm; products from the Kepler family. - * Modes \ref NVML_GOM_LOW_DP and \ref NVML_GOM_ALL_ON are supported on fully supported GeForce products. - * Not supported on Quadro ® and Tesla &tm; C-class products. - * - * @param device The identifier of the target device - * @param current Reference in which to return the current GOM - * @param pending Reference in which to return the pending GOM - * - * @return - * - \ref NVML_SUCCESS if \a mode has been populated - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a current or \a pending is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - * - * @see nvmlGpuOperationMode_t - * @see nvmlDeviceSetGpuOperationMode - */ -nvmlReturn_t DECLDIR nvmlDeviceGetGpuOperationMode(nvmlDevice_t device, nvmlGpuOperationMode_t *current, nvmlGpuOperationMode_t *pending); - -/** - * Retrieves the amount of used, free and total memory available on the device, in bytes. - * - * For all products. - * - * Enabling ECC reduces the amount of total available memory, due to the extra required parity bits. - * Under WDDM most device memory is allocated and managed on startup by Windows. - * - * Under Linux and Windows TCC, the reported amount of used memory is equal to the sum of memory allocated - * by all active channels on the device. - * - * See \ref nvmlMemory_t for details on available memory info. - * - * @param device The identifier of the target device - * @param memory Reference in which to return the memory information - * - * @return - * - \ref NVML_SUCCESS if \a memory has been populated - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a memory is NULL - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetMemoryInfo(nvmlDevice_t device, nvmlMemory_t *memory); - -/** - * Retrieves the current compute mode for the device. - * - * For all products. - * - * See \ref nvmlComputeMode_t for details on allowed compute modes. - * - * @param device The identifier of the target device - * @param mode Reference in which to return the current compute mode - * - * @return - * - \ref NVML_SUCCESS if \a mode has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a mode is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - * - * @see nvmlDeviceSetComputeMode() - */ -nvmlReturn_t DECLDIR nvmlDeviceGetComputeMode(nvmlDevice_t device, nvmlComputeMode_t *mode); - -/** - * Retrieves the CUDA compute capability of the device. - * - * For all products. - * - * Returns the major and minor compute capability version numbers of the - * device. The major and minor versions are equivalent to the - * CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR and - * CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR attributes that would be - * returned by CUDA's cuDeviceGetAttribute(). - * - * @param device The identifier of the target device - * @param major Reference in which to return the major CUDA compute capability - * @param minor Reference in which to return the minor CUDA compute capability - * - * @return - * - \ref NVML_SUCCESS if \a major and \a minor have been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a major or \a minor are NULL - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetCudaComputeCapability(nvmlDevice_t device, int *major, int *minor); - -/** - * Retrieves the current and pending ECC modes for the device. - * - * For Fermi &tm; or newer fully supported devices. - * Only applicable to devices with ECC. - * Requires \a NVML_INFOROM_ECC version 1.0 or higher. - * - * Changing ECC modes requires a reboot. The "pending" ECC mode refers to the target mode following - * the next reboot. - * - * See \ref nvmlEnableState_t for details on allowed modes. - * - * @param device The identifier of the target device - * @param current Reference in which to return the current ECC mode - * @param pending Reference in which to return the pending ECC mode - * - * @return - * - \ref NVML_SUCCESS if \a current and \a pending have been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or either \a current or \a pending is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - * - * @see nvmlDeviceSetEccMode() - */ -nvmlReturn_t DECLDIR nvmlDeviceGetEccMode(nvmlDevice_t device, nvmlEnableState_t *current, nvmlEnableState_t *pending); - -/** - * Retrieves the device boardId from 0-N. - * Devices with the same boardId indicate GPUs connected to the same PLX. Use in conjunction with - * \ref nvmlDeviceGetMultiGpuBoard() to decide if they are on the same board as well. - * The boardId returned is a unique ID for the current configuration. Uniqueness and ordering across - * reboots and system configurations is not guaranteed (i.e. if a Tesla K40c returns 0x100 and - * the two GPUs on a Tesla K10 in the same system returns 0x200 it is not guaranteed they will - * always return those values but they will always be different from each other). - * - * - * For Fermi &tm; or newer fully supported devices. - * - * @param device The identifier of the target device - * @param boardId Reference in which to return the device's board ID - * - * @return - * - \ref NVML_SUCCESS if \a boardId has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a boardId is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetBoardId(nvmlDevice_t device, unsigned int *boardId); - -/** - * Retrieves whether the device is on a Multi-GPU Board - * Devices that are on multi-GPU boards will set \a multiGpuBool to a non-zero value. - * - * For Fermi &tm; or newer fully supported devices. - * - * @param device The identifier of the target device - * @param multiGpuBool Reference in which to return a zero or non-zero value - * to indicate whether the device is on a multi GPU board - * - * @return - * - \ref NVML_SUCCESS if \a multiGpuBool has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a multiGpuBool is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetMultiGpuBoard(nvmlDevice_t device, unsigned int *multiGpuBool); - -/** - * Retrieves the total ECC error counts for the device. - * - * For Fermi &tm; or newer fully supported devices. - * Only applicable to devices with ECC. - * Requires \a NVML_INFOROM_ECC version 1.0 or higher. - * Requires ECC Mode to be enabled. - * - * The total error count is the sum of errors across each of the separate memory systems, i.e. the total set of - * errors across the entire device. - * - * See \ref nvmlMemoryErrorType_t for a description of available error types.\n - * See \ref nvmlEccCounterType_t for a description of available counter types. - * - * @param device The identifier of the target device - * @param errorType Flag that specifies the type of the errors. - * @param counterType Flag that specifies the counter-type of the errors. - * @param eccCounts Reference in which to return the specified ECC errors - * - * @return - * - \ref NVML_SUCCESS if \a eccCounts has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device, \a errorType or \a counterType is invalid, or \a eccCounts is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - * - * @see nvmlDeviceClearEccErrorCounts() - */ -nvmlReturn_t DECLDIR nvmlDeviceGetTotalEccErrors(nvmlDevice_t device, nvmlMemoryErrorType_t errorType, nvmlEccCounterType_t counterType, unsigned long long *eccCounts); - -/** - * Retrieves the detailed ECC error counts for the device. - * - * @deprecated This API supports only a fixed set of ECC error locations - * On different GPU architectures different locations are supported - * See \ref nvmlDeviceGetMemoryErrorCounter - * - * For Fermi &tm; or newer fully supported devices. - * Only applicable to devices with ECC. - * Requires \a NVML_INFOROM_ECC version 2.0 or higher to report aggregate location-based ECC counts. - * Requires \a NVML_INFOROM_ECC version 1.0 or higher to report all other ECC counts. - * Requires ECC Mode to be enabled. - * - * Detailed errors provide separate ECC counts for specific parts of the memory system. - * - * Reports zero for unsupported ECC error counters when a subset of ECC error counters are supported. - * - * See \ref nvmlMemoryErrorType_t for a description of available bit types.\n - * See \ref nvmlEccCounterType_t for a description of available counter types.\n - * See \ref nvmlEccErrorCounts_t for a description of provided detailed ECC counts. - * - * @param device The identifier of the target device - * @param errorType Flag that specifies the type of the errors. - * @param counterType Flag that specifies the counter-type of the errors. - * @param eccCounts Reference in which to return the specified ECC errors - * - * @return - * - \ref NVML_SUCCESS if \a eccCounts has been populated - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device, \a errorType or \a counterType is invalid, or \a eccCounts is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - * - * @see nvmlDeviceClearEccErrorCounts() - */ -nvmlReturn_t DECLDIR nvmlDeviceGetDetailedEccErrors(nvmlDevice_t device, nvmlMemoryErrorType_t errorType, nvmlEccCounterType_t counterType, nvmlEccErrorCounts_t *eccCounts); - -/** - * Retrieves the requested memory error counter for the device. - * - * For Fermi &tm; or newer fully supported devices. - * Requires \a NVML_INFOROM_ECC version 2.0 or higher to report aggregate location-based memory error counts. - * Requires \a NVML_INFOROM_ECC version 1.0 or higher to report all other memory error counts. - * - * Only applicable to devices with ECC. - * - * Requires ECC Mode to be enabled. - * - * See \ref nvmlMemoryErrorType_t for a description of available memory error types.\n - * See \ref nvmlEccCounterType_t for a description of available counter types.\n - * See \ref nvmlMemoryLocation_t for a description of available counter locations.\n - * - * @param device The identifier of the target device - * @param errorType Flag that specifies the type of error. - * @param counterType Flag that specifies the counter-type of the errors. - * @param locationType Specifies the location of the counter. - * @param count Reference in which to return the ECC counter - * - * @return - * - \ref NVML_SUCCESS if \a count has been populated - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device, \a bitTyp,e \a counterType or \a locationType is - * invalid, or \a count is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support ECC error reporting in the specified memory - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetMemoryErrorCounter(nvmlDevice_t device, nvmlMemoryErrorType_t errorType, - nvmlEccCounterType_t counterType, - nvmlMemoryLocation_t locationType, unsigned long long *count); - -/** - * Retrieves the current utilization rates for the device's major subsystems. - * - * For Fermi &tm; or newer fully supported devices. - * - * See \ref nvmlUtilization_t for details on available utilization rates. - * - * \note During driver initialization when ECC is enabled one can see high GPU and Memory Utilization readings. - * This is caused by ECC Memory Scrubbing mechanism that is performed during driver initialization. - * - * @param device The identifier of the target device - * @param utilization Reference in which to return the utilization information - * - * @return - * - \ref NVML_SUCCESS if \a utilization has been populated - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a utilization is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetUtilizationRates(nvmlDevice_t device, nvmlUtilization_t *utilization); - -/** - * Retrieves the current utilization and sampling size in microseconds for the Encoder - * - * For Kepler &tm; or newer fully supported devices. - * - * @param device The identifier of the target device - * @param utilization Reference to an unsigned int for encoder utilization info - * @param samplingPeriodUs Reference to an unsigned int for the sampling period in US - * - * @return - * - \ref NVML_SUCCESS if \a utilization has been populated - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, \a utilization is NULL, or \a samplingPeriodUs is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetEncoderUtilization(nvmlDevice_t device, unsigned int *utilization, unsigned int *samplingPeriodUs); - -/** - * Retrieves the current capacity of the device's encoder, as a percentage of maximum encoder capacity with valid values in the range 0-100. - * - * For Maxwell &tm; or newer fully supported devices. - * - * @param device The identifier of the target device - * @param encoderQueryType Type of encoder to query - * @param encoderCapacity Reference to an unsigned int for the encoder capacity - * - * @return - * - \ref NVML_SUCCESS if \a encoderCapacity is fetched - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a encoderCapacity is NULL, or \a device or \a encoderQueryType - * are invalid - * - \ref NVML_ERROR_NOT_SUPPORTED if device does not support the encoder specified in \a encodeQueryType - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetEncoderCapacity (nvmlDevice_t device, nvmlEncoderType_t encoderQueryType, unsigned int *encoderCapacity); - -/** - * Retrieves the current encoder statistics for a given device. - * - * For Maxwell &tm; or newer fully supported devices. - * - * @param device The identifier of the target device - * @param sessionCount Reference to an unsigned int for count of active encoder sessions - * @param averageFps Reference to an unsigned int for trailing average FPS of all active sessions - * @param averageLatency Reference to an unsigned int for encode latency in microseconds - * - * @return - * - \ref NVML_SUCCESS if \a sessionCount, \a averageFps and \a averageLatency is fetched - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a sessionCount, or \a device or \a averageFps, - * or \a averageLatency is NULL - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetEncoderStats (nvmlDevice_t device, unsigned int *sessionCount, - unsigned int *averageFps, unsigned int *averageLatency); - -/** - * Retrieves information about active encoder sessions on a target device. - * - * An array of active encoder sessions is returned in the caller-supplied buffer pointed at by \a sessionInfos. The - * array elememt count is passed in \a sessionCount, and \a sessionCount is used to return the number of sessions - * written to the buffer. - * - * If the supplied buffer is not large enough to accomodate the active session array, the function returns - * NVML_ERROR_INSUFFICIENT_SIZE, with the element count of nvmlEncoderSessionInfo_t array required in \a sessionCount. - * To query the number of active encoder sessions, call this function with *sessionCount = 0. The code will return - * NVML_SUCCESS with number of active encoder sessions updated in *sessionCount. - * - * For Maxwell &tm; or newer fully supported devices. - * - * @param device The identifier of the target device - * @param sessionCount Reference to caller supplied array size, and returns the number of sessions. - * @param sessionInfos Reference in which to return the session information - * - * @return - * - \ref NVML_SUCCESS if \a sessionInfos is fetched - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a sessionCount is too small, array element count is returned in \a sessionCount - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a sessionCount is NULL. - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetEncoderSessions(nvmlDevice_t device, unsigned int *sessionCount, nvmlEncoderSessionInfo_t *sessionInfos); - -/** - * Retrieves the current utilization and sampling size in microseconds for the Decoder - * - * For Kepler &tm; or newer fully supported devices. - * - * @param device The identifier of the target device - * @param utilization Reference to an unsigned int for decoder utilization info - * @param samplingPeriodUs Reference to an unsigned int for the sampling period in US - * - * @return - * - \ref NVML_SUCCESS if \a utilization has been populated - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, \a utilization is NULL, or \a samplingPeriodUs is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetDecoderUtilization(nvmlDevice_t device, unsigned int *utilization, unsigned int *samplingPeriodUs); - -/** - * Retrieves the current and pending driver model for the device. - * - * For Fermi &tm; or newer fully supported devices. - * For windows only. - * - * On Windows platforms the device driver can run in either WDDM or WDM (TCC) mode. If a display is attached - * to the device it must run in WDDM mode. TCC mode is preferred if a display is not attached. - * - * See \ref nvmlDriverModel_t for details on available driver models. - * - * @param device The identifier of the target device - * @param current Reference in which to return the current driver model - * @param pending Reference in which to return the pending driver model - * - * @return - * - \ref NVML_SUCCESS if either \a current and/or \a pending have been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or both \a current and \a pending are NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if the platform is not windows - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - * - * @see nvmlDeviceSetDriverModel() - */ -nvmlReturn_t DECLDIR nvmlDeviceGetDriverModel(nvmlDevice_t device, nvmlDriverModel_t *current, nvmlDriverModel_t *pending); - -/** - * Get VBIOS version of the device. - * - * For all products. - * - * The VBIOS version may change from time to time. It will not exceed 32 characters in length - * (including the NULL terminator). See \ref nvmlConstants::NVML_DEVICE_VBIOS_VERSION_BUFFER_SIZE. - * - * @param device The identifier of the target device - * @param version Reference to which to return the VBIOS version - * @param length The maximum allowed length of the string returned in \a version - * - * @return - * - \ref NVML_SUCCESS if \a version has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, or \a version is NULL - * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a length is too small - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetVbiosVersion(nvmlDevice_t device, char *version, unsigned int length); - -/** - * Get Bridge Chip Information for all the bridge chips on the board. - * - * For all fully supported products. - * Only applicable to multi-GPU products. - * - * @param device The identifier of the target device - * @param bridgeHierarchy Reference to the returned bridge chip Hierarchy - * - * @return - * - \ref NVML_SUCCESS if bridge chip exists - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, or \a bridgeInfo is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if bridge chip not supported on the device - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - * - */ -nvmlReturn_t DECLDIR nvmlDeviceGetBridgeChipInfo(nvmlDevice_t device, nvmlBridgeChipHierarchy_t *bridgeHierarchy); - -/** - * Get information about processes with a compute context on a device - * - * For Fermi &tm; or newer fully supported devices. - * - * This function returns information only about compute running processes (e.g. CUDA application which have - * active context). Any graphics applications (e.g. using OpenGL, DirectX) won't be listed by this function. - * - * To query the current number of running compute processes, call this function with *infoCount = 0. The - * return code will be NVML_ERROR_INSUFFICIENT_SIZE, or NVML_SUCCESS if none are running. For this call - * \a infos is allowed to be NULL. - * - * The usedGpuMemory field returned is all of the memory used by the application. - * - * Keep in mind that information returned by this call is dynamic and the number of elements might change in - * time. Allocate more space for \a infos table in case new compute processes are spawned. - * - * @param device The identifier of the target device - * @param infoCount Reference in which to provide the \a infos array size, and - * to return the number of returned elements - * @param infos Reference in which to return the process information - * - * @return - * - \ref NVML_SUCCESS if \a infoCount and \a infos have been populated - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a infoCount indicates that the \a infos array is too small - * \a infoCount will contain minimal amount of space necessary for - * the call to complete - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, either of \a infoCount or \a infos is NULL - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - * - * @see \ref nvmlSystemGetProcessName - */ -nvmlReturn_t DECLDIR nvmlDeviceGetComputeRunningProcesses(nvmlDevice_t device, unsigned int *infoCount, nvmlProcessInfo_t *infos); - -/** - * Get information about processes with a graphics context on a device - * - * For Kepler &tm; or newer fully supported devices. - * - * This function returns information only about graphics based processes - * (eg. applications using OpenGL, DirectX) - * - * To query the current number of running graphics processes, call this function with *infoCount = 0. The - * return code will be NVML_ERROR_INSUFFICIENT_SIZE, or NVML_SUCCESS if none are running. For this call - * \a infos is allowed to be NULL. - * - * The usedGpuMemory field returned is all of the memory used by the application. - * - * Keep in mind that information returned by this call is dynamic and the number of elements might change in - * time. Allocate more space for \a infos table in case new graphics processes are spawned. - * - * @param device The identifier of the target device - * @param infoCount Reference in which to provide the \a infos array size, and - * to return the number of returned elements - * @param infos Reference in which to return the process information - * - * @return - * - \ref NVML_SUCCESS if \a infoCount and \a infos have been populated - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a infoCount indicates that the \a infos array is too small - * \a infoCount will contain minimal amount of space necessary for - * the call to complete - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, either of \a infoCount or \a infos is NULL - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - * - * @see \ref nvmlSystemGetProcessName - */ -nvmlReturn_t DECLDIR nvmlDeviceGetGraphicsRunningProcesses(nvmlDevice_t device, unsigned int *infoCount, nvmlProcessInfo_t *infos); - -/** - * Check if the GPU devices are on the same physical board. - * - * For all fully supported products. - * - * @param device1 The first GPU device - * @param device2 The second GPU device - * @param onSameBoard Reference in which to return the status. - * Non-zero indicates that the GPUs are on the same board. - * - * @return - * - \ref NVML_SUCCESS if \a onSameBoard has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a dev1 or \a dev2 are invalid or \a onSameBoard is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if this check is not supported by the device - * - \ref NVML_ERROR_GPU_IS_LOST if the either GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceOnSameBoard(nvmlDevice_t device1, nvmlDevice_t device2, int *onSameBoard); - -/** - * Retrieves the root/admin permissions on the target API. See \a nvmlRestrictedAPI_t for the list of supported APIs. - * If an API is restricted only root users can call that API. See \a nvmlDeviceSetAPIRestriction to change current permissions. - * - * For all fully supported products. - * - * @param device The identifier of the target device - * @param apiType Target API type for this operation - * @param isRestricted Reference in which to return the current restriction - * NVML_FEATURE_ENABLED indicates that the API is root-only - * NVML_FEATURE_DISABLED indicates that the API is accessible to all users - * - * @return - * - \ref NVML_SUCCESS if \a isRestricted has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, \a apiType incorrect or \a isRestricted is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if this query is not supported by the device or the device does not support - * the feature that is being queried (E.G. Enabling/disabling Auto Boosted clocks is - * not supported by the device) - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - * - * @see nvmlRestrictedAPI_t - */ -nvmlReturn_t DECLDIR nvmlDeviceGetAPIRestriction(nvmlDevice_t device, nvmlRestrictedAPI_t apiType, nvmlEnableState_t *isRestricted); - -/** - * Gets recent samples for the GPU. - * - * For Kepler &tm; or newer fully supported devices. - * - * Based on type, this method can be used to fetch the power, utilization or clock samples maintained in the buffer by - * the driver. - * - * Power, Utilization and Clock samples are returned as type "unsigned int" for the union nvmlValue_t. - * - * To get the size of samples that user needs to allocate, the method is invoked with samples set to NULL. - * The returned samplesCount will provide the number of samples that can be queried. The user needs to - * allocate the buffer with size as samplesCount * sizeof(nvmlSample_t). - * - * lastSeenTimeStamp represents CPU timestamp in microseconds. Set it to 0 to fetch all the samples maintained by the - * underlying buffer. Set lastSeenTimeStamp to one of the timeStamps retrieved from the date of the previous query - * to get more recent samples. - * - * This method fetches the number of entries which can be accommodated in the provided samples array, and the - * reference samplesCount is updated to indicate how many samples were actually retrieved. The advantage of using this - * method for samples in contrast to polling via existing methods is to get get higher frequency data at lower polling cost. - * - * @param device The identifier for the target device - * @param type Type of sampling event - * @param lastSeenTimeStamp Return only samples with timestamp greater than lastSeenTimeStamp. - * @param sampleValType Output parameter to represent the type of sample value as described in nvmlSampleVal_t - * @param sampleCount Reference to provide the number of elements which can be queried in samples array - * @param samples Reference in which samples are returned - - * @return - * - \ref NVML_SUCCESS if samples are successfully retrieved - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, \a samplesCount is NULL or - * reference to \a sampleCount is 0 for non null \a samples - * - \ref NVML_ERROR_NOT_SUPPORTED if this query is not supported by the device - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_NOT_FOUND if sample entries are not found - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetSamples(nvmlDevice_t device, nvmlSamplingType_t type, unsigned long long lastSeenTimeStamp, - nvmlValueType_t *sampleValType, unsigned int *sampleCount, nvmlSample_t *samples); - -/** - * Gets Total, Available and Used size of BAR1 memory. - * - * BAR1 is used to map the FB (device memory) so that it can be directly accessed by the CPU or by 3rd party - * devices (peer-to-peer on the PCIE bus). - * - * For Kepler &tm; or newer fully supported devices. - * - * @param device The identifier of the target device - * @param bar1Memory Reference in which BAR1 memory - * information is returned. - * - * @return - * - \ref NVML_SUCCESS if BAR1 memory is successfully retrieved - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, \a bar1Memory is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if this query is not supported by the device - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - * - */ -nvmlReturn_t DECLDIR nvmlDeviceGetBAR1MemoryInfo(nvmlDevice_t device, nvmlBAR1Memory_t *bar1Memory); - - -/** - * Gets the duration of time during which the device was throttled (lower than requested clocks) due to power - * or thermal constraints. - * - * The method is important to users who are tying to understand if their GPUs throttle at any point during their applications. The - * difference in violation times at two different reference times gives the indication of GPU throttling event. - * - * Violation for thermal capping is not supported at this time. - * - * For Kepler &tm; or newer fully supported devices. - * - * @param device The identifier of the target device - * @param perfPolicyType Represents Performance policy which can trigger GPU throttling - * @param violTime Reference to which violation time related information is returned - * - * - * @return - * - \ref NVML_SUCCESS if violation time is successfully retrieved - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, \a perfPolicyType is invalid, or \a violTime is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if this query is not supported by the device - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - */ -nvmlReturn_t DECLDIR nvmlDeviceGetViolationStatus(nvmlDevice_t device, nvmlPerfPolicyType_t perfPolicyType, nvmlViolationTime_t *violTime); - -/** - * @} - */ - -/** @addtogroup nvmlAccountingStats - * @{ - */ - -/** - * Queries the state of per process accounting mode. - * - * For Kepler &tm; or newer fully supported devices. - * - * See \ref nvmlDeviceGetAccountingStats for more details. - * See \ref nvmlDeviceSetAccountingMode - * - * @param device The identifier of the target device - * @param mode Reference in which to return the current accounting mode - * - * @return - * - \ref NVML_SUCCESS if the mode has been successfully retrieved - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a mode are NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetAccountingMode(nvmlDevice_t device, nvmlEnableState_t *mode); - -/** - * Queries process's accounting stats. - * - * For Kepler &tm; or newer fully supported devices. - * - * Accounting stats capture GPU utilization and other statistics across the lifetime of a process. - * Accounting stats can be queried during life time of the process and after its termination. - * The time field in \ref nvmlAccountingStats_t is reported as 0 during the lifetime of the process and - * updated to actual running time after its termination. - * Accounting stats are kept in a circular buffer, newly created processes overwrite information about old - * processes. - * - * See \ref nvmlAccountingStats_t for description of each returned metric. - * List of processes that can be queried can be retrieved from \ref nvmlDeviceGetAccountingPids. - * - * @note Accounting Mode needs to be on. See \ref nvmlDeviceGetAccountingMode. - * @note Only compute and graphics applications stats can be queried. Monitoring applications stats can't be - * queried since they don't contribute to GPU utilization. - * @note In case of pid collision stats of only the latest process (that terminated last) will be reported - * - * @warning On Kepler devices per process statistics are accurate only if there's one process running on a GPU. - * - * @param device The identifier of the target device - * @param pid Process Id of the target process to query stats for - * @param stats Reference in which to return the process's accounting stats - * - * @return - * - \ref NVML_SUCCESS if stats have been successfully retrieved - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a stats are NULL - * - \ref NVML_ERROR_NOT_FOUND if process stats were not found - * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature or accounting mode is disabled - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - * - * @see nvmlDeviceGetAccountingBufferSize - */ -nvmlReturn_t DECLDIR nvmlDeviceGetAccountingStats(nvmlDevice_t device, unsigned int pid, nvmlAccountingStats_t *stats); - -/** - * Queries list of processes that can be queried for accounting stats. The list of processes returned - * can be in running or terminated state. - * - * For Kepler &tm; or newer fully supported devices. - * - * To just query the number of processes ready to be queried, call this function with *count = 0 and - * pids=NULL. The return code will be NVML_ERROR_INSUFFICIENT_SIZE, or NVML_SUCCESS if list is empty. - * - * For more details see \ref nvmlDeviceGetAccountingStats. - * - * @note In case of PID collision some processes might not be accessible before the circular buffer is full. - * - * @param device The identifier of the target device - * @param count Reference in which to provide the \a pids array size, and - * to return the number of elements ready to be queried - * @param pids Reference in which to return list of process ids - * - * @return - * - \ref NVML_SUCCESS if pids were successfully retrieved - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a count is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature or accounting mode is disabled - * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a count is too small (\a count is set to - * expected value) - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - * - * @see nvmlDeviceGetAccountingBufferSize - */ -nvmlReturn_t DECLDIR nvmlDeviceGetAccountingPids(nvmlDevice_t device, unsigned int *count, unsigned int *pids); - -/** - * Returns the number of processes that the circular buffer with accounting pids can hold. - * - * For Kepler &tm; or newer fully supported devices. - * - * This is the maximum number of processes that accounting information will be stored for before information - * about oldest processes will get overwritten by information about new processes. - * - * @param device The identifier of the target device - * @param bufferSize Reference in which to provide the size (in number of elements) - * of the circular buffer for accounting stats. - * - * @return - * - \ref NVML_SUCCESS if buffer size was successfully retrieved - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a bufferSize is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature or accounting mode is disabled - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - * - * @see nvmlDeviceGetAccountingStats - * @see nvmlDeviceGetAccountingPids - */ -nvmlReturn_t DECLDIR nvmlDeviceGetAccountingBufferSize(nvmlDevice_t device, unsigned int *bufferSize); - -/** @} */ - -/** @addtogroup nvmlDeviceQueries - * @{ - */ - -/** - * Returns the list of retired pages by source, including pages that are pending retirement - * The address information provided from this API is the hardware address of the page that was retired. Note - * that this does not match the virtual address used in CUDA, but will match the address information in XID 63 - * - * For Kepler &tm; or newer fully supported devices. - * - * @param device The identifier of the target device - * @param cause Filter page addresses by cause of retirement - * @param pageCount Reference in which to provide the \a addresses buffer size, and - * to return the number of retired pages that match \a cause - * Set to 0 to query the size without allocating an \a addresses buffer - * @param addresses Buffer to write the page addresses into - * - * @return - * - \ref NVML_SUCCESS if \a pageCount was populated and \a addresses was filled - * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a pageCount indicates the buffer is not large enough to store all the - * matching page addresses. \a pageCount is set to the needed size. - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, \a pageCount is NULL, \a cause is invalid, or - * \a addresses is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetRetiredPages(nvmlDevice_t device, nvmlPageRetirementCause_t cause, - unsigned int *pageCount, unsigned long long *addresses); - -/** - * Check if any pages are pending retirement and need a reboot to fully retire. - * - * For Kepler &tm; or newer fully supported devices. - * - * @param device The identifier of the target device - * @param isPending Reference in which to return the pending status - * - * @return - * - \ref NVML_SUCCESS if \a isPending was populated - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a isPending is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetRetiredPagesPendingStatus(nvmlDevice_t device, nvmlEnableState_t *isPending); - -/** @} */ - -/***************************************************************************************************/ -/** @defgroup nvmlUnitCommands Unit Commands - * This chapter describes NVML operations that change the state of the unit. For S-class products. - * Each of these requires root/admin access. Non-admin users will see an NVML_ERROR_NO_PERMISSION - * error code when invoking any of these methods. - * @{ - */ -/***************************************************************************************************/ - -/** - * Set the LED state for the unit. The LED can be either green (0) or amber (1). - * - * For S-class products. - * Requires root/admin permissions. - * - * This operation takes effect immediately. - * - * - * Current S-Class products don't provide unique LEDs for each unit. As such, both front - * and back LEDs will be toggled in unison regardless of which unit is specified with this command. - * - * See \ref nvmlLedColor_t for available colors. - * - * @param unit The identifier of the target unit - * @param color The target LED color - * - * @return - * - \ref NVML_SUCCESS if the LED color has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a unit or \a color is invalid - * - \ref NVML_ERROR_NOT_SUPPORTED if this is not an S-class product - * - \ref NVML_ERROR_NO_PERMISSION if the user doesn't have permission to perform this operation - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - * - * @see nvmlUnitGetLedState() - */ -nvmlReturn_t DECLDIR nvmlUnitSetLedState(nvmlUnit_t unit, nvmlLedColor_t color); - -/** @} */ - -/***************************************************************************************************/ -/** @defgroup nvmlDeviceCommands Device Commands - * This chapter describes NVML operations that change the state of the device. - * Each of these requires root/admin access. Non-admin users will see an NVML_ERROR_NO_PERMISSION - * error code when invoking any of these methods. - * @{ - */ -/***************************************************************************************************/ - -/** - * Set the persistence mode for the device. - * - * For all products. - * For Linux only. - * Requires root/admin permissions. - * - * The persistence mode determines whether the GPU driver software is torn down after the last client - * exits. - * - * This operation takes effect immediately. It is not persistent across reboots. After each reboot the - * persistence mode is reset to "Disabled". - * - * See \ref nvmlEnableState_t for available modes. - * - * @param device The identifier of the target device - * @param mode The target persistence mode - * - * @return - * - \ref NVML_SUCCESS if the persistence mode was set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a mode is invalid - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature - * - \ref NVML_ERROR_NO_PERMISSION if the user doesn't have permission to perform this operation - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - * - * @see nvmlDeviceGetPersistenceMode() - */ -nvmlReturn_t DECLDIR nvmlDeviceSetPersistenceMode(nvmlDevice_t device, nvmlEnableState_t mode); - -/** - * Set the compute mode for the device. - * - * For all products. - * Requires root/admin permissions. - * - * The compute mode determines whether a GPU can be used for compute operations and whether it can - * be shared across contexts. - * - * This operation takes effect immediately. Under Linux it is not persistent across reboots and - * always resets to "Default". Under windows it is persistent. - * - * Under windows compute mode may only be set to DEFAULT when running in WDDM - * - * See \ref nvmlComputeMode_t for details on available compute modes. - * - * @param device The identifier of the target device - * @param mode The target compute mode - * - * @return - * - \ref NVML_SUCCESS if the compute mode was set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a mode is invalid - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature - * - \ref NVML_ERROR_NO_PERMISSION if the user doesn't have permission to perform this operation - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - * - * @see nvmlDeviceGetComputeMode() - */ -nvmlReturn_t DECLDIR nvmlDeviceSetComputeMode(nvmlDevice_t device, nvmlComputeMode_t mode); - -/** - * Set the ECC mode for the device. - * - * For Kepler &tm; or newer fully supported devices. - * Only applicable to devices with ECC. - * Requires \a NVML_INFOROM_ECC version 1.0 or higher. - * Requires root/admin permissions. - * - * The ECC mode determines whether the GPU enables its ECC support. - * - * This operation takes effect after the next reboot. - * - * See \ref nvmlEnableState_t for details on available modes. - * - * @param device The identifier of the target device - * @param ecc The target ECC mode - * - * @return - * - \ref NVML_SUCCESS if the ECC mode was set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a ecc is invalid - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature - * - \ref NVML_ERROR_NO_PERMISSION if the user doesn't have permission to perform this operation - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - * - * @see nvmlDeviceGetEccMode() - */ -nvmlReturn_t DECLDIR nvmlDeviceSetEccMode(nvmlDevice_t device, nvmlEnableState_t ecc); - -/** - * Clear the ECC error and other memory error counts for the device. - * - * For Kepler &tm; or newer fully supported devices. - * Only applicable to devices with ECC. - * Requires \a NVML_INFOROM_ECC version 2.0 or higher to clear aggregate location-based ECC counts. - * Requires \a NVML_INFOROM_ECC version 1.0 or higher to clear all other ECC counts. - * Requires root/admin permissions. - * Requires ECC Mode to be enabled. - * - * Sets all of the specified ECC counters to 0, including both detailed and total counts. - * - * This operation takes effect immediately. - * - * See \ref nvmlMemoryErrorType_t for details on available counter types. - * - * @param device The identifier of the target device - * @param counterType Flag that indicates which type of errors should be cleared. - * - * @return - * - \ref NVML_SUCCESS if the error counts were cleared - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a counterType is invalid - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature - * - \ref NVML_ERROR_NO_PERMISSION if the user doesn't have permission to perform this operation - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - * - * @see - * - nvmlDeviceGetDetailedEccErrors() - * - nvmlDeviceGetTotalEccErrors() - */ -nvmlReturn_t DECLDIR nvmlDeviceClearEccErrorCounts(nvmlDevice_t device, nvmlEccCounterType_t counterType); - -/** - * Set the driver model for the device. - * - * For Fermi &tm; or newer fully supported devices. - * For windows only. - * Requires root/admin permissions. - * - * On Windows platforms the device driver can run in either WDDM or WDM (TCC) mode. If a display is attached - * to the device it must run in WDDM mode. - * - * It is possible to force the change to WDM (TCC) while the display is still attached with a force flag (nvmlFlagForce). - * This should only be done if the host is subsequently powered down and the display is detached from the device - * before the next reboot. - * - * This operation takes effect after the next reboot. - * - * Windows driver model may only be set to WDDM when running in DEFAULT compute mode. - * - * Change driver model to WDDM is not supported when GPU doesn't support graphics acceleration or - * will not support it after reboot. See \ref nvmlDeviceSetGpuOperationMode. - * - * See \ref nvmlDriverModel_t for details on available driver models. - * See \ref nvmlFlagDefault and \ref nvmlFlagForce - * - * @param device The identifier of the target device - * @param driverModel The target driver model - * @param flags Flags that change the default behavior - * - * @return - * - \ref NVML_SUCCESS if the driver model has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a driverModel is invalid - * - \ref NVML_ERROR_NOT_SUPPORTED if the platform is not windows or the device does not support this feature - * - \ref NVML_ERROR_NO_PERMISSION if the user doesn't have permission to perform this operation - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - * - * @see nvmlDeviceGetDriverModel() - */ -nvmlReturn_t DECLDIR nvmlDeviceSetDriverModel(nvmlDevice_t device, nvmlDriverModel_t driverModel, unsigned int flags); - -/** - * Set clocks that applications will lock to. - * - * Sets the clocks that compute and graphics applications will be running at. - * e.g. CUDA driver requests these clocks during context creation which means this property - * defines clocks at which CUDA applications will be running unless some overspec event - * occurs (e.g. over power, over thermal or external HW brake). - * - * Can be used as a setting to request constant performance. - * - * On Pascal and newer hardware, this will automatically disable automatic boosting of clocks. - * - * On K80 and newer Kepler and Maxwell GPUs, users desiring fixed performance should also call - * \ref nvmlDeviceSetAutoBoostedClocksEnabled to prevent clocks from automatically boosting - * above the clock value being set. - * - * For Kepler &tm; or newer non-GeForce fully supported devices and Maxwell or newer GeForce devices. - * Requires root/admin permissions. - * - * See \ref nvmlDeviceGetSupportedMemoryClocks and \ref nvmlDeviceGetSupportedGraphicsClocks - * for details on how to list available clocks combinations. - * - * After system reboot or driver reload applications clocks go back to their default value. - * See \ref nvmlDeviceResetApplicationsClocks. - * - * @param device The identifier of the target device - * @param memClockMHz Requested memory clock in MHz - * @param graphicsClockMHz Requested graphics clock in MHz - * - * @return - * - \ref NVML_SUCCESS if new settings were successfully set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a memClockMHz and \a graphicsClockMHz - * is not a valid clock combination - * - \ref NVML_ERROR_NO_PERMISSION if the user doesn't have permission to perform this operation - * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceSetApplicationsClocks(nvmlDevice_t device, unsigned int memClockMHz, unsigned int graphicsClockMHz); - -/** - * Set new power limit of this device. - * - * For Kepler &tm; or newer fully supported devices. - * Requires root/admin permissions. - * - * See \ref nvmlDeviceGetPowerManagementLimitConstraints to check the allowed ranges of values. - * - * \note Limit is not persistent across reboots or driver unloads. - * Enable persistent mode to prevent driver from unloading when no application is using the device. - * - * @param device The identifier of the target device - * @param limit Power management limit in milliwatts to set - * - * @return - * - \ref NVML_SUCCESS if \a limit has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a defaultLimit is out of range - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - * - * @see nvmlDeviceGetPowerManagementLimitConstraints - * @see nvmlDeviceGetPowerManagementDefaultLimit - */ -nvmlReturn_t DECLDIR nvmlDeviceSetPowerManagementLimit(nvmlDevice_t device, unsigned int limit); - -/** - * Sets new GOM. See \a nvmlGpuOperationMode_t for details. - * - * For GK110 M-class and X-class Tesla &tm; products from the Kepler family. - * Modes \ref NVML_GOM_LOW_DP and \ref NVML_GOM_ALL_ON are supported on fully supported GeForce products. - * Not supported on Quadro ® and Tesla &tm; C-class products. - * Requires root/admin permissions. - * - * Changing GOMs requires a reboot. - * The reboot requirement might be removed in the future. - * - * Compute only GOMs don't support graphics acceleration. Under windows switching to these GOMs when - * pending driver model is WDDM is not supported. See \ref nvmlDeviceSetDriverModel. - * - * @param device The identifier of the target device - * @param mode Target GOM - * - * @return - * - \ref NVML_SUCCESS if \a mode has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a mode incorrect - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support GOM or specific mode - * - \ref NVML_ERROR_NO_PERMISSION if the user doesn't have permission to perform this operation - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - * - * @see nvmlGpuOperationMode_t - * @see nvmlDeviceGetGpuOperationMode - */ -nvmlReturn_t DECLDIR nvmlDeviceSetGpuOperationMode(nvmlDevice_t device, nvmlGpuOperationMode_t mode); - -/** - * Changes the root/admin restructions on certain APIs. See \a nvmlRestrictedAPI_t for the list of supported APIs. - * This method can be used by a root/admin user to give non-root/admin access to certain otherwise-restricted APIs. - * The new setting lasts for the lifetime of the NVIDIA driver; it is not persistent. See \a nvmlDeviceGetAPIRestriction - * to query the current restriction settings. - * - * For Kepler &tm; or newer fully supported devices. - * Requires root/admin permissions. - * - * @param device The identifier of the target device - * @param apiType Target API type for this operation - * @param isRestricted The target restriction - * - * @return - * - \ref NVML_SUCCESS if \a isRestricted has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a apiType incorrect - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support changing API restrictions or the device does not support - * the feature that api restrictions are being set for (E.G. Enabling/disabling auto - * boosted clocks is not supported by the device) - * - \ref NVML_ERROR_NO_PERMISSION if the user doesn't have permission to perform this operation - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - * - * @see nvmlRestrictedAPI_t - */ -nvmlReturn_t DECLDIR nvmlDeviceSetAPIRestriction(nvmlDevice_t device, nvmlRestrictedAPI_t apiType, nvmlEnableState_t isRestricted); - -/** - * @} - */ - -/** @addtogroup nvmlAccountingStats - * @{ - */ - -/** - * Enables or disables per process accounting. - * - * For Kepler &tm; or newer fully supported devices. - * Requires root/admin permissions. - * - * @note This setting is not persistent and will default to disabled after driver unloads. - * Enable persistence mode to be sure the setting doesn't switch off to disabled. - * - * @note Enabling accounting mode has no negative impact on the GPU performance. - * - * @note Disabling accounting clears all accounting pids information. - * - * See \ref nvmlDeviceGetAccountingMode - * See \ref nvmlDeviceGetAccountingStats - * See \ref nvmlDeviceClearAccountingPids - * - * @param device The identifier of the target device - * @param mode The target accounting mode - * - * @return - * - \ref NVML_SUCCESS if the new mode has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device or \a mode are invalid - * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature - * - \ref NVML_ERROR_NO_PERMISSION if the user doesn't have permission to perform this operation - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceSetAccountingMode(nvmlDevice_t device, nvmlEnableState_t mode); - -/** - * Clears accounting information about all processes that have already terminated. - * - * For Kepler &tm; or newer fully supported devices. - * Requires root/admin permissions. - * - * See \ref nvmlDeviceGetAccountingMode - * See \ref nvmlDeviceGetAccountingStats - * See \ref nvmlDeviceSetAccountingMode - * - * @param device The identifier of the target device - * - * @return - * - \ref NVML_SUCCESS if accounting information has been cleared - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device are invalid - * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature - * - \ref NVML_ERROR_NO_PERMISSION if the user doesn't have permission to perform this operation - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceClearAccountingPids(nvmlDevice_t device); - -/** @} */ - -/***************************************************************************************************/ -/** @defgroup NvLink NvLink Methods - * This chapter describes methods that NVML can perform on NVLINK enabled devices. - * @{ - */ -/***************************************************************************************************/ - -/** - * Retrieves the state of the device's NvLink for the link specified - * - * For Pascal &tm; or newer fully supported devices. - * - * @param device The identifier of the target device - * @param link Specifies the NvLink link to be queried - * @param isActive \a nvmlEnableState_t where NVML_FEATURE_ENABLED indicates that - * the link is active and NVML_FEATURE_DISABLED indicates it - * is inactive - * - * @return - * - \ref NVML_SUCCESS if \a isActive has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device or \a link is invalid or \a isActive is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetNvLinkState(nvmlDevice_t device, unsigned int link, nvmlEnableState_t *isActive); - -/** - * Retrieves the version of the device's NvLink for the link specified - * - * For Pascal &tm; or newer fully supported devices. - * - * @param device The identifier of the target device - * @param link Specifies the NvLink link to be queried - * @param version Requested NvLink version - * - * @return - * - \ref NVML_SUCCESS if \a version has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device or \a link is invalid or \a version is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetNvLinkVersion(nvmlDevice_t device, unsigned int link, unsigned int *version); - -/** - * Retrieves the requested capability from the device's NvLink for the link specified - * Please refer to the \a nvmlNvLinkCapability_t structure for the specific caps that can be queried - * The return value should be treated as a boolean. - * - * For Pascal &tm; or newer fully supported devices. - * - * @param device The identifier of the target device - * @param link Specifies the NvLink link to be queried - * @param capability Specifies the \a nvmlNvLinkCapability_t to be queried - * @param capResult A boolean for the queried capability indicating that feature is available - * - * @return - * - \ref NVML_SUCCESS if \a capResult has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device, \a link, or \a capability is invalid or \a capResult is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetNvLinkCapability(nvmlDevice_t device, unsigned int link, - nvmlNvLinkCapability_t capability, unsigned int *capResult); - -/** - * Retrieves the PCI information for the remote node on a NvLink link - * Note: pciSubSystemId is not filled in this function and is indeterminate - * - * For Pascal &tm; or newer fully supported devices. - * - * @param device The identifier of the target device - * @param link Specifies the NvLink link to be queried - * @param pci \a nvmlPciInfo_t of the remote node for the specified link - * - * @return - * - \ref NVML_SUCCESS if \a pci has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device or \a link is invalid or \a pci is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetNvLinkRemotePciInfo(nvmlDevice_t device, unsigned int link, nvmlPciInfo_t *pci); - -/** - * Retrieves the specified error counter value - * Please refer to \a nvmlNvLinkErrorCounter_t for error counters that are available - * - * For Pascal &tm; or newer fully supported devices. - * - * @param device The identifier of the target device - * @param link Specifies the NvLink link to be queried - * @param counter Specifies the NvLink counter to be queried - * @param counterValue Returned counter value - * - * @return - * - \ref NVML_SUCCESS if \a counter has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device, \a link, or \a counter is invalid or \a counterValue is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetNvLinkErrorCounter(nvmlDevice_t device, unsigned int link, - nvmlNvLinkErrorCounter_t counter, unsigned long long *counterValue); - -/** - * Resets all error counters to zero - * Please refer to \a nvmlNvLinkErrorCounter_t for the list of error counters that are reset - * - * For Pascal &tm; or newer fully supported devices. - * - * @param device The identifier of the target device - * @param link Specifies the NvLink link to be queried - * - * @return - * - \ref NVML_SUCCESS if the reset is successful - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device or \a link is invalid - * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceResetNvLinkErrorCounters(nvmlDevice_t device, unsigned int link); - -/** - * Set the NVLINK utilization counter control information for the specified counter, 0 or 1. - * Please refer to \a nvmlNvLinkUtilizationControl_t for the structure definition. Performs a reset - * of the counters if the reset parameter is non-zero. - * - * For Pascal &tm; or newer fully supported devices. - * - * @param device The identifier of the target device - * @param counter Specifies the counter that should be set (0 or 1). - * @param link Specifies the NvLink link to be queried - * @param control A reference to the \a nvmlNvLinkUtilizationControl_t to set - * @param reset Resets the counters on set if non-zero - * - * @return - * - \ref NVML_SUCCESS if the control has been set successfully - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device, \a counter, \a link, or \a control is invalid - * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceSetNvLinkUtilizationControl(nvmlDevice_t device, unsigned int link, unsigned int counter, - nvmlNvLinkUtilizationControl_t *control, unsigned int reset); - -/** - * Get the NVLINK utilization counter control information for the specified counter, 0 or 1. - * Please refer to \a nvmlNvLinkUtilizationControl_t for the structure definition - * - * For Pascal &tm; or newer fully supported devices. - * - * @param device The identifier of the target device - * @param counter Specifies the counter that should be set (0 or 1). - * @param link Specifies the NvLink link to be queried - * @param control A reference to the \a nvmlNvLinkUtilizationControl_t to place information - * - * @return - * - \ref NVML_SUCCESS if the control has been set successfully - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device, \a counter, \a link, or \a control is invalid - * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetNvLinkUtilizationControl(nvmlDevice_t device, unsigned int link, unsigned int counter, - nvmlNvLinkUtilizationControl_t *control); - - -/** - * Retrieve the NVLINK utilization counter based on the current control for a specified counter. - * In general it is good practice to use \a nvmlDeviceSetNvLinkUtilizationControl - * before reading the utilization counters as they have no default state - * - * For Pascal &tm; or newer fully supported devices. - * - * @param device The identifier of the target device - * @param link Specifies the NvLink link to be queried - * @param counter Specifies the counter that should be read (0 or 1). - * @param rxcounter Receive counter return value - * @param txcounter Transmit counter return value - * - * @return - * - \ref NVML_SUCCESS if \a rxcounter and \a txcounter have been successfully set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device, \a counter, or \a link is invalid or \a rxcounter or \a txcounter are NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetNvLinkUtilizationCounter(nvmlDevice_t device, unsigned int link, unsigned int counter, - unsigned long long *rxcounter, unsigned long long *txcounter); - -/** - * Freeze the NVLINK utilization counters - * Both the receive and transmit counters are operated on by this function - * - * For Pascal &tm; or newer fully supported devices. - * - * @param device The identifier of the target device - * @param link Specifies the NvLink link to be queried - * @param counter Specifies the counter that should be frozen (0 or 1). - * @param freeze NVML_FEATURE_ENABLED = freeze the receive and transmit counters - * NVML_FEATURE_DISABLED = unfreeze the receive and transmit counters - * - * @return - * - \ref NVML_SUCCESS if counters were successfully frozen or unfrozen - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device, \a link, \a counter, or \a freeze is invalid - * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceFreezeNvLinkUtilizationCounter (nvmlDevice_t device, unsigned int link, - unsigned int counter, nvmlEnableState_t freeze); - -/** - * Reset the NVLINK utilization counters - * Both the receive and transmit counters are operated on by this function - * - * For Pascal &tm; or newer fully supported devices. - * - * @param device The identifier of the target device - * @param link Specifies the NvLink link to be reset - * @param counter Specifies the counter that should be reset (0 or 1) - * - * @return - * - \ref NVML_SUCCESS if counters were successfully reset - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device, \a link, or \a counter is invalid - * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceResetNvLinkUtilizationCounter (nvmlDevice_t device, unsigned int link, unsigned int counter); - -/** @} */ - -/***************************************************************************************************/ -/** @defgroup nvmlEvents Event Handling Methods - * This chapter describes methods that NVML can perform against each device to register and wait for - * some event to occur. - * @{ - */ -/***************************************************************************************************/ - -/** - * Create an empty set of events. - * Event set should be freed by \ref nvmlEventSetFree - * - * For Fermi &tm; or newer fully supported devices. - * @param set Reference in which to return the event handle - * - * @return - * - \ref NVML_SUCCESS if the event has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a set is NULL - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - * - * @see nvmlEventSetFree - */ -nvmlReturn_t DECLDIR nvmlEventSetCreate(nvmlEventSet_t *set); - -/** - * Starts recording of events on a specified devices and add the events to specified \ref nvmlEventSet_t - * - * For Fermi &tm; or newer fully supported devices. - * Ecc events are available only on ECC enabled devices (see \ref nvmlDeviceGetTotalEccErrors) - * Power capping events are available only on Power Management enabled devices (see \ref nvmlDeviceGetPowerManagementMode) - * - * For Linux only. - * - * \b IMPORTANT: Operations on \a set are not thread safe - * - * This call starts recording of events on specific device. - * All events that occurred before this call are not recorded. - * Checking if some event occurred can be done with \ref nvmlEventSetWait - * - * If function reports NVML_ERROR_UNKNOWN, event set is in undefined state and should be freed. - * If function reports NVML_ERROR_NOT_SUPPORTED, event set can still be used. None of the requested eventTypes - * are registered in that case. - * - * @param device The identifier of the target device - * @param eventTypes Bitmask of \ref nvmlEventType to record - * @param set Set to which add new event types - * - * @return - * - \ref NVML_SUCCESS if the event has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a eventTypes is invalid or \a set is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if the platform does not support this feature or some of requested event types - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - * - * @see nvmlEventType - * @see nvmlDeviceGetSupportedEventTypes - * @see nvmlEventSetWait - * @see nvmlEventSetFree - */ -nvmlReturn_t DECLDIR nvmlDeviceRegisterEvents(nvmlDevice_t device, unsigned long long eventTypes, nvmlEventSet_t set); - -/** - * Returns information about events supported on device - * - * For Fermi &tm; or newer fully supported devices. - * - * Events are not supported on Windows. So this function returns an empty mask in \a eventTypes on Windows. - * - * @param device The identifier of the target device - * @param eventTypes Reference in which to return bitmask of supported events - * - * @return - * - \ref NVML_SUCCESS if the eventTypes has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a eventType is NULL - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - * - * @see nvmlEventType - * @see nvmlDeviceRegisterEvents - */ -nvmlReturn_t DECLDIR nvmlDeviceGetSupportedEventTypes(nvmlDevice_t device, unsigned long long *eventTypes); - -/** - * Waits on events and delivers events - * - * For Fermi &tm; or newer fully supported devices. - * - * If some events are ready to be delivered at the time of the call, function returns immediately. - * If there are no events ready to be delivered, function sleeps till event arrives - * but not longer than specified timeout. This function in certain conditions can return before - * specified timeout passes (e.g. when interrupt arrives) - * - * In case of xid error, the function returns the most recent xid error type seen by the system. If there are multiple - * xid errors generated before nvmlEventSetWait is invoked then the last seen xid error type is returned for all - * xid error events. - * - * @param set Reference to set of events to wait on - * @param data Reference in which to return event data - * @param timeoutms Maximum amount of wait time in milliseconds for registered event - * - * @return - * - \ref NVML_SUCCESS if the data has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a data is NULL - * - \ref NVML_ERROR_TIMEOUT if no event arrived in specified timeout or interrupt arrived - * - \ref NVML_ERROR_GPU_IS_LOST if a GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - * - * @see nvmlEventType - * @see nvmlDeviceRegisterEvents - */ -nvmlReturn_t DECLDIR nvmlEventSetWait(nvmlEventSet_t set, nvmlEventData_t * data, unsigned int timeoutms); - -/** - * Releases events in the set - * - * For Fermi &tm; or newer fully supported devices. - * - * @param set Reference to events to be released - * - * @return - * - \ref NVML_SUCCESS if the event has been successfully released - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - * - * @see nvmlDeviceRegisterEvents - */ -nvmlReturn_t DECLDIR nvmlEventSetFree(nvmlEventSet_t set); - -/** @} */ - -/***************************************************************************************************/ -/** @defgroup nvmlZPI Drain states - * This chapter describes methods that NVML can perform against each device to control their drain state - * and recognition by NVML and NVIDIA kernel driver. These methods can be used with out-of-band tools to - * power on/off GPUs, enable robust reset scenarios, etc. - * @{ - */ -/***************************************************************************************************/ - -/** - * Modify the drain state of a GPU. This method forces a GPU to no longer accept new incoming requests. - * Any new NVML process will no longer see this GPU. Persistence mode for this GPU must be turned off before - * this call is made. - * Must be called as administrator. - * For Linux only. - * - * For Pascal &tm; or newer fully supported devices. - * Some Kepler devices supported. - * - * @param pciInfo The PCI address of the GPU drain state to be modified - * @param newState The drain state that should be entered, see \ref nvmlEnableState_t - * - * @return - * - \ref NVML_SUCCESS if counters were successfully reset - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a nvmlIndex or \a newState is invalid - * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature - * - \ref NVML_ERROR_NO_PERMISSION if the calling process has insufficient permissions to perform operation - * - \ref NVML_ERROR_IN_USE if the device has persistence mode turned on - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceModifyDrainState (nvmlPciInfo_t *pciInfo, nvmlEnableState_t newState); - -/** - * Query the drain state of a GPU. This method is used to check if a GPU is in a currently draining - * state. - * For Linux only. - * - * For Pascal &tm; or newer fully supported devices. - * Some Kepler devices supported. - * - * @param pciInfo The PCI address of the GPU drain state to be queried - * @param currentState The current drain state for this GPU, see \ref nvmlEnableState_t - * - * @return - * - \ref NVML_SUCCESS if counters were successfully reset - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a nvmlIndex or \a currentState is invalid - * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceQueryDrainState (nvmlPciInfo_t *pciInfo, nvmlEnableState_t *currentState); - -/** - * This method will remove the specified GPU from the view of both NVML and the NVIDIA kernel driver - * as long as no other processes are attached. If other processes are attached, this call will return - * NVML_ERROR_IN_USE and the GPU will be returned to its original "draining" state. Note: the - * only situation where a process can still be attached after nvmlDeviceModifyDrainState() is called - * to initiate the draining state is if that process was using, and is still using, a GPU before the - * call was made. Also note, persistence mode counts as an attachment to the GPU thus it must be disabled - * prior to this call. - * - * For long-running NVML processes please note that this will change the enumeration of current GPUs. - * For example, if there are four GPUs present and GPU1 is removed, the new enumeration will be 0-2. - * Also, device handles after the removed GPU will not be valid and must be re-established. - * Must be run as administrator. - * For Linux only. - * - * For Pascal &tm; or newer fully supported devices. - * Some Kepler devices supported. - * - * @param pciInfo The PCI address of the GPU to be removed - * @param gpuState Whether the GPU is to be removed, from the OS - * see \ref nvmlDetachGpuState_t - * @param linkState Requested upstream PCIe link state, see \ref nvmlPcieLinkState_t - * - * @return - * - \ref NVML_SUCCESS if counters were successfully reset - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a nvmlIndex is invalid - * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature - * - \ref NVML_ERROR_IN_USE if the device is still in use and cannot be removed - */ -nvmlReturn_t DECLDIR nvmlDeviceRemoveGpu (nvmlPciInfo_t *pciInfo, nvmlDetachGpuState_t gpuState, nvmlPcieLinkState_t linkState); - -/** - * Request the OS and the NVIDIA kernel driver to rediscover a portion of the PCI subsystem looking for GPUs that - * were previously removed. The portion of the PCI tree can be narrowed by specifying a domain, bus, and device. - * If all are zeroes then the entire PCI tree will be searched. Please note that for long-running NVML processes - * the enumeration will change based on how many GPUs are discovered and where they are inserted in bus order. - * - * In addition, all newly discovered GPUs will be initialized and their ECC scrubbed which may take several seconds - * per GPU. Also, all device handles are no longer guaranteed to be valid post discovery. - * - * Must be run as administrator. - * For Linux only. - * - * For Pascal &tm; or newer fully supported devices. - * Some Kepler devices supported. - * - * @param pciInfo The PCI tree to be searched. Only the domain, bus, and device - * fields are used in this call. - * - * @return - * - \ref NVML_SUCCESS if counters were successfully reset - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a pciInfo is invalid - * - \ref NVML_ERROR_NOT_SUPPORTED if the operating system does not support this feature - * - \ref NVML_ERROR_OPERATING_SYSTEM if the operating system is denying this feature - * - \ref NVML_ERROR_NO_PERMISSION if the calling process has insufficient permissions to perform operation - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceDiscoverGpus (nvmlPciInfo_t *pciInfo); - -/** @} */ - -/***************************************************************************************************/ -/** @defgroup nvmlFieldValueQueries Field Value Queries - * This chapter describes NVML operations that are associated with retrieving Field Values from NVML - * @{ - */ -/***************************************************************************************************/ - -/** - * Request values for a list of fields for a device. This API allows multiple fields to be queried at once. - * If any of the underlying fieldIds are populated by the same driver call, the results for those field IDs - * will be populated from a single call rather than making a driver call for each fieldId. - * - * @param device The device handle of the GPU to request field values for - * @param valuesCount Number of entries in values that should be retrieved - * @param values Array of \a valuesCount structures to hold field values. - * Each value's fieldId must be populated prior to this call - * - * @return - * - \ref NVML_SUCCESS if any values in \a values were populated. Note that you must - * check the nvmlReturn field of each value for each individual - * status - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a values is NULL - */ -nvmlReturn_t DECLDIR nvmlDeviceGetFieldValues(nvmlDevice_t device, int valuesCount, nvmlFieldValue_t *values); - - -/** @} */ - -/***************************************************************************************************/ -/** @defgroup nvmlGridQueries Grid Queries - * This chapter describes NVML operations that are associated with NVIDIA GRID products. - * @{ - */ -/***************************************************************************************************/ - -/** - * This method is used to get the virtualization mode corresponding to the GPU. - * - * For Kepler &tm; or newer fully supported devices. - * - * @param device Identifier of the target device - * @param pVirtualMode Reference to virtualization mode. One of NVML_GPU_VIRTUALIZATION_? - * - * @return - * - \ref NVML_SUCCESS if \a pVirtualMode is fetched - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a pVirtualMode is NULL - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetVirtualizationMode(nvmlDevice_t device, nvmlGpuVirtualizationMode_t *pVirtualMode); - -/** @} */ - -/***************************************************************************************************/ -/** @defgroup nvmlGridCommands Grid Commands - * This chapter describes NVML operations that are associated with NVIDIA GRID products. - * @{ - */ -/***************************************************************************************************/ - -/** - * This method is used to set the virtualization mode corresponding to the GPU. - * - * For Kepler &tm; or newer fully supported devices. - * - * @param device Identifier of the target device - * @param virtualMode virtualization mode. One of NVML_GPU_VIRTUALIZATION_? - * - * @return - * - \ref NVML_SUCCESS if \a pVirtualMode is set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a pVirtualMode is NULL - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_NOT_SUPPORTED if setting of virtualization mode is not supported. - * - \ref NVML_ERROR_NO_PERMISSION if setting of virtualization mode is not allowed for this client. - */ -nvmlReturn_t DECLDIR nvmlDeviceSetVirtualizationMode(nvmlDevice_t device, nvmlGpuVirtualizationMode_t virtualMode); - -/** @} */ - -/***************************************************************************************************/ -/** @defgroup nvmlVgpu vGPU Management - * @{ - * - * Set of APIs supporting GRID vGPU - */ -/***************************************************************************************************/ - -/** - * Retrieve the supported vGPU types on a physical GPU (device). - * - * An array of supported vGPU types for the physical GPU indicated by \a device is returned in the caller-supplied buffer - * pointed at by \a vgpuTypeIds. The element count of nvmlVgpuTypeId_t array is passed in \a vgpuCount, and \a vgpuCount - * is used to return the number of vGPU types written to the buffer. - * - * If the supplied buffer is not large enough to accomodate the vGPU type array, the function returns - * NVML_ERROR_INSUFFICIENT_SIZE, with the element count of nvmlVgpuTypeId_t array required in \a vgpuCount. - * To query the number of vGPU types supported for the GPU, call this function with *vgpuCount = 0. - * The code will return NVML_ERROR_INSUFFICIENT_SIZE, or NVML_SUCCESS if no vGPU types are supported. - * - * @param device The identifier of the target device - * @param vgpuCount Pointer to caller-supplied array size, and returns number of vGPU types - * @param vgpuTypeIds Pointer to caller-supplied array in which to return list of vGPU types - * - * @return - * - \ref NVML_SUCCESS successful completion - * - \ref NVML_ERROR_INSUFFICIENT_SIZE \a vgpuTypeIds buffer is too small, array element count is returned in \a vgpuCount - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuCount is NULL or \a device is invalid - * - \ref NVML_ERROR_NOT_SUPPORTED if vGPU is not supported by the device - * - \ref NVML_ERROR_VGPU_ECC_NOT_SUPPORTED if ECC is enabled on the device - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetSupportedVgpus(nvmlDevice_t device, unsigned int *vgpuCount, nvmlVgpuTypeId_t *vgpuTypeIds); - -/** - * Retrieve the currently creatable vGPU types on a physical GPU (device). - * - * An array of creatable vGPU types for the physical GPU indicated by \a device is returned in the caller-supplied buffer - * pointed at by \a vgpuTypeIds. The element count of nvmlVgpuTypeId_t array is passed in \a vgpuCount, and \a vgpuCount - * is used to return the number of vGPU types written to the buffer. - * - * The creatable vGPU types for a device may differ over time, as there may be restrictions on what type of vGPU types - * can concurrently run on a device. For example, if only one vGPU type is allowed at a time on a device, then the creatable - * list will be restricted to whatever vGPU type is already running on the device. - * - * If the supplied buffer is not large enough to accomodate the vGPU type array, the function returns - * NVML_ERROR_INSUFFICIENT_SIZE, with the element count of nvmlVgpuTypeId_t array required in \a vgpuCount. - * To query the number of vGPU types createable for the GPU, call this function with *vgpuCount = 0. - * The code will return NVML_ERROR_INSUFFICIENT_SIZE, or NVML_SUCCESS if no vGPU types are creatable. - * - * @param device The identifier of the target device - * @param vgpuCount Pointer to caller-supplied array size, and returns number of vGPU types - * @param vgpuTypeIds Pointer to caller-supplied array in which to return list of vGPU types - * - * @return - * - \ref NVML_SUCCESS successful completion - * - \ref NVML_ERROR_INSUFFICIENT_SIZE \a vgpuTypeIds buffer is too small, array element count is returned in \a vgpuCount - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuCount is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if vGPU is not supported by the device - * - \ref NVML_ERROR_VGPU_ECC_NOT_SUPPORTED if ECC is enabled on the device - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetCreatableVgpus(nvmlDevice_t device, unsigned int *vgpuCount, nvmlVgpuTypeId_t *vgpuTypeIds); - -/** - * Retrieve the class of a vGPU type. It will not exceed 64 characters in length (including the NUL terminator). - * See \ref nvmlConstants::NVML_DEVICE_NAME_BUFFER_SIZE. - * - * For Kepler &tm; or newer fully supported devices. - * - * @param vgpuTypeId Handle to vGPU type - * @param vgpuTypeClass Pointer to string array to return class in - * @param size Size of string - * - * @return - * - \ref NVML_SUCCESS successful completion - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuTypeId is invalid, or \a vgpuTypeClass is NULL - * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a size is too small - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlVgpuTypeGetClass(nvmlVgpuTypeId_t vgpuTypeId, char *vgpuTypeClass, unsigned int *size); - -/** - * Retrieve the vGPU type name. - * - * The name is an alphanumeric string that denotes a particular vGPU, e.g. GRID M60-2Q. It will not - * exceed 64 characters in length (including the NUL terminator). See \ref - * nvmlConstants::NVML_DEVICE_NAME_BUFFER_SIZE. - * - * For Kepler &tm; or newer fully supported devices. - * - * @param vgpuTypeId Handle to vGPU type - * @param vgpuTypeName Pointer to buffer to return name - * @param size Size of buffer - * - * @return - * - \ref NVML_SUCCESS successful completion - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuTypeId is invalid, or \a name is NULL - * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a size is too small - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlVgpuTypeGetName(nvmlVgpuTypeId_t vgpuTypeId, char *vgpuTypeName, unsigned int *size); - -/** - * Retrieve the device ID of a vGPU type. - * - * For Kepler &tm; or newer fully supported devices. - * - * @param vgpuTypeId Handle to vGPU type - * @param deviceID Device ID and vendor ID of the device contained in single 32 bit value - * @param subsystemID Subsytem ID and subsytem vendor ID of the device contained in single 32 bit value - * - * @return - * - \ref NVML_SUCCESS successful completion - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuTypeId is invalid, or \a deviceId or \a subsystemID are NULL - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlVgpuTypeGetDeviceID(nvmlVgpuTypeId_t vgpuTypeId, unsigned long long *deviceID, unsigned long long *subsystemID); - -/** - * Retrieve the vGPU framebuffer size in bytes. - * - * For Kepler &tm; or newer fully supported devices. - * - * @param vgpuTypeId Handle to vGPU type - * @param fbSize Pointer to framebuffer size in bytes - * - * @return - * - \ref NVML_SUCCESS successful completion - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuTypeId is invalid, or \a fbSize is NULL - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlVgpuTypeGetFramebufferSize(nvmlVgpuTypeId_t vgpuTypeId, unsigned long long *fbSize); - -/** - * Retrieve count of vGPU's supported display heads. - * - * For Kepler &tm; or newer fully supported devices. - * - * @param vgpuTypeId Handle to vGPU type - * @param numDisplayHeads Pointer to number of display heads - * - * @return - * - \ref NVML_SUCCESS successful completion - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuTypeId is invalid, or \a numDisplayHeads is NULL - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlVgpuTypeGetNumDisplayHeads(nvmlVgpuTypeId_t vgpuTypeId, unsigned int *numDisplayHeads); - -/** - * Retrieve vGPU display head's maximum supported resolution. - * - * For Kepler &tm; or newer fully supported devices. - * - * @param vgpuTypeId Handle to vGPU type - * @param displayIndex Zero-based index of display head - * @param xdim Pointer to maximum number of pixels in X dimension - * @param ydim Pointer to maximum number of pixels in Y dimension - * - * @return - * - \ref NVML_SUCCESS successful completion - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuTypeId is invalid, or \a xdim or \a ydim are NULL, or \a displayIndex - * is out of range. - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlVgpuTypeGetResolution(nvmlVgpuTypeId_t vgpuTypeId, unsigned int displayIndex, unsigned int *xdim, unsigned int *ydim); - -/** - * Retrieve license requirements for a vGPU type - * - * The license type and version required to run the specified vGPU type is returned as an alphanumeric string, in the form - * ",", for example "GRID-Virtual-PC,2.0". If a vGPU is runnable with* more than one type of license, - * the licenses are delimited by a semicolon, for example "GRID-Virtual-PC,2.0;GRID-Virtual-WS,2.0;GRID-Virtual-WS-Ext,2.0". - * - * The total length of the returned string will not exceed 128 characters, including the NUL terminator. - * See \ref nvmlVgpuConstants::NVML_GRID_LICENSE_BUFFER_SIZE. - * - * For Kepler &tm; or newer fully supported devices. - * - * @param vgpuTypeId Handle to vGPU type - * @param vgpuTypeLicenseString Pointer to buffer to return license info - * @param size Size of \a vgpuTypeLicenseString buffer - * - * @return - * - \ref NVML_SUCCESS successful completion - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuTypeId is invalid, or \a vgpuTypeLicenseString is NULL - * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a size is too small - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlVgpuTypeGetLicense(nvmlVgpuTypeId_t vgpuTypeId, char *vgpuTypeLicenseString, unsigned int size); - -/** - * Retrieve the static frame rate limit value of the vGPU type - * - * For Kepler &tm; or newer fully supported devices. - * - * @param vgpuTypeId Handle to vGPU type - * @param frameRateLimit Reference to return the frame rate limit value - * @return - * - \ref NVML_SUCCESS successful completion - * - \ref NVML_ERROR_NOT_SUPPORTED if frame rate limiter is turned off for the vGPU type - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, or \a frameRateLimit is NULL - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlVgpuTypeGetFrameRateLimit(nvmlVgpuTypeId_t vgpuTypeId, unsigned int *frameRateLimit); - -/** - * Retrieve the maximum number of vGPU instances creatable on a device for given vGPU type - * - * For Kepler &tm; or newer fully supported devices. - * - * @param device The identifier of the target device - * @param vgpuTypeId Handle to vGPU type - * @param vgpuInstanceCount Pointer to get the max number of vGPU instances - * that can be created on a deicve for given vgpuTypeId - * @return - * - \ref NVML_SUCCESS successful completion - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuTypeId is invalid or is not supported on target device, - * or \a vgpuInstanceCount is NULL - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlVgpuTypeGetMaxInstances(nvmlDevice_t device, nvmlVgpuTypeId_t vgpuTypeId, unsigned int *vgpuInstanceCount); - -/** - * Retrieve the active vGPU instances on a device. - * - * An array of active vGPU instances is returned in the caller-supplied buffer pointed at by \a vgpuInstances. The - * array elememt count is passed in \a vgpuCount, and \a vgpuCount is used to return the number of vGPU instances - * written to the buffer. - * - * If the supplied buffer is not large enough to accomodate the vGPU instance array, the function returns - * NVML_ERROR_INSUFFICIENT_SIZE, with the element count of nvmlVgpuInstance_t array required in \a vgpuCount. - * To query the number of active vGPU instances, call this function with *vgpuCount = 0. The code will return - * NVML_ERROR_INSUFFICIENT_SIZE, or NVML_SUCCESS if no vGPU Types are supported. - * - * For Kepler &tm; or newer fully supported devices. - * - * @param device The identifier of the target device - * @param vgpuCount Pointer which passes in the array size as well as get - * back the number of types - * @param vgpuInstances Pointer to array in which to return list of vGPU instances - * - * @return - * - \ref NVML_SUCCESS successful completion - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, or \a vgpuCount is NULL - * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a size is too small - * - \ref NVML_ERROR_NOT_SUPPORTED if vGPU is not supported by the device - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetActiveVgpus(nvmlDevice_t device, unsigned int *vgpuCount, nvmlVgpuInstance_t *vgpuInstances); - -/** - * Retrieve the VM ID associated with a vGPU instance. - * - * The VM ID is returned as a string, not exceeding 80 characters in length (including the NUL terminator). - * See \ref nvmlConstants::NVML_DEVICE_UUID_BUFFER_SIZE. - * - * The format of the VM ID varies by platform, and is indicated by the type identifier returned in \a vmIdType. - * - * For Kepler &tm; or newer fully supported devices. - * - * @param vgpuInstance Identifier of the target vGPU instance - * @param vmId Pointer to caller-supplied buffer to hold VM ID - * @param size Size of buffer in bytes - * @param vmIdType Pointer to hold VM ID type - * - * @return - * - \ref NVML_SUCCESS successful completion - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuInstance is invalid, or \a vmId or \a vmIdType are NULL - * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a size is too small - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlVgpuInstanceGetVmID(nvmlVgpuInstance_t vgpuInstance, char *vmId, unsigned int size, nvmlVgpuVmIdType_t *vmIdType); - -/** - * Retrieve the UUID of a vGPU instance. - * - * The UUID is a globally unique identifier associated with the vGPU, and is returned as a 5-part hexadecimal string, - * not exceeding 80 characters in length (including the NULL terminator). - * See \ref nvmlConstants::NVML_DEVICE_UUID_BUFFER_SIZE. - * - * For Kepler &tm; or newer fully supported devices. - * - * @param vgpuInstance Identifier of the target vGPU instance - * @param uuid Pointer to caller-supplied buffer to hold vGPU UUID - * @param size Size of buffer in bytes - * - * @return - * - \ref NVML_SUCCESS successful completion - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuInstance is invalid, or \a uuid is NULL - * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a size is too small - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlVgpuInstanceGetUUID(nvmlVgpuInstance_t vgpuInstance, char *uuid, unsigned int size); - -/** - * Retrieve the NVIDIA driver version installed in the VM associated with a vGPU. - * - * The version is returned as an alphanumeric string in the caller-supplied buffer \a version. The length of the version - * string will not exceed 80 characters in length (including the NUL terminator). - * See \ref nvmlConstants::NVML_SYSTEM_DRIVER_VERSION_BUFFER_SIZE. - * - * nvmlVgpuInstanceGetVmDriverVersion() may be called at any time for a vGPU instance. The guest VM driver version is - * returned as "Unknown" if no NVIDIA driver is installed in the VM, or the VM has not yet booted to the point where the - * NVIDIA driver is loaded and initialized. - * - * For Kepler &tm; or newer fully supported devices. - * - * @param vgpuInstance Identifier of the target vGPU instance - * @param version Caller-supplied buffer to return driver version string - * @param length Size of \a version buffer - * - * @return - * - \ref NVML_SUCCESS if \a version has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuInstance is invalid - * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a length is too small - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlVgpuInstanceGetVmDriverVersion(nvmlVgpuInstance_t vgpuInstance, char* version, unsigned int length); - -/** - * Retrieve the framebuffer usage in bytes. - * - * Framebuffer usage is the amont of vGPU framebuffer memory that is currently in use by the VM. - * - * For Kepler &tm; or newer fully supported devices. - * - * @param vgpuInstance The identifier of the target instance - * @param fbUsage Pointer to framebuffer usage in bytes - * - * @return - * - \ref NVML_SUCCESS successful completion - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuInstance is invalid, or \a fbUsage is NULL - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlVgpuInstanceGetFbUsage(nvmlVgpuInstance_t vgpuInstance, unsigned long long *fbUsage); - -/** - * Retrieve the current licensing state of the vGPU instance. - * - * If the vGPU is currently licensed, \a licensed is set to 1, otherwise it is set to 0. - * - * For Kepler &tm; or newer fully supported devices. - * - * @param vgpuInstance Identifier of the target vGPU instance - * @param licensed Reference to return the licensing status - * - * @return - * - \ref NVML_SUCCESS if \a licensed has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuInstance is invalid, or \a licensed is NULL - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlVgpuInstanceGetLicenseStatus(nvmlVgpuInstance_t vgpuInstance, unsigned int *licensed); - -/** - * Retrieve the vGPU type of a vGPU instance. - * - * Returns the vGPU type ID of vgpu assigned to the vGPU instance. - * - * For Kepler &tm; or newer fully supported devices. - * - * @param vgpuInstance Identifier of the target vGPU instance - * @param vgpuTypeId Reference to return the vgpuTypeId - * - * @return - * - \ref NVML_SUCCESS if \a vgpuTypeId has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuInstance is invalid, or \a vgpuTypeId is NULL - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlVgpuInstanceGetType(nvmlVgpuInstance_t vgpuInstance, nvmlVgpuTypeId_t *vgpuTypeId); - -/** - * Retrieve the frame rate limit set for the vGPU instance. - * - * Returns the value of the frame rate limit set for the vGPU instance - * - * For Kepler &tm; or newer fully supported devices. - * - * @param vgpuInstance Identifier of the target vGPU instance - * @param frameRateLimit Reference to return the frame rate limit - * - * @return - * - \ref NVML_SUCCESS if \a frameRateLimit has been set - * - \ref NVML_ERROR_NOT_SUPPORTED if frame rate limiter is turned off for the vGPU type - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuInstance is invalid, or \a frameRateLimit is NULL - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlVgpuInstanceGetFrameRateLimit(nvmlVgpuInstance_t vgpuInstance, unsigned int *frameRateLimit); - -/** - * Retrieve the encoder capacity of a vGPU instance, as a percentage of maximum encoder capacity with valid values in the range 0-100. - * - * For Maxwell &tm; or newer fully supported devices. - * - * @param vgpuInstance Identifier of the target vGPU instance - * @param encoderCapacity Reference to an unsigned int for the encoder capacity - * - * @return - * - \ref NVML_SUCCESS if \a encoderCapacity has been retrived - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuInstance is invalid, or \a encoderQueryType is invalid - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlVgpuInstanceGetEncoderCapacity(nvmlVgpuInstance_t vgpuInstance, unsigned int *encoderCapacity); - -/** - * Set the encoder capacity of a vGPU instance, as a percentage of maximum encoder capacity with valid values in the range 0-100. - * - * For Maxwell &tm; or newer fully supported devices. - * - * @param vgpuInstance Identifier of the target vGPU instance - * @param encoderCapacity Unsigned int for the encoder capacity value - * - * @return - * - \ref NVML_SUCCESS if \a encoderCapacity has been set - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuInstance is invalid - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlVgpuInstanceSetEncoderCapacity(nvmlVgpuInstance_t vgpuInstance, unsigned int encoderCapacity); - -/** - * Retrieves current utilization for vGPUs on a physical GPU (device). - * - * For Kepler &tm; or newer fully supported devices. - * - * Reads recent utilization of GPU SM (3D/Compute), framebuffer, video encoder, and video decoder for vGPU instances running - * on a device. Utilization values are returned as an array of utilization sample structures in the caller-supplied buffer - * pointed at by \a utilizationSamples. One utilization sample structure is returned per vGPU instance, and includes the - * CPU timestamp at which the samples were recorded. Individual utilization values are returned as "unsigned int" values - * in nvmlValue_t unions. The function sets the caller-supplied \a sampleValType to NVML_VALUE_TYPE_UNSIGNED_INT to - * indicate the returned value type. - * - * To read utilization values, first determine the size of buffer required to hold the samples by invoking the function with - * \a utilizationSamples set to NULL. The function will return NVML_ERROR_INSUFFICIENT_SIZE, with the current vGPU instance - * count in \a vgpuInstanceSamplesCount, or NVML_SUCCESS if the current vGPU instance count is zero. The caller should allocate - * a buffer of size vgpuInstanceSamplesCount * sizeof(nvmlVgpuInstanceUtilizationSample_t). Invoke the function again with - * the allocated buffer passed in \a utilizationSamples, and \a vgpuInstanceSamplesCount set to the number of entries the - * buffer is sized for. - * - * On successful return, the function updates \a vgpuInstanceSampleCount with the number of vGPU utilization sample - * structures that were actually written. This may differ from a previously read value as vGPU instances are created or - * destroyed. - * - * lastSeenTimeStamp represents the CPU timestamp in microseconds at which utilization samples were last read. Set it to 0 - * to read utilization based on all the samples maintained by the driver's internal sample buffer. Set lastSeenTimeStamp - * to a timeStamp retrieved from a previous query to read utilization since the previous query. - * - * @param device The identifier for the target device - * @param lastSeenTimeStamp Return only samples with timestamp greater than lastSeenTimeStamp. - * @param sampleValType Pointer to caller-supplied buffer to hold the type of returned sample values - * @param vgpuInstanceSamplesCount Pointer to caller-supplied array size, and returns number of vGPU instances - * @param utilizationSamples Pointer to caller-supplied buffer in which vGPU utilization samples are returned - - * @return - * - \ref NVML_SUCCESS if utilization samples are successfully retrieved - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, \a vgpuInstanceSamplesCount or \a sampleValType is - * NULL, or a sample count of 0 is passed with a non-NULL \a utilizationSamples - * - \ref NVML_ERROR_INSUFFICIENT_SIZE if supplied \a vgpuInstanceSamplesCount is too small to return samples for all - * vGPU instances currently executing on the device - * - \ref NVML_ERROR_NOT_SUPPORTED if vGPU is not supported by the device - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_NOT_FOUND if sample entries are not found - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetVgpuUtilization(nvmlDevice_t device, unsigned long long lastSeenTimeStamp, - nvmlValueType_t *sampleValType, unsigned int *vgpuInstanceSamplesCount, - nvmlVgpuInstanceUtilizationSample_t *utilizationSamples); - -/** - * Retrieves current utilization for processes running on vGPUs on a physical GPU (device). - * - * For Maxwell &tm; or newer fully supported devices. - * - * Reads recent utilization of GPU SM (3D/Compute), framebuffer, video encoder, and video decoder for processes running on - * vGPU instances active on a device. Utilization values are returned as an array of utilization sample structures in the - * caller-supplied buffer pointed at by \a utilizationSamples. One utilization sample structure is returned per process running - * on vGPU instances, that had some non-zero utilization during the last sample period. It includes the CPU timestamp at which - * the samples were recorded. Individual utilization values are returned as "unsigned int" values. - * - * To read utilization values, first determine the size of buffer required to hold the samples by invoking the function with - * \a utilizationSamples set to NULL. The function will return NVML_ERROR_INSUFFICIENT_SIZE, with the current vGPU instance - * count in \a vgpuProcessSamplesCount. The caller should allocate a buffer of size - * vgpuProcessSamplesCount * sizeof(nvmlVgpuProcessUtilizationSample_t). Invoke the function again with - * the allocated buffer passed in \a utilizationSamples, and \a vgpuProcessSamplesCount set to the number of entries the - * buffer is sized for. - * - * On successful return, the function updates \a vgpuSubProcessSampleCount with the number of vGPU sub process utilization sample - * structures that were actually written. This may differ from a previously read value depending on the number of processes that are active - * in any given sample period. - * - * lastSeenTimeStamp represents the CPU timestamp in microseconds at which utilization samples were last read. Set it to 0 - * to read utilization based on all the samples maintained by the driver's internal sample buffer. Set lastSeenTimeStamp - * to a timeStamp retrieved from a previous query to read utilization since the previous query. - * - * @param device The identifier for the target device - * @param lastSeenTimeStamp Return only samples with timestamp greater than lastSeenTimeStamp. - * @param vgpuProcessSamplesCount Pointer to caller-supplied array size, and returns number of processes running on vGPU instances - * @param utilizationSamples Pointer to caller-supplied buffer in which vGPU sub process utilization samples are returned - - * @return - * - \ref NVML_SUCCESS if utilization samples are successfully retrieved - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, \a vgpuProcessSamplesCount or a sample count of 0 is - * passed with a non-NULL \a utilizationSamples - * - \ref NVML_ERROR_INSUFFICIENT_SIZE if supplied \a vgpuProcessSamplesCount is too small to return samples for all - * vGPU instances currently executing on the device - * - \ref NVML_ERROR_NOT_SUPPORTED if vGPU is not supported by the device - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_NOT_FOUND if sample entries are not found - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetVgpuProcessUtilization(nvmlDevice_t device, unsigned long long lastSeenTimeStamp, - unsigned int *vgpuProcessSamplesCount, - nvmlVgpuProcessUtilizationSample_t *utilizationSamples); -/** - * Retrieve the GRID licensable features. - * - * Identifies whether the system supports GRID Software Licensing. If it does, return the list of licensable feature(s) - * and their current license status. - * - * @param device Identifier of the target device - * @param pGridLicensableFeatures Pointer to structure in which GRID licensable features are returned - * - * @return - * - \ref NVML_SUCCESS if licensable features are successfully retrieved - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a pGridLicensableFeatures is NULL - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetGridLicensableFeatures(nvmlDevice_t device, nvmlGridLicensableFeatures_t *pGridLicensableFeatures); - -/** - * Retrieves the current encoder statistics of a vGPU Instance - * - * For Maxwell &tm; or newer fully supported devices. - * - * @param vgpuInstance Identifier of the target vGPU instance - * @param sessionCount Reference to an unsigned int for count of active encoder sessions - * @param averageFps Reference to an unsigned int for trailing average FPS of all active sessions - * @param averageLatency Reference to an unsigned int for encode latency in microseconds - * - * @return - * - \ref NVML_SUCCESS if \a sessionCount, \a averageFps and \a averageLatency is fetched - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a sessionCount , or \a averageFps or \a averageLatency is NULL - * or \a vgpuInstance is invalid. - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlVgpuInstanceGetEncoderStats(nvmlVgpuInstance_t vgpuInstance, unsigned int *sessionCount, - unsigned int *averageFps, unsigned int *averageLatency); - -/** - * Retrieves information about all active encoder sessions on a vGPU Instance. - * - * An array of active encoder sessions is returned in the caller-supplied buffer pointed at by \a sessionInfo. The - * array elememt count is passed in \a sessionCount, and \a sessionCount is used to return the number of sessions - * written to the buffer. - * - * If the supplied buffer is not large enough to accomodate the active session array, the function returns - * NVML_ERROR_INSUFFICIENT_SIZE, with the element count of nvmlEncoderSessionInfo_t array required in \a sessionCount. - * To query the number of active encoder sessions, call this function with *sessionCount = 0. The code will return - * NVML_SUCCESS with number of active encoder sessions updated in *sessionCount. - * - * For Maxwell &tm; or newer fully supported devices. - * - * @param vgpuInstance Identifier of the target vGPU instance - * @param sessionCount Reference to caller supplied array size, and returns - * the number of sessions. - * @param sessionInfo Reference to caller supplied array in which the list - * of session information us returned. - * - * @return - * - \ref NVML_SUCCESS if \a sessionInfo is fetched - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a sessionCount is too small, array element count is - returned in \a sessionCount - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a sessionCount is NULL or \a vgpuInstance is invalid.. - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlVgpuInstanceGetEncoderSessions(nvmlVgpuInstance_t vgpuInstance, unsigned int *sessionCount, nvmlEncoderSessionInfo_t *sessionInfo); - -/** - * Retrieves the current utilization and process ID - * - * For Maxwell &tm; or newer fully supported devices. - * - * Reads recent utilization of GPU SM (3D/Compute), framebuffer, video encoder, and video decoder for processes running. - * Utilization values are returned as an array of utilization sample structures in the caller-supplied buffer pointed at - * by \a utilization. One utilization sample structure is returned per process running, that had some non-zero utilization - * during the last sample period. It includes the CPU timestamp at which the samples were recorded. Individual utilization values - * are returned as "unsigned int" values. - * - * To read utilization values, first determine the size of buffer required to hold the samples by invoking the function with - * \a utilization set to NULL. The caller should allocate a buffer of size - * processSamplesCount * sizeof(nvmlProcessUtilizationSample_t). Invoke the function again with the allocated buffer passed - * in \a utilization, and \a processSamplesCount set to the number of entries the buffer is sized for. - * - * On successful return, the function updates \a processSamplesCount with the number of process utilization sample - * structures that were actually written. This may differ from a previously read value as instances are created or - * destroyed. - * - * lastSeenTimeStamp represents the CPU timestamp in microseconds at which utilization samples were last read. Set it to 0 - * to read utilization based on all the samples maintained by the driver's internal sample buffer. Set lastSeenTimeStamp - * to a timeStamp retrieved from a previous query to read utilization since the previous query. - * - * @param device The identifier of the target device - * @param utilization Pointer to caller-supplied buffer in which guest process utilization samples are returned - * @param processSamplesCount Pointer to caller-supplied array size, and returns number of processes running - * @param lastSeenTimeStamp Return only samples with timestamp greater than lastSeenTimeStamp. - - * @return - * - \ref NVML_SUCCESS if \a utilization has been populated - * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, \a utilization is NULL, or \a samplingPeriodUs is NULL - * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature - * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetProcessUtilization(nvmlDevice_t device, nvmlProcessUtilizationSample_t *utilization, - unsigned int *processSamplesCount, unsigned long long lastSeenTimeStamp); - -/** @} */ - -/***************************************************************************************************/ -/** @defgroup nvml vGPU Migration - * This chapter describes NVML operations that are associated with vGPU Migration. - * @{ - */ -/***************************************************************************************************/ - -/** - * vGPU metadata structure. - */ -typedef struct nvmlVgpuMetadata_st -{ - unsigned int version; //!< Current version of the structure - unsigned int revision; //!< Current revision of the structure - nvmlVgpuGuestInfoState_t guestInfoState; //!< Current state of Guest-dependent fields - char guestDriverVersion[NVML_SYSTEM_DRIVER_VERSION_BUFFER_SIZE]; //!< Version of driver installed in guest - char hostDriverVersion[NVML_SYSTEM_DRIVER_VERSION_BUFFER_SIZE]; //!< Version of driver installed in host - unsigned int reserved[8]; //!< Reserved for internal use - unsigned int opaqueDataSize; //!< Size of opaque data field in bytes - char opaqueData[4]; //!< Opaque data -} nvmlVgpuMetadata_t; - -/** - * Physical GPU metadata structure - */ -typedef struct nvmlVgpuPgpuMetadata_st -{ - unsigned int version; //!< Current version of the structure - unsigned int revision; //!< Current revision of the structure - char hostDriverVersion[NVML_SYSTEM_DRIVER_VERSION_BUFFER_SIZE]; //!< Host driver version - unsigned int pgpuVirtualizationCaps; //!< Pgpu virtualizaion capabilities bitfileld - unsigned int reserved[7]; //!< Reserved for internal use - unsigned int opaqueDataSize; //!< Size of opaque data field in bytes - char opaqueData[4]; //!< Opaque data -} nvmlVgpuPgpuMetadata_t; - -/** - * vGPU VM compatibility codes - */ -typedef enum nvmlVgpuVmCompatibility_enum -{ - NVML_VGPU_VM_COMPATIBILITY_NONE = 0x0, //!< vGPU is not runnable - NVML_VGPU_VM_COMPATIBILITY_COLD = 0x1, //!< vGPU is runnable from a cold / powered-off state (ACPI S5) - NVML_VGPU_VM_COMPATIBILITY_HIBERNATE = 0x2, //!< vGPU is runnable from a hibernated state (ACPI S4) - NVML_VGPU_VM_COMPATIBILITY_SLEEP = 0x4, //!< vGPU is runnable from a sleeped state (ACPI S3) - NVML_VGPU_VM_COMPATIBILITY_LIVE = 0x8, //!< vGPU is runnable from a live/paused (ACPI S0) -} nvmlVgpuVmCompatibility_t; - -/** - * vGPU-pGPU compatibility limit codes - */ -typedef enum nvmlVgpuPgpuCompatibilityLimitCode_enum -{ - NVML_VGPU_COMPATIBILITY_LIMIT_NONE = 0x0, //!< Compatibility is not limited. - NVML_VGPU_COMPATIBILITY_LIMIT_HOST_DRIVER = 0x1, //!< Compatibility is limited by host driver version. - NVML_VGPU_COMPATIBILITY_LIMIT_GUEST_DRIVER = 0x2, //!< Compatibility is limited by guest driver version. - NVML_VGPU_COMPATIBILITY_LIMIT_GPU = 0x4, //!< Compatibility is limited by GPU hardware. - NVML_VGPU_COMPATIBILITY_LIMIT_OTHER = 0x80000000, //!< Compatibility is limited by an undefined factor. -} nvmlVgpuPgpuCompatibilityLimitCode_t; - -/** - * vGPU-pGPU compatibility structure - */ -typedef struct nvmlVgpuPgpuCompatibility_st -{ - nvmlVgpuVmCompatibility_t vgpuVmCompatibility; //!< Compatibility of vGPU VM. See \ref nvmlVgpuVmCompatibility_t - nvmlVgpuPgpuCompatibilityLimitCode_t compatibilityLimitCode; //!< Limiting factor for vGPU-pGPU compatibility. See \ref nvmlVgpuPgpuCompatibilityLimitCode_t -} nvmlVgpuPgpuCompatibility_t; - -/** - * Returns vGPU metadata structure for a running vGPU. The structure contains information about the vGPU and its associated VM - * such as the currently installed NVIDIA guest driver version, together with host driver version and an opaque data section - * containing internal state. - * - * nvmlVgpuInstanceGetMetadata() may be called at any time for a vGPU instance. Some fields in the returned structure are - * dependent on information obtained from the guest VM, which may not yet have reached a state where that information - * is available. The current state of these dependent fields is reflected in the info structure's \ref guestInfoState field. - * - * The VMM may choose to read and save the vGPU's VM info as persistent metadata associated with the VM, and provide - * it to GRID Virtual GPU Manager when creating a vGPU for subsequent instances of the VM. - * - * The caller passes in a buffer via \a vgpuMetadata, with the size of the buffer in \a bufferSize. If the vGPU Metadata structure - * is too large to fit in the supplied buffer, the function returns NVML_ERROR_INSUFFICIENT_SIZE with the size needed - * in \a bufferSize. - * - * @param vgpuInstance vGPU instance handle - * @param vgpuMetadata Pointer to caller-supplied buffer into which vGPU metadata is written - * @param bufferSize Size of vgpuMetadata buffer - * - * @return - * - \ref NVML_SUCCESS vGPU metadata structure was successfully returned - * - \ref NVML_ERROR_INSUFFICIENT_SIZE vgpuMetadata buffer is too small, required size is returned in \a bufferSize - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a bufferSize is NULL or \a vgpuInstance is invalid; if \a vgpuMetadata is NULL and the value of \a bufferSize is not 0. - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlVgpuInstanceGetMetadata(nvmlVgpuInstance_t vgpuInstance, nvmlVgpuMetadata_t *vgpuMetadata, unsigned int *bufferSize); - -/** - * Returns a vGPU metadata structure for the physical GPU indicated by \a device. The structure contains information about - * the GPU and the currently installed NVIDIA host driver version that's controlling it, together with an opaque data section - * containing internal state. - * - * The caller passes in a buffer via \a pgpuMetadata, with the size of the buffer in \a bufferSize. If the \a pgpuMetadata - * structure is too large to fit in the supplied buffer, the function returns NVML_ERROR_INSUFFICIENT_SIZE with the size needed - * in \a bufferSize. - * - * @param device The identifier of the target device - * @param pgpuMetadata Pointer to caller-supplied buffer into which \a pgpuMetadata is written - * @param bufferSize Pointer to size of \a pgpuMetadata buffer - * - * @return - * - \ref NVML_SUCCESS GPU metadata structure was successfully returned - * - \ref NVML_ERROR_INSUFFICIENT_SIZE pgpuMetadata buffer is too small, required size is returned in \a bufferSize - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a bufferSize is NULL or \a device is invalid; if \a pgpuMetadata is NULL and the value of \a bufferSize is not 0. - * - \ref NVML_ERROR_NOT_SUPPORTED vGPU is not supported by the system - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlDeviceGetVgpuMetadata(nvmlDevice_t device, nvmlVgpuPgpuMetadata_t *pgpuMetadata, unsigned int *bufferSize); - -/** - * Takes a vGPU instance metadata structure read from \ref nvmlVgpuInstanceGetMetadata(), and a vGPU metadata structure for a - * physical GPU read from \ref nvmlDeviceGetVgpuMetadata(), and returns compatibility information of the vGPU instance and the - * physical GPU. - * - * The caller passes in a buffer via \a compatibilityInfo, into which a compatibility information structure is written. The - * structure defines the states in which the vGPU / VM may be booted on the physical GPU. If the vGPU / VM compatibility - * with the physical GPU is limited, a limit code indicates the factor limiting compability. - * (see \ref nvmlVgpuPgpuCompatibilityLimitCode_t for details). - * - * Note: vGPU compatibility does not take into account dynamic capacity conditions that may limit a system's ability to - * boot a given vGPU or associated VM. - * - * @param vgpuMetadata Pointer to caller-supplied vGPU metadata structure - * @param pgpuMetadata Pointer to caller-supplied GPU metadata structure - * @param compatibilityInfo Pointer to caller-supplied buffer to hold compatibility info - * - * @return - * - \ref NVML_SUCCESS vGPU metadata structure was successfully returned - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuMetadata or \a pgpuMetadata or \a bufferSize are NULL - * - \ref NVML_ERROR_UNKNOWN on any unexpected error - */ -nvmlReturn_t DECLDIR nvmlGetVgpuCompatibility(nvmlVgpuMetadata_t *vgpuMetadata, nvmlVgpuPgpuMetadata_t *pgpuMetadata, nvmlVgpuPgpuCompatibility_t *compatibilityInfo); - -/** @} */ - -/** - * NVML API versioning support - */ -#if defined(__NVML_API_VERSION_INTERNAL) -#undef nvmlDeviceRemoveGpu -#undef nvmlDeviceGetNvLinkRemotePciInfo -#undef nvmlDeviceGetPciInfo -#undef nvmlDeviceGetCount -#undef nvmlDeviceGetHandleByIndex -#undef nvmlDeviceGetHandleByPciBusId -#undef nvmlInit -#endif - -#ifdef __cplusplus -} -#endif - -#endif diff --git a/vendor/github.com/NVIDIA/gpu-monitoring-tools/bindings/go/nvml/nvml_dl.c b/vendor/github.com/NVIDIA/gpu-monitoring-tools/bindings/go/nvml/nvml_dl.c deleted file mode 100644 index a3d162c0e1bc..000000000000 --- a/vendor/github.com/NVIDIA/gpu-monitoring-tools/bindings/go/nvml/nvml_dl.c +++ /dev/null @@ -1,46 +0,0 @@ -// Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved. - -#include -#include - -#include "nvml_dl.h" - -#define DLSYM(x, sym) \ -do { \ - dlerror(); \ - x = dlsym(handle, #sym); \ - if (dlerror() != NULL) { \ - return (NVML_ERROR_FUNCTION_NOT_FOUND); \ - } \ -} while (0) - -typedef nvmlReturn_t (*nvmlSym_t)(); - -static void *handle; - -nvmlReturn_t NVML_DL(nvmlInit)(void) -{ - handle = dlopen("libnvidia-ml.so.1", RTLD_LAZY | RTLD_GLOBAL); - if (handle == NULL) { - return (NVML_ERROR_LIBRARY_NOT_FOUND); - } - return (nvmlInit()); -} - -nvmlReturn_t NVML_DL(nvmlShutdown)(void) -{ - nvmlReturn_t r = nvmlShutdown(); - if (r != NVML_SUCCESS) { - return (r); - } - return (dlclose(handle) ? NVML_ERROR_UNKNOWN : NVML_SUCCESS); -} - -nvmlReturn_t NVML_DL(nvmlDeviceGetTopologyCommonAncestor)( - nvmlDevice_t dev1, nvmlDevice_t dev2, nvmlGpuTopologyLevel_t *info) -{ - nvmlSym_t sym; - - DLSYM(sym, nvmlDeviceGetTopologyCommonAncestor); - return ((*sym)(dev1, dev2, info)); -} diff --git a/vendor/github.com/NVIDIA/gpu-monitoring-tools/bindings/go/nvml/nvml_dl.h b/vendor/github.com/NVIDIA/gpu-monitoring-tools/bindings/go/nvml/nvml_dl.h deleted file mode 100644 index 628f0b3a2c2b..000000000000 --- a/vendor/github.com/NVIDIA/gpu-monitoring-tools/bindings/go/nvml/nvml_dl.h +++ /dev/null @@ -1,15 +0,0 @@ -// Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved. - -#ifndef _NVML_DL_H_ -#define _NVML_DL_H_ - -#include "nvml.h" - -#define NVML_DL(x) x##_dl - -extern nvmlReturn_t NVML_DL(nvmlInit)(void); -extern nvmlReturn_t NVML_DL(nvmlShutdown)(void); -extern nvmlReturn_t NVML_DL(nvmlDeviceGetTopologyCommonAncestor)( - nvmlDevice_t, nvmlDevice_t, nvmlGpuTopologyLevel_t *); - -#endif // _NVML_DL_H_ diff --git a/vendor/modules.txt b/vendor/modules.txt index 4dfe88ee06be..6ca76bfd3187 100644 --- a/vendor/modules.txt +++ b/vendor/modules.txt @@ -52,9 +52,6 @@ github.com/Microsoft/go-winio/pkg/guid # github.com/Microsoft/hcsshim v0.8.8-0.20200312192636-fd0797d766b1 ## explicit github.com/Microsoft/hcsshim/osversion -# github.com/NVIDIA/gpu-monitoring-tools v0.0.0-20180829222009-86f2a9fac6c5 -## explicit -github.com/NVIDIA/gpu-monitoring-tools/bindings/go/nvml # github.com/NYTimes/gziphandler v1.0.1 => github.com/NYTimes/gziphandler v1.0.0 ## explicit github.com/NYTimes/gziphandler diff --git a/website/content/docs/devices/nvidia.mdx b/website/content/docs/devices/external/nvidia.mdx similarity index 100% rename from website/content/docs/devices/nvidia.mdx rename to website/content/docs/devices/external/nvidia.mdx diff --git a/website/data/docs-nav-data.json b/website/data/docs-nav-data.json index 1751a892208c..9616f25aec44 100644 --- a/website/data/docs-nav-data.json +++ b/website/data/docs-nav-data.json @@ -1437,10 +1437,6 @@ "title": "Overview", "path": "devices" }, - { - "title": "Nvidia", - "path": "devices/nvidia" - }, { "title": "Community", "routes": [ @@ -1448,6 +1444,10 @@ "title": "Overview", "path": "devices/external" }, + { + "title": "Nvidia", + "path": "devices/external/nvidia" + }, { "title": "USB Beta", "path": "devices/external/usb" @@ -1760,7 +1760,7 @@ { "title": "Overview", "path": "enterprise" - }, + }, { "title": "License", "routes": [