diff --git a/.circleci/config.yml b/.circleci/config.yml
index 0a4cec135dce..0fcf418656d6 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -620,13 +620,9 @@ workflows:
test_module: "api"
filters: *backend_test_branches_filter
enable_race_testing: true
- - test-container:
- name: "test-devices"
- test_packages: "./devices/..."
- filters: *backend_test_branches_filter
- test-machine:
name: "test-other"
- exclude_packages: "./api|./client|./drivers/docker|./drivers/exec|./drivers/shared/executor|./nomad|./devices|./e2e"
+ exclude_packages: "./api|./client|./drivers/docker|./drivers/exec|./drivers/shared/executor|./nomad|./e2e"
filters: *backend_test_branches_filter
- test-machine:
name: "test-docker"
diff --git a/devices/gpu/nvidia/README.md b/devices/gpu/nvidia/README.md
deleted file mode 100644
index 1035c7c89402..000000000000
--- a/devices/gpu/nvidia/README.md
+++ /dev/null
@@ -1,21 +0,0 @@
-This package provides an implementation of nvidia device plugin
-
-# Behavior
-
-Nvidia device plugin uses NVML bindings to get data regarding available nvidia devices and will expose them via Fingerprint RPC. GPUs can be excluded from fingerprinting by setting the `ignored_gpu_ids` field. Plugin sends statistics for fingerprinted devices every `stats_period` period.
-
-# Config
-
-The configuration should be passed via an HCL file that begins with a top level `config` stanza:
-
-```
-config {
- ignored_gpu_ids = ["uuid1", "uuid2"]
- fingerprint_period = "5s"
-}
-```
-
-The valid configuration options are:
-
-* `ignored_gpu_ids` (`list(string)`: `[]`): list of GPU UUIDs strings that should not be exposed to nomad
-* `fingerprint_period` (`string`: `"1m"`): interval to repeat the fingerprint process to identify possible changes.
diff --git a/devices/gpu/nvidia/cmd/main.go b/devices/gpu/nvidia/cmd/main.go
deleted file mode 100644
index 5c0bea6c4d86..000000000000
--- a/devices/gpu/nvidia/cmd/main.go
+++ /dev/null
@@ -1,20 +0,0 @@
-package main
-
-import (
- "context"
-
- log "github.com/hashicorp/go-hclog"
-
- "github.com/hashicorp/nomad/devices/gpu/nvidia"
- "github.com/hashicorp/nomad/plugins"
-)
-
-func main() {
- // Serve the plugin
- plugins.ServeCtx(factory)
-}
-
-// factory returns a new instance of the Nvidia GPU plugin
-func factory(ctx context.Context, log log.Logger) interface{} {
- return nvidia.NewNvidiaDevice(ctx, log)
-}
diff --git a/devices/gpu/nvidia/device.go b/devices/gpu/nvidia/device.go
deleted file mode 100644
index 67680dc2a0ee..000000000000
--- a/devices/gpu/nvidia/device.go
+++ /dev/null
@@ -1,228 +0,0 @@
-package nvidia
-
-import (
- "context"
- "fmt"
- "strings"
- "sync"
- "time"
-
- log "github.com/hashicorp/go-hclog"
- "github.com/hashicorp/nomad/devices/gpu/nvidia/nvml"
- "github.com/hashicorp/nomad/helper/pluginutils/loader"
- "github.com/hashicorp/nomad/plugins/base"
- "github.com/hashicorp/nomad/plugins/device"
- "github.com/hashicorp/nomad/plugins/shared/hclspec"
-)
-
-const (
- // pluginName is the name of the plugin
- pluginName = "nvidia-gpu"
-
- // vendor is the vendor providing the devices
- vendor = "nvidia"
-
- // deviceType is the type of device being returned
- deviceType = device.DeviceTypeGPU
-
- // notAvailable value is returned to nomad server in case some properties were
- // undetected by nvml driver
- notAvailable = "N/A"
-
- // Nvidia-container-runtime environment variable names
- NvidiaVisibleDevices = "NVIDIA_VISIBLE_DEVICES"
-)
-
-var (
- // PluginID is the nvidia plugin metadata registered in the plugin
- // catalog.
- PluginID = loader.PluginID{
- Name: pluginName,
- PluginType: base.PluginTypeDevice,
- }
-
- // PluginConfig is the nvidia factory function registered in the
- // plugin catalog.
- PluginConfig = &loader.InternalPluginConfig{
- Factory: func(ctx context.Context, l log.Logger) interface{} { return NewNvidiaDevice(ctx, l) },
- }
-
- // pluginInfo describes the plugin
- pluginInfo = &base.PluginInfoResponse{
- Type: base.PluginTypeDevice,
- PluginApiVersions: []string{device.ApiVersion010},
- PluginVersion: "0.1.0",
- Name: pluginName,
- }
-
- // configSpec is the specification of the plugin's configuration
- configSpec = hclspec.NewObject(map[string]*hclspec.Spec{
- "enabled": hclspec.NewDefault(
- hclspec.NewAttr("enabled", "bool", false),
- hclspec.NewLiteral("true"),
- ),
- "ignored_gpu_ids": hclspec.NewDefault(
- hclspec.NewAttr("ignored_gpu_ids", "list(string)", false),
- hclspec.NewLiteral("[]"),
- ),
- "fingerprint_period": hclspec.NewDefault(
- hclspec.NewAttr("fingerprint_period", "string", false),
- hclspec.NewLiteral("\"1m\""),
- ),
- })
-)
-
-// Config contains configuration information for the plugin.
-type Config struct {
- Enabled bool `codec:"enabled"`
- IgnoredGPUIDs []string `codec:"ignored_gpu_ids"`
- FingerprintPeriod string `codec:"fingerprint_period"`
-}
-
-// NvidiaDevice contains all plugin specific data
-type NvidiaDevice struct {
- // enabled indicates whether the plugin should be enabled
- enabled bool
-
- // nvmlClient is used to get data from nvidia
- nvmlClient nvml.NvmlClient
-
- // initErr holds an error retrieved during
- // nvmlClient initialization
- initErr error
-
- // ignoredGPUIDs is a set of UUIDs that would not be exposed to nomad
- ignoredGPUIDs map[string]struct{}
-
- // fingerprintPeriod is how often we should call nvml to get list of devices
- fingerprintPeriod time.Duration
-
- // devices is the set of detected eligible devices
- devices map[string]struct{}
- deviceLock sync.RWMutex
-
- logger log.Logger
-}
-
-// NewNvidiaDevice returns a new nvidia device plugin.
-func NewNvidiaDevice(_ context.Context, log log.Logger) *NvidiaDevice {
- nvmlClient, err := nvml.NewNvmlClient()
- logger := log.Named(pluginName)
- if err != nil && err.Error() != nvml.UnavailableLib.Error() {
- logger.Error("unable to initialize Nvidia driver", "reason", err)
- }
- return &NvidiaDevice{
- logger: logger,
- devices: make(map[string]struct{}),
- ignoredGPUIDs: make(map[string]struct{}),
- nvmlClient: nvmlClient,
- initErr: err,
- }
-}
-
-// PluginInfo returns information describing the plugin.
-func (d *NvidiaDevice) PluginInfo() (*base.PluginInfoResponse, error) {
- return pluginInfo, nil
-}
-
-// ConfigSchema returns the plugins configuration schema.
-func (d *NvidiaDevice) ConfigSchema() (*hclspec.Spec, error) {
- return configSpec, nil
-}
-
-// SetConfig is used to set the configuration of the plugin.
-func (d *NvidiaDevice) SetConfig(cfg *base.Config) error {
- var config Config
- if len(cfg.PluginConfig) != 0 {
- if err := base.MsgPackDecode(cfg.PluginConfig, &config); err != nil {
- return err
- }
- }
-
- d.enabled = config.Enabled
-
- for _, ignoredGPUId := range config.IgnoredGPUIDs {
- d.ignoredGPUIDs[ignoredGPUId] = struct{}{}
- }
-
- period, err := time.ParseDuration(config.FingerprintPeriod)
- if err != nil {
- return fmt.Errorf("failed to parse fingerprint period %q: %v", config.FingerprintPeriod, err)
- }
- d.fingerprintPeriod = period
-
- return nil
-}
-
-// Fingerprint streams detected devices. If device changes are detected or the
-// devices health changes, messages will be emitted.
-func (d *NvidiaDevice) Fingerprint(ctx context.Context) (<-chan *device.FingerprintResponse, error) {
- if !d.enabled {
- return nil, device.ErrPluginDisabled
- }
-
- outCh := make(chan *device.FingerprintResponse)
- go d.fingerprint(ctx, outCh)
- return outCh, nil
-}
-
-type reservationError struct {
- notExistingIDs []string
-}
-
-func (e *reservationError) Error() string {
- return fmt.Sprintf("unknown device IDs: %s", strings.Join(e.notExistingIDs, ","))
-}
-
-// Reserve returns information on how to mount given devices.
-// Assumption is made that nomad server is responsible for correctness of
-// GPU allocations, handling tricky cases such as double-allocation of single GPU
-func (d *NvidiaDevice) Reserve(deviceIDs []string) (*device.ContainerReservation, error) {
- if len(deviceIDs) == 0 {
- return &device.ContainerReservation{}, nil
- }
- if !d.enabled {
- return nil, device.ErrPluginDisabled
- }
-
- // Due to the asynchronous nature of NvidiaPlugin, there is a possibility
- // of race condition
- //
- // Timeline:
- // 1 - fingerprint reports that GPU with id "1" is present
- // 2 - the following events happen at the same time:
- // a) server decides to allocate GPU with id "1"
- // b) fingerprint check reports that GPU with id "1" is no more present
- //
- // The latest and always valid version of fingerprinted ids are stored in
- // d.devices map. To avoid this race condition an error is returned if
- // any of provided deviceIDs is not found in d.devices map
- d.deviceLock.RLock()
- var notExistingIDs []string
- for _, id := range deviceIDs {
- if _, deviceIDExists := d.devices[id]; !deviceIDExists {
- notExistingIDs = append(notExistingIDs, id)
- }
- }
- d.deviceLock.RUnlock()
- if len(notExistingIDs) != 0 {
- return nil, &reservationError{notExistingIDs}
- }
-
- return &device.ContainerReservation{
- Envs: map[string]string{
- NvidiaVisibleDevices: strings.Join(deviceIDs, ","),
- },
- }, nil
-}
-
-// Stats streams statistics for the detected devices.
-func (d *NvidiaDevice) Stats(ctx context.Context, interval time.Duration) (<-chan *device.StatsResponse, error) {
- if !d.enabled {
- return nil, device.ErrPluginDisabled
- }
-
- outCh := make(chan *device.StatsResponse)
- go d.stats(ctx, outCh, interval)
- return outCh, nil
-}
diff --git a/devices/gpu/nvidia/device_test.go b/devices/gpu/nvidia/device_test.go
deleted file mode 100644
index a5ec354e2432..000000000000
--- a/devices/gpu/nvidia/device_test.go
+++ /dev/null
@@ -1,140 +0,0 @@
-package nvidia
-
-import (
- "testing"
-
- hclog "github.com/hashicorp/go-hclog"
- "github.com/hashicorp/nomad/devices/gpu/nvidia/nvml"
- "github.com/hashicorp/nomad/plugins/device"
- "github.com/stretchr/testify/require"
-)
-
-type MockNvmlClient struct {
- FingerprintError error
- FingerprintResponseReturned *nvml.FingerprintData
-
- StatsError error
- StatsResponseReturned []*nvml.StatsData
-}
-
-func (c *MockNvmlClient) GetFingerprintData() (*nvml.FingerprintData, error) {
- return c.FingerprintResponseReturned, c.FingerprintError
-}
-
-func (c *MockNvmlClient) GetStatsData() ([]*nvml.StatsData, error) {
- return c.StatsResponseReturned, c.StatsError
-}
-
-func TestReserve(t *testing.T) {
- cases := []struct {
- Name string
- ExpectedReservation *device.ContainerReservation
- ExpectedError error
- Device *NvidiaDevice
- RequestedIDs []string
- }{
- {
- Name: "All RequestedIDs are not managed by Device",
- ExpectedReservation: nil,
- ExpectedError: &reservationError{[]string{
- "UUID1",
- "UUID2",
- "UUID3",
- }},
- RequestedIDs: []string{
- "UUID1",
- "UUID2",
- "UUID3",
- },
- Device: &NvidiaDevice{
- logger: hclog.NewNullLogger(),
- enabled: true,
- },
- },
- {
- Name: "Some RequestedIDs are not managed by Device",
- ExpectedReservation: nil,
- ExpectedError: &reservationError{[]string{
- "UUID1",
- "UUID2",
- }},
- RequestedIDs: []string{
- "UUID1",
- "UUID2",
- "UUID3",
- },
- Device: &NvidiaDevice{
- devices: map[string]struct{}{
- "UUID3": {},
- },
- logger: hclog.NewNullLogger(),
- enabled: true,
- },
- },
- {
- Name: "All RequestedIDs are managed by Device",
- ExpectedReservation: &device.ContainerReservation{
- Envs: map[string]string{
- NvidiaVisibleDevices: "UUID1,UUID2,UUID3",
- },
- },
- ExpectedError: nil,
- RequestedIDs: []string{
- "UUID1",
- "UUID2",
- "UUID3",
- },
- Device: &NvidiaDevice{
- devices: map[string]struct{}{
- "UUID1": {},
- "UUID2": {},
- "UUID3": {},
- },
- logger: hclog.NewNullLogger(),
- enabled: true,
- },
- },
- {
- Name: "No IDs requested",
- ExpectedReservation: &device.ContainerReservation{},
- ExpectedError: nil,
- RequestedIDs: nil,
- Device: &NvidiaDevice{
- devices: map[string]struct{}{
- "UUID1": {},
- "UUID2": {},
- "UUID3": {},
- },
- logger: hclog.NewNullLogger(),
- enabled: true,
- },
- },
- {
- Name: "Device is disabled",
- ExpectedReservation: nil,
- ExpectedError: device.ErrPluginDisabled,
- RequestedIDs: []string{
- "UUID1",
- "UUID2",
- "UUID3",
- },
- Device: &NvidiaDevice{
- devices: map[string]struct{}{
- "UUID1": {},
- "UUID2": {},
- "UUID3": {},
- },
- logger: hclog.NewNullLogger(),
- enabled: false,
- },
- },
- }
-
- for _, c := range cases {
- t.Run(c.Name, func(t *testing.T) {
- actualReservation, actualError := c.Device.Reserve(c.RequestedIDs)
- require.Equal(t, c.ExpectedReservation, actualReservation)
- require.Equal(t, c.ExpectedError, actualError)
- })
- }
-}
diff --git a/devices/gpu/nvidia/fingerprint.go b/devices/gpu/nvidia/fingerprint.go
deleted file mode 100644
index 45bb34fa3355..000000000000
--- a/devices/gpu/nvidia/fingerprint.go
+++ /dev/null
@@ -1,229 +0,0 @@
-package nvidia
-
-import (
- "context"
- "time"
-
- "github.com/hashicorp/nomad/devices/gpu/nvidia/nvml"
- "github.com/hashicorp/nomad/helper"
- "github.com/hashicorp/nomad/plugins/device"
- "github.com/hashicorp/nomad/plugins/shared/structs"
-)
-
-const (
- // Attribute names and units for reporting Fingerprint output
- MemoryAttr = "memory"
- PowerAttr = "power"
- BAR1Attr = "bar1"
- DriverVersionAttr = "driver_version"
- CoresClockAttr = "cores_clock"
- MemoryClockAttr = "memory_clock"
- PCIBandwidthAttr = "pci_bandwidth"
- DisplayStateAttr = "display_state"
- PersistenceModeAttr = "persistence_mode"
-)
-
-// fingerprint is the long running goroutine that detects hardware
-func (d *NvidiaDevice) fingerprint(ctx context.Context, devices chan<- *device.FingerprintResponse) {
- defer close(devices)
-
- if d.initErr != nil {
- if d.initErr.Error() != nvml.UnavailableLib.Error() {
- d.logger.Error("exiting fingerprinting due to problems with NVML loading", "error", d.initErr)
- devices <- device.NewFingerprintError(d.initErr)
- }
-
- // Just close the channel to let server know that there are no working
- // Nvidia GPU units
- return
- }
-
- // Create a timer that will fire immediately for the first detection
- ticker := time.NewTimer(0)
-
- for {
- select {
- case <-ctx.Done():
- return
- case <-ticker.C:
- ticker.Reset(d.fingerprintPeriod)
- }
- d.writeFingerprintToChannel(devices)
- }
-}
-
-// writeFingerprintToChannel makes nvml call and writes response to channel
-func (d *NvidiaDevice) writeFingerprintToChannel(devices chan<- *device.FingerprintResponse) {
- fingerprintData, err := d.nvmlClient.GetFingerprintData()
- if err != nil {
- d.logger.Error("failed to get fingerprint nvidia devices", "error", err)
- devices <- device.NewFingerprintError(err)
- return
- }
-
- // ignore devices from fingerprint output
- fingerprintDevices := ignoreFingerprintedDevices(fingerprintData.Devices, d.ignoredGPUIDs)
- // check if any device health was updated or any device was added to host
- if !d.fingerprintChanged(fingerprintDevices) {
- return
- }
-
- commonAttributes := map[string]*structs.Attribute{
- DriverVersionAttr: {
- String: helper.StringToPtr(fingerprintData.DriverVersion),
- },
- }
-
- // Group all FingerprintDevices by DeviceName attribute
- deviceListByDeviceName := make(map[string][]*nvml.FingerprintDeviceData)
- for _, device := range fingerprintDevices {
- deviceName := device.DeviceName
- if deviceName == nil {
- // nvml driver was not able to detect device name. This kind
- // of devices are placed to single group with 'notAvailable' name
- notAvailableCopy := notAvailable
- deviceName = ¬AvailableCopy
- }
-
- deviceListByDeviceName[*deviceName] = append(deviceListByDeviceName[*deviceName], device)
- }
-
- // Build Fingerprint response with computed groups and send it over the channel
- deviceGroups := make([]*device.DeviceGroup, 0, len(deviceListByDeviceName))
- for groupName, devices := range deviceListByDeviceName {
- deviceGroups = append(deviceGroups, deviceGroupFromFingerprintData(groupName, devices, commonAttributes))
- }
- devices <- device.NewFingerprint(deviceGroups...)
-}
-
-// ignoreFingerprintedDevices excludes ignored devices from fingerprint output
-func ignoreFingerprintedDevices(deviceData []*nvml.FingerprintDeviceData, ignoredGPUIDs map[string]struct{}) []*nvml.FingerprintDeviceData {
- var result []*nvml.FingerprintDeviceData
- for _, fingerprintDevice := range deviceData {
- if _, ignored := ignoredGPUIDs[fingerprintDevice.UUID]; !ignored {
- result = append(result, fingerprintDevice)
- }
- }
- return result
-}
-
-// fingerprintChanged checks if there are any previously unseen nvidia devices located
-// or any of fingerprinted nvidia devices disappeared since the last fingerprint run.
-// Also, this func updates device map on NvidiaDevice with the latest data
-func (d *NvidiaDevice) fingerprintChanged(allDevices []*nvml.FingerprintDeviceData) bool {
- d.deviceLock.Lock()
- defer d.deviceLock.Unlock()
-
- changeDetected := false
- // check if every device in allDevices is in d.devices
- for _, device := range allDevices {
- if _, ok := d.devices[device.UUID]; !ok {
- changeDetected = true
- }
- }
-
- // check if every device in d.devices is in allDevices
- fingerprintDeviceMap := make(map[string]struct{})
- for _, device := range allDevices {
- fingerprintDeviceMap[device.UUID] = struct{}{}
- }
- for id := range d.devices {
- if _, ok := fingerprintDeviceMap[id]; !ok {
- changeDetected = true
- }
- }
-
- d.devices = fingerprintDeviceMap
- return changeDetected
-}
-
-// deviceGroupFromFingerprintData composes deviceGroup from FingerprintDeviceData slice
-func deviceGroupFromFingerprintData(groupName string, deviceList []*nvml.FingerprintDeviceData, commonAttributes map[string]*structs.Attribute) *device.DeviceGroup {
- // deviceGroup without devices makes no sense -> return nil when no devices are provided
- if len(deviceList) == 0 {
- return nil
- }
-
- devices := make([]*device.Device, len(deviceList))
- for index, dev := range deviceList {
- devices[index] = &device.Device{
- ID: dev.UUID,
- // all fingerprinted devices are "healthy" for now
- // to get real health data -> dcgm bindings should be used
- Healthy: true,
- HwLocality: &device.DeviceLocality{
- PciBusID: dev.PCIBusID,
- },
- }
- }
-
- deviceGroup := &device.DeviceGroup{
- Vendor: vendor,
- Type: deviceType,
- Name: groupName,
- Devices: devices,
- // Assumption made that devices with the same DeviceName have the same
- // attributes like amount of memory, power, bar1memory etc
- Attributes: attributesFromFingerprintDeviceData(deviceList[0]),
- }
-
- // Extend attribute map with common attributes
- for attributeKey, attributeValue := range commonAttributes {
- deviceGroup.Attributes[attributeKey] = attributeValue
- }
-
- return deviceGroup
-}
-
-// attributesFromFingerprintDeviceData converts nvml.FingerprintDeviceData
-// struct to device.DeviceGroup.Attributes format (map[string]string)
-// this function performs all nil checks for FingerprintDeviceData pointers
-func attributesFromFingerprintDeviceData(d *nvml.FingerprintDeviceData) map[string]*structs.Attribute {
- attrs := map[string]*structs.Attribute{
- DisplayStateAttr: {
- String: helper.StringToPtr(d.DisplayState),
- },
- PersistenceModeAttr: {
- String: helper.StringToPtr(d.PersistenceMode),
- },
- }
-
- if d.MemoryMiB != nil {
- attrs[MemoryAttr] = &structs.Attribute{
- Int: helper.Int64ToPtr(int64(*d.MemoryMiB)),
- Unit: structs.UnitMiB,
- }
- }
- if d.PowerW != nil {
- attrs[PowerAttr] = &structs.Attribute{
- Int: helper.Int64ToPtr(int64(*d.PowerW)),
- Unit: structs.UnitW,
- }
- }
- if d.BAR1MiB != nil {
- attrs[BAR1Attr] = &structs.Attribute{
- Int: helper.Int64ToPtr(int64(*d.BAR1MiB)),
- Unit: structs.UnitMiB,
- }
- }
- if d.CoresClockMHz != nil {
- attrs[CoresClockAttr] = &structs.Attribute{
- Int: helper.Int64ToPtr(int64(*d.CoresClockMHz)),
- Unit: structs.UnitMHz,
- }
- }
- if d.MemoryClockMHz != nil {
- attrs[MemoryClockAttr] = &structs.Attribute{
- Int: helper.Int64ToPtr(int64(*d.MemoryClockMHz)),
- Unit: structs.UnitMHz,
- }
- }
- if d.PCIBandwidthMBPerS != nil {
- attrs[PCIBandwidthAttr] = &structs.Attribute{
- Int: helper.Int64ToPtr(int64(*d.PCIBandwidthMBPerS)),
- Unit: structs.UnitMBPerS,
- }
- }
-
- return attrs
-}
diff --git a/devices/gpu/nvidia/fingerprint_test.go b/devices/gpu/nvidia/fingerprint_test.go
deleted file mode 100644
index c85b5c8c90a3..000000000000
--- a/devices/gpu/nvidia/fingerprint_test.go
+++ /dev/null
@@ -1,1361 +0,0 @@
-package nvidia
-
-import (
- "context"
- "errors"
- "sort"
- "testing"
-
- hclog "github.com/hashicorp/go-hclog"
- "github.com/hashicorp/nomad/devices/gpu/nvidia/nvml"
- "github.com/hashicorp/nomad/helper"
- "github.com/hashicorp/nomad/plugins/device"
- "github.com/hashicorp/nomad/plugins/shared/structs"
- "github.com/stretchr/testify/require"
-)
-
-func TestIgnoreFingerprintedDevices(t *testing.T) {
- for _, testCase := range []struct {
- Name string
- DeviceData []*nvml.FingerprintDeviceData
- IgnoredGPUIds map[string]struct{}
- ExpectedResult []*nvml.FingerprintDeviceData
- }{
- {
- Name: "Odd ignored",
- DeviceData: []*nvml.FingerprintDeviceData{
- {
- DeviceData: &nvml.DeviceData{
- DeviceName: helper.StringToPtr("DeviceName1"),
- UUID: "UUID1",
- MemoryMiB: helper.Uint64ToPtr(1000),
- },
- },
- {
- DeviceData: &nvml.DeviceData{
- DeviceName: helper.StringToPtr("DeviceName2"),
- UUID: "UUID2",
- MemoryMiB: helper.Uint64ToPtr(1000),
- },
- },
- {
- DeviceData: &nvml.DeviceData{
- DeviceName: helper.StringToPtr("DeviceName3"),
- UUID: "UUID3",
- MemoryMiB: helper.Uint64ToPtr(1000),
- },
- },
- },
- IgnoredGPUIds: map[string]struct{}{
- "UUID2": {},
- },
- ExpectedResult: []*nvml.FingerprintDeviceData{
- {
- DeviceData: &nvml.DeviceData{
- DeviceName: helper.StringToPtr("DeviceName1"),
- UUID: "UUID1",
- MemoryMiB: helper.Uint64ToPtr(1000),
- },
- },
- {
- DeviceData: &nvml.DeviceData{
- DeviceName: helper.StringToPtr("DeviceName3"),
- UUID: "UUID3",
- MemoryMiB: helper.Uint64ToPtr(1000),
- },
- },
- },
- },
- {
- Name: "Even ignored",
- DeviceData: []*nvml.FingerprintDeviceData{
- {
- DeviceData: &nvml.DeviceData{
- DeviceName: helper.StringToPtr("DeviceName1"),
- UUID: "UUID1",
- MemoryMiB: helper.Uint64ToPtr(1000),
- },
- },
- {
- DeviceData: &nvml.DeviceData{
- DeviceName: helper.StringToPtr("DeviceName2"),
- UUID: "UUID2",
- MemoryMiB: helper.Uint64ToPtr(1000),
- },
- },
- {
- DeviceData: &nvml.DeviceData{
- DeviceName: helper.StringToPtr("DeviceName3"),
- UUID: "UUID3",
- MemoryMiB: helper.Uint64ToPtr(1000),
- },
- },
- },
- IgnoredGPUIds: map[string]struct{}{
- "UUID1": {},
- "UUID3": {},
- },
- ExpectedResult: []*nvml.FingerprintDeviceData{
- {
- DeviceData: &nvml.DeviceData{
- DeviceName: helper.StringToPtr("DeviceName2"),
- UUID: "UUID2",
- MemoryMiB: helper.Uint64ToPtr(1000),
- },
- },
- },
- },
- {
- Name: "All ignored",
- DeviceData: []*nvml.FingerprintDeviceData{
- {
- DeviceData: &nvml.DeviceData{
- DeviceName: helper.StringToPtr("DeviceName1"),
- UUID: "UUID1",
- MemoryMiB: helper.Uint64ToPtr(1000),
- },
- },
- {
- DeviceData: &nvml.DeviceData{
- DeviceName: helper.StringToPtr("DeviceName2"),
- UUID: "UUID2",
- MemoryMiB: helper.Uint64ToPtr(1000),
- },
- },
- {
- DeviceData: &nvml.DeviceData{
- DeviceName: helper.StringToPtr("DeviceName3"),
- UUID: "UUID3",
- MemoryMiB: helper.Uint64ToPtr(1000),
- },
- },
- },
- IgnoredGPUIds: map[string]struct{}{
- "UUID1": {},
- "UUID2": {},
- "UUID3": {},
- },
- ExpectedResult: nil,
- },
- {
- Name: "No ignored",
- DeviceData: []*nvml.FingerprintDeviceData{
- {
- DeviceData: &nvml.DeviceData{
- DeviceName: helper.StringToPtr("DeviceName1"),
- UUID: "UUID1",
- MemoryMiB: helper.Uint64ToPtr(1000),
- },
- },
- {
- DeviceData: &nvml.DeviceData{
- DeviceName: helper.StringToPtr("DeviceName2"),
- UUID: "UUID2",
- MemoryMiB: helper.Uint64ToPtr(1000),
- },
- },
- {
- DeviceData: &nvml.DeviceData{
- DeviceName: helper.StringToPtr("DeviceName3"),
- UUID: "UUID3",
- MemoryMiB: helper.Uint64ToPtr(1000),
- },
- },
- },
- IgnoredGPUIds: map[string]struct{}{},
- ExpectedResult: []*nvml.FingerprintDeviceData{
- {
- DeviceData: &nvml.DeviceData{
- DeviceName: helper.StringToPtr("DeviceName1"),
- UUID: "UUID1",
- MemoryMiB: helper.Uint64ToPtr(1000),
- },
- },
- {
- DeviceData: &nvml.DeviceData{
- DeviceName: helper.StringToPtr("DeviceName2"),
- UUID: "UUID2",
- MemoryMiB: helper.Uint64ToPtr(1000),
- },
- },
- {
- DeviceData: &nvml.DeviceData{
- DeviceName: helper.StringToPtr("DeviceName3"),
- UUID: "UUID3",
- MemoryMiB: helper.Uint64ToPtr(1000),
- },
- },
- },
- },
- {
- Name: "No DeviceData provided",
- DeviceData: nil,
- IgnoredGPUIds: map[string]struct{}{
- "UUID1": {},
- "UUID2": {},
- "UUID3": {},
- },
- ExpectedResult: nil,
- },
- } {
- t.Run(testCase.Name, func(t *testing.T) {
- actualResult := ignoreFingerprintedDevices(testCase.DeviceData, testCase.IgnoredGPUIds)
- require.New(t).Equal(testCase.ExpectedResult, actualResult)
- })
- }
-}
-
-func TestCheckFingerprintUpdates(t *testing.T) {
- for _, testCase := range []struct {
- Name string
- Device *NvidiaDevice
- AllDevices []*nvml.FingerprintDeviceData
- DeviceMapAfterMethodCall map[string]struct{}
- ExpectedResult bool
- }{
- {
- Name: "No updates",
- Device: &NvidiaDevice{devices: map[string]struct{}{
- "1": {},
- "2": {},
- "3": {},
- }},
- AllDevices: []*nvml.FingerprintDeviceData{
- {
- DeviceData: &nvml.DeviceData{
- UUID: "1",
- },
- },
- {
- DeviceData: &nvml.DeviceData{
- UUID: "2",
- },
- },
- {
- DeviceData: &nvml.DeviceData{
- UUID: "3",
- },
- },
- },
- ExpectedResult: false,
- DeviceMapAfterMethodCall: map[string]struct{}{
- "1": {},
- "2": {},
- "3": {},
- },
- },
- {
- Name: "New Device Appeared",
- Device: &NvidiaDevice{devices: map[string]struct{}{
- "1": {},
- "2": {},
- "3": {},
- }},
- AllDevices: []*nvml.FingerprintDeviceData{
- {
- DeviceData: &nvml.DeviceData{
- UUID: "1",
- },
- },
- {
- DeviceData: &nvml.DeviceData{
- UUID: "2",
- },
- },
- {
- DeviceData: &nvml.DeviceData{
- UUID: "3",
- },
- },
- {
- DeviceData: &nvml.DeviceData{
- UUID: "I am new",
- },
- },
- },
- ExpectedResult: true,
- DeviceMapAfterMethodCall: map[string]struct{}{
- "1": {},
- "2": {},
- "3": {},
- "I am new": {},
- },
- },
- {
- Name: "Device disappeared",
- Device: &NvidiaDevice{devices: map[string]struct{}{
- "1": {},
- "2": {},
- "3": {},
- }},
- AllDevices: []*nvml.FingerprintDeviceData{
- {
- DeviceData: &nvml.DeviceData{
- UUID: "1",
- },
- },
- {
- DeviceData: &nvml.DeviceData{
- UUID: "2",
- },
- },
- },
- ExpectedResult: true,
- DeviceMapAfterMethodCall: map[string]struct{}{
- "1": {},
- "2": {},
- },
- },
- {
- Name: "No devices in NvidiaDevice map",
- Device: &NvidiaDevice{},
- AllDevices: []*nvml.FingerprintDeviceData{
- {
- DeviceData: &nvml.DeviceData{
- UUID: "1",
- },
- },
- {
- DeviceData: &nvml.DeviceData{
- UUID: "2",
- },
- },
- {
- DeviceData: &nvml.DeviceData{
- UUID: "3",
- },
- },
- },
- ExpectedResult: true,
- DeviceMapAfterMethodCall: map[string]struct{}{
- "1": {},
- "2": {},
- "3": {},
- },
- },
- {
- Name: "No devices detected",
- Device: &NvidiaDevice{devices: map[string]struct{}{
- "1": {},
- "2": {},
- "3": {},
- }},
- AllDevices: nil,
- ExpectedResult: true,
- DeviceMapAfterMethodCall: map[string]struct{}{},
- },
- } {
- t.Run(testCase.Name, func(t *testing.T) {
- actualResult := testCase.Device.fingerprintChanged(testCase.AllDevices)
- req := require.New(t)
- // check that function returns valid "updated / not updated" state
- req.Equal(testCase.ExpectedResult, actualResult)
- // check that function propely updates devices map
- req.Equal(testCase.Device.devices, testCase.DeviceMapAfterMethodCall)
- })
- }
-}
-
-func TestAttributesFromFingerprintDeviceData(t *testing.T) {
- for _, testCase := range []struct {
- Name string
- FingerprintDeviceData *nvml.FingerprintDeviceData
- ExpectedResult map[string]*structs.Attribute
- }{
- {
- Name: "All attributes are not nil",
- FingerprintDeviceData: &nvml.FingerprintDeviceData{
- DeviceData: &nvml.DeviceData{
- UUID: "1",
- DeviceName: helper.StringToPtr("Type1"),
- MemoryMiB: helper.Uint64ToPtr(256),
- PowerW: helper.UintToPtr(2),
- BAR1MiB: helper.Uint64ToPtr(256),
- },
- PCIBusID: "pciBusID1",
- PCIBandwidthMBPerS: helper.UintToPtr(1),
- CoresClockMHz: helper.UintToPtr(1),
- MemoryClockMHz: helper.UintToPtr(1),
- DisplayState: "Enabled",
- PersistenceMode: "Enabled",
- },
- ExpectedResult: map[string]*structs.Attribute{
- MemoryAttr: {
- Int: helper.Int64ToPtr(256),
- Unit: structs.UnitMiB,
- },
- PowerAttr: {
- Int: helper.Int64ToPtr(2),
- Unit: structs.UnitW,
- },
- BAR1Attr: {
- Int: helper.Int64ToPtr(256),
- Unit: structs.UnitMiB,
- },
- PCIBandwidthAttr: {
- Int: helper.Int64ToPtr(1),
- Unit: structs.UnitMBPerS,
- },
- CoresClockAttr: {
- Int: helper.Int64ToPtr(1),
- Unit: structs.UnitMHz,
- },
- MemoryClockAttr: {
- Int: helper.Int64ToPtr(1),
- Unit: structs.UnitMHz,
- },
- DisplayStateAttr: {
- String: helper.StringToPtr("Enabled"),
- },
- PersistenceModeAttr: {
- String: helper.StringToPtr("Enabled"),
- },
- },
- },
- {
- Name: "nil values are omitted",
- FingerprintDeviceData: &nvml.FingerprintDeviceData{
- DeviceData: &nvml.DeviceData{
- UUID: "1",
- DeviceName: helper.StringToPtr("Type1"),
- MemoryMiB: nil,
- PowerW: helper.UintToPtr(2),
- BAR1MiB: helper.Uint64ToPtr(256),
- },
- PCIBusID: "pciBusID1",
- DisplayState: "Enabled",
- PersistenceMode: "Enabled",
- },
- ExpectedResult: map[string]*structs.Attribute{
- PowerAttr: {
- Int: helper.Int64ToPtr(2),
- Unit: structs.UnitW,
- },
- BAR1Attr: {
- Int: helper.Int64ToPtr(256),
- Unit: structs.UnitMiB,
- },
- DisplayStateAttr: {
- String: helper.StringToPtr("Enabled"),
- },
- PersistenceModeAttr: {
- String: helper.StringToPtr("Enabled"),
- },
- },
- },
- } {
- t.Run(testCase.Name, func(t *testing.T) {
- actualResult := attributesFromFingerprintDeviceData(testCase.FingerprintDeviceData)
- require.Equal(t, testCase.ExpectedResult, actualResult)
- })
- }
-}
-
-func TestDeviceGroupFromFingerprintData(t *testing.T) {
- for _, testCase := range []struct {
- Name string
- GroupName string
- Devices []*nvml.FingerprintDeviceData
- CommonAttributes map[string]*structs.Attribute
- ExpectedResult *device.DeviceGroup
- }{
- {
- Name: "Devices are provided",
- GroupName: "Type1",
- Devices: []*nvml.FingerprintDeviceData{
- {
- DeviceData: &nvml.DeviceData{
- UUID: "1",
- DeviceName: helper.StringToPtr("Type1"),
- MemoryMiB: helper.Uint64ToPtr(100),
- PowerW: helper.UintToPtr(2),
- BAR1MiB: helper.Uint64ToPtr(256),
- },
- PCIBusID: "pciBusID1",
- PCIBandwidthMBPerS: helper.UintToPtr(1),
- CoresClockMHz: helper.UintToPtr(1),
- MemoryClockMHz: helper.UintToPtr(1),
- DisplayState: "Enabled",
- PersistenceMode: "Enabled",
- },
- {
- DeviceData: &nvml.DeviceData{
- UUID: "2",
- DeviceName: helper.StringToPtr("Type1"),
- MemoryMiB: helper.Uint64ToPtr(100),
- PowerW: helper.UintToPtr(2),
- BAR1MiB: helper.Uint64ToPtr(256),
- },
- PCIBusID: "pciBusID2",
- PCIBandwidthMBPerS: helper.UintToPtr(1),
- CoresClockMHz: helper.UintToPtr(1),
- MemoryClockMHz: helper.UintToPtr(1),
- DisplayState: "Enabled",
- PersistenceMode: "Enabled",
- },
- },
- ExpectedResult: &device.DeviceGroup{
- Vendor: vendor,
- Type: deviceType,
- Name: "Type1",
- Devices: []*device.Device{
- {
- ID: "1",
- Healthy: true,
- HwLocality: &device.DeviceLocality{
- PciBusID: "pciBusID1",
- },
- },
- {
- ID: "2",
- Healthy: true,
- HwLocality: &device.DeviceLocality{
- PciBusID: "pciBusID2",
- },
- },
- },
- Attributes: map[string]*structs.Attribute{
- MemoryAttr: {
- Int: helper.Int64ToPtr(100),
- Unit: structs.UnitMiB,
- },
- PowerAttr: {
- Int: helper.Int64ToPtr(2),
- Unit: structs.UnitW,
- },
- BAR1Attr: {
- Int: helper.Int64ToPtr(256),
- Unit: structs.UnitMiB,
- },
- PCIBandwidthAttr: {
- Int: helper.Int64ToPtr(1),
- Unit: structs.UnitMBPerS,
- },
- CoresClockAttr: {
- Int: helper.Int64ToPtr(1),
- Unit: structs.UnitMHz,
- },
- MemoryClockAttr: {
- Int: helper.Int64ToPtr(1),
- Unit: structs.UnitMHz,
- },
- DisplayStateAttr: {
- String: helper.StringToPtr("Enabled"),
- },
- PersistenceModeAttr: {
- String: helper.StringToPtr("Enabled"),
- },
- },
- },
- },
- {
- Name: "Devices and common attributes are provided",
- GroupName: "Type1",
- Devices: []*nvml.FingerprintDeviceData{
- {
- DeviceData: &nvml.DeviceData{
- UUID: "1",
- DeviceName: helper.StringToPtr("Type1"),
- MemoryMiB: helper.Uint64ToPtr(100),
- PowerW: helper.UintToPtr(2),
- BAR1MiB: helper.Uint64ToPtr(256),
- },
- PCIBusID: "pciBusID1",
- PCIBandwidthMBPerS: helper.UintToPtr(1),
- CoresClockMHz: helper.UintToPtr(1),
- MemoryClockMHz: helper.UintToPtr(1),
- DisplayState: "Enabled",
- PersistenceMode: "Enabled",
- },
- {
- DeviceData: &nvml.DeviceData{
- UUID: "2",
- DeviceName: helper.StringToPtr("Type1"),
- MemoryMiB: helper.Uint64ToPtr(100),
- PowerW: helper.UintToPtr(2),
- BAR1MiB: helper.Uint64ToPtr(256),
- },
- PCIBusID: "pciBusID2",
- PCIBandwidthMBPerS: helper.UintToPtr(1),
- CoresClockMHz: helper.UintToPtr(1),
- MemoryClockMHz: helper.UintToPtr(1),
- DisplayState: "Enabled",
- PersistenceMode: "Enabled",
- },
- },
- CommonAttributes: map[string]*structs.Attribute{
- DriverVersionAttr: {
- String: helper.StringToPtr("1"),
- },
- },
- ExpectedResult: &device.DeviceGroup{
- Vendor: vendor,
- Type: deviceType,
- Name: "Type1",
- Devices: []*device.Device{
- {
- ID: "1",
- Healthy: true,
- HwLocality: &device.DeviceLocality{
- PciBusID: "pciBusID1",
- },
- },
- {
- ID: "2",
- Healthy: true,
- HwLocality: &device.DeviceLocality{
- PciBusID: "pciBusID2",
- },
- },
- },
- Attributes: map[string]*structs.Attribute{
- MemoryAttr: {
- Int: helper.Int64ToPtr(100),
- Unit: structs.UnitMiB,
- },
- PowerAttr: {
- Int: helper.Int64ToPtr(2),
- Unit: structs.UnitW,
- },
- BAR1Attr: {
- Int: helper.Int64ToPtr(256),
- Unit: structs.UnitMiB,
- },
- PCIBandwidthAttr: {
- Int: helper.Int64ToPtr(1),
- Unit: structs.UnitMBPerS,
- },
- CoresClockAttr: {
- Int: helper.Int64ToPtr(1),
- Unit: structs.UnitMHz,
- },
- MemoryClockAttr: {
- Int: helper.Int64ToPtr(1),
- Unit: structs.UnitMHz,
- },
- DisplayStateAttr: {
- String: helper.StringToPtr("Enabled"),
- },
- PersistenceModeAttr: {
- String: helper.StringToPtr("Enabled"),
- },
- DriverVersionAttr: {
- String: helper.StringToPtr("1"),
- },
- },
- },
- },
- {
- Name: "Devices are not provided",
- GroupName: "Type1",
- CommonAttributes: map[string]*structs.Attribute{
- DriverVersionAttr: {
- String: helper.StringToPtr("1"),
- },
- },
- Devices: nil,
- ExpectedResult: nil,
- },
- } {
- t.Run(testCase.Name, func(t *testing.T) {
- actualResult := deviceGroupFromFingerprintData(testCase.GroupName, testCase.Devices, testCase.CommonAttributes)
- require.New(t).Equal(testCase.ExpectedResult, actualResult)
- })
- }
-}
-
-func TestWriteFingerprintToChannel(t *testing.T) {
- for _, testCase := range []struct {
- Name string
- Device *NvidiaDevice
- ExpectedWriteToChannel *device.FingerprintResponse
- }{
- {
- Name: "Check that FingerprintError is handled properly",
- Device: &NvidiaDevice{
- nvmlClient: &MockNvmlClient{
- FingerprintError: errors.New(""),
- },
- logger: hclog.NewNullLogger(),
- },
- ExpectedWriteToChannel: &device.FingerprintResponse{
- Error: errors.New(""),
- },
- },
- {
- Name: "Check ignore devices works correctly",
- Device: &NvidiaDevice{
- nvmlClient: &MockNvmlClient{
- FingerprintResponseReturned: &nvml.FingerprintData{
- DriverVersion: "1",
- Devices: []*nvml.FingerprintDeviceData{
- {
- DeviceData: &nvml.DeviceData{
- UUID: "1",
- DeviceName: helper.StringToPtr("Name"),
- MemoryMiB: helper.Uint64ToPtr(10),
- PowerW: helper.UintToPtr(100),
- BAR1MiB: helper.Uint64ToPtr(256),
- },
- PCIBusID: "pciBusID1",
- PCIBandwidthMBPerS: helper.UintToPtr(1),
- CoresClockMHz: helper.UintToPtr(1),
- MemoryClockMHz: helper.UintToPtr(1),
- DisplayState: "Enabled",
- PersistenceMode: "Enabled",
- },
- {
- DeviceData: &nvml.DeviceData{
- UUID: "2",
- DeviceName: helper.StringToPtr("Name"),
- MemoryMiB: helper.Uint64ToPtr(10),
- PowerW: helper.UintToPtr(100),
- BAR1MiB: helper.Uint64ToPtr(256),
- },
- PCIBusID: "pciBusID2",
- PCIBandwidthMBPerS: helper.UintToPtr(1),
- CoresClockMHz: helper.UintToPtr(1),
- MemoryClockMHz: helper.UintToPtr(1),
- DisplayState: "Enabled",
- PersistenceMode: "Enabled",
- },
- },
- },
- },
- ignoredGPUIDs: map[string]struct{}{
- "1": {},
- },
- logger: hclog.NewNullLogger(),
- },
- ExpectedWriteToChannel: &device.FingerprintResponse{
- Devices: []*device.DeviceGroup{
- {
- Vendor: vendor,
- Type: deviceType,
- Name: "Name",
- Devices: []*device.Device{
- {
- ID: "2",
- Healthy: true,
- HwLocality: &device.DeviceLocality{
- PciBusID: "pciBusID2",
- },
- },
- },
- Attributes: map[string]*structs.Attribute{
- MemoryAttr: {
- Int: helper.Int64ToPtr(10),
- Unit: structs.UnitMiB,
- },
- PowerAttr: {
- Int: helper.Int64ToPtr(100),
- Unit: structs.UnitW,
- },
- BAR1Attr: {
- Int: helper.Int64ToPtr(256),
- Unit: structs.UnitMiB,
- },
- PCIBandwidthAttr: {
- Int: helper.Int64ToPtr(1),
- Unit: structs.UnitMBPerS,
- },
- CoresClockAttr: {
- Int: helper.Int64ToPtr(1),
- Unit: structs.UnitMHz,
- },
- MemoryClockAttr: {
- Int: helper.Int64ToPtr(1),
- Unit: structs.UnitMHz,
- },
- DisplayStateAttr: {
- String: helper.StringToPtr("Enabled"),
- },
- PersistenceModeAttr: {
- String: helper.StringToPtr("Enabled"),
- },
- DriverVersionAttr: {
- String: helper.StringToPtr("1"),
- },
- },
- },
- },
- },
- },
- {
- Name: "Check devices are split to multiple device groups 1",
- Device: &NvidiaDevice{
- nvmlClient: &MockNvmlClient{
- FingerprintResponseReturned: &nvml.FingerprintData{
- DriverVersion: "1",
- Devices: []*nvml.FingerprintDeviceData{
- {
- DeviceData: &nvml.DeviceData{
- UUID: "1",
- DeviceName: helper.StringToPtr("Name1"),
- MemoryMiB: helper.Uint64ToPtr(10),
- PowerW: helper.UintToPtr(100),
- BAR1MiB: helper.Uint64ToPtr(256),
- },
- PCIBusID: "pciBusID1",
- PCIBandwidthMBPerS: helper.UintToPtr(1),
- CoresClockMHz: helper.UintToPtr(1),
- MemoryClockMHz: helper.UintToPtr(1),
- DisplayState: "Enabled",
- PersistenceMode: "Enabled",
- },
- {
- DeviceData: &nvml.DeviceData{
- UUID: "2",
- DeviceName: helper.StringToPtr("Name2"),
- MemoryMiB: helper.Uint64ToPtr(11),
- PowerW: helper.UintToPtr(100),
- BAR1MiB: helper.Uint64ToPtr(256),
- },
- PCIBusID: "pciBusID2",
- PCIBandwidthMBPerS: helper.UintToPtr(1),
- CoresClockMHz: helper.UintToPtr(1),
- MemoryClockMHz: helper.UintToPtr(1),
- DisplayState: "Enabled",
- PersistenceMode: "Enabled",
- },
- {
- DeviceData: &nvml.DeviceData{
- UUID: "3",
- DeviceName: helper.StringToPtr("Name3"),
- MemoryMiB: helper.Uint64ToPtr(12),
- PowerW: helper.UintToPtr(100),
- BAR1MiB: helper.Uint64ToPtr(256),
- },
- PCIBusID: "pciBusID3",
- PCIBandwidthMBPerS: helper.UintToPtr(1),
- CoresClockMHz: helper.UintToPtr(1),
- MemoryClockMHz: helper.UintToPtr(1),
- DisplayState: "Enabled",
- PersistenceMode: "Enabled",
- },
- },
- },
- },
- logger: hclog.NewNullLogger(),
- },
- ExpectedWriteToChannel: &device.FingerprintResponse{
- Devices: []*device.DeviceGroup{
- {
- Vendor: vendor,
- Type: deviceType,
- Name: "Name1",
- Devices: []*device.Device{
- {
- ID: "1",
- Healthy: true,
- HwLocality: &device.DeviceLocality{
- PciBusID: "pciBusID1",
- },
- },
- },
- Attributes: map[string]*structs.Attribute{
- MemoryAttr: {
- Int: helper.Int64ToPtr(10),
- Unit: structs.UnitMiB,
- },
- PowerAttr: {
- Int: helper.Int64ToPtr(100),
- Unit: structs.UnitW,
- },
- BAR1Attr: {
- Int: helper.Int64ToPtr(256),
- Unit: structs.UnitMiB,
- },
- PCIBandwidthAttr: {
- Int: helper.Int64ToPtr(1),
- Unit: structs.UnitMBPerS,
- },
- CoresClockAttr: {
- Int: helper.Int64ToPtr(1),
- Unit: structs.UnitMHz,
- },
- MemoryClockAttr: {
- Int: helper.Int64ToPtr(1),
- Unit: structs.UnitMHz,
- },
- DisplayStateAttr: {
- String: helper.StringToPtr("Enabled"),
- },
- PersistenceModeAttr: {
- String: helper.StringToPtr("Enabled"),
- },
- DriverVersionAttr: {
- String: helper.StringToPtr("1"),
- },
- },
- },
- {
- Vendor: vendor,
- Type: deviceType,
- Name: "Name2",
- Devices: []*device.Device{
- {
- ID: "2",
- Healthy: true,
- HwLocality: &device.DeviceLocality{
- PciBusID: "pciBusID2",
- },
- },
- },
- Attributes: map[string]*structs.Attribute{
- MemoryAttr: {
- Int: helper.Int64ToPtr(11),
- Unit: structs.UnitMiB,
- },
- PowerAttr: {
- Int: helper.Int64ToPtr(100),
- Unit: structs.UnitW,
- },
- BAR1Attr: {
- Int: helper.Int64ToPtr(256),
- Unit: structs.UnitMiB,
- },
- PCIBandwidthAttr: {
- Int: helper.Int64ToPtr(1),
- Unit: structs.UnitMBPerS,
- },
- CoresClockAttr: {
- Int: helper.Int64ToPtr(1),
- Unit: structs.UnitMHz,
- },
- MemoryClockAttr: {
- Int: helper.Int64ToPtr(1),
- Unit: structs.UnitMHz,
- },
- DisplayStateAttr: {
- String: helper.StringToPtr("Enabled"),
- },
- PersistenceModeAttr: {
- String: helper.StringToPtr("Enabled"),
- },
- DriverVersionAttr: {
- String: helper.StringToPtr("1"),
- },
- },
- },
- {
- Vendor: vendor,
- Type: deviceType,
- Name: "Name3",
- Devices: []*device.Device{
- {
- ID: "3",
- Healthy: true,
- HwLocality: &device.DeviceLocality{
- PciBusID: "pciBusID3",
- },
- },
- },
- Attributes: map[string]*structs.Attribute{
- MemoryAttr: {
- Int: helper.Int64ToPtr(12),
- Unit: structs.UnitMiB,
- },
- PowerAttr: {
- Int: helper.Int64ToPtr(100),
- Unit: structs.UnitW,
- },
- BAR1Attr: {
- Int: helper.Int64ToPtr(256),
- Unit: structs.UnitMiB,
- },
- PCIBandwidthAttr: {
- Int: helper.Int64ToPtr(1),
- Unit: structs.UnitMBPerS,
- },
- CoresClockAttr: {
- Int: helper.Int64ToPtr(1),
- Unit: structs.UnitMHz,
- },
- MemoryClockAttr: {
- Int: helper.Int64ToPtr(1),
- Unit: structs.UnitMHz,
- },
- DisplayStateAttr: {
- String: helper.StringToPtr("Enabled"),
- },
- PersistenceModeAttr: {
- String: helper.StringToPtr("Enabled"),
- },
- DriverVersionAttr: {
- String: helper.StringToPtr("1"),
- },
- },
- },
- },
- },
- },
- {
- Name: "Check devices are split to multiple device groups 2",
- Device: &NvidiaDevice{
- nvmlClient: &MockNvmlClient{
- FingerprintResponseReturned: &nvml.FingerprintData{
- DriverVersion: "1",
- Devices: []*nvml.FingerprintDeviceData{
- {
- DeviceData: &nvml.DeviceData{
- UUID: "1",
- DeviceName: helper.StringToPtr("Name1"),
- MemoryMiB: helper.Uint64ToPtr(10),
- PowerW: helper.UintToPtr(100),
- BAR1MiB: helper.Uint64ToPtr(256),
- },
- PCIBusID: "pciBusID1",
- PCIBandwidthMBPerS: helper.UintToPtr(1),
- CoresClockMHz: helper.UintToPtr(1),
- MemoryClockMHz: helper.UintToPtr(1),
- DisplayState: "Enabled",
- PersistenceMode: "Enabled",
- },
- {
- DeviceData: &nvml.DeviceData{
- UUID: "2",
- DeviceName: helper.StringToPtr("Name2"),
- MemoryMiB: helper.Uint64ToPtr(11),
- PowerW: helper.UintToPtr(100),
- BAR1MiB: helper.Uint64ToPtr(256),
- },
- PCIBusID: "pciBusID2",
- PCIBandwidthMBPerS: helper.UintToPtr(1),
- CoresClockMHz: helper.UintToPtr(1),
- MemoryClockMHz: helper.UintToPtr(1),
- DisplayState: "Enabled",
- PersistenceMode: "Enabled",
- },
- {
- DeviceData: &nvml.DeviceData{
- UUID: "3",
- DeviceName: helper.StringToPtr("Name2"),
- MemoryMiB: helper.Uint64ToPtr(12),
- PowerW: helper.UintToPtr(100),
- BAR1MiB: helper.Uint64ToPtr(256),
- },
- PCIBusID: "pciBusID3",
- PCIBandwidthMBPerS: helper.UintToPtr(1),
- CoresClockMHz: helper.UintToPtr(1),
- MemoryClockMHz: helper.UintToPtr(1),
- DisplayState: "Enabled",
- PersistenceMode: "Enabled",
- },
- },
- },
- },
- logger: hclog.NewNullLogger(),
- },
- ExpectedWriteToChannel: &device.FingerprintResponse{
- Devices: []*device.DeviceGroup{
- {
- Vendor: vendor,
- Type: deviceType,
- Name: "Name1",
- Devices: []*device.Device{
- {
- ID: "1",
- Healthy: true,
- HwLocality: &device.DeviceLocality{
- PciBusID: "pciBusID1",
- },
- },
- },
- Attributes: map[string]*structs.Attribute{
- MemoryAttr: {
- Int: helper.Int64ToPtr(10),
- Unit: structs.UnitMiB,
- },
- PowerAttr: {
- Int: helper.Int64ToPtr(100),
- Unit: structs.UnitW,
- },
- BAR1Attr: {
- Int: helper.Int64ToPtr(256),
- Unit: structs.UnitMiB,
- },
- PCIBandwidthAttr: {
- Int: helper.Int64ToPtr(1),
- Unit: structs.UnitMBPerS,
- },
- CoresClockAttr: {
- Int: helper.Int64ToPtr(1),
- Unit: structs.UnitMHz,
- },
- MemoryClockAttr: {
- Int: helper.Int64ToPtr(1),
- Unit: structs.UnitMHz,
- },
- DisplayStateAttr: {
- String: helper.StringToPtr("Enabled"),
- },
- PersistenceModeAttr: {
- String: helper.StringToPtr("Enabled"),
- },
- DriverVersionAttr: {
- String: helper.StringToPtr("1"),
- },
- },
- },
- {
- Vendor: vendor,
- Type: deviceType,
- Name: "Name2",
- Devices: []*device.Device{
- {
- ID: "2",
- Healthy: true,
- HwLocality: &device.DeviceLocality{
- PciBusID: "pciBusID2",
- },
- },
- {
- ID: "3",
- Healthy: true,
- HwLocality: &device.DeviceLocality{
- PciBusID: "pciBusID3",
- },
- },
- },
- Attributes: map[string]*structs.Attribute{
- MemoryAttr: {
- Int: helper.Int64ToPtr(11),
- Unit: structs.UnitMiB,
- },
- PowerAttr: {
- Int: helper.Int64ToPtr(100),
- Unit: structs.UnitW,
- },
- BAR1Attr: {
- Int: helper.Int64ToPtr(256),
- Unit: structs.UnitMiB,
- },
- PCIBandwidthAttr: {
- Int: helper.Int64ToPtr(1),
- Unit: structs.UnitMBPerS,
- },
- CoresClockAttr: {
- Int: helper.Int64ToPtr(1),
- Unit: structs.UnitMHz,
- },
- MemoryClockAttr: {
- Int: helper.Int64ToPtr(1),
- Unit: structs.UnitMHz,
- },
- DisplayStateAttr: {
- String: helper.StringToPtr("Enabled"),
- },
- PersistenceModeAttr: {
- String: helper.StringToPtr("Enabled"),
- },
- DriverVersionAttr: {
- String: helper.StringToPtr("1"),
- },
- },
- },
- },
- },
- },
- } {
- t.Run(testCase.Name, func(t *testing.T) {
- channel := make(chan *device.FingerprintResponse, 1)
- testCase.Device.writeFingerprintToChannel(channel)
- actualResult := <-channel
- // writeFingerprintToChannel iterates over map keys
- // and insterts results to an array, so order of elements in output array
- // may be different
- // actualResult, expectedResult arrays has to be sorted firsted
- sort.Slice(actualResult.Devices, func(i, j int) bool {
- return actualResult.Devices[i].Name < actualResult.Devices[j].Name
- })
- sort.Slice(testCase.ExpectedWriteToChannel.Devices, func(i, j int) bool {
- return testCase.ExpectedWriteToChannel.Devices[i].Name < testCase.ExpectedWriteToChannel.Devices[j].Name
- })
- require.Equal(t, testCase.ExpectedWriteToChannel, actualResult)
- })
- }
-}
-
-// Test if nonworking driver returns empty fingerprint data
-func TestFingerprint(t *testing.T) {
- for _, testCase := range []struct {
- Name string
- Device *NvidiaDevice
- ExpectedWriteToChannel *device.FingerprintResponse
- }{
- {
- Name: "Check that working driver returns valid fingeprint data",
- Device: &NvidiaDevice{
- initErr: nil,
- nvmlClient: &MockNvmlClient{
- FingerprintResponseReturned: &nvml.FingerprintData{
- DriverVersion: "1",
- Devices: []*nvml.FingerprintDeviceData{
- {
- DeviceData: &nvml.DeviceData{
- UUID: "1",
- DeviceName: helper.StringToPtr("Name1"),
- MemoryMiB: helper.Uint64ToPtr(10),
- PowerW: helper.UintToPtr(100),
- BAR1MiB: helper.Uint64ToPtr(256),
- },
- PCIBusID: "pciBusID1",
- PCIBandwidthMBPerS: helper.UintToPtr(1),
- CoresClockMHz: helper.UintToPtr(1),
- MemoryClockMHz: helper.UintToPtr(1),
- DisplayState: "Enabled",
- PersistenceMode: "Enabled",
- },
- {
- DeviceData: &nvml.DeviceData{
- UUID: "2",
- DeviceName: helper.StringToPtr("Name1"),
- MemoryMiB: helper.Uint64ToPtr(10),
- PowerW: helper.UintToPtr(100),
- BAR1MiB: helper.Uint64ToPtr(256),
- },
- PCIBusID: "pciBusID2",
- PCIBandwidthMBPerS: helper.UintToPtr(1),
- CoresClockMHz: helper.UintToPtr(1),
- MemoryClockMHz: helper.UintToPtr(1),
- DisplayState: "Enabled",
- PersistenceMode: "Enabled",
- },
- {
- DeviceData: &nvml.DeviceData{
- UUID: "3",
- DeviceName: helper.StringToPtr("Name1"),
- MemoryMiB: helper.Uint64ToPtr(10),
- PowerW: helper.UintToPtr(100),
- BAR1MiB: helper.Uint64ToPtr(256),
- },
- PCIBusID: "pciBusID3",
- PCIBandwidthMBPerS: helper.UintToPtr(1),
- CoresClockMHz: helper.UintToPtr(1),
- MemoryClockMHz: helper.UintToPtr(1),
- DisplayState: "Enabled",
- PersistenceMode: "Enabled",
- },
- },
- },
- },
- logger: hclog.NewNullLogger(),
- },
- ExpectedWriteToChannel: &device.FingerprintResponse{
- Devices: []*device.DeviceGroup{
- {
- Vendor: vendor,
- Type: deviceType,
- Name: "Name1",
- Devices: []*device.Device{
- {
- ID: "1",
- Healthy: true,
- HwLocality: &device.DeviceLocality{
- PciBusID: "pciBusID1",
- },
- },
- {
- ID: "2",
- Healthy: true,
- HwLocality: &device.DeviceLocality{
- PciBusID: "pciBusID2",
- },
- },
- {
- ID: "3",
- Healthy: true,
- HwLocality: &device.DeviceLocality{
- PciBusID: "pciBusID3",
- },
- },
- },
- Attributes: map[string]*structs.Attribute{
- MemoryAttr: {
- Int: helper.Int64ToPtr(10),
- Unit: structs.UnitMiB,
- },
- PowerAttr: {
- Int: helper.Int64ToPtr(100),
- Unit: structs.UnitW,
- },
- BAR1Attr: {
- Int: helper.Int64ToPtr(256),
- Unit: structs.UnitMiB,
- },
- PCIBandwidthAttr: {
- Int: helper.Int64ToPtr(1),
- Unit: structs.UnitMBPerS,
- },
- CoresClockAttr: {
- Int: helper.Int64ToPtr(1),
- Unit: structs.UnitMHz,
- },
- MemoryClockAttr: {
- Int: helper.Int64ToPtr(1),
- Unit: structs.UnitMHz,
- },
- DisplayStateAttr: {
- String: helper.StringToPtr("Enabled"),
- },
- PersistenceModeAttr: {
- String: helper.StringToPtr("Enabled"),
- },
- DriverVersionAttr: {
- String: helper.StringToPtr("1"),
- },
- },
- },
- },
- },
- },
- {
- Name: "Check that not working driver returns error fingeprint data",
- Device: &NvidiaDevice{
- initErr: errors.New("foo"),
- nvmlClient: &MockNvmlClient{
- FingerprintResponseReturned: &nvml.FingerprintData{
- DriverVersion: "1",
- Devices: []*nvml.FingerprintDeviceData{
- {
- DeviceData: &nvml.DeviceData{
- UUID: "1",
- DeviceName: helper.StringToPtr("Name1"),
- MemoryMiB: helper.Uint64ToPtr(10),
- },
- },
- {
- DeviceData: &nvml.DeviceData{
- UUID: "2",
- DeviceName: helper.StringToPtr("Name1"),
- MemoryMiB: helper.Uint64ToPtr(10),
- },
- },
- {
- DeviceData: &nvml.DeviceData{
- UUID: "3",
- DeviceName: helper.StringToPtr("Name1"),
- MemoryMiB: helper.Uint64ToPtr(10),
- },
- },
- },
- },
- },
- logger: hclog.NewNullLogger(),
- },
- ExpectedWriteToChannel: &device.FingerprintResponse{
- Error: errors.New("foo"),
- },
- },
- } {
- t.Run(testCase.Name, func(t *testing.T) {
- outCh := make(chan *device.FingerprintResponse)
- ctx, cancel := context.WithCancel(context.Background())
- go testCase.Device.fingerprint(ctx, outCh)
- result := <-outCh
- cancel()
- require.New(t).Equal(result, testCase.ExpectedWriteToChannel)
- })
- }
-}
diff --git a/devices/gpu/nvidia/nvml/client.go b/devices/gpu/nvidia/nvml/client.go
deleted file mode 100644
index d18dcbe1a9f6..000000000000
--- a/devices/gpu/nvidia/nvml/client.go
+++ /dev/null
@@ -1,194 +0,0 @@
-package nvml
-
-import (
- "fmt"
-)
-
-// DeviceData represents common fields for Nvidia device
-type DeviceData struct {
- UUID string
- DeviceName *string
- MemoryMiB *uint64
- PowerW *uint
- BAR1MiB *uint64
-}
-
-// FingerprintDeviceData is a superset of DeviceData
-// it describes device specific fields returned from
-// nvml queries during fingerprinting call
-type FingerprintDeviceData struct {
- *DeviceData
- PCIBandwidthMBPerS *uint
- CoresClockMHz *uint
- MemoryClockMHz *uint
- DisplayState string
- PersistenceMode string
- PCIBusID string
-}
-
-// FingerprintData represets attributes of driver/devices
-type FingerprintData struct {
- Devices []*FingerprintDeviceData
- DriverVersion string
-}
-
-// StatsData is a superset of DeviceData
-// it represents statistics data returned for every Nvidia device
-type StatsData struct {
- *DeviceData
- PowerUsageW *uint
- GPUUtilization *uint
- MemoryUtilization *uint
- EncoderUtilization *uint
- DecoderUtilization *uint
- TemperatureC *uint
- UsedMemoryMiB *uint64
- BAR1UsedMiB *uint64
- ECCErrorsL1Cache *uint64
- ECCErrorsL2Cache *uint64
- ECCErrorsDevice *uint64
-}
-
-// NvmlClient describes how users would use nvml library
-type NvmlClient interface {
- GetFingerprintData() (*FingerprintData, error)
- GetStatsData() ([]*StatsData, error)
-}
-
-// nvmlClient implements NvmlClient
-// Users of this lib are expected to use this struct via NewNvmlClient func
-type nvmlClient struct {
- driver NvmlDriver
-}
-
-// NewNvmlClient function creates new nvmlClient with real
-// NvmlDriver implementation. Also, this func initializes NvmlDriver
-func NewNvmlClient() (*nvmlClient, error) {
- driver := &nvmlDriver{}
- err := driver.Initialize()
- if err != nil {
- return nil, err
- }
- return &nvmlClient{
- driver: driver,
- }, nil
-}
-
-// GetFingerprintData returns FingerprintData for available Nvidia devices
-func (c *nvmlClient) GetFingerprintData() (*FingerprintData, error) {
- /*
- nvml fields to be fingerprinted # nvml_library_call
- 1 - Driver Version # nvmlSystemGetDriverVersion
- 2 - Product Name # nvmlDeviceGetName
- 3 - GPU UUID # nvmlDeviceGetUUID
- 4 - Total Memory # nvmlDeviceGetMemoryInfo
- 5 - Power # nvmlDeviceGetPowerManagementLimit
- 6 - PCIBusID # nvmlDeviceGetPciInfo
- 7 - BAR1 Memory # nvmlDeviceGetBAR1MemoryInfo(
- 8 - PCI Bandwidth
- 9 - Memory, Cores Clock # nvmlDeviceGetMaxClockInfo
- 10 - Display Mode # nvmlDeviceGetDisplayMode
- 11 - Persistence Mode # nvmlDeviceGetPersistenceMode
- */
-
- // Assumed that this method is called with receiver retrieved from
- // NewNvmlClient
- // because this method handles initialization of NVML library
-
- driverVersion, err := c.driver.SystemDriverVersion()
- if err != nil {
- return nil, fmt.Errorf("nvidia nvml SystemDriverVersion() error: %v\n", err)
- }
-
- numDevices, err := c.driver.DeviceCount()
- if err != nil {
- return nil, fmt.Errorf("nvidia nvml DeviceCount() error: %v\n", err)
- }
-
- allNvidiaGPUResources := make([]*FingerprintDeviceData, numDevices)
-
- for i := 0; i < int(numDevices); i++ {
- deviceInfo, err := c.driver.DeviceInfoByIndex(uint(i))
- if err != nil {
- return nil, fmt.Errorf("nvidia nvml DeviceInfoByIndex() error: %v\n", err)
- }
-
- allNvidiaGPUResources[i] = &FingerprintDeviceData{
- DeviceData: &DeviceData{
- DeviceName: deviceInfo.Name,
- UUID: deviceInfo.UUID,
- MemoryMiB: deviceInfo.MemoryMiB,
- PowerW: deviceInfo.PowerW,
- BAR1MiB: deviceInfo.BAR1MiB,
- },
- PCIBandwidthMBPerS: deviceInfo.PCIBandwidthMBPerS,
- CoresClockMHz: deviceInfo.CoresClockMHz,
- MemoryClockMHz: deviceInfo.MemoryClockMHz,
- DisplayState: deviceInfo.DisplayState,
- PersistenceMode: deviceInfo.PersistenceMode,
- PCIBusID: deviceInfo.PCIBusID,
- }
- }
- return &FingerprintData{
- Devices: allNvidiaGPUResources,
- DriverVersion: driverVersion,
- }, nil
-}
-
-// GetStatsData returns statistics data for all devices on this machine
-func (c *nvmlClient) GetStatsData() ([]*StatsData, error) {
- /*
- nvml fields to be reported to stats api # nvml_library_call
- 1 - Used Memory # nvmlDeviceGetMemoryInfo
- 2 - Utilization of GPU # nvmlDeviceGetUtilizationRates
- 3 - Utilization of Memory # nvmlDeviceGetUtilizationRates
- 4 - Utilization of Decoder # nvmlDeviceGetDecoderUtilization
- 5 - Utilization of Encoder # nvmlDeviceGetEncoderUtilization
- 6 - Current GPU Temperature # nvmlDeviceGetTemperature
- 7 - Power Draw # nvmlDeviceGetPowerUsage
- 8 - BAR1 Used memory # nvmlDeviceGetBAR1MemoryInfo
- 9 - ECC Errors on requesting L1Cache # nvmlDeviceGetMemoryErrorCounter
- 10 - ECC Errors on requesting L2Cache # nvmlDeviceGetMemoryErrorCounter
- 11 - ECC Errors on requesting Device memory # nvmlDeviceGetMemoryErrorCounter
- */
-
- // Assumed that this method is called with receiver retrieved from
- // NewNvmlClient
- // because this method handles initialization of NVML library
-
- numDevices, err := c.driver.DeviceCount()
- if err != nil {
- return nil, fmt.Errorf("nvidia nvml DeviceCount() error: %v\n", err)
- }
-
- allNvidiaGPUStats := make([]*StatsData, numDevices)
-
- for i := 0; i < int(numDevices); i++ {
- deviceInfo, deviceStatus, err := c.driver.DeviceInfoAndStatusByIndex(uint(i))
- if err != nil {
- return nil, fmt.Errorf("nvidia nvml DeviceInfoAndStatusByIndex() error: %v\n", err)
- }
-
- allNvidiaGPUStats[i] = &StatsData{
- DeviceData: &DeviceData{
- DeviceName: deviceInfo.Name,
- UUID: deviceInfo.UUID,
- MemoryMiB: deviceInfo.MemoryMiB,
- PowerW: deviceInfo.PowerW,
- BAR1MiB: deviceInfo.BAR1MiB,
- },
- PowerUsageW: deviceStatus.PowerUsageW,
- GPUUtilization: deviceStatus.GPUUtilization,
- MemoryUtilization: deviceStatus.MemoryUtilization,
- EncoderUtilization: deviceStatus.EncoderUtilization,
- DecoderUtilization: deviceStatus.DecoderUtilization,
- TemperatureC: deviceStatus.TemperatureC,
- UsedMemoryMiB: deviceStatus.UsedMemoryMiB,
- BAR1UsedMiB: deviceStatus.BAR1UsedMiB,
- ECCErrorsL1Cache: deviceStatus.ECCErrorsL1Cache,
- ECCErrorsL2Cache: deviceStatus.ECCErrorsL2Cache,
- ECCErrorsDevice: deviceStatus.ECCErrorsDevice,
- }
- }
- return allNvidiaGPUStats, nil
-}
diff --git a/devices/gpu/nvidia/nvml/client_test.go b/devices/gpu/nvidia/nvml/client_test.go
deleted file mode 100644
index 23731f7b052e..000000000000
--- a/devices/gpu/nvidia/nvml/client_test.go
+++ /dev/null
@@ -1,399 +0,0 @@
-package nvml
-
-import (
- "errors"
- "testing"
-
- "github.com/hashicorp/nomad/helper"
- "github.com/stretchr/testify/require"
-)
-
-type MockNVMLDriver struct {
- systemDriverCallSuccessful bool
- deviceCountCallSuccessful bool
- deviceInfoByIndexCallSuccessful bool
- deviceInfoAndStatusByIndexCallSuccessful bool
- driverVersion string
- devices []*DeviceInfo
- deviceStatus []*DeviceStatus
-}
-
-func (m *MockNVMLDriver) Initialize() error {
- return nil
-}
-
-func (m *MockNVMLDriver) Shutdown() error {
- return nil
-}
-
-func (m *MockNVMLDriver) SystemDriverVersion() (string, error) {
- if !m.systemDriverCallSuccessful {
- return "", errors.New("failed to get system driver")
- }
- return m.driverVersion, nil
-}
-
-func (m *MockNVMLDriver) DeviceCount() (uint, error) {
- if !m.deviceCountCallSuccessful {
- return 0, errors.New("failed to get device length")
- }
- return uint(len(m.devices)), nil
-}
-
-func (m *MockNVMLDriver) DeviceInfoByIndex(index uint) (*DeviceInfo, error) {
- if index >= uint(len(m.devices)) {
- return nil, errors.New("index is out of range")
- }
- if !m.deviceInfoByIndexCallSuccessful {
- return nil, errors.New("failed to get device info by index")
- }
- return m.devices[index], nil
-}
-
-func (m *MockNVMLDriver) DeviceInfoAndStatusByIndex(index uint) (*DeviceInfo, *DeviceStatus, error) {
- if index >= uint(len(m.devices)) || index >= uint(len(m.deviceStatus)) {
- return nil, nil, errors.New("index is out of range")
- }
- if !m.deviceInfoAndStatusByIndexCallSuccessful {
- return nil, nil, errors.New("failed to get device info and status by index")
- }
- return m.devices[index], m.deviceStatus[index], nil
-}
-
-func TestGetFingerprintDataFromNVML(t *testing.T) {
- for _, testCase := range []struct {
- Name string
- DriverConfiguration *MockNVMLDriver
- ExpectedError bool
- ExpectedResult *FingerprintData
- }{
- {
- Name: "fail on systemDriverCallSuccessful",
- ExpectedError: true,
- ExpectedResult: nil,
- DriverConfiguration: &MockNVMLDriver{
- systemDriverCallSuccessful: false,
- deviceCountCallSuccessful: true,
- deviceInfoByIndexCallSuccessful: true,
- },
- },
- {
- Name: "fail on deviceCountCallSuccessful",
- ExpectedError: true,
- ExpectedResult: nil,
- DriverConfiguration: &MockNVMLDriver{
- systemDriverCallSuccessful: true,
- deviceCountCallSuccessful: false,
- deviceInfoByIndexCallSuccessful: true,
- },
- },
- {
- Name: "fail on deviceInfoByIndexCall",
- ExpectedError: true,
- ExpectedResult: nil,
- DriverConfiguration: &MockNVMLDriver{
- systemDriverCallSuccessful: true,
- deviceCountCallSuccessful: true,
- deviceInfoByIndexCallSuccessful: false,
- devices: []*DeviceInfo{
- {
- UUID: "UUID1",
- Name: helper.StringToPtr("ModelName1"),
- MemoryMiB: helper.Uint64ToPtr(16),
- PCIBusID: "busId",
- PowerW: helper.UintToPtr(100),
- BAR1MiB: helper.Uint64ToPtr(100),
- PCIBandwidthMBPerS: helper.UintToPtr(100),
- CoresClockMHz: helper.UintToPtr(100),
- MemoryClockMHz: helper.UintToPtr(100),
- }, {
- UUID: "UUID2",
- Name: helper.StringToPtr("ModelName2"),
- MemoryMiB: helper.Uint64ToPtr(8),
- PCIBusID: "busId",
- PowerW: helper.UintToPtr(100),
- BAR1MiB: helper.Uint64ToPtr(100),
- PCIBandwidthMBPerS: helper.UintToPtr(100),
- CoresClockMHz: helper.UintToPtr(100),
- MemoryClockMHz: helper.UintToPtr(100),
- },
- },
- },
- },
- {
- Name: "successful outcome",
- ExpectedError: false,
- ExpectedResult: &FingerprintData{
- DriverVersion: "driverVersion",
- Devices: []*FingerprintDeviceData{
- {
- DeviceData: &DeviceData{
- DeviceName: helper.StringToPtr("ModelName1"),
- UUID: "UUID1",
- MemoryMiB: helper.Uint64ToPtr(16),
- PowerW: helper.UintToPtr(100),
- BAR1MiB: helper.Uint64ToPtr(100),
- },
- PCIBusID: "busId1",
- PCIBandwidthMBPerS: helper.UintToPtr(100),
- CoresClockMHz: helper.UintToPtr(100),
- MemoryClockMHz: helper.UintToPtr(100),
- DisplayState: "Enabled",
- PersistenceMode: "Enabled",
- }, {
- DeviceData: &DeviceData{
- DeviceName: helper.StringToPtr("ModelName2"),
- UUID: "UUID2",
- MemoryMiB: helper.Uint64ToPtr(8),
- PowerW: helper.UintToPtr(200),
- BAR1MiB: helper.Uint64ToPtr(200),
- },
- PCIBusID: "busId2",
- PCIBandwidthMBPerS: helper.UintToPtr(200),
- CoresClockMHz: helper.UintToPtr(200),
- MemoryClockMHz: helper.UintToPtr(200),
- DisplayState: "Enabled",
- PersistenceMode: "Enabled",
- },
- },
- },
- DriverConfiguration: &MockNVMLDriver{
- systemDriverCallSuccessful: true,
- deviceCountCallSuccessful: true,
- deviceInfoByIndexCallSuccessful: true,
- driverVersion: "driverVersion",
- devices: []*DeviceInfo{
- {
- UUID: "UUID1",
- Name: helper.StringToPtr("ModelName1"),
- MemoryMiB: helper.Uint64ToPtr(16),
- PCIBusID: "busId1",
- PowerW: helper.UintToPtr(100),
- BAR1MiB: helper.Uint64ToPtr(100),
- PCIBandwidthMBPerS: helper.UintToPtr(100),
- CoresClockMHz: helper.UintToPtr(100),
- MemoryClockMHz: helper.UintToPtr(100),
- DisplayState: "Enabled",
- PersistenceMode: "Enabled",
- }, {
- UUID: "UUID2",
- Name: helper.StringToPtr("ModelName2"),
- MemoryMiB: helper.Uint64ToPtr(8),
- PCIBusID: "busId2",
- PowerW: helper.UintToPtr(200),
- BAR1MiB: helper.Uint64ToPtr(200),
- PCIBandwidthMBPerS: helper.UintToPtr(200),
- CoresClockMHz: helper.UintToPtr(200),
- MemoryClockMHz: helper.UintToPtr(200),
- DisplayState: "Enabled",
- PersistenceMode: "Enabled",
- },
- },
- },
- },
- } {
- cli := nvmlClient{driver: testCase.DriverConfiguration}
- fingerprintData, err := cli.GetFingerprintData()
- if testCase.ExpectedError && err == nil {
- t.Errorf("case '%s' : expected Error, but didn't get one", testCase.Name)
- }
- if !testCase.ExpectedError && err != nil {
- t.Errorf("case '%s' : unexpected Error '%v'", testCase.Name, err)
- }
- require.New(t).Equal(testCase.ExpectedResult, fingerprintData)
- }
-}
-
-func TestGetStatsDataFromNVML(t *testing.T) {
- for _, testCase := range []struct {
- Name string
- DriverConfiguration *MockNVMLDriver
- ExpectedError bool
- ExpectedResult []*StatsData
- }{
- {
- Name: "fail on deviceCountCallSuccessful",
- ExpectedError: true,
- ExpectedResult: nil,
- DriverConfiguration: &MockNVMLDriver{
- systemDriverCallSuccessful: true,
- deviceCountCallSuccessful: false,
- deviceInfoByIndexCallSuccessful: true,
- deviceInfoAndStatusByIndexCallSuccessful: true,
- },
- },
- {
- Name: "fail on DeviceInfoAndStatusByIndex call",
- ExpectedError: true,
- ExpectedResult: nil,
- DriverConfiguration: &MockNVMLDriver{
- systemDriverCallSuccessful: true,
- deviceCountCallSuccessful: true,
- deviceInfoAndStatusByIndexCallSuccessful: false,
- devices: []*DeviceInfo{
- {
- UUID: "UUID1",
- Name: helper.StringToPtr("ModelName1"),
- MemoryMiB: helper.Uint64ToPtr(16),
- PCIBusID: "busId1",
- PowerW: helper.UintToPtr(100),
- BAR1MiB: helper.Uint64ToPtr(100),
- PCIBandwidthMBPerS: helper.UintToPtr(100),
- CoresClockMHz: helper.UintToPtr(100),
- MemoryClockMHz: helper.UintToPtr(100),
- }, {
- UUID: "UUID2",
- Name: helper.StringToPtr("ModelName2"),
- MemoryMiB: helper.Uint64ToPtr(8),
- PCIBusID: "busId2",
- PowerW: helper.UintToPtr(200),
- BAR1MiB: helper.Uint64ToPtr(200),
- PCIBandwidthMBPerS: helper.UintToPtr(200),
- CoresClockMHz: helper.UintToPtr(200),
- MemoryClockMHz: helper.UintToPtr(200),
- },
- },
- deviceStatus: []*DeviceStatus{
- {
- TemperatureC: helper.UintToPtr(1),
- GPUUtilization: helper.UintToPtr(1),
- MemoryUtilization: helper.UintToPtr(1),
- EncoderUtilization: helper.UintToPtr(1),
- DecoderUtilization: helper.UintToPtr(1),
- UsedMemoryMiB: helper.Uint64ToPtr(1),
- ECCErrorsL1Cache: helper.Uint64ToPtr(1),
- ECCErrorsL2Cache: helper.Uint64ToPtr(1),
- ECCErrorsDevice: helper.Uint64ToPtr(1),
- PowerUsageW: helper.UintToPtr(1),
- BAR1UsedMiB: helper.Uint64ToPtr(1),
- },
- {
- TemperatureC: helper.UintToPtr(2),
- GPUUtilization: helper.UintToPtr(2),
- MemoryUtilization: helper.UintToPtr(2),
- EncoderUtilization: helper.UintToPtr(2),
- DecoderUtilization: helper.UintToPtr(2),
- UsedMemoryMiB: helper.Uint64ToPtr(2),
- ECCErrorsL1Cache: helper.Uint64ToPtr(2),
- ECCErrorsL2Cache: helper.Uint64ToPtr(2),
- ECCErrorsDevice: helper.Uint64ToPtr(2),
- PowerUsageW: helper.UintToPtr(2),
- BAR1UsedMiB: helper.Uint64ToPtr(2),
- },
- },
- },
- },
- {
- Name: "successful outcome",
- ExpectedError: false,
- ExpectedResult: []*StatsData{
- {
- DeviceData: &DeviceData{
- DeviceName: helper.StringToPtr("ModelName1"),
- UUID: "UUID1",
- MemoryMiB: helper.Uint64ToPtr(16),
- PowerW: helper.UintToPtr(100),
- BAR1MiB: helper.Uint64ToPtr(100),
- },
- TemperatureC: helper.UintToPtr(1),
- GPUUtilization: helper.UintToPtr(1),
- MemoryUtilization: helper.UintToPtr(1),
- EncoderUtilization: helper.UintToPtr(1),
- DecoderUtilization: helper.UintToPtr(1),
- UsedMemoryMiB: helper.Uint64ToPtr(1),
- ECCErrorsL1Cache: helper.Uint64ToPtr(1),
- ECCErrorsL2Cache: helper.Uint64ToPtr(1),
- ECCErrorsDevice: helper.Uint64ToPtr(1),
- PowerUsageW: helper.UintToPtr(1),
- BAR1UsedMiB: helper.Uint64ToPtr(1),
- },
- {
- DeviceData: &DeviceData{
- DeviceName: helper.StringToPtr("ModelName2"),
- UUID: "UUID2",
- MemoryMiB: helper.Uint64ToPtr(8),
- PowerW: helper.UintToPtr(200),
- BAR1MiB: helper.Uint64ToPtr(200),
- },
- TemperatureC: helper.UintToPtr(2),
- GPUUtilization: helper.UintToPtr(2),
- MemoryUtilization: helper.UintToPtr(2),
- EncoderUtilization: helper.UintToPtr(2),
- DecoderUtilization: helper.UintToPtr(2),
- UsedMemoryMiB: helper.Uint64ToPtr(2),
- ECCErrorsL1Cache: helper.Uint64ToPtr(2),
- ECCErrorsL2Cache: helper.Uint64ToPtr(2),
- ECCErrorsDevice: helper.Uint64ToPtr(2),
- PowerUsageW: helper.UintToPtr(2),
- BAR1UsedMiB: helper.Uint64ToPtr(2),
- },
- },
- DriverConfiguration: &MockNVMLDriver{
- deviceCountCallSuccessful: true,
- deviceInfoByIndexCallSuccessful: true,
- deviceInfoAndStatusByIndexCallSuccessful: true,
- devices: []*DeviceInfo{
- {
- UUID: "UUID1",
- Name: helper.StringToPtr("ModelName1"),
- MemoryMiB: helper.Uint64ToPtr(16),
- PCIBusID: "busId1",
- PowerW: helper.UintToPtr(100),
- BAR1MiB: helper.Uint64ToPtr(100),
- PCIBandwidthMBPerS: helper.UintToPtr(100),
- CoresClockMHz: helper.UintToPtr(100),
- MemoryClockMHz: helper.UintToPtr(100),
- }, {
- UUID: "UUID2",
- Name: helper.StringToPtr("ModelName2"),
- MemoryMiB: helper.Uint64ToPtr(8),
- PCIBusID: "busId2",
- PowerW: helper.UintToPtr(200),
- BAR1MiB: helper.Uint64ToPtr(200),
- PCIBandwidthMBPerS: helper.UintToPtr(200),
- CoresClockMHz: helper.UintToPtr(200),
- MemoryClockMHz: helper.UintToPtr(200),
- },
- },
- deviceStatus: []*DeviceStatus{
- {
- TemperatureC: helper.UintToPtr(1),
- GPUUtilization: helper.UintToPtr(1),
- MemoryUtilization: helper.UintToPtr(1),
- EncoderUtilization: helper.UintToPtr(1),
- DecoderUtilization: helper.UintToPtr(1),
- UsedMemoryMiB: helper.Uint64ToPtr(1),
- ECCErrorsL1Cache: helper.Uint64ToPtr(1),
- ECCErrorsL2Cache: helper.Uint64ToPtr(1),
- ECCErrorsDevice: helper.Uint64ToPtr(1),
- PowerUsageW: helper.UintToPtr(1),
- BAR1UsedMiB: helper.Uint64ToPtr(1),
- },
- {
- TemperatureC: helper.UintToPtr(2),
- GPUUtilization: helper.UintToPtr(2),
- MemoryUtilization: helper.UintToPtr(2),
- EncoderUtilization: helper.UintToPtr(2),
- DecoderUtilization: helper.UintToPtr(2),
- UsedMemoryMiB: helper.Uint64ToPtr(2),
- ECCErrorsL1Cache: helper.Uint64ToPtr(2),
- ECCErrorsL2Cache: helper.Uint64ToPtr(2),
- ECCErrorsDevice: helper.Uint64ToPtr(2),
- PowerUsageW: helper.UintToPtr(2),
- BAR1UsedMiB: helper.Uint64ToPtr(2),
- },
- },
- },
- },
- } {
- cli := nvmlClient{driver: testCase.DriverConfiguration}
- statsData, err := cli.GetStatsData()
- if testCase.ExpectedError && err == nil {
- t.Errorf("case '%s' : expected Error, but didn't get one", testCase.Name)
- }
- if !testCase.ExpectedError && err != nil {
- t.Errorf("case '%s' : unexpected Error '%v'", testCase.Name, err)
- }
- require.New(t).Equal(testCase.ExpectedResult, statsData)
- }
-}
diff --git a/devices/gpu/nvidia/nvml/driver_default.go b/devices/gpu/nvidia/nvml/driver_default.go
deleted file mode 100644
index e67efa22eeaf..000000000000
--- a/devices/gpu/nvidia/nvml/driver_default.go
+++ /dev/null
@@ -1,33 +0,0 @@
-// +build !linux
-
-package nvml
-
-// Initialize nvml library by locating nvml shared object file and calling ldopen
-func (n *nvmlDriver) Initialize() error {
- return UnavailableLib
-}
-
-// Shutdown stops any further interaction with nvml
-func (n *nvmlDriver) Shutdown() error {
- return UnavailableLib
-}
-
-// SystemDriverVersion returns installed driver version
-func (n *nvmlDriver) SystemDriverVersion() (string, error) {
- return "", UnavailableLib
-}
-
-// DeviceCount reports number of available GPU devices
-func (n *nvmlDriver) DeviceCount() (uint, error) {
- return 0, UnavailableLib
-}
-
-// DeviceInfoByIndex returns DeviceInfo for index GPU in system device list
-func (n *nvmlDriver) DeviceInfoByIndex(index uint) (*DeviceInfo, error) {
- return nil, UnavailableLib
-}
-
-// DeviceInfoByIndex returns DeviceInfo and DeviceStatus for index GPU in system device list
-func (n *nvmlDriver) DeviceInfoAndStatusByIndex(index uint) (*DeviceInfo, *DeviceStatus, error) {
- return nil, nil, UnavailableLib
-}
diff --git a/devices/gpu/nvidia/nvml/driver_linux.go b/devices/gpu/nvidia/nvml/driver_linux.go
deleted file mode 100644
index bdd777561bcf..000000000000
--- a/devices/gpu/nvidia/nvml/driver_linux.go
+++ /dev/null
@@ -1,85 +0,0 @@
-package nvml
-
-import (
- "github.com/NVIDIA/gpu-monitoring-tools/bindings/go/nvml"
-)
-
-// Initialize nvml library by locating nvml shared object file and calling ldopen
-func (n *nvmlDriver) Initialize() error {
- return nvml.Init()
-}
-
-// Shutdown stops any further interaction with nvml
-func (n *nvmlDriver) Shutdown() error {
- return nvml.Shutdown()
-}
-
-// SystemDriverVersion returns installed driver version
-func (n *nvmlDriver) SystemDriverVersion() (string, error) {
- return nvml.GetDriverVersion()
-}
-
-// DeviceCount reports number of available GPU devices
-func (n *nvmlDriver) DeviceCount() (uint, error) {
- return nvml.GetDeviceCount()
-}
-
-// DeviceInfoByIndex returns DeviceInfo for index GPU in system device list
-func (n *nvmlDriver) DeviceInfoByIndex(index uint) (*DeviceInfo, error) {
- device, err := nvml.NewDevice(index)
- if err != nil {
- return nil, err
- }
- deviceMode, err := device.GetDeviceMode()
- if err != nil {
- return nil, err
- }
- return &DeviceInfo{
- UUID: device.UUID,
- Name: device.Model,
- MemoryMiB: device.Memory,
- PowerW: device.Power,
- BAR1MiB: device.PCI.BAR1,
- PCIBandwidthMBPerS: device.PCI.Bandwidth,
- PCIBusID: device.PCI.BusID,
- CoresClockMHz: device.Clocks.Cores,
- MemoryClockMHz: device.Clocks.Memory,
- DisplayState: deviceMode.DisplayInfo.Mode.String(),
- PersistenceMode: deviceMode.Persistence.String(),
- }, nil
-}
-
-// DeviceInfoByIndex returns DeviceInfo and DeviceStatus for index GPU in system device list
-func (n *nvmlDriver) DeviceInfoAndStatusByIndex(index uint) (*DeviceInfo, *DeviceStatus, error) {
- device, err := nvml.NewDevice(index)
- if err != nil {
- return nil, nil, err
- }
- status, err := device.Status()
- if err != nil {
- return nil, nil, err
- }
- return &DeviceInfo{
- UUID: device.UUID,
- Name: device.Model,
- MemoryMiB: device.Memory,
- PowerW: device.Power,
- BAR1MiB: device.PCI.BAR1,
- PCIBandwidthMBPerS: device.PCI.Bandwidth,
- PCIBusID: device.PCI.BusID,
- CoresClockMHz: device.Clocks.Cores,
- MemoryClockMHz: device.Clocks.Memory,
- }, &DeviceStatus{
- TemperatureC: status.Temperature,
- GPUUtilization: status.Utilization.GPU,
- MemoryUtilization: status.Utilization.Memory,
- EncoderUtilization: status.Utilization.Encoder,
- DecoderUtilization: status.Utilization.Decoder,
- UsedMemoryMiB: status.Memory.Global.Used,
- ECCErrorsL1Cache: status.Memory.ECCErrors.L1Cache,
- ECCErrorsL2Cache: status.Memory.ECCErrors.L2Cache,
- ECCErrorsDevice: status.Memory.ECCErrors.Device,
- PowerUsageW: status.Power,
- BAR1UsedMiB: status.PCI.BAR1Used,
- }, nil
-}
diff --git a/devices/gpu/nvidia/nvml/shared.go b/devices/gpu/nvidia/nvml/shared.go
deleted file mode 100644
index a0bb04d22234..000000000000
--- a/devices/gpu/nvidia/nvml/shared.go
+++ /dev/null
@@ -1,61 +0,0 @@
-package nvml
-
-import "errors"
-
-var (
- // UnavailableLib is returned when the nvml library could not be loaded.
- UnavailableLib = errors.New("could not load NVML library")
-)
-
-// nvmlDriver implements NvmlDriver
-// Users are required to call Initialize method before using any other methods
-type nvmlDriver struct{}
-
-// NvmlDriver represents set of methods to query nvml library
-type NvmlDriver interface {
- Initialize() error
- Shutdown() error
- SystemDriverVersion() (string, error)
- DeviceCount() (uint, error)
- DeviceInfoByIndex(uint) (*DeviceInfo, error)
- DeviceInfoAndStatusByIndex(uint) (*DeviceInfo, *DeviceStatus, error)
-}
-
-// DeviceInfo represents nvml device data
-// this struct is returned by NvmlDriver DeviceInfoByIndex and
-// DeviceInfoAndStatusByIndex methods
-type DeviceInfo struct {
- // The following fields are guaranteed to be retrieved from nvml
- UUID string
- PCIBusID string
- DisplayState string
- PersistenceMode string
-
- // The following fields can be nil after call to nvml, because nvml was
- // not able to retrieve this fields for specific nvidia card
- Name *string
- MemoryMiB *uint64
- PowerW *uint
- BAR1MiB *uint64
- PCIBandwidthMBPerS *uint
- CoresClockMHz *uint
- MemoryClockMHz *uint
-}
-
-// DeviceStatus represents nvml device status
-// this struct is returned by NvmlDriver DeviceInfoAndStatusByIndex method
-type DeviceStatus struct {
- // The following fields can be nil after call to nvml, because nvml was
- // not able to retrieve this fields for specific nvidia card
- PowerUsageW *uint
- TemperatureC *uint
- GPUUtilization *uint // %
- MemoryUtilization *uint // %
- EncoderUtilization *uint // %
- DecoderUtilization *uint // %
- BAR1UsedMiB *uint64
- UsedMemoryMiB *uint64
- ECCErrorsL1Cache *uint64
- ECCErrorsL2Cache *uint64
- ECCErrorsDevice *uint64
-}
diff --git a/devices/gpu/nvidia/stats.go b/devices/gpu/nvidia/stats.go
deleted file mode 100644
index c6c447757916..000000000000
--- a/devices/gpu/nvidia/stats.go
+++ /dev/null
@@ -1,325 +0,0 @@
-package nvidia
-
-import (
- "context"
- "time"
-
- "github.com/hashicorp/nomad/devices/gpu/nvidia/nvml"
- "github.com/hashicorp/nomad/helper"
- "github.com/hashicorp/nomad/plugins/device"
- "github.com/hashicorp/nomad/plugins/shared/structs"
-)
-
-const (
- // Attribute names for reporting stats output
- PowerUsageAttr = "Power usage"
- PowerUsageUnit = "W"
- PowerUsageDesc = "Power usage for this GPU in watts and " +
- "its associated circuitry (e.g. memory) / Maximum GPU Power"
- GPUUtilizationAttr = "GPU utilization"
- GPUUtilizationUnit = "%"
- GPUUtilizationDesc = "Percent of time over the past sample period " +
- "during which one or more kernels were executing on the GPU."
- MemoryUtilizationAttr = "Memory utilization"
- MemoryUtilizationUnit = "%"
- MemoryUtilizationDesc = "Percentage of bandwidth used during the past sample period"
- EncoderUtilizationAttr = "Encoder utilization"
- EncoderUtilizationUnit = "%"
- EncoderUtilizationDesc = "Percent of time over the past sample period " +
- "during which GPU Encoder was used"
- DecoderUtilizationAttr = "Decoder utilization"
- DecoderUtilizationUnit = "%"
- DecoderUtilizationDesc = "Percent of time over the past sample period " +
- "during which GPU Decoder was used"
- TemperatureAttr = "Temperature"
- TemperatureUnit = "C" // Celsius degrees
- TemperatureDesc = "Temperature of the Unit"
- MemoryStateAttr = "Memory state"
- MemoryStateUnit = "MiB" // Mebibytes
- MemoryStateDesc = "UsedMemory / TotalMemory"
- BAR1StateAttr = "BAR1 buffer state"
- BAR1StateUnit = "MiB" // Mebibytes
- BAR1StateDesc = "UsedBAR1 / TotalBAR1"
- ECCErrorsL1CacheAttr = "ECC L1 errors"
- ECCErrorsL1CacheUnit = "#" // number of errors
- ECCErrorsL1CacheDesc = "Requested L1Cache error counter for the device"
- ECCErrorsL2CacheAttr = "ECC L2 errors"
- ECCErrorsL2CacheUnit = "#" // number of errors
- ECCErrorsL2CacheDesc = "Requested L2Cache error counter for the device"
- ECCErrorsDeviceAttr = "ECC memory errors"
- ECCErrorsDeviceUnit = "#" // number of errors
- ECCErrorsDeviceDesc = "Requested memory error counter for the device"
-)
-
-// stats is the long running goroutine that streams device statistics
-func (d *NvidiaDevice) stats(ctx context.Context, stats chan<- *device.StatsResponse, interval time.Duration) {
- defer close(stats)
-
- if d.initErr != nil {
- if d.initErr.Error() != nvml.UnavailableLib.Error() {
- d.logger.Error("exiting stats due to problems with NVML loading", "error", d.initErr)
- stats <- device.NewStatsError(d.initErr)
- }
-
- return
- }
-
- // Create a timer that will fire immediately for the first detection
- ticker := time.NewTimer(0)
-
- for {
- select {
- case <-ctx.Done():
- return
- case <-ticker.C:
- ticker.Reset(interval)
- }
-
- d.writeStatsToChannel(stats, time.Now())
- }
-}
-
-// filterStatsByID accepts list of StatsData and set of IDs
-// this function would return entries from StatsData with IDs found in the set
-func filterStatsByID(stats []*nvml.StatsData, ids map[string]struct{}) []*nvml.StatsData {
- var filteredStats []*nvml.StatsData
- for _, statsItem := range stats {
- if _, ok := ids[statsItem.UUID]; ok {
- filteredStats = append(filteredStats, statsItem)
- }
- }
- return filteredStats
-}
-
-// writeStatsToChannel collects StatsData from NVML backend, groups StatsData
-// by DeviceName attribute, populates DeviceGroupStats structure for every group
-// and sends data over provided channel
-func (d *NvidiaDevice) writeStatsToChannel(stats chan<- *device.StatsResponse, timestamp time.Time) {
- statsData, err := d.nvmlClient.GetStatsData()
- if err != nil {
- d.logger.Error("failed to get nvidia stats", "error", err)
- stats <- &device.StatsResponse{
- Error: err,
- }
- return
- }
-
- // filter only stats from devices that are stored in NvidiaDevice struct
- d.deviceLock.RLock()
- statsData = filterStatsByID(statsData, d.devices)
- d.deviceLock.RUnlock()
-
- // group stats by DeviceName struct field
- statsListByDeviceName := make(map[string][]*nvml.StatsData)
- for _, statsItem := range statsData {
- deviceName := statsItem.DeviceName
- if deviceName == nil {
- // nvml driver was not able to detect device name. This kind
- // of devices are placed to single group with 'notAvailable' name
- notAvailableCopy := notAvailable
- deviceName = ¬AvailableCopy
- }
-
- statsListByDeviceName[*deviceName] = append(statsListByDeviceName[*deviceName], statsItem)
- }
-
- // place data device.DeviceGroupStats struct for every group of stats
- deviceGroupsStats := make([]*device.DeviceGroupStats, 0, len(statsListByDeviceName))
- for groupName, groupStats := range statsListByDeviceName {
- deviceGroupsStats = append(deviceGroupsStats, statsForGroup(groupName, groupStats, timestamp))
- }
-
- stats <- &device.StatsResponse{
- Groups: deviceGroupsStats,
- }
-}
-
-func newNotAvailableDeviceStats(unit, desc string) *structs.StatValue {
- return &structs.StatValue{Unit: unit, Desc: desc, StringVal: helper.StringToPtr(notAvailable)}
-}
-
-// statsForGroup is a helper function that populates device.DeviceGroupStats
-// for given groupName with groupStats list
-func statsForGroup(groupName string, groupStats []*nvml.StatsData, timestamp time.Time) *device.DeviceGroupStats {
- instanceStats := make(map[string]*device.DeviceStats)
- for _, statsItem := range groupStats {
- instanceStats[statsItem.UUID] = statsForItem(statsItem, timestamp)
- }
-
- return &device.DeviceGroupStats{
- Vendor: vendor,
- Type: deviceType,
- Name: groupName,
- InstanceStats: instanceStats,
- }
-}
-
-// statsForItem is a helper function that populates device.DeviceStats for given
-// nvml.StatsData
-func statsForItem(statsItem *nvml.StatsData, timestamp time.Time) *device.DeviceStats {
- // nvml.StatsData holds pointers to values that can be nil
- // In case they are nil return stats with 'notAvailable' constant
- var (
- powerUsageStat *structs.StatValue
- GPUUtilizationStat *structs.StatValue
- memoryUtilizationStat *structs.StatValue
- encoderUtilizationStat *structs.StatValue
- decoderUtilizationStat *structs.StatValue
- temperatureStat *structs.StatValue
- memoryStateStat *structs.StatValue
- BAR1StateStat *structs.StatValue
- ECCErrorsL1CacheStat *structs.StatValue
- ECCErrorsL2CacheStat *structs.StatValue
- ECCErrorsDeviceStat *structs.StatValue
- )
-
- if statsItem.PowerUsageW == nil || statsItem.PowerW == nil {
- powerUsageStat = newNotAvailableDeviceStats(PowerUsageUnit, PowerUsageDesc)
- } else {
- powerUsageStat = &structs.StatValue{
- Unit: PowerUsageUnit,
- Desc: PowerUsageDesc,
- IntNumeratorVal: helper.Int64ToPtr(int64(*statsItem.PowerUsageW)),
- IntDenominatorVal: uintToInt64Ptr(statsItem.PowerW),
- }
- }
-
- if statsItem.GPUUtilization == nil {
- GPUUtilizationStat = newNotAvailableDeviceStats(GPUUtilizationUnit, GPUUtilizationDesc)
- } else {
- GPUUtilizationStat = &structs.StatValue{
- Unit: GPUUtilizationUnit,
- Desc: GPUUtilizationDesc,
- IntNumeratorVal: uintToInt64Ptr(statsItem.GPUUtilization),
- }
- }
-
- if statsItem.MemoryUtilization == nil {
- memoryUtilizationStat = newNotAvailableDeviceStats(MemoryUtilizationUnit, MemoryUtilizationDesc)
- } else {
- memoryUtilizationStat = &structs.StatValue{
- Unit: MemoryUtilizationUnit,
- Desc: MemoryUtilizationDesc,
- IntNumeratorVal: uintToInt64Ptr(statsItem.MemoryUtilization),
- }
- }
-
- if statsItem.EncoderUtilization == nil {
- encoderUtilizationStat = newNotAvailableDeviceStats(EncoderUtilizationUnit, EncoderUtilizationDesc)
- } else {
- encoderUtilizationStat = &structs.StatValue{
- Unit: EncoderUtilizationUnit,
- Desc: EncoderUtilizationDesc,
- IntNumeratorVal: uintToInt64Ptr(statsItem.EncoderUtilization),
- }
- }
-
- if statsItem.DecoderUtilization == nil {
- decoderUtilizationStat = newNotAvailableDeviceStats(DecoderUtilizationUnit, DecoderUtilizationDesc)
- } else {
- decoderUtilizationStat = &structs.StatValue{
- Unit: DecoderUtilizationUnit,
- Desc: DecoderUtilizationDesc,
- IntNumeratorVal: uintToInt64Ptr(statsItem.DecoderUtilization),
- }
- }
-
- if statsItem.TemperatureC == nil {
- temperatureStat = newNotAvailableDeviceStats(TemperatureUnit, TemperatureDesc)
- } else {
- temperatureStat = &structs.StatValue{
- Unit: TemperatureUnit,
- Desc: TemperatureDesc,
- IntNumeratorVal: uintToInt64Ptr(statsItem.TemperatureC),
- }
- }
-
- if statsItem.UsedMemoryMiB == nil || statsItem.MemoryMiB == nil {
- memoryStateStat = newNotAvailableDeviceStats(MemoryStateUnit, MemoryStateDesc)
- } else {
- memoryStateStat = &structs.StatValue{
- Unit: MemoryStateUnit,
- Desc: MemoryStateDesc,
- IntNumeratorVal: uint64ToInt64Ptr(statsItem.UsedMemoryMiB),
- IntDenominatorVal: uint64ToInt64Ptr(statsItem.MemoryMiB),
- }
- }
-
- if statsItem.BAR1UsedMiB == nil || statsItem.BAR1MiB == nil {
- BAR1StateStat = newNotAvailableDeviceStats(BAR1StateUnit, BAR1StateDesc)
- } else {
- BAR1StateStat = &structs.StatValue{
- Unit: BAR1StateUnit,
- Desc: BAR1StateDesc,
- IntNumeratorVal: uint64ToInt64Ptr(statsItem.BAR1UsedMiB),
- IntDenominatorVal: uint64ToInt64Ptr(statsItem.BAR1MiB),
- }
- }
-
- if statsItem.ECCErrorsL1Cache == nil {
- ECCErrorsL1CacheStat = newNotAvailableDeviceStats(ECCErrorsL1CacheUnit, ECCErrorsL1CacheDesc)
- } else {
- ECCErrorsL1CacheStat = &structs.StatValue{
- Unit: ECCErrorsL1CacheUnit,
- Desc: ECCErrorsL1CacheDesc,
- IntNumeratorVal: uint64ToInt64Ptr(statsItem.ECCErrorsL1Cache),
- }
- }
-
- if statsItem.ECCErrorsL2Cache == nil {
- ECCErrorsL2CacheStat = newNotAvailableDeviceStats(ECCErrorsL2CacheUnit, ECCErrorsL2CacheDesc)
- } else {
- ECCErrorsL2CacheStat = &structs.StatValue{
- Unit: ECCErrorsL2CacheUnit,
- Desc: ECCErrorsL2CacheDesc,
- IntNumeratorVal: uint64ToInt64Ptr(statsItem.ECCErrorsL2Cache),
- }
- }
-
- if statsItem.ECCErrorsDevice == nil {
- ECCErrorsDeviceStat = newNotAvailableDeviceStats(ECCErrorsDeviceUnit, ECCErrorsDeviceDesc)
- } else {
- ECCErrorsDeviceStat = &structs.StatValue{
- Unit: ECCErrorsDeviceUnit,
- Desc: ECCErrorsDeviceDesc,
- IntNumeratorVal: uint64ToInt64Ptr(statsItem.ECCErrorsDevice),
- }
- }
- return &device.DeviceStats{
- Summary: memoryStateStat,
- Stats: &structs.StatObject{
- Attributes: map[string]*structs.StatValue{
- PowerUsageAttr: powerUsageStat,
- GPUUtilizationAttr: GPUUtilizationStat,
- MemoryUtilizationAttr: memoryUtilizationStat,
- EncoderUtilizationAttr: encoderUtilizationStat,
- DecoderUtilizationAttr: decoderUtilizationStat,
- TemperatureAttr: temperatureStat,
- MemoryStateAttr: memoryStateStat,
- BAR1StateAttr: BAR1StateStat,
- ECCErrorsL1CacheAttr: ECCErrorsL1CacheStat,
- ECCErrorsL2CacheAttr: ECCErrorsL2CacheStat,
- ECCErrorsDeviceAttr: ECCErrorsDeviceStat,
- },
- },
- Timestamp: timestamp,
- }
-}
-
-func uintToInt64Ptr(u *uint) *int64 {
- if u == nil {
- return nil
- }
-
- v := int64(*u)
- return &v
-}
-
-func uint64ToInt64Ptr(u *uint64) *int64 {
- if u == nil {
- return nil
- }
-
- v := int64(*u)
- return &v
-}
diff --git a/devices/gpu/nvidia/stats_test.go b/devices/gpu/nvidia/stats_test.go
deleted file mode 100644
index f6221e0f4801..000000000000
--- a/devices/gpu/nvidia/stats_test.go
+++ /dev/null
@@ -1,3041 +0,0 @@
-package nvidia
-
-import (
- "errors"
- "sort"
- "testing"
- "time"
-
- hclog "github.com/hashicorp/go-hclog"
- "github.com/hashicorp/nomad/devices/gpu/nvidia/nvml"
- "github.com/hashicorp/nomad/helper"
- "github.com/hashicorp/nomad/plugins/device"
- "github.com/hashicorp/nomad/plugins/shared/structs"
- "github.com/stretchr/testify/require"
-)
-
-func TestFilterStatsByID(t *testing.T) {
- for _, testCase := range []struct {
- Name string
- ProvidedStats []*nvml.StatsData
- ProvidedIDs map[string]struct{}
- ExpectedResult []*nvml.StatsData
- }{
- {
- Name: "All ids are in the map",
- ProvidedStats: []*nvml.StatsData{
- {
- DeviceData: &nvml.DeviceData{
- UUID: "UUID1",
- DeviceName: helper.StringToPtr("DeviceName1"),
- MemoryMiB: helper.Uint64ToPtr(1),
- PowerW: helper.UintToPtr(2),
- BAR1MiB: helper.Uint64ToPtr(256),
- },
- PowerUsageW: helper.UintToPtr(1),
- GPUUtilization: helper.UintToPtr(1),
- MemoryUtilization: helper.UintToPtr(1),
- EncoderUtilization: helper.UintToPtr(1),
- DecoderUtilization: helper.UintToPtr(1),
- TemperatureC: helper.UintToPtr(1),
- UsedMemoryMiB: helper.Uint64ToPtr(1),
- ECCErrorsL1Cache: helper.Uint64ToPtr(100),
- ECCErrorsL2Cache: helper.Uint64ToPtr(100),
- ECCErrorsDevice: helper.Uint64ToPtr(100),
- },
- {
- DeviceData: &nvml.DeviceData{
- UUID: "UUID2",
- DeviceName: helper.StringToPtr("DeviceName1"),
- MemoryMiB: helper.Uint64ToPtr(1),
- PowerW: helper.UintToPtr(2),
- BAR1MiB: helper.Uint64ToPtr(256),
- },
- PowerUsageW: helper.UintToPtr(1),
- GPUUtilization: helper.UintToPtr(1),
- MemoryUtilization: helper.UintToPtr(1),
- EncoderUtilization: helper.UintToPtr(1),
- DecoderUtilization: helper.UintToPtr(1),
- TemperatureC: helper.UintToPtr(1),
- UsedMemoryMiB: helper.Uint64ToPtr(1),
- ECCErrorsL1Cache: helper.Uint64ToPtr(100),
- ECCErrorsL2Cache: helper.Uint64ToPtr(100),
- ECCErrorsDevice: helper.Uint64ToPtr(100),
- },
- {
- DeviceData: &nvml.DeviceData{
- UUID: "UUID3",
- DeviceName: helper.StringToPtr("DeviceName1"),
- MemoryMiB: helper.Uint64ToPtr(1),
- PowerW: helper.UintToPtr(2),
- BAR1MiB: helper.Uint64ToPtr(256),
- },
- PowerUsageW: helper.UintToPtr(1),
- GPUUtilization: helper.UintToPtr(1),
- MemoryUtilization: helper.UintToPtr(1),
- EncoderUtilization: helper.UintToPtr(1),
- DecoderUtilization: helper.UintToPtr(1),
- TemperatureC: helper.UintToPtr(1),
- UsedMemoryMiB: helper.Uint64ToPtr(1),
- ECCErrorsL1Cache: helper.Uint64ToPtr(100),
- ECCErrorsL2Cache: helper.Uint64ToPtr(100),
- ECCErrorsDevice: helper.Uint64ToPtr(100),
- },
- },
- ProvidedIDs: map[string]struct{}{
- "UUID1": {},
- "UUID2": {},
- "UUID3": {},
- },
- ExpectedResult: []*nvml.StatsData{
- {
- DeviceData: &nvml.DeviceData{
- UUID: "UUID1",
- DeviceName: helper.StringToPtr("DeviceName1"),
- MemoryMiB: helper.Uint64ToPtr(1),
- PowerW: helper.UintToPtr(2),
- BAR1MiB: helper.Uint64ToPtr(256),
- },
- PowerUsageW: helper.UintToPtr(1),
- GPUUtilization: helper.UintToPtr(1),
- MemoryUtilization: helper.UintToPtr(1),
- EncoderUtilization: helper.UintToPtr(1),
- DecoderUtilization: helper.UintToPtr(1),
- TemperatureC: helper.UintToPtr(1),
- UsedMemoryMiB: helper.Uint64ToPtr(1),
- ECCErrorsL1Cache: helper.Uint64ToPtr(100),
- ECCErrorsL2Cache: helper.Uint64ToPtr(100),
- ECCErrorsDevice: helper.Uint64ToPtr(100),
- },
- {
- DeviceData: &nvml.DeviceData{
- UUID: "UUID2",
- DeviceName: helper.StringToPtr("DeviceName1"),
- MemoryMiB: helper.Uint64ToPtr(1),
- PowerW: helper.UintToPtr(2),
- BAR1MiB: helper.Uint64ToPtr(256),
- },
- PowerUsageW: helper.UintToPtr(1),
- GPUUtilization: helper.UintToPtr(1),
- MemoryUtilization: helper.UintToPtr(1),
- EncoderUtilization: helper.UintToPtr(1),
- DecoderUtilization: helper.UintToPtr(1),
- TemperatureC: helper.UintToPtr(1),
- UsedMemoryMiB: helper.Uint64ToPtr(1),
- ECCErrorsL1Cache: helper.Uint64ToPtr(100),
- ECCErrorsL2Cache: helper.Uint64ToPtr(100),
- ECCErrorsDevice: helper.Uint64ToPtr(100),
- },
- {
- DeviceData: &nvml.DeviceData{
- UUID: "UUID3",
- DeviceName: helper.StringToPtr("DeviceName1"),
- MemoryMiB: helper.Uint64ToPtr(1),
- PowerW: helper.UintToPtr(2),
- BAR1MiB: helper.Uint64ToPtr(256),
- },
- PowerUsageW: helper.UintToPtr(1),
- GPUUtilization: helper.UintToPtr(1),
- MemoryUtilization: helper.UintToPtr(1),
- EncoderUtilization: helper.UintToPtr(1),
- DecoderUtilization: helper.UintToPtr(1),
- TemperatureC: helper.UintToPtr(1),
- UsedMemoryMiB: helper.Uint64ToPtr(1),
- ECCErrorsL1Cache: helper.Uint64ToPtr(100),
- ECCErrorsL2Cache: helper.Uint64ToPtr(100),
- ECCErrorsDevice: helper.Uint64ToPtr(100),
- },
- },
- },
- {
- Name: "Odd are not provided in the map",
- ProvidedStats: []*nvml.StatsData{
- {
- DeviceData: &nvml.DeviceData{
- UUID: "UUID1",
- DeviceName: helper.StringToPtr("DeviceName1"),
- MemoryMiB: helper.Uint64ToPtr(1),
- PowerW: helper.UintToPtr(2),
- BAR1MiB: helper.Uint64ToPtr(256),
- },
- PowerUsageW: helper.UintToPtr(1),
- GPUUtilization: helper.UintToPtr(1),
- MemoryUtilization: helper.UintToPtr(1),
- EncoderUtilization: helper.UintToPtr(1),
- DecoderUtilization: helper.UintToPtr(1),
- TemperatureC: helper.UintToPtr(1),
- UsedMemoryMiB: helper.Uint64ToPtr(1),
- ECCErrorsL1Cache: helper.Uint64ToPtr(100),
- ECCErrorsL2Cache: helper.Uint64ToPtr(100),
- ECCErrorsDevice: helper.Uint64ToPtr(100),
- },
- {
- DeviceData: &nvml.DeviceData{
- UUID: "UUID2",
- DeviceName: helper.StringToPtr("DeviceName1"),
- MemoryMiB: helper.Uint64ToPtr(1),
- PowerW: helper.UintToPtr(2),
- BAR1MiB: helper.Uint64ToPtr(256),
- },
- PowerUsageW: helper.UintToPtr(1),
- GPUUtilization: helper.UintToPtr(1),
- MemoryUtilization: helper.UintToPtr(1),
- EncoderUtilization: helper.UintToPtr(1),
- DecoderUtilization: helper.UintToPtr(1),
- TemperatureC: helper.UintToPtr(1),
- UsedMemoryMiB: helper.Uint64ToPtr(1),
- ECCErrorsL1Cache: helper.Uint64ToPtr(100),
- ECCErrorsL2Cache: helper.Uint64ToPtr(100),
- ECCErrorsDevice: helper.Uint64ToPtr(100),
- },
- {
- DeviceData: &nvml.DeviceData{
- UUID: "UUID3",
- DeviceName: helper.StringToPtr("DeviceName1"),
- MemoryMiB: helper.Uint64ToPtr(1),
- PowerW: helper.UintToPtr(2),
- BAR1MiB: helper.Uint64ToPtr(256),
- },
- PowerUsageW: helper.UintToPtr(1),
- GPUUtilization: helper.UintToPtr(1),
- MemoryUtilization: helper.UintToPtr(1),
- EncoderUtilization: helper.UintToPtr(1),
- DecoderUtilization: helper.UintToPtr(1),
- TemperatureC: helper.UintToPtr(1),
- UsedMemoryMiB: helper.Uint64ToPtr(1),
- ECCErrorsL1Cache: helper.Uint64ToPtr(100),
- ECCErrorsL2Cache: helper.Uint64ToPtr(100),
- ECCErrorsDevice: helper.Uint64ToPtr(100),
- },
- },
- ProvidedIDs: map[string]struct{}{
- "UUID2": {},
- },
- ExpectedResult: []*nvml.StatsData{
- {
- DeviceData: &nvml.DeviceData{
- UUID: "UUID2",
- DeviceName: helper.StringToPtr("DeviceName1"),
- MemoryMiB: helper.Uint64ToPtr(1),
- PowerW: helper.UintToPtr(2),
- BAR1MiB: helper.Uint64ToPtr(256),
- },
- PowerUsageW: helper.UintToPtr(1),
- GPUUtilization: helper.UintToPtr(1),
- MemoryUtilization: helper.UintToPtr(1),
- EncoderUtilization: helper.UintToPtr(1),
- DecoderUtilization: helper.UintToPtr(1),
- TemperatureC: helper.UintToPtr(1),
- UsedMemoryMiB: helper.Uint64ToPtr(1),
- ECCErrorsL1Cache: helper.Uint64ToPtr(100),
- ECCErrorsL2Cache: helper.Uint64ToPtr(100),
- ECCErrorsDevice: helper.Uint64ToPtr(100),
- },
- },
- },
- {
- Name: "Even are not provided in the map",
- ProvidedStats: []*nvml.StatsData{
- {
- DeviceData: &nvml.DeviceData{
- UUID: "UUID1",
- DeviceName: helper.StringToPtr("DeviceName1"),
- MemoryMiB: helper.Uint64ToPtr(1),
- PowerW: helper.UintToPtr(2),
- BAR1MiB: helper.Uint64ToPtr(256),
- },
- PowerUsageW: helper.UintToPtr(1),
- GPUUtilization: helper.UintToPtr(1),
- MemoryUtilization: helper.UintToPtr(1),
- EncoderUtilization: helper.UintToPtr(1),
- DecoderUtilization: helper.UintToPtr(1),
- TemperatureC: helper.UintToPtr(1),
- UsedMemoryMiB: helper.Uint64ToPtr(1),
- ECCErrorsL1Cache: helper.Uint64ToPtr(100),
- ECCErrorsL2Cache: helper.Uint64ToPtr(100),
- ECCErrorsDevice: helper.Uint64ToPtr(100),
- },
- {
- DeviceData: &nvml.DeviceData{
- UUID: "UUID2",
- DeviceName: helper.StringToPtr("DeviceName1"),
- MemoryMiB: helper.Uint64ToPtr(1),
- PowerW: helper.UintToPtr(2),
- BAR1MiB: helper.Uint64ToPtr(256),
- },
- PowerUsageW: helper.UintToPtr(1),
- GPUUtilization: helper.UintToPtr(1),
- MemoryUtilization: helper.UintToPtr(1),
- EncoderUtilization: helper.UintToPtr(1),
- DecoderUtilization: helper.UintToPtr(1),
- TemperatureC: helper.UintToPtr(1),
- UsedMemoryMiB: helper.Uint64ToPtr(1),
- ECCErrorsL1Cache: helper.Uint64ToPtr(100),
- ECCErrorsL2Cache: helper.Uint64ToPtr(100),
- ECCErrorsDevice: helper.Uint64ToPtr(100),
- },
- {
- DeviceData: &nvml.DeviceData{
- UUID: "UUID3",
- DeviceName: helper.StringToPtr("DeviceName1"),
- MemoryMiB: helper.Uint64ToPtr(1),
- PowerW: helper.UintToPtr(2),
- BAR1MiB: helper.Uint64ToPtr(256),
- },
- PowerUsageW: helper.UintToPtr(1),
- GPUUtilization: helper.UintToPtr(1),
- MemoryUtilization: helper.UintToPtr(1),
- EncoderUtilization: helper.UintToPtr(1),
- DecoderUtilization: helper.UintToPtr(1),
- TemperatureC: helper.UintToPtr(1),
- UsedMemoryMiB: helper.Uint64ToPtr(1),
- ECCErrorsL1Cache: helper.Uint64ToPtr(100),
- ECCErrorsL2Cache: helper.Uint64ToPtr(100),
- ECCErrorsDevice: helper.Uint64ToPtr(100),
- },
- },
- ProvidedIDs: map[string]struct{}{
- "UUID1": {},
- "UUID3": {},
- },
- ExpectedResult: []*nvml.StatsData{
- {
- DeviceData: &nvml.DeviceData{
- UUID: "UUID1",
- DeviceName: helper.StringToPtr("DeviceName1"),
- MemoryMiB: helper.Uint64ToPtr(1),
- PowerW: helper.UintToPtr(2),
- BAR1MiB: helper.Uint64ToPtr(256),
- },
- PowerUsageW: helper.UintToPtr(1),
- GPUUtilization: helper.UintToPtr(1),
- MemoryUtilization: helper.UintToPtr(1),
- EncoderUtilization: helper.UintToPtr(1),
- DecoderUtilization: helper.UintToPtr(1),
- TemperatureC: helper.UintToPtr(1),
- UsedMemoryMiB: helper.Uint64ToPtr(1),
- ECCErrorsL1Cache: helper.Uint64ToPtr(100),
- ECCErrorsL2Cache: helper.Uint64ToPtr(100),
- ECCErrorsDevice: helper.Uint64ToPtr(100),
- },
- {
- DeviceData: &nvml.DeviceData{
- UUID: "UUID3",
- DeviceName: helper.StringToPtr("DeviceName1"),
- MemoryMiB: helper.Uint64ToPtr(1),
- PowerW: helper.UintToPtr(2),
- BAR1MiB: helper.Uint64ToPtr(256),
- },
- PowerUsageW: helper.UintToPtr(1),
- GPUUtilization: helper.UintToPtr(1),
- MemoryUtilization: helper.UintToPtr(1),
- EncoderUtilization: helper.UintToPtr(1),
- DecoderUtilization: helper.UintToPtr(1),
- TemperatureC: helper.UintToPtr(1),
- UsedMemoryMiB: helper.Uint64ToPtr(1),
- ECCErrorsL1Cache: helper.Uint64ToPtr(100),
- ECCErrorsL2Cache: helper.Uint64ToPtr(100),
- ECCErrorsDevice: helper.Uint64ToPtr(100),
- },
- },
- },
- {
- Name: "No Stats were provided",
- ProvidedIDs: map[string]struct{}{
- "UUID1": {},
- "UUID2": {},
- "UUID3": {},
- },
- },
- {
- Name: "No Ids were provided",
- ProvidedStats: []*nvml.StatsData{
- {
- DeviceData: &nvml.DeviceData{
- UUID: "UUID1",
- DeviceName: helper.StringToPtr("DeviceName1"),
- MemoryMiB: helper.Uint64ToPtr(1),
- PowerW: helper.UintToPtr(2),
- BAR1MiB: helper.Uint64ToPtr(256),
- },
- PowerUsageW: helper.UintToPtr(1),
- GPUUtilization: helper.UintToPtr(1),
- MemoryUtilization: helper.UintToPtr(1),
- EncoderUtilization: helper.UintToPtr(1),
- DecoderUtilization: helper.UintToPtr(1),
- TemperatureC: helper.UintToPtr(1),
- UsedMemoryMiB: helper.Uint64ToPtr(1),
- ECCErrorsL1Cache: helper.Uint64ToPtr(100),
- ECCErrorsL2Cache: helper.Uint64ToPtr(100),
- ECCErrorsDevice: helper.Uint64ToPtr(100),
- },
- {
- DeviceData: &nvml.DeviceData{
- UUID: "UUID2",
- DeviceName: helper.StringToPtr("DeviceName1"),
- MemoryMiB: helper.Uint64ToPtr(1),
- PowerW: helper.UintToPtr(2),
- BAR1MiB: helper.Uint64ToPtr(256),
- },
- PowerUsageW: helper.UintToPtr(1),
- GPUUtilization: helper.UintToPtr(1),
- MemoryUtilization: helper.UintToPtr(1),
- EncoderUtilization: helper.UintToPtr(1),
- DecoderUtilization: helper.UintToPtr(1),
- TemperatureC: helper.UintToPtr(1),
- UsedMemoryMiB: helper.Uint64ToPtr(1),
- ECCErrorsL1Cache: helper.Uint64ToPtr(100),
- ECCErrorsL2Cache: helper.Uint64ToPtr(100),
- ECCErrorsDevice: helper.Uint64ToPtr(100),
- },
- {
- DeviceData: &nvml.DeviceData{
- UUID: "UUID3",
- DeviceName: helper.StringToPtr("DeviceName1"),
- MemoryMiB: helper.Uint64ToPtr(1),
- PowerW: helper.UintToPtr(2),
- BAR1MiB: helper.Uint64ToPtr(256),
- },
- PowerUsageW: helper.UintToPtr(1),
- GPUUtilization: helper.UintToPtr(1),
- MemoryUtilization: helper.UintToPtr(1),
- EncoderUtilization: helper.UintToPtr(1),
- DecoderUtilization: helper.UintToPtr(1),
- TemperatureC: helper.UintToPtr(1),
- UsedMemoryMiB: helper.Uint64ToPtr(1),
- ECCErrorsL1Cache: helper.Uint64ToPtr(100),
- ECCErrorsL2Cache: helper.Uint64ToPtr(100),
- ECCErrorsDevice: helper.Uint64ToPtr(100),
- },
- },
- },
- } {
- actualResult := filterStatsByID(testCase.ProvidedStats, testCase.ProvidedIDs)
- require.New(t).Equal(testCase.ExpectedResult, actualResult)
- }
-}
-
-func TestStatsForItem(t *testing.T) {
- for _, testCase := range []struct {
- Name string
- Timestamp time.Time
- ItemStat *nvml.StatsData
- ExpectedResult *device.DeviceStats
- }{
- {
- Name: "All fields in ItemStat are not nil",
- Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC),
- ItemStat: &nvml.StatsData{
- DeviceData: &nvml.DeviceData{
- UUID: "UUID1",
- DeviceName: helper.StringToPtr("DeviceName1"),
- MemoryMiB: helper.Uint64ToPtr(1),
- PowerW: helper.UintToPtr(1),
- BAR1MiB: helper.Uint64ToPtr(256),
- },
- PowerUsageW: helper.UintToPtr(1),
- GPUUtilization: helper.UintToPtr(1),
- MemoryUtilization: helper.UintToPtr(1),
- EncoderUtilization: helper.UintToPtr(1),
- DecoderUtilization: helper.UintToPtr(1),
- TemperatureC: helper.UintToPtr(1),
- UsedMemoryMiB: helper.Uint64ToPtr(1),
- BAR1UsedMiB: helper.Uint64ToPtr(1),
- ECCErrorsL1Cache: helper.Uint64ToPtr(100),
- ECCErrorsL2Cache: helper.Uint64ToPtr(100),
- ECCErrorsDevice: helper.Uint64ToPtr(100),
- },
- ExpectedResult: &device.DeviceStats{
- Summary: &structs.StatValue{
- Unit: MemoryStateUnit,
- Desc: MemoryStateDesc,
- IntNumeratorVal: helper.Int64ToPtr(1),
- IntDenominatorVal: helper.Int64ToPtr(1),
- },
- Stats: &structs.StatObject{
- Attributes: map[string]*structs.StatValue{
- PowerUsageAttr: {
- Unit: PowerUsageUnit,
- Desc: PowerUsageDesc,
- IntNumeratorVal: helper.Int64ToPtr(1),
- IntDenominatorVal: helper.Int64ToPtr(1),
- },
- GPUUtilizationAttr: {
- Unit: GPUUtilizationUnit,
- Desc: GPUUtilizationDesc,
- IntNumeratorVal: helper.Int64ToPtr(1),
- },
- MemoryUtilizationAttr: {
- Unit: MemoryUtilizationUnit,
- Desc: MemoryUtilizationDesc,
- IntNumeratorVal: helper.Int64ToPtr(1),
- },
- EncoderUtilizationAttr: {
- Unit: EncoderUtilizationUnit,
- Desc: EncoderUtilizationDesc,
- IntNumeratorVal: helper.Int64ToPtr(1),
- },
- DecoderUtilizationAttr: {
- Unit: DecoderUtilizationUnit,
- Desc: DecoderUtilizationDesc,
- IntNumeratorVal: helper.Int64ToPtr(1),
- },
- TemperatureAttr: {
- Unit: TemperatureUnit,
- Desc: TemperatureDesc,
- IntNumeratorVal: helper.Int64ToPtr(1),
- },
- MemoryStateAttr: {
- Unit: MemoryStateUnit,
- Desc: MemoryStateDesc,
- IntNumeratorVal: helper.Int64ToPtr(1),
- IntDenominatorVal: helper.Int64ToPtr(1),
- },
- BAR1StateAttr: {
- Unit: BAR1StateUnit,
- Desc: BAR1StateDesc,
- IntNumeratorVal: helper.Int64ToPtr(1),
- IntDenominatorVal: helper.Int64ToPtr(256),
- },
- ECCErrorsL1CacheAttr: {
- Unit: ECCErrorsL1CacheUnit,
- Desc: ECCErrorsL1CacheDesc,
- IntNumeratorVal: helper.Int64ToPtr(100),
- },
- ECCErrorsL2CacheAttr: {
- Unit: ECCErrorsL2CacheUnit,
- Desc: ECCErrorsL2CacheDesc,
- IntNumeratorVal: helper.Int64ToPtr(100),
- },
- ECCErrorsDeviceAttr: {
- Unit: ECCErrorsDeviceUnit,
- Desc: ECCErrorsDeviceDesc,
- IntNumeratorVal: helper.Int64ToPtr(100),
- },
- },
- },
- Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC),
- },
- },
- {
- Name: "Power usage is nil",
- Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC),
- ItemStat: &nvml.StatsData{
- DeviceData: &nvml.DeviceData{
- UUID: "UUID1",
- DeviceName: helper.StringToPtr("DeviceName1"),
- MemoryMiB: helper.Uint64ToPtr(1),
- PowerW: helper.UintToPtr(1),
- BAR1MiB: helper.Uint64ToPtr(256),
- },
- PowerUsageW: nil,
- GPUUtilization: helper.UintToPtr(1),
- MemoryUtilization: helper.UintToPtr(1),
- EncoderUtilization: helper.UintToPtr(1),
- DecoderUtilization: helper.UintToPtr(1),
- TemperatureC: helper.UintToPtr(1),
- UsedMemoryMiB: helper.Uint64ToPtr(1),
- BAR1UsedMiB: helper.Uint64ToPtr(1),
- ECCErrorsL1Cache: helper.Uint64ToPtr(100),
- ECCErrorsL2Cache: helper.Uint64ToPtr(100),
- ECCErrorsDevice: helper.Uint64ToPtr(100),
- },
- ExpectedResult: &device.DeviceStats{
- Summary: &structs.StatValue{
- Unit: MemoryStateUnit,
- Desc: MemoryStateDesc,
- IntNumeratorVal: helper.Int64ToPtr(1),
- IntDenominatorVal: helper.Int64ToPtr(1),
- },
- Stats: &structs.StatObject{
- Attributes: map[string]*structs.StatValue{
- PowerUsageAttr: {
- Unit: PowerUsageUnit,
- Desc: PowerUsageDesc,
- StringVal: helper.StringToPtr(notAvailable),
- },
- GPUUtilizationAttr: {
- Unit: GPUUtilizationUnit,
- Desc: GPUUtilizationDesc,
- IntNumeratorVal: helper.Int64ToPtr(1),
- },
- MemoryUtilizationAttr: {
- Unit: MemoryUtilizationUnit,
- Desc: MemoryUtilizationDesc,
- IntNumeratorVal: helper.Int64ToPtr(1),
- },
- EncoderUtilizationAttr: {
- Unit: EncoderUtilizationUnit,
- Desc: EncoderUtilizationDesc,
- IntNumeratorVal: helper.Int64ToPtr(1),
- },
- DecoderUtilizationAttr: {
- Unit: DecoderUtilizationUnit,
- Desc: DecoderUtilizationDesc,
- IntNumeratorVal: helper.Int64ToPtr(1),
- },
- TemperatureAttr: {
- Unit: TemperatureUnit,
- Desc: TemperatureDesc,
- IntNumeratorVal: helper.Int64ToPtr(1),
- },
- MemoryStateAttr: {
- Unit: MemoryStateUnit,
- Desc: MemoryStateDesc,
- IntNumeratorVal: helper.Int64ToPtr(1),
- IntDenominatorVal: helper.Int64ToPtr(1),
- },
- BAR1StateAttr: {
- Unit: BAR1StateUnit,
- Desc: BAR1StateDesc,
- IntNumeratorVal: helper.Int64ToPtr(1),
- IntDenominatorVal: helper.Int64ToPtr(256),
- },
- ECCErrorsL1CacheAttr: {
- Unit: ECCErrorsL1CacheUnit,
- Desc: ECCErrorsL1CacheDesc,
- IntNumeratorVal: helper.Int64ToPtr(100),
- },
- ECCErrorsL2CacheAttr: {
- Unit: ECCErrorsL2CacheUnit,
- Desc: ECCErrorsL2CacheDesc,
- IntNumeratorVal: helper.Int64ToPtr(100),
- },
- ECCErrorsDeviceAttr: {
- Unit: ECCErrorsDeviceUnit,
- Desc: ECCErrorsDeviceDesc,
- IntNumeratorVal: helper.Int64ToPtr(100),
- },
- },
- },
- Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC),
- },
- },
- {
- Name: "PowerW is nil",
- Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC),
- ItemStat: &nvml.StatsData{
- DeviceData: &nvml.DeviceData{
- UUID: "UUID1",
- DeviceName: helper.StringToPtr("DeviceName1"),
- MemoryMiB: helper.Uint64ToPtr(1),
- PowerW: nil,
- BAR1MiB: helper.Uint64ToPtr(256),
- },
- PowerUsageW: helper.UintToPtr(1),
- GPUUtilization: helper.UintToPtr(1),
- MemoryUtilization: helper.UintToPtr(1),
- EncoderUtilization: helper.UintToPtr(1),
- DecoderUtilization: helper.UintToPtr(1),
- TemperatureC: helper.UintToPtr(1),
- UsedMemoryMiB: helper.Uint64ToPtr(1),
- BAR1UsedMiB: helper.Uint64ToPtr(1),
- ECCErrorsL1Cache: helper.Uint64ToPtr(100),
- ECCErrorsL2Cache: helper.Uint64ToPtr(100),
- ECCErrorsDevice: helper.Uint64ToPtr(100),
- },
- ExpectedResult: &device.DeviceStats{
- Summary: &structs.StatValue{
- Unit: MemoryStateUnit,
- Desc: MemoryStateDesc,
- IntNumeratorVal: helper.Int64ToPtr(1),
- IntDenominatorVal: helper.Int64ToPtr(1),
- },
- Stats: &structs.StatObject{
- Attributes: map[string]*structs.StatValue{
- PowerUsageAttr: {
- Unit: PowerUsageUnit,
- Desc: PowerUsageDesc,
- StringVal: helper.StringToPtr(notAvailable),
- },
- GPUUtilizationAttr: {
- Unit: GPUUtilizationUnit,
- Desc: GPUUtilizationDesc,
- IntNumeratorVal: helper.Int64ToPtr(1),
- },
- MemoryUtilizationAttr: {
- Unit: MemoryUtilizationUnit,
- Desc: MemoryUtilizationDesc,
- IntNumeratorVal: helper.Int64ToPtr(1),
- },
- EncoderUtilizationAttr: {
- Unit: EncoderUtilizationUnit,
- Desc: EncoderUtilizationDesc,
- IntNumeratorVal: helper.Int64ToPtr(1),
- },
- DecoderUtilizationAttr: {
- Unit: DecoderUtilizationUnit,
- Desc: DecoderUtilizationDesc,
- IntNumeratorVal: helper.Int64ToPtr(1),
- },
- TemperatureAttr: {
- Unit: TemperatureUnit,
- Desc: TemperatureDesc,
- IntNumeratorVal: helper.Int64ToPtr(1),
- },
- MemoryStateAttr: {
- Unit: MemoryStateUnit,
- Desc: MemoryStateDesc,
- IntNumeratorVal: helper.Int64ToPtr(1),
- IntDenominatorVal: helper.Int64ToPtr(1),
- },
- BAR1StateAttr: {
- Unit: BAR1StateUnit,
- Desc: BAR1StateDesc,
- IntNumeratorVal: helper.Int64ToPtr(1),
- IntDenominatorVal: helper.Int64ToPtr(256),
- },
- ECCErrorsL1CacheAttr: {
- Unit: ECCErrorsL1CacheUnit,
- Desc: ECCErrorsL1CacheDesc,
- IntNumeratorVal: helper.Int64ToPtr(100),
- },
- ECCErrorsL2CacheAttr: {
- Unit: ECCErrorsL2CacheUnit,
- Desc: ECCErrorsL2CacheDesc,
- IntNumeratorVal: helper.Int64ToPtr(100),
- },
- ECCErrorsDeviceAttr: {
- Unit: ECCErrorsDeviceUnit,
- Desc: ECCErrorsDeviceDesc,
- IntNumeratorVal: helper.Int64ToPtr(100),
- },
- },
- },
- Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC),
- },
- },
- {
- Name: "GPUUtilization is nil",
- Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC),
- ItemStat: &nvml.StatsData{
- DeviceData: &nvml.DeviceData{
- UUID: "UUID1",
- DeviceName: helper.StringToPtr("DeviceName1"),
- MemoryMiB: helper.Uint64ToPtr(1),
- PowerW: helper.UintToPtr(1),
- BAR1MiB: helper.Uint64ToPtr(256),
- },
- PowerUsageW: helper.UintToPtr(1),
- GPUUtilization: nil,
- MemoryUtilization: helper.UintToPtr(1),
- EncoderUtilization: helper.UintToPtr(1),
- DecoderUtilization: helper.UintToPtr(1),
- TemperatureC: helper.UintToPtr(1),
- UsedMemoryMiB: helper.Uint64ToPtr(1),
- BAR1UsedMiB: helper.Uint64ToPtr(1),
- ECCErrorsL1Cache: helper.Uint64ToPtr(100),
- ECCErrorsL2Cache: helper.Uint64ToPtr(100),
- ECCErrorsDevice: helper.Uint64ToPtr(100),
- },
- ExpectedResult: &device.DeviceStats{
- Summary: &structs.StatValue{
- Unit: MemoryStateUnit,
- Desc: MemoryStateDesc,
- IntNumeratorVal: helper.Int64ToPtr(1),
- IntDenominatorVal: helper.Int64ToPtr(1),
- },
- Stats: &structs.StatObject{
- Attributes: map[string]*structs.StatValue{
- PowerUsageAttr: {
- Unit: PowerUsageUnit,
- Desc: PowerUsageDesc,
- IntNumeratorVal: helper.Int64ToPtr(1),
- IntDenominatorVal: helper.Int64ToPtr(1),
- },
- GPUUtilizationAttr: {
- Unit: GPUUtilizationUnit,
- Desc: GPUUtilizationDesc,
- StringVal: helper.StringToPtr(notAvailable),
- },
- MemoryUtilizationAttr: {
- Unit: MemoryUtilizationUnit,
- Desc: MemoryUtilizationDesc,
- IntNumeratorVal: helper.Int64ToPtr(1),
- },
- EncoderUtilizationAttr: {
- Unit: EncoderUtilizationUnit,
- Desc: EncoderUtilizationDesc,
- IntNumeratorVal: helper.Int64ToPtr(1),
- },
- DecoderUtilizationAttr: {
- Unit: DecoderUtilizationUnit,
- Desc: DecoderUtilizationDesc,
- IntNumeratorVal: helper.Int64ToPtr(1),
- },
- TemperatureAttr: {
- Unit: TemperatureUnit,
- Desc: TemperatureDesc,
- IntNumeratorVal: helper.Int64ToPtr(1),
- },
- MemoryStateAttr: {
- Unit: MemoryStateUnit,
- Desc: MemoryStateDesc,
- IntNumeratorVal: helper.Int64ToPtr(1),
- IntDenominatorVal: helper.Int64ToPtr(1),
- },
- BAR1StateAttr: {
- Unit: BAR1StateUnit,
- Desc: BAR1StateDesc,
- IntNumeratorVal: helper.Int64ToPtr(1),
- IntDenominatorVal: helper.Int64ToPtr(256),
- },
- ECCErrorsL1CacheAttr: {
- Unit: ECCErrorsL1CacheUnit,
- Desc: ECCErrorsL1CacheDesc,
- IntNumeratorVal: helper.Int64ToPtr(100),
- },
- ECCErrorsL2CacheAttr: {
- Unit: ECCErrorsL2CacheUnit,
- Desc: ECCErrorsL2CacheDesc,
- IntNumeratorVal: helper.Int64ToPtr(100),
- },
- ECCErrorsDeviceAttr: {
- Unit: ECCErrorsDeviceUnit,
- Desc: ECCErrorsDeviceDesc,
- IntNumeratorVal: helper.Int64ToPtr(100),
- },
- },
- },
- Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC),
- },
- },
- {
- Name: "MemoryUtilization is nil",
- Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC),
- ItemStat: &nvml.StatsData{
- DeviceData: &nvml.DeviceData{
- UUID: "UUID1",
- DeviceName: helper.StringToPtr("DeviceName1"),
- MemoryMiB: helper.Uint64ToPtr(1),
- PowerW: helper.UintToPtr(1),
- BAR1MiB: helper.Uint64ToPtr(256),
- },
- PowerUsageW: helper.UintToPtr(1),
- GPUUtilization: helper.UintToPtr(1),
- MemoryUtilization: nil,
- EncoderUtilization: helper.UintToPtr(1),
- DecoderUtilization: helper.UintToPtr(1),
- TemperatureC: helper.UintToPtr(1),
- UsedMemoryMiB: helper.Uint64ToPtr(1),
- BAR1UsedMiB: helper.Uint64ToPtr(1),
- ECCErrorsL1Cache: helper.Uint64ToPtr(100),
- ECCErrorsL2Cache: helper.Uint64ToPtr(100),
- ECCErrorsDevice: helper.Uint64ToPtr(100),
- },
- ExpectedResult: &device.DeviceStats{
- Summary: &structs.StatValue{
- Unit: MemoryStateUnit,
- Desc: MemoryStateDesc,
- IntNumeratorVal: helper.Int64ToPtr(1),
- IntDenominatorVal: helper.Int64ToPtr(1),
- },
- Stats: &structs.StatObject{
- Attributes: map[string]*structs.StatValue{
- PowerUsageAttr: {
- Unit: PowerUsageUnit,
- Desc: PowerUsageDesc,
- IntNumeratorVal: helper.Int64ToPtr(1),
- IntDenominatorVal: helper.Int64ToPtr(1),
- },
- GPUUtilizationAttr: {
- Unit: GPUUtilizationUnit,
- Desc: GPUUtilizationDesc,
- IntNumeratorVal: helper.Int64ToPtr(1),
- },
- MemoryUtilizationAttr: {
- Unit: MemoryUtilizationUnit,
- Desc: MemoryUtilizationDesc,
- StringVal: helper.StringToPtr(notAvailable),
- },
- EncoderUtilizationAttr: {
- Unit: EncoderUtilizationUnit,
- Desc: EncoderUtilizationDesc,
- IntNumeratorVal: helper.Int64ToPtr(1),
- },
- DecoderUtilizationAttr: {
- Unit: DecoderUtilizationUnit,
- Desc: DecoderUtilizationDesc,
- IntNumeratorVal: helper.Int64ToPtr(1),
- },
- TemperatureAttr: {
- Unit: TemperatureUnit,
- Desc: TemperatureDesc,
- IntNumeratorVal: helper.Int64ToPtr(1),
- },
- MemoryStateAttr: {
- Unit: MemoryStateUnit,
- Desc: MemoryStateDesc,
- IntNumeratorVal: helper.Int64ToPtr(1),
- IntDenominatorVal: helper.Int64ToPtr(1),
- },
- BAR1StateAttr: {
- Unit: BAR1StateUnit,
- Desc: BAR1StateDesc,
- IntNumeratorVal: helper.Int64ToPtr(1),
- IntDenominatorVal: helper.Int64ToPtr(256),
- },
- ECCErrorsL1CacheAttr: {
- Unit: ECCErrorsL1CacheUnit,
- Desc: ECCErrorsL1CacheDesc,
- IntNumeratorVal: helper.Int64ToPtr(100),
- },
- ECCErrorsL2CacheAttr: {
- Unit: ECCErrorsL2CacheUnit,
- Desc: ECCErrorsL2CacheDesc,
- IntNumeratorVal: helper.Int64ToPtr(100),
- },
- ECCErrorsDeviceAttr: {
- Unit: ECCErrorsDeviceUnit,
- Desc: ECCErrorsDeviceDesc,
- IntNumeratorVal: helper.Int64ToPtr(100),
- },
- },
- },
- Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC),
- },
- },
- {
- Name: "EncoderUtilization is nil",
- Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC),
- ItemStat: &nvml.StatsData{
- DeviceData: &nvml.DeviceData{
- UUID: "UUID1",
- DeviceName: helper.StringToPtr("DeviceName1"),
- MemoryMiB: helper.Uint64ToPtr(1),
- PowerW: helper.UintToPtr(1),
- BAR1MiB: helper.Uint64ToPtr(256),
- },
- PowerUsageW: helper.UintToPtr(1),
- GPUUtilization: helper.UintToPtr(1),
- MemoryUtilization: helper.UintToPtr(1),
- EncoderUtilization: nil,
- DecoderUtilization: helper.UintToPtr(1),
- TemperatureC: helper.UintToPtr(1),
- UsedMemoryMiB: helper.Uint64ToPtr(1),
- BAR1UsedMiB: helper.Uint64ToPtr(1),
- ECCErrorsL1Cache: helper.Uint64ToPtr(100),
- ECCErrorsL2Cache: helper.Uint64ToPtr(100),
- ECCErrorsDevice: helper.Uint64ToPtr(100),
- },
- ExpectedResult: &device.DeviceStats{
- Summary: &structs.StatValue{
- Unit: MemoryStateUnit,
- Desc: MemoryStateDesc,
- IntNumeratorVal: helper.Int64ToPtr(1),
- IntDenominatorVal: helper.Int64ToPtr(1),
- },
- Stats: &structs.StatObject{
- Attributes: map[string]*structs.StatValue{
- PowerUsageAttr: {
- Unit: PowerUsageUnit,
- Desc: PowerUsageDesc,
- IntNumeratorVal: helper.Int64ToPtr(1),
- IntDenominatorVal: helper.Int64ToPtr(1),
- },
- GPUUtilizationAttr: {
- Unit: GPUUtilizationUnit,
- Desc: GPUUtilizationDesc,
- IntNumeratorVal: helper.Int64ToPtr(1),
- },
- MemoryUtilizationAttr: {
- Unit: MemoryUtilizationUnit,
- Desc: MemoryUtilizationDesc,
- IntNumeratorVal: helper.Int64ToPtr(1),
- },
- EncoderUtilizationAttr: {
- Unit: EncoderUtilizationUnit,
- Desc: EncoderUtilizationDesc,
- StringVal: helper.StringToPtr(notAvailable),
- },
- DecoderUtilizationAttr: {
- Unit: DecoderUtilizationUnit,
- Desc: DecoderUtilizationDesc,
- IntNumeratorVal: helper.Int64ToPtr(1),
- },
- TemperatureAttr: {
- Unit: TemperatureUnit,
- Desc: TemperatureDesc,
- IntNumeratorVal: helper.Int64ToPtr(1),
- },
- MemoryStateAttr: {
- Unit: MemoryStateUnit,
- Desc: MemoryStateDesc,
- IntNumeratorVal: helper.Int64ToPtr(1),
- IntDenominatorVal: helper.Int64ToPtr(1),
- },
- BAR1StateAttr: {
- Unit: BAR1StateUnit,
- Desc: BAR1StateDesc,
- IntNumeratorVal: helper.Int64ToPtr(1),
- IntDenominatorVal: helper.Int64ToPtr(256),
- },
- ECCErrorsL1CacheAttr: {
- Unit: ECCErrorsL1CacheUnit,
- Desc: ECCErrorsL1CacheDesc,
- IntNumeratorVal: helper.Int64ToPtr(100),
- },
- ECCErrorsL2CacheAttr: {
- Unit: ECCErrorsL2CacheUnit,
- Desc: ECCErrorsL2CacheDesc,
- IntNumeratorVal: helper.Int64ToPtr(100),
- },
- ECCErrorsDeviceAttr: {
- Unit: ECCErrorsDeviceUnit,
- Desc: ECCErrorsDeviceDesc,
- IntNumeratorVal: helper.Int64ToPtr(100),
- },
- },
- },
- Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC),
- },
- },
- {
- Name: "DecoderUtilization is nil",
- Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC),
- ItemStat: &nvml.StatsData{
- DeviceData: &nvml.DeviceData{
- UUID: "UUID1",
- DeviceName: helper.StringToPtr("DeviceName1"),
- MemoryMiB: helper.Uint64ToPtr(1),
- PowerW: helper.UintToPtr(1),
- BAR1MiB: helper.Uint64ToPtr(256),
- },
- PowerUsageW: helper.UintToPtr(1),
- GPUUtilization: helper.UintToPtr(1),
- MemoryUtilization: helper.UintToPtr(1),
- EncoderUtilization: helper.UintToPtr(1),
- DecoderUtilization: nil,
- TemperatureC: helper.UintToPtr(1),
- UsedMemoryMiB: helper.Uint64ToPtr(1),
- BAR1UsedMiB: helper.Uint64ToPtr(1),
- ECCErrorsL1Cache: helper.Uint64ToPtr(100),
- ECCErrorsL2Cache: helper.Uint64ToPtr(100),
- ECCErrorsDevice: helper.Uint64ToPtr(100),
- },
- ExpectedResult: &device.DeviceStats{
- Summary: &structs.StatValue{
- Unit: MemoryStateUnit,
- Desc: MemoryStateDesc,
- IntNumeratorVal: helper.Int64ToPtr(1),
- IntDenominatorVal: helper.Int64ToPtr(1),
- },
- Stats: &structs.StatObject{
- Attributes: map[string]*structs.StatValue{
- PowerUsageAttr: {
- Unit: PowerUsageUnit,
- Desc: PowerUsageDesc,
- IntNumeratorVal: helper.Int64ToPtr(1),
- IntDenominatorVal: helper.Int64ToPtr(1),
- },
- GPUUtilizationAttr: {
- Unit: GPUUtilizationUnit,
- Desc: GPUUtilizationDesc,
- IntNumeratorVal: helper.Int64ToPtr(1),
- },
- MemoryUtilizationAttr: {
- Unit: MemoryUtilizationUnit,
- Desc: MemoryUtilizationDesc,
- IntNumeratorVal: helper.Int64ToPtr(1),
- },
- EncoderUtilizationAttr: {
- Unit: EncoderUtilizationUnit,
- Desc: EncoderUtilizationDesc,
- IntNumeratorVal: helper.Int64ToPtr(1),
- },
- DecoderUtilizationAttr: {
- Unit: DecoderUtilizationUnit,
- Desc: DecoderUtilizationDesc,
- StringVal: helper.StringToPtr(notAvailable),
- },
- TemperatureAttr: {
- Unit: TemperatureUnit,
- Desc: TemperatureDesc,
- IntNumeratorVal: helper.Int64ToPtr(1),
- },
- MemoryStateAttr: {
- Unit: MemoryStateUnit,
- Desc: MemoryStateDesc,
- IntNumeratorVal: helper.Int64ToPtr(1),
- IntDenominatorVal: helper.Int64ToPtr(1),
- },
- BAR1StateAttr: {
- Unit: BAR1StateUnit,
- Desc: BAR1StateDesc,
- IntNumeratorVal: helper.Int64ToPtr(1),
- IntDenominatorVal: helper.Int64ToPtr(256),
- },
- ECCErrorsL1CacheAttr: {
- Unit: ECCErrorsL1CacheUnit,
- Desc: ECCErrorsL1CacheDesc,
- IntNumeratorVal: helper.Int64ToPtr(100),
- },
- ECCErrorsL2CacheAttr: {
- Unit: ECCErrorsL2CacheUnit,
- Desc: ECCErrorsL2CacheDesc,
- IntNumeratorVal: helper.Int64ToPtr(100),
- },
- ECCErrorsDeviceAttr: {
- Unit: ECCErrorsDeviceUnit,
- Desc: ECCErrorsDeviceDesc,
- IntNumeratorVal: helper.Int64ToPtr(100),
- },
- },
- },
- Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC),
- },
- },
- {
- Name: "Temperature is nil",
- Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC),
- ItemStat: &nvml.StatsData{
- DeviceData: &nvml.DeviceData{
- UUID: "UUID1",
- DeviceName: helper.StringToPtr("DeviceName1"),
- MemoryMiB: helper.Uint64ToPtr(1),
- PowerW: helper.UintToPtr(1),
- BAR1MiB: helper.Uint64ToPtr(256),
- },
- PowerUsageW: helper.UintToPtr(1),
- GPUUtilization: helper.UintToPtr(1),
- MemoryUtilization: helper.UintToPtr(1),
- EncoderUtilization: helper.UintToPtr(1),
- DecoderUtilization: helper.UintToPtr(1),
- TemperatureC: nil,
- UsedMemoryMiB: helper.Uint64ToPtr(1),
- BAR1UsedMiB: helper.Uint64ToPtr(1),
- ECCErrorsL1Cache: helper.Uint64ToPtr(100),
- ECCErrorsL2Cache: helper.Uint64ToPtr(100),
- ECCErrorsDevice: helper.Uint64ToPtr(100),
- },
- ExpectedResult: &device.DeviceStats{
- Summary: &structs.StatValue{
- Unit: MemoryStateUnit,
- Desc: MemoryStateDesc,
- IntNumeratorVal: helper.Int64ToPtr(1),
- IntDenominatorVal: helper.Int64ToPtr(1),
- },
- Stats: &structs.StatObject{
- Attributes: map[string]*structs.StatValue{
- PowerUsageAttr: {
- Unit: PowerUsageUnit,
- Desc: PowerUsageDesc,
- IntNumeratorVal: helper.Int64ToPtr(1),
- IntDenominatorVal: helper.Int64ToPtr(1),
- },
- GPUUtilizationAttr: {
- Unit: GPUUtilizationUnit,
- Desc: GPUUtilizationDesc,
- IntNumeratorVal: helper.Int64ToPtr(1),
- },
- MemoryUtilizationAttr: {
- Unit: MemoryUtilizationUnit,
- Desc: MemoryUtilizationDesc,
- IntNumeratorVal: helper.Int64ToPtr(1),
- },
- EncoderUtilizationAttr: {
- Unit: EncoderUtilizationUnit,
- Desc: EncoderUtilizationDesc,
- IntNumeratorVal: helper.Int64ToPtr(1),
- },
- DecoderUtilizationAttr: {
- Unit: DecoderUtilizationUnit,
- Desc: DecoderUtilizationDesc,
- IntNumeratorVal: helper.Int64ToPtr(1),
- },
- TemperatureAttr: {
- Unit: TemperatureUnit,
- Desc: TemperatureDesc,
- StringVal: helper.StringToPtr(notAvailable),
- },
- MemoryStateAttr: {
- Unit: MemoryStateUnit,
- Desc: MemoryStateDesc,
- IntNumeratorVal: helper.Int64ToPtr(1),
- IntDenominatorVal: helper.Int64ToPtr(1),
- },
- BAR1StateAttr: {
- Unit: BAR1StateUnit,
- Desc: BAR1StateDesc,
- IntNumeratorVal: helper.Int64ToPtr(1),
- IntDenominatorVal: helper.Int64ToPtr(256),
- },
- ECCErrorsL1CacheAttr: {
- Unit: ECCErrorsL1CacheUnit,
- Desc: ECCErrorsL1CacheDesc,
- IntNumeratorVal: helper.Int64ToPtr(100),
- },
- ECCErrorsL2CacheAttr: {
- Unit: ECCErrorsL2CacheUnit,
- Desc: ECCErrorsL2CacheDesc,
- IntNumeratorVal: helper.Int64ToPtr(100),
- },
- ECCErrorsDeviceAttr: {
- Unit: ECCErrorsDeviceUnit,
- Desc: ECCErrorsDeviceDesc,
- IntNumeratorVal: helper.Int64ToPtr(100),
- },
- },
- },
- Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC),
- },
- },
- {
- Name: "UsedMemoryMiB is nil",
- Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC),
- ItemStat: &nvml.StatsData{
- DeviceData: &nvml.DeviceData{
- UUID: "UUID1",
- DeviceName: helper.StringToPtr("DeviceName1"),
- MemoryMiB: helper.Uint64ToPtr(1),
- PowerW: helper.UintToPtr(1),
- BAR1MiB: helper.Uint64ToPtr(256),
- },
- PowerUsageW: helper.UintToPtr(1),
- GPUUtilization: helper.UintToPtr(1),
- MemoryUtilization: helper.UintToPtr(1),
- EncoderUtilization: helper.UintToPtr(1),
- DecoderUtilization: helper.UintToPtr(1),
- TemperatureC: helper.UintToPtr(1),
- UsedMemoryMiB: nil,
- BAR1UsedMiB: helper.Uint64ToPtr(1),
- ECCErrorsL1Cache: helper.Uint64ToPtr(100),
- ECCErrorsL2Cache: helper.Uint64ToPtr(100),
- ECCErrorsDevice: helper.Uint64ToPtr(100),
- },
- ExpectedResult: &device.DeviceStats{
- Summary: &structs.StatValue{
- Unit: MemoryStateUnit,
- Desc: MemoryStateDesc,
- StringVal: helper.StringToPtr(notAvailable),
- },
- Stats: &structs.StatObject{
- Attributes: map[string]*structs.StatValue{
- PowerUsageAttr: {
- Unit: PowerUsageUnit,
- Desc: PowerUsageDesc,
- IntNumeratorVal: helper.Int64ToPtr(1),
- IntDenominatorVal: helper.Int64ToPtr(1),
- },
- GPUUtilizationAttr: {
- Unit: GPUUtilizationUnit,
- Desc: GPUUtilizationDesc,
- IntNumeratorVal: helper.Int64ToPtr(1),
- },
- MemoryUtilizationAttr: {
- Unit: MemoryUtilizationUnit,
- Desc: MemoryUtilizationDesc,
- IntNumeratorVal: helper.Int64ToPtr(1),
- },
- EncoderUtilizationAttr: {
- Unit: EncoderUtilizationUnit,
- Desc: EncoderUtilizationDesc,
- IntNumeratorVal: helper.Int64ToPtr(1),
- },
- DecoderUtilizationAttr: {
- Unit: DecoderUtilizationUnit,
- Desc: DecoderUtilizationDesc,
- IntNumeratorVal: helper.Int64ToPtr(1),
- },
- TemperatureAttr: {
- Unit: TemperatureUnit,
- Desc: TemperatureDesc,
- IntNumeratorVal: helper.Int64ToPtr(1),
- },
- MemoryStateAttr: {
- Unit: MemoryStateUnit,
- Desc: MemoryStateDesc,
- StringVal: helper.StringToPtr(notAvailable),
- },
- BAR1StateAttr: {
- Unit: BAR1StateUnit,
- Desc: BAR1StateDesc,
- IntNumeratorVal: helper.Int64ToPtr(1),
- IntDenominatorVal: helper.Int64ToPtr(256),
- },
- ECCErrorsL1CacheAttr: {
- Unit: ECCErrorsL1CacheUnit,
- Desc: ECCErrorsL1CacheDesc,
- IntNumeratorVal: helper.Int64ToPtr(100),
- },
- ECCErrorsL2CacheAttr: {
- Unit: ECCErrorsL2CacheUnit,
- Desc: ECCErrorsL2CacheDesc,
- IntNumeratorVal: helper.Int64ToPtr(100),
- },
- ECCErrorsDeviceAttr: {
- Unit: ECCErrorsDeviceUnit,
- Desc: ECCErrorsDeviceDesc,
- IntNumeratorVal: helper.Int64ToPtr(100),
- },
- },
- },
- Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC),
- },
- },
- {
- Name: "MemoryMiB is nil",
- Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC),
- ItemStat: &nvml.StatsData{
- DeviceData: &nvml.DeviceData{
- UUID: "UUID1",
- DeviceName: helper.StringToPtr("DeviceName1"),
- MemoryMiB: nil,
- PowerW: helper.UintToPtr(1),
- BAR1MiB: helper.Uint64ToPtr(256),
- },
- PowerUsageW: helper.UintToPtr(1),
- GPUUtilization: helper.UintToPtr(1),
- MemoryUtilization: helper.UintToPtr(1),
- EncoderUtilization: helper.UintToPtr(1),
- DecoderUtilization: helper.UintToPtr(1),
- TemperatureC: helper.UintToPtr(1),
- UsedMemoryMiB: helper.Uint64ToPtr(1),
- BAR1UsedMiB: helper.Uint64ToPtr(1),
- ECCErrorsL1Cache: helper.Uint64ToPtr(100),
- ECCErrorsL2Cache: helper.Uint64ToPtr(100),
- ECCErrorsDevice: helper.Uint64ToPtr(100),
- },
- ExpectedResult: &device.DeviceStats{
- Summary: &structs.StatValue{
- Unit: MemoryStateUnit,
- Desc: MemoryStateDesc,
- StringVal: helper.StringToPtr(notAvailable),
- },
- Stats: &structs.StatObject{
- Attributes: map[string]*structs.StatValue{
- PowerUsageAttr: {
- Unit: PowerUsageUnit,
- Desc: PowerUsageDesc,
- IntNumeratorVal: helper.Int64ToPtr(1),
- IntDenominatorVal: helper.Int64ToPtr(1),
- },
- GPUUtilizationAttr: {
- Unit: GPUUtilizationUnit,
- Desc: GPUUtilizationDesc,
- IntNumeratorVal: helper.Int64ToPtr(1),
- },
- MemoryUtilizationAttr: {
- Unit: MemoryUtilizationUnit,
- Desc: MemoryUtilizationDesc,
- IntNumeratorVal: helper.Int64ToPtr(1),
- },
- EncoderUtilizationAttr: {
- Unit: EncoderUtilizationUnit,
- Desc: EncoderUtilizationDesc,
- IntNumeratorVal: helper.Int64ToPtr(1),
- },
- DecoderUtilizationAttr: {
- Unit: DecoderUtilizationUnit,
- Desc: DecoderUtilizationDesc,
- IntNumeratorVal: helper.Int64ToPtr(1),
- },
- TemperatureAttr: {
- Unit: TemperatureUnit,
- Desc: TemperatureDesc,
- IntNumeratorVal: helper.Int64ToPtr(1),
- },
- MemoryStateAttr: {
- Unit: MemoryStateUnit,
- Desc: MemoryStateDesc,
- StringVal: helper.StringToPtr(notAvailable),
- },
- BAR1StateAttr: {
- Unit: BAR1StateUnit,
- Desc: BAR1StateDesc,
- IntNumeratorVal: helper.Int64ToPtr(1),
- IntDenominatorVal: helper.Int64ToPtr(256),
- },
- ECCErrorsL1CacheAttr: {
- Unit: ECCErrorsL1CacheUnit,
- Desc: ECCErrorsL1CacheDesc,
- IntNumeratorVal: helper.Int64ToPtr(100),
- },
- ECCErrorsL2CacheAttr: {
- Unit: ECCErrorsL2CacheUnit,
- Desc: ECCErrorsL2CacheDesc,
- IntNumeratorVal: helper.Int64ToPtr(100),
- },
- ECCErrorsDeviceAttr: {
- Unit: ECCErrorsDeviceUnit,
- Desc: ECCErrorsDeviceDesc,
- IntNumeratorVal: helper.Int64ToPtr(100),
- },
- },
- },
- Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC),
- },
- },
- {
- Name: "BAR1UsedMiB is nil",
- Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC),
- ItemStat: &nvml.StatsData{
- DeviceData: &nvml.DeviceData{
- UUID: "UUID1",
- DeviceName: helper.StringToPtr("DeviceName1"),
- MemoryMiB: helper.Uint64ToPtr(1),
- PowerW: helper.UintToPtr(1),
- BAR1MiB: helper.Uint64ToPtr(256),
- },
- PowerUsageW: helper.UintToPtr(1),
- GPUUtilization: helper.UintToPtr(1),
- MemoryUtilization: helper.UintToPtr(1),
- EncoderUtilization: helper.UintToPtr(1),
- DecoderUtilization: helper.UintToPtr(1),
- TemperatureC: helper.UintToPtr(1),
- UsedMemoryMiB: helper.Uint64ToPtr(1),
- BAR1UsedMiB: nil,
- ECCErrorsL1Cache: helper.Uint64ToPtr(100),
- ECCErrorsL2Cache: helper.Uint64ToPtr(100),
- ECCErrorsDevice: helper.Uint64ToPtr(100),
- },
- ExpectedResult: &device.DeviceStats{
- Summary: &structs.StatValue{
- Unit: MemoryStateUnit,
- Desc: MemoryStateDesc,
- IntNumeratorVal: helper.Int64ToPtr(1),
- IntDenominatorVal: helper.Int64ToPtr(1),
- },
- Stats: &structs.StatObject{
- Attributes: map[string]*structs.StatValue{
- PowerUsageAttr: {
- Unit: PowerUsageUnit,
- Desc: PowerUsageDesc,
- IntNumeratorVal: helper.Int64ToPtr(1),
- IntDenominatorVal: helper.Int64ToPtr(1),
- },
- GPUUtilizationAttr: {
- Unit: GPUUtilizationUnit,
- Desc: GPUUtilizationDesc,
- IntNumeratorVal: helper.Int64ToPtr(1),
- },
- MemoryUtilizationAttr: {
- Unit: MemoryUtilizationUnit,
- Desc: MemoryUtilizationDesc,
- IntNumeratorVal: helper.Int64ToPtr(1),
- },
- EncoderUtilizationAttr: {
- Unit: EncoderUtilizationUnit,
- Desc: EncoderUtilizationDesc,
- IntNumeratorVal: helper.Int64ToPtr(1),
- },
- DecoderUtilizationAttr: {
- Unit: DecoderUtilizationUnit,
- Desc: DecoderUtilizationDesc,
- IntNumeratorVal: helper.Int64ToPtr(1),
- },
- TemperatureAttr: {
- Unit: TemperatureUnit,
- Desc: TemperatureDesc,
- IntNumeratorVal: helper.Int64ToPtr(1),
- },
- MemoryStateAttr: {
- Unit: MemoryStateUnit,
- Desc: MemoryStateDesc,
- IntNumeratorVal: helper.Int64ToPtr(1),
- IntDenominatorVal: helper.Int64ToPtr(1),
- },
- BAR1StateAttr: {
- Unit: BAR1StateUnit,
- Desc: BAR1StateDesc,
- StringVal: helper.StringToPtr(notAvailable),
- },
- ECCErrorsL1CacheAttr: {
- Unit: ECCErrorsL1CacheUnit,
- Desc: ECCErrorsL1CacheDesc,
- IntNumeratorVal: helper.Int64ToPtr(100),
- },
- ECCErrorsL2CacheAttr: {
- Unit: ECCErrorsL2CacheUnit,
- Desc: ECCErrorsL2CacheDesc,
- IntNumeratorVal: helper.Int64ToPtr(100),
- },
- ECCErrorsDeviceAttr: {
- Unit: ECCErrorsDeviceUnit,
- Desc: ECCErrorsDeviceDesc,
- IntNumeratorVal: helper.Int64ToPtr(100),
- },
- },
- },
- Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC),
- },
- },
- {
- Name: "BAR1MiB is nil",
- Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC),
- ItemStat: &nvml.StatsData{
- DeviceData: &nvml.DeviceData{
- UUID: "UUID1",
- DeviceName: helper.StringToPtr("DeviceName1"),
- MemoryMiB: helper.Uint64ToPtr(1),
- PowerW: helper.UintToPtr(1),
- BAR1MiB: nil,
- },
- PowerUsageW: helper.UintToPtr(1),
- GPUUtilization: helper.UintToPtr(1),
- MemoryUtilization: helper.UintToPtr(1),
- EncoderUtilization: helper.UintToPtr(1),
- DecoderUtilization: helper.UintToPtr(1),
- TemperatureC: helper.UintToPtr(1),
- UsedMemoryMiB: helper.Uint64ToPtr(1),
- BAR1UsedMiB: helper.Uint64ToPtr(1),
- ECCErrorsL1Cache: helper.Uint64ToPtr(100),
- ECCErrorsL2Cache: helper.Uint64ToPtr(100),
- ECCErrorsDevice: helper.Uint64ToPtr(100),
- },
- ExpectedResult: &device.DeviceStats{
- Summary: &structs.StatValue{
- Unit: MemoryStateUnit,
- Desc: MemoryStateDesc,
- IntNumeratorVal: helper.Int64ToPtr(1),
- IntDenominatorVal: helper.Int64ToPtr(1),
- },
- Stats: &structs.StatObject{
- Attributes: map[string]*structs.StatValue{
- PowerUsageAttr: {
- Unit: PowerUsageUnit,
- Desc: PowerUsageDesc,
- IntNumeratorVal: helper.Int64ToPtr(1),
- IntDenominatorVal: helper.Int64ToPtr(1),
- },
- GPUUtilizationAttr: {
- Unit: GPUUtilizationUnit,
- Desc: GPUUtilizationDesc,
- IntNumeratorVal: helper.Int64ToPtr(1),
- },
- MemoryUtilizationAttr: {
- Unit: MemoryUtilizationUnit,
- Desc: MemoryUtilizationDesc,
- IntNumeratorVal: helper.Int64ToPtr(1),
- },
- EncoderUtilizationAttr: {
- Unit: EncoderUtilizationUnit,
- Desc: EncoderUtilizationDesc,
- IntNumeratorVal: helper.Int64ToPtr(1),
- },
- DecoderUtilizationAttr: {
- Unit: DecoderUtilizationUnit,
- Desc: DecoderUtilizationDesc,
- IntNumeratorVal: helper.Int64ToPtr(1),
- },
- TemperatureAttr: {
- Unit: TemperatureUnit,
- Desc: TemperatureDesc,
- IntNumeratorVal: helper.Int64ToPtr(1),
- },
- MemoryStateAttr: {
- Unit: MemoryStateUnit,
- Desc: MemoryStateDesc,
- IntNumeratorVal: helper.Int64ToPtr(1),
- IntDenominatorVal: helper.Int64ToPtr(1),
- },
- BAR1StateAttr: {
- Unit: BAR1StateUnit,
- Desc: BAR1StateDesc,
- StringVal: helper.StringToPtr(notAvailable),
- },
- ECCErrorsL1CacheAttr: {
- Unit: ECCErrorsL1CacheUnit,
- Desc: ECCErrorsL1CacheDesc,
- IntNumeratorVal: helper.Int64ToPtr(100),
- },
- ECCErrorsL2CacheAttr: {
- Unit: ECCErrorsL2CacheUnit,
- Desc: ECCErrorsL2CacheDesc,
- IntNumeratorVal: helper.Int64ToPtr(100),
- },
- ECCErrorsDeviceAttr: {
- Unit: ECCErrorsDeviceUnit,
- Desc: ECCErrorsDeviceDesc,
- IntNumeratorVal: helper.Int64ToPtr(100),
- },
- },
- },
- Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC),
- },
- },
- {
- Name: "ECCErrorsL1Cache is nil",
- Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC),
- ItemStat: &nvml.StatsData{
- DeviceData: &nvml.DeviceData{
- UUID: "UUID1",
- DeviceName: helper.StringToPtr("DeviceName1"),
- MemoryMiB: helper.Uint64ToPtr(1),
- PowerW: helper.UintToPtr(1),
- BAR1MiB: helper.Uint64ToPtr(256),
- },
- PowerUsageW: helper.UintToPtr(1),
- GPUUtilization: helper.UintToPtr(1),
- MemoryUtilization: helper.UintToPtr(1),
- EncoderUtilization: helper.UintToPtr(1),
- DecoderUtilization: helper.UintToPtr(1),
- TemperatureC: helper.UintToPtr(1),
- UsedMemoryMiB: helper.Uint64ToPtr(1),
- BAR1UsedMiB: helper.Uint64ToPtr(1),
- ECCErrorsL1Cache: nil,
- ECCErrorsL2Cache: helper.Uint64ToPtr(100),
- ECCErrorsDevice: helper.Uint64ToPtr(100),
- },
- ExpectedResult: &device.DeviceStats{
- Summary: &structs.StatValue{
- Unit: MemoryStateUnit,
- Desc: MemoryStateDesc,
- IntNumeratorVal: helper.Int64ToPtr(1),
- IntDenominatorVal: helper.Int64ToPtr(1),
- },
- Stats: &structs.StatObject{
- Attributes: map[string]*structs.StatValue{
- PowerUsageAttr: {
- Unit: PowerUsageUnit,
- Desc: PowerUsageDesc,
- IntNumeratorVal: helper.Int64ToPtr(1),
- IntDenominatorVal: helper.Int64ToPtr(1),
- },
- GPUUtilizationAttr: {
- Unit: GPUUtilizationUnit,
- Desc: GPUUtilizationDesc,
- IntNumeratorVal: helper.Int64ToPtr(1),
- },
- MemoryUtilizationAttr: {
- Unit: MemoryUtilizationUnit,
- Desc: MemoryUtilizationDesc,
- IntNumeratorVal: helper.Int64ToPtr(1),
- },
- EncoderUtilizationAttr: {
- Unit: EncoderUtilizationUnit,
- Desc: EncoderUtilizationDesc,
- IntNumeratorVal: helper.Int64ToPtr(1),
- },
- DecoderUtilizationAttr: {
- Unit: DecoderUtilizationUnit,
- Desc: DecoderUtilizationDesc,
- IntNumeratorVal: helper.Int64ToPtr(1),
- },
- TemperatureAttr: {
- Unit: TemperatureUnit,
- Desc: TemperatureDesc,
- IntNumeratorVal: helper.Int64ToPtr(1),
- },
- MemoryStateAttr: {
- Unit: MemoryStateUnit,
- Desc: MemoryStateDesc,
- IntNumeratorVal: helper.Int64ToPtr(1),
- IntDenominatorVal: helper.Int64ToPtr(1),
- },
- BAR1StateAttr: {
- Unit: BAR1StateUnit,
- Desc: BAR1StateDesc,
- IntNumeratorVal: helper.Int64ToPtr(1),
- IntDenominatorVal: helper.Int64ToPtr(256),
- },
- ECCErrorsL1CacheAttr: {
- Unit: ECCErrorsL1CacheUnit,
- Desc: ECCErrorsL1CacheDesc,
- StringVal: helper.StringToPtr(notAvailable),
- },
- ECCErrorsL2CacheAttr: {
- Unit: ECCErrorsL2CacheUnit,
- Desc: ECCErrorsL2CacheDesc,
- IntNumeratorVal: helper.Int64ToPtr(100),
- },
- ECCErrorsDeviceAttr: {
- Unit: ECCErrorsDeviceUnit,
- Desc: ECCErrorsDeviceDesc,
- IntNumeratorVal: helper.Int64ToPtr(100),
- },
- },
- },
- Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC),
- },
- },
- {
- Name: "ECCErrorsL2Cache is nil",
- Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC),
- ItemStat: &nvml.StatsData{
- DeviceData: &nvml.DeviceData{
- UUID: "UUID1",
- DeviceName: helper.StringToPtr("DeviceName1"),
- MemoryMiB: helper.Uint64ToPtr(1),
- PowerW: helper.UintToPtr(1),
- BAR1MiB: helper.Uint64ToPtr(256),
- },
- PowerUsageW: helper.UintToPtr(1),
- GPUUtilization: helper.UintToPtr(1),
- MemoryUtilization: helper.UintToPtr(1),
- EncoderUtilization: helper.UintToPtr(1),
- DecoderUtilization: helper.UintToPtr(1),
- TemperatureC: helper.UintToPtr(1),
- UsedMemoryMiB: helper.Uint64ToPtr(1),
- BAR1UsedMiB: helper.Uint64ToPtr(1),
- ECCErrorsL1Cache: helper.Uint64ToPtr(100),
- ECCErrorsL2Cache: nil,
- ECCErrorsDevice: helper.Uint64ToPtr(100),
- },
- ExpectedResult: &device.DeviceStats{
- Summary: &structs.StatValue{
- Unit: MemoryStateUnit,
- Desc: MemoryStateDesc,
- IntNumeratorVal: helper.Int64ToPtr(1),
- IntDenominatorVal: helper.Int64ToPtr(1),
- },
- Stats: &structs.StatObject{
- Attributes: map[string]*structs.StatValue{
- PowerUsageAttr: {
- Unit: PowerUsageUnit,
- Desc: PowerUsageDesc,
- IntNumeratorVal: helper.Int64ToPtr(1),
- IntDenominatorVal: helper.Int64ToPtr(1),
- },
- GPUUtilizationAttr: {
- Unit: GPUUtilizationUnit,
- Desc: GPUUtilizationDesc,
- IntNumeratorVal: helper.Int64ToPtr(1),
- },
- MemoryUtilizationAttr: {
- Unit: MemoryUtilizationUnit,
- Desc: MemoryUtilizationDesc,
- IntNumeratorVal: helper.Int64ToPtr(1),
- },
- EncoderUtilizationAttr: {
- Unit: EncoderUtilizationUnit,
- Desc: EncoderUtilizationDesc,
- IntNumeratorVal: helper.Int64ToPtr(1),
- },
- DecoderUtilizationAttr: {
- Unit: DecoderUtilizationUnit,
- Desc: DecoderUtilizationDesc,
- IntNumeratorVal: helper.Int64ToPtr(1),
- },
- TemperatureAttr: {
- Unit: TemperatureUnit,
- Desc: TemperatureDesc,
- IntNumeratorVal: helper.Int64ToPtr(1),
- },
- MemoryStateAttr: {
- Unit: MemoryStateUnit,
- Desc: MemoryStateDesc,
- IntNumeratorVal: helper.Int64ToPtr(1),
- IntDenominatorVal: helper.Int64ToPtr(1),
- },
- BAR1StateAttr: {
- Unit: BAR1StateUnit,
- Desc: BAR1StateDesc,
- IntNumeratorVal: helper.Int64ToPtr(1),
- IntDenominatorVal: helper.Int64ToPtr(256),
- },
- ECCErrorsL1CacheAttr: {
- Unit: ECCErrorsL1CacheUnit,
- Desc: ECCErrorsL1CacheDesc,
- IntNumeratorVal: helper.Int64ToPtr(100),
- },
- ECCErrorsL2CacheAttr: {
- Unit: ECCErrorsL2CacheUnit,
- Desc: ECCErrorsL2CacheDesc,
- StringVal: helper.StringToPtr(notAvailable),
- },
- ECCErrorsDeviceAttr: {
- Unit: ECCErrorsDeviceUnit,
- Desc: ECCErrorsDeviceDesc,
- IntNumeratorVal: helper.Int64ToPtr(100),
- },
- },
- },
- Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC),
- },
- },
- {
- Name: "ECCErrorsDevice is nil",
- Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC),
- ItemStat: &nvml.StatsData{
- DeviceData: &nvml.DeviceData{
- UUID: "UUID1",
- DeviceName: helper.StringToPtr("DeviceName1"),
- MemoryMiB: helper.Uint64ToPtr(1),
- PowerW: helper.UintToPtr(1),
- BAR1MiB: helper.Uint64ToPtr(256),
- },
- PowerUsageW: helper.UintToPtr(1),
- GPUUtilization: helper.UintToPtr(1),
- MemoryUtilization: helper.UintToPtr(1),
- EncoderUtilization: helper.UintToPtr(1),
- DecoderUtilization: helper.UintToPtr(1),
- TemperatureC: helper.UintToPtr(1),
- UsedMemoryMiB: helper.Uint64ToPtr(1),
- BAR1UsedMiB: helper.Uint64ToPtr(1),
- ECCErrorsL1Cache: helper.Uint64ToPtr(100),
- ECCErrorsL2Cache: helper.Uint64ToPtr(100),
- ECCErrorsDevice: nil,
- },
- ExpectedResult: &device.DeviceStats{
- Summary: &structs.StatValue{
- Unit: MemoryStateUnit,
- Desc: MemoryStateDesc,
- IntNumeratorVal: helper.Int64ToPtr(1),
- IntDenominatorVal: helper.Int64ToPtr(1),
- },
- Stats: &structs.StatObject{
- Attributes: map[string]*structs.StatValue{
- PowerUsageAttr: {
- Unit: PowerUsageUnit,
- Desc: PowerUsageDesc,
- IntNumeratorVal: helper.Int64ToPtr(1),
- IntDenominatorVal: helper.Int64ToPtr(1),
- },
- GPUUtilizationAttr: {
- Unit: GPUUtilizationUnit,
- Desc: GPUUtilizationDesc,
- IntNumeratorVal: helper.Int64ToPtr(1),
- },
- MemoryUtilizationAttr: {
- Unit: MemoryUtilizationUnit,
- Desc: MemoryUtilizationDesc,
- IntNumeratorVal: helper.Int64ToPtr(1),
- },
- EncoderUtilizationAttr: {
- Unit: EncoderUtilizationUnit,
- Desc: EncoderUtilizationDesc,
- IntNumeratorVal: helper.Int64ToPtr(1),
- },
- DecoderUtilizationAttr: {
- Unit: DecoderUtilizationUnit,
- Desc: DecoderUtilizationDesc,
- IntNumeratorVal: helper.Int64ToPtr(1),
- },
- TemperatureAttr: {
- Unit: TemperatureUnit,
- Desc: TemperatureDesc,
- IntNumeratorVal: helper.Int64ToPtr(1),
- },
- MemoryStateAttr: {
- Unit: MemoryStateUnit,
- Desc: MemoryStateDesc,
- IntNumeratorVal: helper.Int64ToPtr(1),
- IntDenominatorVal: helper.Int64ToPtr(1),
- },
- BAR1StateAttr: {
- Unit: BAR1StateUnit,
- Desc: BAR1StateDesc,
- IntNumeratorVal: helper.Int64ToPtr(1),
- IntDenominatorVal: helper.Int64ToPtr(256),
- },
- ECCErrorsL1CacheAttr: {
- Unit: ECCErrorsL1CacheUnit,
- Desc: ECCErrorsL1CacheDesc,
- IntNumeratorVal: helper.Int64ToPtr(100),
- },
- ECCErrorsL2CacheAttr: {
- Unit: ECCErrorsL2CacheUnit,
- Desc: ECCErrorsL2CacheDesc,
- IntNumeratorVal: helper.Int64ToPtr(100),
- },
- ECCErrorsDeviceAttr: {
- Unit: ECCErrorsDeviceUnit,
- Desc: ECCErrorsDeviceDesc,
- StringVal: helper.StringToPtr(notAvailable),
- },
- },
- },
- Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC),
- },
- },
- } {
- actualResult := statsForItem(testCase.ItemStat, testCase.Timestamp)
- require.New(t).Equal(testCase.ExpectedResult, actualResult)
- }
-}
-
-func TestStatsForGroup(t *testing.T) {
- for _, testCase := range []struct {
- Name string
- Timestamp time.Time
- GroupStats []*nvml.StatsData
- GroupName string
- ExpectedResult *device.DeviceGroupStats
- }{
- {
- Name: "make sure that all data is transformed correctly",
- Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC),
- GroupName: "DeviceName1",
- GroupStats: []*nvml.StatsData{
- {
- DeviceData: &nvml.DeviceData{
- UUID: "UUID1",
- DeviceName: helper.StringToPtr("DeviceName1"),
- MemoryMiB: helper.Uint64ToPtr(1),
- PowerW: helper.UintToPtr(1),
- BAR1MiB: helper.Uint64ToPtr(256),
- },
- PowerUsageW: helper.UintToPtr(1),
- GPUUtilization: helper.UintToPtr(1),
- MemoryUtilization: helper.UintToPtr(1),
- EncoderUtilization: helper.UintToPtr(1),
- DecoderUtilization: helper.UintToPtr(1),
- TemperatureC: helper.UintToPtr(1),
- UsedMemoryMiB: helper.Uint64ToPtr(1),
- BAR1UsedMiB: helper.Uint64ToPtr(1),
- ECCErrorsL1Cache: helper.Uint64ToPtr(100),
- ECCErrorsL2Cache: helper.Uint64ToPtr(100),
- ECCErrorsDevice: helper.Uint64ToPtr(100),
- },
- {
- DeviceData: &nvml.DeviceData{
- UUID: "UUID2",
- DeviceName: helper.StringToPtr("DeviceName2"),
- MemoryMiB: helper.Uint64ToPtr(2),
- PowerW: helper.UintToPtr(2),
- BAR1MiB: helper.Uint64ToPtr(256),
- },
- PowerUsageW: helper.UintToPtr(2),
- GPUUtilization: helper.UintToPtr(2),
- MemoryUtilization: helper.UintToPtr(2),
- EncoderUtilization: helper.UintToPtr(2),
- DecoderUtilization: helper.UintToPtr(2),
- TemperatureC: helper.UintToPtr(2),
- UsedMemoryMiB: helper.Uint64ToPtr(2),
- BAR1UsedMiB: helper.Uint64ToPtr(2),
- ECCErrorsL1Cache: helper.Uint64ToPtr(200),
- ECCErrorsL2Cache: helper.Uint64ToPtr(200),
- ECCErrorsDevice: helper.Uint64ToPtr(200),
- },
- {
- DeviceData: &nvml.DeviceData{
- UUID: "UUID3",
- DeviceName: helper.StringToPtr("DeviceName3"),
- MemoryMiB: helper.Uint64ToPtr(3),
- PowerW: helper.UintToPtr(3),
- BAR1MiB: helper.Uint64ToPtr(256),
- },
- PowerUsageW: helper.UintToPtr(3),
- GPUUtilization: helper.UintToPtr(3),
- MemoryUtilization: helper.UintToPtr(3),
- EncoderUtilization: helper.UintToPtr(3),
- DecoderUtilization: helper.UintToPtr(3),
- TemperatureC: helper.UintToPtr(3),
- UsedMemoryMiB: helper.Uint64ToPtr(3),
- BAR1UsedMiB: helper.Uint64ToPtr(3),
- ECCErrorsL1Cache: helper.Uint64ToPtr(300),
- ECCErrorsL2Cache: helper.Uint64ToPtr(300),
- ECCErrorsDevice: helper.Uint64ToPtr(300),
- },
- },
- ExpectedResult: &device.DeviceGroupStats{
- Vendor: vendor,
- Type: deviceType,
- Name: "DeviceName1",
- InstanceStats: map[string]*device.DeviceStats{
- "UUID1": {
- Summary: &structs.StatValue{
- Unit: MemoryStateUnit,
- Desc: MemoryStateDesc,
- IntNumeratorVal: helper.Int64ToPtr(1),
- IntDenominatorVal: helper.Int64ToPtr(1),
- },
- Stats: &structs.StatObject{
- Attributes: map[string]*structs.StatValue{
- PowerUsageAttr: {
- Unit: PowerUsageUnit,
- Desc: PowerUsageDesc,
- IntNumeratorVal: helper.Int64ToPtr(1),
- IntDenominatorVal: helper.Int64ToPtr(1),
- },
- GPUUtilizationAttr: {
- Unit: GPUUtilizationUnit,
- Desc: GPUUtilizationDesc,
- IntNumeratorVal: helper.Int64ToPtr(1),
- },
- MemoryUtilizationAttr: {
- Unit: MemoryUtilizationUnit,
- Desc: MemoryUtilizationDesc,
- IntNumeratorVal: helper.Int64ToPtr(1),
- },
- EncoderUtilizationAttr: {
- Unit: EncoderUtilizationUnit,
- Desc: EncoderUtilizationDesc,
- IntNumeratorVal: helper.Int64ToPtr(1),
- },
- DecoderUtilizationAttr: {
- Unit: DecoderUtilizationUnit,
- Desc: DecoderUtilizationDesc,
- IntNumeratorVal: helper.Int64ToPtr(1),
- },
- TemperatureAttr: {
- Unit: TemperatureUnit,
- Desc: TemperatureDesc,
- IntNumeratorVal: helper.Int64ToPtr(1),
- },
- MemoryStateAttr: {
- Unit: MemoryStateUnit,
- Desc: MemoryStateDesc,
- IntNumeratorVal: helper.Int64ToPtr(1),
- IntDenominatorVal: helper.Int64ToPtr(1),
- },
- BAR1StateAttr: {
- Unit: BAR1StateUnit,
- Desc: BAR1StateDesc,
- IntNumeratorVal: helper.Int64ToPtr(1),
- IntDenominatorVal: helper.Int64ToPtr(256),
- },
- ECCErrorsL1CacheAttr: {
- Unit: ECCErrorsL1CacheUnit,
- Desc: ECCErrorsL1CacheDesc,
- IntNumeratorVal: helper.Int64ToPtr(100),
- },
- ECCErrorsL2CacheAttr: {
- Unit: ECCErrorsL2CacheUnit,
- Desc: ECCErrorsL2CacheDesc,
- IntNumeratorVal: helper.Int64ToPtr(100),
- },
- ECCErrorsDeviceAttr: {
- Unit: ECCErrorsDeviceUnit,
- Desc: ECCErrorsDeviceDesc,
- IntNumeratorVal: helper.Int64ToPtr(100),
- },
- },
- },
- Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC),
- },
- "UUID2": {
- Summary: &structs.StatValue{
- Unit: MemoryStateUnit,
- Desc: MemoryStateDesc,
- IntNumeratorVal: helper.Int64ToPtr(2),
- IntDenominatorVal: helper.Int64ToPtr(2),
- },
- Stats: &structs.StatObject{
- Attributes: map[string]*structs.StatValue{
- PowerUsageAttr: {
- Unit: PowerUsageUnit,
- Desc: PowerUsageDesc,
- IntNumeratorVal: helper.Int64ToPtr(2),
- IntDenominatorVal: helper.Int64ToPtr(2),
- },
- GPUUtilizationAttr: {
- Unit: GPUUtilizationUnit,
- Desc: GPUUtilizationDesc,
- IntNumeratorVal: helper.Int64ToPtr(2),
- },
- MemoryUtilizationAttr: {
- Unit: MemoryUtilizationUnit,
- Desc: MemoryUtilizationDesc,
- IntNumeratorVal: helper.Int64ToPtr(2),
- },
- EncoderUtilizationAttr: {
- Unit: EncoderUtilizationUnit,
- Desc: EncoderUtilizationDesc,
- IntNumeratorVal: helper.Int64ToPtr(2),
- },
- DecoderUtilizationAttr: {
- Unit: DecoderUtilizationUnit,
- Desc: DecoderUtilizationDesc,
- IntNumeratorVal: helper.Int64ToPtr(2),
- },
- TemperatureAttr: {
- Unit: TemperatureUnit,
- Desc: TemperatureDesc,
- IntNumeratorVal: helper.Int64ToPtr(2),
- },
- MemoryStateAttr: {
- Unit: MemoryStateUnit,
- Desc: MemoryStateDesc,
- IntNumeratorVal: helper.Int64ToPtr(2),
- IntDenominatorVal: helper.Int64ToPtr(2),
- },
- BAR1StateAttr: {
- Unit: BAR1StateUnit,
- Desc: BAR1StateDesc,
- IntNumeratorVal: helper.Int64ToPtr(2),
- IntDenominatorVal: helper.Int64ToPtr(256),
- },
- ECCErrorsL1CacheAttr: {
- Unit: ECCErrorsL1CacheUnit,
- Desc: ECCErrorsL1CacheDesc,
- IntNumeratorVal: helper.Int64ToPtr(200),
- },
- ECCErrorsL2CacheAttr: {
- Unit: ECCErrorsL2CacheUnit,
- Desc: ECCErrorsL2CacheDesc,
- IntNumeratorVal: helper.Int64ToPtr(200),
- },
- ECCErrorsDeviceAttr: {
- Unit: ECCErrorsDeviceUnit,
- Desc: ECCErrorsDeviceDesc,
- IntNumeratorVal: helper.Int64ToPtr(200),
- },
- },
- },
- Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC),
- },
- "UUID3": {
- Summary: &structs.StatValue{
- Unit: MemoryStateUnit,
- Desc: MemoryStateDesc,
- IntNumeratorVal: helper.Int64ToPtr(3),
- IntDenominatorVal: helper.Int64ToPtr(3),
- },
- Stats: &structs.StatObject{
- Attributes: map[string]*structs.StatValue{
- PowerUsageAttr: {
- Unit: PowerUsageUnit,
- Desc: PowerUsageDesc,
- IntNumeratorVal: helper.Int64ToPtr(3),
- IntDenominatorVal: helper.Int64ToPtr(3),
- },
- GPUUtilizationAttr: {
- Unit: GPUUtilizationUnit,
- Desc: GPUUtilizationDesc,
- IntNumeratorVal: helper.Int64ToPtr(3),
- },
- MemoryUtilizationAttr: {
- Unit: MemoryUtilizationUnit,
- Desc: MemoryUtilizationDesc,
- IntNumeratorVal: helper.Int64ToPtr(3),
- },
- EncoderUtilizationAttr: {
- Unit: EncoderUtilizationUnit,
- Desc: EncoderUtilizationDesc,
- IntNumeratorVal: helper.Int64ToPtr(3),
- },
- DecoderUtilizationAttr: {
- Unit: DecoderUtilizationUnit,
- Desc: DecoderUtilizationDesc,
- IntNumeratorVal: helper.Int64ToPtr(3),
- },
- TemperatureAttr: {
- Unit: TemperatureUnit,
- Desc: TemperatureDesc,
- IntNumeratorVal: helper.Int64ToPtr(3),
- },
- MemoryStateAttr: {
- Unit: MemoryStateUnit,
- Desc: MemoryStateDesc,
- IntNumeratorVal: helper.Int64ToPtr(3),
- IntDenominatorVal: helper.Int64ToPtr(3),
- },
- BAR1StateAttr: {
- Unit: BAR1StateUnit,
- Desc: BAR1StateDesc,
- IntNumeratorVal: helper.Int64ToPtr(3),
- IntDenominatorVal: helper.Int64ToPtr(256),
- },
- ECCErrorsL1CacheAttr: {
- Unit: ECCErrorsL1CacheUnit,
- Desc: ECCErrorsL1CacheDesc,
- IntNumeratorVal: helper.Int64ToPtr(300),
- },
- ECCErrorsL2CacheAttr: {
- Unit: ECCErrorsL2CacheUnit,
- Desc: ECCErrorsL2CacheDesc,
- IntNumeratorVal: helper.Int64ToPtr(300),
- },
- ECCErrorsDeviceAttr: {
- Unit: ECCErrorsDeviceUnit,
- Desc: ECCErrorsDeviceDesc,
- IntNumeratorVal: helper.Int64ToPtr(300),
- },
- },
- },
- Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC),
- },
- },
- },
- },
- } {
- actualResult := statsForGroup(testCase.GroupName, testCase.GroupStats, testCase.Timestamp)
- require.New(t).Equal(testCase.ExpectedResult, actualResult)
- }
-}
-
-func TestWriteStatsToChannel(t *testing.T) {
- for _, testCase := range []struct {
- Name string
- ExpectedWriteToChannel *device.StatsResponse
- Timestamp time.Time
- Device *NvidiaDevice
- }{
- {
- Name: "NVML wrapper returns error",
- Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC),
- ExpectedWriteToChannel: &device.StatsResponse{
- Error: errors.New(""),
- },
- Device: &NvidiaDevice{
- nvmlClient: &MockNvmlClient{
- StatsError: errors.New(""),
- },
- logger: hclog.NewNullLogger(),
- },
- },
- {
- Name: "Check that stats with multiple DeviceNames are assigned to different groups",
- Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC),
- Device: &NvidiaDevice{
- devices: map[string]struct{}{
- "UUID1": {},
- "UUID2": {},
- "UUID3": {},
- },
- nvmlClient: &MockNvmlClient{
- StatsResponseReturned: []*nvml.StatsData{
- {
- DeviceData: &nvml.DeviceData{
- UUID: "UUID1",
- DeviceName: helper.StringToPtr("DeviceName1"),
- MemoryMiB: helper.Uint64ToPtr(1),
- PowerW: helper.UintToPtr(1),
- BAR1MiB: helper.Uint64ToPtr(256),
- },
- PowerUsageW: helper.UintToPtr(1),
- GPUUtilization: helper.UintToPtr(1),
- MemoryUtilization: helper.UintToPtr(1),
- EncoderUtilization: helper.UintToPtr(1),
- DecoderUtilization: helper.UintToPtr(1),
- TemperatureC: helper.UintToPtr(1),
- UsedMemoryMiB: helper.Uint64ToPtr(1),
- BAR1UsedMiB: helper.Uint64ToPtr(1),
- ECCErrorsL1Cache: helper.Uint64ToPtr(100),
- ECCErrorsL2Cache: helper.Uint64ToPtr(100),
- ECCErrorsDevice: helper.Uint64ToPtr(100),
- },
- {
- DeviceData: &nvml.DeviceData{
- UUID: "UUID2",
- DeviceName: helper.StringToPtr("DeviceName2"),
- MemoryMiB: helper.Uint64ToPtr(2),
- PowerW: helper.UintToPtr(2),
- BAR1MiB: helper.Uint64ToPtr(256),
- },
- PowerUsageW: helper.UintToPtr(2),
- GPUUtilization: helper.UintToPtr(2),
- MemoryUtilization: helper.UintToPtr(2),
- EncoderUtilization: helper.UintToPtr(2),
- DecoderUtilization: helper.UintToPtr(2),
- TemperatureC: helper.UintToPtr(2),
- UsedMemoryMiB: helper.Uint64ToPtr(2),
- BAR1UsedMiB: helper.Uint64ToPtr(2),
- ECCErrorsL1Cache: helper.Uint64ToPtr(200),
- ECCErrorsL2Cache: helper.Uint64ToPtr(200),
- ECCErrorsDevice: helper.Uint64ToPtr(200),
- },
- {
- DeviceData: &nvml.DeviceData{
- UUID: "UUID3",
- DeviceName: helper.StringToPtr("DeviceName3"),
- MemoryMiB: helper.Uint64ToPtr(3),
- PowerW: helper.UintToPtr(3),
- BAR1MiB: helper.Uint64ToPtr(256),
- },
- PowerUsageW: helper.UintToPtr(3),
- GPUUtilization: helper.UintToPtr(3),
- MemoryUtilization: helper.UintToPtr(3),
- EncoderUtilization: helper.UintToPtr(3),
- DecoderUtilization: helper.UintToPtr(3),
- TemperatureC: helper.UintToPtr(3),
- UsedMemoryMiB: helper.Uint64ToPtr(3),
- BAR1UsedMiB: helper.Uint64ToPtr(3),
- ECCErrorsL1Cache: helper.Uint64ToPtr(300),
- ECCErrorsL2Cache: helper.Uint64ToPtr(300),
- ECCErrorsDevice: helper.Uint64ToPtr(300),
- },
- },
- },
- logger: hclog.NewNullLogger(),
- },
- ExpectedWriteToChannel: &device.StatsResponse{
- Groups: []*device.DeviceGroupStats{
- {
- Vendor: vendor,
- Type: deviceType,
- Name: "DeviceName1",
- InstanceStats: map[string]*device.DeviceStats{
- "UUID1": {
- Summary: &structs.StatValue{
- Unit: MemoryStateUnit,
- Desc: MemoryStateDesc,
- IntNumeratorVal: helper.Int64ToPtr(1),
- IntDenominatorVal: helper.Int64ToPtr(1),
- },
- Stats: &structs.StatObject{
- Attributes: map[string]*structs.StatValue{
- PowerUsageAttr: {
- Unit: PowerUsageUnit,
- Desc: PowerUsageDesc,
- IntNumeratorVal: helper.Int64ToPtr(1),
- IntDenominatorVal: helper.Int64ToPtr(1),
- },
- GPUUtilizationAttr: {
- Unit: GPUUtilizationUnit,
- Desc: GPUUtilizationDesc,
- IntNumeratorVal: helper.Int64ToPtr(1),
- },
- MemoryUtilizationAttr: {
- Unit: MemoryUtilizationUnit,
- Desc: MemoryUtilizationDesc,
- IntNumeratorVal: helper.Int64ToPtr(1),
- },
- EncoderUtilizationAttr: {
- Unit: EncoderUtilizationUnit,
- Desc: EncoderUtilizationDesc,
- IntNumeratorVal: helper.Int64ToPtr(1),
- },
- DecoderUtilizationAttr: {
- Unit: DecoderUtilizationUnit,
- Desc: DecoderUtilizationDesc,
- IntNumeratorVal: helper.Int64ToPtr(1),
- },
- TemperatureAttr: {
- Unit: TemperatureUnit,
- Desc: TemperatureDesc,
- IntNumeratorVal: helper.Int64ToPtr(1),
- },
- MemoryStateAttr: {
- Unit: MemoryStateUnit,
- Desc: MemoryStateDesc,
- IntNumeratorVal: helper.Int64ToPtr(1),
- IntDenominatorVal: helper.Int64ToPtr(1),
- },
- BAR1StateAttr: {
- Unit: BAR1StateUnit,
- Desc: BAR1StateDesc,
- IntNumeratorVal: helper.Int64ToPtr(1),
- IntDenominatorVal: helper.Int64ToPtr(256),
- },
- ECCErrorsL1CacheAttr: {
- Unit: ECCErrorsL1CacheUnit,
- Desc: ECCErrorsL1CacheDesc,
- IntNumeratorVal: helper.Int64ToPtr(100),
- },
- ECCErrorsL2CacheAttr: {
- Unit: ECCErrorsL2CacheUnit,
- Desc: ECCErrorsL2CacheDesc,
- IntNumeratorVal: helper.Int64ToPtr(100),
- },
- ECCErrorsDeviceAttr: {
- Unit: ECCErrorsDeviceUnit,
- Desc: ECCErrorsDeviceDesc,
- IntNumeratorVal: helper.Int64ToPtr(100),
- },
- },
- },
- Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC),
- },
- },
- },
- {
- Vendor: vendor,
- Type: deviceType,
- Name: "DeviceName2",
- InstanceStats: map[string]*device.DeviceStats{
- "UUID2": {
- Summary: &structs.StatValue{
- Unit: MemoryStateUnit,
- Desc: MemoryStateDesc,
- IntNumeratorVal: helper.Int64ToPtr(2),
- IntDenominatorVal: helper.Int64ToPtr(2),
- },
- Stats: &structs.StatObject{
- Attributes: map[string]*structs.StatValue{
- PowerUsageAttr: {
- Unit: PowerUsageUnit,
- Desc: PowerUsageDesc,
- IntNumeratorVal: helper.Int64ToPtr(2),
- IntDenominatorVal: helper.Int64ToPtr(2),
- },
- GPUUtilizationAttr: {
- Unit: GPUUtilizationUnit,
- Desc: GPUUtilizationDesc,
- IntNumeratorVal: helper.Int64ToPtr(2),
- },
- MemoryUtilizationAttr: {
- Unit: MemoryUtilizationUnit,
- Desc: MemoryUtilizationDesc,
- IntNumeratorVal: helper.Int64ToPtr(2),
- },
- EncoderUtilizationAttr: {
- Unit: EncoderUtilizationUnit,
- Desc: EncoderUtilizationDesc,
- IntNumeratorVal: helper.Int64ToPtr(2),
- },
- DecoderUtilizationAttr: {
- Unit: DecoderUtilizationUnit,
- Desc: DecoderUtilizationDesc,
- IntNumeratorVal: helper.Int64ToPtr(2),
- },
- TemperatureAttr: {
- Unit: TemperatureUnit,
- Desc: TemperatureDesc,
- IntNumeratorVal: helper.Int64ToPtr(2),
- },
- MemoryStateAttr: {
- Unit: MemoryStateUnit,
- Desc: MemoryStateDesc,
- IntNumeratorVal: helper.Int64ToPtr(2),
- IntDenominatorVal: helper.Int64ToPtr(2),
- },
- BAR1StateAttr: {
- Unit: BAR1StateUnit,
- Desc: BAR1StateDesc,
- IntNumeratorVal: helper.Int64ToPtr(2),
- IntDenominatorVal: helper.Int64ToPtr(256),
- },
- ECCErrorsL1CacheAttr: {
- Unit: ECCErrorsL1CacheUnit,
- Desc: ECCErrorsL1CacheDesc,
- IntNumeratorVal: helper.Int64ToPtr(200),
- },
- ECCErrorsL2CacheAttr: {
- Unit: ECCErrorsL2CacheUnit,
- Desc: ECCErrorsL2CacheDesc,
- IntNumeratorVal: helper.Int64ToPtr(200),
- },
- ECCErrorsDeviceAttr: {
- Unit: ECCErrorsDeviceUnit,
- Desc: ECCErrorsDeviceDesc,
- IntNumeratorVal: helper.Int64ToPtr(200),
- },
- },
- },
- Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC),
- },
- },
- },
- {
- Vendor: vendor,
- Type: deviceType,
- Name: "DeviceName3",
- InstanceStats: map[string]*device.DeviceStats{
- "UUID3": {
- Summary: &structs.StatValue{
- Unit: MemoryStateUnit,
- Desc: MemoryStateDesc,
- IntNumeratorVal: helper.Int64ToPtr(3),
- IntDenominatorVal: helper.Int64ToPtr(3),
- },
- Stats: &structs.StatObject{
- Attributes: map[string]*structs.StatValue{
- PowerUsageAttr: {
- Unit: PowerUsageUnit,
- Desc: PowerUsageDesc,
- IntNumeratorVal: helper.Int64ToPtr(3),
- IntDenominatorVal: helper.Int64ToPtr(3),
- },
- GPUUtilizationAttr: {
- Unit: GPUUtilizationUnit,
- Desc: GPUUtilizationDesc,
- IntNumeratorVal: helper.Int64ToPtr(3),
- },
- MemoryUtilizationAttr: {
- Unit: MemoryUtilizationUnit,
- Desc: MemoryUtilizationDesc,
- IntNumeratorVal: helper.Int64ToPtr(3),
- },
- EncoderUtilizationAttr: {
- Unit: EncoderUtilizationUnit,
- Desc: EncoderUtilizationDesc,
- IntNumeratorVal: helper.Int64ToPtr(3),
- },
- DecoderUtilizationAttr: {
- Unit: DecoderUtilizationUnit,
- Desc: DecoderUtilizationDesc,
- IntNumeratorVal: helper.Int64ToPtr(3),
- },
- TemperatureAttr: {
- Unit: TemperatureUnit,
- Desc: TemperatureDesc,
- IntNumeratorVal: helper.Int64ToPtr(3),
- },
- MemoryStateAttr: {
- Unit: MemoryStateUnit,
- Desc: MemoryStateDesc,
- IntNumeratorVal: helper.Int64ToPtr(3),
- IntDenominatorVal: helper.Int64ToPtr(3),
- },
- BAR1StateAttr: {
- Unit: BAR1StateUnit,
- Desc: BAR1StateDesc,
- IntNumeratorVal: helper.Int64ToPtr(3),
- IntDenominatorVal: helper.Int64ToPtr(256),
- },
- ECCErrorsL1CacheAttr: {
- Unit: ECCErrorsL1CacheUnit,
- Desc: ECCErrorsL1CacheDesc,
- IntNumeratorVal: helper.Int64ToPtr(300),
- },
- ECCErrorsL2CacheAttr: {
- Unit: ECCErrorsL2CacheUnit,
- Desc: ECCErrorsL2CacheDesc,
- IntNumeratorVal: helper.Int64ToPtr(300),
- },
- ECCErrorsDeviceAttr: {
- Unit: ECCErrorsDeviceUnit,
- Desc: ECCErrorsDeviceDesc,
- IntNumeratorVal: helper.Int64ToPtr(300),
- },
- },
- },
- Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC),
- },
- },
- },
- },
- },
- },
- {
- Name: "Check that stats with multiple DeviceNames are assigned to different groups 2",
- Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC),
- Device: &NvidiaDevice{
- devices: map[string]struct{}{
- "UUID1": {},
- "UUID2": {},
- "UUID3": {},
- },
- nvmlClient: &MockNvmlClient{
- StatsResponseReturned: []*nvml.StatsData{
- {
- DeviceData: &nvml.DeviceData{
- UUID: "UUID1",
- DeviceName: helper.StringToPtr("DeviceName1"),
- MemoryMiB: helper.Uint64ToPtr(1),
- PowerW: helper.UintToPtr(1),
- BAR1MiB: helper.Uint64ToPtr(256),
- },
- PowerUsageW: helper.UintToPtr(1),
- GPUUtilization: helper.UintToPtr(1),
- MemoryUtilization: helper.UintToPtr(1),
- EncoderUtilization: helper.UintToPtr(1),
- DecoderUtilization: helper.UintToPtr(1),
- TemperatureC: helper.UintToPtr(1),
- UsedMemoryMiB: helper.Uint64ToPtr(1),
- BAR1UsedMiB: helper.Uint64ToPtr(1),
- ECCErrorsL1Cache: helper.Uint64ToPtr(100),
- ECCErrorsL2Cache: helper.Uint64ToPtr(100),
- ECCErrorsDevice: helper.Uint64ToPtr(100),
- },
- {
- DeviceData: &nvml.DeviceData{
- UUID: "UUID2",
- DeviceName: helper.StringToPtr("DeviceName2"),
- MemoryMiB: helper.Uint64ToPtr(2),
- PowerW: helper.UintToPtr(2),
- BAR1MiB: helper.Uint64ToPtr(256),
- },
- PowerUsageW: helper.UintToPtr(2),
- GPUUtilization: helper.UintToPtr(2),
- MemoryUtilization: helper.UintToPtr(2),
- EncoderUtilization: helper.UintToPtr(2),
- DecoderUtilization: helper.UintToPtr(2),
- TemperatureC: helper.UintToPtr(2),
- UsedMemoryMiB: helper.Uint64ToPtr(2),
- BAR1UsedMiB: helper.Uint64ToPtr(2),
- ECCErrorsL1Cache: helper.Uint64ToPtr(200),
- ECCErrorsL2Cache: helper.Uint64ToPtr(200),
- ECCErrorsDevice: helper.Uint64ToPtr(200),
- },
- {
- DeviceData: &nvml.DeviceData{
- UUID: "UUID3",
- DeviceName: helper.StringToPtr("DeviceName2"),
- MemoryMiB: helper.Uint64ToPtr(3),
- PowerW: helper.UintToPtr(3),
- BAR1MiB: helper.Uint64ToPtr(256),
- },
- PowerUsageW: helper.UintToPtr(3),
- GPUUtilization: helper.UintToPtr(3),
- MemoryUtilization: helper.UintToPtr(3),
- EncoderUtilization: helper.UintToPtr(3),
- DecoderUtilization: helper.UintToPtr(3),
- TemperatureC: helper.UintToPtr(3),
- UsedMemoryMiB: helper.Uint64ToPtr(3),
- BAR1UsedMiB: helper.Uint64ToPtr(3),
- ECCErrorsL1Cache: helper.Uint64ToPtr(300),
- ECCErrorsL2Cache: helper.Uint64ToPtr(300),
- ECCErrorsDevice: helper.Uint64ToPtr(300),
- },
- },
- },
- logger: hclog.NewNullLogger(),
- },
- ExpectedWriteToChannel: &device.StatsResponse{
- Groups: []*device.DeviceGroupStats{
- {
- Vendor: vendor,
- Type: deviceType,
- Name: "DeviceName1",
- InstanceStats: map[string]*device.DeviceStats{
- "UUID1": {
- Summary: &structs.StatValue{
- Unit: MemoryStateUnit,
- Desc: MemoryStateDesc,
- IntNumeratorVal: helper.Int64ToPtr(1),
- IntDenominatorVal: helper.Int64ToPtr(1),
- },
- Stats: &structs.StatObject{
- Attributes: map[string]*structs.StatValue{
- PowerUsageAttr: {
- Unit: PowerUsageUnit,
- Desc: PowerUsageDesc,
- IntNumeratorVal: helper.Int64ToPtr(1),
- IntDenominatorVal: helper.Int64ToPtr(1),
- },
- GPUUtilizationAttr: {
- Unit: GPUUtilizationUnit,
- Desc: GPUUtilizationDesc,
- IntNumeratorVal: helper.Int64ToPtr(1),
- },
- MemoryUtilizationAttr: {
- Unit: MemoryUtilizationUnit,
- Desc: MemoryUtilizationDesc,
- IntNumeratorVal: helper.Int64ToPtr(1),
- },
- EncoderUtilizationAttr: {
- Unit: EncoderUtilizationUnit,
- Desc: EncoderUtilizationDesc,
- IntNumeratorVal: helper.Int64ToPtr(1),
- },
- DecoderUtilizationAttr: {
- Unit: DecoderUtilizationUnit,
- Desc: DecoderUtilizationDesc,
- IntNumeratorVal: helper.Int64ToPtr(1),
- },
- TemperatureAttr: {
- Unit: TemperatureUnit,
- Desc: TemperatureDesc,
- IntNumeratorVal: helper.Int64ToPtr(1),
- },
- MemoryStateAttr: {
- Unit: MemoryStateUnit,
- Desc: MemoryStateDesc,
- IntNumeratorVal: helper.Int64ToPtr(1),
- IntDenominatorVal: helper.Int64ToPtr(1),
- },
- BAR1StateAttr: {
- Unit: BAR1StateUnit,
- Desc: BAR1StateDesc,
- IntNumeratorVal: helper.Int64ToPtr(1),
- IntDenominatorVal: helper.Int64ToPtr(256),
- },
- ECCErrorsL1CacheAttr: {
- Unit: ECCErrorsL1CacheUnit,
- Desc: ECCErrorsL1CacheDesc,
- IntNumeratorVal: helper.Int64ToPtr(100),
- },
- ECCErrorsL2CacheAttr: {
- Unit: ECCErrorsL2CacheUnit,
- Desc: ECCErrorsL2CacheDesc,
- IntNumeratorVal: helper.Int64ToPtr(100),
- },
- ECCErrorsDeviceAttr: {
- Unit: ECCErrorsDeviceUnit,
- Desc: ECCErrorsDeviceDesc,
- IntNumeratorVal: helper.Int64ToPtr(100),
- },
- },
- },
- Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC),
- },
- },
- },
- {
- Vendor: vendor,
- Type: deviceType,
- Name: "DeviceName2",
- InstanceStats: map[string]*device.DeviceStats{
- "UUID3": {
- Summary: &structs.StatValue{
- Unit: MemoryStateUnit,
- Desc: MemoryStateDesc,
- IntNumeratorVal: helper.Int64ToPtr(3),
- IntDenominatorVal: helper.Int64ToPtr(3),
- },
- Stats: &structs.StatObject{
- Attributes: map[string]*structs.StatValue{
- PowerUsageAttr: {
- Unit: PowerUsageUnit,
- Desc: PowerUsageDesc,
- IntNumeratorVal: helper.Int64ToPtr(3),
- IntDenominatorVal: helper.Int64ToPtr(3),
- },
- GPUUtilizationAttr: {
- Unit: GPUUtilizationUnit,
- Desc: GPUUtilizationDesc,
- IntNumeratorVal: helper.Int64ToPtr(3),
- },
- MemoryUtilizationAttr: {
- Unit: MemoryUtilizationUnit,
- Desc: MemoryUtilizationDesc,
- IntNumeratorVal: helper.Int64ToPtr(3),
- },
- EncoderUtilizationAttr: {
- Unit: EncoderUtilizationUnit,
- Desc: EncoderUtilizationDesc,
- IntNumeratorVal: helper.Int64ToPtr(3),
- },
- DecoderUtilizationAttr: {
- Unit: DecoderUtilizationUnit,
- Desc: DecoderUtilizationDesc,
- IntNumeratorVal: helper.Int64ToPtr(3),
- },
- TemperatureAttr: {
- Unit: TemperatureUnit,
- Desc: TemperatureDesc,
- IntNumeratorVal: helper.Int64ToPtr(3),
- },
- MemoryStateAttr: {
- Unit: MemoryStateUnit,
- Desc: MemoryStateDesc,
- IntNumeratorVal: helper.Int64ToPtr(3),
- IntDenominatorVal: helper.Int64ToPtr(3),
- },
- BAR1StateAttr: {
- Unit: BAR1StateUnit,
- Desc: BAR1StateDesc,
- IntNumeratorVal: helper.Int64ToPtr(3),
- IntDenominatorVal: helper.Int64ToPtr(256),
- },
- ECCErrorsL1CacheAttr: {
- Unit: ECCErrorsL1CacheUnit,
- Desc: ECCErrorsL1CacheDesc,
- IntNumeratorVal: helper.Int64ToPtr(300),
- },
- ECCErrorsL2CacheAttr: {
- Unit: ECCErrorsL2CacheUnit,
- Desc: ECCErrorsL2CacheDesc,
- IntNumeratorVal: helper.Int64ToPtr(300),
- },
- ECCErrorsDeviceAttr: {
- Unit: ECCErrorsDeviceUnit,
- Desc: ECCErrorsDeviceDesc,
- IntNumeratorVal: helper.Int64ToPtr(300),
- },
- },
- },
- Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC),
- },
- "UUID2": {
- Summary: &structs.StatValue{
- Unit: MemoryStateUnit,
- Desc: MemoryStateDesc,
- IntNumeratorVal: helper.Int64ToPtr(2),
- IntDenominatorVal: helper.Int64ToPtr(2),
- },
- Stats: &structs.StatObject{
- Attributes: map[string]*structs.StatValue{
- PowerUsageAttr: {
- Unit: PowerUsageUnit,
- Desc: PowerUsageDesc,
- IntNumeratorVal: helper.Int64ToPtr(2),
- IntDenominatorVal: helper.Int64ToPtr(2),
- },
- GPUUtilizationAttr: {
- Unit: GPUUtilizationUnit,
- Desc: GPUUtilizationDesc,
- IntNumeratorVal: helper.Int64ToPtr(2),
- },
- MemoryUtilizationAttr: {
- Unit: MemoryUtilizationUnit,
- Desc: MemoryUtilizationDesc,
- IntNumeratorVal: helper.Int64ToPtr(2),
- },
- EncoderUtilizationAttr: {
- Unit: EncoderUtilizationUnit,
- Desc: EncoderUtilizationDesc,
- IntNumeratorVal: helper.Int64ToPtr(2),
- },
- DecoderUtilizationAttr: {
- Unit: DecoderUtilizationUnit,
- Desc: DecoderUtilizationDesc,
- IntNumeratorVal: helper.Int64ToPtr(2),
- },
- TemperatureAttr: {
- Unit: TemperatureUnit,
- Desc: TemperatureDesc,
- IntNumeratorVal: helper.Int64ToPtr(2),
- },
- MemoryStateAttr: {
- Unit: MemoryStateUnit,
- Desc: MemoryStateDesc,
- IntNumeratorVal: helper.Int64ToPtr(2),
- IntDenominatorVal: helper.Int64ToPtr(2),
- },
- BAR1StateAttr: {
- Unit: BAR1StateUnit,
- Desc: BAR1StateDesc,
- IntNumeratorVal: helper.Int64ToPtr(2),
- IntDenominatorVal: helper.Int64ToPtr(256),
- },
- ECCErrorsL1CacheAttr: {
- Unit: ECCErrorsL1CacheUnit,
- Desc: ECCErrorsL1CacheDesc,
- IntNumeratorVal: helper.Int64ToPtr(200),
- },
- ECCErrorsL2CacheAttr: {
- Unit: ECCErrorsL2CacheUnit,
- Desc: ECCErrorsL2CacheDesc,
- IntNumeratorVal: helper.Int64ToPtr(200),
- },
- ECCErrorsDeviceAttr: {
- Unit: ECCErrorsDeviceUnit,
- Desc: ECCErrorsDeviceDesc,
- IntNumeratorVal: helper.Int64ToPtr(200),
- },
- },
- },
- Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC),
- },
- },
- },
- },
- },
- },
- {
- Name: "Check that only devices from NvidiaDevice.device map stats are reported",
- Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC),
- Device: &NvidiaDevice{
- devices: map[string]struct{}{
- "UUID1": {},
- "UUID2": {},
- },
- nvmlClient: &MockNvmlClient{
- StatsResponseReturned: []*nvml.StatsData{
- {
- DeviceData: &nvml.DeviceData{
- UUID: "UUID1",
- DeviceName: helper.StringToPtr("DeviceName1"),
- MemoryMiB: helper.Uint64ToPtr(1),
- PowerW: helper.UintToPtr(1),
- BAR1MiB: helper.Uint64ToPtr(256),
- },
- PowerUsageW: helper.UintToPtr(1),
- GPUUtilization: helper.UintToPtr(1),
- MemoryUtilization: helper.UintToPtr(1),
- EncoderUtilization: helper.UintToPtr(1),
- DecoderUtilization: helper.UintToPtr(1),
- TemperatureC: helper.UintToPtr(1),
- UsedMemoryMiB: helper.Uint64ToPtr(1),
- BAR1UsedMiB: helper.Uint64ToPtr(1),
- ECCErrorsL1Cache: helper.Uint64ToPtr(100),
- ECCErrorsL2Cache: helper.Uint64ToPtr(100),
- ECCErrorsDevice: helper.Uint64ToPtr(100),
- },
- {
- DeviceData: &nvml.DeviceData{
- UUID: "UUID2",
- DeviceName: helper.StringToPtr("DeviceName2"),
- MemoryMiB: helper.Uint64ToPtr(2),
- PowerW: helper.UintToPtr(2),
- BAR1MiB: helper.Uint64ToPtr(256),
- },
- PowerUsageW: helper.UintToPtr(2),
- GPUUtilization: helper.UintToPtr(2),
- MemoryUtilization: helper.UintToPtr(2),
- EncoderUtilization: helper.UintToPtr(2),
- DecoderUtilization: helper.UintToPtr(2),
- TemperatureC: helper.UintToPtr(2),
- UsedMemoryMiB: helper.Uint64ToPtr(2),
- BAR1UsedMiB: helper.Uint64ToPtr(2),
- ECCErrorsL1Cache: helper.Uint64ToPtr(200),
- ECCErrorsL2Cache: helper.Uint64ToPtr(200),
- ECCErrorsDevice: helper.Uint64ToPtr(200),
- },
- {
- DeviceData: &nvml.DeviceData{
- UUID: "UUID3",
- DeviceName: helper.StringToPtr("DeviceName3"),
- MemoryMiB: helper.Uint64ToPtr(3),
- PowerW: helper.UintToPtr(3),
- BAR1MiB: helper.Uint64ToPtr(256),
- },
- PowerUsageW: helper.UintToPtr(3),
- GPUUtilization: helper.UintToPtr(3),
- MemoryUtilization: helper.UintToPtr(3),
- EncoderUtilization: helper.UintToPtr(3),
- DecoderUtilization: helper.UintToPtr(3),
- TemperatureC: helper.UintToPtr(3),
- UsedMemoryMiB: helper.Uint64ToPtr(3),
- BAR1UsedMiB: helper.Uint64ToPtr(3),
- ECCErrorsL1Cache: helper.Uint64ToPtr(300),
- ECCErrorsL2Cache: helper.Uint64ToPtr(300),
- ECCErrorsDevice: helper.Uint64ToPtr(300),
- },
- },
- },
- logger: hclog.NewNullLogger(),
- },
- ExpectedWriteToChannel: &device.StatsResponse{
- Groups: []*device.DeviceGroupStats{
- {
- Vendor: vendor,
- Type: deviceType,
- Name: "DeviceName1",
- InstanceStats: map[string]*device.DeviceStats{
- "UUID1": {
- Summary: &structs.StatValue{
- Unit: MemoryStateUnit,
- Desc: MemoryStateDesc,
- IntNumeratorVal: helper.Int64ToPtr(1),
- IntDenominatorVal: helper.Int64ToPtr(1),
- },
- Stats: &structs.StatObject{
- Attributes: map[string]*structs.StatValue{
- PowerUsageAttr: {
- Unit: PowerUsageUnit,
- Desc: PowerUsageDesc,
- IntNumeratorVal: helper.Int64ToPtr(1),
- IntDenominatorVal: helper.Int64ToPtr(1),
- },
- GPUUtilizationAttr: {
- Unit: GPUUtilizationUnit,
- Desc: GPUUtilizationDesc,
- IntNumeratorVal: helper.Int64ToPtr(1),
- },
- MemoryUtilizationAttr: {
- Unit: MemoryUtilizationUnit,
- Desc: MemoryUtilizationDesc,
- IntNumeratorVal: helper.Int64ToPtr(1),
- },
- EncoderUtilizationAttr: {
- Unit: EncoderUtilizationUnit,
- Desc: EncoderUtilizationDesc,
- IntNumeratorVal: helper.Int64ToPtr(1),
- },
- DecoderUtilizationAttr: {
- Unit: DecoderUtilizationUnit,
- Desc: DecoderUtilizationDesc,
- IntNumeratorVal: helper.Int64ToPtr(1),
- },
- TemperatureAttr: {
- Unit: TemperatureUnit,
- Desc: TemperatureDesc,
- IntNumeratorVal: helper.Int64ToPtr(1),
- },
- MemoryStateAttr: {
- Unit: MemoryStateUnit,
- Desc: MemoryStateDesc,
- IntNumeratorVal: helper.Int64ToPtr(1),
- IntDenominatorVal: helper.Int64ToPtr(1),
- },
- BAR1StateAttr: {
- Unit: BAR1StateUnit,
- Desc: BAR1StateDesc,
- IntNumeratorVal: helper.Int64ToPtr(1),
- IntDenominatorVal: helper.Int64ToPtr(256),
- },
- ECCErrorsL1CacheAttr: {
- Unit: ECCErrorsL1CacheUnit,
- Desc: ECCErrorsL1CacheDesc,
- IntNumeratorVal: helper.Int64ToPtr(100),
- },
- ECCErrorsL2CacheAttr: {
- Unit: ECCErrorsL2CacheUnit,
- Desc: ECCErrorsL2CacheDesc,
- IntNumeratorVal: helper.Int64ToPtr(100),
- },
- ECCErrorsDeviceAttr: {
- Unit: ECCErrorsDeviceUnit,
- Desc: ECCErrorsDeviceDesc,
- IntNumeratorVal: helper.Int64ToPtr(100),
- },
- },
- },
- Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC),
- },
- },
- },
- {
- Vendor: vendor,
- Type: deviceType,
- Name: "DeviceName2",
- InstanceStats: map[string]*device.DeviceStats{
- "UUID2": {
- Summary: &structs.StatValue{
- Unit: MemoryStateUnit,
- Desc: MemoryStateDesc,
- IntNumeratorVal: helper.Int64ToPtr(2),
- IntDenominatorVal: helper.Int64ToPtr(2),
- },
- Stats: &structs.StatObject{
- Attributes: map[string]*structs.StatValue{
- PowerUsageAttr: {
- Unit: PowerUsageUnit,
- Desc: PowerUsageDesc,
- IntNumeratorVal: helper.Int64ToPtr(2),
- IntDenominatorVal: helper.Int64ToPtr(2),
- },
- GPUUtilizationAttr: {
- Unit: GPUUtilizationUnit,
- Desc: GPUUtilizationDesc,
- IntNumeratorVal: helper.Int64ToPtr(2),
- },
- MemoryUtilizationAttr: {
- Unit: MemoryUtilizationUnit,
- Desc: MemoryUtilizationDesc,
- IntNumeratorVal: helper.Int64ToPtr(2),
- },
- EncoderUtilizationAttr: {
- Unit: EncoderUtilizationUnit,
- Desc: EncoderUtilizationDesc,
- IntNumeratorVal: helper.Int64ToPtr(2),
- },
- DecoderUtilizationAttr: {
- Unit: DecoderUtilizationUnit,
- Desc: DecoderUtilizationDesc,
- IntNumeratorVal: helper.Int64ToPtr(2),
- },
- TemperatureAttr: {
- Unit: TemperatureUnit,
- Desc: TemperatureDesc,
- IntNumeratorVal: helper.Int64ToPtr(2),
- },
- MemoryStateAttr: {
- Unit: MemoryStateUnit,
- Desc: MemoryStateDesc,
- IntNumeratorVal: helper.Int64ToPtr(2),
- IntDenominatorVal: helper.Int64ToPtr(2),
- },
- BAR1StateAttr: {
- Unit: BAR1StateUnit,
- Desc: BAR1StateDesc,
- IntNumeratorVal: helper.Int64ToPtr(2),
- IntDenominatorVal: helper.Int64ToPtr(256),
- },
- ECCErrorsL1CacheAttr: {
- Unit: ECCErrorsL1CacheUnit,
- Desc: ECCErrorsL1CacheDesc,
- IntNumeratorVal: helper.Int64ToPtr(200),
- },
- ECCErrorsL2CacheAttr: {
- Unit: ECCErrorsL2CacheUnit,
- Desc: ECCErrorsL2CacheDesc,
- IntNumeratorVal: helper.Int64ToPtr(200),
- },
- ECCErrorsDeviceAttr: {
- Unit: ECCErrorsDeviceUnit,
- Desc: ECCErrorsDeviceDesc,
- IntNumeratorVal: helper.Int64ToPtr(200),
- },
- },
- },
- Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC),
- },
- },
- },
- },
- },
- },
- } {
- channel := make(chan *device.StatsResponse, 1)
- testCase.Device.writeStatsToChannel(channel, testCase.Timestamp)
- actualResult := <-channel
- // writeStatsToChannel iterates over map keys
- // and insterts results to an array, so order of elements in output array
- // may be different
- // actualResult, expectedWriteToChannel arrays has to be sorted firsted
- sort.Slice(actualResult.Groups, func(i, j int) bool {
- return actualResult.Groups[i].Name < actualResult.Groups[j].Name
- })
- sort.Slice(testCase.ExpectedWriteToChannel.Groups, func(i, j int) bool {
- return testCase.ExpectedWriteToChannel.Groups[i].Name < testCase.ExpectedWriteToChannel.Groups[j].Name
- })
- require.New(t).Equal(testCase.ExpectedWriteToChannel, actualResult)
- }
-}
diff --git a/go.mod b/go.mod
index db72e0ab3e9c..911b72a7663c 100644
--- a/go.mod
+++ b/go.mod
@@ -23,7 +23,6 @@ require (
github.com/LK4D4/joincontext v0.0.0-20171026170139-1724345da6d5
github.com/Microsoft/go-winio v0.4.15-0.20200113171025-3fe6c5262873
github.com/Microsoft/hcsshim v0.8.8-0.20200312192636-fd0797d766b1 // indirect
- github.com/NVIDIA/gpu-monitoring-tools v0.0.0-20180829222009-86f2a9fac6c5
github.com/NYTimes/gziphandler v1.0.1
github.com/armon/circbuf v0.0.0-20150827004946-bbbad097214e
github.com/armon/go-metrics v0.3.4
diff --git a/go.sum b/go.sum
index d8f6cfda76a7..6dfe3b5a41c6 100644
--- a/go.sum
+++ b/go.sum
@@ -67,8 +67,6 @@ github.com/LK4D4/joincontext v0.0.0-20171026170139-1724345da6d5/go.mod h1:nxQPcN
github.com/Microsoft/hcsshim v0.8.7/go.mod h1:OHd7sQqRFrYd3RmSgbgji+ctCwkbq2wbEYNSzOYtcBQ=
github.com/Microsoft/hcsshim v0.8.8-0.20200312192636-fd0797d766b1 h1:2T9t72RkTRjAcuFc+4vaGWnRx/anVngE1/VGN/HFEVk=
github.com/Microsoft/hcsshim v0.8.8-0.20200312192636-fd0797d766b1/go.mod h1:LVvUcNYEzt59fFVTuiPEgM6dgF70yMGdy/Qc/UmCbuU=
-github.com/NVIDIA/gpu-monitoring-tools v0.0.0-20180829222009-86f2a9fac6c5 h1:WLyvLAM0QfjAarRzRTG9EgT5McqGWNZMvqqSUSoyUUY=
-github.com/NVIDIA/gpu-monitoring-tools v0.0.0-20180829222009-86f2a9fac6c5/go.mod h1:nMOvShGpWaf0bXwXmeu4k+O4uziuaEI8pWzIj3BUrOA=
github.com/NYTimes/gziphandler v1.0.0 h1:OswZCvpiFsNRCbeapdJxDuikAqVXTgV7XAht8S9olZo=
github.com/NYTimes/gziphandler v1.0.0/go.mod h1:3wb06e3pkSAbeQ52E9H9iFoQsEEwGN64994WTCIhntQ=
github.com/PuerkitoBio/purell v1.0.0/go.mod h1:c11w/QuzBsJSee3cPx9rAFu61PvFxuPbtSwDGJws/X0=
diff --git a/helper/pluginutils/catalog/register_nvidia_linux.go b/helper/pluginutils/catalog/register_nvidia_linux.go
deleted file mode 100644
index a50cbe833a75..000000000000
--- a/helper/pluginutils/catalog/register_nvidia_linux.go
+++ /dev/null
@@ -1,14 +0,0 @@
-// +build !nonvidia
-
-package catalog
-
-import (
- "github.com/hashicorp/nomad/devices/gpu/nvidia"
-)
-
-// This file is where all builtin plugins should be registered in the catalog.
-// Plugins with build restrictions should be placed in the appropriate
-// register_XXX.go file.
-func init() {
- Register(nvidia.PluginID, nvidia.PluginConfig)
-}
diff --git a/vendor/github.com/NVIDIA/gpu-monitoring-tools/LICENSE b/vendor/github.com/NVIDIA/gpu-monitoring-tools/LICENSE
deleted file mode 100644
index 2a718d63da7f..000000000000
--- a/vendor/github.com/NVIDIA/gpu-monitoring-tools/LICENSE
+++ /dev/null
@@ -1,29 +0,0 @@
-BSD 3-Clause License
-
-Copyright (c) 2018, NVIDIA Corporation
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are met:
-
-* Redistributions of source code must retain the above copyright notice, this
- list of conditions and the following disclaimer.
-
-* Redistributions in binary form must reproduce the above copyright notice,
- this list of conditions and the following disclaimer in the documentation
- and/or other materials provided with the distribution.
-
-* Neither the name of the copyright holder nor the names of its
- contributors may be used to endorse or promote products derived from
- this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/vendor/github.com/NVIDIA/gpu-monitoring-tools/bindings/go/nvml/bindings.go b/vendor/github.com/NVIDIA/gpu-monitoring-tools/bindings/go/nvml/bindings.go
deleted file mode 100644
index 4bba898342f3..000000000000
--- a/vendor/github.com/NVIDIA/gpu-monitoring-tools/bindings/go/nvml/bindings.go
+++ /dev/null
@@ -1,634 +0,0 @@
-// Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
-
-package nvml
-
-// #cgo LDFLAGS: -ldl -Wl,--unresolved-symbols=ignore-in-object-files
-// #include "nvml_dl.h"
-import "C"
-
-import (
- "errors"
- "fmt"
- "io/ioutil"
- "os"
- "sort"
- "strconv"
- "strings"
-)
-
-const (
- szDriver = C.NVML_SYSTEM_DRIVER_VERSION_BUFFER_SIZE
- szName = C.NVML_DEVICE_NAME_BUFFER_SIZE
- szUUID = C.NVML_DEVICE_UUID_BUFFER_SIZE
- szProcs = 32
- szProcName = 64
-
- XidCriticalError = C.nvmlEventTypeXidCriticalError
-)
-
-type handle struct{ dev C.nvmlDevice_t }
-type EventSet struct{ set C.nvmlEventSet_t }
-type Event struct {
- UUID *string
- Etype uint64
- Edata uint64
-}
-
-func uintPtr(c C.uint) *uint {
- i := uint(c)
- return &i
-}
-
-func uint64Ptr(c C.ulonglong) *uint64 {
- i := uint64(c)
- return &i
-}
-
-func stringPtr(c *C.char) *string {
- s := C.GoString(c)
- return &s
-}
-
-func errorString(ret C.nvmlReturn_t) error {
- if ret == C.NVML_SUCCESS {
- return nil
- }
- err := C.GoString(C.nvmlErrorString(ret))
- return fmt.Errorf("nvml: %v", err)
-}
-
-func init_() error {
- r := C.nvmlInit_dl()
- if r == C.NVML_ERROR_LIBRARY_NOT_FOUND {
- return errors.New("could not load NVML library")
- }
- return errorString(r)
-}
-
-func NewEventSet() EventSet {
- var set C.nvmlEventSet_t
- C.nvmlEventSetCreate(&set)
-
- return EventSet{set}
-}
-
-func RegisterEvent(es EventSet, event int) error {
- n, err := deviceGetCount()
- if err != nil {
- return err
- }
-
- var i uint
- for i = 0; i < n; i++ {
- h, err := deviceGetHandleByIndex(i)
- if err != nil {
- return err
- }
-
- r := C.nvmlDeviceRegisterEvents(h.dev, C.ulonglong(event), es.set)
- if r != C.NVML_SUCCESS {
- return errorString(r)
- }
- }
-
- return nil
-}
-
-func RegisterEventForDevice(es EventSet, event int, uuid string) error {
- n, err := deviceGetCount()
- if err != nil {
- return err
- }
-
- var i uint
- for i = 0; i < n; i++ {
- h, err := deviceGetHandleByIndex(i)
- if err != nil {
- return err
- }
-
- duuid, err := h.deviceGetUUID()
- if err != nil {
- return err
- }
-
- if *duuid != uuid {
- continue
- }
-
- r := C.nvmlDeviceRegisterEvents(h.dev, C.ulonglong(event), es.set)
- if r != C.NVML_SUCCESS {
- return errorString(r)
- }
-
- return nil
- }
-
- return fmt.Errorf("nvml: device not found")
-}
-
-func DeleteEventSet(es EventSet) {
- C.nvmlEventSetFree(es.set)
-}
-
-func WaitForEvent(es EventSet, timeout uint) (Event, error) {
- var data C.nvmlEventData_t
-
- r := C.nvmlEventSetWait(es.set, &data, C.uint(timeout))
- uuid, _ := handle{data.device}.deviceGetUUID()
-
- return Event{
- UUID: uuid,
- Etype: uint64(data.eventType),
- Edata: uint64(data.eventData),
- },
- errorString(r)
-}
-
-func shutdown() error {
- return errorString(C.nvmlShutdown_dl())
-}
-
-func systemGetDriverVersion() (string, error) {
- var driver [szDriver]C.char
-
- r := C.nvmlSystemGetDriverVersion(&driver[0], szDriver)
- return C.GoString(&driver[0]), errorString(r)
-}
-
-func systemGetProcessName(pid uint) (string, error) {
- var proc [szProcName]C.char
-
- r := C.nvmlSystemGetProcessName(C.uint(pid), &proc[0], szProcName)
- return C.GoString(&proc[0]), errorString(r)
-}
-
-func deviceGetCount() (uint, error) {
- var n C.uint
-
- r := C.nvmlDeviceGetCount(&n)
- return uint(n), errorString(r)
-}
-
-func deviceGetHandleByIndex(idx uint) (handle, error) {
- var dev C.nvmlDevice_t
-
- r := C.nvmlDeviceGetHandleByIndex(C.uint(idx), &dev)
- return handle{dev}, errorString(r)
-}
-
-func deviceGetTopologyCommonAncestor(h1, h2 handle) (*uint, error) {
- var level C.nvmlGpuTopologyLevel_t
-
- r := C.nvmlDeviceGetTopologyCommonAncestor_dl(h1.dev, h2.dev, &level)
- if r == C.NVML_ERROR_FUNCTION_NOT_FOUND || r == C.NVML_ERROR_NOT_SUPPORTED {
- return nil, nil
- }
- return uintPtr(C.uint(level)), errorString(r)
-}
-
-func (h handle) deviceGetName() (*string, error) {
- var name [szName]C.char
-
- r := C.nvmlDeviceGetName(h.dev, &name[0], szName)
- if r == C.NVML_ERROR_NOT_SUPPORTED {
- return nil, nil
- }
- return stringPtr(&name[0]), errorString(r)
-}
-
-func (h handle) deviceGetUUID() (*string, error) {
- var uuid [szUUID]C.char
-
- r := C.nvmlDeviceGetUUID(h.dev, &uuid[0], szUUID)
- if r == C.NVML_ERROR_NOT_SUPPORTED {
- return nil, nil
- }
- return stringPtr(&uuid[0]), errorString(r)
-}
-
-func (h handle) deviceGetPciInfo() (*string, error) {
- var pci C.nvmlPciInfo_t
-
- r := C.nvmlDeviceGetPciInfo(h.dev, &pci)
- if r == C.NVML_ERROR_NOT_SUPPORTED {
- return nil, nil
- }
- return stringPtr(&pci.busId[0]), errorString(r)
-}
-
-func (h handle) deviceGetMinorNumber() (*uint, error) {
- var minor C.uint
-
- r := C.nvmlDeviceGetMinorNumber(h.dev, &minor)
- if r == C.NVML_ERROR_NOT_SUPPORTED {
- return nil, nil
- }
- return uintPtr(minor), errorString(r)
-}
-
-func (h handle) deviceGetBAR1MemoryInfo() (*uint64, *uint64, error) {
- var bar1 C.nvmlBAR1Memory_t
-
- r := C.nvmlDeviceGetBAR1MemoryInfo(h.dev, &bar1)
- if r == C.NVML_ERROR_NOT_SUPPORTED {
- return nil, nil, nil
- }
- return uint64Ptr(bar1.bar1Total), uint64Ptr(bar1.bar1Used), errorString(r)
-}
-
-func (h handle) deviceGetPowerManagementLimit() (*uint, error) {
- var power C.uint
-
- r := C.nvmlDeviceGetPowerManagementLimit(h.dev, &power)
- if r == C.NVML_ERROR_NOT_SUPPORTED {
- return nil, nil
- }
- return uintPtr(power), errorString(r)
-}
-
-func (h handle) deviceGetMaxClockInfo() (*uint, *uint, error) {
- var sm, mem C.uint
-
- r := C.nvmlDeviceGetMaxClockInfo(h.dev, C.NVML_CLOCK_SM, &sm)
- if r == C.NVML_ERROR_NOT_SUPPORTED {
- return nil, nil, nil
- }
- if r == C.NVML_SUCCESS {
- r = C.nvmlDeviceGetMaxClockInfo(h.dev, C.NVML_CLOCK_MEM, &mem)
- }
- return uintPtr(sm), uintPtr(mem), errorString(r)
-}
-
-func (h handle) deviceGetMaxPcieLinkGeneration() (*uint, error) {
- var link C.uint
-
- r := C.nvmlDeviceGetMaxPcieLinkGeneration(h.dev, &link)
- if r == C.NVML_ERROR_NOT_SUPPORTED {
- return nil, nil
- }
- return uintPtr(link), errorString(r)
-}
-
-func (h handle) deviceGetMaxPcieLinkWidth() (*uint, error) {
- var width C.uint
-
- r := C.nvmlDeviceGetMaxPcieLinkWidth(h.dev, &width)
- if r == C.NVML_ERROR_NOT_SUPPORTED {
- return nil, nil
- }
- return uintPtr(width), errorString(r)
-}
-
-func (h handle) deviceGetPowerUsage() (*uint, error) {
- var power C.uint
-
- r := C.nvmlDeviceGetPowerUsage(h.dev, &power)
- if r == C.NVML_ERROR_NOT_SUPPORTED {
- return nil, nil
- }
- return uintPtr(power), errorString(r)
-}
-
-func (h handle) deviceGetTemperature() (*uint, error) {
- var temp C.uint
-
- r := C.nvmlDeviceGetTemperature(h.dev, C.NVML_TEMPERATURE_GPU, &temp)
- if r == C.NVML_ERROR_NOT_SUPPORTED {
- return nil, nil
- }
- return uintPtr(temp), errorString(r)
-}
-
-func (h handle) deviceGetUtilizationRates() (*uint, *uint, error) {
- var usage C.nvmlUtilization_t
-
- r := C.nvmlDeviceGetUtilizationRates(h.dev, &usage)
- if r == C.NVML_ERROR_NOT_SUPPORTED {
- return nil, nil, nil
- }
- return uintPtr(usage.gpu), uintPtr(usage.memory), errorString(r)
-}
-
-func (h handle) deviceGetEncoderUtilization() (*uint, error) {
- var usage, sampling C.uint
-
- r := C.nvmlDeviceGetEncoderUtilization(h.dev, &usage, &sampling)
- if r == C.NVML_ERROR_NOT_SUPPORTED {
- return nil, nil
- }
- return uintPtr(usage), errorString(r)
-}
-
-func (h handle) deviceGetDecoderUtilization() (*uint, error) {
- var usage, sampling C.uint
-
- r := C.nvmlDeviceGetDecoderUtilization(h.dev, &usage, &sampling)
- if r == C.NVML_ERROR_NOT_SUPPORTED {
- return nil, nil
- }
- return uintPtr(usage), errorString(r)
-}
-
-func (h handle) deviceGetMemoryInfo() (totalMem *uint64, devMem DeviceMemory, err error) {
- var mem C.nvmlMemory_t
-
- r := C.nvmlDeviceGetMemoryInfo(h.dev, &mem)
- if r == C.NVML_ERROR_NOT_SUPPORTED {
- return
- }
-
- err = errorString(r)
- if r != C.NVML_SUCCESS {
- return
- }
-
- totalMem = uint64Ptr(mem.total)
- if totalMem != nil {
- *totalMem /= 1024 * 1024 // MiB
- }
-
- devMem = DeviceMemory{
- Used: uint64Ptr(mem.used),
- Free: uint64Ptr(mem.free),
- }
-
- if devMem.Used != nil {
- *devMem.Used /= 1024 * 1024 // MiB
- }
-
- if devMem.Free != nil {
- *devMem.Free /= 1024 * 1024 // MiB
- }
- return
-}
-
-func (h handle) deviceGetClockInfo() (*uint, *uint, error) {
- var sm, mem C.uint
-
- r := C.nvmlDeviceGetClockInfo(h.dev, C.NVML_CLOCK_SM, &sm)
- if r == C.NVML_ERROR_NOT_SUPPORTED {
- return nil, nil, nil
- }
- if r == C.NVML_SUCCESS {
- r = C.nvmlDeviceGetClockInfo(h.dev, C.NVML_CLOCK_MEM, &mem)
- }
- return uintPtr(sm), uintPtr(mem), errorString(r)
-}
-
-func (h handle) deviceGetMemoryErrorCounter() (*uint64, *uint64, *uint64, error) {
- var l1, l2, mem C.ulonglong
-
- r := C.nvmlDeviceGetMemoryErrorCounter(h.dev, C.NVML_MEMORY_ERROR_TYPE_UNCORRECTED,
- C.NVML_VOLATILE_ECC, C.NVML_MEMORY_LOCATION_L1_CACHE, &l1)
- if r == C.NVML_ERROR_NOT_SUPPORTED {
- return nil, nil, nil, nil
- }
- if r == C.NVML_SUCCESS {
- r = C.nvmlDeviceGetMemoryErrorCounter(h.dev, C.NVML_MEMORY_ERROR_TYPE_UNCORRECTED,
- C.NVML_VOLATILE_ECC, C.NVML_MEMORY_LOCATION_L2_CACHE, &l2)
- }
- if r == C.NVML_SUCCESS {
- r = C.nvmlDeviceGetMemoryErrorCounter(h.dev, C.NVML_MEMORY_ERROR_TYPE_UNCORRECTED,
- C.NVML_VOLATILE_ECC, C.NVML_MEMORY_LOCATION_DEVICE_MEMORY, &mem)
- }
- return uint64Ptr(l1), uint64Ptr(l2), uint64Ptr(mem), errorString(r)
-}
-
-func (h handle) deviceGetPcieThroughput() (*uint, *uint, error) {
- var rx, tx C.uint
-
- r := C.nvmlDeviceGetPcieThroughput(h.dev, C.NVML_PCIE_UTIL_RX_BYTES, &rx)
- if r == C.NVML_ERROR_NOT_SUPPORTED {
- return nil, nil, nil
- }
- if r == C.NVML_SUCCESS {
- r = C.nvmlDeviceGetPcieThroughput(h.dev, C.NVML_PCIE_UTIL_TX_BYTES, &tx)
- }
- return uintPtr(rx), uintPtr(tx), errorString(r)
-}
-
-func (h handle) deviceGetComputeRunningProcesses() ([]uint, []uint64, error) {
- var procs [szProcs]C.nvmlProcessInfo_t
- var count = C.uint(szProcs)
-
- r := C.nvmlDeviceGetComputeRunningProcesses(h.dev, &count, &procs[0])
- if r == C.NVML_ERROR_NOT_SUPPORTED {
- return nil, nil, nil
- }
- n := int(count)
- pids := make([]uint, n)
- mems := make([]uint64, n)
- for i := 0; i < n; i++ {
- pids[i] = uint(procs[i].pid)
- mems[i] = uint64(procs[i].usedGpuMemory)
- }
- return pids, mems, errorString(r)
-}
-
-func (h handle) deviceGetGraphicsRunningProcesses() ([]uint, []uint64, error) {
- var procs [szProcs]C.nvmlProcessInfo_t
- var count = C.uint(szProcs)
-
- r := C.nvmlDeviceGetGraphicsRunningProcesses(h.dev, &count, &procs[0])
- if r == C.NVML_ERROR_NOT_SUPPORTED {
- return nil, nil, nil
- }
- n := int(count)
- pids := make([]uint, n)
- mems := make([]uint64, n)
- for i := 0; i < n; i++ {
- pids[i] = uint(procs[i].pid)
- mems[i] = uint64(procs[i].usedGpuMemory)
- }
- return pids, mems, errorString(r)
-}
-
-func (h handle) deviceGetAllRunningProcesses() ([]ProcessInfo, error) {
- cPids, cpMems, err := h.deviceGetComputeRunningProcesses()
- if err != nil {
- return nil, err
- }
-
- gPids, gpMems, err := h.deviceGetGraphicsRunningProcesses()
- if err != nil {
- return nil, err
- }
-
- allPids := make(map[uint]ProcessInfo)
-
- for i, pid := range cPids {
- name, err := processName(pid)
- if err != nil {
- return nil, err
- }
- allPids[pid] = ProcessInfo{
- PID: pid,
- Name: name,
- MemoryUsed: cpMems[i] / (1024 * 1024), // MiB
- Type: Compute,
- }
-
- }
-
- for i, pid := range gPids {
- pInfo, exists := allPids[pid]
- if exists {
- pInfo.Type = ComputeAndGraphics
- allPids[pid] = pInfo
- } else {
- name, err := processName(pid)
- if err != nil {
- return nil, err
- }
- allPids[pid] = ProcessInfo{
- PID: pid,
- Name: name,
- MemoryUsed: gpMems[i] / (1024 * 1024), // MiB
- Type: Graphics,
- }
- }
- }
-
- var processInfo []ProcessInfo
- for _, v := range allPids {
- processInfo = append(processInfo, v)
- }
- sort.Slice(processInfo, func(i, j int) bool {
- return processInfo[i].PID < processInfo[j].PID
- })
-
- return processInfo, nil
-}
-
-func (h handle) getClocksThrottleReasons() (reason ThrottleReason, err error) {
- var clocksThrottleReasons C.ulonglong
-
- r := C.nvmlDeviceGetCurrentClocksThrottleReasons(h.dev, &clocksThrottleReasons)
-
- if r == C.NVML_ERROR_NOT_SUPPORTED {
- return ThrottleReasonUnknown, nil
- }
-
- if r != C.NVML_SUCCESS {
- return ThrottleReasonUnknown, errorString(r)
- }
-
- switch clocksThrottleReasons {
- case C.nvmlClocksThrottleReasonGpuIdle:
- reason = ThrottleReasonGpuIdle
- case C.nvmlClocksThrottleReasonApplicationsClocksSetting:
- reason = ThrottleReasonApplicationsClocksSetting
- case C.nvmlClocksThrottleReasonSwPowerCap:
- reason = ThrottleReasonSwPowerCap
- case C.nvmlClocksThrottleReasonHwSlowdown:
- reason = ThrottleReasonHwSlowdown
- case C.nvmlClocksThrottleReasonSyncBoost:
- reason = ThrottleReasonSyncBoost
- case C.nvmlClocksThrottleReasonSwThermalSlowdown:
- reason = ThrottleReasonSwThermalSlowdown
- case C.nvmlClocksThrottleReasonHwThermalSlowdown:
- reason = ThrottleReasonHwThermalSlowdown
- case C.nvmlClocksThrottleReasonHwPowerBrakeSlowdown:
- reason = ThrottleReasonHwPowerBrakeSlowdown
- case C.nvmlClocksThrottleReasonDisplayClockSetting:
- reason = ThrottleReasonDisplayClockSetting
- case C.nvmlClocksThrottleReasonNone:
- reason = ThrottleReasonNone
- }
- return
-}
-
-func (h handle) getPerformanceState() (PerfState, error) {
- var pstate C.nvmlPstates_t
-
- r := C.nvmlDeviceGetPerformanceState(h.dev, &pstate)
-
- if r == C.NVML_ERROR_NOT_SUPPORTED {
- return PerfStateUnknown, nil
- }
-
- if r != C.NVML_SUCCESS {
- return PerfStateUnknown, errorString(r)
- }
- return PerfState(pstate), nil
-}
-
-func processName(pid uint) (string, error) {
- f := `/proc/` + strconv.FormatUint(uint64(pid), 10) + `/comm`
- d, err := ioutil.ReadFile(f)
-
- if err != nil {
- // TOCTOU: process terminated
- if os.IsNotExist(err) {
- return "", nil
- }
- return "", err
- }
- return strings.TrimSuffix(string(d), "\n"), err
-}
-
-func (h handle) getAccountingInfo() (accountingInfo Accounting, err error) {
- var mode C.nvmlEnableState_t
- var buffer C.uint
-
- r := C.nvmlDeviceGetAccountingMode(h.dev, &mode)
- if r == C.NVML_ERROR_NOT_SUPPORTED {
- return
- }
-
- if r != C.NVML_SUCCESS {
- return accountingInfo, errorString(r)
- }
-
- r = C.nvmlDeviceGetAccountingBufferSize(h.dev, &buffer)
- if r == C.NVML_ERROR_NOT_SUPPORTED {
- return
- }
-
- if r != C.NVML_SUCCESS {
- return accountingInfo, errorString(r)
- }
-
- accountingInfo = Accounting{
- Mode: ModeState(mode),
- BufferSize: uintPtr(buffer),
- }
- return
-}
-
-func (h handle) getDisplayInfo() (display Display, err error) {
- var mode, isActive C.nvmlEnableState_t
-
- r := C.nvmlDeviceGetDisplayActive(h.dev, &mode)
- if r == C.NVML_ERROR_NOT_SUPPORTED {
- return
- }
-
- if r != C.NVML_SUCCESS {
- return display, errorString(r)
- }
-
- r = C.nvmlDeviceGetDisplayMode(h.dev, &isActive)
- if r == C.NVML_ERROR_NOT_SUPPORTED {
- return
- }
- if r != C.NVML_SUCCESS {
- return display, errorString(r)
- }
- display = Display{
- Mode: ModeState(mode),
- Active: ModeState(isActive),
- }
- return
-}
-
-func (h handle) getPeristenceMode() (state ModeState, err error) {
- var mode C.nvmlEnableState_t
-
- r := C.nvmlDeviceGetPersistenceMode(h.dev, &mode)
- if r == C.NVML_ERROR_NOT_SUPPORTED {
- return
- }
- return ModeState(mode), errorString(r)
-}
diff --git a/vendor/github.com/NVIDIA/gpu-monitoring-tools/bindings/go/nvml/nvml.go b/vendor/github.com/NVIDIA/gpu-monitoring-tools/bindings/go/nvml/nvml.go
deleted file mode 100644
index f6ec9e8fae39..000000000000
--- a/vendor/github.com/NVIDIA/gpu-monitoring-tools/bindings/go/nvml/nvml.go
+++ /dev/null
@@ -1,533 +0,0 @@
-// Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
-
-package nvml
-
-// #include "nvml_dl.h"
-import "C"
-
-import (
- "bytes"
- "errors"
- "fmt"
- "io/ioutil"
- "strconv"
- "strings"
-)
-
-var (
- ErrCPUAffinity = errors.New("failed to retrieve CPU affinity")
- ErrUnsupportedP2PLink = errors.New("unsupported P2P link type")
- ErrUnsupportedGPU = errors.New("unsupported GPU device")
-)
-
-type ModeState uint
-
-const (
- Enabled ModeState = iota
- Disabled
-)
-
-func (m ModeState) String() string {
- switch m {
- case Enabled:
- return "Enabled"
- case Disabled:
- return "Disabled"
- }
- return "N/A"
-}
-
-type Display struct {
- Mode ModeState
- Active ModeState
-}
-
-type Accounting struct {
- Mode ModeState
- BufferSize *uint
-}
-
-type DeviceMode struct {
- DisplayInfo Display
- Persistence ModeState
- AccountingInfo Accounting
-}
-
-type ThrottleReason uint
-
-const (
- ThrottleReasonGpuIdle ThrottleReason = iota
- ThrottleReasonApplicationsClocksSetting
- ThrottleReasonSwPowerCap
- ThrottleReasonHwSlowdown
- ThrottleReasonSyncBoost
- ThrottleReasonSwThermalSlowdown
- ThrottleReasonHwThermalSlowdown
- ThrottleReasonHwPowerBrakeSlowdown
- ThrottleReasonDisplayClockSetting
- ThrottleReasonNone
- ThrottleReasonUnknown
-)
-
-func (r ThrottleReason) String() string {
- switch r {
- case ThrottleReasonGpuIdle:
- return "Gpu Idle"
- case ThrottleReasonApplicationsClocksSetting:
- return "Applications Clocks Setting"
- case ThrottleReasonSwPowerCap:
- return "SW Power Cap"
- case ThrottleReasonHwSlowdown:
- return "HW Slowdown"
- case ThrottleReasonSyncBoost:
- return "Sync Boost"
- case ThrottleReasonSwThermalSlowdown:
- return "SW Thermal Slowdown"
- case ThrottleReasonHwThermalSlowdown:
- return "HW Thermal Slowdown"
- case ThrottleReasonHwPowerBrakeSlowdown:
- return "HW Power Brake Slowdown"
- case ThrottleReasonDisplayClockSetting:
- return "Display Clock Setting"
- case ThrottleReasonNone:
- return "No clocks throttling"
- }
- return "N/A"
-}
-
-type PerfState uint
-
-const (
- PerfStateMax = 0
- PerfStateMin = 15
- PerfStateUnknown = 32
-)
-
-func (p PerfState) String() string {
- if p >= PerfStateMax && p <= PerfStateMin {
- return fmt.Sprintf("P%d", p)
- }
- return "Unknown"
-}
-
-type ProcessType uint
-
-const (
- Compute ProcessType = iota
- Graphics
- ComputeAndGraphics
-)
-
-func (t ProcessType) String() string {
- typ := "C+G"
- if t == Compute {
- typ = "C"
- } else if t == Graphics {
- typ = "G"
- }
- return typ
-}
-
-type P2PLinkType uint
-
-const (
- P2PLinkUnknown P2PLinkType = iota
- P2PLinkCrossCPU
- P2PLinkSameCPU
- P2PLinkHostBridge
- P2PLinkMultiSwitch
- P2PLinkSingleSwitch
- P2PLinkSameBoard
-)
-
-type P2PLink struct {
- BusID string
- Link P2PLinkType
-}
-
-func (t P2PLinkType) String() string {
- switch t {
- case P2PLinkCrossCPU:
- return "Cross CPU socket"
- case P2PLinkSameCPU:
- return "Same CPU socket"
- case P2PLinkHostBridge:
- return "Host PCI bridge"
- case P2PLinkMultiSwitch:
- return "Multiple PCI switches"
- case P2PLinkSingleSwitch:
- return "Single PCI switch"
- case P2PLinkSameBoard:
- return "Same board"
- case P2PLinkUnknown:
- }
- return "N/A"
-}
-
-type ClockInfo struct {
- Cores *uint
- Memory *uint
-}
-
-type PCIInfo struct {
- BusID string
- BAR1 *uint64
- Bandwidth *uint
-}
-
-type Device struct {
- handle
-
- UUID string
- Path string
- Model *string
- Power *uint
- Memory *uint64
- CPUAffinity *uint
- PCI PCIInfo
- Clocks ClockInfo
- Topology []P2PLink
-}
-
-type UtilizationInfo struct {
- GPU *uint
- Memory *uint
- Encoder *uint
- Decoder *uint
-}
-
-type PCIThroughputInfo struct {
- RX *uint
- TX *uint
-}
-
-type PCIStatusInfo struct {
- BAR1Used *uint64
- Throughput PCIThroughputInfo
-}
-
-type ECCErrorsInfo struct {
- L1Cache *uint64
- L2Cache *uint64
- Device *uint64
-}
-
-type DeviceMemory struct {
- Used *uint64
- Free *uint64
-}
-
-type MemoryInfo struct {
- Global DeviceMemory
- ECCErrors ECCErrorsInfo
-}
-
-type ProcessInfo struct {
- PID uint
- Name string
- MemoryUsed uint64
- Type ProcessType
-}
-
-type DeviceStatus struct {
- Power *uint
- Temperature *uint
- Utilization UtilizationInfo
- Memory MemoryInfo
- Clocks ClockInfo
- PCI PCIStatusInfo
- Processes []ProcessInfo
- Throttle ThrottleReason
- Performance PerfState
-}
-
-func assert(err error) {
- if err != nil {
- panic(err)
- }
-}
-
-func Init() error {
- return init_()
-}
-
-func Shutdown() error {
- return shutdown()
-}
-
-func GetDeviceCount() (uint, error) {
- return deviceGetCount()
-}
-
-func GetDriverVersion() (string, error) {
- return systemGetDriverVersion()
-}
-
-func numaNode(busid string) (uint, error) {
- // discard leading zeros of busid
- b, err := ioutil.ReadFile(fmt.Sprintf("/sys/bus/pci/devices/%s/numa_node", strings.ToLower(busid[4:])))
- if err != nil {
- // XXX report node 0 if NUMA support isn't enabled
- return 0, nil
- }
- node, err := strconv.ParseInt(string(bytes.TrimSpace(b)), 10, 8)
- if err != nil {
- return 0, fmt.Errorf("%v: %v", ErrCPUAffinity, err)
- }
- if node < 0 {
- node = 0 // XXX report node 0 instead of NUMA_NO_NODE
- }
- return uint(node), nil
-}
-
-func pciBandwidth(gen, width *uint) *uint {
- m := map[uint]uint{
- 1: 250, // MB/s
- 2: 500,
- 3: 985,
- 4: 1969,
- }
- if gen == nil || width == nil {
- return nil
- }
- bw := m[*gen] * *width
- return &bw
-}
-
-func NewDevice(idx uint) (device *Device, err error) {
- defer func() {
- if r := recover(); r != nil {
- err = r.(error)
- }
- }()
-
- h, err := deviceGetHandleByIndex(idx)
- assert(err)
- model, err := h.deviceGetName()
- assert(err)
- uuid, err := h.deviceGetUUID()
- assert(err)
- minor, err := h.deviceGetMinorNumber()
- assert(err)
- power, err := h.deviceGetPowerManagementLimit()
- assert(err)
- totalMem, _, err := h.deviceGetMemoryInfo()
- assert(err)
- busid, err := h.deviceGetPciInfo()
- assert(err)
- bar1, _, err := h.deviceGetBAR1MemoryInfo()
- assert(err)
- pcig, err := h.deviceGetMaxPcieLinkGeneration()
- assert(err)
- pciw, err := h.deviceGetMaxPcieLinkWidth()
- assert(err)
- ccore, cmem, err := h.deviceGetMaxClockInfo()
- assert(err)
-
- if minor == nil || busid == nil || uuid == nil {
- return nil, ErrUnsupportedGPU
- }
- path := fmt.Sprintf("/dev/nvidia%d", *minor)
- node, err := numaNode(*busid)
- assert(err)
-
- device = &Device{
- handle: h,
- UUID: *uuid,
- Path: path,
- Model: model,
- Power: power,
- Memory: totalMem,
- CPUAffinity: &node,
- PCI: PCIInfo{
- BusID: *busid,
- BAR1: bar1,
- Bandwidth: pciBandwidth(pcig, pciw), // MB/s
- },
- Clocks: ClockInfo{
- Cores: ccore, // MHz
- Memory: cmem, // MHz
- },
- }
- if power != nil {
- *device.Power /= 1000 // W
- }
- if bar1 != nil {
- *device.PCI.BAR1 /= 1024 * 1024 // MiB
- }
- return
-}
-
-func NewDeviceLite(idx uint) (device *Device, err error) {
- defer func() {
- if r := recover(); r != nil {
- err = r.(error)
- }
- }()
-
- h, err := deviceGetHandleByIndex(idx)
- assert(err)
- uuid, err := h.deviceGetUUID()
- assert(err)
- minor, err := h.deviceGetMinorNumber()
- assert(err)
- busid, err := h.deviceGetPciInfo()
- assert(err)
-
- if minor == nil || busid == nil || uuid == nil {
- return nil, ErrUnsupportedGPU
- }
- path := fmt.Sprintf("/dev/nvidia%d", *minor)
-
- device = &Device{
- handle: h,
- UUID: *uuid,
- Path: path,
- PCI: PCIInfo{
- BusID: *busid,
- },
- }
- return
-}
-
-func (d *Device) Status() (status *DeviceStatus, err error) {
- defer func() {
- if r := recover(); r != nil {
- err = r.(error)
- }
- }()
-
- power, err := d.deviceGetPowerUsage()
- assert(err)
- temp, err := d.deviceGetTemperature()
- assert(err)
- ugpu, umem, err := d.deviceGetUtilizationRates()
- assert(err)
- uenc, err := d.deviceGetEncoderUtilization()
- assert(err)
- udec, err := d.deviceGetDecoderUtilization()
- assert(err)
- _, devMem, err := d.deviceGetMemoryInfo()
- assert(err)
- ccore, cmem, err := d.deviceGetClockInfo()
- assert(err)
- _, bar1, err := d.deviceGetBAR1MemoryInfo()
- assert(err)
- el1, el2, emem, err := d.deviceGetMemoryErrorCounter()
- assert(err)
- pcirx, pcitx, err := d.deviceGetPcieThroughput()
- assert(err)
- throttle, err := d.getClocksThrottleReasons()
- assert(err)
- perfState, err := d.getPerformanceState()
- assert(err)
- processInfo, err := d.deviceGetAllRunningProcesses()
- assert(err)
-
- status = &DeviceStatus{
- Power: power,
- Temperature: temp, // °C
- Utilization: UtilizationInfo{
- GPU: ugpu, // %
- Memory: umem, // %
- Encoder: uenc, // %
- Decoder: udec, // %
- },
- Memory: MemoryInfo{
- Global: devMem,
- ECCErrors: ECCErrorsInfo{
- L1Cache: el1,
- L2Cache: el2,
- Device: emem,
- },
- },
- Clocks: ClockInfo{
- Cores: ccore, // MHz
- Memory: cmem, // MHz
- },
- PCI: PCIStatusInfo{
- BAR1Used: bar1,
- Throughput: PCIThroughputInfo{
- RX: pcirx,
- TX: pcitx,
- },
- },
- Throttle: throttle,
- Performance: perfState,
- Processes: processInfo,
- }
- if power != nil {
- *status.Power /= 1000 // W
- }
- if bar1 != nil {
- *status.PCI.BAR1Used /= 1024 * 1024 // MiB
- }
- if pcirx != nil {
- *status.PCI.Throughput.RX /= 1000 // MB/s
- }
- if pcitx != nil {
- *status.PCI.Throughput.TX /= 1000 // MB/s
- }
- return
-}
-
-func GetP2PLink(dev1, dev2 *Device) (link P2PLinkType, err error) {
- level, err := deviceGetTopologyCommonAncestor(dev1.handle, dev2.handle)
- if err != nil || level == nil {
- return P2PLinkUnknown, err
- }
-
- switch *level {
- case C.NVML_TOPOLOGY_INTERNAL:
- link = P2PLinkSameBoard
- case C.NVML_TOPOLOGY_SINGLE:
- link = P2PLinkSingleSwitch
- case C.NVML_TOPOLOGY_MULTIPLE:
- link = P2PLinkMultiSwitch
- case C.NVML_TOPOLOGY_HOSTBRIDGE:
- link = P2PLinkHostBridge
- case C.NVML_TOPOLOGY_CPU:
- link = P2PLinkSameCPU
- case C.NVML_TOPOLOGY_SYSTEM:
- link = P2PLinkCrossCPU
- default:
- err = ErrUnsupportedP2PLink
- }
- return
-}
-
-func (d *Device) GetComputeRunningProcesses() ([]uint, []uint64, error) {
- return d.handle.deviceGetComputeRunningProcesses()
-}
-
-func (d *Device) GetGraphicsRunningProcesses() ([]uint, []uint64, error) {
- return d.handle.deviceGetGraphicsRunningProcesses()
-}
-
-func (d *Device) GetAllRunningProcesses() ([]ProcessInfo, error) {
- return d.handle.deviceGetAllRunningProcesses()
-}
-
-func (d *Device) GetDeviceMode() (mode *DeviceMode, err error) {
- defer func() {
- if r := recover(); r != nil {
- err = r.(error)
- }
- }()
-
- display, err := d.getDisplayInfo()
- assert(err)
-
- p, err := d.getPeristenceMode()
- assert(err)
-
- accounting, err := d.getAccountingInfo()
- assert(err)
-
- mode = &DeviceMode{
- DisplayInfo: display,
- Persistence: p,
- AccountingInfo: accounting,
- }
- return
-}
diff --git a/vendor/github.com/NVIDIA/gpu-monitoring-tools/bindings/go/nvml/nvml.h b/vendor/github.com/NVIDIA/gpu-monitoring-tools/bindings/go/nvml/nvml.h
deleted file mode 100644
index 60185dac239d..000000000000
--- a/vendor/github.com/NVIDIA/gpu-monitoring-tools/bindings/go/nvml/nvml.h
+++ /dev/null
@@ -1,5871 +0,0 @@
-/*
- * Copyright 1993-2017 NVIDIA Corporation. All rights reserved.
- *
- * NOTICE TO USER:
- *
- * This source code is subject to NVIDIA ownership rights under U.S. and
- * international Copyright laws. Users and possessors of this source code
- * are hereby granted a nonexclusive, royalty-free license to use this code
- * in individual and commercial software.
- *
- * NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
- * CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
- * IMPLIED WARRANTY OF ANY KIND. NVIDIA DISCLAIMS ALL WARRANTIES WITH
- * REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF
- * MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
- * IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL,
- * OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
- * OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
- * OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE
- * OR PERFORMANCE OF THIS SOURCE CODE.
- *
- * U.S. Government End Users. This source code is a "commercial item" as
- * that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting of
- * "commercial computer software" and "commercial computer software
- * documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995)
- * and is provided to the U.S. Government only as a commercial end item.
- * Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through
- * 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the
- * source code with only those rights set forth herein.
- *
- * Any use of this source code in individual and commercial software must
- * include, in the user documentation and internal comments to the code,
- * the above Disclaimer and U.S. Government End Users Notice.
- */
-
-/*
-NVML API Reference
-
-The NVIDIA Management Library (NVML) is a C-based programmatic interface for monitoring and
-managing various states within NVIDIA Tesla &tm; GPUs. It is intended to be a platform for building
-3rd party applications, and is also the underlying library for the NVIDIA-supported nvidia-smi
-tool. NVML is thread-safe so it is safe to make simultaneous NVML calls from multiple threads.
-
-API Documentation
-
-Supported platforms:
-- Windows: Windows Server 2008 R2 64bit, Windows Server 2012 R2 64bit, Windows 7 64bit, Windows 8 64bit, Windows 10 64bit
-- Linux: 32-bit and 64-bit
-- Hypervisors: Windows Server 2008R2/2012 Hyper-V 64bit, Citrix XenServer 6.2 SP1+, VMware ESX 5.1/5.5
-
-Supported products:
-- Full Support
- - All Tesla products, starting with the Fermi architecture
- - All Quadro products, starting with the Fermi architecture
- - All GRID products, starting with the Kepler architecture
- - Selected GeForce Titan products
-- Limited Support
- - All Geforce products, starting with the Fermi architecture
-
-The NVML library can be found at \%ProgramW6432\%\\"NVIDIA Corporation"\\NVSMI\\ on Windows. It is
-not be added to the system path by default. To dynamically link to NVML, add this path to the PATH
-environmental variable. To dynamically load NVML, call LoadLibrary with this path.
-
-On Linux the NVML library will be found on the standard library path. For 64 bit Linux, both the 32 bit
-and 64 bit NVML libraries will be installed.
-
-Online documentation for this library is available at http://docs.nvidia.com/deploy/nvml-api/index.html
-*/
-
-#ifndef __nvml_nvml_h__
-#define __nvml_nvml_h__
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-/*
- * On Windows, set up methods for DLL export
- * define NVML_STATIC_IMPORT when using nvml_loader library
- */
-#if defined _WINDOWS
- #if !defined NVML_STATIC_IMPORT
- #if defined NVML_LIB_EXPORT
- #define DECLDIR __declspec(dllexport)
- #else
- #define DECLDIR __declspec(dllimport)
- #endif
- #else
- #define DECLDIR
- #endif
-#else
- #define DECLDIR
-#endif
-
-/**
- * NVML API versioning support
- */
-#define NVML_API_VERSION 9
-#define NVML_API_VERSION_STR "9"
-#define nvmlInit nvmlInit_v2
-#define nvmlDeviceGetPciInfo nvmlDeviceGetPciInfo_v3
-#define nvmlDeviceGetCount nvmlDeviceGetCount_v2
-#define nvmlDeviceGetHandleByIndex nvmlDeviceGetHandleByIndex_v2
-#define nvmlDeviceGetHandleByPciBusId nvmlDeviceGetHandleByPciBusId_v2
-#define nvmlDeviceGetNvLinkRemotePciInfo nvmlDeviceGetNvLinkRemotePciInfo_v2
-#define nvmlDeviceRemoveGpu nvmlDeviceRemoveGpu_v2
-
-/***************************************************************************************************/
-/** @defgroup nvmlDeviceStructs Device Structs
- * @{
- */
-/***************************************************************************************************/
-
-/**
- * Special constant that some fields take when they are not available.
- * Used when only part of the struct is not available.
- *
- * Each structure explicitly states when to check for this value.
- */
-#define NVML_VALUE_NOT_AVAILABLE (-1)
-
-typedef struct nvmlDevice_st* nvmlDevice_t;
-
-/**
- * Buffer size guaranteed to be large enough for pci bus id
- */
-#define NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE 32
-
-/**
- * Buffer size guaranteed to be large enough for pci bus id for ::busIdLegacy
- */
-#define NVML_DEVICE_PCI_BUS_ID_BUFFER_V2_SIZE 16
-
-/**
- * PCI information about a GPU device.
- */
-typedef struct nvmlPciInfo_st
-{
- char busIdLegacy[NVML_DEVICE_PCI_BUS_ID_BUFFER_V2_SIZE]; //!< The legacy tuple domain:bus:device.function PCI identifier (& NULL terminator)
- unsigned int domain; //!< The PCI domain on which the device's bus resides, 0 to 0xffffffff
- unsigned int bus; //!< The bus on which the device resides, 0 to 0xff
- unsigned int device; //!< The device's id on the bus, 0 to 31
- unsigned int pciDeviceId; //!< The combined 16-bit device id and 16-bit vendor id
-
- // Added in NVML 2.285 API
- unsigned int pciSubSystemId; //!< The 32-bit Sub System Device ID
-
- char busId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE]; //!< The tuple domain:bus:device.function PCI identifier (& NULL terminator)
-} nvmlPciInfo_t;
-
-/**
- * Detailed ECC error counts for a device.
- *
- * @deprecated Different GPU families can have different memory error counters
- * See \ref nvmlDeviceGetMemoryErrorCounter
- */
-typedef struct nvmlEccErrorCounts_st
-{
- unsigned long long l1Cache; //!< L1 cache errors
- unsigned long long l2Cache; //!< L2 cache errors
- unsigned long long deviceMemory; //!< Device memory errors
- unsigned long long registerFile; //!< Register file errors
-} nvmlEccErrorCounts_t;
-
-/**
- * Utilization information for a device.
- * Each sample period may be between 1 second and 1/6 second, depending on the product being queried.
- */
-typedef struct nvmlUtilization_st
-{
- unsigned int gpu; //!< Percent of time over the past sample period during which one or more kernels was executing on the GPU
- unsigned int memory; //!< Percent of time over the past sample period during which global (device) memory was being read or written
-} nvmlUtilization_t;
-
-/**
- * Memory allocation information for a device.
- */
-typedef struct nvmlMemory_st
-{
- unsigned long long total; //!< Total installed FB memory (in bytes)
- unsigned long long free; //!< Unallocated FB memory (in bytes)
- unsigned long long used; //!< Allocated FB memory (in bytes). Note that the driver/GPU always sets aside a small amount of memory for bookkeeping
-} nvmlMemory_t;
-
-/**
- * BAR1 Memory allocation Information for a device
- */
-typedef struct nvmlBAR1Memory_st
-{
- unsigned long long bar1Total; //!< Total BAR1 Memory (in bytes)
- unsigned long long bar1Free; //!< Unallocated BAR1 Memory (in bytes)
- unsigned long long bar1Used; //!< Allocated Used Memory (in bytes)
-}nvmlBAR1Memory_t;
-
-/**
- * Information about running compute processes on the GPU
- */
-typedef struct nvmlProcessInfo_st
-{
- unsigned int pid; //!< Process ID
- unsigned long long usedGpuMemory; //!< Amount of used GPU memory in bytes.
- //! Under WDDM, \ref NVML_VALUE_NOT_AVAILABLE is always reported
- //! because Windows KMD manages all the memory and not the NVIDIA driver
-} nvmlProcessInfo_t;
-
-/**
- * Enum to represent type of bridge chip
- */
-typedef enum nvmlBridgeChipType_enum
-{
- NVML_BRIDGE_CHIP_PLX = 0,
- NVML_BRIDGE_CHIP_BRO4 = 1
-}nvmlBridgeChipType_t;
-
-/**
- * Maximum number of NvLink links supported
- */
-#define NVML_NVLINK_MAX_LINKS 6
-
-/**
- * Enum to represent the NvLink utilization counter packet units
- */
-typedef enum nvmlNvLinkUtilizationCountUnits_enum
-{
- NVML_NVLINK_COUNTER_UNIT_CYCLES = 0, // count by cycles
- NVML_NVLINK_COUNTER_UNIT_PACKETS = 1, // count by packets
- NVML_NVLINK_COUNTER_UNIT_BYTES = 2, // count by bytes
-
- // this must be last
- NVML_NVLINK_COUNTER_UNIT_COUNT
-} nvmlNvLinkUtilizationCountUnits_t;
-
-/**
- * Enum to represent the NvLink utilization counter packet types to count
- * ** this is ONLY applicable with the units as packets or bytes
- * ** as specified in \a nvmlNvLinkUtilizationCountUnits_t
- * ** all packet filter descriptions are target GPU centric
- * ** these can be "OR'd" together
- */
-typedef enum nvmlNvLinkUtilizationCountPktTypes_enum
-{
- NVML_NVLINK_COUNTER_PKTFILTER_NOP = 0x1, // no operation packets
- NVML_NVLINK_COUNTER_PKTFILTER_READ = 0x2, // read packets
- NVML_NVLINK_COUNTER_PKTFILTER_WRITE = 0x4, // write packets
- NVML_NVLINK_COUNTER_PKTFILTER_RATOM = 0x8, // reduction atomic requests
- NVML_NVLINK_COUNTER_PKTFILTER_NRATOM = 0x10, // non-reduction atomic requests
- NVML_NVLINK_COUNTER_PKTFILTER_FLUSH = 0x20, // flush requests
- NVML_NVLINK_COUNTER_PKTFILTER_RESPDATA = 0x40, // responses with data
- NVML_NVLINK_COUNTER_PKTFILTER_RESPNODATA = 0x80, // responses without data
- NVML_NVLINK_COUNTER_PKTFILTER_ALL = 0xFF // all packets
-} nvmlNvLinkUtilizationCountPktTypes_t;
-
-/**
- * Struct to define the NVLINK counter controls
- */
-typedef struct nvmlNvLinkUtilizationControl_st
-{
- nvmlNvLinkUtilizationCountUnits_t units;
- nvmlNvLinkUtilizationCountPktTypes_t pktfilter;
-} nvmlNvLinkUtilizationControl_t;
-
-/**
- * Enum to represent NvLink queryable capabilities
- */
-typedef enum nvmlNvLinkCapability_enum
-{
- NVML_NVLINK_CAP_P2P_SUPPORTED = 0, // P2P over NVLink is supported
- NVML_NVLINK_CAP_SYSMEM_ACCESS = 1, // Access to system memory is supported
- NVML_NVLINK_CAP_P2P_ATOMICS = 2, // P2P atomics are supported
- NVML_NVLINK_CAP_SYSMEM_ATOMICS= 3, // System memory atomics are supported
- NVML_NVLINK_CAP_SLI_BRIDGE = 4, // SLI is supported over this link
- NVML_NVLINK_CAP_VALID = 5, // Link is supported on this device
- // should be last
- NVML_NVLINK_CAP_COUNT
-} nvmlNvLinkCapability_t;
-
-/**
- * Enum to represent NvLink queryable error counters
- */
-typedef enum nvmlNvLinkErrorCounter_enum
-{
- NVML_NVLINK_ERROR_DL_REPLAY = 0, // Data link transmit replay error counter
- NVML_NVLINK_ERROR_DL_RECOVERY = 1, // Data link transmit recovery error counter
- NVML_NVLINK_ERROR_DL_CRC_FLIT = 2, // Data link receive flow control digit CRC error counter
- NVML_NVLINK_ERROR_DL_CRC_DATA = 3, // Data link receive data CRC error counter
-
- // this must be last
- NVML_NVLINK_ERROR_COUNT
-} nvmlNvLinkErrorCounter_t;
-
-/**
- * Represents level relationships within a system between two GPUs
- * The enums are spaced to allow for future relationships
- */
-typedef enum nvmlGpuLevel_enum
-{
- NVML_TOPOLOGY_INTERNAL = 0, // e.g. Tesla K80
- NVML_TOPOLOGY_SINGLE = 10, // all devices that only need traverse a single PCIe switch
- NVML_TOPOLOGY_MULTIPLE = 20, // all devices that need not traverse a host bridge
- NVML_TOPOLOGY_HOSTBRIDGE = 30, // all devices that are connected to the same host bridge
- NVML_TOPOLOGY_NODE = 40, // all devices that are connected to the same NUMA node but possibly multiple host bridges
- NVML_TOPOLOGY_SYSTEM = 50, // all devices in the system
-
- // there is purposefully no COUNT here because of the need for spacing above
-} nvmlGpuTopologyLevel_t;
-
-/* Compatibility for CPU->NODE renaming */
-#define NVML_TOPOLOGY_CPU NVML_TOPOLOGY_NODE
-
-/* P2P Capability Index Status*/
-typedef enum nvmlGpuP2PStatus_enum
-{
- NVML_P2P_STATUS_OK = 0,
- NVML_P2P_STATUS_CHIPSET_NOT_SUPPORED,
- NVML_P2P_STATUS_GPU_NOT_SUPPORTED,
- NVML_P2P_STATUS_IOH_TOPOLOGY_NOT_SUPPORTED,
- NVML_P2P_STATUS_DISABLED_BY_REGKEY,
- NVML_P2P_STATUS_NOT_SUPPORTED,
- NVML_P2P_STATUS_UNKNOWN
-
-} nvmlGpuP2PStatus_t;
-
-/* P2P Capability Index*/
-typedef enum nvmlGpuP2PCapsIndex_enum
-{
- NVML_P2P_CAPS_INDEX_READ = 0,
- NVML_P2P_CAPS_INDEX_WRITE,
- NVML_P2P_CAPS_INDEX_NVLINK,
- NVML_P2P_CAPS_INDEX_ATOMICS,
- NVML_P2P_CAPS_INDEX_PROP,
- NVML_P2P_CAPS_INDEX_UNKNOWN
-}nvmlGpuP2PCapsIndex_t;
-
-/**
- * Maximum limit on Physical Bridges per Board
- */
-#define NVML_MAX_PHYSICAL_BRIDGE (128)
-
-/**
- * Information about the Bridge Chip Firmware
- */
-typedef struct nvmlBridgeChipInfo_st
-{
- nvmlBridgeChipType_t type; //!< Type of Bridge Chip
- unsigned int fwVersion; //!< Firmware Version. 0=Version is unavailable
-}nvmlBridgeChipInfo_t;
-
-/**
- * This structure stores the complete Hierarchy of the Bridge Chip within the board. The immediate
- * bridge is stored at index 0 of bridgeInfoList, parent to immediate bridge is at index 1 and so forth.
- */
-typedef struct nvmlBridgeChipHierarchy_st
-{
- unsigned char bridgeCount; //!< Number of Bridge Chips on the Board
- nvmlBridgeChipInfo_t bridgeChipInfo[NVML_MAX_PHYSICAL_BRIDGE]; //!< Hierarchy of Bridge Chips on the board
-}nvmlBridgeChipHierarchy_t;
-
-/**
- * Represents Type of Sampling Event
- */
-typedef enum nvmlSamplingType_enum
-{
- NVML_TOTAL_POWER_SAMPLES = 0, //!< To represent total power drawn by GPU
- NVML_GPU_UTILIZATION_SAMPLES = 1, //!< To represent percent of time during which one or more kernels was executing on the GPU
- NVML_MEMORY_UTILIZATION_SAMPLES = 2, //!< To represent percent of time during which global (device) memory was being read or written
- NVML_ENC_UTILIZATION_SAMPLES = 3, //!< To represent percent of time during which NVENC remains busy
- NVML_DEC_UTILIZATION_SAMPLES = 4, //!< To represent percent of time during which NVDEC remains busy
- NVML_PROCESSOR_CLK_SAMPLES = 5, //!< To represent processor clock samples
- NVML_MEMORY_CLK_SAMPLES = 6, //!< To represent memory clock samples
-
- // Keep this last
- NVML_SAMPLINGTYPE_COUNT
-}nvmlSamplingType_t;
-
-/**
- * Represents the queryable PCIe utilization counters
- */
-typedef enum nvmlPcieUtilCounter_enum
-{
- NVML_PCIE_UTIL_TX_BYTES = 0, // 1KB granularity
- NVML_PCIE_UTIL_RX_BYTES = 1, // 1KB granularity
-
- // Keep this last
- NVML_PCIE_UTIL_COUNT
-} nvmlPcieUtilCounter_t;
-
-/**
- * Represents the type for sample value returned
- */
-typedef enum nvmlValueType_enum
-{
- NVML_VALUE_TYPE_DOUBLE = 0,
- NVML_VALUE_TYPE_UNSIGNED_INT = 1,
- NVML_VALUE_TYPE_UNSIGNED_LONG = 2,
- NVML_VALUE_TYPE_UNSIGNED_LONG_LONG = 3,
- NVML_VALUE_TYPE_SIGNED_LONG_LONG = 4,
-
- // Keep this last
- NVML_VALUE_TYPE_COUNT
-}nvmlValueType_t;
-
-
-/**
- * Union to represent different types of Value
- */
-typedef union nvmlValue_st
-{
- double dVal; //!< If the value is double
- unsigned int uiVal; //!< If the value is unsigned int
- unsigned long ulVal; //!< If the value is unsigned long
- unsigned long long ullVal; //!< If the value is unsigned long long
- signed long long sllVal; //!< If the value is signed long long
-}nvmlValue_t;
-
-/**
- * Information for Sample
- */
-typedef struct nvmlSample_st
-{
- unsigned long long timeStamp; //!< CPU Timestamp in microseconds
- nvmlValue_t sampleValue; //!< Sample Value
-}nvmlSample_t;
-
-/**
- * Represents type of perf policy for which violation times can be queried
- */
-typedef enum nvmlPerfPolicyType_enum
-{
- NVML_PERF_POLICY_POWER = 0, //!< How long did power violations cause the GPU to be below application clocks
- NVML_PERF_POLICY_THERMAL = 1, //!< How long did thermal violations cause the GPU to be below application clocks
- NVML_PERF_POLICY_SYNC_BOOST = 2, //!< How long did sync boost cause the GPU to be below application clocks
- NVML_PERF_POLICY_BOARD_LIMIT = 3, //!< How long did the board limit cause the GPU to be below application clocks
- NVML_PERF_POLICY_LOW_UTILIZATION = 4, //!< How long did low utilization cause the GPU to be below application clocks
- NVML_PERF_POLICY_RELIABILITY = 5, //!< How long did the board reliability limit cause the GPU to be below application clocks
-
- NVML_PERF_POLICY_TOTAL_APP_CLOCKS = 10, //!< Total time the GPU was held below application clocks by any limiter (0 - 5 above)
- NVML_PERF_POLICY_TOTAL_BASE_CLOCKS = 11, //!< Total time the GPU was held below base clocks
-
- // Keep this last
- NVML_PERF_POLICY_COUNT
-}nvmlPerfPolicyType_t;
-
-/**
- * Struct to hold perf policy violation status data
- */
-typedef struct nvmlViolationTime_st
-{
- unsigned long long referenceTime; //!< referenceTime represents CPU timestamp in microseconds
- unsigned long long violationTime; //!< violationTime in Nanoseconds
-}nvmlViolationTime_t;
-
-/** @} */
-
-/***************************************************************************************************/
-/** @defgroup nvmlDeviceEnumvs Device Enums
- * @{
- */
-/***************************************************************************************************/
-
-/**
- * Generic enable/disable enum.
- */
-typedef enum nvmlEnableState_enum
-{
- NVML_FEATURE_DISABLED = 0, //!< Feature disabled
- NVML_FEATURE_ENABLED = 1 //!< Feature enabled
-} nvmlEnableState_t;
-
-//! Generic flag used to specify the default behavior of some functions. See description of particular functions for details.
-#define nvmlFlagDefault 0x00
-//! Generic flag used to force some behavior. See description of particular functions for details.
-#define nvmlFlagForce 0x01
-
-/**
- * * The Brand of the GPU
- * */
-typedef enum nvmlBrandType_enum
-{
- NVML_BRAND_UNKNOWN = 0,
- NVML_BRAND_QUADRO = 1,
- NVML_BRAND_TESLA = 2,
- NVML_BRAND_NVS = 3,
- NVML_BRAND_GRID = 4,
- NVML_BRAND_GEFORCE = 5,
- NVML_BRAND_TITAN = 6,
-
- // Keep this last
- NVML_BRAND_COUNT
-} nvmlBrandType_t;
-
-/**
- * Temperature thresholds.
- */
-typedef enum nvmlTemperatureThresholds_enum
-{
- NVML_TEMPERATURE_THRESHOLD_SHUTDOWN = 0, // Temperature at which the GPU will shut down
- // for HW protection
- NVML_TEMPERATURE_THRESHOLD_SLOWDOWN = 1, // Temperature at which the GPU will begin HW slowdown
- NVML_TEMPERATURE_THRESHOLD_MEM_MAX = 2, // Memory Temperature at which the GPU will begin SW slowdown
- NVML_TEMPERATURE_THRESHOLD_GPU_MAX = 3, // GPU Temperature at which the GPU can be throttled below base clock
- // Keep this last
- NVML_TEMPERATURE_THRESHOLD_COUNT
-} nvmlTemperatureThresholds_t;
-
-/**
- * Temperature sensors.
- */
-typedef enum nvmlTemperatureSensors_enum
-{
- NVML_TEMPERATURE_GPU = 0, //!< Temperature sensor for the GPU die
-
- // Keep this last
- NVML_TEMPERATURE_COUNT
-} nvmlTemperatureSensors_t;
-
-/**
- * Compute mode.
- *
- * NVML_COMPUTEMODE_EXCLUSIVE_PROCESS was added in CUDA 4.0.
- * Earlier CUDA versions supported a single exclusive mode,
- * which is equivalent to NVML_COMPUTEMODE_EXCLUSIVE_THREAD in CUDA 4.0 and beyond.
- */
-typedef enum nvmlComputeMode_enum
-{
- NVML_COMPUTEMODE_DEFAULT = 0, //!< Default compute mode -- multiple contexts per device
- NVML_COMPUTEMODE_EXCLUSIVE_THREAD = 1, //!< Support Removed
- NVML_COMPUTEMODE_PROHIBITED = 2, //!< Compute-prohibited mode -- no contexts per device
- NVML_COMPUTEMODE_EXCLUSIVE_PROCESS = 3, //!< Compute-exclusive-process mode -- only one context per device, usable from multiple threads at a time
-
- // Keep this last
- NVML_COMPUTEMODE_COUNT
-} nvmlComputeMode_t;
-
-/**
- * ECC bit types.
- *
- * @deprecated See \ref nvmlMemoryErrorType_t for a more flexible type
- */
-#define nvmlEccBitType_t nvmlMemoryErrorType_t
-
-/**
- * Single bit ECC errors
- *
- * @deprecated Mapped to \ref NVML_MEMORY_ERROR_TYPE_CORRECTED
- */
-#define NVML_SINGLE_BIT_ECC NVML_MEMORY_ERROR_TYPE_CORRECTED
-
-/**
- * Double bit ECC errors
- *
- * @deprecated Mapped to \ref NVML_MEMORY_ERROR_TYPE_UNCORRECTED
- */
-#define NVML_DOUBLE_BIT_ECC NVML_MEMORY_ERROR_TYPE_UNCORRECTED
-
-/**
- * Memory error types
- */
-typedef enum nvmlMemoryErrorType_enum
-{
- /**
- * A memory error that was corrected
- *
- * For ECC errors, these are single bit errors
- * For Texture memory, these are errors fixed by resend
- */
- NVML_MEMORY_ERROR_TYPE_CORRECTED = 0,
- /**
- * A memory error that was not corrected
- *
- * For ECC errors, these are double bit errors
- * For Texture memory, these are errors where the resend fails
- */
- NVML_MEMORY_ERROR_TYPE_UNCORRECTED = 1,
-
-
- // Keep this last
- NVML_MEMORY_ERROR_TYPE_COUNT //!< Count of memory error types
-
-} nvmlMemoryErrorType_t;
-
-/**
- * ECC counter types.
- *
- * Note: Volatile counts are reset each time the driver loads. On Windows this is once per boot. On Linux this can be more frequent.
- * On Linux the driver unloads when no active clients exist. If persistence mode is enabled or there is always a driver
- * client active (e.g. X11), then Linux also sees per-boot behavior. If not, volatile counts are reset each time a compute app
- * is run.
- */
-typedef enum nvmlEccCounterType_enum
-{
- NVML_VOLATILE_ECC = 0, //!< Volatile counts are reset each time the driver loads.
- NVML_AGGREGATE_ECC = 1, //!< Aggregate counts persist across reboots (i.e. for the lifetime of the device)
-
- // Keep this last
- NVML_ECC_COUNTER_TYPE_COUNT //!< Count of memory counter types
-} nvmlEccCounterType_t;
-
-/**
- * Clock types.
- *
- * All speeds are in Mhz.
- */
-typedef enum nvmlClockType_enum
-{
- NVML_CLOCK_GRAPHICS = 0, //!< Graphics clock domain
- NVML_CLOCK_SM = 1, //!< SM clock domain
- NVML_CLOCK_MEM = 2, //!< Memory clock domain
- NVML_CLOCK_VIDEO = 3, //!< Video encoder/decoder clock domain
-
- // Keep this last
- NVML_CLOCK_COUNT //usedGpuMemory is not supported
-
-
- unsigned long long time; //!< Amount of time in ms during which the compute context was active. The time is reported as 0 if
- //!< the process is not terminated
-
- unsigned long long startTime; //!< CPU Timestamp in usec representing start time for the process
-
- unsigned int isRunning; //!< Flag to represent if the process is running (1 for running, 0 for terminated)
-
- unsigned int reserved[5]; //!< Reserved for future use
-} nvmlAccountingStats_t;
-
-/** @} */
-
-/***************************************************************************************************/
-/** @defgroup nvmlVgpuConstants Vgpu Constants
- * @{
- */
-/***************************************************************************************************/
-
-/**
- * Buffer size guaranteed to be large enough for \ref nvmlVgpuTypeGetLicense
- */
-#define NVML_GRID_LICENSE_BUFFER_SIZE 128
-
-#define NVML_VGPU_NAME_BUFFER_SIZE 64
-
-#define NVML_GRID_LICENSE_FEATURE_MAX_COUNT 3
-
-/*!
- * Macros for pGPU's virtualization capabilities bitfield.
- */
-#define NVML_VGPU_PGPU_VIRTUALIZATION_CAP_MIGRATION 0:0
-#define NVML_VGPU_PGPU_VIRTUALIZATION_CAP_MIGRATION_NO 0x0
-#define NVML_VGPU_PGPU_VIRTUALIZATION_CAP_MIGRATION_YES 0x1
-
-/** @} */
-
-/***************************************************************************************************/
-/** @defgroup nvmlVgpuEnum Vgpu Enum
- * @{
- */
-/***************************************************************************************************/
-
-/*!
- * Types of VM identifiers
- */
-typedef enum nvmlVgpuVmIdType {
- NVML_VGPU_VM_ID_DOMAIN_ID = 0, //!< VM ID represents DOMAIN ID
- NVML_VGPU_VM_ID_UUID = 1, //!< VM ID represents UUID
-} nvmlVgpuVmIdType_t;
-
-// vGPU GUEST info state.
-typedef enum nvmlVgpuGuestInfoState_enum
-{
- NVML_VGPU_INSTANCE_GUEST_INFO_STATE_UNINITIALIZED = 0, //= 0 and < \a unitCount
- * @param unit Reference in which to return the unit handle
- *
- * @return
- * - \ref NVML_SUCCESS if \a unit has been set
- * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
- * - \ref NVML_ERROR_INVALID_ARGUMENT if \a index is invalid or \a unit is NULL
- * - \ref NVML_ERROR_UNKNOWN on any unexpected error
- */
-nvmlReturn_t DECLDIR nvmlUnitGetHandleByIndex(unsigned int index, nvmlUnit_t *unit);
-
-/**
- * Retrieves the static information associated with a unit.
- *
- * For S-class products.
- *
- * See \ref nvmlUnitInfo_t for details on available unit info.
- *
- * @param unit The identifier of the target unit
- * @param info Reference in which to return the unit information
- *
- * @return
- * - \ref NVML_SUCCESS if \a info has been populated
- * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
- * - \ref NVML_ERROR_INVALID_ARGUMENT if \a unit is invalid or \a info is NULL
- */
-nvmlReturn_t DECLDIR nvmlUnitGetUnitInfo(nvmlUnit_t unit, nvmlUnitInfo_t *info);
-
-/**
- * Retrieves the LED state associated with this unit.
- *
- * For S-class products.
- *
- * See \ref nvmlLedState_t for details on allowed states.
- *
- * @param unit The identifier of the target unit
- * @param state Reference in which to return the current LED state
- *
- * @return
- * - \ref NVML_SUCCESS if \a state has been set
- * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
- * - \ref NVML_ERROR_INVALID_ARGUMENT if \a unit is invalid or \a state is NULL
- * - \ref NVML_ERROR_NOT_SUPPORTED if this is not an S-class product
- * - \ref NVML_ERROR_UNKNOWN on any unexpected error
- *
- * @see nvmlUnitSetLedState()
- */
-nvmlReturn_t DECLDIR nvmlUnitGetLedState(nvmlUnit_t unit, nvmlLedState_t *state);
-
-/**
- * Retrieves the PSU stats for the unit.
- *
- * For S-class products.
- *
- * See \ref nvmlPSUInfo_t for details on available PSU info.
- *
- * @param unit The identifier of the target unit
- * @param psu Reference in which to return the PSU information
- *
- * @return
- * - \ref NVML_SUCCESS if \a psu has been populated
- * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
- * - \ref NVML_ERROR_INVALID_ARGUMENT if \a unit is invalid or \a psu is NULL
- * - \ref NVML_ERROR_NOT_SUPPORTED if this is not an S-class product
- * - \ref NVML_ERROR_UNKNOWN on any unexpected error
- */
-nvmlReturn_t DECLDIR nvmlUnitGetPsuInfo(nvmlUnit_t unit, nvmlPSUInfo_t *psu);
-
-/**
- * Retrieves the temperature readings for the unit, in degrees C.
- *
- * For S-class products.
- *
- * Depending on the product, readings may be available for intake (type=0),
- * exhaust (type=1) and board (type=2).
- *
- * @param unit The identifier of the target unit
- * @param type The type of reading to take
- * @param temp Reference in which to return the intake temperature
- *
- * @return
- * - \ref NVML_SUCCESS if \a temp has been populated
- * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
- * - \ref NVML_ERROR_INVALID_ARGUMENT if \a unit or \a type is invalid or \a temp is NULL
- * - \ref NVML_ERROR_NOT_SUPPORTED if this is not an S-class product
- * - \ref NVML_ERROR_UNKNOWN on any unexpected error
- */
-nvmlReturn_t DECLDIR nvmlUnitGetTemperature(nvmlUnit_t unit, unsigned int type, unsigned int *temp);
-
-/**
- * Retrieves the fan speed readings for the unit.
- *
- * For S-class products.
- *
- * See \ref nvmlUnitFanSpeeds_t for details on available fan speed info.
- *
- * @param unit The identifier of the target unit
- * @param fanSpeeds Reference in which to return the fan speed information
- *
- * @return
- * - \ref NVML_SUCCESS if \a fanSpeeds has been populated
- * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
- * - \ref NVML_ERROR_INVALID_ARGUMENT if \a unit is invalid or \a fanSpeeds is NULL
- * - \ref NVML_ERROR_NOT_SUPPORTED if this is not an S-class product
- * - \ref NVML_ERROR_UNKNOWN on any unexpected error
- */
-nvmlReturn_t DECLDIR nvmlUnitGetFanSpeedInfo(nvmlUnit_t unit, nvmlUnitFanSpeeds_t *fanSpeeds);
-
-/**
- * Retrieves the set of GPU devices that are attached to the specified unit.
- *
- * For S-class products.
- *
- * The \a deviceCount argument is expected to be set to the size of the input \a devices array.
- *
- * @param unit The identifier of the target unit
- * @param deviceCount Reference in which to provide the \a devices array size, and
- * to return the number of attached GPU devices
- * @param devices Reference in which to return the references to the attached GPU devices
- *
- * @return
- * - \ref NVML_SUCCESS if \a deviceCount and \a devices have been populated
- * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
- * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a deviceCount indicates that the \a devices array is too small
- * - \ref NVML_ERROR_INVALID_ARGUMENT if \a unit is invalid, either of \a deviceCount or \a devices is NULL
- * - \ref NVML_ERROR_UNKNOWN on any unexpected error
- */
-nvmlReturn_t DECLDIR nvmlUnitGetDevices(nvmlUnit_t unit, unsigned int *deviceCount, nvmlDevice_t *devices);
-
-/**
- * Retrieves the IDs and firmware versions for any Host Interface Cards (HICs) in the system.
- *
- * For S-class products.
- *
- * The \a hwbcCount argument is expected to be set to the size of the input \a hwbcEntries array.
- * The HIC must be connected to an S-class system for it to be reported by this function.
- *
- * @param hwbcCount Size of hwbcEntries array
- * @param hwbcEntries Array holding information about hwbc
- *
- * @return
- * - \ref NVML_SUCCESS if \a hwbcCount and \a hwbcEntries have been populated
- * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
- * - \ref NVML_ERROR_INVALID_ARGUMENT if either \a hwbcCount or \a hwbcEntries is NULL
- * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a hwbcCount indicates that the \a hwbcEntries array is too small
- */
-nvmlReturn_t DECLDIR nvmlSystemGetHicVersion(unsigned int *hwbcCount, nvmlHwbcEntry_t *hwbcEntries);
-/** @} */
-
-/***************************************************************************************************/
-/** @defgroup nvmlDeviceQueries Device Queries
- * This chapter describes that queries that NVML can perform against each device.
- * In each case the device is identified with an nvmlDevice_t handle. This handle is obtained by
- * calling one of \ref nvmlDeviceGetHandleByIndex(), \ref nvmlDeviceGetHandleBySerial(),
- * \ref nvmlDeviceGetHandleByPciBusId(). or \ref nvmlDeviceGetHandleByUUID().
- * @{
- */
-/***************************************************************************************************/
-
- /**
- * Retrieves the number of compute devices in the system. A compute device is a single GPU.
- *
- * For all products.
- *
- * Note: New nvmlDeviceGetCount_v2 (default in NVML 5.319) returns count of all devices in the system
- * even if nvmlDeviceGetHandleByIndex_v2 returns NVML_ERROR_NO_PERMISSION for such device.
- * Update your code to handle this error, or use NVML 4.304 or older nvml header file.
- * For backward binary compatibility reasons _v1 version of the API is still present in the shared
- * library.
- * Old _v1 version of nvmlDeviceGetCount doesn't count devices that NVML has no permission to talk to.
- *
- * @param deviceCount Reference in which to return the number of accessible devices
- *
- * @return
- * - \ref NVML_SUCCESS if \a deviceCount has been set
- * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
- * - \ref NVML_ERROR_INVALID_ARGUMENT if \a deviceCount is NULL
- * - \ref NVML_ERROR_UNKNOWN on any unexpected error
- */
-nvmlReturn_t DECLDIR nvmlDeviceGetCount(unsigned int *deviceCount);
-
-/**
- * Acquire the handle for a particular device, based on its index.
- *
- * For all products.
- *
- * Valid indices are derived from the \a accessibleDevices count returned by
- * \ref nvmlDeviceGetCount(). For example, if \a accessibleDevices is 2 the valid indices
- * are 0 and 1, corresponding to GPU 0 and GPU 1.
- *
- * The order in which NVML enumerates devices has no guarantees of consistency between reboots. For that reason it
- * is recommended that devices be looked up by their PCI ids or UUID. See
- * \ref nvmlDeviceGetHandleByUUID() and \ref nvmlDeviceGetHandleByPciBusId().
- *
- * Note: The NVML index may not correlate with other APIs, such as the CUDA device index.
- *
- * Starting from NVML 5, this API causes NVML to initialize the target GPU
- * NVML may initialize additional GPUs if:
- * - The target GPU is an SLI slave
- *
- * Note: New nvmlDeviceGetCount_v2 (default in NVML 5.319) returns count of all devices in the system
- * even if nvmlDeviceGetHandleByIndex_v2 returns NVML_ERROR_NO_PERMISSION for such device.
- * Update your code to handle this error, or use NVML 4.304 or older nvml header file.
- * For backward binary compatibility reasons _v1 version of the API is still present in the shared
- * library.
- * Old _v1 version of nvmlDeviceGetCount doesn't count devices that NVML has no permission to talk to.
- *
- * This means that nvmlDeviceGetHandleByIndex_v2 and _v1 can return different devices for the same index.
- * If you don't touch macros that map old (_v1) versions to _v2 versions at the top of the file you don't
- * need to worry about that.
- *
- * @param index The index of the target GPU, >= 0 and < \a accessibleDevices
- * @param device Reference in which to return the device handle
- *
- * @return
- * - \ref NVML_SUCCESS if \a device has been set
- * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
- * - \ref NVML_ERROR_INVALID_ARGUMENT if \a index is invalid or \a device is NULL
- * - \ref NVML_ERROR_INSUFFICIENT_POWER if any attached devices have improperly attached external power cables
- * - \ref NVML_ERROR_NO_PERMISSION if the user doesn't have permission to talk to this device
- * - \ref NVML_ERROR_IRQ_ISSUE if NVIDIA kernel detected an interrupt issue with the attached GPUs
- * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible
- * - \ref NVML_ERROR_UNKNOWN on any unexpected error
- *
- * @see nvmlDeviceGetIndex
- * @see nvmlDeviceGetCount
- */
-nvmlReturn_t DECLDIR nvmlDeviceGetHandleByIndex(unsigned int index, nvmlDevice_t *device);
-
-/**
- * Acquire the handle for a particular device, based on its board serial number.
- *
- * For Fermi &tm; or newer fully supported devices.
- *
- * This number corresponds to the value printed directly on the board, and to the value returned by
- * \ref nvmlDeviceGetSerial().
- *
- * @deprecated Since more than one GPU can exist on a single board this function is deprecated in favor
- * of \ref nvmlDeviceGetHandleByUUID.
- * For dual GPU boards this function will return NVML_ERROR_INVALID_ARGUMENT.
- *
- * Starting from NVML 5, this API causes NVML to initialize the target GPU
- * NVML may initialize additional GPUs as it searches for the target GPU
- *
- * @param serial The board serial number of the target GPU
- * @param device Reference in which to return the device handle
- *
- * @return
- * - \ref NVML_SUCCESS if \a device has been set
- * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
- * - \ref NVML_ERROR_INVALID_ARGUMENT if \a serial is invalid, \a device is NULL or more than one
- * device has the same serial (dual GPU boards)
- * - \ref NVML_ERROR_NOT_FOUND if \a serial does not match a valid device on the system
- * - \ref NVML_ERROR_INSUFFICIENT_POWER if any attached devices have improperly attached external power cables
- * - \ref NVML_ERROR_IRQ_ISSUE if NVIDIA kernel detected an interrupt issue with the attached GPUs
- * - \ref NVML_ERROR_GPU_IS_LOST if any GPU has fallen off the bus or is otherwise inaccessible
- * - \ref NVML_ERROR_UNKNOWN on any unexpected error
- *
- * @see nvmlDeviceGetSerial
- * @see nvmlDeviceGetHandleByUUID
- */
-nvmlReturn_t DECLDIR nvmlDeviceGetHandleBySerial(const char *serial, nvmlDevice_t *device);
-
-/**
- * Acquire the handle for a particular device, based on its globally unique immutable UUID associated with each device.
- *
- * For all products.
- *
- * @param uuid The UUID of the target GPU
- * @param device Reference in which to return the device handle
- *
- * Starting from NVML 5, this API causes NVML to initialize the target GPU
- * NVML may initialize additional GPUs as it searches for the target GPU
- *
- * @return
- * - \ref NVML_SUCCESS if \a device has been set
- * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
- * - \ref NVML_ERROR_INVALID_ARGUMENT if \a uuid is invalid or \a device is null
- * - \ref NVML_ERROR_NOT_FOUND if \a uuid does not match a valid device on the system
- * - \ref NVML_ERROR_INSUFFICIENT_POWER if any attached devices have improperly attached external power cables
- * - \ref NVML_ERROR_IRQ_ISSUE if NVIDIA kernel detected an interrupt issue with the attached GPUs
- * - \ref NVML_ERROR_GPU_IS_LOST if any GPU has fallen off the bus or is otherwise inaccessible
- * - \ref NVML_ERROR_UNKNOWN on any unexpected error
- *
- * @see nvmlDeviceGetUUID
- */
-nvmlReturn_t DECLDIR nvmlDeviceGetHandleByUUID(const char *uuid, nvmlDevice_t *device);
-
-/**
- * Acquire the handle for a particular device, based on its PCI bus id.
- *
- * For all products.
- *
- * This value corresponds to the nvmlPciInfo_t::busId returned by \ref nvmlDeviceGetPciInfo().
- *
- * Starting from NVML 5, this API causes NVML to initialize the target GPU
- * NVML may initialize additional GPUs if:
- * - The target GPU is an SLI slave
- *
- * \note NVML 4.304 and older version of nvmlDeviceGetHandleByPciBusId"_v1" returns NVML_ERROR_NOT_FOUND
- * instead of NVML_ERROR_NO_PERMISSION.
- *
- * @param pciBusId The PCI bus id of the target GPU
- * @param device Reference in which to return the device handle
- *
- * @return
- * - \ref NVML_SUCCESS if \a device has been set
- * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
- * - \ref NVML_ERROR_INVALID_ARGUMENT if \a pciBusId is invalid or \a device is NULL
- * - \ref NVML_ERROR_NOT_FOUND if \a pciBusId does not match a valid device on the system
- * - \ref NVML_ERROR_INSUFFICIENT_POWER if the attached device has improperly attached external power cables
- * - \ref NVML_ERROR_NO_PERMISSION if the user doesn't have permission to talk to this device
- * - \ref NVML_ERROR_IRQ_ISSUE if NVIDIA kernel detected an interrupt issue with the attached GPUs
- * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible
- * - \ref NVML_ERROR_UNKNOWN on any unexpected error
- */
-nvmlReturn_t DECLDIR nvmlDeviceGetHandleByPciBusId(const char *pciBusId, nvmlDevice_t *device);
-
-/**
- * Retrieves the name of this device.
- *
- * For all products.
- *
- * The name is an alphanumeric string that denotes a particular product, e.g. Tesla &tm; C2070. It will not
- * exceed 64 characters in length (including the NULL terminator). See \ref
- * nvmlConstants::NVML_DEVICE_NAME_BUFFER_SIZE.
- *
- * @param device The identifier of the target device
- * @param name Reference in which to return the product name
- * @param length The maximum allowed length of the string returned in \a name
- *
- * @return
- * - \ref NVML_SUCCESS if \a name has been set
- * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
- * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, or \a name is NULL
- * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a length is too small
- * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible
- * - \ref NVML_ERROR_UNKNOWN on any unexpected error
- */
-nvmlReturn_t DECLDIR nvmlDeviceGetName(nvmlDevice_t device, char *name, unsigned int length);
-
-/**
- * Retrieves the brand of this device.
- *
- * For all products.
- *
- * The type is a member of \ref nvmlBrandType_t defined above.
- *
- * @param device The identifier of the target device
- * @param type Reference in which to return the product brand type
- *
- * @return
- * - \ref NVML_SUCCESS if \a name has been set
- * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
- * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, or \a type is NULL
- * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible
- * - \ref NVML_ERROR_UNKNOWN on any unexpected error
- */
-nvmlReturn_t DECLDIR nvmlDeviceGetBrand(nvmlDevice_t device, nvmlBrandType_t *type);
-
-/**
- * Retrieves the NVML index of this device.
- *
- * For all products.
- *
- * Valid indices are derived from the \a accessibleDevices count returned by
- * \ref nvmlDeviceGetCount(). For example, if \a accessibleDevices is 2 the valid indices
- * are 0 and 1, corresponding to GPU 0 and GPU 1.
- *
- * The order in which NVML enumerates devices has no guarantees of consistency between reboots. For that reason it
- * is recommended that devices be looked up by their PCI ids or GPU UUID. See
- * \ref nvmlDeviceGetHandleByPciBusId() and \ref nvmlDeviceGetHandleByUUID().
- *
- * Note: The NVML index may not correlate with other APIs, such as the CUDA device index.
- *
- * @param device The identifier of the target device
- * @param index Reference in which to return the NVML index of the device
- *
- * @return
- * - \ref NVML_SUCCESS if \a index has been set
- * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
- * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, or \a index is NULL
- * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible
- * - \ref NVML_ERROR_UNKNOWN on any unexpected error
- *
- * @see nvmlDeviceGetHandleByIndex()
- * @see nvmlDeviceGetCount()
- */
-nvmlReturn_t DECLDIR nvmlDeviceGetIndex(nvmlDevice_t device, unsigned int *index);
-
-/**
- * Retrieves the globally unique board serial number associated with this device's board.
- *
- * For all products with an inforom.
- *
- * The serial number is an alphanumeric string that will not exceed 30 characters (including the NULL terminator).
- * This number matches the serial number tag that is physically attached to the board. See \ref
- * nvmlConstants::NVML_DEVICE_SERIAL_BUFFER_SIZE.
- *
- * @param device The identifier of the target device
- * @param serial Reference in which to return the board/module serial number
- * @param length The maximum allowed length of the string returned in \a serial
- *
- * @return
- * - \ref NVML_SUCCESS if \a serial has been set
- * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
- * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, or \a serial is NULL
- * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a length is too small
- * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature
- * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible
- * - \ref NVML_ERROR_UNKNOWN on any unexpected error
- */
-nvmlReturn_t DECLDIR nvmlDeviceGetSerial(nvmlDevice_t device, char *serial, unsigned int length);
-
-/**
- * Retrieves an array of unsigned ints (sized to cpuSetSize) of bitmasks with the ideal CPU affinity for the device
- * For example, if processors 0, 1, 32, and 33 are ideal for the device and cpuSetSize == 2,
- * result[0] = 0x3, result[1] = 0x3
- *
- * For Kepler &tm; or newer fully supported devices.
- * Supported on Linux only.
- *
- * @param device The identifier of the target device
- * @param cpuSetSize The size of the cpuSet array that is safe to access
- * @param cpuSet Array reference in which to return a bitmask of CPUs, 64 CPUs per
- * unsigned long on 64-bit machines, 32 on 32-bit machines
- *
- * @return
- * - \ref NVML_SUCCESS if \a cpuAffinity has been filled
- * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
- * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, cpuSetSize == 0, or cpuSet is NULL
- * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature
- * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible
- * - \ref NVML_ERROR_UNKNOWN on any unexpected error
- */
-nvmlReturn_t DECLDIR nvmlDeviceGetCpuAffinity(nvmlDevice_t device, unsigned int cpuSetSize, unsigned long *cpuSet);
-
-/**
- * Sets the ideal affinity for the calling thread and device using the guidelines
- * given in nvmlDeviceGetCpuAffinity(). Note, this is a change as of version 8.0.
- * Older versions set the affinity for a calling process and all children.
- * Currently supports up to 64 processors.
- *
- * For Kepler &tm; or newer fully supported devices.
- * Supported on Linux only.
- *
- * @param device The identifier of the target device
- *
- * @return
- * - \ref NVML_SUCCESS if the calling process has been successfully bound
- * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
- * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid
- * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature
- * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible
- * - \ref NVML_ERROR_UNKNOWN on any unexpected error
- */
-nvmlReturn_t DECLDIR nvmlDeviceSetCpuAffinity(nvmlDevice_t device);
-
-/**
- * Clear all affinity bindings for the calling thread. Note, this is a change as of version
- * 8.0 as older versions cleared the affinity for a calling process and all children.
- *
- * For Kepler &tm; or newer fully supported devices.
- * Supported on Linux only.
- *
- * @param device The identifier of the target device
- *
- * @return
- * - \ref NVML_SUCCESS if the calling process has been successfully unbound
- * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid
- * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
- * - \ref NVML_ERROR_UNKNOWN on any unexpected error
- */
-nvmlReturn_t DECLDIR nvmlDeviceClearCpuAffinity(nvmlDevice_t device);
-
-/**
- * Retrieve the common ancestor for two devices
- * For all products.
- * Supported on Linux only.
- *
- * @param device1 The identifier of the first device
- * @param device2 The identifier of the second device
- * @param pathInfo A \ref nvmlGpuTopologyLevel_t that gives the path type
- *
- * @return
- * - \ref NVML_SUCCESS if \a pathInfo has been set
- * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device1, or \a device2 is invalid, or \a pathInfo is NULL
- * - \ref NVML_ERROR_NOT_SUPPORTED if the device or OS does not support this feature
- * - \ref NVML_ERROR_UNKNOWN an error has occurred in underlying topology discovery
- */
-nvmlReturn_t DECLDIR nvmlDeviceGetTopologyCommonAncestor(nvmlDevice_t device1, nvmlDevice_t device2, nvmlGpuTopologyLevel_t *pathInfo);
-
-/**
- * Retrieve the set of GPUs that are nearest to a given device at a specific interconnectivity level
- * For all products.
- * Supported on Linux only.
- *
- * @param device The identifier of the first device
- * @param level The \ref nvmlGpuTopologyLevel_t level to search for other GPUs
- * @param count When zero, is set to the number of matching GPUs such that \a deviceArray
- * can be malloc'd. When non-zero, \a deviceArray will be filled with \a count
- * number of device handles.
- * @param deviceArray An array of device handles for GPUs found at \a level
- *
- * @return
- * - \ref NVML_SUCCESS if \a deviceArray or \a count (if initially zero) has been set
- * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device, \a level, or \a count is invalid, or \a deviceArray is NULL with a non-zero \a count
- * - \ref NVML_ERROR_NOT_SUPPORTED if the device or OS does not support this feature
- * - \ref NVML_ERROR_UNKNOWN an error has occurred in underlying topology discovery
- */
-nvmlReturn_t DECLDIR nvmlDeviceGetTopologyNearestGpus(nvmlDevice_t device, nvmlGpuTopologyLevel_t level, unsigned int *count, nvmlDevice_t *deviceArray);
-
-/**
- * Retrieve the set of GPUs that have a CPU affinity with the given CPU number
- * For all products.
- * Supported on Linux only.
- *
- * @param cpuNumber The CPU number
- * @param count When zero, is set to the number of matching GPUs such that \a deviceArray
- * can be malloc'd. When non-zero, \a deviceArray will be filled with \a count
- * number of device handles.
- * @param deviceArray An array of device handles for GPUs found with affinity to \a cpuNumber
- *
- * @return
- * - \ref NVML_SUCCESS if \a deviceArray or \a count (if initially zero) has been set
- * - \ref NVML_ERROR_INVALID_ARGUMENT if \a cpuNumber, or \a count is invalid, or \a deviceArray is NULL with a non-zero \a count
- * - \ref NVML_ERROR_NOT_SUPPORTED if the device or OS does not support this feature
- * - \ref NVML_ERROR_UNKNOWN an error has occurred in underlying topology discovery
- */
-nvmlReturn_t DECLDIR nvmlSystemGetTopologyGpuSet(unsigned int cpuNumber, unsigned int *count, nvmlDevice_t *deviceArray);
-
-/**
- * Retrieve the status for a given p2p capability index between a given pair of GPU
- *
- * @param device1 The first device
- * @param device2 The second device
- * @param p2pIndex p2p Capability Index being looked for between \a device1 and \a device2
- * @param p2pStatus Reference in which to return the status of the \a p2pIndex
- * between \a device1 and \a device2
- * @return
- * - \ref NVML_SUCCESS if \a p2pStatus has been populated
- * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device1 or \a device2 or \a p2pIndex is invalid or \a p2pStatus is NULL
- * - \ref NVML_ERROR_UNKNOWN on any unexpected error
- */
-nvmlReturn_t DECLDIR nvmlDeviceGetP2PStatus(nvmlDevice_t device1, nvmlDevice_t device2, nvmlGpuP2PCapsIndex_t p2pIndex,nvmlGpuP2PStatus_t *p2pStatus);
-
-/**
- * Retrieves the globally unique immutable UUID associated with this device, as a 5 part hexadecimal string,
- * that augments the immutable, board serial identifier.
- *
- * For all products.
- *
- * The UUID is a globally unique identifier. It is the only available identifier for pre-Fermi-architecture products.
- * It does NOT correspond to any identifier printed on the board. It will not exceed 80 characters in length
- * (including the NULL terminator). See \ref nvmlConstants::NVML_DEVICE_UUID_BUFFER_SIZE.
- *
- * @param device The identifier of the target device
- * @param uuid Reference in which to return the GPU UUID
- * @param length The maximum allowed length of the string returned in \a uuid
- *
- * @return
- * - \ref NVML_SUCCESS if \a uuid has been set
- * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
- * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, or \a uuid is NULL
- * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a length is too small
- * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature
- * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible
- * - \ref NVML_ERROR_UNKNOWN on any unexpected error
- */
-nvmlReturn_t DECLDIR nvmlDeviceGetUUID(nvmlDevice_t device, char *uuid, unsigned int length);
-
-/**
- * Retrieves minor number for the device. The minor number for the device is such that the Nvidia device node file for
- * each GPU will have the form /dev/nvidia[minor number].
- *
- * For all products.
- * Supported only for Linux
- *
- * @param device The identifier of the target device
- * @param minorNumber Reference in which to return the minor number for the device
- * @return
- * - \ref NVML_SUCCESS if the minor number is successfully retrieved
- * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
- * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a minorNumber is NULL
- * - \ref NVML_ERROR_NOT_SUPPORTED if this query is not supported by the device
- * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible
- * - \ref NVML_ERROR_UNKNOWN on any unexpected error
- */
-nvmlReturn_t DECLDIR nvmlDeviceGetMinorNumber(nvmlDevice_t device, unsigned int *minorNumber);
-
-/**
- * Retrieves the the device board part number which is programmed into the board's InfoROM
- *
- * For all products.
- *
- * @param device Identifier of the target device
- * @param partNumber Reference to the buffer to return
- * @param length Length of the buffer reference
- *
- * @return
- * - \ref NVML_SUCCESS if \a partNumber has been set
- * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
- * - \ref NVML_ERROR_NOT_SUPPORTED if the needed VBIOS fields have not been filled
- * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a serial is NULL
- * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible
- * - \ref NVML_ERROR_UNKNOWN on any unexpected error
- */
-nvmlReturn_t DECLDIR nvmlDeviceGetBoardPartNumber(nvmlDevice_t device, char* partNumber, unsigned int length);
-
-/**
- * Retrieves the version information for the device's infoROM object.
- *
- * For all products with an inforom.
- *
- * Fermi and higher parts have non-volatile on-board memory for persisting device info, such as aggregate
- * ECC counts. The version of the data structures in this memory may change from time to time. It will not
- * exceed 16 characters in length (including the NULL terminator).
- * See \ref nvmlConstants::NVML_DEVICE_INFOROM_VERSION_BUFFER_SIZE.
- *
- * See \ref nvmlInforomObject_t for details on the available infoROM objects.
- *
- * @param device The identifier of the target device
- * @param object The target infoROM object
- * @param version Reference in which to return the infoROM version
- * @param length The maximum allowed length of the string returned in \a version
- *
- * @return
- * - \ref NVML_SUCCESS if \a version has been set
- * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
- * - \ref NVML_ERROR_INVALID_ARGUMENT if \a version is NULL
- * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a length is too small
- * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not have an infoROM
- * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible
- * - \ref NVML_ERROR_UNKNOWN on any unexpected error
- *
- * @see nvmlDeviceGetInforomImageVersion
- */
-nvmlReturn_t DECLDIR nvmlDeviceGetInforomVersion(nvmlDevice_t device, nvmlInforomObject_t object, char *version, unsigned int length);
-
-/**
- * Retrieves the global infoROM image version
- *
- * For all products with an inforom.
- *
- * Image version just like VBIOS version uniquely describes the exact version of the infoROM flashed on the board
- * in contrast to infoROM object version which is only an indicator of supported features.
- * Version string will not exceed 16 characters in length (including the NULL terminator).
- * See \ref nvmlConstants::NVML_DEVICE_INFOROM_VERSION_BUFFER_SIZE.
- *
- * @param device The identifier of the target device
- * @param version Reference in which to return the infoROM image version
- * @param length The maximum allowed length of the string returned in \a version
- *
- * @return
- * - \ref NVML_SUCCESS if \a version has been set
- * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
- * - \ref NVML_ERROR_INVALID_ARGUMENT if \a version is NULL
- * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a length is too small
- * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not have an infoROM
- * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible
- * - \ref NVML_ERROR_UNKNOWN on any unexpected error
- *
- * @see nvmlDeviceGetInforomVersion
- */
-nvmlReturn_t DECLDIR nvmlDeviceGetInforomImageVersion(nvmlDevice_t device, char *version, unsigned int length);
-
-/**
- * Retrieves the checksum of the configuration stored in the device's infoROM.
- *
- * For all products with an inforom.
- *
- * Can be used to make sure that two GPUs have the exact same configuration.
- * Current checksum takes into account configuration stored in PWR and ECC infoROM objects.
- * Checksum can change between driver releases or when user changes configuration (e.g. disable/enable ECC)
- *
- * @param device The identifier of the target device
- * @param checksum Reference in which to return the infoROM configuration checksum
- *
- * @return
- * - \ref NVML_SUCCESS if \a checksum has been set
- * - \ref NVML_ERROR_CORRUPTED_INFOROM if the device's checksum couldn't be retrieved due to infoROM corruption
- * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
- * - \ref NVML_ERROR_INVALID_ARGUMENT if \a checksum is NULL
- * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature
- * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible
- * - \ref NVML_ERROR_UNKNOWN on any unexpected error
- */
-nvmlReturn_t DECLDIR nvmlDeviceGetInforomConfigurationChecksum(nvmlDevice_t device, unsigned int *checksum);
-
-/**
- * Reads the infoROM from the flash and verifies the checksums.
- *
- * For all products with an inforom.
- *
- * @param device The identifier of the target device
- *
- * @return
- * - \ref NVML_SUCCESS if infoROM is not corrupted
- * - \ref NVML_ERROR_CORRUPTED_INFOROM if the device's infoROM is corrupted
- * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
- * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature
- * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible
- * - \ref NVML_ERROR_UNKNOWN on any unexpected error
- */
-nvmlReturn_t DECLDIR nvmlDeviceValidateInforom(nvmlDevice_t device);
-
-/**
- * Retrieves the display mode for the device.
- *
- * For all products.
- *
- * This method indicates whether a physical display (e.g. monitor) is currently connected to
- * any of the device's connectors.
- *
- * See \ref nvmlEnableState_t for details on allowed modes.
- *
- * @param device The identifier of the target device
- * @param display Reference in which to return the display mode
- *
- * @return
- * - \ref NVML_SUCCESS if \a display has been set
- * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
- * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a display is NULL
- * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature
- * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible
- * - \ref NVML_ERROR_UNKNOWN on any unexpected error
- */
-nvmlReturn_t DECLDIR nvmlDeviceGetDisplayMode(nvmlDevice_t device, nvmlEnableState_t *display);
-
-/**
- * Retrieves the display active state for the device.
- *
- * For all products.
- *
- * This method indicates whether a display is initialized on the device.
- * For example whether X Server is attached to this device and has allocated memory for the screen.
- *
- * Display can be active even when no monitor is physically attached.
- *
- * See \ref nvmlEnableState_t for details on allowed modes.
- *
- * @param device The identifier of the target device
- * @param isActive Reference in which to return the display active state
- *
- * @return
- * - \ref NVML_SUCCESS if \a isActive has been set
- * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
- * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a isActive is NULL
- * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature
- * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible
- * - \ref NVML_ERROR_UNKNOWN on any unexpected error
- */
-nvmlReturn_t DECLDIR nvmlDeviceGetDisplayActive(nvmlDevice_t device, nvmlEnableState_t *isActive);
-
-/**
- * Retrieves the persistence mode associated with this device.
- *
- * For all products.
- * For Linux only.
- *
- * When driver persistence mode is enabled the driver software state is not torn down when the last
- * client disconnects. By default this feature is disabled.
- *
- * See \ref nvmlEnableState_t for details on allowed modes.
- *
- * @param device The identifier of the target device
- * @param mode Reference in which to return the current driver persistence mode
- *
- * @return
- * - \ref NVML_SUCCESS if \a mode has been set
- * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
- * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a mode is NULL
- * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature
- * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible
- * - \ref NVML_ERROR_UNKNOWN on any unexpected error
- *
- * @see nvmlDeviceSetPersistenceMode()
- */
-nvmlReturn_t DECLDIR nvmlDeviceGetPersistenceMode(nvmlDevice_t device, nvmlEnableState_t *mode);
-
-/**
- * Retrieves the PCI attributes of this device.
- *
- * For all products.
- *
- * See \ref nvmlPciInfo_t for details on the available PCI info.
- *
- * @param device The identifier of the target device
- * @param pci Reference in which to return the PCI info
- *
- * @return
- * - \ref NVML_SUCCESS if \a pci has been populated
- * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
- * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a pci is NULL
- * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible
- * - \ref NVML_ERROR_UNKNOWN on any unexpected error
- */
-nvmlReturn_t DECLDIR nvmlDeviceGetPciInfo(nvmlDevice_t device, nvmlPciInfo_t *pci);
-
-/**
- * Retrieves the maximum PCIe link generation possible with this device and system
- *
- * I.E. for a generation 2 PCIe device attached to a generation 1 PCIe bus the max link generation this function will
- * report is generation 1.
- *
- * For Fermi &tm; or newer fully supported devices.
- *
- * @param device The identifier of the target device
- * @param maxLinkGen Reference in which to return the max PCIe link generation
- *
- * @return
- * - \ref NVML_SUCCESS if \a maxLinkGen has been populated
- * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
- * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a maxLinkGen is null
- * - \ref NVML_ERROR_NOT_SUPPORTED if PCIe link information is not available
- * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible
- * - \ref NVML_ERROR_UNKNOWN on any unexpected error
- */
-nvmlReturn_t DECLDIR nvmlDeviceGetMaxPcieLinkGeneration(nvmlDevice_t device, unsigned int *maxLinkGen);
-
-/**
- * Retrieves the maximum PCIe link width possible with this device and system
- *
- * I.E. for a device with a 16x PCIe bus width attached to a 8x PCIe system bus this function will report
- * a max link width of 8.
- *
- * For Fermi &tm; or newer fully supported devices.
- *
- * @param device The identifier of the target device
- * @param maxLinkWidth Reference in which to return the max PCIe link generation
- *
- * @return
- * - \ref NVML_SUCCESS if \a maxLinkWidth has been populated
- * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
- * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a maxLinkWidth is null
- * - \ref NVML_ERROR_NOT_SUPPORTED if PCIe link information is not available
- * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible
- * - \ref NVML_ERROR_UNKNOWN on any unexpected error
- */
-nvmlReturn_t DECLDIR nvmlDeviceGetMaxPcieLinkWidth(nvmlDevice_t device, unsigned int *maxLinkWidth);
-
-/**
- * Retrieves the current PCIe link generation
- *
- * For Fermi &tm; or newer fully supported devices.
- *
- * @param device The identifier of the target device
- * @param currLinkGen Reference in which to return the current PCIe link generation
- *
- * @return
- * - \ref NVML_SUCCESS if \a currLinkGen has been populated
- * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
- * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a currLinkGen is null
- * - \ref NVML_ERROR_NOT_SUPPORTED if PCIe link information is not available
- * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible
- * - \ref NVML_ERROR_UNKNOWN on any unexpected error
- */
-nvmlReturn_t DECLDIR nvmlDeviceGetCurrPcieLinkGeneration(nvmlDevice_t device, unsigned int *currLinkGen);
-
-/**
- * Retrieves the current PCIe link width
- *
- * For Fermi &tm; or newer fully supported devices.
- *
- * @param device The identifier of the target device
- * @param currLinkWidth Reference in which to return the current PCIe link generation
- *
- * @return
- * - \ref NVML_SUCCESS if \a currLinkWidth has been populated
- * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
- * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a currLinkWidth is null
- * - \ref NVML_ERROR_NOT_SUPPORTED if PCIe link information is not available
- * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible
- * - \ref NVML_ERROR_UNKNOWN on any unexpected error
- */
-nvmlReturn_t DECLDIR nvmlDeviceGetCurrPcieLinkWidth(nvmlDevice_t device, unsigned int *currLinkWidth);
-
-/**
- * Retrieve PCIe utilization information.
- * This function is querying a byte counter over a 20ms interval and thus is the
- * PCIe throughput over that interval.
- *
- * For Maxwell &tm; or newer fully supported devices.
- *
- * This method is not supported in virtual machines running virtual GPU (vGPU).
- *
- * @param device The identifier of the target device
- * @param counter The specific counter that should be queried \ref nvmlPcieUtilCounter_t
- * @param value Reference in which to return throughput in KB/s
- *
- * @return
- * - \ref NVML_SUCCESS if \a value has been set
- * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
- * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device or \a counter is invalid, or \a value is NULL
- * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature
- * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible
- * - \ref NVML_ERROR_UNKNOWN on any unexpected error
- */
-nvmlReturn_t DECLDIR nvmlDeviceGetPcieThroughput(nvmlDevice_t device, nvmlPcieUtilCounter_t counter, unsigned int *value);
-
-/**
- * Retrieve the PCIe replay counter.
- *
- * For Kepler &tm; or newer fully supported devices.
- *
- * @param device The identifier of the target device
- * @param value Reference in which to return the counter's value
- *
- * @return
- * - \ref NVML_SUCCESS if \a value and \a rollover have been set
- * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
- * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, or \a value or \a rollover are NULL
- * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature
- * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible
- * - \ref NVML_ERROR_UNKNOWN on any unexpected error
- */
-nvmlReturn_t DECLDIR nvmlDeviceGetPcieReplayCounter(nvmlDevice_t device, unsigned int *value);
-
-/**
- * Retrieves the current clock speeds for the device.
- *
- * For Fermi &tm; or newer fully supported devices.
- *
- * See \ref nvmlClockType_t for details on available clock information.
- *
- * @param device The identifier of the target device
- * @param type Identify which clock domain to query
- * @param clock Reference in which to return the clock speed in MHz
- *
- * @return
- * - \ref NVML_SUCCESS if \a clock has been set
- * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
- * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a clock is NULL
- * - \ref NVML_ERROR_NOT_SUPPORTED if the device cannot report the specified clock
- * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible
- * - \ref NVML_ERROR_UNKNOWN on any unexpected error
- */
-nvmlReturn_t DECLDIR nvmlDeviceGetClockInfo(nvmlDevice_t device, nvmlClockType_t type, unsigned int *clock);
-
-/**
- * Retrieves the maximum clock speeds for the device.
- *
- * For Fermi &tm; or newer fully supported devices.
- *
- * See \ref nvmlClockType_t for details on available clock information.
- *
- * \note On GPUs from Fermi family current P0 clocks (reported by \ref nvmlDeviceGetClockInfo) can differ from max clocks
- * by few MHz.
- *
- * @param device The identifier of the target device
- * @param type Identify which clock domain to query
- * @param clock Reference in which to return the clock speed in MHz
- *
- * @return
- * - \ref NVML_SUCCESS if \a clock has been set
- * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
- * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a clock is NULL
- * - \ref NVML_ERROR_NOT_SUPPORTED if the device cannot report the specified clock
- * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible
- * - \ref NVML_ERROR_UNKNOWN on any unexpected error
- */
-nvmlReturn_t DECLDIR nvmlDeviceGetMaxClockInfo(nvmlDevice_t device, nvmlClockType_t type, unsigned int *clock);
-
-/**
- * Retrieves the current setting of a clock that applications will use unless an overspec situation occurs.
- * Can be changed using \ref nvmlDeviceSetApplicationsClocks.
- *
- * For Kepler &tm; or newer fully supported devices.
- *
- * @param device The identifier of the target device
- * @param clockType Identify which clock domain to query
- * @param clockMHz Reference in which to return the clock in MHz
- *
- * @return
- * - \ref NVML_SUCCESS if \a clockMHz has been set
- * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
- * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a clockMHz is NULL or \a clockType is invalid
- * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature
- * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible
- * - \ref NVML_ERROR_UNKNOWN on any unexpected error
- */
-nvmlReturn_t DECLDIR nvmlDeviceGetApplicationsClock(nvmlDevice_t device, nvmlClockType_t clockType, unsigned int *clockMHz);
-
-/**
- * Retrieves the default applications clock that GPU boots with or
- * defaults to after \ref nvmlDeviceResetApplicationsClocks call.
- *
- * For Kepler &tm; or newer fully supported devices.
- *
- * @param device The identifier of the target device
- * @param clockType Identify which clock domain to query
- * @param clockMHz Reference in which to return the default clock in MHz
- *
- * @return
- * - \ref NVML_SUCCESS if \a clockMHz has been set
- * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
- * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a clockMHz is NULL or \a clockType is invalid
- * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature
- * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible
- * - \ref NVML_ERROR_UNKNOWN on any unexpected error
- *
- * \see nvmlDeviceGetApplicationsClock
- */
-nvmlReturn_t DECLDIR nvmlDeviceGetDefaultApplicationsClock(nvmlDevice_t device, nvmlClockType_t clockType, unsigned int *clockMHz);
-
-/**
- * Resets the application clock to the default value
- *
- * This is the applications clock that will be used after system reboot or driver reload.
- * Default value is constant, but the current value an be changed using \ref nvmlDeviceSetApplicationsClocks.
- *
- * On Pascal and newer hardware, if clocks were previously locked with \ref nvmlDeviceSetApplicationsClocks,
- * this call will unlock clocks. This returns clocks their default behavior ofautomatically boosting above
- * base clocks as thermal limits allow.
- *
- * @see nvmlDeviceGetApplicationsClock
- * @see nvmlDeviceSetApplicationsClocks
- *
- * For Fermi &tm; or newer non-GeForce fully supported devices and Maxwell or newer GeForce devices.
- *
- * @param device The identifier of the target device
- *
- * @return
- * - \ref NVML_SUCCESS if new settings were successfully set
- * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
- * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid
- * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature
- * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible
- * - \ref NVML_ERROR_UNKNOWN on any unexpected error
- */
-nvmlReturn_t DECLDIR nvmlDeviceResetApplicationsClocks(nvmlDevice_t device);
-
-/**
- * Retrieves the clock speed for the clock specified by the clock type and clock ID.
- *
- * For Kepler &tm; or newer fully supported devices.
- *
- * @param device The identifier of the target device
- * @param clockType Identify which clock domain to query
- * @param clockId Identify which clock in the domain to query
- * @param clockMHz Reference in which to return the clock in MHz
- *
- * @return
- * - \ref NVML_SUCCESS if \a clockMHz has been set
- * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
- * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a clockMHz is NULL or \a clockType is invalid
- * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature
- * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible
- * - \ref NVML_ERROR_UNKNOWN on any unexpected error
- */
-nvmlReturn_t DECLDIR nvmlDeviceGetClock(nvmlDevice_t device, nvmlClockType_t clockType, nvmlClockId_t clockId, unsigned int *clockMHz);
-
-/**
- * Retrieves the customer defined maximum boost clock speed specified by the given clock type.
- *
- * For Pascal &tm; or newer fully supported devices.
- *
- * @param device The identifier of the target device
- * @param clockType Identify which clock domain to query
- * @param clockMHz Reference in which to return the clock in MHz
- *
- * @return
- * - \ref NVML_SUCCESS if \a clockMHz has been set
- * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
- * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a clockMHz is NULL or \a clockType is invalid
- * - \ref NVML_ERROR_NOT_SUPPORTED if the device or the \a clockType on this device does not support this feature
- * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible
- * - \ref NVML_ERROR_UNKNOWN on any unexpected error
- */
-nvmlReturn_t DECLDIR nvmlDeviceGetMaxCustomerBoostClock(nvmlDevice_t device, nvmlClockType_t clockType, unsigned int *clockMHz);
-
-/**
- * Retrieves the list of possible memory clocks that can be used as an argument for \ref nvmlDeviceSetApplicationsClocks.
- *
- * For Kepler &tm; or newer fully supported devices.
- *
- * @param device The identifier of the target device
- * @param count Reference in which to provide the \a clocksMHz array size, and
- * to return the number of elements
- * @param clocksMHz Reference in which to return the clock in MHz
- *
- * @return
- * - \ref NVML_SUCCESS if \a count and \a clocksMHz have been populated
- * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
- * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a count is NULL
- * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature
- * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a count is too small (\a count is set to the number of
- * required elements)
- * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible
- * - \ref NVML_ERROR_UNKNOWN on any unexpected error
- *
- * @see nvmlDeviceSetApplicationsClocks
- * @see nvmlDeviceGetSupportedGraphicsClocks
- */
-nvmlReturn_t DECLDIR nvmlDeviceGetSupportedMemoryClocks(nvmlDevice_t device, unsigned int *count, unsigned int *clocksMHz);
-
-/**
- * Retrieves the list of possible graphics clocks that can be used as an argument for \ref nvmlDeviceSetApplicationsClocks.
- *
- * For Kepler &tm; or newer fully supported devices.
- *
- * @param device The identifier of the target device
- * @param memoryClockMHz Memory clock for which to return possible graphics clocks
- * @param count Reference in which to provide the \a clocksMHz array size, and
- * to return the number of elements
- * @param clocksMHz Reference in which to return the clocks in MHz
- *
- * @return
- * - \ref NVML_SUCCESS if \a count and \a clocksMHz have been populated
- * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
- * - \ref NVML_ERROR_NOT_FOUND if the specified \a memoryClockMHz is not a supported frequency
- * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a clock is NULL
- * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature
- * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a count is too small
- * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible
- * - \ref NVML_ERROR_UNKNOWN on any unexpected error
- *
- * @see nvmlDeviceSetApplicationsClocks
- * @see nvmlDeviceGetSupportedMemoryClocks
- */
-nvmlReturn_t DECLDIR nvmlDeviceGetSupportedGraphicsClocks(nvmlDevice_t device, unsigned int memoryClockMHz, unsigned int *count, unsigned int *clocksMHz);
-
-/**
- * Retrieve the current state of Auto Boosted clocks on a device and store it in \a isEnabled
- *
- * For Kepler &tm; or newer fully supported devices.
- *
- * Auto Boosted clocks are enabled by default on some hardware, allowing the GPU to run at higher clock rates
- * to maximize performance as thermal limits allow.
- *
- * On Pascal and newer hardware, Auto Aoosted clocks are controlled through application clocks.
- * Use \ref nvmlDeviceSetApplicationsClocks and \ref nvmlDeviceResetApplicationsClocks to control Auto Boost
- * behavior.
- *
- * @param device The identifier of the target device
- * @param isEnabled Where to store the current state of Auto Boosted clocks of the target device
- * @param defaultIsEnabled Where to store the default Auto Boosted clocks behavior of the target device that the device will
- * revert to when no applications are using the GPU
- *
- * @return
- * - \ref NVML_SUCCESS If \a isEnabled has been been set with the Auto Boosted clocks state of \a device
- * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
- * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a isEnabled is NULL
- * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support Auto Boosted clocks
- * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible
- * - \ref NVML_ERROR_UNKNOWN on any unexpected error
- *
- */
-nvmlReturn_t DECLDIR nvmlDeviceGetAutoBoostedClocksEnabled(nvmlDevice_t device, nvmlEnableState_t *isEnabled, nvmlEnableState_t *defaultIsEnabled);
-
-/**
- * Try to set the current state of Auto Boosted clocks on a device.
- *
- * For Kepler &tm; or newer fully supported devices.
- *
- * Auto Boosted clocks are enabled by default on some hardware, allowing the GPU to run at higher clock rates
- * to maximize performance as thermal limits allow. Auto Boosted clocks should be disabled if fixed clock
- * rates are desired.
- *
- * Non-root users may use this API by default but can be restricted by root from using this API by calling
- * \ref nvmlDeviceSetAPIRestriction with apiType=NVML_RESTRICTED_API_SET_AUTO_BOOSTED_CLOCKS.
- * Note: Persistence Mode is required to modify current Auto Boost settings, therefore, it must be enabled.
- *
- * On Pascal and newer hardware, Auto Boosted clocks are controlled through application clocks.
- * Use \ref nvmlDeviceSetApplicationsClocks and \ref nvmlDeviceResetApplicationsClocks to control Auto Boost
- * behavior.
- *
- * @param device The identifier of the target device
- * @param enabled What state to try to set Auto Boosted clocks of the target device to
- *
- * @return
- * - \ref NVML_SUCCESS If the Auto Boosted clocks were successfully set to the state specified by \a enabled
- * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
- * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid
- * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support Auto Boosted clocks
- * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible
- * - \ref NVML_ERROR_UNKNOWN on any unexpected error
- *
- */
-nvmlReturn_t DECLDIR nvmlDeviceSetAutoBoostedClocksEnabled(nvmlDevice_t device, nvmlEnableState_t enabled);
-
-/**
- * Try to set the default state of Auto Boosted clocks on a device. This is the default state that Auto Boosted clocks will
- * return to when no compute running processes (e.g. CUDA application which have an active context) are running
- *
- * For Kepler &tm; or newer non-GeForce fully supported devices and Maxwell or newer GeForce devices.
- * Requires root/admin permissions.
- *
- * Auto Boosted clocks are enabled by default on some hardware, allowing the GPU to run at higher clock rates
- * to maximize performance as thermal limits allow. Auto Boosted clocks should be disabled if fixed clock
- * rates are desired.
- *
- * On Pascal and newer hardware, Auto Boosted clocks are controlled through application clocks.
- * Use \ref nvmlDeviceSetApplicationsClocks and \ref nvmlDeviceResetApplicationsClocks to control Auto Boost
- * behavior.
- *
- * @param device The identifier of the target device
- * @param enabled What state to try to set default Auto Boosted clocks of the target device to
- * @param flags Flags that change the default behavior. Currently Unused.
- *
- * @return
- * - \ref NVML_SUCCESS If the Auto Boosted clock's default state was successfully set to the state specified by \a enabled
- * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
- * - \ref NVML_ERROR_NO_PERMISSION If the calling user does not have permission to change Auto Boosted clock's default state.
- * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid
- * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support Auto Boosted clocks
- * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible
- * - \ref NVML_ERROR_UNKNOWN on any unexpected error
- *
- */
-nvmlReturn_t DECLDIR nvmlDeviceSetDefaultAutoBoostedClocksEnabled(nvmlDevice_t device, nvmlEnableState_t enabled, unsigned int flags);
-
-
-/**
- * Retrieves the intended operating speed of the device's fan.
- *
- * Note: The reported speed is the intended fan speed. If the fan is physically blocked and unable to spin, the
- * output will not match the actual fan speed.
- *
- * For all discrete products with dedicated fans.
- *
- * The fan speed is expressed as a percent of the maximum, i.e. full speed is 100%.
- *
- * @param device The identifier of the target device
- * @param speed Reference in which to return the fan speed percentage
- *
- * @return
- * - \ref NVML_SUCCESS if \a speed has been set
- * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
- * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a speed is NULL
- * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not have a fan
- * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible
- * - \ref NVML_ERROR_UNKNOWN on any unexpected error
- */
-nvmlReturn_t DECLDIR nvmlDeviceGetFanSpeed(nvmlDevice_t device, unsigned int *speed);
-
-/**
- * Retrieves the current temperature readings for the device, in degrees C.
- *
- * For all products.
- *
- * See \ref nvmlTemperatureSensors_t for details on available temperature sensors.
- *
- * @param device The identifier of the target device
- * @param sensorType Flag that indicates which sensor reading to retrieve
- * @param temp Reference in which to return the temperature reading
- *
- * @return
- * - \ref NVML_SUCCESS if \a temp has been set
- * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
- * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, \a sensorType is invalid or \a temp is NULL
- * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not have the specified sensor
- * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible
- * - \ref NVML_ERROR_UNKNOWN on any unexpected error
- */
-nvmlReturn_t DECLDIR nvmlDeviceGetTemperature(nvmlDevice_t device, nvmlTemperatureSensors_t sensorType, unsigned int *temp);
-
-/**
- * Retrieves the temperature threshold for the GPU with the specified threshold type in degrees C.
- *
- * For Kepler &tm; or newer fully supported devices.
- *
- * See \ref nvmlTemperatureThresholds_t for details on available temperature thresholds.
- *
- * @param device The identifier of the target device
- * @param thresholdType The type of threshold value queried
- * @param temp Reference in which to return the temperature reading
- * @return
- * - \ref NVML_SUCCESS if \a temp has been set
- * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
- * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, \a thresholdType is invalid or \a temp is NULL
- * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not have a temperature sensor or is unsupported
- * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible
- * - \ref NVML_ERROR_UNKNOWN on any unexpected error
- */
-nvmlReturn_t DECLDIR nvmlDeviceGetTemperatureThreshold(nvmlDevice_t device, nvmlTemperatureThresholds_t thresholdType, unsigned int *temp);
-
-/**
- * Retrieves the current performance state for the device.
- *
- * For Fermi &tm; or newer fully supported devices.
- *
- * See \ref nvmlPstates_t for details on allowed performance states.
- *
- * @param device The identifier of the target device
- * @param pState Reference in which to return the performance state reading
- *
- * @return
- * - \ref NVML_SUCCESS if \a pState has been set
- * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
- * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a pState is NULL
- * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature
- * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible
- * - \ref NVML_ERROR_UNKNOWN on any unexpected error
- */
-nvmlReturn_t DECLDIR nvmlDeviceGetPerformanceState(nvmlDevice_t device, nvmlPstates_t *pState);
-
-/**
- * Retrieves current clocks throttling reasons.
- *
- * For all fully supported products.
- *
- * \note More than one bit can be enabled at the same time. Multiple reasons can be affecting clocks at once.
- *
- * @param device The identifier of the target device
- * @param clocksThrottleReasons Reference in which to return bitmask of active clocks throttle
- * reasons
- *
- * @return
- * - \ref NVML_SUCCESS if \a clocksThrottleReasons has been set
- * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
- * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a clocksThrottleReasons is NULL
- * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature
- * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible
- * - \ref NVML_ERROR_UNKNOWN on any unexpected error
- *
- * @see nvmlClocksThrottleReasons
- * @see nvmlDeviceGetSupportedClocksThrottleReasons
- */
-nvmlReturn_t DECLDIR nvmlDeviceGetCurrentClocksThrottleReasons(nvmlDevice_t device, unsigned long long *clocksThrottleReasons);
-
-/**
- * Retrieves bitmask of supported clocks throttle reasons that can be returned by
- * \ref nvmlDeviceGetCurrentClocksThrottleReasons
- *
- * For all fully supported products.
- *
- * This method is not supported in virtual machines running virtual GPU (vGPU).
- *
- * @param device The identifier of the target device
- * @param supportedClocksThrottleReasons Reference in which to return bitmask of supported
- * clocks throttle reasons
- *
- * @return
- * - \ref NVML_SUCCESS if \a supportedClocksThrottleReasons has been set
- * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
- * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a supportedClocksThrottleReasons is NULL
- * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible
- * - \ref NVML_ERROR_UNKNOWN on any unexpected error
- *
- * @see nvmlClocksThrottleReasons
- * @see nvmlDeviceGetCurrentClocksThrottleReasons
- */
-nvmlReturn_t DECLDIR nvmlDeviceGetSupportedClocksThrottleReasons(nvmlDevice_t device, unsigned long long *supportedClocksThrottleReasons);
-
-/**
- * Deprecated: Use \ref nvmlDeviceGetPerformanceState. This function exposes an incorrect generalization.
- *
- * Retrieve the current performance state for the device.
- *
- * For Fermi &tm; or newer fully supported devices.
- *
- * See \ref nvmlPstates_t for details on allowed performance states.
- *
- * @param device The identifier of the target device
- * @param pState Reference in which to return the performance state reading
- *
- * @return
- * - \ref NVML_SUCCESS if \a pState has been set
- * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
- * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a pState is NULL
- * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature
- * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible
- * - \ref NVML_ERROR_UNKNOWN on any unexpected error
- */
-nvmlReturn_t DECLDIR nvmlDeviceGetPowerState(nvmlDevice_t device, nvmlPstates_t *pState);
-
-/**
- * This API has been deprecated.
- *
- * Retrieves the power management mode associated with this device.
- *
- * For products from the Fermi family.
- * - Requires \a NVML_INFOROM_POWER version 3.0 or higher.
- *
- * For from the Kepler or newer families.
- * - Does not require \a NVML_INFOROM_POWER object.
- *
- * This flag indicates whether any power management algorithm is currently active on the device. An
- * enabled state does not necessarily mean the device is being actively throttled -- only that
- * that the driver will do so if the appropriate conditions are met.
- *
- * See \ref nvmlEnableState_t for details on allowed modes.
- *
- * @param device The identifier of the target device
- * @param mode Reference in which to return the current power management mode
- *
- * @return
- * - \ref NVML_SUCCESS if \a mode has been set
- * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
- * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a mode is NULL
- * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature
- * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible
- * - \ref NVML_ERROR_UNKNOWN on any unexpected error
- */
-nvmlReturn_t DECLDIR nvmlDeviceGetPowerManagementMode(nvmlDevice_t device, nvmlEnableState_t *mode);
-
-/**
- * Retrieves the power management limit associated with this device.
- *
- * For Fermi &tm; or newer fully supported devices.
- *
- * The power limit defines the upper boundary for the card's power draw. If
- * the card's total power draw reaches this limit the power management algorithm kicks in.
- *
- * This reading is only available if power management mode is supported.
- * See \ref nvmlDeviceGetPowerManagementMode.
- *
- * @param device The identifier of the target device
- * @param limit Reference in which to return the power management limit in milliwatts
- *
- * @return
- * - \ref NVML_SUCCESS if \a limit has been set
- * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
- * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a limit is NULL
- * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature
- * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible
- * - \ref NVML_ERROR_UNKNOWN on any unexpected error
- */
-nvmlReturn_t DECLDIR nvmlDeviceGetPowerManagementLimit(nvmlDevice_t device, unsigned int *limit);
-
-/**
- * Retrieves information about possible values of power management limits on this device.
- *
- * For Kepler &tm; or newer fully supported devices.
- *
- * @param device The identifier of the target device
- * @param minLimit Reference in which to return the minimum power management limit in milliwatts
- * @param maxLimit Reference in which to return the maximum power management limit in milliwatts
- *
- * @return
- * - \ref NVML_SUCCESS if \a minLimit and \a maxLimit have been set
- * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
- * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a minLimit or \a maxLimit is NULL
- * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature
- * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible
- * - \ref NVML_ERROR_UNKNOWN on any unexpected error
- *
- * @see nvmlDeviceSetPowerManagementLimit
- */
-nvmlReturn_t DECLDIR nvmlDeviceGetPowerManagementLimitConstraints(nvmlDevice_t device, unsigned int *minLimit, unsigned int *maxLimit);
-
-/**
- * Retrieves default power management limit on this device, in milliwatts.
- * Default power management limit is a power management limit that the device boots with.
- *
- * For Kepler &tm; or newer fully supported devices.
- *
- * @param device The identifier of the target device
- * @param defaultLimit Reference in which to return the default power management limit in milliwatts
- *
- * @return
- * - \ref NVML_SUCCESS if \a defaultLimit has been set
- * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
- * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a defaultLimit is NULL
- * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature
- * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible
- * - \ref NVML_ERROR_UNKNOWN on any unexpected error
- */
-nvmlReturn_t DECLDIR nvmlDeviceGetPowerManagementDefaultLimit(nvmlDevice_t device, unsigned int *defaultLimit);
-
-/**
- * Retrieves power usage for this GPU in milliwatts and its associated circuitry (e.g. memory)
- *
- * For Fermi &tm; or newer fully supported devices.
- *
- * On Fermi and Kepler GPUs the reading is accurate to within +/- 5% of current power draw.
- *
- * It is only available if power management mode is supported. See \ref nvmlDeviceGetPowerManagementMode.
- *
- * @param device The identifier of the target device
- * @param power Reference in which to return the power usage information
- *
- * @return
- * - \ref NVML_SUCCESS if \a power has been populated
- * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
- * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a power is NULL
- * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support power readings
- * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible
- * - \ref NVML_ERROR_UNKNOWN on any unexpected error
- */
-nvmlReturn_t DECLDIR nvmlDeviceGetPowerUsage(nvmlDevice_t device, unsigned int *power);
-
-/**
- * Retrieves total energy consumption for this GPU in millijoules (mJ) since the driver was last reloaded
- *
- * For newer than Pascal &tm; fully supported devices.
- *
- * @param device The identifier of the target device
- * @param energy Reference in which to return the energy consumption information
- *
- * @return
- * - \ref NVML_SUCCESS if \a energy has been populated
- * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
- * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a energy is NULL
- * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support energy readings
- * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible
- * - \ref NVML_ERROR_UNKNOWN on any unexpected error
- */
-nvmlReturn_t DECLDIR nvmlDeviceGetTotalEnergyConsumption(nvmlDevice_t device, unsigned long long *energy);
-
-/**
- * Get the effective power limit that the driver enforces after taking into account all limiters
- *
- * Note: This can be different from the \ref nvmlDeviceGetPowerManagementLimit if other limits are set elsewhere
- * This includes the out of band power limit interface
- *
- * For Kepler &tm; or newer fully supported devices.
- *
- * @param device The device to communicate with
- * @param limit Reference in which to return the power management limit in milliwatts
- *
- * @return
- * - \ref NVML_SUCCESS if \a limit has been set
- * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
- * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a limit is NULL
- * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature
- * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible
- * - \ref NVML_ERROR_UNKNOWN on any unexpected error
- */
-nvmlReturn_t DECLDIR nvmlDeviceGetEnforcedPowerLimit(nvmlDevice_t device, unsigned int *limit);
-
-/**
- * Retrieves the current GOM and pending GOM (the one that GPU will switch to after reboot).
- *
- * For GK110 M-class and X-class Tesla &tm; products from the Kepler family.
- * Modes \ref NVML_GOM_LOW_DP and \ref NVML_GOM_ALL_ON are supported on fully supported GeForce products.
- * Not supported on Quadro ® and Tesla &tm; C-class products.
- *
- * @param device The identifier of the target device
- * @param current Reference in which to return the current GOM
- * @param pending Reference in which to return the pending GOM
- *
- * @return
- * - \ref NVML_SUCCESS if \a mode has been populated
- * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
- * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a current or \a pending is NULL
- * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature
- * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible
- * - \ref NVML_ERROR_UNKNOWN on any unexpected error
- *
- * @see nvmlGpuOperationMode_t
- * @see nvmlDeviceSetGpuOperationMode
- */
-nvmlReturn_t DECLDIR nvmlDeviceGetGpuOperationMode(nvmlDevice_t device, nvmlGpuOperationMode_t *current, nvmlGpuOperationMode_t *pending);
-
-/**
- * Retrieves the amount of used, free and total memory available on the device, in bytes.
- *
- * For all products.
- *
- * Enabling ECC reduces the amount of total available memory, due to the extra required parity bits.
- * Under WDDM most device memory is allocated and managed on startup by Windows.
- *
- * Under Linux and Windows TCC, the reported amount of used memory is equal to the sum of memory allocated
- * by all active channels on the device.
- *
- * See \ref nvmlMemory_t for details on available memory info.
- *
- * @param device The identifier of the target device
- * @param memory Reference in which to return the memory information
- *
- * @return
- * - \ref NVML_SUCCESS if \a memory has been populated
- * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
- * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a memory is NULL
- * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible
- * - \ref NVML_ERROR_UNKNOWN on any unexpected error
- */
-nvmlReturn_t DECLDIR nvmlDeviceGetMemoryInfo(nvmlDevice_t device, nvmlMemory_t *memory);
-
-/**
- * Retrieves the current compute mode for the device.
- *
- * For all products.
- *
- * See \ref nvmlComputeMode_t for details on allowed compute modes.
- *
- * @param device The identifier of the target device
- * @param mode Reference in which to return the current compute mode
- *
- * @return
- * - \ref NVML_SUCCESS if \a mode has been set
- * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
- * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a mode is NULL
- * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature
- * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible
- * - \ref NVML_ERROR_UNKNOWN on any unexpected error
- *
- * @see nvmlDeviceSetComputeMode()
- */
-nvmlReturn_t DECLDIR nvmlDeviceGetComputeMode(nvmlDevice_t device, nvmlComputeMode_t *mode);
-
-/**
- * Retrieves the CUDA compute capability of the device.
- *
- * For all products.
- *
- * Returns the major and minor compute capability version numbers of the
- * device. The major and minor versions are equivalent to the
- * CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR and
- * CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR attributes that would be
- * returned by CUDA's cuDeviceGetAttribute().
- *
- * @param device The identifier of the target device
- * @param major Reference in which to return the major CUDA compute capability
- * @param minor Reference in which to return the minor CUDA compute capability
- *
- * @return
- * - \ref NVML_SUCCESS if \a major and \a minor have been set
- * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
- * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a major or \a minor are NULL
- * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible
- * - \ref NVML_ERROR_UNKNOWN on any unexpected error
- */
-nvmlReturn_t DECLDIR nvmlDeviceGetCudaComputeCapability(nvmlDevice_t device, int *major, int *minor);
-
-/**
- * Retrieves the current and pending ECC modes for the device.
- *
- * For Fermi &tm; or newer fully supported devices.
- * Only applicable to devices with ECC.
- * Requires \a NVML_INFOROM_ECC version 1.0 or higher.
- *
- * Changing ECC modes requires a reboot. The "pending" ECC mode refers to the target mode following
- * the next reboot.
- *
- * See \ref nvmlEnableState_t for details on allowed modes.
- *
- * @param device The identifier of the target device
- * @param current Reference in which to return the current ECC mode
- * @param pending Reference in which to return the pending ECC mode
- *
- * @return
- * - \ref NVML_SUCCESS if \a current and \a pending have been set
- * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
- * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or either \a current or \a pending is NULL
- * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature
- * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible
- * - \ref NVML_ERROR_UNKNOWN on any unexpected error
- *
- * @see nvmlDeviceSetEccMode()
- */
-nvmlReturn_t DECLDIR nvmlDeviceGetEccMode(nvmlDevice_t device, nvmlEnableState_t *current, nvmlEnableState_t *pending);
-
-/**
- * Retrieves the device boardId from 0-N.
- * Devices with the same boardId indicate GPUs connected to the same PLX. Use in conjunction with
- * \ref nvmlDeviceGetMultiGpuBoard() to decide if they are on the same board as well.
- * The boardId returned is a unique ID for the current configuration. Uniqueness and ordering across
- * reboots and system configurations is not guaranteed (i.e. if a Tesla K40c returns 0x100 and
- * the two GPUs on a Tesla K10 in the same system returns 0x200 it is not guaranteed they will
- * always return those values but they will always be different from each other).
- *
- *
- * For Fermi &tm; or newer fully supported devices.
- *
- * @param device The identifier of the target device
- * @param boardId Reference in which to return the device's board ID
- *
- * @return
- * - \ref NVML_SUCCESS if \a boardId has been set
- * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
- * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a boardId is NULL
- * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature
- * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible
- * - \ref NVML_ERROR_UNKNOWN on any unexpected error
- */
-nvmlReturn_t DECLDIR nvmlDeviceGetBoardId(nvmlDevice_t device, unsigned int *boardId);
-
-/**
- * Retrieves whether the device is on a Multi-GPU Board
- * Devices that are on multi-GPU boards will set \a multiGpuBool to a non-zero value.
- *
- * For Fermi &tm; or newer fully supported devices.
- *
- * @param device The identifier of the target device
- * @param multiGpuBool Reference in which to return a zero or non-zero value
- * to indicate whether the device is on a multi GPU board
- *
- * @return
- * - \ref NVML_SUCCESS if \a multiGpuBool has been set
- * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
- * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a multiGpuBool is NULL
- * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature
- * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible
- * - \ref NVML_ERROR_UNKNOWN on any unexpected error
- */
-nvmlReturn_t DECLDIR nvmlDeviceGetMultiGpuBoard(nvmlDevice_t device, unsigned int *multiGpuBool);
-
-/**
- * Retrieves the total ECC error counts for the device.
- *
- * For Fermi &tm; or newer fully supported devices.
- * Only applicable to devices with ECC.
- * Requires \a NVML_INFOROM_ECC version 1.0 or higher.
- * Requires ECC Mode to be enabled.
- *
- * The total error count is the sum of errors across each of the separate memory systems, i.e. the total set of
- * errors across the entire device.
- *
- * See \ref nvmlMemoryErrorType_t for a description of available error types.\n
- * See \ref nvmlEccCounterType_t for a description of available counter types.
- *
- * @param device The identifier of the target device
- * @param errorType Flag that specifies the type of the errors.
- * @param counterType Flag that specifies the counter-type of the errors.
- * @param eccCounts Reference in which to return the specified ECC errors
- *
- * @return
- * - \ref NVML_SUCCESS if \a eccCounts has been set
- * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
- * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device, \a errorType or \a counterType is invalid, or \a eccCounts is NULL
- * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature
- * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible
- * - \ref NVML_ERROR_UNKNOWN on any unexpected error
- *
- * @see nvmlDeviceClearEccErrorCounts()
- */
-nvmlReturn_t DECLDIR nvmlDeviceGetTotalEccErrors(nvmlDevice_t device, nvmlMemoryErrorType_t errorType, nvmlEccCounterType_t counterType, unsigned long long *eccCounts);
-
-/**
- * Retrieves the detailed ECC error counts for the device.
- *
- * @deprecated This API supports only a fixed set of ECC error locations
- * On different GPU architectures different locations are supported
- * See \ref nvmlDeviceGetMemoryErrorCounter
- *
- * For Fermi &tm; or newer fully supported devices.
- * Only applicable to devices with ECC.
- * Requires \a NVML_INFOROM_ECC version 2.0 or higher to report aggregate location-based ECC counts.
- * Requires \a NVML_INFOROM_ECC version 1.0 or higher to report all other ECC counts.
- * Requires ECC Mode to be enabled.
- *
- * Detailed errors provide separate ECC counts for specific parts of the memory system.
- *
- * Reports zero for unsupported ECC error counters when a subset of ECC error counters are supported.
- *
- * See \ref nvmlMemoryErrorType_t for a description of available bit types.\n
- * See \ref nvmlEccCounterType_t for a description of available counter types.\n
- * See \ref nvmlEccErrorCounts_t for a description of provided detailed ECC counts.
- *
- * @param device The identifier of the target device
- * @param errorType Flag that specifies the type of the errors.
- * @param counterType Flag that specifies the counter-type of the errors.
- * @param eccCounts Reference in which to return the specified ECC errors
- *
- * @return
- * - \ref NVML_SUCCESS if \a eccCounts has been populated
- * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
- * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device, \a errorType or \a counterType is invalid, or \a eccCounts is NULL
- * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature
- * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible
- * - \ref NVML_ERROR_UNKNOWN on any unexpected error
- *
- * @see nvmlDeviceClearEccErrorCounts()
- */
-nvmlReturn_t DECLDIR nvmlDeviceGetDetailedEccErrors(nvmlDevice_t device, nvmlMemoryErrorType_t errorType, nvmlEccCounterType_t counterType, nvmlEccErrorCounts_t *eccCounts);
-
-/**
- * Retrieves the requested memory error counter for the device.
- *
- * For Fermi &tm; or newer fully supported devices.
- * Requires \a NVML_INFOROM_ECC version 2.0 or higher to report aggregate location-based memory error counts.
- * Requires \a NVML_INFOROM_ECC version 1.0 or higher to report all other memory error counts.
- *
- * Only applicable to devices with ECC.
- *
- * Requires ECC Mode to be enabled.
- *
- * See \ref nvmlMemoryErrorType_t for a description of available memory error types.\n
- * See \ref nvmlEccCounterType_t for a description of available counter types.\n
- * See \ref nvmlMemoryLocation_t for a description of available counter locations.\n
- *
- * @param device The identifier of the target device
- * @param errorType Flag that specifies the type of error.
- * @param counterType Flag that specifies the counter-type of the errors.
- * @param locationType Specifies the location of the counter.
- * @param count Reference in which to return the ECC counter
- *
- * @return
- * - \ref NVML_SUCCESS if \a count has been populated
- * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
- * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device, \a bitTyp,e \a counterType or \a locationType is
- * invalid, or \a count is NULL
- * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support ECC error reporting in the specified memory
- * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible
- * - \ref NVML_ERROR_UNKNOWN on any unexpected error
- */
-nvmlReturn_t DECLDIR nvmlDeviceGetMemoryErrorCounter(nvmlDevice_t device, nvmlMemoryErrorType_t errorType,
- nvmlEccCounterType_t counterType,
- nvmlMemoryLocation_t locationType, unsigned long long *count);
-
-/**
- * Retrieves the current utilization rates for the device's major subsystems.
- *
- * For Fermi &tm; or newer fully supported devices.
- *
- * See \ref nvmlUtilization_t for details on available utilization rates.
- *
- * \note During driver initialization when ECC is enabled one can see high GPU and Memory Utilization readings.
- * This is caused by ECC Memory Scrubbing mechanism that is performed during driver initialization.
- *
- * @param device The identifier of the target device
- * @param utilization Reference in which to return the utilization information
- *
- * @return
- * - \ref NVML_SUCCESS if \a utilization has been populated
- * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
- * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a utilization is NULL
- * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature
- * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible
- * - \ref NVML_ERROR_UNKNOWN on any unexpected error
- */
-nvmlReturn_t DECLDIR nvmlDeviceGetUtilizationRates(nvmlDevice_t device, nvmlUtilization_t *utilization);
-
-/**
- * Retrieves the current utilization and sampling size in microseconds for the Encoder
- *
- * For Kepler &tm; or newer fully supported devices.
- *
- * @param device The identifier of the target device
- * @param utilization Reference to an unsigned int for encoder utilization info
- * @param samplingPeriodUs Reference to an unsigned int for the sampling period in US
- *
- * @return
- * - \ref NVML_SUCCESS if \a utilization has been populated
- * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
- * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, \a utilization is NULL, or \a samplingPeriodUs is NULL
- * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature
- * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible
- * - \ref NVML_ERROR_UNKNOWN on any unexpected error
- */
-nvmlReturn_t DECLDIR nvmlDeviceGetEncoderUtilization(nvmlDevice_t device, unsigned int *utilization, unsigned int *samplingPeriodUs);
-
-/**
- * Retrieves the current capacity of the device's encoder, as a percentage of maximum encoder capacity with valid values in the range 0-100.
- *
- * For Maxwell &tm; or newer fully supported devices.
- *
- * @param device The identifier of the target device
- * @param encoderQueryType Type of encoder to query
- * @param encoderCapacity Reference to an unsigned int for the encoder capacity
- *
- * @return
- * - \ref NVML_SUCCESS if \a encoderCapacity is fetched
- * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
- * - \ref NVML_ERROR_INVALID_ARGUMENT if \a encoderCapacity is NULL, or \a device or \a encoderQueryType
- * are invalid
- * - \ref NVML_ERROR_NOT_SUPPORTED if device does not support the encoder specified in \a encodeQueryType
- * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible
- * - \ref NVML_ERROR_UNKNOWN on any unexpected error
- */
-nvmlReturn_t DECLDIR nvmlDeviceGetEncoderCapacity (nvmlDevice_t device, nvmlEncoderType_t encoderQueryType, unsigned int *encoderCapacity);
-
-/**
- * Retrieves the current encoder statistics for a given device.
- *
- * For Maxwell &tm; or newer fully supported devices.
- *
- * @param device The identifier of the target device
- * @param sessionCount Reference to an unsigned int for count of active encoder sessions
- * @param averageFps Reference to an unsigned int for trailing average FPS of all active sessions
- * @param averageLatency Reference to an unsigned int for encode latency in microseconds
- *
- * @return
- * - \ref NVML_SUCCESS if \a sessionCount, \a averageFps and \a averageLatency is fetched
- * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
- * - \ref NVML_ERROR_INVALID_ARGUMENT if \a sessionCount, or \a device or \a averageFps,
- * or \a averageLatency is NULL
- * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible
- * - \ref NVML_ERROR_UNKNOWN on any unexpected error
- */
-nvmlReturn_t DECLDIR nvmlDeviceGetEncoderStats (nvmlDevice_t device, unsigned int *sessionCount,
- unsigned int *averageFps, unsigned int *averageLatency);
-
-/**
- * Retrieves information about active encoder sessions on a target device.
- *
- * An array of active encoder sessions is returned in the caller-supplied buffer pointed at by \a sessionInfos. The
- * array elememt count is passed in \a sessionCount, and \a sessionCount is used to return the number of sessions
- * written to the buffer.
- *
- * If the supplied buffer is not large enough to accomodate the active session array, the function returns
- * NVML_ERROR_INSUFFICIENT_SIZE, with the element count of nvmlEncoderSessionInfo_t array required in \a sessionCount.
- * To query the number of active encoder sessions, call this function with *sessionCount = 0. The code will return
- * NVML_SUCCESS with number of active encoder sessions updated in *sessionCount.
- *
- * For Maxwell &tm; or newer fully supported devices.
- *
- * @param device The identifier of the target device
- * @param sessionCount Reference to caller supplied array size, and returns the number of sessions.
- * @param sessionInfos Reference in which to return the session information
- *
- * @return
- * - \ref NVML_SUCCESS if \a sessionInfos is fetched
- * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
- * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a sessionCount is too small, array element count is returned in \a sessionCount
- * - \ref NVML_ERROR_INVALID_ARGUMENT if \a sessionCount is NULL.
- * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible
- * - \ref NVML_ERROR_UNKNOWN on any unexpected error
- */
-nvmlReturn_t DECLDIR nvmlDeviceGetEncoderSessions(nvmlDevice_t device, unsigned int *sessionCount, nvmlEncoderSessionInfo_t *sessionInfos);
-
-/**
- * Retrieves the current utilization and sampling size in microseconds for the Decoder
- *
- * For Kepler &tm; or newer fully supported devices.
- *
- * @param device The identifier of the target device
- * @param utilization Reference to an unsigned int for decoder utilization info
- * @param samplingPeriodUs Reference to an unsigned int for the sampling period in US
- *
- * @return
- * - \ref NVML_SUCCESS if \a utilization has been populated
- * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
- * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, \a utilization is NULL, or \a samplingPeriodUs is NULL
- * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature
- * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible
- * - \ref NVML_ERROR_UNKNOWN on any unexpected error
- */
-nvmlReturn_t DECLDIR nvmlDeviceGetDecoderUtilization(nvmlDevice_t device, unsigned int *utilization, unsigned int *samplingPeriodUs);
-
-/**
- * Retrieves the current and pending driver model for the device.
- *
- * For Fermi &tm; or newer fully supported devices.
- * For windows only.
- *
- * On Windows platforms the device driver can run in either WDDM or WDM (TCC) mode. If a display is attached
- * to the device it must run in WDDM mode. TCC mode is preferred if a display is not attached.
- *
- * See \ref nvmlDriverModel_t for details on available driver models.
- *
- * @param device The identifier of the target device
- * @param current Reference in which to return the current driver model
- * @param pending Reference in which to return the pending driver model
- *
- * @return
- * - \ref NVML_SUCCESS if either \a current and/or \a pending have been set
- * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
- * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or both \a current and \a pending are NULL
- * - \ref NVML_ERROR_NOT_SUPPORTED if the platform is not windows
- * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible
- * - \ref NVML_ERROR_UNKNOWN on any unexpected error
- *
- * @see nvmlDeviceSetDriverModel()
- */
-nvmlReturn_t DECLDIR nvmlDeviceGetDriverModel(nvmlDevice_t device, nvmlDriverModel_t *current, nvmlDriverModel_t *pending);
-
-/**
- * Get VBIOS version of the device.
- *
- * For all products.
- *
- * The VBIOS version may change from time to time. It will not exceed 32 characters in length
- * (including the NULL terminator). See \ref nvmlConstants::NVML_DEVICE_VBIOS_VERSION_BUFFER_SIZE.
- *
- * @param device The identifier of the target device
- * @param version Reference to which to return the VBIOS version
- * @param length The maximum allowed length of the string returned in \a version
- *
- * @return
- * - \ref NVML_SUCCESS if \a version has been set
- * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
- * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, or \a version is NULL
- * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a length is too small
- * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible
- * - \ref NVML_ERROR_UNKNOWN on any unexpected error
- */
-nvmlReturn_t DECLDIR nvmlDeviceGetVbiosVersion(nvmlDevice_t device, char *version, unsigned int length);
-
-/**
- * Get Bridge Chip Information for all the bridge chips on the board.
- *
- * For all fully supported products.
- * Only applicable to multi-GPU products.
- *
- * @param device The identifier of the target device
- * @param bridgeHierarchy Reference to the returned bridge chip Hierarchy
- *
- * @return
- * - \ref NVML_SUCCESS if bridge chip exists
- * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
- * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, or \a bridgeInfo is NULL
- * - \ref NVML_ERROR_NOT_SUPPORTED if bridge chip not supported on the device
- * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible
- * - \ref NVML_ERROR_UNKNOWN on any unexpected error
- *
- */
-nvmlReturn_t DECLDIR nvmlDeviceGetBridgeChipInfo(nvmlDevice_t device, nvmlBridgeChipHierarchy_t *bridgeHierarchy);
-
-/**
- * Get information about processes with a compute context on a device
- *
- * For Fermi &tm; or newer fully supported devices.
- *
- * This function returns information only about compute running processes (e.g. CUDA application which have
- * active context). Any graphics applications (e.g. using OpenGL, DirectX) won't be listed by this function.
- *
- * To query the current number of running compute processes, call this function with *infoCount = 0. The
- * return code will be NVML_ERROR_INSUFFICIENT_SIZE, or NVML_SUCCESS if none are running. For this call
- * \a infos is allowed to be NULL.
- *
- * The usedGpuMemory field returned is all of the memory used by the application.
- *
- * Keep in mind that information returned by this call is dynamic and the number of elements might change in
- * time. Allocate more space for \a infos table in case new compute processes are spawned.
- *
- * @param device The identifier of the target device
- * @param infoCount Reference in which to provide the \a infos array size, and
- * to return the number of returned elements
- * @param infos Reference in which to return the process information
- *
- * @return
- * - \ref NVML_SUCCESS if \a infoCount and \a infos have been populated
- * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
- * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a infoCount indicates that the \a infos array is too small
- * \a infoCount will contain minimal amount of space necessary for
- * the call to complete
- * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, either of \a infoCount or \a infos is NULL
- * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible
- * - \ref NVML_ERROR_UNKNOWN on any unexpected error
- *
- * @see \ref nvmlSystemGetProcessName
- */
-nvmlReturn_t DECLDIR nvmlDeviceGetComputeRunningProcesses(nvmlDevice_t device, unsigned int *infoCount, nvmlProcessInfo_t *infos);
-
-/**
- * Get information about processes with a graphics context on a device
- *
- * For Kepler &tm; or newer fully supported devices.
- *
- * This function returns information only about graphics based processes
- * (eg. applications using OpenGL, DirectX)
- *
- * To query the current number of running graphics processes, call this function with *infoCount = 0. The
- * return code will be NVML_ERROR_INSUFFICIENT_SIZE, or NVML_SUCCESS if none are running. For this call
- * \a infos is allowed to be NULL.
- *
- * The usedGpuMemory field returned is all of the memory used by the application.
- *
- * Keep in mind that information returned by this call is dynamic and the number of elements might change in
- * time. Allocate more space for \a infos table in case new graphics processes are spawned.
- *
- * @param device The identifier of the target device
- * @param infoCount Reference in which to provide the \a infos array size, and
- * to return the number of returned elements
- * @param infos Reference in which to return the process information
- *
- * @return
- * - \ref NVML_SUCCESS if \a infoCount and \a infos have been populated
- * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
- * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a infoCount indicates that the \a infos array is too small
- * \a infoCount will contain minimal amount of space necessary for
- * the call to complete
- * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, either of \a infoCount or \a infos is NULL
- * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible
- * - \ref NVML_ERROR_UNKNOWN on any unexpected error
- *
- * @see \ref nvmlSystemGetProcessName
- */
-nvmlReturn_t DECLDIR nvmlDeviceGetGraphicsRunningProcesses(nvmlDevice_t device, unsigned int *infoCount, nvmlProcessInfo_t *infos);
-
-/**
- * Check if the GPU devices are on the same physical board.
- *
- * For all fully supported products.
- *
- * @param device1 The first GPU device
- * @param device2 The second GPU device
- * @param onSameBoard Reference in which to return the status.
- * Non-zero indicates that the GPUs are on the same board.
- *
- * @return
- * - \ref NVML_SUCCESS if \a onSameBoard has been set
- * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
- * - \ref NVML_ERROR_INVALID_ARGUMENT if \a dev1 or \a dev2 are invalid or \a onSameBoard is NULL
- * - \ref NVML_ERROR_NOT_SUPPORTED if this check is not supported by the device
- * - \ref NVML_ERROR_GPU_IS_LOST if the either GPU has fallen off the bus or is otherwise inaccessible
- * - \ref NVML_ERROR_UNKNOWN on any unexpected error
- */
-nvmlReturn_t DECLDIR nvmlDeviceOnSameBoard(nvmlDevice_t device1, nvmlDevice_t device2, int *onSameBoard);
-
-/**
- * Retrieves the root/admin permissions on the target API. See \a nvmlRestrictedAPI_t for the list of supported APIs.
- * If an API is restricted only root users can call that API. See \a nvmlDeviceSetAPIRestriction to change current permissions.
- *
- * For all fully supported products.
- *
- * @param device The identifier of the target device
- * @param apiType Target API type for this operation
- * @param isRestricted Reference in which to return the current restriction
- * NVML_FEATURE_ENABLED indicates that the API is root-only
- * NVML_FEATURE_DISABLED indicates that the API is accessible to all users
- *
- * @return
- * - \ref NVML_SUCCESS if \a isRestricted has been set
- * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
- * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, \a apiType incorrect or \a isRestricted is NULL
- * - \ref NVML_ERROR_NOT_SUPPORTED if this query is not supported by the device or the device does not support
- * the feature that is being queried (E.G. Enabling/disabling Auto Boosted clocks is
- * not supported by the device)
- * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible
- * - \ref NVML_ERROR_UNKNOWN on any unexpected error
- *
- * @see nvmlRestrictedAPI_t
- */
-nvmlReturn_t DECLDIR nvmlDeviceGetAPIRestriction(nvmlDevice_t device, nvmlRestrictedAPI_t apiType, nvmlEnableState_t *isRestricted);
-
-/**
- * Gets recent samples for the GPU.
- *
- * For Kepler &tm; or newer fully supported devices.
- *
- * Based on type, this method can be used to fetch the power, utilization or clock samples maintained in the buffer by
- * the driver.
- *
- * Power, Utilization and Clock samples are returned as type "unsigned int" for the union nvmlValue_t.
- *
- * To get the size of samples that user needs to allocate, the method is invoked with samples set to NULL.
- * The returned samplesCount will provide the number of samples that can be queried. The user needs to
- * allocate the buffer with size as samplesCount * sizeof(nvmlSample_t).
- *
- * lastSeenTimeStamp represents CPU timestamp in microseconds. Set it to 0 to fetch all the samples maintained by the
- * underlying buffer. Set lastSeenTimeStamp to one of the timeStamps retrieved from the date of the previous query
- * to get more recent samples.
- *
- * This method fetches the number of entries which can be accommodated in the provided samples array, and the
- * reference samplesCount is updated to indicate how many samples were actually retrieved. The advantage of using this
- * method for samples in contrast to polling via existing methods is to get get higher frequency data at lower polling cost.
- *
- * @param device The identifier for the target device
- * @param type Type of sampling event
- * @param lastSeenTimeStamp Return only samples with timestamp greater than lastSeenTimeStamp.
- * @param sampleValType Output parameter to represent the type of sample value as described in nvmlSampleVal_t
- * @param sampleCount Reference to provide the number of elements which can be queried in samples array
- * @param samples Reference in which samples are returned
-
- * @return
- * - \ref NVML_SUCCESS if samples are successfully retrieved
- * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
- * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, \a samplesCount is NULL or
- * reference to \a sampleCount is 0 for non null \a samples
- * - \ref NVML_ERROR_NOT_SUPPORTED if this query is not supported by the device
- * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible
- * - \ref NVML_ERROR_NOT_FOUND if sample entries are not found
- * - \ref NVML_ERROR_UNKNOWN on any unexpected error
- */
-nvmlReturn_t DECLDIR nvmlDeviceGetSamples(nvmlDevice_t device, nvmlSamplingType_t type, unsigned long long lastSeenTimeStamp,
- nvmlValueType_t *sampleValType, unsigned int *sampleCount, nvmlSample_t *samples);
-
-/**
- * Gets Total, Available and Used size of BAR1 memory.
- *
- * BAR1 is used to map the FB (device memory) so that it can be directly accessed by the CPU or by 3rd party
- * devices (peer-to-peer on the PCIE bus).
- *
- * For Kepler &tm; or newer fully supported devices.
- *
- * @param device The identifier of the target device
- * @param bar1Memory Reference in which BAR1 memory
- * information is returned.
- *
- * @return
- * - \ref NVML_SUCCESS if BAR1 memory is successfully retrieved
- * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
- * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, \a bar1Memory is NULL
- * - \ref NVML_ERROR_NOT_SUPPORTED if this query is not supported by the device
- * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible
- * - \ref NVML_ERROR_UNKNOWN on any unexpected error
- *
- */
-nvmlReturn_t DECLDIR nvmlDeviceGetBAR1MemoryInfo(nvmlDevice_t device, nvmlBAR1Memory_t *bar1Memory);
-
-
-/**
- * Gets the duration of time during which the device was throttled (lower than requested clocks) due to power
- * or thermal constraints.
- *
- * The method is important to users who are tying to understand if their GPUs throttle at any point during their applications. The
- * difference in violation times at two different reference times gives the indication of GPU throttling event.
- *
- * Violation for thermal capping is not supported at this time.
- *
- * For Kepler &tm; or newer fully supported devices.
- *
- * @param device The identifier of the target device
- * @param perfPolicyType Represents Performance policy which can trigger GPU throttling
- * @param violTime Reference to which violation time related information is returned
- *
- *
- * @return
- * - \ref NVML_SUCCESS if violation time is successfully retrieved
- * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
- * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, \a perfPolicyType is invalid, or \a violTime is NULL
- * - \ref NVML_ERROR_NOT_SUPPORTED if this query is not supported by the device
- * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible
- *
- */
-nvmlReturn_t DECLDIR nvmlDeviceGetViolationStatus(nvmlDevice_t device, nvmlPerfPolicyType_t perfPolicyType, nvmlViolationTime_t *violTime);
-
-/**
- * @}
- */
-
-/** @addtogroup nvmlAccountingStats
- * @{
- */
-
-/**
- * Queries the state of per process accounting mode.
- *
- * For Kepler &tm; or newer fully supported devices.
- *
- * See \ref nvmlDeviceGetAccountingStats for more details.
- * See \ref nvmlDeviceSetAccountingMode
- *
- * @param device The identifier of the target device
- * @param mode Reference in which to return the current accounting mode
- *
- * @return
- * - \ref NVML_SUCCESS if the mode has been successfully retrieved
- * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
- * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a mode are NULL
- * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature
- * - \ref NVML_ERROR_UNKNOWN on any unexpected error
- */
-nvmlReturn_t DECLDIR nvmlDeviceGetAccountingMode(nvmlDevice_t device, nvmlEnableState_t *mode);
-
-/**
- * Queries process's accounting stats.
- *
- * For Kepler &tm; or newer fully supported devices.
- *
- * Accounting stats capture GPU utilization and other statistics across the lifetime of a process.
- * Accounting stats can be queried during life time of the process and after its termination.
- * The time field in \ref nvmlAccountingStats_t is reported as 0 during the lifetime of the process and
- * updated to actual running time after its termination.
- * Accounting stats are kept in a circular buffer, newly created processes overwrite information about old
- * processes.
- *
- * See \ref nvmlAccountingStats_t for description of each returned metric.
- * List of processes that can be queried can be retrieved from \ref nvmlDeviceGetAccountingPids.
- *
- * @note Accounting Mode needs to be on. See \ref nvmlDeviceGetAccountingMode.
- * @note Only compute and graphics applications stats can be queried. Monitoring applications stats can't be
- * queried since they don't contribute to GPU utilization.
- * @note In case of pid collision stats of only the latest process (that terminated last) will be reported
- *
- * @warning On Kepler devices per process statistics are accurate only if there's one process running on a GPU.
- *
- * @param device The identifier of the target device
- * @param pid Process Id of the target process to query stats for
- * @param stats Reference in which to return the process's accounting stats
- *
- * @return
- * - \ref NVML_SUCCESS if stats have been successfully retrieved
- * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
- * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a stats are NULL
- * - \ref NVML_ERROR_NOT_FOUND if process stats were not found
- * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature or accounting mode is disabled
- * - \ref NVML_ERROR_UNKNOWN on any unexpected error
- *
- * @see nvmlDeviceGetAccountingBufferSize
- */
-nvmlReturn_t DECLDIR nvmlDeviceGetAccountingStats(nvmlDevice_t device, unsigned int pid, nvmlAccountingStats_t *stats);
-
-/**
- * Queries list of processes that can be queried for accounting stats. The list of processes returned
- * can be in running or terminated state.
- *
- * For Kepler &tm; or newer fully supported devices.
- *
- * To just query the number of processes ready to be queried, call this function with *count = 0 and
- * pids=NULL. The return code will be NVML_ERROR_INSUFFICIENT_SIZE, or NVML_SUCCESS if list is empty.
- *
- * For more details see \ref nvmlDeviceGetAccountingStats.
- *
- * @note In case of PID collision some processes might not be accessible before the circular buffer is full.
- *
- * @param device The identifier of the target device
- * @param count Reference in which to provide the \a pids array size, and
- * to return the number of elements ready to be queried
- * @param pids Reference in which to return list of process ids
- *
- * @return
- * - \ref NVML_SUCCESS if pids were successfully retrieved
- * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
- * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a count is NULL
- * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature or accounting mode is disabled
- * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a count is too small (\a count is set to
- * expected value)
- * - \ref NVML_ERROR_UNKNOWN on any unexpected error
- *
- * @see nvmlDeviceGetAccountingBufferSize
- */
-nvmlReturn_t DECLDIR nvmlDeviceGetAccountingPids(nvmlDevice_t device, unsigned int *count, unsigned int *pids);
-
-/**
- * Returns the number of processes that the circular buffer with accounting pids can hold.
- *
- * For Kepler &tm; or newer fully supported devices.
- *
- * This is the maximum number of processes that accounting information will be stored for before information
- * about oldest processes will get overwritten by information about new processes.
- *
- * @param device The identifier of the target device
- * @param bufferSize Reference in which to provide the size (in number of elements)
- * of the circular buffer for accounting stats.
- *
- * @return
- * - \ref NVML_SUCCESS if buffer size was successfully retrieved
- * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
- * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a bufferSize is NULL
- * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature or accounting mode is disabled
- * - \ref NVML_ERROR_UNKNOWN on any unexpected error
- *
- * @see nvmlDeviceGetAccountingStats
- * @see nvmlDeviceGetAccountingPids
- */
-nvmlReturn_t DECLDIR nvmlDeviceGetAccountingBufferSize(nvmlDevice_t device, unsigned int *bufferSize);
-
-/** @} */
-
-/** @addtogroup nvmlDeviceQueries
- * @{
- */
-
-/**
- * Returns the list of retired pages by source, including pages that are pending retirement
- * The address information provided from this API is the hardware address of the page that was retired. Note
- * that this does not match the virtual address used in CUDA, but will match the address information in XID 63
- *
- * For Kepler &tm; or newer fully supported devices.
- *
- * @param device The identifier of the target device
- * @param cause Filter page addresses by cause of retirement
- * @param pageCount Reference in which to provide the \a addresses buffer size, and
- * to return the number of retired pages that match \a cause
- * Set to 0 to query the size without allocating an \a addresses buffer
- * @param addresses Buffer to write the page addresses into
- *
- * @return
- * - \ref NVML_SUCCESS if \a pageCount was populated and \a addresses was filled
- * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a pageCount indicates the buffer is not large enough to store all the
- * matching page addresses. \a pageCount is set to the needed size.
- * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
- * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, \a pageCount is NULL, \a cause is invalid, or
- * \a addresses is NULL
- * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature
- * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible
- * - \ref NVML_ERROR_UNKNOWN on any unexpected error
- */
-nvmlReturn_t DECLDIR nvmlDeviceGetRetiredPages(nvmlDevice_t device, nvmlPageRetirementCause_t cause,
- unsigned int *pageCount, unsigned long long *addresses);
-
-/**
- * Check if any pages are pending retirement and need a reboot to fully retire.
- *
- * For Kepler &tm; or newer fully supported devices.
- *
- * @param device The identifier of the target device
- * @param isPending Reference in which to return the pending status
- *
- * @return
- * - \ref NVML_SUCCESS if \a isPending was populated
- * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
- * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a isPending is NULL
- * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature
- * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible
- * - \ref NVML_ERROR_UNKNOWN on any unexpected error
- */
-nvmlReturn_t DECLDIR nvmlDeviceGetRetiredPagesPendingStatus(nvmlDevice_t device, nvmlEnableState_t *isPending);
-
-/** @} */
-
-/***************************************************************************************************/
-/** @defgroup nvmlUnitCommands Unit Commands
- * This chapter describes NVML operations that change the state of the unit. For S-class products.
- * Each of these requires root/admin access. Non-admin users will see an NVML_ERROR_NO_PERMISSION
- * error code when invoking any of these methods.
- * @{
- */
-/***************************************************************************************************/
-
-/**
- * Set the LED state for the unit. The LED can be either green (0) or amber (1).
- *
- * For S-class products.
- * Requires root/admin permissions.
- *
- * This operation takes effect immediately.
- *
- *
- * Current S-Class products don't provide unique LEDs for each unit. As such, both front
- * and back LEDs will be toggled in unison regardless of which unit is specified with this command.
- *
- * See \ref nvmlLedColor_t for available colors.
- *
- * @param unit The identifier of the target unit
- * @param color The target LED color
- *
- * @return
- * - \ref NVML_SUCCESS if the LED color has been set
- * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
- * - \ref NVML_ERROR_INVALID_ARGUMENT if \a unit or \a color is invalid
- * - \ref NVML_ERROR_NOT_SUPPORTED if this is not an S-class product
- * - \ref NVML_ERROR_NO_PERMISSION if the user doesn't have permission to perform this operation
- * - \ref NVML_ERROR_UNKNOWN on any unexpected error
- *
- * @see nvmlUnitGetLedState()
- */
-nvmlReturn_t DECLDIR nvmlUnitSetLedState(nvmlUnit_t unit, nvmlLedColor_t color);
-
-/** @} */
-
-/***************************************************************************************************/
-/** @defgroup nvmlDeviceCommands Device Commands
- * This chapter describes NVML operations that change the state of the device.
- * Each of these requires root/admin access. Non-admin users will see an NVML_ERROR_NO_PERMISSION
- * error code when invoking any of these methods.
- * @{
- */
-/***************************************************************************************************/
-
-/**
- * Set the persistence mode for the device.
- *
- * For all products.
- * For Linux only.
- * Requires root/admin permissions.
- *
- * The persistence mode determines whether the GPU driver software is torn down after the last client
- * exits.
- *
- * This operation takes effect immediately. It is not persistent across reboots. After each reboot the
- * persistence mode is reset to "Disabled".
- *
- * See \ref nvmlEnableState_t for available modes.
- *
- * @param device The identifier of the target device
- * @param mode The target persistence mode
- *
- * @return
- * - \ref NVML_SUCCESS if the persistence mode was set
- * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
- * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a mode is invalid
- * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature
- * - \ref NVML_ERROR_NO_PERMISSION if the user doesn't have permission to perform this operation
- * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible
- * - \ref NVML_ERROR_UNKNOWN on any unexpected error
- *
- * @see nvmlDeviceGetPersistenceMode()
- */
-nvmlReturn_t DECLDIR nvmlDeviceSetPersistenceMode(nvmlDevice_t device, nvmlEnableState_t mode);
-
-/**
- * Set the compute mode for the device.
- *
- * For all products.
- * Requires root/admin permissions.
- *
- * The compute mode determines whether a GPU can be used for compute operations and whether it can
- * be shared across contexts.
- *
- * This operation takes effect immediately. Under Linux it is not persistent across reboots and
- * always resets to "Default". Under windows it is persistent.
- *
- * Under windows compute mode may only be set to DEFAULT when running in WDDM
- *
- * See \ref nvmlComputeMode_t for details on available compute modes.
- *
- * @param device The identifier of the target device
- * @param mode The target compute mode
- *
- * @return
- * - \ref NVML_SUCCESS if the compute mode was set
- * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
- * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a mode is invalid
- * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature
- * - \ref NVML_ERROR_NO_PERMISSION if the user doesn't have permission to perform this operation
- * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible
- * - \ref NVML_ERROR_UNKNOWN on any unexpected error
- *
- * @see nvmlDeviceGetComputeMode()
- */
-nvmlReturn_t DECLDIR nvmlDeviceSetComputeMode(nvmlDevice_t device, nvmlComputeMode_t mode);
-
-/**
- * Set the ECC mode for the device.
- *
- * For Kepler &tm; or newer fully supported devices.
- * Only applicable to devices with ECC.
- * Requires \a NVML_INFOROM_ECC version 1.0 or higher.
- * Requires root/admin permissions.
- *
- * The ECC mode determines whether the GPU enables its ECC support.
- *
- * This operation takes effect after the next reboot.
- *
- * See \ref nvmlEnableState_t for details on available modes.
- *
- * @param device The identifier of the target device
- * @param ecc The target ECC mode
- *
- * @return
- * - \ref NVML_SUCCESS if the ECC mode was set
- * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
- * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a ecc is invalid
- * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature
- * - \ref NVML_ERROR_NO_PERMISSION if the user doesn't have permission to perform this operation
- * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible
- * - \ref NVML_ERROR_UNKNOWN on any unexpected error
- *
- * @see nvmlDeviceGetEccMode()
- */
-nvmlReturn_t DECLDIR nvmlDeviceSetEccMode(nvmlDevice_t device, nvmlEnableState_t ecc);
-
-/**
- * Clear the ECC error and other memory error counts for the device.
- *
- * For Kepler &tm; or newer fully supported devices.
- * Only applicable to devices with ECC.
- * Requires \a NVML_INFOROM_ECC version 2.0 or higher to clear aggregate location-based ECC counts.
- * Requires \a NVML_INFOROM_ECC version 1.0 or higher to clear all other ECC counts.
- * Requires root/admin permissions.
- * Requires ECC Mode to be enabled.
- *
- * Sets all of the specified ECC counters to 0, including both detailed and total counts.
- *
- * This operation takes effect immediately.
- *
- * See \ref nvmlMemoryErrorType_t for details on available counter types.
- *
- * @param device The identifier of the target device
- * @param counterType Flag that indicates which type of errors should be cleared.
- *
- * @return
- * - \ref NVML_SUCCESS if the error counts were cleared
- * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
- * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a counterType is invalid
- * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature
- * - \ref NVML_ERROR_NO_PERMISSION if the user doesn't have permission to perform this operation
- * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible
- * - \ref NVML_ERROR_UNKNOWN on any unexpected error
- *
- * @see
- * - nvmlDeviceGetDetailedEccErrors()
- * - nvmlDeviceGetTotalEccErrors()
- */
-nvmlReturn_t DECLDIR nvmlDeviceClearEccErrorCounts(nvmlDevice_t device, nvmlEccCounterType_t counterType);
-
-/**
- * Set the driver model for the device.
- *
- * For Fermi &tm; or newer fully supported devices.
- * For windows only.
- * Requires root/admin permissions.
- *
- * On Windows platforms the device driver can run in either WDDM or WDM (TCC) mode. If a display is attached
- * to the device it must run in WDDM mode.
- *
- * It is possible to force the change to WDM (TCC) while the display is still attached with a force flag (nvmlFlagForce).
- * This should only be done if the host is subsequently powered down and the display is detached from the device
- * before the next reboot.
- *
- * This operation takes effect after the next reboot.
- *
- * Windows driver model may only be set to WDDM when running in DEFAULT compute mode.
- *
- * Change driver model to WDDM is not supported when GPU doesn't support graphics acceleration or
- * will not support it after reboot. See \ref nvmlDeviceSetGpuOperationMode.
- *
- * See \ref nvmlDriverModel_t for details on available driver models.
- * See \ref nvmlFlagDefault and \ref nvmlFlagForce
- *
- * @param device The identifier of the target device
- * @param driverModel The target driver model
- * @param flags Flags that change the default behavior
- *
- * @return
- * - \ref NVML_SUCCESS if the driver model has been set
- * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
- * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a driverModel is invalid
- * - \ref NVML_ERROR_NOT_SUPPORTED if the platform is not windows or the device does not support this feature
- * - \ref NVML_ERROR_NO_PERMISSION if the user doesn't have permission to perform this operation
- * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible
- * - \ref NVML_ERROR_UNKNOWN on any unexpected error
- *
- * @see nvmlDeviceGetDriverModel()
- */
-nvmlReturn_t DECLDIR nvmlDeviceSetDriverModel(nvmlDevice_t device, nvmlDriverModel_t driverModel, unsigned int flags);
-
-/**
- * Set clocks that applications will lock to.
- *
- * Sets the clocks that compute and graphics applications will be running at.
- * e.g. CUDA driver requests these clocks during context creation which means this property
- * defines clocks at which CUDA applications will be running unless some overspec event
- * occurs (e.g. over power, over thermal or external HW brake).
- *
- * Can be used as a setting to request constant performance.
- *
- * On Pascal and newer hardware, this will automatically disable automatic boosting of clocks.
- *
- * On K80 and newer Kepler and Maxwell GPUs, users desiring fixed performance should also call
- * \ref nvmlDeviceSetAutoBoostedClocksEnabled to prevent clocks from automatically boosting
- * above the clock value being set.
- *
- * For Kepler &tm; or newer non-GeForce fully supported devices and Maxwell or newer GeForce devices.
- * Requires root/admin permissions.
- *
- * See \ref nvmlDeviceGetSupportedMemoryClocks and \ref nvmlDeviceGetSupportedGraphicsClocks
- * for details on how to list available clocks combinations.
- *
- * After system reboot or driver reload applications clocks go back to their default value.
- * See \ref nvmlDeviceResetApplicationsClocks.
- *
- * @param device The identifier of the target device
- * @param memClockMHz Requested memory clock in MHz
- * @param graphicsClockMHz Requested graphics clock in MHz
- *
- * @return
- * - \ref NVML_SUCCESS if new settings were successfully set
- * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
- * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a memClockMHz and \a graphicsClockMHz
- * is not a valid clock combination
- * - \ref NVML_ERROR_NO_PERMISSION if the user doesn't have permission to perform this operation
- * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature
- * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible
- * - \ref NVML_ERROR_UNKNOWN on any unexpected error
- */
-nvmlReturn_t DECLDIR nvmlDeviceSetApplicationsClocks(nvmlDevice_t device, unsigned int memClockMHz, unsigned int graphicsClockMHz);
-
-/**
- * Set new power limit of this device.
- *
- * For Kepler &tm; or newer fully supported devices.
- * Requires root/admin permissions.
- *
- * See \ref nvmlDeviceGetPowerManagementLimitConstraints to check the allowed ranges of values.
- *
- * \note Limit is not persistent across reboots or driver unloads.
- * Enable persistent mode to prevent driver from unloading when no application is using the device.
- *
- * @param device The identifier of the target device
- * @param limit Power management limit in milliwatts to set
- *
- * @return
- * - \ref NVML_SUCCESS if \a limit has been set
- * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
- * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a defaultLimit is out of range
- * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature
- * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible
- * - \ref NVML_ERROR_UNKNOWN on any unexpected error
- *
- * @see nvmlDeviceGetPowerManagementLimitConstraints
- * @see nvmlDeviceGetPowerManagementDefaultLimit
- */
-nvmlReturn_t DECLDIR nvmlDeviceSetPowerManagementLimit(nvmlDevice_t device, unsigned int limit);
-
-/**
- * Sets new GOM. See \a nvmlGpuOperationMode_t for details.
- *
- * For GK110 M-class and X-class Tesla &tm; products from the Kepler family.
- * Modes \ref NVML_GOM_LOW_DP and \ref NVML_GOM_ALL_ON are supported on fully supported GeForce products.
- * Not supported on Quadro ® and Tesla &tm; C-class products.
- * Requires root/admin permissions.
- *
- * Changing GOMs requires a reboot.
- * The reboot requirement might be removed in the future.
- *
- * Compute only GOMs don't support graphics acceleration. Under windows switching to these GOMs when
- * pending driver model is WDDM is not supported. See \ref nvmlDeviceSetDriverModel.
- *
- * @param device The identifier of the target device
- * @param mode Target GOM
- *
- * @return
- * - \ref NVML_SUCCESS if \a mode has been set
- * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
- * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a mode incorrect
- * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support GOM or specific mode
- * - \ref NVML_ERROR_NO_PERMISSION if the user doesn't have permission to perform this operation
- * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible
- * - \ref NVML_ERROR_UNKNOWN on any unexpected error
- *
- * @see nvmlGpuOperationMode_t
- * @see nvmlDeviceGetGpuOperationMode
- */
-nvmlReturn_t DECLDIR nvmlDeviceSetGpuOperationMode(nvmlDevice_t device, nvmlGpuOperationMode_t mode);
-
-/**
- * Changes the root/admin restructions on certain APIs. See \a nvmlRestrictedAPI_t for the list of supported APIs.
- * This method can be used by a root/admin user to give non-root/admin access to certain otherwise-restricted APIs.
- * The new setting lasts for the lifetime of the NVIDIA driver; it is not persistent. See \a nvmlDeviceGetAPIRestriction
- * to query the current restriction settings.
- *
- * For Kepler &tm; or newer fully supported devices.
- * Requires root/admin permissions.
- *
- * @param device The identifier of the target device
- * @param apiType Target API type for this operation
- * @param isRestricted The target restriction
- *
- * @return
- * - \ref NVML_SUCCESS if \a isRestricted has been set
- * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
- * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a apiType incorrect
- * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support changing API restrictions or the device does not support
- * the feature that api restrictions are being set for (E.G. Enabling/disabling auto
- * boosted clocks is not supported by the device)
- * - \ref NVML_ERROR_NO_PERMISSION if the user doesn't have permission to perform this operation
- * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible
- * - \ref NVML_ERROR_UNKNOWN on any unexpected error
- *
- * @see nvmlRestrictedAPI_t
- */
-nvmlReturn_t DECLDIR nvmlDeviceSetAPIRestriction(nvmlDevice_t device, nvmlRestrictedAPI_t apiType, nvmlEnableState_t isRestricted);
-
-/**
- * @}
- */
-
-/** @addtogroup nvmlAccountingStats
- * @{
- */
-
-/**
- * Enables or disables per process accounting.
- *
- * For Kepler &tm; or newer fully supported devices.
- * Requires root/admin permissions.
- *
- * @note This setting is not persistent and will default to disabled after driver unloads.
- * Enable persistence mode to be sure the setting doesn't switch off to disabled.
- *
- * @note Enabling accounting mode has no negative impact on the GPU performance.
- *
- * @note Disabling accounting clears all accounting pids information.
- *
- * See \ref nvmlDeviceGetAccountingMode
- * See \ref nvmlDeviceGetAccountingStats
- * See \ref nvmlDeviceClearAccountingPids
- *
- * @param device The identifier of the target device
- * @param mode The target accounting mode
- *
- * @return
- * - \ref NVML_SUCCESS if the new mode has been set
- * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
- * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device or \a mode are invalid
- * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature
- * - \ref NVML_ERROR_NO_PERMISSION if the user doesn't have permission to perform this operation
- * - \ref NVML_ERROR_UNKNOWN on any unexpected error
- */
-nvmlReturn_t DECLDIR nvmlDeviceSetAccountingMode(nvmlDevice_t device, nvmlEnableState_t mode);
-
-/**
- * Clears accounting information about all processes that have already terminated.
- *
- * For Kepler &tm; or newer fully supported devices.
- * Requires root/admin permissions.
- *
- * See \ref nvmlDeviceGetAccountingMode
- * See \ref nvmlDeviceGetAccountingStats
- * See \ref nvmlDeviceSetAccountingMode
- *
- * @param device The identifier of the target device
- *
- * @return
- * - \ref NVML_SUCCESS if accounting information has been cleared
- * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
- * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device are invalid
- * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature
- * - \ref NVML_ERROR_NO_PERMISSION if the user doesn't have permission to perform this operation
- * - \ref NVML_ERROR_UNKNOWN on any unexpected error
- */
-nvmlReturn_t DECLDIR nvmlDeviceClearAccountingPids(nvmlDevice_t device);
-
-/** @} */
-
-/***************************************************************************************************/
-/** @defgroup NvLink NvLink Methods
- * This chapter describes methods that NVML can perform on NVLINK enabled devices.
- * @{
- */
-/***************************************************************************************************/
-
-/**
- * Retrieves the state of the device's NvLink for the link specified
- *
- * For Pascal &tm; or newer fully supported devices.
- *
- * @param device The identifier of the target device
- * @param link Specifies the NvLink link to be queried
- * @param isActive \a nvmlEnableState_t where NVML_FEATURE_ENABLED indicates that
- * the link is active and NVML_FEATURE_DISABLED indicates it
- * is inactive
- *
- * @return
- * - \ref NVML_SUCCESS if \a isActive has been set
- * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
- * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device or \a link is invalid or \a isActive is NULL
- * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature
- * - \ref NVML_ERROR_UNKNOWN on any unexpected error
- */
-nvmlReturn_t DECLDIR nvmlDeviceGetNvLinkState(nvmlDevice_t device, unsigned int link, nvmlEnableState_t *isActive);
-
-/**
- * Retrieves the version of the device's NvLink for the link specified
- *
- * For Pascal &tm; or newer fully supported devices.
- *
- * @param device The identifier of the target device
- * @param link Specifies the NvLink link to be queried
- * @param version Requested NvLink version
- *
- * @return
- * - \ref NVML_SUCCESS if \a version has been set
- * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
- * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device or \a link is invalid or \a version is NULL
- * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature
- * - \ref NVML_ERROR_UNKNOWN on any unexpected error
- */
-nvmlReturn_t DECLDIR nvmlDeviceGetNvLinkVersion(nvmlDevice_t device, unsigned int link, unsigned int *version);
-
-/**
- * Retrieves the requested capability from the device's NvLink for the link specified
- * Please refer to the \a nvmlNvLinkCapability_t structure for the specific caps that can be queried
- * The return value should be treated as a boolean.
- *
- * For Pascal &tm; or newer fully supported devices.
- *
- * @param device The identifier of the target device
- * @param link Specifies the NvLink link to be queried
- * @param capability Specifies the \a nvmlNvLinkCapability_t to be queried
- * @param capResult A boolean for the queried capability indicating that feature is available
- *
- * @return
- * - \ref NVML_SUCCESS if \a capResult has been set
- * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
- * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device, \a link, or \a capability is invalid or \a capResult is NULL
- * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature
- * - \ref NVML_ERROR_UNKNOWN on any unexpected error
- */
-nvmlReturn_t DECLDIR nvmlDeviceGetNvLinkCapability(nvmlDevice_t device, unsigned int link,
- nvmlNvLinkCapability_t capability, unsigned int *capResult);
-
-/**
- * Retrieves the PCI information for the remote node on a NvLink link
- * Note: pciSubSystemId is not filled in this function and is indeterminate
- *
- * For Pascal &tm; or newer fully supported devices.
- *
- * @param device The identifier of the target device
- * @param link Specifies the NvLink link to be queried
- * @param pci \a nvmlPciInfo_t of the remote node for the specified link
- *
- * @return
- * - \ref NVML_SUCCESS if \a pci has been set
- * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
- * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device or \a link is invalid or \a pci is NULL
- * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature
- * - \ref NVML_ERROR_UNKNOWN on any unexpected error
- */
-nvmlReturn_t DECLDIR nvmlDeviceGetNvLinkRemotePciInfo(nvmlDevice_t device, unsigned int link, nvmlPciInfo_t *pci);
-
-/**
- * Retrieves the specified error counter value
- * Please refer to \a nvmlNvLinkErrorCounter_t for error counters that are available
- *
- * For Pascal &tm; or newer fully supported devices.
- *
- * @param device The identifier of the target device
- * @param link Specifies the NvLink link to be queried
- * @param counter Specifies the NvLink counter to be queried
- * @param counterValue Returned counter value
- *
- * @return
- * - \ref NVML_SUCCESS if \a counter has been set
- * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
- * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device, \a link, or \a counter is invalid or \a counterValue is NULL
- * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature
- * - \ref NVML_ERROR_UNKNOWN on any unexpected error
- */
-nvmlReturn_t DECLDIR nvmlDeviceGetNvLinkErrorCounter(nvmlDevice_t device, unsigned int link,
- nvmlNvLinkErrorCounter_t counter, unsigned long long *counterValue);
-
-/**
- * Resets all error counters to zero
- * Please refer to \a nvmlNvLinkErrorCounter_t for the list of error counters that are reset
- *
- * For Pascal &tm; or newer fully supported devices.
- *
- * @param device The identifier of the target device
- * @param link Specifies the NvLink link to be queried
- *
- * @return
- * - \ref NVML_SUCCESS if the reset is successful
- * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
- * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device or \a link is invalid
- * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature
- * - \ref NVML_ERROR_UNKNOWN on any unexpected error
- */
-nvmlReturn_t DECLDIR nvmlDeviceResetNvLinkErrorCounters(nvmlDevice_t device, unsigned int link);
-
-/**
- * Set the NVLINK utilization counter control information for the specified counter, 0 or 1.
- * Please refer to \a nvmlNvLinkUtilizationControl_t for the structure definition. Performs a reset
- * of the counters if the reset parameter is non-zero.
- *
- * For Pascal &tm; or newer fully supported devices.
- *
- * @param device The identifier of the target device
- * @param counter Specifies the counter that should be set (0 or 1).
- * @param link Specifies the NvLink link to be queried
- * @param control A reference to the \a nvmlNvLinkUtilizationControl_t to set
- * @param reset Resets the counters on set if non-zero
- *
- * @return
- * - \ref NVML_SUCCESS if the control has been set successfully
- * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
- * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device, \a counter, \a link, or \a control is invalid
- * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature
- * - \ref NVML_ERROR_UNKNOWN on any unexpected error
- */
-nvmlReturn_t DECLDIR nvmlDeviceSetNvLinkUtilizationControl(nvmlDevice_t device, unsigned int link, unsigned int counter,
- nvmlNvLinkUtilizationControl_t *control, unsigned int reset);
-
-/**
- * Get the NVLINK utilization counter control information for the specified counter, 0 or 1.
- * Please refer to \a nvmlNvLinkUtilizationControl_t for the structure definition
- *
- * For Pascal &tm; or newer fully supported devices.
- *
- * @param device The identifier of the target device
- * @param counter Specifies the counter that should be set (0 or 1).
- * @param link Specifies the NvLink link to be queried
- * @param control A reference to the \a nvmlNvLinkUtilizationControl_t to place information
- *
- * @return
- * - \ref NVML_SUCCESS if the control has been set successfully
- * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
- * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device, \a counter, \a link, or \a control is invalid
- * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature
- * - \ref NVML_ERROR_UNKNOWN on any unexpected error
- */
-nvmlReturn_t DECLDIR nvmlDeviceGetNvLinkUtilizationControl(nvmlDevice_t device, unsigned int link, unsigned int counter,
- nvmlNvLinkUtilizationControl_t *control);
-
-
-/**
- * Retrieve the NVLINK utilization counter based on the current control for a specified counter.
- * In general it is good practice to use \a nvmlDeviceSetNvLinkUtilizationControl
- * before reading the utilization counters as they have no default state
- *
- * For Pascal &tm; or newer fully supported devices.
- *
- * @param device The identifier of the target device
- * @param link Specifies the NvLink link to be queried
- * @param counter Specifies the counter that should be read (0 or 1).
- * @param rxcounter Receive counter return value
- * @param txcounter Transmit counter return value
- *
- * @return
- * - \ref NVML_SUCCESS if \a rxcounter and \a txcounter have been successfully set
- * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
- * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device, \a counter, or \a link is invalid or \a rxcounter or \a txcounter are NULL
- * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature
- * - \ref NVML_ERROR_UNKNOWN on any unexpected error
- */
-nvmlReturn_t DECLDIR nvmlDeviceGetNvLinkUtilizationCounter(nvmlDevice_t device, unsigned int link, unsigned int counter,
- unsigned long long *rxcounter, unsigned long long *txcounter);
-
-/**
- * Freeze the NVLINK utilization counters
- * Both the receive and transmit counters are operated on by this function
- *
- * For Pascal &tm; or newer fully supported devices.
- *
- * @param device The identifier of the target device
- * @param link Specifies the NvLink link to be queried
- * @param counter Specifies the counter that should be frozen (0 or 1).
- * @param freeze NVML_FEATURE_ENABLED = freeze the receive and transmit counters
- * NVML_FEATURE_DISABLED = unfreeze the receive and transmit counters
- *
- * @return
- * - \ref NVML_SUCCESS if counters were successfully frozen or unfrozen
- * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
- * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device, \a link, \a counter, or \a freeze is invalid
- * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature
- * - \ref NVML_ERROR_UNKNOWN on any unexpected error
- */
-nvmlReturn_t DECLDIR nvmlDeviceFreezeNvLinkUtilizationCounter (nvmlDevice_t device, unsigned int link,
- unsigned int counter, nvmlEnableState_t freeze);
-
-/**
- * Reset the NVLINK utilization counters
- * Both the receive and transmit counters are operated on by this function
- *
- * For Pascal &tm; or newer fully supported devices.
- *
- * @param device The identifier of the target device
- * @param link Specifies the NvLink link to be reset
- * @param counter Specifies the counter that should be reset (0 or 1)
- *
- * @return
- * - \ref NVML_SUCCESS if counters were successfully reset
- * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
- * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device, \a link, or \a counter is invalid
- * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature
- * - \ref NVML_ERROR_UNKNOWN on any unexpected error
- */
-nvmlReturn_t DECLDIR nvmlDeviceResetNvLinkUtilizationCounter (nvmlDevice_t device, unsigned int link, unsigned int counter);
-
-/** @} */
-
-/***************************************************************************************************/
-/** @defgroup nvmlEvents Event Handling Methods
- * This chapter describes methods that NVML can perform against each device to register and wait for
- * some event to occur.
- * @{
- */
-/***************************************************************************************************/
-
-/**
- * Create an empty set of events.
- * Event set should be freed by \ref nvmlEventSetFree
- *
- * For Fermi &tm; or newer fully supported devices.
- * @param set Reference in which to return the event handle
- *
- * @return
- * - \ref NVML_SUCCESS if the event has been set
- * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
- * - \ref NVML_ERROR_INVALID_ARGUMENT if \a set is NULL
- * - \ref NVML_ERROR_UNKNOWN on any unexpected error
- *
- * @see nvmlEventSetFree
- */
-nvmlReturn_t DECLDIR nvmlEventSetCreate(nvmlEventSet_t *set);
-
-/**
- * Starts recording of events on a specified devices and add the events to specified \ref nvmlEventSet_t
- *
- * For Fermi &tm; or newer fully supported devices.
- * Ecc events are available only on ECC enabled devices (see \ref nvmlDeviceGetTotalEccErrors)
- * Power capping events are available only on Power Management enabled devices (see \ref nvmlDeviceGetPowerManagementMode)
- *
- * For Linux only.
- *
- * \b IMPORTANT: Operations on \a set are not thread safe
- *
- * This call starts recording of events on specific device.
- * All events that occurred before this call are not recorded.
- * Checking if some event occurred can be done with \ref nvmlEventSetWait
- *
- * If function reports NVML_ERROR_UNKNOWN, event set is in undefined state and should be freed.
- * If function reports NVML_ERROR_NOT_SUPPORTED, event set can still be used. None of the requested eventTypes
- * are registered in that case.
- *
- * @param device The identifier of the target device
- * @param eventTypes Bitmask of \ref nvmlEventType to record
- * @param set Set to which add new event types
- *
- * @return
- * - \ref NVML_SUCCESS if the event has been set
- * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
- * - \ref NVML_ERROR_INVALID_ARGUMENT if \a eventTypes is invalid or \a set is NULL
- * - \ref NVML_ERROR_NOT_SUPPORTED if the platform does not support this feature or some of requested event types
- * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible
- * - \ref NVML_ERROR_UNKNOWN on any unexpected error
- *
- * @see nvmlEventType
- * @see nvmlDeviceGetSupportedEventTypes
- * @see nvmlEventSetWait
- * @see nvmlEventSetFree
- */
-nvmlReturn_t DECLDIR nvmlDeviceRegisterEvents(nvmlDevice_t device, unsigned long long eventTypes, nvmlEventSet_t set);
-
-/**
- * Returns information about events supported on device
- *
- * For Fermi &tm; or newer fully supported devices.
- *
- * Events are not supported on Windows. So this function returns an empty mask in \a eventTypes on Windows.
- *
- * @param device The identifier of the target device
- * @param eventTypes Reference in which to return bitmask of supported events
- *
- * @return
- * - \ref NVML_SUCCESS if the eventTypes has been set
- * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
- * - \ref NVML_ERROR_INVALID_ARGUMENT if \a eventType is NULL
- * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible
- * - \ref NVML_ERROR_UNKNOWN on any unexpected error
- *
- * @see nvmlEventType
- * @see nvmlDeviceRegisterEvents
- */
-nvmlReturn_t DECLDIR nvmlDeviceGetSupportedEventTypes(nvmlDevice_t device, unsigned long long *eventTypes);
-
-/**
- * Waits on events and delivers events
- *
- * For Fermi &tm; or newer fully supported devices.
- *
- * If some events are ready to be delivered at the time of the call, function returns immediately.
- * If there are no events ready to be delivered, function sleeps till event arrives
- * but not longer than specified timeout. This function in certain conditions can return before
- * specified timeout passes (e.g. when interrupt arrives)
- *
- * In case of xid error, the function returns the most recent xid error type seen by the system. If there are multiple
- * xid errors generated before nvmlEventSetWait is invoked then the last seen xid error type is returned for all
- * xid error events.
- *
- * @param set Reference to set of events to wait on
- * @param data Reference in which to return event data
- * @param timeoutms Maximum amount of wait time in milliseconds for registered event
- *
- * @return
- * - \ref NVML_SUCCESS if the data has been set
- * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
- * - \ref NVML_ERROR_INVALID_ARGUMENT if \a data is NULL
- * - \ref NVML_ERROR_TIMEOUT if no event arrived in specified timeout or interrupt arrived
- * - \ref NVML_ERROR_GPU_IS_LOST if a GPU has fallen off the bus or is otherwise inaccessible
- * - \ref NVML_ERROR_UNKNOWN on any unexpected error
- *
- * @see nvmlEventType
- * @see nvmlDeviceRegisterEvents
- */
-nvmlReturn_t DECLDIR nvmlEventSetWait(nvmlEventSet_t set, nvmlEventData_t * data, unsigned int timeoutms);
-
-/**
- * Releases events in the set
- *
- * For Fermi &tm; or newer fully supported devices.
- *
- * @param set Reference to events to be released
- *
- * @return
- * - \ref NVML_SUCCESS if the event has been successfully released
- * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
- * - \ref NVML_ERROR_UNKNOWN on any unexpected error
- *
- * @see nvmlDeviceRegisterEvents
- */
-nvmlReturn_t DECLDIR nvmlEventSetFree(nvmlEventSet_t set);
-
-/** @} */
-
-/***************************************************************************************************/
-/** @defgroup nvmlZPI Drain states
- * This chapter describes methods that NVML can perform against each device to control their drain state
- * and recognition by NVML and NVIDIA kernel driver. These methods can be used with out-of-band tools to
- * power on/off GPUs, enable robust reset scenarios, etc.
- * @{
- */
-/***************************************************************************************************/
-
-/**
- * Modify the drain state of a GPU. This method forces a GPU to no longer accept new incoming requests.
- * Any new NVML process will no longer see this GPU. Persistence mode for this GPU must be turned off before
- * this call is made.
- * Must be called as administrator.
- * For Linux only.
- *
- * For Pascal &tm; or newer fully supported devices.
- * Some Kepler devices supported.
- *
- * @param pciInfo The PCI address of the GPU drain state to be modified
- * @param newState The drain state that should be entered, see \ref nvmlEnableState_t
- *
- * @return
- * - \ref NVML_SUCCESS if counters were successfully reset
- * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
- * - \ref NVML_ERROR_INVALID_ARGUMENT if \a nvmlIndex or \a newState is invalid
- * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature
- * - \ref NVML_ERROR_NO_PERMISSION if the calling process has insufficient permissions to perform operation
- * - \ref NVML_ERROR_IN_USE if the device has persistence mode turned on
- * - \ref NVML_ERROR_UNKNOWN on any unexpected error
- */
-nvmlReturn_t DECLDIR nvmlDeviceModifyDrainState (nvmlPciInfo_t *pciInfo, nvmlEnableState_t newState);
-
-/**
- * Query the drain state of a GPU. This method is used to check if a GPU is in a currently draining
- * state.
- * For Linux only.
- *
- * For Pascal &tm; or newer fully supported devices.
- * Some Kepler devices supported.
- *
- * @param pciInfo The PCI address of the GPU drain state to be queried
- * @param currentState The current drain state for this GPU, see \ref nvmlEnableState_t
- *
- * @return
- * - \ref NVML_SUCCESS if counters were successfully reset
- * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
- * - \ref NVML_ERROR_INVALID_ARGUMENT if \a nvmlIndex or \a currentState is invalid
- * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature
- * - \ref NVML_ERROR_UNKNOWN on any unexpected error
- */
-nvmlReturn_t DECLDIR nvmlDeviceQueryDrainState (nvmlPciInfo_t *pciInfo, nvmlEnableState_t *currentState);
-
-/**
- * This method will remove the specified GPU from the view of both NVML and the NVIDIA kernel driver
- * as long as no other processes are attached. If other processes are attached, this call will return
- * NVML_ERROR_IN_USE and the GPU will be returned to its original "draining" state. Note: the
- * only situation where a process can still be attached after nvmlDeviceModifyDrainState() is called
- * to initiate the draining state is if that process was using, and is still using, a GPU before the
- * call was made. Also note, persistence mode counts as an attachment to the GPU thus it must be disabled
- * prior to this call.
- *
- * For long-running NVML processes please note that this will change the enumeration of current GPUs.
- * For example, if there are four GPUs present and GPU1 is removed, the new enumeration will be 0-2.
- * Also, device handles after the removed GPU will not be valid and must be re-established.
- * Must be run as administrator.
- * For Linux only.
- *
- * For Pascal &tm; or newer fully supported devices.
- * Some Kepler devices supported.
- *
- * @param pciInfo The PCI address of the GPU to be removed
- * @param gpuState Whether the GPU is to be removed, from the OS
- * see \ref nvmlDetachGpuState_t
- * @param linkState Requested upstream PCIe link state, see \ref nvmlPcieLinkState_t
- *
- * @return
- * - \ref NVML_SUCCESS if counters were successfully reset
- * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
- * - \ref NVML_ERROR_INVALID_ARGUMENT if \a nvmlIndex is invalid
- * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature
- * - \ref NVML_ERROR_IN_USE if the device is still in use and cannot be removed
- */
-nvmlReturn_t DECLDIR nvmlDeviceRemoveGpu (nvmlPciInfo_t *pciInfo, nvmlDetachGpuState_t gpuState, nvmlPcieLinkState_t linkState);
-
-/**
- * Request the OS and the NVIDIA kernel driver to rediscover a portion of the PCI subsystem looking for GPUs that
- * were previously removed. The portion of the PCI tree can be narrowed by specifying a domain, bus, and device.
- * If all are zeroes then the entire PCI tree will be searched. Please note that for long-running NVML processes
- * the enumeration will change based on how many GPUs are discovered and where they are inserted in bus order.
- *
- * In addition, all newly discovered GPUs will be initialized and their ECC scrubbed which may take several seconds
- * per GPU. Also, all device handles are no longer guaranteed to be valid post discovery.
- *
- * Must be run as administrator.
- * For Linux only.
- *
- * For Pascal &tm; or newer fully supported devices.
- * Some Kepler devices supported.
- *
- * @param pciInfo The PCI tree to be searched. Only the domain, bus, and device
- * fields are used in this call.
- *
- * @return
- * - \ref NVML_SUCCESS if counters were successfully reset
- * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
- * - \ref NVML_ERROR_INVALID_ARGUMENT if \a pciInfo is invalid
- * - \ref NVML_ERROR_NOT_SUPPORTED if the operating system does not support this feature
- * - \ref NVML_ERROR_OPERATING_SYSTEM if the operating system is denying this feature
- * - \ref NVML_ERROR_NO_PERMISSION if the calling process has insufficient permissions to perform operation
- * - \ref NVML_ERROR_UNKNOWN on any unexpected error
- */
-nvmlReturn_t DECLDIR nvmlDeviceDiscoverGpus (nvmlPciInfo_t *pciInfo);
-
-/** @} */
-
-/***************************************************************************************************/
-/** @defgroup nvmlFieldValueQueries Field Value Queries
- * This chapter describes NVML operations that are associated with retrieving Field Values from NVML
- * @{
- */
-/***************************************************************************************************/
-
-/**
- * Request values for a list of fields for a device. This API allows multiple fields to be queried at once.
- * If any of the underlying fieldIds are populated by the same driver call, the results for those field IDs
- * will be populated from a single call rather than making a driver call for each fieldId.
- *
- * @param device The device handle of the GPU to request field values for
- * @param valuesCount Number of entries in values that should be retrieved
- * @param values Array of \a valuesCount structures to hold field values.
- * Each value's fieldId must be populated prior to this call
- *
- * @return
- * - \ref NVML_SUCCESS if any values in \a values were populated. Note that you must
- * check the nvmlReturn field of each value for each individual
- * status
- * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a values is NULL
- */
-nvmlReturn_t DECLDIR nvmlDeviceGetFieldValues(nvmlDevice_t device, int valuesCount, nvmlFieldValue_t *values);
-
-
-/** @} */
-
-/***************************************************************************************************/
-/** @defgroup nvmlGridQueries Grid Queries
- * This chapter describes NVML operations that are associated with NVIDIA GRID products.
- * @{
- */
-/***************************************************************************************************/
-
-/**
- * This method is used to get the virtualization mode corresponding to the GPU.
- *
- * For Kepler &tm; or newer fully supported devices.
- *
- * @param device Identifier of the target device
- * @param pVirtualMode Reference to virtualization mode. One of NVML_GPU_VIRTUALIZATION_?
- *
- * @return
- * - \ref NVML_SUCCESS if \a pVirtualMode is fetched
- * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
- * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a pVirtualMode is NULL
- * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible
- * - \ref NVML_ERROR_UNKNOWN on any unexpected error
- */
-nvmlReturn_t DECLDIR nvmlDeviceGetVirtualizationMode(nvmlDevice_t device, nvmlGpuVirtualizationMode_t *pVirtualMode);
-
-/** @} */
-
-/***************************************************************************************************/
-/** @defgroup nvmlGridCommands Grid Commands
- * This chapter describes NVML operations that are associated with NVIDIA GRID products.
- * @{
- */
-/***************************************************************************************************/
-
-/**
- * This method is used to set the virtualization mode corresponding to the GPU.
- *
- * For Kepler &tm; or newer fully supported devices.
- *
- * @param device Identifier of the target device
- * @param virtualMode virtualization mode. One of NVML_GPU_VIRTUALIZATION_?
- *
- * @return
- * - \ref NVML_SUCCESS if \a pVirtualMode is set
- * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
- * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a pVirtualMode is NULL
- * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible
- * - \ref NVML_ERROR_NOT_SUPPORTED if setting of virtualization mode is not supported.
- * - \ref NVML_ERROR_NO_PERMISSION if setting of virtualization mode is not allowed for this client.
- */
-nvmlReturn_t DECLDIR nvmlDeviceSetVirtualizationMode(nvmlDevice_t device, nvmlGpuVirtualizationMode_t virtualMode);
-
-/** @} */
-
-/***************************************************************************************************/
-/** @defgroup nvmlVgpu vGPU Management
- * @{
- *
- * Set of APIs supporting GRID vGPU
- */
-/***************************************************************************************************/
-
-/**
- * Retrieve the supported vGPU types on a physical GPU (device).
- *
- * An array of supported vGPU types for the physical GPU indicated by \a device is returned in the caller-supplied buffer
- * pointed at by \a vgpuTypeIds. The element count of nvmlVgpuTypeId_t array is passed in \a vgpuCount, and \a vgpuCount
- * is used to return the number of vGPU types written to the buffer.
- *
- * If the supplied buffer is not large enough to accomodate the vGPU type array, the function returns
- * NVML_ERROR_INSUFFICIENT_SIZE, with the element count of nvmlVgpuTypeId_t array required in \a vgpuCount.
- * To query the number of vGPU types supported for the GPU, call this function with *vgpuCount = 0.
- * The code will return NVML_ERROR_INSUFFICIENT_SIZE, or NVML_SUCCESS if no vGPU types are supported.
- *
- * @param device The identifier of the target device
- * @param vgpuCount Pointer to caller-supplied array size, and returns number of vGPU types
- * @param vgpuTypeIds Pointer to caller-supplied array in which to return list of vGPU types
- *
- * @return
- * - \ref NVML_SUCCESS successful completion
- * - \ref NVML_ERROR_INSUFFICIENT_SIZE \a vgpuTypeIds buffer is too small, array element count is returned in \a vgpuCount
- * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuCount is NULL or \a device is invalid
- * - \ref NVML_ERROR_NOT_SUPPORTED if vGPU is not supported by the device
- * - \ref NVML_ERROR_VGPU_ECC_NOT_SUPPORTED if ECC is enabled on the device
- * - \ref NVML_ERROR_UNKNOWN on any unexpected error
- */
-nvmlReturn_t DECLDIR nvmlDeviceGetSupportedVgpus(nvmlDevice_t device, unsigned int *vgpuCount, nvmlVgpuTypeId_t *vgpuTypeIds);
-
-/**
- * Retrieve the currently creatable vGPU types on a physical GPU (device).
- *
- * An array of creatable vGPU types for the physical GPU indicated by \a device is returned in the caller-supplied buffer
- * pointed at by \a vgpuTypeIds. The element count of nvmlVgpuTypeId_t array is passed in \a vgpuCount, and \a vgpuCount
- * is used to return the number of vGPU types written to the buffer.
- *
- * The creatable vGPU types for a device may differ over time, as there may be restrictions on what type of vGPU types
- * can concurrently run on a device. For example, if only one vGPU type is allowed at a time on a device, then the creatable
- * list will be restricted to whatever vGPU type is already running on the device.
- *
- * If the supplied buffer is not large enough to accomodate the vGPU type array, the function returns
- * NVML_ERROR_INSUFFICIENT_SIZE, with the element count of nvmlVgpuTypeId_t array required in \a vgpuCount.
- * To query the number of vGPU types createable for the GPU, call this function with *vgpuCount = 0.
- * The code will return NVML_ERROR_INSUFFICIENT_SIZE, or NVML_SUCCESS if no vGPU types are creatable.
- *
- * @param device The identifier of the target device
- * @param vgpuCount Pointer to caller-supplied array size, and returns number of vGPU types
- * @param vgpuTypeIds Pointer to caller-supplied array in which to return list of vGPU types
- *
- * @return
- * - \ref NVML_SUCCESS successful completion
- * - \ref NVML_ERROR_INSUFFICIENT_SIZE \a vgpuTypeIds buffer is too small, array element count is returned in \a vgpuCount
- * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuCount is NULL
- * - \ref NVML_ERROR_NOT_SUPPORTED if vGPU is not supported by the device
- * - \ref NVML_ERROR_VGPU_ECC_NOT_SUPPORTED if ECC is enabled on the device
- * - \ref NVML_ERROR_UNKNOWN on any unexpected error
- */
-nvmlReturn_t DECLDIR nvmlDeviceGetCreatableVgpus(nvmlDevice_t device, unsigned int *vgpuCount, nvmlVgpuTypeId_t *vgpuTypeIds);
-
-/**
- * Retrieve the class of a vGPU type. It will not exceed 64 characters in length (including the NUL terminator).
- * See \ref nvmlConstants::NVML_DEVICE_NAME_BUFFER_SIZE.
- *
- * For Kepler &tm; or newer fully supported devices.
- *
- * @param vgpuTypeId Handle to vGPU type
- * @param vgpuTypeClass Pointer to string array to return class in
- * @param size Size of string
- *
- * @return
- * - \ref NVML_SUCCESS successful completion
- * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuTypeId is invalid, or \a vgpuTypeClass is NULL
- * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a size is too small
- * - \ref NVML_ERROR_UNKNOWN on any unexpected error
- */
-nvmlReturn_t DECLDIR nvmlVgpuTypeGetClass(nvmlVgpuTypeId_t vgpuTypeId, char *vgpuTypeClass, unsigned int *size);
-
-/**
- * Retrieve the vGPU type name.
- *
- * The name is an alphanumeric string that denotes a particular vGPU, e.g. GRID M60-2Q. It will not
- * exceed 64 characters in length (including the NUL terminator). See \ref
- * nvmlConstants::NVML_DEVICE_NAME_BUFFER_SIZE.
- *
- * For Kepler &tm; or newer fully supported devices.
- *
- * @param vgpuTypeId Handle to vGPU type
- * @param vgpuTypeName Pointer to buffer to return name
- * @param size Size of buffer
- *
- * @return
- * - \ref NVML_SUCCESS successful completion
- * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuTypeId is invalid, or \a name is NULL
- * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a size is too small
- * - \ref NVML_ERROR_UNKNOWN on any unexpected error
- */
-nvmlReturn_t DECLDIR nvmlVgpuTypeGetName(nvmlVgpuTypeId_t vgpuTypeId, char *vgpuTypeName, unsigned int *size);
-
-/**
- * Retrieve the device ID of a vGPU type.
- *
- * For Kepler &tm; or newer fully supported devices.
- *
- * @param vgpuTypeId Handle to vGPU type
- * @param deviceID Device ID and vendor ID of the device contained in single 32 bit value
- * @param subsystemID Subsytem ID and subsytem vendor ID of the device contained in single 32 bit value
- *
- * @return
- * - \ref NVML_SUCCESS successful completion
- * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
- * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuTypeId is invalid, or \a deviceId or \a subsystemID are NULL
- * - \ref NVML_ERROR_UNKNOWN on any unexpected error
- */
-nvmlReturn_t DECLDIR nvmlVgpuTypeGetDeviceID(nvmlVgpuTypeId_t vgpuTypeId, unsigned long long *deviceID, unsigned long long *subsystemID);
-
-/**
- * Retrieve the vGPU framebuffer size in bytes.
- *
- * For Kepler &tm; or newer fully supported devices.
- *
- * @param vgpuTypeId Handle to vGPU type
- * @param fbSize Pointer to framebuffer size in bytes
- *
- * @return
- * - \ref NVML_SUCCESS successful completion
- * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
- * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuTypeId is invalid, or \a fbSize is NULL
- * - \ref NVML_ERROR_UNKNOWN on any unexpected error
- */
-nvmlReturn_t DECLDIR nvmlVgpuTypeGetFramebufferSize(nvmlVgpuTypeId_t vgpuTypeId, unsigned long long *fbSize);
-
-/**
- * Retrieve count of vGPU's supported display heads.
- *
- * For Kepler &tm; or newer fully supported devices.
- *
- * @param vgpuTypeId Handle to vGPU type
- * @param numDisplayHeads Pointer to number of display heads
- *
- * @return
- * - \ref NVML_SUCCESS successful completion
- * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
- * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuTypeId is invalid, or \a numDisplayHeads is NULL
- * - \ref NVML_ERROR_UNKNOWN on any unexpected error
- */
-nvmlReturn_t DECLDIR nvmlVgpuTypeGetNumDisplayHeads(nvmlVgpuTypeId_t vgpuTypeId, unsigned int *numDisplayHeads);
-
-/**
- * Retrieve vGPU display head's maximum supported resolution.
- *
- * For Kepler &tm; or newer fully supported devices.
- *
- * @param vgpuTypeId Handle to vGPU type
- * @param displayIndex Zero-based index of display head
- * @param xdim Pointer to maximum number of pixels in X dimension
- * @param ydim Pointer to maximum number of pixels in Y dimension
- *
- * @return
- * - \ref NVML_SUCCESS successful completion
- * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
- * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuTypeId is invalid, or \a xdim or \a ydim are NULL, or \a displayIndex
- * is out of range.
- * - \ref NVML_ERROR_UNKNOWN on any unexpected error
- */
-nvmlReturn_t DECLDIR nvmlVgpuTypeGetResolution(nvmlVgpuTypeId_t vgpuTypeId, unsigned int displayIndex, unsigned int *xdim, unsigned int *ydim);
-
-/**
- * Retrieve license requirements for a vGPU type
- *
- * The license type and version required to run the specified vGPU type is returned as an alphanumeric string, in the form
- * ",", for example "GRID-Virtual-PC,2.0". If a vGPU is runnable with* more than one type of license,
- * the licenses are delimited by a semicolon, for example "GRID-Virtual-PC,2.0;GRID-Virtual-WS,2.0;GRID-Virtual-WS-Ext,2.0".
- *
- * The total length of the returned string will not exceed 128 characters, including the NUL terminator.
- * See \ref nvmlVgpuConstants::NVML_GRID_LICENSE_BUFFER_SIZE.
- *
- * For Kepler &tm; or newer fully supported devices.
- *
- * @param vgpuTypeId Handle to vGPU type
- * @param vgpuTypeLicenseString Pointer to buffer to return license info
- * @param size Size of \a vgpuTypeLicenseString buffer
- *
- * @return
- * - \ref NVML_SUCCESS successful completion
- * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
- * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuTypeId is invalid, or \a vgpuTypeLicenseString is NULL
- * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a size is too small
- * - \ref NVML_ERROR_UNKNOWN on any unexpected error
- */
-nvmlReturn_t DECLDIR nvmlVgpuTypeGetLicense(nvmlVgpuTypeId_t vgpuTypeId, char *vgpuTypeLicenseString, unsigned int size);
-
-/**
- * Retrieve the static frame rate limit value of the vGPU type
- *
- * For Kepler &tm; or newer fully supported devices.
- *
- * @param vgpuTypeId Handle to vGPU type
- * @param frameRateLimit Reference to return the frame rate limit value
- * @return
- * - \ref NVML_SUCCESS successful completion
- * - \ref NVML_ERROR_NOT_SUPPORTED if frame rate limiter is turned off for the vGPU type
- * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
- * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, or \a frameRateLimit is NULL
- * - \ref NVML_ERROR_UNKNOWN on any unexpected error
- */
-nvmlReturn_t DECLDIR nvmlVgpuTypeGetFrameRateLimit(nvmlVgpuTypeId_t vgpuTypeId, unsigned int *frameRateLimit);
-
-/**
- * Retrieve the maximum number of vGPU instances creatable on a device for given vGPU type
- *
- * For Kepler &tm; or newer fully supported devices.
- *
- * @param device The identifier of the target device
- * @param vgpuTypeId Handle to vGPU type
- * @param vgpuInstanceCount Pointer to get the max number of vGPU instances
- * that can be created on a deicve for given vgpuTypeId
- * @return
- * - \ref NVML_SUCCESS successful completion
- * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
- * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuTypeId is invalid or is not supported on target device,
- * or \a vgpuInstanceCount is NULL
- * - \ref NVML_ERROR_UNKNOWN on any unexpected error
- */
-nvmlReturn_t DECLDIR nvmlVgpuTypeGetMaxInstances(nvmlDevice_t device, nvmlVgpuTypeId_t vgpuTypeId, unsigned int *vgpuInstanceCount);
-
-/**
- * Retrieve the active vGPU instances on a device.
- *
- * An array of active vGPU instances is returned in the caller-supplied buffer pointed at by \a vgpuInstances. The
- * array elememt count is passed in \a vgpuCount, and \a vgpuCount is used to return the number of vGPU instances
- * written to the buffer.
- *
- * If the supplied buffer is not large enough to accomodate the vGPU instance array, the function returns
- * NVML_ERROR_INSUFFICIENT_SIZE, with the element count of nvmlVgpuInstance_t array required in \a vgpuCount.
- * To query the number of active vGPU instances, call this function with *vgpuCount = 0. The code will return
- * NVML_ERROR_INSUFFICIENT_SIZE, or NVML_SUCCESS if no vGPU Types are supported.
- *
- * For Kepler &tm; or newer fully supported devices.
- *
- * @param device The identifier of the target device
- * @param vgpuCount Pointer which passes in the array size as well as get
- * back the number of types
- * @param vgpuInstances Pointer to array in which to return list of vGPU instances
- *
- * @return
- * - \ref NVML_SUCCESS successful completion
- * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
- * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, or \a vgpuCount is NULL
- * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a size is too small
- * - \ref NVML_ERROR_NOT_SUPPORTED if vGPU is not supported by the device
- * - \ref NVML_ERROR_UNKNOWN on any unexpected error
- */
-nvmlReturn_t DECLDIR nvmlDeviceGetActiveVgpus(nvmlDevice_t device, unsigned int *vgpuCount, nvmlVgpuInstance_t *vgpuInstances);
-
-/**
- * Retrieve the VM ID associated with a vGPU instance.
- *
- * The VM ID is returned as a string, not exceeding 80 characters in length (including the NUL terminator).
- * See \ref nvmlConstants::NVML_DEVICE_UUID_BUFFER_SIZE.
- *
- * The format of the VM ID varies by platform, and is indicated by the type identifier returned in \a vmIdType.
- *
- * For Kepler &tm; or newer fully supported devices.
- *
- * @param vgpuInstance Identifier of the target vGPU instance
- * @param vmId Pointer to caller-supplied buffer to hold VM ID
- * @param size Size of buffer in bytes
- * @param vmIdType Pointer to hold VM ID type
- *
- * @return
- * - \ref NVML_SUCCESS successful completion
- * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
- * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuInstance is invalid, or \a vmId or \a vmIdType are NULL
- * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a size is too small
- * - \ref NVML_ERROR_UNKNOWN on any unexpected error
- */
-nvmlReturn_t DECLDIR nvmlVgpuInstanceGetVmID(nvmlVgpuInstance_t vgpuInstance, char *vmId, unsigned int size, nvmlVgpuVmIdType_t *vmIdType);
-
-/**
- * Retrieve the UUID of a vGPU instance.
- *
- * The UUID is a globally unique identifier associated with the vGPU, and is returned as a 5-part hexadecimal string,
- * not exceeding 80 characters in length (including the NULL terminator).
- * See \ref nvmlConstants::NVML_DEVICE_UUID_BUFFER_SIZE.
- *
- * For Kepler &tm; or newer fully supported devices.
- *
- * @param vgpuInstance Identifier of the target vGPU instance
- * @param uuid Pointer to caller-supplied buffer to hold vGPU UUID
- * @param size Size of buffer in bytes
- *
- * @return
- * - \ref NVML_SUCCESS successful completion
- * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
- * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuInstance is invalid, or \a uuid is NULL
- * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a size is too small
- * - \ref NVML_ERROR_UNKNOWN on any unexpected error
- */
-nvmlReturn_t DECLDIR nvmlVgpuInstanceGetUUID(nvmlVgpuInstance_t vgpuInstance, char *uuid, unsigned int size);
-
-/**
- * Retrieve the NVIDIA driver version installed in the VM associated with a vGPU.
- *
- * The version is returned as an alphanumeric string in the caller-supplied buffer \a version. The length of the version
- * string will not exceed 80 characters in length (including the NUL terminator).
- * See \ref nvmlConstants::NVML_SYSTEM_DRIVER_VERSION_BUFFER_SIZE.
- *
- * nvmlVgpuInstanceGetVmDriverVersion() may be called at any time for a vGPU instance. The guest VM driver version is
- * returned as "Unknown" if no NVIDIA driver is installed in the VM, or the VM has not yet booted to the point where the
- * NVIDIA driver is loaded and initialized.
- *
- * For Kepler &tm; or newer fully supported devices.
- *
- * @param vgpuInstance Identifier of the target vGPU instance
- * @param version Caller-supplied buffer to return driver version string
- * @param length Size of \a version buffer
- *
- * @return
- * - \ref NVML_SUCCESS if \a version has been set
- * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
- * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuInstance is invalid
- * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a length is too small
- * - \ref NVML_ERROR_UNKNOWN on any unexpected error
- */
-nvmlReturn_t DECLDIR nvmlVgpuInstanceGetVmDriverVersion(nvmlVgpuInstance_t vgpuInstance, char* version, unsigned int length);
-
-/**
- * Retrieve the framebuffer usage in bytes.
- *
- * Framebuffer usage is the amont of vGPU framebuffer memory that is currently in use by the VM.
- *
- * For Kepler &tm; or newer fully supported devices.
- *
- * @param vgpuInstance The identifier of the target instance
- * @param fbUsage Pointer to framebuffer usage in bytes
- *
- * @return
- * - \ref NVML_SUCCESS successful completion
- * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
- * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuInstance is invalid, or \a fbUsage is NULL
- * - \ref NVML_ERROR_UNKNOWN on any unexpected error
- */
-nvmlReturn_t DECLDIR nvmlVgpuInstanceGetFbUsage(nvmlVgpuInstance_t vgpuInstance, unsigned long long *fbUsage);
-
-/**
- * Retrieve the current licensing state of the vGPU instance.
- *
- * If the vGPU is currently licensed, \a licensed is set to 1, otherwise it is set to 0.
- *
- * For Kepler &tm; or newer fully supported devices.
- *
- * @param vgpuInstance Identifier of the target vGPU instance
- * @param licensed Reference to return the licensing status
- *
- * @return
- * - \ref NVML_SUCCESS if \a licensed has been set
- * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
- * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuInstance is invalid, or \a licensed is NULL
- * - \ref NVML_ERROR_UNKNOWN on any unexpected error
- */
-nvmlReturn_t DECLDIR nvmlVgpuInstanceGetLicenseStatus(nvmlVgpuInstance_t vgpuInstance, unsigned int *licensed);
-
-/**
- * Retrieve the vGPU type of a vGPU instance.
- *
- * Returns the vGPU type ID of vgpu assigned to the vGPU instance.
- *
- * For Kepler &tm; or newer fully supported devices.
- *
- * @param vgpuInstance Identifier of the target vGPU instance
- * @param vgpuTypeId Reference to return the vgpuTypeId
- *
- * @return
- * - \ref NVML_SUCCESS if \a vgpuTypeId has been set
- * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
- * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuInstance is invalid, or \a vgpuTypeId is NULL
- * - \ref NVML_ERROR_UNKNOWN on any unexpected error
- */
-nvmlReturn_t DECLDIR nvmlVgpuInstanceGetType(nvmlVgpuInstance_t vgpuInstance, nvmlVgpuTypeId_t *vgpuTypeId);
-
-/**
- * Retrieve the frame rate limit set for the vGPU instance.
- *
- * Returns the value of the frame rate limit set for the vGPU instance
- *
- * For Kepler &tm; or newer fully supported devices.
- *
- * @param vgpuInstance Identifier of the target vGPU instance
- * @param frameRateLimit Reference to return the frame rate limit
- *
- * @return
- * - \ref NVML_SUCCESS if \a frameRateLimit has been set
- * - \ref NVML_ERROR_NOT_SUPPORTED if frame rate limiter is turned off for the vGPU type
- * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
- * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuInstance is invalid, or \a frameRateLimit is NULL
- * - \ref NVML_ERROR_UNKNOWN on any unexpected error
- */
-nvmlReturn_t DECLDIR nvmlVgpuInstanceGetFrameRateLimit(nvmlVgpuInstance_t vgpuInstance, unsigned int *frameRateLimit);
-
-/**
- * Retrieve the encoder capacity of a vGPU instance, as a percentage of maximum encoder capacity with valid values in the range 0-100.
- *
- * For Maxwell &tm; or newer fully supported devices.
- *
- * @param vgpuInstance Identifier of the target vGPU instance
- * @param encoderCapacity Reference to an unsigned int for the encoder capacity
- *
- * @return
- * - \ref NVML_SUCCESS if \a encoderCapacity has been retrived
- * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
- * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuInstance is invalid, or \a encoderQueryType is invalid
- * - \ref NVML_ERROR_UNKNOWN on any unexpected error
- */
-nvmlReturn_t DECLDIR nvmlVgpuInstanceGetEncoderCapacity(nvmlVgpuInstance_t vgpuInstance, unsigned int *encoderCapacity);
-
-/**
- * Set the encoder capacity of a vGPU instance, as a percentage of maximum encoder capacity with valid values in the range 0-100.
- *
- * For Maxwell &tm; or newer fully supported devices.
- *
- * @param vgpuInstance Identifier of the target vGPU instance
- * @param encoderCapacity Unsigned int for the encoder capacity value
- *
- * @return
- * - \ref NVML_SUCCESS if \a encoderCapacity has been set
- * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
- * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuInstance is invalid
- * - \ref NVML_ERROR_UNKNOWN on any unexpected error
- */
-nvmlReturn_t DECLDIR nvmlVgpuInstanceSetEncoderCapacity(nvmlVgpuInstance_t vgpuInstance, unsigned int encoderCapacity);
-
-/**
- * Retrieves current utilization for vGPUs on a physical GPU (device).
- *
- * For Kepler &tm; or newer fully supported devices.
- *
- * Reads recent utilization of GPU SM (3D/Compute), framebuffer, video encoder, and video decoder for vGPU instances running
- * on a device. Utilization values are returned as an array of utilization sample structures in the caller-supplied buffer
- * pointed at by \a utilizationSamples. One utilization sample structure is returned per vGPU instance, and includes the
- * CPU timestamp at which the samples were recorded. Individual utilization values are returned as "unsigned int" values
- * in nvmlValue_t unions. The function sets the caller-supplied \a sampleValType to NVML_VALUE_TYPE_UNSIGNED_INT to
- * indicate the returned value type.
- *
- * To read utilization values, first determine the size of buffer required to hold the samples by invoking the function with
- * \a utilizationSamples set to NULL. The function will return NVML_ERROR_INSUFFICIENT_SIZE, with the current vGPU instance
- * count in \a vgpuInstanceSamplesCount, or NVML_SUCCESS if the current vGPU instance count is zero. The caller should allocate
- * a buffer of size vgpuInstanceSamplesCount * sizeof(nvmlVgpuInstanceUtilizationSample_t). Invoke the function again with
- * the allocated buffer passed in \a utilizationSamples, and \a vgpuInstanceSamplesCount set to the number of entries the
- * buffer is sized for.
- *
- * On successful return, the function updates \a vgpuInstanceSampleCount with the number of vGPU utilization sample
- * structures that were actually written. This may differ from a previously read value as vGPU instances are created or
- * destroyed.
- *
- * lastSeenTimeStamp represents the CPU timestamp in microseconds at which utilization samples were last read. Set it to 0
- * to read utilization based on all the samples maintained by the driver's internal sample buffer. Set lastSeenTimeStamp
- * to a timeStamp retrieved from a previous query to read utilization since the previous query.
- *
- * @param device The identifier for the target device
- * @param lastSeenTimeStamp Return only samples with timestamp greater than lastSeenTimeStamp.
- * @param sampleValType Pointer to caller-supplied buffer to hold the type of returned sample values
- * @param vgpuInstanceSamplesCount Pointer to caller-supplied array size, and returns number of vGPU instances
- * @param utilizationSamples Pointer to caller-supplied buffer in which vGPU utilization samples are returned
-
- * @return
- * - \ref NVML_SUCCESS if utilization samples are successfully retrieved
- * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
- * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, \a vgpuInstanceSamplesCount or \a sampleValType is
- * NULL, or a sample count of 0 is passed with a non-NULL \a utilizationSamples
- * - \ref NVML_ERROR_INSUFFICIENT_SIZE if supplied \a vgpuInstanceSamplesCount is too small to return samples for all
- * vGPU instances currently executing on the device
- * - \ref NVML_ERROR_NOT_SUPPORTED if vGPU is not supported by the device
- * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible
- * - \ref NVML_ERROR_NOT_FOUND if sample entries are not found
- * - \ref NVML_ERROR_UNKNOWN on any unexpected error
- */
-nvmlReturn_t DECLDIR nvmlDeviceGetVgpuUtilization(nvmlDevice_t device, unsigned long long lastSeenTimeStamp,
- nvmlValueType_t *sampleValType, unsigned int *vgpuInstanceSamplesCount,
- nvmlVgpuInstanceUtilizationSample_t *utilizationSamples);
-
-/**
- * Retrieves current utilization for processes running on vGPUs on a physical GPU (device).
- *
- * For Maxwell &tm; or newer fully supported devices.
- *
- * Reads recent utilization of GPU SM (3D/Compute), framebuffer, video encoder, and video decoder for processes running on
- * vGPU instances active on a device. Utilization values are returned as an array of utilization sample structures in the
- * caller-supplied buffer pointed at by \a utilizationSamples. One utilization sample structure is returned per process running
- * on vGPU instances, that had some non-zero utilization during the last sample period. It includes the CPU timestamp at which
- * the samples were recorded. Individual utilization values are returned as "unsigned int" values.
- *
- * To read utilization values, first determine the size of buffer required to hold the samples by invoking the function with
- * \a utilizationSamples set to NULL. The function will return NVML_ERROR_INSUFFICIENT_SIZE, with the current vGPU instance
- * count in \a vgpuProcessSamplesCount. The caller should allocate a buffer of size
- * vgpuProcessSamplesCount * sizeof(nvmlVgpuProcessUtilizationSample_t). Invoke the function again with
- * the allocated buffer passed in \a utilizationSamples, and \a vgpuProcessSamplesCount set to the number of entries the
- * buffer is sized for.
- *
- * On successful return, the function updates \a vgpuSubProcessSampleCount with the number of vGPU sub process utilization sample
- * structures that were actually written. This may differ from a previously read value depending on the number of processes that are active
- * in any given sample period.
- *
- * lastSeenTimeStamp represents the CPU timestamp in microseconds at which utilization samples were last read. Set it to 0
- * to read utilization based on all the samples maintained by the driver's internal sample buffer. Set lastSeenTimeStamp
- * to a timeStamp retrieved from a previous query to read utilization since the previous query.
- *
- * @param device The identifier for the target device
- * @param lastSeenTimeStamp Return only samples with timestamp greater than lastSeenTimeStamp.
- * @param vgpuProcessSamplesCount Pointer to caller-supplied array size, and returns number of processes running on vGPU instances
- * @param utilizationSamples Pointer to caller-supplied buffer in which vGPU sub process utilization samples are returned
-
- * @return
- * - \ref NVML_SUCCESS if utilization samples are successfully retrieved
- * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
- * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, \a vgpuProcessSamplesCount or a sample count of 0 is
- * passed with a non-NULL \a utilizationSamples
- * - \ref NVML_ERROR_INSUFFICIENT_SIZE if supplied \a vgpuProcessSamplesCount is too small to return samples for all
- * vGPU instances currently executing on the device
- * - \ref NVML_ERROR_NOT_SUPPORTED if vGPU is not supported by the device
- * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible
- * - \ref NVML_ERROR_NOT_FOUND if sample entries are not found
- * - \ref NVML_ERROR_UNKNOWN on any unexpected error
- */
-nvmlReturn_t DECLDIR nvmlDeviceGetVgpuProcessUtilization(nvmlDevice_t device, unsigned long long lastSeenTimeStamp,
- unsigned int *vgpuProcessSamplesCount,
- nvmlVgpuProcessUtilizationSample_t *utilizationSamples);
-/**
- * Retrieve the GRID licensable features.
- *
- * Identifies whether the system supports GRID Software Licensing. If it does, return the list of licensable feature(s)
- * and their current license status.
- *
- * @param device Identifier of the target device
- * @param pGridLicensableFeatures Pointer to structure in which GRID licensable features are returned
- *
- * @return
- * - \ref NVML_SUCCESS if licensable features are successfully retrieved
- * - \ref NVML_ERROR_INVALID_ARGUMENT if \a pGridLicensableFeatures is NULL
- * - \ref NVML_ERROR_UNKNOWN on any unexpected error
- */
-nvmlReturn_t DECLDIR nvmlDeviceGetGridLicensableFeatures(nvmlDevice_t device, nvmlGridLicensableFeatures_t *pGridLicensableFeatures);
-
-/**
- * Retrieves the current encoder statistics of a vGPU Instance
- *
- * For Maxwell &tm; or newer fully supported devices.
- *
- * @param vgpuInstance Identifier of the target vGPU instance
- * @param sessionCount Reference to an unsigned int for count of active encoder sessions
- * @param averageFps Reference to an unsigned int for trailing average FPS of all active sessions
- * @param averageLatency Reference to an unsigned int for encode latency in microseconds
- *
- * @return
- * - \ref NVML_SUCCESS if \a sessionCount, \a averageFps and \a averageLatency is fetched
- * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
- * - \ref NVML_ERROR_INVALID_ARGUMENT if \a sessionCount , or \a averageFps or \a averageLatency is NULL
- * or \a vgpuInstance is invalid.
- * - \ref NVML_ERROR_UNKNOWN on any unexpected error
- */
-nvmlReturn_t DECLDIR nvmlVgpuInstanceGetEncoderStats(nvmlVgpuInstance_t vgpuInstance, unsigned int *sessionCount,
- unsigned int *averageFps, unsigned int *averageLatency);
-
-/**
- * Retrieves information about all active encoder sessions on a vGPU Instance.
- *
- * An array of active encoder sessions is returned in the caller-supplied buffer pointed at by \a sessionInfo. The
- * array elememt count is passed in \a sessionCount, and \a sessionCount is used to return the number of sessions
- * written to the buffer.
- *
- * If the supplied buffer is not large enough to accomodate the active session array, the function returns
- * NVML_ERROR_INSUFFICIENT_SIZE, with the element count of nvmlEncoderSessionInfo_t array required in \a sessionCount.
- * To query the number of active encoder sessions, call this function with *sessionCount = 0. The code will return
- * NVML_SUCCESS with number of active encoder sessions updated in *sessionCount.
- *
- * For Maxwell &tm; or newer fully supported devices.
- *
- * @param vgpuInstance Identifier of the target vGPU instance
- * @param sessionCount Reference to caller supplied array size, and returns
- * the number of sessions.
- * @param sessionInfo Reference to caller supplied array in which the list
- * of session information us returned.
- *
- * @return
- * - \ref NVML_SUCCESS if \a sessionInfo is fetched
- * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
- * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a sessionCount is too small, array element count is
- returned in \a sessionCount
- * - \ref NVML_ERROR_INVALID_ARGUMENT if \a sessionCount is NULL or \a vgpuInstance is invalid..
- * - \ref NVML_ERROR_UNKNOWN on any unexpected error
- */
-nvmlReturn_t DECLDIR nvmlVgpuInstanceGetEncoderSessions(nvmlVgpuInstance_t vgpuInstance, unsigned int *sessionCount, nvmlEncoderSessionInfo_t *sessionInfo);
-
-/**
- * Retrieves the current utilization and process ID
- *
- * For Maxwell &tm; or newer fully supported devices.
- *
- * Reads recent utilization of GPU SM (3D/Compute), framebuffer, video encoder, and video decoder for processes running.
- * Utilization values are returned as an array of utilization sample structures in the caller-supplied buffer pointed at
- * by \a utilization. One utilization sample structure is returned per process running, that had some non-zero utilization
- * during the last sample period. It includes the CPU timestamp at which the samples were recorded. Individual utilization values
- * are returned as "unsigned int" values.
- *
- * To read utilization values, first determine the size of buffer required to hold the samples by invoking the function with
- * \a utilization set to NULL. The caller should allocate a buffer of size
- * processSamplesCount * sizeof(nvmlProcessUtilizationSample_t). Invoke the function again with the allocated buffer passed
- * in \a utilization, and \a processSamplesCount set to the number of entries the buffer is sized for.
- *
- * On successful return, the function updates \a processSamplesCount with the number of process utilization sample
- * structures that were actually written. This may differ from a previously read value as instances are created or
- * destroyed.
- *
- * lastSeenTimeStamp represents the CPU timestamp in microseconds at which utilization samples were last read. Set it to 0
- * to read utilization based on all the samples maintained by the driver's internal sample buffer. Set lastSeenTimeStamp
- * to a timeStamp retrieved from a previous query to read utilization since the previous query.
- *
- * @param device The identifier of the target device
- * @param utilization Pointer to caller-supplied buffer in which guest process utilization samples are returned
- * @param processSamplesCount Pointer to caller-supplied array size, and returns number of processes running
- * @param lastSeenTimeStamp Return only samples with timestamp greater than lastSeenTimeStamp.
-
- * @return
- * - \ref NVML_SUCCESS if \a utilization has been populated
- * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
- * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, \a utilization is NULL, or \a samplingPeriodUs is NULL
- * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature
- * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible
- * - \ref NVML_ERROR_UNKNOWN on any unexpected error
- */
-nvmlReturn_t DECLDIR nvmlDeviceGetProcessUtilization(nvmlDevice_t device, nvmlProcessUtilizationSample_t *utilization,
- unsigned int *processSamplesCount, unsigned long long lastSeenTimeStamp);
-
-/** @} */
-
-/***************************************************************************************************/
-/** @defgroup nvml vGPU Migration
- * This chapter describes NVML operations that are associated with vGPU Migration.
- * @{
- */
-/***************************************************************************************************/
-
-/**
- * vGPU metadata structure.
- */
-typedef struct nvmlVgpuMetadata_st
-{
- unsigned int version; //!< Current version of the structure
- unsigned int revision; //!< Current revision of the structure
- nvmlVgpuGuestInfoState_t guestInfoState; //!< Current state of Guest-dependent fields
- char guestDriverVersion[NVML_SYSTEM_DRIVER_VERSION_BUFFER_SIZE]; //!< Version of driver installed in guest
- char hostDriverVersion[NVML_SYSTEM_DRIVER_VERSION_BUFFER_SIZE]; //!< Version of driver installed in host
- unsigned int reserved[8]; //!< Reserved for internal use
- unsigned int opaqueDataSize; //!< Size of opaque data field in bytes
- char opaqueData[4]; //!< Opaque data
-} nvmlVgpuMetadata_t;
-
-/**
- * Physical GPU metadata structure
- */
-typedef struct nvmlVgpuPgpuMetadata_st
-{
- unsigned int version; //!< Current version of the structure
- unsigned int revision; //!< Current revision of the structure
- char hostDriverVersion[NVML_SYSTEM_DRIVER_VERSION_BUFFER_SIZE]; //!< Host driver version
- unsigned int pgpuVirtualizationCaps; //!< Pgpu virtualizaion capabilities bitfileld
- unsigned int reserved[7]; //!< Reserved for internal use
- unsigned int opaqueDataSize; //!< Size of opaque data field in bytes
- char opaqueData[4]; //!< Opaque data
-} nvmlVgpuPgpuMetadata_t;
-
-/**
- * vGPU VM compatibility codes
- */
-typedef enum nvmlVgpuVmCompatibility_enum
-{
- NVML_VGPU_VM_COMPATIBILITY_NONE = 0x0, //!< vGPU is not runnable
- NVML_VGPU_VM_COMPATIBILITY_COLD = 0x1, //!< vGPU is runnable from a cold / powered-off state (ACPI S5)
- NVML_VGPU_VM_COMPATIBILITY_HIBERNATE = 0x2, //!< vGPU is runnable from a hibernated state (ACPI S4)
- NVML_VGPU_VM_COMPATIBILITY_SLEEP = 0x4, //!< vGPU is runnable from a sleeped state (ACPI S3)
- NVML_VGPU_VM_COMPATIBILITY_LIVE = 0x8, //!< vGPU is runnable from a live/paused (ACPI S0)
-} nvmlVgpuVmCompatibility_t;
-
-/**
- * vGPU-pGPU compatibility limit codes
- */
-typedef enum nvmlVgpuPgpuCompatibilityLimitCode_enum
-{
- NVML_VGPU_COMPATIBILITY_LIMIT_NONE = 0x0, //!< Compatibility is not limited.
- NVML_VGPU_COMPATIBILITY_LIMIT_HOST_DRIVER = 0x1, //!< Compatibility is limited by host driver version.
- NVML_VGPU_COMPATIBILITY_LIMIT_GUEST_DRIVER = 0x2, //!< Compatibility is limited by guest driver version.
- NVML_VGPU_COMPATIBILITY_LIMIT_GPU = 0x4, //!< Compatibility is limited by GPU hardware.
- NVML_VGPU_COMPATIBILITY_LIMIT_OTHER = 0x80000000, //!< Compatibility is limited by an undefined factor.
-} nvmlVgpuPgpuCompatibilityLimitCode_t;
-
-/**
- * vGPU-pGPU compatibility structure
- */
-typedef struct nvmlVgpuPgpuCompatibility_st
-{
- nvmlVgpuVmCompatibility_t vgpuVmCompatibility; //!< Compatibility of vGPU VM. See \ref nvmlVgpuVmCompatibility_t
- nvmlVgpuPgpuCompatibilityLimitCode_t compatibilityLimitCode; //!< Limiting factor for vGPU-pGPU compatibility. See \ref nvmlVgpuPgpuCompatibilityLimitCode_t
-} nvmlVgpuPgpuCompatibility_t;
-
-/**
- * Returns vGPU metadata structure for a running vGPU. The structure contains information about the vGPU and its associated VM
- * such as the currently installed NVIDIA guest driver version, together with host driver version and an opaque data section
- * containing internal state.
- *
- * nvmlVgpuInstanceGetMetadata() may be called at any time for a vGPU instance. Some fields in the returned structure are
- * dependent on information obtained from the guest VM, which may not yet have reached a state where that information
- * is available. The current state of these dependent fields is reflected in the info structure's \ref guestInfoState field.
- *
- * The VMM may choose to read and save the vGPU's VM info as persistent metadata associated with the VM, and provide
- * it to GRID Virtual GPU Manager when creating a vGPU for subsequent instances of the VM.
- *
- * The caller passes in a buffer via \a vgpuMetadata, with the size of the buffer in \a bufferSize. If the vGPU Metadata structure
- * is too large to fit in the supplied buffer, the function returns NVML_ERROR_INSUFFICIENT_SIZE with the size needed
- * in \a bufferSize.
- *
- * @param vgpuInstance vGPU instance handle
- * @param vgpuMetadata Pointer to caller-supplied buffer into which vGPU metadata is written
- * @param bufferSize Size of vgpuMetadata buffer
- *
- * @return
- * - \ref NVML_SUCCESS vGPU metadata structure was successfully returned
- * - \ref NVML_ERROR_INSUFFICIENT_SIZE vgpuMetadata buffer is too small, required size is returned in \a bufferSize
- * - \ref NVML_ERROR_INVALID_ARGUMENT if \a bufferSize is NULL or \a vgpuInstance is invalid; if \a vgpuMetadata is NULL and the value of \a bufferSize is not 0.
- * - \ref NVML_ERROR_UNKNOWN on any unexpected error
- */
-nvmlReturn_t DECLDIR nvmlVgpuInstanceGetMetadata(nvmlVgpuInstance_t vgpuInstance, nvmlVgpuMetadata_t *vgpuMetadata, unsigned int *bufferSize);
-
-/**
- * Returns a vGPU metadata structure for the physical GPU indicated by \a device. The structure contains information about
- * the GPU and the currently installed NVIDIA host driver version that's controlling it, together with an opaque data section
- * containing internal state.
- *
- * The caller passes in a buffer via \a pgpuMetadata, with the size of the buffer in \a bufferSize. If the \a pgpuMetadata
- * structure is too large to fit in the supplied buffer, the function returns NVML_ERROR_INSUFFICIENT_SIZE with the size needed
- * in \a bufferSize.
- *
- * @param device The identifier of the target device
- * @param pgpuMetadata Pointer to caller-supplied buffer into which \a pgpuMetadata is written
- * @param bufferSize Pointer to size of \a pgpuMetadata buffer
- *
- * @return
- * - \ref NVML_SUCCESS GPU metadata structure was successfully returned
- * - \ref NVML_ERROR_INSUFFICIENT_SIZE pgpuMetadata buffer is too small, required size is returned in \a bufferSize
- * - \ref NVML_ERROR_INVALID_ARGUMENT if \a bufferSize is NULL or \a device is invalid; if \a pgpuMetadata is NULL and the value of \a bufferSize is not 0.
- * - \ref NVML_ERROR_NOT_SUPPORTED vGPU is not supported by the system
- * - \ref NVML_ERROR_UNKNOWN on any unexpected error
- */
-nvmlReturn_t DECLDIR nvmlDeviceGetVgpuMetadata(nvmlDevice_t device, nvmlVgpuPgpuMetadata_t *pgpuMetadata, unsigned int *bufferSize);
-
-/**
- * Takes a vGPU instance metadata structure read from \ref nvmlVgpuInstanceGetMetadata(), and a vGPU metadata structure for a
- * physical GPU read from \ref nvmlDeviceGetVgpuMetadata(), and returns compatibility information of the vGPU instance and the
- * physical GPU.
- *
- * The caller passes in a buffer via \a compatibilityInfo, into which a compatibility information structure is written. The
- * structure defines the states in which the vGPU / VM may be booted on the physical GPU. If the vGPU / VM compatibility
- * with the physical GPU is limited, a limit code indicates the factor limiting compability.
- * (see \ref nvmlVgpuPgpuCompatibilityLimitCode_t for details).
- *
- * Note: vGPU compatibility does not take into account dynamic capacity conditions that may limit a system's ability to
- * boot a given vGPU or associated VM.
- *
- * @param vgpuMetadata Pointer to caller-supplied vGPU metadata structure
- * @param pgpuMetadata Pointer to caller-supplied GPU metadata structure
- * @param compatibilityInfo Pointer to caller-supplied buffer to hold compatibility info
- *
- * @return
- * - \ref NVML_SUCCESS vGPU metadata structure was successfully returned
- * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuMetadata or \a pgpuMetadata or \a bufferSize are NULL
- * - \ref NVML_ERROR_UNKNOWN on any unexpected error
- */
-nvmlReturn_t DECLDIR nvmlGetVgpuCompatibility(nvmlVgpuMetadata_t *vgpuMetadata, nvmlVgpuPgpuMetadata_t *pgpuMetadata, nvmlVgpuPgpuCompatibility_t *compatibilityInfo);
-
-/** @} */
-
-/**
- * NVML API versioning support
- */
-#if defined(__NVML_API_VERSION_INTERNAL)
-#undef nvmlDeviceRemoveGpu
-#undef nvmlDeviceGetNvLinkRemotePciInfo
-#undef nvmlDeviceGetPciInfo
-#undef nvmlDeviceGetCount
-#undef nvmlDeviceGetHandleByIndex
-#undef nvmlDeviceGetHandleByPciBusId
-#undef nvmlInit
-#endif
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif
diff --git a/vendor/github.com/NVIDIA/gpu-monitoring-tools/bindings/go/nvml/nvml_dl.c b/vendor/github.com/NVIDIA/gpu-monitoring-tools/bindings/go/nvml/nvml_dl.c
deleted file mode 100644
index a3d162c0e1bc..000000000000
--- a/vendor/github.com/NVIDIA/gpu-monitoring-tools/bindings/go/nvml/nvml_dl.c
+++ /dev/null
@@ -1,46 +0,0 @@
-// Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
-
-#include
-#include
-
-#include "nvml_dl.h"
-
-#define DLSYM(x, sym) \
-do { \
- dlerror(); \
- x = dlsym(handle, #sym); \
- if (dlerror() != NULL) { \
- return (NVML_ERROR_FUNCTION_NOT_FOUND); \
- } \
-} while (0)
-
-typedef nvmlReturn_t (*nvmlSym_t)();
-
-static void *handle;
-
-nvmlReturn_t NVML_DL(nvmlInit)(void)
-{
- handle = dlopen("libnvidia-ml.so.1", RTLD_LAZY | RTLD_GLOBAL);
- if (handle == NULL) {
- return (NVML_ERROR_LIBRARY_NOT_FOUND);
- }
- return (nvmlInit());
-}
-
-nvmlReturn_t NVML_DL(nvmlShutdown)(void)
-{
- nvmlReturn_t r = nvmlShutdown();
- if (r != NVML_SUCCESS) {
- return (r);
- }
- return (dlclose(handle) ? NVML_ERROR_UNKNOWN : NVML_SUCCESS);
-}
-
-nvmlReturn_t NVML_DL(nvmlDeviceGetTopologyCommonAncestor)(
- nvmlDevice_t dev1, nvmlDevice_t dev2, nvmlGpuTopologyLevel_t *info)
-{
- nvmlSym_t sym;
-
- DLSYM(sym, nvmlDeviceGetTopologyCommonAncestor);
- return ((*sym)(dev1, dev2, info));
-}
diff --git a/vendor/github.com/NVIDIA/gpu-monitoring-tools/bindings/go/nvml/nvml_dl.h b/vendor/github.com/NVIDIA/gpu-monitoring-tools/bindings/go/nvml/nvml_dl.h
deleted file mode 100644
index 628f0b3a2c2b..000000000000
--- a/vendor/github.com/NVIDIA/gpu-monitoring-tools/bindings/go/nvml/nvml_dl.h
+++ /dev/null
@@ -1,15 +0,0 @@
-// Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
-
-#ifndef _NVML_DL_H_
-#define _NVML_DL_H_
-
-#include "nvml.h"
-
-#define NVML_DL(x) x##_dl
-
-extern nvmlReturn_t NVML_DL(nvmlInit)(void);
-extern nvmlReturn_t NVML_DL(nvmlShutdown)(void);
-extern nvmlReturn_t NVML_DL(nvmlDeviceGetTopologyCommonAncestor)(
- nvmlDevice_t, nvmlDevice_t, nvmlGpuTopologyLevel_t *);
-
-#endif // _NVML_DL_H_
diff --git a/vendor/modules.txt b/vendor/modules.txt
index 4dfe88ee06be..6ca76bfd3187 100644
--- a/vendor/modules.txt
+++ b/vendor/modules.txt
@@ -52,9 +52,6 @@ github.com/Microsoft/go-winio/pkg/guid
# github.com/Microsoft/hcsshim v0.8.8-0.20200312192636-fd0797d766b1
## explicit
github.com/Microsoft/hcsshim/osversion
-# github.com/NVIDIA/gpu-monitoring-tools v0.0.0-20180829222009-86f2a9fac6c5
-## explicit
-github.com/NVIDIA/gpu-monitoring-tools/bindings/go/nvml
# github.com/NYTimes/gziphandler v1.0.1 => github.com/NYTimes/gziphandler v1.0.0
## explicit
github.com/NYTimes/gziphandler
diff --git a/website/content/docs/devices/nvidia.mdx b/website/content/docs/devices/external/nvidia.mdx
similarity index 100%
rename from website/content/docs/devices/nvidia.mdx
rename to website/content/docs/devices/external/nvidia.mdx
diff --git a/website/data/docs-nav-data.json b/website/data/docs-nav-data.json
index 1751a892208c..9616f25aec44 100644
--- a/website/data/docs-nav-data.json
+++ b/website/data/docs-nav-data.json
@@ -1437,10 +1437,6 @@
"title": "Overview",
"path": "devices"
},
- {
- "title": "Nvidia",
- "path": "devices/nvidia"
- },
{
"title": "Community",
"routes": [
@@ -1448,6 +1444,10 @@
"title": "Overview",
"path": "devices/external"
},
+ {
+ "title": "Nvidia",
+ "path": "devices/external/nvidia"
+ },
{
"title": "USB Beta",
"path": "devices/external/usb"
@@ -1760,7 +1760,7 @@
{
"title": "Overview",
"path": "enterprise"
- },
+ },
{
"title": "License",
"routes": [