diff --git a/devices/gpu/nvidia/README.md b/devices/gpu/nvidia/README.md deleted file mode 100644 index 1035c7c89402..000000000000 --- a/devices/gpu/nvidia/README.md +++ /dev/null @@ -1,21 +0,0 @@ -This package provides an implementation of nvidia device plugin - -# Behavior - -Nvidia device plugin uses NVML bindings to get data regarding available nvidia devices and will expose them via Fingerprint RPC. GPUs can be excluded from fingerprinting by setting the `ignored_gpu_ids` field. Plugin sends statistics for fingerprinted devices every `stats_period` period. - -# Config - -The configuration should be passed via an HCL file that begins with a top level `config` stanza: - -``` -config { - ignored_gpu_ids = ["uuid1", "uuid2"] - fingerprint_period = "5s" -} -``` - -The valid configuration options are: - -* `ignored_gpu_ids` (`list(string)`: `[]`): list of GPU UUIDs strings that should not be exposed to nomad -* `fingerprint_period` (`string`: `"1m"`): interval to repeat the fingerprint process to identify possible changes. diff --git a/devices/gpu/nvidia/cmd/main.go b/devices/gpu/nvidia/cmd/main.go deleted file mode 100644 index 5c0bea6c4d86..000000000000 --- a/devices/gpu/nvidia/cmd/main.go +++ /dev/null @@ -1,20 +0,0 @@ -package main - -import ( - "context" - - log "github.com/hashicorp/go-hclog" - - "github.com/hashicorp/nomad/devices/gpu/nvidia" - "github.com/hashicorp/nomad/plugins" -) - -func main() { - // Serve the plugin - plugins.ServeCtx(factory) -} - -// factory returns a new instance of the Nvidia GPU plugin -func factory(ctx context.Context, log log.Logger) interface{} { - return nvidia.NewNvidiaDevice(ctx, log) -} diff --git a/devices/gpu/nvidia/device.go b/devices/gpu/nvidia/device.go deleted file mode 100644 index 67680dc2a0ee..000000000000 --- a/devices/gpu/nvidia/device.go +++ /dev/null @@ -1,228 +0,0 @@ -package nvidia - -import ( - "context" - "fmt" - "strings" - "sync" - "time" - - log "github.com/hashicorp/go-hclog" - "github.com/hashicorp/nomad/devices/gpu/nvidia/nvml" - "github.com/hashicorp/nomad/helper/pluginutils/loader" - "github.com/hashicorp/nomad/plugins/base" - "github.com/hashicorp/nomad/plugins/device" - "github.com/hashicorp/nomad/plugins/shared/hclspec" -) - -const ( - // pluginName is the name of the plugin - pluginName = "nvidia-gpu" - - // vendor is the vendor providing the devices - vendor = "nvidia" - - // deviceType is the type of device being returned - deviceType = device.DeviceTypeGPU - - // notAvailable value is returned to nomad server in case some properties were - // undetected by nvml driver - notAvailable = "N/A" - - // Nvidia-container-runtime environment variable names - NvidiaVisibleDevices = "NVIDIA_VISIBLE_DEVICES" -) - -var ( - // PluginID is the nvidia plugin metadata registered in the plugin - // catalog. - PluginID = loader.PluginID{ - Name: pluginName, - PluginType: base.PluginTypeDevice, - } - - // PluginConfig is the nvidia factory function registered in the - // plugin catalog. - PluginConfig = &loader.InternalPluginConfig{ - Factory: func(ctx context.Context, l log.Logger) interface{} { return NewNvidiaDevice(ctx, l) }, - } - - // pluginInfo describes the plugin - pluginInfo = &base.PluginInfoResponse{ - Type: base.PluginTypeDevice, - PluginApiVersions: []string{device.ApiVersion010}, - PluginVersion: "0.1.0", - Name: pluginName, - } - - // configSpec is the specification of the plugin's configuration - configSpec = hclspec.NewObject(map[string]*hclspec.Spec{ - "enabled": hclspec.NewDefault( - hclspec.NewAttr("enabled", "bool", false), - hclspec.NewLiteral("true"), - ), - "ignored_gpu_ids": hclspec.NewDefault( - hclspec.NewAttr("ignored_gpu_ids", "list(string)", false), - hclspec.NewLiteral("[]"), - ), - "fingerprint_period": hclspec.NewDefault( - hclspec.NewAttr("fingerprint_period", "string", false), - hclspec.NewLiteral("\"1m\""), - ), - }) -) - -// Config contains configuration information for the plugin. -type Config struct { - Enabled bool `codec:"enabled"` - IgnoredGPUIDs []string `codec:"ignored_gpu_ids"` - FingerprintPeriod string `codec:"fingerprint_period"` -} - -// NvidiaDevice contains all plugin specific data -type NvidiaDevice struct { - // enabled indicates whether the plugin should be enabled - enabled bool - - // nvmlClient is used to get data from nvidia - nvmlClient nvml.NvmlClient - - // initErr holds an error retrieved during - // nvmlClient initialization - initErr error - - // ignoredGPUIDs is a set of UUIDs that would not be exposed to nomad - ignoredGPUIDs map[string]struct{} - - // fingerprintPeriod is how often we should call nvml to get list of devices - fingerprintPeriod time.Duration - - // devices is the set of detected eligible devices - devices map[string]struct{} - deviceLock sync.RWMutex - - logger log.Logger -} - -// NewNvidiaDevice returns a new nvidia device plugin. -func NewNvidiaDevice(_ context.Context, log log.Logger) *NvidiaDevice { - nvmlClient, err := nvml.NewNvmlClient() - logger := log.Named(pluginName) - if err != nil && err.Error() != nvml.UnavailableLib.Error() { - logger.Error("unable to initialize Nvidia driver", "reason", err) - } - return &NvidiaDevice{ - logger: logger, - devices: make(map[string]struct{}), - ignoredGPUIDs: make(map[string]struct{}), - nvmlClient: nvmlClient, - initErr: err, - } -} - -// PluginInfo returns information describing the plugin. -func (d *NvidiaDevice) PluginInfo() (*base.PluginInfoResponse, error) { - return pluginInfo, nil -} - -// ConfigSchema returns the plugins configuration schema. -func (d *NvidiaDevice) ConfigSchema() (*hclspec.Spec, error) { - return configSpec, nil -} - -// SetConfig is used to set the configuration of the plugin. -func (d *NvidiaDevice) SetConfig(cfg *base.Config) error { - var config Config - if len(cfg.PluginConfig) != 0 { - if err := base.MsgPackDecode(cfg.PluginConfig, &config); err != nil { - return err - } - } - - d.enabled = config.Enabled - - for _, ignoredGPUId := range config.IgnoredGPUIDs { - d.ignoredGPUIDs[ignoredGPUId] = struct{}{} - } - - period, err := time.ParseDuration(config.FingerprintPeriod) - if err != nil { - return fmt.Errorf("failed to parse fingerprint period %q: %v", config.FingerprintPeriod, err) - } - d.fingerprintPeriod = period - - return nil -} - -// Fingerprint streams detected devices. If device changes are detected or the -// devices health changes, messages will be emitted. -func (d *NvidiaDevice) Fingerprint(ctx context.Context) (<-chan *device.FingerprintResponse, error) { - if !d.enabled { - return nil, device.ErrPluginDisabled - } - - outCh := make(chan *device.FingerprintResponse) - go d.fingerprint(ctx, outCh) - return outCh, nil -} - -type reservationError struct { - notExistingIDs []string -} - -func (e *reservationError) Error() string { - return fmt.Sprintf("unknown device IDs: %s", strings.Join(e.notExistingIDs, ",")) -} - -// Reserve returns information on how to mount given devices. -// Assumption is made that nomad server is responsible for correctness of -// GPU allocations, handling tricky cases such as double-allocation of single GPU -func (d *NvidiaDevice) Reserve(deviceIDs []string) (*device.ContainerReservation, error) { - if len(deviceIDs) == 0 { - return &device.ContainerReservation{}, nil - } - if !d.enabled { - return nil, device.ErrPluginDisabled - } - - // Due to the asynchronous nature of NvidiaPlugin, there is a possibility - // of race condition - // - // Timeline: - // 1 - fingerprint reports that GPU with id "1" is present - // 2 - the following events happen at the same time: - // a) server decides to allocate GPU with id "1" - // b) fingerprint check reports that GPU with id "1" is no more present - // - // The latest and always valid version of fingerprinted ids are stored in - // d.devices map. To avoid this race condition an error is returned if - // any of provided deviceIDs is not found in d.devices map - d.deviceLock.RLock() - var notExistingIDs []string - for _, id := range deviceIDs { - if _, deviceIDExists := d.devices[id]; !deviceIDExists { - notExistingIDs = append(notExistingIDs, id) - } - } - d.deviceLock.RUnlock() - if len(notExistingIDs) != 0 { - return nil, &reservationError{notExistingIDs} - } - - return &device.ContainerReservation{ - Envs: map[string]string{ - NvidiaVisibleDevices: strings.Join(deviceIDs, ","), - }, - }, nil -} - -// Stats streams statistics for the detected devices. -func (d *NvidiaDevice) Stats(ctx context.Context, interval time.Duration) (<-chan *device.StatsResponse, error) { - if !d.enabled { - return nil, device.ErrPluginDisabled - } - - outCh := make(chan *device.StatsResponse) - go d.stats(ctx, outCh, interval) - return outCh, nil -} diff --git a/devices/gpu/nvidia/device_test.go b/devices/gpu/nvidia/device_test.go deleted file mode 100644 index a5ec354e2432..000000000000 --- a/devices/gpu/nvidia/device_test.go +++ /dev/null @@ -1,140 +0,0 @@ -package nvidia - -import ( - "testing" - - hclog "github.com/hashicorp/go-hclog" - "github.com/hashicorp/nomad/devices/gpu/nvidia/nvml" - "github.com/hashicorp/nomad/plugins/device" - "github.com/stretchr/testify/require" -) - -type MockNvmlClient struct { - FingerprintError error - FingerprintResponseReturned *nvml.FingerprintData - - StatsError error - StatsResponseReturned []*nvml.StatsData -} - -func (c *MockNvmlClient) GetFingerprintData() (*nvml.FingerprintData, error) { - return c.FingerprintResponseReturned, c.FingerprintError -} - -func (c *MockNvmlClient) GetStatsData() ([]*nvml.StatsData, error) { - return c.StatsResponseReturned, c.StatsError -} - -func TestReserve(t *testing.T) { - cases := []struct { - Name string - ExpectedReservation *device.ContainerReservation - ExpectedError error - Device *NvidiaDevice - RequestedIDs []string - }{ - { - Name: "All RequestedIDs are not managed by Device", - ExpectedReservation: nil, - ExpectedError: &reservationError{[]string{ - "UUID1", - "UUID2", - "UUID3", - }}, - RequestedIDs: []string{ - "UUID1", - "UUID2", - "UUID3", - }, - Device: &NvidiaDevice{ - logger: hclog.NewNullLogger(), - enabled: true, - }, - }, - { - Name: "Some RequestedIDs are not managed by Device", - ExpectedReservation: nil, - ExpectedError: &reservationError{[]string{ - "UUID1", - "UUID2", - }}, - RequestedIDs: []string{ - "UUID1", - "UUID2", - "UUID3", - }, - Device: &NvidiaDevice{ - devices: map[string]struct{}{ - "UUID3": {}, - }, - logger: hclog.NewNullLogger(), - enabled: true, - }, - }, - { - Name: "All RequestedIDs are managed by Device", - ExpectedReservation: &device.ContainerReservation{ - Envs: map[string]string{ - NvidiaVisibleDevices: "UUID1,UUID2,UUID3", - }, - }, - ExpectedError: nil, - RequestedIDs: []string{ - "UUID1", - "UUID2", - "UUID3", - }, - Device: &NvidiaDevice{ - devices: map[string]struct{}{ - "UUID1": {}, - "UUID2": {}, - "UUID3": {}, - }, - logger: hclog.NewNullLogger(), - enabled: true, - }, - }, - { - Name: "No IDs requested", - ExpectedReservation: &device.ContainerReservation{}, - ExpectedError: nil, - RequestedIDs: nil, - Device: &NvidiaDevice{ - devices: map[string]struct{}{ - "UUID1": {}, - "UUID2": {}, - "UUID3": {}, - }, - logger: hclog.NewNullLogger(), - enabled: true, - }, - }, - { - Name: "Device is disabled", - ExpectedReservation: nil, - ExpectedError: device.ErrPluginDisabled, - RequestedIDs: []string{ - "UUID1", - "UUID2", - "UUID3", - }, - Device: &NvidiaDevice{ - devices: map[string]struct{}{ - "UUID1": {}, - "UUID2": {}, - "UUID3": {}, - }, - logger: hclog.NewNullLogger(), - enabled: false, - }, - }, - } - - for _, c := range cases { - t.Run(c.Name, func(t *testing.T) { - actualReservation, actualError := c.Device.Reserve(c.RequestedIDs) - require.Equal(t, c.ExpectedReservation, actualReservation) - require.Equal(t, c.ExpectedError, actualError) - }) - } -} diff --git a/devices/gpu/nvidia/fingerprint.go b/devices/gpu/nvidia/fingerprint.go deleted file mode 100644 index 45bb34fa3355..000000000000 --- a/devices/gpu/nvidia/fingerprint.go +++ /dev/null @@ -1,229 +0,0 @@ -package nvidia - -import ( - "context" - "time" - - "github.com/hashicorp/nomad/devices/gpu/nvidia/nvml" - "github.com/hashicorp/nomad/helper" - "github.com/hashicorp/nomad/plugins/device" - "github.com/hashicorp/nomad/plugins/shared/structs" -) - -const ( - // Attribute names and units for reporting Fingerprint output - MemoryAttr = "memory" - PowerAttr = "power" - BAR1Attr = "bar1" - DriverVersionAttr = "driver_version" - CoresClockAttr = "cores_clock" - MemoryClockAttr = "memory_clock" - PCIBandwidthAttr = "pci_bandwidth" - DisplayStateAttr = "display_state" - PersistenceModeAttr = "persistence_mode" -) - -// fingerprint is the long running goroutine that detects hardware -func (d *NvidiaDevice) fingerprint(ctx context.Context, devices chan<- *device.FingerprintResponse) { - defer close(devices) - - if d.initErr != nil { - if d.initErr.Error() != nvml.UnavailableLib.Error() { - d.logger.Error("exiting fingerprinting due to problems with NVML loading", "error", d.initErr) - devices <- device.NewFingerprintError(d.initErr) - } - - // Just close the channel to let server know that there are no working - // Nvidia GPU units - return - } - - // Create a timer that will fire immediately for the first detection - ticker := time.NewTimer(0) - - for { - select { - case <-ctx.Done(): - return - case <-ticker.C: - ticker.Reset(d.fingerprintPeriod) - } - d.writeFingerprintToChannel(devices) - } -} - -// writeFingerprintToChannel makes nvml call and writes response to channel -func (d *NvidiaDevice) writeFingerprintToChannel(devices chan<- *device.FingerprintResponse) { - fingerprintData, err := d.nvmlClient.GetFingerprintData() - if err != nil { - d.logger.Error("failed to get fingerprint nvidia devices", "error", err) - devices <- device.NewFingerprintError(err) - return - } - - // ignore devices from fingerprint output - fingerprintDevices := ignoreFingerprintedDevices(fingerprintData.Devices, d.ignoredGPUIDs) - // check if any device health was updated or any device was added to host - if !d.fingerprintChanged(fingerprintDevices) { - return - } - - commonAttributes := map[string]*structs.Attribute{ - DriverVersionAttr: { - String: helper.StringToPtr(fingerprintData.DriverVersion), - }, - } - - // Group all FingerprintDevices by DeviceName attribute - deviceListByDeviceName := make(map[string][]*nvml.FingerprintDeviceData) - for _, device := range fingerprintDevices { - deviceName := device.DeviceName - if deviceName == nil { - // nvml driver was not able to detect device name. This kind - // of devices are placed to single group with 'notAvailable' name - notAvailableCopy := notAvailable - deviceName = ¬AvailableCopy - } - - deviceListByDeviceName[*deviceName] = append(deviceListByDeviceName[*deviceName], device) - } - - // Build Fingerprint response with computed groups and send it over the channel - deviceGroups := make([]*device.DeviceGroup, 0, len(deviceListByDeviceName)) - for groupName, devices := range deviceListByDeviceName { - deviceGroups = append(deviceGroups, deviceGroupFromFingerprintData(groupName, devices, commonAttributes)) - } - devices <- device.NewFingerprint(deviceGroups...) -} - -// ignoreFingerprintedDevices excludes ignored devices from fingerprint output -func ignoreFingerprintedDevices(deviceData []*nvml.FingerprintDeviceData, ignoredGPUIDs map[string]struct{}) []*nvml.FingerprintDeviceData { - var result []*nvml.FingerprintDeviceData - for _, fingerprintDevice := range deviceData { - if _, ignored := ignoredGPUIDs[fingerprintDevice.UUID]; !ignored { - result = append(result, fingerprintDevice) - } - } - return result -} - -// fingerprintChanged checks if there are any previously unseen nvidia devices located -// or any of fingerprinted nvidia devices disappeared since the last fingerprint run. -// Also, this func updates device map on NvidiaDevice with the latest data -func (d *NvidiaDevice) fingerprintChanged(allDevices []*nvml.FingerprintDeviceData) bool { - d.deviceLock.Lock() - defer d.deviceLock.Unlock() - - changeDetected := false - // check if every device in allDevices is in d.devices - for _, device := range allDevices { - if _, ok := d.devices[device.UUID]; !ok { - changeDetected = true - } - } - - // check if every device in d.devices is in allDevices - fingerprintDeviceMap := make(map[string]struct{}) - for _, device := range allDevices { - fingerprintDeviceMap[device.UUID] = struct{}{} - } - for id := range d.devices { - if _, ok := fingerprintDeviceMap[id]; !ok { - changeDetected = true - } - } - - d.devices = fingerprintDeviceMap - return changeDetected -} - -// deviceGroupFromFingerprintData composes deviceGroup from FingerprintDeviceData slice -func deviceGroupFromFingerprintData(groupName string, deviceList []*nvml.FingerprintDeviceData, commonAttributes map[string]*structs.Attribute) *device.DeviceGroup { - // deviceGroup without devices makes no sense -> return nil when no devices are provided - if len(deviceList) == 0 { - return nil - } - - devices := make([]*device.Device, len(deviceList)) - for index, dev := range deviceList { - devices[index] = &device.Device{ - ID: dev.UUID, - // all fingerprinted devices are "healthy" for now - // to get real health data -> dcgm bindings should be used - Healthy: true, - HwLocality: &device.DeviceLocality{ - PciBusID: dev.PCIBusID, - }, - } - } - - deviceGroup := &device.DeviceGroup{ - Vendor: vendor, - Type: deviceType, - Name: groupName, - Devices: devices, - // Assumption made that devices with the same DeviceName have the same - // attributes like amount of memory, power, bar1memory etc - Attributes: attributesFromFingerprintDeviceData(deviceList[0]), - } - - // Extend attribute map with common attributes - for attributeKey, attributeValue := range commonAttributes { - deviceGroup.Attributes[attributeKey] = attributeValue - } - - return deviceGroup -} - -// attributesFromFingerprintDeviceData converts nvml.FingerprintDeviceData -// struct to device.DeviceGroup.Attributes format (map[string]string) -// this function performs all nil checks for FingerprintDeviceData pointers -func attributesFromFingerprintDeviceData(d *nvml.FingerprintDeviceData) map[string]*structs.Attribute { - attrs := map[string]*structs.Attribute{ - DisplayStateAttr: { - String: helper.StringToPtr(d.DisplayState), - }, - PersistenceModeAttr: { - String: helper.StringToPtr(d.PersistenceMode), - }, - } - - if d.MemoryMiB != nil { - attrs[MemoryAttr] = &structs.Attribute{ - Int: helper.Int64ToPtr(int64(*d.MemoryMiB)), - Unit: structs.UnitMiB, - } - } - if d.PowerW != nil { - attrs[PowerAttr] = &structs.Attribute{ - Int: helper.Int64ToPtr(int64(*d.PowerW)), - Unit: structs.UnitW, - } - } - if d.BAR1MiB != nil { - attrs[BAR1Attr] = &structs.Attribute{ - Int: helper.Int64ToPtr(int64(*d.BAR1MiB)), - Unit: structs.UnitMiB, - } - } - if d.CoresClockMHz != nil { - attrs[CoresClockAttr] = &structs.Attribute{ - Int: helper.Int64ToPtr(int64(*d.CoresClockMHz)), - Unit: structs.UnitMHz, - } - } - if d.MemoryClockMHz != nil { - attrs[MemoryClockAttr] = &structs.Attribute{ - Int: helper.Int64ToPtr(int64(*d.MemoryClockMHz)), - Unit: structs.UnitMHz, - } - } - if d.PCIBandwidthMBPerS != nil { - attrs[PCIBandwidthAttr] = &structs.Attribute{ - Int: helper.Int64ToPtr(int64(*d.PCIBandwidthMBPerS)), - Unit: structs.UnitMBPerS, - } - } - - return attrs -} diff --git a/devices/gpu/nvidia/fingerprint_test.go b/devices/gpu/nvidia/fingerprint_test.go deleted file mode 100644 index c85b5c8c90a3..000000000000 --- a/devices/gpu/nvidia/fingerprint_test.go +++ /dev/null @@ -1,1361 +0,0 @@ -package nvidia - -import ( - "context" - "errors" - "sort" - "testing" - - hclog "github.com/hashicorp/go-hclog" - "github.com/hashicorp/nomad/devices/gpu/nvidia/nvml" - "github.com/hashicorp/nomad/helper" - "github.com/hashicorp/nomad/plugins/device" - "github.com/hashicorp/nomad/plugins/shared/structs" - "github.com/stretchr/testify/require" -) - -func TestIgnoreFingerprintedDevices(t *testing.T) { - for _, testCase := range []struct { - Name string - DeviceData []*nvml.FingerprintDeviceData - IgnoredGPUIds map[string]struct{} - ExpectedResult []*nvml.FingerprintDeviceData - }{ - { - Name: "Odd ignored", - DeviceData: []*nvml.FingerprintDeviceData{ - { - DeviceData: &nvml.DeviceData{ - DeviceName: helper.StringToPtr("DeviceName1"), - UUID: "UUID1", - MemoryMiB: helper.Uint64ToPtr(1000), - }, - }, - { - DeviceData: &nvml.DeviceData{ - DeviceName: helper.StringToPtr("DeviceName2"), - UUID: "UUID2", - MemoryMiB: helper.Uint64ToPtr(1000), - }, - }, - { - DeviceData: &nvml.DeviceData{ - DeviceName: helper.StringToPtr("DeviceName3"), - UUID: "UUID3", - MemoryMiB: helper.Uint64ToPtr(1000), - }, - }, - }, - IgnoredGPUIds: map[string]struct{}{ - "UUID2": {}, - }, - ExpectedResult: []*nvml.FingerprintDeviceData{ - { - DeviceData: &nvml.DeviceData{ - DeviceName: helper.StringToPtr("DeviceName1"), - UUID: "UUID1", - MemoryMiB: helper.Uint64ToPtr(1000), - }, - }, - { - DeviceData: &nvml.DeviceData{ - DeviceName: helper.StringToPtr("DeviceName3"), - UUID: "UUID3", - MemoryMiB: helper.Uint64ToPtr(1000), - }, - }, - }, - }, - { - Name: "Even ignored", - DeviceData: []*nvml.FingerprintDeviceData{ - { - DeviceData: &nvml.DeviceData{ - DeviceName: helper.StringToPtr("DeviceName1"), - UUID: "UUID1", - MemoryMiB: helper.Uint64ToPtr(1000), - }, - }, - { - DeviceData: &nvml.DeviceData{ - DeviceName: helper.StringToPtr("DeviceName2"), - UUID: "UUID2", - MemoryMiB: helper.Uint64ToPtr(1000), - }, - }, - { - DeviceData: &nvml.DeviceData{ - DeviceName: helper.StringToPtr("DeviceName3"), - UUID: "UUID3", - MemoryMiB: helper.Uint64ToPtr(1000), - }, - }, - }, - IgnoredGPUIds: map[string]struct{}{ - "UUID1": {}, - "UUID3": {}, - }, - ExpectedResult: []*nvml.FingerprintDeviceData{ - { - DeviceData: &nvml.DeviceData{ - DeviceName: helper.StringToPtr("DeviceName2"), - UUID: "UUID2", - MemoryMiB: helper.Uint64ToPtr(1000), - }, - }, - }, - }, - { - Name: "All ignored", - DeviceData: []*nvml.FingerprintDeviceData{ - { - DeviceData: &nvml.DeviceData{ - DeviceName: helper.StringToPtr("DeviceName1"), - UUID: "UUID1", - MemoryMiB: helper.Uint64ToPtr(1000), - }, - }, - { - DeviceData: &nvml.DeviceData{ - DeviceName: helper.StringToPtr("DeviceName2"), - UUID: "UUID2", - MemoryMiB: helper.Uint64ToPtr(1000), - }, - }, - { - DeviceData: &nvml.DeviceData{ - DeviceName: helper.StringToPtr("DeviceName3"), - UUID: "UUID3", - MemoryMiB: helper.Uint64ToPtr(1000), - }, - }, - }, - IgnoredGPUIds: map[string]struct{}{ - "UUID1": {}, - "UUID2": {}, - "UUID3": {}, - }, - ExpectedResult: nil, - }, - { - Name: "No ignored", - DeviceData: []*nvml.FingerprintDeviceData{ - { - DeviceData: &nvml.DeviceData{ - DeviceName: helper.StringToPtr("DeviceName1"), - UUID: "UUID1", - MemoryMiB: helper.Uint64ToPtr(1000), - }, - }, - { - DeviceData: &nvml.DeviceData{ - DeviceName: helper.StringToPtr("DeviceName2"), - UUID: "UUID2", - MemoryMiB: helper.Uint64ToPtr(1000), - }, - }, - { - DeviceData: &nvml.DeviceData{ - DeviceName: helper.StringToPtr("DeviceName3"), - UUID: "UUID3", - MemoryMiB: helper.Uint64ToPtr(1000), - }, - }, - }, - IgnoredGPUIds: map[string]struct{}{}, - ExpectedResult: []*nvml.FingerprintDeviceData{ - { - DeviceData: &nvml.DeviceData{ - DeviceName: helper.StringToPtr("DeviceName1"), - UUID: "UUID1", - MemoryMiB: helper.Uint64ToPtr(1000), - }, - }, - { - DeviceData: &nvml.DeviceData{ - DeviceName: helper.StringToPtr("DeviceName2"), - UUID: "UUID2", - MemoryMiB: helper.Uint64ToPtr(1000), - }, - }, - { - DeviceData: &nvml.DeviceData{ - DeviceName: helper.StringToPtr("DeviceName3"), - UUID: "UUID3", - MemoryMiB: helper.Uint64ToPtr(1000), - }, - }, - }, - }, - { - Name: "No DeviceData provided", - DeviceData: nil, - IgnoredGPUIds: map[string]struct{}{ - "UUID1": {}, - "UUID2": {}, - "UUID3": {}, - }, - ExpectedResult: nil, - }, - } { - t.Run(testCase.Name, func(t *testing.T) { - actualResult := ignoreFingerprintedDevices(testCase.DeviceData, testCase.IgnoredGPUIds) - require.New(t).Equal(testCase.ExpectedResult, actualResult) - }) - } -} - -func TestCheckFingerprintUpdates(t *testing.T) { - for _, testCase := range []struct { - Name string - Device *NvidiaDevice - AllDevices []*nvml.FingerprintDeviceData - DeviceMapAfterMethodCall map[string]struct{} - ExpectedResult bool - }{ - { - Name: "No updates", - Device: &NvidiaDevice{devices: map[string]struct{}{ - "1": {}, - "2": {}, - "3": {}, - }}, - AllDevices: []*nvml.FingerprintDeviceData{ - { - DeviceData: &nvml.DeviceData{ - UUID: "1", - }, - }, - { - DeviceData: &nvml.DeviceData{ - UUID: "2", - }, - }, - { - DeviceData: &nvml.DeviceData{ - UUID: "3", - }, - }, - }, - ExpectedResult: false, - DeviceMapAfterMethodCall: map[string]struct{}{ - "1": {}, - "2": {}, - "3": {}, - }, - }, - { - Name: "New Device Appeared", - Device: &NvidiaDevice{devices: map[string]struct{}{ - "1": {}, - "2": {}, - "3": {}, - }}, - AllDevices: []*nvml.FingerprintDeviceData{ - { - DeviceData: &nvml.DeviceData{ - UUID: "1", - }, - }, - { - DeviceData: &nvml.DeviceData{ - UUID: "2", - }, - }, - { - DeviceData: &nvml.DeviceData{ - UUID: "3", - }, - }, - { - DeviceData: &nvml.DeviceData{ - UUID: "I am new", - }, - }, - }, - ExpectedResult: true, - DeviceMapAfterMethodCall: map[string]struct{}{ - "1": {}, - "2": {}, - "3": {}, - "I am new": {}, - }, - }, - { - Name: "Device disappeared", - Device: &NvidiaDevice{devices: map[string]struct{}{ - "1": {}, - "2": {}, - "3": {}, - }}, - AllDevices: []*nvml.FingerprintDeviceData{ - { - DeviceData: &nvml.DeviceData{ - UUID: "1", - }, - }, - { - DeviceData: &nvml.DeviceData{ - UUID: "2", - }, - }, - }, - ExpectedResult: true, - DeviceMapAfterMethodCall: map[string]struct{}{ - "1": {}, - "2": {}, - }, - }, - { - Name: "No devices in NvidiaDevice map", - Device: &NvidiaDevice{}, - AllDevices: []*nvml.FingerprintDeviceData{ - { - DeviceData: &nvml.DeviceData{ - UUID: "1", - }, - }, - { - DeviceData: &nvml.DeviceData{ - UUID: "2", - }, - }, - { - DeviceData: &nvml.DeviceData{ - UUID: "3", - }, - }, - }, - ExpectedResult: true, - DeviceMapAfterMethodCall: map[string]struct{}{ - "1": {}, - "2": {}, - "3": {}, - }, - }, - { - Name: "No devices detected", - Device: &NvidiaDevice{devices: map[string]struct{}{ - "1": {}, - "2": {}, - "3": {}, - }}, - AllDevices: nil, - ExpectedResult: true, - DeviceMapAfterMethodCall: map[string]struct{}{}, - }, - } { - t.Run(testCase.Name, func(t *testing.T) { - actualResult := testCase.Device.fingerprintChanged(testCase.AllDevices) - req := require.New(t) - // check that function returns valid "updated / not updated" state - req.Equal(testCase.ExpectedResult, actualResult) - // check that function propely updates devices map - req.Equal(testCase.Device.devices, testCase.DeviceMapAfterMethodCall) - }) - } -} - -func TestAttributesFromFingerprintDeviceData(t *testing.T) { - for _, testCase := range []struct { - Name string - FingerprintDeviceData *nvml.FingerprintDeviceData - ExpectedResult map[string]*structs.Attribute - }{ - { - Name: "All attributes are not nil", - FingerprintDeviceData: &nvml.FingerprintDeviceData{ - DeviceData: &nvml.DeviceData{ - UUID: "1", - DeviceName: helper.StringToPtr("Type1"), - MemoryMiB: helper.Uint64ToPtr(256), - PowerW: helper.UintToPtr(2), - BAR1MiB: helper.Uint64ToPtr(256), - }, - PCIBusID: "pciBusID1", - PCIBandwidthMBPerS: helper.UintToPtr(1), - CoresClockMHz: helper.UintToPtr(1), - MemoryClockMHz: helper.UintToPtr(1), - DisplayState: "Enabled", - PersistenceMode: "Enabled", - }, - ExpectedResult: map[string]*structs.Attribute{ - MemoryAttr: { - Int: helper.Int64ToPtr(256), - Unit: structs.UnitMiB, - }, - PowerAttr: { - Int: helper.Int64ToPtr(2), - Unit: structs.UnitW, - }, - BAR1Attr: { - Int: helper.Int64ToPtr(256), - Unit: structs.UnitMiB, - }, - PCIBandwidthAttr: { - Int: helper.Int64ToPtr(1), - Unit: structs.UnitMBPerS, - }, - CoresClockAttr: { - Int: helper.Int64ToPtr(1), - Unit: structs.UnitMHz, - }, - MemoryClockAttr: { - Int: helper.Int64ToPtr(1), - Unit: structs.UnitMHz, - }, - DisplayStateAttr: { - String: helper.StringToPtr("Enabled"), - }, - PersistenceModeAttr: { - String: helper.StringToPtr("Enabled"), - }, - }, - }, - { - Name: "nil values are omitted", - FingerprintDeviceData: &nvml.FingerprintDeviceData{ - DeviceData: &nvml.DeviceData{ - UUID: "1", - DeviceName: helper.StringToPtr("Type1"), - MemoryMiB: nil, - PowerW: helper.UintToPtr(2), - BAR1MiB: helper.Uint64ToPtr(256), - }, - PCIBusID: "pciBusID1", - DisplayState: "Enabled", - PersistenceMode: "Enabled", - }, - ExpectedResult: map[string]*structs.Attribute{ - PowerAttr: { - Int: helper.Int64ToPtr(2), - Unit: structs.UnitW, - }, - BAR1Attr: { - Int: helper.Int64ToPtr(256), - Unit: structs.UnitMiB, - }, - DisplayStateAttr: { - String: helper.StringToPtr("Enabled"), - }, - PersistenceModeAttr: { - String: helper.StringToPtr("Enabled"), - }, - }, - }, - } { - t.Run(testCase.Name, func(t *testing.T) { - actualResult := attributesFromFingerprintDeviceData(testCase.FingerprintDeviceData) - require.Equal(t, testCase.ExpectedResult, actualResult) - }) - } -} - -func TestDeviceGroupFromFingerprintData(t *testing.T) { - for _, testCase := range []struct { - Name string - GroupName string - Devices []*nvml.FingerprintDeviceData - CommonAttributes map[string]*structs.Attribute - ExpectedResult *device.DeviceGroup - }{ - { - Name: "Devices are provided", - GroupName: "Type1", - Devices: []*nvml.FingerprintDeviceData{ - { - DeviceData: &nvml.DeviceData{ - UUID: "1", - DeviceName: helper.StringToPtr("Type1"), - MemoryMiB: helper.Uint64ToPtr(100), - PowerW: helper.UintToPtr(2), - BAR1MiB: helper.Uint64ToPtr(256), - }, - PCIBusID: "pciBusID1", - PCIBandwidthMBPerS: helper.UintToPtr(1), - CoresClockMHz: helper.UintToPtr(1), - MemoryClockMHz: helper.UintToPtr(1), - DisplayState: "Enabled", - PersistenceMode: "Enabled", - }, - { - DeviceData: &nvml.DeviceData{ - UUID: "2", - DeviceName: helper.StringToPtr("Type1"), - MemoryMiB: helper.Uint64ToPtr(100), - PowerW: helper.UintToPtr(2), - BAR1MiB: helper.Uint64ToPtr(256), - }, - PCIBusID: "pciBusID2", - PCIBandwidthMBPerS: helper.UintToPtr(1), - CoresClockMHz: helper.UintToPtr(1), - MemoryClockMHz: helper.UintToPtr(1), - DisplayState: "Enabled", - PersistenceMode: "Enabled", - }, - }, - ExpectedResult: &device.DeviceGroup{ - Vendor: vendor, - Type: deviceType, - Name: "Type1", - Devices: []*device.Device{ - { - ID: "1", - Healthy: true, - HwLocality: &device.DeviceLocality{ - PciBusID: "pciBusID1", - }, - }, - { - ID: "2", - Healthy: true, - HwLocality: &device.DeviceLocality{ - PciBusID: "pciBusID2", - }, - }, - }, - Attributes: map[string]*structs.Attribute{ - MemoryAttr: { - Int: helper.Int64ToPtr(100), - Unit: structs.UnitMiB, - }, - PowerAttr: { - Int: helper.Int64ToPtr(2), - Unit: structs.UnitW, - }, - BAR1Attr: { - Int: helper.Int64ToPtr(256), - Unit: structs.UnitMiB, - }, - PCIBandwidthAttr: { - Int: helper.Int64ToPtr(1), - Unit: structs.UnitMBPerS, - }, - CoresClockAttr: { - Int: helper.Int64ToPtr(1), - Unit: structs.UnitMHz, - }, - MemoryClockAttr: { - Int: helper.Int64ToPtr(1), - Unit: structs.UnitMHz, - }, - DisplayStateAttr: { - String: helper.StringToPtr("Enabled"), - }, - PersistenceModeAttr: { - String: helper.StringToPtr("Enabled"), - }, - }, - }, - }, - { - Name: "Devices and common attributes are provided", - GroupName: "Type1", - Devices: []*nvml.FingerprintDeviceData{ - { - DeviceData: &nvml.DeviceData{ - UUID: "1", - DeviceName: helper.StringToPtr("Type1"), - MemoryMiB: helper.Uint64ToPtr(100), - PowerW: helper.UintToPtr(2), - BAR1MiB: helper.Uint64ToPtr(256), - }, - PCIBusID: "pciBusID1", - PCIBandwidthMBPerS: helper.UintToPtr(1), - CoresClockMHz: helper.UintToPtr(1), - MemoryClockMHz: helper.UintToPtr(1), - DisplayState: "Enabled", - PersistenceMode: "Enabled", - }, - { - DeviceData: &nvml.DeviceData{ - UUID: "2", - DeviceName: helper.StringToPtr("Type1"), - MemoryMiB: helper.Uint64ToPtr(100), - PowerW: helper.UintToPtr(2), - BAR1MiB: helper.Uint64ToPtr(256), - }, - PCIBusID: "pciBusID2", - PCIBandwidthMBPerS: helper.UintToPtr(1), - CoresClockMHz: helper.UintToPtr(1), - MemoryClockMHz: helper.UintToPtr(1), - DisplayState: "Enabled", - PersistenceMode: "Enabled", - }, - }, - CommonAttributes: map[string]*structs.Attribute{ - DriverVersionAttr: { - String: helper.StringToPtr("1"), - }, - }, - ExpectedResult: &device.DeviceGroup{ - Vendor: vendor, - Type: deviceType, - Name: "Type1", - Devices: []*device.Device{ - { - ID: "1", - Healthy: true, - HwLocality: &device.DeviceLocality{ - PciBusID: "pciBusID1", - }, - }, - { - ID: "2", - Healthy: true, - HwLocality: &device.DeviceLocality{ - PciBusID: "pciBusID2", - }, - }, - }, - Attributes: map[string]*structs.Attribute{ - MemoryAttr: { - Int: helper.Int64ToPtr(100), - Unit: structs.UnitMiB, - }, - PowerAttr: { - Int: helper.Int64ToPtr(2), - Unit: structs.UnitW, - }, - BAR1Attr: { - Int: helper.Int64ToPtr(256), - Unit: structs.UnitMiB, - }, - PCIBandwidthAttr: { - Int: helper.Int64ToPtr(1), - Unit: structs.UnitMBPerS, - }, - CoresClockAttr: { - Int: helper.Int64ToPtr(1), - Unit: structs.UnitMHz, - }, - MemoryClockAttr: { - Int: helper.Int64ToPtr(1), - Unit: structs.UnitMHz, - }, - DisplayStateAttr: { - String: helper.StringToPtr("Enabled"), - }, - PersistenceModeAttr: { - String: helper.StringToPtr("Enabled"), - }, - DriverVersionAttr: { - String: helper.StringToPtr("1"), - }, - }, - }, - }, - { - Name: "Devices are not provided", - GroupName: "Type1", - CommonAttributes: map[string]*structs.Attribute{ - DriverVersionAttr: { - String: helper.StringToPtr("1"), - }, - }, - Devices: nil, - ExpectedResult: nil, - }, - } { - t.Run(testCase.Name, func(t *testing.T) { - actualResult := deviceGroupFromFingerprintData(testCase.GroupName, testCase.Devices, testCase.CommonAttributes) - require.New(t).Equal(testCase.ExpectedResult, actualResult) - }) - } -} - -func TestWriteFingerprintToChannel(t *testing.T) { - for _, testCase := range []struct { - Name string - Device *NvidiaDevice - ExpectedWriteToChannel *device.FingerprintResponse - }{ - { - Name: "Check that FingerprintError is handled properly", - Device: &NvidiaDevice{ - nvmlClient: &MockNvmlClient{ - FingerprintError: errors.New(""), - }, - logger: hclog.NewNullLogger(), - }, - ExpectedWriteToChannel: &device.FingerprintResponse{ - Error: errors.New(""), - }, - }, - { - Name: "Check ignore devices works correctly", - Device: &NvidiaDevice{ - nvmlClient: &MockNvmlClient{ - FingerprintResponseReturned: &nvml.FingerprintData{ - DriverVersion: "1", - Devices: []*nvml.FingerprintDeviceData{ - { - DeviceData: &nvml.DeviceData{ - UUID: "1", - DeviceName: helper.StringToPtr("Name"), - MemoryMiB: helper.Uint64ToPtr(10), - PowerW: helper.UintToPtr(100), - BAR1MiB: helper.Uint64ToPtr(256), - }, - PCIBusID: "pciBusID1", - PCIBandwidthMBPerS: helper.UintToPtr(1), - CoresClockMHz: helper.UintToPtr(1), - MemoryClockMHz: helper.UintToPtr(1), - DisplayState: "Enabled", - PersistenceMode: "Enabled", - }, - { - DeviceData: &nvml.DeviceData{ - UUID: "2", - DeviceName: helper.StringToPtr("Name"), - MemoryMiB: helper.Uint64ToPtr(10), - PowerW: helper.UintToPtr(100), - BAR1MiB: helper.Uint64ToPtr(256), - }, - PCIBusID: "pciBusID2", - PCIBandwidthMBPerS: helper.UintToPtr(1), - CoresClockMHz: helper.UintToPtr(1), - MemoryClockMHz: helper.UintToPtr(1), - DisplayState: "Enabled", - PersistenceMode: "Enabled", - }, - }, - }, - }, - ignoredGPUIDs: map[string]struct{}{ - "1": {}, - }, - logger: hclog.NewNullLogger(), - }, - ExpectedWriteToChannel: &device.FingerprintResponse{ - Devices: []*device.DeviceGroup{ - { - Vendor: vendor, - Type: deviceType, - Name: "Name", - Devices: []*device.Device{ - { - ID: "2", - Healthy: true, - HwLocality: &device.DeviceLocality{ - PciBusID: "pciBusID2", - }, - }, - }, - Attributes: map[string]*structs.Attribute{ - MemoryAttr: { - Int: helper.Int64ToPtr(10), - Unit: structs.UnitMiB, - }, - PowerAttr: { - Int: helper.Int64ToPtr(100), - Unit: structs.UnitW, - }, - BAR1Attr: { - Int: helper.Int64ToPtr(256), - Unit: structs.UnitMiB, - }, - PCIBandwidthAttr: { - Int: helper.Int64ToPtr(1), - Unit: structs.UnitMBPerS, - }, - CoresClockAttr: { - Int: helper.Int64ToPtr(1), - Unit: structs.UnitMHz, - }, - MemoryClockAttr: { - Int: helper.Int64ToPtr(1), - Unit: structs.UnitMHz, - }, - DisplayStateAttr: { - String: helper.StringToPtr("Enabled"), - }, - PersistenceModeAttr: { - String: helper.StringToPtr("Enabled"), - }, - DriverVersionAttr: { - String: helper.StringToPtr("1"), - }, - }, - }, - }, - }, - }, - { - Name: "Check devices are split to multiple device groups 1", - Device: &NvidiaDevice{ - nvmlClient: &MockNvmlClient{ - FingerprintResponseReturned: &nvml.FingerprintData{ - DriverVersion: "1", - Devices: []*nvml.FingerprintDeviceData{ - { - DeviceData: &nvml.DeviceData{ - UUID: "1", - DeviceName: helper.StringToPtr("Name1"), - MemoryMiB: helper.Uint64ToPtr(10), - PowerW: helper.UintToPtr(100), - BAR1MiB: helper.Uint64ToPtr(256), - }, - PCIBusID: "pciBusID1", - PCIBandwidthMBPerS: helper.UintToPtr(1), - CoresClockMHz: helper.UintToPtr(1), - MemoryClockMHz: helper.UintToPtr(1), - DisplayState: "Enabled", - PersistenceMode: "Enabled", - }, - { - DeviceData: &nvml.DeviceData{ - UUID: "2", - DeviceName: helper.StringToPtr("Name2"), - MemoryMiB: helper.Uint64ToPtr(11), - PowerW: helper.UintToPtr(100), - BAR1MiB: helper.Uint64ToPtr(256), - }, - PCIBusID: "pciBusID2", - PCIBandwidthMBPerS: helper.UintToPtr(1), - CoresClockMHz: helper.UintToPtr(1), - MemoryClockMHz: helper.UintToPtr(1), - DisplayState: "Enabled", - PersistenceMode: "Enabled", - }, - { - DeviceData: &nvml.DeviceData{ - UUID: "3", - DeviceName: helper.StringToPtr("Name3"), - MemoryMiB: helper.Uint64ToPtr(12), - PowerW: helper.UintToPtr(100), - BAR1MiB: helper.Uint64ToPtr(256), - }, - PCIBusID: "pciBusID3", - PCIBandwidthMBPerS: helper.UintToPtr(1), - CoresClockMHz: helper.UintToPtr(1), - MemoryClockMHz: helper.UintToPtr(1), - DisplayState: "Enabled", - PersistenceMode: "Enabled", - }, - }, - }, - }, - logger: hclog.NewNullLogger(), - }, - ExpectedWriteToChannel: &device.FingerprintResponse{ - Devices: []*device.DeviceGroup{ - { - Vendor: vendor, - Type: deviceType, - Name: "Name1", - Devices: []*device.Device{ - { - ID: "1", - Healthy: true, - HwLocality: &device.DeviceLocality{ - PciBusID: "pciBusID1", - }, - }, - }, - Attributes: map[string]*structs.Attribute{ - MemoryAttr: { - Int: helper.Int64ToPtr(10), - Unit: structs.UnitMiB, - }, - PowerAttr: { - Int: helper.Int64ToPtr(100), - Unit: structs.UnitW, - }, - BAR1Attr: { - Int: helper.Int64ToPtr(256), - Unit: structs.UnitMiB, - }, - PCIBandwidthAttr: { - Int: helper.Int64ToPtr(1), - Unit: structs.UnitMBPerS, - }, - CoresClockAttr: { - Int: helper.Int64ToPtr(1), - Unit: structs.UnitMHz, - }, - MemoryClockAttr: { - Int: helper.Int64ToPtr(1), - Unit: structs.UnitMHz, - }, - DisplayStateAttr: { - String: helper.StringToPtr("Enabled"), - }, - PersistenceModeAttr: { - String: helper.StringToPtr("Enabled"), - }, - DriverVersionAttr: { - String: helper.StringToPtr("1"), - }, - }, - }, - { - Vendor: vendor, - Type: deviceType, - Name: "Name2", - Devices: []*device.Device{ - { - ID: "2", - Healthy: true, - HwLocality: &device.DeviceLocality{ - PciBusID: "pciBusID2", - }, - }, - }, - Attributes: map[string]*structs.Attribute{ - MemoryAttr: { - Int: helper.Int64ToPtr(11), - Unit: structs.UnitMiB, - }, - PowerAttr: { - Int: helper.Int64ToPtr(100), - Unit: structs.UnitW, - }, - BAR1Attr: { - Int: helper.Int64ToPtr(256), - Unit: structs.UnitMiB, - }, - PCIBandwidthAttr: { - Int: helper.Int64ToPtr(1), - Unit: structs.UnitMBPerS, - }, - CoresClockAttr: { - Int: helper.Int64ToPtr(1), - Unit: structs.UnitMHz, - }, - MemoryClockAttr: { - Int: helper.Int64ToPtr(1), - Unit: structs.UnitMHz, - }, - DisplayStateAttr: { - String: helper.StringToPtr("Enabled"), - }, - PersistenceModeAttr: { - String: helper.StringToPtr("Enabled"), - }, - DriverVersionAttr: { - String: helper.StringToPtr("1"), - }, - }, - }, - { - Vendor: vendor, - Type: deviceType, - Name: "Name3", - Devices: []*device.Device{ - { - ID: "3", - Healthy: true, - HwLocality: &device.DeviceLocality{ - PciBusID: "pciBusID3", - }, - }, - }, - Attributes: map[string]*structs.Attribute{ - MemoryAttr: { - Int: helper.Int64ToPtr(12), - Unit: structs.UnitMiB, - }, - PowerAttr: { - Int: helper.Int64ToPtr(100), - Unit: structs.UnitW, - }, - BAR1Attr: { - Int: helper.Int64ToPtr(256), - Unit: structs.UnitMiB, - }, - PCIBandwidthAttr: { - Int: helper.Int64ToPtr(1), - Unit: structs.UnitMBPerS, - }, - CoresClockAttr: { - Int: helper.Int64ToPtr(1), - Unit: structs.UnitMHz, - }, - MemoryClockAttr: { - Int: helper.Int64ToPtr(1), - Unit: structs.UnitMHz, - }, - DisplayStateAttr: { - String: helper.StringToPtr("Enabled"), - }, - PersistenceModeAttr: { - String: helper.StringToPtr("Enabled"), - }, - DriverVersionAttr: { - String: helper.StringToPtr("1"), - }, - }, - }, - }, - }, - }, - { - Name: "Check devices are split to multiple device groups 2", - Device: &NvidiaDevice{ - nvmlClient: &MockNvmlClient{ - FingerprintResponseReturned: &nvml.FingerprintData{ - DriverVersion: "1", - Devices: []*nvml.FingerprintDeviceData{ - { - DeviceData: &nvml.DeviceData{ - UUID: "1", - DeviceName: helper.StringToPtr("Name1"), - MemoryMiB: helper.Uint64ToPtr(10), - PowerW: helper.UintToPtr(100), - BAR1MiB: helper.Uint64ToPtr(256), - }, - PCIBusID: "pciBusID1", - PCIBandwidthMBPerS: helper.UintToPtr(1), - CoresClockMHz: helper.UintToPtr(1), - MemoryClockMHz: helper.UintToPtr(1), - DisplayState: "Enabled", - PersistenceMode: "Enabled", - }, - { - DeviceData: &nvml.DeviceData{ - UUID: "2", - DeviceName: helper.StringToPtr("Name2"), - MemoryMiB: helper.Uint64ToPtr(11), - PowerW: helper.UintToPtr(100), - BAR1MiB: helper.Uint64ToPtr(256), - }, - PCIBusID: "pciBusID2", - PCIBandwidthMBPerS: helper.UintToPtr(1), - CoresClockMHz: helper.UintToPtr(1), - MemoryClockMHz: helper.UintToPtr(1), - DisplayState: "Enabled", - PersistenceMode: "Enabled", - }, - { - DeviceData: &nvml.DeviceData{ - UUID: "3", - DeviceName: helper.StringToPtr("Name2"), - MemoryMiB: helper.Uint64ToPtr(12), - PowerW: helper.UintToPtr(100), - BAR1MiB: helper.Uint64ToPtr(256), - }, - PCIBusID: "pciBusID3", - PCIBandwidthMBPerS: helper.UintToPtr(1), - CoresClockMHz: helper.UintToPtr(1), - MemoryClockMHz: helper.UintToPtr(1), - DisplayState: "Enabled", - PersistenceMode: "Enabled", - }, - }, - }, - }, - logger: hclog.NewNullLogger(), - }, - ExpectedWriteToChannel: &device.FingerprintResponse{ - Devices: []*device.DeviceGroup{ - { - Vendor: vendor, - Type: deviceType, - Name: "Name1", - Devices: []*device.Device{ - { - ID: "1", - Healthy: true, - HwLocality: &device.DeviceLocality{ - PciBusID: "pciBusID1", - }, - }, - }, - Attributes: map[string]*structs.Attribute{ - MemoryAttr: { - Int: helper.Int64ToPtr(10), - Unit: structs.UnitMiB, - }, - PowerAttr: { - Int: helper.Int64ToPtr(100), - Unit: structs.UnitW, - }, - BAR1Attr: { - Int: helper.Int64ToPtr(256), - Unit: structs.UnitMiB, - }, - PCIBandwidthAttr: { - Int: helper.Int64ToPtr(1), - Unit: structs.UnitMBPerS, - }, - CoresClockAttr: { - Int: helper.Int64ToPtr(1), - Unit: structs.UnitMHz, - }, - MemoryClockAttr: { - Int: helper.Int64ToPtr(1), - Unit: structs.UnitMHz, - }, - DisplayStateAttr: { - String: helper.StringToPtr("Enabled"), - }, - PersistenceModeAttr: { - String: helper.StringToPtr("Enabled"), - }, - DriverVersionAttr: { - String: helper.StringToPtr("1"), - }, - }, - }, - { - Vendor: vendor, - Type: deviceType, - Name: "Name2", - Devices: []*device.Device{ - { - ID: "2", - Healthy: true, - HwLocality: &device.DeviceLocality{ - PciBusID: "pciBusID2", - }, - }, - { - ID: "3", - Healthy: true, - HwLocality: &device.DeviceLocality{ - PciBusID: "pciBusID3", - }, - }, - }, - Attributes: map[string]*structs.Attribute{ - MemoryAttr: { - Int: helper.Int64ToPtr(11), - Unit: structs.UnitMiB, - }, - PowerAttr: { - Int: helper.Int64ToPtr(100), - Unit: structs.UnitW, - }, - BAR1Attr: { - Int: helper.Int64ToPtr(256), - Unit: structs.UnitMiB, - }, - PCIBandwidthAttr: { - Int: helper.Int64ToPtr(1), - Unit: structs.UnitMBPerS, - }, - CoresClockAttr: { - Int: helper.Int64ToPtr(1), - Unit: structs.UnitMHz, - }, - MemoryClockAttr: { - Int: helper.Int64ToPtr(1), - Unit: structs.UnitMHz, - }, - DisplayStateAttr: { - String: helper.StringToPtr("Enabled"), - }, - PersistenceModeAttr: { - String: helper.StringToPtr("Enabled"), - }, - DriverVersionAttr: { - String: helper.StringToPtr("1"), - }, - }, - }, - }, - }, - }, - } { - t.Run(testCase.Name, func(t *testing.T) { - channel := make(chan *device.FingerprintResponse, 1) - testCase.Device.writeFingerprintToChannel(channel) - actualResult := <-channel - // writeFingerprintToChannel iterates over map keys - // and insterts results to an array, so order of elements in output array - // may be different - // actualResult, expectedResult arrays has to be sorted firsted - sort.Slice(actualResult.Devices, func(i, j int) bool { - return actualResult.Devices[i].Name < actualResult.Devices[j].Name - }) - sort.Slice(testCase.ExpectedWriteToChannel.Devices, func(i, j int) bool { - return testCase.ExpectedWriteToChannel.Devices[i].Name < testCase.ExpectedWriteToChannel.Devices[j].Name - }) - require.Equal(t, testCase.ExpectedWriteToChannel, actualResult) - }) - } -} - -// Test if nonworking driver returns empty fingerprint data -func TestFingerprint(t *testing.T) { - for _, testCase := range []struct { - Name string - Device *NvidiaDevice - ExpectedWriteToChannel *device.FingerprintResponse - }{ - { - Name: "Check that working driver returns valid fingeprint data", - Device: &NvidiaDevice{ - initErr: nil, - nvmlClient: &MockNvmlClient{ - FingerprintResponseReturned: &nvml.FingerprintData{ - DriverVersion: "1", - Devices: []*nvml.FingerprintDeviceData{ - { - DeviceData: &nvml.DeviceData{ - UUID: "1", - DeviceName: helper.StringToPtr("Name1"), - MemoryMiB: helper.Uint64ToPtr(10), - PowerW: helper.UintToPtr(100), - BAR1MiB: helper.Uint64ToPtr(256), - }, - PCIBusID: "pciBusID1", - PCIBandwidthMBPerS: helper.UintToPtr(1), - CoresClockMHz: helper.UintToPtr(1), - MemoryClockMHz: helper.UintToPtr(1), - DisplayState: "Enabled", - PersistenceMode: "Enabled", - }, - { - DeviceData: &nvml.DeviceData{ - UUID: "2", - DeviceName: helper.StringToPtr("Name1"), - MemoryMiB: helper.Uint64ToPtr(10), - PowerW: helper.UintToPtr(100), - BAR1MiB: helper.Uint64ToPtr(256), - }, - PCIBusID: "pciBusID2", - PCIBandwidthMBPerS: helper.UintToPtr(1), - CoresClockMHz: helper.UintToPtr(1), - MemoryClockMHz: helper.UintToPtr(1), - DisplayState: "Enabled", - PersistenceMode: "Enabled", - }, - { - DeviceData: &nvml.DeviceData{ - UUID: "3", - DeviceName: helper.StringToPtr("Name1"), - MemoryMiB: helper.Uint64ToPtr(10), - PowerW: helper.UintToPtr(100), - BAR1MiB: helper.Uint64ToPtr(256), - }, - PCIBusID: "pciBusID3", - PCIBandwidthMBPerS: helper.UintToPtr(1), - CoresClockMHz: helper.UintToPtr(1), - MemoryClockMHz: helper.UintToPtr(1), - DisplayState: "Enabled", - PersistenceMode: "Enabled", - }, - }, - }, - }, - logger: hclog.NewNullLogger(), - }, - ExpectedWriteToChannel: &device.FingerprintResponse{ - Devices: []*device.DeviceGroup{ - { - Vendor: vendor, - Type: deviceType, - Name: "Name1", - Devices: []*device.Device{ - { - ID: "1", - Healthy: true, - HwLocality: &device.DeviceLocality{ - PciBusID: "pciBusID1", - }, - }, - { - ID: "2", - Healthy: true, - HwLocality: &device.DeviceLocality{ - PciBusID: "pciBusID2", - }, - }, - { - ID: "3", - Healthy: true, - HwLocality: &device.DeviceLocality{ - PciBusID: "pciBusID3", - }, - }, - }, - Attributes: map[string]*structs.Attribute{ - MemoryAttr: { - Int: helper.Int64ToPtr(10), - Unit: structs.UnitMiB, - }, - PowerAttr: { - Int: helper.Int64ToPtr(100), - Unit: structs.UnitW, - }, - BAR1Attr: { - Int: helper.Int64ToPtr(256), - Unit: structs.UnitMiB, - }, - PCIBandwidthAttr: { - Int: helper.Int64ToPtr(1), - Unit: structs.UnitMBPerS, - }, - CoresClockAttr: { - Int: helper.Int64ToPtr(1), - Unit: structs.UnitMHz, - }, - MemoryClockAttr: { - Int: helper.Int64ToPtr(1), - Unit: structs.UnitMHz, - }, - DisplayStateAttr: { - String: helper.StringToPtr("Enabled"), - }, - PersistenceModeAttr: { - String: helper.StringToPtr("Enabled"), - }, - DriverVersionAttr: { - String: helper.StringToPtr("1"), - }, - }, - }, - }, - }, - }, - { - Name: "Check that not working driver returns error fingeprint data", - Device: &NvidiaDevice{ - initErr: errors.New("foo"), - nvmlClient: &MockNvmlClient{ - FingerprintResponseReturned: &nvml.FingerprintData{ - DriverVersion: "1", - Devices: []*nvml.FingerprintDeviceData{ - { - DeviceData: &nvml.DeviceData{ - UUID: "1", - DeviceName: helper.StringToPtr("Name1"), - MemoryMiB: helper.Uint64ToPtr(10), - }, - }, - { - DeviceData: &nvml.DeviceData{ - UUID: "2", - DeviceName: helper.StringToPtr("Name1"), - MemoryMiB: helper.Uint64ToPtr(10), - }, - }, - { - DeviceData: &nvml.DeviceData{ - UUID: "3", - DeviceName: helper.StringToPtr("Name1"), - MemoryMiB: helper.Uint64ToPtr(10), - }, - }, - }, - }, - }, - logger: hclog.NewNullLogger(), - }, - ExpectedWriteToChannel: &device.FingerprintResponse{ - Error: errors.New("foo"), - }, - }, - } { - t.Run(testCase.Name, func(t *testing.T) { - outCh := make(chan *device.FingerprintResponse) - ctx, cancel := context.WithCancel(context.Background()) - go testCase.Device.fingerprint(ctx, outCh) - result := <-outCh - cancel() - require.New(t).Equal(result, testCase.ExpectedWriteToChannel) - }) - } -} diff --git a/devices/gpu/nvidia/nvml/client.go b/devices/gpu/nvidia/nvml/client.go deleted file mode 100644 index d18dcbe1a9f6..000000000000 --- a/devices/gpu/nvidia/nvml/client.go +++ /dev/null @@ -1,194 +0,0 @@ -package nvml - -import ( - "fmt" -) - -// DeviceData represents common fields for Nvidia device -type DeviceData struct { - UUID string - DeviceName *string - MemoryMiB *uint64 - PowerW *uint - BAR1MiB *uint64 -} - -// FingerprintDeviceData is a superset of DeviceData -// it describes device specific fields returned from -// nvml queries during fingerprinting call -type FingerprintDeviceData struct { - *DeviceData - PCIBandwidthMBPerS *uint - CoresClockMHz *uint - MemoryClockMHz *uint - DisplayState string - PersistenceMode string - PCIBusID string -} - -// FingerprintData represets attributes of driver/devices -type FingerprintData struct { - Devices []*FingerprintDeviceData - DriverVersion string -} - -// StatsData is a superset of DeviceData -// it represents statistics data returned for every Nvidia device -type StatsData struct { - *DeviceData - PowerUsageW *uint - GPUUtilization *uint - MemoryUtilization *uint - EncoderUtilization *uint - DecoderUtilization *uint - TemperatureC *uint - UsedMemoryMiB *uint64 - BAR1UsedMiB *uint64 - ECCErrorsL1Cache *uint64 - ECCErrorsL2Cache *uint64 - ECCErrorsDevice *uint64 -} - -// NvmlClient describes how users would use nvml library -type NvmlClient interface { - GetFingerprintData() (*FingerprintData, error) - GetStatsData() ([]*StatsData, error) -} - -// nvmlClient implements NvmlClient -// Users of this lib are expected to use this struct via NewNvmlClient func -type nvmlClient struct { - driver NvmlDriver -} - -// NewNvmlClient function creates new nvmlClient with real -// NvmlDriver implementation. Also, this func initializes NvmlDriver -func NewNvmlClient() (*nvmlClient, error) { - driver := &nvmlDriver{} - err := driver.Initialize() - if err != nil { - return nil, err - } - return &nvmlClient{ - driver: driver, - }, nil -} - -// GetFingerprintData returns FingerprintData for available Nvidia devices -func (c *nvmlClient) GetFingerprintData() (*FingerprintData, error) { - /* - nvml fields to be fingerprinted # nvml_library_call - 1 - Driver Version # nvmlSystemGetDriverVersion - 2 - Product Name # nvmlDeviceGetName - 3 - GPU UUID # nvmlDeviceGetUUID - 4 - Total Memory # nvmlDeviceGetMemoryInfo - 5 - Power # nvmlDeviceGetPowerManagementLimit - 6 - PCIBusID # nvmlDeviceGetPciInfo - 7 - BAR1 Memory # nvmlDeviceGetBAR1MemoryInfo( - 8 - PCI Bandwidth - 9 - Memory, Cores Clock # nvmlDeviceGetMaxClockInfo - 10 - Display Mode # nvmlDeviceGetDisplayMode - 11 - Persistence Mode # nvmlDeviceGetPersistenceMode - */ - - // Assumed that this method is called with receiver retrieved from - // NewNvmlClient - // because this method handles initialization of NVML library - - driverVersion, err := c.driver.SystemDriverVersion() - if err != nil { - return nil, fmt.Errorf("nvidia nvml SystemDriverVersion() error: %v\n", err) - } - - numDevices, err := c.driver.DeviceCount() - if err != nil { - return nil, fmt.Errorf("nvidia nvml DeviceCount() error: %v\n", err) - } - - allNvidiaGPUResources := make([]*FingerprintDeviceData, numDevices) - - for i := 0; i < int(numDevices); i++ { - deviceInfo, err := c.driver.DeviceInfoByIndex(uint(i)) - if err != nil { - return nil, fmt.Errorf("nvidia nvml DeviceInfoByIndex() error: %v\n", err) - } - - allNvidiaGPUResources[i] = &FingerprintDeviceData{ - DeviceData: &DeviceData{ - DeviceName: deviceInfo.Name, - UUID: deviceInfo.UUID, - MemoryMiB: deviceInfo.MemoryMiB, - PowerW: deviceInfo.PowerW, - BAR1MiB: deviceInfo.BAR1MiB, - }, - PCIBandwidthMBPerS: deviceInfo.PCIBandwidthMBPerS, - CoresClockMHz: deviceInfo.CoresClockMHz, - MemoryClockMHz: deviceInfo.MemoryClockMHz, - DisplayState: deviceInfo.DisplayState, - PersistenceMode: deviceInfo.PersistenceMode, - PCIBusID: deviceInfo.PCIBusID, - } - } - return &FingerprintData{ - Devices: allNvidiaGPUResources, - DriverVersion: driverVersion, - }, nil -} - -// GetStatsData returns statistics data for all devices on this machine -func (c *nvmlClient) GetStatsData() ([]*StatsData, error) { - /* - nvml fields to be reported to stats api # nvml_library_call - 1 - Used Memory # nvmlDeviceGetMemoryInfo - 2 - Utilization of GPU # nvmlDeviceGetUtilizationRates - 3 - Utilization of Memory # nvmlDeviceGetUtilizationRates - 4 - Utilization of Decoder # nvmlDeviceGetDecoderUtilization - 5 - Utilization of Encoder # nvmlDeviceGetEncoderUtilization - 6 - Current GPU Temperature # nvmlDeviceGetTemperature - 7 - Power Draw # nvmlDeviceGetPowerUsage - 8 - BAR1 Used memory # nvmlDeviceGetBAR1MemoryInfo - 9 - ECC Errors on requesting L1Cache # nvmlDeviceGetMemoryErrorCounter - 10 - ECC Errors on requesting L2Cache # nvmlDeviceGetMemoryErrorCounter - 11 - ECC Errors on requesting Device memory # nvmlDeviceGetMemoryErrorCounter - */ - - // Assumed that this method is called with receiver retrieved from - // NewNvmlClient - // because this method handles initialization of NVML library - - numDevices, err := c.driver.DeviceCount() - if err != nil { - return nil, fmt.Errorf("nvidia nvml DeviceCount() error: %v\n", err) - } - - allNvidiaGPUStats := make([]*StatsData, numDevices) - - for i := 0; i < int(numDevices); i++ { - deviceInfo, deviceStatus, err := c.driver.DeviceInfoAndStatusByIndex(uint(i)) - if err != nil { - return nil, fmt.Errorf("nvidia nvml DeviceInfoAndStatusByIndex() error: %v\n", err) - } - - allNvidiaGPUStats[i] = &StatsData{ - DeviceData: &DeviceData{ - DeviceName: deviceInfo.Name, - UUID: deviceInfo.UUID, - MemoryMiB: deviceInfo.MemoryMiB, - PowerW: deviceInfo.PowerW, - BAR1MiB: deviceInfo.BAR1MiB, - }, - PowerUsageW: deviceStatus.PowerUsageW, - GPUUtilization: deviceStatus.GPUUtilization, - MemoryUtilization: deviceStatus.MemoryUtilization, - EncoderUtilization: deviceStatus.EncoderUtilization, - DecoderUtilization: deviceStatus.DecoderUtilization, - TemperatureC: deviceStatus.TemperatureC, - UsedMemoryMiB: deviceStatus.UsedMemoryMiB, - BAR1UsedMiB: deviceStatus.BAR1UsedMiB, - ECCErrorsL1Cache: deviceStatus.ECCErrorsL1Cache, - ECCErrorsL2Cache: deviceStatus.ECCErrorsL2Cache, - ECCErrorsDevice: deviceStatus.ECCErrorsDevice, - } - } - return allNvidiaGPUStats, nil -} diff --git a/devices/gpu/nvidia/nvml/client_test.go b/devices/gpu/nvidia/nvml/client_test.go deleted file mode 100644 index 23731f7b052e..000000000000 --- a/devices/gpu/nvidia/nvml/client_test.go +++ /dev/null @@ -1,399 +0,0 @@ -package nvml - -import ( - "errors" - "testing" - - "github.com/hashicorp/nomad/helper" - "github.com/stretchr/testify/require" -) - -type MockNVMLDriver struct { - systemDriverCallSuccessful bool - deviceCountCallSuccessful bool - deviceInfoByIndexCallSuccessful bool - deviceInfoAndStatusByIndexCallSuccessful bool - driverVersion string - devices []*DeviceInfo - deviceStatus []*DeviceStatus -} - -func (m *MockNVMLDriver) Initialize() error { - return nil -} - -func (m *MockNVMLDriver) Shutdown() error { - return nil -} - -func (m *MockNVMLDriver) SystemDriverVersion() (string, error) { - if !m.systemDriverCallSuccessful { - return "", errors.New("failed to get system driver") - } - return m.driverVersion, nil -} - -func (m *MockNVMLDriver) DeviceCount() (uint, error) { - if !m.deviceCountCallSuccessful { - return 0, errors.New("failed to get device length") - } - return uint(len(m.devices)), nil -} - -func (m *MockNVMLDriver) DeviceInfoByIndex(index uint) (*DeviceInfo, error) { - if index >= uint(len(m.devices)) { - return nil, errors.New("index is out of range") - } - if !m.deviceInfoByIndexCallSuccessful { - return nil, errors.New("failed to get device info by index") - } - return m.devices[index], nil -} - -func (m *MockNVMLDriver) DeviceInfoAndStatusByIndex(index uint) (*DeviceInfo, *DeviceStatus, error) { - if index >= uint(len(m.devices)) || index >= uint(len(m.deviceStatus)) { - return nil, nil, errors.New("index is out of range") - } - if !m.deviceInfoAndStatusByIndexCallSuccessful { - return nil, nil, errors.New("failed to get device info and status by index") - } - return m.devices[index], m.deviceStatus[index], nil -} - -func TestGetFingerprintDataFromNVML(t *testing.T) { - for _, testCase := range []struct { - Name string - DriverConfiguration *MockNVMLDriver - ExpectedError bool - ExpectedResult *FingerprintData - }{ - { - Name: "fail on systemDriverCallSuccessful", - ExpectedError: true, - ExpectedResult: nil, - DriverConfiguration: &MockNVMLDriver{ - systemDriverCallSuccessful: false, - deviceCountCallSuccessful: true, - deviceInfoByIndexCallSuccessful: true, - }, - }, - { - Name: "fail on deviceCountCallSuccessful", - ExpectedError: true, - ExpectedResult: nil, - DriverConfiguration: &MockNVMLDriver{ - systemDriverCallSuccessful: true, - deviceCountCallSuccessful: false, - deviceInfoByIndexCallSuccessful: true, - }, - }, - { - Name: "fail on deviceInfoByIndexCall", - ExpectedError: true, - ExpectedResult: nil, - DriverConfiguration: &MockNVMLDriver{ - systemDriverCallSuccessful: true, - deviceCountCallSuccessful: true, - deviceInfoByIndexCallSuccessful: false, - devices: []*DeviceInfo{ - { - UUID: "UUID1", - Name: helper.StringToPtr("ModelName1"), - MemoryMiB: helper.Uint64ToPtr(16), - PCIBusID: "busId", - PowerW: helper.UintToPtr(100), - BAR1MiB: helper.Uint64ToPtr(100), - PCIBandwidthMBPerS: helper.UintToPtr(100), - CoresClockMHz: helper.UintToPtr(100), - MemoryClockMHz: helper.UintToPtr(100), - }, { - UUID: "UUID2", - Name: helper.StringToPtr("ModelName2"), - MemoryMiB: helper.Uint64ToPtr(8), - PCIBusID: "busId", - PowerW: helper.UintToPtr(100), - BAR1MiB: helper.Uint64ToPtr(100), - PCIBandwidthMBPerS: helper.UintToPtr(100), - CoresClockMHz: helper.UintToPtr(100), - MemoryClockMHz: helper.UintToPtr(100), - }, - }, - }, - }, - { - Name: "successful outcome", - ExpectedError: false, - ExpectedResult: &FingerprintData{ - DriverVersion: "driverVersion", - Devices: []*FingerprintDeviceData{ - { - DeviceData: &DeviceData{ - DeviceName: helper.StringToPtr("ModelName1"), - UUID: "UUID1", - MemoryMiB: helper.Uint64ToPtr(16), - PowerW: helper.UintToPtr(100), - BAR1MiB: helper.Uint64ToPtr(100), - }, - PCIBusID: "busId1", - PCIBandwidthMBPerS: helper.UintToPtr(100), - CoresClockMHz: helper.UintToPtr(100), - MemoryClockMHz: helper.UintToPtr(100), - DisplayState: "Enabled", - PersistenceMode: "Enabled", - }, { - DeviceData: &DeviceData{ - DeviceName: helper.StringToPtr("ModelName2"), - UUID: "UUID2", - MemoryMiB: helper.Uint64ToPtr(8), - PowerW: helper.UintToPtr(200), - BAR1MiB: helper.Uint64ToPtr(200), - }, - PCIBusID: "busId2", - PCIBandwidthMBPerS: helper.UintToPtr(200), - CoresClockMHz: helper.UintToPtr(200), - MemoryClockMHz: helper.UintToPtr(200), - DisplayState: "Enabled", - PersistenceMode: "Enabled", - }, - }, - }, - DriverConfiguration: &MockNVMLDriver{ - systemDriverCallSuccessful: true, - deviceCountCallSuccessful: true, - deviceInfoByIndexCallSuccessful: true, - driverVersion: "driverVersion", - devices: []*DeviceInfo{ - { - UUID: "UUID1", - Name: helper.StringToPtr("ModelName1"), - MemoryMiB: helper.Uint64ToPtr(16), - PCIBusID: "busId1", - PowerW: helper.UintToPtr(100), - BAR1MiB: helper.Uint64ToPtr(100), - PCIBandwidthMBPerS: helper.UintToPtr(100), - CoresClockMHz: helper.UintToPtr(100), - MemoryClockMHz: helper.UintToPtr(100), - DisplayState: "Enabled", - PersistenceMode: "Enabled", - }, { - UUID: "UUID2", - Name: helper.StringToPtr("ModelName2"), - MemoryMiB: helper.Uint64ToPtr(8), - PCIBusID: "busId2", - PowerW: helper.UintToPtr(200), - BAR1MiB: helper.Uint64ToPtr(200), - PCIBandwidthMBPerS: helper.UintToPtr(200), - CoresClockMHz: helper.UintToPtr(200), - MemoryClockMHz: helper.UintToPtr(200), - DisplayState: "Enabled", - PersistenceMode: "Enabled", - }, - }, - }, - }, - } { - cli := nvmlClient{driver: testCase.DriverConfiguration} - fingerprintData, err := cli.GetFingerprintData() - if testCase.ExpectedError && err == nil { - t.Errorf("case '%s' : expected Error, but didn't get one", testCase.Name) - } - if !testCase.ExpectedError && err != nil { - t.Errorf("case '%s' : unexpected Error '%v'", testCase.Name, err) - } - require.New(t).Equal(testCase.ExpectedResult, fingerprintData) - } -} - -func TestGetStatsDataFromNVML(t *testing.T) { - for _, testCase := range []struct { - Name string - DriverConfiguration *MockNVMLDriver - ExpectedError bool - ExpectedResult []*StatsData - }{ - { - Name: "fail on deviceCountCallSuccessful", - ExpectedError: true, - ExpectedResult: nil, - DriverConfiguration: &MockNVMLDriver{ - systemDriverCallSuccessful: true, - deviceCountCallSuccessful: false, - deviceInfoByIndexCallSuccessful: true, - deviceInfoAndStatusByIndexCallSuccessful: true, - }, - }, - { - Name: "fail on DeviceInfoAndStatusByIndex call", - ExpectedError: true, - ExpectedResult: nil, - DriverConfiguration: &MockNVMLDriver{ - systemDriverCallSuccessful: true, - deviceCountCallSuccessful: true, - deviceInfoAndStatusByIndexCallSuccessful: false, - devices: []*DeviceInfo{ - { - UUID: "UUID1", - Name: helper.StringToPtr("ModelName1"), - MemoryMiB: helper.Uint64ToPtr(16), - PCIBusID: "busId1", - PowerW: helper.UintToPtr(100), - BAR1MiB: helper.Uint64ToPtr(100), - PCIBandwidthMBPerS: helper.UintToPtr(100), - CoresClockMHz: helper.UintToPtr(100), - MemoryClockMHz: helper.UintToPtr(100), - }, { - UUID: "UUID2", - Name: helper.StringToPtr("ModelName2"), - MemoryMiB: helper.Uint64ToPtr(8), - PCIBusID: "busId2", - PowerW: helper.UintToPtr(200), - BAR1MiB: helper.Uint64ToPtr(200), - PCIBandwidthMBPerS: helper.UintToPtr(200), - CoresClockMHz: helper.UintToPtr(200), - MemoryClockMHz: helper.UintToPtr(200), - }, - }, - deviceStatus: []*DeviceStatus{ - { - TemperatureC: helper.UintToPtr(1), - GPUUtilization: helper.UintToPtr(1), - MemoryUtilization: helper.UintToPtr(1), - EncoderUtilization: helper.UintToPtr(1), - DecoderUtilization: helper.UintToPtr(1), - UsedMemoryMiB: helper.Uint64ToPtr(1), - ECCErrorsL1Cache: helper.Uint64ToPtr(1), - ECCErrorsL2Cache: helper.Uint64ToPtr(1), - ECCErrorsDevice: helper.Uint64ToPtr(1), - PowerUsageW: helper.UintToPtr(1), - BAR1UsedMiB: helper.Uint64ToPtr(1), - }, - { - TemperatureC: helper.UintToPtr(2), - GPUUtilization: helper.UintToPtr(2), - MemoryUtilization: helper.UintToPtr(2), - EncoderUtilization: helper.UintToPtr(2), - DecoderUtilization: helper.UintToPtr(2), - UsedMemoryMiB: helper.Uint64ToPtr(2), - ECCErrorsL1Cache: helper.Uint64ToPtr(2), - ECCErrorsL2Cache: helper.Uint64ToPtr(2), - ECCErrorsDevice: helper.Uint64ToPtr(2), - PowerUsageW: helper.UintToPtr(2), - BAR1UsedMiB: helper.Uint64ToPtr(2), - }, - }, - }, - }, - { - Name: "successful outcome", - ExpectedError: false, - ExpectedResult: []*StatsData{ - { - DeviceData: &DeviceData{ - DeviceName: helper.StringToPtr("ModelName1"), - UUID: "UUID1", - MemoryMiB: helper.Uint64ToPtr(16), - PowerW: helper.UintToPtr(100), - BAR1MiB: helper.Uint64ToPtr(100), - }, - TemperatureC: helper.UintToPtr(1), - GPUUtilization: helper.UintToPtr(1), - MemoryUtilization: helper.UintToPtr(1), - EncoderUtilization: helper.UintToPtr(1), - DecoderUtilization: helper.UintToPtr(1), - UsedMemoryMiB: helper.Uint64ToPtr(1), - ECCErrorsL1Cache: helper.Uint64ToPtr(1), - ECCErrorsL2Cache: helper.Uint64ToPtr(1), - ECCErrorsDevice: helper.Uint64ToPtr(1), - PowerUsageW: helper.UintToPtr(1), - BAR1UsedMiB: helper.Uint64ToPtr(1), - }, - { - DeviceData: &DeviceData{ - DeviceName: helper.StringToPtr("ModelName2"), - UUID: "UUID2", - MemoryMiB: helper.Uint64ToPtr(8), - PowerW: helper.UintToPtr(200), - BAR1MiB: helper.Uint64ToPtr(200), - }, - TemperatureC: helper.UintToPtr(2), - GPUUtilization: helper.UintToPtr(2), - MemoryUtilization: helper.UintToPtr(2), - EncoderUtilization: helper.UintToPtr(2), - DecoderUtilization: helper.UintToPtr(2), - UsedMemoryMiB: helper.Uint64ToPtr(2), - ECCErrorsL1Cache: helper.Uint64ToPtr(2), - ECCErrorsL2Cache: helper.Uint64ToPtr(2), - ECCErrorsDevice: helper.Uint64ToPtr(2), - PowerUsageW: helper.UintToPtr(2), - BAR1UsedMiB: helper.Uint64ToPtr(2), - }, - }, - DriverConfiguration: &MockNVMLDriver{ - deviceCountCallSuccessful: true, - deviceInfoByIndexCallSuccessful: true, - deviceInfoAndStatusByIndexCallSuccessful: true, - devices: []*DeviceInfo{ - { - UUID: "UUID1", - Name: helper.StringToPtr("ModelName1"), - MemoryMiB: helper.Uint64ToPtr(16), - PCIBusID: "busId1", - PowerW: helper.UintToPtr(100), - BAR1MiB: helper.Uint64ToPtr(100), - PCIBandwidthMBPerS: helper.UintToPtr(100), - CoresClockMHz: helper.UintToPtr(100), - MemoryClockMHz: helper.UintToPtr(100), - }, { - UUID: "UUID2", - Name: helper.StringToPtr("ModelName2"), - MemoryMiB: helper.Uint64ToPtr(8), - PCIBusID: "busId2", - PowerW: helper.UintToPtr(200), - BAR1MiB: helper.Uint64ToPtr(200), - PCIBandwidthMBPerS: helper.UintToPtr(200), - CoresClockMHz: helper.UintToPtr(200), - MemoryClockMHz: helper.UintToPtr(200), - }, - }, - deviceStatus: []*DeviceStatus{ - { - TemperatureC: helper.UintToPtr(1), - GPUUtilization: helper.UintToPtr(1), - MemoryUtilization: helper.UintToPtr(1), - EncoderUtilization: helper.UintToPtr(1), - DecoderUtilization: helper.UintToPtr(1), - UsedMemoryMiB: helper.Uint64ToPtr(1), - ECCErrorsL1Cache: helper.Uint64ToPtr(1), - ECCErrorsL2Cache: helper.Uint64ToPtr(1), - ECCErrorsDevice: helper.Uint64ToPtr(1), - PowerUsageW: helper.UintToPtr(1), - BAR1UsedMiB: helper.Uint64ToPtr(1), - }, - { - TemperatureC: helper.UintToPtr(2), - GPUUtilization: helper.UintToPtr(2), - MemoryUtilization: helper.UintToPtr(2), - EncoderUtilization: helper.UintToPtr(2), - DecoderUtilization: helper.UintToPtr(2), - UsedMemoryMiB: helper.Uint64ToPtr(2), - ECCErrorsL1Cache: helper.Uint64ToPtr(2), - ECCErrorsL2Cache: helper.Uint64ToPtr(2), - ECCErrorsDevice: helper.Uint64ToPtr(2), - PowerUsageW: helper.UintToPtr(2), - BAR1UsedMiB: helper.Uint64ToPtr(2), - }, - }, - }, - }, - } { - cli := nvmlClient{driver: testCase.DriverConfiguration} - statsData, err := cli.GetStatsData() - if testCase.ExpectedError && err == nil { - t.Errorf("case '%s' : expected Error, but didn't get one", testCase.Name) - } - if !testCase.ExpectedError && err != nil { - t.Errorf("case '%s' : unexpected Error '%v'", testCase.Name, err) - } - require.New(t).Equal(testCase.ExpectedResult, statsData) - } -} diff --git a/devices/gpu/nvidia/nvml/driver_default.go b/devices/gpu/nvidia/nvml/driver_default.go deleted file mode 100644 index e67efa22eeaf..000000000000 --- a/devices/gpu/nvidia/nvml/driver_default.go +++ /dev/null @@ -1,33 +0,0 @@ -// +build !linux - -package nvml - -// Initialize nvml library by locating nvml shared object file and calling ldopen -func (n *nvmlDriver) Initialize() error { - return UnavailableLib -} - -// Shutdown stops any further interaction with nvml -func (n *nvmlDriver) Shutdown() error { - return UnavailableLib -} - -// SystemDriverVersion returns installed driver version -func (n *nvmlDriver) SystemDriverVersion() (string, error) { - return "", UnavailableLib -} - -// DeviceCount reports number of available GPU devices -func (n *nvmlDriver) DeviceCount() (uint, error) { - return 0, UnavailableLib -} - -// DeviceInfoByIndex returns DeviceInfo for index GPU in system device list -func (n *nvmlDriver) DeviceInfoByIndex(index uint) (*DeviceInfo, error) { - return nil, UnavailableLib -} - -// DeviceInfoByIndex returns DeviceInfo and DeviceStatus for index GPU in system device list -func (n *nvmlDriver) DeviceInfoAndStatusByIndex(index uint) (*DeviceInfo, *DeviceStatus, error) { - return nil, nil, UnavailableLib -} diff --git a/devices/gpu/nvidia/nvml/driver_linux.go b/devices/gpu/nvidia/nvml/driver_linux.go deleted file mode 100644 index bdd777561bcf..000000000000 --- a/devices/gpu/nvidia/nvml/driver_linux.go +++ /dev/null @@ -1,85 +0,0 @@ -package nvml - -import ( - "github.com/NVIDIA/gpu-monitoring-tools/bindings/go/nvml" -) - -// Initialize nvml library by locating nvml shared object file and calling ldopen -func (n *nvmlDriver) Initialize() error { - return nvml.Init() -} - -// Shutdown stops any further interaction with nvml -func (n *nvmlDriver) Shutdown() error { - return nvml.Shutdown() -} - -// SystemDriverVersion returns installed driver version -func (n *nvmlDriver) SystemDriverVersion() (string, error) { - return nvml.GetDriverVersion() -} - -// DeviceCount reports number of available GPU devices -func (n *nvmlDriver) DeviceCount() (uint, error) { - return nvml.GetDeviceCount() -} - -// DeviceInfoByIndex returns DeviceInfo for index GPU in system device list -func (n *nvmlDriver) DeviceInfoByIndex(index uint) (*DeviceInfo, error) { - device, err := nvml.NewDevice(index) - if err != nil { - return nil, err - } - deviceMode, err := device.GetDeviceMode() - if err != nil { - return nil, err - } - return &DeviceInfo{ - UUID: device.UUID, - Name: device.Model, - MemoryMiB: device.Memory, - PowerW: device.Power, - BAR1MiB: device.PCI.BAR1, - PCIBandwidthMBPerS: device.PCI.Bandwidth, - PCIBusID: device.PCI.BusID, - CoresClockMHz: device.Clocks.Cores, - MemoryClockMHz: device.Clocks.Memory, - DisplayState: deviceMode.DisplayInfo.Mode.String(), - PersistenceMode: deviceMode.Persistence.String(), - }, nil -} - -// DeviceInfoByIndex returns DeviceInfo and DeviceStatus for index GPU in system device list -func (n *nvmlDriver) DeviceInfoAndStatusByIndex(index uint) (*DeviceInfo, *DeviceStatus, error) { - device, err := nvml.NewDevice(index) - if err != nil { - return nil, nil, err - } - status, err := device.Status() - if err != nil { - return nil, nil, err - } - return &DeviceInfo{ - UUID: device.UUID, - Name: device.Model, - MemoryMiB: device.Memory, - PowerW: device.Power, - BAR1MiB: device.PCI.BAR1, - PCIBandwidthMBPerS: device.PCI.Bandwidth, - PCIBusID: device.PCI.BusID, - CoresClockMHz: device.Clocks.Cores, - MemoryClockMHz: device.Clocks.Memory, - }, &DeviceStatus{ - TemperatureC: status.Temperature, - GPUUtilization: status.Utilization.GPU, - MemoryUtilization: status.Utilization.Memory, - EncoderUtilization: status.Utilization.Encoder, - DecoderUtilization: status.Utilization.Decoder, - UsedMemoryMiB: status.Memory.Global.Used, - ECCErrorsL1Cache: status.Memory.ECCErrors.L1Cache, - ECCErrorsL2Cache: status.Memory.ECCErrors.L2Cache, - ECCErrorsDevice: status.Memory.ECCErrors.Device, - PowerUsageW: status.Power, - BAR1UsedMiB: status.PCI.BAR1Used, - }, nil -} diff --git a/devices/gpu/nvidia/nvml/shared.go b/devices/gpu/nvidia/nvml/shared.go deleted file mode 100644 index a0bb04d22234..000000000000 --- a/devices/gpu/nvidia/nvml/shared.go +++ /dev/null @@ -1,61 +0,0 @@ -package nvml - -import "errors" - -var ( - // UnavailableLib is returned when the nvml library could not be loaded. - UnavailableLib = errors.New("could not load NVML library") -) - -// nvmlDriver implements NvmlDriver -// Users are required to call Initialize method before using any other methods -type nvmlDriver struct{} - -// NvmlDriver represents set of methods to query nvml library -type NvmlDriver interface { - Initialize() error - Shutdown() error - SystemDriverVersion() (string, error) - DeviceCount() (uint, error) - DeviceInfoByIndex(uint) (*DeviceInfo, error) - DeviceInfoAndStatusByIndex(uint) (*DeviceInfo, *DeviceStatus, error) -} - -// DeviceInfo represents nvml device data -// this struct is returned by NvmlDriver DeviceInfoByIndex and -// DeviceInfoAndStatusByIndex methods -type DeviceInfo struct { - // The following fields are guaranteed to be retrieved from nvml - UUID string - PCIBusID string - DisplayState string - PersistenceMode string - - // The following fields can be nil after call to nvml, because nvml was - // not able to retrieve this fields for specific nvidia card - Name *string - MemoryMiB *uint64 - PowerW *uint - BAR1MiB *uint64 - PCIBandwidthMBPerS *uint - CoresClockMHz *uint - MemoryClockMHz *uint -} - -// DeviceStatus represents nvml device status -// this struct is returned by NvmlDriver DeviceInfoAndStatusByIndex method -type DeviceStatus struct { - // The following fields can be nil after call to nvml, because nvml was - // not able to retrieve this fields for specific nvidia card - PowerUsageW *uint - TemperatureC *uint - GPUUtilization *uint // % - MemoryUtilization *uint // % - EncoderUtilization *uint // % - DecoderUtilization *uint // % - BAR1UsedMiB *uint64 - UsedMemoryMiB *uint64 - ECCErrorsL1Cache *uint64 - ECCErrorsL2Cache *uint64 - ECCErrorsDevice *uint64 -} diff --git a/devices/gpu/nvidia/stats.go b/devices/gpu/nvidia/stats.go deleted file mode 100644 index c6c447757916..000000000000 --- a/devices/gpu/nvidia/stats.go +++ /dev/null @@ -1,325 +0,0 @@ -package nvidia - -import ( - "context" - "time" - - "github.com/hashicorp/nomad/devices/gpu/nvidia/nvml" - "github.com/hashicorp/nomad/helper" - "github.com/hashicorp/nomad/plugins/device" - "github.com/hashicorp/nomad/plugins/shared/structs" -) - -const ( - // Attribute names for reporting stats output - PowerUsageAttr = "Power usage" - PowerUsageUnit = "W" - PowerUsageDesc = "Power usage for this GPU in watts and " + - "its associated circuitry (e.g. memory) / Maximum GPU Power" - GPUUtilizationAttr = "GPU utilization" - GPUUtilizationUnit = "%" - GPUUtilizationDesc = "Percent of time over the past sample period " + - "during which one or more kernels were executing on the GPU." - MemoryUtilizationAttr = "Memory utilization" - MemoryUtilizationUnit = "%" - MemoryUtilizationDesc = "Percentage of bandwidth used during the past sample period" - EncoderUtilizationAttr = "Encoder utilization" - EncoderUtilizationUnit = "%" - EncoderUtilizationDesc = "Percent of time over the past sample period " + - "during which GPU Encoder was used" - DecoderUtilizationAttr = "Decoder utilization" - DecoderUtilizationUnit = "%" - DecoderUtilizationDesc = "Percent of time over the past sample period " + - "during which GPU Decoder was used" - TemperatureAttr = "Temperature" - TemperatureUnit = "C" // Celsius degrees - TemperatureDesc = "Temperature of the Unit" - MemoryStateAttr = "Memory state" - MemoryStateUnit = "MiB" // Mebibytes - MemoryStateDesc = "UsedMemory / TotalMemory" - BAR1StateAttr = "BAR1 buffer state" - BAR1StateUnit = "MiB" // Mebibytes - BAR1StateDesc = "UsedBAR1 / TotalBAR1" - ECCErrorsL1CacheAttr = "ECC L1 errors" - ECCErrorsL1CacheUnit = "#" // number of errors - ECCErrorsL1CacheDesc = "Requested L1Cache error counter for the device" - ECCErrorsL2CacheAttr = "ECC L2 errors" - ECCErrorsL2CacheUnit = "#" // number of errors - ECCErrorsL2CacheDesc = "Requested L2Cache error counter for the device" - ECCErrorsDeviceAttr = "ECC memory errors" - ECCErrorsDeviceUnit = "#" // number of errors - ECCErrorsDeviceDesc = "Requested memory error counter for the device" -) - -// stats is the long running goroutine that streams device statistics -func (d *NvidiaDevice) stats(ctx context.Context, stats chan<- *device.StatsResponse, interval time.Duration) { - defer close(stats) - - if d.initErr != nil { - if d.initErr.Error() != nvml.UnavailableLib.Error() { - d.logger.Error("exiting stats due to problems with NVML loading", "error", d.initErr) - stats <- device.NewStatsError(d.initErr) - } - - return - } - - // Create a timer that will fire immediately for the first detection - ticker := time.NewTimer(0) - - for { - select { - case <-ctx.Done(): - return - case <-ticker.C: - ticker.Reset(interval) - } - - d.writeStatsToChannel(stats, time.Now()) - } -} - -// filterStatsByID accepts list of StatsData and set of IDs -// this function would return entries from StatsData with IDs found in the set -func filterStatsByID(stats []*nvml.StatsData, ids map[string]struct{}) []*nvml.StatsData { - var filteredStats []*nvml.StatsData - for _, statsItem := range stats { - if _, ok := ids[statsItem.UUID]; ok { - filteredStats = append(filteredStats, statsItem) - } - } - return filteredStats -} - -// writeStatsToChannel collects StatsData from NVML backend, groups StatsData -// by DeviceName attribute, populates DeviceGroupStats structure for every group -// and sends data over provided channel -func (d *NvidiaDevice) writeStatsToChannel(stats chan<- *device.StatsResponse, timestamp time.Time) { - statsData, err := d.nvmlClient.GetStatsData() - if err != nil { - d.logger.Error("failed to get nvidia stats", "error", err) - stats <- &device.StatsResponse{ - Error: err, - } - return - } - - // filter only stats from devices that are stored in NvidiaDevice struct - d.deviceLock.RLock() - statsData = filterStatsByID(statsData, d.devices) - d.deviceLock.RUnlock() - - // group stats by DeviceName struct field - statsListByDeviceName := make(map[string][]*nvml.StatsData) - for _, statsItem := range statsData { - deviceName := statsItem.DeviceName - if deviceName == nil { - // nvml driver was not able to detect device name. This kind - // of devices are placed to single group with 'notAvailable' name - notAvailableCopy := notAvailable - deviceName = ¬AvailableCopy - } - - statsListByDeviceName[*deviceName] = append(statsListByDeviceName[*deviceName], statsItem) - } - - // place data device.DeviceGroupStats struct for every group of stats - deviceGroupsStats := make([]*device.DeviceGroupStats, 0, len(statsListByDeviceName)) - for groupName, groupStats := range statsListByDeviceName { - deviceGroupsStats = append(deviceGroupsStats, statsForGroup(groupName, groupStats, timestamp)) - } - - stats <- &device.StatsResponse{ - Groups: deviceGroupsStats, - } -} - -func newNotAvailableDeviceStats(unit, desc string) *structs.StatValue { - return &structs.StatValue{Unit: unit, Desc: desc, StringVal: helper.StringToPtr(notAvailable)} -} - -// statsForGroup is a helper function that populates device.DeviceGroupStats -// for given groupName with groupStats list -func statsForGroup(groupName string, groupStats []*nvml.StatsData, timestamp time.Time) *device.DeviceGroupStats { - instanceStats := make(map[string]*device.DeviceStats) - for _, statsItem := range groupStats { - instanceStats[statsItem.UUID] = statsForItem(statsItem, timestamp) - } - - return &device.DeviceGroupStats{ - Vendor: vendor, - Type: deviceType, - Name: groupName, - InstanceStats: instanceStats, - } -} - -// statsForItem is a helper function that populates device.DeviceStats for given -// nvml.StatsData -func statsForItem(statsItem *nvml.StatsData, timestamp time.Time) *device.DeviceStats { - // nvml.StatsData holds pointers to values that can be nil - // In case they are nil return stats with 'notAvailable' constant - var ( - powerUsageStat *structs.StatValue - GPUUtilizationStat *structs.StatValue - memoryUtilizationStat *structs.StatValue - encoderUtilizationStat *structs.StatValue - decoderUtilizationStat *structs.StatValue - temperatureStat *structs.StatValue - memoryStateStat *structs.StatValue - BAR1StateStat *structs.StatValue - ECCErrorsL1CacheStat *structs.StatValue - ECCErrorsL2CacheStat *structs.StatValue - ECCErrorsDeviceStat *structs.StatValue - ) - - if statsItem.PowerUsageW == nil || statsItem.PowerW == nil { - powerUsageStat = newNotAvailableDeviceStats(PowerUsageUnit, PowerUsageDesc) - } else { - powerUsageStat = &structs.StatValue{ - Unit: PowerUsageUnit, - Desc: PowerUsageDesc, - IntNumeratorVal: helper.Int64ToPtr(int64(*statsItem.PowerUsageW)), - IntDenominatorVal: uintToInt64Ptr(statsItem.PowerW), - } - } - - if statsItem.GPUUtilization == nil { - GPUUtilizationStat = newNotAvailableDeviceStats(GPUUtilizationUnit, GPUUtilizationDesc) - } else { - GPUUtilizationStat = &structs.StatValue{ - Unit: GPUUtilizationUnit, - Desc: GPUUtilizationDesc, - IntNumeratorVal: uintToInt64Ptr(statsItem.GPUUtilization), - } - } - - if statsItem.MemoryUtilization == nil { - memoryUtilizationStat = newNotAvailableDeviceStats(MemoryUtilizationUnit, MemoryUtilizationDesc) - } else { - memoryUtilizationStat = &structs.StatValue{ - Unit: MemoryUtilizationUnit, - Desc: MemoryUtilizationDesc, - IntNumeratorVal: uintToInt64Ptr(statsItem.MemoryUtilization), - } - } - - if statsItem.EncoderUtilization == nil { - encoderUtilizationStat = newNotAvailableDeviceStats(EncoderUtilizationUnit, EncoderUtilizationDesc) - } else { - encoderUtilizationStat = &structs.StatValue{ - Unit: EncoderUtilizationUnit, - Desc: EncoderUtilizationDesc, - IntNumeratorVal: uintToInt64Ptr(statsItem.EncoderUtilization), - } - } - - if statsItem.DecoderUtilization == nil { - decoderUtilizationStat = newNotAvailableDeviceStats(DecoderUtilizationUnit, DecoderUtilizationDesc) - } else { - decoderUtilizationStat = &structs.StatValue{ - Unit: DecoderUtilizationUnit, - Desc: DecoderUtilizationDesc, - IntNumeratorVal: uintToInt64Ptr(statsItem.DecoderUtilization), - } - } - - if statsItem.TemperatureC == nil { - temperatureStat = newNotAvailableDeviceStats(TemperatureUnit, TemperatureDesc) - } else { - temperatureStat = &structs.StatValue{ - Unit: TemperatureUnit, - Desc: TemperatureDesc, - IntNumeratorVal: uintToInt64Ptr(statsItem.TemperatureC), - } - } - - if statsItem.UsedMemoryMiB == nil || statsItem.MemoryMiB == nil { - memoryStateStat = newNotAvailableDeviceStats(MemoryStateUnit, MemoryStateDesc) - } else { - memoryStateStat = &structs.StatValue{ - Unit: MemoryStateUnit, - Desc: MemoryStateDesc, - IntNumeratorVal: uint64ToInt64Ptr(statsItem.UsedMemoryMiB), - IntDenominatorVal: uint64ToInt64Ptr(statsItem.MemoryMiB), - } - } - - if statsItem.BAR1UsedMiB == nil || statsItem.BAR1MiB == nil { - BAR1StateStat = newNotAvailableDeviceStats(BAR1StateUnit, BAR1StateDesc) - } else { - BAR1StateStat = &structs.StatValue{ - Unit: BAR1StateUnit, - Desc: BAR1StateDesc, - IntNumeratorVal: uint64ToInt64Ptr(statsItem.BAR1UsedMiB), - IntDenominatorVal: uint64ToInt64Ptr(statsItem.BAR1MiB), - } - } - - if statsItem.ECCErrorsL1Cache == nil { - ECCErrorsL1CacheStat = newNotAvailableDeviceStats(ECCErrorsL1CacheUnit, ECCErrorsL1CacheDesc) - } else { - ECCErrorsL1CacheStat = &structs.StatValue{ - Unit: ECCErrorsL1CacheUnit, - Desc: ECCErrorsL1CacheDesc, - IntNumeratorVal: uint64ToInt64Ptr(statsItem.ECCErrorsL1Cache), - } - } - - if statsItem.ECCErrorsL2Cache == nil { - ECCErrorsL2CacheStat = newNotAvailableDeviceStats(ECCErrorsL2CacheUnit, ECCErrorsL2CacheDesc) - } else { - ECCErrorsL2CacheStat = &structs.StatValue{ - Unit: ECCErrorsL2CacheUnit, - Desc: ECCErrorsL2CacheDesc, - IntNumeratorVal: uint64ToInt64Ptr(statsItem.ECCErrorsL2Cache), - } - } - - if statsItem.ECCErrorsDevice == nil { - ECCErrorsDeviceStat = newNotAvailableDeviceStats(ECCErrorsDeviceUnit, ECCErrorsDeviceDesc) - } else { - ECCErrorsDeviceStat = &structs.StatValue{ - Unit: ECCErrorsDeviceUnit, - Desc: ECCErrorsDeviceDesc, - IntNumeratorVal: uint64ToInt64Ptr(statsItem.ECCErrorsDevice), - } - } - return &device.DeviceStats{ - Summary: memoryStateStat, - Stats: &structs.StatObject{ - Attributes: map[string]*structs.StatValue{ - PowerUsageAttr: powerUsageStat, - GPUUtilizationAttr: GPUUtilizationStat, - MemoryUtilizationAttr: memoryUtilizationStat, - EncoderUtilizationAttr: encoderUtilizationStat, - DecoderUtilizationAttr: decoderUtilizationStat, - TemperatureAttr: temperatureStat, - MemoryStateAttr: memoryStateStat, - BAR1StateAttr: BAR1StateStat, - ECCErrorsL1CacheAttr: ECCErrorsL1CacheStat, - ECCErrorsL2CacheAttr: ECCErrorsL2CacheStat, - ECCErrorsDeviceAttr: ECCErrorsDeviceStat, - }, - }, - Timestamp: timestamp, - } -} - -func uintToInt64Ptr(u *uint) *int64 { - if u == nil { - return nil - } - - v := int64(*u) - return &v -} - -func uint64ToInt64Ptr(u *uint64) *int64 { - if u == nil { - return nil - } - - v := int64(*u) - return &v -} diff --git a/devices/gpu/nvidia/stats_test.go b/devices/gpu/nvidia/stats_test.go deleted file mode 100644 index f6221e0f4801..000000000000 --- a/devices/gpu/nvidia/stats_test.go +++ /dev/null @@ -1,3041 +0,0 @@ -package nvidia - -import ( - "errors" - "sort" - "testing" - "time" - - hclog "github.com/hashicorp/go-hclog" - "github.com/hashicorp/nomad/devices/gpu/nvidia/nvml" - "github.com/hashicorp/nomad/helper" - "github.com/hashicorp/nomad/plugins/device" - "github.com/hashicorp/nomad/plugins/shared/structs" - "github.com/stretchr/testify/require" -) - -func TestFilterStatsByID(t *testing.T) { - for _, testCase := range []struct { - Name string - ProvidedStats []*nvml.StatsData - ProvidedIDs map[string]struct{} - ExpectedResult []*nvml.StatsData - }{ - { - Name: "All ids are in the map", - ProvidedStats: []*nvml.StatsData{ - { - DeviceData: &nvml.DeviceData{ - UUID: "UUID1", - DeviceName: helper.StringToPtr("DeviceName1"), - MemoryMiB: helper.Uint64ToPtr(1), - PowerW: helper.UintToPtr(2), - BAR1MiB: helper.Uint64ToPtr(256), - }, - PowerUsageW: helper.UintToPtr(1), - GPUUtilization: helper.UintToPtr(1), - MemoryUtilization: helper.UintToPtr(1), - EncoderUtilization: helper.UintToPtr(1), - DecoderUtilization: helper.UintToPtr(1), - TemperatureC: helper.UintToPtr(1), - UsedMemoryMiB: helper.Uint64ToPtr(1), - ECCErrorsL1Cache: helper.Uint64ToPtr(100), - ECCErrorsL2Cache: helper.Uint64ToPtr(100), - ECCErrorsDevice: helper.Uint64ToPtr(100), - }, - { - DeviceData: &nvml.DeviceData{ - UUID: "UUID2", - DeviceName: helper.StringToPtr("DeviceName1"), - MemoryMiB: helper.Uint64ToPtr(1), - PowerW: helper.UintToPtr(2), - BAR1MiB: helper.Uint64ToPtr(256), - }, - PowerUsageW: helper.UintToPtr(1), - GPUUtilization: helper.UintToPtr(1), - MemoryUtilization: helper.UintToPtr(1), - EncoderUtilization: helper.UintToPtr(1), - DecoderUtilization: helper.UintToPtr(1), - TemperatureC: helper.UintToPtr(1), - UsedMemoryMiB: helper.Uint64ToPtr(1), - ECCErrorsL1Cache: helper.Uint64ToPtr(100), - ECCErrorsL2Cache: helper.Uint64ToPtr(100), - ECCErrorsDevice: helper.Uint64ToPtr(100), - }, - { - DeviceData: &nvml.DeviceData{ - UUID: "UUID3", - DeviceName: helper.StringToPtr("DeviceName1"), - MemoryMiB: helper.Uint64ToPtr(1), - PowerW: helper.UintToPtr(2), - BAR1MiB: helper.Uint64ToPtr(256), - }, - PowerUsageW: helper.UintToPtr(1), - GPUUtilization: helper.UintToPtr(1), - MemoryUtilization: helper.UintToPtr(1), - EncoderUtilization: helper.UintToPtr(1), - DecoderUtilization: helper.UintToPtr(1), - TemperatureC: helper.UintToPtr(1), - UsedMemoryMiB: helper.Uint64ToPtr(1), - ECCErrorsL1Cache: helper.Uint64ToPtr(100), - ECCErrorsL2Cache: helper.Uint64ToPtr(100), - ECCErrorsDevice: helper.Uint64ToPtr(100), - }, - }, - ProvidedIDs: map[string]struct{}{ - "UUID1": {}, - "UUID2": {}, - "UUID3": {}, - }, - ExpectedResult: []*nvml.StatsData{ - { - DeviceData: &nvml.DeviceData{ - UUID: "UUID1", - DeviceName: helper.StringToPtr("DeviceName1"), - MemoryMiB: helper.Uint64ToPtr(1), - PowerW: helper.UintToPtr(2), - BAR1MiB: helper.Uint64ToPtr(256), - }, - PowerUsageW: helper.UintToPtr(1), - GPUUtilization: helper.UintToPtr(1), - MemoryUtilization: helper.UintToPtr(1), - EncoderUtilization: helper.UintToPtr(1), - DecoderUtilization: helper.UintToPtr(1), - TemperatureC: helper.UintToPtr(1), - UsedMemoryMiB: helper.Uint64ToPtr(1), - ECCErrorsL1Cache: helper.Uint64ToPtr(100), - ECCErrorsL2Cache: helper.Uint64ToPtr(100), - ECCErrorsDevice: helper.Uint64ToPtr(100), - }, - { - DeviceData: &nvml.DeviceData{ - UUID: "UUID2", - DeviceName: helper.StringToPtr("DeviceName1"), - MemoryMiB: helper.Uint64ToPtr(1), - PowerW: helper.UintToPtr(2), - BAR1MiB: helper.Uint64ToPtr(256), - }, - PowerUsageW: helper.UintToPtr(1), - GPUUtilization: helper.UintToPtr(1), - MemoryUtilization: helper.UintToPtr(1), - EncoderUtilization: helper.UintToPtr(1), - DecoderUtilization: helper.UintToPtr(1), - TemperatureC: helper.UintToPtr(1), - UsedMemoryMiB: helper.Uint64ToPtr(1), - ECCErrorsL1Cache: helper.Uint64ToPtr(100), - ECCErrorsL2Cache: helper.Uint64ToPtr(100), - ECCErrorsDevice: helper.Uint64ToPtr(100), - }, - { - DeviceData: &nvml.DeviceData{ - UUID: "UUID3", - DeviceName: helper.StringToPtr("DeviceName1"), - MemoryMiB: helper.Uint64ToPtr(1), - PowerW: helper.UintToPtr(2), - BAR1MiB: helper.Uint64ToPtr(256), - }, - PowerUsageW: helper.UintToPtr(1), - GPUUtilization: helper.UintToPtr(1), - MemoryUtilization: helper.UintToPtr(1), - EncoderUtilization: helper.UintToPtr(1), - DecoderUtilization: helper.UintToPtr(1), - TemperatureC: helper.UintToPtr(1), - UsedMemoryMiB: helper.Uint64ToPtr(1), - ECCErrorsL1Cache: helper.Uint64ToPtr(100), - ECCErrorsL2Cache: helper.Uint64ToPtr(100), - ECCErrorsDevice: helper.Uint64ToPtr(100), - }, - }, - }, - { - Name: "Odd are not provided in the map", - ProvidedStats: []*nvml.StatsData{ - { - DeviceData: &nvml.DeviceData{ - UUID: "UUID1", - DeviceName: helper.StringToPtr("DeviceName1"), - MemoryMiB: helper.Uint64ToPtr(1), - PowerW: helper.UintToPtr(2), - BAR1MiB: helper.Uint64ToPtr(256), - }, - PowerUsageW: helper.UintToPtr(1), - GPUUtilization: helper.UintToPtr(1), - MemoryUtilization: helper.UintToPtr(1), - EncoderUtilization: helper.UintToPtr(1), - DecoderUtilization: helper.UintToPtr(1), - TemperatureC: helper.UintToPtr(1), - UsedMemoryMiB: helper.Uint64ToPtr(1), - ECCErrorsL1Cache: helper.Uint64ToPtr(100), - ECCErrorsL2Cache: helper.Uint64ToPtr(100), - ECCErrorsDevice: helper.Uint64ToPtr(100), - }, - { - DeviceData: &nvml.DeviceData{ - UUID: "UUID2", - DeviceName: helper.StringToPtr("DeviceName1"), - MemoryMiB: helper.Uint64ToPtr(1), - PowerW: helper.UintToPtr(2), - BAR1MiB: helper.Uint64ToPtr(256), - }, - PowerUsageW: helper.UintToPtr(1), - GPUUtilization: helper.UintToPtr(1), - MemoryUtilization: helper.UintToPtr(1), - EncoderUtilization: helper.UintToPtr(1), - DecoderUtilization: helper.UintToPtr(1), - TemperatureC: helper.UintToPtr(1), - UsedMemoryMiB: helper.Uint64ToPtr(1), - ECCErrorsL1Cache: helper.Uint64ToPtr(100), - ECCErrorsL2Cache: helper.Uint64ToPtr(100), - ECCErrorsDevice: helper.Uint64ToPtr(100), - }, - { - DeviceData: &nvml.DeviceData{ - UUID: "UUID3", - DeviceName: helper.StringToPtr("DeviceName1"), - MemoryMiB: helper.Uint64ToPtr(1), - PowerW: helper.UintToPtr(2), - BAR1MiB: helper.Uint64ToPtr(256), - }, - PowerUsageW: helper.UintToPtr(1), - GPUUtilization: helper.UintToPtr(1), - MemoryUtilization: helper.UintToPtr(1), - EncoderUtilization: helper.UintToPtr(1), - DecoderUtilization: helper.UintToPtr(1), - TemperatureC: helper.UintToPtr(1), - UsedMemoryMiB: helper.Uint64ToPtr(1), - ECCErrorsL1Cache: helper.Uint64ToPtr(100), - ECCErrorsL2Cache: helper.Uint64ToPtr(100), - ECCErrorsDevice: helper.Uint64ToPtr(100), - }, - }, - ProvidedIDs: map[string]struct{}{ - "UUID2": {}, - }, - ExpectedResult: []*nvml.StatsData{ - { - DeviceData: &nvml.DeviceData{ - UUID: "UUID2", - DeviceName: helper.StringToPtr("DeviceName1"), - MemoryMiB: helper.Uint64ToPtr(1), - PowerW: helper.UintToPtr(2), - BAR1MiB: helper.Uint64ToPtr(256), - }, - PowerUsageW: helper.UintToPtr(1), - GPUUtilization: helper.UintToPtr(1), - MemoryUtilization: helper.UintToPtr(1), - EncoderUtilization: helper.UintToPtr(1), - DecoderUtilization: helper.UintToPtr(1), - TemperatureC: helper.UintToPtr(1), - UsedMemoryMiB: helper.Uint64ToPtr(1), - ECCErrorsL1Cache: helper.Uint64ToPtr(100), - ECCErrorsL2Cache: helper.Uint64ToPtr(100), - ECCErrorsDevice: helper.Uint64ToPtr(100), - }, - }, - }, - { - Name: "Even are not provided in the map", - ProvidedStats: []*nvml.StatsData{ - { - DeviceData: &nvml.DeviceData{ - UUID: "UUID1", - DeviceName: helper.StringToPtr("DeviceName1"), - MemoryMiB: helper.Uint64ToPtr(1), - PowerW: helper.UintToPtr(2), - BAR1MiB: helper.Uint64ToPtr(256), - }, - PowerUsageW: helper.UintToPtr(1), - GPUUtilization: helper.UintToPtr(1), - MemoryUtilization: helper.UintToPtr(1), - EncoderUtilization: helper.UintToPtr(1), - DecoderUtilization: helper.UintToPtr(1), - TemperatureC: helper.UintToPtr(1), - UsedMemoryMiB: helper.Uint64ToPtr(1), - ECCErrorsL1Cache: helper.Uint64ToPtr(100), - ECCErrorsL2Cache: helper.Uint64ToPtr(100), - ECCErrorsDevice: helper.Uint64ToPtr(100), - }, - { - DeviceData: &nvml.DeviceData{ - UUID: "UUID2", - DeviceName: helper.StringToPtr("DeviceName1"), - MemoryMiB: helper.Uint64ToPtr(1), - PowerW: helper.UintToPtr(2), - BAR1MiB: helper.Uint64ToPtr(256), - }, - PowerUsageW: helper.UintToPtr(1), - GPUUtilization: helper.UintToPtr(1), - MemoryUtilization: helper.UintToPtr(1), - EncoderUtilization: helper.UintToPtr(1), - DecoderUtilization: helper.UintToPtr(1), - TemperatureC: helper.UintToPtr(1), - UsedMemoryMiB: helper.Uint64ToPtr(1), - ECCErrorsL1Cache: helper.Uint64ToPtr(100), - ECCErrorsL2Cache: helper.Uint64ToPtr(100), - ECCErrorsDevice: helper.Uint64ToPtr(100), - }, - { - DeviceData: &nvml.DeviceData{ - UUID: "UUID3", - DeviceName: helper.StringToPtr("DeviceName1"), - MemoryMiB: helper.Uint64ToPtr(1), - PowerW: helper.UintToPtr(2), - BAR1MiB: helper.Uint64ToPtr(256), - }, - PowerUsageW: helper.UintToPtr(1), - GPUUtilization: helper.UintToPtr(1), - MemoryUtilization: helper.UintToPtr(1), - EncoderUtilization: helper.UintToPtr(1), - DecoderUtilization: helper.UintToPtr(1), - TemperatureC: helper.UintToPtr(1), - UsedMemoryMiB: helper.Uint64ToPtr(1), - ECCErrorsL1Cache: helper.Uint64ToPtr(100), - ECCErrorsL2Cache: helper.Uint64ToPtr(100), - ECCErrorsDevice: helper.Uint64ToPtr(100), - }, - }, - ProvidedIDs: map[string]struct{}{ - "UUID1": {}, - "UUID3": {}, - }, - ExpectedResult: []*nvml.StatsData{ - { - DeviceData: &nvml.DeviceData{ - UUID: "UUID1", - DeviceName: helper.StringToPtr("DeviceName1"), - MemoryMiB: helper.Uint64ToPtr(1), - PowerW: helper.UintToPtr(2), - BAR1MiB: helper.Uint64ToPtr(256), - }, - PowerUsageW: helper.UintToPtr(1), - GPUUtilization: helper.UintToPtr(1), - MemoryUtilization: helper.UintToPtr(1), - EncoderUtilization: helper.UintToPtr(1), - DecoderUtilization: helper.UintToPtr(1), - TemperatureC: helper.UintToPtr(1), - UsedMemoryMiB: helper.Uint64ToPtr(1), - ECCErrorsL1Cache: helper.Uint64ToPtr(100), - ECCErrorsL2Cache: helper.Uint64ToPtr(100), - ECCErrorsDevice: helper.Uint64ToPtr(100), - }, - { - DeviceData: &nvml.DeviceData{ - UUID: "UUID3", - DeviceName: helper.StringToPtr("DeviceName1"), - MemoryMiB: helper.Uint64ToPtr(1), - PowerW: helper.UintToPtr(2), - BAR1MiB: helper.Uint64ToPtr(256), - }, - PowerUsageW: helper.UintToPtr(1), - GPUUtilization: helper.UintToPtr(1), - MemoryUtilization: helper.UintToPtr(1), - EncoderUtilization: helper.UintToPtr(1), - DecoderUtilization: helper.UintToPtr(1), - TemperatureC: helper.UintToPtr(1), - UsedMemoryMiB: helper.Uint64ToPtr(1), - ECCErrorsL1Cache: helper.Uint64ToPtr(100), - ECCErrorsL2Cache: helper.Uint64ToPtr(100), - ECCErrorsDevice: helper.Uint64ToPtr(100), - }, - }, - }, - { - Name: "No Stats were provided", - ProvidedIDs: map[string]struct{}{ - "UUID1": {}, - "UUID2": {}, - "UUID3": {}, - }, - }, - { - Name: "No Ids were provided", - ProvidedStats: []*nvml.StatsData{ - { - DeviceData: &nvml.DeviceData{ - UUID: "UUID1", - DeviceName: helper.StringToPtr("DeviceName1"), - MemoryMiB: helper.Uint64ToPtr(1), - PowerW: helper.UintToPtr(2), - BAR1MiB: helper.Uint64ToPtr(256), - }, - PowerUsageW: helper.UintToPtr(1), - GPUUtilization: helper.UintToPtr(1), - MemoryUtilization: helper.UintToPtr(1), - EncoderUtilization: helper.UintToPtr(1), - DecoderUtilization: helper.UintToPtr(1), - TemperatureC: helper.UintToPtr(1), - UsedMemoryMiB: helper.Uint64ToPtr(1), - ECCErrorsL1Cache: helper.Uint64ToPtr(100), - ECCErrorsL2Cache: helper.Uint64ToPtr(100), - ECCErrorsDevice: helper.Uint64ToPtr(100), - }, - { - DeviceData: &nvml.DeviceData{ - UUID: "UUID2", - DeviceName: helper.StringToPtr("DeviceName1"), - MemoryMiB: helper.Uint64ToPtr(1), - PowerW: helper.UintToPtr(2), - BAR1MiB: helper.Uint64ToPtr(256), - }, - PowerUsageW: helper.UintToPtr(1), - GPUUtilization: helper.UintToPtr(1), - MemoryUtilization: helper.UintToPtr(1), - EncoderUtilization: helper.UintToPtr(1), - DecoderUtilization: helper.UintToPtr(1), - TemperatureC: helper.UintToPtr(1), - UsedMemoryMiB: helper.Uint64ToPtr(1), - ECCErrorsL1Cache: helper.Uint64ToPtr(100), - ECCErrorsL2Cache: helper.Uint64ToPtr(100), - ECCErrorsDevice: helper.Uint64ToPtr(100), - }, - { - DeviceData: &nvml.DeviceData{ - UUID: "UUID3", - DeviceName: helper.StringToPtr("DeviceName1"), - MemoryMiB: helper.Uint64ToPtr(1), - PowerW: helper.UintToPtr(2), - BAR1MiB: helper.Uint64ToPtr(256), - }, - PowerUsageW: helper.UintToPtr(1), - GPUUtilization: helper.UintToPtr(1), - MemoryUtilization: helper.UintToPtr(1), - EncoderUtilization: helper.UintToPtr(1), - DecoderUtilization: helper.UintToPtr(1), - TemperatureC: helper.UintToPtr(1), - UsedMemoryMiB: helper.Uint64ToPtr(1), - ECCErrorsL1Cache: helper.Uint64ToPtr(100), - ECCErrorsL2Cache: helper.Uint64ToPtr(100), - ECCErrorsDevice: helper.Uint64ToPtr(100), - }, - }, - }, - } { - actualResult := filterStatsByID(testCase.ProvidedStats, testCase.ProvidedIDs) - require.New(t).Equal(testCase.ExpectedResult, actualResult) - } -} - -func TestStatsForItem(t *testing.T) { - for _, testCase := range []struct { - Name string - Timestamp time.Time - ItemStat *nvml.StatsData - ExpectedResult *device.DeviceStats - }{ - { - Name: "All fields in ItemStat are not nil", - Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC), - ItemStat: &nvml.StatsData{ - DeviceData: &nvml.DeviceData{ - UUID: "UUID1", - DeviceName: helper.StringToPtr("DeviceName1"), - MemoryMiB: helper.Uint64ToPtr(1), - PowerW: helper.UintToPtr(1), - BAR1MiB: helper.Uint64ToPtr(256), - }, - PowerUsageW: helper.UintToPtr(1), - GPUUtilization: helper.UintToPtr(1), - MemoryUtilization: helper.UintToPtr(1), - EncoderUtilization: helper.UintToPtr(1), - DecoderUtilization: helper.UintToPtr(1), - TemperatureC: helper.UintToPtr(1), - UsedMemoryMiB: helper.Uint64ToPtr(1), - BAR1UsedMiB: helper.Uint64ToPtr(1), - ECCErrorsL1Cache: helper.Uint64ToPtr(100), - ECCErrorsL2Cache: helper.Uint64ToPtr(100), - ECCErrorsDevice: helper.Uint64ToPtr(100), - }, - ExpectedResult: &device.DeviceStats{ - Summary: &structs.StatValue{ - Unit: MemoryStateUnit, - Desc: MemoryStateDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(1), - }, - Stats: &structs.StatObject{ - Attributes: map[string]*structs.StatValue{ - PowerUsageAttr: { - Unit: PowerUsageUnit, - Desc: PowerUsageDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(1), - }, - GPUUtilizationAttr: { - Unit: GPUUtilizationUnit, - Desc: GPUUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - MemoryUtilizationAttr: { - Unit: MemoryUtilizationUnit, - Desc: MemoryUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - EncoderUtilizationAttr: { - Unit: EncoderUtilizationUnit, - Desc: EncoderUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - DecoderUtilizationAttr: { - Unit: DecoderUtilizationUnit, - Desc: DecoderUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - TemperatureAttr: { - Unit: TemperatureUnit, - Desc: TemperatureDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - MemoryStateAttr: { - Unit: MemoryStateUnit, - Desc: MemoryStateDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(1), - }, - BAR1StateAttr: { - Unit: BAR1StateUnit, - Desc: BAR1StateDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(256), - }, - ECCErrorsL1CacheAttr: { - Unit: ECCErrorsL1CacheUnit, - Desc: ECCErrorsL1CacheDesc, - IntNumeratorVal: helper.Int64ToPtr(100), - }, - ECCErrorsL2CacheAttr: { - Unit: ECCErrorsL2CacheUnit, - Desc: ECCErrorsL2CacheDesc, - IntNumeratorVal: helper.Int64ToPtr(100), - }, - ECCErrorsDeviceAttr: { - Unit: ECCErrorsDeviceUnit, - Desc: ECCErrorsDeviceDesc, - IntNumeratorVal: helper.Int64ToPtr(100), - }, - }, - }, - Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC), - }, - }, - { - Name: "Power usage is nil", - Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC), - ItemStat: &nvml.StatsData{ - DeviceData: &nvml.DeviceData{ - UUID: "UUID1", - DeviceName: helper.StringToPtr("DeviceName1"), - MemoryMiB: helper.Uint64ToPtr(1), - PowerW: helper.UintToPtr(1), - BAR1MiB: helper.Uint64ToPtr(256), - }, - PowerUsageW: nil, - GPUUtilization: helper.UintToPtr(1), - MemoryUtilization: helper.UintToPtr(1), - EncoderUtilization: helper.UintToPtr(1), - DecoderUtilization: helper.UintToPtr(1), - TemperatureC: helper.UintToPtr(1), - UsedMemoryMiB: helper.Uint64ToPtr(1), - BAR1UsedMiB: helper.Uint64ToPtr(1), - ECCErrorsL1Cache: helper.Uint64ToPtr(100), - ECCErrorsL2Cache: helper.Uint64ToPtr(100), - ECCErrorsDevice: helper.Uint64ToPtr(100), - }, - ExpectedResult: &device.DeviceStats{ - Summary: &structs.StatValue{ - Unit: MemoryStateUnit, - Desc: MemoryStateDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(1), - }, - Stats: &structs.StatObject{ - Attributes: map[string]*structs.StatValue{ - PowerUsageAttr: { - Unit: PowerUsageUnit, - Desc: PowerUsageDesc, - StringVal: helper.StringToPtr(notAvailable), - }, - GPUUtilizationAttr: { - Unit: GPUUtilizationUnit, - Desc: GPUUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - MemoryUtilizationAttr: { - Unit: MemoryUtilizationUnit, - Desc: MemoryUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - EncoderUtilizationAttr: { - Unit: EncoderUtilizationUnit, - Desc: EncoderUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - DecoderUtilizationAttr: { - Unit: DecoderUtilizationUnit, - Desc: DecoderUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - TemperatureAttr: { - Unit: TemperatureUnit, - Desc: TemperatureDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - MemoryStateAttr: { - Unit: MemoryStateUnit, - Desc: MemoryStateDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(1), - }, - BAR1StateAttr: { - Unit: BAR1StateUnit, - Desc: BAR1StateDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(256), - }, - ECCErrorsL1CacheAttr: { - Unit: ECCErrorsL1CacheUnit, - Desc: ECCErrorsL1CacheDesc, - IntNumeratorVal: helper.Int64ToPtr(100), - }, - ECCErrorsL2CacheAttr: { - Unit: ECCErrorsL2CacheUnit, - Desc: ECCErrorsL2CacheDesc, - IntNumeratorVal: helper.Int64ToPtr(100), - }, - ECCErrorsDeviceAttr: { - Unit: ECCErrorsDeviceUnit, - Desc: ECCErrorsDeviceDesc, - IntNumeratorVal: helper.Int64ToPtr(100), - }, - }, - }, - Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC), - }, - }, - { - Name: "PowerW is nil", - Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC), - ItemStat: &nvml.StatsData{ - DeviceData: &nvml.DeviceData{ - UUID: "UUID1", - DeviceName: helper.StringToPtr("DeviceName1"), - MemoryMiB: helper.Uint64ToPtr(1), - PowerW: nil, - BAR1MiB: helper.Uint64ToPtr(256), - }, - PowerUsageW: helper.UintToPtr(1), - GPUUtilization: helper.UintToPtr(1), - MemoryUtilization: helper.UintToPtr(1), - EncoderUtilization: helper.UintToPtr(1), - DecoderUtilization: helper.UintToPtr(1), - TemperatureC: helper.UintToPtr(1), - UsedMemoryMiB: helper.Uint64ToPtr(1), - BAR1UsedMiB: helper.Uint64ToPtr(1), - ECCErrorsL1Cache: helper.Uint64ToPtr(100), - ECCErrorsL2Cache: helper.Uint64ToPtr(100), - ECCErrorsDevice: helper.Uint64ToPtr(100), - }, - ExpectedResult: &device.DeviceStats{ - Summary: &structs.StatValue{ - Unit: MemoryStateUnit, - Desc: MemoryStateDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(1), - }, - Stats: &structs.StatObject{ - Attributes: map[string]*structs.StatValue{ - PowerUsageAttr: { - Unit: PowerUsageUnit, - Desc: PowerUsageDesc, - StringVal: helper.StringToPtr(notAvailable), - }, - GPUUtilizationAttr: { - Unit: GPUUtilizationUnit, - Desc: GPUUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - MemoryUtilizationAttr: { - Unit: MemoryUtilizationUnit, - Desc: MemoryUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - EncoderUtilizationAttr: { - Unit: EncoderUtilizationUnit, - Desc: EncoderUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - DecoderUtilizationAttr: { - Unit: DecoderUtilizationUnit, - Desc: DecoderUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - TemperatureAttr: { - Unit: TemperatureUnit, - Desc: TemperatureDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - MemoryStateAttr: { - Unit: MemoryStateUnit, - Desc: MemoryStateDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(1), - }, - BAR1StateAttr: { - Unit: BAR1StateUnit, - Desc: BAR1StateDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(256), - }, - ECCErrorsL1CacheAttr: { - Unit: ECCErrorsL1CacheUnit, - Desc: ECCErrorsL1CacheDesc, - IntNumeratorVal: helper.Int64ToPtr(100), - }, - ECCErrorsL2CacheAttr: { - Unit: ECCErrorsL2CacheUnit, - Desc: ECCErrorsL2CacheDesc, - IntNumeratorVal: helper.Int64ToPtr(100), - }, - ECCErrorsDeviceAttr: { - Unit: ECCErrorsDeviceUnit, - Desc: ECCErrorsDeviceDesc, - IntNumeratorVal: helper.Int64ToPtr(100), - }, - }, - }, - Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC), - }, - }, - { - Name: "GPUUtilization is nil", - Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC), - ItemStat: &nvml.StatsData{ - DeviceData: &nvml.DeviceData{ - UUID: "UUID1", - DeviceName: helper.StringToPtr("DeviceName1"), - MemoryMiB: helper.Uint64ToPtr(1), - PowerW: helper.UintToPtr(1), - BAR1MiB: helper.Uint64ToPtr(256), - }, - PowerUsageW: helper.UintToPtr(1), - GPUUtilization: nil, - MemoryUtilization: helper.UintToPtr(1), - EncoderUtilization: helper.UintToPtr(1), - DecoderUtilization: helper.UintToPtr(1), - TemperatureC: helper.UintToPtr(1), - UsedMemoryMiB: helper.Uint64ToPtr(1), - BAR1UsedMiB: helper.Uint64ToPtr(1), - ECCErrorsL1Cache: helper.Uint64ToPtr(100), - ECCErrorsL2Cache: helper.Uint64ToPtr(100), - ECCErrorsDevice: helper.Uint64ToPtr(100), - }, - ExpectedResult: &device.DeviceStats{ - Summary: &structs.StatValue{ - Unit: MemoryStateUnit, - Desc: MemoryStateDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(1), - }, - Stats: &structs.StatObject{ - Attributes: map[string]*structs.StatValue{ - PowerUsageAttr: { - Unit: PowerUsageUnit, - Desc: PowerUsageDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(1), - }, - GPUUtilizationAttr: { - Unit: GPUUtilizationUnit, - Desc: GPUUtilizationDesc, - StringVal: helper.StringToPtr(notAvailable), - }, - MemoryUtilizationAttr: { - Unit: MemoryUtilizationUnit, - Desc: MemoryUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - EncoderUtilizationAttr: { - Unit: EncoderUtilizationUnit, - Desc: EncoderUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - DecoderUtilizationAttr: { - Unit: DecoderUtilizationUnit, - Desc: DecoderUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - TemperatureAttr: { - Unit: TemperatureUnit, - Desc: TemperatureDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - MemoryStateAttr: { - Unit: MemoryStateUnit, - Desc: MemoryStateDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(1), - }, - BAR1StateAttr: { - Unit: BAR1StateUnit, - Desc: BAR1StateDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(256), - }, - ECCErrorsL1CacheAttr: { - Unit: ECCErrorsL1CacheUnit, - Desc: ECCErrorsL1CacheDesc, - IntNumeratorVal: helper.Int64ToPtr(100), - }, - ECCErrorsL2CacheAttr: { - Unit: ECCErrorsL2CacheUnit, - Desc: ECCErrorsL2CacheDesc, - IntNumeratorVal: helper.Int64ToPtr(100), - }, - ECCErrorsDeviceAttr: { - Unit: ECCErrorsDeviceUnit, - Desc: ECCErrorsDeviceDesc, - IntNumeratorVal: helper.Int64ToPtr(100), - }, - }, - }, - Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC), - }, - }, - { - Name: "MemoryUtilization is nil", - Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC), - ItemStat: &nvml.StatsData{ - DeviceData: &nvml.DeviceData{ - UUID: "UUID1", - DeviceName: helper.StringToPtr("DeviceName1"), - MemoryMiB: helper.Uint64ToPtr(1), - PowerW: helper.UintToPtr(1), - BAR1MiB: helper.Uint64ToPtr(256), - }, - PowerUsageW: helper.UintToPtr(1), - GPUUtilization: helper.UintToPtr(1), - MemoryUtilization: nil, - EncoderUtilization: helper.UintToPtr(1), - DecoderUtilization: helper.UintToPtr(1), - TemperatureC: helper.UintToPtr(1), - UsedMemoryMiB: helper.Uint64ToPtr(1), - BAR1UsedMiB: helper.Uint64ToPtr(1), - ECCErrorsL1Cache: helper.Uint64ToPtr(100), - ECCErrorsL2Cache: helper.Uint64ToPtr(100), - ECCErrorsDevice: helper.Uint64ToPtr(100), - }, - ExpectedResult: &device.DeviceStats{ - Summary: &structs.StatValue{ - Unit: MemoryStateUnit, - Desc: MemoryStateDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(1), - }, - Stats: &structs.StatObject{ - Attributes: map[string]*structs.StatValue{ - PowerUsageAttr: { - Unit: PowerUsageUnit, - Desc: PowerUsageDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(1), - }, - GPUUtilizationAttr: { - Unit: GPUUtilizationUnit, - Desc: GPUUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - MemoryUtilizationAttr: { - Unit: MemoryUtilizationUnit, - Desc: MemoryUtilizationDesc, - StringVal: helper.StringToPtr(notAvailable), - }, - EncoderUtilizationAttr: { - Unit: EncoderUtilizationUnit, - Desc: EncoderUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - DecoderUtilizationAttr: { - Unit: DecoderUtilizationUnit, - Desc: DecoderUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - TemperatureAttr: { - Unit: TemperatureUnit, - Desc: TemperatureDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - MemoryStateAttr: { - Unit: MemoryStateUnit, - Desc: MemoryStateDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(1), - }, - BAR1StateAttr: { - Unit: BAR1StateUnit, - Desc: BAR1StateDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(256), - }, - ECCErrorsL1CacheAttr: { - Unit: ECCErrorsL1CacheUnit, - Desc: ECCErrorsL1CacheDesc, - IntNumeratorVal: helper.Int64ToPtr(100), - }, - ECCErrorsL2CacheAttr: { - Unit: ECCErrorsL2CacheUnit, - Desc: ECCErrorsL2CacheDesc, - IntNumeratorVal: helper.Int64ToPtr(100), - }, - ECCErrorsDeviceAttr: { - Unit: ECCErrorsDeviceUnit, - Desc: ECCErrorsDeviceDesc, - IntNumeratorVal: helper.Int64ToPtr(100), - }, - }, - }, - Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC), - }, - }, - { - Name: "EncoderUtilization is nil", - Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC), - ItemStat: &nvml.StatsData{ - DeviceData: &nvml.DeviceData{ - UUID: "UUID1", - DeviceName: helper.StringToPtr("DeviceName1"), - MemoryMiB: helper.Uint64ToPtr(1), - PowerW: helper.UintToPtr(1), - BAR1MiB: helper.Uint64ToPtr(256), - }, - PowerUsageW: helper.UintToPtr(1), - GPUUtilization: helper.UintToPtr(1), - MemoryUtilization: helper.UintToPtr(1), - EncoderUtilization: nil, - DecoderUtilization: helper.UintToPtr(1), - TemperatureC: helper.UintToPtr(1), - UsedMemoryMiB: helper.Uint64ToPtr(1), - BAR1UsedMiB: helper.Uint64ToPtr(1), - ECCErrorsL1Cache: helper.Uint64ToPtr(100), - ECCErrorsL2Cache: helper.Uint64ToPtr(100), - ECCErrorsDevice: helper.Uint64ToPtr(100), - }, - ExpectedResult: &device.DeviceStats{ - Summary: &structs.StatValue{ - Unit: MemoryStateUnit, - Desc: MemoryStateDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(1), - }, - Stats: &structs.StatObject{ - Attributes: map[string]*structs.StatValue{ - PowerUsageAttr: { - Unit: PowerUsageUnit, - Desc: PowerUsageDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(1), - }, - GPUUtilizationAttr: { - Unit: GPUUtilizationUnit, - Desc: GPUUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - MemoryUtilizationAttr: { - Unit: MemoryUtilizationUnit, - Desc: MemoryUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - EncoderUtilizationAttr: { - Unit: EncoderUtilizationUnit, - Desc: EncoderUtilizationDesc, - StringVal: helper.StringToPtr(notAvailable), - }, - DecoderUtilizationAttr: { - Unit: DecoderUtilizationUnit, - Desc: DecoderUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - TemperatureAttr: { - Unit: TemperatureUnit, - Desc: TemperatureDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - MemoryStateAttr: { - Unit: MemoryStateUnit, - Desc: MemoryStateDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(1), - }, - BAR1StateAttr: { - Unit: BAR1StateUnit, - Desc: BAR1StateDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(256), - }, - ECCErrorsL1CacheAttr: { - Unit: ECCErrorsL1CacheUnit, - Desc: ECCErrorsL1CacheDesc, - IntNumeratorVal: helper.Int64ToPtr(100), - }, - ECCErrorsL2CacheAttr: { - Unit: ECCErrorsL2CacheUnit, - Desc: ECCErrorsL2CacheDesc, - IntNumeratorVal: helper.Int64ToPtr(100), - }, - ECCErrorsDeviceAttr: { - Unit: ECCErrorsDeviceUnit, - Desc: ECCErrorsDeviceDesc, - IntNumeratorVal: helper.Int64ToPtr(100), - }, - }, - }, - Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC), - }, - }, - { - Name: "DecoderUtilization is nil", - Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC), - ItemStat: &nvml.StatsData{ - DeviceData: &nvml.DeviceData{ - UUID: "UUID1", - DeviceName: helper.StringToPtr("DeviceName1"), - MemoryMiB: helper.Uint64ToPtr(1), - PowerW: helper.UintToPtr(1), - BAR1MiB: helper.Uint64ToPtr(256), - }, - PowerUsageW: helper.UintToPtr(1), - GPUUtilization: helper.UintToPtr(1), - MemoryUtilization: helper.UintToPtr(1), - EncoderUtilization: helper.UintToPtr(1), - DecoderUtilization: nil, - TemperatureC: helper.UintToPtr(1), - UsedMemoryMiB: helper.Uint64ToPtr(1), - BAR1UsedMiB: helper.Uint64ToPtr(1), - ECCErrorsL1Cache: helper.Uint64ToPtr(100), - ECCErrorsL2Cache: helper.Uint64ToPtr(100), - ECCErrorsDevice: helper.Uint64ToPtr(100), - }, - ExpectedResult: &device.DeviceStats{ - Summary: &structs.StatValue{ - Unit: MemoryStateUnit, - Desc: MemoryStateDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(1), - }, - Stats: &structs.StatObject{ - Attributes: map[string]*structs.StatValue{ - PowerUsageAttr: { - Unit: PowerUsageUnit, - Desc: PowerUsageDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(1), - }, - GPUUtilizationAttr: { - Unit: GPUUtilizationUnit, - Desc: GPUUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - MemoryUtilizationAttr: { - Unit: MemoryUtilizationUnit, - Desc: MemoryUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - EncoderUtilizationAttr: { - Unit: EncoderUtilizationUnit, - Desc: EncoderUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - DecoderUtilizationAttr: { - Unit: DecoderUtilizationUnit, - Desc: DecoderUtilizationDesc, - StringVal: helper.StringToPtr(notAvailable), - }, - TemperatureAttr: { - Unit: TemperatureUnit, - Desc: TemperatureDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - MemoryStateAttr: { - Unit: MemoryStateUnit, - Desc: MemoryStateDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(1), - }, - BAR1StateAttr: { - Unit: BAR1StateUnit, - Desc: BAR1StateDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(256), - }, - ECCErrorsL1CacheAttr: { - Unit: ECCErrorsL1CacheUnit, - Desc: ECCErrorsL1CacheDesc, - IntNumeratorVal: helper.Int64ToPtr(100), - }, - ECCErrorsL2CacheAttr: { - Unit: ECCErrorsL2CacheUnit, - Desc: ECCErrorsL2CacheDesc, - IntNumeratorVal: helper.Int64ToPtr(100), - }, - ECCErrorsDeviceAttr: { - Unit: ECCErrorsDeviceUnit, - Desc: ECCErrorsDeviceDesc, - IntNumeratorVal: helper.Int64ToPtr(100), - }, - }, - }, - Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC), - }, - }, - { - Name: "Temperature is nil", - Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC), - ItemStat: &nvml.StatsData{ - DeviceData: &nvml.DeviceData{ - UUID: "UUID1", - DeviceName: helper.StringToPtr("DeviceName1"), - MemoryMiB: helper.Uint64ToPtr(1), - PowerW: helper.UintToPtr(1), - BAR1MiB: helper.Uint64ToPtr(256), - }, - PowerUsageW: helper.UintToPtr(1), - GPUUtilization: helper.UintToPtr(1), - MemoryUtilization: helper.UintToPtr(1), - EncoderUtilization: helper.UintToPtr(1), - DecoderUtilization: helper.UintToPtr(1), - TemperatureC: nil, - UsedMemoryMiB: helper.Uint64ToPtr(1), - BAR1UsedMiB: helper.Uint64ToPtr(1), - ECCErrorsL1Cache: helper.Uint64ToPtr(100), - ECCErrorsL2Cache: helper.Uint64ToPtr(100), - ECCErrorsDevice: helper.Uint64ToPtr(100), - }, - ExpectedResult: &device.DeviceStats{ - Summary: &structs.StatValue{ - Unit: MemoryStateUnit, - Desc: MemoryStateDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(1), - }, - Stats: &structs.StatObject{ - Attributes: map[string]*structs.StatValue{ - PowerUsageAttr: { - Unit: PowerUsageUnit, - Desc: PowerUsageDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(1), - }, - GPUUtilizationAttr: { - Unit: GPUUtilizationUnit, - Desc: GPUUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - MemoryUtilizationAttr: { - Unit: MemoryUtilizationUnit, - Desc: MemoryUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - EncoderUtilizationAttr: { - Unit: EncoderUtilizationUnit, - Desc: EncoderUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - DecoderUtilizationAttr: { - Unit: DecoderUtilizationUnit, - Desc: DecoderUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - TemperatureAttr: { - Unit: TemperatureUnit, - Desc: TemperatureDesc, - StringVal: helper.StringToPtr(notAvailable), - }, - MemoryStateAttr: { - Unit: MemoryStateUnit, - Desc: MemoryStateDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(1), - }, - BAR1StateAttr: { - Unit: BAR1StateUnit, - Desc: BAR1StateDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(256), - }, - ECCErrorsL1CacheAttr: { - Unit: ECCErrorsL1CacheUnit, - Desc: ECCErrorsL1CacheDesc, - IntNumeratorVal: helper.Int64ToPtr(100), - }, - ECCErrorsL2CacheAttr: { - Unit: ECCErrorsL2CacheUnit, - Desc: ECCErrorsL2CacheDesc, - IntNumeratorVal: helper.Int64ToPtr(100), - }, - ECCErrorsDeviceAttr: { - Unit: ECCErrorsDeviceUnit, - Desc: ECCErrorsDeviceDesc, - IntNumeratorVal: helper.Int64ToPtr(100), - }, - }, - }, - Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC), - }, - }, - { - Name: "UsedMemoryMiB is nil", - Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC), - ItemStat: &nvml.StatsData{ - DeviceData: &nvml.DeviceData{ - UUID: "UUID1", - DeviceName: helper.StringToPtr("DeviceName1"), - MemoryMiB: helper.Uint64ToPtr(1), - PowerW: helper.UintToPtr(1), - BAR1MiB: helper.Uint64ToPtr(256), - }, - PowerUsageW: helper.UintToPtr(1), - GPUUtilization: helper.UintToPtr(1), - MemoryUtilization: helper.UintToPtr(1), - EncoderUtilization: helper.UintToPtr(1), - DecoderUtilization: helper.UintToPtr(1), - TemperatureC: helper.UintToPtr(1), - UsedMemoryMiB: nil, - BAR1UsedMiB: helper.Uint64ToPtr(1), - ECCErrorsL1Cache: helper.Uint64ToPtr(100), - ECCErrorsL2Cache: helper.Uint64ToPtr(100), - ECCErrorsDevice: helper.Uint64ToPtr(100), - }, - ExpectedResult: &device.DeviceStats{ - Summary: &structs.StatValue{ - Unit: MemoryStateUnit, - Desc: MemoryStateDesc, - StringVal: helper.StringToPtr(notAvailable), - }, - Stats: &structs.StatObject{ - Attributes: map[string]*structs.StatValue{ - PowerUsageAttr: { - Unit: PowerUsageUnit, - Desc: PowerUsageDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(1), - }, - GPUUtilizationAttr: { - Unit: GPUUtilizationUnit, - Desc: GPUUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - MemoryUtilizationAttr: { - Unit: MemoryUtilizationUnit, - Desc: MemoryUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - EncoderUtilizationAttr: { - Unit: EncoderUtilizationUnit, - Desc: EncoderUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - DecoderUtilizationAttr: { - Unit: DecoderUtilizationUnit, - Desc: DecoderUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - TemperatureAttr: { - Unit: TemperatureUnit, - Desc: TemperatureDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - MemoryStateAttr: { - Unit: MemoryStateUnit, - Desc: MemoryStateDesc, - StringVal: helper.StringToPtr(notAvailable), - }, - BAR1StateAttr: { - Unit: BAR1StateUnit, - Desc: BAR1StateDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(256), - }, - ECCErrorsL1CacheAttr: { - Unit: ECCErrorsL1CacheUnit, - Desc: ECCErrorsL1CacheDesc, - IntNumeratorVal: helper.Int64ToPtr(100), - }, - ECCErrorsL2CacheAttr: { - Unit: ECCErrorsL2CacheUnit, - Desc: ECCErrorsL2CacheDesc, - IntNumeratorVal: helper.Int64ToPtr(100), - }, - ECCErrorsDeviceAttr: { - Unit: ECCErrorsDeviceUnit, - Desc: ECCErrorsDeviceDesc, - IntNumeratorVal: helper.Int64ToPtr(100), - }, - }, - }, - Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC), - }, - }, - { - Name: "MemoryMiB is nil", - Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC), - ItemStat: &nvml.StatsData{ - DeviceData: &nvml.DeviceData{ - UUID: "UUID1", - DeviceName: helper.StringToPtr("DeviceName1"), - MemoryMiB: nil, - PowerW: helper.UintToPtr(1), - BAR1MiB: helper.Uint64ToPtr(256), - }, - PowerUsageW: helper.UintToPtr(1), - GPUUtilization: helper.UintToPtr(1), - MemoryUtilization: helper.UintToPtr(1), - EncoderUtilization: helper.UintToPtr(1), - DecoderUtilization: helper.UintToPtr(1), - TemperatureC: helper.UintToPtr(1), - UsedMemoryMiB: helper.Uint64ToPtr(1), - BAR1UsedMiB: helper.Uint64ToPtr(1), - ECCErrorsL1Cache: helper.Uint64ToPtr(100), - ECCErrorsL2Cache: helper.Uint64ToPtr(100), - ECCErrorsDevice: helper.Uint64ToPtr(100), - }, - ExpectedResult: &device.DeviceStats{ - Summary: &structs.StatValue{ - Unit: MemoryStateUnit, - Desc: MemoryStateDesc, - StringVal: helper.StringToPtr(notAvailable), - }, - Stats: &structs.StatObject{ - Attributes: map[string]*structs.StatValue{ - PowerUsageAttr: { - Unit: PowerUsageUnit, - Desc: PowerUsageDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(1), - }, - GPUUtilizationAttr: { - Unit: GPUUtilizationUnit, - Desc: GPUUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - MemoryUtilizationAttr: { - Unit: MemoryUtilizationUnit, - Desc: MemoryUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - EncoderUtilizationAttr: { - Unit: EncoderUtilizationUnit, - Desc: EncoderUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - DecoderUtilizationAttr: { - Unit: DecoderUtilizationUnit, - Desc: DecoderUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - TemperatureAttr: { - Unit: TemperatureUnit, - Desc: TemperatureDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - MemoryStateAttr: { - Unit: MemoryStateUnit, - Desc: MemoryStateDesc, - StringVal: helper.StringToPtr(notAvailable), - }, - BAR1StateAttr: { - Unit: BAR1StateUnit, - Desc: BAR1StateDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(256), - }, - ECCErrorsL1CacheAttr: { - Unit: ECCErrorsL1CacheUnit, - Desc: ECCErrorsL1CacheDesc, - IntNumeratorVal: helper.Int64ToPtr(100), - }, - ECCErrorsL2CacheAttr: { - Unit: ECCErrorsL2CacheUnit, - Desc: ECCErrorsL2CacheDesc, - IntNumeratorVal: helper.Int64ToPtr(100), - }, - ECCErrorsDeviceAttr: { - Unit: ECCErrorsDeviceUnit, - Desc: ECCErrorsDeviceDesc, - IntNumeratorVal: helper.Int64ToPtr(100), - }, - }, - }, - Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC), - }, - }, - { - Name: "BAR1UsedMiB is nil", - Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC), - ItemStat: &nvml.StatsData{ - DeviceData: &nvml.DeviceData{ - UUID: "UUID1", - DeviceName: helper.StringToPtr("DeviceName1"), - MemoryMiB: helper.Uint64ToPtr(1), - PowerW: helper.UintToPtr(1), - BAR1MiB: helper.Uint64ToPtr(256), - }, - PowerUsageW: helper.UintToPtr(1), - GPUUtilization: helper.UintToPtr(1), - MemoryUtilization: helper.UintToPtr(1), - EncoderUtilization: helper.UintToPtr(1), - DecoderUtilization: helper.UintToPtr(1), - TemperatureC: helper.UintToPtr(1), - UsedMemoryMiB: helper.Uint64ToPtr(1), - BAR1UsedMiB: nil, - ECCErrorsL1Cache: helper.Uint64ToPtr(100), - ECCErrorsL2Cache: helper.Uint64ToPtr(100), - ECCErrorsDevice: helper.Uint64ToPtr(100), - }, - ExpectedResult: &device.DeviceStats{ - Summary: &structs.StatValue{ - Unit: MemoryStateUnit, - Desc: MemoryStateDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(1), - }, - Stats: &structs.StatObject{ - Attributes: map[string]*structs.StatValue{ - PowerUsageAttr: { - Unit: PowerUsageUnit, - Desc: PowerUsageDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(1), - }, - GPUUtilizationAttr: { - Unit: GPUUtilizationUnit, - Desc: GPUUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - MemoryUtilizationAttr: { - Unit: MemoryUtilizationUnit, - Desc: MemoryUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - EncoderUtilizationAttr: { - Unit: EncoderUtilizationUnit, - Desc: EncoderUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - DecoderUtilizationAttr: { - Unit: DecoderUtilizationUnit, - Desc: DecoderUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - TemperatureAttr: { - Unit: TemperatureUnit, - Desc: TemperatureDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - MemoryStateAttr: { - Unit: MemoryStateUnit, - Desc: MemoryStateDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(1), - }, - BAR1StateAttr: { - Unit: BAR1StateUnit, - Desc: BAR1StateDesc, - StringVal: helper.StringToPtr(notAvailable), - }, - ECCErrorsL1CacheAttr: { - Unit: ECCErrorsL1CacheUnit, - Desc: ECCErrorsL1CacheDesc, - IntNumeratorVal: helper.Int64ToPtr(100), - }, - ECCErrorsL2CacheAttr: { - Unit: ECCErrorsL2CacheUnit, - Desc: ECCErrorsL2CacheDesc, - IntNumeratorVal: helper.Int64ToPtr(100), - }, - ECCErrorsDeviceAttr: { - Unit: ECCErrorsDeviceUnit, - Desc: ECCErrorsDeviceDesc, - IntNumeratorVal: helper.Int64ToPtr(100), - }, - }, - }, - Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC), - }, - }, - { - Name: "BAR1MiB is nil", - Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC), - ItemStat: &nvml.StatsData{ - DeviceData: &nvml.DeviceData{ - UUID: "UUID1", - DeviceName: helper.StringToPtr("DeviceName1"), - MemoryMiB: helper.Uint64ToPtr(1), - PowerW: helper.UintToPtr(1), - BAR1MiB: nil, - }, - PowerUsageW: helper.UintToPtr(1), - GPUUtilization: helper.UintToPtr(1), - MemoryUtilization: helper.UintToPtr(1), - EncoderUtilization: helper.UintToPtr(1), - DecoderUtilization: helper.UintToPtr(1), - TemperatureC: helper.UintToPtr(1), - UsedMemoryMiB: helper.Uint64ToPtr(1), - BAR1UsedMiB: helper.Uint64ToPtr(1), - ECCErrorsL1Cache: helper.Uint64ToPtr(100), - ECCErrorsL2Cache: helper.Uint64ToPtr(100), - ECCErrorsDevice: helper.Uint64ToPtr(100), - }, - ExpectedResult: &device.DeviceStats{ - Summary: &structs.StatValue{ - Unit: MemoryStateUnit, - Desc: MemoryStateDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(1), - }, - Stats: &structs.StatObject{ - Attributes: map[string]*structs.StatValue{ - PowerUsageAttr: { - Unit: PowerUsageUnit, - Desc: PowerUsageDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(1), - }, - GPUUtilizationAttr: { - Unit: GPUUtilizationUnit, - Desc: GPUUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - MemoryUtilizationAttr: { - Unit: MemoryUtilizationUnit, - Desc: MemoryUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - EncoderUtilizationAttr: { - Unit: EncoderUtilizationUnit, - Desc: EncoderUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - DecoderUtilizationAttr: { - Unit: DecoderUtilizationUnit, - Desc: DecoderUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - TemperatureAttr: { - Unit: TemperatureUnit, - Desc: TemperatureDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - MemoryStateAttr: { - Unit: MemoryStateUnit, - Desc: MemoryStateDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(1), - }, - BAR1StateAttr: { - Unit: BAR1StateUnit, - Desc: BAR1StateDesc, - StringVal: helper.StringToPtr(notAvailable), - }, - ECCErrorsL1CacheAttr: { - Unit: ECCErrorsL1CacheUnit, - Desc: ECCErrorsL1CacheDesc, - IntNumeratorVal: helper.Int64ToPtr(100), - }, - ECCErrorsL2CacheAttr: { - Unit: ECCErrorsL2CacheUnit, - Desc: ECCErrorsL2CacheDesc, - IntNumeratorVal: helper.Int64ToPtr(100), - }, - ECCErrorsDeviceAttr: { - Unit: ECCErrorsDeviceUnit, - Desc: ECCErrorsDeviceDesc, - IntNumeratorVal: helper.Int64ToPtr(100), - }, - }, - }, - Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC), - }, - }, - { - Name: "ECCErrorsL1Cache is nil", - Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC), - ItemStat: &nvml.StatsData{ - DeviceData: &nvml.DeviceData{ - UUID: "UUID1", - DeviceName: helper.StringToPtr("DeviceName1"), - MemoryMiB: helper.Uint64ToPtr(1), - PowerW: helper.UintToPtr(1), - BAR1MiB: helper.Uint64ToPtr(256), - }, - PowerUsageW: helper.UintToPtr(1), - GPUUtilization: helper.UintToPtr(1), - MemoryUtilization: helper.UintToPtr(1), - EncoderUtilization: helper.UintToPtr(1), - DecoderUtilization: helper.UintToPtr(1), - TemperatureC: helper.UintToPtr(1), - UsedMemoryMiB: helper.Uint64ToPtr(1), - BAR1UsedMiB: helper.Uint64ToPtr(1), - ECCErrorsL1Cache: nil, - ECCErrorsL2Cache: helper.Uint64ToPtr(100), - ECCErrorsDevice: helper.Uint64ToPtr(100), - }, - ExpectedResult: &device.DeviceStats{ - Summary: &structs.StatValue{ - Unit: MemoryStateUnit, - Desc: MemoryStateDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(1), - }, - Stats: &structs.StatObject{ - Attributes: map[string]*structs.StatValue{ - PowerUsageAttr: { - Unit: PowerUsageUnit, - Desc: PowerUsageDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(1), - }, - GPUUtilizationAttr: { - Unit: GPUUtilizationUnit, - Desc: GPUUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - MemoryUtilizationAttr: { - Unit: MemoryUtilizationUnit, - Desc: MemoryUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - EncoderUtilizationAttr: { - Unit: EncoderUtilizationUnit, - Desc: EncoderUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - DecoderUtilizationAttr: { - Unit: DecoderUtilizationUnit, - Desc: DecoderUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - TemperatureAttr: { - Unit: TemperatureUnit, - Desc: TemperatureDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - MemoryStateAttr: { - Unit: MemoryStateUnit, - Desc: MemoryStateDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(1), - }, - BAR1StateAttr: { - Unit: BAR1StateUnit, - Desc: BAR1StateDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(256), - }, - ECCErrorsL1CacheAttr: { - Unit: ECCErrorsL1CacheUnit, - Desc: ECCErrorsL1CacheDesc, - StringVal: helper.StringToPtr(notAvailable), - }, - ECCErrorsL2CacheAttr: { - Unit: ECCErrorsL2CacheUnit, - Desc: ECCErrorsL2CacheDesc, - IntNumeratorVal: helper.Int64ToPtr(100), - }, - ECCErrorsDeviceAttr: { - Unit: ECCErrorsDeviceUnit, - Desc: ECCErrorsDeviceDesc, - IntNumeratorVal: helper.Int64ToPtr(100), - }, - }, - }, - Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC), - }, - }, - { - Name: "ECCErrorsL2Cache is nil", - Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC), - ItemStat: &nvml.StatsData{ - DeviceData: &nvml.DeviceData{ - UUID: "UUID1", - DeviceName: helper.StringToPtr("DeviceName1"), - MemoryMiB: helper.Uint64ToPtr(1), - PowerW: helper.UintToPtr(1), - BAR1MiB: helper.Uint64ToPtr(256), - }, - PowerUsageW: helper.UintToPtr(1), - GPUUtilization: helper.UintToPtr(1), - MemoryUtilization: helper.UintToPtr(1), - EncoderUtilization: helper.UintToPtr(1), - DecoderUtilization: helper.UintToPtr(1), - TemperatureC: helper.UintToPtr(1), - UsedMemoryMiB: helper.Uint64ToPtr(1), - BAR1UsedMiB: helper.Uint64ToPtr(1), - ECCErrorsL1Cache: helper.Uint64ToPtr(100), - ECCErrorsL2Cache: nil, - ECCErrorsDevice: helper.Uint64ToPtr(100), - }, - ExpectedResult: &device.DeviceStats{ - Summary: &structs.StatValue{ - Unit: MemoryStateUnit, - Desc: MemoryStateDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(1), - }, - Stats: &structs.StatObject{ - Attributes: map[string]*structs.StatValue{ - PowerUsageAttr: { - Unit: PowerUsageUnit, - Desc: PowerUsageDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(1), - }, - GPUUtilizationAttr: { - Unit: GPUUtilizationUnit, - Desc: GPUUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - MemoryUtilizationAttr: { - Unit: MemoryUtilizationUnit, - Desc: MemoryUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - EncoderUtilizationAttr: { - Unit: EncoderUtilizationUnit, - Desc: EncoderUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - DecoderUtilizationAttr: { - Unit: DecoderUtilizationUnit, - Desc: DecoderUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - TemperatureAttr: { - Unit: TemperatureUnit, - Desc: TemperatureDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - MemoryStateAttr: { - Unit: MemoryStateUnit, - Desc: MemoryStateDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(1), - }, - BAR1StateAttr: { - Unit: BAR1StateUnit, - Desc: BAR1StateDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(256), - }, - ECCErrorsL1CacheAttr: { - Unit: ECCErrorsL1CacheUnit, - Desc: ECCErrorsL1CacheDesc, - IntNumeratorVal: helper.Int64ToPtr(100), - }, - ECCErrorsL2CacheAttr: { - Unit: ECCErrorsL2CacheUnit, - Desc: ECCErrorsL2CacheDesc, - StringVal: helper.StringToPtr(notAvailable), - }, - ECCErrorsDeviceAttr: { - Unit: ECCErrorsDeviceUnit, - Desc: ECCErrorsDeviceDesc, - IntNumeratorVal: helper.Int64ToPtr(100), - }, - }, - }, - Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC), - }, - }, - { - Name: "ECCErrorsDevice is nil", - Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC), - ItemStat: &nvml.StatsData{ - DeviceData: &nvml.DeviceData{ - UUID: "UUID1", - DeviceName: helper.StringToPtr("DeviceName1"), - MemoryMiB: helper.Uint64ToPtr(1), - PowerW: helper.UintToPtr(1), - BAR1MiB: helper.Uint64ToPtr(256), - }, - PowerUsageW: helper.UintToPtr(1), - GPUUtilization: helper.UintToPtr(1), - MemoryUtilization: helper.UintToPtr(1), - EncoderUtilization: helper.UintToPtr(1), - DecoderUtilization: helper.UintToPtr(1), - TemperatureC: helper.UintToPtr(1), - UsedMemoryMiB: helper.Uint64ToPtr(1), - BAR1UsedMiB: helper.Uint64ToPtr(1), - ECCErrorsL1Cache: helper.Uint64ToPtr(100), - ECCErrorsL2Cache: helper.Uint64ToPtr(100), - ECCErrorsDevice: nil, - }, - ExpectedResult: &device.DeviceStats{ - Summary: &structs.StatValue{ - Unit: MemoryStateUnit, - Desc: MemoryStateDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(1), - }, - Stats: &structs.StatObject{ - Attributes: map[string]*structs.StatValue{ - PowerUsageAttr: { - Unit: PowerUsageUnit, - Desc: PowerUsageDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(1), - }, - GPUUtilizationAttr: { - Unit: GPUUtilizationUnit, - Desc: GPUUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - MemoryUtilizationAttr: { - Unit: MemoryUtilizationUnit, - Desc: MemoryUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - EncoderUtilizationAttr: { - Unit: EncoderUtilizationUnit, - Desc: EncoderUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - DecoderUtilizationAttr: { - Unit: DecoderUtilizationUnit, - Desc: DecoderUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - TemperatureAttr: { - Unit: TemperatureUnit, - Desc: TemperatureDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - MemoryStateAttr: { - Unit: MemoryStateUnit, - Desc: MemoryStateDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(1), - }, - BAR1StateAttr: { - Unit: BAR1StateUnit, - Desc: BAR1StateDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(256), - }, - ECCErrorsL1CacheAttr: { - Unit: ECCErrorsL1CacheUnit, - Desc: ECCErrorsL1CacheDesc, - IntNumeratorVal: helper.Int64ToPtr(100), - }, - ECCErrorsL2CacheAttr: { - Unit: ECCErrorsL2CacheUnit, - Desc: ECCErrorsL2CacheDesc, - IntNumeratorVal: helper.Int64ToPtr(100), - }, - ECCErrorsDeviceAttr: { - Unit: ECCErrorsDeviceUnit, - Desc: ECCErrorsDeviceDesc, - StringVal: helper.StringToPtr(notAvailable), - }, - }, - }, - Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC), - }, - }, - } { - actualResult := statsForItem(testCase.ItemStat, testCase.Timestamp) - require.New(t).Equal(testCase.ExpectedResult, actualResult) - } -} - -func TestStatsForGroup(t *testing.T) { - for _, testCase := range []struct { - Name string - Timestamp time.Time - GroupStats []*nvml.StatsData - GroupName string - ExpectedResult *device.DeviceGroupStats - }{ - { - Name: "make sure that all data is transformed correctly", - Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC), - GroupName: "DeviceName1", - GroupStats: []*nvml.StatsData{ - { - DeviceData: &nvml.DeviceData{ - UUID: "UUID1", - DeviceName: helper.StringToPtr("DeviceName1"), - MemoryMiB: helper.Uint64ToPtr(1), - PowerW: helper.UintToPtr(1), - BAR1MiB: helper.Uint64ToPtr(256), - }, - PowerUsageW: helper.UintToPtr(1), - GPUUtilization: helper.UintToPtr(1), - MemoryUtilization: helper.UintToPtr(1), - EncoderUtilization: helper.UintToPtr(1), - DecoderUtilization: helper.UintToPtr(1), - TemperatureC: helper.UintToPtr(1), - UsedMemoryMiB: helper.Uint64ToPtr(1), - BAR1UsedMiB: helper.Uint64ToPtr(1), - ECCErrorsL1Cache: helper.Uint64ToPtr(100), - ECCErrorsL2Cache: helper.Uint64ToPtr(100), - ECCErrorsDevice: helper.Uint64ToPtr(100), - }, - { - DeviceData: &nvml.DeviceData{ - UUID: "UUID2", - DeviceName: helper.StringToPtr("DeviceName2"), - MemoryMiB: helper.Uint64ToPtr(2), - PowerW: helper.UintToPtr(2), - BAR1MiB: helper.Uint64ToPtr(256), - }, - PowerUsageW: helper.UintToPtr(2), - GPUUtilization: helper.UintToPtr(2), - MemoryUtilization: helper.UintToPtr(2), - EncoderUtilization: helper.UintToPtr(2), - DecoderUtilization: helper.UintToPtr(2), - TemperatureC: helper.UintToPtr(2), - UsedMemoryMiB: helper.Uint64ToPtr(2), - BAR1UsedMiB: helper.Uint64ToPtr(2), - ECCErrorsL1Cache: helper.Uint64ToPtr(200), - ECCErrorsL2Cache: helper.Uint64ToPtr(200), - ECCErrorsDevice: helper.Uint64ToPtr(200), - }, - { - DeviceData: &nvml.DeviceData{ - UUID: "UUID3", - DeviceName: helper.StringToPtr("DeviceName3"), - MemoryMiB: helper.Uint64ToPtr(3), - PowerW: helper.UintToPtr(3), - BAR1MiB: helper.Uint64ToPtr(256), - }, - PowerUsageW: helper.UintToPtr(3), - GPUUtilization: helper.UintToPtr(3), - MemoryUtilization: helper.UintToPtr(3), - EncoderUtilization: helper.UintToPtr(3), - DecoderUtilization: helper.UintToPtr(3), - TemperatureC: helper.UintToPtr(3), - UsedMemoryMiB: helper.Uint64ToPtr(3), - BAR1UsedMiB: helper.Uint64ToPtr(3), - ECCErrorsL1Cache: helper.Uint64ToPtr(300), - ECCErrorsL2Cache: helper.Uint64ToPtr(300), - ECCErrorsDevice: helper.Uint64ToPtr(300), - }, - }, - ExpectedResult: &device.DeviceGroupStats{ - Vendor: vendor, - Type: deviceType, - Name: "DeviceName1", - InstanceStats: map[string]*device.DeviceStats{ - "UUID1": { - Summary: &structs.StatValue{ - Unit: MemoryStateUnit, - Desc: MemoryStateDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(1), - }, - Stats: &structs.StatObject{ - Attributes: map[string]*structs.StatValue{ - PowerUsageAttr: { - Unit: PowerUsageUnit, - Desc: PowerUsageDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(1), - }, - GPUUtilizationAttr: { - Unit: GPUUtilizationUnit, - Desc: GPUUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - MemoryUtilizationAttr: { - Unit: MemoryUtilizationUnit, - Desc: MemoryUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - EncoderUtilizationAttr: { - Unit: EncoderUtilizationUnit, - Desc: EncoderUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - DecoderUtilizationAttr: { - Unit: DecoderUtilizationUnit, - Desc: DecoderUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - TemperatureAttr: { - Unit: TemperatureUnit, - Desc: TemperatureDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - MemoryStateAttr: { - Unit: MemoryStateUnit, - Desc: MemoryStateDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(1), - }, - BAR1StateAttr: { - Unit: BAR1StateUnit, - Desc: BAR1StateDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(256), - }, - ECCErrorsL1CacheAttr: { - Unit: ECCErrorsL1CacheUnit, - Desc: ECCErrorsL1CacheDesc, - IntNumeratorVal: helper.Int64ToPtr(100), - }, - ECCErrorsL2CacheAttr: { - Unit: ECCErrorsL2CacheUnit, - Desc: ECCErrorsL2CacheDesc, - IntNumeratorVal: helper.Int64ToPtr(100), - }, - ECCErrorsDeviceAttr: { - Unit: ECCErrorsDeviceUnit, - Desc: ECCErrorsDeviceDesc, - IntNumeratorVal: helper.Int64ToPtr(100), - }, - }, - }, - Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC), - }, - "UUID2": { - Summary: &structs.StatValue{ - Unit: MemoryStateUnit, - Desc: MemoryStateDesc, - IntNumeratorVal: helper.Int64ToPtr(2), - IntDenominatorVal: helper.Int64ToPtr(2), - }, - Stats: &structs.StatObject{ - Attributes: map[string]*structs.StatValue{ - PowerUsageAttr: { - Unit: PowerUsageUnit, - Desc: PowerUsageDesc, - IntNumeratorVal: helper.Int64ToPtr(2), - IntDenominatorVal: helper.Int64ToPtr(2), - }, - GPUUtilizationAttr: { - Unit: GPUUtilizationUnit, - Desc: GPUUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(2), - }, - MemoryUtilizationAttr: { - Unit: MemoryUtilizationUnit, - Desc: MemoryUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(2), - }, - EncoderUtilizationAttr: { - Unit: EncoderUtilizationUnit, - Desc: EncoderUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(2), - }, - DecoderUtilizationAttr: { - Unit: DecoderUtilizationUnit, - Desc: DecoderUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(2), - }, - TemperatureAttr: { - Unit: TemperatureUnit, - Desc: TemperatureDesc, - IntNumeratorVal: helper.Int64ToPtr(2), - }, - MemoryStateAttr: { - Unit: MemoryStateUnit, - Desc: MemoryStateDesc, - IntNumeratorVal: helper.Int64ToPtr(2), - IntDenominatorVal: helper.Int64ToPtr(2), - }, - BAR1StateAttr: { - Unit: BAR1StateUnit, - Desc: BAR1StateDesc, - IntNumeratorVal: helper.Int64ToPtr(2), - IntDenominatorVal: helper.Int64ToPtr(256), - }, - ECCErrorsL1CacheAttr: { - Unit: ECCErrorsL1CacheUnit, - Desc: ECCErrorsL1CacheDesc, - IntNumeratorVal: helper.Int64ToPtr(200), - }, - ECCErrorsL2CacheAttr: { - Unit: ECCErrorsL2CacheUnit, - Desc: ECCErrorsL2CacheDesc, - IntNumeratorVal: helper.Int64ToPtr(200), - }, - ECCErrorsDeviceAttr: { - Unit: ECCErrorsDeviceUnit, - Desc: ECCErrorsDeviceDesc, - IntNumeratorVal: helper.Int64ToPtr(200), - }, - }, - }, - Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC), - }, - "UUID3": { - Summary: &structs.StatValue{ - Unit: MemoryStateUnit, - Desc: MemoryStateDesc, - IntNumeratorVal: helper.Int64ToPtr(3), - IntDenominatorVal: helper.Int64ToPtr(3), - }, - Stats: &structs.StatObject{ - Attributes: map[string]*structs.StatValue{ - PowerUsageAttr: { - Unit: PowerUsageUnit, - Desc: PowerUsageDesc, - IntNumeratorVal: helper.Int64ToPtr(3), - IntDenominatorVal: helper.Int64ToPtr(3), - }, - GPUUtilizationAttr: { - Unit: GPUUtilizationUnit, - Desc: GPUUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(3), - }, - MemoryUtilizationAttr: { - Unit: MemoryUtilizationUnit, - Desc: MemoryUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(3), - }, - EncoderUtilizationAttr: { - Unit: EncoderUtilizationUnit, - Desc: EncoderUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(3), - }, - DecoderUtilizationAttr: { - Unit: DecoderUtilizationUnit, - Desc: DecoderUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(3), - }, - TemperatureAttr: { - Unit: TemperatureUnit, - Desc: TemperatureDesc, - IntNumeratorVal: helper.Int64ToPtr(3), - }, - MemoryStateAttr: { - Unit: MemoryStateUnit, - Desc: MemoryStateDesc, - IntNumeratorVal: helper.Int64ToPtr(3), - IntDenominatorVal: helper.Int64ToPtr(3), - }, - BAR1StateAttr: { - Unit: BAR1StateUnit, - Desc: BAR1StateDesc, - IntNumeratorVal: helper.Int64ToPtr(3), - IntDenominatorVal: helper.Int64ToPtr(256), - }, - ECCErrorsL1CacheAttr: { - Unit: ECCErrorsL1CacheUnit, - Desc: ECCErrorsL1CacheDesc, - IntNumeratorVal: helper.Int64ToPtr(300), - }, - ECCErrorsL2CacheAttr: { - Unit: ECCErrorsL2CacheUnit, - Desc: ECCErrorsL2CacheDesc, - IntNumeratorVal: helper.Int64ToPtr(300), - }, - ECCErrorsDeviceAttr: { - Unit: ECCErrorsDeviceUnit, - Desc: ECCErrorsDeviceDesc, - IntNumeratorVal: helper.Int64ToPtr(300), - }, - }, - }, - Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC), - }, - }, - }, - }, - } { - actualResult := statsForGroup(testCase.GroupName, testCase.GroupStats, testCase.Timestamp) - require.New(t).Equal(testCase.ExpectedResult, actualResult) - } -} - -func TestWriteStatsToChannel(t *testing.T) { - for _, testCase := range []struct { - Name string - ExpectedWriteToChannel *device.StatsResponse - Timestamp time.Time - Device *NvidiaDevice - }{ - { - Name: "NVML wrapper returns error", - Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC), - ExpectedWriteToChannel: &device.StatsResponse{ - Error: errors.New(""), - }, - Device: &NvidiaDevice{ - nvmlClient: &MockNvmlClient{ - StatsError: errors.New(""), - }, - logger: hclog.NewNullLogger(), - }, - }, - { - Name: "Check that stats with multiple DeviceNames are assigned to different groups", - Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC), - Device: &NvidiaDevice{ - devices: map[string]struct{}{ - "UUID1": {}, - "UUID2": {}, - "UUID3": {}, - }, - nvmlClient: &MockNvmlClient{ - StatsResponseReturned: []*nvml.StatsData{ - { - DeviceData: &nvml.DeviceData{ - UUID: "UUID1", - DeviceName: helper.StringToPtr("DeviceName1"), - MemoryMiB: helper.Uint64ToPtr(1), - PowerW: helper.UintToPtr(1), - BAR1MiB: helper.Uint64ToPtr(256), - }, - PowerUsageW: helper.UintToPtr(1), - GPUUtilization: helper.UintToPtr(1), - MemoryUtilization: helper.UintToPtr(1), - EncoderUtilization: helper.UintToPtr(1), - DecoderUtilization: helper.UintToPtr(1), - TemperatureC: helper.UintToPtr(1), - UsedMemoryMiB: helper.Uint64ToPtr(1), - BAR1UsedMiB: helper.Uint64ToPtr(1), - ECCErrorsL1Cache: helper.Uint64ToPtr(100), - ECCErrorsL2Cache: helper.Uint64ToPtr(100), - ECCErrorsDevice: helper.Uint64ToPtr(100), - }, - { - DeviceData: &nvml.DeviceData{ - UUID: "UUID2", - DeviceName: helper.StringToPtr("DeviceName2"), - MemoryMiB: helper.Uint64ToPtr(2), - PowerW: helper.UintToPtr(2), - BAR1MiB: helper.Uint64ToPtr(256), - }, - PowerUsageW: helper.UintToPtr(2), - GPUUtilization: helper.UintToPtr(2), - MemoryUtilization: helper.UintToPtr(2), - EncoderUtilization: helper.UintToPtr(2), - DecoderUtilization: helper.UintToPtr(2), - TemperatureC: helper.UintToPtr(2), - UsedMemoryMiB: helper.Uint64ToPtr(2), - BAR1UsedMiB: helper.Uint64ToPtr(2), - ECCErrorsL1Cache: helper.Uint64ToPtr(200), - ECCErrorsL2Cache: helper.Uint64ToPtr(200), - ECCErrorsDevice: helper.Uint64ToPtr(200), - }, - { - DeviceData: &nvml.DeviceData{ - UUID: "UUID3", - DeviceName: helper.StringToPtr("DeviceName3"), - MemoryMiB: helper.Uint64ToPtr(3), - PowerW: helper.UintToPtr(3), - BAR1MiB: helper.Uint64ToPtr(256), - }, - PowerUsageW: helper.UintToPtr(3), - GPUUtilization: helper.UintToPtr(3), - MemoryUtilization: helper.UintToPtr(3), - EncoderUtilization: helper.UintToPtr(3), - DecoderUtilization: helper.UintToPtr(3), - TemperatureC: helper.UintToPtr(3), - UsedMemoryMiB: helper.Uint64ToPtr(3), - BAR1UsedMiB: helper.Uint64ToPtr(3), - ECCErrorsL1Cache: helper.Uint64ToPtr(300), - ECCErrorsL2Cache: helper.Uint64ToPtr(300), - ECCErrorsDevice: helper.Uint64ToPtr(300), - }, - }, - }, - logger: hclog.NewNullLogger(), - }, - ExpectedWriteToChannel: &device.StatsResponse{ - Groups: []*device.DeviceGroupStats{ - { - Vendor: vendor, - Type: deviceType, - Name: "DeviceName1", - InstanceStats: map[string]*device.DeviceStats{ - "UUID1": { - Summary: &structs.StatValue{ - Unit: MemoryStateUnit, - Desc: MemoryStateDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(1), - }, - Stats: &structs.StatObject{ - Attributes: map[string]*structs.StatValue{ - PowerUsageAttr: { - Unit: PowerUsageUnit, - Desc: PowerUsageDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(1), - }, - GPUUtilizationAttr: { - Unit: GPUUtilizationUnit, - Desc: GPUUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - MemoryUtilizationAttr: { - Unit: MemoryUtilizationUnit, - Desc: MemoryUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - EncoderUtilizationAttr: { - Unit: EncoderUtilizationUnit, - Desc: EncoderUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - DecoderUtilizationAttr: { - Unit: DecoderUtilizationUnit, - Desc: DecoderUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - TemperatureAttr: { - Unit: TemperatureUnit, - Desc: TemperatureDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - MemoryStateAttr: { - Unit: MemoryStateUnit, - Desc: MemoryStateDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(1), - }, - BAR1StateAttr: { - Unit: BAR1StateUnit, - Desc: BAR1StateDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(256), - }, - ECCErrorsL1CacheAttr: { - Unit: ECCErrorsL1CacheUnit, - Desc: ECCErrorsL1CacheDesc, - IntNumeratorVal: helper.Int64ToPtr(100), - }, - ECCErrorsL2CacheAttr: { - Unit: ECCErrorsL2CacheUnit, - Desc: ECCErrorsL2CacheDesc, - IntNumeratorVal: helper.Int64ToPtr(100), - }, - ECCErrorsDeviceAttr: { - Unit: ECCErrorsDeviceUnit, - Desc: ECCErrorsDeviceDesc, - IntNumeratorVal: helper.Int64ToPtr(100), - }, - }, - }, - Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC), - }, - }, - }, - { - Vendor: vendor, - Type: deviceType, - Name: "DeviceName2", - InstanceStats: map[string]*device.DeviceStats{ - "UUID2": { - Summary: &structs.StatValue{ - Unit: MemoryStateUnit, - Desc: MemoryStateDesc, - IntNumeratorVal: helper.Int64ToPtr(2), - IntDenominatorVal: helper.Int64ToPtr(2), - }, - Stats: &structs.StatObject{ - Attributes: map[string]*structs.StatValue{ - PowerUsageAttr: { - Unit: PowerUsageUnit, - Desc: PowerUsageDesc, - IntNumeratorVal: helper.Int64ToPtr(2), - IntDenominatorVal: helper.Int64ToPtr(2), - }, - GPUUtilizationAttr: { - Unit: GPUUtilizationUnit, - Desc: GPUUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(2), - }, - MemoryUtilizationAttr: { - Unit: MemoryUtilizationUnit, - Desc: MemoryUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(2), - }, - EncoderUtilizationAttr: { - Unit: EncoderUtilizationUnit, - Desc: EncoderUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(2), - }, - DecoderUtilizationAttr: { - Unit: DecoderUtilizationUnit, - Desc: DecoderUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(2), - }, - TemperatureAttr: { - Unit: TemperatureUnit, - Desc: TemperatureDesc, - IntNumeratorVal: helper.Int64ToPtr(2), - }, - MemoryStateAttr: { - Unit: MemoryStateUnit, - Desc: MemoryStateDesc, - IntNumeratorVal: helper.Int64ToPtr(2), - IntDenominatorVal: helper.Int64ToPtr(2), - }, - BAR1StateAttr: { - Unit: BAR1StateUnit, - Desc: BAR1StateDesc, - IntNumeratorVal: helper.Int64ToPtr(2), - IntDenominatorVal: helper.Int64ToPtr(256), - }, - ECCErrorsL1CacheAttr: { - Unit: ECCErrorsL1CacheUnit, - Desc: ECCErrorsL1CacheDesc, - IntNumeratorVal: helper.Int64ToPtr(200), - }, - ECCErrorsL2CacheAttr: { - Unit: ECCErrorsL2CacheUnit, - Desc: ECCErrorsL2CacheDesc, - IntNumeratorVal: helper.Int64ToPtr(200), - }, - ECCErrorsDeviceAttr: { - Unit: ECCErrorsDeviceUnit, - Desc: ECCErrorsDeviceDesc, - IntNumeratorVal: helper.Int64ToPtr(200), - }, - }, - }, - Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC), - }, - }, - }, - { - Vendor: vendor, - Type: deviceType, - Name: "DeviceName3", - InstanceStats: map[string]*device.DeviceStats{ - "UUID3": { - Summary: &structs.StatValue{ - Unit: MemoryStateUnit, - Desc: MemoryStateDesc, - IntNumeratorVal: helper.Int64ToPtr(3), - IntDenominatorVal: helper.Int64ToPtr(3), - }, - Stats: &structs.StatObject{ - Attributes: map[string]*structs.StatValue{ - PowerUsageAttr: { - Unit: PowerUsageUnit, - Desc: PowerUsageDesc, - IntNumeratorVal: helper.Int64ToPtr(3), - IntDenominatorVal: helper.Int64ToPtr(3), - }, - GPUUtilizationAttr: { - Unit: GPUUtilizationUnit, - Desc: GPUUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(3), - }, - MemoryUtilizationAttr: { - Unit: MemoryUtilizationUnit, - Desc: MemoryUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(3), - }, - EncoderUtilizationAttr: { - Unit: EncoderUtilizationUnit, - Desc: EncoderUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(3), - }, - DecoderUtilizationAttr: { - Unit: DecoderUtilizationUnit, - Desc: DecoderUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(3), - }, - TemperatureAttr: { - Unit: TemperatureUnit, - Desc: TemperatureDesc, - IntNumeratorVal: helper.Int64ToPtr(3), - }, - MemoryStateAttr: { - Unit: MemoryStateUnit, - Desc: MemoryStateDesc, - IntNumeratorVal: helper.Int64ToPtr(3), - IntDenominatorVal: helper.Int64ToPtr(3), - }, - BAR1StateAttr: { - Unit: BAR1StateUnit, - Desc: BAR1StateDesc, - IntNumeratorVal: helper.Int64ToPtr(3), - IntDenominatorVal: helper.Int64ToPtr(256), - }, - ECCErrorsL1CacheAttr: { - Unit: ECCErrorsL1CacheUnit, - Desc: ECCErrorsL1CacheDesc, - IntNumeratorVal: helper.Int64ToPtr(300), - }, - ECCErrorsL2CacheAttr: { - Unit: ECCErrorsL2CacheUnit, - Desc: ECCErrorsL2CacheDesc, - IntNumeratorVal: helper.Int64ToPtr(300), - }, - ECCErrorsDeviceAttr: { - Unit: ECCErrorsDeviceUnit, - Desc: ECCErrorsDeviceDesc, - IntNumeratorVal: helper.Int64ToPtr(300), - }, - }, - }, - Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC), - }, - }, - }, - }, - }, - }, - { - Name: "Check that stats with multiple DeviceNames are assigned to different groups 2", - Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC), - Device: &NvidiaDevice{ - devices: map[string]struct{}{ - "UUID1": {}, - "UUID2": {}, - "UUID3": {}, - }, - nvmlClient: &MockNvmlClient{ - StatsResponseReturned: []*nvml.StatsData{ - { - DeviceData: &nvml.DeviceData{ - UUID: "UUID1", - DeviceName: helper.StringToPtr("DeviceName1"), - MemoryMiB: helper.Uint64ToPtr(1), - PowerW: helper.UintToPtr(1), - BAR1MiB: helper.Uint64ToPtr(256), - }, - PowerUsageW: helper.UintToPtr(1), - GPUUtilization: helper.UintToPtr(1), - MemoryUtilization: helper.UintToPtr(1), - EncoderUtilization: helper.UintToPtr(1), - DecoderUtilization: helper.UintToPtr(1), - TemperatureC: helper.UintToPtr(1), - UsedMemoryMiB: helper.Uint64ToPtr(1), - BAR1UsedMiB: helper.Uint64ToPtr(1), - ECCErrorsL1Cache: helper.Uint64ToPtr(100), - ECCErrorsL2Cache: helper.Uint64ToPtr(100), - ECCErrorsDevice: helper.Uint64ToPtr(100), - }, - { - DeviceData: &nvml.DeviceData{ - UUID: "UUID2", - DeviceName: helper.StringToPtr("DeviceName2"), - MemoryMiB: helper.Uint64ToPtr(2), - PowerW: helper.UintToPtr(2), - BAR1MiB: helper.Uint64ToPtr(256), - }, - PowerUsageW: helper.UintToPtr(2), - GPUUtilization: helper.UintToPtr(2), - MemoryUtilization: helper.UintToPtr(2), - EncoderUtilization: helper.UintToPtr(2), - DecoderUtilization: helper.UintToPtr(2), - TemperatureC: helper.UintToPtr(2), - UsedMemoryMiB: helper.Uint64ToPtr(2), - BAR1UsedMiB: helper.Uint64ToPtr(2), - ECCErrorsL1Cache: helper.Uint64ToPtr(200), - ECCErrorsL2Cache: helper.Uint64ToPtr(200), - ECCErrorsDevice: helper.Uint64ToPtr(200), - }, - { - DeviceData: &nvml.DeviceData{ - UUID: "UUID3", - DeviceName: helper.StringToPtr("DeviceName2"), - MemoryMiB: helper.Uint64ToPtr(3), - PowerW: helper.UintToPtr(3), - BAR1MiB: helper.Uint64ToPtr(256), - }, - PowerUsageW: helper.UintToPtr(3), - GPUUtilization: helper.UintToPtr(3), - MemoryUtilization: helper.UintToPtr(3), - EncoderUtilization: helper.UintToPtr(3), - DecoderUtilization: helper.UintToPtr(3), - TemperatureC: helper.UintToPtr(3), - UsedMemoryMiB: helper.Uint64ToPtr(3), - BAR1UsedMiB: helper.Uint64ToPtr(3), - ECCErrorsL1Cache: helper.Uint64ToPtr(300), - ECCErrorsL2Cache: helper.Uint64ToPtr(300), - ECCErrorsDevice: helper.Uint64ToPtr(300), - }, - }, - }, - logger: hclog.NewNullLogger(), - }, - ExpectedWriteToChannel: &device.StatsResponse{ - Groups: []*device.DeviceGroupStats{ - { - Vendor: vendor, - Type: deviceType, - Name: "DeviceName1", - InstanceStats: map[string]*device.DeviceStats{ - "UUID1": { - Summary: &structs.StatValue{ - Unit: MemoryStateUnit, - Desc: MemoryStateDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(1), - }, - Stats: &structs.StatObject{ - Attributes: map[string]*structs.StatValue{ - PowerUsageAttr: { - Unit: PowerUsageUnit, - Desc: PowerUsageDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(1), - }, - GPUUtilizationAttr: { - Unit: GPUUtilizationUnit, - Desc: GPUUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - MemoryUtilizationAttr: { - Unit: MemoryUtilizationUnit, - Desc: MemoryUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - EncoderUtilizationAttr: { - Unit: EncoderUtilizationUnit, - Desc: EncoderUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - DecoderUtilizationAttr: { - Unit: DecoderUtilizationUnit, - Desc: DecoderUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - TemperatureAttr: { - Unit: TemperatureUnit, - Desc: TemperatureDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - MemoryStateAttr: { - Unit: MemoryStateUnit, - Desc: MemoryStateDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(1), - }, - BAR1StateAttr: { - Unit: BAR1StateUnit, - Desc: BAR1StateDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(256), - }, - ECCErrorsL1CacheAttr: { - Unit: ECCErrorsL1CacheUnit, - Desc: ECCErrorsL1CacheDesc, - IntNumeratorVal: helper.Int64ToPtr(100), - }, - ECCErrorsL2CacheAttr: { - Unit: ECCErrorsL2CacheUnit, - Desc: ECCErrorsL2CacheDesc, - IntNumeratorVal: helper.Int64ToPtr(100), - }, - ECCErrorsDeviceAttr: { - Unit: ECCErrorsDeviceUnit, - Desc: ECCErrorsDeviceDesc, - IntNumeratorVal: helper.Int64ToPtr(100), - }, - }, - }, - Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC), - }, - }, - }, - { - Vendor: vendor, - Type: deviceType, - Name: "DeviceName2", - InstanceStats: map[string]*device.DeviceStats{ - "UUID3": { - Summary: &structs.StatValue{ - Unit: MemoryStateUnit, - Desc: MemoryStateDesc, - IntNumeratorVal: helper.Int64ToPtr(3), - IntDenominatorVal: helper.Int64ToPtr(3), - }, - Stats: &structs.StatObject{ - Attributes: map[string]*structs.StatValue{ - PowerUsageAttr: { - Unit: PowerUsageUnit, - Desc: PowerUsageDesc, - IntNumeratorVal: helper.Int64ToPtr(3), - IntDenominatorVal: helper.Int64ToPtr(3), - }, - GPUUtilizationAttr: { - Unit: GPUUtilizationUnit, - Desc: GPUUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(3), - }, - MemoryUtilizationAttr: { - Unit: MemoryUtilizationUnit, - Desc: MemoryUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(3), - }, - EncoderUtilizationAttr: { - Unit: EncoderUtilizationUnit, - Desc: EncoderUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(3), - }, - DecoderUtilizationAttr: { - Unit: DecoderUtilizationUnit, - Desc: DecoderUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(3), - }, - TemperatureAttr: { - Unit: TemperatureUnit, - Desc: TemperatureDesc, - IntNumeratorVal: helper.Int64ToPtr(3), - }, - MemoryStateAttr: { - Unit: MemoryStateUnit, - Desc: MemoryStateDesc, - IntNumeratorVal: helper.Int64ToPtr(3), - IntDenominatorVal: helper.Int64ToPtr(3), - }, - BAR1StateAttr: { - Unit: BAR1StateUnit, - Desc: BAR1StateDesc, - IntNumeratorVal: helper.Int64ToPtr(3), - IntDenominatorVal: helper.Int64ToPtr(256), - }, - ECCErrorsL1CacheAttr: { - Unit: ECCErrorsL1CacheUnit, - Desc: ECCErrorsL1CacheDesc, - IntNumeratorVal: helper.Int64ToPtr(300), - }, - ECCErrorsL2CacheAttr: { - Unit: ECCErrorsL2CacheUnit, - Desc: ECCErrorsL2CacheDesc, - IntNumeratorVal: helper.Int64ToPtr(300), - }, - ECCErrorsDeviceAttr: { - Unit: ECCErrorsDeviceUnit, - Desc: ECCErrorsDeviceDesc, - IntNumeratorVal: helper.Int64ToPtr(300), - }, - }, - }, - Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC), - }, - "UUID2": { - Summary: &structs.StatValue{ - Unit: MemoryStateUnit, - Desc: MemoryStateDesc, - IntNumeratorVal: helper.Int64ToPtr(2), - IntDenominatorVal: helper.Int64ToPtr(2), - }, - Stats: &structs.StatObject{ - Attributes: map[string]*structs.StatValue{ - PowerUsageAttr: { - Unit: PowerUsageUnit, - Desc: PowerUsageDesc, - IntNumeratorVal: helper.Int64ToPtr(2), - IntDenominatorVal: helper.Int64ToPtr(2), - }, - GPUUtilizationAttr: { - Unit: GPUUtilizationUnit, - Desc: GPUUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(2), - }, - MemoryUtilizationAttr: { - Unit: MemoryUtilizationUnit, - Desc: MemoryUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(2), - }, - EncoderUtilizationAttr: { - Unit: EncoderUtilizationUnit, - Desc: EncoderUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(2), - }, - DecoderUtilizationAttr: { - Unit: DecoderUtilizationUnit, - Desc: DecoderUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(2), - }, - TemperatureAttr: { - Unit: TemperatureUnit, - Desc: TemperatureDesc, - IntNumeratorVal: helper.Int64ToPtr(2), - }, - MemoryStateAttr: { - Unit: MemoryStateUnit, - Desc: MemoryStateDesc, - IntNumeratorVal: helper.Int64ToPtr(2), - IntDenominatorVal: helper.Int64ToPtr(2), - }, - BAR1StateAttr: { - Unit: BAR1StateUnit, - Desc: BAR1StateDesc, - IntNumeratorVal: helper.Int64ToPtr(2), - IntDenominatorVal: helper.Int64ToPtr(256), - }, - ECCErrorsL1CacheAttr: { - Unit: ECCErrorsL1CacheUnit, - Desc: ECCErrorsL1CacheDesc, - IntNumeratorVal: helper.Int64ToPtr(200), - }, - ECCErrorsL2CacheAttr: { - Unit: ECCErrorsL2CacheUnit, - Desc: ECCErrorsL2CacheDesc, - IntNumeratorVal: helper.Int64ToPtr(200), - }, - ECCErrorsDeviceAttr: { - Unit: ECCErrorsDeviceUnit, - Desc: ECCErrorsDeviceDesc, - IntNumeratorVal: helper.Int64ToPtr(200), - }, - }, - }, - Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC), - }, - }, - }, - }, - }, - }, - { - Name: "Check that only devices from NvidiaDevice.device map stats are reported", - Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC), - Device: &NvidiaDevice{ - devices: map[string]struct{}{ - "UUID1": {}, - "UUID2": {}, - }, - nvmlClient: &MockNvmlClient{ - StatsResponseReturned: []*nvml.StatsData{ - { - DeviceData: &nvml.DeviceData{ - UUID: "UUID1", - DeviceName: helper.StringToPtr("DeviceName1"), - MemoryMiB: helper.Uint64ToPtr(1), - PowerW: helper.UintToPtr(1), - BAR1MiB: helper.Uint64ToPtr(256), - }, - PowerUsageW: helper.UintToPtr(1), - GPUUtilization: helper.UintToPtr(1), - MemoryUtilization: helper.UintToPtr(1), - EncoderUtilization: helper.UintToPtr(1), - DecoderUtilization: helper.UintToPtr(1), - TemperatureC: helper.UintToPtr(1), - UsedMemoryMiB: helper.Uint64ToPtr(1), - BAR1UsedMiB: helper.Uint64ToPtr(1), - ECCErrorsL1Cache: helper.Uint64ToPtr(100), - ECCErrorsL2Cache: helper.Uint64ToPtr(100), - ECCErrorsDevice: helper.Uint64ToPtr(100), - }, - { - DeviceData: &nvml.DeviceData{ - UUID: "UUID2", - DeviceName: helper.StringToPtr("DeviceName2"), - MemoryMiB: helper.Uint64ToPtr(2), - PowerW: helper.UintToPtr(2), - BAR1MiB: helper.Uint64ToPtr(256), - }, - PowerUsageW: helper.UintToPtr(2), - GPUUtilization: helper.UintToPtr(2), - MemoryUtilization: helper.UintToPtr(2), - EncoderUtilization: helper.UintToPtr(2), - DecoderUtilization: helper.UintToPtr(2), - TemperatureC: helper.UintToPtr(2), - UsedMemoryMiB: helper.Uint64ToPtr(2), - BAR1UsedMiB: helper.Uint64ToPtr(2), - ECCErrorsL1Cache: helper.Uint64ToPtr(200), - ECCErrorsL2Cache: helper.Uint64ToPtr(200), - ECCErrorsDevice: helper.Uint64ToPtr(200), - }, - { - DeviceData: &nvml.DeviceData{ - UUID: "UUID3", - DeviceName: helper.StringToPtr("DeviceName3"), - MemoryMiB: helper.Uint64ToPtr(3), - PowerW: helper.UintToPtr(3), - BAR1MiB: helper.Uint64ToPtr(256), - }, - PowerUsageW: helper.UintToPtr(3), - GPUUtilization: helper.UintToPtr(3), - MemoryUtilization: helper.UintToPtr(3), - EncoderUtilization: helper.UintToPtr(3), - DecoderUtilization: helper.UintToPtr(3), - TemperatureC: helper.UintToPtr(3), - UsedMemoryMiB: helper.Uint64ToPtr(3), - BAR1UsedMiB: helper.Uint64ToPtr(3), - ECCErrorsL1Cache: helper.Uint64ToPtr(300), - ECCErrorsL2Cache: helper.Uint64ToPtr(300), - ECCErrorsDevice: helper.Uint64ToPtr(300), - }, - }, - }, - logger: hclog.NewNullLogger(), - }, - ExpectedWriteToChannel: &device.StatsResponse{ - Groups: []*device.DeviceGroupStats{ - { - Vendor: vendor, - Type: deviceType, - Name: "DeviceName1", - InstanceStats: map[string]*device.DeviceStats{ - "UUID1": { - Summary: &structs.StatValue{ - Unit: MemoryStateUnit, - Desc: MemoryStateDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(1), - }, - Stats: &structs.StatObject{ - Attributes: map[string]*structs.StatValue{ - PowerUsageAttr: { - Unit: PowerUsageUnit, - Desc: PowerUsageDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(1), - }, - GPUUtilizationAttr: { - Unit: GPUUtilizationUnit, - Desc: GPUUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - MemoryUtilizationAttr: { - Unit: MemoryUtilizationUnit, - Desc: MemoryUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - EncoderUtilizationAttr: { - Unit: EncoderUtilizationUnit, - Desc: EncoderUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - DecoderUtilizationAttr: { - Unit: DecoderUtilizationUnit, - Desc: DecoderUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - TemperatureAttr: { - Unit: TemperatureUnit, - Desc: TemperatureDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - }, - MemoryStateAttr: { - Unit: MemoryStateUnit, - Desc: MemoryStateDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(1), - }, - BAR1StateAttr: { - Unit: BAR1StateUnit, - Desc: BAR1StateDesc, - IntNumeratorVal: helper.Int64ToPtr(1), - IntDenominatorVal: helper.Int64ToPtr(256), - }, - ECCErrorsL1CacheAttr: { - Unit: ECCErrorsL1CacheUnit, - Desc: ECCErrorsL1CacheDesc, - IntNumeratorVal: helper.Int64ToPtr(100), - }, - ECCErrorsL2CacheAttr: { - Unit: ECCErrorsL2CacheUnit, - Desc: ECCErrorsL2CacheDesc, - IntNumeratorVal: helper.Int64ToPtr(100), - }, - ECCErrorsDeviceAttr: { - Unit: ECCErrorsDeviceUnit, - Desc: ECCErrorsDeviceDesc, - IntNumeratorVal: helper.Int64ToPtr(100), - }, - }, - }, - Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC), - }, - }, - }, - { - Vendor: vendor, - Type: deviceType, - Name: "DeviceName2", - InstanceStats: map[string]*device.DeviceStats{ - "UUID2": { - Summary: &structs.StatValue{ - Unit: MemoryStateUnit, - Desc: MemoryStateDesc, - IntNumeratorVal: helper.Int64ToPtr(2), - IntDenominatorVal: helper.Int64ToPtr(2), - }, - Stats: &structs.StatObject{ - Attributes: map[string]*structs.StatValue{ - PowerUsageAttr: { - Unit: PowerUsageUnit, - Desc: PowerUsageDesc, - IntNumeratorVal: helper.Int64ToPtr(2), - IntDenominatorVal: helper.Int64ToPtr(2), - }, - GPUUtilizationAttr: { - Unit: GPUUtilizationUnit, - Desc: GPUUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(2), - }, - MemoryUtilizationAttr: { - Unit: MemoryUtilizationUnit, - Desc: MemoryUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(2), - }, - EncoderUtilizationAttr: { - Unit: EncoderUtilizationUnit, - Desc: EncoderUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(2), - }, - DecoderUtilizationAttr: { - Unit: DecoderUtilizationUnit, - Desc: DecoderUtilizationDesc, - IntNumeratorVal: helper.Int64ToPtr(2), - }, - TemperatureAttr: { - Unit: TemperatureUnit, - Desc: TemperatureDesc, - IntNumeratorVal: helper.Int64ToPtr(2), - }, - MemoryStateAttr: { - Unit: MemoryStateUnit, - Desc: MemoryStateDesc, - IntNumeratorVal: helper.Int64ToPtr(2), - IntDenominatorVal: helper.Int64ToPtr(2), - }, - BAR1StateAttr: { - Unit: BAR1StateUnit, - Desc: BAR1StateDesc, - IntNumeratorVal: helper.Int64ToPtr(2), - IntDenominatorVal: helper.Int64ToPtr(256), - }, - ECCErrorsL1CacheAttr: { - Unit: ECCErrorsL1CacheUnit, - Desc: ECCErrorsL1CacheDesc, - IntNumeratorVal: helper.Int64ToPtr(200), - }, - ECCErrorsL2CacheAttr: { - Unit: ECCErrorsL2CacheUnit, - Desc: ECCErrorsL2CacheDesc, - IntNumeratorVal: helper.Int64ToPtr(200), - }, - ECCErrorsDeviceAttr: { - Unit: ECCErrorsDeviceUnit, - Desc: ECCErrorsDeviceDesc, - IntNumeratorVal: helper.Int64ToPtr(200), - }, - }, - }, - Timestamp: time.Date(1974, time.May, 19, 1, 2, 3, 4, time.UTC), - }, - }, - }, - }, - }, - }, - } { - channel := make(chan *device.StatsResponse, 1) - testCase.Device.writeStatsToChannel(channel, testCase.Timestamp) - actualResult := <-channel - // writeStatsToChannel iterates over map keys - // and insterts results to an array, so order of elements in output array - // may be different - // actualResult, expectedWriteToChannel arrays has to be sorted firsted - sort.Slice(actualResult.Groups, func(i, j int) bool { - return actualResult.Groups[i].Name < actualResult.Groups[j].Name - }) - sort.Slice(testCase.ExpectedWriteToChannel.Groups, func(i, j int) bool { - return testCase.ExpectedWriteToChannel.Groups[i].Name < testCase.ExpectedWriteToChannel.Groups[j].Name - }) - require.New(t).Equal(testCase.ExpectedWriteToChannel, actualResult) - } -} diff --git a/helper/pluginutils/catalog/register_nvidia_linux.go b/helper/pluginutils/catalog/register_nvidia_linux.go deleted file mode 100644 index a50cbe833a75..000000000000 --- a/helper/pluginutils/catalog/register_nvidia_linux.go +++ /dev/null @@ -1,14 +0,0 @@ -// +build !nonvidia - -package catalog - -import ( - "github.com/hashicorp/nomad/devices/gpu/nvidia" -) - -// This file is where all builtin plugins should be registered in the catalog. -// Plugins with build restrictions should be placed in the appropriate -// register_XXX.go file. -func init() { - Register(nvidia.PluginID, nvidia.PluginConfig) -} diff --git a/website/content/docs/devices/nvidia.mdx b/website/content/docs/devices/external/nvidia.mdx similarity index 100% rename from website/content/docs/devices/nvidia.mdx rename to website/content/docs/devices/external/nvidia.mdx diff --git a/website/data/docs-nav-data.json b/website/data/docs-nav-data.json index 1751a892208c..9616f25aec44 100644 --- a/website/data/docs-nav-data.json +++ b/website/data/docs-nav-data.json @@ -1437,10 +1437,6 @@ "title": "Overview", "path": "devices" }, - { - "title": "Nvidia", - "path": "devices/nvidia" - }, { "title": "Community", "routes": [ @@ -1448,6 +1444,10 @@ "title": "Overview", "path": "devices/external" }, + { + "title": "Nvidia", + "path": "devices/external/nvidia" + }, { "title": "USB Beta", "path": "devices/external/usb" @@ -1760,7 +1760,7 @@ { "title": "Overview", "path": "enterprise" - }, + }, { "title": "License", "routes": [