Skip to content

Commit

Permalink
go.d nvidia_smi remove "csv" mode (netdata#18311)
Browse files Browse the repository at this point in the history
  • Loading branch information
ilyam8 authored Aug 12, 2024
1 parent f7a6066 commit 80c0093
Show file tree
Hide file tree
Showing 15 changed files with 356 additions and 1,253 deletions.
47 changes: 3 additions & 44 deletions src/go/plugin/go.d/modules/nvidia_smi/charts.go
Original file line number Diff line number Diff line change
Expand Up @@ -53,16 +53,6 @@ var (
migDeviceFrameBufferMemoryUsageChartTmpl.Copy(),
migDeviceBAR1MemoryUsageChartTmpl.Copy(),
}
gpuCSVCharts = module.Charts{
gpuFanSpeedPercChartTmpl.Copy(),
gpuUtilizationChartTmpl.Copy(),
gpuMemUtilizationChartTmpl.Copy(),
gpuFrameBufferMemoryUsageChartTmpl.Copy(),
gpuTemperatureChartTmpl.Copy(),
gpuClockFreqChartTmpl.Copy(),
gpuPowerDrawChartTmpl.Copy(),
gpuPerformanceStateChartTmpl.Copy(),
}
)

var (
Expand Down Expand Up @@ -271,7 +261,7 @@ var (
}
)

func (nv *NvidiaSMI) addGPUXMLCharts(gpu xmlGPUInfo) {
func (nv *NvidiaSmi) addGPUXMLCharts(gpu gpuInfo) {
charts := gpuXMLCharts.Copy()

if !isValidValue(gpu.Utilization.GpuUtil) {
Expand Down Expand Up @@ -318,37 +308,6 @@ func (nv *NvidiaSMI) addGPUXMLCharts(gpu xmlGPUInfo) {
}
}

func (nv *NvidiaSMI) addGPUCSVCharts(gpu csvGPUInfo) {
charts := gpuCSVCharts.Copy()

if !isValidValue(gpu.utilizationGPU) {
_ = charts.Remove(gpuUtilizationChartTmpl.ID)
}
if !isValidValue(gpu.utilizationMemory) {
_ = charts.Remove(gpuMemUtilizationChartTmpl.ID)
}
if !isValidValue(gpu.fanSpeed) {
_ = charts.Remove(gpuFanSpeedPercChartTmpl.ID)
}
if !isValidValue(gpu.powerDraw) {
_ = charts.Remove(gpuPowerDrawChartTmpl.ID)
}

for _, c := range *charts {
c.ID = fmt.Sprintf(c.ID, strings.ToLower(gpu.uuid))
c.Labels = []module.Label{
{Key: "product_name", Value: gpu.name},
}
for _, d := range c.Dims {
d.ID = fmt.Sprintf(d.ID, gpu.uuid)
}
}

if err := nv.Charts().Add(*charts...); err != nil {
nv.Warning(err)
}
}

var (
migDeviceFrameBufferMemoryUsageChartTmpl = module.Chart{
ID: "mig_instance_%s_gpu_%s_frame_buffer_memory_usage",
Expand Down Expand Up @@ -379,7 +338,7 @@ var (
}
)

func (nv *NvidiaSMI) addMIGDeviceXMLCharts(gpu xmlGPUInfo, mig xmlMIGDeviceInfo) {
func (nv *NvidiaSmi) addMIGDeviceCharts(gpu gpuInfo, mig gpuMIGDeviceInfo) {
charts := migDeviceXMLCharts.Copy()

for _, c := range *charts {
Expand All @@ -399,7 +358,7 @@ func (nv *NvidiaSMI) addMIGDeviceXMLCharts(gpu xmlGPUInfo, mig xmlMIGDeviceInfo)
}
}

func (nv *NvidiaSMI) removeCharts(prefix string) {
func (nv *NvidiaSmi) removeCharts(prefix string) {
prefix = strings.ToLower(prefix)

for _, c := range *nv.Charts() {
Expand Down
142 changes: 137 additions & 5 deletions src/go/plugin/go.d/modules/nvidia_smi/collect.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,14 @@
package nvidia_smi

import (
"encoding/xml"
"errors"
"fmt"
"strconv"
"strings"
)

func (nv *NvidiaSMI) collect() (map[string]int64, error) {
func (nv *NvidiaSmi) collect() (map[string]int64, error) {
if nv.exec == nil {
return nil, errors.New("nvidia-smi exec is not initialized")
}
Expand All @@ -22,11 +24,141 @@ func (nv *NvidiaSMI) collect() (map[string]int64, error) {
return mx, nil
}

func (nv *NvidiaSMI) collectGPUInfo(mx map[string]int64) error {
if nv.UseCSVFormat {
return nv.collectGPUInfoCSV(mx)
func (nv *NvidiaSmi) collectGPUInfo(mx map[string]int64) error {
bs, err := nv.exec.queryGPUInfo()
if err != nil {
return fmt.Errorf("error on quering XML GPU info: %v", err)
}

info := &gpusInfo{}
if err := xml.Unmarshal(bs, info); err != nil {
return fmt.Errorf("error on unmarshaling XML GPU info response: %v", err)
}

seenGPU := make(map[string]bool)
seenMIG := make(map[string]bool)

for _, gpu := range info.GPUs {
if !isValidValue(gpu.UUID) {
continue
}

px := "gpu_" + gpu.UUID + "_"

seenGPU[px] = true

if !nv.gpus[px] {
nv.gpus[px] = true
nv.addGPUXMLCharts(gpu)
}

addMetric(mx, px+"pcie_bandwidth_usage_rx", gpu.PCI.RxUtil, 1024) // KB => bytes
addMetric(mx, px+"pcie_bandwidth_usage_tx", gpu.PCI.TxUtil, 1024) // KB => bytes
if maxBw := calcMaxPCIEBandwidth(gpu); maxBw > 0 {
rx := parseFloat(gpu.PCI.RxUtil) * 1024 // KB => bytes
tx := parseFloat(gpu.PCI.TxUtil) * 1024 // KB => bytes
mx[px+"pcie_bandwidth_utilization_rx"] = int64((rx * 100 / maxBw) * 100)
mx[px+"pcie_bandwidth_utilization_tx"] = int64((tx * 100 / maxBw) * 100)
}
addMetric(mx, px+"fan_speed_perc", gpu.FanSpeed, 0)
addMetric(mx, px+"gpu_utilization", gpu.Utilization.GpuUtil, 0)
addMetric(mx, px+"mem_utilization", gpu.Utilization.MemoryUtil, 0)
addMetric(mx, px+"decoder_utilization", gpu.Utilization.DecoderUtil, 0)
addMetric(mx, px+"encoder_utilization", gpu.Utilization.EncoderUtil, 0)
addMetric(mx, px+"frame_buffer_memory_usage_free", gpu.FBMemoryUsage.Free, 1024*1024) // MiB => bytes
addMetric(mx, px+"frame_buffer_memory_usage_used", gpu.FBMemoryUsage.Used, 1024*1024) // MiB => bytes
addMetric(mx, px+"frame_buffer_memory_usage_reserved", gpu.FBMemoryUsage.Reserved, 1024*1024) // MiB => bytes
addMetric(mx, px+"bar1_memory_usage_free", gpu.Bar1MemoryUsage.Free, 1024*1024) // MiB => bytes
addMetric(mx, px+"bar1_memory_usage_used", gpu.Bar1MemoryUsage.Used, 1024*1024) // MiB => bytes
addMetric(mx, px+"temperature", gpu.Temperature.GpuTemp, 0)
addMetric(mx, px+"graphics_clock", gpu.Clocks.GraphicsClock, 0)
addMetric(mx, px+"video_clock", gpu.Clocks.VideoClock, 0)
addMetric(mx, px+"sm_clock", gpu.Clocks.SmClock, 0)
addMetric(mx, px+"mem_clock", gpu.Clocks.MemClock, 0)
if gpu.PowerReadings != nil {
addMetric(mx, px+"power_draw", gpu.PowerReadings.PowerDraw, 0)
} else if gpu.GPUPowerReadings != nil {
addMetric(mx, px+"power_draw", gpu.GPUPowerReadings.PowerDraw, 0)
}
addMetric(mx, px+"voltage", gpu.Voltage.GraphicsVolt, 0)
for i := 0; i < 16; i++ {
s := "P" + strconv.Itoa(i)
mx[px+"performance_state_"+s] = boolToInt(gpu.PerformanceState == s)
}
if isValidValue(gpu.MIGMode.CurrentMIG) {
mode := strings.ToLower(gpu.MIGMode.CurrentMIG)
mx[px+"mig_current_mode_enabled"] = boolToInt(mode == "enabled")
mx[px+"mig_current_mode_disabled"] = boolToInt(mode == "disabled")
mx[px+"mig_devices_count"] = int64(len(gpu.MIGDevices.MIGDevice))
}

for _, mig := range gpu.MIGDevices.MIGDevice {
if !isValidValue(mig.GPUInstanceID) {
continue
}

px := "mig_instance_" + mig.GPUInstanceID + "_" + px

seenMIG[px] = true

if !nv.migs[px] {
nv.migs[px] = true
nv.addMIGDeviceCharts(gpu, mig)
}

addMetric(mx, px+"ecc_error_sram_uncorrectable", mig.ECCErrorCount.VolatileCount.SRAMUncorrectable, 0)
addMetric(mx, px+"frame_buffer_memory_usage_free", mig.FBMemoryUsage.Free, 1024*1024) // MiB => bytes
addMetric(mx, px+"frame_buffer_memory_usage_used", mig.FBMemoryUsage.Used, 1024*1024) // MiB => bytes
addMetric(mx, px+"frame_buffer_memory_usage_reserved", mig.FBMemoryUsage.Reserved, 1024*1024) // MiB => bytes
addMetric(mx, px+"bar1_memory_usage_free", mig.BAR1MemoryUsage.Free, 1024*1024) // MiB => bytes
addMetric(mx, px+"bar1_memory_usage_used", mig.BAR1MemoryUsage.Used, 1024*1024) // MiB => bytes
}
}

for px := range nv.gpus {
if !seenGPU[px] {
delete(nv.gpus, px)
nv.removeCharts(px)
}
}

for px := range nv.migs {
if !seenMIG[px] {
delete(nv.migs, px)
nv.removeCharts(px)
}
}
return nv.collectGPUInfoXML(mx)

return nil
}

func calcMaxPCIEBandwidth(gpu gpuInfo) float64 {
gen := gpu.PCI.PCIGPULinkInfo.PCIEGen.MaxLinkGen
width := strings.TrimSuffix(gpu.PCI.PCIGPULinkInfo.LinkWidths.MaxLinkWidth, "x")

if !isValidValue(gen) || !isValidValue(width) {
return 0
}

// https://enterprise-support.nvidia.com/s/article/understanding-pcie-configuration-for-maximum-performance
var speed, enc float64
switch gen {
case "1":
speed, enc = 2.5, 1.0/5.0
case "2":
speed, enc = 5, 1.0/5.0
case "3":
speed, enc = 8, 2.0/130.0
case "4":
speed, enc = 16, 2.0/130.0
case "5":
speed, enc = 32, 2.0/130.0
default:
return 0
}

// Maximum PCIe Bandwidth = SPEED * WIDTH * (1 - ENCODING) - 1Gb/s
return (speed*parseFloat(width)*(1-enc) - 1) * 1e9 / 8 // Gb/s => bytes
}

func addMetric(mx map[string]int64, key, value string, mul int) {
Expand Down
Loading

0 comments on commit 80c0093

Please sign in to comment.