diff --git a/README.md b/README.md index f344e6a..ff40d62 100644 --- a/README.md +++ b/README.md @@ -46,6 +46,8 @@ Priority order is `binary flag` ➡️ `env var` ➡️ `default value`. Nvidia-smi persistence mod is very useful, the option permits to run `nvidia-smi` as a daemon in background to prevent 100% of GPU load at each request. Enabling this option requires root. +About logs, they're all located under syslog. + ## Metrics 📈 There are 6 differents metrics fetched, this number will grow in the future. @@ -59,12 +61,14 @@ There are 6 differents metrics fetched, this number will grow in the future. It creates an amount of time series equal to GPU amount with the label `gpu_id` + a GPU average. +A `bus_id` label also exist to identify your GPUs at hardware level. + Example for 2 GPUs with `temperature.gpu` query, it will create: -| gpu_id | Value | -|---|---| -| gpu_0 | 50 | -| gpu_1 | 60 | -| gpu_avg | 55 | +| gpu_id | bus_id | Value | +|---|---|---| +| gpu_0 | 00000000:00:04.0 | 50 | +| gpu_1 | 00000000:00:05.0 | 60 | +| gpu_avg | null | 55 | ## Compile gcp-gpu-metrics ⚙ diff --git a/metrics.go b/metrics.go index 5dcf3cf..d481f65 100644 --- a/metrics.go +++ b/metrics.go @@ -108,6 +108,11 @@ func (s *service) createMetricsDescriptors() error { ValueType: label.LabelDescriptor_STRING, Description: "related gpu_id for " + fquery + " metric", }, + { + Key: "bus_id", + ValueType: label.LabelDescriptor_STRING, + Description: "related bus_id for " + fquery + " metric", + }, }, }, } @@ -154,19 +159,23 @@ func (s *service) fetchMetric(q nvidiasmiQuery, id int) { _ = s.slog.Err(err.Error()) } - s.createTimeSeries(value, &q, fmt.Sprint(id)) + if id >= 0 { + busID, err := getGPUbusID(id) + if err != nil { + _ = s.slog.Err(err.Error()) + } + + s.createTimeSeries(value, &q, fmt.Sprint(id), busID) + } else { + s.createTimeSeries(value, &q, "avg", "null") + } } -func (s *service) createTimeSeries(value int64, q *nvidiasmiQuery, id string) { +func (s *service) createTimeSeries(value int64, q *nvidiasmiQuery, id string, busID string) { now := time.Now() fquery := q.gcpFormat() - // dirty hack to set the label to average - if id == "-1" { - id = "avg" - } - req := &monitoringpb.CreateTimeSeriesRequest{ Name: "projects/" + s.projectID, TimeSeries: []*monitoringpb.TimeSeries{ @@ -175,6 +184,7 @@ func (s *service) createTimeSeries(value int64, q *nvidiasmiQuery, id string) { Type: "custom.googleapis.com/gpu/" + fquery, Labels: map[string]string{ "gpu_id": "gpu_" + id, + "bus_id": busID, }, }, Resource: &monitoredres.MonitoredResource{ diff --git a/nvidiasmi.go b/nvidiasmi.go index 21af9f2..fd4b8a2 100644 --- a/nvidiasmi.go +++ b/nvidiasmi.go @@ -64,10 +64,14 @@ func (q *nvidiasmiQuery) gcpFormat() string { return strings.ReplaceAll(q.Name, ".", "_") } +const ( + queryFormat string = "-u --format=csv,noheader" +) + func getGPUAmount() (int, error) { o, err := exec.Command("/bin/sh", "-c", - "nvidia-smi --query-gpu=index -u --format=csv,noheader", + "nvidia-smi --query-gpu=index "+queryFormat, ).Output() if err != nil { return 0, fmt.Errorf("%s - %s", err.Error(), string(o)) @@ -81,14 +85,27 @@ func getGPUAmount() (int, error) { return amount, nil } +func getGPUbusID(id int) (string, error) { + o, err := exec.Command("/bin/sh", + "-c", + fmt.Sprintf("nvidia-smi --query-gpu=pci.bus_id --id=%d "+queryFormat, + id), + ).Output() + if err != nil { + return "", fmt.Errorf("%s - %s", err.Error(), string(o)) + } + + return strings.Split(string(o), "\n")[0], nil +} + func getGPUMetric(query string, id int) (int64, string, error) { var cmd string if id >= 0 { - cmd = fmt.Sprintf("nvidia-smi --id=%d --query-gpu=%s -u --format=csv,noheader", + cmd = fmt.Sprintf("nvidia-smi --id=%d --query-gpu=%s "+queryFormat, id, query) } else { - cmd = fmt.Sprintf("nvidia-smi --query-gpu=%s -u --format=csv,noheader", + cmd = fmt.Sprintf("nvidia-smi --query-gpu=%s "+queryFormat, query) }