Skip to content

Commit

Permalink
feat: add bus_id label to identify the gpu at hardware level
Browse files Browse the repository at this point in the history
Signed-off-by: Valentin Pichard <7628998+w3st3ry@users.noreply.github.com>
  • Loading branch information
w3st3ry committed Jan 6, 2021
1 parent d5b9b3c commit 3eb0000
Show file tree
Hide file tree
Showing 3 changed files with 46 additions and 15 deletions.
14 changes: 9 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,8 @@ Priority order is `binary flag` ➡️ `env var` ➡️ `default value`.

Nvidia-smi persistence mod is very useful, the option permits to run `nvidia-smi` as a daemon in background to prevent 100% of GPU load at each request. Enabling this option requires root.

About logs, they're all located under syslog.

## Metrics 📈

There are 6 differents metrics fetched, this number will grow in the future.
Expand All @@ -59,12 +61,14 @@ There are 6 differents metrics fetched, this number will grow in the future.

It creates an amount of time series equal to GPU amount with the label `gpu_id` + a GPU average.

A `bus_id` label also exist to identify your GPUs at hardware level.

Example for 2 GPUs with `temperature.gpu` query, it will create:
| gpu_id | Value |
|---|---|
| gpu_0 | 50 |
| gpu_1 | 60 |
| gpu_avg | 55 |
| gpu_id | bus_id | Value |
|---|---|---|
| gpu_0 | 00000000:00:04.0 | 50 |
| gpu_1 | 00000000:00:05.0 | 60 |
| gpu_avg | null | 55 |

## Compile gcp-gpu-metrics ⚙

Expand Down
24 changes: 17 additions & 7 deletions metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,11 @@ func (s *service) createMetricsDescriptors() error {
ValueType: label.LabelDescriptor_STRING,
Description: "related gpu_id for " + fquery + " metric",
},
{
Key: "bus_id",
ValueType: label.LabelDescriptor_STRING,
Description: "related bus_id for " + fquery + " metric",
},
},
},
}
Expand Down Expand Up @@ -154,19 +159,23 @@ func (s *service) fetchMetric(q nvidiasmiQuery, id int) {
_ = s.slog.Err(err.Error())
}

s.createTimeSeries(value, &q, fmt.Sprint(id))
if id >= 0 {
busID, err := getGPUbusID(id)
if err != nil {
_ = s.slog.Err(err.Error())
}

s.createTimeSeries(value, &q, fmt.Sprint(id), busID)
} else {
s.createTimeSeries(value, &q, "avg", "null")
}
}

func (s *service) createTimeSeries(value int64, q *nvidiasmiQuery, id string) {
func (s *service) createTimeSeries(value int64, q *nvidiasmiQuery, id string, busID string) {
now := time.Now()

fquery := q.gcpFormat()

// dirty hack to set the label to average
if id == "-1" {
id = "avg"
}

req := &monitoringpb.CreateTimeSeriesRequest{
Name: "projects/" + s.projectID,
TimeSeries: []*monitoringpb.TimeSeries{
Expand All @@ -175,6 +184,7 @@ func (s *service) createTimeSeries(value int64, q *nvidiasmiQuery, id string) {
Type: "custom.googleapis.com/gpu/" + fquery,
Labels: map[string]string{
"gpu_id": "gpu_" + id,
"bus_id": busID,
},
},
Resource: &monitoredres.MonitoredResource{
Expand Down
23 changes: 20 additions & 3 deletions nvidiasmi.go
Original file line number Diff line number Diff line change
Expand Up @@ -64,10 +64,14 @@ func (q *nvidiasmiQuery) gcpFormat() string {
return strings.ReplaceAll(q.Name, ".", "_")
}

const (
queryFormat string = "-u --format=csv,noheader"
)

func getGPUAmount() (int, error) {
o, err := exec.Command("/bin/sh",
"-c",
"nvidia-smi --query-gpu=index -u --format=csv,noheader",
"nvidia-smi --query-gpu=index "+queryFormat,
).Output()
if err != nil {
return 0, fmt.Errorf("%s - %s", err.Error(), string(o))
Expand All @@ -81,14 +85,27 @@ func getGPUAmount() (int, error) {
return amount, nil
}

func getGPUbusID(id int) (string, error) {
o, err := exec.Command("/bin/sh",
"-c",
fmt.Sprintf("nvidia-smi --query-gpu=pci.bus_id --id=%d "+queryFormat,
id),
).Output()
if err != nil {
return "", fmt.Errorf("%s - %s", err.Error(), string(o))
}

return strings.Split(string(o), "\n")[0], nil
}

func getGPUMetric(query string, id int) (int64, string, error) {
var cmd string

if id >= 0 {
cmd = fmt.Sprintf("nvidia-smi --id=%d --query-gpu=%s -u --format=csv,noheader",
cmd = fmt.Sprintf("nvidia-smi --id=%d --query-gpu=%s "+queryFormat,
id, query)
} else {
cmd = fmt.Sprintf("nvidia-smi --query-gpu=%s -u --format=csv,noheader",
cmd = fmt.Sprintf("nvidia-smi --query-gpu=%s "+queryFormat,
query)
}

Expand Down

0 comments on commit 3eb0000

Please sign in to comment.