forked from NVIDIA/k8s-device-plugin
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathnvidia.go
98 lines (80 loc) · 1.93 KB
/
nvidia.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
// Copyright (c) 2017, NVIDIA CORPORATION. All rights reserved.
package main
import (
"log"
"strings"
"github.com/NVIDIA/nvidia-docker/src/nvml"
"golang.org/x/net/context"
pluginapi "k8s.io/kubernetes/pkg/kubelet/apis/deviceplugin/v1alpha"
)
func check(err error) {
if err != nil {
log.Panicln("Fatal:", err)
}
}
func getDevices() []*pluginapi.Device {
n, err := nvml.GetDeviceCount()
check(err)
var devs []*pluginapi.Device
for i := uint(0); i < n; i++ {
d, err := nvml.NewDeviceLite(i)
check(err)
devs = append(devs, &pluginapi.Device{
ID: d.UUID,
Health: pluginapi.Healthy,
})
}
return devs
}
func deviceExists(devs []*pluginapi.Device, id string) bool {
for _, d := range devs {
if d.ID == id {
return true
}
}
return false
}
func watchXIDs(ctx context.Context, devs []*pluginapi.Device, xids chan<- *pluginapi.Device) {
eventSet := nvml.NewEventSet()
defer nvml.DeleteEventSet(eventSet)
for _, d := range devs {
err := nvml.RegisterEventForDevice(eventSet, nvml.XidCriticalError, d.ID)
if err != nil && strings.HasSuffix(err.Error(), "Not Supported") {
log.Printf("Warning: GPU with UUID %s is too old to support healtchecking with error: %s. Marking it unhealthy.", d.ID)
xids <- d
continue
}
if err != nil {
log.Panicln("Fatal:", err)
}
}
for {
select {
case <-ctx.Done():
return
default:
}
e, err := nvml.WaitForEvent(eventSet, 5000)
if err != nil && e.Etype != nvml.XidCriticalError {
continue
}
// FIXME: formalize the full list and document it.
// http://docs.nvidia.com/deploy/xid-errors/index.html#topic_4
// Application errors: the GPU should still be healthy
if e.Edata == 31 || e.Edata == 43 || e.Edata == 45 {
continue
}
if e.UUID == nil || len(*e.UUID) == 0 {
// All devices are unhealthy
for _, d := range devs {
xids <- d
}
continue
}
for _, d := range devs {
if d.ID == *e.UUID {
xids <- d
}
}
}
}