forked from antonzhelyazkov/nagiosScripts
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcheck_nvidiasmi.sh
109 lines (86 loc) · 4.29 KB
/
check_nvidiasmi.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
#!/bin/bash
OPTS=$(getopt -o wct --long warning:,critical:,tmp: -n 'parse-options' -- "$@")
getOptsExitCode=$?
if [ $getOptsExitCode != 0 ]; then
echo "Failed parsing options." >&2 ;
exit 1 ;
fi
eval set -- "$OPTS"
warning="85"
critical="95"
tmpXmlFileName=nvidia.xml
tmpXmlDir=/tmp
HELP=false
while true; do
case "$1" in
--warning ) warning="$2"; shift; shift ;;
--critical ) critical="$2"; shift; shift ;;
--tmp ) tmpXmlDir="$2"; shift; shift ;;
-- ) shift; break ;;
* ) break ;;
esac
done
tmpDirTrimmed=$(echo $tmpXmlDir | sed 's:/*$::')
tmpXml=$tmpDirTrimmed/$tmpXmlFileName
temperatureWarningTreshold=85
temperatureCriticalTreshold=95
encoderWarning=0
decoderWarning=0
gpuWarning=0
memoryWarning=0
temperatureWarning=0
hash xmlstarlet 2>/dev/null
checkXmlstarlet=$?
if [ $checkXmlstarlet -ne 0 ]; then
echo "CRITICAL xmlstarlet not found. Try to install xmlstarlet"
exit 2
fi
hash nvidia-smi 2>/dev/null
checkNvidiaSmi=$?
if [ $checkNvidiaSmi -ne 0 ]; then
echo "CRITICAL nvidia-smi not found. Try to install nvidia-smi"
exit 2
fi
nvidia-smi -q -x --filename=$tmpXml
checkXmlCreation=$?
if [ $checkXmlCreation -ne 0 ]; then
echo "CRITICAL could not create $tmpXml with user $USER"
exit 2
fi
encoderUtil=$(xmlstarlet fo --dropdtd $tmpXml | xmlstarlet sel -t -v nvidia_smi_log/gpu/utilization/encoder_util | sed 's/\ \%*$//')
gpuUtil=$(xmlstarlet fo --dropdtd $tmpXml | xmlstarlet sel -t -v nvidia_smi_log/gpu/utilization/gpu_util | sed 's/\ \%*$//')
memoryUtil=$(xmlstarlet fo --dropdtd $tmpXml | xmlstarlet sel -t -v nvidia_smi_log/gpu/utilization/memory_util | sed 's/\ \%*$//')
decoderUtil=$(xmlstarlet fo --dropdtd $tmpXml | xmlstarlet sel -t -v nvidia_smi_log/gpu/utilization/decoder_util | sed 's/\ \%*$//')
temperature=$(xmlstarlet fo --dropdtd $tmpXml | xmlstarlet sel -t -v nvidia_smi_log/gpu/temperature/gpu_temp | sed 's/\ \%*C//')
temperatureMax=$(xmlstarlet fo --dropdtd $tmpXml | xmlstarlet sel -t -v nvidia_smi_log/gpu/temperature/gpu_temp_max_threshold | sed 's/\ \%*C//')
temperatureTresholdPercent=$(awk "BEGIN { pc=100*${temperature}/${temperatureMax}; i=int(pc); print (pc-i<0.5)?i:i+1 }")
rm -f $tmpXml
#echo $temperatureTresholdPercent $temperatureWarningTreshold
if [ $encoderUtil -lt $warning ] && [ $gpuUtil -lt $warning ] && [ $memoryUtil -lt $warning ] && [ $decoderUtil -lt $warning ] && [ $temperatureTresholdPercent -lt $temperatureWarningTreshold ]; then
echo "OK GPU - $gpuUtil%; Memory - $memoryUtil%; Encoder - $encoderUtil%; Decoder - $decoderUtil%; Temperature - $temperature | gpu=$gpuUtil% memory=$memoryUtil% encoder=$encoderUtil% decoder=$decoderUtil% temperature=$temperature"
exit 0
fi
if [ $encoderUtil -gt $warning ] && [ $encoderUtil -lt $critical ]; then
encoderWarning=1
fi
if [ $decoderUtil -gt $warning ] && [ $decoderUtil -lt $critical ]; then
decoderWarning=1
fi
if [ $gpuUtil -gt $warning ] && [ $gpuUtil -lt $critical ]; then
gpuWarning=1
fi
if [ $memoryUtil -gt $warning ] && [ $memoryUtil -lt $critical ]; then
memoryWarning=1
fi
if [ $temperatureTresholdPercent -gt $temperatureWarningTreshold ] && [ $temperatureTresholdPercent -lt $temperatureCriticalTreshold ]; then
temperatureWarning=1
fi
#echo "enc" $encoderWarning "dec" $decoderWarning "gpu" $gpuWarning "mem" $memoryWarning "temp" $temperatureWarning
if [ $encoderWarning -eq 1 ] || [ $decoderWarning -eq 1 ] || [ $gpuWarning -eq 1 ] || [ $memoryWarning -eq 1 ] || [ $temperatureWarning -eq 1 ]; then
echo "WARNING GPU - $gpuUtil%; Memory - $memoryUtil%; Encoder - $encoderUtil%; Decoder - $decoderUtil%; Temperature - $temperature | gpu=$gpuUtil% memory=$memoryUtil% encoder=$encoderUtil% decoder=$decoderUtil% temperature=$temperature"
exit 1
fi
if [ $encoderUtil -gt $critical ] || [ $gpuUtil -gt $critical ] || [ $memoryUtil -gt $critical ] || [ $decoderUtil -gt $critical ] || [ $temperatureTresholdPercent -gt $temperatureCriticalTreshold ]; then
echo "CRITICAL GPU - $gpuUtil%; Memory - $memoryUtil%; Encoder - $encoderUtil%; Decoder - $decoderUtil%; Temperature - $temperature | gpu=$gpuUtil% memory=$memoryUtil% encoder=$encoderUtil% decoder=$decoderUtil% temperature=$temperature"
exit 2
fi