-
Notifications
You must be signed in to change notification settings - Fork 0
/
health
284 lines (272 loc) · 13.9 KB
/
health
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
#!/usr/bin/env bash
#系统健康检测参数开始
#希望报告的SMART属性
SMARTAttributes="05 C4 C5 C6 C7"
#不可用的值显示为
Unavailable="N/A"
#温度单位
TemperatureUnit="°C"
#处理器传感器名称
CPUSensorsName="package"
#核心传感器名称
CoreSensorsName="core"
#处理器低温阈值
CPULowTemperature="15"
#处理器高温阈值
CPUHighTemperature="65"
#处理器过热温度
CPUOverheatedTemperature="80"
#驱动器低温阈值
DeviceLowTemperature="10"
#驱动器高温阈值
DeviceHighTemperature="50"
#驱动器过热温度
DeviceOverheatedTemperature="60"
#风扇传感器名称
FanSensorsName="fan"
#风扇转速单位
FanRotationalUnit=" rpm"
#风扇故障转速阈值
FanStopRotational="500"
#风扇转速过低阈值
FanMinRotational="1000"
#风扇转速过高阈值
FanMaxRotational="4000"
#简报推送时间,每天该时间后的第一次运行推送简报,为空则不推送简报。
PushTime="12:00"
#推送时间记录文件,为空则每次运行均推送简报。
PushTimeStamp="/tmp/health.push.timestamp"
#如果处理器和驱动器高于过热温度或风扇转速低于故障转速,无论上面两个参数如何设置,都会推送简报。
#健康简报发送命令
PushCMD="dingtalkrobot"
SplitLine="======================================================"
#系统健康检测参数结束
ScriptDir="$(dirname "$(readlink -nf "$0")")"
[[ ${PushCMD:0:1} != "/" ]] && PushCMD="${ScriptDir}/${PushCMD}"
exec 2>"/dev/null"
function _red() { echo "\033[1;31m${1}\033[0m"; }
function _green() { echo "\033[1;32m${1}\033[0m"; }
function _yellow() { echo "\033[1;33m${1}\033[0m"; }
function _blue() { echo "\033[1;34m${1}\033[0m"; }
function _purple() { echo "\033[1;35m${1}\033[0m"; }
function _cyan() { echo "\033[1;36m${1}\033[0m"; }
function _white() { echo "\033[1;37m${1}\033[0m"; }
function FormatSensorsValue() {
if [[ "${2}" == "SMART" ]]; then
SMARTWhenFailed="$(echo ${1} | cut -d ' ' -f 9)"
Attribute="$(echo ${1} | cut -d ' ' -f 10-)"
[[ -z "${Attribute}" ]] && SMARTAttribute="$(_white "${Unavailable}")" && Attribute="${Unavailable}" && return
case "${SMARTWhenFailed,,}" in
"-") [[ "${Attribute}" == "0" ]] && SMARTAttribute="$(_green "${Attribute}")" || SMARTAttribute="$(_yellow "${Attribute}")";;
*) SMARTAttribute="$(_red "${Attribute}")";;&
"failing_now") Alert="即将故障,";;
"in_the_past") Critical="曾经出错,";;
*) Error="出现问题,";;
esac
[[ -n "${Alert}${Critical}${Error}" ]] && SMARTStatus="${Error}${Critical}${Alert}"
return
elif [[ "${2}" == "${FanSensorsName}" ]]; then
Rotational="${1}"
if [[ "${1}" -lt "${FanStopRotational}" ]]; then
RotationalStatus="Failed"
RotationalColor="$(_red "${1}${FanRotationalUnit}")"
elif [[ "${1}" -lt "${FanMinRotational}" ]]; then
RotationalColor="$(_yellow "${1}${FanRotationalUnit}")"
elif [[ "${1}" -lt "${FanMaxRotational}" ]]; then
RotationalColor="$(_green "${1}${FanRotationalUnit}")"
else
RotationalColor="$(_yellow "${1}${FanRotationalUnit}")"
fi
return
elif [[ "${2}" == "${CPUSensorsName}" ]]; then
Temperature="${1}"
LowTemperature="${CPULowTemperature}"
HighTemperature="${CPUHighTemperature}"
OverheatedTemperature="${CPUOverheatedTemperature}"
else
Temperature="${1}"
LowTemperature="${DeviceLowTemperature}"
HighTemperature="${DeviceHighTemperature}"
OverheatedTemperature="${DeviceOverheatedTemperature}"
fi
if [[ "${1}" == "${Unavailable}" ]]; then
TemperatureColor="$(_white "${Unavailable}")"
elif [[ "${1%.*}" -lt "${LowTemperature}" ]]; then
TemperatureStatus="Microtherm"
TemperatureColor="$(_cyan "${1}${TemperatureUnit}")"
elif [[ "${1%.*}" -lt "${HighTemperature}" ]]; then
TemperatureColor="$(_green "${1}${TemperatureUnit}")"
elif [[ "${1%.*}" -lt "${OverheatedTemperature}" ]]; then
TemperatureColor="$(_yellow "${1}${TemperatureUnit}")"
else
TemperatureStatus="Overheated"
TemperatureColor="$(_red "${1}${TemperatureUnit}")"
fi
}
function ValueRange() {
[[ "${1}" == "${Unavailable}" ]] && Range="${2}" || Range="${1}"
Min="${Range%%|**}" && Max="${Range##*|}"
[[ "${2//./}" -lt "${Min//./}" ]] && Min="${2}"
[[ "${2//./}" -gt "${Max//./}" ]] && Max="${2}"
[[ "${Min}" == "${Max}" ]] && echo "${Min}" || echo "${Min}|${Max}"
}
Briefing=""
hash sensors || Briefing="${Briefing}找不到命令 $(_white "sensors"),请先安装 $(_white "lm_sensors") 软件包。\n"
hash smartctl || Briefing="${Briefing}找不到命令 $(_white "smartctl"),请先安装 $(_white "smartmontools") 软件包。\n"
[[ -n "${Briefing}" ]] && echo -en "${Briefing}" && exit 1
TemperatureRange="${Unavailable}"
RotationalRange="${Unavailable}"
MicrothermList=""
OverheatedList=""
FailedFanList=""
DetailedList=""
Briefing="${Briefing}${SplitLine}\n\t\t$(_white "散热情况简报")\n"
MotherBoardSensors="$(sensors | grep -i "^${CPUSensorsName} \|^${CoreSensorsName} \|^${FanSensorsName}")"
for line in ${MotherBoardSensors// /}; do
SensorsName="$(expr "${line}" : '\(^[^:]*\)')"
SensorsName="${SensorsName,,}"
SensorsValue=""
SensorsNumber="$(expr "${SensorsName}" : '^[a-z]*\([0-9]*\)')"
if [[ "${SensorsName}" == "${CPUSensorsName}"* ]]; then
SensorsValue="Temperature $(expr "${line}" : '^.*:+\([0-9\.\-]*\)')"
SensorsName="处理器${SensorsNumber}:\t"
elif [[ "${SensorsName}" == "${CoreSensorsName}"* ]]; then
SensorsValue="Temperature $(expr "${line}" : '^.*:+\([0-9\.\-]*\)')"
SensorsName="核心${SensorsNumber}:\t\t"
elif [[ "${SensorsName}" == "${FanSensorsName}"* ]]; then
SensorsValue="Rotational $(expr "${line}" : '^.*:\([0-9]*\).*')"
SensorsName="风扇${SensorsNumber}:\t\t"
fi
case "${SensorsValue%% *}" in
"Temperature")
FormatSensorsValue "${SensorsValue#* }" "${CPUSensorsName}"
TemperatureRange="$(ValueRange "${TemperatureRange}" "${Temperature}")"
[[ "${TemperatureStatus}" == "Microtherm" ]] && MicrothermList="${MicrothermList}${SensorsName%%:*} "
[[ "${TemperatureStatus}" == "Overheated" ]] && OverheatedList="${OverheatedList}${SensorsName%%:*} "
DetailedList="${DetailedList}${SensorsName}${TemperatureColor}\n"
PushNormal="${PushNormal} ${SensorsName} ${Temperature}${TemperatureUnit}\n"
TemperatureStatus=""
;;
"Rotational")
FormatSensorsValue "${SensorsValue#* }" "${FanSensorsName}"
if [[ "${Rotational}" -gt "0" ]]; then
RotationalRange="$(ValueRange "${RotationalRange}" "${Rotational}")"
[[ "${RotationalStatus}" == "Failed" ]] && FailedFanList="${FailedFanList}${SensorsName%%:*} "
DetailedList="${DetailedList}${SensorsName}${RotationalColor}\n"
PushNormal="${PushNormal} ${SensorsName} ${Rotational}${FanRotationalUnit}\n"
fi
RotationalStatus=""
;;
esac
done
[[ "${TemperatureRange}" == "${Unavailable}" ]] && UnavailableList="无法获取处理器温度" && PushUnavailable=" ${UnavailableList}\n"
[[ -z "${UnavailableList}${FailedFanList}${MicrothermList}${OverheatedList}${Abnormal}" ]] && Briefing="${Briefing} 散热系统检测正常"
FormatSensorsValue ${TemperatureRange##*|} "${CPUSensorsName}"
[[ -z "${UnavailableList}" ]] && Briefing="${Briefing} 处理器最高温度:${TemperatureColor}\n" || Briefing="${Briefing} $(_white "${UnavailableList}")\n"
[[ -n "${FailedFanList}" ]] && Briefing="${Briefing} $(_white "${FailedFanList}")$(_red "转速过低")\n" && PushFailed=" ${FailedFanList}转速过低 [${RotationalRange%%|*} ${FanRotationalUnit}]\n"
[[ -n "${OverheatedList}" ]] && Briefing="${Briefing} $(_white "${OverheatedList}")$(_red "过热")\n" && PushOverheated=" ${OverheatedList}过热 [${TemperatureRange##*|} ${TemperatureUnit}]\n"
[[ -n "${MicrothermList}" ]] && Briefing="${Briefing} $(_white "${MicrothermList}")$(_cyan "过冷")\n" && PushMicrotherm=" ${MicrothermList}过冷 [${TemperatureRange%%|*} ${TemperatureUnit}]\n"
[[ -n "${DetailedList}" ]] && Briefing="${Briefing}${SplitLine}\n${DetailedList}"
TemperatureStatus=""
TemperatureRange="${Unavailable}"
OverheatedList=""
MicrothermList=""
DisableDevice=""
FailedDevice=""
DetailedList=""
Briefing="${Briefing}${SplitLine}\n\t\t$(_white "磁盘健康简报")\n"
for i in {a..z}; do
DevicePath="/dev/sd${i}"
[[ -b "${DevicePath}" ]] || continue
Information="$(smartctl -a "${DevicePath}")"
DeviceName="$(expr "$(echo "${Information}" | grep 'Device Model')" : '.*: *\(.*\)')"
SerialNumber="$(expr "$(echo "${Information}" | grep 'Serial Number')" : '.*: *\(.*\)')"
Capacity="$(expr "$(echo "${Information}" | grep 'User Capacity')" : '.*:.*\[\(.*\)\]')"
Support="$(expr "$(echo "${Information}" | grep 'SMART support is')" : '.*: *\([^ ]*\)')"
DetailedList="${DetailedList}${SplitLine}\n设备: $(_white "${DevicePath##*/}")\t容量: $(_white "${Capacity}")\t"
if [[ "${Support,,}" == "enabled" ]]; then
DetailedList="${DetailedList}S.M.A.R.T. $(_green "已启用")"
else
DisableDevice="${DisableDevice}${DevicePath##*/} "
if [[ "${Support,,}" == "disabled" ]]; then
DetailedList="${DetailedList}S.M.A.R.T. $(_yellow "未启用")"
else
DetailedList="${DetailedList}S.M.A.R.T. $(_red "不支持")"
fi
Support=""
fi
DetailedList="${DetailedList}\n型号: $(_white "${DeviceName}")\t序列号: $(_white "${SerialNumber}")\n"
[[ -z "${Support}" ]] && continue
SMARTStatus="$(expr "$(echo "${Information}" | grep 'SMART overall-health self-assessment test result')" : '.*: *\(.*\)')"
Alert="" ; Critical=""; Error=""
Information="$(expr "${Information}" : '.*\(SMART Attributes Data.*\)SMART Error')"
FormatSensorsValue "$(echo "${Information}" | grep "^194 ")" "SMART"
FormatSensorsValue "${Attribute%% *}"
TemperatureRange="$(ValueRange "${TemperatureRange}" "${Temperature}")"
[[ "${TemperatureStatus}" == "Microtherm" ]] && MicrothermList="${MicrothermList}${DevicePath##*/} "
[[ "${TemperatureStatus}" == "Overheated" ]] && OverheatedList="${OverheatedList}${DevicePath##*/} "
TemperatureStatus=""
FormatSensorsValue "$(echo "${Information}" | grep "^ *9 ")" "SMART"
PowerOnHours="${Attribute%% *}"
FormatSensorsValue "$(echo "${Information}" | grep "^ *12 ")" "SMART"
PowerCycleCount="${Attribute%% *}"
DetailedList="${DetailedList}温度: ${TemperatureColor}\t通电: $(_white "${PowerCycleCount}")次,累计$(_white "${PowerOnHours}")小时\n自检: "
PushNormal="${PushNormal} 设备: ${DevicePath##*/}\t容量: ${Capacity}\t温度: ${Temperature}${TemperatureUnit}\n \t"
for n in ${SMARTAttributes}; do
FormatSensorsValue "$(echo "${Information}" | grep "^ *$((16#${n})) ")" "SMART"
DetailedList="${DetailedList}${n}[${SMARTAttribute}]\t"
PushNormal="${PushNormal}${n}[${Attribute}] "
done
if [[ "${SMARTStatus,,}" == "passed" ]]; then
DetailedList="${DetailedList}\n\t\t$(_green "自检通过")\n"
else
FailedDevice="${FailedDevice}${DevicePath##*/} "
DetailedList="${DetailedList}\n\t设备$(_white "${SMARTStatus%%,}"),以下是详细自检数据:\n${Information}\n"
fi
PushNormal="${PushNormal}\n"
done
[[ -z "${DisableDevice}${FailedDevice}${MicrothermList}${OverheatedList}" ]] && Briefing="${Briefing} 存储设备检测正常"
FormatSensorsValue ${TemperatureRange##*|}
Briefing="${Briefing} 设备最高温度:${TemperatureColor}\n"
[[ -n "${DisableDevice}" ]] && Briefing="${Briefing} 设备 $(_white "${DisableDevice}")无法获取 S.M.A.R.T. 信息\n" && PushUnavailable="${PushUnavailable} 设备${DisableDevice}无法获取 S.M.A.R.T. 信息\n"
[[ -n "${FailedDevice}" ]] && Briefing="${Briefing} 设备 $(_white "${FailedDevice}")发现问题\n" && PushFailed="${PushFailed} 设备 ${FailedDevice}发现问题\n"
[[ -n "${OverheatedList}" ]] && Briefing="${Briefing} 设备 $(_white "${OverheatedList}")$(_red "过热")\n" && PushOverheated="${PushOverheated} 设备 ${OverheatedList}过热[${TemperatureRange##*|} ${TemperatureUnit}]\n"
[[ -n "${MicrothermList}" ]] && Briefing="${Briefing} 设备 $(_white "${MicrothermList}")$(_cyan "过冷")\n" && PushMicrotherm="${PushMicrotherm} 设备 ${MicrothermList}过冷[${TemperatureRange%%|*} ${TemperatureUnit}]\n"
[[ -n "${DetailedList}" ]] && Briefing="${Briefing}${DetailedList}"
DiskUsage="$(df -t vfat -t ext4 -t xfs -t btrfs -h)"
DiskUsage="${DiskUsage//Filesystem/ 文件系统}"
DiskUsage="${DiskUsage//Size/大小}"
DiskUsage="${DiskUsage//Used/已用}"
DiskUsage="${DiskUsage//Avail/ 可用}"
DiskUsage="${DiskUsage//Use%/占用}"
DiskUsage="${DiskUsage//Mounted on/挂载点}"
Briefing="${Briefing}${SplitLine}\n\t\t$(_white "磁盘使用情况")\n"
Briefing="${Briefing}${SplitLine}\n${DiskUsage}\n${SplitLine}"
[[ -r "${PushTimeStamp}" ]] && LastPushTime="$(cat "${PushTimeStamp}")" || NeedPush="True"
[[ "$(expr "${PushTime//:/}" : '0*\([0-9]*\)')" -le "$(expr "$(date "+%H%M")" : '0*\([0-9]*\)')" ]] && NeedPush="True"
[[ "$(expr "${PushTime//:/}" : '0*\([0-9]*\)')" -le "$(expr "${LastPushTime:8:4}" : '0*\([0-9]*\)')" ]] && [[ "${LastPushTime:0:8}" -eq "$(date +%Y%m%d)" ]] && NeedPush=""
[[ -z "${PushTime}" ]] && NeedPush=""
NeedPush="${NeedPush}${PushUnavailable}${PushFailed}${PushOverheated}${PushMicrotherm}"
[[ "${1,,}" == "cron" ]] || echo -e "${Briefing}"
[[ -z "${NeedPush}" ]] && echo -e "$(date +%F\ %T) 系统健康检测完成" && exit
[[ -n "${PushUnavailable}" ]] && Overview="检测失败、"
[[ -n "${PushFailed}" ]] && Overview="${Overview}设备故障、"
[[ -n "${PushOverheated}${PushMicrotherm}" ]] && Overview="${Overview}温度异常、"
if [[ -z "${Overview}" ]]; then
Overview="${_green}运行正常</font>"
Details="${PushNormal}"
else
Overview="${_red}${Overview%%、}</font>"
Details="${PushUnavailable}${PushFailed}${PushOverheated}${PushMicrotherm}${PushSMARTList}\n${PushNormal}"
fi
MK_title="主机${HostName}的健康简报"
MK_text="${_blue}$(date +"%F %T")</font>\n\n"
MK_text="${MK_text}来自主机${_green}${HostName}</font>的系统健康简报\n"
MK_text="${MK_text}## ${Overview}\n\n详细信息:\n\n${Details}"
Message_Data="{\"msgtype\":\"markdown\",\"markdown\":{\"title\":\"${MK_title}\",\"text\":\"${MK_text}\"}}"
if "${PushCMD}" -q "${Message_Data}"; then
echo -e "$(date +%F\ %T) 系统健康简报推送成功"
[[ -n "${PushTimeStamp}" ]] && echo -n "$(date +%Y%m%d%H%M)" > "${PushTimeStamp}"
else
echo -e "$(date +%F\ %T) 系统健康简报推送失败"
fi