-
Notifications
You must be signed in to change notification settings - Fork 0
/
nhc.conf
197 lines (140 loc) · 6.19 KB
/
nhc.conf
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
# NHC Configuration File (sample)
#
# Lines are in the form "<hostmask>||<check>"
# Hostmask is a glob, /regexp/, or {noderange}
# Comments begin with '#'
#
#######################################################################
###
### NHC Configuration Variables
###
# Explicitly instruct NHC to assume PBS (TORQUE, PBSPro) is the Resource Manager
# * || export NHC_RM=pbs
# Do not mark nodes offline
# * || export MARK_OFFLINE=0
# Activate debugging mode
# * || export DEBUG=1
# Set watchdog timer to 15 seconds
# * || export TIMEOUT=15
# In out-of-band contexts, enable all checks
# * || export NHC_CHECK_ALL=1
# Make sure $PATH contains important directories for diagnostic commands
# * || export MOABHOMEDIR="/opt/moab"
# * || export PATH="$MOABHOMEDIR/bin:$PATH"
#######################################################################
###
### Hardware checks
###
# Set these to your correct socket, core, and thread counts.
# * || check_hw_cpuinfo 2 12 24
# Set these to the amount of physical RAM you have (leave the fudge factor).
# * || check_hw_physmem 32gb 32gb 5%
# Set these to the amount of swap you have (leave the fudge factor).
# * || check_hw_swap 8g 8g 3%
# If you prefer to use this instead of the previous two, you can.
# * || check_hw_mem 40g 40g 5%
# Check specifically for free physical memory.
# * || check_hw_physmem_free 1MB
# Same, but for swap space.
# * || check_hw_swap_free 1MB
# Check for some sort of free memory of either type.
* || check_hw_mem_free 1mb
# Checks that there's a QDR IB interface that's ACTIVE and shows LinkUp.
# * || check_hw_ib 40
# Checks for an active Myrinet interface named "myri0."
# * || check_hw_gm myri0
# Checks for an active ethernet interface named "eth1."
# * || check_hw_eth eth1
# Make sure we're running the correct BIOS version on all nodes.
# * || check_dmi_data_match "BIOS Information: Version: 2.0.1"
# Make sure our RAM is running at the correct bus rate.
# * || check_dmi_data_match -t "Memory Device" "*Speed: 1866 MHz"
# Check the mcelog daemon for any pending errors.
# * || check_hw_mcelog
#######################################################################
###
### Filesystem checks
###
# All nodes should have their root filesystem mounted read/write.
* || check_fs_mount_rw -f /
# Assert that /tmp is a mounted filesystem of type "tmpfs."
# * || check_fs_mount_rw -t tmpfs -f /tmp
# Controlling TTYs are a good thing!
# * || check_fs_mount_rw -t devpts -s '/(none|devpts)/' -f /dev/pts
# Make sure the root filesystem doesn't get too full.
# * || check_fs_free / 3%
# Free inodes are also important.
# * || check_fs_ifree / 1k
# The following illustrates how to assert an NFSv3 mount (or any other specific mount option).
# * || check_fs_mount -s bluearc0:/home -t nfs -o '/(^|,)vers=3(,|$)/' -f /home
#######################################################################
###
### File/metadata checks
###
# These should always be directories and always be read/write/execute and sticky.
* || check_file_test -r -w -x -d -k /tmp /var/tmp
# These should always be readable and should never be empty.
* || check_file_test -r -s /etc/passwd /etc/group
# Assert common properties for /dev/null (which occasionally gets clobbered).
* || check_file_test -c -r -w /dev/null /dev/zero
# * || check_file_stat -m 0666 -u 0 -g 0 -t 1 -T 3 /dev/null
# Make sure there's relatively recent activity from the syslog.
# * || check_file_stat -n 7200 /var/log/messages
# Validate a couple important accounts in the passwd file.
# * || check_file_contents /etc/passwd "/^root:x:0:0:/" "sshd:*"
#######################################################################
###
### Process checks
###
# Everybody needs sshd running, right? But don't use -r (restart)!
* || check_ps_service -u root -S sshd
# The cron daemon is another useful critter...
# * || check_ps_service -r crond
# Check for wulfd but don't manage it.
# * || check_ps_daemon wulfd root
# Make sure no users are SSH'd in, but don't kill them.
# * || check_ps_blacklist sshd '!root'
# Flag and kill any processes which are owned by unauthorized users.
# * || check_ps_unauth_users log syslog kill
# Flag any user processes not properly parented.
# * || check_ps_userproc_lineage log syslog
# Most systems also need NFS locking services.
# * || check_ps_service -d rpc.statd -r nfslock
# The audit daemon can sometimes disappear if things get hairy.
# * || check_ps_service -r auditd
# This is only valid for RHEL6 and similar/newer systems.
# * || check_ps_service -d rsyslogd -r rsyslog
# In the case of MySQL, it's typically better to cycle.
# * || check_ps_service -c mysqld
# Double your core count is a good rule of thumb for load average max.
# * || check_ps_loadavg 24
# This should work if you place it after one of the check_hw_*() checks.
# * || check_ps_loadavg $((2*HW_CORES))
#######################################################################
###
### TORQUE/Moab checks
###
# Monitor trqauthd to make sure it's always running.
# * || check_ps_service -u root -r trqauthd
# Same for pbs_mom...just make sure NHC runs out-of-band in some way!
# * || check_ps_service -u root -r pbs_mom
# On the master node, pbs_server gets monitored too.
# * || check_ps_service -u root -r pbs_server
# Verify Moab status and version.
# * || check_moab_sched -t 10 -v 7.2.3 -m '!/PAUSED/'
# RM engine sanity checks.
# * || check_moab_rm -t 10
# TORQUE configuration sanity checks.
# * || check_moab_torque -t 10
# Assert specific TORQUE settings that are critical to operation.
# * || check_file_contents $PBS_SERVER_HOME/mom_priv/config '/^\$pbsserver master$/' '/^\$spool_as_final_name true$/' '!/localhost/'
#######################################################################
###
### Other checks
###
# Check to verify that SELinux is disabled. (Remove the "-r 1" to verify it's enabled.)
# * || check_cmd_status -t 1 -r 1 selinuxenabled
# Verify settings for an Ethernet interface.
# * || check_cmd_output -m '/addr:10\.0\.0\.1/' -m '/Bcast:10\.0\.0\.255/' -m '/Mask:255\.255\.255\.0/' -m '/^[[:space:]]*UP /' /sbin/ifconfig eth3
# nVidia HealthMon GPU health checks (requires Tesla Development Kit)
# * || check_nv_healthmon