-
Notifications
You must be signed in to change notification settings - Fork 0
/
xtreemfs_slurm.sh
executable file
·377 lines (285 loc) · 12.9 KB
/
xtreemfs_slurm.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
#!/bin/bash
###############################################################################
# Author: Robert Bärhold
# Date: 18.05.2015
#
# Starting or stopping a distributed XtreemFS farm on slurm.
# It is also possible to clone a git repository or to cleanup the local
# xtreemfs folder on each allocated node.
#
# Call:
# ./xtreemfs_slurm.sh [env.sh] (start|stop [-savelogs]|cleanup|clone)|
#
# Parameter:
# $1 [optional] -savelogs
# > only valid a "stop"
#
###############################################################################
shopt -s extglob
BASEDIR=$(dirname $0)
SOURCE_FILE=$BASEDIR/env.sh
if [[ "$1" = *.sh ]] && [[ -f "$1" ]]; then
SOURCE_FILE="$1"
shift
fi
source $SOURCE_FILE
LOCAL_DIR=""
SERVICE_PREFIX=""
# clones the git repository specified in the source file
function cloneGitHubRepository() {
echo "Clone GitHub Repository"
if [[ "$IS_CLONE_REPO" != "true" ]]; then
return 0
fi
command -v git >/dev/null 2>&1 || {
echo >&2 "Git is required to clone current GitHub repository. Aborting."
exit 1
}
if [[ ! -z "$REPO_CLONE_LOCATION" ]] && [[ -d "$REPO_CLONE_LOCATION" ]]; then
git clone "$GITHUB_REPO" "$REPO_CLONE_LOCATION"
else
echo "The given path $REPO_CLONE_LOCATION isn't a valid directory"
exit 1
fi
return 0
}
# Checks generic variables and xtreemfs components and defines basic variables
function initializeEnvironment() {
outputDebug "Setup: Checking variables and environment"
if [[ -z $SLURM_JOB_ID ]]; then
echo "You're not inside an executing SLURM allocation"
echo "Please alloc SLURM nodes with an active SLURM shell (no '--no-shell' argument)"
exit 1
fi
if [[ ! -d "$XTREEMFS_DIRECTORY" ]] || [[ ! -f "$XTREEMFS_DIRECTORY/java/xtreemfs-servers/target/xtreemfs.jar" ]]; then
echo -n "XtreemFS Directory $XTREEMFS_DIRECTORY does not exist"
echo " or doesn't include XtreemFS (missing ./java/xtreemfs-servers/target/xtreemfs.jar)."
echo "Please adjust the SOURCE_FILE ($SOURCE_FILE) or the XTREEMFS_DIRECTORY ($XTREEMFS_DIRECTORY)."
exit 1
fi
if [[ ! -e "$XTREEMFS_DIRECTORY/bin/mkfs.xtreemfs" ]] || [[ ! -e "$XTREEMFS_DIRECTORY/bin/rmfs.xtreemfs" ]] ||
[[ ! -e "$XTREEMFS_DIRECTORY/bin/mount.xtreemfs" ]] || [[ ! -e "$XTREEMFS_DIRECTORY/bin/umount.xtreemfs" ]]; then
echo "XtreemFS client hasn't been built correctly. One of the following files couldn't be found:"
echo "mkfs.xtreemfs, rmfs.xtreemfs, mount.xtreemfs or umount.xtreemfs ."
echo "Please run 'make client' inside the $XTREEMFS_DIRECTORY ."
exit 1
fi
for var in $CURRENT_JOB_FOLDER_GENERIC $LOCAL_DIR_GENERIC $LOCAL_MOUNT_PATH_GENERIC $SERVICE_PREFIX_GENERIC $VOLUME_NAME_GENERIC; do
echo "$var" | grep %JOBID% > /dev/null || {
echo "%JOBID% parameter was not found in variable: $var"
exit 1
}
done
for var in $CONFIG_FILENAME_GENERIC $LOG_FILENAME_GENERIC $PID_FILENAME_GENERIC; do
echo "$var" | grep %NAME% > /dev/null || {
echo "%NAME% parameter was not found in variable: $var"
exit 1
}
done
LOCAL_DIR=$(substitudeJobID "$LOCAL_DIR_GENERIC")
SERVICE_PREFIX=$(substitudeJobID "$SERVICE_PREFIX_GENERIC")
return 0
}
# copies the dummy configurations and adjusts several parameter inside the configuration files
function prepareConfigs() {
if [[ "$GENERATE_CONFIGS" == true ]]; then
outputDebug "Preparing configurations for DIR, MRC and OSD*"
else
outputDebug "Using given configurations for DIR, MRC and OSD*"
return 0
fi
CURRENT_JOB_FOLDER=$(substitudeJobID "$CURRENT_JOB_FOLDER_GENERIC")
mkdir -p "$CURRENT_JOB_FOLDER/config"
## DIR
DIR_CONFIG_FILE="$CURRENT_JOB_FOLDER/config/$(substitudeName "$CONFIG_FILENAME_GENERIC" "DIR")"
cp "$XTREEMFS_DIRECTORY/etc/xos/xtreemfs/dirconfig.properties" "$DIR_CONFIG_FILE"
substitudeProperty "$DIR_CONFIG_FILE" "uuid" "uuid = $SERVICE_PREFIX-DIR"
substitudeProperty "$DIR_CONFIG_FILE" "babudb.baseDir" "babudb.baseDir = $LOCAL_DIR/dir/datebase"
substitudeProperty "$DIR_CONFIG_FILE" "babudb.logDir" "babudb.logDir = $LOCAL_DIR/dir/db-log"
substitudeProperty "$DIR_CONFIG_FILE" "policy_dir" "policy_dir = $XTREEMFS_DIRECTORY/etc/xos/xtreemfs/policies"
substitudeProperty "$DIR_CONFIG_FILE" "#debug.level =" "debug.level = $DEBUG_DIR_LEVEL"
substitudeProperty "$DIR_CONFIG_FILE" "#debug.categories =" "debug.categories = $DEBUG_DIR_CATEGORIES"
DIR_HOSTNAME="${XTREEMFS_NODES[@]:0:1}"
## MRC
MRC_CONFIG_FILE="$CURRENT_JOB_FOLDER/config/$(substitudeName "$CONFIG_FILENAME_GENERIC" "MRC")"
cp "$XTREEMFS_DIRECTORY/etc/xos/xtreemfs/mrcconfig.properties" "$MRC_CONFIG_FILE"
substitudeProperty "$MRC_CONFIG_FILE" "dir_service.host" "dir_service.host = $DIR_HOSTNAME"
substitudeProperty "$MRC_CONFIG_FILE" "uuid" "uuid = $SERVICE_PREFIX-MRC"
substitudeProperty "$MRC_CONFIG_FILE" "babudb.baseDir" "babudb.baseDir = $LOCAL_DIR/mrc/datebase"
substitudeProperty "$MRC_CONFIG_FILE" "babudb.logDir" "babudb.logDir = $LOCAL_DIR/mrc/db-log"
substitudeProperty "$MRC_CONFIG_FILE" "#debug.level =" "debug.level = $DEBUG_MRC_LEVEL"
substitudeProperty "$MRC_CONFIG_FILE" "#debug.categories =" "debug.categories = $DEBUG_MRC_CATEGORIES"
substitudeProperty "$MRC_CONFIG_FILE" "policy_dir" "policy_dir = $XTREEMFS_DIRECTORY/etc/xos/xtreemfs/policies"
substitudeProperty "$MRC_CONFIG_FILE" "ssl.service_creds =" "ssl.service_creds = $XTREEMFS_DIRECTORY/etc/xos/xtreemfs/truststore/certs/mrc.p12"
substitudeProperty "$MRC_CONFIG_FILE" "ssl.trusted_certs =" "ssl.trusted_certs = $XTREEMFS_DIRECTORY/etc/xos/xtreemfs/truststore/certs/trusted.jks"
## OSDs
for (( counter=1; $counter <= $(( $NUMBER_OF_NODES - ($SKIP_NODE_COUNT + 1) )); counter++ )) {
OSDNAME="OSD${counter}"
OSD_CONFIG_FILE="$CURRENT_JOB_FOLDER/config/$(substitudeName "$CONFIG_FILENAME_GENERIC" "$OSDNAME")"
cp "$XTREEMFS_DIRECTORY/etc/xos/xtreemfs/osdconfig.properties" "$OSD_CONFIG_FILE"
substitudeProperty "$OSD_CONFIG_FILE" "dir_service.host" "dir_service.host = $DIR_HOSTNAME"
substitudeProperty "$OSD_CONFIG_FILE" "uuid" "uuid = $SERVICE_PREFIX-$OSDNAME"
substitudeProperty "$OSD_CONFIG_FILE" "object_dir" "object_dir = $LOCAL_DIR/$OSDNAME/objs"
substitudeProperty "$OSD_CONFIG_FILE" "#debug.level =" "debug.level = $DEBUG_OSD_LEVEL"
substitudeProperty "$OSD_CONFIG_FILE" "#debug.categories =" "debug.categories = $DEBUG_OSD_CATEGORIES"
substitudeProperty "$OSD_CONFIG_FILE" "policy_dir" "policy_dir = $XTREEMFS_DIRECTORY/etc/xos/xtreemfs/policies"
substitudeProperty "$OSD_CONFIG_FILE" "ssl.service_creds =" "ssl.service_creds = $XTREEMFS_DIRECTORY/etc/xos/xtreemfs/truststore/certs/osd.p12"
substitudeProperty "$OSD_CONFIG_FILE" "ssl.trusted_certs =" "ssl.trusted_certs = $XTREEMFS_DIRECTORY/etc/xos/xtreemfs/truststore/certs/trusted.jks"
}
return 0
}
# calls the start script remotely on each cumuslus node
function startServer() {
DIR_HOSTNAME="${XTREEMFS_NODES[@]:0:1}"
srun -k -N1-1 --nodelist=$DIR_HOSTNAME $BASEDIR/xtreemfs_slurm_rstart.sh "$SOURCE_FILE" "DIR" "DIR"
stopOnStartError $?
MRC_HOSTNAME="${XTREEMFS_NODES[@]:$SKIP_NODE_COUNT:1}"
srun -k -N1-1 --nodelist=$MRC_HOSTNAME $BASEDIR/xtreemfs_slurm_rstart.sh "$SOURCE_FILE" "MRC" "MRC"
stopOnStartError $?
counter=1
for osd_hostname in "${XTREEMFS_NODES[@]:$(( $SKIP_NODE_COUNT + 1 )):$(( $NUMBER_OF_NODES - ($SKIP_NODE_COUNT + 1) ))}"; do
OSDNAME="OSD${counter}"
srun -k -N1-1 --nodelist=$osd_hostname $BASEDIR/xtreemfs_slurm_rstart.sh "$SOURCE_FILE" "OSD" "$OSDNAME"
stopOnStartError $?
counter=$(($counter+1))
done
return 0
}
function stopOnStartError(){
if [[ "$1" == "1" ]]; then
echo "An error occured during startup. XtreemFS will be stopped and the logs will be saved."
stop "-savelogs"
exit 1
fi
}
# deletes the local job folder on each slurm node
function cleanUp() {
echo "Cleanup job folder incl. mount point"
for slurm_host in "${XTREEMFS_NODES[@]}"; do
outputDebug "Cleaning... $slurm_host of JOB: $JOB_ID"
srun -k -N1-1 --nodelist="$slurm_host" rm -r "$LOCAL_DIR"
done
return 0
}
# calls the stop script remotely on each slurm node, passing the savelogs flag if activated
function stopServerAndSaveLogs() {
SAVE_LOGS=""
if [[ ! -z "$1" ]] && [[ "$1" == "-savelogs" ]]; then
outputDebug "Saving logs is active"
SAVE_LOGS="-savelogs"
fi
counter=1
for osd_hostname in "${XTREEMFS_NODES[@]:$(( $SKIP_NODE_COUNT + 1 )):$(( $NUMBER_OF_NODES - ($SKIP_NODE_COUNT + 1) ))}"; do
OSDNAME="OSD${counter}"
srun -k -N1-1 --nodelist=$osd_hostname $BASEDIR/xtreemfs_slurm_rstop.sh "$SOURCE_FILE" "$OSDNAME" "$SAVE_LOGS"
counter=$(($counter+1))
done
MRC_HOSTNAME="${XTREEMFS_NODES[@]:$SKIP_NODE_COUNT:1}"
srun -k -N1-1 --nodelist=$MRC_HOSTNAME $BASEDIR/xtreemfs_slurm_rstop.sh "$SOURCE_FILE" "MRC" "$SAVE_LOGS"
DIR_HOSTNAME="${XTREEMFS_NODES[@]:0:1}"
srun -k -N1-1 --nodelist=$DIR_HOSTNAME $BASEDIR/xtreemfs_slurm_rstop.sh "$SOURCE_FILE" "DIR" "$SAVE_LOGS"
return 0
}
# unmounts the volume, deletes the volumes and stops and deletes the XtreemFS farm setup
function stop() {
initializeEnvironment
MRC_HOSTNAME="${XTREEMFS_NODES[@]:$SKIP_NODE_COUNT:1}"
VOLUME_NAME=$(substitudeJobID "$VOLUME_NAME_GENERIC")
LOCAL_MOUNT_PATH=$(substitudeJobID "$LOCAL_MOUNT_PATH_GENERIC")
CURRENT_JOB_FOLDER=$(substitudeJobID "$CURRENT_JOB_FOLDER_GENERIC")
echo "Stopping XtreemFS $JOB_ID on slurm..."
# stop watchdog
for slurm_host in "${XTREEMFS_NODES[@]}"; do
srun -k -N1-1 --nodelist=$slurm_host $BASEDIR/xtreemfs_slurm_rstop.sh "$SOURCE_FILE" "watchdog"
done
for slurm_host in "${XTREEMFS_NODES[@]}"; do
if [[ ! -z "$1" ]] && [[ "$1" == "-savelogs" ]] && [[ "$DEBUG_CLIENT_ACTIVE" == true ]]; then
mkdir -p "$CURRENT_JOB_FOLDER/savedLogs"
srun -k -N1-1 --nodelist="$slurm_host" cp "$LOCAL_DIR/$slurm_host-client.log" "$CURRENT_JOB_FOLDER/savedLogs/$slurm_host-client.log"
fi
srun -k -N1-1 --nodelist="$slurm_host" $XTREEMFS_DIRECTORY/bin/umount.xtreemfs "$LOCAL_MOUNT_PATH"
done
$XTREEMFS_DIRECTORY/cpp/build/rmfs.xtreemfs -f $MRC_HOSTNAME/$VOLUME_NAME
stopServerAndSaveLogs $1
cleanUp
return 0
}
# starts the XtreemFS farm setup, creates a volume and mounts it to the specified location
function start() {
initializeEnvironment
DIR_HOSTNAME="${XTREEMFS_NODES[@]:0:1}"
MRC_HOSTNAME="${XTREEMFS_NODES[@]:$SKIP_NODE_COUNT:1}"
VOLUME_NAME=$(substitudeJobID "$VOLUME_NAME_GENERIC")
LOCAL_MOUNT_PATH=$(substitudeJobID "$LOCAL_MOUNT_PATH_GENERIC")
echo "Starting XtreemFS $JOB_ID on slurm..."
prepareConfigs
startServer
$XTREEMFS_DIRECTORY/cpp/build/mkfs.xtreemfs $VOLUME_PARAMETER $MRC_HOSTNAME/$VOLUME_NAME
stopOnStartError $?
# create local mount path on each node
for slurm_host in "${XTREEMFS_NODES[@]}"; do
srun -k -N1-1 --nodelist="$slurm_host" mkdir -p $LOCAL_MOUNT_PATH
stopOnStartError $?
done
# mount volume on each node
for slurm_host in "${XTREEMFS_NODES[@]}"; do
_MOUNT_OPTIONS="$MOUNT_OPTIONS"
if [[ "$DEBUG_CLIENT_ACTIVE" == true ]]; then
_MOUNT_OPTIONS="$_MOUNT_OPTIONS -d $DEBUG_CLIENT_LEVEL -l $LOCAL_DIR/$slurm_host-client.log"
fi
srun -k -N1-1 --nodelist="$slurm_host" $XTREEMFS_DIRECTORY/cpp/build/mount.xtreemfs $_MOUNT_OPTIONS $DIR_HOSTNAME/$VOLUME_NAME "$LOCAL_MOUNT_PATH"
stopOnStartError $?
done
# start watchdog
for slurm_host in "${XTREEMFS_NODES[@]}"; do
srun -k -N1-1 --nodelist="$slurm_host" $BASEDIR/xtreemfs_slurm_rstart.sh "$SOURCE_FILE" "watchdog"
stopOnStartError $?
done
dir_hostname_ip=$(nslookup "$DIR_HOSTNAME" | awk -F': *' '/Address:/&&NR>2{print $2;exit}')
CURRENT_JOB_ENV_FILE=$(substitudeJobID "$CURRENT_JOB_ENV_FILE_GENERIC")
echo "export WORK=\"$LOCAL_MOUNT_PATH\"" > "$CURRENT_JOB_ENV_FILE"
# This variable is set for running a Hadoop cluster
echo "export DEFAULT_VOLUME=\"xtreemfs://$dir_hostname_ip:32638/$VOLUME_NAME\"" >> "$CURRENT_JOB_ENV_FILE"
echo "export NODES_FILESYSTEM=\"${XTREEMFS_NODES[@]:0:$NUMBER_OF_NODES}\"" >> "$CURRENT_JOB_ENV_FILE"
echo "export NODES_REMAINING=\""${XTREEMFS_NODES[@]:$NUMBER_OF_NODES}"\"" >> "$CURRENT_JOB_ENV_FILE"
outputSummary
return 0
}
function outputSummary() {
echo "DIR HOST: ${XTREEMFS_NODES[@]:0:1}"
echo "MRC HOST: ${XTREEMFS_NODES[@]:$SKIP_NODE_COUNT:1}"
counter=1
for osd_hostname in ${XTREEMFS_NODES[@]:$(( $SKIP_NODE_COUNT + 1 )):$(( $NUMBER_OF_NODES - ($SKIP_NODE_COUNT + 1) ))}; do
echo "OSD${counter} HOST: $osd_hostname"
counter=$(($counter+1))
done
echo "VOLUME: $(substitudeJobID $VOLUME_NAME_GENERIC)"
echo "MOUNT PATH: $(substitudeJobID $LOCAL_MOUNT_PATH_GENERIC)"
echo "JOB ENV FILE: $(substitudeJobID $CURRENT_JOB_ENV_FILE_GENERIC)"
return 0
}
result=0
case "$1" in
start)
start
result=$?
;;
stop)
stop $2
result=$?
;;
cleanup)
initializeEnvironment
cleanUp
result=$?
;;
clone)
cloneGitHubRepository
result=$?
;;
*)
echo -e "Usage: $0 [env.sh] {start|stop [-savelogs]|cleanup|clone}\n"
result=1
;;
esac
exit $result