-
Notifications
You must be signed in to change notification settings - Fork 0
/
xtreemfs_slurm_watchdog.sh
executable file
·76 lines (61 loc) · 2.12 KB
/
xtreemfs_slurm_watchdog.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
#!/bin/bash
###############################################################################
# Author: Robert Bärhold
# Concept by: Dr. Thorsten Schütt
# Date: 21.09.2015
#
# The watchdog script runs in the background checking periodically, whether the
# current allocation (/batch job) is still running and iff it isn't, it will
# stop all running XtreemFS servers including unmounting the volume and saving
# the logs, if it was specified in the env.sh.
#
# Call:
# ./xtreemfs_slurm_watchdog.sh /path/to/env.sh
#
# Parameter:
# $1 path to source file (env.sh)
#
###############################################################################
if [[ "$#" -ne 1 ]]; then
echo "Wrong parameter count!"
echo "Expecting 1 arguments; found: $#"
exit 1
fi
BASEDIR=$(dirname $0)
SOURCE_FILE="$1"
if [[ ! -f $SOURCE_FILE ]]; then
echo "SOURCE_FILE $SOURCE_FILE not found!"
exit 1;
fi
source $SOURCE_FILE
function cleanup_node(){
# unmount
LOCAL_MOUNT_PATH=$(substitudeJobID "$LOCAL_MOUNT_PATH_GENERIC")
$XTREEMFS_DIRECTORY/bin/umount.xtreemfs "$LOCAL_MOUNT_PATH"
# save client-log?
if [[ ! -z "$WATCHDOG_SAVELOGS" ]] && [[ "$WATCHDOG_SAVELOGS" == "-savelogs" ]] && [[ "$DEBUG_CLIENT_ACTIVE" == true ]]; then
LOCAL_DIR=$(substitudeJobID "$LOCAL_DIR_GENERIC")
CURRENT_JOB_FOLDER=$(substitudeJobID "$CURRENT_JOB_FOLDER_GENERIC")
mkdir -p "$CURRENT_JOB_FOLDER/savedLogs"
cp "$LOCAL_DIR/$(hostname)-client.log" "$CURRENT_JOB_FOLDER/savedLogs/"
fi
CURRENT_LOCAL_FOLDER=$(substitudeJobID "$LOCAL_DIR_GENERIC")
# stop server
for server_pid in $CURRENT_LOCAL_FOLDER/*$PID_FILENAME_EXTENSION; do
pid_filename=$(basename $server_pid)
server_name=${pid_filename%.*}
if [[ $server_name != "watchdog" ]]; then
$BASEDIR/xtreemfs_slurm_rstop.sh $SOURCE_FILE $server_name "$WATCHDOG_SAVELOGS"
fi
done
# cleanup files
rm -r $CURRENT_LOCAL_FOLDER
return 0
}
IS_RUNNING=($(sacct -j $JOB_ID -b | grep $JOB_ID | awk '{print $2}'))
while [[ ${IS_RUNNING[0]} == "RUNNING" ]]; do
sleep $WATCHDOG_INTERVAL
IS_RUNNING=($(sacct -j $JOB_ID -b | grep $JOB_ID | awk '{print $2}'))
done
cleanup_node
exit $?