-
Notifications
You must be signed in to change notification settings - Fork 1
/
run_224_workflow.sh
147 lines (111 loc) · 4.69 KB
/
run_224_workflow.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
#!/bin/bash
#------------------------------------------------------------------------
#------------------------- define nci cluster size ----------------------
#------------------------------------------------------------------------
#PBS -N mlflowpbs
#PBS -P ge3
#PBS -q normalbw
#PBS -l walltime=12:00:00
#PBS -l ncpus=224
#PBS -l mem=800GB
#PBS -l jobfs=200GB
#PBS -l storage=gdata/ge3
#PBS -l other=hyperthread
#------------------------------------------------------------------------
#------------------------- define task parameters -----------------------
#------------------------------------------------------------------------
MLHOME=/g/data/ge3/$USER # where you have installed venvs/MLWorkflow, etc
iteration=3
jupyterPort=838$iteration
rayDashboardPort=848$iteration
rayPort=637$iteration
inputConfigFile=$inputConfigFile
#------------------------------------------------------------------------
#------------------------- load nci modules -----------------------------
#------------------------------------------------------------------------
module purge
module load pbs
module load python3-as-python
module load gdal/3.0.2
set -e
ulimit -s unlimited
echo $MLHOME
echo $inputConfigFile
source $MLHOME/venvs/MLWorkflow/bin/activate
cd $MLHOME/github/MLWorkflow
#------------------------------------------------------------------------
#------------------------- setup ray cluster ----------------------------
#------------------------------------------------------------------------
cd $PBS_O_WORKDIR
nodeDnsIps=`cat $PBS_NODEFILE | uniq`
hostNodeDnsIp=`uname -n`
hostNodeIp=`hostname -i`
rayDashboardPort=$rayDashboardPort
rayPassword='5241590000000000'
cat > $PBS_O_WORKDIR/${iteration}_setupRayWorkerNode.sh << 'EOF'
#!/bin/bash -l
set -e
ulimit -s unlimited
cd $PBS_O_WORKDIR
hostNodeIp=${1}
rayPort=${2}
rayPassword=${3}
MLHOME=${4}
hostIpNPort=$hostNodeIp:$rayPort
module purge
module load pbs
module load python3-as-python
module load gdal/3.0.2
source $MLHOME/venvs/MLWorkflow/bin/activate
cd $MLHOME/github/MLWorkflow
echo "running node to ray cluster"
echo `uname -n`
echo `hostname -i`
echo `ray start --address=$hostIpNPort --num-cpus=112 --redis-password='5241590000000000' --block &`
EOF
chmod +x $PBS_O_WORKDIR/${iteration}_setupRayWorkerNode.sh
echo "set up ray cluster......."
for nodeDnsIp in `echo ${nodeDnsIps}`
do
if [[ ${nodeDnsIp} == "${hostNodeDnsIp}" ]]
then
echo "Starting ray cluster on head node ..."
module purge
module load pbs
module load python3-as-python
module load gdal/3.0.2
source $MLHOME/venvs/MLWorkflow/bin/activate
cd $MLHOME/github/MLWorkflow
ray start --head --num-cpus=112 --include-dashboard=true --dashboard-host=0.0.0.0 --dashboard-port=${rayDashboardPort} --port=${rayPort}
sleep 10
else
echo "Starting ray cluster on worker node ..."
pbs_tmrsh "${nodeDnsIp}" $PBS_O_WORKDIR/${iteration}_setupRayWorkerNode.sh "${hostNodeIp}" "${rayPort}" "${rayPassword}" "${MLHOME}" &
sleep 5
fi
done
echo "Creating ray connection string ..."
echo "ssh -N -L ${rayDashboardPort}:${hostNodeDnsIp}:${rayDashboardPort} ${USER}@gadi.nci.org.au &" > ${PBS_O_WORKDIR}/${iteration}_connection_strings.txt
#------------------------------------------------------------------------
#------------------------- setup jupyter notebook -----------------------
#------------------------------------------------------------------------
hostNodeDnsIp=`uname -n`
echo "Starting Jupyter lab ..."
jupyter notebook --no-browser --port ${jupyterPort} --no-browser --ip=${hostNodeDnsIp} --NotebookApp.token='' --NotebookApp.password='' &
echo "Creating jupyter connection string ..."
echo "ssh -N -L ${jupyterPort}:${hostNodeDnsIp}:${jupyterPort} ${USER}@gadi.nci.org.au &" >> ${PBS_O_WORKDIR}/${iteration}_connection_strings.txt
#------------------------------------------------------------------------
#------------------------- run ml workflow ------------------------------
#------------------------------------------------------------------------
cd $MLHOME/github/MLWorkflow
python -m mlwkf -c $inputConfigFile
# sleep infinity # this allows the pbs nodes to persist until requested wall timeout, therefore you can run jupyter notebook and terminal in a browser
#------------------------------------------------------------------------
#------------------------- gracefully exit ------------------------------
#------------------------------------------------------------------------
rm *setupRayWorkerNode.sh -f
rm *connection_strings* -f
rm mlflowpbs* -f
rm core.ray:* -f
rm core.raylet* -f
rm core.store* -f