-
Notifications
You must be signed in to change notification settings - Fork 0
/
RunBench_cedar.slurm
153 lines (131 loc) · 4.29 KB
/
RunBench_cedar.slurm
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
#!/bin/bash
#SBATCH --mem=16gb
#SBATCH --time=24:00:00
#SBATCH --cpus-per-task=4
#SBATCH --array=1-10%1
#SBATCH --gres=gpu:1
#SBATCH --account=your-account
echo "started the RunBench at $(date)"
module load python/3.6
virtualenv --no-download $SLURM_TMPDIR/env
source $SLURM_TMPDIR/env/bin/activate
pip install torch torchvision --no-index
pip install matplotlib tensorboardX --no-index
pip install pandas numpy --no-index
pip install progress
pip install --extra-index-url https://developer.download.nvidia.com/compute/redist/cuda/10.0 nvidia-dali
# compile the QE code under utils/torch_qe
pip install $ws/utils/torch_qe/setup.py
# ---- INIT PATHS ---------------
ws=${TRAIN_HOME}
SCRATCH=$ws
#------- PARAMETERS --------------
FOLDER="${SLURM_JOB_NAME##*@}"
REMAINING="${SLURM_JOB_NAME%@*}"
BASENAME="${REMAINING%%@*}"
BENCH="${REMAINING%@*}"
CONFIG="${REMAINING##*@}"
#echo $FOLDER
#echo $BASENAME
#echo $BENCH
#echo config $CONFIG
#----- BENCH ------------------
REMAINING=${BENCH}
#1
IN_DUMMY="${REMAINING%%_*}"
REMAINING="${REMAINING#*_}"
#2
MD="${REMAINING%%_*}"
REMAINING="${REMAINING#*_}"
#echo $MD
#----- CONFIGS ------------------
REMAINING=${CONFIG}
#1
LR="${REMAINING%%_*}"
REMAINING="${REMAINING#*_}"
#2
LD="${REMAINING%%_*}"
REMAINING="${REMAINING#*_}"
#3
LS="${REMAINING%%_*}"
REMAINING="${REMAINING#*_}"
#4
ID="${REMAINING%%_*}"
REMAINING="${REMAINING#*_}"
#5
TS="${REMAINING%%_*}"
REMAINING="${REMAINING#*_}"
#6
EP="${REMAINING%%_*}"
REMAINING="${REMAINING#*_}"
#7
BS="${REMAINING%%_*}"
REMAINING="${REMAINING#*_}"
#8
WD="${REMAINING%%_*}"
REMAINING="${REMAINING#*_}"
#9
MO="${REMAINING%%_*}"
REMAINING="${REMAINING#*_}"
#10
DB="${REMAINING%%_*}"
REMAINING="${REMAINING#*_}"
#11
ST="${REMAINING%%_*}"
REMAINING="${REMAINING#*_}"
#12
QI="${REMAINING%%_*}"
REMAINING="${REMAINING#*_}"
#13
QS="${REMAINING%%_*}"
REMAINING="${REMAINING#*_}"
#14
QF="${REMAINING%%_*}"
REMAINING="${REMAINING#*_}"
# ----- PATHS -----------------------
cd $SCRATCH
mkdir -p $FOLDER; cd $FOLDER
mkdir -p tmps; cd tmps
mkdir -p $SLURM_JOB_NAME; cd $SLURM_JOB_NAME
tmpPath=$SCRATCH/$FOLDER/tmps/$SLURM_JOB_NAME
cp $ws/schedules/${SLURM_JOB_NAME}.csv $tmpPath/schedule.csv
# PArallel copy of the whole imagenet dataset to the cluster
IMAGENET='/path_to/IMAGENET-UNCROPPED'
echo started copying to $SLURM_TMPDIR at: $(date)
mkdir $SLURM_TMPDIR/IMAGENET-UNCROPPED
mkdir $SLURM_TMPDIR/IMAGENET-UNCROPPED/train
mkdir $SLURM_TMPDIR/IMAGENET-UNCROPPED/val
avail=$(df --output=avail $SLURM_TMPDIR | tail -1)
need=$(echo '130 * 1024 * 1024' | bc)
if [[ $avail > $need ]]
then
echo 'Enough space on localscratch!'
ls -1 $IMAGENET/val | xargs -n1 -P16 -I% rsync --info=progress2 --chown=your-user:your-group -r $IMAGENET/val/% $SLURM_TMPDIR/IMAGENET-UNCROPPED/val
echo Number of copied val files: $(ls -R $SLURM_TMPDIR/IMAGENET-UNCROPPED/ | wc -l)
echo finished copying val at: $(date)
ls -1 $IMAGENET/train | xargs -n1 -P16 -I% rsync --info=progress2 --chown=your-user:your-group -r $IMAGENET/train/% $SLURM_TMPDIR/IMAGENET-UNCROPPED/train
copied=$(ls -R $SLURM_TMPDIR/IMAGENET-UNCROPPED/ | wc -l)
echo Number of copied files total : $copied
echo finished copying at: $(date)
if [[ $copied > 1330000 ]]
then
echo properly copied!
IN=$SLURM_TMPDIR/IMAGENET-UNCROPPED # run on local node => faster speed
echo "This machine is cedar: $(hostname)"
nvidia-smi; cd $tmpPath; echo running at:$(pwd); bash $ws/run_tune.sh $LR $LD $LS $ID $TS $EP $BS $WD $MO $DB $ST $QI $QS $QF $MD $IN
else
echo not enough space or dead rsync or bad copy!
fi
else
echo 'not enough space on localscratch'
echo trying to find a prev copy!
PREV_COPY=$(bash $ws/check_node.sh )
if [[ '' != $PREV_COPY ]]
then
echo found prev copy here: $PREV_COPY;
IN=$PREV_COPY
nvidia-smi; cd $tmpPath; echo running at:$(pwd); bash $ws/run_tune.sh $LR $LD $LS $ID $TS $EP $BS $WD $MO $DB $ST $QI $QS $QF $MD $IN
else
echo no copy no space! Wth!;
fi
fi