-
Notifications
You must be signed in to change notification settings - Fork 23
/
Copy pathrun_all_256.sh
executable file
·129 lines (106 loc) · 3.71 KB
/
run_all_256.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
#!/bin/bash
if [ "$#" -ne 3 ]; then
echo "Usage : ./run_img.sh <data-dir> <out-dir> <worker>"
exit 1
fi
apt-get install jq
DATA_DIR=$1
OUT_DIR=$2
WORKER=$3
SRC="models/image_classification/"
SCRIPTS="scripts/"
mkdir -p $OUT_DIR
gpu=0
num_gpu=8
echo " Data dir is $DATA_DIR"
echo " Out dir is $OUT_DIR"
resnext="resnext101"
densenet="densenet121"
for arch in 'vgg16' ; do
#for arch in 'resnet18' ; do
#for arch in 'resnet50' 'resnet18' 'inception_v3' 'resnext101' 'densenet121' 'vgg16'; do
for workers in $WORKER; do
for batch in 256; do
#: <<'END'
if [ "$arch" = "$resnext" ]; then
batch=128
elif [ "$arch" = "$densenet" ]; then
batch=128
fi
# RUN 1 : CheckFreq
result_dir="${OUT_DIR}/${arch}_b${batch}_w${workers}_g${num_gpu}_dali_fp32_cf"
echo "result dir is $result_dir"
mkdir -p $result_dir
echo "Now running $arch for $workers workers and $batch batch"
mpstat -P ALL 1 > cpu_util.out 2>&1 &
./$SCRIPTS/free.sh &
#./$SCRIPTS/gpulog.sh &
dstat -cdnmgyr --output all-utils.csv 2>&1 &
python -m torch.distributed.launch --nproc_per_node=$num_gpu $SRC/pytorch-imagenet-cf.py --dali -a $arch -b $batch --workers $workers --epochs 2 --deterministic --noeval --barrier --checkfreq --chk-prefix ./chk/ --cf_iterator --data $DATA_DIR > stdout.out 2>&1
sync
echo "RAN $arch for $workers workers, $batch batch with DDP" >> stdout.out
pkill -f mpstat
pkill -f dstat
pkill -f free
pkill -f gpulog
pkill -f nvidia-smi
pkill -f pytorch-imagenet
sleep 2
mv *.out $result_dir/
mv *.log $result_dir/
mv *.csv $result_dir/
#exit
#: <<'END'
#END
# RUN 2 : Epoch boundary
result_dir="${OUT_DIR}/${arch}_b${batch}_w${workers}_g${num_gpu}_dali_fp32_epoch_chk"
echo "result dir is $result_dir"
mkdir -p $result_dir
echo "Now running $arch for $workers workers and $batch batch"
mpstat -P ALL 1 > cpu_util.out 2>&1 &
./$SCRIPTS/free.sh &
#./$SCRIPTS/gpulog.sh &
dstat -cdnmgyr --output all-utils.csv 2>&1 &
python -m torch.distributed.launch --nproc_per_node=$num_gpu $SRC/pytorch-imagenet-cf.py --dali -a $arch -b $batch --workers $workers --epochs 1 --deterministic --noeval --barrier --chk-freq 0 --chk_mode_baseline --checkfreq --chk-prefix ./chk/ --cf_iterator --data $DATA_DIR > stdout.out 2>&1
sync
echo "RAN $arch for $workers workers, $batch batch with DDP" >> stdout.out
pkill -f mpstat
pkill -f dstat
pkill -f free
pkill -f gpulog
pkill -f nvidia-smi
pkill -f pytorch-imagenet
sleep 2
mv *.out $result_dir/
mv *.log $result_dir/
mv *.csv $result_dir/
#exit
#END
# RUN 3 : Synchronous at chosen frequency
result_dir="${OUT_DIR}/${arch}_b${batch}_w${workers}_g${num_gpu}_dali_fp32_iter_chk_baseline_persist"
echo "result dir is $result_dir"
mkdir -p $result_dir
echo "Now running $arch for $workers workers and $batch batch"
cache_file=".cache_${arch}_${batch}"
CHK=$(jq '.chk_freq' $cache_file)
echo "Setting CHK freq = $CHK"
mpstat -P ALL 1 > cpu_util.out 2>&1 &
./$SCRIPTS/free.sh &
#./$SCRIPTS/gpulog.sh &
dstat -cdnmgyr --output all-utils.csv 2>&1 &
python -m torch.distributed.launch --nproc_per_node=$num_gpu $SRC/pytorch-imagenet-cf.py --dali -a $arch -b $batch --workers $workers --epochs 1 --deterministic --noeval --barrier --chk-freq $CHK --chk_mode_baseline --persist --checkfreq --chk-prefix ./chk/ --cf_iterator --data $DATA_DIR > stdout.out 2>&1
sync
echo "RAN $arch for $workers workers, $batch batch with DDP" >> stdout.out
pkill -f mpstat
pkill -f dstat
pkill -f free
pkill -f gpulog
pkill -f nvidia-smi
pkill -f pytorch-imagenet
sleep 2
mv *.out $result_dir/
mv *.log $result_dir/
mv *.csv $result_dir/
done
done
done