[Feature]: Add multi machine dist_train

open-mmlab · Mar 8, 2022 · 900bc42 · 900bc42
1 parent 47f6feb
commit 900bc42
Show file tree

Hide file tree

Showing 6 changed files with 83 additions and 10 deletions.
diff --git a/docs/en/getting_started.md b/docs/en/getting_started.md
@@ -66,7 +66,21 @@ GPUS_PER_NODE=8 GPUS=8 bash tools/srun_train.sh Dummy Test_job configs/selfsup/o
 
 ### Train with multiple machines
 
-If you launch with multiple machines simply connected with ethernet, you have to modify `tools/dist_train.sh` or create a new script, please refer to PyTorch [Launch utility](https://pytorch.org/docs/stable/distributed.html#launch-utility). Usually it is slow if you do not have high speed networking like InfiniBand.
+If you launch with multiple machines simply connected with ethernet, you can simply run following commands:
+
+On the first machine:
+
+```shell
+NNODES=2 NODE_RANK=0 PORT=$MASTER_PORT MASTER_ADDR=$MASTER_ADDR bash tools/dist_train $CONFIG $GPUS
+```
+
+On the second machine:
+
+```shell
+NNODES=2 NODE_RANK=1 PORT=$MASTER_PORT MASTER_ADDR=$MASTER_ADDR bash tools/dist_train $CONFIG $GPUS
+```
+
+Usually it is slow if you do not have high speed networking like InfiniBand.
 
 If you launch with slurm, the command is the same as that on single machine described above, but you need refer to [slurm_train.sh](https://github.com/open-mmlab/mmselfsup/blob/master/tools/slurm_train.sh) to set appropriate parameters and environment variables.
 

diff --git a/docs/zh_cn/getting_started.md b/docs/zh_cn/getting_started.md
@@ -62,7 +62,21 @@ GPUS_PER_NODE=8 GPUS=8 bash tools/srun_train.sh Dummy Test_job configs/selfsup/o
 
 ### 使用多台机器训练
 
-如果您想使用由 ethernet 连接起来的多台机器， 您可以参考 PyTorch [Launch utility](https://pytorch.org/docs/stable/distributed.html#launch-utility) 去修改一下 `tools/dist_train.sh`。但是，如果您不使用高速网路连接这几台机器的话，训练将会非常慢。
+如果您想使用由 ethernet 连接起来的多台机器， 您可以使用以下命令:
+
+在第一台机器上:
+
+```shell
+NNODES=2 NODE_RANK=0 PORT=$MASTER_PORT MASTER_ADDR=$MASTER_ADDR bash tools/dist_train $CONFIG $GPUS
+```
+
+在第二台机器上:
+
+```shell
+NNODES=2 NODE_RANK=1 PORT=$MASTER_PORT MASTER_ADDR=$MASTER_ADDR bash tools/dist_train $CONFIG $GPUS
+```
+
+但是，如果您不使用高速网路连接这几台机器的话，训练将会非常慢。
 
 如果您使用的是 slurm 来管理多台机器，您可以使用同在单台机器上一样的命令来启动任务，但是您必须得设置合适的环境变量和参数，具体可以参考[slurm_train.sh](../../tools/slurm_train.sh)。
 

diff --git a/tools/benchmarks/classification/dist_train_linear.sh b/tools/benchmarks/classification/dist_train_linear.sh
@@ -7,13 +7,24 @@ CFG=$1  # use cfgs under "configs/benchmarks/classification/imagenet/*.py"
 PRETRAIN=$2  # pretrained model
 PY_ARGS=${@:3}
 GPUS=${GPUS:-8}  # When changing GPUS, please also change samples_per_gpu in the config file accordingly to ensure the total batch size is 256.
+NNODES=${NNODES:-1}
+NODE_RANK=${NODE_RANK:-0}
 PORT=${PORT:-29500}
+MASTER_ADDR=${MASTER_ADDR:-"127.0.0.1"}
 
 # set work_dir according to config path and pretrained model to distinguish different models
 WORK_DIR="$(echo ${CFG%.*} | sed -e "s/configs/work_dirs/g")/$(echo $PRETRAIN | rev | cut -d/ -f 1 | rev)"
 
-python -m torch.distributed.launch --nproc_per_node=$GPUS --master_port=$PORT \
+python -m torch.distributed.launch \
+    --nnodes=$NNODES \
+    --node_rank=$NODE_RANK \
+    --master_addr=$MASTER_ADDR \
+    --nproc_per_node=$GPUS \
+    --master_port=$PORT \
     tools/train.py $CFG \
     --cfg-options model.backbone.init_cfg.type=Pretrained \
     model.backbone.init_cfg.checkpoint=$PRETRAIN \
-    --work_dir $WORK_DIR --seed 0 --launcher="pytorch" ${PY_ARGS}
+    --work_dir $WORK_DIR \
+    --seed 0 \
+    --launcher="pytorch" \
+    ${PY_ARGS}
diff --git a/tools/benchmarks/classification/dist_train_semi.sh b/tools/benchmarks/classification/dist_train_semi.sh
@@ -7,14 +7,25 @@ CFG=$1  # use cfgs under "configs/benchmarks/classification/imagenet/imagenet_*p
 PRETRAIN=$2  # pretrained model
 PY_ARGS=${@:3}
 GPUS=${GPUS:-4}  # in the standard setting, GPUS=4
+NNODES=${NNODES:-1}
+NODE_RANK=${NODE_RANK:-0}
 PORT=${PORT:-29500}
+MASTER_ADDR=${MASTER_ADDR:-"127.0.0.1"}
 
 # set work_dir according to config path and pretrained model to distinguish different models
 WORK_DIR="$(echo ${CFG%.*} | sed -e "s/configs/work_dirs/g")/$(echo $PRETRAIN | rev | cut -d/ -f 1 | rev)"
 
 # train
-python -m torch.distributed.launch --nproc_per_node=$GPUS --master_port=$PORT \
+python -m torch.distributed.launch \
+    --nnodes=$NNODES \
+    --node_rank=$NODE_RANK \
+    --master_addr=$MASTER_ADDR \
+    --nproc_per_node=$GPUS \
+    --master_port=$PORT \
     tools/train.py $CFG \
     --cfg-options model.backbone.init_cfg.type=Pretrained \
     model.backbone.init_cfg.checkpoint=$PRETRAIN \
-    --work_dir $WORK_DIR --seed 0 --launcher="pytorch" ${PY_ARGS}
+    --work_dir $WORK_DIR \
+    --seed 0 \
+    --launcher="pytorch" \
+    ${PY_ARGS}
diff --git a/tools/dist_test.sh b/tools/dist_test.sh
@@ -3,8 +3,20 @@
 CONFIG=$1
 CHECKPOINT=$2
 GPUS=$3
+NNODES=${NNODES:-1}
+NODE_RANK=${NODE_RANK:-0}
 PORT=${PORT:-29500}
+MASTER_ADDR=${MASTER_ADDR:-"127.0.0.1"}
 
 PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \
-python -m torch.distributed.launch --nproc_per_node=$GPUS --master_port=$PORT \
-    $(dirname "$0")/test.py $CONFIG $CHECKPOINT --launcher pytorch ${@:4}
+python -m torch.distributed.launch \
+    --nnodes=$NNODES \
+    --node_rank=$NODE_RANK \
+    --master_addr=$MASTER_ADDR \
+    --nproc_per_node=$GPUS \
+    --master_port=$PORT \
+    $(dirname "$0")/test.py \
+    $CONFIG \
+    $CHECKPOINT \
+    --launcher pytorch \
+    ${@:4}
diff --git a/tools/dist_train.sh b/tools/dist_train.sh
@@ -2,8 +2,19 @@
 
 CONFIG=$1
 GPUS=$2
+NNODES=${NNODES:-1}
+NODE_RANK=${NODE_RANK:-0}
 PORT=${PORT:-29500}
+MASTER_ADDR=${MASTER_ADDR:-"127.0.0.1"}
 
 PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \
-python -m torch.distributed.launch --nproc_per_node=$GPUS --master_port=$PORT \
-    $(dirname "$0")/train.py $CONFIG --seed 0 --launcher pytorch ${@:3}
+python -m torch.distributed.launch \
+    --nnodes=$NNODES \
+    --node_rank=$NODE_RANK \
+    --master_addr=$MASTER_ADDR \
+    --nproc_per_node=$GPUS \
+    --master_port=$PORT \
+    $(dirname "$0")/train.py \
+    $CONFIG \
+    --seed 0 \
+    --launcher pytorch ${@:3}