forked from NVIDIA/Megatron-LM
-
Notifications
You must be signed in to change notification settings - Fork 4
/
launch_pretrain_bert.py
113 lines (92 loc) · 4.18 KB
/
launch_pretrain_bert.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
#!/usr/bin/env python
# forked from launch_8gpu.py
import argparse
import ncluster
parser = argparse.ArgumentParser()
parser.add_argument('--name', type=str, default='pretrain_bert',
help="name of the current run, used for machine naming and tensorboard visualization")
parser.add_argument('--machines', type=int, default=1,
help="how many machines to use")
parser.add_argument('--instance_type', type=str, default="p3dn.24xlarge",
help="which instance type to use")
parser.add_argument('--image_name', type=str,
default='Deep Learning AMI (Ubuntu) Version 22.0',
help="name of AMI to use ")
parser.add_argument('--num_rings', type=int, default=10,
help='how many rings to use in multimachine setting')
args = parser.parse_args()
ncluster.set_backend('aws')
# routines to build NCCL ring orders
def get_nccl_params(num_tasks, _num_gpus):
params = f'NCCL_DEBUG=VERSION '
# todo(y): try NCCL_SINGLE_RING_THRESHOLD=10, custom ring definition
if num_tasks > 1:
params += f'NCCL_MIN_NRINGS={args.num_rings} '
return params
def main():
job = ncluster.make_job(name=args.name,
run_name=f"{args.name}",
num_tasks=args.machines,
image_name=args.image_name,
instance_type=args.instance_type)
job.upload('*')
job.run('killall python || echo failed') # kill previous run
job.run('source activate pytorch_p36')
job.run('export NCCL_SOCKET_IFNAME=ens5') # tip from cakarak@amazon.com
job.run('pip install -r requirements.txt')
# workaround for https://github.com/tensorflow/models/issues/3995
job.run('pip install -U protobuf')
num_gpus = 8
assert args.instance_type in ['p3.16xlarge', 'p3dn.24xlarge'], f"{args.instance_type} is not 8-gpu"
# WORLD_SIZE = num_gpus * args.machines
MASTER_ADDR = job.tasks[0].ip
MASTER_PORT = 6016
NNODES = args.machines
train = open('bookcorpus.filelist.train').read().strip()
validate = "/ncluster/data/bookcorpus.tfrecords/final_tfrecords_sharded/tf_examples.tfrecord000163"
test = "/ncluster/data/bookcorpus.tfrecords/final_tfrecords_sharded/tf_examples.tfrecord000164"
nccl_params = get_nccl_params(args.machines, num_gpus)
lr = 0.0001 # original learning rate for 256 global batch size/64 GPUs
for i, task in enumerate(job.tasks):
NODE_RANK = i
DISTRIBUTED_ARGS = f"--nproc_per_node {num_gpus} --nnodes {NNODES} --node_rank {NODE_RANK} --master_addr " \
f"{MASTER_ADDR} --master_port {MASTER_PORT}"
cmd = (f"{nccl_params} python -m torch.distributed.launch {DISTRIBUTED_ARGS} "
f"pretrain_bert.py "
f"--batch-size 4 "
f"--tokenizer-type BertWordPieceTokenizer "
f"--cache-dir cache_dir "
f"--tokenizer-model-type bert-large-uncased "
f"--vocab-size 30522 "
f"--use-tfrecords "
f"--train-data {train} "
f"--valid-data {validate} "
f"--test-data {test} "
f"--max-preds-per-seq 80 "
f"--seq-length 512 "
f"--max-position-embeddings 512 "
f"--num-layers 24 "
f"--hidden-size 1024 "
f"--intermediate-size 4096 "
f"--num-attention-heads 16 "
f"--hidden-dropout 0.1 "
f"--attention-dropout 0.1 "
f"--train-iters 1000000 "
f"--lr {lr} "
f"--lr-decay-style linear "
f"--lr-decay-iters 990000 "
f"--warmup .01 "
f"--weight-decay 1e-2 "
f"--clip-grad 1.0 "
f"--fp16 "
f"--fp32-layernorm "
f"--fp32-embedding "
f"--hysteresis 2 "
f"--num-workers 2 ")
# new params
cmd += f"--logdir {job.logdir} "
task.run(f'echo {cmd} > {job.logdir}/task-{i}.cmd') # save command-line
task.run(cmd, non_blocking=True)
print(f"Logging to {job.logdir}")
if __name__ == '__main__':
main()