-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathprepare_efa_image.py
116 lines (86 loc) · 4.1 KB
/
prepare_efa_image.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
#!/usr/bin/env python
"""
Script that builds EFA-enable image. It wraps indu_build.sh with extra logging. Because Python is needed for logging, conda env setup is duplicated here and in indu_build.sh
# usage (Python 3.6)
pip install -r https://raw.githubusercontent.com/cybertronai/aws-network-benchmarks/master/requirements.txt
export AWS_ACCESS_KEY_ID=<access key id>
export AWS_SECRET_ACCESS_KEY=<secret key>
export AWS_DEFAULT_REGION=us-east-1
export NCLUSTER_ZONE=us-east-1b
"""
import argparse
import os
import shlex
import sys
import wandb
import util
parser = argparse.ArgumentParser()
parser.add_argument('--name', type=str, default='0.prepare_efa_image')
parser.add_argument('--instance_type', type=str, default="p3dn.24xlarge")
parser.add_argument('--spot', action='store_true', help='use spot instances')
parser.add_argument('--skip_setup', action='store_true',
help='can use this option on reruns for slightly faster turn-around')
parser.add_argument('--image_name', type=str, default='amzn2-ami-hvm-2.0.20190612-x86_64-gp2',
help='base image to build upon')
parser.add_argument('--use_tmpfs', type=int, default=0, help='use tmpfs for slightly faster build')
parser.add_argument('--use_io2', type=int, default=1, help='use io2 disk for faster building')
# internal flags
parser.add_argument('--internal_role', type=str, default='launcher')
parser.add_argument('--internal_cmd', type=str, default='echo whoami')
parser.add_argument('--internal_config', type=str, default='800358020000007B7D71002E', help='base64 encoded dict of additional config attributes to log')
parser.add_argument('--internal_config_fn', type=str, default='ncluster_config_dict', help='location of filename with extra info to log')
args = parser.parse_args()
SETUP_COMPLETED_FN = 'ncluster_setup_completed'
def launcher():
import ncluster
config = vars(args) # save command-line args
util.log_client_environment(config)
if args.use_io2:
os.environ['NCLUSTER_AWS_FAST_ROOTDISK'] = '1'
os.environ['WANDB_SILENT'] = '1'
task0 = ncluster.make_task(**config)
task0.rsync('.')
pickled_config = util.text_pickle(config)
task0.write(args.internal_config_fn, pickled_config)
INSTALL_ROOT = '/home/ec2-user'
if args.use_tmpfs:
task0.run('sudo mkdir -p /tmpfs && sudo chown `whoami` /tmpfs && sudo mount -t tmpfs -o size=50G tmpfs /tmpfs')
INSTALL_ROOT = '/tmpfs'
task0.run(f'export INSTALL_ROOT={INSTALL_ROOT}')
task0.run(f'export WANDB_SILENT=1')
task0.run(f'mkdir -p {INSTALL_ROOT}/packages')
task0.run(f'pushd {INSTALL_ROOT}/packages')
task0.run('sudo yum groupinstall "Development Tools" -y')
task0.run('sudo update-alternatives --set gcc "/usr/bin/gcc48" || echo ignored')
task0.run('sudo update-alternatives --set g++ "/usr/bin/g++48" || echo ignored')
task0.run('wget https://repo.anaconda.com/archive/Anaconda3-2019.03-Linux-x86_64.sh')
task0.run('bash Anaconda3-2019.03-Linux-x86_64.sh -b || echo ignore')
task0.run('/home/ec2-user/anaconda3/bin/conda init bash && source ~/.bashrc')
task0.run('conda create -n pytorch_p36 python=3.6 -y || echo ignore')
task0.run('source activate pytorch_p36')
task0.run(f'popd')
task0.run('pip install -r worker_requirements.txt')
this_script = os.path.basename(__file__)
task0.run(f'python {this_script} --internal_role=worker')
if args.use_tmpfs:
task0.run(f'cp -R {INSTALL_ROOT} ~/install_root')
def worker():
name = util.get_script_name(__file__)
wandb.init(project='nccl_bench', name=name)
util.install_pdb_handler()
# log info propagated from the launcher
config = util.text_unpickle(open(args.internal_config_fn).read())
print(config)
wandb.config.update(config)
util.log_worker_environment()
util.ossystem2(f'bash indu_build.sh')
open(SETUP_COMPLETED_FN, 'w').write('ok')
def main():
if args.internal_role == 'launcher':
launcher()
elif args.internal_role == 'worker':
worker()
else:
assert False, f'unknown role {args.internal_role}'
if __name__ == '__main__':
main()