Skip to content

Commit

Permalink
Work around the issue of not being able to export varialbes in a laun…
Browse files Browse the repository at this point in the history
…cher-agnostic way, by simply passing them to the python script as argument, and having that set it to the environment. Print clear error if SLURM or openMPIs mpirun are not used - we still rely on these to get the local rank, there is no other way
  • Loading branch information
Caspar van Leeuwen committed Mar 29, 2024
1 parent 8bb9bbd commit a6bf34d
Show file tree
Hide file tree
Showing 2 changed files with 33 additions and 20 deletions.
19 changes: 7 additions & 12 deletions eessi/testsuite/tests/apps/PyTorch/PyTorch_torchvision.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import reframe.utility.sanity as sn

from eessi.testsuite import hooks
from eessi.testsuite.constants import SCALES, TAGS, DEVICE_TYPES, COMPUTE_UNIT, CPU, CPU_SOCKET, GPU
from eessi.testsuite.constants import SCALES, TAGS, DEVICE_TYPES, COMPUTE_UNIT, CPU, NUMA_NODE, GPU, INVALID_SYSTEM
from eessi.testsuite.utils import find_modules, log

class PyTorch_torchvision(rfm.RunOnlyRegressionTest):
Expand Down Expand Up @@ -66,7 +66,7 @@ def apply_setup_hooks(self):
else:
# Hybrid code, so launch 1 rank per socket.
# Probably, launching 1 task per NUMA domain is even better, but the current hook doesn't support it
hooks.assign_tasks_per_compute_unit(test=self, compute_unit=COMPUTE_UNIT[CPU_SOCKET])
hooks.assign_tasks_per_compute_unit(test=self, compute_unit=COMPUTE_UNIT[NUMA_NODE])

# This is a hybrid test, binding is important for performance
hooks.set_compact_process_binding(self)
Expand All @@ -76,15 +76,10 @@ def set_ddp_env_vars(self):
# Set environment variables for PyTorch DDP
### TODO: THIS WILL ONLY WORK WITH SLURM, WE SHOULD MAKE A SKIP_IF BASED ON THE SCHEDULER
if self.parallel_strategy == 'ddp':
self.prerun_cmds = [
'export MASTER_PORT=$(expr 10000 + $(echo -n $SLURM_JOBID | tail -c 4))',
'export WORLD_SIZE=%s' % self.num_tasks,
'echo "WORLD_SIZE="${WORLD_SIZE}',
'master_addr=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)',
'export MASTER_ADDR=${master_addr}',
'echo "MASTER_ADDR"=${master_addr}',
]

# Set additional options required by DDP
self.executable_opts += ["--master-port $(python python_get_free_socket.py)"]
self.executable_opts += ["--master-address $(hostname --fqdn)"]
self.executable_opts += ["--world-size %s" % self.num_tasks]

@run_after('setup')
def filter_invalid_parameter_combinations(self):
Expand Down Expand Up @@ -149,5 +144,5 @@ def prepare_gpu_test(self):
def skip_hvd_plus_amp(self):
'''Skip combination of horovod and AMP, it does not work see https://github.com/horovod/horovod/issues/1417'''
if self.parallel_strategy == 'horovod' and self.precision == 'mixed':
self.valid_systems = []
self.valid_systems = [INVALID_SYSTEM]

Original file line number Diff line number Diff line change
@@ -1,14 +1,15 @@
from __future__ import print_function

import argparse
import timeit
import os
import random

import numpy as np

import torch.backends.cudnn as cudnn
import torch.nn.functional as F
import torch.optim as optim
import torch.utils.data.distributed
from torchvision import models
import timeit
import numpy as np
import os

# Benchmark settings
parser = argparse.ArgumentParser(description='PyTorch Synthetic Benchmark',
Expand Down Expand Up @@ -38,6 +39,12 @@

parser.add_argument('--use-amp', action='store_true', default=False,
help='Use PyTorch Automatic Mixed Precision (AMP)')
parser.add_argument('--world-size', type=int, default=1,
help='Define the world size for ddp')
parser.add_argument('--master-port', type=int, default=False,
help='Define a master port for ddp')
parser.add_argument('--master-address', type=str, default='localhost',
help='Define a master address for ddp')

args = parser.parse_args()
args.cuda = not args.no_cuda and torch.cuda.is_available()
Expand All @@ -46,9 +53,15 @@
print("You can't specify to use both Horovod and Pytorch DDP, exiting...")
exit(1)

# Set MASTER_ADDR and MASTER_PORT environment variables
# By doing it as part of this python script, we don't need to have the launchers export them
# This saves us from having to find a launcher-agnostic way of exporting variables
os.environ['MASTER_ADDR'] = args.master_address
os.environ['MASTER_PORT'] = '%s' % args.master_port

# Set a default rank and world size, also for when ddp and horovod are not used
rank = 0
world_size=1
world_size = args.world_size
if args.use_horovod:
import horovod.torch as hvd
hvd.init()
Expand Down Expand Up @@ -86,12 +99,17 @@ def cleanup():
# clean up the distributed environment
dist.destroy_process_group()

world_size = int(os.environ["SLURM_NTASKS"])
# world_size = int(os.environ["SLURM_NTASKS"]) ## No longer needed now we pass it as argument?
# If launched with mpirun, get rank from this
rank = int(os.environ.get("OMPI_COMM_WORLD_RANK", -1))
if rank == -1:
# Else it's launched with srun, get rank from this
rank = int(os.environ["SLURM_PROCID"])
rank = int(os.environ.get("SLURM_PROCID", -1))
if rank == -1:
err_msg = "ERROR: cannot determine local rank. This test currently only supports OpenMPI"
err_msg += " and srun as launchers. If you've configured a different launcher for your system"
err_msg += " this test will need to be extended with a method to get it's local rank for that launcher."
print(err_msg)

setup(rank, world_size)
# log(f"Group initialized? {dist.is_initialized()}", rank)
Expand Down

0 comments on commit a6bf34d

Please sign in to comment.