diff --git a/eessi/testsuite/tests/apps/PyTorch/PyTorch_torchvision.py b/eessi/testsuite/tests/apps/PyTorch/PyTorch_torchvision.py index 91d0a708..6367c63e 100644 --- a/eessi/testsuite/tests/apps/PyTorch/PyTorch_torchvision.py +++ b/eessi/testsuite/tests/apps/PyTorch/PyTorch_torchvision.py @@ -2,7 +2,7 @@ import reframe.utility.sanity as sn from eessi.testsuite import hooks -from eessi.testsuite.constants import SCALES, TAGS, DEVICE_TYPES, COMPUTE_UNIT, CPU, CPU_SOCKET, GPU +from eessi.testsuite.constants import SCALES, TAGS, DEVICE_TYPES, COMPUTE_UNIT, CPU, NUMA_NODE, GPU, INVALID_SYSTEM from eessi.testsuite.utils import find_modules, log class PyTorch_torchvision(rfm.RunOnlyRegressionTest): @@ -66,7 +66,7 @@ def apply_setup_hooks(self): else: # Hybrid code, so launch 1 rank per socket. # Probably, launching 1 task per NUMA domain is even better, but the current hook doesn't support it - hooks.assign_tasks_per_compute_unit(test=self, compute_unit=COMPUTE_UNIT[CPU_SOCKET]) + hooks.assign_tasks_per_compute_unit(test=self, compute_unit=COMPUTE_UNIT[NUMA_NODE]) # This is a hybrid test, binding is important for performance hooks.set_compact_process_binding(self) @@ -76,15 +76,10 @@ def set_ddp_env_vars(self): # Set environment variables for PyTorch DDP ### TODO: THIS WILL ONLY WORK WITH SLURM, WE SHOULD MAKE A SKIP_IF BASED ON THE SCHEDULER if self.parallel_strategy == 'ddp': - self.prerun_cmds = [ - 'export MASTER_PORT=$(expr 10000 + $(echo -n $SLURM_JOBID | tail -c 4))', - 'export WORLD_SIZE=%s' % self.num_tasks, - 'echo "WORLD_SIZE="${WORLD_SIZE}', - 'master_addr=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)', - 'export MASTER_ADDR=${master_addr}', - 'echo "MASTER_ADDR"=${master_addr}', - ] - + # Set additional options required by DDP + self.executable_opts += ["--master-port $(python python_get_free_socket.py)"] + self.executable_opts += ["--master-address $(hostname --fqdn)"] + self.executable_opts += ["--world-size %s" % self.num_tasks] @run_after('setup') def filter_invalid_parameter_combinations(self): @@ -149,5 +144,5 @@ def prepare_gpu_test(self): def skip_hvd_plus_amp(self): '''Skip combination of horovod and AMP, it does not work see https://github.com/horovod/horovod/issues/1417''' if self.parallel_strategy == 'horovod' and self.precision == 'mixed': - self.valid_systems = [] + self.valid_systems = [INVALID_SYSTEM] diff --git a/eessi/testsuite/tests/apps/PyTorch/src/pytorch_synthetic_benchmark.py b/eessi/testsuite/tests/apps/PyTorch/src/pytorch_synthetic_benchmark.py index f3237e7d..4c0db3be 100644 --- a/eessi/testsuite/tests/apps/PyTorch/src/pytorch_synthetic_benchmark.py +++ b/eessi/testsuite/tests/apps/PyTorch/src/pytorch_synthetic_benchmark.py @@ -1,14 +1,15 @@ -from __future__ import print_function - import argparse +import timeit +import os +import random + +import numpy as np + import torch.backends.cudnn as cudnn import torch.nn.functional as F import torch.optim as optim import torch.utils.data.distributed from torchvision import models -import timeit -import numpy as np -import os # Benchmark settings parser = argparse.ArgumentParser(description='PyTorch Synthetic Benchmark', @@ -38,6 +39,12 @@ parser.add_argument('--use-amp', action='store_true', default=False, help='Use PyTorch Automatic Mixed Precision (AMP)') +parser.add_argument('--world-size', type=int, default=1, + help='Define the world size for ddp') +parser.add_argument('--master-port', type=int, default=False, + help='Define a master port for ddp') +parser.add_argument('--master-address', type=str, default='localhost', + help='Define a master address for ddp') args = parser.parse_args() args.cuda = not args.no_cuda and torch.cuda.is_available() @@ -46,9 +53,15 @@ print("You can't specify to use both Horovod and Pytorch DDP, exiting...") exit(1) +# Set MASTER_ADDR and MASTER_PORT environment variables +# By doing it as part of this python script, we don't need to have the launchers export them +# This saves us from having to find a launcher-agnostic way of exporting variables +os.environ['MASTER_ADDR'] = args.master_address +os.environ['MASTER_PORT'] = '%s' % args.master_port + # Set a default rank and world size, also for when ddp and horovod are not used rank = 0 -world_size=1 +world_size = args.world_size if args.use_horovod: import horovod.torch as hvd hvd.init() @@ -86,12 +99,17 @@ def cleanup(): # clean up the distributed environment dist.destroy_process_group() - world_size = int(os.environ["SLURM_NTASKS"]) + # world_size = int(os.environ["SLURM_NTASKS"]) ## No longer needed now we pass it as argument? # If launched with mpirun, get rank from this rank = int(os.environ.get("OMPI_COMM_WORLD_RANK", -1)) if rank == -1: # Else it's launched with srun, get rank from this - rank = int(os.environ["SLURM_PROCID"]) + rank = int(os.environ.get("SLURM_PROCID", -1)) + if rank == -1: + err_msg = "ERROR: cannot determine local rank. This test currently only supports OpenMPI" + err_msg += " and srun as launchers. If you've configured a different launcher for your system" + err_msg += " this test will need to be extended with a method to get it's local rank for that launcher." + print(err_msg) setup(rank, world_size) # log(f"Group initialized? {dist.is_initialized()}", rank)