diff --git a/eessi/testsuite/hooks.py b/eessi/testsuite/hooks.py index a354cba8..fd82087f 100644 --- a/eessi/testsuite/hooks.py +++ b/eessi/testsuite/hooks.py @@ -58,7 +58,8 @@ def _assign_default_num_gpus_per_node(test: rfm.RegressionTest): def assign_tasks_per_compute_unit(test: rfm.RegressionTest, compute_unit: str, num_per: int = 1): """ - Assign one task per compute unit. + Assign one task per compute unit. More than 1 task per compute unit can be assigned with + num_per for compute units that support it. Automatically sets num_tasks, num_tasks_per_node, num_cpus_per_task, and num_gpus_per_node, based on the current scale and the current partition’s num_cpus, max_avail_gpus_per_node and num_nodes. For GPU tests, one task per GPU is set, and num_cpus_per_task is based on the ratio of CPU-cores/GPUs. @@ -80,7 +81,7 @@ def assign_tasks_per_compute_unit(test: rfm.RegressionTest, compute_unit: str, n - assign_tasks_per_compute_unit(test, COMPUTE_UNIT[CPU_SOCKET]) will launch 2 tasks with 64 threads per task """ - if num_per != 1 and compute_unit in [COMPUTE_UNIT[GPU], COMPUTE_UNIT[CPU], COMPUTE_UNIT[CPU_SOCKET]]: + if num_per != 1 and compute_unit not in [COMPUTE_UNIT[NODE]]: raise NotImplementedError( f'Non-default num_per {num_per} is not implemented for compute_unit {compute_unit}.') diff --git a/eessi/testsuite/tests/apps/PyTorch/PyTorch_torchvision.py b/eessi/testsuite/tests/apps/PyTorch/PyTorch_torchvision.py index 575527cf..2235ac36 100644 --- a/eessi/testsuite/tests/apps/PyTorch/PyTorch_torchvision.py +++ b/eessi/testsuite/tests/apps/PyTorch/PyTorch_torchvision.py @@ -63,19 +63,18 @@ def apply_setup_hooks(self): if self.compute_device == DEVICE_TYPES[GPU]: hooks.assign_tasks_per_compute_unit(test=self, compute_unit=COMPUTE_UNIT[GPU]) else: - # Hybrid code, so launch 1 rank per socket. - # Probably, launching 1 task per NUMA domain is even better, but the current hook doesn't support it + # Hybrid code, for which launching one task per NUMA_NODE is typically the most efficient hooks.assign_tasks_per_compute_unit(test=self, compute_unit=COMPUTE_UNIT[NUMA_NODE]) # This is a hybrid test, binding is important for performance hooks.set_compact_process_binding(self) @run_after('setup') - def set_ddp_env_vars(self): + def set_ddp_options(self): # Set environment variables for PyTorch DDP if self.parallel_strategy == 'ddp': # Set additional options required by DDP - self.executable_opts += ["--master-port $(python python_get_free_socket.py)"] + self.executable_opts += ["--master-port $(python get_free_socket.py)"] self.executable_opts += ["--master-address $(hostname --fqdn)"] self.executable_opts += ["--world-size %s" % self.num_tasks] @@ -96,15 +95,6 @@ def pass_parallel_strategy(self): if self.num_tasks != 1: self.executable_opts += ['--use-%s' % self.parallel_strategy] - @run_after('setup') - def avoid_horovod_cpu_contention(self): - # Horovod had issues with CPU performance, see https://github.com/horovod/horovod/issues/2804 - # The root cause is Horovod having two threads with very high utilization, which interferes with - # the compute threads. It was fixed, but seems to be broken again in Horovod 0.28.1 - # The easiest workaround is to reduce the number of compute threads by 2 - if self.compute_device == DEVICE_TYPES[CPU] and self.parallel_strategy == 'horovod': - self.env_vars['OMP_NUM_THREADS'] = max(self.num_cpus_per_task - 2, 2) # Never go below 2 compute threads - @sanity_function def assert_num_ranks(self): '''Assert that the number of reported CPUs/GPUs used is correct''' @@ -140,8 +130,3 @@ def prepare_gpu_test(self): if self.precision == 'mixed': self.executable_opts += ['--use-amp'] - @run_after('init') - def skip_hvd_plus_amp(self): - '''Skip combination of horovod and AMP, it does not work see https://github.com/horovod/horovod/issues/1417''' - if self.parallel_strategy == 'horovod' and self.precision == 'mixed': - self.valid_systems = [INVALID_SYSTEM] diff --git a/eessi/testsuite/tests/apps/PyTorch/src/python_get_free_socket.py b/eessi/testsuite/tests/apps/PyTorch/src/get_free_socket.py similarity index 100% rename from eessi/testsuite/tests/apps/PyTorch/src/python_get_free_socket.py rename to eessi/testsuite/tests/apps/PyTorch/src/get_free_socket.py