Skip to content

Commit

Permalink
Fix comments from Review Sam
Browse files Browse the repository at this point in the history
  • Loading branch information
Caspar van Leeuwen committed Jul 1, 2024
1 parent 11146ef commit 8298e6a
Show file tree
Hide file tree
Showing 3 changed files with 6 additions and 20 deletions.
5 changes: 3 additions & 2 deletions eessi/testsuite/hooks.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,8 @@ def _assign_default_num_gpus_per_node(test: rfm.RegressionTest):

def assign_tasks_per_compute_unit(test: rfm.RegressionTest, compute_unit: str, num_per: int = 1):
"""
Assign one task per compute unit.
Assign one task per compute unit. More than 1 task per compute unit can be assigned with
num_per for compute units that support it.
Automatically sets num_tasks, num_tasks_per_node, num_cpus_per_task, and num_gpus_per_node,
based on the current scale and the current partition’s num_cpus, max_avail_gpus_per_node and num_nodes.
For GPU tests, one task per GPU is set, and num_cpus_per_task is based on the ratio of CPU-cores/GPUs.
Expand All @@ -80,7 +81,7 @@ def assign_tasks_per_compute_unit(test: rfm.RegressionTest, compute_unit: str, n
- assign_tasks_per_compute_unit(test, COMPUTE_UNIT[CPU_SOCKET]) will launch 2 tasks with 64 threads per task
"""
if num_per != 1 and compute_unit in [COMPUTE_UNIT[GPU], COMPUTE_UNIT[CPU], COMPUTE_UNIT[CPU_SOCKET]]:
if num_per != 1 and compute_unit not in [COMPUTE_UNIT[NODE]]:
raise NotImplementedError(
f'Non-default num_per {num_per} is not implemented for compute_unit {compute_unit}.')

Expand Down
21 changes: 3 additions & 18 deletions eessi/testsuite/tests/apps/PyTorch/PyTorch_torchvision.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,19 +63,18 @@ def apply_setup_hooks(self):
if self.compute_device == DEVICE_TYPES[GPU]:
hooks.assign_tasks_per_compute_unit(test=self, compute_unit=COMPUTE_UNIT[GPU])
else:
# Hybrid code, so launch 1 rank per socket.
# Probably, launching 1 task per NUMA domain is even better, but the current hook doesn't support it
# Hybrid code, for which launching one task per NUMA_NODE is typically the most efficient
hooks.assign_tasks_per_compute_unit(test=self, compute_unit=COMPUTE_UNIT[NUMA_NODE])

# This is a hybrid test, binding is important for performance
hooks.set_compact_process_binding(self)

@run_after('setup')
def set_ddp_env_vars(self):
def set_ddp_options(self):
# Set environment variables for PyTorch DDP
if self.parallel_strategy == 'ddp':
# Set additional options required by DDP
self.executable_opts += ["--master-port $(python python_get_free_socket.py)"]
self.executable_opts += ["--master-port $(python get_free_socket.py)"]
self.executable_opts += ["--master-address $(hostname --fqdn)"]
self.executable_opts += ["--world-size %s" % self.num_tasks]

Expand All @@ -96,15 +95,6 @@ def pass_parallel_strategy(self):
if self.num_tasks != 1:
self.executable_opts += ['--use-%s' % self.parallel_strategy]

@run_after('setup')
def avoid_horovod_cpu_contention(self):
# Horovod had issues with CPU performance, see https://github.com/horovod/horovod/issues/2804
# The root cause is Horovod having two threads with very high utilization, which interferes with
# the compute threads. It was fixed, but seems to be broken again in Horovod 0.28.1
# The easiest workaround is to reduce the number of compute threads by 2
if self.compute_device == DEVICE_TYPES[CPU] and self.parallel_strategy == 'horovod':
self.env_vars['OMP_NUM_THREADS'] = max(self.num_cpus_per_task - 2, 2) # Never go below 2 compute threads

@sanity_function
def assert_num_ranks(self):
'''Assert that the number of reported CPUs/GPUs used is correct'''
Expand Down Expand Up @@ -140,8 +130,3 @@ def prepare_gpu_test(self):
if self.precision == 'mixed':
self.executable_opts += ['--use-amp']

@run_after('init')
def skip_hvd_plus_amp(self):
'''Skip combination of horovod and AMP, it does not work see https://github.com/horovod/horovod/issues/1417'''
if self.parallel_strategy == 'horovod' and self.precision == 'mixed':
self.valid_systems = [INVALID_SYSTEM]

0 comments on commit 8298e6a

Please sign in to comment.