diff --git a/eessi/testsuite/hooks.py b/eessi/testsuite/hooks.py
index a354cba8..fd82087f 100644
--- a/eessi/testsuite/hooks.py
+++ b/eessi/testsuite/hooks.py
@@ -58,7 +58,8 @@ def _assign_default_num_gpus_per_node(test: rfm.RegressionTest):
 
 def assign_tasks_per_compute_unit(test: rfm.RegressionTest, compute_unit: str, num_per: int = 1):
     """
-    Assign one task per compute unit.
+    Assign one task per compute unit. More than 1 task per compute unit can be assigned with
+    num_per for compute units that support it. 
     Automatically sets num_tasks, num_tasks_per_node, num_cpus_per_task, and num_gpus_per_node,
     based on the current scale and the current partition’s num_cpus, max_avail_gpus_per_node and num_nodes.
     For GPU tests, one task per GPU is set, and num_cpus_per_task is based on the ratio of CPU-cores/GPUs.
@@ -80,7 +81,7 @@ def assign_tasks_per_compute_unit(test: rfm.RegressionTest, compute_unit: str, n
     - assign_tasks_per_compute_unit(test, COMPUTE_UNIT[CPU_SOCKET]) will launch 2 tasks with 64 threads per task
 
     """
-    if num_per != 1 and compute_unit in [COMPUTE_UNIT[GPU], COMPUTE_UNIT[CPU], COMPUTE_UNIT[CPU_SOCKET]]:
+    if num_per != 1 and compute_unit not in [COMPUTE_UNIT[NODE]]:
         raise NotImplementedError(
             f'Non-default num_per {num_per} is not implemented for compute_unit {compute_unit}.')
 
diff --git a/eessi/testsuite/tests/apps/PyTorch/PyTorch_torchvision.py b/eessi/testsuite/tests/apps/PyTorch/PyTorch_torchvision.py
index 575527cf..2235ac36 100644
--- a/eessi/testsuite/tests/apps/PyTorch/PyTorch_torchvision.py
+++ b/eessi/testsuite/tests/apps/PyTorch/PyTorch_torchvision.py
@@ -63,19 +63,18 @@ def apply_setup_hooks(self):
         if self.compute_device == DEVICE_TYPES[GPU]:
             hooks.assign_tasks_per_compute_unit(test=self, compute_unit=COMPUTE_UNIT[GPU])
         else:
-            # Hybrid code, so launch 1 rank per socket.
-            # Probably, launching 1 task per NUMA domain is even better, but the current hook doesn't support it
+            # Hybrid code, for which launching one task per NUMA_NODE is typically the most efficient
             hooks.assign_tasks_per_compute_unit(test=self, compute_unit=COMPUTE_UNIT[NUMA_NODE])
 
         # This is a hybrid test, binding is important for performance
         hooks.set_compact_process_binding(self)
 
     @run_after('setup')
-    def set_ddp_env_vars(self):
+    def set_ddp_options(self):
         # Set environment variables for PyTorch DDP
         if self.parallel_strategy == 'ddp':
             # Set additional options required by DDP
-            self.executable_opts += ["--master-port $(python python_get_free_socket.py)"]
+            self.executable_opts += ["--master-port $(python get_free_socket.py)"]
             self.executable_opts += ["--master-address $(hostname --fqdn)"]
             self.executable_opts += ["--world-size %s" % self.num_tasks]
 
@@ -96,15 +95,6 @@ def pass_parallel_strategy(self):
         if self.num_tasks != 1:
             self.executable_opts += ['--use-%s' % self.parallel_strategy]
 
-    @run_after('setup')
-    def avoid_horovod_cpu_contention(self):
-        # Horovod had issues with CPU performance, see https://github.com/horovod/horovod/issues/2804
-        # The root cause is Horovod having two threads with very high utilization, which interferes with
-        # the compute threads. It was fixed, but seems to be broken again in Horovod 0.28.1
-        # The easiest workaround is to reduce the number of compute threads by 2
-        if self.compute_device == DEVICE_TYPES[CPU] and self.parallel_strategy == 'horovod':
-            self.env_vars['OMP_NUM_THREADS'] = max(self.num_cpus_per_task - 2, 2)  # Never go below 2 compute threads
-
     @sanity_function
     def assert_num_ranks(self):
         '''Assert that the number of reported CPUs/GPUs used is correct'''
@@ -140,8 +130,3 @@ def prepare_gpu_test(self):
         if self.precision == 'mixed':
             self.executable_opts += ['--use-amp']
 
-    @run_after('init')
-    def skip_hvd_plus_amp(self):
-        '''Skip combination of horovod and AMP, it does not work see https://github.com/horovod/horovod/issues/1417'''
-        if self.parallel_strategy == 'horovod' and self.precision == 'mixed':
-            self.valid_systems = [INVALID_SYSTEM]
diff --git a/eessi/testsuite/tests/apps/PyTorch/src/python_get_free_socket.py b/eessi/testsuite/tests/apps/PyTorch/src/get_free_socket.py
similarity index 100%
rename from eessi/testsuite/tests/apps/PyTorch/src/python_get_free_socket.py
rename to eessi/testsuite/tests/apps/PyTorch/src/get_free_socket.py