From b2037a9f302fbde5e6401585acc28d4451c0b25e Mon Sep 17 00:00:00 2001
From: Satish Kamath <satishk@int4.local.snellius.surf.nl>
Date: Wed, 31 May 2023 12:03:20 +0200
Subject: [PATCH 01/23] Adding 1st draft of OSU test.

---
 .../reframe/eessi_checks/applications/osu.py  | 48 +++++++++++++++++++
 1 file changed, 48 insertions(+)
 create mode 100644 eessi/reframe/eessi_checks/applications/osu.py
diff --git a/eessi/reframe/eessi_checks/applications/osu.py b/eessi/reframe/eessi_checks/applications/osu.py
new file mode 100644
index 00000000..dd6b4001
--- /dev/null
+++ b/eessi/reframe/eessi_checks/applications/osu.py
@@ -0,0 +1,48 @@
+"""
+This module tests the binary 'osu' in available modules containing substring 'OSU-Micro-Benchmarks'.
+The basic application class is taken from the hpctestlib to which extra features are added.
+"""
+
+import reframe as rfm
+
+from hpctestlib.microbenchmarks.mpi.osu import osu_benchmark
+from eessi_utils import hooks, utils
+
+@rfm.simple_test
+class OSU_EESSI(osu_benchmark):
+    scale = parameter(utils.SCALES)
+    valid_prog_environs = ['default']
+    valid_systems = []
+    time_limit = '30m'
+    module_name = parameter(utils.find_modules('OSU-Mirco-Benchmarks'))
+
+    @run_after('init')
+    def run_after_init(self):
+        """hooks to run after init phase"""
+        hooks.filter_tests_by_device_type(
+                self,
+                required_device_type=self.device_buffers)
+        hooks.set_modules(self)
+        hooks.set_tag_scale(self)
+
+    @run_after('init')
+    def set_tag_ci(self):
+        if self.benchmark_info[0] =='mpi.pt2pt.osu_latency':
+            self.tags.add('CI')
+
+# TODO: Set slurm options per rack, switch.
+# TODO: Override already existing message sizes if specified.
+    @run_after('init')
+    def set_executable_opts(self):
+        """
+        Add extra executable_opts or ones that override default ones such as
+        message sizes, unless specified via --setvar executable_opts=<x>
+        """
+        bench, bench_metric = self.benchmark_info
+        if bench.startswith('mpi.pt2pt'):
+            num_default = 8  # normalized number of executable opts added by parent class (osu_benchmark)
+        elif self.device_buffers != 'cpu':
+            num_default = 10
+        else:
+            num_default = 6
+        hooks.check_custom_executable_opts(self, num_default=num_default)

From 317c929e70af4425611645699da33476b464f876 Mon Sep 17 00:00:00 2001
From: Satish Kamath <satishk@int4.local.snellius.surf.nl>
Date: Wed, 31 May 2023 12:15:50 +0200
Subject: [PATCH 02/23] Adjusting to changed SCALES constant

---
 eessi/reframe/eessi_checks/applications/osu.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/eessi/reframe/eessi_checks/applications/osu.py b/eessi/reframe/eessi_checks/applications/osu.py
index dd6b4001..cb927f0c 100644
--- a/eessi/reframe/eessi_checks/applications/osu.py
+++ b/eessi/reframe/eessi_checks/applications/osu.py
@@ -7,10 +7,12 @@
 
 from hpctestlib.microbenchmarks.mpi.osu import osu_benchmark
 from eessi_utils import hooks, utils
+from eessi_utils.constants import SCALES, TAGS
+
 
 @rfm.simple_test
 class OSU_EESSI(osu_benchmark):
-    scale = parameter(utils.SCALES)
+    scale = parameter(SCALES.keys())
     valid_prog_environs = ['default']
     valid_systems = []
     time_limit = '30m'

From 7cb3e698ffa11a2b644a94e31957f7cfa5e897a9 Mon Sep 17 00:00:00 2001
From: Satish Kamath <satishk@tcn3.local.snellius.surf.nl>
Date: Sat, 3 Jun 2023 00:37:03 +0200
Subject: [PATCH 03/23] The OSU test is getting listed.

---
 .../reframe/eessi_checks/applications/osu.py  | 56 ++++++++++++-------
 1 file changed, 37 insertions(+), 19 deletions(-)

diff --git a/eessi/reframe/eessi_checks/applications/osu.py b/eessi/reframe/eessi_checks/applications/osu.py
index cb927f0c..5c71d15f 100644
--- a/eessi/reframe/eessi_checks/applications/osu.py
+++ b/eessi/reframe/eessi_checks/applications/osu.py
@@ -3,27 +3,35 @@
 The basic application class is taken from the hpctestlib to which extra features are added.
 """
 
+import os
 import reframe as rfm
+import reframe.utility.sanity as sn
 
 from hpctestlib.microbenchmarks.mpi.osu import osu_benchmark
+
 from eessi_utils import hooks, utils
 from eessi_utils.constants import SCALES, TAGS
 
 
 @rfm.simple_test
-class OSU_EESSI(osu_benchmark):
+class osu_run(osu_benchmark):
+    ''' Run-only OSU test '''
     scale = parameter(SCALES.keys())
     valid_prog_environs = ['default']
     valid_systems = []
     time_limit = '30m'
-    module_name = parameter(utils.find_modules('OSU-Mirco-Benchmarks'))
+    module_name = parameter(utils.find_modules('OSU-Micro-Benchmarks'))
+# This is required by the base class and needs to at least have a default value
+# which means that this needs to be assigned or re-assigned in the class based
+# on other options
+#    osu_benchmark.num_tasks = 2
 
     @run_after('init')
     def run_after_init(self):
         """hooks to run after init phase"""
-        hooks.filter_tests_by_device_type(
-                self,
-                required_device_type=self.device_buffers)
+        hooks.filter_valid_systems_by_device_type(
+               self,
+               required_device_type=self.device_buffers)
         hooks.set_modules(self)
         hooks.set_tag_scale(self)
 
@@ -32,19 +40,29 @@ def set_tag_ci(self):
         if self.benchmark_info[0] =='mpi.pt2pt.osu_latency':
             self.tags.add('CI')
 
+#    @run_after('setup')
+#    def set_executable_opts(self):
+#        """
+#        Add extra executable_opts or ones that override default ones such as
+#        message sizes, unless specified via --setvar executable_opts=<x>
+#        """
+#        bench, bench_metric = self.benchmark_info
+#        if bench.startswith('mpi.pt2pt'):
+#            num_default = 8  # normalized number of executable opts added by parent class (osu_benchmark)
+#        elif self.device_buffers != 'cpu':
+#            num_default = 10
+#        else:
+#            num_default = 6
+#        hooks.check_custom_executable_opts(self, num_default=num_default)
+    
+#    @run_after('setup')
+#    def run_after_setup(self):
+#        """Hooks to run after the setup phase"""
+#
+#        # Calculate default requested resources based on the scale:
+#        # 1 task per CPU for CPU-only tests, 1 task per GPU for GPU tests.
+#        # Also support setting the resources on the cmd line.
+#        hooks.assign_one_task_per_compute_unit(test=self, compute_unit=self.nb_impl)
+
 # TODO: Set slurm options per rack, switch.
 # TODO: Override already existing message sizes if specified.
-    @run_after('init')
-    def set_executable_opts(self):
-        """
-        Add extra executable_opts or ones that override default ones such as
-        message sizes, unless specified via --setvar executable_opts=<x>
-        """
-        bench, bench_metric = self.benchmark_info
-        if bench.startswith('mpi.pt2pt'):
-            num_default = 8  # normalized number of executable opts added by parent class (osu_benchmark)
-        elif self.device_buffers != 'cpu':
-            num_default = 10
-        else:
-            num_default = 6
-        hooks.check_custom_executable_opts(self, num_default=num_default)

From d952151f6b3a08ffc75b7141de5a4bb53dd4b215 Mon Sep 17 00:00:00 2001
From: Satish Kamath <satishk@tcn3.local.snellius.surf.nl>
Date: Thu, 10 Aug 2023 13:34:54 +0200
Subject: [PATCH 04/23] Stashing for changing the branch.

---
 eessi/reframe/eessi_checks/applications/osu.py | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

diff --git a/eessi/reframe/eessi_checks/applications/osu.py b/eessi/reframe/eessi_checks/applications/osu.py
index 5c71d15f..792bdd71 100644
--- a/eessi/reframe/eessi_checks/applications/osu.py
+++ b/eessi/reframe/eessi_checks/applications/osu.py
@@ -12,11 +12,21 @@
 from eessi_utils import hooks, utils
 from eessi_utils.constants import SCALES, TAGS
 
+def my_filtering_function(test: rfm.RegressionTest):
+    """
+    Filtering function for filtering scales for the OSU test
+    """
+    for key in list(SCALES):
+        if(key == '1_core' or key == '4_cores' or SCALES.get(key).get('num_nodes') > 2):
+            test.scale_filtered.pop(key)
+    return test.scale_filtered
+
 
 @rfm.simple_test
-class osu_run(osu_benchmark):
+class osu_pt_2_pt(osu_benchmark):
     ''' Run-only OSU test '''
-    scale = parameter(SCALES.keys())
+    scale_filtered = SCALES
+    scale = parameter(my_filtering_function())
     valid_prog_environs = ['default']
     valid_systems = []
     time_limit = '30m'
@@ -33,13 +43,13 @@ def run_after_init(self):
                self,
                required_device_type=self.device_buffers)
         hooks.set_modules(self)
-        hooks.set_tag_scale(self)
 
     @run_after('init')
     def set_tag_ci(self):
         if self.benchmark_info[0] =='mpi.pt2pt.osu_latency':
             self.tags.add('CI')
 
+
 #    @run_after('setup')
 #    def set_executable_opts(self):
 #        """

From 4165aff882c249463ed258716c7ce3e67a98b6d4 Mon Sep 17 00:00:00 2001
From: Satish Kamath <satishk@tcn3.local.snellius.surf.nl>
Date: Tue, 22 Aug 2023 00:12:21 +0200
Subject: [PATCH 05/23] Completed point to point tests, on Snellius. Yet to be
 tested on other systems

---
 .../reframe/eessi_checks/applications/osu.py  | 41 +++++++++++++++----
 1 file changed, 33 insertions(+), 8 deletions(-)

diff --git a/eessi/reframe/eessi_checks/applications/osu.py b/eessi/reframe/eessi_checks/applications/osu.py
index 792bdd71..4359b153 100644
--- a/eessi/reframe/eessi_checks/applications/osu.py
+++ b/eessi/reframe/eessi_checks/applications/osu.py
@@ -10,22 +10,26 @@
 from hpctestlib.microbenchmarks.mpi.osu import osu_benchmark
 
 from eessi_utils import hooks, utils
-from eessi_utils.constants import SCALES, TAGS
+from eessi_utils.constants import SCALES, TAGS, DEVICES
 
-def my_filtering_function(test: rfm.RegressionTest):
+def my_filtering_function():
     """
     Filtering function for filtering scales for the OSU test
     """
+    scale_filtered = SCALES
     for key in list(SCALES):
-        if(key == '1_core' or key == '4_cores' or SCALES.get(key).get('num_nodes') > 2):
-            test.scale_filtered.pop(key)
-    return test.scale_filtered
+        if(key == '1_core' or key == '4_cores' or
+           SCALES.get(key).get('num_nodes') > 2):
+            scale_filtered.pop(key)
+        elif('node_part' in SCALES.get(key)):
+            if(SCALES.get(key).get('node_part') > 1):
+                scale_filtered.pop(key)
+    return scale_filtered
 
 
 @rfm.simple_test
 class osu_pt_2_pt(osu_benchmark):
     ''' Run-only OSU test '''
-    scale_filtered = SCALES
     scale = parameter(my_filtering_function())
     valid_prog_environs = ['default']
     valid_systems = []
@@ -35,7 +39,6 @@ class osu_pt_2_pt(osu_benchmark):
 # which means that this needs to be assigned or re-assigned in the class based
 # on other options
 #    osu_benchmark.num_tasks = 2
-
     @run_after('init')
     def run_after_init(self):
         """hooks to run after init phase"""
@@ -46,10 +49,32 @@ def run_after_init(self):
 
     @run_after('init')
     def set_tag_ci(self):
-        if self.benchmark_info[0] =='mpi.pt2pt.osu_latency':
+        if (self.benchmark_info[0] == 'mpi.pt2pt.osu_latency' or
+           self.benchmark_info[0] == 'mpi.pt2pt.osu_bw'):
             self.tags.add('CI')
 
 
+    @run_after('init')
+    def set_num_tasks_per_node(self):
+        if(SCALES.get(self.scale).get('num_nodes') == 1):
+            self.num_tasks_per_node = 2
+
+
+    @run_after('setup')
+    def set_num_gpus_per_node(self):
+        """
+        This test does not require gpus and is for host to host within GPU
+        nodes. But some systems do require a GPU allocation for to perform any
+        activity in the GPU nodes.
+        """
+        if('gpu' in self.current_partition.features):
+            if(SCALES.get(self.scale).get('num_nodes') == 1):
+                self.num_gpus_per_node = 1
+            else:
+                self.num_gpus_per_node = \
+                    self.current_partition.devices[0].num_devices
+
+
 #    @run_after('setup')
 #    def set_executable_opts(self):
 #        """

From 8d5586e5611d5f86d86f00e4d02ecf0ed4e63add Mon Sep 17 00:00:00 2001
From: Satish Kamath <satishk@tcn3.local.snellius.surf.nl>
Date: Fri, 25 Aug 2023 19:54:56 +0200
Subject: [PATCH 06/23] OSU pt2pt works generically for both CUDA and non-CUDA
 modules. There can of course be further improvements where more types of
 devices are present and also for slurm options based on system architechture.

---
 eessi/testsuite/tests/apps/osu.py | 42 +++++++++++++++++++++++++------
 1 file changed, 34 insertions(+), 8 deletions(-)

diff --git a/eessi/testsuite/tests/apps/osu.py b/eessi/testsuite/tests/apps/osu.py
index 4359b153..a33b2fdb 100644
--- a/eessi/testsuite/tests/apps/osu.py
+++ b/eessi/testsuite/tests/apps/osu.py
@@ -9,8 +9,8 @@
 
 from hpctestlib.microbenchmarks.mpi.osu import osu_benchmark
 
-from eessi_utils import hooks, utils
-from eessi_utils.constants import SCALES, TAGS, DEVICES
+from eessi.testsuite import hooks, utils
+from eessi.testsuite.constants import SCALES, TAGS, DEVICE_TYPES
 
 def my_filtering_function():
     """
@@ -35,16 +35,26 @@ class osu_pt_2_pt(osu_benchmark):
     valid_systems = []
     time_limit = '30m'
     module_name = parameter(utils.find_modules('OSU-Micro-Benchmarks'))
-# This is required by the base class and needs to at least have a default value
-# which means that this needs to be assigned or re-assigned in the class based
-# on other options
-#    osu_benchmark.num_tasks = 2
+    # Device type for non-cuda OSU-Micro-Benchmarks should run on hosts of both
+    # node types. To do this the default device type is set to GPU.
+    device_type = DEVICE_TYPES['GPU']
+
+
     @run_after('init')
     def run_after_init(self):
         """hooks to run after init phase"""
         hooks.filter_valid_systems_by_device_type(
                self,
-               required_device_type=self.device_buffers)
+               required_device_type=self.device_type)
+        is_cuda_module = utils.is_cuda_required_module(self.module_name)
+        # This part of the hook is meant to be for the OSU cpu tests.
+        if not is_cuda_module and self.device_type == DEVICE_TYPES['GPU']:
+            self.valid_systems = ['*']
+            self.device_buffers = 'cpu'
+        elif is_cuda_module and self.device_type == DEVICE_TYPES['GPU']:
+            # Currently the device buffer is hard coded to be cuda. More
+            # options need to be introduced based on vendor and device type.
+            self.device_buffers = 'cuda'
         hooks.set_modules(self)
 
     @run_after('init')
@@ -67,14 +77,30 @@ def set_num_gpus_per_node(self):
         nodes. But some systems do require a GPU allocation for to perform any
         activity in the GPU nodes.
         """
-        if('gpu' in self.current_partition.features):
+        if('gpu' in self.current_partition.features and
+           not utils.is_cuda_required_module(self.module_name)):
             if(SCALES.get(self.scale).get('num_nodes') == 1):
                 self.num_gpus_per_node = 1
             else:
+                # The devices section is sort of hard coded. This needs to be
+                # amended for a more heterogeneous system with more than one
+                # device type.
+                self.num_gpus_per_node = \
+                    self.current_partition.devices[0].num_devices
+        elif('gpu' in self.current_partition.features and
+             utils.is_cuda_required_module(self.module_name)):
+            if(SCALES.get(self.scale).get('num_nodes') == 1):
+                self.num_gpus_per_node = 2
+            else:
+                # The devices section is sort of hard coded. This needs to be
+                # amended for a more heterogeneous system with more than one
+                # device type.
                 self.num_gpus_per_node = \
                     self.current_partition.devices[0].num_devices
 
 
+
+
 #    @run_after('setup')
 #    def set_executable_opts(self):
 #        """

From 52b7dd9bb76434bc40392ee4877ed0fc4fe21aed Mon Sep 17 00:00:00 2001
From: Satish Kamath <satishk@tcn3.local.snellius.surf.nl>
Date: Thu, 7 Sep 2023 11:09:07 +0200
Subject: [PATCH 07/23] Removed the collective communication tests from the
 pt2pt.

---
 eessi/testsuite/tests/apps/osu.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/eessi/testsuite/tests/apps/osu.py b/eessi/testsuite/tests/apps/osu.py
index a33b2fdb..86fbfd38 100644
--- a/eessi/testsuite/tests/apps/osu.py
+++ b/eessi/testsuite/tests/apps/osu.py
@@ -55,6 +55,10 @@ def run_after_init(self):
             # Currently the device buffer is hard coded to be cuda. More
             # options need to be introduced based on vendor and device type.
             self.device_buffers = 'cuda'
+        # This part of the code removes the collective communication calls out
+        # of the run list since this test is only meant for pt2pt.
+        if not self.benchmark_info[0].startswith('mpi.pt2pt'):
+            self.valid_systems = []
         hooks.set_modules(self)
 
     @run_after('init')

From ad385392ea322732f8d1d5d47fe9bb56fe2d0425 Mon Sep 17 00:00:00 2001
From: Satish Kamath <satishk@tcn3.local.snellius.surf.nl>
Date: Tue, 12 Sep 2023 05:54:59 +0200
Subject: [PATCH 08/23] Added filtering function to set num_tasks and
 num_tasks_per_node.

---
 eessi/testsuite/tests/apps/osu.py | 109 +++++++++++++++++++++++++++++-
 1 file changed, 106 insertions(+), 3 deletions(-)

diff --git a/eessi/testsuite/tests/apps/osu.py b/eessi/testsuite/tests/apps/osu.py
index 86fbfd38..21795720 100644
--- a/eessi/testsuite/tests/apps/osu.py
+++ b/eessi/testsuite/tests/apps/osu.py
@@ -5,6 +5,7 @@
 
 import os
 import reframe as rfm
+from reframe.core.meta import parameters
 import reframe.utility.sanity as sn
 
 from hpctestlib.microbenchmarks.mpi.osu import osu_benchmark
@@ -14,9 +15,9 @@
 
 def my_filtering_function():
     """
-    Filtering function for filtering scales for the OSU test
+    Filtering function for filtering scales for the pt2pt OSU test
     """
-    scale_filtered = SCALES
+    scale_filtered = SCALES.copy()
     for key in list(SCALES):
         if(key == '1_core' or key == '4_cores' or
            SCALES.get(key).get('num_nodes') > 2):
@@ -27,6 +28,17 @@ def my_filtering_function():
     return scale_filtered
 
 
+def my_filtering_function_coll():
+    """
+    Filtering function for filtering scales for collective the OSU test
+    """
+    scale_filtered = SCALES.copy()
+    for key in list(SCALES):
+        if(key == '1_core'):
+            scale_filtered.pop(key)
+    return scale_filtered
+
+
 @rfm.simple_test
 class osu_pt_2_pt(osu_benchmark):
     ''' Run-only OSU test '''
@@ -103,6 +115,97 @@ def set_num_gpus_per_node(self):
                     self.current_partition.devices[0].num_devices
 
 
+@rfm.simple_test
+class osu_coll(osu_benchmark):
+    ''' Run-only OSU test '''
+    scale = parameter(my_filtering_function_coll())
+    #scale = parameter(SCALES.keys())
+    valid_prog_environs = ['default']
+    valid_systems = []
+    time_limit = '30m'
+    module_name = parameter(utils.find_modules('OSU-Micro-Benchmarks'))
+    # Device type for non-cuda OSU-Micro-Benchmarks should run on hosts of both
+    # node types. To do this the default device type is set to GPU.
+    device_type = DEVICE_TYPES['GPU']
+
+
+    @run_after('init')
+    def run_after_init(self):
+        """hooks to run after init phase"""
+        hooks.filter_valid_systems_by_device_type(
+               self,
+               required_device_type=self.device_type)
+        is_cuda_module = utils.is_cuda_required_module(self.module_name)
+        # This part of the hook is meant to be for the OSU cpu tests.
+        if not is_cuda_module and self.device_type == DEVICE_TYPES['GPU']:
+            self.valid_systems = ['*']
+            self.device_buffers = 'cpu'
+        elif is_cuda_module and self.device_type == DEVICE_TYPES['GPU']:
+            # Currently the device buffer is hard coded to be cuda. More
+            # options need to be introduced based on vendor and device type.
+            self.device_buffers = 'cuda'
+        # This part of the code removes the collective communication calls out
+        # of the run list since this test is only meant for collective.
+        if not self.benchmark_info[0].startswith('mpi.collective'):
+            self.valid_systems = []
+        hooks.set_modules(self)
+
+
+    @run_after('init')
+    def set_tag_ci(self):
+        if (self.benchmark_info[0] == 'mpi.collective.osu_allreduce' or
+           self.benchmark_info[0] == 'mpi.collective.osu_alltoall'):
+            self.tags.add('CI')
+
+
+    @run_after('init')
+    def set_num_tasks(self):
+        hooks.set_tag_scale(self)
+
+
+    @run_after('setup')
+    def run_after_setup(self):
+        """Hooks to run after the setup phase"""
+        # Calculate default requested resources based on the scale:
+        # 1 task per CPU for CPU-only tests, 1 task per GPU for GPU tests.
+        # Also support setting the resources on the cmd line.
+        # CPU settings for cpu based tests
+        # Setting num_tasks
+        max_avail_cpus_per_node = self.current_partition.processor.num_cpus
+        self.num_tasks = max_avail_cpus_per_node * SCALES.get(self.scale).get('num_nodes')
+        if (SCALES.get(self.scale).get('node_part') is not None):
+            self.num_tasks = int(self.num_tasks/SCALES.get(self.scale).get('node_part'))
+        elif (SCALES.get(self.scale).get('num_cpus_per_node') is not None):
+            self.num_tasks = SCALES.get(self.scale).get('num_cpus_per_node')
+
+        # Setting num_tasks_per_node
+        if (SCALES.get(self.scale).get('num_nodes') == 1):
+            self.num_tasks_per_node = self.num_tasks
+        else:
+            self.num_tasks_per_node = max_avail_cpus_per_node
+
+
+    @run_after('setup')
+    def set_num_gpus_per_node(self):
+        """
+        This test does not require gpus and is for host to host within GPU
+        nodes. But some systems do require a GPU allocation for to perform any
+        activity in the GPU nodes.
+        """
+        if('gpu' in self.current_partition.features and
+           not utils.is_cuda_required_module(self.module_name)):
+            if(SCALES.get(self.scale).get('num_nodes') == 1):
+                self.num_gpus_per_node = 1
+            else:
+                # The devices section is sort of hard coded. This needs to be
+                # amended for a more heterogeneous system with more than one
+                # device type.
+                self.num_gpus_per_node = \
+                    self.current_partition.devices[0].num_devices
+        elif('gpu' in self.current_partition.features and
+             utils.is_cuda_required_module(self.module_name)):
+            self.num_gpus_per_node = \
+                    self.current_partition.devices[0].num_devices
 
 
 #    @run_after('setup')
@@ -119,7 +222,7 @@ def set_num_gpus_per_node(self):
 #        else:
 #            num_default = 6
 #        hooks.check_custom_executable_opts(self, num_default=num_default)
-    
+
 #    @run_after('setup')
 #    def run_after_setup(self):
 #        """Hooks to run after the setup phase"""

From b83afa4c7effdc83e84d8eecf1689c271652eb02 Mon Sep 17 00:00:00 2001
From: Satish Kamath <satishk@tcn3.local.snellius.surf.nl>
Date: Wed, 11 Oct 2023 16:20:51 +0200
Subject: [PATCH 09/23] Adding a sub section for the CUDA tests which divides
 the node based on the CPUs specified. For example, if a 2 core option is
 chosen then, 2 GPU devices are chosen if available otherwise the test is
 skipped.

---
 eessi/testsuite/tests/apps/osu.py | 62 ++++++++++++++++++++++++++++++-
 1 file changed, 61 insertions(+), 1 deletion(-)

diff --git a/eessi/testsuite/tests/apps/osu.py b/eessi/testsuite/tests/apps/osu.py
index 21795720..b75ba38e 100644
--- a/eessi/testsuite/tests/apps/osu.py
+++ b/eessi/testsuite/tests/apps/osu.py
@@ -184,6 +184,41 @@ def run_after_setup(self):
         else:
             self.num_tasks_per_node = max_avail_cpus_per_node
 
+        # The above setting is for all CPU tests including the ones occurring
+        # in the GPU nodes. This section is specifically for GPU tests the
+        # num_tasks should be equal to num gpus per node.
+        if('gpu' in self.current_partition.features and
+           utils.is_cuda_required_module(self.module_name)):
+            max_avail_gpus_per_node = \
+                    self.current_partition.devices[0].num_devices
+            if(max_avail_gpus_per_node == 1 and
+                    SCALES.get(self.scale).get('num_nodes') == 1):
+                raise ValueError(
+                                 "There is only 1 device within the node."
+                                 "There is no point of performing collective\
+                                  operations on 1 device."
+                                )
+            else:
+                if (SCALES.get(self.scale).get('num_nodes') == 1):
+                    if (SCALES.get(self.scale).get('node_part') is not None):
+                        self.num_tasks = int(max_avail_gpus_per_node /
+                                             SCALES.get(self.scale).get('node_part'))
+                        self.skip_if(self.num_tasks <= 1,
+                                     msg="There are not enough GPU cards to be divided")
+                    elif (SCALES.get(self.scale).get('num_cpus_per_node') is not None):
+                        if(SCALES.get(self.scale).get('num_cpus_per_node') >=
+                           max_avail_gpus_per_node):
+                            self.num_tasks = self.num_tasks_per_node =\
+                                    max_avail_gpus_per_node
+                        else:
+                            self.num_tasks = \
+                                    SCALES.get(self.scale).get('num_cpus_per_node')
+                            self.num_tasks_per_node = self.num_tasks
+
+                else:
+                    self.num_tasks = SCALES.get(self.scale).get('num_nodes') *\
+                           max_avail_gpus_per_node
+                    self.num_tasks_per_node = max_avail_gpus_per_node
 
     @run_after('setup')
     def set_num_gpus_per_node(self):
@@ -204,8 +239,33 @@ def set_num_gpus_per_node(self):
                     self.current_partition.devices[0].num_devices
         elif('gpu' in self.current_partition.features and
              utils.is_cuda_required_module(self.module_name)):
-            self.num_gpus_per_node = \
+            max_avail_gpus_per_node = \
                     self.current_partition.devices[0].num_devices
+            if(max_avail_gpus_per_node == 1 and
+                    SCALES.get(self.scale).get('num_nodes') == 1):
+                raise ValueError(
+                                 "There is only 1 device within the node."
+                                 "There is no point of performing collective\
+                                  operations on 1 device."
+                                )
+            else:
+                if (SCALES.get(self.scale).get('num_nodes') == 1):
+                    if (SCALES.get(self.scale).get('node_part') is not None):
+                        self.num_gpus_per_node = int(max_avail_gpus_per_node /
+                                                     SCALES.get(self.scale).get('node_part'))
+                        self.skip_if(self.num_gpus_per_node <= 1,
+                                     msg="There are not enough GPU cards to be divided")
+                    elif (SCALES.get(self.scale).get('num_cpus_per_node') is not None):
+                        if(SCALES.get(self.scale).get('num_cpus_per_node') >=
+                           max_avail_gpus_per_node):
+                            self.num_gpus_per_node =\
+                                    max_avail_gpus_per_node
+                        else:
+                            self.num_gpus_per_node = \
+                                    SCALES.get(self.scale).get('num_cpus_per_node')
+
+                else:
+                    self.num_gpus_per_node = max_avail_gpus_per_node
 
 
 #    @run_after('setup')

From dcc8a86c842259674349ea2d24580d19f37db417 Mon Sep 17 00:00:00 2001
From: Satish Kamath <satishk@tcn3.local.snellius.surf.nl>
Date: Mon, 27 Nov 2023 17:33:12 +0100
Subject: [PATCH 10/23] The point to point should work for all settings now.
 Even the option of memory is included.

---
 eessi/testsuite/tests/apps/osu.py | 127 ++++++++++++++++--------------
 1 file changed, 66 insertions(+), 61 deletions(-)

diff --git a/eessi/testsuite/tests/apps/osu.py b/eessi/testsuite/tests/apps/osu.py
index b75ba38e..c8be64b9 100644
--- a/eessi/testsuite/tests/apps/osu.py
+++ b/eessi/testsuite/tests/apps/osu.py
@@ -2,41 +2,33 @@
 This module tests the binary 'osu' in available modules containing substring 'OSU-Micro-Benchmarks'.
 The basic application class is taken from the hpctestlib to which extra features are added.
 """
-
-import os
 import reframe as rfm
-from reframe.core.meta import parameters
-import reframe.utility.sanity as sn
-
 from hpctestlib.microbenchmarks.mpi.osu import osu_benchmark
 
 from eessi.testsuite import hooks, utils
-from eessi.testsuite.constants import SCALES, TAGS, DEVICE_TYPES
+from eessi.testsuite.constants import CPU, SCALES, TAGS, DEVICE_TYPES, COMPUTE_UNIT
 
 def my_filtering_function():
     """
     Filtering function for filtering scales for the pt2pt OSU test
     """
-    scale_filtered = SCALES.copy()
-    for key in list(SCALES):
-        if(key == '1_core' or key == '4_cores' or
-           SCALES.get(key).get('num_nodes') > 2):
-            scale_filtered.pop(key)
-        elif('node_part' in SCALES.get(key)):
-            if(SCALES.get(key).get('node_part') > 1):
-                scale_filtered.pop(key)
-    return scale_filtered
+    return [
+        k for (k, v) in SCALES.items()
+        if v['num_nodes'] * v.get('num_cpus_per_node', 0) == 2
+        or (v['num_nodes'] == 2 and v.get('node_part', 0) == 1)
+        or (v['num_nodes'] == 1 and v.get('node_part', 0) == 1)
+    ]
 
 
 def my_filtering_function_coll():
     """
     Filtering function for filtering scales for collective the OSU test
     """
-    scale_filtered = SCALES.copy()
-    for key in list(SCALES):
-        if(key == '1_core'):
-            scale_filtered.pop(key)
-    return scale_filtered
+    return [
+        k for (k, v) in SCALES.items()
+        if (v['num_nodes'] * v.get('num_cpus_per_node', 1) > 1)
+        or (v.get('node_part', 0) > 0)
+    ]
 
 
 @rfm.simple_test
@@ -75,16 +67,46 @@ def run_after_init(self):
 
     @run_after('init')
     def set_tag_ci(self):
+        """ Setting tests under CI tag. """
         if (self.benchmark_info[0] == 'mpi.pt2pt.osu_latency' or
            self.benchmark_info[0] == 'mpi.pt2pt.osu_bw'):
             self.tags.add('CI')
 
+        if (self.benchmark_info[0] == 'mpi.pt2pt.osu_bw'):
+            self.tags.add('osu_bw')
+
+        if (self.benchmark_info[0] == 'mpi.pt2pt.osu_latency'):
+            self.tags.add('osu_latency')
+
+    @run_after('init')
+    def set_mem(self):
+        """ Setting an extra job option of memory. This test has only 4
+        possibilities: 1_node, 2_nodes, 2_cores and 1_cpn_2_nodes. Only the
+        last 2 require the memory to be set. """
+        is_cuda_module = utils.is_cuda_required_module(self.module_name)
+        if(SCALES.get(self.scale).get('node_part', 0) == 0):
+            self.extra_resources = {'memory': {'size': '8GB'}}
+
+    @run_after('init')
+    def set_num_tasks(self):
+        """ Setting scales as tags. """
+        hooks.set_tag_scale(self)
 
     @run_after('init')
     def set_num_tasks_per_node(self):
         if(SCALES.get(self.scale).get('num_nodes') == 1):
             self.num_tasks_per_node = 2
+        else:
+            self.num_tasks_per_node = 1
 
+    @run_after('setup')
+    def set_num_cpus_per_task(self):
+        """ Since num_tasks_per_node is already set. This function sets
+        num_cpus_per_task for 1 node and 2 node options. For """
+        if(SCALES.get(self.scale).get('num_nodes') == 1 and
+           SCALES.get(self.scale).get('node_part', 0) == 1):
+            hooks.assign_one_task_per_compute_unit(self,
+                                                   COMPUTE_UNIT.get(CPU, 'cpu'))
 
     @run_after('setup')
     def set_num_gpus_per_node(self):
@@ -101,18 +123,37 @@ def set_num_gpus_per_node(self):
                 # The devices section is sort of hard coded. This needs to be
                 # amended for a more heterogeneous system with more than one
                 # device type.
+
+                # Even for 1_cpn_2_nodes, the gpus requested are for the full
+                # nodes. On Snellius 1 GPU card cannot be reserved on 2
+                # different nodes which can be different on different systems.
                 self.num_gpus_per_node = \
                     self.current_partition.devices[0].num_devices
         elif('gpu' in self.current_partition.features and
              utils.is_cuda_required_module(self.module_name)):
+            max_avail_gpus_per_node = \
+                    self.current_partition.devices[0].num_devices
             if(SCALES.get(self.scale).get('num_nodes') == 1):
-                self.num_gpus_per_node = 2
+                # Skip the single node test if there is only 1 device in the 
+                # node.
+                if(max_avail_gpus_per_node == 1):
+                    self.skip(msg="There is only 1 device within the node. Skipping tests involving only 1 node.")
+                else:
+                    self.num_gpus_per_node = 2
             else:
                 # The devices section is sort of hard coded. This needs to be
                 # amended for a more heterogeneous system with more than one
                 # device type.
-                self.num_gpus_per_node = \
-                    self.current_partition.devices[0].num_devices
+
+                # Note these settings are for 1_cpn_2_nodes. In that case we
+                # want to test for only 1 GPU per node since we have not
+                # requested for full nodes.
+                if(SCALES.get(self.scale).get('num_gpus_per_node', 0)):
+                    self.num_gpus_per_node = \
+                        SCALES.get(self.scale).get('num_gpus_per_node', 0)
+                else:
+                    self.num_gpus_per_node = \
+                        self.current_partition.devices[0].num_devices
 
 
 @rfm.simple_test
@@ -193,11 +234,7 @@ def run_after_setup(self):
                     self.current_partition.devices[0].num_devices
             if(max_avail_gpus_per_node == 1 and
                     SCALES.get(self.scale).get('num_nodes') == 1):
-                raise ValueError(
-                                 "There is only 1 device within the node."
-                                 "There is no point of performing collective\
-                                  operations on 1 device."
-                                )
+                self.skip(msg="There is only 1 device within the node. Skipping collective tests involving only 1 node.")
             else:
                 if (SCALES.get(self.scale).get('num_nodes') == 1):
                     if (SCALES.get(self.scale).get('node_part') is not None):
@@ -243,11 +280,7 @@ def set_num_gpus_per_node(self):
                     self.current_partition.devices[0].num_devices
             if(max_avail_gpus_per_node == 1 and
                     SCALES.get(self.scale).get('num_nodes') == 1):
-                raise ValueError(
-                                 "There is only 1 device within the node."
-                                 "There is no point of performing collective\
-                                  operations on 1 device."
-                                )
+                self.skip(msg="There is only 1 device within the node. Skipping collective tests involving only 1 node.")
             else:
                 if (SCALES.get(self.scale).get('num_nodes') == 1):
                     if (SCALES.get(self.scale).get('node_part') is not None):
@@ -266,31 +299,3 @@ def set_num_gpus_per_node(self):
 
                 else:
                     self.num_gpus_per_node = max_avail_gpus_per_node
-
-
-#    @run_after('setup')
-#    def set_executable_opts(self):
-#        """
-#        Add extra executable_opts or ones that override default ones such as
-#        message sizes, unless specified via --setvar executable_opts=<x>
-#        """
-#        bench, bench_metric = self.benchmark_info
-#        if bench.startswith('mpi.pt2pt'):
-#            num_default = 8  # normalized number of executable opts added by parent class (osu_benchmark)
-#        elif self.device_buffers != 'cpu':
-#            num_default = 10
-#        else:
-#            num_default = 6
-#        hooks.check_custom_executable_opts(self, num_default=num_default)
-
-#    @run_after('setup')
-#    def run_after_setup(self):
-#        """Hooks to run after the setup phase"""
-#
-#        # Calculate default requested resources based on the scale:
-#        # 1 task per CPU for CPU-only tests, 1 task per GPU for GPU tests.
-#        # Also support setting the resources on the cmd line.
-#        hooks.assign_one_task_per_compute_unit(test=self, compute_unit=self.nb_impl)
-
-# TODO: Set slurm options per rack, switch.
-# TODO: Override already existing message sizes if specified.

From 3b3fed4dbb324d7baaa96dda449cf7da0234d01c Mon Sep 17 00:00:00 2001
From: Satish Kamath <satishk@tcn3.local.snellius.surf.nl>
Date: Mon, 4 Dec 2023 14:24:39 +0100
Subject: [PATCH 11/23] Changing the required memory per node to 32GB from 8GB
 because it is just not enough.

---
 eessi/testsuite/tests/apps/osu.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/eessi/testsuite/tests/apps/osu.py b/eessi/testsuite/tests/apps/osu.py
index c8be64b9..5e484076 100644
--- a/eessi/testsuite/tests/apps/osu.py
+++ b/eessi/testsuite/tests/apps/osu.py
@@ -85,7 +85,7 @@ def set_mem(self):
         last 2 require the memory to be set. """
         is_cuda_module = utils.is_cuda_required_module(self.module_name)
         if(SCALES.get(self.scale).get('node_part', 0) == 0):
-            self.extra_resources = {'memory': {'size': '8GB'}}
+            self.extra_resources = {'memory': {'size': '32GB'}}
 
     @run_after('init')
     def set_num_tasks(self):

From 5c30cfe507855955cd6461eea2edb1d3e645bda0 Mon Sep 17 00:00:00 2001
From: Satish Kamath <satishk@tcn3.local.snellius.surf.nl>
Date: Mon, 4 Dec 2023 16:08:30 +0100
Subject: [PATCH 12/23] Small logical error was not asking for 2 FULL nodes
 with cpus-per-task. Corrrected.

---
 eessi/testsuite/tests/apps/osu.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/eessi/testsuite/tests/apps/osu.py b/eessi/testsuite/tests/apps/osu.py
index 5e484076..95a82a18 100644
--- a/eessi/testsuite/tests/apps/osu.py
+++ b/eessi/testsuite/tests/apps/osu.py
@@ -102,8 +102,9 @@ def set_num_tasks_per_node(self):
     @run_after('setup')
     def set_num_cpus_per_task(self):
         """ Since num_tasks_per_node is already set. This function sets
-        num_cpus_per_task for 1 node and 2 node options. For """
-        if(SCALES.get(self.scale).get('num_nodes') == 1 and
+        num_cpus_per_task for 1 node and 2 node options where the request is
+        for full nodes."""
+        if(SCALES.get(self.scale).get('num_nodes') >= 1 and
            SCALES.get(self.scale).get('node_part', 0) == 1):
             hooks.assign_one_task_per_compute_unit(self,
                                                    COMPUTE_UNIT.get(CPU, 'cpu'))

From 47c3c9e96af4b7d97d0b9179f05742d73562dc03 Mon Sep 17 00:00:00 2001
From: Satish Kamath <satishk@tcn3.local.snellius.surf.nl>
Date: Wed, 6 Dec 2023 17:42:23 +0100
Subject: [PATCH 13/23] Committing changes suggested by Sam. There are some
 open where some discussion is needed.

---
 eessi/testsuite/tests/apps/osu.py | 30 +++++++++++++++++++++---------
 1 file changed, 21 insertions(+), 9 deletions(-)

diff --git a/eessi/testsuite/tests/apps/osu.py b/eessi/testsuite/tests/apps/osu.py
index 95a82a18..eb308db7 100644
--- a/eessi/testsuite/tests/apps/osu.py
+++ b/eessi/testsuite/tests/apps/osu.py
@@ -6,7 +6,10 @@
 from hpctestlib.microbenchmarks.mpi.osu import osu_benchmark
 
 from eessi.testsuite import hooks, utils
-from eessi.testsuite.constants import CPU, SCALES, TAGS, DEVICE_TYPES, COMPUTE_UNIT
+from eessi.testsuite.constants import CPU, SCALES, TAGS, DEVICE_TYPES,\
+        COMPUTE_UNIT, GPU, GPU_VENDOR, FEATURES, GPU_VENDORS, NVIDIA
+from eessi.testsuite.utils import find_modules, log
+
 
 def my_filtering_function():
     """
@@ -38,10 +41,10 @@ class osu_pt_2_pt(osu_benchmark):
     valid_prog_environs = ['default']
     valid_systems = []
     time_limit = '30m'
-    module_name = parameter(utils.find_modules('OSU-Micro-Benchmarks'))
+    module_name = parameter(find_modules('OSU-Micro-Benchmarks'))
     # Device type for non-cuda OSU-Micro-Benchmarks should run on hosts of both
     # node types. To do this the default device type is set to GPU.
-    device_type = DEVICE_TYPES['GPU']
+    device_type = parameter([DEVICE_TYPES[CPU], DEVICE_TYPES[GPU]])
 
 
     @run_after('init')
@@ -51,14 +54,22 @@ def run_after_init(self):
                self,
                required_device_type=self.device_type)
         is_cuda_module = utils.is_cuda_required_module(self.module_name)
-        # This part of the hook is meant to be for the OSU cpu tests.
-        if not is_cuda_module and self.device_type == DEVICE_TYPES['GPU']:
-            self.valid_systems = ['*']
+        # This part of the hook is meant to be for the OSU cpu tests. This is
+        # required since the non CUDA module should be able to run in the GPU
+        # partition as well. This is specific for this test and not covered by
+        # the function above.
+        if not is_cuda_module and self.device_type == DEVICE_TYPES[GPU]:
+            self.valid_systems = [f'+{FEATURES[GPU]} %{GPU_VENDOR}={GPU_VENDORS[NVIDIA]}']
             self.device_buffers = 'cpu'
-        elif is_cuda_module and self.device_type == DEVICE_TYPES['GPU']:
+        elif is_cuda_module and self.device_type == DEVICE_TYPES[GPU]:
             # Currently the device buffer is hard coded to be cuda. More
             # options need to be introduced based on vendor and device type.
             self.device_buffers = 'cuda'
+
+        # If the device_type is CPU then device buffer should always be CPU.
+        if self.device_type == DEVICE_TYPES[CPU]:
+            self.device_buffers = 'cpu'
+
         # This part of the code removes the collective communication calls out
         # of the run list since this test is only meant for pt2pt.
         if not self.benchmark_info[0].startswith('mpi.pt2pt'):
@@ -68,9 +79,10 @@ def run_after_init(self):
     @run_after('init')
     def set_tag_ci(self):
         """ Setting tests under CI tag. """
-        if (self.benchmark_info[0] == 'mpi.pt2pt.osu_latency' or
-           self.benchmark_info[0] == 'mpi.pt2pt.osu_bw'):
+        if (self.benchmark_info[0] in ['mpi.pt2pt.osu_latency',
+                                       'mpi.pt2pt.osu_bw']):
             self.tags.add('CI')
+            log(f'tags set to {self.tags}')
 
         if (self.benchmark_info[0] == 'mpi.pt2pt.osu_bw'):
             self.tags.add('osu_bw')

From efe91980bc1484555b40aadefd51fe4c0901f0f5 Mon Sep 17 00:00:00 2001
From: Satish Kamath <satishk@tcn1.local.snellius.surf.nl>
Date: Wed, 13 Dec 2023 11:33:20 +0100
Subject: [PATCH 14/23] Committing the import constants part which is trivial
 and just code cleaning.

---
 eessi/testsuite/tests/apps/osu.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/eessi/testsuite/tests/apps/osu.py b/eessi/testsuite/tests/apps/osu.py
index eb308db7..42451700 100644
--- a/eessi/testsuite/tests/apps/osu.py
+++ b/eessi/testsuite/tests/apps/osu.py
@@ -6,8 +6,7 @@
 from hpctestlib.microbenchmarks.mpi.osu import osu_benchmark
 
 from eessi.testsuite import hooks, utils
-from eessi.testsuite.constants import CPU, SCALES, TAGS, DEVICE_TYPES,\
-        COMPUTE_UNIT, GPU, GPU_VENDOR, FEATURES, GPU_VENDORS, NVIDIA
+from eessi.testsuite.constants import *
 from eessi.testsuite.utils import find_modules, log
 
 

From 57d6e52e898672964fdc53ae3dde4e86fac323ac Mon Sep 17 00:00:00 2001
From: Satish Kamath <satishk@tcn1.local.snellius.surf.nl>
Date: Wed, 13 Dec 2023 18:12:37 +0100
Subject: [PATCH 15/23] Adding the latest commit with minor modifications to
 Sam's PR #97 within hooks.

---
 eessi/testsuite/hooks.py          |  6 ++++++
 eessi/testsuite/tests/apps/osu.py | 22 +++++++++-------------
 2 files changed, 15 insertions(+), 13 deletions(-)

diff --git a/eessi/testsuite/hooks.py b/eessi/testsuite/hooks.py
index 751b3ffb..65c51c87 100644
--- a/eessi/testsuite/hooks.py
+++ b/eessi/testsuite/hooks.py
@@ -105,6 +105,12 @@ def _assign_num_tasks_per_node(test: rfm.RegressionTest, num_per: int = 1):
     - num_tasks_per_node = num_per
     - num_cpus_per_task = test.default_num_cpus_per_node / num_tasks_per_node
     """
+
+    # OSU test inherits a default num_tasks_per_node = 1 from hpctestlib.
+    # There has to be a condition where a non-default value of num_per replaces the original num_tasks_per_node.
+    if test.num_tasks_per_node and num_per != 1:
+        if test.num_tasks_per_node != num_per:
+            test.num_tasks_per_node = num_per
     # neither num_tasks_per_node nor num_cpus_per_task are set
     if not test.num_tasks_per_node and not test.num_cpus_per_task:
         test.num_tasks_per_node = num_per
diff --git a/eessi/testsuite/tests/apps/osu.py b/eessi/testsuite/tests/apps/osu.py
index 42451700..28e6b2f8 100644
--- a/eessi/testsuite/tests/apps/osu.py
+++ b/eessi/testsuite/tests/apps/osu.py
@@ -103,22 +103,18 @@ def set_num_tasks(self):
         """ Setting scales as tags. """
         hooks.set_tag_scale(self)
 
-    @run_after('init')
+    @run_after('setup')
     def set_num_tasks_per_node(self):
+        """ Setting number of tasks per node and cpus per task in this function.
+        This function sets num_cpus_per_task for 1 node and 2 node options where 
+        the request is for full nodes."""
         if(SCALES.get(self.scale).get('num_nodes') == 1):
-            self.num_tasks_per_node = 2
+            print("test: ", self.num_tasks_per_node)
+            hooks.assign_tasks_per_compute_unit(self,
+                                                   COMPUTE_UNIT.get(NODE,
+                                                                    'node'), 2)
         else:
-            self.num_tasks_per_node = 1
-
-    @run_after('setup')
-    def set_num_cpus_per_task(self):
-        """ Since num_tasks_per_node is already set. This function sets
-        num_cpus_per_task for 1 node and 2 node options where the request is
-        for full nodes."""
-        if(SCALES.get(self.scale).get('num_nodes') >= 1 and
-           SCALES.get(self.scale).get('node_part', 0) == 1):
-            hooks.assign_one_task_per_compute_unit(self,
-                                                   COMPUTE_UNIT.get(CPU, 'cpu'))
+            hooks.assign_tasks_per_compute_unit(self, COMPUTE_UNIT.get(NODE, 'node'))
 
     @run_after('setup')
     def set_num_gpus_per_node(self):

From fd48cc4caaf372bffb665835c819575511d01268 Mon Sep 17 00:00:00 2001
From: Satish Kamath <satishk@tcn1.local.snellius.surf.nl>
Date: Wed, 13 Dec 2023 18:25:59 +0100
Subject: [PATCH 16/23] Minimal changing indentation in comments.

---
 eessi/testsuite/hooks.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/eessi/testsuite/hooks.py b/eessi/testsuite/hooks.py
index 65c51c87..bda8ad30 100644
--- a/eessi/testsuite/hooks.py
+++ b/eessi/testsuite/hooks.py
@@ -106,8 +106,9 @@ def _assign_num_tasks_per_node(test: rfm.RegressionTest, num_per: int = 1):
     - num_cpus_per_task = test.default_num_cpus_per_node / num_tasks_per_node
     """
 
-    # OSU test inherits a default num_tasks_per_node = 1 from hpctestlib.
-    # There has to be a condition where a non-default value of num_per replaces the original num_tasks_per_node.
+    # OSU test inherits a default num_tasks_per_node = 1 from hpctestlib. There
+    # has to be a condition where a non-default value of num_per replaces the
+    # original num_tasks_per_node.
     if test.num_tasks_per_node and num_per != 1:
         if test.num_tasks_per_node != num_per:
             test.num_tasks_per_node = num_per

From 400c55757941c1660c0105d894142033e4f53bd4 Mon Sep 17 00:00:00 2001
From: Satish Kamath <satishk@tcn3.local.snellius.surf.nl>
Date: Wed, 3 Jan 2024 18:04:39 +0100
Subject: [PATCH 17/23] Committing the latest changes which correspond to
 changes suggested by Sam. Another important change is made in the init phase
 to avoid OSU CUDA module being executed on a pure `cpu` node which does not
 have CUDA drivers.

---
 eessi/testsuite/hooks.py          | 11 +++-------
 eessi/testsuite/tests/apps/osu.py | 35 ++++++++++++++++++++++---------
 2 files changed, 28 insertions(+), 18 deletions(-)

diff --git a/eessi/testsuite/hooks.py b/eessi/testsuite/hooks.py
index bda8ad30..c9b9e5d4 100644
--- a/eessi/testsuite/hooks.py
+++ b/eessi/testsuite/hooks.py
@@ -106,20 +106,15 @@ def _assign_num_tasks_per_node(test: rfm.RegressionTest, num_per: int = 1):
     - num_cpus_per_task = test.default_num_cpus_per_node / num_tasks_per_node
     """
 
-    # OSU test inherits a default num_tasks_per_node = 1 from hpctestlib. There
-    # has to be a condition where a non-default value of num_per replaces the
-    # original num_tasks_per_node.
-    if test.num_tasks_per_node and num_per != 1:
-        if test.num_tasks_per_node != num_per:
-            test.num_tasks_per_node = num_per
     # neither num_tasks_per_node nor num_cpus_per_task are set
     if not test.num_tasks_per_node and not test.num_cpus_per_task:
         test.num_tasks_per_node = num_per
-        test.num_cpus_per_task = test.default_num_cpus_per_node / test.num_tasks_per_node
+        test.num_cpus_per_task = int(test.default_num_cpus_per_node /
+                                     test.num_tasks_per_node)
 
     # num_tasks_per_node is not set, but num_cpus_per_task is
     elif not test.num_tasks_per_node:
-        test.num_tasks_per_node = test.default_num_cpus_per_node / test.num_cpus_per_task
+        test.num_tasks_per_node = int(test.default_num_cpus_per_node / test.num_cpus_per_task)
 
     # num_cpus_per_task is not set, but num_tasks_per_node is
     elif not test.num_cpus_per_task:
diff --git a/eessi/testsuite/tests/apps/osu.py b/eessi/testsuite/tests/apps/osu.py
index 28e6b2f8..557767a3 100644
--- a/eessi/testsuite/tests/apps/osu.py
+++ b/eessi/testsuite/tests/apps/osu.py
@@ -1,6 +1,11 @@
 """
-This module tests the binary 'osu' in available modules containing substring 'OSU-Micro-Benchmarks'.
-The basic application class is taken from the hpctestlib to which extra features are added.
+This module tests the binary 'osu' in available modules containing substring
+'OSU-Micro-Benchmarks'. The basic application class is taken from the
+hpctestlib to which extra features are added.
+
+Note: OSU-Micro-Benchmarks CUDA module binaries must be linked to stubs so that
+it at the least finds libcuda.so.1 on non-GPU nodes. Otherwise those tests will
+FAIL.
 """
 import reframe as rfm
 from hpctestlib.microbenchmarks.mpi.osu import osu_benchmark
@@ -44,7 +49,7 @@ class osu_pt_2_pt(osu_benchmark):
     # Device type for non-cuda OSU-Micro-Benchmarks should run on hosts of both
     # node types. To do this the default device type is set to GPU.
     device_type = parameter([DEVICE_TYPES[CPU], DEVICE_TYPES[GPU]])
-
+    num_tasks_per_node = None
 
     @run_after('init')
     def run_after_init(self):
@@ -57,13 +62,23 @@ def run_after_init(self):
         # required since the non CUDA module should be able to run in the GPU
         # partition as well. This is specific for this test and not covered by
         # the function above.
-        if not is_cuda_module and self.device_type == DEVICE_TYPES[GPU]:
-            self.valid_systems = [f'+{FEATURES[GPU]} %{GPU_VENDOR}={GPU_VENDORS[NVIDIA]}']
-            self.device_buffers = 'cpu'
-        elif is_cuda_module and self.device_type == DEVICE_TYPES[GPU]:
+        # if not is_cuda_module and self.device_type == DEVICE_TYPES[GPU]:
+        #     self.valid_systems = [f'+{FEATURES[GPU]} %{GPU_VENDOR}={GPU_VENDORS[NVIDIA]}']
+        #     self.device_buffers = 'cpu'
+        # elif is_cuda_module and self.device_type == DEVICE_TYPES[GPU]:
+        #     # Currently the device buffer is hard coded to be cuda. More
+        #     # options need to be introduced based on vendor and device type.
+        #     self.device_buffers = 'cuda'
+        if is_cuda_module and self.device_type == DEVICE_TYPES[GPU]:
             # Currently the device buffer is hard coded to be cuda. More
             # options need to be introduced based on vendor and device type.
             self.device_buffers = 'cuda'
+        elif is_cuda_module and self.device_type == DEVICE_TYPES[CPU]:
+            # This if condition had to be added since the CUDA compiled osu
+            # tests do not run on cpu partitions. The binaries need
+            # libcuda.so.1 during runtime which can only be found in a
+            # partition with CUDA drivers.
+            self.valid_systems = [f'+{FEATURES[CPU]} +{FEATURES[GPU]} %{GPU_VENDOR}={GPU_VENDORS[NVIDIA]}']
 
         # If the device_type is CPU then device buffer should always be CPU.
         if self.device_type == DEVICE_TYPES[CPU]:
@@ -106,15 +121,15 @@ def set_num_tasks(self):
     @run_after('setup')
     def set_num_tasks_per_node(self):
         """ Setting number of tasks per node and cpus per task in this function.
-        This function sets num_cpus_per_task for 1 node and 2 node options where 
+        This function sets num_cpus_per_task for 1 node and 2 node options where
         the request is for full nodes."""
         if(SCALES.get(self.scale).get('num_nodes') == 1):
-            print("test: ", self.num_tasks_per_node)
             hooks.assign_tasks_per_compute_unit(self,
                                                    COMPUTE_UNIT.get(NODE,
                                                                     'node'), 2)
         else:
-            hooks.assign_tasks_per_compute_unit(self, COMPUTE_UNIT.get(NODE, 'node'))
+            hooks.assign_tasks_per_compute_unit(self, COMPUTE_UNIT.get(NODE,
+                                                                       'node'))
 
     @run_after('setup')
     def set_num_gpus_per_node(self):

From 591d1464accc07a687679adfb41e261623e3911b Mon Sep 17 00:00:00 2001
From: Satish Kamath <satishk@tcn3.local.snellius.surf.nl>
Date: Wed, 3 Jan 2024 18:27:52 +0100
Subject: [PATCH 18/23] Added a small comment.

---
 eessi/testsuite/tests/apps/osu.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/eessi/testsuite/tests/apps/osu.py b/eessi/testsuite/tests/apps/osu.py
index 557767a3..7a385b6e 100644
--- a/eessi/testsuite/tests/apps/osu.py
+++ b/eessi/testsuite/tests/apps/osu.py
@@ -49,6 +49,7 @@ class osu_pt_2_pt(osu_benchmark):
     # Device type for non-cuda OSU-Micro-Benchmarks should run on hosts of both
     # node types. To do this the default device type is set to GPU.
     device_type = parameter([DEVICE_TYPES[CPU], DEVICE_TYPES[GPU]])
+    # unset num_tasks_per_node from the hpctestlib.
     num_tasks_per_node = None
 
     @run_after('init')

From 14371a5836b30c11e2aaff560abd75089666c965 Mon Sep 17 00:00:00 2001
From: Satish Kamath <satishk@tcn3.local.snellius.surf.nl>
Date: Thu, 4 Jan 2024 20:37:00 +0100
Subject: [PATCH 19/23] Fixed collective tests as well. osu.py ready to be
 tested in a complete manner (both point to point and collectives).

---
 eessi/testsuite/tests/apps/osu.py | 172 ++++++++++++++----------------
 1 file changed, 79 insertions(+), 93 deletions(-)

diff --git a/eessi/testsuite/tests/apps/osu.py b/eessi/testsuite/tests/apps/osu.py
index 7a385b6e..7edcaae6 100644
--- a/eessi/testsuite/tests/apps/osu.py
+++ b/eessi/testsuite/tests/apps/osu.py
@@ -110,7 +110,6 @@ def set_mem(self):
         """ Setting an extra job option of memory. This test has only 4
         possibilities: 1_node, 2_nodes, 2_cores and 1_cpn_2_nodes. Only the
         last 2 require the memory to be set. """
-        is_cuda_module = utils.is_cuda_required_module(self.module_name)
         if(SCALES.get(self.scale).get('node_part', 0) == 0):
             self.extra_resources = {'memory': {'size': '32GB'}}
 
@@ -158,7 +157,7 @@ def set_num_gpus_per_node(self):
             max_avail_gpus_per_node = \
                     self.current_partition.devices[0].num_devices
             if(SCALES.get(self.scale).get('num_nodes') == 1):
-                # Skip the single node test if there is only 1 device in the 
+                # Skip the single node test if there is only 1 device in the
                 # node.
                 if(max_avail_gpus_per_node == 1):
                     self.skip(msg="There is only 1 device within the node. Skipping tests involving only 1 node.")
@@ -191,7 +190,9 @@ class osu_coll(osu_benchmark):
     module_name = parameter(utils.find_modules('OSU-Micro-Benchmarks'))
     # Device type for non-cuda OSU-Micro-Benchmarks should run on hosts of both
     # node types. To do this the default device type is set to GPU.
-    device_type = DEVICE_TYPES['GPU']
+    device_type = parameter([DEVICE_TYPES[CPU], DEVICE_TYPES[GPU]])
+    # unset num_tasks_per_node from hpctestlib
+    num_tasks_per_node = None
 
 
     @run_after('init')
@@ -202,13 +203,27 @@ def run_after_init(self):
                required_device_type=self.device_type)
         is_cuda_module = utils.is_cuda_required_module(self.module_name)
         # This part of the hook is meant to be for the OSU cpu tests.
-        if not is_cuda_module and self.device_type == DEVICE_TYPES['GPU']:
-            self.valid_systems = ['*']
-            self.device_buffers = 'cpu'
-        elif is_cuda_module and self.device_type == DEVICE_TYPES['GPU']:
+#        if not is_cuda_module and self.device_type == DEVICE_TYPES['GPU']:
+#            self.valid_systems = ['*']
+#            self.device_buffers = 'cpu'
+#       elif is_cuda_module and self.device_type == DEVICE_TYPES['GPU']:
+#           # Currently the device buffer is hard coded to be cuda. More
+#           # options need to be introduced based on vendor and device type.
+#           self.device_buffers = 'cuda'
+        if is_cuda_module and self.device_type == DEVICE_TYPES[GPU]:
             # Currently the device buffer is hard coded to be cuda. More
             # options need to be introduced based on vendor and device type.
             self.device_buffers = 'cuda'
+        elif is_cuda_module and self.device_type == DEVICE_TYPES[CPU]:
+            # This if condition had to be added since the CUDA compiled osu
+            # tests do not run on cpu partitions. The binaries need
+            # libcuda.so.1 during runtime which can only be found in a
+            # partition with CUDA drivers.
+            self.valid_systems = [f'+{FEATURES[CPU]} +{FEATURES[GPU]} %{GPU_VENDOR}={GPU_VENDORS[NVIDIA]}']
+
+        # If the device_type is CPU then device buffer should always be CPU.
+        if self.device_type == DEVICE_TYPES[CPU]:
+            self.device_buffers = 'cpu'
         # This part of the code removes the collective communication calls out
         # of the run list since this test is only meant for collective.
         if not self.benchmark_info[0].startswith('mpi.collective'):
@@ -221,105 +236,76 @@ def set_tag_ci(self):
         if (self.benchmark_info[0] == 'mpi.collective.osu_allreduce' or
            self.benchmark_info[0] == 'mpi.collective.osu_alltoall'):
             self.tags.add('CI')
+        if (self.benchmark_info[0] == 'mpi.collective.osu_allreduce'):
+            self.tags.add('osu_allreduce')
+
+        if (self.benchmark_info[0] == 'mpi.collective.osu_alltoall'):
+            self.tags.add('osu_alltoall')
 
 
+    @run_after('init')
+    def set_mem(self):
+        """ Setting an extra job option of memory."""
+        if(SCALES.get(self.scale).get('node_part', 0) != 1):
+            self.extra_resources = {'memory': {'size': '64GB'}}
+
     @run_after('init')
     def set_num_tasks(self):
         hooks.set_tag_scale(self)
 
-
     @run_after('setup')
-    def run_after_setup(self):
-        """Hooks to run after the setup phase"""
-        # Calculate default requested resources based on the scale:
-        # 1 task per CPU for CPU-only tests, 1 task per GPU for GPU tests.
-        # Also support setting the resources on the cmd line.
-        # CPU settings for cpu based tests
-        # Setting num_tasks
+    def set_num_tasks_per_node(self):
+        """ Setting number of tasks per node, cpus per task and gpus per node
+        in this function. This function sets num_cpus_per_task for 1 node and 2
+        node options where the request is for full nodes."""
         max_avail_cpus_per_node = self.current_partition.processor.num_cpus
-        self.num_tasks = max_avail_cpus_per_node * SCALES.get(self.scale).get('num_nodes')
-        if (SCALES.get(self.scale).get('node_part') is not None):
-            self.num_tasks = int(self.num_tasks/SCALES.get(self.scale).get('node_part'))
-        elif (SCALES.get(self.scale).get('num_cpus_per_node') is not None):
-            self.num_tasks = SCALES.get(self.scale).get('num_cpus_per_node')
-
-        # Setting num_tasks_per_node
-        if (SCALES.get(self.scale).get('num_nodes') == 1):
-            self.num_tasks_per_node = self.num_tasks
-        else:
-            self.num_tasks_per_node = max_avail_cpus_per_node
-
-        # The above setting is for all CPU tests including the ones occurring
-        # in the GPU nodes. This section is specifically for GPU tests the
-        # num_tasks should be equal to num gpus per node.
-        if('gpu' in self.current_partition.features and
-           utils.is_cuda_required_module(self.module_name)):
-            max_avail_gpus_per_node = \
-                    self.current_partition.devices[0].num_devices
-            if(max_avail_gpus_per_node == 1 and
-                    SCALES.get(self.scale).get('num_nodes') == 1):
-                self.skip(msg="There is only 1 device within the node. Skipping collective tests involving only 1 node.")
-            else:
-                if (SCALES.get(self.scale).get('num_nodes') == 1):
-                    if (SCALES.get(self.scale).get('node_part') is not None):
-                        self.num_tasks = int(max_avail_gpus_per_node /
-                                             SCALES.get(self.scale).get('node_part'))
-                        self.skip_if(self.num_tasks <= 1,
-                                     msg="There are not enough GPU cards to be divided")
-                    elif (SCALES.get(self.scale).get('num_cpus_per_node') is not None):
-                        if(SCALES.get(self.scale).get('num_cpus_per_node') >=
-                           max_avail_gpus_per_node):
-                            self.num_tasks = self.num_tasks_per_node =\
-                                    max_avail_gpus_per_node
-                        else:
-                            self.num_tasks = \
-                                    SCALES.get(self.scale).get('num_cpus_per_node')
-                            self.num_tasks_per_node = self.num_tasks
-
+        if(self.device_buffers == 'cpu'):
+            # Setting num_tasks and num_tasks_per_node for the CPU tests
+            if(SCALES.get(self.scale).get('num_cpus_per_node', 0)):
+                hooks.assign_tasks_per_compute_unit(self,
+                                                    COMPUTE_UNIT.get(NODE,
+                                                                     'node'),
+                                                    self.default_num_cpus_per_node)
+            elif(SCALES.get(self.scale).get('node_part', 0)):
+                pass_num_per = int(max_avail_cpus_per_node /
+                        SCALES.get(self.scale).get('node_part', 0))
+                hooks.assign_tasks_per_compute_unit(self,
+                                                    COMPUTE_UNIT.get(NODE,
+                                                                     'node'),
+                                                    pass_num_per)
+
+            if('gpu' in self.current_partition.features):
+                # Setting number of GPU for a cpu test on a GPU node.
+                if(SCALES.get(self.scale).get('num_nodes') == 1):
+                    self.num_gpus_per_node = 1
                 else:
-                    self.num_tasks = SCALES.get(self.scale).get('num_nodes') *\
-                           max_avail_gpus_per_node
-                    self.num_tasks_per_node = max_avail_gpus_per_node
-
-    @run_after('setup')
-    def set_num_gpus_per_node(self):
-        """
-        This test does not require gpus and is for host to host within GPU
-        nodes. But some systems do require a GPU allocation for to perform any
-        activity in the GPU nodes.
-        """
-        if('gpu' in self.current_partition.features and
-           not utils.is_cuda_required_module(self.module_name)):
-            if(SCALES.get(self.scale).get('num_nodes') == 1):
-                self.num_gpus_per_node = 1
-            else:
-                # The devices section is sort of hard coded. This needs to be
-                # amended for a more heterogeneous system with more than one
-                # device type.
-                self.num_gpus_per_node = \
-                    self.current_partition.devices[0].num_devices
-        elif('gpu' in self.current_partition.features and
-             utils.is_cuda_required_module(self.module_name)):
+                    # The devices section is sort of hard coded. This needs to be
+                    # amended for a more heterogeneous system with more than one
+                    # device type.
+                    self.num_gpus_per_node = \
+                        self.current_partition.devices[0].num_devices
+        elif(self.device_buffers == 'cuda'):
+            # Setting num_tasks and num_tasks_per_node for the GPU tests
             max_avail_gpus_per_node = \
                     self.current_partition.devices[0].num_devices
             if(max_avail_gpus_per_node == 1 and
                     SCALES.get(self.scale).get('num_nodes') == 1):
                 self.skip(msg="There is only 1 device within the node. Skipping collective tests involving only 1 node.")
             else:
-                if (SCALES.get(self.scale).get('num_nodes') == 1):
-                    if (SCALES.get(self.scale).get('node_part') is not None):
-                        self.num_gpus_per_node = int(max_avail_gpus_per_node /
-                                                     SCALES.get(self.scale).get('node_part'))
-                        self.skip_if(self.num_gpus_per_node <= 1,
-                                     msg="There are not enough GPU cards to be divided")
-                    elif (SCALES.get(self.scale).get('num_cpus_per_node') is not None):
-                        if(SCALES.get(self.scale).get('num_cpus_per_node') >=
-                           max_avail_gpus_per_node):
-                            self.num_gpus_per_node =\
-                                    max_avail_gpus_per_node
-                        else:
-                            self.num_gpus_per_node = \
-                                    SCALES.get(self.scale).get('num_cpus_per_node')
-
+                if(SCALES.get(self.scale).get('num_gpus_per_node', 0) *
+                   SCALES.get(self.scale).get('num_nodes', 0) > 1):
+                    hooks.assign_tasks_per_compute_unit(self,
+                                                        COMPUTE_UNIT.get(GPU,
+                                                                         'gpu'))
+                elif(SCALES.get(self.scale).get('node_part', 0)):
+                    pass_num_per = int(max_avail_gpus_per_node /
+                            SCALES.get(self.scale).get('node_part', 0))
+                    if(pass_num_per > 1):
+                        hooks.assign_tasks_per_compute_unit(self,
+                                                        COMPUTE_UNIT.get(GPU,
+                                                                         'gpu'))
+                    else:
+                        self.skip(msg="Total GPUs (max_avail_gpus_per_node / node_part) is 1 less.")
                 else:
-                    self.num_gpus_per_node = max_avail_gpus_per_node
+                    self.skip(msg="Total GPUs (num_nodes * num_gpus_per_node) = 1")
+

From 54364ebdef3ed118a566208addf56741f5753d2b Mon Sep 17 00:00:00 2001
From: Satish Kamath <satishk@int5.local.snellius.surf.nl>
Date: Thu, 4 Jan 2024 20:53:22 +0100
Subject: [PATCH 20/23] Line 272 did not account for the case where the number
 of cores were too few and would result in 0 num_tasks_per_node. Now it is
 fixed.

---
 eessi/testsuite/tests/apps/osu.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/eessi/testsuite/tests/apps/osu.py b/eessi/testsuite/tests/apps/osu.py
index 7edcaae6..b2cb687d 100644
--- a/eessi/testsuite/tests/apps/osu.py
+++ b/eessi/testsuite/tests/apps/osu.py
@@ -269,10 +269,13 @@ def set_num_tasks_per_node(self):
             elif(SCALES.get(self.scale).get('node_part', 0)):
                 pass_num_per = int(max_avail_cpus_per_node /
                         SCALES.get(self.scale).get('node_part', 0))
-                hooks.assign_tasks_per_compute_unit(self,
+                if(pass_num_per > 1):
+                    hooks.assign_tasks_per_compute_unit(self,
                                                     COMPUTE_UNIT.get(NODE,
                                                                      'node'),
                                                     pass_num_per)
+                else:
+                    self.skip(msg="Too few cores available for a collective operation.")
 
             if('gpu' in self.current_partition.features):
                 # Setting number of GPU for a cpu test on a GPU node.

From b0a5a45e35eac9250be738ede3d7d14b1b199c6a Mon Sep 17 00:00:00 2001
From: Satish Kamath <satishk@tcn3.local.snellius.surf.nl>
Date: Wed, 24 Jan 2024 14:53:57 +0100
Subject: [PATCH 21/23] Have accounted for all the comments. I hope I have not
 missed much.

---
 eessi/testsuite/tests/apps/osu.py | 241 ++++++++++++------------------
 1 file changed, 99 insertions(+), 142 deletions(-)

diff --git a/eessi/testsuite/tests/apps/osu.py b/eessi/testsuite/tests/apps/osu.py
index b2cb687d..34e654f0 100644
--- a/eessi/testsuite/tests/apps/osu.py
+++ b/eessi/testsuite/tests/apps/osu.py
@@ -1,21 +1,20 @@
 """
-This module tests the binary 'osu' in available modules containing substring
-'OSU-Micro-Benchmarks'. The basic application class is taken from the
-hpctestlib to which extra features are added.
+This module tests the binary 'osu' in available modules containing substring 'OSU-Micro-Benchmarks'. The basic
+application class is taken from the hpctestlib to which extra features are added.
 
-Note: OSU-Micro-Benchmarks CUDA module binaries must be linked to stubs so that
-it at the least finds libcuda.so.1 on non-GPU nodes. Otherwise those tests will
-FAIL.
+Note: OSU-Micro-Benchmarks CUDA module binaries must be linked to stubs so that it at the least finds libcuda.so.1 on
+non-GPU nodes. Otherwise those tests will FAIL.
 """
 import reframe as rfm
 from hpctestlib.microbenchmarks.mpi.osu import osu_benchmark
+from reframe.utility import reframe
 
 from eessi.testsuite import hooks, utils
 from eessi.testsuite.constants import *
 from eessi.testsuite.utils import find_modules, log
 
 
-def my_filtering_function():
+def filter_scales_pt2pt():
     """
     Filtering function for filtering scales for the pt2pt OSU test
     """
@@ -27,7 +26,7 @@ def my_filtering_function():
     ]
 
 
-def my_filtering_function_coll():
+def filter_scales_coll():
     """
     Filtering function for filtering scales for collective the OSU test
     """
@@ -41,61 +40,53 @@ def my_filtering_function_coll():
 @rfm.simple_test
 class osu_pt_2_pt(osu_benchmark):
     ''' Run-only OSU test '''
-    scale = parameter(my_filtering_function())
+    scale = parameter(filter_scales_pt2pt())
     valid_prog_environs = ['default']
     valid_systems = []
     time_limit = '30m'
     module_name = parameter(find_modules('OSU-Micro-Benchmarks'))
-    # Device type for non-cuda OSU-Micro-Benchmarks should run on hosts of both
-    # node types. To do this the default device type is set to GPU.
+    # Device type for non-cuda OSU-Micro-Benchmarks should run on hosts of both node types. To do this the default
+    # device type is set to GPU.
     device_type = parameter([DEVICE_TYPES[CPU], DEVICE_TYPES[GPU]])
     # unset num_tasks_per_node from the hpctestlib.
     num_tasks_per_node = None
+    # Note: device_buffers variable is inherited from the hpctestlib class and adds options to the launcher
+    # commands based on what device is set.
+    device_buffers = 'cpu'
 
     @run_after('init')
     def run_after_init(self):
         """hooks to run after init phase"""
-        hooks.filter_valid_systems_by_device_type(
-               self,
-               required_device_type=self.device_type)
+        hooks.filter_valid_systems_by_device_type(self, required_device_type=self.device_type)
         is_cuda_module = utils.is_cuda_required_module(self.module_name)
-        # This part of the hook is meant to be for the OSU cpu tests. This is
-        # required since the non CUDA module should be able to run in the GPU
-        # partition as well. This is specific for this test and not covered by
-        # the function above.
-        # if not is_cuda_module and self.device_type == DEVICE_TYPES[GPU]:
-        #     self.valid_systems = [f'+{FEATURES[GPU]} %{GPU_VENDOR}={GPU_VENDORS[NVIDIA]}']
-        #     self.device_buffers = 'cpu'
-        # elif is_cuda_module and self.device_type == DEVICE_TYPES[GPU]:
-        #     # Currently the device buffer is hard coded to be cuda. More
-        #     # options need to be introduced based on vendor and device type.
-        #     self.device_buffers = 'cuda'
+        # This part of the hook is meant to be for the OSU cpu tests. This is required since the non CUDA module should
+        # be able to run in the GPU partition as well. This is specific for this test and not covered by the function
+        # above.
         if is_cuda_module and self.device_type == DEVICE_TYPES[GPU]:
-            # Currently the device buffer is hard coded to be cuda. More
-            # options need to be introduced based on vendor and device type.
+            # Sets to cuda as device buffer only if the module is compiled with CUDA.
             self.device_buffers = 'cuda'
-        elif is_cuda_module and self.device_type == DEVICE_TYPES[CPU]:
-            # This if condition had to be added since the CUDA compiled osu
-            # tests do not run on cpu partitions. The binaries need
-            # libcuda.so.1 during runtime which can only be found in a
-            # partition with CUDA drivers.
-            self.valid_systems = [f'+{FEATURES[CPU]} +{FEATURES[GPU]} %{GPU_VENDOR}={GPU_VENDORS[NVIDIA]}']
 
         # If the device_type is CPU then device buffer should always be CPU.
         if self.device_type == DEVICE_TYPES[CPU]:
             self.device_buffers = 'cpu'
 
-        # This part of the code removes the collective communication calls out
-        # of the run list since this test is only meant for pt2pt.
+        # This part of the code removes the collective communication calls out of the run list since this test is only
+        # meant for pt2pt.
         if not self.benchmark_info[0].startswith('mpi.pt2pt'):
             self.valid_systems = []
         hooks.set_modules(self)
 
+    @run_after('setup')
+    def adjust_executable_opts(self):
+        """The option "D D" is only meant for Devices if and not for CPU tests. This option is added by hpctestlib to
+        all pt2pt tests which is not required."""
+        if(self.device_type == DEVICE_TYPES[CPU]):
+            self.executable_opts = [ele for ele in self.executable_opts if ele != 'D']
+
     @run_after('init')
     def set_tag_ci(self):
         """ Setting tests under CI tag. """
-        if (self.benchmark_info[0] in ['mpi.pt2pt.osu_latency',
-                                       'mpi.pt2pt.osu_bw']):
+        if (self.benchmark_info[0] in ['mpi.pt2pt.osu_latency', 'mpi.pt2pt.osu_bw']):
             self.tags.add('CI')
             log(f'tags set to {self.tags}')
 
@@ -107,125 +98,89 @@ def set_tag_ci(self):
 
     @run_after('init')
     def set_mem(self):
-        """ Setting an extra job option of memory. This test has only 4
-        possibilities: 1_node, 2_nodes, 2_cores and 1_cpn_2_nodes. Only the
-        last 2 require the memory to be set. """
-        if(SCALES.get(self.scale).get('node_part', 0) == 0):
-            self.extra_resources = {'memory': {'size': '32GB'}}
+        """ Setting an extra job option of memory. This test has only 4 possibilities: 1_node, 2_nodes, 2_cores and
+        1_cpn_2_nodes. This is implemented for all cases including full node cases. The requested memory may seem large
+        and the test requires at least 4.5 GB per core at the minimum for the full test when run with validation (-c
+        option for osu_bw or osu_latency). We run till message size 8 (-m 8) which significantly reduces memory
+        requirement."""
+        self.extra_resources = {'memory': {'size': '16GB'}}
 
     @run_after('init')
     def set_num_tasks(self):
         """ Setting scales as tags. """
         hooks.set_tag_scale(self)
 
+    @run_after('setup')
+    def set_environment(self):
+        """ Setting environment variable for CUDA module tests that run on pure cpu nodes."""
+        is_cuda_module = utils.is_cuda_required_module(self.module_name)
+        if (is_cuda_module and self.device_type == DEVICE_TYPES[CPU] and
+                (not FEATURES[GPU] in self.current_partition.features)):
+            self.env_vars = {'LD_LIBRARY_PATH': '$EBROOTCUDA/stubs/lib64:$LD_LIBRARY_PATH'}
+
     @run_after('setup')
     def set_num_tasks_per_node(self):
-        """ Setting number of tasks per node and cpus per task in this function.
-        This function sets num_cpus_per_task for 1 node and 2 node options where
-        the request is for full nodes."""
+        """ Setting number of tasks per node and cpus per task in this function. This function sets num_cpus_per_task
+        for 1 node and 2 node options where the request is for full nodes."""
         if(SCALES.get(self.scale).get('num_nodes') == 1):
-            hooks.assign_tasks_per_compute_unit(self,
-                                                   COMPUTE_UNIT.get(NODE,
-                                                                    'node'), 2)
+            hooks.assign_tasks_per_compute_unit(self, COMPUTE_UNIT.get(NODE, 'node'), 2)
         else:
-            hooks.assign_tasks_per_compute_unit(self, COMPUTE_UNIT.get(NODE,
-                                                                       'node'))
+            hooks.assign_tasks_per_compute_unit(self, COMPUTE_UNIT.get(NODE, 'node'))
 
     @run_after('setup')
     def set_num_gpus_per_node(self):
         """
-        This test does not require gpus and is for host to host within GPU
-        nodes. But some systems do require a GPU allocation for to perform any
-        activity in the GPU nodes.
+        This test does not require gpus and is for host to host within GPU nodes. But some systems do require a GPU
+        allocation for to perform any activity in the GPU nodes.
         """
-        if('gpu' in self.current_partition.features and
-           not utils.is_cuda_required_module(self.module_name)):
+        if(FEATURES[GPU] in self.current_partition.features and not utils.is_cuda_required_module(self.module_name)):
+            max_avail_gpus_per_node = utils.get_max_avail_gpus_per_node(self)
+            # Here for the 2_node test we assign max_avail_gpus_per_node but some systems cannot allocate 1_cpn_2_nodes
+            # for GPUs but need all gpus allocated within the 2 nodes for this work which. The test may fail under such
+            # conditions for the scale 1_cpn_2_nodes because it is simply not allowed.
+            self.num_gpus_per_node = self.default_num_gpus_per_node or max_avail_gpus_per_node
+        elif(FEATURES[GPU] in self.current_partition.features and utils.is_cuda_required_module(self.module_name)):
+            max_avail_gpus_per_node = utils.get_max_avail_gpus_per_node(self)
             if(SCALES.get(self.scale).get('num_nodes') == 1):
-                self.num_gpus_per_node = 1
-            else:
-                # The devices section is sort of hard coded. This needs to be
-                # amended for a more heterogeneous system with more than one
-                # device type.
-
-                # Even for 1_cpn_2_nodes, the gpus requested are for the full
-                # nodes. On Snellius 1 GPU card cannot be reserved on 2
-                # different nodes which can be different on different systems.
-                self.num_gpus_per_node = \
-                    self.current_partition.devices[0].num_devices
-        elif('gpu' in self.current_partition.features and
-             utils.is_cuda_required_module(self.module_name)):
-            max_avail_gpus_per_node = \
-                    self.current_partition.devices[0].num_devices
-            if(SCALES.get(self.scale).get('num_nodes') == 1):
-                # Skip the single node test if there is only 1 device in the
-                # node.
+                # Skip the single node test if there is only 1 device in the node.
                 if(max_avail_gpus_per_node == 1):
                     self.skip(msg="There is only 1 device within the node. Skipping tests involving only 1 node.")
                 else:
                     self.num_gpus_per_node = 2
             else:
-                # The devices section is sort of hard coded. This needs to be
-                # amended for a more heterogeneous system with more than one
-                # device type.
-
-                # Note these settings are for 1_cpn_2_nodes. In that case we
-                # want to test for only 1 GPU per node since we have not
-                # requested for full nodes.
-                if(SCALES.get(self.scale).get('num_gpus_per_node', 0)):
-                    self.num_gpus_per_node = \
-                        SCALES.get(self.scale).get('num_gpus_per_node', 0)
-                else:
-                    self.num_gpus_per_node = \
-                        self.current_partition.devices[0].num_devices
+                # Note these settings are for 1_cpn_2_nodes. In that case we want to test for only 1 GPU per node since
+                # we have not requested for full nodes.
+                self.num_gpus_per_node = self.default_num_gpus_per_node or max_avail_gpus_per_node
 
 
 @rfm.simple_test
 class osu_coll(osu_benchmark):
     ''' Run-only OSU test '''
-    scale = parameter(my_filtering_function_coll())
-    #scale = parameter(SCALES.keys())
+    scale = parameter(filter_scales_coll())
     valid_prog_environs = ['default']
     valid_systems = []
     time_limit = '30m'
     module_name = parameter(utils.find_modules('OSU-Micro-Benchmarks'))
-    # Device type for non-cuda OSU-Micro-Benchmarks should run on hosts of both
-    # node types. To do this the default device type is set to GPU.
+    # Device type for non-cuda OSU-Micro-Benchmarks should run on hosts of both node types. To do this the default
+    # device type is set to GPU.
     device_type = parameter([DEVICE_TYPES[CPU], DEVICE_TYPES[GPU]])
-    # unset num_tasks_per_node from hpctestlib
+    # Unset num_tasks_per_node from hpctestlib
     num_tasks_per_node = None
 
 
     @run_after('init')
     def run_after_init(self):
         """hooks to run after init phase"""
-        hooks.filter_valid_systems_by_device_type(
-               self,
-               required_device_type=self.device_type)
+        hooks.filter_valid_systems_by_device_type( self, required_device_type=self.device_type)
         is_cuda_module = utils.is_cuda_required_module(self.module_name)
-        # This part of the hook is meant to be for the OSU cpu tests.
-#        if not is_cuda_module and self.device_type == DEVICE_TYPES['GPU']:
-#            self.valid_systems = ['*']
-#            self.device_buffers = 'cpu'
-#       elif is_cuda_module and self.device_type == DEVICE_TYPES['GPU']:
-#           # Currently the device buffer is hard coded to be cuda. More
-#           # options need to be introduced based on vendor and device type.
-#           self.device_buffers = 'cuda'
         if is_cuda_module and self.device_type == DEVICE_TYPES[GPU]:
-            # Currently the device buffer is hard coded to be cuda. More
-            # options need to be introduced based on vendor and device type.
             self.device_buffers = 'cuda'
-        elif is_cuda_module and self.device_type == DEVICE_TYPES[CPU]:
-            # This if condition had to be added since the CUDA compiled osu
-            # tests do not run on cpu partitions. The binaries need
-            # libcuda.so.1 during runtime which can only be found in a
-            # partition with CUDA drivers.
-            self.valid_systems = [f'+{FEATURES[CPU]} +{FEATURES[GPU]} %{GPU_VENDOR}={GPU_VENDORS[NVIDIA]}']
 
         # If the device_type is CPU then device buffer should always be CPU.
         if self.device_type == DEVICE_TYPES[CPU]:
             self.device_buffers = 'cpu'
-        # This part of the code removes the collective communication calls out
-        # of the run list since this test is only meant for collective.
+        # This part of the code removes the collective communication calls out of the run list since this test is only
+        # meant for collective.
         if not self.benchmark_info[0].startswith('mpi.collective'):
             self.valid_systems = []
         hooks.set_modules(self)
@@ -253,62 +208,64 @@ def set_mem(self):
     def set_num_tasks(self):
         hooks.set_tag_scale(self)
 
+    @run_after('setup')
+    def set_environment(self):
+        """ Setting environment variable for CUDA module tests that run on pure cpu nodes."""
+        is_cuda_module = utils.is_cuda_required_module(self.module_name)
+        if (is_cuda_module and self.device_type == DEVICE_TYPES[CPU] and
+                (not FEATURES[GPU] in self.current_partition.features)):
+            self.env_vars = {'LD_LIBRARY_PATH': '$EBROOTCUDA/stubs/lib64:$LD_LIBRARY_PATH'}
+
     @run_after('setup')
     def set_num_tasks_per_node(self):
-        """ Setting number of tasks per node, cpus per task and gpus per node
-        in this function. This function sets num_cpus_per_task for 1 node and 2
-        node options where the request is for full nodes."""
+        """ Setting number of tasks per node, cpus per task and gpus per node in this function. This function sets
+        num_cpus_per_task for 1 node and 2 node options where the request is for full nodes."""
         max_avail_cpus_per_node = self.current_partition.processor.num_cpus
         if(self.device_buffers == 'cpu'):
             # Setting num_tasks and num_tasks_per_node for the CPU tests
             if(SCALES.get(self.scale).get('num_cpus_per_node', 0)):
-                hooks.assign_tasks_per_compute_unit(self,
-                                                    COMPUTE_UNIT.get(NODE,
-                                                                     'node'),
+                hooks.assign_tasks_per_compute_unit(self, COMPUTE_UNIT.get(NODE, 'node'),
                                                     self.default_num_cpus_per_node)
             elif(SCALES.get(self.scale).get('node_part', 0)):
-                pass_num_per = int(max_avail_cpus_per_node /
-                        SCALES.get(self.scale).get('node_part', 0))
+                pass_num_per = int(max_avail_cpus_per_node / SCALES.get(self.scale).get('node_part', 0))
                 if(pass_num_per > 1):
-                    hooks.assign_tasks_per_compute_unit(self,
-                                                    COMPUTE_UNIT.get(NODE,
-                                                                     'node'),
-                                                    pass_num_per)
+                    hooks.assign_tasks_per_compute_unit(self, COMPUTE_UNIT.get(NODE, 'node'), pass_num_per)
                 else:
                     self.skip(msg="Too few cores available for a collective operation.")
 
-            if('gpu' in self.current_partition.features):
+            if(FEATURES[GPU] in self.current_partition.features):
+                max_avail_gpus_per_node = utils.get_max_avail_gpus_per_node(self)
                 # Setting number of GPU for a cpu test on a GPU node.
                 if(SCALES.get(self.scale).get('num_nodes') == 1):
                     self.num_gpus_per_node = 1
                 else:
-                    # The devices section is sort of hard coded. This needs to be
-                    # amended for a more heterogeneous system with more than one
-                    # device type.
-                    self.num_gpus_per_node = \
-                        self.current_partition.devices[0].num_devices
+                    self.num_gpus_per_node = max_avail_gpus_per_node
         elif(self.device_buffers == 'cuda'):
+            max_avail_gpus_per_node = utils.get_max_avail_gpus_per_node(self)
             # Setting num_tasks and num_tasks_per_node for the GPU tests
-            max_avail_gpus_per_node = \
-                    self.current_partition.devices[0].num_devices
             if(max_avail_gpus_per_node == 1 and
                     SCALES.get(self.scale).get('num_nodes') == 1):
                 self.skip(msg="There is only 1 device within the node. Skipping collective tests involving only 1 node.")
             else:
-                if(SCALES.get(self.scale).get('num_gpus_per_node', 0) *
-                   SCALES.get(self.scale).get('num_nodes', 0) > 1):
-                    hooks.assign_tasks_per_compute_unit(self,
-                                                        COMPUTE_UNIT.get(GPU,
-                                                                         'gpu'))
+                if(SCALES.get(self.scale).get('num_gpus_per_node', 0) * SCALES.get(self.scale).get('num_nodes', 0) > 1):
+                    hooks.assign_tasks_per_compute_unit(self, COMPUTE_UNIT.get(GPU, FEATURES[GPU]))
                 elif(SCALES.get(self.scale).get('node_part', 0)):
-                    pass_num_per = int(max_avail_gpus_per_node /
-                            SCALES.get(self.scale).get('node_part', 0))
+                    pass_num_per = int(max_avail_gpus_per_node / SCALES.get(self.scale).get('node_part', 0))
                     if(pass_num_per > 1):
-                        hooks.assign_tasks_per_compute_unit(self,
-                                                        COMPUTE_UNIT.get(GPU,
-                                                                         'gpu'))
+                        hooks.assign_tasks_per_compute_unit(self, COMPUTE_UNIT.get(GPU, FEATURES[GPU]))
                     else:
                         self.skip(msg="Total GPUs (max_avail_gpus_per_node / node_part) is 1 less.")
                 else:
                     self.skip(msg="Total GPUs (num_nodes * num_gpus_per_node) = 1")
 
+# Note: This is code to setup launcher options if needed later to pass LD_LIBRARY_PATH to mpirun for the stubs solution.
+# Currently this is experimental therefore commented out and moved here.
+#    @run_after('setup')
+#    def launcher_options(self):
+#        """ Setting launcher options for CUDA module tests that run on pure cpu nodes. Note this way of setting
+#        environment variable only works for OpenMPI."""
+#        is_cuda_module = utils.is_cuda_required_module(self.module_name)
+#        if (is_cuda_module and self.device_type == DEVICE_TYPES[CPU] and
+#                isinstance(self.job.launcher, rfm.core.backends.getlauncher('mpirun')().__class__) and
+#                (not FEATURES[GPU] in self.current_partition.features)):
+#            self.job.launcher.options = ["-x LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$EBROOTCUDA/stubs/lib64/libcuda.so.1"]

From b0363ffd9e5121ece7f7c2993534bfd766d6782b Mon Sep 17 00:00:00 2001
From: Satish Kamath <satishk@tcn3.local.snellius.surf.nl>
Date: Thu, 25 Jan 2024 16:08:47 +0100
Subject: [PATCH 22/23] Adding (and removing) a few comments more for
 explanation of functions. No change in code base.

---
 eessi/testsuite/tests/apps/osu.py | 20 +++++---------------
 1 file changed, 5 insertions(+), 15 deletions(-)

diff --git a/eessi/testsuite/tests/apps/osu.py b/eessi/testsuite/tests/apps/osu.py
index 34e654f0..92c5f0a8 100644
--- a/eessi/testsuite/tests/apps/osu.py
+++ b/eessi/testsuite/tests/apps/osu.py
@@ -200,9 +200,11 @@ def set_tag_ci(self):
 
     @run_after('init')
     def set_mem(self):
-        """ Setting an extra job option of memory."""
-        if(SCALES.get(self.scale).get('node_part', 0) != 1):
-            self.extra_resources = {'memory': {'size': '64GB'}}
+        """ Setting an extra job option of memory. The alltoall operation takes maximum memory of 0.1 GB per core for a
+        message size of 8 and almost 0.5 GB per core for the maximum message size the test allows. But we limit the
+        message sizes to 8 and for a safety net we take 64 GB assuming dense nodes works for all the tests and node
+        types."""
+        self.extra_resources = {'memory': {'size': '64GB'}}
 
     @run_after('init')
     def set_num_tasks(self):
@@ -257,15 +259,3 @@ def set_num_tasks_per_node(self):
                         self.skip(msg="Total GPUs (max_avail_gpus_per_node / node_part) is 1 less.")
                 else:
                     self.skip(msg="Total GPUs (num_nodes * num_gpus_per_node) = 1")
-
-# Note: This is code to setup launcher options if needed later to pass LD_LIBRARY_PATH to mpirun for the stubs solution.
-# Currently this is experimental therefore commented out and moved here.
-#    @run_after('setup')
-#    def launcher_options(self):
-#        """ Setting launcher options for CUDA module tests that run on pure cpu nodes. Note this way of setting
-#        environment variable only works for OpenMPI."""
-#        is_cuda_module = utils.is_cuda_required_module(self.module_name)
-#        if (is_cuda_module and self.device_type == DEVICE_TYPES[CPU] and
-#                isinstance(self.job.launcher, rfm.core.backends.getlauncher('mpirun')().__class__) and
-#                (not FEATURES[GPU] in self.current_partition.features)):
-#            self.job.launcher.options = ["-x LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$EBROOTCUDA/stubs/lib64/libcuda.so.1"]

From fc1e11893aa6bce6aed8a310337b63a7ef6296dc Mon Sep 17 00:00:00 2001
From: Satish Kamath <satishk@tcn3.local.snellius.surf.nl>
Date: Mon, 29 Jan 2024 14:09:24 +0100
Subject: [PATCH 23/23] Used the constants for `node` strings and reduced the
 mem size for pt2pt to 12 GB.

---
 eessi/testsuite/tests/apps/osu.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/eessi/testsuite/tests/apps/osu.py b/eessi/testsuite/tests/apps/osu.py
index 92c5f0a8..4f6af605 100644
--- a/eessi/testsuite/tests/apps/osu.py
+++ b/eessi/testsuite/tests/apps/osu.py
@@ -103,7 +103,7 @@ def set_mem(self):
         and the test requires at least 4.5 GB per core at the minimum for the full test when run with validation (-c
         option for osu_bw or osu_latency). We run till message size 8 (-m 8) which significantly reduces memory
         requirement."""
-        self.extra_resources = {'memory': {'size': '16GB'}}
+        self.extra_resources = {'memory': {'size': '12GB'}}
 
     @run_after('init')
     def set_num_tasks(self):
@@ -123,9 +123,9 @@ def set_num_tasks_per_node(self):
         """ Setting number of tasks per node and cpus per task in this function. This function sets num_cpus_per_task
         for 1 node and 2 node options where the request is for full nodes."""
         if(SCALES.get(self.scale).get('num_nodes') == 1):
-            hooks.assign_tasks_per_compute_unit(self, COMPUTE_UNIT.get(NODE, 'node'), 2)
+            hooks.assign_tasks_per_compute_unit(self, COMPUTE_UNIT[NODE], 2)
         else:
-            hooks.assign_tasks_per_compute_unit(self, COMPUTE_UNIT.get(NODE, 'node'))
+            hooks.assign_tasks_per_compute_unit(self, COMPUTE_UNIT[NODE])
 
     @run_after('setup')
     def set_num_gpus_per_node(self):
@@ -226,12 +226,12 @@ def set_num_tasks_per_node(self):
         if(self.device_buffers == 'cpu'):
             # Setting num_tasks and num_tasks_per_node for the CPU tests
             if(SCALES.get(self.scale).get('num_cpus_per_node', 0)):
-                hooks.assign_tasks_per_compute_unit(self, COMPUTE_UNIT.get(NODE, 'node'),
+                hooks.assign_tasks_per_compute_unit(self, COMPUTE_UNIT[NODE],
                                                     self.default_num_cpus_per_node)
             elif(SCALES.get(self.scale).get('node_part', 0)):
                 pass_num_per = int(max_avail_cpus_per_node / SCALES.get(self.scale).get('node_part', 0))
                 if(pass_num_per > 1):
-                    hooks.assign_tasks_per_compute_unit(self, COMPUTE_UNIT.get(NODE, 'node'), pass_num_per)
+                    hooks.assign_tasks_per_compute_unit(self, COMPUTE_UNIT[NODE], pass_num_per)
                 else:
                     self.skip(msg="Too few cores available for a collective operation.")