Merge pull request #116 from smoors/request_gpus

support always requesting GPUs on partitions that require it
EESSI · Feb 29, 2024 · ba35eb2 · ba35eb2
2 parents c8e917c + 82891ba
commit ba35eb2
Show file tree

Hide file tree

Showing 4 changed files with 109 additions and 81 deletions.
diff --git a/eessi/testsuite/constants.py b/eessi/testsuite/constants.py
@@ -11,6 +11,7 @@
 INTEL = 'INTEL'
 NODE = 'NODE'
 NVIDIA = 'NVIDIA'
+ALWAYS_REQUEST_GPUS = 'ALWAYS_REQUEST_GPUS'
 
 DEVICE_TYPES = {
     CPU: 'cpu',
@@ -31,6 +32,7 @@
 FEATURES = {
     CPU: 'cpu',
     GPU: 'gpu',
+    ALWAYS_REQUEST_GPUS: 'always_request_gpus',
 }
 
 GPU_VENDORS = {

diff --git a/eessi/testsuite/hooks.py b/eessi/testsuite/hooks.py
@@ -7,12 +7,12 @@
 
 import reframe as rfm
 
-from eessi.testsuite.constants import *  # noqa
+from eessi.testsuite.constants import *
 from eessi.testsuite.utils import (get_max_avail_gpus_per_node, is_cuda_required_module, log,
                                    check_proc_attribute_defined)
 
 
-def assign_default_num_cpus_per_node(test: rfm.RegressionTest):
+def _assign_default_num_cpus_per_node(test: rfm.RegressionTest):
     """
     Check if the default number of cpus per node is already defined in the test
     (e.g. by earlier hooks like set_tag_scale).
@@ -34,6 +34,27 @@ def assign_default_num_cpus_per_node(test: rfm.RegressionTest):
     log(f'default_num_cpus_per_node set to {test.default_num_cpus_per_node}')
 
 
+def _assign_default_num_gpus_per_node(test: rfm.RegressionTest):
+    """
+    Check if the default number of gpus per node is already defined in the test
+    (e.g. by earlier hooks like set_tag_scale).
+    If so, check if it doesn't exceed the maximum available.
+    If not, set default_num_gpus_per_node based on the maximum available gpus and node_part
+    """
+
+    test.max_avail_gpus_per_node = get_max_avail_gpus_per_node(test)
+    if test.default_num_gpus_per_node:
+        # may skip if not enough GPUs
+        test.skip_if(
+            test.default_num_gpus_per_node > test.max_avail_gpus_per_node,
+            f'Number of GPUs per node in selected scale ({test.default_num_gpus_per_node}) is higher than max available'
+            f' ({test.max_avail_gpus_per_node}) in current partition ({test.current_partition.name}).'
+        )
+    else:
+        # no default set yet, so setting one
+        test.default_num_gpus_per_node = math.ceil(test.max_avail_gpus_per_node / test.node_part)
+
+
 def assign_tasks_per_compute_unit(test: rfm.RegressionTest, compute_unit: str, num_per: int = 1):
     """
     Assign one task per compute unit (COMPUTE_UNIT[CPU], COMPUTE_UNIT[CPU_SOCKET] or COMPUTE_UNIT[GPU]).
@@ -69,15 +90,18 @@ def assign_tasks_per_compute_unit(test: rfm.RegressionTest, compute_unit: str, n
 
     # Check if either node_part, or default_num_cpus_per_node and default_num_gpus_per_node are set correctly
     if not (
-        type(test.node_part) == int or
-        (type(test.default_num_cpus_per_node) == int and type(test.default_num_gpus_per_node) == int)
+        type(test.node_part) == int
+        or (type(test.default_num_cpus_per_node) == int and type(test.default_num_gpus_per_node) == int)
     ):
         raise ValueError(
             f'Either node_part ({test.node_part}), or default_num_cpus_per_node ({test.default_num_cpus_per_node}) and'
             f' default num_gpus_per_node ({test.default_num_gpus_per_node}) must be defined and have integer values.'
         )
 
-    assign_default_num_cpus_per_node(test)
+    _assign_default_num_cpus_per_node(test)
+
+    if FEATURES[GPU] in test.current_partition.features:
+        _assign_default_num_gpus_per_node(test)
 
     if compute_unit == COMPUTE_UNIT[GPU]:
         _assign_one_task_per_gpu(test)
@@ -90,6 +114,8 @@ def assign_tasks_per_compute_unit(test: rfm.RegressionTest, compute_unit: str, n
     else:
         raise ValueError(f'compute unit {compute_unit} is currently not supported')
 
+    _check_always_request_gpus(test)
+
 
 def _assign_num_tasks_per_node(test: rfm.RegressionTest, num_per: int = 1):
     """
@@ -112,7 +138,6 @@ def _assign_num_tasks_per_node(test: rfm.RegressionTest, num_per: int = 1):
         test.num_tasks_per_node = num_per
         test.num_cpus_per_task = int(test.default_num_cpus_per_node / test.num_tasks_per_node)
 
-
     # num_tasks_per_node is not set, but num_cpus_per_task is
     elif not test.num_tasks_per_node:
         test.num_tasks_per_node = int(test.default_num_cpus_per_node / test.num_cpus_per_task)
@@ -222,11 +247,6 @@ def _assign_one_task_per_gpu(test: rfm.RegressionTest):
     --setvar num_cpus_per_task=<y> and/or
     --setvar num_gpus_per_node=<z>.
 
-    Variables:
-    - max_avail_gpus_per_node: maximum available number of GPUs per node
-    - default_num_gpus_per_node: default number of GPUs per node as defined in the test
-    (e.g. by earlier hooks like set_tag_scale)
-
     Default resources requested:
     - num_gpus_per_node = default_num_gpus_per_node
     - num_tasks_per_node = num_gpus_per_node
@@ -235,22 +255,6 @@ def _assign_one_task_per_gpu(test: rfm.RegressionTest):
     If num_tasks_per_node is set, set num_gpus_per_node equal to either num_tasks_per_node or default_num_gpus_per_node
     (whichever is smallest), unless num_gpus_per_node is also set.
     """
-    max_avail_gpus_per_node = get_max_avail_gpus_per_node(test)
-
-    # Check if the default number of gpus per node is already defined in the test
-    # (e.g. by earlier hooks like set_tag_scale).
-    # If so, check if it doesn't exceed the maximum available.
-    # If not, set default_num_gpus_per_node based on the maximum available gpus and node_part
-    if test.default_num_gpus_per_node:
-        # may skip if not enough GPUs
-        test.skip_if(
-            test.default_num_gpus_per_node > max_avail_gpus_per_node,
-            f'Requested GPUs per node ({test.default_num_gpus_per_node}) is higher than max available'
-            f' ({max_avail_gpus_per_node}) in current partition ({test.current_partition.name}).'
-        )
-    else:
-        # no default set yet, so setting one
-        test.default_num_gpus_per_node = math.ceil(max_avail_gpus_per_node / test.node_part)
 
     # neither num_tasks_per_node nor num_gpus_per_node are set
     if not test.num_tasks_per_node and not test.num_gpus_per_node:
@@ -273,7 +277,7 @@ def _assign_one_task_per_gpu(test: rfm.RegressionTest):
         # limit num_cpus_per_task to the maximum available cpus per gpu
         test.num_cpus_per_task = min(
             int(test.default_num_cpus_per_node / test.num_tasks_per_node),
-            int(test.max_avail_cpus_per_node / max_avail_gpus_per_node)
+            int(test.max_avail_cpus_per_node / test.max_avail_gpus_per_node)
         )
 
     test.num_tasks = test.num_nodes * test.num_tasks_per_node
@@ -303,8 +307,8 @@ def _set_or_append_valid_systems(test: rfm.RegressionTest, valid_systems: str):
         return
 
     # test.valid_systems wasn't set yet, so set it
-    if len(test.valid_systems) == 0:
-        # test.valid_systems is empty, meaning all tests are filtered out. This hook shouldn't change that
+    if len(test.valid_systems) == 0 or test.valid_systems == [INVALID_SYSTEM]:
+        # test.valid_systems is empty or invalid, meaning all tests are filtered out. This hook shouldn't change that
         return
     # test.valid_systems still at default value, so overwrite
     elif len(test.valid_systems) == 1 and test.valid_systems[0] == '*':
@@ -314,8 +318,8 @@ def _set_or_append_valid_systems(test: rfm.RegressionTest, valid_systems: str):
         test.valid_systems[0] = f'{test.valid_systems[0]} {valid_systems}'
     else:
         warn_msg = f"valid_systems has multiple ({len(test.valid_systems)}) items,"
-        warn_msg += f" which is not supported by this hook."
-        warn_msg += f" Make sure to handle filtering yourself."
+        warn_msg += " which is not supported by this hook."
+        warn_msg += " Make sure to handle filtering yourself."
         warnings.warn(warn_msg)
         return
 
@@ -333,6 +337,7 @@ def filter_supported_scales(test: rfm.RegressionTest):
 
     log(f'valid_systems set to {test.valid_systems}')
 
+
 def filter_valid_systems_by_device_type(test: rfm.RegressionTest, required_device_type: str):
     """
     Filter valid_systems by required device type and by whether the module supports CUDA,
@@ -459,3 +464,12 @@ def set_compact_thread_binding(test: rfm.RegressionTest):
     log(f'Set environment variable OMP_PLACES to {test.env_vars["OMP_PLACES"]}')
     log(f'Set environment variable OMP_PROC_BIND to {test.env_vars["OMP_PROC_BIND"]}')
     log(f'Set environment variable KMP_AFFINITY to {test.env_vars["KMP_AFFINITY"]}')
+
+
+def _check_always_request_gpus(test: rfm.RegressionTest):
+    """
+    Make sure we always request enough GPUs if required for the current GPU partition (cluster-specific policy)
+    """
+    if FEATURES[ALWAYS_REQUEST_GPUS] in test.current_partition.features and not test.num_gpus_per_node:
+        test.num_gpus_per_node = test.default_num_gpus_per_node
+        log(f'num_gpus_per_node set to {test.num_gpus_per_node} for partition {test.current_partition.name}')
diff --git a/eessi/testsuite/tests/apps/osu.py b/eessi/testsuite/tests/apps/osu.py
@@ -51,40 +51,50 @@ class EESSI_OSU_Micro_Benchmarks_pt2pt(osu_benchmark):
     # unset num_tasks_per_node from the hpctestlib.
     num_tasks_per_node = None
 
+    @run_after('init')
+    def filter_scales_2gpus(self):
+        """Filter out scales with < 2 GPUs if running on GPUs"""
+        if (
+            self.device_type == DEVICE_TYPES[GPU]
+            and SCALES[self.scale]['num_nodes'] == 1
+            and SCALES[self.scale].get('num_gpus_per_node', 2) < 2
+        ):
+            self.valid_systems = [INVALID_SYSTEM]
+            log(f'valid_systems set to {self.valid_systems} for scale {self.scale} and device_type {self.device_type}')
+
+    @run_after('init')
+    def filter_benchmark_pt2pt(self):
+        """ Filter out all non-mpi.pt2pt benchmarks """
+        if not self.benchmark_info[0].startswith('mpi.pt2pt'):
+            self.valid_systems = [INVALID_SYSTEM]
+
     @run_after('init')
     def run_after_init(self):
         """hooks to run after init phase"""
-        # Note: device_buffers variable is inherited from the hpctestlib class and adds options to the launcher
-        # commands (before setup) if not equal to 'cpu'. We set it to 'cpu' initially and change it later in this hook depending on the test.
-        self.device_buffers = 'cpu'
+
         # Filter on which scales are supported by the partitions defined in the ReFrame configuration
         hooks.filter_supported_scales(self)
 
         hooks.filter_valid_systems_by_device_type(self, required_device_type=self.device_type)
-        is_cuda_module = utils.is_cuda_required_module(self.module_name)
-        # This part of the hook is meant to be for the OSU cpu tests. This is required since the non CUDA module should
-        # be able to run in the GPU partition as well. This is specific for this test and not covered by the function
-        # above.
-        if is_cuda_module and self.device_type == DEVICE_TYPES[GPU]:
-            # Sets to cuda as device buffer only if the module is compiled with CUDA.
-            self.device_buffers = 'cuda'
 
-        # If the device_type is CPU then device buffer should always be CPU.
-        if self.device_type == DEVICE_TYPES[CPU]:
-            self.device_buffers = 'cpu'
-
-        # This part of the code removes the collective communication calls out of the run list since this test is only
-        # meant for pt2pt.
-        if not self.benchmark_info[0].startswith('mpi.pt2pt'):
-            self.valid_systems = []
         hooks.set_modules(self)
 
-    @run_after('setup')
-    def adjust_executable_opts(self):
-        """The option "D D" is only meant for Devices if and not for CPU tests. This option is added by hpctestlib to
-        all pt2pt tests which is not required."""
-        if(self.device_type == DEVICE_TYPES[CPU]):
-            self.executable_opts = [ele for ele in self.executable_opts if ele != 'D']
+        # Set scales as tags
+        hooks.set_tag_scale(self)
+
+    @run_after('init')
+    def set_device_buffers(self):
+        """
+        device_buffers is inherited from the hpctestlib class and adds options to the launcher
+        commands in a @run_before('setup') hook if not equal to 'cpu'.
+        Therefore, we must set device_buffers *before* the @run_before('setup') hooks.
+        """
+        if self.device_type == DEVICE_TYPES[GPU]:
+            self.device_buffers = 'cuda'
+
+        else:
+            # If the device_type is CPU then device_buffers should always be CPU.
+            self.device_buffers = 'cpu'
 
     @run_after('init')
     def set_tag_ci(self):
@@ -108,44 +118,40 @@ def set_mem(self):
         requirement."""
         self.extra_resources = {'memory': {'size': '12GB'}}
 
-    @run_after('init')
-    def set_num_tasks(self):
-        """ Setting scales as tags. """
-        hooks.set_tag_scale(self)
+    @run_after('setup')
+    def adjust_executable_opts(self):
+        """The option "D D" is only meant for Devices if and not for CPU tests.
+        This option is added by hpctestlib in a @run_before('setup') to all pt2pt tests which is not required.
+        Therefore we must override it *after* the 'setup' phase
+        """
+        if self.device_type == DEVICE_TYPES[CPU]:
+            self.executable_opts = [ele for ele in self.executable_opts if ele != 'D']
+
 
     @run_after('setup')
     def set_num_tasks_per_node(self):
         """ Setting number of tasks per node and cpus per task in this function. This function sets num_cpus_per_task
         for 1 node and 2 node options where the request is for full nodes."""
-        if(SCALES.get(self.scale).get('num_nodes') == 1):
+        if SCALES.get(self.scale).get('num_nodes') == 1:
             hooks.assign_tasks_per_compute_unit(self, COMPUTE_UNIT[NODE], 2)
         else:
             hooks.assign_tasks_per_compute_unit(self, COMPUTE_UNIT[NODE])
 
     @run_after('setup')
     def set_num_gpus_per_node(self):
         """
-        This test does not require gpus and is for host to host within GPU nodes. But some systems do require a GPU
-        allocation for to perform any activity in the GPU nodes.
+        Set number of GPUs per node for GPU-to-GPU tests
         """
-        if(FEATURES[GPU] in self.current_partition.features and not utils.is_cuda_required_module(self.module_name)):
-            max_avail_gpus_per_node = utils.get_max_avail_gpus_per_node(self)
-            # Here for the 2_node test we assign max_avail_gpus_per_node but some systems cannot allocate 1_cpn_2_nodes
-            # for GPUs but need all gpus allocated within the 2 nodes for this work which. The test may fail under such
-            # conditions for the scale 1_cpn_2_nodes because it is simply not allowed.
-            self.num_gpus_per_node = self.default_num_gpus_per_node or max_avail_gpus_per_node
-        elif(FEATURES[GPU] in self.current_partition.features and utils.is_cuda_required_module(self.module_name)):
-            max_avail_gpus_per_node = utils.get_max_avail_gpus_per_node(self)
-            if(SCALES.get(self.scale).get('num_nodes') == 1):
-                # Skip the single node test if there is only 1 device in the node.
-                if(max_avail_gpus_per_node == 1):
-                    self.skip(msg="There is only 1 device within the node. Skipping tests involving only 1 node.")
-                else:
-                    self.num_gpus_per_node = 2
-            else:
-                # Note these settings are for 1_cpn_2_nodes. In that case we want to test for only 1 GPU per node since
-                # we have not requested for full nodes.
-                self.num_gpus_per_node = self.default_num_gpus_per_node or max_avail_gpus_per_node
+        if self.device_type == DEVICE_TYPES[GPU]:
+            # Skip single-node tests with less than 2 GPU devices in the node
+            self.skip_if(
+                SCALES[self.scale]['num_nodes'] == 1 and self.default_num_gpus_per_node < 2,
+                "There are < 2 GPU devices present in the node."
+                f" Skipping tests with device_type={DEVICE_TYPES[GPU]} involving < 2 GPUs and 1 node."
+            )
+            if not self.num_gpus_per_node:
+                self.num_gpus_per_node = self.default_num_gpus_per_node
+                log(f'num_gpus_per_node set to {self.num_gpus_per_node} for partition {self.current_partition.name}')
 
 
 @rfm.simple_test

diff --git a/setup.cfg b/setup.cfg
@@ -20,3 +20,9 @@ namespace_packages = eessi
 
 [options.packages.find]
 include = eessi*
+
+[flake8]
+max-line-length = 120
+# ignore star imports (F403, F405)
+# ignore obsolete warning (W503)
+ignore = F403, F405, W503