Crivella · Crivella · May 23, 2024 · May 16, 2024 · May 16, 2024 · May 16, 2024
diff --git a/config/github_actions.py b/config/github_actions.py
@@ -25,7 +25,13 @@
                             'options': ['--mem={size}'],
                         }
                     ],
-                    'max_jobs': 1
+                    'max_jobs': 1,
+                    'extras': {
+                        # Make sure to round down, otherwise a job might ask for more mem than is available
+                        # per node
+                        # This is a fictional amount, GH actions probably has less, but only does --dry-run
+                        'mem_per_node': 30  # in GiB
+                    },
                 }
             ]
         }

diff --git a/config/it4i_karolina.py b/config/it4i_karolina.py
@@ -53,6 +53,11 @@
                     'features': [
                         FEATURES[CPU],
                     ] + list(SCALES.keys()),
+                    'extras': {
+                        # Make sure to round down, otherwise a job might ask for more mem than is available
+                        # per node
+                        'mem_per_node': 219.345  # in GiB
+                    },
                     'descr': 'CPU Universal Compute Nodes, see https://docs.it4i.cz/karolina/hardware-overview/'
                 },
                 # We don't have GPU budget on Karolina at this time

diff --git a/config/izum_vega.py b/config/izum_vega.py
@@ -59,47 +59,57 @@
                     'features': [
                         FEATURES[CPU],
                     ] + list(SCALES.keys()),
+                    'extras': {
+                        # Make sure to round down, otherwise a job might ask for more mem than is available
+                        # per node
+                        'mem_per_node': 238.418  # in GiB
+                    },
                     'descr': 'CPU partition Standard, see https://en-doc.vega.izum.si/architecture/'
                 },
-                {
-                    'name': 'gpu',
-                    'scheduler': 'slurm',
-                    'prepare_cmds': [
-                        'source %s' % common_eessi_init(),
-                        # Pass job environment variables like $PATH, etc., into job steps
-                        'export SLURM_EXPORT_ENV=ALL',
-                        # Needed when using srun launcher
-                        # 'export SLURM_MPI_TYPE=pmix',  # WARNING: this broke the GROMACS on Vega
-                        # Avoid https://github.com/EESSI/software-layer/issues/136
-                        # Can be taken out once we don't care about old OpenMPI versions anymore (pre-4.1.1)
-                        'export OMPI_MCA_pml=ucx',
-                    ],
-                    'launcher': 'mpirun',
-                    # Use --export=None to avoid that login environment is passed down to submitted jobs
-                    'access': ['-p gpu', '--export=None'],
-                    'environs': ['default'],
-                    'max_jobs': 60,
-                    'devices': [
-                        {
-                            'type': DEVICE_TYPES[GPU],
-                            'num_devices': 4,
-                        }
-                    ],
-                    'resources': [
-                        {
-                            'name': '_rfm_gpu',
-                            'options': ['--gpus-per-node={num_gpus_per_node}'],
-                        },
-                        {
-                            'name': 'memory',
-                            'options': ['--mem={size}'],
-                        }
-                    ],
-                    'features': [
-                        FEATURES[GPU],
-                    ] + list(SCALES.keys()),
-                    'descr': 'GPU partition, see https://en-doc.vega.izum.si/architecture/'
-                },
+                # {
+                #     'name': 'gpu',
+                #     'scheduler': 'slurm',
+                #     'prepare_cmds': [
+                #         'source %s' % common_eessi_init(),
+                #         # Pass job environment variables like $PATH, etc., into job steps
+                #         'export SLURM_EXPORT_ENV=ALL',
+                #         # Needed when using srun launcher
+                #         # 'export SLURM_MPI_TYPE=pmix',  # WARNING: this broke the GROMACS on Vega
+                #         # Avoid https://github.com/EESSI/software-layer/issues/136
+                #         # Can be taken out once we don't care about old OpenMPI versions anymore (pre-4.1.1)
+                #         'export OMPI_MCA_pml=ucx',
+                #     ],
+                #     'launcher': 'mpirun',
+                #     # Use --export=None to avoid that login environment is passed down to submitted jobs
+                #     'access': ['-p gpu', '--export=None'],
+                #     'environs': ['default'],
+                #     'max_jobs': 60,
+                #     'devices': [
+                #         {
+                #             'type': DEVICE_TYPES[GPU],
+                #             'num_devices': 4,
+                #         }
+                #     ],
+                #     'resources': [
+                #         {
+                #             'name': '_rfm_gpu',
+                #             'options': ['--gpus-per-node={num_gpus_per_node}'],
+                #         },
+                #         {
+                #             'name': 'memory',
+                #             'options': ['--mem={size}'],
+                #         }
+                #     ],
+                #     'features': [
+                #         FEATURES[GPU],
+                #     ] + list(SCALES.keys()),
+                #     'extras': {
+                #         # Make sure to round down, otherwise a job might ask for more mem than is available
+                #         # per node
+                #         'mem_per_node': 476.837  # in GiB (should be checked, its unclear from slurm.conf)
+                #     },
+                #     'descr': 'GPU partition, see https://en-doc.vega.izum.si/architecture/'
+                # },
             ]
         },
     ],

diff --git a/config/surf_snellius.py b/config/surf_snellius.py
@@ -53,6 +53,11 @@
                     'features': [
                         FEATURES[CPU],
                     ] + list(SCALES.keys()),
+                    'extras': {
+                        # Make sure to round down, otherwise a job might ask for more mem than is available
+                        # per node
+                        'mem_per_node': 213.623  # in GiB
+                    },
                     'descr': 'AMD Rome CPU partition with native EESSI stack'
                 },
                 {
@@ -72,6 +77,11 @@
                     'features': [
                         FEATURES[CPU],
                     ] + list(SCALES.keys()),
+                    'extras': {
+                        # Make sure to round down, otherwise a job might ask for more mem than is available
+                        # per node
+                        'mem_per_node': 320.434  # in GiB
+                    },
                     'descr': 'AMD Genoa CPU partition with native EESSI stack'
                 },
 
@@ -105,6 +115,9 @@
                     ] + valid_scales_snellius_gpu,
                     'extras': {
                         GPU_VENDOR: GPU_VENDORS[NVIDIA],
+                        # Make sure to round down, otherwise a job might ask for more mem than is available
+                        # per node
+                        'mem_per_node': 457.763  # in GiB
                     },
                     'descr': 'Nvidia A100 GPU partition with native EESSI stack'
                 },

diff --git a/eessi/testsuite/hooks.py b/eessi/testsuite/hooks.py
@@ -9,7 +9,7 @@
 
 from eessi.testsuite.constants import *
 from eessi.testsuite.utils import (get_max_avail_gpus_per_node, is_cuda_required_module, log,
-                                   check_proc_attribute_defined)
+                                   check_proc_attribute_defined, check_extras_key_defined)
 
 
 def _assign_default_num_cpus_per_node(test: rfm.RegressionTest):
@@ -373,6 +373,92 @@ def filter_valid_systems_by_device_type(test: rfm.RegressionTest, required_devic
     log(f'valid_systems set to {test.valid_systems}')
 
 
+def req_memory_per_node(test: rfm.RegressionTest, app_mem_req):
+    """
+    This hook will request a specific amount of memory per node to the batch scheduler.
+    First, it computes which fraction of CPUs is requested from a node, and how much the corresponding (proportional)
+    amount of memory would be.
+    Then, the hook compares this to how much memory the application claims to need per node (app_mem_req).
+    It then passes the maximum of these two numbers to the batch scheduler as a memory request.
+
+    Note: using this hook requires that the ReFrame configuration defines system.partition.extras['mem_per_node']
+    That field should be defined in GiB
+
+    Arguments:
+    - test: the ReFrame test to which this hook should apply
+    - app_mem_req: the amount of memory this application needs (per node) in GiB
+
+    Example 1:
+    - A system with 128 cores and 64 GiB per node.
+    - The test is launched on 64 cores
+    - The app_mem_req is 40 (GiB)
+    In this case, the test requests 50% of the CPUs. Thus, the proportional amount of memory is 32 GiB.
+    The app_mem_req is higher. Thus, 40GiB (per node) is requested from the batch scheduler.
+
+    Example 2:
+    - A system with 128 cores per node, 128 GiB mem per node is used.
+    - The test is launched on 64 cores
+    - the app_mem_req is 40 (GiB)
+    In this case, the test requests 50% of the CPUs. Thus, the proportional amount of memory is 64 GiB.
+    This is higher than the app_mem_req. Thus, 64 GiB (per node) is requested from the batch scheduler.
+    """
+    # Check that the systems.partitions.extra dict in the ReFrame config contains mem_per_node
+    check_extras_key_defined(test, 'mem_per_node')
+    # Skip if the current partition doesn't have sufficient memory to run the application
+    msg = f"Skipping test: nodes in this partition only have {test.current_partition.extras['mem_per_node']} GiB"
+    msg += " memory available (per node) accodring to the current ReFrame configuration,"
+    msg += f" but {app_mem_req} GiB is needed"
+    test.skip_if(test.current_partition.extras['mem_per_node'] < app_mem_req, msg)
+
+    # Compute what is higher: the requested memory, or the memory available proportional to requested CPUs
+    # Fraction of CPU cores requested
+    check_proc_attribute_defined(test, 'num_cpus')
+    cpu_fraction = test.num_tasks_per_node * test.num_cpus_per_task / test.current_partition.processor.num_cpus
+    proportional_mem = cpu_fraction * test.current_partition.extras['mem_per_node']
+
+    scheduler_name = test.current_partition.scheduler.registered_name
+    if scheduler_name == 'slurm' or scheduler_name == 'squeue':
+        # SLURMs --mem defines memory per node, see https://slurm.schedmd.com/sbatch.html
+        # SLURM uses megabytes and gigabytes, i.e. base-10, so conversion is 1000, not 1024
+        # Thus, we convert from GiB (gibibytes) to MB (megabytes) (1024 * 1024 * 1024 / (1000 * 1000) = 1073.741824)
+        app_mem_req = math.ceil(1073.741824 * app_mem_req)
+        log(f"Memory requested by application: {app_mem_req} MB")
+        proportional_mem = math.floor(1073.741824 * proportional_mem)
+        log(f"Memory proportional to the core count: {proportional_mem} MB")
+
+        # Request the maximum of the proportional_mem, and app_mem_req to the scheduler
+        req_mem_per_node = max(proportional_mem, app_mem_req)
+
+        test.extra_resources = {'memory': {'size': f'{req_mem_per_node}M'}}
+        log(f"Requested {req_mem_per_node} MB per node from the SLURM batch scheduler")
+
+    elif scheduler_name == 'torque':
+        # Torque/moab requires asking for --pmem (--mem only works single node and thus doesnt generalize)
+        # See https://docs.adaptivecomputing.com/10-0-1/Torque/torque.htm#topics/torque/3-jobs/3.1.3-requestingRes.htm
+        # Units are MiB according to the documentation, thus, we simply multiply with 1024
+        # We immediately divide by num_tasks_per_node (before rounding), since -pmem specifies memroy _per process_
+        app_mem_req_task = math.ceil(1024 * app_mem_req / test.num_tasks_per_node)
+        proportional_mem_task = math.floor(1024 * proportional_mem / test.num_tasks_per_node)
+
+        # Request the maximum of the proportional_mem, and app_mem_req to the scheduler
+        req_mem_per_task = max(proportional_mem_task, app_mem_req_task)
+
+        # We assume here the reframe config defines the extra resource memory as asking for pmem
+        # i.e. 'options': ['--pmem={size}']
+        test.extra_resources = {'memory': {'size': f'{req_mem_per_task}mb'}}
+        log(f"Requested {req_mem_per_task} MiB per task from the torque batch scheduler")
+
+    else:
+        logger = rflog.getlogger()
+        msg = "hooks.req_memory_per_node does not support the scheduler you configured"
+        msg += f" ({test.current_partition.scheduler.registered_name})."
+        msg += " The test will run, but since it doesn't request the required amount of memory explicitely,"
+        msg += " it may result in an out-of-memory error."
+        msg += " Please expand the functionality of hooks.req_memory_per_node for your scheduler."
+        # Warnings will, at default loglevel, be printed on stdout when executing the ReFrame command
+        logger.warning(msg)
+
+
 def set_modules(test: rfm.RegressionTest):
     """
     Skip current test if module_name is not among a list of modules,

diff --git a/eessi/testsuite/tests/apps/QuantumESPRESSO.py b/eessi/testsuite/tests/apps/QuantumESPRESSO.py
@@ -97,6 +97,11 @@ def run_after_setup(self):
         else:
             hooks.assign_tasks_per_compute_unit(test=self, compute_unit=COMPUTE_UNIT[CPU])
 
+    @run_after('setup')
+    def request_mem(self):
+        memory_required = self.num_tasks_per_node * 0.9 + 4
+        hooks.req_memory_per_node(test=self, app_mem_req=memory_required)
+
     @run_after('setup')
     def set_omp_num_threads(self):
         """

diff --git a/eessi/testsuite/utils.py b/eessi/testsuite/utils.py
@@ -145,7 +145,41 @@ def check_proc_attribute_defined(test: rfm.RegressionTest, attribute) -> bool:
     else:
         msg = (
             "This test's current_partition is not set yet. "
-            "The function utils.proc_attribute_defined should only be called after the setup() phase of ReFrame."
+            "The function utils.check_proc_attribute_defined should only be called after the setup() phase of ReFrame."
             "This is a programming error, please report this issue."
         )
-        raise AttributeError(msg)
+    raise AttributeError(msg)
+
+
+def check_extras_key_defined(test: rfm.RegressionTest, extra_key) -> bool:
+    """
+    Checks if a specific key is defined in the 'extras' dictionary for the current partition
+    (i.e. if test.current_partition.extras[extra_key] is defined)
+    If not, throws an informative error message.
+    Note that partition extras are defined by free text keys, so any string is (potentially) valid.
+
+    Arguments:
+    - test: the reframe regression test instance for which should be checked if the key is defined in 'extras'
+    - extra_key: key for which to check in the 'extras' dictionary
+
+    Return:
+    - True (bool) if the key is defined
+    - Function does not return (but raises an error) if the attribute is undefined
+    """
+
+    if test.current_partition:
+        if extra_key in test.current_partition.extras:
+            return True
+        else:
+            msg = (
+                f"Key '{extra_key}' missing in the 'extras' dictionary for partition '{test.current_partition.name}'."
+                "Please define this key for the relevant partition in the ReFrame configuration file (see "
+                "https://reframe-hpc.readthedocs.io/en/stable/config_reference.html#config.systems.partitions.extras)."
+            )
+    else:
+        msg = (
+            "This test's current_partition is not set yet. "
+            "The function utils.check_extras_key_defined should only be called after the setup() phase of ReFrame."
+            "This is a programming error, please report this issue."
+        )
+    raise AttributeError(msg)