Merge branch 'main' into pytorch

casparvl · Jul 1, 2024 · 11146ef · 11146ef
2 parents 07b2c1b + 926c3e4
commit 11146ef
Show file tree

Hide file tree

Showing 14 changed files with 476 additions and 55 deletions.
diff --git a/CI/run_reframe.sh b/CI/run_reframe.sh
@@ -48,7 +48,7 @@ if [ -z "${EESSI_TESTSUITE_URL}" ]; then
     EESSI_TESTSUITE_URL='https://github.com/EESSI/test-suite.git'
 fi
 if [ -z "${EESSI_TESTSUITE_BRANCH}" ]; then
-    EESSI_TESTSUITE_BRANCH='v0.2.0'
+    EESSI_TESTSUITE_BRANCH='v0.3.2'
 fi
 if [ -z "${EESSI_CVMFS_REPO}" ]; then
     export EESSI_CVMFS_REPO=/cvmfs/software.eessi.io

diff --git a/README.md b/README.md
@@ -104,6 +104,7 @@ are ok with that_ before doing so!
 When a release of the EESSI test suite is made, the following things must be taken care of:
 
 - Version bump: in both `pyproject.toml` and `setup.cfg`;
+- Version bump the default `EESSI_TESTSUITE_BRANCH` in `CI/run_reframe.sh`;
 - Release notes: in `RELEASE_NOTES` + in GitHub release (cfr. https://github.com/EESSI/test-suite/releases/tag/v0.2.0);
 - Tag release on GitHub + publish release (incl. release notes);
 - Publishing release to PyPI:

diff --git a/RELEASE_NOTES b/RELEASE_NOTES
@@ -1,6 +1,50 @@
 This file contains a description of the major changes to the EESSI test suite.
 For more detailed information, please see the git log.
 
+v0.3.2 (29 June 2024)
+---------------------
+
+This is a bugfix release of the EESSI test-suite
+
+It includes:
+
+- Add config for Deucalion (#162)
+
+v0.3.1 (28 June 2024)
+---------------------
+
+This is a bugfix release of the EESSI test-suite
+
+It includes:
+
+- Correct required memory per node to MiB in ESPResSo test (#158)
+- Change behavior for assign_tasks_per_compute_unit(test, COMPUTE_UNIT[CPU]) on hyperthreading-enabled systems (#160)
+- Use compact process binding in ESPResSo test (#160)
+
+v0.3.0 (27 June 2024)
+---------------------
+
+This is a minor release of the EESSI test-suite
+
+It includes:
+
+* Update config AWS MC cluster to use `software.eessi.io` (#126)
+* Add test for QuantumESPRESSO (pw.x) (#128)
+* Fix compact process binding for OpenMPI mpirun (#137)
+* Use compact process binding for GROMACS (#139)
+* Rename scale tags 1_cpn_2_nodes and 1_cpn_4_nodes (#140)
+* Set SRUN_CPUS_PER_TASK for srun launcher (#141)
+* Fix for "Failed to modify UD QP to INIT on mlx5_0" on Karolina CI runs (#142)
+* Reduce the iteration count to make the OSU tests run faster, especially on slower interconnects (#143)
+* Add test for ESPResSo (P3M) (#144)
+* Use software.eessi.io repo in CI (#146)
+* Add notes on release management to README (#148)
+* Fix memory_per_node for Hortense (#151)
+* Use MiB units for memory per node (#152)
+* Added / updated memory for various systems in MiB units (#153)
+* Add additional test for ESPRESSO (LJ) (#155)
+* Bump default version used in CI (#157)
+
 v0.2.0 (7 March 2024)
 ---------------------
 

diff --git a/config/aws_mc.py b/config/aws_mc.py
@@ -105,6 +105,11 @@
         # steps inherit environment. It doesn't hurt to define this even if srun is not used
         'export SLURM_EXPORT_ENV=ALL'
     ],
+    'extras': {
+        # Node types have somewhat varying amounts of memory, but we'll make it easy on ourselves
+        # All should _at least_ have this amount (30GB * 1E9 / (1024*1024) = 28610 MiB)
+        'mem_per_node': 28610
+    },
 }
 for system in site_configuration['systems']:
     for partition in system['partitions']:

diff --git a/config/it4i_karolina.py b/config/it4i_karolina.py
@@ -62,7 +62,7 @@
                     'extras': {
                         # Make sure to round down, otherwise a job might ask for more mem than is available
                         # per node
-                        'mem_per_node': 219.345 * 1024  # in MiB
+                        'mem_per_node': 235520  # in MiB
                     },
                     'descr': 'CPU Universal Compute Nodes, see https://docs.it4i.cz/karolina/hardware-overview/'
                 },

diff --git a/config/izum_vega.py b/config/izum_vega.py
@@ -62,7 +62,9 @@
                     'extras': {
                         # Make sure to round down, otherwise a job might ask for more mem than is available
                         # per node
-                        'mem_per_node': 238.418 * 1024  # in MiB
+                        # NB: Vega's MaxMemPerNode is set to 256000, but this MUST be a MB/MiB units mistake
+                        # Most likely, it is 256 GB, so 256*1E9/(1024*1024) MiB
+                        'mem_per_node': 244140  # in MiB
                     },
                     'descr': 'CPU partition Standard, see https://en-doc.vega.izum.si/architecture/'
                 },

diff --git a/config/macc_deucalion.py b/config/macc_deucalion.py
@@ -0,0 +1,75 @@
+import os
+
+from eessi.testsuite.common_config import common_logging_config, common_general_config, common_eessi_init
+from eessi.testsuite.constants import *  # noqa: F403
+
+# This config will write all staging, output and logging to subdirs under this prefix
+# Override with RFM_PREFIX environment variable
+reframe_prefix = os.path.join(os.environ['HOME'], 'reframe_runs')
+
+# This is an example configuration file
+site_configuration = {
+    'systems': [
+        {
+            'name': 'deucalion',
+            'descr': 'Deucalion, a EuroHPC JU system',
+            'modules_system': 'lmod',
+            'hostnames': ['ln*', 'cn*', 'gn*'],
+            'prefix': reframe_prefix,
+            'partitions': [
+                {
+                    'name': 'arm',
+                    'scheduler': 'slurm',
+                    'prepare_cmds': [
+                        'wrap.sh << EOF',
+                        # bypass CPU autodetection for now aarch64/a64fx,
+                        # see https://github.com/EESSI/software-layer/pull/608
+                        'export EESSI_SOFTWARE_SUBDIR_OVERRIDE=aarch64/a64fx',
+                        'source %s' % common_eessi_init(),
+                        # Pass job environment variables like $PATH, etc., into job steps
+                        'export SLURM_EXPORT_ENV=HOME,PATH,LD_LIBRARY_PATH,PYTHONPATH',
+                    ],
+                    'launcher': 'mpirun',
+                    # Use --export=None to avoid that login environment is passed down to submitted jobs
+                    'access': ['-p normal-arm', '--export=None'],
+                    'environs': ['default'],
+                    'max_jobs': 120,
+                    'resources': [
+                        {
+                            'name': 'memory',
+                            'options': ['--mem={size}'],
+                        }
+                    ],
+                    'features': [
+                        FEATURES[CPU],
+                    ] + list(SCALES.keys()),
+                    'extras': {
+                        # Make sure to round down, otherwise a job might ask for more mem than is available
+                        # per node
+                        # NB: Deucalion's MaxMemPerNode is undefined. Experimentally I found you cannot submit with
+                        # more than --mem=30000M
+                        'mem_per_node': 30000  # in MiB
+                    },
+                    'descr': 'CPU ARM A64FX partition, see https://docs.macc.fccn.pt/deucalion/#compute-nodes'
+                },
+            ]
+        },
+    ],
+    'environments': [
+        {
+            'name': 'default',
+            'cc': 'cc',
+            'cxx': '',
+            'ftn': '',
+        },
+    ],
+    'logging': common_logging_config(reframe_prefix),
+    'general': [
+        {
+            # Enable automatic detection of CPU architecture for each partition
+            # See https://reframe-hpc.readthedocs.io/en/stable/configure.html#auto-detecting-processor-information
+            'remote_detect': True,
+            **common_general_config(reframe_prefix)
+        }
+    ],
+}
diff --git a/config/surf_snellius.py b/config/surf_snellius.py
@@ -56,7 +56,7 @@
                     'extras': {
                         # Make sure to round down, otherwise a job might ask for more mem than is available
                         # per node
-                        'mem_per_node': 213.623 * 1024  # in MiB
+                        'mem_per_node': 229376  # in MiB
                     },
                     'descr': 'AMD Rome CPU partition with native EESSI stack'
                 },
@@ -80,7 +80,7 @@
                     'extras': {
                         # Make sure to round down, otherwise a job might ask for more mem than is available
                         # per node
-                        'mem_per_node': 320.434 * 1024  # in MiB
+                        'mem_per_node': 344064  # in MiB
                     },
                     'descr': 'AMD Genoa CPU partition with native EESSI stack'
                 },
@@ -117,7 +117,7 @@
                         GPU_VENDOR: GPU_VENDORS[NVIDIA],
                         # Make sure to round down, otherwise a job might ask for more mem than is available
                         # per node
-                        'mem_per_node': 457.763 * 1024  # in MiB
+                        'mem_per_node': 491520  # in MiB
                     },
                     'descr': 'Nvidia A100 GPU partition with native EESSI stack'
                 },

diff --git a/eessi/testsuite/constants.py b/eessi/testsuite/constants.py
@@ -4,6 +4,7 @@
 
 AMD = 'AMD'
 CI = 'CI'
+HWTHREAD = 'HWTHREAD'
 CPU = 'CPU'
 CPU_SOCKET = 'CPU_SOCKET'
 NUMA_NODE = 'NUMA_NODE'
@@ -20,6 +21,7 @@
 }
 
 COMPUTE_UNIT = {
+    HWTHREAD: 'hwthread',
     CPU: 'cpu',
     CPU_SOCKET: 'cpu_socket',
     NUMA_NODE: 'numa_node',

diff --git a/eessi/testsuite/hooks.py b/eessi/testsuite/hooks.py
@@ -66,20 +66,19 @@ def assign_tasks_per_compute_unit(test: rfm.RegressionTest, compute_unit: str, n
     Total task count is determined based on the number of nodes to be used in the test.
     Behaviour of this function is (usually) sensible for MPI tests.
 
+    WARNING: when using COMPUTE_UNIT[HWTHREAD] and invoking a hook for process binding, please verify that process
+    binding happens correctly.
+
     Arguments:
     - test: the ReFrame test to which this hook should apply
     - compute_unit: a device as listed in eessi.testsuite.constants.COMPUTE_UNIT
 
     Examples:
     On a single node with 2 sockets, 64 cores and 128 hyperthreads:
-    - assign_tasks_per_compute_unit(test, COMPUTE_UNIT[CPU]) will launch 64 tasks with 1 thread
-    - assign_tasks_per_compute_unit(test, COMPUTE_UNIT[CPU_SOCKET]) will launch 2 tasks with 32 threads per task
+    - assign_tasks_per_compute_unit(test, COMPUTE_UNIT[HWTHREAD]) will launch 128 tasks with 1 thread per task
+    - assign_tasks_per_compute_unit(test, COMPUTE_UNIT[CPU]) will launch 64 tasks with 2 threads per task
+    - assign_tasks_per_compute_unit(test, COMPUTE_UNIT[CPU_SOCKET]) will launch 2 tasks with 64 threads per task
 
-    Future work:
-    Currently, on a single node with 2 sockets, 64 cores and 128 hyperthreads, this
-    - assign_one_task_per_compute_unit(test, COMPUTE_UNIT[CPU], true) launches 128 tasks with 1 thread
-    - assign_one_task_per_compute_unit(test, COMPUTE_UNIT[CPU_SOCKET], true) launches 2 tasks with 64 threads per task
-    In the future, we'd like to add an arugment that disables spawning tasks for hyperthreads.
     """
     if num_per != 1 and compute_unit in [COMPUTE_UNIT[GPU], COMPUTE_UNIT[CPU], COMPUTE_UNIT[CPU_SOCKET]]:
         raise NotImplementedError(
@@ -100,12 +99,26 @@ def assign_tasks_per_compute_unit(test: rfm.RegressionTest, compute_unit: str, n
         )
 
     _assign_default_num_cpus_per_node(test)
+    # If on
+    # - a hyperthreading system
+    # - num_cpus_per_node was set by the scale
+    # - compute_unit != COMPUTE_UNIT[HWTHREAD]
+    # double the default_num_cpus_per_node. In this scenario, if the scale asks for e.g. 1 num_cpus_per_node and
+    # the test doesn't state it wants to use hwthreads, we want to launch on two hyperthreads, i.e. one physical core
+    if SCALES[test.scale].get('num_cpus_per_node') is not None and compute_unit != COMPUTE_UNIT[HWTHREAD]:
+        check_proc_attribute_defined(test, 'num_cpus_per_core')
+        num_cpus_per_core = test.current_partition.processor.num_cpus_per_core
+        # On a hyperthreading system?
+        if num_cpus_per_core > 1:
+            test.default_num_cpus_per_node = test.default_num_cpus_per_node * num_cpus_per_core
 
     if FEATURES[GPU] in test.current_partition.features:
         _assign_default_num_gpus_per_node(test)
 
     if compute_unit == COMPUTE_UNIT[GPU]:
         _assign_one_task_per_gpu(test)
+    elif compute_unit == COMPUTE_UNIT[HWTHREAD]:
+        _assign_one_task_per_hwthread(test)
     elif compute_unit == COMPUTE_UNIT[CPU]:
         _assign_one_task_per_cpu(test)
     elif compute_unit == COMPUTE_UNIT[CPU_SOCKET]:
@@ -273,6 +286,44 @@ def _assign_one_task_per_cpu(test: rfm.RegressionTest):
     --setvar num_tasks_per_node=<x> and/or
     --setvar num_cpus_per_task=<y>.
 
+    Default resources requested:
+    - num_tasks_per_node = default_num_cpus_per_node
+    - num_cpus_per_task = default_num_cpus_per_node / num_tasks_per_node
+    """
+    # neither num_tasks_per_node nor num_cpus_per_task are set
+    if not test.num_tasks_per_node and not test.num_cpus_per_task:
+        check_proc_attribute_defined(test, 'num_cpus_per_core')
+        test.num_tasks_per_node = max(
+            int(test.default_num_cpus_per_node / test.current_partition.processor.num_cpus_per_core),
+            1
+        )
+        test.num_cpus_per_task = int(test.default_num_cpus_per_node / test.num_tasks_per_node)
+
+    # num_tasks_per_node is not set, but num_cpus_per_task is
+    elif not test.num_tasks_per_node:
+        test.num_tasks_per_node = int(test.default_num_cpus_per_node / test.num_cpus_per_task)
+
+    # num_cpus_per_task is not set, but num_tasks_per_node is
+    elif not test.num_cpus_per_task:
+        test.num_cpus_per_task = int(test.default_num_cpus_per_node / test.num_tasks_per_node)
+
+    else:
+        pass  # both num_tasks_per_node and num_cpus_per_node are already set
+
+    test.num_tasks = test.num_nodes * test.num_tasks_per_node
+
+    log(f'num_tasks_per_node set to {test.num_tasks_per_node}')
+    log(f'num_cpus_per_task set to {test.num_cpus_per_task}')
+    log(f'num_tasks set to {test.num_tasks}')
+
+
+def _assign_one_task_per_hwthread(test: rfm.RegressionTest):
+    """
+    Sets num_tasks_per_node and num_cpus_per_task such that it will run one task per core,
+    unless specified with:
+    --setvar num_tasks_per_node=<x> and/or
+    --setvar num_cpus_per_task=<y>.
+
     Default resources requested:
     - num_tasks_per_node = default_num_cpus_per_node
     - num_cpus_per_task = default_num_cpus_per_node / num_tasks_per_node
@@ -558,6 +609,10 @@ def set_compact_process_binding(test: rfm.RegressionTest):
     This hook sets a binding policy for process binding.
     More specifically, it will bind each process to subsequent domains of test.num_cpus_per_task cores.
 
+    Arguments:
+    - test: the ReFrame test to which this hook should apply
+
+
     A few examples:
     - Pure MPI (test.num_cpus_per_task = 1) will result in binding 1 process to each core.
       this will happen in a compact way, i.e. rank 0 to core 0, rank 1 to core 1, etc
@@ -572,6 +627,7 @@ def set_compact_process_binding(test: rfm.RegressionTest):
 
     # Check if hyperthreading is enabled. If so, divide the number of cpus per task by the number
     # of hw threads per core to get a physical core count
+    # TODO: check if this also leads to sensible binding when using COMPUTE_UNIT[HWTHREAD]
     check_proc_attribute_defined(test, 'num_cpus_per_core')
     num_cpus_per_core = test.current_partition.processor.num_cpus_per_core
     physical_cpus_per_task = int(test.num_cpus_per_task / num_cpus_per_core)