Merge branch 'main' into pytorch

casparvl · Jun 13, 2024 · 07b2c1b · 07b2c1b
2 parents 73b7e84 + 46db269
commit 07b2c1b
Show file tree

Hide file tree

Showing 20 changed files with 697 additions and 86 deletions.
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -1,5 +1,5 @@
 # documentation: https://help.github.com/en/articles/workflow-syntax-for-github-actions
-name: Tests for EESSI test suite, using EESSI pilot repo
+name: Tests for EESSI test suite, using EESSI production repo
 on: [push, pull_request, workflow_dispatch]
 permissions: read-all
 jobs:
@@ -9,39 +9,23 @@ jobs:
       fail-fast: false
       matrix:
         EESSI_VERSION:
-        - "2021.12"
+        - '2023.06'
     steps:
         - name: Check out software-layer repository
           uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
           with:
             persist-credentials: false
 
-        - name: Mount EESSI CernVM-FS pilot repository
-          uses: cvmfs-contrib/github-action-cvmfs@55899ca74cf78ab874bdf47f5a804e47c198743c # v4.0
+        - name: Mount EESSI CernVM-FS production repository
+          uses: eessi/github-action-eessi@e1f8f20638ea417a18d23ab29443ee34794ff900 # v3.1.0
           with:
-              cvmfs_config_package: https://github.com/EESSI/filesystem-layer/releases/download/latest/cvmfs-config-eessi_latest_all.deb
-              cvmfs_http_proxy: DIRECT
-              cvmfs_repositories: pilot.eessi-hpc.org
+              eessi_stack_version: ${{matrix.EESSI_VERSION}}
 
         - name: Run test suite
           run: |
-            source /cvmfs/pilot.eessi-hpc.org/versions/${{matrix.EESSI_VERSION}}/init/bash
+            source /cvmfs/software.eessi.io/versions/${{matrix.EESSI_VERSION}}/init/bash
 
-            # install latest version of EasyBuild, to install ReFrame with it,
-            # since that includes the ReFrame test library (hpctestlib) that we rely on
-            python3 -m venv venv
-            source venv/bin/activate
-            pip3 install easybuild
-            eb --version
-            export EASYBUILD_PREFIX=$HOME/easybuild
-            # need to force module generation with --module-only --force because 'pip check' fails
-            # in EESSI pilot 2021.12, see https://github.com/EESSI/compatibility-layer/issues/152
-            eb ReFrame-4.3.3.eb || eb ReFrame-4.3.3.eb --module-only --force
-
-            # load ReFrame
-            module use $HOME/easybuild/modules/all
-
-            module load ReFrame/4.3.3
+            module load ReFrame
             reframe --version
 
             # configure ReFrame (cfr. https://reframe-hpc.readthedocs.io/en/stable/manpage.html#environment)

diff --git a/README.md b/README.md
@@ -98,3 +98,17 @@ is that it is easy to pull in updates from a feature branch using `git pull`.
 You can also push back changes to the feature branch directly, but note that
 you are pushing to the Github fork of another Github user, so _make sure they
 are ok with that_ before doing so!
+
+## Release management
+
+When a release of the EESSI test suite is made, the following things must be taken care of:
+
+- Version bump: in both `pyproject.toml` and `setup.cfg`;
+- Release notes: in `RELEASE_NOTES` + in GitHub release (cfr. https://github.com/EESSI/test-suite/releases/tag/v0.2.0);
+- Tag release on GitHub + publish release (incl. release notes);
+- Publishing release to PyPI:
+  ```
+  # example for version 0.2.0
+  python setup.py sdist
+  twine upload dist/eessi_testsuite-0.2.0.tar.gz
+  ```
diff --git a/RELEASE_NOTES b/RELEASE_NOTES
@@ -1,7 +1,7 @@
 This file contains a description of the major changes to the EESSI test suite.
 For more detailed information, please see the git log.
 
-v0.2.0 (7 march 2024)
+v0.2.0 (7 March 2024)
 ---------------------
 
 This is a minor release of the EESSI test-suite

diff --git a/config/github_actions.py b/config/github_actions.py
@@ -18,14 +18,23 @@
                     'launcher': 'local',
                     'environs': ['default'],
                     'features': [FEATURES[CPU]] + list(SCALES.keys()),
-                    'processor': {'num_cpus': 2},
+                    'processor': {
+                        'num_cpus': 2,
+                        'num_cpus_per_core': 1,
+                    },
                     'resources': [
                         {
                             'name': 'memory',
                             'options': ['--mem={size}'],
                         }
                     ],
-                    'max_jobs': 1
+                    'max_jobs': 1,
+                    'extras': {
+                        # Make sure to round down, otherwise a job might ask for more mem than is available
+                        # per node
+                        # This is a fictional amount, GH actions probably has less, but only does --dry-run
+                        'mem_per_node': 30 * 1024  # in MiB
+                    },
                 }
             ]
         }

diff --git a/config/it4i_karolina.py b/config/it4i_karolina.py
@@ -44,6 +44,12 @@
                         # Avoid https://github.com/EESSI/software-layer/issues/136
                         # Can be taken out once we don't care about old OpenMPI versions anymore (pre-4.1.1)
                         'export OMPI_MCA_pml=ucx',
+                        # Work around "Failed to modify UD QP to INIT on mlx5_0: Operation not permitted" issue
+                        # until we can resolve this through an LMOD hook in host_injections.
+                        # (then these OMPI_MCA_btl & mtl can be removed again)
+                        # See https://github.com/EESSI/software-layer/issues/456#issuecomment-2107755266
+                        'export OMPI_MCA_mtl="^ofi"',
+                        'export OMPI_MCA_btl="^ofi"',
                     ],
                     'launcher': 'mpirun',
                     # Use --export=None to avoid that login environment is passed down to submitted jobs
@@ -53,6 +59,11 @@
                     'features': [
                         FEATURES[CPU],
                     ] + list(SCALES.keys()),
+                    'extras': {
+                        # Make sure to round down, otherwise a job might ask for more mem than is available
+                        # per node
+                        'mem_per_node': 219.345 * 1024  # in MiB
+                    },
                     'descr': 'CPU Universal Compute Nodes, see https://docs.it4i.cz/karolina/hardware-overview/'
                 },
                 # We don't have GPU budget on Karolina at this time

diff --git a/config/izum_vega.py b/config/izum_vega.py
@@ -59,47 +59,57 @@
                     'features': [
                         FEATURES[CPU],
                     ] + list(SCALES.keys()),
+                    'extras': {
+                        # Make sure to round down, otherwise a job might ask for more mem than is available
+                        # per node
+                        'mem_per_node': 238.418 * 1024  # in MiB
+                    },
                     'descr': 'CPU partition Standard, see https://en-doc.vega.izum.si/architecture/'
                 },
-                {
-                    'name': 'gpu',
-                    'scheduler': 'slurm',
-                    'prepare_cmds': [
-                        'source %s' % common_eessi_init(),
-                        # Pass job environment variables like $PATH, etc., into job steps
-                        'export SLURM_EXPORT_ENV=ALL',
-                        # Needed when using srun launcher
-                        # 'export SLURM_MPI_TYPE=pmix',  # WARNING: this broke the GROMACS on Vega
-                        # Avoid https://github.com/EESSI/software-layer/issues/136
-                        # Can be taken out once we don't care about old OpenMPI versions anymore (pre-4.1.1)
-                        'export OMPI_MCA_pml=ucx',
-                    ],
-                    'launcher': 'mpirun',
-                    # Use --export=None to avoid that login environment is passed down to submitted jobs
-                    'access': ['-p gpu', '--export=None'],
-                    'environs': ['default'],
-                    'max_jobs': 60,
-                    'devices': [
-                        {
-                            'type': DEVICE_TYPES[GPU],
-                            'num_devices': 4,
-                        }
-                    ],
-                    'resources': [
-                        {
-                            'name': '_rfm_gpu',
-                            'options': ['--gpus-per-node={num_gpus_per_node}'],
-                        },
-                        {
-                            'name': 'memory',
-                            'options': ['--mem={size}'],
-                        }
-                    ],
-                    'features': [
-                        FEATURES[GPU],
-                    ] + list(SCALES.keys()),
-                    'descr': 'GPU partition, see https://en-doc.vega.izum.si/architecture/'
-                },
+                # {
+                #     'name': 'gpu',
+                #     'scheduler': 'slurm',
+                #     'prepare_cmds': [
+                #         'source %s' % common_eessi_init(),
+                #         # Pass job environment variables like $PATH, etc., into job steps
+                #         'export SLURM_EXPORT_ENV=ALL',
+                #         # Needed when using srun launcher
+                #         # 'export SLURM_MPI_TYPE=pmix',  # WARNING: this broke the GROMACS on Vega
+                #         # Avoid https://github.com/EESSI/software-layer/issues/136
+                #         # Can be taken out once we don't care about old OpenMPI versions anymore (pre-4.1.1)
+                #         'export OMPI_MCA_pml=ucx',
+                #     ],
+                #     'launcher': 'mpirun',
+                #     # Use --export=None to avoid that login environment is passed down to submitted jobs
+                #     'access': ['-p gpu', '--export=None'],
+                #     'environs': ['default'],
+                #     'max_jobs': 60,
+                #     'devices': [
+                #         {
+                #             'type': DEVICE_TYPES[GPU],
+                #             'num_devices': 4,
+                #         }
+                #     ],
+                #     'resources': [
+                #         {
+                #             'name': '_rfm_gpu',
+                #             'options': ['--gpus-per-node={num_gpus_per_node}'],
+                #         },
+                #         {
+                #             'name': 'memory',
+                #             'options': ['--mem={size}'],
+                #         }
+                #     ],
+                #     'features': [
+                #         FEATURES[GPU],
+                #     ] + list(SCALES.keys()),
+                #     'extras': {
+                #         # Make sure to round down, otherwise a job might ask for more mem than is available
+                #         # per node
+                #         'mem_per_node': 476.837 * 1024  # in MiB (should be checked, its unclear from slurm.conf)
+                #     },
+                #     'descr': 'GPU partition, see https://en-doc.vega.izum.si/architecture/'
+                # },
             ]
         },
     ],

diff --git a/config/surf_snellius.py b/config/surf_snellius.py
@@ -53,6 +53,11 @@
                     'features': [
                         FEATURES[CPU],
                     ] + list(SCALES.keys()),
+                    'extras': {
+                        # Make sure to round down, otherwise a job might ask for more mem than is available
+                        # per node
+                        'mem_per_node': 213.623 * 1024  # in MiB
+                    },
                     'descr': 'AMD Rome CPU partition with native EESSI stack'
                 },
                 {
@@ -72,6 +77,11 @@
                     'features': [
                         FEATURES[CPU],
                     ] + list(SCALES.keys()),
+                    'extras': {
+                        # Make sure to round down, otherwise a job might ask for more mem than is available
+                        # per node
+                        'mem_per_node': 320.434 * 1024  # in MiB
+                    },
                     'descr': 'AMD Genoa CPU partition with native EESSI stack'
                 },
 
@@ -105,6 +115,9 @@
                     ] + valid_scales_snellius_gpu,
                     'extras': {
                         GPU_VENDOR: GPU_VENDORS[NVIDIA],
+                        # Make sure to round down, otherwise a job might ask for more mem than is available
+                        # per node
+                        'mem_per_node': 457.763 * 1024  # in MiB
                     },
                     'descr': 'Nvidia A100 GPU partition with native EESSI stack'
                 },

diff --git a/config/vsc_hortense.py b/config/vsc_hortense.py
@@ -6,7 +6,9 @@
 from reframe.core.backends import register_launcher
 from reframe.core.launchers import JobLauncher
 
-from eessi.testsuite.common_config import common_logging_config, common_general_config, common_eessi_init
+from eessi.testsuite.common_config import (common_eessi_init,
+                                           common_general_config,
+                                           common_logging_config)
 from eessi.testsuite.constants import *  # noqa: F403
 
 account = "my-slurm-account"
@@ -54,6 +56,11 @@ def command(self, job):
                     'features': [
                         FEATURES[CPU],
                     ] + list(SCALES.keys()),
+                    'extras': {
+                        # Make sure to round down, otherwise a job might ask for more mem than is available
+                        # per node
+                        'mem_per_node': 252160,  # in MiB
+                    },
                 },
                 {
                     'name': 'cpu_rome_512gb',
@@ -81,6 +88,11 @@ def command(self, job):
                     'features': [
                         FEATURES[CPU],
                     ] + list(SCALES.keys()),
+                    'extras': {
+                        # Make sure to round down, otherwise a job might ask for more mem than is available
+                        # per node
+                        'mem_per_node': 508160,  # in MiB
+                    },
                 },
                 {
                     'name': 'cpu_milan',
@@ -108,6 +120,11 @@ def command(self, job):
                     'features': [
                         FEATURES[CPU],
                     ] + list(SCALES.keys()),
+                    'extras': {
+                        # Make sure to round down, otherwise a job might ask for more mem than is available
+                        # per node
+                        'mem_per_node': 252160,  # in MiB
+                    },
                 },
                 {
                     'name': 'gpu_rome_a100_40gb',
@@ -131,6 +148,9 @@ def command(self, job):
                     ] + list(SCALES.keys()),
                     'extras': {
                         GPU_VENDOR: GPU_VENDORS[NVIDIA],
+                        # Make sure to round down, otherwise a job might ask for more mem than is available
+                        # per node
+                        'mem_per_node': 254400,  # in MiB
                     },
                     'resources': [
                         {
@@ -172,6 +192,9 @@ def command(self, job):
                     ] + list(SCALES.keys()),
                     'extras': {
                         GPU_VENDOR: GPU_VENDORS[NVIDIA],
+                        # Make sure to round down, otherwise a job might ask for more mem than is available
+                        # per node
+                        'mem_per_node': 510720,  # in MiB
                     },
                     'resources': [
                         {

diff --git a/eessi/testsuite/constants.py b/eessi/testsuite/constants.py
@@ -53,8 +53,10 @@
     '1_core': {'num_nodes': 1, 'num_cpus_per_node': 1, 'num_gpus_per_node': 1},
     '2_cores': {'num_nodes': 1, 'num_cpus_per_node': 2, 'num_gpus_per_node': 1},
     '4_cores': {'num_nodes': 1, 'num_cpus_per_node': 4, 'num_gpus_per_node': 1},
-    '1_cpn_2_nodes': {'num_nodes': 2, 'num_cpus_per_node': 1, 'num_gpus_per_node': 1},
-    '1_cpn_4_nodes': {'num_nodes': 4, 'num_cpus_per_node': 1, 'num_gpus_per_node': 1},
+    # renamed after v0.2.0 from 1_cpn_2_nodes to make more unique
+    '1cpn_2nodes': {'num_nodes': 2, 'num_cpus_per_node': 1, 'num_gpus_per_node': 1},
+    # renamed after v0.2.0 from 1_cpn_4_nodes to make more unique
+    '1cpn_4nodes': {'num_nodes': 4, 'num_cpus_per_node': 1, 'num_gpus_per_node': 1},
     '1_8_node': {'num_nodes': 1, 'node_part': 8},  # 1/8 node
     '1_4_node': {'num_nodes': 1, 'node_part': 4},  # 1/4 node
     '1_2_node': {'num_nodes': 1, 'node_part': 2},  # 1/2 node