From 12914c01c4d8567f5699a3ddafb2350fc561619c Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Wed, 4 Sep 2024 14:51:00 +0200 Subject: [PATCH 1/5] Add ci_config for Azure. We only have a single zen4 node for now, so limit to 1_node test --- CI/azure_mc/ci_config.sh | 4 ++++ 1 file changed, 4 insertions(+) create mode 100644 CI/azure_mc/ci_config.sh diff --git a/CI/azure_mc/ci_config.sh b/CI/azure_mc/ci_config.sh new file mode 100644 index 00000000..eaf2971f --- /dev/null +++ b/CI/azure_mc/ci_config.sh @@ -0,0 +1,4 @@ +# Configurable items +if [ -z "${REFRAME_ARGS}" ]; then + REFRAME_ARGS="--tag CI --tag 1_node" +fi From 854b09725431808a38de19a8f2cad17357051864 Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Wed, 4 Sep 2024 14:58:08 +0200 Subject: [PATCH 2/5] Add azure config --- config/azure_mc.py | 86 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 86 insertions(+) create mode 100644 config/azure_mc.py diff --git a/config/azure_mc.py b/config/azure_mc.py new file mode 100644 index 00000000..90e557f5 --- /dev/null +++ b/config/azure_mc.py @@ -0,0 +1,86 @@ +# WARNING: for CPU autodetect to work correctly you need to +# 1. Either use ReFrame >= 4.3.3 or temporarily change the 'launcher' for each partition to srun +# 2. Either use ReFrame >= 4.3.3 or run from a clone of the ReFrame repository + +# Without this, the autodetect job fails because +# 1. A missing mpirun command +# 2. An incorrect directory structure is assumed when preparing the stagedir for the autodetect job + +# Related issues +# 1. https://github.com/reframe-hpc/reframe/issues/2926 +# 2. https://github.com/reframe-hpc/reframe/issues/2914 + +import os + +from eessi.testsuite.common_config import common_logging_config, common_general_config, common_eessi_init +from eessi.testsuite.constants import FEATURES, SCALES + +# This config will write all staging, output and logging to subdirs under this prefix +# Override with RFM_PREFIX environment variable +reframe_prefix = os.path.join(os.environ['HOME'], 'reframe_runs') + +# AWS CITC site configuration +site_configuration = { + 'systems': [ + { + 'name': 'Magic_Castle_Azure', + 'descr': 'Magic Castle build and test environment on Azure', + 'modules_system': 'lmod', + 'hostnames': ['login*', '*-node'], + 'prefix': reframe_prefix, + 'partitions': [ + { + 'name': 'x86_64-amd-zen4-node1', + 'access': ['--partition=x86-64-amd-zen4-node1', '--export=NONE'], + 'descr': 'Zen4, 16 cores, 30 GB', + }, + { + 'name': 'aarch64-neoverse-N1-16c-32gb', + 'access': ['--partition=aarch64-neoverse-n1-node', '--export=NONE'], + 'descr': 'Neoverse N1, 16 cores, 32 GiB', + }, + ] + }, + ], + 'environments': [ + { + 'name': 'default', + 'cc': 'cc', + 'cxx': '', + 'ftn': '', + }, + ], + 'logging': common_logging_config(reframe_prefix), + 'general': [ + { + # Enable automatic detection of CPU architecture for each partition + # See https://reframe-hpc.readthedocs.io/en/stable/configure.html#auto-detecting-processor-information + 'remote_detect': True, + **common_general_config(reframe_prefix) + } + ], +} + +# Add default things to each partition: +partition_defaults = { + 'scheduler': 'slurm', + 'launcher': 'mpirun', + 'environs': ['default'], + 'features': [ + FEATURES['CPU'] + ] + list(SCALES.keys()), + 'prepare_cmds': [ + common_eessi_init(), + # Required when using srun as launcher with --export=NONE in partition access, in order to ensure job + # steps inherit environment. It doesn't hurt to define this even if srun is not used + 'export SLURM_EXPORT_ENV=ALL' + ], + 'extras': { + # Node types have strongly varying amounts of memory, but we'll make it easy on ourselves + # All should _at least_ have this amount + 'mem_per_node': 64000 + }, +} +for system in site_configuration['systems']: + for partition in system['partitions']: + partition.update(partition_defaults) From d5be59edf89bbc676172ffd4e78d8865137517ef Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Wed, 4 Sep 2024 15:27:16 +0200 Subject: [PATCH 3/5] See if rewriting the node pattern helps --- config/azure_mc.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/config/azure_mc.py b/config/azure_mc.py index 90e557f5..ce6f1806 100644 --- a/config/azure_mc.py +++ b/config/azure_mc.py @@ -26,7 +26,7 @@ 'name': 'Magic_Castle_Azure', 'descr': 'Magic Castle build and test environment on Azure', 'modules_system': 'lmod', - 'hostnames': ['login*', '*-node'], + 'hostnames': ['login.*', '.*-node'], 'prefix': reframe_prefix, 'partitions': [ { From 240265386c124fce5edbe708daa3505ca30f555e Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Wed, 4 Sep 2024 15:29:12 +0200 Subject: [PATCH 4/5] Fix partition name --- config/azure_mc.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/config/azure_mc.py b/config/azure_mc.py index ce6f1806..c54b32f1 100644 --- a/config/azure_mc.py +++ b/config/azure_mc.py @@ -30,7 +30,7 @@ 'prefix': reframe_prefix, 'partitions': [ { - 'name': 'x86_64-amd-zen4-node1', + 'name': 'x86_64-amd-zen4-node', 'access': ['--partition=x86-64-amd-zen4-node1', '--export=NONE'], 'descr': 'Zen4, 16 cores, 30 GB', }, From 2f3f67c1142870ad9b252212f7f8179733735463 Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Wed, 4 Sep 2024 15:30:30 +0200 Subject: [PATCH 5/5] Fix partition name in the right place, fix description for neoverse --- config/azure_mc.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/config/azure_mc.py b/config/azure_mc.py index c54b32f1..3afcbddb 100644 --- a/config/azure_mc.py +++ b/config/azure_mc.py @@ -31,13 +31,13 @@ 'partitions': [ { 'name': 'x86_64-amd-zen4-node', - 'access': ['--partition=x86-64-amd-zen4-node1', '--export=NONE'], + 'access': ['--partition=x86-64-amd-zen4-node', '--export=NONE'], 'descr': 'Zen4, 16 cores, 30 GB', }, { - 'name': 'aarch64-neoverse-N1-16c-32gb', + 'name': 'aarch64-neoverse-N1-16c-62gb', 'access': ['--partition=aarch64-neoverse-n1-node', '--export=NONE'], - 'descr': 'Neoverse N1, 16 cores, 32 GiB', + 'descr': 'Neoverse N1, 16 cores, 62 GiB', }, ] },