From 82e6812d5cf0c3126e344465e27264b6f7c78acb Mon Sep 17 00:00:00 2001 From: crivella Date: Tue, 19 Mar 2024 11:24:39 +0100 Subject: [PATCH 01/55] Added test for QE `pw.x` --- eessi/testsuite/tests/apps/QuantumESPRESSO.py | 93 +++++++++++++++++++ 1 file changed, 93 insertions(+) create mode 100644 eessi/testsuite/tests/apps/QuantumESPRESSO.py diff --git a/eessi/testsuite/tests/apps/QuantumESPRESSO.py b/eessi/testsuite/tests/apps/QuantumESPRESSO.py new file mode 100644 index 00000000..a7bb6dd4 --- /dev/null +++ b/eessi/testsuite/tests/apps/QuantumESPRESSO.py @@ -0,0 +1,93 @@ +""" +This module tests the binary 'pw.x' in available modules containing substring 'QuantumESPRESSO'. +Test input files are defined in the ReFrame test library, +see https://github.com/reframe-hpc/reframe/blob/develop/hpctestlib/sciapps/qespresso/benchmarks.py + +ReFrame terminology: + +"pipeline stages": +https://reframe-hpc.readthedocs.io/en/stable/regression_test_api.html#pipeline-hooks + +"test parameter": a list of values, which will generate different test variants. +https://reframe-hpc.readthedocs.io/en/stable/regression_test_api.html#reframe.core.builtins.parameter + +"test variant": a version of a test with a specific value for each test parameter +https://reframe-hpc.readthedocs.io/en/stable/regression_test_api.html#test-variants + +"concrete test cases": all test combinations that will actually run: +- test variants +- valid system:partition+programming environment combinations +https://reframe-hpc.readthedocs.io/en/stable/tutorial_deps.html#listing-dependencies + +Tests can be filtered by name, tag, programming environment, system, partition, or maintainer, +see https://reframe-hpc.readthedocs.io/en/stable/manpage.html#test-filtering + +Hooks acting on all possible test combinations (before filtering) are called after the 'init' stage. +Hooks acting on concrete test cases (after filtering) are called after the 'setup' stage. + +See also https://reframe-hpc.readthedocs.io/en/stable/pipeline.html +""" + +import reframe as rfm +from hpctestlib.sciapps.qespresso.benchmarks import QEspressoPWCheck +from reframe.core.builtins import ( # added only to make the linter happy + parameter, run_after) + +from eessi.testsuite import hooks +from eessi.testsuite.constants import SCALES, TAGS +from eessi.testsuite.utils import find_modules, log + + +@rfm.simple_test +class EESSI_QuantumESPRESSO_PW(QEspressoPWCheck): + scale = parameter(SCALES.keys()) + valid_prog_environs = ['default'] + valid_systems = ['*'] + time_limit = '30m' + module_name = parameter(find_modules('QuantumESPRESSO')) + + @run_after('init') + def run_after_init(self): + """Hooks to run after the init phase""" + + # Filter on which scales are supported by the partitions defined in the ReFrame configuration + hooks.filter_supported_scales(self) + + # Make sure that GPU tests run in partitions that support running on a GPU, + # and that CPU-only tests run in partitions that support running CPU-only. + # Also support setting valid_systems on the cmd line. + hooks.filter_valid_systems_by_device_type(self, required_device_type=self.nb_impl) + + # Support selecting modules on the cmd line. + hooks.set_modules(self) + + # Support selecting scales on the cmd line via tags. + hooks.set_tag_scale(self) + + @run_after('init') + def set_tag_ci(self): + """Set tag CI on smallest benchmark, so it can be selected on the cmd line via --tag CI""" + min_ecut = min(QEspressoPWCheck.ecut.values) + min_nbnd = min(QEspressoPWCheck.nbnd.values) + if self.ecut == min_ecut and self.nbnd == min_nbnd: + self.tags.add(TAGS['CI']) + log(f'tags set to {self.tags}') + + @run_after('setup') + def run_after_setup(self): + """Hooks to run after the setup phase""" + + # Calculate default requested resources based on the scale: + # 1 task per CPU for CPU-only tests, 1 task per GPU for GPU tests. + # Also support setting the resources on the cmd line. + hooks.assign_tasks_per_compute_unit(test=self, compute_unit=self.nb_impl) + + @run_after('setup') + def set_omp_num_threads(self): + """ + Set number of OpenMP threads via OMP_NUM_THREADS. + Set default number of OpenMP threads equal to number of CPUs per task. + """ + + self.env_vars['OMP_NUM_THREADS'] = self.num_cpus_per_task + log(f'env_vars set to {self.env_vars}') From 67e61e44ead58f449898a22102681e8e69c2062f Mon Sep 17 00:00:00 2001 From: crivella Date: Wed, 3 Apr 2024 17:24:44 +0200 Subject: [PATCH 02/55] Implemented use of `DEVICE_TYPES` --- eessi/testsuite/tests/apps/QuantumESPRESSO.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/eessi/testsuite/tests/apps/QuantumESPRESSO.py b/eessi/testsuite/tests/apps/QuantumESPRESSO.py index a7bb6dd4..b5b3db25 100644 --- a/eessi/testsuite/tests/apps/QuantumESPRESSO.py +++ b/eessi/testsuite/tests/apps/QuantumESPRESSO.py @@ -34,7 +34,8 @@ parameter, run_after) from eessi.testsuite import hooks -from eessi.testsuite.constants import SCALES, TAGS +from eessi.testsuite.constants import (COMPUTE_UNIT, CPU, DEVICE_TYPES, GPU, + SCALES, TAGS) from eessi.testsuite.utils import find_modules, log @@ -45,6 +46,9 @@ class EESSI_QuantumESPRESSO_PW(QEspressoPWCheck): valid_systems = ['*'] time_limit = '30m' module_name = parameter(find_modules('QuantumESPRESSO')) + # For now, QE is being build for CPU targets only + # compute_device = parameter([DEVICE_TYPES[CPU], DEVICE_TYPES[GPU]]) + compute_device = parameter([DEVICE_TYPES[CPU], ]) @run_after('init') def run_after_init(self): @@ -56,7 +60,7 @@ def run_after_init(self): # Make sure that GPU tests run in partitions that support running on a GPU, # and that CPU-only tests run in partitions that support running CPU-only. # Also support setting valid_systems on the cmd line. - hooks.filter_valid_systems_by_device_type(self, required_device_type=self.nb_impl) + hooks.filter_valid_systems_by_device_type(self, required_device_type=self.compute_device) # Support selecting modules on the cmd line. hooks.set_modules(self) @@ -80,7 +84,10 @@ def run_after_setup(self): # Calculate default requested resources based on the scale: # 1 task per CPU for CPU-only tests, 1 task per GPU for GPU tests. # Also support setting the resources on the cmd line. - hooks.assign_tasks_per_compute_unit(test=self, compute_unit=self.nb_impl) + if self.compute_device == DEVICE_TYPES[GPU]: + hooks.assign_tasks_per_compute_unit(test=self, compute_unit=COMPUTE_UNIT[GPU]) + else: + hooks.assign_tasks_per_compute_unit(test=self, compute_unit=COMPUTE_UNIT[CPU]) @run_after('setup') def set_omp_num_threads(self): From 4d8e10fd4a3b130339e5e2374a4162114e29c637 Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Fri, 3 May 2024 18:30:59 +0200 Subject: [PATCH 03/55] Set process binding for GROMACS. It is single core, but if launched with mpirun it is currently free to migrate between cores within a numa domain. On Snellius I've seen some strange issues with occassionally very slow performance (10x slower than normal), potentially due to the OS thread schedulling being silly. Process binding leads to better _and_ more reproducible results --- eessi/testsuite/tests/apps/gromacs.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/eessi/testsuite/tests/apps/gromacs.py b/eessi/testsuite/tests/apps/gromacs.py index a3d9e625..c10da7c6 100644 --- a/eessi/testsuite/tests/apps/gromacs.py +++ b/eessi/testsuite/tests/apps/gromacs.py @@ -113,3 +113,11 @@ def set_omp_num_threads(self): self.env_vars['OMP_NUM_THREADS'] = omp_num_threads log(f'env_vars set to {self.env_vars}') + + @run_after('setup') + def set_binding_policy(self): + """ + Default process binding may depend on the launcher used. We've seen some variable performance. + Better set it explicitely to make sure process migration cannot cause such variations. + """ + hooks.set_compact_process_binding(self) From 241eabbce760570bdc01ad76f1ea60fe249807da Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Fri, 3 May 2024 18:46:18 +0200 Subject: [PATCH 04/55] Explicitely add num_cpus_per_core=1. I don't actually know if hyperthreading is enabled in the github CI environment, but it doesn't really matter for the dry-runs anyway. --- config/github_actions.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/config/github_actions.py b/config/github_actions.py index 5328f6f3..9371e376 100644 --- a/config/github_actions.py +++ b/config/github_actions.py @@ -18,7 +18,10 @@ 'launcher': 'local', 'environs': ['default'], 'features': [FEATURES[CPU]] + list(SCALES.keys()), - 'processor': {'num_cpus': 2}, + 'processor': { + 'num_cpus': 2, + 'num_cpus_per_core': 1, + }, 'resources': [ { 'name': 'memory', From 2ff34841b0bd4978ab2af2a0128c5784244cba7a Mon Sep 17 00:00:00 2001 From: Samuel Moors Date: Sun, 5 May 2024 09:58:46 +0200 Subject: [PATCH 05/55] rename 1_cpn_2_nodes and 1_cpn_4_nodes to ensure unique tag matching --- eessi/testsuite/constants.py | 4 ++-- eessi/testsuite/tests/apps/osu.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/eessi/testsuite/constants.py b/eessi/testsuite/constants.py index 9b7d6ac3..dbf97f54 100644 --- a/eessi/testsuite/constants.py +++ b/eessi/testsuite/constants.py @@ -51,8 +51,8 @@ '1_core': {'num_nodes': 1, 'num_cpus_per_node': 1, 'num_gpus_per_node': 1}, '2_cores': {'num_nodes': 1, 'num_cpus_per_node': 2, 'num_gpus_per_node': 1}, '4_cores': {'num_nodes': 1, 'num_cpus_per_node': 4, 'num_gpus_per_node': 1}, - '1_cpn_2_nodes': {'num_nodes': 2, 'num_cpus_per_node': 1, 'num_gpus_per_node': 1}, - '1_cpn_4_nodes': {'num_nodes': 4, 'num_cpus_per_node': 1, 'num_gpus_per_node': 1}, + '1cpn_2nodes': {'num_nodes': 2, 'num_cpus_per_node': 1, 'num_gpus_per_node': 1}, + '1cpn_4nodes': {'num_nodes': 4, 'num_cpus_per_node': 1, 'num_gpus_per_node': 1}, '1_8_node': {'num_nodes': 1, 'node_part': 8}, # 1/8 node '1_4_node': {'num_nodes': 1, 'node_part': 4}, # 1/4 node '1_2_node': {'num_nodes': 1, 'node_part': 2}, # 1/2 node diff --git a/eessi/testsuite/tests/apps/osu.py b/eessi/testsuite/tests/apps/osu.py index 3a6f9dbe..2fab94d7 100644 --- a/eessi/testsuite/tests/apps/osu.py +++ b/eessi/testsuite/tests/apps/osu.py @@ -114,7 +114,7 @@ def set_tag_ci(self): @run_after('init') def set_mem(self): """ Setting an extra job option of memory. This test has only 4 possibilities: 1_node, 2_nodes, 2_cores and - 1_cpn_2_nodes. This is implemented for all cases including full node cases. The requested memory may seem large + 1cpn_2nodes. This is implemented for all cases including full node cases. The requested memory may seem large and the test requires at least 4.5 GB per core at the minimum for the full test when run with validation (-c option for osu_bw or osu_latency). We run till message size 8 (-m 8) which significantly reduces memory requirement.""" From 458379e0e1f5178944d2ff76b29d63d16ab6113c Mon Sep 17 00:00:00 2001 From: Samuel Moors Date: Sun, 5 May 2024 11:57:24 +0200 Subject: [PATCH 06/55] add comments --- eessi/testsuite/constants.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/eessi/testsuite/constants.py b/eessi/testsuite/constants.py index dbf97f54..19ad4a3d 100644 --- a/eessi/testsuite/constants.py +++ b/eessi/testsuite/constants.py @@ -51,7 +51,9 @@ '1_core': {'num_nodes': 1, 'num_cpus_per_node': 1, 'num_gpus_per_node': 1}, '2_cores': {'num_nodes': 1, 'num_cpus_per_node': 2, 'num_gpus_per_node': 1}, '4_cores': {'num_nodes': 1, 'num_cpus_per_node': 4, 'num_gpus_per_node': 1}, + # renamed after v0.2.0 from 1_cpn_2_nodes to make more unique '1cpn_2nodes': {'num_nodes': 2, 'num_cpus_per_node': 1, 'num_gpus_per_node': 1}, + # renamed after v0.2.0 from 1_cpn_4_nodes to make more unique '1cpn_4nodes': {'num_nodes': 4, 'num_cpus_per_node': 1, 'num_gpus_per_node': 1}, '1_8_node': {'num_nodes': 1, 'node_part': 8}, # 1/8 node '1_4_node': {'num_nodes': 1, 'node_part': 4}, # 1/4 node From 393f270a27998a863ffb3ac5b5c57920f1faaded Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Tue, 7 May 2024 13:25:33 +0200 Subject: [PATCH 07/55] Fix indentation of rais in check_proc_attribute_defined --- eessi/testsuite/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/eessi/testsuite/utils.py b/eessi/testsuite/utils.py index 9357cc60..be9dec4d 100644 --- a/eessi/testsuite/utils.py +++ b/eessi/testsuite/utils.py @@ -148,4 +148,4 @@ def check_proc_attribute_defined(test: rfm.RegressionTest, attribute) -> bool: "The function utils.proc_attribute_defined should only be called after the setup() phase of ReFrame." "This is a programming error, please report this issue." ) - raise AttributeError(msg) + raise AttributeError(msg) From 829d6ff0d1ecdbabb2d35fbc32ae33e8cc50262a Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Tue, 7 May 2024 13:26:21 +0200 Subject: [PATCH 08/55] Set block distribution in sockets as well when srun is used as parallel launcher --- eessi/testsuite/hooks.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/eessi/testsuite/hooks.py b/eessi/testsuite/hooks.py index 0b73cd58..25abfa84 100644 --- a/eessi/testsuite/hooks.py +++ b/eessi/testsuite/hooks.py @@ -437,9 +437,11 @@ def set_compact_process_binding(test: rfm.RegressionTest): test.env_vars['I_MPI_PIN_CELL'] = 'core' # Don't bind to hyperthreads, only to physcial cores test.env_vars['I_MPI_PIN_DOMAIN'] = '%s:compact' % physical_cpus_per_task test.env_vars['OMPI_MCA_rmaps_base_mapping_policy'] = 'slot:PE=%s' % physical_cpus_per_task - # Default binding for SLURM. Only effective if the task/affinity plugin is enabled - # and when number of tasks times cpus per task equals either socket, core or thread count - test.env_vars['SLURM_CPU_BIND'] = 'verbose' + if test.current_partition.launcher_type().registered_name == 'srun': + # Set compact binding for SLURM. Only effective if the task/affinity plugin is enabled + # and when number of tasks times cpus per task equals either socket, core or thread count + test.env_vars['SLURM_DISTRIBUTION'] = 'block:block' + test.env_vars['SLURM_CPU_BIND'] = 'verbose' log(f'Set environment variable I_MPI_PIN_DOMAIN to {test.env_vars["I_MPI_PIN_DOMAIN"]}') log('Set environment variable OMPI_MCA_rmaps_base_mapping_policy to ' f'{test.env_vars["OMPI_MCA_rmaps_base_mapping_policy"]}') From 23705ac380509ef3b5c81a6ef4982fe9872c472a Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Tue, 7 May 2024 16:25:24 +0200 Subject: [PATCH 09/55] Make if-statements based on launcher, and warn if an unsupported launcher is used that binding might not be effective --- eessi/testsuite/hooks.py | 32 ++++++++++++++++++++++---------- 1 file changed, 22 insertions(+), 10 deletions(-) diff --git a/eessi/testsuite/hooks.py b/eessi/testsuite/hooks.py index 25abfa84..bbb527de 100644 --- a/eessi/testsuite/hooks.py +++ b/eessi/testsuite/hooks.py @@ -6,6 +6,7 @@ import warnings import reframe as rfm +import reframe.core.logging as rflog from eessi.testsuite.constants import * from eessi.testsuite.utils import (get_max_avail_gpus_per_node, is_cuda_required_module, log, @@ -432,20 +433,31 @@ def set_compact_process_binding(test: rfm.RegressionTest): num_cpus_per_core = test.current_partition.processor.num_cpus_per_core physical_cpus_per_task = int(test.num_cpus_per_task / num_cpus_per_core) - # Do binding for intel and OpenMPI's mpirun, and srun - # Other launchers may or may not do the correct binding - test.env_vars['I_MPI_PIN_CELL'] = 'core' # Don't bind to hyperthreads, only to physcial cores - test.env_vars['I_MPI_PIN_DOMAIN'] = '%s:compact' % physical_cpus_per_task - test.env_vars['OMPI_MCA_rmaps_base_mapping_policy'] = 'slot:PE=%s' % physical_cpus_per_task - if test.current_partition.launcher_type().registered_name == 'srun': + if test.current_partition.launcher_type().registered_name == 'mpirun': + # Do binding for intel and OpenMPI's mpirun, and srun + test.env_vars['I_MPI_PIN_CELL'] = 'core' # Don't bind to hyperthreads, only to physcial cores + test.env_vars['I_MPI_PIN_DOMAIN'] = '%s:compact' % physical_cpus_per_task + test.env_vars['OMPI_MCA_rmaps_base_mapping_policy'] = 'slot:PE=%s' % physical_cpus_per_task + log(f'Set environment variable I_MPI_PIN_CELL to {test.env_vars["I_MPI_PIN_CELL"]}') + log(f'Set environment variable I_MPI_PIN_DOMAIN to {test.env_vars["I_MPI_PIN_DOMAIN"]}') + log('Set environment variable OMPI_MCA_rmaps_base_mapping_policy to ' + f'{test.env_vars["OMPI_MCA_rmaps_base_mapping_policy"]}') + elif test.current_partition.launcher_type().registered_name == 'srun': # Set compact binding for SLURM. Only effective if the task/affinity plugin is enabled # and when number of tasks times cpus per task equals either socket, core or thread count test.env_vars['SLURM_DISTRIBUTION'] = 'block:block' test.env_vars['SLURM_CPU_BIND'] = 'verbose' - log(f'Set environment variable I_MPI_PIN_DOMAIN to {test.env_vars["I_MPI_PIN_DOMAIN"]}') - log('Set environment variable OMPI_MCA_rmaps_base_mapping_policy to ' - f'{test.env_vars["OMPI_MCA_rmaps_base_mapping_policy"]}') - log(f'Set environment variable SLURM_CPU_BIND to {test.env_vars["SLURM_CPU_BIND"]}') + log(f'Set environment variable SLURM_DISTRIBUTION to {test.env_vars["SLURM_DISTRIBUTION"]}') + log(f'Set environment variable SLURM_CPU_BIND to {test.env_vars["SLURM_CPU_BIND"]}') + else: + logger = rflog.getlogger() + msg = "hooks.set_compact_process_binding does not support the current launcher" + msg += f" ({test.current_partition.launcher_type().registered_name})." + msg += " The test will run, but using the default binding strategy of your parallel launcher." + msg += " This may lead to suboptimal performance." + msg += " Please expand the functionality of hooks.set_compact_process_binding for your parallel launcher." + # Warnings will, at default loglevel, be printed on stdout when executing the ReFrame command + logger.warning(msg) def set_compact_thread_binding(test: rfm.RegressionTest): From 34e78709d6dcc058599fbabe4967b2fccab70847 Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Tue, 7 May 2024 16:29:19 +0200 Subject: [PATCH 10/55] Make linter happy --- eessi/testsuite/hooks.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/eessi/testsuite/hooks.py b/eessi/testsuite/hooks.py index bbb527de..cfa968f0 100644 --- a/eessi/testsuite/hooks.py +++ b/eessi/testsuite/hooks.py @@ -441,7 +441,7 @@ def set_compact_process_binding(test: rfm.RegressionTest): log(f'Set environment variable I_MPI_PIN_CELL to {test.env_vars["I_MPI_PIN_CELL"]}') log(f'Set environment variable I_MPI_PIN_DOMAIN to {test.env_vars["I_MPI_PIN_DOMAIN"]}') log('Set environment variable OMPI_MCA_rmaps_base_mapping_policy to ' - f'{test.env_vars["OMPI_MCA_rmaps_base_mapping_policy"]}') + f'{test.env_vars["OMPI_MCA_rmaps_base_mapping_policy"]}') elif test.current_partition.launcher_type().registered_name == 'srun': # Set compact binding for SLURM. Only effective if the task/affinity plugin is enabled # and when number of tasks times cpus per task equals either socket, core or thread count From 9f4666787bf816f033bb57ea8485396842a4ec2b Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Tue, 7 May 2024 16:42:20 +0200 Subject: [PATCH 11/55] Make linter happy --- eessi/testsuite/hooks.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/eessi/testsuite/hooks.py b/eessi/testsuite/hooks.py index cfa968f0..01b84206 100644 --- a/eessi/testsuite/hooks.py +++ b/eessi/testsuite/hooks.py @@ -441,7 +441,7 @@ def set_compact_process_binding(test: rfm.RegressionTest): log(f'Set environment variable I_MPI_PIN_CELL to {test.env_vars["I_MPI_PIN_CELL"]}') log(f'Set environment variable I_MPI_PIN_DOMAIN to {test.env_vars["I_MPI_PIN_DOMAIN"]}') log('Set environment variable OMPI_MCA_rmaps_base_mapping_policy to ' - f'{test.env_vars["OMPI_MCA_rmaps_base_mapping_policy"]}') + f'{test.env_vars["OMPI_MCA_rmaps_base_mapping_policy"]}') elif test.current_partition.launcher_type().registered_name == 'srun': # Set compact binding for SLURM. Only effective if the task/affinity plugin is enabled # and when number of tasks times cpus per task equals either socket, core or thread count From c4d403892b69fba85bd9d6bc423482ee942df53e Mon Sep 17 00:00:00 2001 From: crivella Date: Tue, 7 May 2024 18:57:23 +0200 Subject: [PATCH 12/55] Increase time for largest test --- eessi/testsuite/tests/apps/QuantumESPRESSO.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/eessi/testsuite/tests/apps/QuantumESPRESSO.py b/eessi/testsuite/tests/apps/QuantumESPRESSO.py index b5b3db25..c8c7b96d 100644 --- a/eessi/testsuite/tests/apps/QuantumESPRESSO.py +++ b/eessi/testsuite/tests/apps/QuantumESPRESSO.py @@ -77,6 +77,14 @@ def set_tag_ci(self): self.tags.add(TAGS['CI']) log(f'tags set to {self.tags}') + @run_after('init') + def set_increased_walltime(self): + """Increase the amount of time for the largest benchmark, so it can complete successfully.""" + max_ecut = max(QEspressoPWCheck.ecut.values) + max_nbnd = max(QEspressoPWCheck.nbnd.values) + if self.ecut == max_ecut and self.nbnd == max_nbnd: + self.time_limit = '60m' + @run_after('setup') def run_after_setup(self): """Hooks to run after the setup phase""" From d1921b4b29dbd45206a5a697df5f160d0d465680 Mon Sep 17 00:00:00 2001 From: Samuel Moors Date: Tue, 7 May 2024 20:24:28 +0200 Subject: [PATCH 13/55] set SRUN_CPUS_PER_TASK --- eessi/testsuite/hooks.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/eessi/testsuite/hooks.py b/eessi/testsuite/hooks.py index 0b73cd58..686f8b24 100644 --- a/eessi/testsuite/hooks.py +++ b/eessi/testsuite/hooks.py @@ -116,6 +116,15 @@ def assign_tasks_per_compute_unit(test: rfm.RegressionTest, compute_unit: str, n _check_always_request_gpus(test) + if test.current_partition.launcher_type().registered_name == 'srun': + # Make sure srun inherits --cpus-per-task from the job environment for Slurm versions >= 22.05 < 23.11, + # ensuring the same task binding across all Slurm versions. + # https://bugs.schedmd.com/show_bug.cgi?id=13351 + # https://bugs.schedmd.com/show_bug.cgi?id=11275 + # https://bugs.schedmd.com/show_bug.cgi?id=15632#c43 + test.env_vars['SRUN_CPUS_PER_TASK'] = test.num_cpus_per_task + log(f'Set environment variable SRUN_CPUS_PER_TASK to {test.env_vars["SRUN_CPUS_PER_TASK"]}') + def _assign_num_tasks_per_node(test: rfm.RegressionTest, num_per: int = 1): """ From aa6cd57e323145795f28b2167cc598594180c218 Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Mon, 13 May 2024 16:50:06 +0200 Subject: [PATCH 14/55] Fix https://github.com/EESSI/software-layer/issues/456#issuecomment-2107755266 until we can more permanently fix it through an LMOD hook in host_injections --- config/it4i_karolina.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/config/it4i_karolina.py b/config/it4i_karolina.py index 90062c85..2207561e 100644 --- a/config/it4i_karolina.py +++ b/config/it4i_karolina.py @@ -44,6 +44,11 @@ # Avoid https://github.com/EESSI/software-layer/issues/136 # Can be taken out once we don't care about old OpenMPI versions anymore (pre-4.1.1) 'export OMPI_MCA_pml=ucx', + # Work around "Failed to modify UD QP to INIT on mlx5_0: Operation not permitted" issue + # until we can resolve this through an LMOD hook in host_injections. + # See https://github.com/EESSI/software-layer/issues/456#issuecomment-2107755266 + 'export OMPI_MCA_mtl="^ofi"', + 'export OMPI_MCA_btl="^ofi"', ], 'launcher': 'mpirun', # Use --export=None to avoid that login environment is passed down to submitted jobs From 955ce034dc137beea485e906ffec0c5bba94a855 Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Mon, 13 May 2024 16:53:58 +0200 Subject: [PATCH 15/55] Make linter happy, explain better when this can be removed again --- config/it4i_karolina.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/config/it4i_karolina.py b/config/it4i_karolina.py index 2207561e..cdd6957e 100644 --- a/config/it4i_karolina.py +++ b/config/it4i_karolina.py @@ -46,8 +46,9 @@ 'export OMPI_MCA_pml=ucx', # Work around "Failed to modify UD QP to INIT on mlx5_0: Operation not permitted" issue # until we can resolve this through an LMOD hook in host_injections. + # (then these OMPI_MCA_btl & mtl can be removed again) # See https://github.com/EESSI/software-layer/issues/456#issuecomment-2107755266 - 'export OMPI_MCA_mtl="^ofi"', + 'export OMPI_MCA_mtl="^ofi"', 'export OMPI_MCA_btl="^ofi"', ], 'launcher': 'mpirun', From 625a6138fb049b5e56ee0fe82ef2b1978fa47aa0 Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Mon, 13 May 2024 18:05:03 +0200 Subject: [PATCH 16/55] Reduce the iteration count to make the OSU tests run faster, especially on slower interconnects --- eessi/testsuite/tests/apps/osu.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/eessi/testsuite/tests/apps/osu.py b/eessi/testsuite/tests/apps/osu.py index 2fab94d7..83bbd0f4 100644 --- a/eessi/testsuite/tests/apps/osu.py +++ b/eessi/testsuite/tests/apps/osu.py @@ -53,6 +53,11 @@ class EESSI_OSU_Micro_Benchmarks_pt2pt(osu_benchmark): # unset num_tasks_per_node from the hpctestlib. num_tasks_per_node = None + # Set num_warmup_iters to 5 to reduce execution time, especially on slower interconnects + num_warmup_iters = 5 + # Set num_iters to 10 to reduce execution time, especially on slower interconnects + num_iters = 10 + @run_after('init') def filter_scales_2gpus(self): """Filter out scales with < 2 GPUs if running on GPUs""" @@ -169,6 +174,11 @@ class EESSI_OSU_Micro_Benchmarks_coll(osu_benchmark): # Unset num_tasks_per_node from hpctestlib num_tasks_per_node = None + # Set num_warmup_iters to 5 to reduce execution time, especially on slower interconnects + num_warmup_iters = 5 + # Set num_iters to 10 to reduce execution time, especially on slower interconnects + num_iters = 10 + @run_after('init') def run_after_init(self): """hooks to run after init phase""" From e882e6ddeed5041059287402f3fa67107723c28e Mon Sep 17 00:00:00 2001 From: Satish Kamath Date: Tue, 14 May 2024 09:37:24 +0200 Subject: [PATCH 17/55] First commit ESPResSo test. Still WIP and needs more polishing. --- .../tests/apps/espresso/benchmarks.csv | 27 ++++ .../testsuite/tests/apps/espresso/espresso.py | 97 +++++++++++++ eessi/testsuite/tests/apps/espresso/job.sh | 10 ++ .../testsuite/tests/apps/espresso/madelung.py | 132 ++++++++++++++++++ eessi/testsuite/tests/apps/espresso/plot.py | 39 ++++++ .../apps/espresso/scripts_Espresso.tar.gz | Bin 0 -> 3089 bytes 6 files changed, 305 insertions(+) create mode 100644 eessi/testsuite/tests/apps/espresso/benchmarks.csv create mode 100644 eessi/testsuite/tests/apps/espresso/espresso.py create mode 100644 eessi/testsuite/tests/apps/espresso/job.sh create mode 100644 eessi/testsuite/tests/apps/espresso/madelung.py create mode 100644 eessi/testsuite/tests/apps/espresso/plot.py create mode 100644 eessi/testsuite/tests/apps/espresso/scripts_Espresso.tar.gz diff --git a/eessi/testsuite/tests/apps/espresso/benchmarks.csv b/eessi/testsuite/tests/apps/espresso/benchmarks.csv new file mode 100644 index 00000000..95724751 --- /dev/null +++ b/eessi/testsuite/tests/apps/espresso/benchmarks.csv @@ -0,0 +1,27 @@ +"mode","cores","mpi.x","mpi.y","mpi.z","particles","mean","std" +"weak scaling",4,2,2,1,6912,2.341e-01,8.081e-03 +"strong scaling",4,2,2,1,5832,2.496e-01,9.019e-03 +"weak scaling",16,4,2,2,27648,2.417e+00,9.576e-02 +"strong scaling",16,4,2,2,5832,3.853e-02,1.991e-03 +"weak scaling",32,4,4,2,55296,4.263e+00,1.161e+00 +"strong scaling",32,4,4,2,5832,2.194e-02,7.303e-04 +"weak scaling",1,1,1,1,1728,7.655e-02,3.434e-03 +"weak scaling",2,2,1,1,3456,1.456e-01,4.679e-03 +"strong scaling",2,2,1,1,5832,3.936e-01,1.098e-02 +"strong scaling",1,1,1,1,5832,6.333e-01,1.194e-01 +"strong scaling",64,4,4,4,5832,1.910e-02,6.132e-04 +"weak scaling",1,1,1,1,1728,9.482e-02,2.956e-03 +"weak scaling",2,2,1,1,3456,2.111e-01,6.614e-03 +"strong scaling",1,1,1,1,5832,9.133e-01,2.868e-02 +"strong scaling",16,4,2,2,5832,4.285e-02,1.327e-03 +"strong scaling",64,4,4,4,5832,1.715e-02,5.776e-04 +"strong scaling",128,8,4,4,5832,1.980e-02,7.013e-04 +"weak scaling",64,4,4,4,110592,4.375e-01,1.414e-02 +"weak scaling",100,5,5,4,172800,4.450e-01,1.437e-02 +"weak scaling",128,8,4,4,221184,8.720e+00,2.753e-01 +"weak scaling",128,8,4,4,221184,8.760e+00,3.110e-01 +"weak scaling",4,2,2,1,6912,2.626e-01,8.142e-03 +"weak scaling",4,2,2,1,6912,2.780e-01,8.683e-03 +"weak scaling",4,2,2,1,6912,2.627e-01,8.391e-03 +"weak scaling",4,2,2,1,6912,2.617e-01,8.155e-03 +"weak scaling",2,2,1,1,3456,2.028e-01,6.255e-03 diff --git a/eessi/testsuite/tests/apps/espresso/espresso.py b/eessi/testsuite/tests/apps/espresso/espresso.py new file mode 100644 index 00000000..494abf67 --- /dev/null +++ b/eessi/testsuite/tests/apps/espresso/espresso.py @@ -0,0 +1,97 @@ +""" +This module tests Espresso in available modules containing substring 'ESPResSo' which is different from Quantum Espresso. +Tests included: +- P3M benchmark - Ionic crystals + - Weak scaling + - Strong scaling +Weak and strong scaling are options that are needed to be provided tothe script and the system is either scaled based on +number of cores or kept constant. +""" + +import reframe as rfm +from reframe.core.builtins import parameter, run_after # added only to make the linter happy +from reframe.utility import reframe + +from hpctestlib.microbenchmarks.mpi.osu import osu_benchmark + +from eessi.testsuite import hooks, utils +from eessi.testsuite.constants import * +from eessi.testsuite.utils import find_modules, log + +@rfm.simple_test +class EESSI_ESPRESSO_P3M_IONIC_CRYSTALS(rfm.RunOnlyRegressionTest): + '''''' + scale = parameter(SCALES.keys()) + valid_prog_environs = ['default'] + valid_systems = ['*'] + time_limit = '30m' + # Need to check if QuantumESPRESSO also gets listed. + module_name = parameter(find_modules('ESPResSo')) + # device type is parameterized for an impending CUDA ESPResSo module. + device_type = parameter([DEVICE_TYPES[CPU]]) + + executable = 'python3 madelung.py' + + default_strong_scaling_system_size = 9 + default_weak_scaling_system_size = 6 + + benchmark_info = parameter([ + ('mpi.ionic_crystals.p3m'), + ], fmt=lambda x: x[0], loggable=True) + + + @run_after('init') + def run_after_init(self): + """hooks to run after init phase""" + + # Filter on which scales are supported by the partitions defined in the ReFrame configuration + hooks.filter_supported_scales(self) + + hooks.filter_valid_systems_by_device_type(self, required_device_type=self.device_type) + + hooks.set_modules(self) + + # Set scales as tags + hooks.set_tag_scale(self) + + @run_after('init') + def set_tag_ci(self): + """ Setting tests under CI tag. """ + if (self.benchmark_info[0] in ['mpi.ionic_crystals.p3m']): + self.tags.add('CI') + log(f'tags set to {self.tags}') + + if (self.benchmark_info[0] == 'mpi.ionic_crystals.p3m'): + self.tags.add('ionic_crystals_p3m') + + + @run_after('init') + def set_mem(self): + """ Setting an extra job option of memory. """ + self.extra_resources = {'memory': {'size': '50GB'}} + + @run_after('init') + def set_executable_opts(self): + """Set executable opts based on device_type parameter""" + num_default = 0 # If this test already has executable opts, they must have come from the command line + hooks.check_custom_executable_opts(self, num_default=num_default) + if not self.has_custom_executable_opts: + # By default we run weak scaling since the strong scaling sizes need to change based on max node size and a + # corresponding min node size has to be chozen. + self.executable_opts += ['--size', self.default_weak_scaling_system_size, '--weak-scaling'] + utils.log(f'executable_opts set to {self.executable_opts}') + + @run_after('setup') + def set_num_tasks_per_node(self): + """ Setting number of tasks per node and cpus per task in this function. This function sets num_cpus_per_task + for 1 node and 2 node options where the request is for full nodes.""" + hooks.assign_tasks_per_compute_unit(self, COMPUTE_UNIT[CPU]) + + @sanity_function + def assert_sanity(self): + '''Check all sanity criteria''' + return sn.all([ + self.assert_completion(), + self.assert_convergence(), + ]) + diff --git a/eessi/testsuite/tests/apps/espresso/job.sh b/eessi/testsuite/tests/apps/espresso/job.sh new file mode 100644 index 00000000..17399c52 --- /dev/null +++ b/eessi/testsuite/tests/apps/espresso/job.sh @@ -0,0 +1,10 @@ +#!/bin/bash +#SBATCH --time=00:40:00 +#SBATCH --output %j.stdout +#SBATCH --error %j.stderr +module load spack/default gcc/12.3.0 cuda/12.3.0 openmpi/4.1.6 \ + fftw/3.3.10 boost/1.83.0 python/3.12.1 +source ../espresso-4.3/venv/bin/activate +srun --cpu-bind=cores python3 madelung.py --size 6 --weak-scaling +srun --cpu-bind=cores python3 madelung.py --size 9 --strong-scaling +deactivate diff --git a/eessi/testsuite/tests/apps/espresso/madelung.py b/eessi/testsuite/tests/apps/espresso/madelung.py new file mode 100644 index 00000000..4bfb1df1 --- /dev/null +++ b/eessi/testsuite/tests/apps/espresso/madelung.py @@ -0,0 +1,132 @@ +# +# Copyright (C) 2013-2024 The ESPResSo project +# +# This file is part of ESPResSo. +# +# ESPResSo is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# ESPResSo is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# + +import espressomd +import espressomd.version +import espressomd.electrostatics +import argparse +import pathlib +import time +import numpy as np + +parser = argparse.ArgumentParser(description="Benchmark P3M simulations.") +parser.add_argument("--size", metavar="S", action="store", + default=9, required=False, type=int, + help="Problem size, such that the number of particles N is " + "equal to (2*S)^2; with --weak-scaling this number N " + "is multiplied by the number of cores!") +parser.add_argument("--gpu", action=argparse.BooleanOptionalAction, + default=False, required=False, help="Use GPU implementation") +parser.add_argument("--topology", metavar=("X", "Y", "Z"), nargs=3, action="store", + default=None, required=False, type=int, help="Cartesian topology") +parser.add_argument("--output", metavar="FILEPATH", action="store", + type=str, required=False, default="benchmarks.csv", + help="Output file (default: benchmarks.csv)") +group = parser.add_mutually_exclusive_group() +group.add_argument("--weak-scaling", action="store_true", + help="Weak scaling benchmark (Gustafson's law: constant work per core)") +group.add_argument("--strong-scaling", action="store_true", + help="Strong scaling benchmark (Amdahl's law: constant total work)") +args = parser.parse_args() + +def get_reference_values_per_ion(base_vector): + madelung_constant = -1.74756459463318219 + base_tensor = base_vector * np.eye(3) + ref_energy = madelung_constant + ref_pressure = madelung_constant * base_tensor / np.trace(base_tensor) + return ref_energy, ref_pressure + +def get_normalized_values_per_ion(system): + energy = system.analysis.energy()["coulomb"] + p_scalar = system.analysis.pressure()["coulomb"] + p_tensor = system.analysis.pressure_tensor()["coulomb"] + N = len(system.part) + V = system.volume() + return 2. * energy / N, 2. * p_scalar * V / N, 2. * p_tensor * V / N + +# initialize system +system = espressomd.System(box_l=[100., 100., 100.]) +system.time_step = 0.01 +system.cell_system.skin = 0.4 + +# set MPI Cartesian topology +node_grid = system.cell_system.node_grid.copy() +n_cores = int(np.prod(node_grid)) +if args.topology: + system.cell_system.node_grid = node_grid = args.topology + +# place ions on a cubic lattice +base_vector = np.array([1., 1., 1.]) +lattice_size = 3 * [2 * args.size] +if args.weak_scaling: + lattice_size = np.multiply(lattice_size, node_grid) +system.box_l = np.multiply(lattice_size, base_vector) +for j in range(lattice_size[0]): + for k in range(lattice_size[1]): + for l in range(lattice_size[2]): + _ = system.part.add(pos=np.multiply([j, k, l], base_vector), + q=(-1.)**(j + k + l), fix=3 * [True]) + +# setup P3M algorithm +algorithm = espressomd.electrostatics.P3M +if args.gpu: + algorithm = espressomd.electrostatics.P3MGPU +solver = algorithm(prefactor=1., accuracy=1e-6) +if (espressomd.version.major(), espressomd.version.minor()) == (4, 2): + system.actors.add(solver) +else: + system.electrostatics.solver = solver + +# run checks +forces = np.copy(system.part.all().f) +energy, p_scalar, p_tensor = get_normalized_values_per_ion(system) +ref_energy, ref_pressure = get_reference_values_per_ion(base_vector) +np.testing.assert_allclose(energy, ref_energy, atol=1e-12, rtol=5e-6) +np.testing.assert_allclose(p_scalar, np.trace(ref_pressure) / 3., + atol=1e-12, rtol=2e-5) +np.testing.assert_allclose(p_tensor, ref_pressure, atol=1e-12, rtol=2e-5) +np.testing.assert_allclose(forces, 0., atol=1e-5, rtol=0.) +np.testing.assert_allclose(np.median(np.abs(forces)), 0., atol=2e-6, rtol=0.) + + +print("Executing sanity ...\n") +print (np.all([np.allclose(energy, ref_energy, atol=1e-12, rtol=5e-6), + np.allclose(p_scalar, np.trace(ref_pressure) / 3., + atol=1e-12, rtol=2e-5), + np.allclose(p_tensor, ref_pressure, atol=1e-12, rtol=2e-5), + np.allclose(forces, 0., atol=1e-5, rtol=0.), + np.allclose(np.median(np.abs(forces)), 0., atol=2e-6, rtol=0.)])) + +print("Sanity checking ...\n") +# sample runtime +n_steps = 10 +timings = [] +for _ in range(10): + tick = time.time() + system.integrator.run(n_steps) + tock = time.time() + timings.append((tock - tick) / n_steps) + +# write results to file +header = '"mode","cores","mpi.x","mpi.y","mpi.z","particles","mean","std"\n' +report = f'"{"weak scaling" if args.weak_scaling else "strong scaling"}",{n_cores},{node_grid[0]},{node_grid[1]},{node_grid[2]},{len(system.part)},{np.mean(timings):.3e},{np.std(timings, ddof=1):.3e}\n' +if pathlib.Path(args.output).is_file(): + header = "" +with open(args.output, "a") as f: + f.write(header + report) diff --git a/eessi/testsuite/tests/apps/espresso/plot.py b/eessi/testsuite/tests/apps/espresso/plot.py new file mode 100644 index 00000000..c9a023c4 --- /dev/null +++ b/eessi/testsuite/tests/apps/espresso/plot.py @@ -0,0 +1,39 @@ +import numpy as np +import pandas as pd +import matplotlib.pyplot as plt +import matplotlib.ticker as mtick + +df = pd.read_csv("benchmarks.csv") +df = df.sort_values(by=["mode", "cores", "mpi.x", "mpi.y", "mpi.z"]) + +group = df.query(f"mode == 'strong scaling'") + +fig = plt.figure(figsize=(12, 6)) +ax = fig.subplots().axes +xdata = group["cores"].to_numpy() +ydata = group["mean"].to_numpy() +ax.axline((xdata[0], xdata[0]), slope=1, linestyle="--", color="grey", label="Theoretical maximum") +ax.plot(xdata, ydata[0] / ydata, "o-", label="Measurements") +ax.set_title("Strong scaling") +ax.set_xlabel("Number of cores") +ax.set_ylabel("Speed-up") +ax.set_xscale("log", base=2) +ax.set_yscale("log", base=10) +ax.legend() +plt.show() + +group = df.query(f"mode == 'weak scaling'") + +fig = plt.figure(figsize=(12, 6)) +ax = fig.subplots().axes +xdata = group["cores"].to_numpy() +ydata = group["mean"].to_numpy() +ax.axline((-np.inf, 1), slope=0, linestyle="--", color="grey", label="Theoretical maximum") +ax.plot(xdata, ydata[0] / ydata, "o-", label="Measurements") +ax.set_title("Weak scaling") +ax.set_xlabel("Number of cores") +ax.set_ylabel("Efficiency") +ax.set_xscale("log", base=2) +ax.yaxis.set_major_formatter(mtick.PercentFormatter(1)) +ax.legend() +plt.show() diff --git a/eessi/testsuite/tests/apps/espresso/scripts_Espresso.tar.gz b/eessi/testsuite/tests/apps/espresso/scripts_Espresso.tar.gz new file mode 100644 index 0000000000000000000000000000000000000000..24e2621fec80e082c1830617209e16fa63c8df4e GIT binary patch literal 3089 zcmV+s4DRzEiwFP!000001MM1XZ`(MspYRr0(9qv~LhNs*Q44Y*X-3GCt9eGlNARK` z{T}LkZ+NObC9il|WYK&fiSf!LuI+ez*LHn!zF_3_*~uvrXPl%NzhZ&Zkl=g~2{Maf zM&O##OcFk8FfE1B&>?2V7!iCXZ)wK%Nx^dx(1c_xjD*ahX)b}0Bs2*JJR?gUMzaD~ zz)+rqEF%*1kStpY6oU_Mjz5w&EMXaq$w@wqqkz1N0+t9yC@6#@V!=W(MK(xqh#H(# z8jwQ{e5s6h(jzQ__ZhiinSf{F)gmf>J;F2KVNi)`XN0FnW`eE-iK$#sZq9l&xsXJO zV!Gfd%wz$~U~acj9Fr*{xnQ$A?g2I6k{^%G-+uUbP7aPgksl9EPY;gIKm8qW<$}Xo zcEd_e(K3xAU<~ugXd(+x8yLKQefsJxP#nBGdUtgG33MliN9V_{&(6rh_82K0~Eb*4N6#tWIGQ7p>&egant&@2wgg5EH2X~3cz z(2J4)g6hlu0v`nkTu%v-uz{ zd5FP3Q5y|x!XXryNHZn`Uxu3_R^_cNZ&(b`opHzpG73bEK(jf-H|SYCr&KQDXj(tY zXxY3<@+Hg|;x9=xO%Y{e+%T{XvU$E_i9AtnjF5>Si&Qj?^_MIO7E794lN0Yf5z#V_ zRd@XSQLGj?3JXn+gjcx`DOkvXCbj%4+5HmrJ zaVh9q87k002Z##EiQzs!Gym(B!A|=9TSl+@BA{^u;h`d<;(z=oKfnX?$*5c()8d|H z0a!--@UU6uX}&UKvb#AIvE3HrK^&+f5+&sXiRQ-cerz}=R#bBSTGcaK`^Ni;y ztk>48mbrw`jEf1o3*uZrUYjT^qr%>pdOKq8Pj(`+{DB#liv9@1qz?B+KVrPW24yCA z@=QR*x!s2dPGBT~0>a^a3So^g-OOP#Q9xEp=D%C7Gey2lvx8+w7x9K#l1o_7s2u8q zLAf%kx}bIfjHrP-$ehVZ#%3%7zHD+s21ON% z9E#u;B;eVWH$Vw8qX9Eo!?iS-XGu$FZ;kuPFbU6=VD_I`c;7Oj5Rxq`qcobAqZWk} zRftGfdchYPNNv*^-qQ;aD5mgwQw8uacvs!Ho16i8lUn_1m zJcgJx)=cDDVAg7S19IFe2O9OC1ATkAQm`7mb&1(xgDKJYiQMC)OAu}5gUC=0nMuTRWS~SCULs+$lGCCgP4b0m2Yt*QI8bbgR z>_iAO$I#T}Q|R5GIf3fVwDyV`W4vQCS{N4&8czKK(^be5Y#0E`1Mgh8@S|km;ANvX z=Fdr$XiDAh5rc@TwiibGRc|$m##hRjkJ7ZWi-!H}753@SPS4pI=7oJ(Ei^>BegMg- zk!r;F0YtZrh%i|N04@wn;6}>DxUKocRgYZvNPM}j_V&uxhR&T)*9M1vrQTw07B)*q@sH#VDkYe}y#`Si9l{MIr`@SRDzhR@Q`k?OBgT^M*{9^>@A?voz(pUAAcdMNXwNdju;&O}$eQ+t#C`xQJN@i68Ii zR8+i7v&9YM4_n-DF;d(ZU_7ZEW}=D?Mw4S}Fam<`;o?%It;s6UICd4*5TVzAfz+yM zRb{mbLMR{%_SoQh3q%+d`w~U+t(28ii>4_{Lc>sKeI*B-*RTNPZ(#-FQGkHtC2(hl z&9}B-G*rvwnZAT%t@m_Q&fvOCBkQiZ7S;7=^6J4zJvih%j@0Y5o{t9<>34wgS_UNoXy7+j%nL!NnH2Ar)l(}zt ztQ-cd>rvRmgnTx3%C|U`XjUI8SSN5blpP)ubH@1i`LFFbzIX5Z*Y#Y#JOBM2@P95wZGG|e z{GJPcPH#AZ?CM&c51G<-dV7|=hj*TKe+@PiJ9{2d_@kks7(shIDk;__IKzs>-5vUS zNa5_V7q$%)JG)5hZpvtAln~F_+wl;w=UAhWvrPzK`HEy`#~lH&{_0U;{1)7)PK9~9$*c3b`-v6`JTT`snSkP&+~VNpa}dbOZnDtceLHM zHAOY;kylbUmOa{gz*5zpq8M79=T#KtY@AJsq2DY26bV?=vDGw&mgBi<+JAJ2ckl?L zF_+)++aJOg^1thO8{hwI*Xi>A_kb-QFPcvsNeBb@L9O|cvZN9N2Ah?%z&rIimRsIq zX9yh!ykFw2riC*+2MnzYT9FC#8pi!&T(d+8XRx(`7P{Q87}H{WQ57jtS1D4LDN@%d zQWYxP%hpG^!02<%vci}tdffRvThCt4K&Cc}=BQXKEx5yzFSy`FeQe+cVQ88fy~9p3 znp+~D;`9V;-sl|@+FeK`#f=t9yr{Ihv?QM>dqV46tYfIkxrRXRfEDP%3`22)=CeoY zJJ=+M7<#<116zN<5weKcSnu~?!U2zAbF9xZhSQJfl*MEHe8E5mus4mt7kAMzU+PK* zs$B}`k)q-bZF^CU=)Aw;@t#rKC*kq2D2WB^BV{C^550F9yn217hzb2+{bdxR*N z^@!6r-2SsTy!~0?cewlYY!(F(tn=b49A3Z#ghEj#Ig=S4en`nOL!}Vwgu&K1k%!H& zV?G!Pe{^2`tH2eXg2}$Ej4#xGd+6M+|2DMpUH$(a@brhlG)e}r3K!bbvzG_wuilb= z9}j-Uw!QD$`>@qnjhCkk3G$bOW!S$f}{7U9Q zFmPaVXxSvlLt5YCVFpz3fp0n1ko>fE7&x2B+kppYj!mYV3psGCJtRsCx!?)B0w#_o zcn&FoSk|CE0`B{kH@IQRjhZHY`n;i%X#$UG`~4uz`!F7k%TFsMGq3r$2Tcws|EC2R z-uoX3zn#N~-2WD Date: Tue, 14 May 2024 22:15:57 +0200 Subject: [PATCH 18/55] Moved some files and added __init__.py to mark the directories and sub-dirs as part of the package. Sanity for the weak scaling is already added. Next step is to extract the performance timing and log it. --- .../testsuite/tests/apps/espresso/__init__.py | 0 .../tests/apps/espresso/benchmarks.csv | 7 +++ .../testsuite/tests/apps/espresso/espresso.py | 19 +++++++- .../tests/apps/espresso/src/__init__.py | 0 .../tests/apps/espresso/{ => src}/job.sh | 0 .../tests/apps/espresso/{ => src}/madelung.py | 44 +++++++++++++----- .../tests/apps/espresso/{ => src}/plot.py | 0 .../{ => src}/scripts_Espresso.tar.gz | Bin 8 files changed, 56 insertions(+), 14 deletions(-) create mode 100644 eessi/testsuite/tests/apps/espresso/__init__.py create mode 100644 eessi/testsuite/tests/apps/espresso/src/__init__.py rename eessi/testsuite/tests/apps/espresso/{ => src}/job.sh (100%) rename eessi/testsuite/tests/apps/espresso/{ => src}/madelung.py (76%) rename eessi/testsuite/tests/apps/espresso/{ => src}/plot.py (100%) rename eessi/testsuite/tests/apps/espresso/{ => src}/scripts_Espresso.tar.gz (100%) diff --git a/eessi/testsuite/tests/apps/espresso/__init__.py b/eessi/testsuite/tests/apps/espresso/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/eessi/testsuite/tests/apps/espresso/benchmarks.csv b/eessi/testsuite/tests/apps/espresso/benchmarks.csv index 95724751..9091534b 100644 --- a/eessi/testsuite/tests/apps/espresso/benchmarks.csv +++ b/eessi/testsuite/tests/apps/espresso/benchmarks.csv @@ -25,3 +25,10 @@ "weak scaling",4,2,2,1,6912,2.627e-01,8.391e-03 "weak scaling",4,2,2,1,6912,2.617e-01,8.155e-03 "weak scaling",2,2,1,1,3456,2.028e-01,6.255e-03 +"weak scaling",2,2,1,1,3456,3.247e-01,1.026e-02 +"weak scaling",2,2,1,1,3456,3.249e-01,1.029e-02 +"weak scaling",2,2,1,1,3456,3.257e-01,1.028e-02 +"weak scaling",2,2,1,1,3456,3.375e-01,1.095e-02 +"weak scaling",2,2,1,1,3456,3.367e-01,1.086e-02 +"weak scaling",2,2,1,1,3456,3.241e-01,1.048e-02 +"weak scaling",2,2,1,1,3456,3.243e-01,1.038e-02 diff --git a/eessi/testsuite/tests/apps/espresso/espresso.py b/eessi/testsuite/tests/apps/espresso/espresso.py index 494abf67..37f81344 100644 --- a/eessi/testsuite/tests/apps/espresso/espresso.py +++ b/eessi/testsuite/tests/apps/espresso/espresso.py @@ -9,6 +9,8 @@ """ import reframe as rfm +import reframe.utility.sanity as sn + from reframe.core.builtins import parameter, run_after # added only to make the linter happy from reframe.utility import reframe @@ -36,7 +38,7 @@ class EESSI_ESPRESSO_P3M_IONIC_CRYSTALS(rfm.RunOnlyRegressionTest): default_weak_scaling_system_size = 6 benchmark_info = parameter([ - ('mpi.ionic_crystals.p3m'), + ('mpi.ionic_crystals.p3m', 'p3m'), ], fmt=lambda x: x[0], loggable=True) @@ -78,7 +80,7 @@ def set_executable_opts(self): if not self.has_custom_executable_opts: # By default we run weak scaling since the strong scaling sizes need to change based on max node size and a # corresponding min node size has to be chozen. - self.executable_opts += ['--size', self.default_weak_scaling_system_size, '--weak-scaling'] + self.executable_opts += ['--size', str(self.default_weak_scaling_system_size), '--weak-scaling'] utils.log(f'executable_opts set to {self.executable_opts}') @run_after('setup') @@ -87,6 +89,19 @@ def set_num_tasks_per_node(self): for 1 node and 2 node options where the request is for full nodes.""" hooks.assign_tasks_per_compute_unit(self, COMPUTE_UNIT[CPU]) + @deferrable + def assert_completion(self): + '''Check completion''' + cao = sn.extractsingle(r'^resulting parameters:.*cao: (?P\S+),', self.stdout, 'cao', int) + return (sn.assert_found(r'^Algorithm executed.', self.stdout) and cao) + + @deferrable + def assert_convergence(self): + '''Check convergence''' + check_string = sn.assert_found(r'Final convergence met with tolerances:', self.stdout) + energy = sn.extractsingle(r'^\s+energy:\s+(?P\S+)', self.stdout, 'energy', float) + return (check_string and (energy != 0.0)) + @sanity_function def assert_sanity(self): '''Check all sanity criteria''' diff --git a/eessi/testsuite/tests/apps/espresso/src/__init__.py b/eessi/testsuite/tests/apps/espresso/src/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/eessi/testsuite/tests/apps/espresso/job.sh b/eessi/testsuite/tests/apps/espresso/src/job.sh similarity index 100% rename from eessi/testsuite/tests/apps/espresso/job.sh rename to eessi/testsuite/tests/apps/espresso/src/job.sh diff --git a/eessi/testsuite/tests/apps/espresso/madelung.py b/eessi/testsuite/tests/apps/espresso/src/madelung.py similarity index 76% rename from eessi/testsuite/tests/apps/espresso/madelung.py rename to eessi/testsuite/tests/apps/espresso/src/madelung.py index 4bfb1df1..628d8eab 100644 --- a/eessi/testsuite/tests/apps/espresso/madelung.py +++ b/eessi/testsuite/tests/apps/espresso/src/madelung.py @@ -93,27 +93,46 @@ def get_normalized_values_per_ion(system): else: system.electrostatics.solver = solver + +print("Algorithm executed. \n") + +atol_energy = atol_pressure = 1e-12 +atol_forces = 1e-5 +atol_abs_forces = 2e-6 + +rtol_energy = 5e-6 +rtol_pressure = 2e-5 +rtol_forces = 0. +rtol_abs_forces = 0. # run checks forces = np.copy(system.part.all().f) energy, p_scalar, p_tensor = get_normalized_values_per_ion(system) ref_energy, ref_pressure = get_reference_values_per_ion(base_vector) -np.testing.assert_allclose(energy, ref_energy, atol=1e-12, rtol=5e-6) +np.testing.assert_allclose(energy, ref_energy, atol=atol_energy, rtol=rtol_energy) np.testing.assert_allclose(p_scalar, np.trace(ref_pressure) / 3., - atol=1e-12, rtol=2e-5) -np.testing.assert_allclose(p_tensor, ref_pressure, atol=1e-12, rtol=2e-5) -np.testing.assert_allclose(forces, 0., atol=1e-5, rtol=0.) -np.testing.assert_allclose(np.median(np.abs(forces)), 0., atol=2e-6, rtol=0.) + atol=atol_pressure, rtol=rtol_pressure) +np.testing.assert_allclose(p_tensor, ref_pressure, atol=atol_pressure, rtol=rtol_pressure) +np.testing.assert_allclose(forces, 0., atol=atol_forces, rtol=rtol_forces) +np.testing.assert_allclose(np.median(np.abs(forces)), 0., atol=atol_abs_forces, rtol=rtol_abs_forces) -print("Executing sanity ...\n") -print (np.all([np.allclose(energy, ref_energy, atol=1e-12, rtol=5e-6), +print("Executing sanity checks...\n") +if (np.all([np.allclose(energy, ref_energy, atol=atol_energy, rtol=rtol_energy), np.allclose(p_scalar, np.trace(ref_pressure) / 3., - atol=1e-12, rtol=2e-5), - np.allclose(p_tensor, ref_pressure, atol=1e-12, rtol=2e-5), - np.allclose(forces, 0., atol=1e-5, rtol=0.), - np.allclose(np.median(np.abs(forces)), 0., atol=2e-6, rtol=0.)])) + atol=atol_pressure, rtol=rtol_pressure), + np.allclose(p_tensor, ref_pressure, atol=atol_pressure, rtol=rtol_pressure), + np.allclose(forces, 0., atol=atol_forces, rtol=rtol_forces), + np.allclose(np.median(np.abs(forces)), 0., atol=atol_abs_forces, rtol=rtol_abs_forces)])): + print("Final convergence met with tolerances: \n\ + energy: ", atol_energy, "\n\ + p_scalar: ", atol_pressure, "\n\ + p_tensor: ", atol_pressure, "\n\ + forces: ", atol_forces, "\n\ + abs_forces: ", atol_abs_forces, "\n") +else: + print("At least one parameter did not meet the tolerance, see the log above.\n") -print("Sanity checking ...\n") +print("Sampling runtime...\n") # sample runtime n_steps = 10 timings = [] @@ -126,6 +145,7 @@ def get_normalized_values_per_ion(system): # write results to file header = '"mode","cores","mpi.x","mpi.y","mpi.z","particles","mean","std"\n' report = f'"{"weak scaling" if args.weak_scaling else "strong scaling"}",{n_cores},{node_grid[0]},{node_grid[1]},{node_grid[2]},{len(system.part)},{np.mean(timings):.3e},{np.std(timings, ddof=1):.3e}\n' +print(report) if pathlib.Path(args.output).is_file(): header = "" with open(args.output, "a") as f: diff --git a/eessi/testsuite/tests/apps/espresso/plot.py b/eessi/testsuite/tests/apps/espresso/src/plot.py similarity index 100% rename from eessi/testsuite/tests/apps/espresso/plot.py rename to eessi/testsuite/tests/apps/espresso/src/plot.py diff --git a/eessi/testsuite/tests/apps/espresso/scripts_Espresso.tar.gz b/eessi/testsuite/tests/apps/espresso/src/scripts_Espresso.tar.gz similarity index 100% rename from eessi/testsuite/tests/apps/espresso/scripts_Espresso.tar.gz rename to eessi/testsuite/tests/apps/espresso/src/scripts_Espresso.tar.gz From 6f2c2db13341c6763fea1e0e106c72e410b15c85 Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Thu, 16 May 2024 11:53:19 +0200 Subject: [PATCH 19/55] Add available memory for one of the Snellius partitions, for testing --- config/surf_snellius.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/config/surf_snellius.py b/config/surf_snellius.py index 9e4ee269..542f3ee2 100644 --- a/config/surf_snellius.py +++ b/config/surf_snellius.py @@ -72,6 +72,9 @@ 'features': [ FEATURES[CPU], ] + list(SCALES.keys()), + 'extras': { + 'mem_per_node': 336 + }, 'descr': 'AMD Genoa CPU partition with native EESSI stack' }, From d1c7f74656c9e5f4a340fb5a854f3b2aa9f5672b Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Thu, 16 May 2024 11:53:53 +0200 Subject: [PATCH 20/55] Implement hook for requesting memory from the scheduler --- eessi/testsuite/hooks.py | 75 ++++++++++++++++++- eessi/testsuite/tests/apps/QuantumESPRESSO.py | 5 ++ eessi/testsuite/utils.py | 38 +++++++++- 3 files changed, 115 insertions(+), 3 deletions(-) diff --git a/eessi/testsuite/hooks.py b/eessi/testsuite/hooks.py index c06ff572..dd7ea2bb 100644 --- a/eessi/testsuite/hooks.py +++ b/eessi/testsuite/hooks.py @@ -9,7 +9,7 @@ from eessi.testsuite.constants import * from eessi.testsuite.utils import (get_max_avail_gpus_per_node, is_cuda_required_module, log, - check_proc_attribute_defined) + check_proc_attribute_defined, check_extras_key_defined) def _assign_default_num_cpus_per_node(test: rfm.RegressionTest): @@ -373,6 +373,79 @@ def filter_valid_systems_by_device_type(test: rfm.RegressionTest, required_devic log(f'valid_systems set to {test.valid_systems}') +## TODO: function should take everything in MB, as schedulers (at least slurm) does not except asking for fractional memory +## ie --mem=7.0G is invalid. This should be done as --mem=7168M. +## It's probably better if this function does everything in MB, and the ReFrame config also specifies available mem per node in MB. +## Then, we should make sure the numbers are integers by rounding up for app_mem_req (1 MB more should never really be an issue) +## and probably down for the default_mem (as to not ask for more than the equivalent share of a core) +def req_memory_per_node(test: rfm.RegressionTest, app_mem_req): + """ + This hook will request a specific amount of memory per node to the batch scheduler. + First, it computes which fraction of CPUs is requested from a node, and how much the corresponding (proportional) + amount of memory would be. + Then, the hook compares this to how much memory the application claims to need per node (app_mem_req). + It then passes the maximum of these two numbers to the batch scheduler as a memory request. + + Note: using this hook requires that the ReFrame configuration defines system.partition.extras['mem_per_node'] + + Arguments: + - test: the ReFrame test to which this hook should apply + - app_mem_req: the amount of memory this application needs (per node) in gigabytes + + Example 1: + - A system with 128 cores per node, 64 GB mem per node is used. + - The test is launched on 64 cores + - The app_mem_req is 40 (GB) + In this case, the test requests 50% of the CPUs. Thus, the proportional amount of memory is 32 GB. + The app_mem_req is higher. Thus, 40GB (per node) is requested from the batch scheduler. + + Example 2: + - A system with 128 cores per node, 128 GB mem per node is used. + - The test is launched on 64 cores + - the app_mem_req is 40 (GB) + In this case, the test requests 50% of the CPUs. Thus, the proportional amount of memory is 64 GB. + This is higher than the app_mem_req. Thus, 64 GB (per node) is requested from the batch scheduler. + """ + # Check that the systems.partitions.extra dict in the ReFrame config contains mem_per_node + check_extras_key_defined(test, 'mem_per_node') + + # Skip if the current partition doesn't have sufficient memory to run the application + msg = f"Skipping test: nodes in this partition only have {test.current_partition.extras['mem_per_node']} GB" + msg += " memory available (per node) accodring to the current ReFrame configuration," + msg += f" but {app_mem_req} GB is needed" + test.skip_if(test.current_partition.extras['mem_per_node'] < app_mem_req, msg) + + # Compute what is higher: the requested memory, or the memory available proportional to requested CPUs + # Fraction of CPU cores requested + check_proc_attribute_defined(test, 'num_cpus') + cpu_fraction = test.num_tasks_per_node * test.num_cpus_per_task / test.current_partition.processor.num_cpus + default_mem = cpu_fraction * test.current_partition.extras['mem_per_node'] + + # Request the maximum of the default_mem, and app_mem_req to the scheduler + req_mem_per_node = max(default_mem, app_mem_req) + if test.current_partition.scheduler.registered_name == 'slurm' or test.current_partition.scheduler.registered_name == 'squeue': + # SLURMs --mem defines memory per node, see https://slurm.schedmd.com/sbatch.html + test.extra_resources = {'memory': {'size': '%sG' % req_mem_per_node }} + log(f"Requested {req_mem_per_node}GB per node from the SLURM batch scheduler") + elif test.current_partition.scheduler.registered_name == 'torque': + # Torque/moab requires asking for --pmem (--mem only works single node and thus doesnt generalize) + # See https://docs.adaptivecomputing.com/10-0-1/Torque/torque.htm#topics/torque/3-jobs/3.1.3-requestingRes.htm + req_mem_per_task = req_mem_per_node / test.num_tasks_per_node + # We assume here the reframe config defines the extra resource memory as asking for pmem + # i.e. 'options': ['--pmem={size}'] + test.extra_resources = {'memory': {'size': '%sgb' % req_mem_per_task }} + log(f"Requested {req_mem_per_task}GB per task from the torque batch scheduler") + else: + logger = rflog.getlogger() + msg = "hooks.req_memory_per_node does not support the scheduler you configured" + msg += f" ({test.current_partition.scheduler.registered_name})." + msg += " The test will run, but since it doesn't request the required amount of memory explicitely," + msg += " it may result in an out-of-memory error." + msg += " Please expand the functionality of hooks.req_memory_per_node for your scheduler." + # Warnings will, at default loglevel, be printed on stdout when executing the ReFrame command + logger.warning(msg) + + def set_modules(test: rfm.RegressionTest): """ Skip current test if module_name is not among a list of modules, diff --git a/eessi/testsuite/tests/apps/QuantumESPRESSO.py b/eessi/testsuite/tests/apps/QuantumESPRESSO.py index c8c7b96d..b98807d5 100644 --- a/eessi/testsuite/tests/apps/QuantumESPRESSO.py +++ b/eessi/testsuite/tests/apps/QuantumESPRESSO.py @@ -97,6 +97,11 @@ def run_after_setup(self): else: hooks.assign_tasks_per_compute_unit(test=self, compute_unit=COMPUTE_UNIT[CPU]) + @run_after('setup') + def request_mem(self): + memory_required = self.num_tasks_per_node * 1 + 2 + hooks.req_memory_per_node(test=self, app_mem_req=memory_required) + @run_after('setup') def set_omp_num_threads(self): """ diff --git a/eessi/testsuite/utils.py b/eessi/testsuite/utils.py index 9357cc60..ee679295 100644 --- a/eessi/testsuite/utils.py +++ b/eessi/testsuite/utils.py @@ -145,7 +145,41 @@ def check_proc_attribute_defined(test: rfm.RegressionTest, attribute) -> bool: else: msg = ( "This test's current_partition is not set yet. " - "The function utils.proc_attribute_defined should only be called after the setup() phase of ReFrame." + "The function utils.check_proc_attribute_defined should only be called after the setup() phase of ReFrame." "This is a programming error, please report this issue." ) - raise AttributeError(msg) + raise AttributeError(msg) + + +def check_extras_key_defined(test: rfm.RegressionTest, extra_key) -> bool: + """ + Checks if a specific key is defined in the 'extras' dictionary for the current partition + (i.e. if test.current_partition.extras[extra_key] is defined) + If not, throws an informative error message. + Note that partition extras are defined by free text keys, so any string is (potentially) valid. + + Arguments: + - test: the reframe regression test instance for which should be checked if the key is defined in 'extras' + - extra_key: key for which to check in the 'extras' dictionary + + Return: + - True (bool) if the key is defined + - Function does not return (but raises an error) if the attribute is undefined + """ + + if test.current_partition: + if extra_key in test.current_partition.extras: + return True + else: + msg = ( + f"Key '{extra_key}' missing in the 'extras' dictionary for partition '{test.current_partition.name}'." + "Please define this key for the relevant partition in the ReFrame configuration file (see " + "https://reframe-hpc.readthedocs.io/en/stable/config_reference.html#config.systems.partitions.extras)." + ) + else: + msg = ( + "This test's current_partition is not set yet. " + "The function utils.check_extras_key_defined should only be called after the setup() phase of ReFrame." + "This is a programming error, please report this issue." + ) + raise AttributeError(msg) From 4242927ff621066d4429a48e9e22582685762789 Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Thu, 16 May 2024 13:47:07 +0200 Subject: [PATCH 21/55] Convert to MB before requesting the scheduler, as schedulers (at least slurm) don't accept fractional memory requests. Rounding in MB we introduce less error --- eessi/testsuite/hooks.py | 27 +++++++++++++++++---------- 1 file changed, 17 insertions(+), 10 deletions(-) diff --git a/eessi/testsuite/hooks.py b/eessi/testsuite/hooks.py index dd7ea2bb..abac264e 100644 --- a/eessi/testsuite/hooks.py +++ b/eessi/testsuite/hooks.py @@ -377,7 +377,7 @@ def filter_valid_systems_by_device_type(test: rfm.RegressionTest, required_devic ## ie --mem=7.0G is invalid. This should be done as --mem=7168M. ## It's probably better if this function does everything in MB, and the ReFrame config also specifies available mem per node in MB. ## Then, we should make sure the numbers are integers by rounding up for app_mem_req (1 MB more should never really be an issue) -## and probably down for the default_mem (as to not ask for more than the equivalent share of a core) +## and probably down for the proportional_mem (as to not ask for more than the equivalent share of a core) def req_memory_per_node(test: rfm.RegressionTest, app_mem_req): """ This hook will request a specific amount of memory per node to the batch scheduler. @@ -390,10 +390,10 @@ def req_memory_per_node(test: rfm.RegressionTest, app_mem_req): Arguments: - test: the ReFrame test to which this hook should apply - - app_mem_req: the amount of memory this application needs (per node) in gigabytes + - app_mem_req: the amount of memory this application needs (per node) in megabytes Example 1: - - A system with 128 cores per node, 64 GB mem per node is used. + - A system with 128 cores and 64 GB per node. - The test is launched on 64 cores - The app_mem_req is 40 (GB) In this case, the test requests 50% of the CPUs. Thus, the proportional amount of memory is 32 GB. @@ -419,22 +419,29 @@ def req_memory_per_node(test: rfm.RegressionTest, app_mem_req): # Fraction of CPU cores requested check_proc_attribute_defined(test, 'num_cpus') cpu_fraction = test.num_tasks_per_node * test.num_cpus_per_task / test.current_partition.processor.num_cpus - default_mem = cpu_fraction * test.current_partition.extras['mem_per_node'] + proportional_mem = cpu_fraction * test.current_partition.extras['mem_per_node'] - # Request the maximum of the default_mem, and app_mem_req to the scheduler - req_mem_per_node = max(default_mem, app_mem_req) + # First convert to MB and round - schedulers typically don't allow fractional numbers + # (and we want to reduce roundoff error, hence MB) + # Round up for app_mem_req to be on the save side: + app_mem_req = math.ceil(1024 * app_mem_req) + # Round down for proportional_mem, so we don't ask more than what is available per node + proportional_mem = math.floor(1024 * proportional_mem) + + # Request the maximum of the proportional_mem, and app_mem_req to the scheduler + req_mem_per_node = max(proportional_mem, app_mem_req) if test.current_partition.scheduler.registered_name == 'slurm' or test.current_partition.scheduler.registered_name == 'squeue': # SLURMs --mem defines memory per node, see https://slurm.schedmd.com/sbatch.html - test.extra_resources = {'memory': {'size': '%sG' % req_mem_per_node }} - log(f"Requested {req_mem_per_node}GB per node from the SLURM batch scheduler") + test.extra_resources = {'memory': {'size': '%sM' % req_mem_per_node }} + log(f"Requested {req_mem_per_node}MB per node from the SLURM batch scheduler") elif test.current_partition.scheduler.registered_name == 'torque': # Torque/moab requires asking for --pmem (--mem only works single node and thus doesnt generalize) # See https://docs.adaptivecomputing.com/10-0-1/Torque/torque.htm#topics/torque/3-jobs/3.1.3-requestingRes.htm req_mem_per_task = req_mem_per_node / test.num_tasks_per_node # We assume here the reframe config defines the extra resource memory as asking for pmem # i.e. 'options': ['--pmem={size}'] - test.extra_resources = {'memory': {'size': '%sgb' % req_mem_per_task }} - log(f"Requested {req_mem_per_task}GB per task from the torque batch scheduler") + test.extra_resources = {'memory': {'size': '%smb' % req_mem_per_task }} + log(f"Requested {req_mem_per_task}MB per task from the torque batch scheduler") else: logger = rflog.getlogger() msg = "hooks.req_memory_per_node does not support the scheduler you configured" From 4b87213947bdda8b5c1391997dcab11ddd0b0a01 Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Thu, 16 May 2024 16:27:51 +0200 Subject: [PATCH 22/55] Make sure all user defined inputs are in base-2, i.e. gibibytes etc. SLURM takes gigabytes (base-10). Torque supposedly takes mebibytes (base-2). --- config/izum_vega.py | 5 ++++ eessi/testsuite/hooks.py | 57 ++++++++++++++++++++++++---------------- 2 files changed, 39 insertions(+), 23 deletions(-) diff --git a/config/izum_vega.py b/config/izum_vega.py index 4c67792b..77d75796 100644 --- a/config/izum_vega.py +++ b/config/izum_vega.py @@ -59,6 +59,11 @@ 'features': [ FEATURES[CPU], ] + list(SCALES.keys()), + 'extras': { + # Make sure to round down, otherwise a job might ask for more mem than is available + # per node + 'mem_per_node': 238.418 # in GiB + }, 'descr': 'CPU partition Standard, see https://en-doc.vega.izum.si/architecture/' }, { diff --git a/eessi/testsuite/hooks.py b/eessi/testsuite/hooks.py index abac264e..a97e7d45 100644 --- a/eessi/testsuite/hooks.py +++ b/eessi/testsuite/hooks.py @@ -387,32 +387,33 @@ def req_memory_per_node(test: rfm.RegressionTest, app_mem_req): It then passes the maximum of these two numbers to the batch scheduler as a memory request. Note: using this hook requires that the ReFrame configuration defines system.partition.extras['mem_per_node'] + That field should be defined in GiB Arguments: - test: the ReFrame test to which this hook should apply - - app_mem_req: the amount of memory this application needs (per node) in megabytes + - app_mem_req: the amount of memory this application needs (per node) in GiB Example 1: - - A system with 128 cores and 64 GB per node. + - A system with 128 cores and 64 GiB per node. - The test is launched on 64 cores - - The app_mem_req is 40 (GB) - In this case, the test requests 50% of the CPUs. Thus, the proportional amount of memory is 32 GB. - The app_mem_req is higher. Thus, 40GB (per node) is requested from the batch scheduler. + - The app_mem_req is 40 (GiB) + In this case, the test requests 50% of the CPUs. Thus, the proportional amount of memory is 32 GiB. + The app_mem_req is higher. Thus, 40GiB (per node) is requested from the batch scheduler. Example 2: - - A system with 128 cores per node, 128 GB mem per node is used. + - A system with 128 cores per node, 128 GiB mem per node is used. - The test is launched on 64 cores - - the app_mem_req is 40 (GB) - In this case, the test requests 50% of the CPUs. Thus, the proportional amount of memory is 64 GB. - This is higher than the app_mem_req. Thus, 64 GB (per node) is requested from the batch scheduler. + - the app_mem_req is 40 (GiB) + In this case, the test requests 50% of the CPUs. Thus, the proportional amount of memory is 64 GiB. + This is higher than the app_mem_req. Thus, 64 GiB (per node) is requested from the batch scheduler. """ # Check that the systems.partitions.extra dict in the ReFrame config contains mem_per_node check_extras_key_defined(test, 'mem_per_node') # Skip if the current partition doesn't have sufficient memory to run the application - msg = f"Skipping test: nodes in this partition only have {test.current_partition.extras['mem_per_node']} GB" + msg = f"Skipping test: nodes in this partition only have {test.current_partition.extras['mem_per_node']} GiB" msg += " memory available (per node) accodring to the current ReFrame configuration," - msg += f" but {app_mem_req} GB is needed" + msg += f" but {app_mem_req} GiB is needed" test.skip_if(test.current_partition.extras['mem_per_node'] < app_mem_req, msg) # Compute what is higher: the requested memory, or the memory available proportional to requested CPUs @@ -421,27 +422,37 @@ def req_memory_per_node(test: rfm.RegressionTest, app_mem_req): cpu_fraction = test.num_tasks_per_node * test.num_cpus_per_task / test.current_partition.processor.num_cpus proportional_mem = cpu_fraction * test.current_partition.extras['mem_per_node'] - # First convert to MB and round - schedulers typically don't allow fractional numbers - # (and we want to reduce roundoff error, hence MB) - # Round up for app_mem_req to be on the save side: - app_mem_req = math.ceil(1024 * app_mem_req) - # Round down for proportional_mem, so we don't ask more than what is available per node - proportional_mem = math.floor(1024 * proportional_mem) - - # Request the maximum of the proportional_mem, and app_mem_req to the scheduler - req_mem_per_node = max(proportional_mem, app_mem_req) if test.current_partition.scheduler.registered_name == 'slurm' or test.current_partition.scheduler.registered_name == 'squeue': # SLURMs --mem defines memory per node, see https://slurm.schedmd.com/sbatch.html + # SLURM uses megabytes and gigabytes, i.e. base-10, so conversion is 1000, not 1024 + # Thus, we convert from GiB (gibibytes) to MB (megabytes) (1024 * 1024 * 1024 / (1000 * 1000) = 1073.741824) + app_mem_req = math.ceil(1073.741824 * app_mem_req) + log(f"Memory requested by application: {app_mem_req} MB") + proportional_mem = math.floor(1073.741824 * proportional_mem) + log(f"Memory proportional to the core count: {proportional_mem} MB") + + # Request the maximum of the proportional_mem, and app_mem_req to the scheduler + req_mem_per_node = max(proportional_mem, app_mem_req) + test.extra_resources = {'memory': {'size': '%sM' % req_mem_per_node }} - log(f"Requested {req_mem_per_node}MB per node from the SLURM batch scheduler") + log(f"Requested {req_mem_per_node} MB per node from the SLURM batch scheduler") + elif test.current_partition.scheduler.registered_name == 'torque': # Torque/moab requires asking for --pmem (--mem only works single node and thus doesnt generalize) # See https://docs.adaptivecomputing.com/10-0-1/Torque/torque.htm#topics/torque/3-jobs/3.1.3-requestingRes.htm - req_mem_per_task = req_mem_per_node / test.num_tasks_per_node + # Units are MiB according to the documentation, thus, we simply multiply with 1024 + # We immediately divide by num_tasks_per_node (before rounding), since -pmem specifies memroy _per process_ + app_mem_req_task = math.ceil(1024 * app_mem_req / test.num_tasks_per_node) + proportional_mem_task = math.floor(1024 * proportional_mem / test.num_tasks_per_node) + + # Request the maximum of the proportional_mem, and app_mem_req to the scheduler + req_mem_per_task = max(proportional_mem_task, app_mem_req_task) + # We assume here the reframe config defines the extra resource memory as asking for pmem # i.e. 'options': ['--pmem={size}'] test.extra_resources = {'memory': {'size': '%smb' % req_mem_per_task }} - log(f"Requested {req_mem_per_task}MB per task from the torque batch scheduler") + log(f"Requested {req_mem_per_task} MiB per task from the torque batch scheduler") + else: logger = rflog.getlogger() msg = "hooks.req_memory_per_node does not support the scheduler you configured" From e2de0ee69dae59531a23d876f319f09cc295d365 Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Thu, 16 May 2024 16:28:40 +0200 Subject: [PATCH 23/55] Make memory requirement tighter, so that we don't exceed the 1 GB/core for large task counts --- eessi/testsuite/tests/apps/QuantumESPRESSO.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/eessi/testsuite/tests/apps/QuantumESPRESSO.py b/eessi/testsuite/tests/apps/QuantumESPRESSO.py index b98807d5..050e43d3 100644 --- a/eessi/testsuite/tests/apps/QuantumESPRESSO.py +++ b/eessi/testsuite/tests/apps/QuantumESPRESSO.py @@ -99,7 +99,7 @@ def run_after_setup(self): @run_after('setup') def request_mem(self): - memory_required = self.num_tasks_per_node * 1 + 2 + memory_required = self.num_tasks_per_node * 0.9 + 4 hooks.req_memory_per_node(test=self, app_mem_req=memory_required) @run_after('setup') From 48878d187f4f035fd4fc45b05e92e07ace98d8ca Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Thu, 16 May 2024 16:55:37 +0200 Subject: [PATCH 24/55] Add some memory for standard system configs. Will do aws_mc later... --- config/github_actions.py | 6 +++ config/it4i_karolina.py | 5 +++ config/izum_vega.py | 83 +++++++++++++++++++++------------------- config/surf_snellius.py | 12 +++++- 4 files changed, 66 insertions(+), 40 deletions(-) diff --git a/config/github_actions.py b/config/github_actions.py index 5328f6f3..0060b7ab 100644 --- a/config/github_actions.py +++ b/config/github_actions.py @@ -26,6 +26,12 @@ } ], 'max_jobs': 1 + 'extras': { + # Make sure to round down, otherwise a job might ask for more mem than is available + # per node + # This is a fictional amount, GH actions probably has less, but only does --dry-run + 'mem_per_node': 30 # in GiB + }, } ] } diff --git a/config/it4i_karolina.py b/config/it4i_karolina.py index 90062c85..d395d911 100644 --- a/config/it4i_karolina.py +++ b/config/it4i_karolina.py @@ -53,6 +53,11 @@ 'features': [ FEATURES[CPU], ] + list(SCALES.keys()), + 'extras': { + # Make sure to round down, otherwise a job might ask for more mem than is available + # per node + 'mem_per_node': 219.345 # in GiB + }, 'descr': 'CPU Universal Compute Nodes, see https://docs.it4i.cz/karolina/hardware-overview/' }, # We don't have GPU budget on Karolina at this time diff --git a/config/izum_vega.py b/config/izum_vega.py index 77d75796..3adb1504 100644 --- a/config/izum_vega.py +++ b/config/izum_vega.py @@ -66,45 +66,50 @@ }, 'descr': 'CPU partition Standard, see https://en-doc.vega.izum.si/architecture/' }, - { - 'name': 'gpu', - 'scheduler': 'slurm', - 'prepare_cmds': [ - 'source %s' % common_eessi_init(), - # Pass job environment variables like $PATH, etc., into job steps - 'export SLURM_EXPORT_ENV=ALL', - # Needed when using srun launcher - # 'export SLURM_MPI_TYPE=pmix', # WARNING: this broke the GROMACS on Vega - # Avoid https://github.com/EESSI/software-layer/issues/136 - # Can be taken out once we don't care about old OpenMPI versions anymore (pre-4.1.1) - 'export OMPI_MCA_pml=ucx', - ], - 'launcher': 'mpirun', - # Use --export=None to avoid that login environment is passed down to submitted jobs - 'access': ['-p gpu', '--export=None'], - 'environs': ['default'], - 'max_jobs': 60, - 'devices': [ - { - 'type': DEVICE_TYPES[GPU], - 'num_devices': 4, - } - ], - 'resources': [ - { - 'name': '_rfm_gpu', - 'options': ['--gpus-per-node={num_gpus_per_node}'], - }, - { - 'name': 'memory', - 'options': ['--mem={size}'], - } - ], - 'features': [ - FEATURES[GPU], - ] + list(SCALES.keys()), - 'descr': 'GPU partition, see https://en-doc.vega.izum.si/architecture/' - }, +# { +# 'name': 'gpu', +# 'scheduler': 'slurm', +# 'prepare_cmds': [ +# 'source %s' % common_eessi_init(), +# # Pass job environment variables like $PATH, etc., into job steps +# 'export SLURM_EXPORT_ENV=ALL', +# # Needed when using srun launcher +# # 'export SLURM_MPI_TYPE=pmix', # WARNING: this broke the GROMACS on Vega +# # Avoid https://github.com/EESSI/software-layer/issues/136 +# # Can be taken out once we don't care about old OpenMPI versions anymore (pre-4.1.1) +# 'export OMPI_MCA_pml=ucx', +# ], +# 'launcher': 'mpirun', +# # Use --export=None to avoid that login environment is passed down to submitted jobs +# 'access': ['-p gpu', '--export=None'], +# 'environs': ['default'], +# 'max_jobs': 60, +# 'devices': [ +# { +# 'type': DEVICE_TYPES[GPU], +# 'num_devices': 4, +# } +# ], +# 'resources': [ +# { +# 'name': '_rfm_gpu', +# 'options': ['--gpus-per-node={num_gpus_per_node}'], +# }, +# { +# 'name': 'memory', +# 'options': ['--mem={size}'], +# } +# ], +# 'features': [ +# FEATURES[GPU], +# ] + list(SCALES.keys()), +# 'extras': { +# # Make sure to round down, otherwise a job might ask for more mem than is available +# # per node +# 'mem_per_node': 476.837 # in GiB (should be checked, its unclear from slurm.conf) +# }, +# 'descr': 'GPU partition, see https://en-doc.vega.izum.si/architecture/' +# }, ] }, ], diff --git a/config/surf_snellius.py b/config/surf_snellius.py index 542f3ee2..d8bcc36c 100644 --- a/config/surf_snellius.py +++ b/config/surf_snellius.py @@ -53,6 +53,11 @@ 'features': [ FEATURES[CPU], ] + list(SCALES.keys()), + 'extras': { + # Make sure to round down, otherwise a job might ask for more mem than is available + # per node + 'mem_per_node': 213.623 # in GiB + }, 'descr': 'AMD Rome CPU partition with native EESSI stack' }, { @@ -73,7 +78,9 @@ FEATURES[CPU], ] + list(SCALES.keys()), 'extras': { - 'mem_per_node': 336 + # Make sure to round down, otherwise a job might ask for more mem than is available + # per node + 'mem_per_node': 320.434 # in GiB }, 'descr': 'AMD Genoa CPU partition with native EESSI stack' }, @@ -108,6 +115,9 @@ ] + valid_scales_snellius_gpu, 'extras': { GPU_VENDOR: GPU_VENDORS[NVIDIA], + # Make sure to round down, otherwise a job might ask for more mem than is available + # per node + 'mem_per_node': 457.763 # in GiB }, 'descr': 'Nvidia A100 GPU partition with native EESSI stack' }, From 94e616197a89e6dd9ccbf073fc3be9c7970ae8f9 Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen <33718780+casparvl@users.noreply.github.com> Date: Thu, 23 May 2024 14:03:10 +0200 Subject: [PATCH 25/55] Apply suggestions from code review Fix CI issues Co-authored-by: Davide Grassano <34096612+Crivella@users.noreply.github.com> --- config/github_actions.py | 2 +- config/izum_vega.py | 88 ++++++++++++++++++++-------------------- eessi/testsuite/hooks.py | 11 +++-- 3 files changed, 50 insertions(+), 51 deletions(-) diff --git a/config/github_actions.py b/config/github_actions.py index 0060b7ab..b6555d28 100644 --- a/config/github_actions.py +++ b/config/github_actions.py @@ -25,7 +25,7 @@ 'options': ['--mem={size}'], } ], - 'max_jobs': 1 + 'max_jobs': 1, 'extras': { # Make sure to round down, otherwise a job might ask for more mem than is available # per node diff --git a/config/izum_vega.py b/config/izum_vega.py index 3adb1504..765d6a69 100644 --- a/config/izum_vega.py +++ b/config/izum_vega.py @@ -66,50 +66,50 @@ }, 'descr': 'CPU partition Standard, see https://en-doc.vega.izum.si/architecture/' }, -# { -# 'name': 'gpu', -# 'scheduler': 'slurm', -# 'prepare_cmds': [ -# 'source %s' % common_eessi_init(), -# # Pass job environment variables like $PATH, etc., into job steps -# 'export SLURM_EXPORT_ENV=ALL', -# # Needed when using srun launcher -# # 'export SLURM_MPI_TYPE=pmix', # WARNING: this broke the GROMACS on Vega -# # Avoid https://github.com/EESSI/software-layer/issues/136 -# # Can be taken out once we don't care about old OpenMPI versions anymore (pre-4.1.1) -# 'export OMPI_MCA_pml=ucx', -# ], -# 'launcher': 'mpirun', -# # Use --export=None to avoid that login environment is passed down to submitted jobs -# 'access': ['-p gpu', '--export=None'], -# 'environs': ['default'], -# 'max_jobs': 60, -# 'devices': [ -# { -# 'type': DEVICE_TYPES[GPU], -# 'num_devices': 4, -# } -# ], -# 'resources': [ -# { -# 'name': '_rfm_gpu', -# 'options': ['--gpus-per-node={num_gpus_per_node}'], -# }, -# { -# 'name': 'memory', -# 'options': ['--mem={size}'], -# } -# ], -# 'features': [ -# FEATURES[GPU], -# ] + list(SCALES.keys()), -# 'extras': { -# # Make sure to round down, otherwise a job might ask for more mem than is available -# # per node -# 'mem_per_node': 476.837 # in GiB (should be checked, its unclear from slurm.conf) -# }, -# 'descr': 'GPU partition, see https://en-doc.vega.izum.si/architecture/' -# }, +{ + # 'name': 'gpu', + # 'scheduler': 'slurm', + # 'prepare_cmds': [ + # 'source %s' % common_eessi_init(), + # # Pass job environment variables like $PATH, etc., into job steps + # 'export SLURM_EXPORT_ENV=ALL', + # # Needed when using srun launcher + # # 'export SLURM_MPI_TYPE=pmix', # WARNING: this broke the GROMACS on Vega + # # Avoid https://github.com/EESSI/software-layer/issues/136 + # # Can be taken out once we don't care about old OpenMPI versions anymore (pre-4.1.1) + # 'export OMPI_MCA_pml=ucx', + # ], + # 'launcher': 'mpirun', + # # Use --export=None to avoid that login environment is passed down to submitted jobs + # 'access': ['-p gpu', '--export=None'], + # 'environs': ['default'], + # 'max_jobs': 60, + # 'devices': [ + # { + # 'type': DEVICE_TYPES[GPU], + # 'num_devices': 4, + # } + # ], + # 'resources': [ + # { + # 'name': '_rfm_gpu', + # 'options': ['--gpus-per-node={num_gpus_per_node}'], + # }, + # { + # 'name': 'memory', + # 'options': ['--mem={size}'], + # } + # ], + # 'features': [ + # FEATURES[GPU], + # ] + list(SCALES.keys()), + # 'extras': { + # # Make sure to round down, otherwise a job might ask for more mem than is available + # # per node + # 'mem_per_node': 476.837 # in GiB (should be checked, its unclear from slurm.conf) + # }, + # 'descr': 'GPU partition, see https://en-doc.vega.izum.si/architecture/' + # }, ] }, ], diff --git a/eessi/testsuite/hooks.py b/eessi/testsuite/hooks.py index a97e7d45..9e838904 100644 --- a/eessi/testsuite/hooks.py +++ b/eessi/testsuite/hooks.py @@ -409,7 +409,6 @@ def req_memory_per_node(test: rfm.RegressionTest, app_mem_req): """ # Check that the systems.partitions.extra dict in the ReFrame config contains mem_per_node check_extras_key_defined(test, 'mem_per_node') - # Skip if the current partition doesn't have sufficient memory to run the application msg = f"Skipping test: nodes in this partition only have {test.current_partition.extras['mem_per_node']} GiB" msg += " memory available (per node) accodring to the current ReFrame configuration," @@ -422,7 +421,8 @@ def req_memory_per_node(test: rfm.RegressionTest, app_mem_req): cpu_fraction = test.num_tasks_per_node * test.num_cpus_per_task / test.current_partition.processor.num_cpus proportional_mem = cpu_fraction * test.current_partition.extras['mem_per_node'] - if test.current_partition.scheduler.registered_name == 'slurm' or test.current_partition.scheduler.registered_name == 'squeue': + scheduler_name = test.current_partition.scheduler.registered_name + if scheduler_name == 'slurm' or scheduler_name == 'squeue': # SLURMs --mem defines memory per node, see https://slurm.schedmd.com/sbatch.html # SLURM uses megabytes and gigabytes, i.e. base-10, so conversion is 1000, not 1024 # Thus, we convert from GiB (gibibytes) to MB (megabytes) (1024 * 1024 * 1024 / (1000 * 1000) = 1073.741824) @@ -434,10 +434,10 @@ def req_memory_per_node(test: rfm.RegressionTest, app_mem_req): # Request the maximum of the proportional_mem, and app_mem_req to the scheduler req_mem_per_node = max(proportional_mem, app_mem_req) - test.extra_resources = {'memory': {'size': '%sM' % req_mem_per_node }} + test.extra_resources = {'memory': {'size': f'{req_mem_per_node}M' }} log(f"Requested {req_mem_per_node} MB per node from the SLURM batch scheduler") - elif test.current_partition.scheduler.registered_name == 'torque': + elif scheduler_name == 'torque': # Torque/moab requires asking for --pmem (--mem only works single node and thus doesnt generalize) # See https://docs.adaptivecomputing.com/10-0-1/Torque/torque.htm#topics/torque/3-jobs/3.1.3-requestingRes.htm # Units are MiB according to the documentation, thus, we simply multiply with 1024 @@ -450,7 +450,7 @@ def req_memory_per_node(test: rfm.RegressionTest, app_mem_req): # We assume here the reframe config defines the extra resource memory as asking for pmem # i.e. 'options': ['--pmem={size}'] - test.extra_resources = {'memory': {'size': '%smb' % req_mem_per_task }} + test.extra_resources = {'memory': {'size': f'{req_mem_per_task}mb'}} log(f"Requested {req_mem_per_task} MiB per task from the torque batch scheduler") else: @@ -462,7 +462,6 @@ def req_memory_per_node(test: rfm.RegressionTest, app_mem_req): msg += " Please expand the functionality of hooks.req_memory_per_node for your scheduler." # Warnings will, at default loglevel, be printed on stdout when executing the ReFrame command logger.warning(msg) - def set_modules(test: rfm.RegressionTest): """ From 360df652893fa8913a6e7fcad860f029355e6b91 Mon Sep 17 00:00:00 2001 From: Davide Grassano <34096612+Crivella@users.noreply.github.com> Date: Thu, 23 May 2024 14:15:47 +0200 Subject: [PATCH 26/55] Apply suggestions from code review Apply fixes --- config/izum_vega.py | 2 +- eessi/testsuite/hooks.py | 8 ++------ 2 files changed, 3 insertions(+), 7 deletions(-) diff --git a/config/izum_vega.py b/config/izum_vega.py index 765d6a69..f7193aed 100644 --- a/config/izum_vega.py +++ b/config/izum_vega.py @@ -66,7 +66,7 @@ }, 'descr': 'CPU partition Standard, see https://en-doc.vega.izum.si/architecture/' }, -{ + # { # 'name': 'gpu', # 'scheduler': 'slurm', # 'prepare_cmds': [ diff --git a/eessi/testsuite/hooks.py b/eessi/testsuite/hooks.py index 9e838904..72a72569 100644 --- a/eessi/testsuite/hooks.py +++ b/eessi/testsuite/hooks.py @@ -373,11 +373,6 @@ def filter_valid_systems_by_device_type(test: rfm.RegressionTest, required_devic log(f'valid_systems set to {test.valid_systems}') -## TODO: function should take everything in MB, as schedulers (at least slurm) does not except asking for fractional memory -## ie --mem=7.0G is invalid. This should be done as --mem=7168M. -## It's probably better if this function does everything in MB, and the ReFrame config also specifies available mem per node in MB. -## Then, we should make sure the numbers are integers by rounding up for app_mem_req (1 MB more should never really be an issue) -## and probably down for the proportional_mem (as to not ask for more than the equivalent share of a core) def req_memory_per_node(test: rfm.RegressionTest, app_mem_req): """ This hook will request a specific amount of memory per node to the batch scheduler. @@ -434,7 +429,7 @@ def req_memory_per_node(test: rfm.RegressionTest, app_mem_req): # Request the maximum of the proportional_mem, and app_mem_req to the scheduler req_mem_per_node = max(proportional_mem, app_mem_req) - test.extra_resources = {'memory': {'size': f'{req_mem_per_node}M' }} + test.extra_resources = {'memory': {'size': f'{req_mem_per_node}M'}} log(f"Requested {req_mem_per_node} MB per node from the SLURM batch scheduler") elif scheduler_name == 'torque': @@ -463,6 +458,7 @@ def req_memory_per_node(test: rfm.RegressionTest, app_mem_req): # Warnings will, at default loglevel, be printed on stdout when executing the ReFrame command logger.warning(msg) + def set_modules(test: rfm.RegressionTest): """ Skip current test if module_name is not among a list of modules, From b66c62722133ce561196b0b201a0c5436702a89d Mon Sep 17 00:00:00 2001 From: crivella Date: Thu, 23 May 2024 14:21:28 +0200 Subject: [PATCH 27/55] Linting --- eessi/testsuite/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/eessi/testsuite/utils.py b/eessi/testsuite/utils.py index ac408b45..ee679295 100644 --- a/eessi/testsuite/utils.py +++ b/eessi/testsuite/utils.py @@ -150,7 +150,7 @@ def check_proc_attribute_defined(test: rfm.RegressionTest, attribute) -> bool: ) raise AttributeError(msg) - + def check_extras_key_defined(test: rfm.RegressionTest, extra_key) -> bool: """ Checks if a specific key is defined in the 'extras' dictionary for the current partition From 387bd80891b832013ce17e26476953f031c5e2e3 Mon Sep 17 00:00:00 2001 From: Kenneth Hoste Date: Thu, 23 May 2024 14:46:37 +0200 Subject: [PATCH 28/55] use software.eessi.io repo in CI --- .github/workflows/test.yml | 30 +++++++----------------------- 1 file changed, 7 insertions(+), 23 deletions(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index f5686848..73471776 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -1,5 +1,5 @@ # documentation: https://help.github.com/en/articles/workflow-syntax-for-github-actions -name: Tests for EESSI test suite, using EESSI pilot repo +name: Tests for EESSI test suite, using EESSI production repo on: [push, pull_request, workflow_dispatch] permissions: read-all jobs: @@ -9,39 +9,23 @@ jobs: fail-fast: false matrix: EESSI_VERSION: - - "2021.12" + - '2023.06' steps: - name: Check out software-layer repository uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1 with: persist-credentials: false - - name: Mount EESSI CernVM-FS pilot repository - uses: cvmfs-contrib/github-action-cvmfs@55899ca74cf78ab874bdf47f5a804e47c198743c # v4.0 + - name: Mount EESSI CernVM-FS production repository + uses: eessi/github-action-eessi@e1f8f20638ea417a18d23ab29443ee34794ff900 # v3.1.0 with: - cvmfs_config_package: https://github.com/EESSI/filesystem-layer/releases/download/latest/cvmfs-config-eessi_latest_all.deb - cvmfs_http_proxy: DIRECT - cvmfs_repositories: pilot.eessi-hpc.org + eessi_stack_version: ${{matrix.EESSI_VERSION}} - name: Run test suite run: | - source /cvmfs/pilot.eessi-hpc.org/versions/${{matrix.EESSI_VERSION}}/init/bash + source /cvmfs/software.eessi.io/versions/${{matrix.EESSI_VERSION}}/init/bash - # install latest version of EasyBuild, to install ReFrame with it, - # since that includes the ReFrame test library (hpctestlib) that we rely on - python3 -m venv venv - source venv/bin/activate - pip3 install easybuild - eb --version - export EASYBUILD_PREFIX=$HOME/easybuild - # need to force module generation with --module-only --force because 'pip check' fails - # in EESSI pilot 2021.12, see https://github.com/EESSI/compatibility-layer/issues/152 - eb ReFrame-4.3.3.eb || eb ReFrame-4.3.3.eb --module-only --force - - # load ReFrame - module use $HOME/easybuild/modules/all - - module load ReFrame/4.3.3 + module load ReFrame reframe --version # configure ReFrame (cfr. https://reframe-hpc.readthedocs.io/en/stable/manpage.html#environment) From 471becc9c14d7ccc367feb4de4f8443ac10fa6f2 Mon Sep 17 00:00:00 2001 From: Kenneth Hoste Date: Thu, 23 May 2024 15:12:34 +0200 Subject: [PATCH 29/55] bump version to 0.2.0 --- RELEASE_NOTES | 2 +- pyproject.toml | 2 +- setup.cfg | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/RELEASE_NOTES b/RELEASE_NOTES index 3efe3cc4..79bd3dab 100644 --- a/RELEASE_NOTES +++ b/RELEASE_NOTES @@ -1,7 +1,7 @@ This file contains a description of the major changes to the EESSI test suite. For more detailed information, please see the git log. -v0.2.0 (7 march 2024) +v0.2.0 (7 March 2024) --------------------- This is a minor release of the EESSI test-suite diff --git a/pyproject.toml b/pyproject.toml index 3c374a5c..2b3b607c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "eessi-testsuite" -version = "0.1.0" +version = "0.2.0" description = "Test suite for the EESSI software stack" readme = "README.md" license = {file = "LICENSE"} diff --git a/setup.cfg b/setup.cfg index 87b688e7..49a7b178 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,6 +1,6 @@ [metadata] name = eessi-testsuite -version = 0.1.0 +version = 0.2.0 description = Test suite for the EESSI software stack long_description = file: README.md long_description_content_type = text/markdown From 8fcdb704d9a065f5a22d869312345bc10d1cdafc Mon Sep 17 00:00:00 2001 From: Kenneth Hoste Date: Thu, 23 May 2024 15:32:21 +0200 Subject: [PATCH 30/55] add notes on release management to README --- README.md | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/README.md b/README.md index 5049bbf8..72878010 100644 --- a/README.md +++ b/README.md @@ -98,3 +98,17 @@ is that it is easy to pull in updates from a feature branch using `git pull`. You can also push back changes to the feature branch directly, but note that you are pushing to the Github fork of another Github user, so _make sure they are ok with that_ before doing so! + +## Release management + +When a release of the EESSI test suite is made, the following things must be taken care of: + +- Version bump: in both `pyproject.toml` and `setup.cfg`; +- Release notes: in `RELEASE_NOTES` + in GitHub release (cfr. https://github.com/EESSI/test-suite/releases/tag/v0.2.0); +- Tag release on GitHub + publish release (incl. release notes); +- Publishing release to PyPI: + ``` + # example for version 0.2.0 + python setup.py sdist + twine upload dist/eessi_testsuite-0.2.0.tar.gz + ``` From e4d8a5183e9246e597c38ddd12f2be11940d86e5 Mon Sep 17 00:00:00 2001 From: vsc46128 vscuser Date: Fri, 24 May 2024 16:26:51 +0200 Subject: [PATCH 31/55] update hortense config --- config/vsc_hortense.py | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/config/vsc_hortense.py b/config/vsc_hortense.py index fbfa9e4c..f8a83b4f 100644 --- a/config/vsc_hortense.py +++ b/config/vsc_hortense.py @@ -54,6 +54,11 @@ def command(self, job): 'features': [ FEATURES[CPU], ] + list(SCALES.keys()), + 'extras': { + # Make sure to round down, otherwise a job might ask for more mem than is available + # per node + 'mem_per_node': 256.000 # in GiB (should be checked, its unclear from slurm.conf) + }, }, { 'name': 'cpu_rome_512gb', @@ -81,6 +86,11 @@ def command(self, job): 'features': [ FEATURES[CPU], ] + list(SCALES.keys()), + 'extras': { + # Make sure to round down, otherwise a job might ask for more mem than is available + # per node + 'mem_per_node': 511.983 # in GiB + }, }, { 'name': 'cpu_milan', @@ -108,6 +118,11 @@ def command(self, job): 'features': [ FEATURES[CPU], ] + list(SCALES.keys()), + 'extras': { + # Make sure to round down, otherwise a job might ask for more mem than is available + # per node + 'mem_per_node': 256.000 # in GiB (should be checked, its unclear from slurm.conf) + }, }, { 'name': 'gpu_rome_a100_40gb', @@ -131,6 +146,9 @@ def command(self, job): ] + list(SCALES.keys()), 'extras': { GPU_VENDOR: GPU_VENDORS[NVIDIA], + # Make sure to round down, otherwise a job might ask for more mem than is available + # per node + 'mem_per_node': 256.000 # in GiB }, 'resources': [ { @@ -172,6 +190,9 @@ def command(self, job): ] + list(SCALES.keys()), 'extras': { GPU_VENDOR: GPU_VENDORS[NVIDIA], + # Make sure to round down, otherwise a job might ask for more mem than is available + # per node + 'mem_per_node': 511.983 # in GiB }, 'resources': [ { From 58a716d1225384ad678514ecb53fe4e577dd278b Mon Sep 17 00:00:00 2001 From: crivella Date: Fri, 24 May 2024 17:31:28 +0200 Subject: [PATCH 32/55] lint --- config/vsc_hortense.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/config/vsc_hortense.py b/config/vsc_hortense.py index f8a83b4f..f349bf60 100644 --- a/config/vsc_hortense.py +++ b/config/vsc_hortense.py @@ -6,7 +6,9 @@ from reframe.core.backends import register_launcher from reframe.core.launchers import JobLauncher -from eessi.testsuite.common_config import common_logging_config, common_general_config, common_eessi_init +from eessi.testsuite.common_config import (common_eessi_init, + common_general_config, + common_logging_config) from eessi.testsuite.constants import * # noqa: F403 account = "my-slurm-account" @@ -56,8 +58,8 @@ def command(self, job): ] + list(SCALES.keys()), 'extras': { # Make sure to round down, otherwise a job might ask for more mem than is available - # per node - 'mem_per_node': 256.000 # in GiB (should be checked, its unclear from slurm.conf) + # per node + 'mem_per_node': 256.000 # in GiB (should be checked, its unclear from slurm.conf) }, }, { From 6e429af96f2297f5f881ca178360350a99e4a9f8 Mon Sep 17 00:00:00 2001 From: Satish Kamath Date: Wed, 29 May 2024 19:05:35 +0200 Subject: [PATCH 33/55] 1. Scaled memory with the number of tasks per node. 2. Increased time limit to account for tuning that takes longer on large number of cores. --- .../testsuite/tests/apps/espresso/espresso.py | 15 ++++++------ .../tests/apps/espresso/src/madelung.py | 23 +++++++------------ 2 files changed, 16 insertions(+), 22 deletions(-) diff --git a/eessi/testsuite/tests/apps/espresso/espresso.py b/eessi/testsuite/tests/apps/espresso/espresso.py index 37f81344..98b0017e 100644 --- a/eessi/testsuite/tests/apps/espresso/espresso.py +++ b/eessi/testsuite/tests/apps/espresso/espresso.py @@ -26,7 +26,7 @@ class EESSI_ESPRESSO_P3M_IONIC_CRYSTALS(rfm.RunOnlyRegressionTest): scale = parameter(SCALES.keys()) valid_prog_environs = ['default'] valid_systems = ['*'] - time_limit = '30m' + time_limit = '180m' # Need to check if QuantumESPRESSO also gets listed. module_name = parameter(find_modules('ESPResSo')) # device type is parameterized for an impending CUDA ESPResSo module. @@ -66,12 +66,6 @@ def set_tag_ci(self): if (self.benchmark_info[0] == 'mpi.ionic_crystals.p3m'): self.tags.add('ionic_crystals_p3m') - - @run_after('init') - def set_mem(self): - """ Setting an extra job option of memory. """ - self.extra_resources = {'memory': {'size': '50GB'}} - @run_after('init') def set_executable_opts(self): """Set executable opts based on device_type parameter""" @@ -89,6 +83,13 @@ def set_num_tasks_per_node(self): for 1 node and 2 node options where the request is for full nodes.""" hooks.assign_tasks_per_compute_unit(self, COMPUTE_UNIT[CPU]) + @run_after('setup') + def set_mem(self): + """ Setting an extra job option of memory. Here the assumption made is that HPC systems will contain at + least 1 GB per core of memory.""" + mem_required_per_node = str(self.num_tasks_per_node * 1) + 'GB' + self.extra_resources = {'memory': {'size': mem_required_per_node}} + @deferrable def assert_completion(self): '''Check completion''' diff --git a/eessi/testsuite/tests/apps/espresso/src/madelung.py b/eessi/testsuite/tests/apps/espresso/src/madelung.py index 628d8eab..2cf6fea0 100644 --- a/eessi/testsuite/tests/apps/espresso/src/madelung.py +++ b/eessi/testsuite/tests/apps/espresso/src/madelung.py @@ -105,6 +105,7 @@ def get_normalized_values_per_ion(system): rtol_forces = 0. rtol_abs_forces = 0. # run checks +print("Executing sanity checks...\n") forces = np.copy(system.part.all().f) energy, p_scalar, p_tensor = get_normalized_values_per_ion(system) ref_energy, ref_pressure = get_reference_values_per_ion(base_vector) @@ -115,22 +116,12 @@ def get_normalized_values_per_ion(system): np.testing.assert_allclose(forces, 0., atol=atol_forces, rtol=rtol_forces) np.testing.assert_allclose(np.median(np.abs(forces)), 0., atol=atol_abs_forces, rtol=rtol_abs_forces) - -print("Executing sanity checks...\n") -if (np.all([np.allclose(energy, ref_energy, atol=atol_energy, rtol=rtol_energy), - np.allclose(p_scalar, np.trace(ref_pressure) / 3., - atol=atol_pressure, rtol=rtol_pressure), - np.allclose(p_tensor, ref_pressure, atol=atol_pressure, rtol=rtol_pressure), - np.allclose(forces, 0., atol=atol_forces, rtol=rtol_forces), - np.allclose(np.median(np.abs(forces)), 0., atol=atol_abs_forces, rtol=rtol_abs_forces)])): - print("Final convergence met with tolerances: \n\ +print("Final convergence met with tolerances: \n\ energy: ", atol_energy, "\n\ p_scalar: ", atol_pressure, "\n\ p_tensor: ", atol_pressure, "\n\ forces: ", atol_forces, "\n\ abs_forces: ", atol_abs_forces, "\n") -else: - print("At least one parameter did not meet the tolerance, see the log above.\n") print("Sampling runtime...\n") # sample runtime @@ -142,11 +133,13 @@ def get_normalized_values_per_ion(system): tock = time.time() timings.append((tock - tick) / n_steps) +print("10 steps executed...\n") # write results to file header = '"mode","cores","mpi.x","mpi.y","mpi.z","particles","mean","std"\n' report = f'"{"weak scaling" if args.weak_scaling else "strong scaling"}",{n_cores},{node_grid[0]},{node_grid[1]},{node_grid[2]},{len(system.part)},{np.mean(timings):.3e},{np.std(timings, ddof=1):.3e}\n' print(report) -if pathlib.Path(args.output).is_file(): - header = "" -with open(args.output, "a") as f: - f.write(header + report) + +# if pathlib.Path(args.output).is_file(): +# header = "" +# with open(args.output, "a") as f: +# f.write(header + report) From e4321a8cc4088634983a853539fd91d978f9f44c Mon Sep 17 00:00:00 2001 From: Satish Kamath Date: Wed, 29 May 2024 19:08:43 +0200 Subject: [PATCH 34/55] 1. Increased time limit again to 5 hours for 16 node tests. This is a temporary fix until the mesh size can be fixed based on extrapolation. --- eessi/testsuite/tests/apps/espresso/espresso.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/eessi/testsuite/tests/apps/espresso/espresso.py b/eessi/testsuite/tests/apps/espresso/espresso.py index 98b0017e..d39c4aaa 100644 --- a/eessi/testsuite/tests/apps/espresso/espresso.py +++ b/eessi/testsuite/tests/apps/espresso/espresso.py @@ -26,7 +26,7 @@ class EESSI_ESPRESSO_P3M_IONIC_CRYSTALS(rfm.RunOnlyRegressionTest): scale = parameter(SCALES.keys()) valid_prog_environs = ['default'] valid_systems = ['*'] - time_limit = '180m' + time_limit = '300m' # Need to check if QuantumESPRESSO also gets listed. module_name = parameter(find_modules('ESPResSo')) # device type is parameterized for an impending CUDA ESPResSo module. From c06e32f2d0a692144954b7338b3183253c484b23 Mon Sep 17 00:00:00 2001 From: Satish Kamath Date: Wed, 29 May 2024 19:18:31 +0200 Subject: [PATCH 35/55] Deleting the tar file. --- .../apps/espresso/src/scripts_Espresso.tar.gz | Bin 3089 -> 0 bytes 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 eessi/testsuite/tests/apps/espresso/src/scripts_Espresso.tar.gz diff --git a/eessi/testsuite/tests/apps/espresso/src/scripts_Espresso.tar.gz b/eessi/testsuite/tests/apps/espresso/src/scripts_Espresso.tar.gz deleted file mode 100644 index 24e2621fec80e082c1830617209e16fa63c8df4e..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 3089 zcmV+s4DRzEiwFP!000001MM1XZ`(MspYRr0(9qv~LhNs*Q44Y*X-3GCt9eGlNARK` z{T}LkZ+NObC9il|WYK&fiSf!LuI+ez*LHn!zF_3_*~uvrXPl%NzhZ&Zkl=g~2{Maf zM&O##OcFk8FfE1B&>?2V7!iCXZ)wK%Nx^dx(1c_xjD*ahX)b}0Bs2*JJR?gUMzaD~ zz)+rqEF%*1kStpY6oU_Mjz5w&EMXaq$w@wqqkz1N0+t9yC@6#@V!=W(MK(xqh#H(# z8jwQ{e5s6h(jzQ__ZhiinSf{F)gmf>J;F2KVNi)`XN0FnW`eE-iK$#sZq9l&xsXJO zV!Gfd%wz$~U~acj9Fr*{xnQ$A?g2I6k{^%G-+uUbP7aPgksl9EPY;gIKm8qW<$}Xo zcEd_e(K3xAU<~ugXd(+x8yLKQefsJxP#nBGdUtgG33MliN9V_{&(6rh_82K0~Eb*4N6#tWIGQ7p>&egant&@2wgg5EH2X~3cz z(2J4)g6hlu0v`nkTu%v-uz{ zd5FP3Q5y|x!XXryNHZn`Uxu3_R^_cNZ&(b`opHzpG73bEK(jf-H|SYCr&KQDXj(tY zXxY3<@+Hg|;x9=xO%Y{e+%T{XvU$E_i9AtnjF5>Si&Qj?^_MIO7E794lN0Yf5z#V_ zRd@XSQLGj?3JXn+gjcx`DOkvXCbj%4+5HmrJ zaVh9q87k002Z##EiQzs!Gym(B!A|=9TSl+@BA{^u;h`d<;(z=oKfnX?$*5c()8d|H z0a!--@UU6uX}&UKvb#AIvE3HrK^&+f5+&sXiRQ-cerz}=R#bBSTGcaK`^Ni;y ztk>48mbrw`jEf1o3*uZrUYjT^qr%>pdOKq8Pj(`+{DB#liv9@1qz?B+KVrPW24yCA z@=QR*x!s2dPGBT~0>a^a3So^g-OOP#Q9xEp=D%C7Gey2lvx8+w7x9K#l1o_7s2u8q zLAf%kx}bIfjHrP-$ehVZ#%3%7zHD+s21ON% z9E#u;B;eVWH$Vw8qX9Eo!?iS-XGu$FZ;kuPFbU6=VD_I`c;7Oj5Rxq`qcobAqZWk} zRftGfdchYPNNv*^-qQ;aD5mgwQw8uacvs!Ho16i8lUn_1m zJcgJx)=cDDVAg7S19IFe2O9OC1ATkAQm`7mb&1(xgDKJYiQMC)OAu}5gUC=0nMuTRWS~SCULs+$lGCCgP4b0m2Yt*QI8bbgR z>_iAO$I#T}Q|R5GIf3fVwDyV`W4vQCS{N4&8czKK(^be5Y#0E`1Mgh8@S|km;ANvX z=Fdr$XiDAh5rc@TwiibGRc|$m##hRjkJ7ZWi-!H}753@SPS4pI=7oJ(Ei^>BegMg- zk!r;F0YtZrh%i|N04@wn;6}>DxUKocRgYZvNPM}j_V&uxhR&T)*9M1vrQTw07B)*q@sH#VDkYe}y#`Si9l{MIr`@SRDzhR@Q`k?OBgT^M*{9^>@A?voz(pUAAcdMNXwNdju;&O}$eQ+t#C`xQJN@i68Ii zR8+i7v&9YM4_n-DF;d(ZU_7ZEW}=D?Mw4S}Fam<`;o?%It;s6UICd4*5TVzAfz+yM zRb{mbLMR{%_SoQh3q%+d`w~U+t(28ii>4_{Lc>sKeI*B-*RTNPZ(#-FQGkHtC2(hl z&9}B-G*rvwnZAT%t@m_Q&fvOCBkQiZ7S;7=^6J4zJvih%j@0Y5o{t9<>34wgS_UNoXy7+j%nL!NnH2Ar)l(}zt ztQ-cd>rvRmgnTx3%C|U`XjUI8SSN5blpP)ubH@1i`LFFbzIX5Z*Y#Y#JOBM2@P95wZGG|e z{GJPcPH#AZ?CM&c51G<-dV7|=hj*TKe+@PiJ9{2d_@kks7(shIDk;__IKzs>-5vUS zNa5_V7q$%)JG)5hZpvtAln~F_+wl;w=UAhWvrPzK`HEy`#~lH&{_0U;{1)7)PK9~9$*c3b`-v6`JTT`snSkP&+~VNpa}dbOZnDtceLHM zHAOY;kylbUmOa{gz*5zpq8M79=T#KtY@AJsq2DY26bV?=vDGw&mgBi<+JAJ2ckl?L zF_+)++aJOg^1thO8{hwI*Xi>A_kb-QFPcvsNeBb@L9O|cvZN9N2Ah?%z&rIimRsIq zX9yh!ykFw2riC*+2MnzYT9FC#8pi!&T(d+8XRx(`7P{Q87}H{WQ57jtS1D4LDN@%d zQWYxP%hpG^!02<%vci}tdffRvThCt4K&Cc}=BQXKEx5yzFSy`FeQe+cVQ88fy~9p3 znp+~D;`9V;-sl|@+FeK`#f=t9yr{Ihv?QM>dqV46tYfIkxrRXRfEDP%3`22)=CeoY zJJ=+M7<#<116zN<5weKcSnu~?!U2zAbF9xZhSQJfl*MEHe8E5mus4mt7kAMzU+PK* zs$B}`k)q-bZF^CU=)Aw;@t#rKC*kq2D2WB^BV{C^550F9yn217hzb2+{bdxR*N z^@!6r-2SsTy!~0?cewlYY!(F(tn=b49A3Z#ghEj#Ig=S4en`nOL!}Vwgu&K1k%!H& zV?G!Pe{^2`tH2eXg2}$Ej4#xGd+6M+|2DMpUH$(a@brhlG)e}r3K!bbvzG_wuilb= z9}j-Uw!QD$`>@qnjhCkk3G$bOW!S$f}{7U9Q zFmPaVXxSvlLt5YCVFpz3fp0n1ko>fE7&x2B+kppYj!mYV3psGCJtRsCx!?)B0w#_o zcn&FoSk|CE0`B{kH@IQRjhZHY`n;i%X#$UG`~4uz`!F7k%TFsMGq3r$2Tcws|EC2R z-uoX3zn#N~-2WD Date: Tue, 4 Jun 2024 12:50:55 +0200 Subject: [PATCH 36/55] Increasing the pressure tolerance for higher node counts as instructed by Jean-Noel. --- eessi/testsuite/tests/apps/espresso/src/madelung.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/eessi/testsuite/tests/apps/espresso/src/madelung.py b/eessi/testsuite/tests/apps/espresso/src/madelung.py index 2cf6fea0..0c848dfc 100644 --- a/eessi/testsuite/tests/apps/espresso/src/madelung.py +++ b/eessi/testsuite/tests/apps/espresso/src/madelung.py @@ -101,7 +101,8 @@ def get_normalized_values_per_ion(system): atol_abs_forces = 2e-6 rtol_energy = 5e-6 -rtol_pressure = 2e-5 +#rtol_pressure = 2e-5 +rtol_pressure = 1e-4 rtol_forces = 0. rtol_abs_forces = 0. # run checks From eea35380407b6b8dfce398831881ab79b7387483 Mon Sep 17 00:00:00 2001 From: Satish Kamath Date: Tue, 4 Jun 2024 14:29:30 +0200 Subject: [PATCH 37/55] Introduced a performance function for weak scaling which is the mean time per step. --- eessi/testsuite/tests/apps/espresso/espresso.py | 4 ++++ eessi/testsuite/tests/apps/espresso/src/madelung.py | 3 +++ 2 files changed, 7 insertions(+) diff --git a/eessi/testsuite/tests/apps/espresso/espresso.py b/eessi/testsuite/tests/apps/espresso/espresso.py index d39c4aaa..aee07353 100644 --- a/eessi/testsuite/tests/apps/espresso/espresso.py +++ b/eessi/testsuite/tests/apps/espresso/espresso.py @@ -111,3 +111,7 @@ def assert_sanity(self): self.assert_convergence(), ]) + @performance_function('s/step') + def perf(self): + return sn.extractsingle(r'^Performance:\s+(?P\S+)', self.stdout, 'perf', float) + diff --git a/eessi/testsuite/tests/apps/espresso/src/madelung.py b/eessi/testsuite/tests/apps/espresso/src/madelung.py index 0c848dfc..7d55bd0d 100644 --- a/eessi/testsuite/tests/apps/espresso/src/madelung.py +++ b/eessi/testsuite/tests/apps/espresso/src/madelung.py @@ -138,8 +138,11 @@ def get_normalized_values_per_ion(system): # write results to file header = '"mode","cores","mpi.x","mpi.y","mpi.z","particles","mean","std"\n' report = f'"{"weak scaling" if args.weak_scaling else "strong scaling"}",{n_cores},{node_grid[0]},{node_grid[1]},{node_grid[2]},{len(system.part)},{np.mean(timings):.3e},{np.std(timings, ddof=1):.3e}\n' +print(header) print(report) +print(f"Performance: {np.mean(timings):.3e} \n") + # if pathlib.Path(args.output).is_file(): # header = "" # with open(args.output, "a") as f: From a2c660faa87d0689360d815b2c9a92eb8ed10d46 Mon Sep 17 00:00:00 2001 From: Satish Kamath Date: Tue, 4 Jun 2024 14:57:32 +0200 Subject: [PATCH 38/55] Trying to make the linter happy. --- eessi/testsuite/tests/apps/espresso/espresso.py | 15 ++++++--------- .../testsuite/tests/apps/espresso/src/madelung.py | 14 +++++++------- 2 files changed, 13 insertions(+), 16 deletions(-) diff --git a/eessi/testsuite/tests/apps/espresso/espresso.py b/eessi/testsuite/tests/apps/espresso/espresso.py index aee07353..9a3a8ecb 100644 --- a/eessi/testsuite/tests/apps/espresso/espresso.py +++ b/eessi/testsuite/tests/apps/espresso/espresso.py @@ -1,11 +1,10 @@ """ -This module tests Espresso in available modules containing substring 'ESPResSo' which is different from Quantum Espresso. -Tests included: +This module tests Espresso in available modules containing substring 'ESPResSo' which is different from Quantum +Espresso. Tests included: - P3M benchmark - Ionic crystals - Weak scaling - - Strong scaling -Weak and strong scaling are options that are needed to be provided tothe script and the system is either scaled based on -number of cores or kept constant. + - Strong scaling Weak and strong scaling are options that are needed to be provided to the script and the system is + either scaled based on number of cores or kept constant. """ import reframe as rfm @@ -14,15 +13,14 @@ from reframe.core.builtins import parameter, run_after # added only to make the linter happy from reframe.utility import reframe -from hpctestlib.microbenchmarks.mpi.osu import osu_benchmark - from eessi.testsuite import hooks, utils from eessi.testsuite.constants import * from eessi.testsuite.utils import find_modules, log @rfm.simple_test class EESSI_ESPRESSO_P3M_IONIC_CRYSTALS(rfm.RunOnlyRegressionTest): - '''''' + + scale = parameter(SCALES.keys()) valid_prog_environs = ['default'] valid_systems = ['*'] @@ -45,7 +43,6 @@ class EESSI_ESPRESSO_P3M_IONIC_CRYSTALS(rfm.RunOnlyRegressionTest): @run_after('init') def run_after_init(self): """hooks to run after init phase""" - # Filter on which scales are supported by the partitions defined in the ReFrame configuration hooks.filter_supported_scales(self) diff --git a/eessi/testsuite/tests/apps/espresso/src/madelung.py b/eessi/testsuite/tests/apps/espresso/src/madelung.py index 7d55bd0d..ce41d61a 100644 --- a/eessi/testsuite/tests/apps/espresso/src/madelung.py +++ b/eessi/testsuite/tests/apps/espresso/src/madelung.py @@ -21,7 +21,6 @@ import espressomd.version import espressomd.electrostatics import argparse -import pathlib import time import numpy as np @@ -45,6 +44,7 @@ help="Strong scaling benchmark (Amdahl's law: constant total work)") args = parser.parse_args() + def get_reference_values_per_ion(base_vector): madelung_constant = -1.74756459463318219 base_tensor = base_vector * np.eye(3) @@ -52,6 +52,7 @@ def get_reference_values_per_ion(base_vector): ref_pressure = madelung_constant * base_tensor / np.trace(base_tensor) return ref_energy, ref_pressure + def get_normalized_values_per_ion(system): energy = system.analysis.energy()["coulomb"] p_scalar = system.analysis.pressure()["coulomb"] @@ -60,6 +61,7 @@ def get_normalized_values_per_ion(system): V = system.volume() return 2. * energy / N, 2. * p_scalar * V / N, 2. * p_tensor * V / N + # initialize system system = espressomd.System(box_l=[100., 100., 100.]) system.time_step = 0.01 @@ -96,12 +98,15 @@ def get_normalized_values_per_ion(system): print("Algorithm executed. \n") +# Old rtol_pressure = 2e-5 +# This resulted in failures especially at high number of nodes therefore increased +# to a larger value. + atol_energy = atol_pressure = 1e-12 atol_forces = 1e-5 atol_abs_forces = 2e-6 rtol_energy = 5e-6 -#rtol_pressure = 2e-5 rtol_pressure = 1e-4 rtol_forces = 0. rtol_abs_forces = 0. @@ -142,8 +147,3 @@ def get_normalized_values_per_ion(system): print(report) print(f"Performance: {np.mean(timings):.3e} \n") - -# if pathlib.Path(args.output).is_file(): -# header = "" -# with open(args.output, "a") as f: -# f.write(header + report) From cbe7fe9865bdeaa142a7e4cc37d38d4dacf3ab45 Mon Sep 17 00:00:00 2001 From: Satish Kamath Date: Tue, 4 Jun 2024 15:00:39 +0200 Subject: [PATCH 39/55] Linter changes. --- eessi/testsuite/tests/apps/espresso/espresso.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/eessi/testsuite/tests/apps/espresso/espresso.py b/eessi/testsuite/tests/apps/espresso/espresso.py index 9a3a8ecb..9fbccf58 100644 --- a/eessi/testsuite/tests/apps/espresso/espresso.py +++ b/eessi/testsuite/tests/apps/espresso/espresso.py @@ -17,10 +17,10 @@ from eessi.testsuite.constants import * from eessi.testsuite.utils import find_modules, log + @rfm.simple_test class EESSI_ESPRESSO_P3M_IONIC_CRYSTALS(rfm.RunOnlyRegressionTest): - scale = parameter(SCALES.keys()) valid_prog_environs = ['default'] valid_systems = ['*'] @@ -39,7 +39,6 @@ class EESSI_ESPRESSO_P3M_IONIC_CRYSTALS(rfm.RunOnlyRegressionTest): ('mpi.ionic_crystals.p3m', 'p3m'), ], fmt=lambda x: x[0], loggable=True) - @run_after('init') def run_after_init(self): """hooks to run after init phase""" @@ -111,4 +110,3 @@ def assert_sanity(self): @performance_function('s/step') def perf(self): return sn.extractsingle(r'^Performance:\s+(?P\S+)', self.stdout, 'perf', float) - From f434c48a68512d9a8f3fcc5ff786d21f76957c83 Mon Sep 17 00:00:00 2001 From: Satish Kamath Date: Wed, 5 Jun 2024 00:55:03 +0200 Subject: [PATCH 40/55] Removed plot.py as it served no purpose and improved formatting in madelung.py for the linter. --- .../tests/apps/espresso/src/madelung.py | 4 +- .../testsuite/tests/apps/espresso/src/plot.py | 39 ------------------- 2 files changed, 3 insertions(+), 40 deletions(-) delete mode 100644 eessi/testsuite/tests/apps/espresso/src/plot.py diff --git a/eessi/testsuite/tests/apps/espresso/src/madelung.py b/eessi/testsuite/tests/apps/espresso/src/madelung.py index ce41d61a..1c019e29 100644 --- a/eessi/testsuite/tests/apps/espresso/src/madelung.py +++ b/eessi/testsuite/tests/apps/espresso/src/madelung.py @@ -142,7 +142,9 @@ def get_normalized_values_per_ion(system): print("10 steps executed...\n") # write results to file header = '"mode","cores","mpi.x","mpi.y","mpi.z","particles","mean","std"\n' -report = f'"{"weak scaling" if args.weak_scaling else "strong scaling"}",{n_cores},{node_grid[0]},{node_grid[1]},{node_grid[2]},{len(system.part)},{np.mean(timings):.3e},{np.std(timings, ddof=1):.3e}\n' +report = f'''"{"weak scaling" if args.weak_scaling else "strong scaling"}",\ +{n_cores},{node_grid[0]},{node_grid[1]},{node_grid[2]},{len(system.part)},\ +{np.mean(timings):.3e},{np.std(timings,ddof=1):.3e}\n''' print(header) print(report) diff --git a/eessi/testsuite/tests/apps/espresso/src/plot.py b/eessi/testsuite/tests/apps/espresso/src/plot.py deleted file mode 100644 index c9a023c4..00000000 --- a/eessi/testsuite/tests/apps/espresso/src/plot.py +++ /dev/null @@ -1,39 +0,0 @@ -import numpy as np -import pandas as pd -import matplotlib.pyplot as plt -import matplotlib.ticker as mtick - -df = pd.read_csv("benchmarks.csv") -df = df.sort_values(by=["mode", "cores", "mpi.x", "mpi.y", "mpi.z"]) - -group = df.query(f"mode == 'strong scaling'") - -fig = plt.figure(figsize=(12, 6)) -ax = fig.subplots().axes -xdata = group["cores"].to_numpy() -ydata = group["mean"].to_numpy() -ax.axline((xdata[0], xdata[0]), slope=1, linestyle="--", color="grey", label="Theoretical maximum") -ax.plot(xdata, ydata[0] / ydata, "o-", label="Measurements") -ax.set_title("Strong scaling") -ax.set_xlabel("Number of cores") -ax.set_ylabel("Speed-up") -ax.set_xscale("log", base=2) -ax.set_yscale("log", base=10) -ax.legend() -plt.show() - -group = df.query(f"mode == 'weak scaling'") - -fig = plt.figure(figsize=(12, 6)) -ax = fig.subplots().axes -xdata = group["cores"].to_numpy() -ydata = group["mean"].to_numpy() -ax.axline((-np.inf, 1), slope=0, linestyle="--", color="grey", label="Theoretical maximum") -ax.plot(xdata, ydata[0] / ydata, "o-", label="Measurements") -ax.set_title("Weak scaling") -ax.set_xlabel("Number of cores") -ax.set_ylabel("Efficiency") -ax.set_xscale("log", base=2) -ax.yaxis.set_major_formatter(mtick.PercentFormatter(1)) -ax.legend() -plt.show() From bd04f8340bacb434cf2db908d17091e1c74f93d8 Mon Sep 17 00:00:00 2001 From: Satish Kamath Date: Wed, 5 Jun 2024 01:02:14 +0200 Subject: [PATCH 41/55] Making linter happy again. --- eessi/testsuite/tests/apps/espresso/src/madelung.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/eessi/testsuite/tests/apps/espresso/src/madelung.py b/eessi/testsuite/tests/apps/espresso/src/madelung.py index 1c019e29..37d0b44a 100644 --- a/eessi/testsuite/tests/apps/espresso/src/madelung.py +++ b/eessi/testsuite/tests/apps/espresso/src/madelung.py @@ -79,11 +79,11 @@ def get_normalized_values_per_ion(system): if args.weak_scaling: lattice_size = np.multiply(lattice_size, node_grid) system.box_l = np.multiply(lattice_size, base_vector) -for j in range(lattice_size[0]): - for k in range(lattice_size[1]): - for l in range(lattice_size[2]): - _ = system.part.add(pos=np.multiply([j, k, l], base_vector), - q=(-1.)**(j + k + l), fix=3 * [True]) +for var_j in range(lattice_size[0]): + for var_k in range(lattice_size[1]): + for var_l in range(lattice_size[2]): + _ = system.part.add(pos=np.multiply([var_j, var_k, var_l], base_vector), + q=(-1.)**(var_j + var_k + var_l), fix=3 * [True]) # setup P3M algorithm algorithm = espressomd.electrostatics.P3M From ef21ed5259c787b3993732df78e9d4c81b042f56 Mon Sep 17 00:00:00 2001 From: Satish Kamath Date: Thu, 6 Jun 2024 23:59:19 +0200 Subject: [PATCH 42/55] Using mem_required_per_node from the hooks. Tested on Snellius and it works properly. --- eessi/testsuite/tests/apps/espresso/espresso.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/eessi/testsuite/tests/apps/espresso/espresso.py b/eessi/testsuite/tests/apps/espresso/espresso.py index 9fbccf58..2fbb5ce3 100644 --- a/eessi/testsuite/tests/apps/espresso/espresso.py +++ b/eessi/testsuite/tests/apps/espresso/espresso.py @@ -83,8 +83,9 @@ def set_num_tasks_per_node(self): def set_mem(self): """ Setting an extra job option of memory. Here the assumption made is that HPC systems will contain at least 1 GB per core of memory.""" - mem_required_per_node = str(self.num_tasks_per_node * 1) + 'GB' - self.extra_resources = {'memory': {'size': mem_required_per_node}} + mem_required_per_node = self.num_tasks_per_node * 0.9 + hooks.req_memory_per_node(test=self, app_mem_req=mem_required_per_node) + @deferrable def assert_completion(self): From 140b9e424e81694f22207303eda7e54467d932b3 Mon Sep 17 00:00:00 2001 From: Satish Kamath Date: Fri, 7 Jun 2024 00:03:31 +0200 Subject: [PATCH 43/55] Making the linter happy. --- eessi/testsuite/tests/apps/espresso/espresso.py | 1 - 1 file changed, 1 deletion(-) diff --git a/eessi/testsuite/tests/apps/espresso/espresso.py b/eessi/testsuite/tests/apps/espresso/espresso.py index 2fbb5ce3..7db09ff9 100644 --- a/eessi/testsuite/tests/apps/espresso/espresso.py +++ b/eessi/testsuite/tests/apps/espresso/espresso.py @@ -86,7 +86,6 @@ def set_mem(self): mem_required_per_node = self.num_tasks_per_node * 0.9 hooks.req_memory_per_node(test=self, app_mem_req=mem_required_per_node) - @deferrable def assert_completion(self): '''Check completion''' From dcd268810440fc458cab43f538faf8ecd473cea4 Mon Sep 17 00:00:00 2001 From: vsc46128 vscuser Date: Sat, 8 Jun 2024 11:47:24 +0200 Subject: [PATCH 44/55] fix memory per node --- config/vsc_hortense.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/config/vsc_hortense.py b/config/vsc_hortense.py index f349bf60..1615330b 100644 --- a/config/vsc_hortense.py +++ b/config/vsc_hortense.py @@ -59,7 +59,7 @@ def command(self, job): 'extras': { # Make sure to round down, otherwise a job might ask for more mem than is available # per node - 'mem_per_node': 256.000 # in GiB (should be checked, its unclear from slurm.conf) + 'mem_per_node': 234 # in GiB }, }, { @@ -91,7 +91,7 @@ def command(self, job): 'extras': { # Make sure to round down, otherwise a job might ask for more mem than is available # per node - 'mem_per_node': 511.983 # in GiB + 'mem_per_node': 473 # in GiB }, }, { @@ -123,7 +123,7 @@ def command(self, job): 'extras': { # Make sure to round down, otherwise a job might ask for more mem than is available # per node - 'mem_per_node': 256.000 # in GiB (should be checked, its unclear from slurm.conf) + 'mem_per_node': 234 # in GiB }, }, { @@ -150,7 +150,7 @@ def command(self, job): GPU_VENDOR: GPU_VENDORS[NVIDIA], # Make sure to round down, otherwise a job might ask for more mem than is available # per node - 'mem_per_node': 256.000 # in GiB + 'mem_per_node': 236 # in GiB }, 'resources': [ { @@ -194,7 +194,7 @@ def command(self, job): GPU_VENDOR: GPU_VENDORS[NVIDIA], # Make sure to round down, otherwise a job might ask for more mem than is available # per node - 'mem_per_node': 511.983 # in GiB + 'mem_per_node': 475 # in GiB }, 'resources': [ { From 889e518deacb39efe8b0f94007c2477c14a6d0d6 Mon Sep 17 00:00:00 2001 From: Samuel Moors Date: Sun, 9 Jun 2024 10:23:00 +0200 Subject: [PATCH 45/55] use MiB units for slurm --- eessi/testsuite/hooks.py | 43 +++++++++++++++++++--------------------- 1 file changed, 20 insertions(+), 23 deletions(-) diff --git a/eessi/testsuite/hooks.py b/eessi/testsuite/hooks.py index 5dd98a7f..c40613e4 100644 --- a/eessi/testsuite/hooks.py +++ b/eessi/testsuite/hooks.py @@ -383,7 +383,7 @@ def filter_valid_systems_by_device_type(test: rfm.RegressionTest, required_devic log(f'valid_systems set to {test.valid_systems}') -def req_memory_per_node(test: rfm.RegressionTest, app_mem_req): +def req_memory_per_node(test: rfm.RegressionTest, app_mem_req: int): """ This hook will request a specific amount of memory per node to the batch scheduler. First, it computes which fraction of CPUs is requested from a node, and how much the corresponding (proportional) @@ -396,59 +396,56 @@ def req_memory_per_node(test: rfm.RegressionTest, app_mem_req): Arguments: - test: the ReFrame test to which this hook should apply - - app_mem_req: the amount of memory this application needs (per node) in GiB + - app_mem_req: the amount of memory this application needs (per node) in MiB Example 1: - A system with 128 cores and 64 GiB per node. - The test is launched on 64 cores - - The app_mem_req is 40 (GiB) - In this case, the test requests 50% of the CPUs. Thus, the proportional amount of memory is 32 GiB. - The app_mem_req is higher. Thus, 40GiB (per node) is requested from the batch scheduler. + - The app_mem_req is 40,000 (MiB) + In this case, the test requests 50% of the CPUs. Thus, the proportional amount of memory is 32,000 MiB. + The app_mem_req is higher. Thus, 40,000 MiB (per node) is requested from the batch scheduler. Example 2: - - A system with 128 cores per node, 128 GiB mem per node is used. + - A system with 128 cores per node, 128,000 MiB mem per node is used. - The test is launched on 64 cores - - the app_mem_req is 40 (GiB) - In this case, the test requests 50% of the CPUs. Thus, the proportional amount of memory is 64 GiB. - This is higher than the app_mem_req. Thus, 64 GiB (per node) is requested from the batch scheduler. + - the app_mem_req is 40,000 (MiB) + In this case, the test requests 50% of the CPUs. Thus, the proportional amount of memory is 64,000 MiB. + This is higher than the app_mem_req. Thus, 64,000 MiB (per node) is requested from the batch scheduler. """ # Check that the systems.partitions.extra dict in the ReFrame config contains mem_per_node check_extras_key_defined(test, 'mem_per_node') # Skip if the current partition doesn't have sufficient memory to run the application - msg = f"Skipping test: nodes in this partition only have {test.current_partition.extras['mem_per_node']} GiB" + msg = f"Skipping test: nodes in this partition only have {test.current_partition.extras['mem_per_node']} MiB" msg += " memory available (per node) accodring to the current ReFrame configuration," - msg += f" but {app_mem_req} GiB is needed" + msg += f" but {app_mem_req} MiB is needed" test.skip_if(test.current_partition.extras['mem_per_node'] < app_mem_req, msg) # Compute what is higher: the requested memory, or the memory available proportional to requested CPUs # Fraction of CPU cores requested check_proc_attribute_defined(test, 'num_cpus') cpu_fraction = test.num_tasks_per_node * test.num_cpus_per_task / test.current_partition.processor.num_cpus - proportional_mem = cpu_fraction * test.current_partition.extras['mem_per_node'] + proportional_mem = math.floor(cpu_fraction * test.current_partition.extras['mem_per_node']) scheduler_name = test.current_partition.scheduler.registered_name if scheduler_name == 'slurm' or scheduler_name == 'squeue': - # SLURMs --mem defines memory per node, see https://slurm.schedmd.com/sbatch.html - # SLURM uses megabytes and gigabytes, i.e. base-10, so conversion is 1000, not 1024 - # Thus, we convert from GiB (gibibytes) to MB (megabytes) (1024 * 1024 * 1024 / (1000 * 1000) = 1073.741824) - app_mem_req = math.ceil(1073.741824 * app_mem_req) - log(f"Memory requested by application: {app_mem_req} MB") - proportional_mem = math.floor(1073.741824 * proportional_mem) - log(f"Memory proportional to the core count: {proportional_mem} MB") + # SLURM defines --mem as memory per node, see https://slurm.schedmd.com/sbatch.html + # SLURM uses MiB units by default + log(f"Memory requested by application: {app_mem_req} MiB") + log(f"Memory proportional to the core count: {proportional_mem} MiB") # Request the maximum of the proportional_mem, and app_mem_req to the scheduler req_mem_per_node = max(proportional_mem, app_mem_req) test.extra_resources = {'memory': {'size': f'{req_mem_per_node}M'}} - log(f"Requested {req_mem_per_node} MB per node from the SLURM batch scheduler") + log(f"Requested {req_mem_per_node} MiB per node from the SLURM batch scheduler") elif scheduler_name == 'torque': # Torque/moab requires asking for --pmem (--mem only works single node and thus doesnt generalize) # See https://docs.adaptivecomputing.com/10-0-1/Torque/torque.htm#topics/torque/3-jobs/3.1.3-requestingRes.htm - # Units are MiB according to the documentation, thus, we simply multiply with 1024 + # Units are MiB according to the documentation # We immediately divide by num_tasks_per_node (before rounding), since -pmem specifies memroy _per process_ - app_mem_req_task = math.ceil(1024 * app_mem_req / test.num_tasks_per_node) - proportional_mem_task = math.floor(1024 * proportional_mem / test.num_tasks_per_node) + app_mem_req_task = math.ceil(app_mem_req / test.num_tasks_per_node) + proportional_mem_task = math.floor(proportional_mem / test.num_tasks_per_node) # Request the maximum of the proportional_mem, and app_mem_req to the scheduler req_mem_per_task = max(proportional_mem_task, app_mem_req_task) From 0bacba813ee58330597e7ff871c02c17d49ca190 Mon Sep 17 00:00:00 2001 From: Samuel Moors Date: Sun, 9 Jun 2024 12:12:57 +0200 Subject: [PATCH 46/55] use MiB units for memory per node --- eessi/testsuite/hooks.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/eessi/testsuite/hooks.py b/eessi/testsuite/hooks.py index c40613e4..d6829d08 100644 --- a/eessi/testsuite/hooks.py +++ b/eessi/testsuite/hooks.py @@ -383,7 +383,7 @@ def filter_valid_systems_by_device_type(test: rfm.RegressionTest, required_devic log(f'valid_systems set to {test.valid_systems}') -def req_memory_per_node(test: rfm.RegressionTest, app_mem_req: int): +def req_memory_per_node(test: rfm.RegressionTest, app_mem_req: float): """ This hook will request a specific amount of memory per node to the batch scheduler. First, it computes which fraction of CPUs is requested from a node, and how much the corresponding (proportional) @@ -425,6 +425,7 @@ def req_memory_per_node(test: rfm.RegressionTest, app_mem_req: int): check_proc_attribute_defined(test, 'num_cpus') cpu_fraction = test.num_tasks_per_node * test.num_cpus_per_task / test.current_partition.processor.num_cpus proportional_mem = math.floor(cpu_fraction * test.current_partition.extras['mem_per_node']) + app_mem_req = math.ceil(app_mem_req) scheduler_name = test.current_partition.scheduler.registered_name if scheduler_name == 'slurm' or scheduler_name == 'squeue': From e5234583ec01cda67927471554753a9b5043f20d Mon Sep 17 00:00:00 2001 From: Satish Kamath Date: Mon, 10 Jun 2024 16:25:55 +0200 Subject: [PATCH 47/55] Removing 16 node test case for now since it takes way too long and have dialing down the scales within the CI tests since they should not take too much time. --- eessi/testsuite/tests/apps/espresso/espresso.py | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/eessi/testsuite/tests/apps/espresso/espresso.py b/eessi/testsuite/tests/apps/espresso/espresso.py index 7db09ff9..7213ee6c 100644 --- a/eessi/testsuite/tests/apps/espresso/espresso.py +++ b/eessi/testsuite/tests/apps/espresso/espresso.py @@ -17,11 +17,23 @@ from eessi.testsuite.constants import * from eessi.testsuite.utils import find_modules, log +def filter_scales_P3M(): + """ + Filtering function for filtering scales for P3M test. + This is currently required because the 16 node test takes way too long and always fails due to time limit. + Once a solution to mesh tuning algorithm is found, where we can specify the mesh sizes for a particular scale, + this function can be removed. + """ + return [ + k for (k, v) in SCALES.items() + if v['num_nodes'] != 16 + ] + @rfm.simple_test class EESSI_ESPRESSO_P3M_IONIC_CRYSTALS(rfm.RunOnlyRegressionTest): - scale = parameter(SCALES.keys()) + scale = parameter(filter_scales_P3M()) valid_prog_environs = ['default'] valid_systems = ['*'] time_limit = '300m' @@ -55,7 +67,8 @@ def run_after_init(self): @run_after('init') def set_tag_ci(self): """ Setting tests under CI tag. """ - if (self.benchmark_info[0] in ['mpi.ionic_crystals.p3m']): + if (self.benchmark_info[0] in ['mpi.ionic_crystals.p3m'] + and SCALES[self.scale]['num_nodes'] < 2): self.tags.add('CI') log(f'tags set to {self.tags}') From c5e02458a45e39047dd5a81c7ad7b1a20304a139 Mon Sep 17 00:00:00 2001 From: Satish Kamath Date: Mon, 10 Jun 2024 16:32:29 +0200 Subject: [PATCH 48/55] Trying to make the linter happy. --- eessi/testsuite/tests/apps/espresso/espresso.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/eessi/testsuite/tests/apps/espresso/espresso.py b/eessi/testsuite/tests/apps/espresso/espresso.py index 7213ee6c..5366fe5b 100644 --- a/eessi/testsuite/tests/apps/espresso/espresso.py +++ b/eessi/testsuite/tests/apps/espresso/espresso.py @@ -17,6 +17,7 @@ from eessi.testsuite.constants import * from eessi.testsuite.utils import find_modules, log + def filter_scales_P3M(): """ Filtering function for filtering scales for P3M test. @@ -67,8 +68,8 @@ def run_after_init(self): @run_after('init') def set_tag_ci(self): """ Setting tests under CI tag. """ - if (self.benchmark_info[0] in ['mpi.ionic_crystals.p3m'] - and SCALES[self.scale]['num_nodes'] < 2): + if (self.benchmark_info[0] in ['mpi.ionic_crystals.p3m'] and + SCALES[self.scale]['num_nodes'] < 2): self.tags.add('CI') log(f'tags set to {self.tags}') From df8873c69fcc4f7ff47b486b8fd8ebfc2fa9e9f0 Mon Sep 17 00:00:00 2001 From: Satish Kamath Date: Mon, 10 Jun 2024 16:36:04 +0200 Subject: [PATCH 49/55] Making the linter happy. --- eessi/testsuite/tests/apps/espresso/espresso.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/eessi/testsuite/tests/apps/espresso/espresso.py b/eessi/testsuite/tests/apps/espresso/espresso.py index 5366fe5b..a1675afd 100644 --- a/eessi/testsuite/tests/apps/espresso/espresso.py +++ b/eessi/testsuite/tests/apps/espresso/espresso.py @@ -68,8 +68,7 @@ def run_after_init(self): @run_after('init') def set_tag_ci(self): """ Setting tests under CI tag. """ - if (self.benchmark_info[0] in ['mpi.ionic_crystals.p3m'] and - SCALES[self.scale]['num_nodes'] < 2): + if (self.benchmark_info[0] in ['mpi.ionic_crystals.p3m'] and SCALES[self.scale]['num_nodes'] < 2): self.tags.add('CI') log(f'tags set to {self.tags}') From aebfdc189c5fa6104bc6a0e588d2914a32b9eab0 Mon Sep 17 00:00:00 2001 From: Satish Kamath Date: Tue, 11 Jun 2024 16:55:28 +0200 Subject: [PATCH 50/55] Removing files that are not relevant: job.sh and benchmark.csv and removing the statement from madelung that puts benchmark.csv as path within the output parameter. --- .../tests/apps/espresso/benchmarks.csv | 34 ------------------- .../testsuite/tests/apps/espresso/src/job.sh | 10 ------ .../tests/apps/espresso/src/madelung.py | 3 -- 3 files changed, 47 deletions(-) delete mode 100644 eessi/testsuite/tests/apps/espresso/benchmarks.csv delete mode 100644 eessi/testsuite/tests/apps/espresso/src/job.sh diff --git a/eessi/testsuite/tests/apps/espresso/benchmarks.csv b/eessi/testsuite/tests/apps/espresso/benchmarks.csv deleted file mode 100644 index 9091534b..00000000 --- a/eessi/testsuite/tests/apps/espresso/benchmarks.csv +++ /dev/null @@ -1,34 +0,0 @@ -"mode","cores","mpi.x","mpi.y","mpi.z","particles","mean","std" -"weak scaling",4,2,2,1,6912,2.341e-01,8.081e-03 -"strong scaling",4,2,2,1,5832,2.496e-01,9.019e-03 -"weak scaling",16,4,2,2,27648,2.417e+00,9.576e-02 -"strong scaling",16,4,2,2,5832,3.853e-02,1.991e-03 -"weak scaling",32,4,4,2,55296,4.263e+00,1.161e+00 -"strong scaling",32,4,4,2,5832,2.194e-02,7.303e-04 -"weak scaling",1,1,1,1,1728,7.655e-02,3.434e-03 -"weak scaling",2,2,1,1,3456,1.456e-01,4.679e-03 -"strong scaling",2,2,1,1,5832,3.936e-01,1.098e-02 -"strong scaling",1,1,1,1,5832,6.333e-01,1.194e-01 -"strong scaling",64,4,4,4,5832,1.910e-02,6.132e-04 -"weak scaling",1,1,1,1,1728,9.482e-02,2.956e-03 -"weak scaling",2,2,1,1,3456,2.111e-01,6.614e-03 -"strong scaling",1,1,1,1,5832,9.133e-01,2.868e-02 -"strong scaling",16,4,2,2,5832,4.285e-02,1.327e-03 -"strong scaling",64,4,4,4,5832,1.715e-02,5.776e-04 -"strong scaling",128,8,4,4,5832,1.980e-02,7.013e-04 -"weak scaling",64,4,4,4,110592,4.375e-01,1.414e-02 -"weak scaling",100,5,5,4,172800,4.450e-01,1.437e-02 -"weak scaling",128,8,4,4,221184,8.720e+00,2.753e-01 -"weak scaling",128,8,4,4,221184,8.760e+00,3.110e-01 -"weak scaling",4,2,2,1,6912,2.626e-01,8.142e-03 -"weak scaling",4,2,2,1,6912,2.780e-01,8.683e-03 -"weak scaling",4,2,2,1,6912,2.627e-01,8.391e-03 -"weak scaling",4,2,2,1,6912,2.617e-01,8.155e-03 -"weak scaling",2,2,1,1,3456,2.028e-01,6.255e-03 -"weak scaling",2,2,1,1,3456,3.247e-01,1.026e-02 -"weak scaling",2,2,1,1,3456,3.249e-01,1.029e-02 -"weak scaling",2,2,1,1,3456,3.257e-01,1.028e-02 -"weak scaling",2,2,1,1,3456,3.375e-01,1.095e-02 -"weak scaling",2,2,1,1,3456,3.367e-01,1.086e-02 -"weak scaling",2,2,1,1,3456,3.241e-01,1.048e-02 -"weak scaling",2,2,1,1,3456,3.243e-01,1.038e-02 diff --git a/eessi/testsuite/tests/apps/espresso/src/job.sh b/eessi/testsuite/tests/apps/espresso/src/job.sh deleted file mode 100644 index 17399c52..00000000 --- a/eessi/testsuite/tests/apps/espresso/src/job.sh +++ /dev/null @@ -1,10 +0,0 @@ -#!/bin/bash -#SBATCH --time=00:40:00 -#SBATCH --output %j.stdout -#SBATCH --error %j.stderr -module load spack/default gcc/12.3.0 cuda/12.3.0 openmpi/4.1.6 \ - fftw/3.3.10 boost/1.83.0 python/3.12.1 -source ../espresso-4.3/venv/bin/activate -srun --cpu-bind=cores python3 madelung.py --size 6 --weak-scaling -srun --cpu-bind=cores python3 madelung.py --size 9 --strong-scaling -deactivate diff --git a/eessi/testsuite/tests/apps/espresso/src/madelung.py b/eessi/testsuite/tests/apps/espresso/src/madelung.py index 37d0b44a..3f73b5d5 100644 --- a/eessi/testsuite/tests/apps/espresso/src/madelung.py +++ b/eessi/testsuite/tests/apps/espresso/src/madelung.py @@ -34,9 +34,6 @@ default=False, required=False, help="Use GPU implementation") parser.add_argument("--topology", metavar=("X", "Y", "Z"), nargs=3, action="store", default=None, required=False, type=int, help="Cartesian topology") -parser.add_argument("--output", metavar="FILEPATH", action="store", - type=str, required=False, default="benchmarks.csv", - help="Output file (default: benchmarks.csv)") group = parser.add_mutually_exclusive_group() group.add_argument("--weak-scaling", action="store_true", help="Weak scaling benchmark (Gustafson's law: constant work per core)") From 8ccec25cb86379014f033dc7f62b38b72dcd599f Mon Sep 17 00:00:00 2001 From: Samuel Moors Date: Tue, 11 Jun 2024 20:57:13 +0200 Subject: [PATCH 51/55] also fix config files and QE hook --- config/github_actions.py | 2 +- config/it4i_karolina.py | 2 +- config/izum_vega.py | 4 ++-- config/surf_snellius.py | 6 +++--- config/vsc_hortense.py | 10 +++++----- eessi/testsuite/tests/apps/QuantumESPRESSO.py | 2 +- 6 files changed, 13 insertions(+), 13 deletions(-) diff --git a/config/github_actions.py b/config/github_actions.py index b2196b6b..7ed97422 100644 --- a/config/github_actions.py +++ b/config/github_actions.py @@ -33,7 +33,7 @@ # Make sure to round down, otherwise a job might ask for more mem than is available # per node # This is a fictional amount, GH actions probably has less, but only does --dry-run - 'mem_per_node': 30 # in GiB + 'mem_per_node': 30 * 1024 # in MiB }, } ] diff --git a/config/it4i_karolina.py b/config/it4i_karolina.py index 2bdfa035..4904bf1d 100644 --- a/config/it4i_karolina.py +++ b/config/it4i_karolina.py @@ -62,7 +62,7 @@ 'extras': { # Make sure to round down, otherwise a job might ask for more mem than is available # per node - 'mem_per_node': 219.345 # in GiB + 'mem_per_node': 219.345 * 1024 # in MiB }, 'descr': 'CPU Universal Compute Nodes, see https://docs.it4i.cz/karolina/hardware-overview/' }, diff --git a/config/izum_vega.py b/config/izum_vega.py index f7193aed..e3b53752 100644 --- a/config/izum_vega.py +++ b/config/izum_vega.py @@ -62,7 +62,7 @@ 'extras': { # Make sure to round down, otherwise a job might ask for more mem than is available # per node - 'mem_per_node': 238.418 # in GiB + 'mem_per_node': 238.418 * 1024 # in MiB }, 'descr': 'CPU partition Standard, see https://en-doc.vega.izum.si/architecture/' }, @@ -106,7 +106,7 @@ # 'extras': { # # Make sure to round down, otherwise a job might ask for more mem than is available # # per node - # 'mem_per_node': 476.837 # in GiB (should be checked, its unclear from slurm.conf) + # 'mem_per_node': 476.837 * 1024 # in MiB (should be checked, its unclear from slurm.conf) # }, # 'descr': 'GPU partition, see https://en-doc.vega.izum.si/architecture/' # }, diff --git a/config/surf_snellius.py b/config/surf_snellius.py index d8bcc36c..c4c0623a 100644 --- a/config/surf_snellius.py +++ b/config/surf_snellius.py @@ -56,7 +56,7 @@ 'extras': { # Make sure to round down, otherwise a job might ask for more mem than is available # per node - 'mem_per_node': 213.623 # in GiB + 'mem_per_node': 213.623 * 1024 # in MiB }, 'descr': 'AMD Rome CPU partition with native EESSI stack' }, @@ -80,7 +80,7 @@ 'extras': { # Make sure to round down, otherwise a job might ask for more mem than is available # per node - 'mem_per_node': 320.434 # in GiB + 'mem_per_node': 320.434 * 1024 # in MiB }, 'descr': 'AMD Genoa CPU partition with native EESSI stack' }, @@ -117,7 +117,7 @@ GPU_VENDOR: GPU_VENDORS[NVIDIA], # Make sure to round down, otherwise a job might ask for more mem than is available # per node - 'mem_per_node': 457.763 # in GiB + 'mem_per_node': 457.763 * 1024 # in MiB }, 'descr': 'Nvidia A100 GPU partition with native EESSI stack' }, diff --git a/config/vsc_hortense.py b/config/vsc_hortense.py index 1615330b..9d52d1c9 100644 --- a/config/vsc_hortense.py +++ b/config/vsc_hortense.py @@ -59,7 +59,7 @@ def command(self, job): 'extras': { # Make sure to round down, otherwise a job might ask for more mem than is available # per node - 'mem_per_node': 234 # in GiB + 'mem_per_node': 234 * 1024 # in MiB }, }, { @@ -91,7 +91,7 @@ def command(self, job): 'extras': { # Make sure to round down, otherwise a job might ask for more mem than is available # per node - 'mem_per_node': 473 # in GiB + 'mem_per_node': 473 * 1024 # in MiB }, }, { @@ -123,7 +123,7 @@ def command(self, job): 'extras': { # Make sure to round down, otherwise a job might ask for more mem than is available # per node - 'mem_per_node': 234 # in GiB + 'mem_per_node': 234 * 1024 # in MiB }, }, { @@ -150,7 +150,7 @@ def command(self, job): GPU_VENDOR: GPU_VENDORS[NVIDIA], # Make sure to round down, otherwise a job might ask for more mem than is available # per node - 'mem_per_node': 236 # in GiB + 'mem_per_node': 236 * 1024 # in MiB }, 'resources': [ { @@ -194,7 +194,7 @@ def command(self, job): GPU_VENDOR: GPU_VENDORS[NVIDIA], # Make sure to round down, otherwise a job might ask for more mem than is available # per node - 'mem_per_node': 475 # in GiB + 'mem_per_node': 475 * 1024 # in MiB }, 'resources': [ { diff --git a/eessi/testsuite/tests/apps/QuantumESPRESSO.py b/eessi/testsuite/tests/apps/QuantumESPRESSO.py index 050e43d3..288354b2 100644 --- a/eessi/testsuite/tests/apps/QuantumESPRESSO.py +++ b/eessi/testsuite/tests/apps/QuantumESPRESSO.py @@ -100,7 +100,7 @@ def run_after_setup(self): @run_after('setup') def request_mem(self): memory_required = self.num_tasks_per_node * 0.9 + 4 - hooks.req_memory_per_node(test=self, app_mem_req=memory_required) + hooks.req_memory_per_node(test=self, app_mem_req=memory_required * 1024) @run_after('setup') def set_omp_num_threads(self): From 13a6312cf5250b9b7d3e6ada7c1310af39002bce Mon Sep 17 00:00:00 2001 From: Samuel Moors Date: Tue, 11 Jun 2024 21:07:18 +0200 Subject: [PATCH 52/55] update hortense config according to slurm.conf --- config/vsc_hortense.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/config/vsc_hortense.py b/config/vsc_hortense.py index 9d52d1c9..312c72f4 100644 --- a/config/vsc_hortense.py +++ b/config/vsc_hortense.py @@ -59,7 +59,7 @@ def command(self, job): 'extras': { # Make sure to round down, otherwise a job might ask for more mem than is available # per node - 'mem_per_node': 234 * 1024 # in MiB + 'mem_per_node': 252160, # in MiB }, }, { @@ -91,7 +91,7 @@ def command(self, job): 'extras': { # Make sure to round down, otherwise a job might ask for more mem than is available # per node - 'mem_per_node': 473 * 1024 # in MiB + 'mem_per_node': 508160, # in MiB }, }, { @@ -123,7 +123,7 @@ def command(self, job): 'extras': { # Make sure to round down, otherwise a job might ask for more mem than is available # per node - 'mem_per_node': 234 * 1024 # in MiB + 'mem_per_node': 252160, # in MiB }, }, { @@ -150,7 +150,7 @@ def command(self, job): GPU_VENDOR: GPU_VENDORS[NVIDIA], # Make sure to round down, otherwise a job might ask for more mem than is available # per node - 'mem_per_node': 236 * 1024 # in MiB + 'mem_per_node': 254400, # in MiB }, 'resources': [ { @@ -194,7 +194,7 @@ def command(self, job): GPU_VENDOR: GPU_VENDORS[NVIDIA], # Make sure to round down, otherwise a job might ask for more mem than is available # per node - 'mem_per_node': 475 * 1024 # in MiB + 'mem_per_node': 510720, # in MiB }, 'resources': [ { From 79d604b563e60bfb16b7c25b7008f91532d8a7bb Mon Sep 17 00:00:00 2001 From: Samuel Moors Date: Tue, 11 Jun 2024 22:57:23 +0200 Subject: [PATCH 53/55] also update espresso test --- eessi/testsuite/tests/apps/espresso/espresso.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/eessi/testsuite/tests/apps/espresso/espresso.py b/eessi/testsuite/tests/apps/espresso/espresso.py index a1675afd..20ea5e7e 100644 --- a/eessi/testsuite/tests/apps/espresso/espresso.py +++ b/eessi/testsuite/tests/apps/espresso/espresso.py @@ -97,7 +97,7 @@ def set_mem(self): """ Setting an extra job option of memory. Here the assumption made is that HPC systems will contain at least 1 GB per core of memory.""" mem_required_per_node = self.num_tasks_per_node * 0.9 - hooks.req_memory_per_node(test=self, app_mem_req=mem_required_per_node) + hooks.req_memory_per_node(test=self, app_mem_req=mem_required_per_node * 1024) @deferrable def assert_completion(self): From 8075ae9b2f8159f079457ad9a50e85108cf8b60b Mon Sep 17 00:00:00 2001 From: Sam Moors Date: Thu, 13 Jun 2024 13:58:43 +0200 Subject: [PATCH 54/55] fix comment Co-authored-by: Caspar van Leeuwen <33718780+casparvl@users.noreply.github.com> --- eessi/testsuite/hooks.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/eessi/testsuite/hooks.py b/eessi/testsuite/hooks.py index d6829d08..c4d658df 100644 --- a/eessi/testsuite/hooks.py +++ b/eessi/testsuite/hooks.py @@ -399,7 +399,7 @@ def req_memory_per_node(test: rfm.RegressionTest, app_mem_req: float): - app_mem_req: the amount of memory this application needs (per node) in MiB Example 1: - - A system with 128 cores and 64 GiB per node. + - A system with 128 cores and 64,000 MiB per node. - The test is launched on 64 cores - The app_mem_req is 40,000 (MiB) In this case, the test requests 50% of the CPUs. Thus, the proportional amount of memory is 32,000 MiB. From 96f6b9d136b5298ca2c1dd1b5846b2935581d467 Mon Sep 17 00:00:00 2001 From: Sam Moors Date: Thu, 13 Jun 2024 14:00:05 +0200 Subject: [PATCH 55/55] fix comment Co-authored-by: Caspar van Leeuwen <33718780+casparvl@users.noreply.github.com> --- eessi/testsuite/hooks.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/eessi/testsuite/hooks.py b/eessi/testsuite/hooks.py index c4d658df..ab711955 100644 --- a/eessi/testsuite/hooks.py +++ b/eessi/testsuite/hooks.py @@ -406,7 +406,7 @@ def req_memory_per_node(test: rfm.RegressionTest, app_mem_req: float): The app_mem_req is higher. Thus, 40,000 MiB (per node) is requested from the batch scheduler. Example 2: - - A system with 128 cores per node, 128,000 MiB mem per node is used. + - A system with 128 cores per node, 128,000 MiB mem per node. - The test is launched on 64 cores - the app_mem_req is 40,000 (MiB) In this case, the test requests 50% of the CPUs. Thus, the proportional amount of memory is 64,000 MiB.