From 6352623ff3bb2184df9544b43574fa7b0e422125 Mon Sep 17 00:00:00 2001 From: Samuel Moors Date: Tue, 6 Dec 2022 15:56:04 +0100 Subject: [PATCH 01/29] use reframe features to select valid_systems --- .gitignore | 1 + eessi/reframe/config/settings_example.py | 12 +++-- .../applications/gromacs_check.py | 47 ++++++++++++++----- eessi/reframe/eessi_utils/utils.py | 19 +++++--- 4 files changed, 56 insertions(+), 23 deletions(-) create mode 100644 .gitignore diff --git a/.gitignore b/.gitignore new file mode 100644 index 00000000..c18dd8d8 --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +__pycache__/ diff --git a/eessi/reframe/config/settings_example.py b/eessi/reframe/config/settings_example.py index 2c103dae..e8fe9131 100644 --- a/eessi/reframe/config/settings_example.py +++ b/eessi/reframe/config/settings_example.py @@ -25,6 +25,7 @@ 'num_cpus_per_socket': 64, 'arch': 'znver2', }, + 'features': ['cpu'], 'descr': 'CPU partition' }, { @@ -46,11 +47,12 @@ 'num_devices': 4, } ], + 'features': ['gpu'], 'descr': 'GPU partition' }, - ] - }, - ], + ] + }, + ], 'environments': [ { 'name': 'builtin', @@ -58,8 +60,8 @@ 'cxx': '', 'ftn': '', }, - ], - 'logging': [ + ], + 'logging': [ { 'level': 'debug', 'handlers': [ diff --git a/eessi/reframe/eessi-checks/applications/gromacs_check.py b/eessi/reframe/eessi-checks/applications/gromacs_check.py index 5c5fd4df..84466aa7 100644 --- a/eessi/reframe/eessi-checks/applications/gromacs_check.py +++ b/eessi/reframe/eessi-checks/applications/gromacs_check.py @@ -4,12 +4,25 @@ # SPDX-License-Identifier: BSD-3-Clause import reframe as rfm -from reframe.utility import find_modules +import reframe.core.runtime as rt +from reframe.utility import OrderedSet from hpctestlib.sciapps.gromacs.benchmarks import gromacs_check import eessi_utils.hooks as hooks import eessi_utils.utils as utils + +def my_find_modules(substr): + """Return all modules in the current system that contain ``substr`` in their name.""" + if not isinstance(substr, str): + raise TypeError("'substr' argument must be a string") + + ms = rt.runtime().modules_system + modules = OrderedSet(ms.available_modules(substr)) + for m in modules: + yield m + + @rfm.simple_test class GROMACS_EESSI(gromacs_check): @@ -18,7 +31,9 @@ class GROMACS_EESSI(gromacs_check): ('n_small', 2), ('n_medium', 8), ('n_large', 16)]) - module_info = parameter(find_modules('GROMACS', environ_mapping={r'.*': 'builtin'})) + + module_name = parameter(my_find_modules('GROMACS')) + valid_prog_environs = ['builtin'] omp_num_threads = 1 executable_opts += ['-dlb yes', '-ntomp %s' % omp_num_threads, '-npme -1'] @@ -29,11 +44,17 @@ class GROMACS_EESSI(gromacs_check): time_limit = '30m' @run_after('init') - def apply_module_info(self): - self.s, self.e, self.m = self.module_info - self.valid_systems = [self.s] - self.modules = [self.m] - self.valid_prog_environs = [self.e] + def select_valid_systems(self): + cuda = utils.is_cuda_required_module(self.module_name) + if self.nb_impl == 'gpu' and cuda: + valid_systems = '+gpu' + elif self.nb_impl == 'cpu' and not cuda: + valid_systems = '+cpu' + else: + valid_systems = 'nonexisting' + + self.valid_systems = [valid_systems] + self.modules = [self.module_name] @run_after('init') def set_test_scale(self): @@ -54,7 +75,8 @@ def set_test_purpose(self): def skip_nb_impl_gpu_on_cpu_nodes(self): self.skip_if( (self.nb_impl == 'gpu' and not utils.is_gpu_present(self)), - "Skipping test variant with non-bonded interactions on GPUs, as this partition (%s) does not have GPU nodes" % self.current_partition.name + "Skipping test variant with non-bonded interactions on GPUs, " + "as this partition (%s) does not have GPU nodes" % self.current_partition.name ) # Sckip testing when nb_impl=gpu and this is not a GPU build of GROMACS @@ -70,11 +92,12 @@ def skip_nb_impl_gpu_on_non_cuda_builds(self): def skip_gpu_test_on_cpu_nodes(self): hooks.skip_gpu_test_on_cpu_nodes(self) - # Assign num_tasks, num_tasks_per_node and num_cpus_per_task automatically based on current partition's num_cpus and gpus + # Assign num_tasks, num_tasks_per_node and num_cpus_per_task automatically + # based on current partition's num_cpus and gpus # Only when running nb_impl on GPU do we want one task per GPU @run_after('setup') def set_num_tasks(self): - if(self.nb_impl == 'gpu'): - hooks.assign_one_task_per_gpu(test = self, num_nodes = self.num_nodes) + if self.nb_impl == 'gpu': + hooks.assign_one_task_per_gpu(test=self, num_nodes=self.num_nodes) else: - hooks.assign_one_task_per_cpu(test = self, num_nodes = self.num_nodes) + hooks.assign_one_task_per_cpu(test=self, num_nodes=self.num_nodes) diff --git a/eessi/reframe/eessi_utils/utils.py b/eessi/reframe/eessi_utils/utils.py index 8b5cd319..ceee4777 100644 --- a/eessi/reframe/eessi_utils/utils.py +++ b/eessi/reframe/eessi_utils/utils.py @@ -2,12 +2,13 @@ import reframe as rfm - gpu_dev_name = 'gpu' + def _get_gpu_list(test: rfm.RegressionTest): return [ dev.num_devices for dev in test.current_partition.devices if dev.device_type == gpu_dev_name ] + def get_num_gpus(test: rfm.RegressionTest) -> int: '''Returns the number of GPUs for the current partition''' gpu_list = _get_gpu_list(test) @@ -22,14 +23,20 @@ def get_num_gpus(test: rfm.RegressionTest) -> int: return gpu_list[0] + def is_gpu_present(test: rfm.RegressionTest) -> bool: '''Checks if GPUs are present in the current partition''' return ( len(_get_gpu_list(test)) >= 1 ) -def is_cuda_required(test: rfm.RegressionTest) -> bool: - '''Checks if CUDA seems to be required by current module''' + +def is_cuda_required_module(module_name): + '''Checks if CUDA seems to be required by given module''' requires_cuda = False - for module in test.modules: - if re.search("(?i)cuda", module): - requires_cuda = True + if re.search("(?i)cuda", module_name): + requires_cuda = True return requires_cuda + + +def is_cuda_required(test: rfm.RegressionTest) -> bool: + '''Checks if CUDA seems to be required by current module''' + return any([is_cuda_required_module(x) for x in test.modules]) From 476721108b4cbe7ce38d722e6c6c8fa16f70e8c8 Mon Sep 17 00:00:00 2001 From: Samuel Moors Date: Thu, 8 Dec 2022 14:56:54 +0100 Subject: [PATCH 02/29] add support for specifying (a list of) modules --- eessi/reframe/eessi-checks/applications/gromacs_check.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/eessi/reframe/eessi-checks/applications/gromacs_check.py b/eessi/reframe/eessi-checks/applications/gromacs_check.py index 84466aa7..2b824099 100644 --- a/eessi/reframe/eessi-checks/applications/gromacs_check.py +++ b/eessi/reframe/eessi-checks/applications/gromacs_check.py @@ -44,7 +44,7 @@ class GROMACS_EESSI(gromacs_check): time_limit = '30m' @run_after('init') - def select_valid_systems(self): + def fiter_tests(self): cuda = utils.is_cuda_required_module(self.module_name) if self.nb_impl == 'gpu' and cuda: valid_systems = '+gpu' @@ -53,6 +53,11 @@ def select_valid_systems(self): else: valid_systems = 'nonexisting' + # filter out this test if the module is not among a list of specified modules + # modules can be specified with '--setvar modules="" + if self.modules and self.module_name not in self.modules: + valid_systems = 'nonexisting' + self.valid_systems = [valid_systems] self.modules = [self.module_name] From d78e2dc1d2e90c165bdd00ed77ef7d1320bc9aac Mon Sep 17 00:00:00 2001 From: Samuel Moors Date: Thu, 8 Dec 2022 15:53:30 +0100 Subject: [PATCH 03/29] add support for specifying tasks per node --- eessi/reframe/eessi_utils/hooks.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/eessi/reframe/eessi_utils/hooks.py b/eessi/reframe/eessi_utils/hooks.py index da49c8b4..2c94d2d5 100644 --- a/eessi/reframe/eessi_utils/hooks.py +++ b/eessi/reframe/eessi_utils/hooks.py @@ -21,10 +21,14 @@ def skip_gpu_test_on_cpu_nodes(test: rfm.RegressionTest): test.skip_if(True, "Test requires CUDA, but no GPU is present in this partition (%s). Skipping test..." % test.current_partition.name) def assign_one_task_per_cpu(test: rfm.RegressionTest, num_nodes: int) -> rfm.RegressionTest: - '''Sets num_tasks_per_node and num_cpus_per_task such that it will run one task per core''' - if test.current_partition.processor.num_cpus is None: - raise AttributeError(processor_info_missing) - test.num_tasks_per_node = test.current_partition.processor.num_cpus + ''' + Sets num_tasks_per_node and num_cpus_per_task such that it will run one task per core unless specified + (with --setvar num_tasks_per_node=) + ''' + if not test.num_tasks_per_node: + if test.current_partition.processor.num_cpus is None: + raise AttributeError(processor_info_missing) + test.num_tasks_per_node = test.current_partition.processor.num_cpus test.num_cpus_per_task = 1 test.num_tasks = num_nodes * test.num_tasks_per_node From 45531b255ac42bec6e8409b288613e2e5bc03756 Mon Sep 17 00:00:00 2001 From: Samuel Moors Date: Mon, 12 Dec 2022 15:10:45 +0100 Subject: [PATCH 04/29] fix code style for eessi_utils/utils.py --- eessi/reframe/eessi_utils/utils.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/eessi/reframe/eessi_utils/utils.py b/eessi/reframe/eessi_utils/utils.py index ceee4777..7d02b388 100644 --- a/eessi/reframe/eessi_utils/utils.py +++ b/eessi/reframe/eessi_utils/utils.py @@ -2,11 +2,11 @@ import reframe as rfm -gpu_dev_name = 'gpu' +GPU_DEV_NAME = 'gpu' def _get_gpu_list(test: rfm.RegressionTest): - return [ dev.num_devices for dev in test.current_partition.devices if dev.device_type == gpu_dev_name ] + return [dev.num_devices for dev in test.current_partition.devices if dev.device_type == GPU_DEV_NAME] def get_num_gpus(test: rfm.RegressionTest) -> int: @@ -14,9 +14,9 @@ def get_num_gpus(test: rfm.RegressionTest) -> int: gpu_list = _get_gpu_list(test) # If multiple devices are called 'GPU' in the current partition, # we don't know for which to return the device count... - if(len(gpu_list) != 1): - raise ValueError(f"Multiple different devices exist with the name " - f"'{gpu_dev_name}' for partition '{test.current_partition.name}'. " + if len(gpu_list) != 1: + raise ValueError(f"Multiple different devices exist with the name " + f"'{GPU_DEV_NAME}' for partition '{test.current_partition.name}'. " f"Cannot determine number of GPUs available for the test. " f"Please check the definition of partition '{test.current_partition.name}' " f"in your ReFrame config file.") @@ -26,7 +26,7 @@ def get_num_gpus(test: rfm.RegressionTest) -> int: def is_gpu_present(test: rfm.RegressionTest) -> bool: '''Checks if GPUs are present in the current partition''' - return ( len(_get_gpu_list(test)) >= 1 ) + return len(_get_gpu_list(test)) >= 1 def is_cuda_required_module(module_name): From 1cac4d4e959b7aba87b00ce1a05ba6f2d5bd4d8b Mon Sep 17 00:00:00 2001 From: Samuel Moors Date: Mon, 12 Dec 2022 15:12:45 +0100 Subject: [PATCH 05/29] add support for specifying valid_systems --- eessi/reframe/eessi-checks/applications/gromacs_check.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/eessi/reframe/eessi-checks/applications/gromacs_check.py b/eessi/reframe/eessi-checks/applications/gromacs_check.py index 2b824099..648d3eb2 100644 --- a/eessi/reframe/eessi-checks/applications/gromacs_check.py +++ b/eessi/reframe/eessi-checks/applications/gromacs_check.py @@ -34,6 +34,7 @@ class GROMACS_EESSI(gromacs_check): module_name = parameter(my_find_modules('GROMACS')) valid_prog_environs = ['builtin'] + valid_systems = [] omp_num_threads = 1 executable_opts += ['-dlb yes', '-ntomp %s' % omp_num_threads, '-npme -1'] @@ -46,6 +47,7 @@ class GROMACS_EESSI(gromacs_check): @run_after('init') def fiter_tests(self): cuda = utils.is_cuda_required_module(self.module_name) + valid_systems = '' if self.nb_impl == 'gpu' and cuda: valid_systems = '+gpu' elif self.nb_impl == 'cpu' and not cuda: @@ -53,12 +55,13 @@ def fiter_tests(self): else: valid_systems = 'nonexisting' - # filter out this test if the module is not among a list of specified modules + # filter out this test if the module is not among a list of manually specified modules # modules can be specified with '--setvar modules="" if self.modules and self.module_name not in self.modules: valid_systems = 'nonexisting' - self.valid_systems = [valid_systems] + if not self.valid_systems: + self.valid_systems = [valid_systems] self.modules = [self.module_name] @run_after('init') From 1ea9e18afb7f16aac11d7c571e2466d734558e77 Mon Sep 17 00:00:00 2001 From: Samuel Moors Date: Mon, 12 Dec 2022 16:44:37 +0100 Subject: [PATCH 06/29] set omp_num_threads equal to cpus_per_task --- .../applications/gromacs_check.py | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/eessi/reframe/eessi-checks/applications/gromacs_check.py b/eessi/reframe/eessi-checks/applications/gromacs_check.py index 648d3eb2..2e167b37 100644 --- a/eessi/reframe/eessi-checks/applications/gromacs_check.py +++ b/eessi/reframe/eessi-checks/applications/gromacs_check.py @@ -36,12 +36,6 @@ class GROMACS_EESSI(gromacs_check): valid_prog_environs = ['builtin'] valid_systems = [] - omp_num_threads = 1 - executable_opts += ['-dlb yes', '-ntomp %s' % omp_num_threads, '-npme -1'] - variables = { - 'OMP_NUM_THREADS': '%s' % omp_num_threads, - } - time_limit = '30m' @run_after('init') @@ -84,7 +78,7 @@ def skip_nb_impl_gpu_on_cpu_nodes(self): self.skip_if( (self.nb_impl == 'gpu' and not utils.is_gpu_present(self)), "Skipping test variant with non-bonded interactions on GPUs, " - "as this partition (%s) does not have GPU nodes" % self.current_partition.name + f"as this partition ({self.current_partition.name}) does not have GPU nodes" ) # Sckip testing when nb_impl=gpu and this is not a GPU build of GROMACS @@ -92,7 +86,8 @@ def skip_nb_impl_gpu_on_cpu_nodes(self): def skip_nb_impl_gpu_on_non_cuda_builds(self): self.skip_if( (self.nb_impl == 'gpu' and not utils.is_cuda_required(self)), - "Skipping test variant with non-bonded interaction on GPUs, as this GROMACS was not build with GPU support" + "Skipping test variant with non-bonded interactions on GPUs, " + f"as this module ({self.module_name}) was not build with GPU support" ) # Skip testing GPU-based modules on CPU-based nodes @@ -109,3 +104,11 @@ def set_num_tasks(self): hooks.assign_one_task_per_gpu(test=self, num_nodes=self.num_nodes) else: hooks.assign_one_task_per_cpu(test=self, num_nodes=self.num_nodes) + + @run_after('setup') + def set_omp_num_threads(self): + omp_num_threads = self.num_cpus_per_task + self.executable_opts += ['-dlb yes', f'-ntomp {omp_num_threads}', '-npme -1'] + self.variables = { + 'OMP_NUM_THREADS': f'{omp_num_threads}', + } From 1c7043ca3c9b284866f0a5e35c521b03d2d4621f Mon Sep 17 00:00:00 2001 From: Samuel Moors Date: Tue, 13 Dec 2022 18:24:06 +0100 Subject: [PATCH 07/29] add support for setting custom environment variables --- eessi/reframe/eessi-checks/applications/gromacs_check.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/eessi/reframe/eessi-checks/applications/gromacs_check.py b/eessi/reframe/eessi-checks/applications/gromacs_check.py index 2e167b37..ad371578 100644 --- a/eessi/reframe/eessi-checks/applications/gromacs_check.py +++ b/eessi/reframe/eessi-checks/applications/gromacs_check.py @@ -109,6 +109,4 @@ def set_num_tasks(self): def set_omp_num_threads(self): omp_num_threads = self.num_cpus_per_task self.executable_opts += ['-dlb yes', f'-ntomp {omp_num_threads}', '-npme -1'] - self.variables = { - 'OMP_NUM_THREADS': f'{omp_num_threads}', - } + self.variables['OMP_NUM_THREADS'] = f'{omp_num_threads}' From 9d71af9c1cf33ada2b6349e5f3a7d55eec762a56 Mon Sep 17 00:00:00 2001 From: Samuel Moors Date: Wed, 14 Dec 2022 10:25:28 +0100 Subject: [PATCH 08/29] simplify valid systems logic --- .../eessi-checks/applications/gromacs_check.py | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/eessi/reframe/eessi-checks/applications/gromacs_check.py b/eessi/reframe/eessi-checks/applications/gromacs_check.py index ad371578..15568a50 100644 --- a/eessi/reframe/eessi-checks/applications/gromacs_check.py +++ b/eessi/reframe/eessi-checks/applications/gromacs_check.py @@ -39,23 +39,24 @@ class GROMACS_EESSI(gromacs_check): time_limit = '30m' @run_after('init') - def fiter_tests(self): + def filter_tests(self): cuda = utils.is_cuda_required_module(self.module_name) - valid_systems = '' + valid_systems = [] + + # CUDA modules should only run in partitions with 'gpu' feature, + # non-CUDA modules should only run in partitions with 'cpu' feature if self.nb_impl == 'gpu' and cuda: - valid_systems = '+gpu' + valid_systems = ['+gpu'] elif self.nb_impl == 'cpu' and not cuda: - valid_systems = '+cpu' - else: - valid_systems = 'nonexisting' + valid_systems = ['+cpu'] # filter out this test if the module is not among a list of manually specified modules # modules can be specified with '--setvar modules="" if self.modules and self.module_name not in self.modules: - valid_systems = 'nonexisting' + valid_systems = [] if not self.valid_systems: - self.valid_systems = [valid_systems] + self.valid_systems = valid_systems self.modules = [self.module_name] @run_after('init') From c0d233377291e7bcbb874e37f1d70948b6978755 Mon Sep 17 00:00:00 2001 From: Samuel Moors Date: Wed, 14 Dec 2022 10:39:52 +0100 Subject: [PATCH 09/29] fix code style for eessi_utils/hooks.py --- eessi/reframe/eessi_utils/hooks.py | 38 ++++++++++++++++++++++-------- 1 file changed, 28 insertions(+), 10 deletions(-) diff --git a/eessi/reframe/eessi_utils/hooks.py b/eessi/reframe/eessi_utils/hooks.py index 2c94d2d5..5633bdf0 100644 --- a/eessi/reframe/eessi_utils/hooks.py +++ b/eessi/reframe/eessi_utils/hooks.py @@ -1,24 +1,31 @@ import reframe as rfm import eessi_utils.utils as utils -processor_info_missing = '''This test requires the number of CPUs to be known for the partition it runs on. -Check that processor information is either autodetected -(see https://reframe-hpc.readthedocs.io/en/stable/configure.html#proc-autodetection), -or manually set in the ReFrame configuration file +processor_info_missing = '''This test requires the number of CPUs to be known for the partition it runs on. +Check that processor information is either autodetected +(see https://reframe-hpc.readthedocs.io/en/stable/configure.html#proc-autodetection), +or manually set in the ReFrame configuration file (see https://reframe-hpc.readthedocs.io/en/stable/config_reference.html?highlight=processor%20info#processor-info). ''' + def skip_cpu_test_on_gpu_nodes(test: rfm.RegressionTest): '''Skip test if GPUs are present, but no CUDA is required''' - skip = ( utils.is_gpu_present(test) and not utils.is_cuda_required(test) ) + skip = (utils.is_gpu_present(test) and not utils.is_cuda_required(test)) if skip: - test.skip_if(True, "GPU is present on this partition (%s), skipping CPU-based test" % test.current_partition.name) + test.skip_if(True, f"GPU is present on this partition ({test.current_partition.name}), skipping CPU-based test") + def skip_gpu_test_on_cpu_nodes(test: rfm.RegressionTest): '''Skip test if CUDA is required, but no GPU is present''' - skip = ( utils.is_cuda_required(test) and not utils.is_gpu_present(test) ) + skip = (utils.is_cuda_required(test) and not utils.is_gpu_present(test)) if skip: - test.skip_if(True, "Test requires CUDA, but no GPU is present in this partition (%s). Skipping test..." % test.current_partition.name) + test.skip_if( + True, + f"Test requires CUDA, but no GPU is present in this partition ({test.current_partition.name}). " + "Skipping test..." + ) + def assign_one_task_per_cpu(test: rfm.RegressionTest, num_nodes: int) -> rfm.RegressionTest: ''' @@ -32,16 +39,27 @@ def assign_one_task_per_cpu(test: rfm.RegressionTest, num_nodes: int) -> rfm.Reg test.num_cpus_per_task = 1 test.num_tasks = num_nodes * test.num_tasks_per_node + def assign_one_task_per_gpu(test: rfm.RegressionTest, num_nodes: int) -> rfm.RegressionTest: - '''Sets num_tasks_per_node to the number of gpus, and num_cpus_per_task to the number of CPUs available per GPU in this partition''' + ''' + Sets num_tasks_per_node to the number of gpus, + and num_cpus_per_task to the number of CPUs available per GPU in this partition + ''' if test.current_partition.processor.num_cpus is None: raise AttributeError(processor_info_missing) test.num_tasks_per_node = utils.get_num_gpus(test) test.num_cpus_per_task = int(test.current_partition.processor.num_cpus / test.num_tasks_per_node) test.num_tasks = num_nodes * test.num_tasks_per_node + def auto_assign_num_tasks_MPI(test: rfm.RegressionTest, num_nodes: int) -> rfm.RegressionTest: - '''Automatically sets num_tasks, tasks_per_node and cpus_per_task based on the current partitions num_cpus, number of GPUs and test.num_nodes. For GPU tests, one task per GPU is set, and num_cpus_per_task is based on the ratio of CPU cores/GPUs. For CPU tests, one task per CPU is set, and num_cpus_per_task is set to 1. Total task count is determined based on the number of nodes to be used in the test. Behaviour of this function is (usually) sensible for pure MPI tests.''' + ''' + Automatically sets num_tasks, tasks_per_node and cpus_per_task based on the current partitions num_cpus, number of + GPUs and test.num_nodes. For GPU tests, one task per GPU is set, and num_cpus_per_task is based on the ratio of CPU + cores/GPUs. For CPU tests, one task per CPU is set, and num_cpus_per_task is set to 1. Total task count is + determined based on the number of nodes to be used in the test. Behaviour of this function is (usually) sensible for + pure MPI tests. + ''' if utils.is_cuda_required(test): assign_one_task_per_gpu(test, num_nodes) else: From cc7342110b68f58afc718cd9956e10ef978954c2 Mon Sep 17 00:00:00 2001 From: Samuel Moors Date: Wed, 14 Dec 2022 10:49:54 +0100 Subject: [PATCH 10/29] also add support for specifying num_cpus_per_task --- eessi/reframe/eessi_utils/hooks.py | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/eessi/reframe/eessi_utils/hooks.py b/eessi/reframe/eessi_utils/hooks.py index 5633bdf0..e23f7c30 100644 --- a/eessi/reframe/eessi_utils/hooks.py +++ b/eessi/reframe/eessi_utils/hooks.py @@ -29,26 +29,35 @@ def skip_gpu_test_on_cpu_nodes(test: rfm.RegressionTest): def assign_one_task_per_cpu(test: rfm.RegressionTest, num_nodes: int) -> rfm.RegressionTest: ''' - Sets num_tasks_per_node and num_cpus_per_task such that it will run one task per core unless specified - (with --setvar num_tasks_per_node=) + Sets num_tasks_per_node and num_cpus_per_task such that it will run one task per core, + unless specified with --setvar num_tasks_per_node= and/or --setvar num_cpus_per_task= ''' if not test.num_tasks_per_node: if test.current_partition.processor.num_cpus is None: raise AttributeError(processor_info_missing) test.num_tasks_per_node = test.current_partition.processor.num_cpus - test.num_cpus_per_task = 1 + + if not test.num_cpus_per_task: + test.num_cpus_per_task = 1 + test.num_tasks = num_nodes * test.num_tasks_per_node def assign_one_task_per_gpu(test: rfm.RegressionTest, num_nodes: int) -> rfm.RegressionTest: ''' Sets num_tasks_per_node to the number of gpus, - and num_cpus_per_task to the number of CPUs available per GPU in this partition + and num_cpus_per_task to the number of CPUs available per GPU in this partition, + unless specified with --setvar num_tasks_per_node= and/or --setvar num_cpus_per_task= ''' if test.current_partition.processor.num_cpus is None: raise AttributeError(processor_info_missing) - test.num_tasks_per_node = utils.get_num_gpus(test) - test.num_cpus_per_task = int(test.current_partition.processor.num_cpus / test.num_tasks_per_node) + + if not test.num_tasks_per_node: + test.num_tasks_per_node = utils.get_num_gpus(test) + + if not test.num_cpus_per_task: + test.num_cpus_per_task = int(test.current_partition.processor.num_cpus / test.num_tasks_per_node) + test.num_tasks = num_nodes * test.num_tasks_per_node From c4b285827f81f45d194e8980e18728bc17a14c51 Mon Sep 17 00:00:00 2001 From: Samuel Moors Date: Wed, 14 Dec 2022 11:39:11 +0100 Subject: [PATCH 11/29] rearrange valid_systems filtering logic --- .../applications/gromacs_check.py | 26 ++++++++++--------- 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/eessi/reframe/eessi-checks/applications/gromacs_check.py b/eessi/reframe/eessi-checks/applications/gromacs_check.py index 15568a50..96bc972b 100644 --- a/eessi/reframe/eessi-checks/applications/gromacs_check.py +++ b/eessi/reframe/eessi-checks/applications/gromacs_check.py @@ -40,23 +40,25 @@ class GROMACS_EESSI(gromacs_check): @run_after('init') def filter_tests(self): - cuda = utils.is_cuda_required_module(self.module_name) - valid_systems = [] + # filter valid_systems, unless specified with --setvar valid_systems= + if not self.valid_systems: + cuda = utils.is_cuda_required_module(self.module_name) + valid_systems = [] - # CUDA modules should only run in partitions with 'gpu' feature, - # non-CUDA modules should only run in partitions with 'cpu' feature - if self.nb_impl == 'gpu' and cuda: - valid_systems = ['+gpu'] - elif self.nb_impl == 'cpu' and not cuda: - valid_systems = ['+cpu'] + # CUDA modules should only run in partitions with 'gpu' feature, + # non-CUDA modules should only run in partitions with 'cpu' feature + if self.nb_impl == 'gpu' and cuda: + valid_systems = ['+gpu'] + elif self.nb_impl == 'cpu' and not cuda: + valid_systems = ['+cpu'] + + self.valid_systems = valid_systems # filter out this test if the module is not among a list of manually specified modules - # modules can be specified with '--setvar modules="" + # modules can be specified with --setvar modules= if self.modules and self.module_name not in self.modules: - valid_systems = [] + self.valid_systems = [] - if not self.valid_systems: - self.valid_systems = valid_systems self.modules = [self.module_name] @run_after('init') From 0f77a7adcce4368196d4d4fcb529e9a377b9baf7 Mon Sep 17 00:00:00 2001 From: Samuel Moors Date: Sun, 12 Feb 2023 08:17:41 +0100 Subject: [PATCH 12/29] support testing non-gpu jobs on gpu nodes --- .../applications/gromacs_check.py | 27 +++++++++++-------- 1 file changed, 16 insertions(+), 11 deletions(-) diff --git a/eessi/reframe/eessi-checks/applications/gromacs_check.py b/eessi/reframe/eessi-checks/applications/gromacs_check.py index 96bc972b..2bec6706 100644 --- a/eessi/reframe/eessi-checks/applications/gromacs_check.py +++ b/eessi/reframe/eessi-checks/applications/gromacs_check.py @@ -42,17 +42,22 @@ class GROMACS_EESSI(gromacs_check): def filter_tests(self): # filter valid_systems, unless specified with --setvar valid_systems= if not self.valid_systems: - cuda = utils.is_cuda_required_module(self.module_name) - valid_systems = [] - - # CUDA modules should only run in partitions with 'gpu' feature, - # non-CUDA modules should only run in partitions with 'cpu' feature - if self.nb_impl == 'gpu' and cuda: - valid_systems = ['+gpu'] - elif self.nb_impl == 'cpu' and not cuda: - valid_systems = ['+cpu'] - - self.valid_systems = valid_systems + cuda_module = utils.is_cuda_required_module(self.module_name) + valid_systems = '' + + # CUDA modules and when using a GPU for non-bonded interactions require partitions with 'gpu' feature + # non-CUDA modules require partitions with 'cpu' feature + if cuda_module: + valid_systems = '+gpu' + if self.nb_impl == 'cpu': + valid_systems += ' +cpu' + else: + valid_systems += '+cpu' + if self.nb_impl == 'gpu': + valid_systems = '' # impossible combination + + if valid_systems: + self.valid_systems = [valid_systems] # filter out this test if the module is not among a list of manually specified modules # modules can be specified with --setvar modules= From 33324af8c29b2758ef4815d6e8a602324ab6adbe Mon Sep 17 00:00:00 2001 From: Samuel Moors Date: Sun, 12 Feb 2023 08:38:11 +0100 Subject: [PATCH 13/29] use env_vars rather than variables for Reframe 4 --- eessi/reframe/eessi-checks/applications/gromacs_check.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/eessi/reframe/eessi-checks/applications/gromacs_check.py b/eessi/reframe/eessi-checks/applications/gromacs_check.py index 2bec6706..e0ce0fbc 100644 --- a/eessi/reframe/eessi-checks/applications/gromacs_check.py +++ b/eessi/reframe/eessi-checks/applications/gromacs_check.py @@ -116,5 +116,6 @@ def set_num_tasks(self): @run_after('setup') def set_omp_num_threads(self): omp_num_threads = self.num_cpus_per_task + # set both OMP_NUM_THREADS and -ntomp explicitly to avoid conflicting values self.executable_opts += ['-dlb yes', f'-ntomp {omp_num_threads}', '-npme -1'] - self.variables['OMP_NUM_THREADS'] = f'{omp_num_threads}' + self.env_vars['OMP_NUM_THREADS'] = f'{omp_num_threads}' From 7853ff17a2ce4d8aa02e8e075d6040b51017308b Mon Sep 17 00:00:00 2001 From: Samuel Moors Date: Sun, 12 Feb 2023 08:38:50 +0100 Subject: [PATCH 14/29] update readme --- README.md | 19 +++---------------- 1 file changed, 3 insertions(+), 16 deletions(-) diff --git a/README.md b/README.md index bf7654fb..3c73139b 100644 --- a/README.md +++ b/README.md @@ -1,9 +1,9 @@ # test-suite A portable test suite for software installations, using ReFrame -## Getting started (@casparvl, commited 2022-12-06) +## Getting started -- install ReFrame >=3.11, <4 +- install ReFrame >=4.0 - clone the test suite @@ -22,7 +22,7 @@ git clone git@github.com:EESSI/test-suite.git but skips CUDA modules in non-GPU nodes, and skips non-CUDA modules in GPU nodes ``` -module load ReFrame/3.12.0 +module load ReFrame/4.0.1 eessiroot= eessihome=$eessiroot/eessi/reframe @@ -34,16 +34,3 @@ PYTHONPATH=$PYTHONPATH:$EBROOTREFRAME:$eessihome reframe \ -r --performance-report ``` -## Improvements in PR #11 (2022-12-14) - -- features to filter out CUDA modules in non-GPU nodes and non-CUDA modules in GPU nodes - - requires adding `features` `cpu` and/or `gpu` to the partitions in the site config file -- support for specifying modules - - via `--setvar modules=` -- support for specifying systems:partitions - - via `--setvar valid_systems=` -- support for overriding tasks, cpus - - via `--setvar num_tasks_per_node=` and/or `--setvar num_cpus_per_task=` -- support for setting additional environment variables - - via `--setvar variables=:` - From 2d93a78f73d7f3a2daf624f1d63041a607ec027f Mon Sep 17 00:00:00 2001 From: Samuel Moors Date: Sun, 12 Feb 2023 08:55:37 +0100 Subject: [PATCH 15/29] improve variable name --- eessi/reframe/eessi-checks/applications/gromacs_check.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/eessi/reframe/eessi-checks/applications/gromacs_check.py b/eessi/reframe/eessi-checks/applications/gromacs_check.py index e0ce0fbc..2b7827fc 100644 --- a/eessi/reframe/eessi-checks/applications/gromacs_check.py +++ b/eessi/reframe/eessi-checks/applications/gromacs_check.py @@ -42,12 +42,12 @@ class GROMACS_EESSI(gromacs_check): def filter_tests(self): # filter valid_systems, unless specified with --setvar valid_systems= if not self.valid_systems: - cuda_module = utils.is_cuda_required_module(self.module_name) + is_cuda_module = utils.is_cuda_required_module(self.module_name) valid_systems = '' # CUDA modules and when using a GPU for non-bonded interactions require partitions with 'gpu' feature # non-CUDA modules require partitions with 'cpu' feature - if cuda_module: + if is_cuda_module: valid_systems = '+gpu' if self.nb_impl == 'cpu': valid_systems += ' +cpu' From 4f9ae422a76a88b0d050cc19d69c1927ddca2bf4 Mon Sep 17 00:00:00 2001 From: Samuel Moors Date: Sun, 12 Feb 2023 09:49:28 +0100 Subject: [PATCH 16/29] update readme --- README.md | 40 ++++++++++++++++++++++-- eessi/reframe/config/settings_example.py | 2 +- 2 files changed, 39 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 3c73139b..0a8f9389 100644 --- a/README.md +++ b/README.md @@ -18,8 +18,7 @@ git clone git@github.com:EESSI/test-suite.git - run the tests the example below runs a gromacs simulation using GROMACS modules available in the system, - in combination with all available system:partitions as defined in the site config file, - but skips CUDA modules in non-GPU nodes, and skips non-CUDA modules in GPU nodes + in combination with all available system:partitions as defined in the site config file ``` module load ReFrame/4.0.1 @@ -34,3 +33,40 @@ PYTHONPATH=$PYTHONPATH:$EBROOTREFRAME:$eessihome reframe \ -r --performance-report ``` +## Configuring GPU/non-GPU partitions in your site config file: + +- running GPU jobs in GPU nodes + - add feature `gpu` to the GPU partitions + +- running non-GPU jobs in non-GPU nodes + - add feature `cpu` to the non-GPU partitions + +- running GPU jobs and non-GPU jobs on gpu nodes + - add both features `cpu` and `gpu` to the GPU partitions + ``` + 'features': ['cpu', 'gpu'], + ``` + +- setting the number of GPUS per node for a partition: + ``` + 'access': ['-p --gpus-per-node='], + 'devices': [ + {'type': 'gpu', 'num_devices': } + ], + ``` + +## Changing the default test behavior on the cmd line + +- specifying modules + - `--setvar modules=` + +- specifying systems:partitions + - `--setvar valid_systems=` + +- overriding tasks, cpus + - `--setvar num_tasks_per_node=` and/or + - `--setvar num_cpus_per_task=` + +- setting additional environment variables + - `--setvar env_vars=:` + diff --git a/eessi/reframe/config/settings_example.py b/eessi/reframe/config/settings_example.py index e8fe9131..c0ebe8b7 100644 --- a/eessi/reframe/config/settings_example.py +++ b/eessi/reframe/config/settings_example.py @@ -47,7 +47,7 @@ 'num_devices': 4, } ], - 'features': ['gpu'], + 'features': ['cpu', 'gpu'], 'descr': 'GPU partition' }, ] From 55088e996ab312fff2bda1518f609fbe61e62f6d Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Tue, 14 Feb 2023 17:25:22 +0100 Subject: [PATCH 17/29] Updated the logic that selects the default valid partition, module and nb_impl combinations. Stripped the skipping hooks, as those are no longer needed now that we use partition features for partition selection. --- .../applications/gromacs_check.py | 72 +++++++++++-------- 1 file changed, 41 insertions(+), 31 deletions(-) diff --git a/eessi/reframe/eessi-checks/applications/gromacs_check.py b/eessi/reframe/eessi-checks/applications/gromacs_check.py index 2b7827fc..f79b0277 100644 --- a/eessi/reframe/eessi-checks/applications/gromacs_check.py +++ b/eessi/reframe/eessi-checks/applications/gromacs_check.py @@ -45,16 +45,26 @@ def filter_tests(self): is_cuda_module = utils.is_cuda_required_module(self.module_name) valid_systems = '' - # CUDA modules and when using a GPU for non-bonded interactions require partitions with 'gpu' feature - # non-CUDA modules require partitions with 'cpu' feature - if is_cuda_module: + if is_cuda_module and self.nb_impl == 'gpu': + # CUDA modules and when using a GPU for non-bonded interactions require partitions with 'gpu' feature valid_systems = '+gpu' - if self.nb_impl == 'cpu': - valid_systems += ' +cpu' - else: - valid_systems += '+cpu' - if self.nb_impl == 'gpu': - valid_systems = '' # impossible combination + elif self.nb_impl == 'cpu': + # Non-bonded interactions on the CPU require partitions with 'cpu' feature + # Note: making 'cpu' an explicit feature allows e.g. skipping CPU-based tests on GPU partitions + + valid_systems = '+cpu' + elif not is_cuda_module and self.nb_impl == 'gpu': + # Invalid combination: a module without GPU support cannot compute non-bonded interactions on GPU + valid_systems = '' + +# if is_cuda_module: +# valid_systems = '+gpu' +# if self.nb_impl == 'cpu': +# valid_systems += ' +cpu' +# else: +# valid_systems += '+cpu' +# if self.nb_impl == 'gpu': +# valid_systems = '' # impossible combination if valid_systems: self.valid_systems = [valid_systems] @@ -80,28 +90,28 @@ def set_test_purpose(self): if self.benchmark_info[0] == 'HECBioSim/hEGFRDimer': self.tags.add('CI') - # Skip testing for when nb_impl=gpu and this is not a GPU node - @run_after('setup') - def skip_nb_impl_gpu_on_cpu_nodes(self): - self.skip_if( - (self.nb_impl == 'gpu' and not utils.is_gpu_present(self)), - "Skipping test variant with non-bonded interactions on GPUs, " - f"as this partition ({self.current_partition.name}) does not have GPU nodes" - ) - - # Sckip testing when nb_impl=gpu and this is not a GPU build of GROMACS - @run_after('setup') - def skip_nb_impl_gpu_on_non_cuda_builds(self): - self.skip_if( - (self.nb_impl == 'gpu' and not utils.is_cuda_required(self)), - "Skipping test variant with non-bonded interactions on GPUs, " - f"as this module ({self.module_name}) was not build with GPU support" - ) - - # Skip testing GPU-based modules on CPU-based nodes - @run_after('setup') - def skip_gpu_test_on_cpu_nodes(self): - hooks.skip_gpu_test_on_cpu_nodes(self) +# # Skip testing for when nb_impl=gpu and this is not a GPU node +# @run_after('setup') +# def skip_nb_impl_gpu_on_cpu_nodes(self): +# self.skip_if( +# (self.nb_impl == 'gpu' and not utils.is_gpu_present(self)), +# "Skipping test variant with non-bonded interactions on GPUs, " +# f"as this partition ({self.current_partition.name}) does not have GPU nodes" +# ) +# +# # Sckip testing when nb_impl=gpu and this is not a GPU build of GROMACS +# @run_after('setup') +# def skip_nb_impl_gpu_on_non_cuda_builds(self): +# self.skip_if( +# (self.nb_impl == 'gpu' and not utils.is_cuda_required(self)), +# "Skipping test variant with non-bonded interactions on GPUs, " +# f"as this module ({self.module_name}) was not build with GPU support" +# ) +# +# # Skip testing GPU-based modules on CPU-based nodes +# @run_after('setup') +# def skip_gpu_test_on_cpu_nodes(self): +# hooks.skip_gpu_test_on_cpu_nodes(self) # Assign num_tasks, num_tasks_per_node and num_cpus_per_task automatically # based on current partition's num_cpus and gpus From dca8ff87802e2dc390a237c88429577fe786866d Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Tue, 14 Feb 2023 17:28:35 +0100 Subject: [PATCH 18/29] Removed comments --- .../applications/gromacs_check.py | 32 ------------------- 1 file changed, 32 deletions(-) diff --git a/eessi/reframe/eessi-checks/applications/gromacs_check.py b/eessi/reframe/eessi-checks/applications/gromacs_check.py index f79b0277..0793b7cd 100644 --- a/eessi/reframe/eessi-checks/applications/gromacs_check.py +++ b/eessi/reframe/eessi-checks/applications/gromacs_check.py @@ -57,15 +57,6 @@ def filter_tests(self): # Invalid combination: a module without GPU support cannot compute non-bonded interactions on GPU valid_systems = '' -# if is_cuda_module: -# valid_systems = '+gpu' -# if self.nb_impl == 'cpu': -# valid_systems += ' +cpu' -# else: -# valid_systems += '+cpu' -# if self.nb_impl == 'gpu': -# valid_systems = '' # impossible combination - if valid_systems: self.valid_systems = [valid_systems] @@ -90,29 +81,6 @@ def set_test_purpose(self): if self.benchmark_info[0] == 'HECBioSim/hEGFRDimer': self.tags.add('CI') -# # Skip testing for when nb_impl=gpu and this is not a GPU node -# @run_after('setup') -# def skip_nb_impl_gpu_on_cpu_nodes(self): -# self.skip_if( -# (self.nb_impl == 'gpu' and not utils.is_gpu_present(self)), -# "Skipping test variant with non-bonded interactions on GPUs, " -# f"as this partition ({self.current_partition.name}) does not have GPU nodes" -# ) -# -# # Sckip testing when nb_impl=gpu and this is not a GPU build of GROMACS -# @run_after('setup') -# def skip_nb_impl_gpu_on_non_cuda_builds(self): -# self.skip_if( -# (self.nb_impl == 'gpu' and not utils.is_cuda_required(self)), -# "Skipping test variant with non-bonded interactions on GPUs, " -# f"as this module ({self.module_name}) was not build with GPU support" -# ) -# -# # Skip testing GPU-based modules on CPU-based nodes -# @run_after('setup') -# def skip_gpu_test_on_cpu_nodes(self): -# hooks.skip_gpu_test_on_cpu_nodes(self) - # Assign num_tasks, num_tasks_per_node and num_cpus_per_task automatically # based on current partition's num_cpus and gpus # Only when running nb_impl on GPU do we want one task per GPU From b1e4c864900c2d1304eef0fa609fdec261bd82bb Mon Sep 17 00:00:00 2001 From: Samuel Moors Date: Sat, 18 Feb 2023 13:19:41 +0100 Subject: [PATCH 19/29] replace 'builtin' prog env with 'default' to avoid reframe-4 warning --- eessi/reframe/config/settings_example.py | 6 +++--- eessi/reframe/eessi-checks/applications/gromacs_check.py | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/eessi/reframe/config/settings_example.py b/eessi/reframe/config/settings_example.py index c0ebe8b7..8425fe52 100644 --- a/eessi/reframe/config/settings_example.py +++ b/eessi/reframe/config/settings_example.py @@ -17,7 +17,7 @@ 'scheduler': 'slurm', 'launcher': 'mpirun', 'access': ['-p cpu'], - 'environs': ['builtin'], + 'environs': ['default'], 'max_jobs': 4, 'processor': { 'num_cpus': 128, @@ -33,7 +33,7 @@ 'scheduler': 'slurm', 'launcher': 'mpirun', 'access': ['-p gpu'], - 'environs': ['builtin'], + 'environs': ['default'], 'max_jobs': 4, 'processor': { 'num_cpus': 72, @@ -55,7 +55,7 @@ ], 'environments': [ { - 'name': 'builtin', + 'name': 'default', 'cc': 'cc', 'cxx': '', 'ftn': '', diff --git a/eessi/reframe/eessi-checks/applications/gromacs_check.py b/eessi/reframe/eessi-checks/applications/gromacs_check.py index 0793b7cd..606bdbac 100644 --- a/eessi/reframe/eessi-checks/applications/gromacs_check.py +++ b/eessi/reframe/eessi-checks/applications/gromacs_check.py @@ -33,7 +33,7 @@ class GROMACS_EESSI(gromacs_check): ('n_large', 16)]) module_name = parameter(my_find_modules('GROMACS')) - valid_prog_environs = ['builtin'] + valid_prog_environs = ['default'] valid_systems = [] time_limit = '30m' From cabf75170074786e5dd05cf10c07d216d6cc7041 Mon Sep 17 00:00:00 2001 From: Samuel Moors Date: Sat, 18 Feb 2023 14:18:55 +0100 Subject: [PATCH 20/29] add custom variables module_regex_select, module_regex_skip, run_mode --- .../applications/gromacs_check.py | 26 +++++++++++++++++-- 1 file changed, 24 insertions(+), 2 deletions(-) diff --git a/eessi/reframe/eessi-checks/applications/gromacs_check.py b/eessi/reframe/eessi-checks/applications/gromacs_check.py index 606bdbac..c9929a63 100644 --- a/eessi/reframe/eessi-checks/applications/gromacs_check.py +++ b/eessi/reframe/eessi-checks/applications/gromacs_check.py @@ -3,6 +3,8 @@ # # SPDX-License-Identifier: BSD-3-Clause +import re + import reframe as rfm import reframe.core.runtime as rt from reframe.utility import OrderedSet @@ -36,6 +38,10 @@ class GROMACS_EESSI(gromacs_check): valid_prog_environs = ['default'] valid_systems = [] + module_regex_select = variable(str, value='') + module_regex_skip = variable(str, value='') + run_mode = variable(str, value='') + time_limit = '30m' @run_after('init') @@ -48,11 +54,12 @@ def filter_tests(self): if is_cuda_module and self.nb_impl == 'gpu': # CUDA modules and when using a GPU for non-bonded interactions require partitions with 'gpu' feature valid_systems = '+gpu' + elif self.nb_impl == 'cpu': # Non-bonded interactions on the CPU require partitions with 'cpu' feature # Note: making 'cpu' an explicit feature allows e.g. skipping CPU-based tests on GPU partitions - valid_systems = '+cpu' + elif not is_cuda_module and self.nb_impl == 'gpu': # Invalid combination: a module without GPU support cannot compute non-bonded interactions on GPU valid_systems = '' @@ -60,11 +67,26 @@ def filter_tests(self): if valid_systems: self.valid_systems = [valid_systems] - # filter out this test if the module is not among a list of manually specified modules + # skip this test if nb_impl is not equal to run_mode + if self.run_mode: + if self.nb_impl != self.run_mode: + self.valid_systems = [] + + # skip this test if the module is not among a list of manually specified modules # modules can be specified with --setvar modules= if self.modules and self.module_name not in self.modules: self.valid_systems = [] + # skip this test if the module does not match module_regex_select + if self.module_regex_select: + if not re.search(r'{}'.format(self.module_regex_select), self.module_name): + self.valid_systems = [] + + # skip this test if the module matches module_regex_skip + if self.module_regex_skip: + if re.search(r'{}'.format(self.module_regex_skip), self.module_name): + self.valid_systems = [] + self.modules = [self.module_name] @run_after('init') From 24f22d93d2360eb3e129cc40edbd11d7254d1ba5 Mon Sep 17 00:00:00 2001 From: Samuel Moors Date: Thu, 23 Feb 2023 13:01:34 +0100 Subject: [PATCH 21/29] revert commit adding custom variables --- .../applications/gromacs_check.py | 21 --- .../gromacs_check.py.bak20230217_203527 | 132 ++++++++++++++++++ .../gromacs_check.py.bak20230218_131214 | 105 ++++++++++++++ 3 files changed, 237 insertions(+), 21 deletions(-) create mode 100644 eessi/reframe/eessi-checks/applications/gromacs_check.py.bak20230217_203527 create mode 100644 eessi/reframe/eessi-checks/applications/gromacs_check.py.bak20230218_131214 diff --git a/eessi/reframe/eessi-checks/applications/gromacs_check.py b/eessi/reframe/eessi-checks/applications/gromacs_check.py index c9929a63..2a31c4a6 100644 --- a/eessi/reframe/eessi-checks/applications/gromacs_check.py +++ b/eessi/reframe/eessi-checks/applications/gromacs_check.py @@ -3,8 +3,6 @@ # # SPDX-License-Identifier: BSD-3-Clause -import re - import reframe as rfm import reframe.core.runtime as rt from reframe.utility import OrderedSet @@ -38,10 +36,6 @@ class GROMACS_EESSI(gromacs_check): valid_prog_environs = ['default'] valid_systems = [] - module_regex_select = variable(str, value='') - module_regex_skip = variable(str, value='') - run_mode = variable(str, value='') - time_limit = '30m' @run_after('init') @@ -67,26 +61,11 @@ def filter_tests(self): if valid_systems: self.valid_systems = [valid_systems] - # skip this test if nb_impl is not equal to run_mode - if self.run_mode: - if self.nb_impl != self.run_mode: - self.valid_systems = [] - # skip this test if the module is not among a list of manually specified modules # modules can be specified with --setvar modules= if self.modules and self.module_name not in self.modules: self.valid_systems = [] - # skip this test if the module does not match module_regex_select - if self.module_regex_select: - if not re.search(r'{}'.format(self.module_regex_select), self.module_name): - self.valid_systems = [] - - # skip this test if the module matches module_regex_skip - if self.module_regex_skip: - if re.search(r'{}'.format(self.module_regex_skip), self.module_name): - self.valid_systems = [] - self.modules = [self.module_name] @run_after('init') diff --git a/eessi/reframe/eessi-checks/applications/gromacs_check.py.bak20230217_203527 b/eessi/reframe/eessi-checks/applications/gromacs_check.py.bak20230217_203527 new file mode 100644 index 00000000..881b4316 --- /dev/null +++ b/eessi/reframe/eessi-checks/applications/gromacs_check.py.bak20230217_203527 @@ -0,0 +1,132 @@ +# Copyright 2016-2021 Swiss National Supercomputing Centre (CSCS/ETH Zurich) +# ReFrame Project Developers. See the top-level LICENSE file for details. +# +# SPDX-License-Identifier: BSD-3-Clause + +import re + +import reframe as rfm +import reframe.core.runtime as rt +from reframe.utility import OrderedSet + +from hpctestlib.sciapps.gromacs.benchmarks import gromacs_check +import eessi_utils.hooks as hooks +import eessi_utils.utils as utils + + +def my_find_modules(substr): + """Return all modules in the current system that contain ``substr`` in their name.""" + if not isinstance(substr, str): + raise TypeError("'substr' argument must be a string") + + ms = rt.runtime().modules_system + modules = OrderedSet(ms.available_modules(substr)) + for m in modules: + yield m + + +@rfm.simple_test +class GROMACS_EESSI(gromacs_check): + + scale = parameter([ + ('singlenode', 1), + ('n_small', 2), + ('n_medium', 8), + ('n_large', 16)]) + + exclude_modules = variable(str, value='') + module_name = parameter(my_find_modules('GROMACS')) + valid_prog_environs = ['builtin'] + valid_systems = [] + + time_limit = '30m' + + @run_after('init') + def filter_tests(self): + + # filter valid_systems, unless specified with --setvar valid_systems= + if not self.valid_systems: + is_cuda_module = utils.is_cuda_required_module(self.module_name) + valid_systems = '' + + # CUDA modules and when using a GPU for non-bonded interactions require partitions with 'gpu' feature + # non-CUDA modules require partitions with 'cpu' feature + if is_cuda_module: + valid_systems = '+gpu' + if self.nb_impl == 'cpu': + valid_systems += ' +cpu' + else: + valid_systems += '+cpu' + if self.nb_impl == 'gpu': + valid_systems = '' # impossible combination + + if valid_systems: + self.valid_systems = [valid_systems] + + # filter out this test if the module is not among a list of manually specified modules + # modules can be specified with --setvar modules= + if self.modules and self.module_name not in self.modules: + self.valid_systems = [] + + # filter out this test if the module matches optional exclude_modules regex + print(self.module_name) + if self.exclude_modules: + if re.search(r'{}'.format(self.exclude_modules), self.module_name): + self.valid_systems = [] + print('filtered') + + self.modules = [self.module_name] + + @run_after('init') + def set_test_scale(self): + scale_variant, self.num_nodes = self.scale + self.tags.add(scale_variant) + + # Set correct tags for monitoring & CI + @run_after('init') + def set_test_purpose(self): + # Run all tests from the testlib for monitoring + self.tags.add('monitoring') + # Select one test for CI + if self.benchmark_info[0] == 'HECBioSim/hEGFRDimer': + self.tags.add('CI') + + # Skip testing for when nb_impl=gpu and this is not a GPU node + @run_after('setup') + def skip_nb_impl_gpu_on_cpu_nodes(self): + self.skip_if( + (self.nb_impl == 'gpu' and not utils.is_gpu_present(self)), + "Skipping test variant with non-bonded interactions on GPUs, " + f"as this partition ({self.current_partition.name}) does not have GPU nodes" + ) + + # Sckip testing when nb_impl=gpu and this is not a GPU build of GROMACS + @run_after('setup') + def skip_nb_impl_gpu_on_non_cuda_builds(self): + self.skip_if( + (self.nb_impl == 'gpu' and not utils.is_cuda_required(self)), + "Skipping test variant with non-bonded interactions on GPUs, " + f"as this module ({self.module_name}) was not build with GPU support" + ) + + # Skip testing GPU-based modules on CPU-based nodes + @run_after('setup') + def skip_gpu_test_on_cpu_nodes(self): + hooks.skip_gpu_test_on_cpu_nodes(self) + + # Assign num_tasks, num_tasks_per_node and num_cpus_per_task automatically + # based on current partition's num_cpus and gpus + # Only when running nb_impl on GPU do we want one task per GPU + @run_after('setup') + def set_num_tasks(self): + if self.nb_impl == 'gpu': + hooks.assign_one_task_per_gpu(test=self, num_nodes=self.num_nodes) + else: + hooks.assign_one_task_per_cpu(test=self, num_nodes=self.num_nodes) + + @run_after('setup') + def set_omp_num_threads(self): + omp_num_threads = self.num_cpus_per_task + # set both OMP_NUM_THREADS and -ntomp explicitly to avoid conflicting values + self.executable_opts += ['-dlb yes', f'-ntomp {omp_num_threads}', '-npme -1'] + self.env_vars['OMP_NUM_THREADS'] = f'{omp_num_threads}' diff --git a/eessi/reframe/eessi-checks/applications/gromacs_check.py.bak20230218_131214 b/eessi/reframe/eessi-checks/applications/gromacs_check.py.bak20230218_131214 new file mode 100644 index 00000000..ca9638bf --- /dev/null +++ b/eessi/reframe/eessi-checks/applications/gromacs_check.py.bak20230218_131214 @@ -0,0 +1,105 @@ +# Copyright 2016-2021 Swiss National Supercomputing Centre (CSCS/ETH Zurich) +# ReFrame Project Developers. See the top-level LICENSE file for details. +# +# SPDX-License-Identifier: BSD-3-Clause + +import re + +import reframe as rfm +import reframe.core.runtime as rt +from reframe.utility import OrderedSet +from reframe.utility.typecheck import Bool + +from hpctestlib.sciapps.gromacs.benchmarks import gromacs_check +import eessi_utils.hooks as hooks +import eessi_utils.utils as utils + + +def my_find_modules(substr): + """Return all modules in the current system that contain ``substr`` in their name.""" + if not isinstance(substr, str): + raise TypeError("'substr' argument must be a string") + + ms = rt.runtime().modules_system + modules = OrderedSet(ms.available_modules(substr)) + for m in modules: + yield m + + +@rfm.simple_test +class GROMACS_EESSI(gromacs_check): + + scale = parameter([ + ('singlenode', 1), + ('n_small', 2), + ('n_medium', 8), + ('n_large', 16)]) + + module_name = parameter(my_find_modules('GROMACS')) + valid_prog_environs = ['default'] + valid_systems = [] + cuda_module_on_cpu = variable(Bool, value=True) + + time_limit = '30m' + + @run_after('init') + def filter_tests(self): + # filter valid_systems, unless specified with --setvar valid_systems= + if not self.valid_systems: + is_cuda_module = utils.is_cuda_required_module(self.module_name) + valid_systems = '' + + if is_cuda_module and self.nb_impl == 'gpu': + # CUDA modules and when using a GPU for non-bonded interactions require partitions with 'gpu' feature + valid_systems = '+gpu' + + elif self.nb_impl == 'cpu' and (self.cuda_module_on_cpu or not is_cuda_module): + # Non-bonded interactions on the CPU require partitions with 'cpu' feature + # Note: making 'cpu' an explicit feature allows e.g. skipping CPU-based tests on GPU partitions + # Note: cuda_module_on_cpu=True allows running CUDA modules on partitions with 'cpu' feature + valid_systems = '+cpu' + + elif not is_cuda_module and self.nb_impl == 'gpu': + # Invalid combination: a module without GPU support cannot compute non-bonded interactions on GPU + valid_systems = '' + + if valid_systems: + self.valid_systems = [valid_systems] + + # skip this module if it is not among a list of manually specified modules + # modules can be specified with --setvar modules= + if self.modules and self.module_name not in self.modules: + self.valid_systems = [] + + self.modules = [self.module_name] + + @run_after('init') + def set_test_scale(self): + scale_variant, self.num_nodes = self.scale + self.tags.add(scale_variant) + + # Set correct tags for monitoring & CI + @run_after('init') + def set_test_purpose(self): + # Run all tests from the testlib for monitoring + self.tags.add('monitoring') + # Select one test for CI + if self.benchmark_info[0] == 'HECBioSim/hEGFRDimer': + self.tags.add('CI') + + # Assign num_tasks, num_tasks_per_node and num_cpus_per_task automatically + # based on current partition's num_cpus and gpus + # Only when running nb_impl on GPU do we want one task per GPU + @run_after('setup') + def set_num_tasks(self): + if self.nb_impl == 'gpu': + hooks.assign_one_task_per_gpu(test=self, num_nodes=self.num_nodes) + else: + hooks.assign_one_task_per_cpu(test=self, num_nodes=self.num_nodes) + + @run_after('setup') + def set_omp_num_threads(self): + omp_num_threads = self.num_cpus_per_task + # set both OMP_NUM_THREADS and -ntomp explicitly to avoid conflicting values + self.executable_opts += ['-dlb yes', f'-ntomp {omp_num_threads}', '-npme -1'] + self.env_vars['OMP_NUM_THREADS'] = f'{omp_num_threads}' From c986198c3a6b3956c75d39c330b37581c9104cf4 Mon Sep 17 00:00:00 2001 From: Samuel Moors Date: Thu, 23 Feb 2023 15:34:23 +0100 Subject: [PATCH 22/29] remove backup files --- .../gromacs_check.py.bak20230217_203527 | 132 ------------------ .../gromacs_check.py.bak20230218_131214 | 105 -------------- 2 files changed, 237 deletions(-) delete mode 100644 eessi/reframe/eessi-checks/applications/gromacs_check.py.bak20230217_203527 delete mode 100644 eessi/reframe/eessi-checks/applications/gromacs_check.py.bak20230218_131214 diff --git a/eessi/reframe/eessi-checks/applications/gromacs_check.py.bak20230217_203527 b/eessi/reframe/eessi-checks/applications/gromacs_check.py.bak20230217_203527 deleted file mode 100644 index 881b4316..00000000 --- a/eessi/reframe/eessi-checks/applications/gromacs_check.py.bak20230217_203527 +++ /dev/null @@ -1,132 +0,0 @@ -# Copyright 2016-2021 Swiss National Supercomputing Centre (CSCS/ETH Zurich) -# ReFrame Project Developers. See the top-level LICENSE file for details. -# -# SPDX-License-Identifier: BSD-3-Clause - -import re - -import reframe as rfm -import reframe.core.runtime as rt -from reframe.utility import OrderedSet - -from hpctestlib.sciapps.gromacs.benchmarks import gromacs_check -import eessi_utils.hooks as hooks -import eessi_utils.utils as utils - - -def my_find_modules(substr): - """Return all modules in the current system that contain ``substr`` in their name.""" - if not isinstance(substr, str): - raise TypeError("'substr' argument must be a string") - - ms = rt.runtime().modules_system - modules = OrderedSet(ms.available_modules(substr)) - for m in modules: - yield m - - -@rfm.simple_test -class GROMACS_EESSI(gromacs_check): - - scale = parameter([ - ('singlenode', 1), - ('n_small', 2), - ('n_medium', 8), - ('n_large', 16)]) - - exclude_modules = variable(str, value='') - module_name = parameter(my_find_modules('GROMACS')) - valid_prog_environs = ['builtin'] - valid_systems = [] - - time_limit = '30m' - - @run_after('init') - def filter_tests(self): - - # filter valid_systems, unless specified with --setvar valid_systems= - if not self.valid_systems: - is_cuda_module = utils.is_cuda_required_module(self.module_name) - valid_systems = '' - - # CUDA modules and when using a GPU for non-bonded interactions require partitions with 'gpu' feature - # non-CUDA modules require partitions with 'cpu' feature - if is_cuda_module: - valid_systems = '+gpu' - if self.nb_impl == 'cpu': - valid_systems += ' +cpu' - else: - valid_systems += '+cpu' - if self.nb_impl == 'gpu': - valid_systems = '' # impossible combination - - if valid_systems: - self.valid_systems = [valid_systems] - - # filter out this test if the module is not among a list of manually specified modules - # modules can be specified with --setvar modules= - if self.modules and self.module_name not in self.modules: - self.valid_systems = [] - - # filter out this test if the module matches optional exclude_modules regex - print(self.module_name) - if self.exclude_modules: - if re.search(r'{}'.format(self.exclude_modules), self.module_name): - self.valid_systems = [] - print('filtered') - - self.modules = [self.module_name] - - @run_after('init') - def set_test_scale(self): - scale_variant, self.num_nodes = self.scale - self.tags.add(scale_variant) - - # Set correct tags for monitoring & CI - @run_after('init') - def set_test_purpose(self): - # Run all tests from the testlib for monitoring - self.tags.add('monitoring') - # Select one test for CI - if self.benchmark_info[0] == 'HECBioSim/hEGFRDimer': - self.tags.add('CI') - - # Skip testing for when nb_impl=gpu and this is not a GPU node - @run_after('setup') - def skip_nb_impl_gpu_on_cpu_nodes(self): - self.skip_if( - (self.nb_impl == 'gpu' and not utils.is_gpu_present(self)), - "Skipping test variant with non-bonded interactions on GPUs, " - f"as this partition ({self.current_partition.name}) does not have GPU nodes" - ) - - # Sckip testing when nb_impl=gpu and this is not a GPU build of GROMACS - @run_after('setup') - def skip_nb_impl_gpu_on_non_cuda_builds(self): - self.skip_if( - (self.nb_impl == 'gpu' and not utils.is_cuda_required(self)), - "Skipping test variant with non-bonded interactions on GPUs, " - f"as this module ({self.module_name}) was not build with GPU support" - ) - - # Skip testing GPU-based modules on CPU-based nodes - @run_after('setup') - def skip_gpu_test_on_cpu_nodes(self): - hooks.skip_gpu_test_on_cpu_nodes(self) - - # Assign num_tasks, num_tasks_per_node and num_cpus_per_task automatically - # based on current partition's num_cpus and gpus - # Only when running nb_impl on GPU do we want one task per GPU - @run_after('setup') - def set_num_tasks(self): - if self.nb_impl == 'gpu': - hooks.assign_one_task_per_gpu(test=self, num_nodes=self.num_nodes) - else: - hooks.assign_one_task_per_cpu(test=self, num_nodes=self.num_nodes) - - @run_after('setup') - def set_omp_num_threads(self): - omp_num_threads = self.num_cpus_per_task - # set both OMP_NUM_THREADS and -ntomp explicitly to avoid conflicting values - self.executable_opts += ['-dlb yes', f'-ntomp {omp_num_threads}', '-npme -1'] - self.env_vars['OMP_NUM_THREADS'] = f'{omp_num_threads}' diff --git a/eessi/reframe/eessi-checks/applications/gromacs_check.py.bak20230218_131214 b/eessi/reframe/eessi-checks/applications/gromacs_check.py.bak20230218_131214 deleted file mode 100644 index ca9638bf..00000000 --- a/eessi/reframe/eessi-checks/applications/gromacs_check.py.bak20230218_131214 +++ /dev/null @@ -1,105 +0,0 @@ -# Copyright 2016-2021 Swiss National Supercomputing Centre (CSCS/ETH Zurich) -# ReFrame Project Developers. See the top-level LICENSE file for details. -# -# SPDX-License-Identifier: BSD-3-Clause - -import re - -import reframe as rfm -import reframe.core.runtime as rt -from reframe.utility import OrderedSet -from reframe.utility.typecheck import Bool - -from hpctestlib.sciapps.gromacs.benchmarks import gromacs_check -import eessi_utils.hooks as hooks -import eessi_utils.utils as utils - - -def my_find_modules(substr): - """Return all modules in the current system that contain ``substr`` in their name.""" - if not isinstance(substr, str): - raise TypeError("'substr' argument must be a string") - - ms = rt.runtime().modules_system - modules = OrderedSet(ms.available_modules(substr)) - for m in modules: - yield m - - -@rfm.simple_test -class GROMACS_EESSI(gromacs_check): - - scale = parameter([ - ('singlenode', 1), - ('n_small', 2), - ('n_medium', 8), - ('n_large', 16)]) - - module_name = parameter(my_find_modules('GROMACS')) - valid_prog_environs = ['default'] - valid_systems = [] - cuda_module_on_cpu = variable(Bool, value=True) - - time_limit = '30m' - - @run_after('init') - def filter_tests(self): - # filter valid_systems, unless specified with --setvar valid_systems= - if not self.valid_systems: - is_cuda_module = utils.is_cuda_required_module(self.module_name) - valid_systems = '' - - if is_cuda_module and self.nb_impl == 'gpu': - # CUDA modules and when using a GPU for non-bonded interactions require partitions with 'gpu' feature - valid_systems = '+gpu' - - elif self.nb_impl == 'cpu' and (self.cuda_module_on_cpu or not is_cuda_module): - # Non-bonded interactions on the CPU require partitions with 'cpu' feature - # Note: making 'cpu' an explicit feature allows e.g. skipping CPU-based tests on GPU partitions - # Note: cuda_module_on_cpu=True allows running CUDA modules on partitions with 'cpu' feature - valid_systems = '+cpu' - - elif not is_cuda_module and self.nb_impl == 'gpu': - # Invalid combination: a module without GPU support cannot compute non-bonded interactions on GPU - valid_systems = '' - - if valid_systems: - self.valid_systems = [valid_systems] - - # skip this module if it is not among a list of manually specified modules - # modules can be specified with --setvar modules= - if self.modules and self.module_name not in self.modules: - self.valid_systems = [] - - self.modules = [self.module_name] - - @run_after('init') - def set_test_scale(self): - scale_variant, self.num_nodes = self.scale - self.tags.add(scale_variant) - - # Set correct tags for monitoring & CI - @run_after('init') - def set_test_purpose(self): - # Run all tests from the testlib for monitoring - self.tags.add('monitoring') - # Select one test for CI - if self.benchmark_info[0] == 'HECBioSim/hEGFRDimer': - self.tags.add('CI') - - # Assign num_tasks, num_tasks_per_node and num_cpus_per_task automatically - # based on current partition's num_cpus and gpus - # Only when running nb_impl on GPU do we want one task per GPU - @run_after('setup') - def set_num_tasks(self): - if self.nb_impl == 'gpu': - hooks.assign_one_task_per_gpu(test=self, num_nodes=self.num_nodes) - else: - hooks.assign_one_task_per_cpu(test=self, num_nodes=self.num_nodes) - - @run_after('setup') - def set_omp_num_threads(self): - omp_num_threads = self.num_cpus_per_task - # set both OMP_NUM_THREADS and -ntomp explicitly to avoid conflicting values - self.executable_opts += ['-dlb yes', f'-ntomp {omp_num_threads}', '-npme -1'] - self.env_vars['OMP_NUM_THREADS'] = f'{omp_num_threads}' From ded8cc8c6006fa6bfa82515be9faa06d7439864a Mon Sep 17 00:00:00 2001 From: Samuel Moors Date: Thu, 23 Feb 2023 16:47:57 +0100 Subject: [PATCH 23/29] properly handle gpus-per-node --- eessi/reframe/config/settings_example.py | 6 ++++++ eessi/reframe/eessi_utils/hooks.py | 7 ++++++- eessi/reframe/eessi_utils/utils.py | 7 +++++-- 3 files changed, 17 insertions(+), 3 deletions(-) diff --git a/eessi/reframe/config/settings_example.py b/eessi/reframe/config/settings_example.py index 8425fe52..3fc6ddaf 100644 --- a/eessi/reframe/config/settings_example.py +++ b/eessi/reframe/config/settings_example.py @@ -41,6 +41,12 @@ 'num_cpus_per_socket': 36, 'arch': 'icelake', }, + 'resources': [ + { + 'name': '_rfm_gpu', + 'options': ['--gpus-per-node={num_gpus_per_node}'], + } + ], 'devices': [ { 'type': 'gpu', diff --git a/eessi/reframe/eessi_utils/hooks.py b/eessi/reframe/eessi_utils/hooks.py index e23f7c30..7bcd06de 100644 --- a/eessi/reframe/eessi_utils/hooks.py +++ b/eessi/reframe/eessi_utils/hooks.py @@ -53,7 +53,12 @@ def assign_one_task_per_gpu(test: rfm.RegressionTest, num_nodes: int) -> rfm.Reg raise AttributeError(processor_info_missing) if not test.num_tasks_per_node: - test.num_tasks_per_node = utils.get_num_gpus(test) + if not test.num_gpus_per_node: + test.num_gpus_per_node = utils.get_num_gpus_per_node(test) + test.num_tasks_per_node = test.num_gpus_per_node + + elif not test.num_gpus_per_node: + test.num_gpus_per_node = test.num_tasks_per_node if not test.num_cpus_per_task: test.num_cpus_per_task = int(test.current_partition.processor.num_cpus / test.num_tasks_per_node) diff --git a/eessi/reframe/eessi_utils/utils.py b/eessi/reframe/eessi_utils/utils.py index 7d02b388..400b36d8 100644 --- a/eessi/reframe/eessi_utils/utils.py +++ b/eessi/reframe/eessi_utils/utils.py @@ -9,8 +9,11 @@ def _get_gpu_list(test: rfm.RegressionTest): return [dev.num_devices for dev in test.current_partition.devices if dev.device_type == GPU_DEV_NAME] -def get_num_gpus(test: rfm.RegressionTest) -> int: - '''Returns the number of GPUs for the current partition''' +def get_num_gpus_per_node(test: rfm.RegressionTest) -> int: + ''' + Returns the number of GPUs per node for the current partition, + taken from 'num_devices' of device GPU_DEV_NAME in the 'devices' attribute of the current partition + ''' gpu_list = _get_gpu_list(test) # If multiple devices are called 'GPU' in the current partition, # we don't know for which to return the device count... From 65fcb42f30e5b1beb32043b93f705ddc858ab71c Mon Sep 17 00:00:00 2001 From: Samuel Moors Date: Thu, 23 Feb 2023 17:09:26 +0100 Subject: [PATCH 24/29] update readme --- README.md | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 0a8f9389..95dfc675 100644 --- a/README.md +++ b/README.md @@ -47,13 +47,22 @@ PYTHONPATH=$PYTHONPATH:$EBROOTREFRAME:$eessihome reframe \ 'features': ['cpu', 'gpu'], ``` -- setting the number of GPUS per node for a partition: +- setting the number of GPUS per node for a partition: ``` - 'access': ['-p --gpus-per-node='], + 'access': ['-p '], 'devices': [ {'type': 'gpu', 'num_devices': } ], ``` +- requesting GPUs per node for a partition: + ``` + 'resources': [ + { + 'name': '_rfm_gpu', + 'options': ['--gpus-per-node={num_gpus_per_node}'], + } + ], + ``` ## Changing the default test behavior on the cmd line @@ -63,9 +72,10 @@ PYTHONPATH=$PYTHONPATH:$EBROOTREFRAME:$eessihome reframe \ - specifying systems:partitions - `--setvar valid_systems=` -- overriding tasks, cpus - - `--setvar num_tasks_per_node=` and/or +- overriding tasks, cpus, gpus + - `--setvar num_tasks_per_node=` - `--setvar num_cpus_per_task=` + - `--setvar num_gpus_per_node=` - setting additional environment variables - `--setvar env_vars=:` From 0d2554b44b08260988432c6dfd29626cd1e3a423 Mon Sep 17 00:00:00 2001 From: Samuel Moors Date: Sat, 25 Feb 2023 11:21:08 +0100 Subject: [PATCH 25/29] limit gpus per node to the maximum available --- eessi/reframe/eessi_utils/hooks.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/eessi/reframe/eessi_utils/hooks.py b/eessi/reframe/eessi_utils/hooks.py index 7bcd06de..a7dc526a 100644 --- a/eessi/reframe/eessi_utils/hooks.py +++ b/eessi/reframe/eessi_utils/hooks.py @@ -46,19 +46,25 @@ def assign_one_task_per_cpu(test: rfm.RegressionTest, num_nodes: int) -> rfm.Reg def assign_one_task_per_gpu(test: rfm.RegressionTest, num_nodes: int) -> rfm.RegressionTest: ''' Sets num_tasks_per_node to the number of gpus, - and num_cpus_per_task to the number of CPUs available per GPU in this partition, + and num_cpus_per_task to the number of CPUs per node available per GPU in this partition, unless specified with --setvar num_tasks_per_node= and/or --setvar num_cpus_per_task= + Also sets num_gpus_per_node unless specified with --setvar num_gpus_per_node=: + - if num_tasks_per_node is not set, set num_gpus_per_node equal to nb of GPUs per node available in this partition + - if num_tasks_per_node is set, set num_gpus_per_node equal to either num_tasks_per_node or nb of GPUs per node + available in this partition (whatever is smallest). ''' if test.current_partition.processor.num_cpus is None: raise AttributeError(processor_info_missing) + max_gpus_per_node = utils.get_num_gpus_per_node(test) + if not test.num_tasks_per_node: if not test.num_gpus_per_node: - test.num_gpus_per_node = utils.get_num_gpus_per_node(test) + test.num_gpus_per_node = max_gpus_per_node test.num_tasks_per_node = test.num_gpus_per_node elif not test.num_gpus_per_node: - test.num_gpus_per_node = test.num_tasks_per_node + test.num_gpus_per_node = min(test.num_tasks_per_node, max_gpus_per_node) if not test.num_cpus_per_task: test.num_cpus_per_task = int(test.current_partition.processor.num_cpus / test.num_tasks_per_node) From 3835a8730e21e0fa5a9c2c5c3651ab339a060687 Mon Sep 17 00:00:00 2001 From: Samuel Moors Date: Tue, 28 Feb 2023 12:17:39 +0100 Subject: [PATCH 26/29] scale number of cpus per node with number of GPUs requested --- eessi/reframe/eessi_utils/hooks.py | 36 ++++++++++++++++++------------ 1 file changed, 22 insertions(+), 14 deletions(-) diff --git a/eessi/reframe/eessi_utils/hooks.py b/eessi/reframe/eessi_utils/hooks.py index a7dc526a..f8788d36 100644 --- a/eessi/reframe/eessi_utils/hooks.py +++ b/eessi/reframe/eessi_utils/hooks.py @@ -45,31 +45,39 @@ def assign_one_task_per_cpu(test: rfm.RegressionTest, num_nodes: int) -> rfm.Reg def assign_one_task_per_gpu(test: rfm.RegressionTest, num_nodes: int) -> rfm.RegressionTest: ''' - Sets num_tasks_per_node to the number of gpus, - and num_cpus_per_task to the number of CPUs per node available per GPU in this partition, - unless specified with --setvar num_tasks_per_node= and/or --setvar num_cpus_per_task= - Also sets num_gpus_per_node unless specified with --setvar num_gpus_per_node=: - - if num_tasks_per_node is not set, set num_gpus_per_node equal to nb of GPUs per node available in this partition + Sets num_tasks_per_node, num_cpus_per_task, and num_gpus_per_node, + unless specified with + --setvar num_tasks_per_node= and/or + --setvar num_cpus_per_task= and/or + --setvar num_gpus_per_node= + - default num_gpus_per_node = total nb of GPUs per node available in this partition + - default num_tasks_per_node = num_gpus_per_node + - default num_cpus_per_task = total nb of CPUs per GPU available in this partition, divided by num_tasks_per_node - if num_tasks_per_node is set, set num_gpus_per_node equal to either num_tasks_per_node or nb of GPUs per node available in this partition (whatever is smallest). ''' - if test.current_partition.processor.num_cpus is None: + num_cpus = test.current_partition.processor.num_cpus + if num_cpus is None: raise AttributeError(processor_info_missing) + num_tasks_per_node = test.num_tasks_per_node + num_gpus_per_node = test.num_gpus_per_node max_gpus_per_node = utils.get_num_gpus_per_node(test) - if not test.num_tasks_per_node: - if not test.num_gpus_per_node: - test.num_gpus_per_node = max_gpus_per_node - test.num_tasks_per_node = test.num_gpus_per_node + if not num_tasks_per_node: + if not num_gpus_per_node: + num_gpus_per_node = max_gpus_per_node + num_tasks_per_node = num_gpus_per_node - elif not test.num_gpus_per_node: - test.num_gpus_per_node = min(test.num_tasks_per_node, max_gpus_per_node) + elif not num_gpus_per_node: + num_gpus_per_node = min(num_tasks_per_node, max_gpus_per_node) if not test.num_cpus_per_task: - test.num_cpus_per_task = int(test.current_partition.processor.num_cpus / test.num_tasks_per_node) + test.num_cpus_per_task = int((num_cpus * num_gpus_per_node) / (num_tasks_per_node * max_gpus_per_node)) - test.num_tasks = num_nodes * test.num_tasks_per_node + test.num_gpus_per_node = num_gpus_per_node + test.num_tasks_per_node = num_tasks_per_node + test.num_tasks = num_nodes * num_tasks_per_node def auto_assign_num_tasks_MPI(test: rfm.RegressionTest, num_nodes: int) -> rfm.RegressionTest: From 01378f01feb50ef0819d407030d937fcade7f49b Mon Sep 17 00:00:00 2001 From: Samuel Moors Date: Fri, 3 Mar 2023 15:14:28 +0100 Subject: [PATCH 27/29] properly scale num_tasks_per_node and num_cpus_per_task for cpu jobs --- .../applications/gromacs_check.py | 12 ++-- eessi/reframe/eessi_utils/hooks.py | 60 ++++++++++++------- 2 files changed, 44 insertions(+), 28 deletions(-) diff --git a/eessi/reframe/eessi-checks/applications/gromacs_check.py b/eessi/reframe/eessi-checks/applications/gromacs_check.py index 2a31c4a6..3dee5881 100644 --- a/eessi/reframe/eessi-checks/applications/gromacs_check.py +++ b/eessi/reframe/eessi-checks/applications/gromacs_check.py @@ -82,15 +82,13 @@ def set_test_purpose(self): if self.benchmark_info[0] == 'HECBioSim/hEGFRDimer': self.tags.add('CI') - # Assign num_tasks, num_tasks_per_node and num_cpus_per_task automatically - # based on current partition's num_cpus and gpus - # Only when running nb_impl on GPU do we want one task per GPU + # Assign default values for num_tasks, num_tasks_per_node, num_cpus_per_task, and num_gpus_per_node, + # based on current partition's num_cpus and gpus + # when running nb_impl on CPU, we request one task per CPU + # when running nb_impl on GPU, we request one task per GPU @run_after('setup') def set_num_tasks(self): - if self.nb_impl == 'gpu': - hooks.assign_one_task_per_gpu(test=self, num_nodes=self.num_nodes) - else: - hooks.assign_one_task_per_cpu(test=self, num_nodes=self.num_nodes) + hooks.assign_one_task_per_feature(test=self, feature=self.nb_impl) @run_after('setup') def set_omp_num_threads(self): diff --git a/eessi/reframe/eessi_utils/hooks.py b/eessi/reframe/eessi_utils/hooks.py index f8788d36..302e2f0b 100644 --- a/eessi/reframe/eessi_utils/hooks.py +++ b/eessi/reframe/eessi_utils/hooks.py @@ -11,7 +11,7 @@ def skip_cpu_test_on_gpu_nodes(test: rfm.RegressionTest): '''Skip test if GPUs are present, but no CUDA is required''' - skip = (utils.is_gpu_present(test) and not utils.is_cuda_required(test)) + skip = (utils.is_gpu_cresent(test) and not utils.is_cuda_required(test)) if skip: test.skip_if(True, f"GPU is present on this partition ({test.current_partition.name}), skipping CPU-based test") @@ -27,24 +27,43 @@ def skip_gpu_test_on_cpu_nodes(test: rfm.RegressionTest): ) -def assign_one_task_per_cpu(test: rfm.RegressionTest, num_nodes: int) -> rfm.RegressionTest: - ''' +def assign_one_task_per_feature(test: rfm.RegressionTest, feature) -> rfm.RegressionTest: + """assign on task per feature ('gpu' or 'cpu')""" + test.max_cpus_per_node = test.current_partition.processor.num_cpus + if test.max_cpus_per_node is None: + raise AttributeError(processor_info_missing) + + if feature == 'gpu': + assign_one_task_per_gpu(test) + else: + assign_one_task_per_cpu(test) + + +def assign_one_task_per_cpu(test: rfm.RegressionTest) -> rfm.RegressionTest: + """ Sets num_tasks_per_node and num_cpus_per_task such that it will run one task per core, unless specified with --setvar num_tasks_per_node= and/or --setvar num_cpus_per_task= - ''' - if not test.num_tasks_per_node: - if test.current_partition.processor.num_cpus is None: - raise AttributeError(processor_info_missing) - test.num_tasks_per_node = test.current_partition.processor.num_cpus + """ + max_cpus_per_node = test.max_cpus_per_node + num_tasks_per_node = test.num_tasks_per_node + num_cpus_per_task = test.num_cpus_per_task - if not test.num_cpus_per_task: - test.num_cpus_per_task = 1 + if not num_tasks_per_node: + if not num_cpus_per_task: + num_tasks_per_node = max_cpus_per_node + else: + num_tasks_per_node = int(max_cpus_per_node / num_cpus_per_task) - test.num_tasks = num_nodes * test.num_tasks_per_node + if not num_cpus_per_task: + num_cpus_per_task = int(max_cpus_per_node / num_tasks_per_node) + test.num_tasks_per_node = num_tasks_per_node + test.num_tasks = test.num_nodes * test.num_tasks_per_node + test.num_cpus_per_task = num_cpus_per_task -def assign_one_task_per_gpu(test: rfm.RegressionTest, num_nodes: int) -> rfm.RegressionTest: - ''' + +def assign_one_task_per_gpu(test: rfm.RegressionTest) -> rfm.RegressionTest: + """ Sets num_tasks_per_node, num_cpus_per_task, and num_gpus_per_node, unless specified with --setvar num_tasks_per_node= and/or @@ -55,14 +74,11 @@ def assign_one_task_per_gpu(test: rfm.RegressionTest, num_nodes: int) -> rfm.Reg - default num_cpus_per_task = total nb of CPUs per GPU available in this partition, divided by num_tasks_per_node - if num_tasks_per_node is set, set num_gpus_per_node equal to either num_tasks_per_node or nb of GPUs per node available in this partition (whatever is smallest). - ''' - num_cpus = test.current_partition.processor.num_cpus - if num_cpus is None: - raise AttributeError(processor_info_missing) - + """ + max_gpus_per_node = utils.get_num_gpus_per_node(test) + max_cpus_per_node = test.max_cpus_per_node num_tasks_per_node = test.num_tasks_per_node num_gpus_per_node = test.num_gpus_per_node - max_gpus_per_node = utils.get_num_gpus_per_node(test) if not num_tasks_per_node: if not num_gpus_per_node: @@ -73,11 +89,13 @@ def assign_one_task_per_gpu(test: rfm.RegressionTest, num_nodes: int) -> rfm.Reg num_gpus_per_node = min(num_tasks_per_node, max_gpus_per_node) if not test.num_cpus_per_task: - test.num_cpus_per_task = int((num_cpus * num_gpus_per_node) / (num_tasks_per_node * max_gpus_per_node)) + test.num_cpus_per_task = int( + (max_cpus_per_node * num_gpus_per_node) / (num_tasks_per_node * max_gpus_per_node) + ) test.num_gpus_per_node = num_gpus_per_node test.num_tasks_per_node = num_tasks_per_node - test.num_tasks = num_nodes * num_tasks_per_node + test.num_tasks = test.num_nodes * num_tasks_per_node def auto_assign_num_tasks_MPI(test: rfm.RegressionTest, num_nodes: int) -> rfm.RegressionTest: From fa889ecc8fb54df10a453b3a382356d64c0f777d Mon Sep 17 00:00:00 2001 From: Samuel Moors Date: Sat, 4 Mar 2023 11:10:23 +0100 Subject: [PATCH 28/29] fix typo --- eessi/reframe/eessi_utils/hooks.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/eessi/reframe/eessi_utils/hooks.py b/eessi/reframe/eessi_utils/hooks.py index 302e2f0b..bb68c67d 100644 --- a/eessi/reframe/eessi_utils/hooks.py +++ b/eessi/reframe/eessi_utils/hooks.py @@ -11,7 +11,7 @@ def skip_cpu_test_on_gpu_nodes(test: rfm.RegressionTest): '''Skip test if GPUs are present, but no CUDA is required''' - skip = (utils.is_gpu_cresent(test) and not utils.is_cuda_required(test)) + skip = (utils.is_gpu_present(test) and not utils.is_cuda_required(test)) if skip: test.skip_if(True, f"GPU is present on this partition ({test.current_partition.name}), skipping CPU-based test") From 0972cc12af37cc17fde1cd35b81add00d48a2807 Mon Sep 17 00:00:00 2001 From: Samuel Moors Date: Sat, 4 Mar 2023 11:14:28 +0100 Subject: [PATCH 29/29] make resource assignment more future proof against new features --- eessi/reframe/eessi_utils/hooks.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/eessi/reframe/eessi_utils/hooks.py b/eessi/reframe/eessi_utils/hooks.py index bb68c67d..7f39082c 100644 --- a/eessi/reframe/eessi_utils/hooks.py +++ b/eessi/reframe/eessi_utils/hooks.py @@ -28,15 +28,17 @@ def skip_gpu_test_on_cpu_nodes(test: rfm.RegressionTest): def assign_one_task_per_feature(test: rfm.RegressionTest, feature) -> rfm.RegressionTest: - """assign on task per feature ('gpu' or 'cpu')""" + """assign one task per feature ('gpu' or 'cpu')""" test.max_cpus_per_node = test.current_partition.processor.num_cpus if test.max_cpus_per_node is None: raise AttributeError(processor_info_missing) if feature == 'gpu': assign_one_task_per_gpu(test) - else: + elif feature == 'cpu': assign_one_task_per_cpu(test) + else: + raise ValueError(f'Feature {feature} is currently not supported') def assign_one_task_per_cpu(test: rfm.RegressionTest) -> rfm.RegressionTest: