Skip to content

Commit

Permalink
Merge pull request #116 from smoors/request_gpus
Browse files Browse the repository at this point in the history
support always requesting GPUs on partitions that require it
  • Loading branch information
casparvl authored Feb 29, 2024
2 parents c8e917c + 82891ba commit ba35eb2
Show file tree
Hide file tree
Showing 4 changed files with 109 additions and 81 deletions.
2 changes: 2 additions & 0 deletions eessi/testsuite/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
INTEL = 'INTEL'
NODE = 'NODE'
NVIDIA = 'NVIDIA'
ALWAYS_REQUEST_GPUS = 'ALWAYS_REQUEST_GPUS'

DEVICE_TYPES = {
CPU: 'cpu',
Expand All @@ -31,6 +32,7 @@
FEATURES = {
CPU: 'cpu',
GPU: 'gpu',
ALWAYS_REQUEST_GPUS: 'always_request_gpus',
}

GPU_VENDORS = {
Expand Down
78 changes: 46 additions & 32 deletions eessi/testsuite/hooks.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,12 @@

import reframe as rfm

from eessi.testsuite.constants import * # noqa
from eessi.testsuite.constants import *
from eessi.testsuite.utils import (get_max_avail_gpus_per_node, is_cuda_required_module, log,
check_proc_attribute_defined)


def assign_default_num_cpus_per_node(test: rfm.RegressionTest):
def _assign_default_num_cpus_per_node(test: rfm.RegressionTest):
"""
Check if the default number of cpus per node is already defined in the test
(e.g. by earlier hooks like set_tag_scale).
Expand All @@ -34,6 +34,27 @@ def assign_default_num_cpus_per_node(test: rfm.RegressionTest):
log(f'default_num_cpus_per_node set to {test.default_num_cpus_per_node}')


def _assign_default_num_gpus_per_node(test: rfm.RegressionTest):
"""
Check if the default number of gpus per node is already defined in the test
(e.g. by earlier hooks like set_tag_scale).
If so, check if it doesn't exceed the maximum available.
If not, set default_num_gpus_per_node based on the maximum available gpus and node_part
"""

test.max_avail_gpus_per_node = get_max_avail_gpus_per_node(test)
if test.default_num_gpus_per_node:
# may skip if not enough GPUs
test.skip_if(
test.default_num_gpus_per_node > test.max_avail_gpus_per_node,
f'Number of GPUs per node in selected scale ({test.default_num_gpus_per_node}) is higher than max available'
f' ({test.max_avail_gpus_per_node}) in current partition ({test.current_partition.name}).'
)
else:
# no default set yet, so setting one
test.default_num_gpus_per_node = math.ceil(test.max_avail_gpus_per_node / test.node_part)


def assign_tasks_per_compute_unit(test: rfm.RegressionTest, compute_unit: str, num_per: int = 1):
"""
Assign one task per compute unit (COMPUTE_UNIT[CPU], COMPUTE_UNIT[CPU_SOCKET] or COMPUTE_UNIT[GPU]).
Expand Down Expand Up @@ -69,15 +90,18 @@ def assign_tasks_per_compute_unit(test: rfm.RegressionTest, compute_unit: str, n

# Check if either node_part, or default_num_cpus_per_node and default_num_gpus_per_node are set correctly
if not (
type(test.node_part) == int or
(type(test.default_num_cpus_per_node) == int and type(test.default_num_gpus_per_node) == int)
type(test.node_part) == int
or (type(test.default_num_cpus_per_node) == int and type(test.default_num_gpus_per_node) == int)
):
raise ValueError(
f'Either node_part ({test.node_part}), or default_num_cpus_per_node ({test.default_num_cpus_per_node}) and'
f' default num_gpus_per_node ({test.default_num_gpus_per_node}) must be defined and have integer values.'
)

assign_default_num_cpus_per_node(test)
_assign_default_num_cpus_per_node(test)

if FEATURES[GPU] in test.current_partition.features:
_assign_default_num_gpus_per_node(test)

if compute_unit == COMPUTE_UNIT[GPU]:
_assign_one_task_per_gpu(test)
Expand All @@ -90,6 +114,8 @@ def assign_tasks_per_compute_unit(test: rfm.RegressionTest, compute_unit: str, n
else:
raise ValueError(f'compute unit {compute_unit} is currently not supported')

_check_always_request_gpus(test)


def _assign_num_tasks_per_node(test: rfm.RegressionTest, num_per: int = 1):
"""
Expand All @@ -112,7 +138,6 @@ def _assign_num_tasks_per_node(test: rfm.RegressionTest, num_per: int = 1):
test.num_tasks_per_node = num_per
test.num_cpus_per_task = int(test.default_num_cpus_per_node / test.num_tasks_per_node)


# num_tasks_per_node is not set, but num_cpus_per_task is
elif not test.num_tasks_per_node:
test.num_tasks_per_node = int(test.default_num_cpus_per_node / test.num_cpus_per_task)
Expand Down Expand Up @@ -222,11 +247,6 @@ def _assign_one_task_per_gpu(test: rfm.RegressionTest):
--setvar num_cpus_per_task=<y> and/or
--setvar num_gpus_per_node=<z>.
Variables:
- max_avail_gpus_per_node: maximum available number of GPUs per node
- default_num_gpus_per_node: default number of GPUs per node as defined in the test
(e.g. by earlier hooks like set_tag_scale)
Default resources requested:
- num_gpus_per_node = default_num_gpus_per_node
- num_tasks_per_node = num_gpus_per_node
Expand All @@ -235,22 +255,6 @@ def _assign_one_task_per_gpu(test: rfm.RegressionTest):
If num_tasks_per_node is set, set num_gpus_per_node equal to either num_tasks_per_node or default_num_gpus_per_node
(whichever is smallest), unless num_gpus_per_node is also set.
"""
max_avail_gpus_per_node = get_max_avail_gpus_per_node(test)

# Check if the default number of gpus per node is already defined in the test
# (e.g. by earlier hooks like set_tag_scale).
# If so, check if it doesn't exceed the maximum available.
# If not, set default_num_gpus_per_node based on the maximum available gpus and node_part
if test.default_num_gpus_per_node:
# may skip if not enough GPUs
test.skip_if(
test.default_num_gpus_per_node > max_avail_gpus_per_node,
f'Requested GPUs per node ({test.default_num_gpus_per_node}) is higher than max available'
f' ({max_avail_gpus_per_node}) in current partition ({test.current_partition.name}).'
)
else:
# no default set yet, so setting one
test.default_num_gpus_per_node = math.ceil(max_avail_gpus_per_node / test.node_part)

# neither num_tasks_per_node nor num_gpus_per_node are set
if not test.num_tasks_per_node and not test.num_gpus_per_node:
Expand All @@ -273,7 +277,7 @@ def _assign_one_task_per_gpu(test: rfm.RegressionTest):
# limit num_cpus_per_task to the maximum available cpus per gpu
test.num_cpus_per_task = min(
int(test.default_num_cpus_per_node / test.num_tasks_per_node),
int(test.max_avail_cpus_per_node / max_avail_gpus_per_node)
int(test.max_avail_cpus_per_node / test.max_avail_gpus_per_node)
)

test.num_tasks = test.num_nodes * test.num_tasks_per_node
Expand Down Expand Up @@ -303,8 +307,8 @@ def _set_or_append_valid_systems(test: rfm.RegressionTest, valid_systems: str):
return

# test.valid_systems wasn't set yet, so set it
if len(test.valid_systems) == 0:
# test.valid_systems is empty, meaning all tests are filtered out. This hook shouldn't change that
if len(test.valid_systems) == 0 or test.valid_systems == [INVALID_SYSTEM]:
# test.valid_systems is empty or invalid, meaning all tests are filtered out. This hook shouldn't change that
return
# test.valid_systems still at default value, so overwrite
elif len(test.valid_systems) == 1 and test.valid_systems[0] == '*':
Expand All @@ -314,8 +318,8 @@ def _set_or_append_valid_systems(test: rfm.RegressionTest, valid_systems: str):
test.valid_systems[0] = f'{test.valid_systems[0]} {valid_systems}'
else:
warn_msg = f"valid_systems has multiple ({len(test.valid_systems)}) items,"
warn_msg += f" which is not supported by this hook."
warn_msg += f" Make sure to handle filtering yourself."
warn_msg += " which is not supported by this hook."
warn_msg += " Make sure to handle filtering yourself."
warnings.warn(warn_msg)
return

Expand All @@ -333,6 +337,7 @@ def filter_supported_scales(test: rfm.RegressionTest):

log(f'valid_systems set to {test.valid_systems}')


def filter_valid_systems_by_device_type(test: rfm.RegressionTest, required_device_type: str):
"""
Filter valid_systems by required device type and by whether the module supports CUDA,
Expand Down Expand Up @@ -459,3 +464,12 @@ def set_compact_thread_binding(test: rfm.RegressionTest):
log(f'Set environment variable OMP_PLACES to {test.env_vars["OMP_PLACES"]}')
log(f'Set environment variable OMP_PROC_BIND to {test.env_vars["OMP_PROC_BIND"]}')
log(f'Set environment variable KMP_AFFINITY to {test.env_vars["KMP_AFFINITY"]}')


def _check_always_request_gpus(test: rfm.RegressionTest):
"""
Make sure we always request enough GPUs if required for the current GPU partition (cluster-specific policy)
"""
if FEATURES[ALWAYS_REQUEST_GPUS] in test.current_partition.features and not test.num_gpus_per_node:
test.num_gpus_per_node = test.default_num_gpus_per_node
log(f'num_gpus_per_node set to {test.num_gpus_per_node} for partition {test.current_partition.name}')
104 changes: 55 additions & 49 deletions eessi/testsuite/tests/apps/osu.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,40 +51,50 @@ class EESSI_OSU_Micro_Benchmarks_pt2pt(osu_benchmark):
# unset num_tasks_per_node from the hpctestlib.
num_tasks_per_node = None

@run_after('init')
def filter_scales_2gpus(self):
"""Filter out scales with < 2 GPUs if running on GPUs"""
if (
self.device_type == DEVICE_TYPES[GPU]
and SCALES[self.scale]['num_nodes'] == 1
and SCALES[self.scale].get('num_gpus_per_node', 2) < 2
):
self.valid_systems = [INVALID_SYSTEM]
log(f'valid_systems set to {self.valid_systems} for scale {self.scale} and device_type {self.device_type}')

@run_after('init')
def filter_benchmark_pt2pt(self):
""" Filter out all non-mpi.pt2pt benchmarks """
if not self.benchmark_info[0].startswith('mpi.pt2pt'):
self.valid_systems = [INVALID_SYSTEM]

@run_after('init')
def run_after_init(self):
"""hooks to run after init phase"""
# Note: device_buffers variable is inherited from the hpctestlib class and adds options to the launcher
# commands (before setup) if not equal to 'cpu'. We set it to 'cpu' initially and change it later in this hook depending on the test.
self.device_buffers = 'cpu'

# Filter on which scales are supported by the partitions defined in the ReFrame configuration
hooks.filter_supported_scales(self)

hooks.filter_valid_systems_by_device_type(self, required_device_type=self.device_type)
is_cuda_module = utils.is_cuda_required_module(self.module_name)
# This part of the hook is meant to be for the OSU cpu tests. This is required since the non CUDA module should
# be able to run in the GPU partition as well. This is specific for this test and not covered by the function
# above.
if is_cuda_module and self.device_type == DEVICE_TYPES[GPU]:
# Sets to cuda as device buffer only if the module is compiled with CUDA.
self.device_buffers = 'cuda'

# If the device_type is CPU then device buffer should always be CPU.
if self.device_type == DEVICE_TYPES[CPU]:
self.device_buffers = 'cpu'

# This part of the code removes the collective communication calls out of the run list since this test is only
# meant for pt2pt.
if not self.benchmark_info[0].startswith('mpi.pt2pt'):
self.valid_systems = []
hooks.set_modules(self)

@run_after('setup')
def adjust_executable_opts(self):
"""The option "D D" is only meant for Devices if and not for CPU tests. This option is added by hpctestlib to
all pt2pt tests which is not required."""
if(self.device_type == DEVICE_TYPES[CPU]):
self.executable_opts = [ele for ele in self.executable_opts if ele != 'D']
# Set scales as tags
hooks.set_tag_scale(self)

@run_after('init')
def set_device_buffers(self):
"""
device_buffers is inherited from the hpctestlib class and adds options to the launcher
commands in a @run_before('setup') hook if not equal to 'cpu'.
Therefore, we must set device_buffers *before* the @run_before('setup') hooks.
"""
if self.device_type == DEVICE_TYPES[GPU]:
self.device_buffers = 'cuda'

else:
# If the device_type is CPU then device_buffers should always be CPU.
self.device_buffers = 'cpu'

@run_after('init')
def set_tag_ci(self):
Expand All @@ -108,44 +118,40 @@ def set_mem(self):
requirement."""
self.extra_resources = {'memory': {'size': '12GB'}}

@run_after('init')
def set_num_tasks(self):
""" Setting scales as tags. """
hooks.set_tag_scale(self)
@run_after('setup')
def adjust_executable_opts(self):
"""The option "D D" is only meant for Devices if and not for CPU tests.
This option is added by hpctestlib in a @run_before('setup') to all pt2pt tests which is not required.
Therefore we must override it *after* the 'setup' phase
"""
if self.device_type == DEVICE_TYPES[CPU]:
self.executable_opts = [ele for ele in self.executable_opts if ele != 'D']


@run_after('setup')
def set_num_tasks_per_node(self):
""" Setting number of tasks per node and cpus per task in this function. This function sets num_cpus_per_task
for 1 node and 2 node options where the request is for full nodes."""
if(SCALES.get(self.scale).get('num_nodes') == 1):
if SCALES.get(self.scale).get('num_nodes') == 1:
hooks.assign_tasks_per_compute_unit(self, COMPUTE_UNIT[NODE], 2)
else:
hooks.assign_tasks_per_compute_unit(self, COMPUTE_UNIT[NODE])

@run_after('setup')
def set_num_gpus_per_node(self):
"""
This test does not require gpus and is for host to host within GPU nodes. But some systems do require a GPU
allocation for to perform any activity in the GPU nodes.
Set number of GPUs per node for GPU-to-GPU tests
"""
if(FEATURES[GPU] in self.current_partition.features and not utils.is_cuda_required_module(self.module_name)):
max_avail_gpus_per_node = utils.get_max_avail_gpus_per_node(self)
# Here for the 2_node test we assign max_avail_gpus_per_node but some systems cannot allocate 1_cpn_2_nodes
# for GPUs but need all gpus allocated within the 2 nodes for this work which. The test may fail under such
# conditions for the scale 1_cpn_2_nodes because it is simply not allowed.
self.num_gpus_per_node = self.default_num_gpus_per_node or max_avail_gpus_per_node
elif(FEATURES[GPU] in self.current_partition.features and utils.is_cuda_required_module(self.module_name)):
max_avail_gpus_per_node = utils.get_max_avail_gpus_per_node(self)
if(SCALES.get(self.scale).get('num_nodes') == 1):
# Skip the single node test if there is only 1 device in the node.
if(max_avail_gpus_per_node == 1):
self.skip(msg="There is only 1 device within the node. Skipping tests involving only 1 node.")
else:
self.num_gpus_per_node = 2
else:
# Note these settings are for 1_cpn_2_nodes. In that case we want to test for only 1 GPU per node since
# we have not requested for full nodes.
self.num_gpus_per_node = self.default_num_gpus_per_node or max_avail_gpus_per_node
if self.device_type == DEVICE_TYPES[GPU]:
# Skip single-node tests with less than 2 GPU devices in the node
self.skip_if(
SCALES[self.scale]['num_nodes'] == 1 and self.default_num_gpus_per_node < 2,
"There are < 2 GPU devices present in the node."
f" Skipping tests with device_type={DEVICE_TYPES[GPU]} involving < 2 GPUs and 1 node."
)
if not self.num_gpus_per_node:
self.num_gpus_per_node = self.default_num_gpus_per_node
log(f'num_gpus_per_node set to {self.num_gpus_per_node} for partition {self.current_partition.name}')


@rfm.simple_test
Expand Down
6 changes: 6 additions & 0 deletions setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -20,3 +20,9 @@ namespace_packages = eessi

[options.packages.find]
include = eessi*

[flake8]
max-line-length = 120
# ignore star imports (F403, F405)
# ignore obsolete warning (W503)
ignore = F403, F405, W503

0 comments on commit ba35eb2

Please sign in to comment.