Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Set mem hook #1

Merged
merged 8 commits into from
May 23, 2024
8 changes: 7 additions & 1 deletion config/github_actions.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,13 @@
'options': ['--mem={size}'],
}
],
'max_jobs': 1
'max_jobs': 1,
'extras': {
# Make sure to round down, otherwise a job might ask for more mem than is available
# per node
# This is a fictional amount, GH actions probably has less, but only does --dry-run
'mem_per_node': 30 # in GiB
},
}
]
}
Expand Down
5 changes: 5 additions & 0 deletions config/it4i_karolina.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,11 @@
'features': [
FEATURES[CPU],
] + list(SCALES.keys()),
'extras': {
# Make sure to round down, otherwise a job might ask for more mem than is available
# per node
'mem_per_node': 219.345 # in GiB
},
'descr': 'CPU Universal Compute Nodes, see https://docs.it4i.cz/karolina/hardware-overview/'
},
# We don't have GPU budget on Karolina at this time
Expand Down
88 changes: 49 additions & 39 deletions config/izum_vega.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,47 +59,57 @@
'features': [
FEATURES[CPU],
] + list(SCALES.keys()),
'extras': {
# Make sure to round down, otherwise a job might ask for more mem than is available
# per node
'mem_per_node': 238.418 # in GiB
},
'descr': 'CPU partition Standard, see https://en-doc.vega.izum.si/architecture/'
},
{
'name': 'gpu',
'scheduler': 'slurm',
'prepare_cmds': [
'source %s' % common_eessi_init(),
# Pass job environment variables like $PATH, etc., into job steps
'export SLURM_EXPORT_ENV=ALL',
# Needed when using srun launcher
# 'export SLURM_MPI_TYPE=pmix', # WARNING: this broke the GROMACS on Vega
# Avoid https://github.com/EESSI/software-layer/issues/136
# Can be taken out once we don't care about old OpenMPI versions anymore (pre-4.1.1)
'export OMPI_MCA_pml=ucx',
],
'launcher': 'mpirun',
# Use --export=None to avoid that login environment is passed down to submitted jobs
'access': ['-p gpu', '--export=None'],
'environs': ['default'],
'max_jobs': 60,
'devices': [
{
'type': DEVICE_TYPES[GPU],
'num_devices': 4,
}
],
'resources': [
{
'name': '_rfm_gpu',
'options': ['--gpus-per-node={num_gpus_per_node}'],
},
{
'name': 'memory',
'options': ['--mem={size}'],
}
],
'features': [
FEATURES[GPU],
] + list(SCALES.keys()),
'descr': 'GPU partition, see https://en-doc.vega.izum.si/architecture/'
},
# {
# 'name': 'gpu',
# 'scheduler': 'slurm',
# 'prepare_cmds': [
# 'source %s' % common_eessi_init(),
# # Pass job environment variables like $PATH, etc., into job steps
# 'export SLURM_EXPORT_ENV=ALL',
# # Needed when using srun launcher
# # 'export SLURM_MPI_TYPE=pmix', # WARNING: this broke the GROMACS on Vega
# # Avoid https://github.com/EESSI/software-layer/issues/136
# # Can be taken out once we don't care about old OpenMPI versions anymore (pre-4.1.1)
# 'export OMPI_MCA_pml=ucx',
# ],
# 'launcher': 'mpirun',
# # Use --export=None to avoid that login environment is passed down to submitted jobs
# 'access': ['-p gpu', '--export=None'],
# 'environs': ['default'],
# 'max_jobs': 60,
# 'devices': [
# {
# 'type': DEVICE_TYPES[GPU],
# 'num_devices': 4,
# }
# ],
# 'resources': [
# {
# 'name': '_rfm_gpu',
# 'options': ['--gpus-per-node={num_gpus_per_node}'],
# },
# {
# 'name': 'memory',
# 'options': ['--mem={size}'],
# }
# ],
# 'features': [
# FEATURES[GPU],
# ] + list(SCALES.keys()),
# 'extras': {
# # Make sure to round down, otherwise a job might ask for more mem than is available
# # per node
# 'mem_per_node': 476.837 # in GiB (should be checked, its unclear from slurm.conf)
# },
# 'descr': 'GPU partition, see https://en-doc.vega.izum.si/architecture/'
# },
]
},
],
Expand Down
13 changes: 13 additions & 0 deletions config/surf_snellius.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,11 @@
'features': [
FEATURES[CPU],
] + list(SCALES.keys()),
'extras': {
# Make sure to round down, otherwise a job might ask for more mem than is available
# per node
'mem_per_node': 213.623 # in GiB
},
'descr': 'AMD Rome CPU partition with native EESSI stack'
},
{
Expand All @@ -72,6 +77,11 @@
'features': [
FEATURES[CPU],
] + list(SCALES.keys()),
'extras': {
# Make sure to round down, otherwise a job might ask for more mem than is available
# per node
'mem_per_node': 320.434 # in GiB
},
'descr': 'AMD Genoa CPU partition with native EESSI stack'
},

Expand Down Expand Up @@ -105,6 +115,9 @@
] + valid_scales_snellius_gpu,
'extras': {
GPU_VENDOR: GPU_VENDORS[NVIDIA],
# Make sure to round down, otherwise a job might ask for more mem than is available
# per node
'mem_per_node': 457.763 # in GiB
},
'descr': 'Nvidia A100 GPU partition with native EESSI stack'
},
Expand Down
88 changes: 87 additions & 1 deletion eessi/testsuite/hooks.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@

from eessi.testsuite.constants import *
from eessi.testsuite.utils import (get_max_avail_gpus_per_node, is_cuda_required_module, log,
check_proc_attribute_defined)
check_proc_attribute_defined, check_extras_key_defined)


def _assign_default_num_cpus_per_node(test: rfm.RegressionTest):
Expand Down Expand Up @@ -373,6 +373,92 @@ def filter_valid_systems_by_device_type(test: rfm.RegressionTest, required_devic
log(f'valid_systems set to {test.valid_systems}')


def req_memory_per_node(test: rfm.RegressionTest, app_mem_req):
"""
This hook will request a specific amount of memory per node to the batch scheduler.
First, it computes which fraction of CPUs is requested from a node, and how much the corresponding (proportional)
amount of memory would be.
Then, the hook compares this to how much memory the application claims to need per node (app_mem_req).
It then passes the maximum of these two numbers to the batch scheduler as a memory request.

Note: using this hook requires that the ReFrame configuration defines system.partition.extras['mem_per_node']
That field should be defined in GiB

Arguments:
- test: the ReFrame test to which this hook should apply
- app_mem_req: the amount of memory this application needs (per node) in GiB

Example 1:
- A system with 128 cores and 64 GiB per node.
- The test is launched on 64 cores
- The app_mem_req is 40 (GiB)
In this case, the test requests 50% of the CPUs. Thus, the proportional amount of memory is 32 GiB.
The app_mem_req is higher. Thus, 40GiB (per node) is requested from the batch scheduler.

Example 2:
- A system with 128 cores per node, 128 GiB mem per node is used.
- The test is launched on 64 cores
- the app_mem_req is 40 (GiB)
In this case, the test requests 50% of the CPUs. Thus, the proportional amount of memory is 64 GiB.
This is higher than the app_mem_req. Thus, 64 GiB (per node) is requested from the batch scheduler.
"""
# Check that the systems.partitions.extra dict in the ReFrame config contains mem_per_node
check_extras_key_defined(test, 'mem_per_node')
# Skip if the current partition doesn't have sufficient memory to run the application
msg = f"Skipping test: nodes in this partition only have {test.current_partition.extras['mem_per_node']} GiB"
msg += " memory available (per node) accodring to the current ReFrame configuration,"
msg += f" but {app_mem_req} GiB is needed"
test.skip_if(test.current_partition.extras['mem_per_node'] < app_mem_req, msg)

# Compute what is higher: the requested memory, or the memory available proportional to requested CPUs
# Fraction of CPU cores requested
check_proc_attribute_defined(test, 'num_cpus')
cpu_fraction = test.num_tasks_per_node * test.num_cpus_per_task / test.current_partition.processor.num_cpus
proportional_mem = cpu_fraction * test.current_partition.extras['mem_per_node']

scheduler_name = test.current_partition.scheduler.registered_name
if scheduler_name == 'slurm' or scheduler_name == 'squeue':
# SLURMs --mem defines memory per node, see https://slurm.schedmd.com/sbatch.html
# SLURM uses megabytes and gigabytes, i.e. base-10, so conversion is 1000, not 1024
# Thus, we convert from GiB (gibibytes) to MB (megabytes) (1024 * 1024 * 1024 / (1000 * 1000) = 1073.741824)
app_mem_req = math.ceil(1073.741824 * app_mem_req)
log(f"Memory requested by application: {app_mem_req} MB")
proportional_mem = math.floor(1073.741824 * proportional_mem)
log(f"Memory proportional to the core count: {proportional_mem} MB")

# Request the maximum of the proportional_mem, and app_mem_req to the scheduler
req_mem_per_node = max(proportional_mem, app_mem_req)

test.extra_resources = {'memory': {'size': f'{req_mem_per_node}M'}}
log(f"Requested {req_mem_per_node} MB per node from the SLURM batch scheduler")

elif scheduler_name == 'torque':
# Torque/moab requires asking for --pmem (--mem only works single node and thus doesnt generalize)
# See https://docs.adaptivecomputing.com/10-0-1/Torque/torque.htm#topics/torque/3-jobs/3.1.3-requestingRes.htm
# Units are MiB according to the documentation, thus, we simply multiply with 1024
# We immediately divide by num_tasks_per_node (before rounding), since -pmem specifies memroy _per process_
app_mem_req_task = math.ceil(1024 * app_mem_req / test.num_tasks_per_node)
proportional_mem_task = math.floor(1024 * proportional_mem / test.num_tasks_per_node)

# Request the maximum of the proportional_mem, and app_mem_req to the scheduler
req_mem_per_task = max(proportional_mem_task, app_mem_req_task)

# We assume here the reframe config defines the extra resource memory as asking for pmem
# i.e. 'options': ['--pmem={size}']
test.extra_resources = {'memory': {'size': f'{req_mem_per_task}mb'}}
log(f"Requested {req_mem_per_task} MiB per task from the torque batch scheduler")

else:
logger = rflog.getlogger()
msg = "hooks.req_memory_per_node does not support the scheduler you configured"
msg += f" ({test.current_partition.scheduler.registered_name})."
msg += " The test will run, but since it doesn't request the required amount of memory explicitely,"
msg += " it may result in an out-of-memory error."
msg += " Please expand the functionality of hooks.req_memory_per_node for your scheduler."
# Warnings will, at default loglevel, be printed on stdout when executing the ReFrame command
logger.warning(msg)

Crivella marked this conversation as resolved.
Show resolved Hide resolved

def set_modules(test: rfm.RegressionTest):
"""
Skip current test if module_name is not among a list of modules,
Expand Down
5 changes: 5 additions & 0 deletions eessi/testsuite/tests/apps/QuantumESPRESSO.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,11 @@ def run_after_setup(self):
else:
hooks.assign_tasks_per_compute_unit(test=self, compute_unit=COMPUTE_UNIT[CPU])

@run_after('setup')
def request_mem(self):
memory_required = self.num_tasks_per_node * 0.9 + 4
hooks.req_memory_per_node(test=self, app_mem_req=memory_required)

@run_after('setup')
def set_omp_num_threads(self):
"""
Expand Down
38 changes: 36 additions & 2 deletions eessi/testsuite/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -145,7 +145,41 @@ def check_proc_attribute_defined(test: rfm.RegressionTest, attribute) -> bool:
else:
msg = (
"This test's current_partition is not set yet. "
"The function utils.proc_attribute_defined should only be called after the setup() phase of ReFrame."
"The function utils.check_proc_attribute_defined should only be called after the setup() phase of ReFrame."
"This is a programming error, please report this issue."
)
raise AttributeError(msg)
raise AttributeError(msg)


def check_extras_key_defined(test: rfm.RegressionTest, extra_key) -> bool:
"""
Checks if a specific key is defined in the 'extras' dictionary for the current partition
(i.e. if test.current_partition.extras[extra_key] is defined)
If not, throws an informative error message.
Note that partition extras are defined by free text keys, so any string is (potentially) valid.

Arguments:
- test: the reframe regression test instance for which should be checked if the key is defined in 'extras'
- extra_key: key for which to check in the 'extras' dictionary

Return:
- True (bool) if the key is defined
- Function does not return (but raises an error) if the attribute is undefined
"""

if test.current_partition:
if extra_key in test.current_partition.extras:
return True
else:
msg = (
f"Key '{extra_key}' missing in the 'extras' dictionary for partition '{test.current_partition.name}'."
"Please define this key for the relevant partition in the ReFrame configuration file (see "
"https://reframe-hpc.readthedocs.io/en/stable/config_reference.html#config.systems.partitions.extras)."
)
else:
msg = (
"This test's current_partition is not set yet. "
"The function utils.check_extras_key_defined should only be called after the setup() phase of ReFrame."
"This is a programming error, please report this issue."
)
raise AttributeError(msg)