Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Handle node allocation errors gracefully, log details, and exit on failure #264

Merged
merged 3 commits into from
Oct 15, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
49 changes: 33 additions & 16 deletions src/cloudai/systems/slurm/slurm_system.py
Original file line number Diff line number Diff line change
Expand Up @@ -467,21 +467,28 @@ def get_available_nodes_from_group(
available nodes.
"""
self.validate_partition_and_group(partition_name, group_name)

self.update_node_states()

grouped_nodes = self.group_nodes_by_state(partition_name, group_name)
allocated_nodes = self.allocate_nodes(grouped_nodes, number_of_nodes, group_name)

# Log allocation details
logging.info(
"Allocated nodes from group '{}' in partition '{}': {}".format(
group_name,
partition_name,
[node.name for node in allocated_nodes],

try:
allocated_nodes = self.allocate_nodes(grouped_nodes, number_of_nodes, group_name)

logging.info(
f"Allocated nodes from group '{group_name}' in partition '{partition_name}': "
f"{[node.name for node in allocated_nodes]}"
)
)

return allocated_nodes
return allocated_nodes

except ValueError as e:
logging.error(
f"Error occurred while allocating nodes from group '{group_name}' in partition '{partition_name}': {e}",
exc_info=True,
)

return []

def validate_partition_and_group(self, partition_name: str, group_name: str) -> None:
"""
Expand Down Expand Up @@ -542,13 +549,20 @@ def allocate_nodes(
Raises:
ValueError: If the requested number of nodes exceeds the available nodes.
"""
# Allocate nodes based on priority: idle, then completing, then allocated
allocated_nodes = []

if isinstance(number_of_nodes, str) and number_of_nodes == "max_avail":
allocated_nodes.extend(grouped_nodes[SlurmNodeState.IDLE])
allocated_nodes.extend(grouped_nodes[SlurmNodeState.COMPLETING])

if len(allocated_nodes) == 0:
raise ValueError(f"No available nodes in group '{group_name}'.")
raise ValueError(
f"CloudAI is requesting the maximum available nodes from the group '{group_name}', "
f"but no nodes are available. Please review the available nodes in the system and ensure "
f"there are sufficient resources to meet the requirements of the test scenario. Additionally, "
f"verify that the system is capable of hosting the maximum number of nodes specified in the test "
"scenario."
)

elif isinstance(number_of_nodes, int):
for state in grouped_nodes:
Expand All @@ -557,13 +571,16 @@ def allocate_nodes(

if len(allocated_nodes) < number_of_nodes:
raise ValueError(
"Requested number of nodes ({}) exceeds the number of nodes in group '{}'.".format(
number_of_nodes, group_name
)
f"CloudAI is requesting {number_of_nodes} nodes from the group '{group_name}', but only "
f"{len(allocated_nodes)} nodes are available. Please review the available nodes in the system "
f"and ensure there are enough resources to meet the requested node count. Additionally, "
f"verify that the system can accommodate the number of nodes required by the test scenario."
)
else:
raise ValueError(
f"number of nodes should either be an int or 'max_avail', number of nodes : {number_of_nodes}"
f"The 'number_of_nodes' argument must be either an integer specifying the number of nodes to allocate,"
f" or 'max_avail' to allocate all available nodes. Received: '{number_of_nodes}'. "
"Please correct the input."
)

return allocated_nodes
Expand Down
20 changes: 17 additions & 3 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
from pathlib import Path

import pytest
from cloudai.systems.slurm.slurm_system import SlurmPartition, SlurmSystem
from cloudai.systems.slurm.slurm_system import SlurmGroup, SlurmPartition, SlurmSystem


@pytest.fixture
Expand All @@ -28,8 +28,22 @@ def slurm_system() -> SlurmSystem:
output_path=Path("/fake/output"),
default_partition="main",
partitions=[
SlurmPartition(name="main", nodes=["node-[033-064]"]),
SlurmPartition(name="backup", nodes=["node0[1-8]"]),
SlurmPartition(
name="main",
nodes=["node-[033-064]"],
groups=[
SlurmGroup(name="group1", nodes=["node-[033-048]"]),
SlurmGroup(name="group2", nodes=["node-[049-064]"]),
],
),
SlurmPartition(
name="backup",
nodes=["node0[1-8]"],
groups=[
SlurmGroup(name="group1", nodes=["node0[1-4]"]),
SlurmGroup(name="group2", nodes=["node0[5-8]"]),
],
),
],
)
return system
23 changes: 20 additions & 3 deletions tests/test_slurm_system.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
# limitations under the License.

import re
from typing import List
from typing import Dict, List
from unittest.mock import patch

import pytest
Expand Down Expand Up @@ -150,7 +150,7 @@ def grouped_nodes() -> dict[SlurmNodeState, list[SlurmNode]]:
"""
Helper function to set up a mock Slurm system with nodes and their states.
"""
partition_name = "partition_name"
partition_name = "main"

grouped_nodes = {
SlurmNodeState.IDLE: [
Expand All @@ -166,6 +166,19 @@ def grouped_nodes() -> dict[SlurmNodeState, list[SlurmNode]]:
return grouped_nodes


def test_get_available_nodes_exceeding_limit_no_callstack(
slurm_system: SlurmSystem, grouped_nodes: Dict[SlurmNodeState, List[SlurmNode]], caplog
):
group_name = "group1"
partition_name = "main"
num_nodes = 5

slurm_system.get_available_nodes_from_group(partition_name, group_name, num_nodes)

log_message = "CloudAI is requesting 5 nodes from the group 'group1', but only 0 nodes are available."
assert log_message in caplog.text


def test_allocate_nodes_max_avail(slurm_system: SlurmSystem, grouped_nodes: dict[SlurmNodeState, list[SlurmNode]]):
group_name = "group_name"

Expand Down Expand Up @@ -202,11 +215,15 @@ def test_allocate_nodes_exceeding_limit(
):
group_name = "group_name"
num_nodes = 5
available_nodes = 4

with pytest.raises(
ValueError,
match=re.escape(
f"Requested number of nodes ({num_nodes}) exceeds the number of nodes in group '{group_name}'."
f"CloudAI is requesting {num_nodes} nodes from the group '{group_name}', but only "
f"{available_nodes} nodes are available. Please review the available nodes in the system "
f"and ensure there are enough resources to meet the requested node count. Additionally, "
f"verify that the system can accommodate the number of nodes required by the test scenario."
),
):
slurm_system.allocate_nodes(grouped_nodes, num_nodes, group_name)