diff --git a/src/cloudai/systems/slurm/slurm_system.py b/src/cloudai/systems/slurm/slurm_system.py index f9e5473e..dc248826 100644 --- a/src/cloudai/systems/slurm/slurm_system.py +++ b/src/cloudai/systems/slurm/slurm_system.py @@ -467,21 +467,28 @@ def get_available_nodes_from_group( available nodes. """ self.validate_partition_and_group(partition_name, group_name) + self.update_node_states() grouped_nodes = self.group_nodes_by_state(partition_name, group_name) - allocated_nodes = self.allocate_nodes(grouped_nodes, number_of_nodes, group_name) - - # Log allocation details - logging.info( - "Allocated nodes from group '{}' in partition '{}': {}".format( - group_name, - partition_name, - [node.name for node in allocated_nodes], + + try: + allocated_nodes = self.allocate_nodes(grouped_nodes, number_of_nodes, group_name) + + logging.info( + f"Allocated nodes from group '{group_name}' in partition '{partition_name}': " + f"{[node.name for node in allocated_nodes]}" ) - ) - return allocated_nodes + return allocated_nodes + + except ValueError as e: + logging.error( + f"Error occurred while allocating nodes from group '{group_name}' in partition '{partition_name}': {e}", + exc_info=True, + ) + + return [] def validate_partition_and_group(self, partition_name: str, group_name: str) -> None: """ @@ -542,13 +549,20 @@ def allocate_nodes( Raises: ValueError: If the requested number of nodes exceeds the available nodes. """ - # Allocate nodes based on priority: idle, then completing, then allocated allocated_nodes = [] + if isinstance(number_of_nodes, str) and number_of_nodes == "max_avail": allocated_nodes.extend(grouped_nodes[SlurmNodeState.IDLE]) allocated_nodes.extend(grouped_nodes[SlurmNodeState.COMPLETING]) + if len(allocated_nodes) == 0: - raise ValueError(f"No available nodes in group '{group_name}'.") + raise ValueError( + f"CloudAI is requesting the maximum available nodes from the group '{group_name}', " + f"but no nodes are available. Please review the available nodes in the system and ensure " + f"there are sufficient resources to meet the requirements of the test scenario. Additionally, " + f"verify that the system is capable of hosting the maximum number of nodes specified in the test " + "scenario." + ) elif isinstance(number_of_nodes, int): for state in grouped_nodes: @@ -557,13 +571,16 @@ def allocate_nodes( if len(allocated_nodes) < number_of_nodes: raise ValueError( - "Requested number of nodes ({}) exceeds the number of nodes in group '{}'.".format( - number_of_nodes, group_name - ) + f"CloudAI is requesting {number_of_nodes} nodes from the group '{group_name}', but only " + f"{len(allocated_nodes)} nodes are available. Please review the available nodes in the system " + f"and ensure there are enough resources to meet the requested node count. Additionally, " + f"verify that the system can accommodate the number of nodes required by the test scenario." ) else: raise ValueError( - f"number of nodes should either be an int or 'max_avail', number of nodes : {number_of_nodes}" + f"The 'number_of_nodes' argument must be either an integer specifying the number of nodes to allocate," + f" or 'max_avail' to allocate all available nodes. Received: '{number_of_nodes}'. " + "Please correct the input." ) return allocated_nodes diff --git a/tests/conftest.py b/tests/conftest.py index 37b46099..d4937624 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -17,7 +17,7 @@ from pathlib import Path import pytest -from cloudai.systems.slurm.slurm_system import SlurmPartition, SlurmSystem +from cloudai.systems.slurm.slurm_system import SlurmGroup, SlurmPartition, SlurmSystem @pytest.fixture @@ -28,8 +28,22 @@ def slurm_system() -> SlurmSystem: output_path=Path("/fake/output"), default_partition="main", partitions=[ - SlurmPartition(name="main", nodes=["node-[033-064]"]), - SlurmPartition(name="backup", nodes=["node0[1-8]"]), + SlurmPartition( + name="main", + nodes=["node-[033-064]"], + groups=[ + SlurmGroup(name="group1", nodes=["node-[033-048]"]), + SlurmGroup(name="group2", nodes=["node-[049-064]"]), + ], + ), + SlurmPartition( + name="backup", + nodes=["node0[1-8]"], + groups=[ + SlurmGroup(name="group1", nodes=["node0[1-4]"]), + SlurmGroup(name="group2", nodes=["node0[5-8]"]), + ], + ), ], ) return system diff --git a/tests/test_slurm_system.py b/tests/test_slurm_system.py index bf8a790c..a8060e7f 100644 --- a/tests/test_slurm_system.py +++ b/tests/test_slurm_system.py @@ -15,7 +15,7 @@ # limitations under the License. import re -from typing import List +from typing import Dict, List from unittest.mock import patch import pytest @@ -150,7 +150,7 @@ def grouped_nodes() -> dict[SlurmNodeState, list[SlurmNode]]: """ Helper function to set up a mock Slurm system with nodes and their states. """ - partition_name = "partition_name" + partition_name = "main" grouped_nodes = { SlurmNodeState.IDLE: [ @@ -166,6 +166,19 @@ def grouped_nodes() -> dict[SlurmNodeState, list[SlurmNode]]: return grouped_nodes +def test_get_available_nodes_exceeding_limit_no_callstack( + slurm_system: SlurmSystem, grouped_nodes: Dict[SlurmNodeState, List[SlurmNode]], caplog +): + group_name = "group1" + partition_name = "main" + num_nodes = 5 + + slurm_system.get_available_nodes_from_group(partition_name, group_name, num_nodes) + + log_message = "CloudAI is requesting 5 nodes from the group 'group1', but only 0 nodes are available." + assert log_message in caplog.text + + def test_allocate_nodes_max_avail(slurm_system: SlurmSystem, grouped_nodes: dict[SlurmNodeState, list[SlurmNode]]): group_name = "group_name" @@ -202,11 +215,15 @@ def test_allocate_nodes_exceeding_limit( ): group_name = "group_name" num_nodes = 5 + available_nodes = 4 with pytest.raises( ValueError, match=re.escape( - f"Requested number of nodes ({num_nodes}) exceeds the number of nodes in group '{group_name}'." + f"CloudAI is requesting {num_nodes} nodes from the group '{group_name}', but only " + f"{available_nodes} nodes are available. Please review the available nodes in the system " + f"and ensure there are enough resources to meet the requested node count. Additionally, " + f"verify that the system can accommodate the number of nodes required by the test scenario." ), ): slurm_system.allocate_nodes(grouped_nodes, num_nodes, group_name)