diff --git a/cookbooks/aws-parallelcluster-install/files/default/clusterstatusmgtd/clusterstatusmgtd.py b/cookbooks/aws-parallelcluster-install/files/default/clusterstatusmgtd/clusterstatusmgtd.py index a9c1134251..f36519212c 100644 --- a/cookbooks/aws-parallelcluster-install/files/default/clusterstatusmgtd/clusterstatusmgtd.py +++ b/cookbooks/aws-parallelcluster-install/files/default/clusterstatusmgtd/clusterstatusmgtd.py @@ -323,6 +323,11 @@ def __init__(self, config): self._compute_fleet_data = {} self.set_config(config) + class ClusterStatusUpdateEventError(Exception): + """Raised when there is a failure in updating the status due to an error on update event handler execution.""" + + pass + def set_config(self, config): # noqa: D102 if self._config != config: log.info("Applying new clusterstatusmgtd config: %s", config) @@ -366,7 +371,7 @@ def _call_update_event(self): _write_json_to_file(self._config.computefleet_status_path, compute_fleet_data) except Exception as e: log.error("Update event handler failed during fleet status translation: %s", e) - raise + raise ClusterStatusManager.ClusterStatusUpdateEventError(e) cinc_log_file = "/var/log/chef-client.log" log.info("Calling update event handler, log can be found at %s", cinc_log_file) @@ -384,9 +389,9 @@ def _call_update_event(self): ) try: _run_command(cmd, self._config.update_event_timeout_minutes) - except Exception: + except Exception as e: log.error("Update event handler failed. Check log file %s", cinc_log_file) - raise + raise ClusterStatusManager.ClusterStatusUpdateEventError(e) def _update_status(self, request_status, in_progress_status, final_status): if self._compute_fleet_status == request_status: diff --git a/cookbooks/aws-parallelcluster-slurm/files/default/head_node_slurm/slurm/pcluster_fleet_config_generator.py b/cookbooks/aws-parallelcluster-slurm/files/default/head_node_slurm/slurm/pcluster_fleet_config_generator.py index f8ea639c41..b1079a9e81 100644 --- a/cookbooks/aws-parallelcluster-slurm/files/default/head_node_slurm/slurm/pcluster_fleet_config_generator.py +++ b/cookbooks/aws-parallelcluster-slurm/files/default/head_node_slurm/slurm/pcluster_fleet_config_generator.py @@ -25,6 +25,12 @@ class CriticalError(Exception): pass +class ConfigurationFieldNotFoundError(Exception): + """Field not found in configuration""" + + pass + + def generate_fleet_config_file(output_file, input_file): """ Generate configuration file used by Fleet Manager in node daemon package. @@ -87,7 +93,7 @@ def generate_fleet_config_file(output_file, input_file): } else: - raise Exception( + raise ConfigurationFieldNotFoundError( "Instances or InstanceType field not found " f"in queue: {queue}, compute resource: {compute_resource} configuration" ) diff --git a/test/unit/clusterstatusmgtd/test_clusterstatusmgtd.py b/test/unit/clusterstatusmgtd/test_clusterstatusmgtd.py index 15852d0344..df6b33b8c0 100644 --- a/test/unit/clusterstatusmgtd/test_clusterstatusmgtd.py +++ b/test/unit/clusterstatusmgtd/test_clusterstatusmgtd.py @@ -398,7 +398,7 @@ def test_call_update_event(self, mocker, status, translated_status, exception): run_command_mock = mocker.patch("clusterstatusmgtd._run_command") if isinstance(exception, Exception): run_command_mock.side_effect = exception - with pytest.raises(Exception): + with pytest.raises(ClusterStatusManager.ClusterStatusUpdateEventError): clusterstatus_manager._call_update_event() else: file_writer_mock = mocker.mock_open() diff --git a/test/unit/slurm/test_fleet_config_generator.py b/test/unit/slurm/test_fleet_config_generator.py index ade2e41bda..cf22946cbc 100644 --- a/test/unit/slurm/test_fleet_config_generator.py +++ b/test/unit/slurm/test_fleet_config_generator.py @@ -12,34 +12,42 @@ import pytest from assertpy import assert_that -from slurm.pcluster_fleet_config_generator import generate_fleet_config_file +from slurm.pcluster_fleet_config_generator import ( + ConfigurationFieldNotFoundError, + CriticalError, + generate_fleet_config_file, +) @pytest.mark.parametrize( - "cluster_config, expected_message", + "cluster_config, expected_exception, expected_message", [ - ({}, "Unable to find key 'Scheduling' in the configuration file"), - ({"Scheduling": {}}, "Unable to find key 'SlurmQueues' in the configuration file"), - ({"Scheduling": {"SlurmQueues": []}}, None), + ({}, CriticalError, "Unable to find key 'Scheduling' in the configuration file"), + ({"Scheduling": {}}, CriticalError, "Unable to find key 'SlurmQueues' in the configuration file"), + ({"Scheduling": {"SlurmQueues": []}}, None, None), ( {"Scheduling": {"SlurmQueues": [{"ComputeResources": []}]}}, + CriticalError, "Unable to find key 'Name' in the configuration file", ), ( {"Scheduling": {"SlurmQueues": [{"Name": "q1"}]}}, + CriticalError, "Unable to find key 'CapacityType' in the configuration of queue: q1", ), ( {"Scheduling": {"SlurmQueues": [{"Name": "q1", "CapacityType": "ONDEMAND"}]}}, + CriticalError, "Unable to find key 'ComputeResources' in the configuration of queue: q1", ), - ({"Scheduling": {"SlurmQueues": [{"Name": "q1", "CapacityType": "SPOT", "ComputeResources": []}]}}, None), + ({"Scheduling": {"SlurmQueues": [{"Name": "q1", "CapacityType": "SPOT", "ComputeResources": []}]}}, None, None), ( { "Scheduling": { "SlurmQueues": [{"Name": "q1", "CapacityType": "SPOT", "ComputeResources": [{"Instances": []}]}] } }, + CriticalError, "Unable to find key 'Name' in the configuration of queue: q1", ), ( @@ -55,6 +63,7 @@ ] } }, + ConfigurationFieldNotFoundError, "Instances or InstanceType field not found in queue: q1, compute resource: cr1 configuration", ), ( @@ -74,6 +83,7 @@ } }, None, + None, ), ( { @@ -92,6 +102,7 @@ } }, None, + None, ), ( { @@ -114,6 +125,7 @@ } }, None, + None, ), ( { @@ -128,6 +140,7 @@ ] } }, + CriticalError, "Unable to find key 'SpotPrice' in the configuration of queue: q1, compute resource: cr1", ), ( @@ -146,6 +159,7 @@ } }, None, + None, ), ( { @@ -161,6 +175,7 @@ ] } }, + CriticalError, "Unable to find key 'Networking' in the configuration of queue: q1, compute resource: cr1", ), ( @@ -178,16 +193,17 @@ ] } }, + CriticalError, "Unable to find key 'SubnetIds' in the configuration of queue: q1, compute resource: cr1", ), ], ) -def test_generate_fleet_config_file_error_cases(mocker, tmpdir, cluster_config, expected_message): +def test_generate_fleet_config_file_error_cases(mocker, tmpdir, cluster_config, expected_exception, expected_message): mocker.patch("slurm.pcluster_fleet_config_generator._load_cluster_config", return_value=cluster_config) output_file = f"{tmpdir}/fleet-config.json" if expected_message: - with pytest.raises(Exception, match=expected_message): + with pytest.raises(expected_exception, match=expected_message): generate_fleet_config_file(output_file, input_file="fake") else: generate_fleet_config_file(output_file, input_file="fake")