Skip to content

Commit

Permalink
Tox fixes.
Browse files Browse the repository at this point in the history
  • Loading branch information
EddyMM authored and lukeseawalker committed Jan 22, 2024
1 parent 9168213 commit d1b8ed7
Show file tree
Hide file tree
Showing 4 changed files with 21 additions and 18 deletions.
12 changes: 7 additions & 5 deletions tests/integration-tests/conftest_networking.py
Original file line number Diff line number Diff line change
Expand Up @@ -307,10 +307,10 @@ def vpc_stack(vpc_stacks_shared, region, az_id):

def _is_scaling_test(tests_config):
logging.info(f"Checking any scaling stress tests in {tests_config}")
return tests_config.get(
"test-suites", {}).get(
"performance_tests", {}).get(
"test_scaling.py::test_scaling_stress_test"
return (
tests_config.get("test-suites", {})
.get("performance_tests", {})
.get("test_scaling.py::test_scaling_stress_test")
)


Expand Down Expand Up @@ -351,7 +351,9 @@ def vpc_stacks_shared(cfn_stacks_factory, request, key_name):
subnets.append(
SubnetConfig(
name=subnet_name(visibility="Private", az_id=az_id),
cidr=CIDR_FOR_PRIVATE_SUBNETS_SCALING[index] if is_scaling_test else CIDR_FOR_PRIVATE_SUBNETS[index],
cidr=CIDR_FOR_PRIVATE_SUBNETS_SCALING[index]
if is_scaling_test
else CIDR_FOR_PRIVATE_SUBNETS[index],
map_public_ip_on_launch=False,
has_nat_gateway=False,
availability_zone=az_name,
Expand Down
5 changes: 3 additions & 2 deletions tests/integration-tests/tests/common/scaling_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,9 @@
def scaling_target_condition(
ec2_capacity_time_series,
compute_nodes_time_series,
target_cluster_size, use_ec2_limit=True, # Stop monitoring after all EC2 instances have been launched
use_compute_nodes_limit=True # Stop monitoring after all nodes have joined the cluster
target_cluster_size,
use_ec2_limit=True, # Stop monitoring after all EC2 instances have been launched
use_compute_nodes_limit=True, # Stop monitoring after all nodes have joined the cluster
):
return (
(use_ec2_limit and ec2_capacity_time_series[-1] != target_cluster_size)
Expand Down
17 changes: 8 additions & 9 deletions tests/integration-tests/tests/performance_tests/test_scaling.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,10 @@
from benchmarks.common.metrics_reporter import produce_benchmark_metrics_report
from remote_command_executor import RemoteCommandExecutor
from time_utils import minutes
from utils import disable_protected_mode

from tests.common.assertions import assert_no_msg_in_logs
from tests.common.scaling_common import get_scaling_metrics
from utils import disable_protected_mode


@pytest.mark.parametrize(
Expand Down Expand Up @@ -76,7 +76,7 @@ def _get_scaling_time(ec2_capacity_time_series: list, timestamps: list, scaling_


@pytest.mark.parametrize(
"scaling_max_time_in_mins, scaling_target, shared_headnode_storage, head_node_instance_type, scaling_strategy",
"max_monitoring_time_in_mins, scaling_target, shared_headnode_storage, head_node_instance_type, scaling_strategy",
[
(20, 1000, "Efs", "c5.24xlarge", "best-effort"), # TODO: Pass these values from an external source
(20, 2000, "Efs", "c5.24xlarge", "best-effort"),
Expand All @@ -93,7 +93,7 @@ def test_scaling_stress_test(
pcluster_config_reader,
scheduler_commands_factory,
clusters_factory,
scaling_max_time_in_mins,
max_monitoring_time_in_mins,
scaling_target,
shared_headnode_storage,
head_node_instance_type,
Expand All @@ -116,13 +116,12 @@ def test_scaling_stress_test(
# Creating cluster with intended head node instance type and scaling parameters
cluster_config = pcluster_config_reader(
# Prevent nodes being set down before we start monitoring the scale down metrics
scaledown_idletime=scaling_max_time_in_mins,
scaledown_idletime=max_monitoring_time_in_mins,
scaling_target=scaling_target,
head_node_instance_type=head_node_instance_type,
shared_headnode_storage=shared_headnode_storage,
scaling_strategy=scaling_strategy,
)
logging.info(f"Cluster config: {cluster_config}")
cluster = clusters_factory(cluster_config)
remote_command_executor = RemoteCommandExecutor(cluster)
scheduler_commands = scheduler_commands_factory(remote_command_executor)
Expand All @@ -133,7 +132,7 @@ def test_scaling_stress_test(
# Submit a simple job to trigger the launch all compute nodes
scaling_job = {
# Keep job running until we explicitly cancel it and start monitoring scale down
"command": f"srun sleep {minutes(scaling_max_time_in_mins) // 1000}",
"command": f"srun sleep {minutes(max_monitoring_time_in_mins) // 1000}",
"nodes": scaling_target,
}
job_id = scheduler_commands.submit_command_and_assert_job_accepted(scaling_job)
Expand All @@ -144,7 +143,7 @@ def test_scaling_stress_test(
# Monitor the cluster during scale up
ec2_capacity_time_series, compute_nodes_time_series, timestamps, end_time = get_scaling_metrics(
remote_command_executor,
max_monitoring_time=minutes(scaling_max_time_in_mins),
max_monitoring_time=minutes(max_monitoring_time_in_mins),
region=region,
cluster_name=cluster.name,
publish_metrics=True,
Expand All @@ -155,15 +154,15 @@ def test_scaling_stress_test(
ec2_capacity_time_series, timestamps, scaling_target, start_time
)

# Cancel the running job and scale dow the cluster using the update-compute-fleet command
# Cancel the running job and scale down the cluster using the update-compute-fleet command
scheduler_commands.cancel_job(job_id)
cluster.stop()

# Monitor the cluster during scale down
scale_down_start_timestamp = _datetime_to_minute_granularity(datetime.datetime.now(tz=datetime.timezone.utc))
ec2_capacity_time_series, compute_nodes_time_series, timestamps, end_time = get_scaling_metrics(
remote_command_executor,
max_monitoring_time=minutes(scaling_max_time_in_mins),
max_monitoring_time=minutes(max_monitoring_time_in_mins),
region=region,
cluster_name=cluster.name,
publish_metrics=True,
Expand Down
5 changes: 3 additions & 2 deletions tests/integration-tests/tests/schedulers/test_slurm.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,10 @@
check_status,
get_compute_nodes_instance_ids,
get_instance_info,
retrieve_clustermgtd_conf_path,
set_protected_failure_count,
test_cluster_health_metric,
wait_for_computefleet_changed, set_protected_failure_count, retrieve_clustermgtd_conf_path,
wait_for_computefleet_changed,
)

from tests.common.assertions import (
Expand Down Expand Up @@ -1911,7 +1913,6 @@ def _inject_bootstrap_failures(cluster, bucket_name, pcluster_config_reader, sca
_update_and_start_cluster(cluster, updated_config_file)



@retry(wait_fixed=seconds(30), stop_max_delay=minutes(20))
def _wait_until_protected_mode_failure_count_set(cluster):
"""Retry setting the protected failure count until the clustermgtd is running."""
Expand Down

0 comments on commit d1b8ed7

Please sign in to comment.