From 292af08fe3edb5c68507ab8ee19975554b4aff90 Mon Sep 17 00:00:00 2001 From: Justin Yu Date: Fri, 16 Jun 2023 00:16:16 -0700 Subject: [PATCH] [release/air] Fix release test timeout for `tune_scalability_network_overhead` (#36360) There are 2 versions of the test: a smoke test version with 20 nodes/trials and a full version with 100 nodes/trials. The smoke test version timed out recently. The test setups is slightly different: - Smoke test runs with m5a.large (2 cpus) x 20 nodes. - Full version runs with m5a.4xlarge (16 cpus) head node + 99 x m5a.large worker nodes - The smoke test takes **longer** than the full run due to syncing overhead at the end caused by the smaller head node instance (since all the syncing is going there). This PR bumps the smoke test's head node instance size, and forces trials to run on worker nodes -- the head node is purely meant to handle syncing in this test. This also fixes a problem that existed before, where the full 100 trial release test would schedule 8 trials on the head node, rather than utilize every node in the cluster. See https://github.com/ray-project/ray/issues/36346 for more context. Signed-off-by: Justin Yu --- python/ray/tune/utils/release_test_util.py | 8 ++++++-- release/release_tests.yaml | 4 ++-- .../tune_tests/scalability_tests/tpl_100x2.yaml | 11 ++++++++--- .../tune_tests/scalability_tests/tpl_20x2.yaml | 13 +++++++++---- .../scalability_tests/tpl_gce_100x2.yaml | 13 +++++++++---- .../workloads/test_network_overhead.py | 16 +++++++++++----- 6 files changed, 45 insertions(+), 20 deletions(-) diff --git a/python/ray/tune/utils/release_test_util.py b/python/ray/tune/utils/release_test_util.py index 5355b3a1b1d8..1b9ad5918349 100644 --- a/python/ray/tune/utils/release_test_util.py +++ b/python/ray/tune/utils/release_test_util.py @@ -108,7 +108,7 @@ def timed_tune_run( checkpoint_size_b: int = 0, checkpoint_num_files: int = 1, **tune_kwargs, -): +) -> bool: durable = ( "storage_path" in tune_kwargs and tune_kwargs["storage_path"] @@ -164,7 +164,9 @@ def timed_tune_run( with open(test_output_json, "wt") as f: json.dump(result, f) - if time_taken > max_runtime: + success = time_taken <= max_runtime + + if not success: print( f"The {name} test took {time_taken:.2f} seconds, but should not " f"have exceeded {max_runtime:.2f} seconds. Test failed. \n\n" @@ -179,3 +181,5 @@ def timed_tune_run( f"--- PASSED: {name.upper()} ::: " f"{time_taken:.2f} <= {max_runtime:.2f} ---" ) + + return success diff --git a/release/release_tests.yaml b/release/release_tests.yaml index fbe1fffb4a1c..79a5aa84947b 100644 --- a/release/release_tests.yaml +++ b/release/release_tests.yaml @@ -2023,7 +2023,7 @@ cluster_compute: tpl_100x2.yaml run: - timeout: 900 + timeout: 750 prepare_timeout: 1200 script: python workloads/test_network_overhead.py wait_for_nodes: @@ -2039,7 +2039,7 @@ cluster_env: app_config.yaml cluster_compute: tpl_20x2.yaml run: - timeout: 500 + timeout: 750 prepare_timeout: 600 script: python workloads/test_network_overhead.py --smoke-test wait_for_nodes: diff --git a/release/tune_tests/scalability_tests/tpl_100x2.yaml b/release/tune_tests/scalability_tests/tpl_100x2.yaml index 7b59d177ad69..f01952b49efc 100644 --- a/release/tune_tests/scalability_tests/tpl_100x2.yaml +++ b/release/tune_tests/scalability_tests/tpl_100x2.yaml @@ -1,15 +1,20 @@ cloud_id: {{env["ANYSCALE_CLOUD_ID"]}} region: us-west-2 -max_workers: 99 +max_workers: 100 head_node_type: name: head_node instance_type: m5a.4xlarge + # See `tune_scalability_network_overhead` + # Don't schedule any trials to run on the head node. + # The head node is only used for synchronization purposes. + resources: + cpu: 0 worker_node_types: - name: worker_node instance_type: m5a.large - min_workers: 99 - max_workers: 99 + min_workers: 100 + max_workers: 100 use_spot: true diff --git a/release/tune_tests/scalability_tests/tpl_20x2.yaml b/release/tune_tests/scalability_tests/tpl_20x2.yaml index ead23a1a142b..b50e088eab57 100644 --- a/release/tune_tests/scalability_tests/tpl_20x2.yaml +++ b/release/tune_tests/scalability_tests/tpl_20x2.yaml @@ -1,15 +1,20 @@ cloud_id: {{env["ANYSCALE_CLOUD_ID"]}} region: us-west-2 -max_workers: 19 +max_workers: 20 head_node_type: name: head_node - instance_type: m5a.large + instance_type: m5a.4xlarge + # See `tune_scalability_network_overhead` + # Don't schedule any trials to run on the head node. + # The head node is only used for synchronization purposes. + resources: + cpu: 0 worker_node_types: - name: worker_node instance_type: m5a.large - min_workers: 19 - max_workers: 19 + min_workers: 20 + max_workers: 20 use_spot: true diff --git a/release/tune_tests/scalability_tests/tpl_gce_100x2.yaml b/release/tune_tests/scalability_tests/tpl_gce_100x2.yaml index 14f6b70b279f..f2a446862bfd 100644 --- a/release/tune_tests/scalability_tests/tpl_gce_100x2.yaml +++ b/release/tune_tests/scalability_tests/tpl_gce_100x2.yaml @@ -1,17 +1,22 @@ cloud_id: cld_tPsS3nQz8p5cautbyWgEdr4y region: us-west1 -allowed_azs: +allowed_azs: - us-west1-c -max_workers: 99 +max_workers: 100 head_node_type: name: head_node + # See `tune_scalability_network_overhead` + # Don't schedule any trials to run on the head node. + # The head node is only used for synchronization purposes. instance_type: n2d-standard-16 + resources: + cpu: 0 worker_node_types: - name: worker_node instance_type: n2d-standard-2 - min_workers: 99 - max_workers: 99 + min_workers: 100 + max_workers: 100 use_spot: true diff --git a/release/tune_tests/scalability_tests/workloads/test_network_overhead.py b/release/tune_tests/scalability_tests/workloads/test_network_overhead.py index 8e63d250a3de..1543ec1777a1 100644 --- a/release/tune_tests/scalability_tests/workloads/test_network_overhead.py +++ b/release/tune_tests/scalability_tests/workloads/test_network_overhead.py @@ -4,8 +4,6 @@ This test will thus measure the overhead that comes with network communication and specifically log synchronization. -Cluster: cluster_100x2.yaml - Test owner: krfricke Acceptance criteria: Should run faster than 500 seconds. @@ -26,18 +24,26 @@ def main(smoke_test: bool = False): results_per_second = 0.01 trial_length_s = 300 - max_runtime = 1000 + max_runtime = 500 - timed_tune_run( + success = timed_tune_run( name="result network overhead", num_samples=num_samples, results_per_second=results_per_second, trial_length_s=trial_length_s, max_runtime=max_runtime, - resources_per_trial={"cpu": 2}, # One per node + # One trial per worker node, none get scheduled on the head node. + # See the compute config. + resources_per_trial={"cpu": 2}, sync_config=tune.SyncConfig(syncer="auto"), ) + if not success: + raise RuntimeError( + f"Test did not finish in within the max_runtime ({max_runtime} s). " + "See above for details." + ) + if __name__ == "__main__": parser = argparse.ArgumentParser()