[release/air] Fix release test timeout for `tune_scalability_network_…

…overhead` (#36360) There are 2 versions of the test: a smoke test version with 20 nodes/trials and a full version with 100 nodes/trials. The smoke test version timed out recently. The test setups is slightly different: - Smoke test runs with m5a.large (2 cpus) x 20 nodes. - Full version runs with m5a.4xlarge (16 cpus) head node + 99 x m5a.large worker nodes - The smoke test takes **longer** than the full run due to syncing overhead at the end caused by the smaller head node instance (since all the syncing is going there). This PR bumps the smoke test's head node instance size, and forces trials to run on worker nodes -- the head node is purely meant to handle syncing in this test. This also fixes a problem that existed before, where the full 100 trial release test would schedule 8 trials on the head node, rather than utilize every node in the cluster. See #36346 for more context. Signed-off-by: Justin Yu <justinvyu@anyscale.com>
ray-project · Jun 16, 2023 · 292af08 · 292af08
1 parent 43c0812
commit 292af08
Show file tree

Hide file tree

Showing 6 changed files with 45 additions and 20 deletions.
diff --git a/python/ray/tune/utils/release_test_util.py b/python/ray/tune/utils/release_test_util.py
@@ -108,7 +108,7 @@ def timed_tune_run(
     checkpoint_size_b: int = 0,
     checkpoint_num_files: int = 1,
     **tune_kwargs,
-):
+) -> bool:
     durable = (
         "storage_path" in tune_kwargs
         and tune_kwargs["storage_path"]
@@ -164,7 +164,9 @@ def timed_tune_run(
     with open(test_output_json, "wt") as f:
         json.dump(result, f)
 
-    if time_taken > max_runtime:
+    success = time_taken <= max_runtime
+
+    if not success:
         print(
             f"The {name} test took {time_taken:.2f} seconds, but should not "
             f"have exceeded {max_runtime:.2f} seconds. Test failed. \n\n"
@@ -179,3 +181,5 @@ def timed_tune_run(
             f"--- PASSED: {name.upper()} ::: "
             f"{time_taken:.2f} <= {max_runtime:.2f} ---"
         )
+
+    return success
diff --git a/release/release_tests.yaml b/release/release_tests.yaml
@@ -2023,7 +2023,7 @@
     cluster_compute: tpl_100x2.yaml
 
   run:
-    timeout: 900
+    timeout: 750
     prepare_timeout: 1200
     script: python workloads/test_network_overhead.py
     wait_for_nodes:
@@ -2039,7 +2039,7 @@
         cluster_env: app_config.yaml
         cluster_compute: tpl_20x2.yaml
       run:
-        timeout: 500
+        timeout: 750
         prepare_timeout: 600
         script: python workloads/test_network_overhead.py --smoke-test
         wait_for_nodes:

diff --git a/release/tune_tests/scalability_tests/tpl_100x2.yaml b/release/tune_tests/scalability_tests/tpl_100x2.yaml
@@ -1,15 +1,20 @@
 cloud_id: {{env["ANYSCALE_CLOUD_ID"]}}
 region: us-west-2
 
-max_workers: 99
+max_workers: 100
 
 head_node_type:
     name: head_node
     instance_type: m5a.4xlarge
+    # See `tune_scalability_network_overhead`
+    # Don't schedule any trials to run on the head node.
+    # The head node is only used for synchronization purposes.
+    resources:
+        cpu: 0
 
 worker_node_types:
     - name: worker_node
       instance_type: m5a.large
-      min_workers: 99
-      max_workers: 99
+      min_workers: 100
+      max_workers: 100
       use_spot: true
diff --git a/release/tune_tests/scalability_tests/tpl_20x2.yaml b/release/tune_tests/scalability_tests/tpl_20x2.yaml
@@ -1,15 +1,20 @@
 cloud_id: {{env["ANYSCALE_CLOUD_ID"]}}
 region: us-west-2
 
-max_workers: 19
+max_workers: 20
 
 head_node_type:
     name: head_node
-    instance_type: m5a.large
+    instance_type: m5a.4xlarge
+    # See `tune_scalability_network_overhead`
+    # Don't schedule any trials to run on the head node.
+    # The head node is only used for synchronization purposes.
+    resources:
+        cpu: 0
 
 worker_node_types:
     - name: worker_node
       instance_type: m5a.large
-      min_workers: 19
-      max_workers: 19
+      min_workers: 20
+      max_workers: 20
       use_spot: true
diff --git a/release/tune_tests/scalability_tests/tpl_gce_100x2.yaml b/release/tune_tests/scalability_tests/tpl_gce_100x2.yaml
@@ -1,17 +1,22 @@
 cloud_id: cld_tPsS3nQz8p5cautbyWgEdr4y
 region: us-west1
-allowed_azs: 
+allowed_azs:
     - us-west1-c
 
-max_workers: 99
+max_workers: 100
 
 head_node_type:
     name: head_node
+    # See `tune_scalability_network_overhead`
+    # Don't schedule any trials to run on the head node.
+    # The head node is only used for synchronization purposes.
     instance_type: n2d-standard-16
+    resources:
+        cpu: 0
 
 worker_node_types:
     - name: worker_node
       instance_type: n2d-standard-2
-      min_workers: 99
-      max_workers: 99
+      min_workers: 100
+      max_workers: 100
       use_spot: true
diff --git a/release/tune_tests/scalability_tests/workloads/test_network_overhead.py b/release/tune_tests/scalability_tests/workloads/test_network_overhead.py
@@ -4,8 +4,6 @@
 This test will thus measure the overhead that comes with network communication
 and specifically log synchronization.
 
-Cluster: cluster_100x2.yaml
-
 Test owner: krfricke
 
 Acceptance criteria: Should run faster than 500 seconds.
@@ -26,18 +24,26 @@ def main(smoke_test: bool = False):
     results_per_second = 0.01
     trial_length_s = 300
 
-    max_runtime = 1000
+    max_runtime = 500
 
-    timed_tune_run(
+    success = timed_tune_run(
         name="result network overhead",
         num_samples=num_samples,
         results_per_second=results_per_second,
         trial_length_s=trial_length_s,
         max_runtime=max_runtime,
-        resources_per_trial={"cpu": 2},  # One per node
+        # One trial per worker node, none get scheduled on the head node.
+        # See the compute config.
+        resources_per_trial={"cpu": 2},
         sync_config=tune.SyncConfig(syncer="auto"),
     )
 
+    if not success:
+        raise RuntimeError(
+            f"Test did not finish in within the max_runtime ({max_runtime} s). "
+            "See above for details."
+        )
+
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()