From 292af08fe3edb5c68507ab8ee19975554b4aff90 Mon Sep 17 00:00:00 2001
From: Justin Yu <justinvyu@anyscale.com>
Date: Fri, 16 Jun 2023 00:16:16 -0700
Subject: [PATCH] [release/air] Fix release test timeout for
 `tune_scalability_network_overhead` (#36360)

There are 2 versions of the test: a smoke test version with 20 nodes/trials and a full version with 100 nodes/trials. The smoke test version timed out recently.

The test setups is slightly different:
- Smoke test runs with m5a.large (2 cpus) x 20 nodes.
- Full version runs with m5a.4xlarge (16 cpus) head node + 99 x m5a.large worker nodes
- The smoke test takes **longer** than the full run due to syncing overhead at the end caused by the smaller head node instance (since all the syncing is going there).

This PR bumps the smoke test's head node instance size, and forces trials to run on worker nodes -- the head node is purely meant to handle syncing in this test. This also fixes a problem that existed before, where the full 100 trial release test would schedule 8 trials on the head node, rather than utilize every node in the cluster.

See https://github.com/ray-project/ray/issues/36346 for more context.

Signed-off-by: Justin Yu <justinvyu@anyscale.com>
---
 python/ray/tune/utils/release_test_util.py       |  8 ++++++--
 release/release_tests.yaml                       |  4 ++--
 .../tune_tests/scalability_tests/tpl_100x2.yaml  | 11 ++++++++---
 .../tune_tests/scalability_tests/tpl_20x2.yaml   | 13 +++++++++----
 .../scalability_tests/tpl_gce_100x2.yaml         | 13 +++++++++----
 .../workloads/test_network_overhead.py           | 16 +++++++++++-----
 6 files changed, 45 insertions(+), 20 deletions(-)

diff --git a/python/ray/tune/utils/release_test_util.py b/python/ray/tune/utils/release_test_util.py
index 5355b3a1b1d8..1b9ad5918349 100644
--- a/python/ray/tune/utils/release_test_util.py
+++ b/python/ray/tune/utils/release_test_util.py
@@ -108,7 +108,7 @@ def timed_tune_run(
     checkpoint_size_b: int = 0,
     checkpoint_num_files: int = 1,
     **tune_kwargs,
-):
+) -> bool:
     durable = (
         "storage_path" in tune_kwargs
         and tune_kwargs["storage_path"]
@@ -164,7 +164,9 @@ def timed_tune_run(
     with open(test_output_json, "wt") as f:
         json.dump(result, f)
 
-    if time_taken > max_runtime:
+    success = time_taken <= max_runtime
+
+    if not success:
         print(
             f"The {name} test took {time_taken:.2f} seconds, but should not "
             f"have exceeded {max_runtime:.2f} seconds. Test failed. \n\n"
@@ -179,3 +181,5 @@ def timed_tune_run(
             f"--- PASSED: {name.upper()} ::: "
             f"{time_taken:.2f} <= {max_runtime:.2f} ---"
         )
+
+    return success
diff --git a/release/release_tests.yaml b/release/release_tests.yaml
index fbe1fffb4a1c..79a5aa84947b 100644
--- a/release/release_tests.yaml
+++ b/release/release_tests.yaml
@@ -2023,7 +2023,7 @@
     cluster_compute: tpl_100x2.yaml
 
   run:
-    timeout: 900
+    timeout: 750
     prepare_timeout: 1200
     script: python workloads/test_network_overhead.py
     wait_for_nodes:
@@ -2039,7 +2039,7 @@
         cluster_env: app_config.yaml
         cluster_compute: tpl_20x2.yaml
       run:
-        timeout: 500
+        timeout: 750
         prepare_timeout: 600
         script: python workloads/test_network_overhead.py --smoke-test
         wait_for_nodes:
diff --git a/release/tune_tests/scalability_tests/tpl_100x2.yaml b/release/tune_tests/scalability_tests/tpl_100x2.yaml
index 7b59d177ad69..f01952b49efc 100644
--- a/release/tune_tests/scalability_tests/tpl_100x2.yaml
+++ b/release/tune_tests/scalability_tests/tpl_100x2.yaml
@@ -1,15 +1,20 @@
 cloud_id: {{env["ANYSCALE_CLOUD_ID"]}}
 region: us-west-2
 
-max_workers: 99
+max_workers: 100
 
 head_node_type:
     name: head_node
     instance_type: m5a.4xlarge
+    # See `tune_scalability_network_overhead`
+    # Don't schedule any trials to run on the head node.
+    # The head node is only used for synchronization purposes.
+    resources:
+        cpu: 0
 
 worker_node_types:
     - name: worker_node
       instance_type: m5a.large
-      min_workers: 99
-      max_workers: 99
+      min_workers: 100
+      max_workers: 100
       use_spot: true
diff --git a/release/tune_tests/scalability_tests/tpl_20x2.yaml b/release/tune_tests/scalability_tests/tpl_20x2.yaml
index ead23a1a142b..b50e088eab57 100644
--- a/release/tune_tests/scalability_tests/tpl_20x2.yaml
+++ b/release/tune_tests/scalability_tests/tpl_20x2.yaml
@@ -1,15 +1,20 @@
 cloud_id: {{env["ANYSCALE_CLOUD_ID"]}}
 region: us-west-2
 
-max_workers: 19
+max_workers: 20
 
 head_node_type:
     name: head_node
-    instance_type: m5a.large
+    instance_type: m5a.4xlarge
+    # See `tune_scalability_network_overhead`
+    # Don't schedule any trials to run on the head node.
+    # The head node is only used for synchronization purposes.
+    resources:
+        cpu: 0
 
 worker_node_types:
     - name: worker_node
       instance_type: m5a.large
-      min_workers: 19
-      max_workers: 19
+      min_workers: 20
+      max_workers: 20
       use_spot: true
diff --git a/release/tune_tests/scalability_tests/tpl_gce_100x2.yaml b/release/tune_tests/scalability_tests/tpl_gce_100x2.yaml
index 14f6b70b279f..f2a446862bfd 100644
--- a/release/tune_tests/scalability_tests/tpl_gce_100x2.yaml
+++ b/release/tune_tests/scalability_tests/tpl_gce_100x2.yaml
@@ -1,17 +1,22 @@
 cloud_id: cld_tPsS3nQz8p5cautbyWgEdr4y
 region: us-west1
-allowed_azs: 
+allowed_azs:
     - us-west1-c
 
-max_workers: 99
+max_workers: 100
 
 head_node_type:
     name: head_node
+    # See `tune_scalability_network_overhead`
+    # Don't schedule any trials to run on the head node.
+    # The head node is only used for synchronization purposes.
     instance_type: n2d-standard-16
+    resources:
+        cpu: 0
 
 worker_node_types:
     - name: worker_node
       instance_type: n2d-standard-2
-      min_workers: 99
-      max_workers: 99
+      min_workers: 100
+      max_workers: 100
       use_spot: true
diff --git a/release/tune_tests/scalability_tests/workloads/test_network_overhead.py b/release/tune_tests/scalability_tests/workloads/test_network_overhead.py
index 8e63d250a3de..1543ec1777a1 100644
--- a/release/tune_tests/scalability_tests/workloads/test_network_overhead.py
+++ b/release/tune_tests/scalability_tests/workloads/test_network_overhead.py
@@ -4,8 +4,6 @@
 This test will thus measure the overhead that comes with network communication
 and specifically log synchronization.
 
-Cluster: cluster_100x2.yaml
-
 Test owner: krfricke
 
 Acceptance criteria: Should run faster than 500 seconds.
@@ -26,18 +24,26 @@ def main(smoke_test: bool = False):
     results_per_second = 0.01
     trial_length_s = 300
 
-    max_runtime = 1000
+    max_runtime = 500
 
-    timed_tune_run(
+    success = timed_tune_run(
         name="result network overhead",
         num_samples=num_samples,
         results_per_second=results_per_second,
         trial_length_s=trial_length_s,
         max_runtime=max_runtime,
-        resources_per_trial={"cpu": 2},  # One per node
+        # One trial per worker node, none get scheduled on the head node.
+        # See the compute config.
+        resources_per_trial={"cpu": 2},
         sync_config=tune.SyncConfig(syncer="auto"),
     )
 
+    if not success:
+        raise RuntimeError(
+            f"Test did not finish in within the max_runtime ({max_runtime} s). "
+            "See above for details."
+        )
+
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()