Skip to content

Commit

Permalink
[nightly] Start to populate metrics for many_nodes_actor_test (ray-pr…
Browse files Browse the repository at this point in the history
…oject#32498)

Now v2 stack has supported metrics population, this PR turn it on for our nightly test.
  • Loading branch information
fishbone authored and peytondmurray committed Mar 22, 2023
1 parent 659a97d commit 11ca107
Showing 1 changed file with 22 additions and 17 deletions.
39 changes: 22 additions & 17 deletions release/benchmarks/distributed/many_nodes_tests/actor_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,39 +93,44 @@ def main():
args, unknown = parse_script_args()
args.total_actors.sort()

ray.init(address="auto")
from distributed.dashboard_test import DashboardTestAtScale

addr = ray.init(address="auto")
dashboard_test = DashboardTestAtScale(addr)

dashboard_test = None
# Enable it once v2 support prometheus
# dashboard_test = DashboardTestAtScale(addr)
result = {}
for i in args.total_actors:
result[f"many_nodes_actor_tests_{i}"] = run_one(
i, args.cpus_per_actor, args.no_wait
)

# Print the results early so if failed in the future, we still
# can see it in the log.
print(f"Result: {json.dumps(result, indent=2)}")

if "TEST_OUTPUT_JSON" in os.environ and not args.no_report:
out_file = open(os.environ["TEST_OUTPUT_JSON"], "w")
if dashboard_test is not None:
perf = [
{
"perf_metric_name": name,
"perf_metric_value": r["throughput"],
"perf_metric_type": "THROUGHPUT",
}
for (name, r) in result.items()
]
result["perf_metrics"] = perf
dashboard_test.update_release_test_result(result)
perf = [
{
"perf_metric_name": name,
"perf_metric_value": r["throughput"],
"perf_metric_type": "THROUGHPUT",
}
for (name, r) in result.items()
]
result["perf_metrics"] = perf
dashboard_test.update_release_test_result(result)

print(f"Writing data into file: {os.environ['TEST_OUTPUT_JSON']}")
json.dump(result, out_file)
print(f"Result: {json.dumps(result, indent=2)}")

print("Test finished successfully!")
ray.shutdown()

# We need to make sure GCS cool down otherwise, testing infra
# might get timeout when fetching the result.
# might get timeout when fetching the result because when the driver
# got shutdown, many actors needs to be terminated which will
# overload GCS.
print("Sleep for 60s, waiting for the cluster to cool down.")
sleep(60)

Expand Down

0 comments on commit 11ca107

Please sign in to comment.