diff --git a/release/BUILD b/release/BUILD index 7387db0d8697..ad3e31f03b7c 100644 --- a/release/BUILD +++ b/release/BUILD @@ -523,6 +523,21 @@ py_test( ], ) +py_test( + name = "test_byod_build", + size = "small", + srcs = ["ray_release/tests/test_byod_build.py"], + exec_compatible_with = [":hermetic_python"], + tags = [ + "release_unit", + "team:ci", + ], + deps = [ + ":ray_release", + bk_require("pytest"), + ], +) + py_test( name = "test_cluster_manager", size = "small", diff --git a/release/ray_release/byod/build.py b/release/ray_release/byod/build.py index 4a49e2123e29..2e163ed278e8 100644 --- a/release/ray_release/byod/build.py +++ b/release/ray_release/byod/build.py @@ -19,7 +19,43 @@ RELEASE_BYOD_DIR = os.path.join(RELEASE_PACKAGE_DIR, "ray_release/byod") -def build_anyscale_byod_images(tests: List[Test]) -> None: +def build_anyscale_custom_byod_image(test: Test) -> None: + if not test.require_custom_byod_image(): + logger.info(f"Test {test.get_name()} does not require a custom byod image") + return + byod_image = test.get_anyscale_byod_image() + if _byod_image_exist(test, base_image=False): + logger.info(f"Image {byod_image} already exists") + return + + env = os.environ.copy() + env["DOCKER_BUILDKIT"] = "1" + subprocess.check_call( + [ + "docker", + "build", + "--build-arg", + f"BASE_IMAGE={test.get_anyscale_base_byod_image()}", + "--build-arg", + f"POST_BUILD_SCRIPT={test.get_byod_post_build_script()}", + "-t", + byod_image, + "-f", + os.path.join(RELEASE_BYOD_DIR, "byod.custom.Dockerfile"), + RELEASE_BYOD_DIR, + ], + stdout=sys.stderr, + env=env, + ) + # push the image to ecr, the image will have a tag in this format + # {commit_sha}-py{version}-gpu-{custom_information_dict_hash} + subprocess.check_call( + ["docker", "push", byod_image], + stdout=sys.stderr, + ) + + +def build_anyscale_base_byod_images(tests: List[Test]) -> None: """ Builds the Anyscale BYOD images for the given tests. """ @@ -40,7 +76,7 @@ def build_anyscale_byod_images(tests: List[Test]) -> None: and int(time.time()) - start < BASE_IMAGE_WAIT_TIMEOUT ): for ray_image, test in to_be_built.items(): - byod_image = test.get_anyscale_byod_image() + byod_image = test.get_anyscale_base_byod_image() if _byod_image_exist(test): logger.info(f"Image {byod_image} already exists") built.add(ray_image) @@ -124,15 +160,18 @@ def _ray_image_exist(ray_image: str) -> bool: return p.returncode == 0 -def _byod_image_exist(test: Test) -> bool: +def _byod_image_exist(test: Test, base_image: bool = True) -> bool: """ Checks if the given Anyscale BYOD image exists. """ client = boto3.client("ecr") + image_tag = ( + test.get_byod_base_image_tag() if base_image else test.get_byod_image_tag() + ) try: client.describe_images( repositoryName=test.get_byod_repo(), - imageIds=[{"imageTag": test.get_byod_image_tag()}], + imageIds=[{"imageTag": image_tag}], ) return True except client.exceptions.ImageNotFoundException: diff --git a/release/ray_release/byod/byod.custom.Dockerfile b/release/ray_release/byod/byod.custom.Dockerfile new file mode 100644 index 000000000000..432ddeef3138 --- /dev/null +++ b/release/ray_release/byod/byod.custom.Dockerfile @@ -0,0 +1,10 @@ +# syntax=docker/dockerfile:1.3-labs +# shellcheck disable=SC2148 + +ARG BASE_IMAGE +FROM "$BASE_IMAGE" + +ARG POST_BUILD_SCRIPT + +COPY "$POST_BUILD_SCRIPT" /tmp/post_build_script.sh +RUN /tmp/post_build_script.sh diff --git a/release/ray_release/byod/byod_agent_stress_test.sh b/release/ray_release/byod/byod_agent_stress_test.sh new file mode 100755 index 000000000000..95bfb32ff3a2 --- /dev/null +++ b/release/ray_release/byod/byod_agent_stress_test.sh @@ -0,0 +1,8 @@ +#!/bin/bash +# This script is used to build an extra layer on top of the base anyscale/ray image +# to run the agent stress test. + +set -exo pipefail + +echo "deb [signed-by=/usr/share/keyrings/cloud.google.gpg] https://packages.cloud.google.com/apt cloud-sdk main" | sudo tee -a /etc/apt/sources.list.d/google-cloud-sdk.list +curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | sudo apt-key --keyring /usr/share/keyrings/cloud.google.gpg add - diff --git a/release/ray_release/glue.py b/release/ray_release/glue.py index 8b022209106b..b1c519c941aa 100644 --- a/release/ray_release/glue.py +++ b/release/ray_release/glue.py @@ -294,14 +294,6 @@ def _prepare_remote_environment( except CommandTimeout as e: raise PrepareCommandTimeout(e) - for pre_run_cmd in test.get_byod_pre_run_cmds(): - try: - command_runner.run_prepare_command(pre_run_cmd, timeout=300) - except CommandError as e: - raise PrepareCommandError(e) - except CommandTimeout as e: - raise PrepareCommandTimeout(e) - def _running_test_script( test: Test, diff --git a/release/ray_release/schema.json b/release/ray_release/schema.json index a812fd709fbf..4734a3c84975 100644 --- a/release/ray_release/schema.json +++ b/release/ray_release/schema.json @@ -109,8 +109,8 @@ "gpu" ] }, - "pre_run_cmds": { - "type": "array" + "post_build_script":{ + "type": "string" }, "pip": { "type": "array" diff --git a/release/ray_release/scripts/build_pipeline.py b/release/ray_release/scripts/build_pipeline.py index e72957d9fed2..b04691c52e9a 100644 --- a/release/ray_release/scripts/build_pipeline.py +++ b/release/ray_release/scripts/build_pipeline.py @@ -11,7 +11,10 @@ from ray_release.buildkite.filter import filter_tests, group_tests from ray_release.buildkite.settings import get_pipeline_settings from ray_release.buildkite.step import get_step -from ray_release.byod.build import build_anyscale_byod_images +from ray_release.byod.build import ( + build_anyscale_base_byod_images, + build_anyscale_custom_byod_image, +) from ray_release.config import ( read_and_validate_release_test_collection, DEFAULT_WHEEL_WAIT_TIMEOUT, @@ -150,8 +153,12 @@ def main( "Empty test collection. The selected frequency or filter did " "not return any tests to run. Adjust your filters." ) - logger.info("Build anyscale BYOD images") - build_anyscale_byod_images([test for test, _ in filtered_tests]) + tests = [test for test, _ in filtered_tests] + logger.info("Build anyscale base BYOD images") + build_anyscale_base_byod_images(tests) + logger.info("Build anyscale custom BYOD images") + for test in tests: + build_anyscale_custom_byod_image(test) grouped_tests = group_tests(filtered_tests) group_str = "" diff --git a/release/ray_release/scripts/ray_bisect.py b/release/ray_release/scripts/ray_bisect.py index d82cee22d088..6e3df088268b 100644 --- a/release/ray_release/scripts/ray_bisect.py +++ b/release/ray_release/scripts/ray_bisect.py @@ -8,7 +8,10 @@ from ray_release.logger import logger from ray_release.buildkite.step import get_step -from ray_release.byod.build import build_anyscale_byod_images +from ray_release.byod.build import ( + build_anyscale_base_byod_images, + build_anyscale_custom_byod_image, +) from ray_release.config import ( read_and_validate_release_test_collection, parse_python_version, @@ -162,7 +165,8 @@ def _trigger_test_run( if test.is_byod_cluster(): ray_wheels_url = None os.environ["COMMIT_TO_TEST"] = commit - build_anyscale_byod_images([test]) + build_anyscale_base_byod_images([test]) + build_anyscale_custom_byod_image(test) else: ray_wheels_url = find_and_wait_for_ray_wheels_url( commit, timeout=DEFAULT_WHEEL_WAIT_TIMEOUT, python_version=python_version diff --git a/release/ray_release/test.py b/release/ray_release/test.py index b52b1bea85ca..6d428fcb4fee 100644 --- a/release/ray_release/test.py +++ b/release/ray_release/test.py @@ -14,6 +14,7 @@ Result, ) from ray_release.logger import logger +from ray_release.util import dict_hash AWS_BUCKET = "ray-ci-results" AWS_TEST_KEY = "ray_tests" @@ -125,13 +126,13 @@ def get_byod_type(self) -> Optional[str]: return None return self["cluster"]["byod"].get("type", "cpu") - def get_byod_pre_run_cmds(self) -> List[str]: + def get_byod_post_build_script(self) -> Optional[str]: """ - Returns the list of pre-run commands for the BYOD cluster. + Returns the post-build script for the BYOD cluster. """ if not self.is_byod_cluster(): - return [] - return self["cluster"]["byod"].get("pre_run_cmds", []) + return None + return self["cluster"]["byod"].get("post_build_script") def get_byod_runtime_env(self) -> Dict[str, str]: """ @@ -200,7 +201,7 @@ def get_python_version(self) -> str: """ return self.get("python", ".".join(str(v) for v in DEFAULT_PYTHON_VERSION)) - def get_byod_image_tag(self) -> str: + def get_byod_base_image_tag(self) -> str: """ Returns the byod image tag to use for this test. """ @@ -223,6 +224,17 @@ def get_byod_image_tag(self) -> str: python_version = f"py{self.get_python_version().replace('.', '')}" return f"{ray_version}-{python_version}{image_suffix}" + def get_byod_image_tag(self) -> str: + """ + Returns the byod custom image tag to use for this test. + """ + if not self.require_custom_byod_image(): + return self.get_byod_base_image_tag() + custom_info = { + "post_build_script": self.get_byod_post_build_script(), + } + return f"{self.get_byod_base_image_tag()}-{dict_hash(custom_info)}" + def get_byod_repo(self) -> str: """ Returns the byod repo to use for this test. @@ -238,7 +250,21 @@ def get_ray_image(self) -> str: Returns the ray docker image to use for this test. """ ray_project = "ray-ml" if self.get_byod_type() == "gpu" else "ray" - return f"rayproject/{ray_project}:{self.get_byod_image_tag()}" + return f"rayproject/{ray_project}:{self.get_byod_base_image_tag()}" + + def get_anyscale_base_byod_image(self) -> str: + """ + Returns the anyscale byod image to use for this test. + """ + return ( + f"{DATAPLANE_ECR}/{self.get_byod_repo()}:{self.get_byod_base_image_tag()}" + ) + + def require_custom_byod_image(self) -> bool: + """ + Returns whether this test requires a custom byod image. + """ + return self.get_byod_post_build_script() is not None def get_anyscale_byod_image(self) -> str: """ diff --git a/release/ray_release/tests/test_byod_build.py b/release/ray_release/tests/test_byod_build.py new file mode 100644 index 000000000000..3cea0f1496e8 --- /dev/null +++ b/release/ray_release/tests/test_byod_build.py @@ -0,0 +1,42 @@ +import sys + +import pytest +from unittest.mock import patch +from typing import List + +from ray_release.test import Test +from ray_release.byod.build import build_anyscale_custom_byod_image + + +def test_build_anyscale_custom_byod_image() -> None: + cmds = [] + + def _mock_check_call( + cmd: List[str], + *args, + **kwargs, + ) -> None: + cmds.append(cmd) + + with patch( + "ray_release.byod.build._byod_image_exist", return_value=False + ), patch.dict( + "os.environ", + {"BUILDKITE_COMMIT": "abc123", "BUILDKITE_BRANCH": "master"}, + ), patch( + "subprocess.check_call", + side_effect=_mock_check_call, + ): + test = Test( + name="name", + cluster={"byod": {"post_build_script": "foo.sh"}}, + ) + build_anyscale_custom_byod_image(test) + assert "docker build --build-arg BASE_IMAGE=029272617770.dkr.ecr.us-west-2." + "amazonaws.com/anyscale/ray:abc123-py37 -t 029272617770.dkr.ecr.us-west-2." + "amazonaws.com/anyscale/ray:abc123-py37-c3fc5fc6d84cea4d7ab885c6cdc966542e" + "f59e4c679b8c970f2f77b956bfd8fb" in " ".join(cmds[0]) + + +if __name__ == "__main__": + sys.exit(pytest.main(["-v", __file__])) diff --git a/release/ray_release/tests/test_test.py b/release/ray_release/tests/test_test.py index f096fefe1e84..878bae12f147 100644 --- a/release/ray_release/tests/test_test.py +++ b/release/ray_release/tests/test_test.py @@ -90,6 +90,21 @@ def test_get_anyscale_byod_image(): ).get_anyscale_byod_image() == f"{DATAPLANE_ECR}/{DATAPLANE_ECR_ML_REPO}:123456-py38-gpu" ) + assert ( + _stub_test( + { + "python": "3.8", + "cluster": { + "byod": { + "type": "gpu", + "post_build_script": "foo.sh", + } + }, + } + ).get_anyscale_byod_image() + == f"{DATAPLANE_ECR}/{DATAPLANE_ECR_ML_REPO}:123456-py38-gpu-" + "ab7ed2b7a7e8d3f855a7925b0d296b0f9c75fac91882aba47854d92d27e13e53" + ) @patch("github.Repository") diff --git a/release/release_tests.yaml b/release/release_tests.yaml index 3ce30fed2027..f889eb6845ab 100644 --- a/release/release_tests.yaml +++ b/release/release_tests.yaml @@ -6363,11 +6363,10 @@ team: core cluster: byod: + type: gpu runtime_env: - RAY_INTERNAL_MEM_PROFILE_COMPONENTS=dashboard_agent - pre_run_cmds: - - echo "deb [signed-by=/usr/share/keyrings/cloud.google.gpg] https://packages.cloud.google.com/apt cloud-sdk main" | sudo tee -a /etc/apt/sources.list.d/google-cloud-sdk.list - - curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | sudo apt-key --keyring /usr/share/keyrings/cloud.google.gpg add - + post_build_script: byod_agent_stress_test.sh cluster_env: agent_stress_app_config.yaml cluster_compute: agent_stress_compute.yaml