Skip to content

Commit

Permalink
Support post build script for byod (ray-project#36457)
Browse files Browse the repository at this point in the history
Signed-off-by: can <can@anyscale.com>
Signed-off-by: e428265 <arvind.chandramouli@lmco.com>
  • Loading branch information
can-anyscale authored and arvind-chandra committed Aug 31, 2023
1 parent a89092b commit 2a77981
Show file tree
Hide file tree
Showing 12 changed files with 185 additions and 28 deletions.
15 changes: 15 additions & 0 deletions release/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -523,6 +523,21 @@ py_test(
],
)

py_test(
name = "test_byod_build",
size = "small",
srcs = ["ray_release/tests/test_byod_build.py"],
exec_compatible_with = [":hermetic_python"],
tags = [
"release_unit",
"team:ci",
],
deps = [
":ray_release",
bk_require("pytest"),
],
)

py_test(
name = "test_cluster_manager",
size = "small",
Expand Down
47 changes: 43 additions & 4 deletions release/ray_release/byod/build.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,43 @@
RELEASE_BYOD_DIR = os.path.join(RELEASE_PACKAGE_DIR, "ray_release/byod")


def build_anyscale_byod_images(tests: List[Test]) -> None:
def build_anyscale_custom_byod_image(test: Test) -> None:
if not test.require_custom_byod_image():
logger.info(f"Test {test.get_name()} does not require a custom byod image")
return
byod_image = test.get_anyscale_byod_image()
if _byod_image_exist(test, base_image=False):
logger.info(f"Image {byod_image} already exists")
return

env = os.environ.copy()
env["DOCKER_BUILDKIT"] = "1"
subprocess.check_call(
[
"docker",
"build",
"--build-arg",
f"BASE_IMAGE={test.get_anyscale_base_byod_image()}",
"--build-arg",
f"POST_BUILD_SCRIPT={test.get_byod_post_build_script()}",
"-t",
byod_image,
"-f",
os.path.join(RELEASE_BYOD_DIR, "byod.custom.Dockerfile"),
RELEASE_BYOD_DIR,
],
stdout=sys.stderr,
env=env,
)
# push the image to ecr, the image will have a tag in this format
# {commit_sha}-py{version}-gpu-{custom_information_dict_hash}
subprocess.check_call(
["docker", "push", byod_image],
stdout=sys.stderr,
)


def build_anyscale_base_byod_images(tests: List[Test]) -> None:
"""
Builds the Anyscale BYOD images for the given tests.
"""
Expand All @@ -40,7 +76,7 @@ def build_anyscale_byod_images(tests: List[Test]) -> None:
and int(time.time()) - start < BASE_IMAGE_WAIT_TIMEOUT
):
for ray_image, test in to_be_built.items():
byod_image = test.get_anyscale_byod_image()
byod_image = test.get_anyscale_base_byod_image()
if _byod_image_exist(test):
logger.info(f"Image {byod_image} already exists")
built.add(ray_image)
Expand Down Expand Up @@ -124,15 +160,18 @@ def _ray_image_exist(ray_image: str) -> bool:
return p.returncode == 0


def _byod_image_exist(test: Test) -> bool:
def _byod_image_exist(test: Test, base_image: bool = True) -> bool:
"""
Checks if the given Anyscale BYOD image exists.
"""
client = boto3.client("ecr")
image_tag = (
test.get_byod_base_image_tag() if base_image else test.get_byod_image_tag()
)
try:
client.describe_images(
repositoryName=test.get_byod_repo(),
imageIds=[{"imageTag": test.get_byod_image_tag()}],
imageIds=[{"imageTag": image_tag}],
)
return True
except client.exceptions.ImageNotFoundException:
Expand Down
10 changes: 10 additions & 0 deletions release/ray_release/byod/byod.custom.Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
# syntax=docker/dockerfile:1.3-labs
# shellcheck disable=SC2148

ARG BASE_IMAGE
FROM "$BASE_IMAGE"

ARG POST_BUILD_SCRIPT

COPY "$POST_BUILD_SCRIPT" /tmp/post_build_script.sh
RUN /tmp/post_build_script.sh
8 changes: 8 additions & 0 deletions release/ray_release/byod/byod_agent_stress_test.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
#!/bin/bash
# This script is used to build an extra layer on top of the base anyscale/ray image
# to run the agent stress test.

set -exo pipefail

echo "deb [signed-by=/usr/share/keyrings/cloud.google.gpg] https://packages.cloud.google.com/apt cloud-sdk main" | sudo tee -a /etc/apt/sources.list.d/google-cloud-sdk.list
curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | sudo apt-key --keyring /usr/share/keyrings/cloud.google.gpg add -
8 changes: 0 additions & 8 deletions release/ray_release/glue.py
Original file line number Diff line number Diff line change
Expand Up @@ -294,14 +294,6 @@ def _prepare_remote_environment(
except CommandTimeout as e:
raise PrepareCommandTimeout(e)

for pre_run_cmd in test.get_byod_pre_run_cmds():
try:
command_runner.run_prepare_command(pre_run_cmd, timeout=300)
except CommandError as e:
raise PrepareCommandError(e)
except CommandTimeout as e:
raise PrepareCommandTimeout(e)


def _running_test_script(
test: Test,
Expand Down
4 changes: 2 additions & 2 deletions release/ray_release/schema.json
Original file line number Diff line number Diff line change
Expand Up @@ -109,8 +109,8 @@
"gpu"
]
},
"pre_run_cmds": {
"type": "array"
"post_build_script":{
"type": "string"
},
"pip": {
"type": "array"
Expand Down
13 changes: 10 additions & 3 deletions release/ray_release/scripts/build_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,10 @@
from ray_release.buildkite.filter import filter_tests, group_tests
from ray_release.buildkite.settings import get_pipeline_settings
from ray_release.buildkite.step import get_step
from ray_release.byod.build import build_anyscale_byod_images
from ray_release.byod.build import (
build_anyscale_base_byod_images,
build_anyscale_custom_byod_image,
)
from ray_release.config import (
read_and_validate_release_test_collection,
DEFAULT_WHEEL_WAIT_TIMEOUT,
Expand Down Expand Up @@ -150,8 +153,12 @@ def main(
"Empty test collection. The selected frequency or filter did "
"not return any tests to run. Adjust your filters."
)
logger.info("Build anyscale BYOD images")
build_anyscale_byod_images([test for test, _ in filtered_tests])
tests = [test for test, _ in filtered_tests]
logger.info("Build anyscale base BYOD images")
build_anyscale_base_byod_images(tests)
logger.info("Build anyscale custom BYOD images")
for test in tests:
build_anyscale_custom_byod_image(test)
grouped_tests = group_tests(filtered_tests)

group_str = ""
Expand Down
8 changes: 6 additions & 2 deletions release/ray_release/scripts/ray_bisect.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,10 @@

from ray_release.logger import logger
from ray_release.buildkite.step import get_step
from ray_release.byod.build import build_anyscale_byod_images
from ray_release.byod.build import (
build_anyscale_base_byod_images,
build_anyscale_custom_byod_image,
)
from ray_release.config import (
read_and_validate_release_test_collection,
parse_python_version,
Expand Down Expand Up @@ -162,7 +165,8 @@ def _trigger_test_run(
if test.is_byod_cluster():
ray_wheels_url = None
os.environ["COMMIT_TO_TEST"] = commit
build_anyscale_byod_images([test])
build_anyscale_base_byod_images([test])
build_anyscale_custom_byod_image(test)
else:
ray_wheels_url = find_and_wait_for_ray_wheels_url(
commit, timeout=DEFAULT_WHEEL_WAIT_TIMEOUT, python_version=python_version
Expand Down
38 changes: 32 additions & 6 deletions release/ray_release/test.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
Result,
)
from ray_release.logger import logger
from ray_release.util import dict_hash

AWS_BUCKET = "ray-ci-results"
AWS_TEST_KEY = "ray_tests"
Expand Down Expand Up @@ -125,13 +126,13 @@ def get_byod_type(self) -> Optional[str]:
return None
return self["cluster"]["byod"].get("type", "cpu")

def get_byod_pre_run_cmds(self) -> List[str]:
def get_byod_post_build_script(self) -> Optional[str]:
"""
Returns the list of pre-run commands for the BYOD cluster.
Returns the post-build script for the BYOD cluster.
"""
if not self.is_byod_cluster():
return []
return self["cluster"]["byod"].get("pre_run_cmds", [])
return None
return self["cluster"]["byod"].get("post_build_script")

def get_byod_runtime_env(self) -> Dict[str, str]:
"""
Expand Down Expand Up @@ -200,7 +201,7 @@ def get_python_version(self) -> str:
"""
return self.get("python", ".".join(str(v) for v in DEFAULT_PYTHON_VERSION))

def get_byod_image_tag(self) -> str:
def get_byod_base_image_tag(self) -> str:
"""
Returns the byod image tag to use for this test.
"""
Expand All @@ -223,6 +224,17 @@ def get_byod_image_tag(self) -> str:
python_version = f"py{self.get_python_version().replace('.', '')}"
return f"{ray_version}-{python_version}{image_suffix}"

def get_byod_image_tag(self) -> str:
"""
Returns the byod custom image tag to use for this test.
"""
if not self.require_custom_byod_image():
return self.get_byod_base_image_tag()
custom_info = {
"post_build_script": self.get_byod_post_build_script(),
}
return f"{self.get_byod_base_image_tag()}-{dict_hash(custom_info)}"

def get_byod_repo(self) -> str:
"""
Returns the byod repo to use for this test.
Expand All @@ -238,7 +250,21 @@ def get_ray_image(self) -> str:
Returns the ray docker image to use for this test.
"""
ray_project = "ray-ml" if self.get_byod_type() == "gpu" else "ray"
return f"rayproject/{ray_project}:{self.get_byod_image_tag()}"
return f"rayproject/{ray_project}:{self.get_byod_base_image_tag()}"

def get_anyscale_base_byod_image(self) -> str:
"""
Returns the anyscale byod image to use for this test.
"""
return (
f"{DATAPLANE_ECR}/{self.get_byod_repo()}:{self.get_byod_base_image_tag()}"
)

def require_custom_byod_image(self) -> bool:
"""
Returns whether this test requires a custom byod image.
"""
return self.get_byod_post_build_script() is not None

def get_anyscale_byod_image(self) -> str:
"""
Expand Down
42 changes: 42 additions & 0 deletions release/ray_release/tests/test_byod_build.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
import sys

import pytest
from unittest.mock import patch
from typing import List

from ray_release.test import Test
from ray_release.byod.build import build_anyscale_custom_byod_image


def test_build_anyscale_custom_byod_image() -> None:
cmds = []

def _mock_check_call(
cmd: List[str],
*args,
**kwargs,
) -> None:
cmds.append(cmd)

with patch(
"ray_release.byod.build._byod_image_exist", return_value=False
), patch.dict(
"os.environ",
{"BUILDKITE_COMMIT": "abc123", "BUILDKITE_BRANCH": "master"},
), patch(
"subprocess.check_call",
side_effect=_mock_check_call,
):
test = Test(
name="name",
cluster={"byod": {"post_build_script": "foo.sh"}},
)
build_anyscale_custom_byod_image(test)
assert "docker build --build-arg BASE_IMAGE=029272617770.dkr.ecr.us-west-2."
"amazonaws.com/anyscale/ray:abc123-py37 -t 029272617770.dkr.ecr.us-west-2."
"amazonaws.com/anyscale/ray:abc123-py37-c3fc5fc6d84cea4d7ab885c6cdc966542e"
"f59e4c679b8c970f2f77b956bfd8fb" in " ".join(cmds[0])


if __name__ == "__main__":
sys.exit(pytest.main(["-v", __file__]))
15 changes: 15 additions & 0 deletions release/ray_release/tests/test_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,21 @@ def test_get_anyscale_byod_image():
).get_anyscale_byod_image()
== f"{DATAPLANE_ECR}/{DATAPLANE_ECR_ML_REPO}:123456-py38-gpu"
)
assert (
_stub_test(
{
"python": "3.8",
"cluster": {
"byod": {
"type": "gpu",
"post_build_script": "foo.sh",
}
},
}
).get_anyscale_byod_image()
== f"{DATAPLANE_ECR}/{DATAPLANE_ECR_ML_REPO}:123456-py38-gpu-"
"ab7ed2b7a7e8d3f855a7925b0d296b0f9c75fac91882aba47854d92d27e13e53"
)


@patch("github.Repository")
Expand Down
5 changes: 2 additions & 3 deletions release/release_tests.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6363,11 +6363,10 @@
team: core
cluster:
byod:
type: gpu
runtime_env:
- RAY_INTERNAL_MEM_PROFILE_COMPONENTS=dashboard_agent
pre_run_cmds:
- echo "deb [signed-by=/usr/share/keyrings/cloud.google.gpg] https://packages.cloud.google.com/apt cloud-sdk main" | sudo tee -a /etc/apt/sources.list.d/google-cloud-sdk.list
- curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | sudo apt-key --keyring /usr/share/keyrings/cloud.google.gpg add -
post_build_script: byod_agent_stress_test.sh
cluster_env: agent_stress_app_config.yaml
cluster_compute: agent_stress_compute.yaml

Expand Down

0 comments on commit 2a77981

Please sign in to comment.