Skip to content

Commit

Permalink
Migrate all serve tests to byod (ray-project#36380)
Browse files Browse the repository at this point in the history
Signed-off-by: can <can@anyscale.com>
Signed-off-by: e428265 <arvind.chandramouli@lmco.com>
  • Loading branch information
can-anyscale authored and arvind-chandra committed Aug 31, 2023
1 parent 2a77981 commit 8beb15e
Show file tree
Hide file tree
Showing 9 changed files with 338 additions and 25 deletions.
10 changes: 10 additions & 0 deletions release/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,16 @@ compile_pip_requirements(
visibility = ["//visibility:private"],
)

compile_pip_requirements(
name = "requirements_ml_byod",
requirements_in = "ray_release/byod/requirements_ml_byod.in",
requirements_txt = "ray_release/byod/requirements_ml_byod.txt",
tags = [
"team:ci",
],
visibility = ["//visibility:private"],
)

test_srcs = glob(["**/*.py"])

####
Expand Down
9 changes: 8 additions & 1 deletion release/ray_release/byod/build.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@
BASE_IMAGE_WAIT_TIMEOUT = 7200
BASE_IMAGE_WAIT_DURATION = 30
RELEASE_BYOD_DIR = os.path.join(RELEASE_PACKAGE_DIR, "ray_release/byod")
REQUIREMENTS_BYOD = "requirements_byod.txt"
REQUIREMENTS_ML_BYOD = "requirements_ml_byod.txt"


def build_anyscale_custom_byod_image(test: Test) -> None:
Expand Down Expand Up @@ -77,6 +79,11 @@ def build_anyscale_base_byod_images(tests: List[Test]) -> None:
):
for ray_image, test in to_be_built.items():
byod_image = test.get_anyscale_base_byod_image()
byod_requirements = (
REQUIREMENTS_BYOD
if test.get_byod_type() == "cpu"
else REQUIREMENTS_ML_BYOD
)
if _byod_image_exist(test):
logger.info(f"Image {byod_image} already exists")
built.add(ray_image)
Expand Down Expand Up @@ -114,7 +121,7 @@ def build_anyscale_base_byod_images(tests: List[Test]) -> None:
"--build-arg",
f"BASE_IMAGE={byod_image}",
"--build-arg",
"PIP_REQUIREMENTS=requirements_byod.txt",
f"PIP_REQUIREMENTS={byod_requirements}",
"--build-arg",
"DEBIAN_REQUIREMENTS=requirements_debian_byod.txt",
"-t",
Expand Down
7 changes: 6 additions & 1 deletion release/ray_release/byod/byod.Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,13 @@ sudo apt-get update -y \
&& sudo apt-get install -y --no-install-recommends $(cat requirements_debian_byod.txt) \
&& sudo apt-get autoclean

rm -rf /tmp/wrk
git clone --branch 4.2.0 https://github.com/wg/wrk.git /tmp/wrk
make -C /tmp/wrk -j
sudo cp /tmp/wrk/wrk /usr/local/bin/wrk

EOF

COPY "$PIP_REQUIREMENTS" .
RUN "$HOME"/anaconda3/bin/pip install --no-cache-dir https://ray-ci-deps-wheels.s3.us-west-2.amazonaws.com/AutoROM.accept_rom_license-0.5.4-py3-none-any.whl
RUN "$HOME"/anaconda3/bin/pip install --no-cache-dir -r requirements_byod.txt
RUN "$HOME"/anaconda3/bin/pip install --no-cache-dir -r "${PIP_REQUIREMENTS}"
2 changes: 1 addition & 1 deletion release/ray_release/byod/requirements_byod.in
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Python requirements to run release tests from anyscale byod
# Python requirements to run release tests from anyscale byod (cpu type)
ale-py
anyscale
boto3
Expand Down
9 changes: 9 additions & 0 deletions release/ray_release/byod/requirements_ml_byod.in
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
# Python requirements to run release tests from anyscale byod (gpu type)
boto3
cmake
crc32c
numpy
pyarrow
pytest
tqdm
validators
231 changes: 231 additions & 0 deletions release/ray_release/byod/requirements_ml_byod.txt

Large diffs are not rendered by default.

18 changes: 13 additions & 5 deletions release/ray_release/cluster_manager/cluster_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,11 +62,19 @@ def set_cluster_env(self, cluster_env: Dict[str, Any]):
"RAY_USAGE_STATS_EXTRA_TAGS"
] = f"test_name={self.test.get_name()};smoke_test={self.smoke_test}"

self.cluster_env_name = (
f"{self.project_name}_{self.project_id[4:8]}"
f"__env__{self.test.get_name().replace('.', '_')}__"
f"{dict_hash(self.cluster_env)}"
)
if self.test.is_byod_cluster():
self.cluster_env_name = (
self.test.get_anyscale_byod_image()
.replace("/", "_")
.replace(":", "_")
.replace(".", "_")
)
else:
self.cluster_env_name = (
f"{self.project_name}_{self.project_id[4:8]}"
f"__env__{self.test.get_name().replace('.', '_')}__"
f"{dict_hash(self.cluster_env)}"
)

def set_cluster_compute(
self,
Expand Down
4 changes: 2 additions & 2 deletions release/ray_release/tests/test_cluster_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,7 +131,7 @@ def testSetClusterEnv(self):
sdk.returns["get_project"] = APIDict(result=APIDict(name="release_unit_tests"))
sdk.returns["get_cloud"] = APIDict(result=APIDict(provider="AWS"))
cluster_manager = self.cls(
test=Test({"name": "test"}),
test=Test({"name": "test", "cluster": {}}),
project_id=UNIT_TEST_PROJECT_ID,
smoke_test=False,
sdk=sdk,
Expand All @@ -142,7 +142,7 @@ def testSetClusterEnv(self):
"test_name=test;smoke_test=False",
)
cluster_manager = self.cls(
test=Test({"name": "Test"}),
test=Test({"name": "Test", "cluster": {}}),
project_id=UNIT_TEST_PROJECT_ID,
smoke_test=True,
sdk=sdk,
Expand Down
73 changes: 58 additions & 15 deletions release/release_tests.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2573,7 +2573,11 @@

frequency: weekly
team: serve
python: "3.8"
cluster:
byod:
runtime_env:
- RLLIB_TEST_NO_JAX_IMPORT=1
cluster_env: app_config.yaml
cluster_compute: tpl_cpu_1.yaml

Expand Down Expand Up @@ -2611,7 +2615,11 @@

frequency: weekly
team: serve
python: "3.8"
cluster:
byod:
runtime_env:
- RLLIB_TEST_NO_JAX_IMPORT=1
cluster_env: app_config.yaml
cluster_compute: tpl_cpu_1_c5.yaml

Expand Down Expand Up @@ -2649,7 +2657,11 @@

frequency: weekly
team: serve
python: "3.8"
cluster:
byod:
runtime_env:
- RLLIB_TEST_NO_JAX_IMPORT=1
cluster_env: app_config.yaml
cluster_compute: tpl_cpu_1.yaml

Expand Down Expand Up @@ -2724,8 +2736,10 @@

frequency: nightly
team: serve

python: "3.8"
cluster:
byod:
type: gpu
cluster_env: app_config.yaml
cluster_compute: compute_tpl_4_xlarge.yaml

Expand Down Expand Up @@ -2753,8 +2767,10 @@

frequency: nightly
team: serve

python: "3.8"
cluster:
byod:
type: gpu
cluster_env: app_config.yaml
cluster_compute: compute_tpl_4_xlarge.yaml

Expand All @@ -2781,7 +2797,10 @@
team: serve
frequency: nightly
working_dir: jobs_tests
python: "3.8"
cluster:
byod:
type: gpu
cluster_env: app_config.yaml
cluster_compute: compute_tpl_4_xlarge.yaml
run:
Expand All @@ -2802,9 +2821,12 @@
- name: jobs_check_cuda_available
group: Jobs tests
team: serve
python: "3.8"
frequency: nightly
working_dir: jobs_tests
cluster:
byod:
type: gpu
cluster_env: app_config.yaml
cluster_compute: compute_tpl_gpu_node.yaml
run:
Expand All @@ -2825,9 +2847,12 @@
- name: jobs_specify_num_gpus
group: Jobs tests
team: serve
python: "3.8"
frequency: nightly
working_dir: jobs_tests
cluster:
byod:
type: gpu
cluster_env: app_config.yaml
cluster_compute: compute_tpl_gpu_worker.yaml
run:
Expand All @@ -2854,8 +2879,10 @@

frequency: nightly
team: serve
python: "3.8"

cluster:
byod: {}
cluster_env: app_config.yaml
cluster_compute: rte_small.yaml

Expand Down Expand Up @@ -2883,8 +2910,9 @@

frequency: nightly
team: serve

python: "3.8"
cluster:
byod: {}
cluster_env: app_config.yaml
cluster_compute: rte_minimal.yaml

Expand Down Expand Up @@ -2938,8 +2966,9 @@

frequency: nightly
team: serve

python: "3.8"
cluster:
byod: {}
cluster_env: app_config.yaml
cluster_compute: compute_tpl_32_cpu.yaml

Expand All @@ -2965,8 +2994,9 @@

frequency: nightly
team: serve

python: "3.8"
cluster:
byod: {}
cluster_env: app_config.yaml
cluster_compute: compute_tpl_32_cpu.yaml

Expand All @@ -2992,8 +3022,9 @@

frequency: nightly
team: serve

python: "3.8"
cluster:
byod: {}
cluster_env: app_config.yaml
cluster_compute: compute_tpl_8_cpu_autoscaling.yaml

Expand All @@ -3019,8 +3050,9 @@

frequency: nightly
team: serve

python: "3.8"
cluster:
byod: {}
cluster_env: app_config.yaml
cluster_compute: compute_tpl_32_cpu_autoscaling.yaml

Expand All @@ -3046,8 +3078,9 @@

frequency: nightly
team: serve

python: "3.8"
cluster:
byod: {}
cluster_env: app_config.yaml
cluster_compute: compute_tpl_single_node.yaml

Expand Down Expand Up @@ -3093,8 +3126,9 @@

frequency: nightly
team: serve

python: "3.8"
cluster:
byod: {}
cluster_env: app_config.yaml
cluster_compute: compute_tpl_single_node_32_cpu.yaml

Expand All @@ -3121,8 +3155,9 @@

frequency: nightly
team: serve

python: "3.8"
cluster:
byod: {}
cluster_env: app_config.yaml
cluster_compute: compute_tpl_single_node_32_cpu.yaml

Expand All @@ -3149,8 +3184,9 @@

frequency: nightly
team: serve

python: "3.8"
cluster:
byod: {}
cluster_env: app_config.yaml
cluster_compute: compute_tpl_single_node_32_cpu.yaml

Expand All @@ -3177,8 +3213,9 @@

frequency: nightly
team: serve

python: "3.8"
cluster:
byod: {}
cluster_env: app_config.yaml
cluster_compute: compute_tpl_single_node_32_cpu.yaml

Expand Down Expand Up @@ -3206,8 +3243,9 @@

frequency: nightly
team: serve

python: "3.8"
cluster:
byod: {}
cluster_env: app_config.yaml
cluster_compute: compute_tpl_single_node.yaml

Expand All @@ -3233,8 +3271,9 @@

frequency: nightly
team: serve

python: "3.8"
cluster:
byod: {}
cluster_env: app_config.yaml
cluster_compute: compute_tpl_single_node.yaml

Expand All @@ -3261,8 +3300,10 @@

frequency: nightly
team: serve

python: "3.8"
cluster:
byod:
type: gpu
cluster_env: gpu_app_config.yaml
cluster_compute: compute_tpl_gpu_node.yaml

Expand Down Expand Up @@ -5977,6 +6018,8 @@
byod:
runtime_env:
- RAY_worker_killing_policy=retriable_lifo
pip:
- ray[default]
cluster_env: shuffle/shuffle_app_config.yaml
cluster_compute: shuffle/datasets_large_scale_compute_small_instances.yaml

Expand Down

0 comments on commit 8beb15e

Please sign in to comment.