Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[ci][byod] byod the rest non-rllib of release tests #36737

Merged
merged 3 commits into from
Jun 28, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 26 additions & 0 deletions release/ray_release/byod/byod_alpa_test.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
#!/bin/bash
# This script is used to build an extra layer on top of the base anyscale/ray image
# to run the horovod tests
#
# TODO(can): once the library versions can be resolved cleanly with other dependencies,
# they should be managed through requirements_byod.in file

set -exo pipefail

pip3 install cupy-cuda113 numpy==1.21.0 protobuf==3.20.0

pip3 install --upgrade pip
# Install Alpa from source for now.
# TODO(jungong) : pip install alpa after next release.
git clone https://github.com/alpa-projects/alpa.git
pip3 install -e alpa
pip3 install -e alpa/examples
# Install custom built jaxlib.
pip install jaxlib==0.3.22+cuda113.cudnn820 -f https://alpa-projects.github.io/wheels.html
# Install nvidia dependencies.
pip3 install --no-cache-dir nvidia-pyindex
pip3 install --no-cache-dir nvidia-tensorrt==7.2.3.4
# Huggingface transformers.
# TODO(jungong) : bring llm_serving up to date with latest transforemrs library.
pip install -U transformers==4.23.1
pip install -U accelerate
can-anyscale marked this conversation as resolved.
Show resolved Hide resolved
9 changes: 9 additions & 0 deletions release/ray_release/byod/byod_dask_test.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
#!/bin/bash
# This script is used to build an extra layer on top of the base anyscale/ray image
# to run the horovod tests
# shellcheck disable=SC2102

set -exo pipefail

pip3 install dask[complete]
pip3 install boto3 s3fs
54 changes: 52 additions & 2 deletions release/release_tests.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -496,7 +496,10 @@

frequency: nightly
team: ml
python: "3.8"
cluster:
byod:
type: gpu
cluster_env: app_config.yaml
cluster_compute: compute_gpu_4x4_aws.yaml

Expand Down Expand Up @@ -697,9 +700,12 @@

frequency: nightly
team: ml
python: "3.8"
stable: false

cluster:
byod:
type: gpu
cluster_env: app_config.yaml
cluster_compute: compute_gpu_4x4_aws.yaml

Expand Down Expand Up @@ -835,7 +841,10 @@

frequency: weekly
team: ml
python: "3.8"
cluster:
byod:
type: gpu
cluster_env: dreambooth_env.yaml
cluster_compute: dreambooth_compute_aws.yaml

Expand Down Expand Up @@ -888,6 +897,12 @@
frequency: weekly
team: ml
cluster:
byod:
type: cu118
pip:
- myst-parser==0.15.2
- myst-nb==0.13.1
- jupytext==1.13.6
cluster_env: dolly_v2_fsdp_env.yaml
cluster_compute: dolly_v2_fsdp_compute_aws.yaml

Expand Down Expand Up @@ -3531,7 +3546,11 @@

frequency: nightly
team: ml
python: "3.8"
cluster:
byod:
type: gpu
post_build_script: byod_alpa_test.sh
cluster_env: app_config.yaml
cluster_compute: gpu_2x4_t4_aws.yaml

Expand Down Expand Up @@ -3565,7 +3584,11 @@

frequency: nightly
team: ml
python: "3.8"
cluster:
byod:
type: gpu
post_build_script: byod_alpa_test.sh
cluster_env: app_config.yaml
cluster_compute: gpu_1x8_v100_aws.yaml

Expand Down Expand Up @@ -4954,7 +4977,12 @@

frequency: nightly
team: core
python: "3.8"
cluster:
byod:
runtime_env:
- RAY_worker_killing_policy=retriable_lifo
post_build_script: byod_dask_test.sh
cluster_env: dask_on_ray/dask_on_ray_app_config.yaml
cluster_compute: dask_on_ray/dask_on_ray_sort_compute_template.yaml

Expand All @@ -4981,6 +5009,10 @@
team: data
python: "3.8"
cluster:
byod:
runtime_env:
- RAY_worker_killing_policy=retriable_lifo
post_build_script: byod_dask_test.sh
cluster_env: dask_on_ray/large_scale_dask_on_ray_app_config.yaml
cluster_compute: dask_on_ray/dask_on_ray_stress_compute.yaml

Expand Down Expand Up @@ -5269,6 +5301,10 @@
team: core
python: "3.8"
cluster:
byod:
runtime_env:
- RAY_memory_usage_threshold=0.7
- RAY_task_oom_retries=-1
cluster_env: oom/stress_tests_tune_air_oom_app_config.yaml
cluster_compute: oom/stress_tests_tune_air_oom_compute.yaml

Expand All @@ -5283,7 +5319,12 @@

frequency: nightly-3x
team: core
python: "3.8"
cluster:
byod:
runtime_env:
- RAY_worker_killing_policy=retriable_lifo
post_build_script: byod_dask_test.sh
cluster_env: dask_on_ray/dask_on_ray_app_config.yaml
cluster_compute: dask_on_ray/1tb_sort_compute.yaml

Expand Down Expand Up @@ -6428,7 +6469,12 @@

frequency: nightly
team: data
python: "3.8"
cluster:
byod:
runtime_env:
- RAY_lineage_pinning_enabled=1
post_build_script: byod_dask_test.sh
cluster_env: chaos_test/dask_on_ray_app_config_reconstruction.yaml
cluster_compute: dask_on_ray/chaos_dask_on_ray_stress_compute.yaml

Expand All @@ -6455,7 +6501,12 @@

frequency: nightly
team: data
python: "3.8"
cluster:
byod:
runtime_env:
- RAY_lineage_pinning_enabled=1
post_build_script: byod_dask_test.sh
cluster_env: chaos_test/dask_on_ray_app_config_reconstruction.yaml
cluster_compute: dask_on_ray/dask_on_ray_stress_compute.yaml

Expand Down Expand Up @@ -6699,11 +6750,10 @@
group: cluster-launcher-test
working_dir: ../python/ray/autoscaler/

stable: true

frequency: nightly
team: core
cluster:
byod: {}
cluster_env: aws/tests/aws_config.yaml
cluster_compute: aws/tests/aws_compute.yaml

Expand Down