Skip to content

Commit

Permalink
[ci][byod] byod the rest non-rllib of release tests (ray-project#36737)
Browse files Browse the repository at this point in the history
Signed-off-by: e428265 <arvind.chandramouli@lmco.com>
  • Loading branch information
can-anyscale authored and arvind-chandra committed Aug 31, 2023
1 parent f918b64 commit ca3ac1b
Show file tree
Hide file tree
Showing 3 changed files with 87 additions and 2 deletions.
26 changes: 26 additions & 0 deletions release/ray_release/byod/byod_alpa_test.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
#!/bin/bash
# This script is used to build an extra layer on top of the base anyscale/ray image
# to run the horovod tests
#
# TODO(can): once the library versions can be resolved cleanly with other dependencies,
# they should be managed through requirements_byod.in file

set -exo pipefail

pip3 install cupy-cuda113 numpy==1.21.0 protobuf==3.20.0

pip3 install --upgrade pip
# Install Alpa from source for now.
# TODO(jungong) : pip install alpa after next release.
git clone https://github.com/alpa-projects/alpa.git
pip3 install -e alpa
pip3 install -e alpa/examples
# Install custom built jaxlib.
pip install jaxlib==0.3.22+cuda113.cudnn820 -f https://alpa-projects.github.io/wheels.html
# Install nvidia dependencies.
pip3 install --no-cache-dir nvidia-pyindex
pip3 install --no-cache-dir nvidia-tensorrt==7.2.3.4
# Huggingface transformers.
# TODO(jungong) : bring llm_serving up to date with latest transforemrs library.
pip install -U transformers==4.23.1
pip install -U accelerate
9 changes: 9 additions & 0 deletions release/ray_release/byod/byod_dask_test.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
#!/bin/bash
# This script is used to build an extra layer on top of the base anyscale/ray image
# to run the horovod tests
# shellcheck disable=SC2102

set -exo pipefail

pip3 install dask[complete]
pip3 install boto3 s3fs
54 changes: 52 additions & 2 deletions release/release_tests.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -496,7 +496,10 @@

frequency: nightly
team: ml
python: "3.8"
cluster:
byod:
type: gpu
cluster_env: app_config.yaml
cluster_compute: compute_gpu_4x4_aws.yaml

Expand Down Expand Up @@ -697,9 +700,12 @@

frequency: nightly
team: ml
python: "3.8"
stable: false

cluster:
byod:
type: gpu
cluster_env: app_config.yaml
cluster_compute: compute_gpu_4x4_aws.yaml

Expand Down Expand Up @@ -835,7 +841,10 @@

frequency: weekly
team: ml
python: "3.8"
cluster:
byod:
type: gpu
cluster_env: dreambooth_env.yaml
cluster_compute: dreambooth_compute_aws.yaml

Expand Down Expand Up @@ -888,6 +897,12 @@
frequency: weekly
team: ml
cluster:
byod:
type: cu118
pip:
- myst-parser==0.15.2
- myst-nb==0.13.1
- jupytext==1.13.6
cluster_env: dolly_v2_fsdp_env.yaml
cluster_compute: dolly_v2_fsdp_compute_aws.yaml

Expand Down Expand Up @@ -3531,7 +3546,11 @@

frequency: nightly
team: ml
python: "3.8"
cluster:
byod:
type: gpu
post_build_script: byod_alpa_test.sh
cluster_env: app_config.yaml
cluster_compute: gpu_2x4_t4_aws.yaml

Expand Down Expand Up @@ -3565,7 +3584,11 @@

frequency: nightly
team: ml
python: "3.8"
cluster:
byod:
type: gpu
post_build_script: byod_alpa_test.sh
cluster_env: app_config.yaml
cluster_compute: gpu_1x8_v100_aws.yaml

Expand Down Expand Up @@ -4954,7 +4977,12 @@

frequency: nightly
team: core
python: "3.8"
cluster:
byod:
runtime_env:
- RAY_worker_killing_policy=retriable_lifo
post_build_script: byod_dask_test.sh
cluster_env: dask_on_ray/dask_on_ray_app_config.yaml
cluster_compute: dask_on_ray/dask_on_ray_sort_compute_template.yaml

Expand All @@ -4981,6 +5009,10 @@
team: data
python: "3.8"
cluster:
byod:
runtime_env:
- RAY_worker_killing_policy=retriable_lifo
post_build_script: byod_dask_test.sh
cluster_env: dask_on_ray/large_scale_dask_on_ray_app_config.yaml
cluster_compute: dask_on_ray/dask_on_ray_stress_compute.yaml

Expand Down Expand Up @@ -5269,6 +5301,10 @@
team: core
python: "3.8"
cluster:
byod:
runtime_env:
- RAY_memory_usage_threshold=0.7
- RAY_task_oom_retries=-1
cluster_env: oom/stress_tests_tune_air_oom_app_config.yaml
cluster_compute: oom/stress_tests_tune_air_oom_compute.yaml

Expand All @@ -5283,7 +5319,12 @@

frequency: nightly-3x
team: core
python: "3.8"
cluster:
byod:
runtime_env:
- RAY_worker_killing_policy=retriable_lifo
post_build_script: byod_dask_test.sh
cluster_env: dask_on_ray/dask_on_ray_app_config.yaml
cluster_compute: dask_on_ray/1tb_sort_compute.yaml

Expand Down Expand Up @@ -6428,7 +6469,12 @@

frequency: nightly
team: data
python: "3.8"
cluster:
byod:
runtime_env:
- RAY_lineage_pinning_enabled=1
post_build_script: byod_dask_test.sh
cluster_env: chaos_test/dask_on_ray_app_config_reconstruction.yaml
cluster_compute: dask_on_ray/chaos_dask_on_ray_stress_compute.yaml

Expand All @@ -6455,7 +6501,12 @@

frequency: nightly
team: data
python: "3.8"
cluster:
byod:
runtime_env:
- RAY_lineage_pinning_enabled=1
post_build_script: byod_dask_test.sh
cluster_env: chaos_test/dask_on_ray_app_config_reconstruction.yaml
cluster_compute: dask_on_ray/dask_on_ray_stress_compute.yaml

Expand Down Expand Up @@ -6699,11 +6750,10 @@
group: cluster-launcher-test
working_dir: ../python/ray/autoscaler/

stable: true

frequency: nightly
team: core
cluster:
byod: {}
cluster_env: aws/tests/aws_config.yaml
cluster_compute: aws/tests/aws_compute.yaml

Expand Down

0 comments on commit ca3ac1b

Please sign in to comment.