diff --git a/release/ray_release/byod/byod_alpa_test.sh b/release/ray_release/byod/byod_alpa_test.sh new file mode 100755 index 000000000000..d992030a7cf7 --- /dev/null +++ b/release/ray_release/byod/byod_alpa_test.sh @@ -0,0 +1,26 @@ +#!/bin/bash +# This script is used to build an extra layer on top of the base anyscale/ray image +# to run the horovod tests +# +# TODO(can): once the library versions can be resolved cleanly with other dependencies, +# they should be managed through requirements_byod.in file + +set -exo pipefail + +pip3 install cupy-cuda113 numpy==1.21.0 protobuf==3.20.0 + +pip3 install --upgrade pip +# Install Alpa from source for now. +# TODO(jungong) : pip install alpa after next release. +git clone https://github.com/alpa-projects/alpa.git +pip3 install -e alpa +pip3 install -e alpa/examples +# Install custom built jaxlib. +pip install jaxlib==0.3.22+cuda113.cudnn820 -f https://alpa-projects.github.io/wheels.html +# Install nvidia dependencies. +pip3 install --no-cache-dir nvidia-pyindex +pip3 install --no-cache-dir nvidia-tensorrt==7.2.3.4 +# Huggingface transformers. +# TODO(jungong) : bring llm_serving up to date with latest transforemrs library. +pip install -U transformers==4.23.1 +pip install -U accelerate diff --git a/release/ray_release/byod/byod_dask_test.sh b/release/ray_release/byod/byod_dask_test.sh new file mode 100755 index 000000000000..1a95d34c0637 --- /dev/null +++ b/release/ray_release/byod/byod_dask_test.sh @@ -0,0 +1,9 @@ +#!/bin/bash +# This script is used to build an extra layer on top of the base anyscale/ray image +# to run the horovod tests +# shellcheck disable=SC2102 + +set -exo pipefail + +pip3 install dask[complete] +pip3 install boto3 s3fs diff --git a/release/release_tests.yaml b/release/release_tests.yaml index 5037d4f1f941..85dc284d3c1b 100644 --- a/release/release_tests.yaml +++ b/release/release_tests.yaml @@ -496,7 +496,10 @@ frequency: nightly team: ml + python: "3.8" cluster: + byod: + type: gpu cluster_env: app_config.yaml cluster_compute: compute_gpu_4x4_aws.yaml @@ -697,9 +700,12 @@ frequency: nightly team: ml + python: "3.8" stable: false cluster: + byod: + type: gpu cluster_env: app_config.yaml cluster_compute: compute_gpu_4x4_aws.yaml @@ -835,7 +841,10 @@ frequency: weekly team: ml + python: "3.8" cluster: + byod: + type: gpu cluster_env: dreambooth_env.yaml cluster_compute: dreambooth_compute_aws.yaml @@ -888,6 +897,12 @@ frequency: weekly team: ml cluster: + byod: + type: cu118 + pip: + - myst-parser==0.15.2 + - myst-nb==0.13.1 + - jupytext==1.13.6 cluster_env: dolly_v2_fsdp_env.yaml cluster_compute: dolly_v2_fsdp_compute_aws.yaml @@ -3531,7 +3546,11 @@ frequency: nightly team: ml + python: "3.8" cluster: + byod: + type: gpu + post_build_script: byod_alpa_test.sh cluster_env: app_config.yaml cluster_compute: gpu_2x4_t4_aws.yaml @@ -3565,7 +3584,11 @@ frequency: nightly team: ml + python: "3.8" cluster: + byod: + type: gpu + post_build_script: byod_alpa_test.sh cluster_env: app_config.yaml cluster_compute: gpu_1x8_v100_aws.yaml @@ -4954,7 +4977,12 @@ frequency: nightly team: core + python: "3.8" cluster: + byod: + runtime_env: + - RAY_worker_killing_policy=retriable_lifo + post_build_script: byod_dask_test.sh cluster_env: dask_on_ray/dask_on_ray_app_config.yaml cluster_compute: dask_on_ray/dask_on_ray_sort_compute_template.yaml @@ -4981,6 +5009,10 @@ team: data python: "3.8" cluster: + byod: + runtime_env: + - RAY_worker_killing_policy=retriable_lifo + post_build_script: byod_dask_test.sh cluster_env: dask_on_ray/large_scale_dask_on_ray_app_config.yaml cluster_compute: dask_on_ray/dask_on_ray_stress_compute.yaml @@ -5269,6 +5301,10 @@ team: core python: "3.8" cluster: + byod: + runtime_env: + - RAY_memory_usage_threshold=0.7 + - RAY_task_oom_retries=-1 cluster_env: oom/stress_tests_tune_air_oom_app_config.yaml cluster_compute: oom/stress_tests_tune_air_oom_compute.yaml @@ -5283,7 +5319,12 @@ frequency: nightly-3x team: core + python: "3.8" cluster: + byod: + runtime_env: + - RAY_worker_killing_policy=retriable_lifo + post_build_script: byod_dask_test.sh cluster_env: dask_on_ray/dask_on_ray_app_config.yaml cluster_compute: dask_on_ray/1tb_sort_compute.yaml @@ -6428,7 +6469,12 @@ frequency: nightly team: data + python: "3.8" cluster: + byod: + runtime_env: + - RAY_lineage_pinning_enabled=1 + post_build_script: byod_dask_test.sh cluster_env: chaos_test/dask_on_ray_app_config_reconstruction.yaml cluster_compute: dask_on_ray/chaos_dask_on_ray_stress_compute.yaml @@ -6455,7 +6501,12 @@ frequency: nightly team: data + python: "3.8" cluster: + byod: + runtime_env: + - RAY_lineage_pinning_enabled=1 + post_build_script: byod_dask_test.sh cluster_env: chaos_test/dask_on_ray_app_config_reconstruction.yaml cluster_compute: dask_on_ray/dask_on_ray_stress_compute.yaml @@ -6699,11 +6750,10 @@ group: cluster-launcher-test working_dir: ../python/ray/autoscaler/ - stable: true - frequency: nightly team: core cluster: + byod: {} cluster_env: aws/tests/aws_config.yaml cluster_compute: aws/tests/aws_compute.yaml