forked from ray-project/ray
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[AIR] Add a sanity checking release test for Alpa and ray nightly. (r…
…ay-project#32995) Signed-off-by: Jun Gong <jungong@anyscale.com>
- Loading branch information
1 parent
fbe79eb
commit 45e6496
Showing
5 changed files
with
617 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,22 @@ | ||
cloud_id: {{env["ANYSCALE_CLOUD_ID"]}} | ||
region: us-west-2 | ||
|
||
max_workers: 2 | ||
|
||
head_node_type: | ||
name: head_node | ||
instance_type: g4dn.12xlarge | ||
|
||
worker_node_types: | ||
- name: worker_node | ||
instance_type: g4dn.12xlarge | ||
min_workers: 1 | ||
max_workers: 1 | ||
use_spot: false | ||
|
||
aws: | ||
BlockDeviceMappings: | ||
- DeviceName: /dev/sda1 | ||
Ebs: | ||
DeleteOnTermination: true | ||
VolumeSize: 500 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,34 @@ | ||
base_image: {{ env["RAY_IMAGE_ML_NIGHTLY_GPU"] | default("anyscale/ray-ml:nightly-py37-gpu") }} | ||
env_vars: {} | ||
debian_packages: | ||
- curl | ||
|
||
python: | ||
pip_packages: | ||
- pytest | ||
- awscli | ||
- cupy-cuda113 | ||
- numpy==1.21.0 | ||
- protobuf==3.20.0 | ||
conda_packages: [] | ||
|
||
post_build_cmds: | ||
# Install nightly wheel. | ||
- pip3 install --upgrade pip | ||
# Install Alpa from source for now. | ||
# TODO(jungong) : pip install alpa after next release. | ||
- git clone https://github.com/alpa-projects/alpa.git | ||
- pip3 install -e alpa | ||
# Install custom built jaxlib. | ||
- pip install jaxlib==0.3.22+cuda113.cudnn820 -f https://alpa-projects.github.io/wheels.html | ||
# Install nvidia dependencies. | ||
- pip3 install --no-cache-dir nvidia-pyindex | ||
- pip3 install --no-cache-dir nvidia-tensorrt==7.2.3.4 | ||
# Huggingface transformers. | ||
- pip3 install -U transformers | ||
# Install testing wheel after Alpa dependencies, since Alpa's setup.py requires | ||
# Ray 2.1.0 right now, and would have overridden the installed version if this | ||
# order is reversed. | ||
- pip3 uninstall ray -y || true && pip3 install -U {{ env["RAY_WHEELS"] | default("ray") }} | ||
# Sanity check. | ||
- {{ env["RAY_WHEELS_SANITY_CHECK"] | default("echo No Ray wheels sanity check") }} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,26 @@ | ||
#!/bin/bash | ||
|
||
# Integration test for Alpa and Ray. | ||
|
||
# Exit if any of the test commands fail. | ||
set -x -e pipeline | ||
|
||
TRAIN_FILE=https://air-example-data-2.s3.us-west-2.amazonaws.com/alpa/alllines.txt | ||
S3_MODEL_DIR=s3://air-example-data-2/alpa/opt/models/models--facebook--opt-2.7b/ | ||
LOCAL_MODEL_DIR=/tmp/opt-2.7b/ | ||
OUTPUT_DIR=/tmp/alpa_outputs/ | ||
|
||
mkdir -p $LOCAL_MODEL_DIR | ||
mkdir -p $OUTPUT_DIR | ||
|
||
# Download weights and tokenizer. | ||
aws s3 sync $S3_MODEL_DIR $LOCAL_MODEL_DIR | ||
|
||
# Run training. | ||
python train_opt_2_7b_minimum.py \ | ||
--operator_parallel 1 \ | ||
--pipeline_parallel 4 \ | ||
--model_name_or_path $LOCAL_MODEL_DIR \ | ||
--output_dir $OUTPUT_DIR \ | ||
--train_file $TRAIN_FILE \ | ||
--max_train_samples 100 |
Oops, something went wrong.