Skip to content

Commit

Permalink
[AIR] Add a sanity checking release test for Alpa and ray nightly. (r…
Browse files Browse the repository at this point in the history
…ay-project#32995)

Signed-off-by: Jun Gong <jungong@anyscale.com>
  • Loading branch information
Jun Gong authored and peytondmurray committed Mar 22, 2023
1 parent fbe79eb commit 45e6496
Show file tree
Hide file tree
Showing 5 changed files with 617 additions and 0 deletions.
22 changes: 22 additions & 0 deletions release/alpa_tests/2_g4dn_12xlarge.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
cloud_id: {{env["ANYSCALE_CLOUD_ID"]}}
region: us-west-2

max_workers: 2

head_node_type:
name: head_node
instance_type: g4dn.12xlarge

worker_node_types:
- name: worker_node
instance_type: g4dn.12xlarge
min_workers: 1
max_workers: 1
use_spot: false

aws:
BlockDeviceMappings:
- DeviceName: /dev/sda1
Ebs:
DeleteOnTermination: true
VolumeSize: 500
34 changes: 34 additions & 0 deletions release/alpa_tests/app_config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
base_image: {{ env["RAY_IMAGE_ML_NIGHTLY_GPU"] | default("anyscale/ray-ml:nightly-py37-gpu") }}
env_vars: {}
debian_packages:
- curl

python:
pip_packages:
- pytest
- awscli
- cupy-cuda113
- numpy==1.21.0
- protobuf==3.20.0
conda_packages: []

post_build_cmds:
# Install nightly wheel.
- pip3 install --upgrade pip
# Install Alpa from source for now.
# TODO(jungong) : pip install alpa after next release.
- git clone https://github.com/alpa-projects/alpa.git
- pip3 install -e alpa
# Install custom built jaxlib.
- pip install jaxlib==0.3.22+cuda113.cudnn820 -f https://alpa-projects.github.io/wheels.html
# Install nvidia dependencies.
- pip3 install --no-cache-dir nvidia-pyindex
- pip3 install --no-cache-dir nvidia-tensorrt==7.2.3.4
# Huggingface transformers.
- pip3 install -U transformers
# Install testing wheel after Alpa dependencies, since Alpa's setup.py requires
# Ray 2.1.0 right now, and would have overridden the installed version if this
# order is reversed.
- pip3 uninstall ray -y || true && pip3 install -U {{ env["RAY_WHEELS"] | default("ray") }}
# Sanity check.
- {{ env["RAY_WHEELS_SANITY_CHECK"] | default("echo No Ray wheels sanity check") }}
26 changes: 26 additions & 0 deletions release/alpa_tests/run.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
#!/bin/bash

# Integration test for Alpa and Ray.

# Exit if any of the test commands fail.
set -x -e pipeline

TRAIN_FILE=https://air-example-data-2.s3.us-west-2.amazonaws.com/alpa/alllines.txt
S3_MODEL_DIR=s3://air-example-data-2/alpa/opt/models/models--facebook--opt-2.7b/
LOCAL_MODEL_DIR=/tmp/opt-2.7b/
OUTPUT_DIR=/tmp/alpa_outputs/

mkdir -p $LOCAL_MODEL_DIR
mkdir -p $OUTPUT_DIR

# Download weights and tokenizer.
aws s3 sync $S3_MODEL_DIR $LOCAL_MODEL_DIR

# Run training.
python train_opt_2_7b_minimum.py \
--operator_parallel 1 \
--pipeline_parallel 4 \
--model_name_or_path $LOCAL_MODEL_DIR \
--output_dir $OUTPUT_DIR \
--train_file $TRAIN_FILE \
--max_train_samples 100
Loading

0 comments on commit 45e6496

Please sign in to comment.