[AIR] Add a sanity checking release test for Alpa and ray nightly. (r…

…ay-project#32995) Signed-off-by: Jun Gong <jungong@anyscale.com>
peytondmurray · Mar 22, 2023 · 45e6496 · 45e6496
1 parent fbe79eb
commit 45e6496
Show file tree

Hide file tree

Showing 5 changed files with 617 additions and 0 deletions.
diff --git a/release/alpa_tests/2_g4dn_12xlarge.yaml b/release/alpa_tests/2_g4dn_12xlarge.yaml
@@ -0,0 +1,22 @@
+cloud_id: {{env["ANYSCALE_CLOUD_ID"]}}
+region: us-west-2
+
+max_workers: 2
+
+head_node_type:
+    name: head_node
+    instance_type: g4dn.12xlarge
+
+worker_node_types:
+    - name: worker_node
+      instance_type: g4dn.12xlarge
+      min_workers: 1
+      max_workers: 1
+      use_spot: false
+
+aws:
+    BlockDeviceMappings:
+        - DeviceName: /dev/sda1
+          Ebs:
+            DeleteOnTermination: true
+            VolumeSize: 500
diff --git a/release/alpa_tests/app_config.yaml b/release/alpa_tests/app_config.yaml
@@ -0,0 +1,34 @@
+base_image: {{ env["RAY_IMAGE_ML_NIGHTLY_GPU"] | default("anyscale/ray-ml:nightly-py37-gpu") }}
+env_vars: {}
+debian_packages:
+  - curl
+
+python:
+  pip_packages:
+    - pytest
+    - awscli
+    - cupy-cuda113
+    - numpy==1.21.0
+    - protobuf==3.20.0
+  conda_packages: []
+
+post_build_cmds:
+  # Install nightly wheel.
+  - pip3 install --upgrade pip
+  # Install Alpa from source for now.
+  # TODO(jungong) : pip install alpa after next release.
+  - git clone https://github.com/alpa-projects/alpa.git
+  - pip3 install -e alpa
+  # Install custom built jaxlib.
+  - pip install jaxlib==0.3.22+cuda113.cudnn820 -f https://alpa-projects.github.io/wheels.html
+  # Install nvidia dependencies.
+  - pip3 install --no-cache-dir nvidia-pyindex
+  - pip3 install --no-cache-dir nvidia-tensorrt==7.2.3.4
+  # Huggingface transformers.
+  - pip3 install -U transformers
+  # Install testing wheel after Alpa dependencies, since Alpa's setup.py requires
+  # Ray 2.1.0 right now, and would have overridden the installed version if this
+  # order is reversed.
+  - pip3 uninstall ray -y || true && pip3 install -U {{ env["RAY_WHEELS"] | default("ray") }}
+  # Sanity check.
+  - {{ env["RAY_WHEELS_SANITY_CHECK"] | default("echo No Ray wheels sanity check") }}
diff --git a/release/alpa_tests/run.sh b/release/alpa_tests/run.sh
@@ -0,0 +1,26 @@
+#!/bin/bash
+
+# Integration test for Alpa and Ray.
+
+# Exit if any of the test commands fail.
+set -x -e pipeline
+
+TRAIN_FILE=https://air-example-data-2.s3.us-west-2.amazonaws.com/alpa/alllines.txt
+S3_MODEL_DIR=s3://air-example-data-2/alpa/opt/models/models--facebook--opt-2.7b/
+LOCAL_MODEL_DIR=/tmp/opt-2.7b/
+OUTPUT_DIR=/tmp/alpa_outputs/
+
+mkdir -p $LOCAL_MODEL_DIR
+mkdir -p $OUTPUT_DIR
+
+# Download weights and tokenizer.
+aws s3 sync $S3_MODEL_DIR $LOCAL_MODEL_DIR
+
+# Run training.
+python train_opt_2_7b_minimum.py \
+    --operator_parallel 1 \
+    --pipeline_parallel 4 \
+    --model_name_or_path $LOCAL_MODEL_DIR \
+    --output_dir $OUTPUT_DIR \
+    --train_file $TRAIN_FILE \
+    --max_train_samples 100