Skip to content

Integration tests #1320

Integration tests

Integration tests #1320

Workflow file for this run

name: Integration tests
on:
workflow_dispatch:
inputs:
djl-version:
description: 'The released version of DJL'
required: false
default: ''
tag-suffix:
description: 'Run tests on the specific tags suffix i.e. arch-{suffix}'
required: false
type: string
default: 'nightly'
workflow_call:
inputs:
djl-version:
description: 'The released version of DJL'
required: false
type: string
default: ''
tag-suffix:
description: 'Run tests on the specific tags suffix i.e. arch-{suffix}'
required: false
type: string
default: 'nightly'
outputs:
images-to-push:
value: ${{ jobs.test_results.outputs.images_to_push }}
env:
AWS_ECR_REPO: "185921645874.dkr.ecr.us-east-1.amazonaws.com/djl-ci-temp"
jobs:
create-runners:
runs-on: [self-hosted, scheduler]
steps:
- name: Create new G6 instance
id: create_gpu
run: |
cd /home/ubuntu/djl_benchmark_script/scripts
token=$( curl -X POST -H "Authorization: token ${{ secrets.ACTION_RUNNER_PERSONAL_TOKEN }}" \
https://api.github.com/repos/deepjavalibrary/djl-serving/actions/runners/registration-token \
--fail \
| jq '.token' | tr -d '"' )
./start_instance.sh action_g6 $token djl-serving
- name: Create new G6 instance
id: create_gpu2
run: |
cd /home/ubuntu/djl_benchmark_script/scripts
token=$( curl -X POST -H "Authorization: token ${{ secrets.ACTION_RUNNER_PERSONAL_TOKEN }}" \
https://api.github.com/repos/deepjavalibrary/djl-serving/actions/runners/registration-token \
--fail \
| jq '.token' | tr -d '"' )
./start_instance.sh action_g6 $token djl-serving
- name: Create new G6 instance
id: create_gpu3
run: |
cd /home/ubuntu/djl_benchmark_script/scripts
token=$( curl -X POST -H "Authorization: token ${{ secrets.ACTION_RUNNER_PERSONAL_TOKEN }}" \
https://api.github.com/repos/deepjavalibrary/djl-serving/actions/runners/registration-token \
--fail \
| jq '.token' | tr -d '"' )
./start_instance.sh action_g6 $token djl-serving
- name: Create new G6 instance
id: create_gpu4
run: |
cd /home/ubuntu/djl_benchmark_script/scripts
token=$( curl -X POST -H "Authorization: token ${{ secrets.ACTION_RUNNER_PERSONAL_TOKEN }}" \
https://api.github.com/repos/deepjavalibrary/djl-serving/actions/runners/registration-token \
--fail \
| jq '.token' | tr -d '"' )
./start_instance.sh action_g6 $token djl-serving
- name: Create new Graviton instance
id: create_aarch64
run: |
cd /home/ubuntu/djl_benchmark_script/scripts
token=$( curl -X POST -H "Authorization: token ${{ secrets.ACTION_RUNNER_PERSONAL_TOKEN }}" \
https://api.github.com/repos/deepjavalibrary/djl-serving/actions/runners/registration-token \
--fail \
| jq '.token' | tr -d '"' )
./start_instance.sh action_graviton $token djl-serving
- name: Create new Inf2.24xl instance
id: create_inf2
run: |
cd /home/ubuntu/djl_benchmark_script/scripts
token=$( curl -X POST -H "Authorization: token ${{ secrets.ACTION_RUNNER_PERSONAL_TOKEN }}" \
https://api.github.com/repos/deepjavalibrary/djl-serving/actions/runners/registration-token \
--fail \
| jq '.token' | tr -d '"' )
./start_instance.sh action_inf2 $token djl-serving
- name: Create new Inf2.24xl instance
id: create_inf2_2
run: |
cd /home/ubuntu/djl_benchmark_script/scripts
token=$( curl -X POST -H "Authorization: token ${{ secrets.ACTION_RUNNER_PERSONAL_TOKEN }}" \
https://api.github.com/repos/deepjavalibrary/djl-serving/actions/runners/registration-token \
--fail \
| jq '.token' | tr -d '"' )
./start_instance.sh action_inf2 $token djl-serving
outputs:
gpu_instance_id_1: ${{ steps.create_gpu.outputs.action_g6_instance_id }}
gpu_instance_id_2: ${{ steps.create_gpu2.outputs.action_g6_instance_id }}
gpu_instance_id_3: ${{ steps.create_gpu3.outputs.action_g6_instance_id }}
gpu_instance_id_4: ${{ steps.create_gpu4.outputs.action_g6_instance_id }}
aarch64_instance_id: ${{ steps.create_aarch64.outputs.action_graviton_instance_id }}
inf2_instance_id_1: ${{ steps.create_inf2.outputs.action_inf2_instance_id }}
inf2_instance_id_2: ${{ steps.create_inf2_2.outputs.action_inf2_instance_id }}
test:
runs-on:
- ${{ matrix.test.gh-runner && matrix.test.instance || 'self-hosted' }}
- ${{ matrix.test.gh-runner && matrix.test.instance || format('RUN_ID-{0}', github.run_id) }}
- ${{ matrix.test.gh-runner && matrix.test.instance || format('RUN_NUMBER-{0}', github.run_number) }}
- ${{ matrix.test.gh-runner && matrix.test.instance || format('SHA-{0}', github.sha) }}
- ${{ matrix.test.instance }}
timeout-minutes: 90
needs: create-runners
strategy:
fail-fast: false
matrix:
test:
- test: TestCpuFull
instance: ubuntu-latest
gh-runner: true
- test: TestCpuBoth
instance: ubuntu-latest
gh-runner: true
# - test: TestGpu
# instance: g6
- test: TestAarch64
instance: aarch64
- test: TestHfHandler
instance: g6
# - test: TestTrtLlmHandler1
# instance: g6
# - test: TestTrtLlmHandler2
# instance: g6
# - test: TestSchedulerSingleGPU
# instance: g6
# - test: TestSchedulerMultiGPU
# instance: g6
# - test: TestLmiDist1
# instance: g6
# - test: TestLmiDist2
# instance: g6
# - test: TestVllm1
# instance: g6
# - test: TestVllmLora
# instance: g6
# - test: TestLmiDistLora
# instance: g6
# - test: TestNeuronx1
# instance: inf2
# - test: TestNeuronx2
# instance: inf2
# - test: TestNeuronxRollingBatch
# instance: inf2
# - test: TestMultiModal
# instance: g6
# - test: TestTextEmbedding
# instance: g6
# - test: TestLmiDistPipelineParallel
# instance: g6
steps:
- uses: actions/checkout@v4
- name: Clean env
run: |
yes | docker system prune -a --volumes
sudo rm -rf /home/ubuntu/actions-runner/_work/_tool/Java_Corretto_jdk/
echo "wait dpkg lock..."
while sudo fuser /var/{lib/{dpkg,apt/lists},cache/apt/archives}/lock >/dev/null 2>&1; do sleep 5; done
- name: install awscli
run: |
sudo apt-get update
sudo apt-get install awscli -y
- name: Set up Python3
if: ${{ matrix.test.instance != 'aarch64' }}
uses: actions/setup-python@v5
with:
python-version: '3.10.x'
- name: Set up Python3 (aarch64)
if: ${{ matrix.test.instance == 'aarch64' }}
run: |
# Using an alternate installation because of an incompatible combination
# of aarch64 with ubuntu-20.04 not supported by the actions/setup-python
sudo apt-get install python3 python-is-python3 python3-pip -y
- name: Install pip dependencies
run: pip3 install pytest requests "numpy<2" pillow huggingface_hub
- name: Install torch
# Use torch to get cuda capability of current device to selectively run tests
# Torch version doesn't really matter that much
run: |
pip3 install torch==2.3.0
- name: Install awscurl
working-directory: tests/integration
run: |
wget https://publish.djl.ai/awscurl/awscurl
chmod +x awscurl
mkdir outputs
- name: Configure AWS Credentials
uses: aws-actions/configure-aws-credentials@v4
with:
role-to-assume: arn:aws:iam::185921645874:role/github-actions-djl-serving
aws-region: us-east-1
- name: Test
working-directory: tests/integration
env:
TEST_DJL_VERSION: ${{ inputs.djl-version }}
OVERRIDE_IMAGE_TAG_SUFFIX: ${{ inputs.tag-suffix }}
IMAGE_REPO: ${{ env.AWS_ECR_REPO }}
run: |
ECR_REGION=$(echo "${{ env.AWS_ECR_REPO }}" | awk -F. '{print $4}')
aws ecr get-login-password --region $ECR_REGION | docker login --username AWS --password-stdin ${{env.AWS_ECR_REPO}}
python -m pytest -s -k ${{ matrix.test.test }} tests.py
- name: Cleanup
working-directory: tests/integration
run: |
rm -rf outputs
rm awscurl
- name: On Failure
if: ${{ failure() }}
working-directory: tests/integration
run: |
for file in outputs/*; do if [ -f "$file" ]; then echo "Contents of $file:"; cat "$file"; echo; fi; done
sudo rm -rf outputs && sudo rm -rf models
rm awscurl
./remove_container.sh
- name: Upload test logs
if: ${{ always() }}
uses: actions/upload-artifact@v4
with:
name: test-${{ matrix.test.test }}-logs
path: tests/integration/all_logs/
- name: Set test status
if: ${{ always() }}
id: test_status
run: |
echo ${{ job.status }}
if [[ ${{ job.status }} == "success" ]]; then
echo "test_result_${{ matrix.test.test }}=true" >> \$GITHUB_OUTPUT
else
echo "test_result_${{ matrix.test.test }}=false" >> \$GITHUB_OUTPUT
fi
outputs:
test_result_TestCpuFull: ${{ steps.test_status.outputs.test_result_TestCpuFull }}
test_result_TestCpuBoth: ${{ steps.test_status.outputs.test_result_TestCpuBoth }}
test_result_TestGpu: ${{ steps.test_status.outputs.test_result_TestGpu }}
test_result_TestAarch64: ${{ steps.test_status.outputs.test_result_TestAarch64 }}
test_result_TestHfHandler: ${{ steps.test_status.outputs.test_result_TestHfHandler }}
test_result_TestTrtLlmHandler1: ${{ steps.test_status.outputs.test_result_TestTrtLlmHandler1 }}
test_result_TestTrtLlmHandler2: ${{ steps.test_status.outputs.test_result_TestTrtLlmHandler2 }}
test_result_TestSchedulerSingleGPU: ${{ steps.test_status.outputs.test_result_TestSchedulerSingleGPU }}
test_result_TestSchedulerMultiGPU: ${{ steps.test_status.outputs.test_result_TestSchedulerMultiGPU }}
test_result_TestLmiDist1: ${{ steps.test_status.outputs.test_result_TestLmiDist1 }}
test_result_TestLmiDist2: ${{ steps.test_status.outputs.test_result_TestLmiDist2 }}
test_result_TestVllm1: ${{ steps.test_status.outputs.test_result_TestVllm1 }}
test_result_TestVllmLora: ${{ steps.test_status.outputs.test_result_TestVllmLora }}
test_result_TestLmiDistLora: ${{ steps.test_status.outputs.test_result_TestLmiDistLora }}
test_result_TestNeuronx1: ${{ steps.test_status.outputs.test_result_TestNeuronx1 }}
test_result_TestNeuronx2: ${{ steps.test_status.outputs.test_result_TestNeuronx2 }}
test_result_TestNeuronxRollingBatch: ${{ steps.test_status.outputs.test_result_TestNeuronxRollingBatch }}
test_result_TestMultiModal: ${{ steps.test_status.outputs.test_result_TestMultiModal }}
test_result_TestTextEmbedding: ${{ steps.test_status.outputs.test_result_TestTextEmbedding }}
test_result_TestLmiDistPipelineParallel: ${{ steps.test_status.outputs.test_result_TestLmiDistPipelineParallel }}
test_result_sheteng_not_Exist: ${{ steps.test_status.outputs.not_existing }}
test_results:
if: ${{ !cancelled() }}
runs-on: ubuntu-latest
timeout-minutes: 5
needs: [ test, transformers-neuronx-container-unit-tests ]
steps:
- name: summarize
id: summarize_passing_image
run: |
declare -a image_list=()
echo needs.test.outputs.test_result_sheteng_not_Exist
echo ${{needs.test.outputs.test_result_sheteng_not_Exist}}
if [ "${{ needs.test.outputs.test_result_TestCpuBoth }}" = "true" ] && \
[ "${{ needs.test.outputs.test_result_TestCpuFull }}" = "true" ]; then
image_list+=("cpu")
fi
if [ "${{ needs.test.outputs.test_result_TestNeuronx1 }}" = "true" ] && \
[ "${{ needs.test.outputs.test_result_TestNeuronx1 }}" = "true" ]; then
push_neron_image=true
fi
json_array=$(printf '%s\n' "${image_list[@]}" | jq -R . | jq -s .)
echo "image_list=$json_array" >> $GITHUB_OUTPUT
outputs:
images_to_push: ${{ steps.summarize_passing_image.outputs.image_list }}
transformers-neuronx-container-unit-tests:
runs-on:
- self-hosted
- inf2
- RUN_ID-${{ github.run_id }}
- RUN_NUMBER-${{ github.run_number }}
- SHA-${{ github.sha }}
timeout-minutes: 15
needs: create-runners
steps:
- uses: actions/checkout@v4
- name: Clean env
run: |
yes | docker system prune -a --volumes
sudo rm -rf /home/ubuntu/actions-runner/_work/_tool/Java_Corretto_jdk/
echo "wait dpkg lock..."
while sudo fuser /var/{lib/{dpkg,apt/lists},cache/apt/archives}/lock >/dev/null 2>&1; do sleep 5; done
- name: Set up Python3
uses: actions/setup-python@v5
with:
python-version: '3.10.x'
- name: Install pip dependencies
run: pip3 install requests numpy pillow wheel
- name: Configure AWS Credentials
uses: aws-actions/configure-aws-credentials@v4
with:
role-to-assume: arn:aws:iam::185921645874:role/github-actions-djl-serving
aws-region: us-east-1
- name: Build container name
run: |
./serving/docker/scripts/docker_name_builder.sh pytorch-inf2 ${{ github.event.inputs.djl-version }}
- name: Download models and dockers
run: |
if [ ${{ github.event.inputs.djl-version }} == "temp" ]; then
DOCKER_IMAGE_URI="185921645874.dkr.ecr.us-east-1.amazonaws.com/djl-ci-temp:pytorch-inf2-${GITHUB_SHA}-${GITHUB_RUN_ID}"
else
DOCKER_IMAGE_URI="deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG"
fi
echo "DOCKER_IMAGE_URI=$DOCKER_IMAGE_URI" >>$GITHUB_ENV
aws ecr get-login-password --region us-east-1 | docker login --username AWS --password-stdin ${{env.AWS_ECR_REPO}}
echo $DOCKER_IMAGE_URI
docker pull $DOCKER_IMAGE_URI
- name: Run djl_python unit/integration tests on container
working-directory: engines/python/setup
run: |
# Setup
pip install setuptools
python3 -m setup bdist_wheel
mkdir logs
docker run -t --rm --network="host" \
--name neuron-test \
-v $PWD/:/opt/ml/model/ \
-w /opt/ml/model \
--device=/dev/neuron0:/dev/neuron0 \
$DOCKER_IMAGE_URI \
/bin/bash -c "'pip install /opt/ml/model/dist/*.whl pytest' && \
pytest djl_python/tests/neuron_test_scripts/ | tee logs/results.log"
# Cleanup
sudo rm -rf TinyLlama .pytest_cache djl_python
# Fail on failed tests
if grep -F "failed" logs/results.log &>/dev/null; then exit 1; fi
- name: On fail step
if: ${{ failure() }}
working-directory: engines/python/setup
run: |
cat logs/results.log
- name: Upload test logs
uses: actions/upload-artifact@v4
with:
name: transformers-neuronx-${{ matrix.arch }}-logs
path: engines/python/setup/logs/
- name: Set test status
if: ${{ always() }}
id: test_status
run: |
echo ${{ job.status }}
if [[ ${{ job.status }} == "success" ]]; then
echo "test_result_Transformer_Neuron_UnitTest=true" >> \$GITHUB_OUTPUT
else
echo "test_result_${{ matrix.test.test }}=false" >> \$GITHUB_OUTPUT
fi
outputs:
test_result_Transformer_Neuron_UnitTest: ${{ steps.test_status.outputs.test_result_Transformer_Neuron_UnitTest }}
stop-runners:
if: always()
runs-on: [ self-hosted, scheduler ]
needs: [ create-runners, test, transformers-neuronx-container-unit-tests]
steps:
- name: Stop all instances
run: |
cd /home/ubuntu/djl_benchmark_script/scripts
instance_id=${{ needs.create-runners.outputs.gpu_instance_id_1 }}
./stop_instance.sh $instance_id
instance_id=${{ needs.create-runners.outputs.gpu_instance_id_2 }}
./stop_instance.sh $instance_id
instance_id=${{ needs.create-runners.outputs.gpu_instance_id_3 }}
./stop_instance.sh $instance_id
instance_id=${{ needs.create-runners.outputs.gpu_instance_id_4 }}
./stop_instance.sh $instance_id
instance_id=${{ needs.create-runners.outputs.aarch64_instance_id }}
./stop_instance.sh $instance_id
instance_id=${{ needs.create-runners.outputs.inf2_instance_id_1 }}
./stop_instance.sh $instance_id
instance_id=${{ needs.create-runners.outputs.inf2_instance_id_2 }}
./stop_instance.sh $instance_id