Integration tests #1320

Workflow file for this run

.github/workflows/integration.yml at 66466fd

	name: Integration tests

	on:
	workflow_dispatch:
	inputs:
	djl-version:
	description: 'The released version of DJL'
	required: false
	default: ''
	tag-suffix:
	description: 'Run tests on the specific tags suffix i.e. arch-{suffix}'
	required: false
	type: string
	default: 'nightly'
	workflow_call:
	inputs:
	djl-version:
	description: 'The released version of DJL'
	required: false
	type: string
	default: ''
	tag-suffix:
	description: 'Run tests on the specific tags suffix i.e. arch-{suffix}'
	required: false
	type: string
	default: 'nightly'
	outputs:
	images-to-push:
	value: ${{ jobs.test_results.outputs.images_to_push }}

	env:
	AWS_ECR_REPO: "185921645874.dkr.ecr.us-east-1.amazonaws.com/djl-ci-temp"

	jobs:
	create-runners:
	runs-on: [self-hosted, scheduler]
	steps:
	- name: Create new G6 instance
	id: create_gpu
	run: \|
	cd /home/ubuntu/djl_benchmark_script/scripts
	token=$( curl -X POST -H "Authorization: token ${{ secrets.ACTION_RUNNER_PERSONAL_TOKEN }}" \
	https://api.github.com/repos/deepjavalibrary/djl-serving/actions/runners/registration-token \
	--fail \
	\| jq '.token' \| tr -d '"' )
	./start_instance.sh action_g6 $token djl-serving
	- name: Create new G6 instance
	id: create_gpu2
	run: \|
	cd /home/ubuntu/djl_benchmark_script/scripts
	token=$( curl -X POST -H "Authorization: token ${{ secrets.ACTION_RUNNER_PERSONAL_TOKEN }}" \
	https://api.github.com/repos/deepjavalibrary/djl-serving/actions/runners/registration-token \
	--fail \
	\| jq '.token' \| tr -d '"' )
	./start_instance.sh action_g6 $token djl-serving
	- name: Create new G6 instance
	id: create_gpu3
	run: \|
	cd /home/ubuntu/djl_benchmark_script/scripts
	token=$( curl -X POST -H "Authorization: token ${{ secrets.ACTION_RUNNER_PERSONAL_TOKEN }}" \
	https://api.github.com/repos/deepjavalibrary/djl-serving/actions/runners/registration-token \
	--fail \
	\| jq '.token' \| tr -d '"' )
	./start_instance.sh action_g6 $token djl-serving
	- name: Create new G6 instance
	id: create_gpu4
	run: \|
	cd /home/ubuntu/djl_benchmark_script/scripts
	token=$( curl -X POST -H "Authorization: token ${{ secrets.ACTION_RUNNER_PERSONAL_TOKEN }}" \
	https://api.github.com/repos/deepjavalibrary/djl-serving/actions/runners/registration-token \
	--fail \
	\| jq '.token' \| tr -d '"' )
	./start_instance.sh action_g6 $token djl-serving
	- name: Create new Graviton instance
	id: create_aarch64
	run: \|
	cd /home/ubuntu/djl_benchmark_script/scripts
	token=$( curl -X POST -H "Authorization: token ${{ secrets.ACTION_RUNNER_PERSONAL_TOKEN }}" \
	https://api.github.com/repos/deepjavalibrary/djl-serving/actions/runners/registration-token \
	--fail \
	\| jq '.token' \| tr -d '"' )
	./start_instance.sh action_graviton $token djl-serving
	- name: Create new Inf2.24xl instance
	id: create_inf2
	run: \|
	cd /home/ubuntu/djl_benchmark_script/scripts
	token=$( curl -X POST -H "Authorization: token ${{ secrets.ACTION_RUNNER_PERSONAL_TOKEN }}" \
	https://api.github.com/repos/deepjavalibrary/djl-serving/actions/runners/registration-token \
	--fail \
	\| jq '.token' \| tr -d '"' )
	./start_instance.sh action_inf2 $token djl-serving
	- name: Create new Inf2.24xl instance
	id: create_inf2_2
	run: \|
	cd /home/ubuntu/djl_benchmark_script/scripts
	token=$( curl -X POST -H "Authorization: token ${{ secrets.ACTION_RUNNER_PERSONAL_TOKEN }}" \
	https://api.github.com/repos/deepjavalibrary/djl-serving/actions/runners/registration-token \
	--fail \
	\| jq '.token' \| tr -d '"' )
	./start_instance.sh action_inf2 $token djl-serving
	outputs:
	gpu_instance_id_1: ${{ steps.create_gpu.outputs.action_g6_instance_id }}
	gpu_instance_id_2: ${{ steps.create_gpu2.outputs.action_g6_instance_id }}
	gpu_instance_id_3: ${{ steps.create_gpu3.outputs.action_g6_instance_id }}
	gpu_instance_id_4: ${{ steps.create_gpu4.outputs.action_g6_instance_id }}
	aarch64_instance_id: ${{ steps.create_aarch64.outputs.action_graviton_instance_id }}
	inf2_instance_id_1: ${{ steps.create_inf2.outputs.action_inf2_instance_id }}
	inf2_instance_id_2: ${{ steps.create_inf2_2.outputs.action_inf2_instance_id }}


	test:
	runs-on:
	- ${{ matrix.test.gh-runner && matrix.test.instance \|\| 'self-hosted' }}
	- ${{ matrix.test.gh-runner && matrix.test.instance \|\| format('RUN_ID-{0}', github.run_id) }}
	- ${{ matrix.test.gh-runner && matrix.test.instance \|\| format('RUN_NUMBER-{0}', github.run_number) }}
	- ${{ matrix.test.gh-runner && matrix.test.instance \|\| format('SHA-{0}', github.sha) }}
	- ${{ matrix.test.instance }}
	timeout-minutes: 90
	needs: create-runners
	strategy:
	fail-fast: false
	matrix:
	test:
	- test: TestCpuFull
	instance: ubuntu-latest
	gh-runner: true
	- test: TestCpuBoth
	instance: ubuntu-latest
	gh-runner: true
	# - test: TestGpu
	# instance: g6
	- test: TestAarch64
	instance: aarch64
	- test: TestHfHandler
	instance: g6
	# - test: TestTrtLlmHandler1
	# instance: g6
	# - test: TestTrtLlmHandler2
	# instance: g6
	# - test: TestSchedulerSingleGPU
	# instance: g6
	# - test: TestSchedulerMultiGPU
	# instance: g6
	# - test: TestLmiDist1
	# instance: g6
	# - test: TestLmiDist2
	# instance: g6
	# - test: TestVllm1
	# instance: g6
	# - test: TestVllmLora
	# instance: g6
	# - test: TestLmiDistLora
	# instance: g6
	# - test: TestNeuronx1
	# instance: inf2
	# - test: TestNeuronx2
	# instance: inf2
	# - test: TestNeuronxRollingBatch
	# instance: inf2
	# - test: TestMultiModal
	# instance: g6
	# - test: TestTextEmbedding
	# instance: g6
	# - test: TestLmiDistPipelineParallel
	# instance: g6
	steps:
	- uses: actions/checkout@v4
	- name: Clean env
	run: \|
	yes \| docker system prune -a --volumes
	sudo rm -rf /home/ubuntu/actions-runner/_work/_tool/Java_Corretto_jdk/
	echo "wait dpkg lock..."
	while sudo fuser /var/{lib/{dpkg,apt/lists},cache/apt/archives}/lock >/dev/null 2>&1; do sleep 5; done
	- name: install awscli
	run: \|
	sudo apt-get update
	sudo apt-get install awscli -y
	- name: Set up Python3
	if: ${{ matrix.test.instance != 'aarch64' }}
	uses: actions/setup-python@v5
	with:
	python-version: '3.10.x'
	- name: Set up Python3 (aarch64)
	if: ${{ matrix.test.instance == 'aarch64' }}
	run: \|
	# Using an alternate installation because of an incompatible combination
	# of aarch64 with ubuntu-20.04 not supported by the actions/setup-python
	sudo apt-get install python3 python-is-python3 python3-pip -y
	- name: Install pip dependencies
	run: pip3 install pytest requests "numpy<2" pillow huggingface_hub
	- name: Install torch
	# Use torch to get cuda capability of current device to selectively run tests
	# Torch version doesn't really matter that much
	run: \|
	pip3 install torch==2.3.0
	- name: Install awscurl
	working-directory: tests/integration
	run: \|
	wget https://publish.djl.ai/awscurl/awscurl
	chmod +x awscurl
	mkdir outputs
	- name: Configure AWS Credentials
	uses: aws-actions/configure-aws-credentials@v4
	with:
	role-to-assume: arn:aws:iam::185921645874:role/github-actions-djl-serving
	aws-region: us-east-1
	- name: Test
	working-directory: tests/integration
	env:
	TEST_DJL_VERSION: ${{ inputs.djl-version }}
	OVERRIDE_IMAGE_TAG_SUFFIX: ${{ inputs.tag-suffix }}
	IMAGE_REPO: ${{ env.AWS_ECR_REPO }}
	run: \|
	ECR_REGION=$(echo "${{ env.AWS_ECR_REPO }}" \| awk -F. '{print $4}')
	aws ecr get-login-password --region $ECR_REGION \| docker login --username AWS --password-stdin ${{env.AWS_ECR_REPO}}
	python -m pytest -s -k ${{ matrix.test.test }} tests.py
	- name: Cleanup
	working-directory: tests/integration
	run: \|
	rm -rf outputs
	rm awscurl
	- name: On Failure
	if: ${{ failure() }}
	working-directory: tests/integration
	run: \|
	for file in outputs/*; do if [ -f "$file" ]; then echo "Contents of $file:"; cat "$file"; echo; fi; done
	sudo rm -rf outputs && sudo rm -rf models
	rm awscurl
	./remove_container.sh
	- name: Upload test logs
	if: ${{ always() }}
	uses: actions/upload-artifact@v4
	with:
	name: test-${{ matrix.test.test }}-logs
	path: tests/integration/all_logs/
	- name: Set test status
	if: ${{ always() }}
	id: test_status
	run: \|
	echo ${{ job.status }}
	if [[ ${{ job.status }} == "success" ]]; then
	echo "test_result_${{ matrix.test.test }}=true" >> \$GITHUB_OUTPUT
	else
	echo "test_result_${{ matrix.test.test }}=false" >> \$GITHUB_OUTPUT
	fi
	outputs:
	test_result_TestCpuFull: ${{ steps.test_status.outputs.test_result_TestCpuFull }}
	test_result_TestCpuBoth: ${{ steps.test_status.outputs.test_result_TestCpuBoth }}
	test_result_TestGpu: ${{ steps.test_status.outputs.test_result_TestGpu }}
	test_result_TestAarch64: ${{ steps.test_status.outputs.test_result_TestAarch64 }}
	test_result_TestHfHandler: ${{ steps.test_status.outputs.test_result_TestHfHandler }}
	test_result_TestTrtLlmHandler1: ${{ steps.test_status.outputs.test_result_TestTrtLlmHandler1 }}
	test_result_TestTrtLlmHandler2: ${{ steps.test_status.outputs.test_result_TestTrtLlmHandler2 }}
	test_result_TestSchedulerSingleGPU: ${{ steps.test_status.outputs.test_result_TestSchedulerSingleGPU }}
	test_result_TestSchedulerMultiGPU: ${{ steps.test_status.outputs.test_result_TestSchedulerMultiGPU }}
	test_result_TestLmiDist1: ${{ steps.test_status.outputs.test_result_TestLmiDist1 }}
	test_result_TestLmiDist2: ${{ steps.test_status.outputs.test_result_TestLmiDist2 }}
	test_result_TestVllm1: ${{ steps.test_status.outputs.test_result_TestVllm1 }}
	test_result_TestVllmLora: ${{ steps.test_status.outputs.test_result_TestVllmLora }}
	test_result_TestLmiDistLora: ${{ steps.test_status.outputs.test_result_TestLmiDistLora }}
	test_result_TestNeuronx1: ${{ steps.test_status.outputs.test_result_TestNeuronx1 }}
	test_result_TestNeuronx2: ${{ steps.test_status.outputs.test_result_TestNeuronx2 }}
	test_result_TestNeuronxRollingBatch: ${{ steps.test_status.outputs.test_result_TestNeuronxRollingBatch }}
	test_result_TestMultiModal: ${{ steps.test_status.outputs.test_result_TestMultiModal }}
	test_result_TestTextEmbedding: ${{ steps.test_status.outputs.test_result_TestTextEmbedding }}
	test_result_TestLmiDistPipelineParallel: ${{ steps.test_status.outputs.test_result_TestLmiDistPipelineParallel }}
	test_result_sheteng_not_Exist: ${{ steps.test_status.outputs.not_existing }}

	test_results:
	if: ${{ !cancelled() }}
	runs-on: ubuntu-latest
	timeout-minutes: 5
	needs: [ test, transformers-neuronx-container-unit-tests ]
	steps:
	- name: summarize
	id: summarize_passing_image
	run: \|
	declare -a image_list=()

	echo needs.test.outputs.test_result_sheteng_not_Exist
	echo ${{needs.test.outputs.test_result_sheteng_not_Exist}}

	if [ "${{ needs.test.outputs.test_result_TestCpuBoth }}" = "true" ] && \
	[ "${{ needs.test.outputs.test_result_TestCpuFull }}" = "true" ]; then
	image_list+=("cpu")
	fi

	if [ "${{ needs.test.outputs.test_result_TestNeuronx1 }}" = "true" ] && \
	[ "${{ needs.test.outputs.test_result_TestNeuronx1 }}" = "true" ]; then
	push_neron_image=true
	fi
	json_array=$(printf '%s\n' "${image_list[@]}" \| jq -R . \| jq -s .)
	echo "image_list=$json_array" >> $GITHUB_OUTPUT
	outputs:
	images_to_push: ${{ steps.summarize_passing_image.outputs.image_list }}

	transformers-neuronx-container-unit-tests:
	runs-on:
	- self-hosted
	- inf2
	- RUN_ID-${{ github.run_id }}
	- RUN_NUMBER-${{ github.run_number }}
	- SHA-${{ github.sha }}
	timeout-minutes: 15
	needs: create-runners
	steps:
	- uses: actions/checkout@v4
	- name: Clean env
	run: \|
	yes \| docker system prune -a --volumes
	sudo rm -rf /home/ubuntu/actions-runner/_work/_tool/Java_Corretto_jdk/
	echo "wait dpkg lock..."
	while sudo fuser /var/{lib/{dpkg,apt/lists},cache/apt/archives}/lock >/dev/null 2>&1; do sleep 5; done
	- name: Set up Python3
	uses: actions/setup-python@v5
	with:
	python-version: '3.10.x'
	- name: Install pip dependencies
	run: pip3 install requests numpy pillow wheel
	- name: Configure AWS Credentials
	uses: aws-actions/configure-aws-credentials@v4
	with:
	role-to-assume: arn:aws:iam::185921645874:role/github-actions-djl-serving
	aws-region: us-east-1
	- name: Build container name
	run: \|
	./serving/docker/scripts/docker_name_builder.sh pytorch-inf2 ${{ github.event.inputs.djl-version }}
	- name: Download models and dockers
	run: \|
	if [ ${{ github.event.inputs.djl-version }} == "temp" ]; then
	DOCKER_IMAGE_URI="185921645874.dkr.ecr.us-east-1.amazonaws.com/djl-ci-temp:pytorch-inf2-${GITHUB_SHA}-${GITHUB_RUN_ID}"
	else
	DOCKER_IMAGE_URI="deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG"
	fi
	echo "DOCKER_IMAGE_URI=$DOCKER_IMAGE_URI" >>$GITHUB_ENV
	aws ecr get-login-password --region us-east-1 \| docker login --username AWS --password-stdin ${{env.AWS_ECR_REPO}}
	echo $DOCKER_IMAGE_URI
	docker pull $DOCKER_IMAGE_URI
	- name: Run djl_python unit/integration tests on container
	working-directory: engines/python/setup
	run: \|
	# Setup
	pip install setuptools
	python3 -m setup bdist_wheel
	mkdir logs
	docker run -t --rm --network="host" \
	--name neuron-test \
	-v $PWD/:/opt/ml/model/ \
	-w /opt/ml/model \
	--device=/dev/neuron0:/dev/neuron0 \
	$DOCKER_IMAGE_URI \
	/bin/bash -c "'pip install /opt/ml/model/dist/*.whl pytest' && \
	pytest djl_python/tests/neuron_test_scripts/ \| tee logs/results.log"

	# Cleanup
	sudo rm -rf TinyLlama .pytest_cache djl_python

	# Fail on failed tests
	if grep -F "failed" logs/results.log &>/dev/null; then exit 1; fi
	- name: On fail step
	if: ${{ failure() }}
	working-directory: engines/python/setup
	run: \|
	cat logs/results.log
	- name: Upload test logs
	uses: actions/upload-artifact@v4
	with:
	name: transformers-neuronx-${{ matrix.arch }}-logs
	path: engines/python/setup/logs/
	- name: Set test status
	if: ${{ always() }}
	id: test_status
	run: \|
	echo ${{ job.status }}
	if [[ ${{ job.status }} == "success" ]]; then
	echo "test_result_Transformer_Neuron_UnitTest=true" >> \$GITHUB_OUTPUT
	else
	echo "test_result_${{ matrix.test.test }}=false" >> \$GITHUB_OUTPUT
	fi
	outputs:
	test_result_Transformer_Neuron_UnitTest: ${{ steps.test_status.outputs.test_result_Transformer_Neuron_UnitTest }}

	stop-runners:
	if: always()
	runs-on: [ self-hosted, scheduler ]
	needs: [ create-runners, test, transformers-neuronx-container-unit-tests]
	steps:
	- name: Stop all instances
	run: \|
	cd /home/ubuntu/djl_benchmark_script/scripts
	instance_id=${{ needs.create-runners.outputs.gpu_instance_id_1 }}
	./stop_instance.sh $instance_id
	instance_id=${{ needs.create-runners.outputs.gpu_instance_id_2 }}
	./stop_instance.sh $instance_id
	instance_id=${{ needs.create-runners.outputs.gpu_instance_id_3 }}
	./stop_instance.sh $instance_id
	instance_id=${{ needs.create-runners.outputs.gpu_instance_id_4 }}
	./stop_instance.sh $instance_id
	instance_id=${{ needs.create-runners.outputs.aarch64_instance_id }}
	./stop_instance.sh $instance_id
	instance_id=${{ needs.create-runners.outputs.inf2_instance_id_1 }}
	./stop_instance.sh $instance_id
	instance_id=${{ needs.create-runners.outputs.inf2_instance_id_2 }}
	./stop_instance.sh $instance_id

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Integration tests #1320

Workflow file

Integration tests #1320

Jobs

Run details

Workflow file for this run