Fix openai protocols and pass top_k, min_p #3111

Workflow file for this run

.github/workflows/pr-test.yml at 59d8031

	name: PR Test

	on:
	push:
	branches: [ main ]
	paths:
	- "python/sglang/**"
	- "test/**"
	pull_request:
	branches: [ main ]
	paths:
	- "python/sglang/**"
	- "test/**"
	workflow_dispatch:
	inputs:
	version:
	description: "FlashInfer version"
	required: true
	type: choice
	default: 'release'
	options:
	- 'release'
	- 'nightly'

	concurrency:
	group: pr-test-${{ github.ref }}
	cancel-in-progress: true

	jobs:

	unit-test-frontend:
	if: github.repository == 'sgl-project/sglang' \|\| github.event_name == 'pull_request'
	runs-on: 1-gpu-runner
	steps:
	- name: Checkout code
	uses: actions/checkout@v3

	- name: Install dependencies
	env:
	FLASHINFER_REPO: ${{ inputs.version == 'nightly' && 'https://flashinfer.ai/whl/nightly/cu121/torch2.4/flashinfer' \|\| 'https://flashinfer.ai/whl/cu121/torch2.4/flashinfer' }}
	run: \|
	bash scripts/ci_install_dependency.sh

	- name: Run test
	timeout-minutes: 10
	run: \|
	cd test/lang
	python3 run_suite.py --suite minimal

	unit-test-backend-1-gpu:
	if: github.repository == 'sgl-project/sglang' \|\| github.event_name == 'pull_request'
	runs-on: 1-gpu-runner
	strategy:
	matrix:
	range: [0-6, 6-15, 15-23, 23-30, 30-100]
	steps:
	- name: Checkout code
	uses: actions/checkout@v3

	- name: Install dependencies
	env:
	FLASHINFER_REPO: ${{ inputs.version == 'nightly' && 'https://flashinfer.ai/whl/nightly/cu121/torch2.4/flashinfer' \|\| 'https://flashinfer.ai/whl/cu121/torch2.4/flashinfer' }}
	run: \|
	bash scripts/ci_install_dependency.sh

	- name: Run test
	timeout-minutes: 25
	run: \|
	cd test/srt
	RANGE=${{ matrix.range }}
	range_begin=${RANGE%-*}
	range_end=${RANGE#*-}
	python3 run_suite.py --suite minimal --range-begin ${range_begin} --range-end ${range_end}

	unit-test-backend-2-gpu:
	if: github.repository == 'sgl-project/sglang' \|\| github.event_name == 'pull_request'
	runs-on: 2-gpu-runner
	steps:
	- name: Checkout code
	uses: actions/checkout@v3

	- name: Install dependencies
	env:
	FLASHINFER_REPO: ${{ inputs.version == 'nightly' && 'https://flashinfer.ai/whl/nightly/cu121/torch2.4/flashinfer' \|\| 'https://flashinfer.ai/whl/cu121/torch2.4/flashinfer' }}
	run: \|
	bash scripts/ci_install_dependency.sh

	- name: Evaluate data parallelism accuracy (DP=2)
	timeout-minutes: 10
	run: \|
	cd test/srt
	python3 test_data_parallelism.py

	- name: Evaluate MLA accuracy (TP=2)
	timeout-minutes: 10
	run: \|
	cd test/srt
	python3 test_mla.py
	python3 test_mla_fp8.py
	python3 test_dp_attention.py

	- name: Test update weights from distributed
	timeout-minutes: 10
	run: \|
	cd test/srt
	python3 test_update_weights_from_distributed.py

	- name: Evaluate MoE EP accuracy (TP=2)
	timeout-minutes: 10
	run: \|
	cd test/srt
	python3 test_moe_ep.py

	performance-test-1-gpu-part-1:
	if: github.repository == 'sgl-project/sglang' \|\| github.event_name == 'pull_request'
	runs-on: 1-gpu-runner
	steps:
	- name: Checkout code
	uses: actions/checkout@v3

	- name: Install dependencies
	env:
	FLASHINFER_REPO: ${{ inputs.version == 'nightly' && 'https://flashinfer.ai/whl/nightly/cu121/torch2.4/flashinfer' \|\| 'https://flashinfer.ai/whl/cu121/torch2.4/flashinfer' }}
	run: \|
	bash scripts/ci_install_dependency.sh

	- name: Benchmark single latency
	timeout-minutes: 10
	run: \|
	cd test/srt
	python3 -m unittest test_bench_one_batch.TestBenchOneBatch.test_default

	- name: Benchmark online latency
	timeout-minutes: 10
	run: \|
	cd test/srt
	python3 -m unittest test_bench_serving.TestBenchServing.test_online_latency_default

	- name: Benchmark offline throughput
	timeout-minutes: 10
	run: \|
	cd test/srt
	python3 -m unittest test_bench_serving.TestBenchServing.test_offline_throughput_default

	- name: Benchmark offline throughput (Non-streaming, small batch size)
	timeout-minutes: 10
	run: \|
	cd test/srt
	python3 -m unittest test_bench_serving.TestBenchServing.test_offline_throughput_non_stream_small_batch_size

	performance-test-1-gpu-part-2:
	if: github.repository == 'sgl-project/sglang' \|\| github.event_name == 'pull_request'
	runs-on: 1-gpu-runner
	steps:
	- name: Checkout code
	uses: actions/checkout@v3

	- name: Install dependencies
	env:
	FLASHINFER_REPO: ${{ inputs.version == 'nightly' && 'https://flashinfer.ai/whl/nightly/cu121/torch2.4/flashinfer' \|\| 'https://flashinfer.ai/whl/cu121/torch2.4/flashinfer' }}
	run: \|
	bash scripts/ci_install_dependency.sh

	- name: Benchmark offline throughput (w/o RadixAttention)
	timeout-minutes: 10
	run: \|
	cd test/srt
	python3 -m unittest test_bench_serving.TestBenchServing.test_offline_throughput_without_radix_cache

	- name: Benchmark offline throughput (w/ Triton)
	timeout-minutes: 10
	run: \|
	cd test/srt
	python3 -m unittest test_bench_serving.TestBenchServing.test_offline_throughput_with_triton_attention_backend

	- name: Benchmark offline throughput (w/ FP8)
	timeout-minutes: 10
	run: \|
	cd test/srt
	python3 -m unittest test_bench_serving.TestBenchServing.test_offline_throughput_default_fp8

	performance-test-2-gpu:
	if: github.repository == 'sgl-project/sglang' \|\| github.event_name == 'pull_request'
	runs-on: 2-gpu-runner
	steps:
	- name: Checkout code
	uses: actions/checkout@v3

	- name: Install dependencies
	env:
	FLASHINFER_REPO: ${{ inputs.version == 'nightly' && 'https://flashinfer.ai/whl/nightly/cu121/torch2.4/flashinfer' \|\| 'https://flashinfer.ai/whl/cu121/torch2.4/flashinfer' }}
	run: \|
	bash scripts/ci_install_dependency.sh

	- name: Benchmark single latency (TP=2)
	timeout-minutes: 10
	run: \|
	cd test/srt
	python3 -m unittest test_bench_one_batch.TestBenchOneBatch.test_moe_default

	- name: Benchmark offline throughput (TP=2)
	timeout-minutes: 10
	run: \|
	cd test/srt
	python3 -m unittest test_bench_serving.TestBenchServing.test_moe_offline_throughput_default

	- name: Benchmark offline throughput (w/o RadixAttention) (TP=2)
	timeout-minutes: 10
	run: \|
	cd test/srt
	python3 -m unittest test_bench_serving.TestBenchServing.test_moe_offline_throughput_without_radix_cache

	accuracy-test-1-gpu:
	if: github.repository == 'sgl-project/sglang' \|\| github.event_name == 'pull_request'
	runs-on: 1-gpu-runner
	steps:
	- name: Checkout code
	uses: actions/checkout@v3

	- name: Install dependencies
	env:
	FLASHINFER_REPO: ${{ inputs.version == 'nightly' && 'https://flashinfer.ai/whl/nightly/cu121/torch2.4/flashinfer' \|\| 'https://flashinfer.ai/whl/cu121/torch2.4/flashinfer' }}
	run: \|
	bash scripts/ci_install_dependency.sh

	git clone https://github.com/merrymercy/human-eval.git
	cd human-eval
	pip install -e .

	- name: Evaluate accuracy
	timeout-minutes: 20
	run: \|
	cd test/srt
	python3 test_eval_accuracy_large.py


	accuracy-test-2-gpu:
	if: github.repository == 'sgl-project/sglang' \|\| github.event_name == 'pull_request'
	runs-on: 2-gpu-runner
	steps:
	- name: Checkout code
	uses: actions/checkout@v3

	- name: Install dependencies
	env:
	FLASHINFER_REPO: ${{ inputs.version == 'nightly' && 'https://flashinfer.ai/whl/nightly/cu121/torch2.4/flashinfer' \|\| 'https://flashinfer.ai/whl/cu121/torch2.4/flashinfer' }}
	run: \|
	bash scripts/ci_install_dependency.sh

	git clone https://github.com/merrymercy/human-eval.git
	cd human-eval
	pip install -e .

	- name: Evaluate accuracy (TP=2)
	timeout-minutes: 20
	run: \|
	cd test/srt
	python3 test_moe_eval_accuracy_large.py


	finish:
	needs: [
	unit-test-frontend, unit-test-backend-1-gpu, unit-test-backend-2-gpu,
	performance-test-1-gpu-part-1, performance-test-1-gpu-part-2, performance-test-2-gpu,
	accuracy-test-1-gpu, accuracy-test-2-gpu
	]
	runs-on: ubuntu-latest
	steps:
	- name: Finish
	run: echo "This is an empty step to ensure that all jobs are completed."

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Fix openai protocols and pass top_k, min_p #3111

Workflow file

Fix openai protocols and pass top_k, min_p #3111

Jobs

Run details

Workflow file for this run