Skip to content

Commit

Permalink
Build DML in Windows GPU CI pipeline (microsoft#22869)
Browse files Browse the repository at this point in the history
### Description
Add a new stage to build cuda and dml in Windows GPU CI pipeline (PR
checks) to prevent regressions introduced by new cuda tests.
Update all tests in cuda/testcases name prefix to CudaEp for skipping
them easily

### Motivation and Context
1. CudaNhwcEP is added by default when using cuda ep
2. if onnxruntime_ENABLE_CUDA_EP_INTERNAL_TES is enable, the tests in
tests/provider/cuda/testcases is added too.

### To do
add enable_pybind in the new stage.
Now, --enable_pybind will trigger some python test, like
onnxruntime_test_python.py.
It uses the API of get_avaible_providers() .
More discussions are needed to decide how to make it works
  • Loading branch information
mszhanyi authored Nov 25, 2024
1 parent a2ba3cb commit 85751e7
Show file tree
Hide file tree
Showing 15 changed files with 81 additions and 38 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -27,13 +27,15 @@
import java.util.HashMap;
import java.util.Map;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.api.condition.DisabledIfSystemProperty;
import org.junit.jupiter.api.condition.EnabledIfSystemProperty;

public class ProviderOptionsTest {
private static final OrtEnvironment env = TestHelpers.getOrtEnvironment();

@Test
@EnabledIfSystemProperty(named = "USE_CUDA", matches = "1")
@DisabledIfSystemProperty(named = "NO_CUDA_TEST", matches = "1")
public void testCUDAOptions() throws OrtException {
// Test standard options
OrtCUDAProviderOptions cudaOpts = new OrtCUDAProviderOptions(0);
Expand Down Expand Up @@ -61,6 +63,7 @@ public void testCUDAOptions() throws OrtException {

@Test
@EnabledIfSystemProperty(named = "USE_TENSORRT", matches = "1")
@DisabledIfSystemProperty(named = "NO_CUDA_TEST", matches = "1")
public void testTensorRT() throws OrtException {
// Test standard options
OrtTensorRTProviderOptions rtOpts = new OrtTensorRTProviderOptions(0);
Expand Down
2 changes: 1 addition & 1 deletion onnxruntime/test/providers/cuda/cuda_provider_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ ProviderInfo_CUDA& GetProviderInfo_CUDA_Test();

namespace test {
namespace cuda {
TEST(CUDA_EP_Unittest, All) {
TEST(CudaEpUnittest, All) {
onnxruntime::ProviderInfo_CUDA& ep = onnxruntime::GetProviderInfo_CUDA_Test();
ep.TestAll();
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
namespace onnxruntime {
namespace test {

TEST(AllocatorTest, CUDAAllocatorTest) {
TEST(CudaEpAllocatorTest, CUDAAllocatorTest) {
OrtDevice::DeviceId cuda_device_id = 0;

// ensure CUDA device is available.
Expand Down Expand Up @@ -77,7 +77,7 @@ TEST(AllocatorTest, CUDAAllocatorTest) {
}

// test that we fallback to smaller allocations if the growth of the arena exceeds the available memory
TEST(AllocatorTest, CUDAAllocatorFallbackTest) {
TEST(CudaEpAllocatorTest, CUDAAllocatorFallbackTest) {
OrtDevice::DeviceId cuda_device_id = 0;

size_t free = 0;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ using onnxruntime::contrib::attention::AttentionBackend;
namespace onnxruntime {
namespace test {

TEST(AttentionKernelOptionsTest, NonZeroValue) {
TEST(CudaEpAttentionKernelOptionsTest, NonZeroValue) {
{
AttentionKernelOptions options;
int value = static_cast<int>(AttentionBackend::FLASH_ATTENTION) | static_cast<int>(AttentionBackend::EFFICIENT_ATTENTION);
Expand Down Expand Up @@ -156,7 +156,7 @@ TEST(AttentionKernelOptionsTest, NonZeroValue) {
}

// Test all environment variables take effect when option value is 0.
TEST(AttentionKernelOptionsTest, DefaultOptionWithEnvVar) {
TEST(CudaEpAttentionKernelOptionsTest, DefaultOptionWithEnvVar) {
constexpr int value = 0;
ScopedEnvironmentVariables scoped_env_vars{
EnvVarMap{
Expand Down Expand Up @@ -186,7 +186,7 @@ TEST(AttentionKernelOptionsTest, DefaultOptionWithEnvVar) {
}

// Test default min sequence lengths when environment variables are not set.
TEST(AttentionKernelOptionsTest, DefaultMinSeqLens) {
TEST(CudaEpAttentionKernelOptionsTest, DefaultMinSeqLens) {
constexpr int value = 0;
ScopedEnvironmentVariables scoped_env_vars{
EnvVarMap{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ void ComputeTopKReference(const std::vector<float>& values,
}
}

TEST(TestBeamSearch, TopK) {
TEST(CudaEpTestBeamSearch, TopK) {
int32_t batch_size = 4;
int32_t beam_size = 4;
int32_t vocab_size = 50257;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -230,7 +230,7 @@ void testPrepack(int rows, int columns) {
}

// TODO: code runs on CPU, but this is for sm80 only, maybe enable only when test on sm80
TEST(BlkQ4_GEMM, PrepackSm80Test) {
TEST(CudaEpBlkQ4_GEMM, PrepackSm80Test) {
Status status = onnxruntime::cuda::test::sm80_supported();
if (!status.IsOK()) {
// skip the test if sm80 is not supported
Expand Down Expand Up @@ -263,7 +263,7 @@ TEST(BlkQ4_GEMM, PrepackSm80Test) {
testPrepack<true, false>(256, 256);
}

TEST(BlkQ4_GEMM, Sm80RowBlockingTest) {
TEST(CudaEpBlkQ4_GEMM, Sm80RowBlockingTest) {
Status status = onnxruntime::cuda::test::sm80_supported();
if (!status.IsOK()) {
// skip the test if sm80 is not supported
Expand Down Expand Up @@ -292,7 +292,7 @@ TEST(BlkQ4_GEMM, Sm80RowBlockingTest) {
onnxruntime::cuda::test::run_blkq4_gemm<64, false, false, true>(256, 1024, 576);
}

TEST(BlkQ4_GEMM, Sm80ColBlockingTest) {
TEST(CudaEpBlkQ4_GEMM, Sm80ColBlockingTest) {
Status status = onnxruntime::cuda::test::sm80_supported();
if (!status.IsOK()) {
// skip the test if sm80 is not supported
Expand All @@ -305,7 +305,7 @@ TEST(BlkQ4_GEMM, Sm80ColBlockingTest) {
onnxruntime::cuda::test::run_blkq4_gemm<64, true, false, true>(256, 1024, 576);
}

TEST(BlkQ4_GEMM, Sm80SmallMTest) {
TEST(CudaEpBlkQ4_GEMM, Sm80SmallMTest) {
Status status = onnxruntime::cuda::test::sm80_supported();
if (!status.IsOK()) {
// skip the test if sm80 is not supported
Expand All @@ -326,7 +326,7 @@ TEST(BlkQ4_GEMM, Sm80SmallMTest) {
onnxruntime::cuda::test::run_blkq4_gemm<64, true, true, true>(16, 1024, 576);
}

TEST(BlkQ4_GEMM, Sm80SmallTileKernelTest) {
TEST(CudaEpBlkQ4_GEMM, Sm80SmallTileKernelTest) {
Status status = onnxruntime::cuda::test::sm80_supported();
if (!status.IsOK()) {
// skip the test if sm80 is not supported
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ namespace cuda {
namespace test {
// TODO: Since the "DeferredRelease" has been migrated to CudaStream class,
// we should migrate this test from CudaEP unit test to CudaStream unit test.
TEST(TestDeferredRelease, WithArena) {
TEST(CudaEpTestDeferredRelease, WithArena) {
// Create CUDA EP.
CUDAExecutionProviderInfo info;
CUDAExecutionProvider ep(info);
Expand Down Expand Up @@ -52,7 +52,7 @@ TEST(TestDeferredRelease, WithArena) {
ORT_THROW_IF_ERROR(ep.OnRunEnd(true, run_opts));
}

TEST(TestDeferredRelease, WithoutArena) {
TEST(CudaEpTestDeferredRelease, WithoutArena) {
// Create CUDA EP.
CUDAExecutionProviderInfo info;
CUDAExecutionProvider ep(info);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ void TestFillCorrectness(size_t num_elements, TElement value) {
}
} // namespace

TEST(CudaUtilsTest, FillCorrectness) {
TEST(CudaEpUnittest, FillCorrectness) {
TestFillCorrectness<int8_t>(1 << 20, 1);
TestFillCorrectness<int16_t>(1 << 20, 2);
TestFillCorrectness<int32_t>(1 << 20, 3);
Expand Down
12 changes: 6 additions & 6 deletions onnxruntime/test/providers/cuda/test_cases/gemm_options_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ namespace onnxruntime {
namespace cuda {
namespace test {

TEST(CudaGemmOptions, TestDefaultOptions) {
TEST(CudaEpGemmOptions, TestDefaultOptions) {
HalfGemmOptions gemm_options;
ASSERT_FALSE(gemm_options.IsCompute16F());
#if defined(USE_CUDA)
Expand All @@ -22,7 +22,7 @@ TEST(CudaGemmOptions, TestDefaultOptions) {
#endif
}

TEST(CudaGemmOptions, TestCompute16F) {
TEST(CudaEpGemmOptions, TestCompute16F) {
HalfGemmOptions gemm_options;
gemm_options.Initialize(1);
ASSERT_TRUE(gemm_options.IsCompute16F());
Expand All @@ -35,7 +35,7 @@ TEST(CudaGemmOptions, TestCompute16F) {
#endif
}

TEST(CudaGemmOptions, NoReducedPrecision) {
TEST(CudaEpGemmOptions, NoReducedPrecision) {
HalfGemmOptions gemm_options;
gemm_options.Initialize(2);
ASSERT_FALSE(gemm_options.IsCompute16F());
Expand All @@ -48,7 +48,7 @@ TEST(CudaGemmOptions, NoReducedPrecision) {
#endif
}

TEST(CudaGemmOptions, Pedantic) {
TEST(CudaEpGemmOptions, Pedantic) {
HalfGemmOptions gemm_options;
gemm_options.Initialize(4);
ASSERT_FALSE(gemm_options.IsCompute16F());
Expand All @@ -61,7 +61,7 @@ TEST(CudaGemmOptions, Pedantic) {
#endif
}

TEST(CudaGemmOptions, Compute16F_Pedantic) {
TEST(CudaEpGemmOptions, Compute16F_Pedantic) {
HalfGemmOptions gemm_options;
gemm_options.Initialize(5);
ASSERT_TRUE(gemm_options.IsCompute16F());
Expand All @@ -74,7 +74,7 @@ TEST(CudaGemmOptions, Compute16F_Pedantic) {
#endif
}

TEST(CudaGemmOptions, Compute16F_NoReducedPrecision) {
TEST(CudaEpGemmOptions, Compute16F_NoReducedPrecision) {
HalfGemmOptions gemm_options;
gemm_options.Initialize(3);
ASSERT_TRUE(gemm_options.IsCompute16F());
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ void ComputeTop1Reference(const std::vector<float>& values,
}
}

TEST(TestGreedySearch, TopOne) {
TEST(CudaEpTestGreedySearch, TopOne) {
int32_t batch_size = 4;
int32_t vocab_size = 50257;
int32_t batch_x_vocab = batch_size * vocab_size;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -179,7 +179,7 @@ void TestReduceColumnsToColumn(int m, int n, float relative_error_tolerance = 1e
}
} // namespace

TEST(ReductionFunctionsTest, ReduceRowToScalar) {
TEST(CudaEpReductionFunctionsTest, ReduceRowToScalar) {
TestReduceRowToScalarApis(3);
TestReduceRowToScalarApis(19);
TestReduceRowToScalarApis(123);
Expand All @@ -188,7 +188,7 @@ TEST(ReductionFunctionsTest, ReduceRowToScalar) {
TestReduceRowToScalarApis(941736, 2e-4f);
}

TEST(ReductionFunctionsTest, ReduceRowsToRow) {
TEST(CudaEpReductionFunctionsTest, ReduceRowsToRow) {
for (int m : {3, 193, 2945}) {
for (int n : {3, 193, 2945}) {
TestReduceRowsToRow(m, n, true);
Expand All @@ -197,15 +197,15 @@ TEST(ReductionFunctionsTest, ReduceRowsToRow) {
}
}

TEST(ReductionFunctionsTest, ReduceColumnsToColumn) {
TEST(CudaEpReductionFunctionsTest, ReduceColumnsToColumn) {
for (int m : {3, 193, 2945}) {
for (int n : {3, 193, 2945}) {
TestReduceColumnsToColumn(m, n);
}
}
}

TEST(ReductionFunctionsTest, BufferOffsets) {
TEST(CudaEpReductionFunctionsTest, BufferOffsets) {
const int m = 2048;
const int n = 1024;
const TensorShape shape{m, n};
Expand Down Expand Up @@ -240,7 +240,7 @@ TEST(ReductionFunctionsTest, BufferOffsets) {
}
}

TEST(ReductionFunctionsTest, InvalidBufferSize) {
TEST(CudaEpReductionFunctionsTest, InvalidBufferSize) {
const int m = 2048;
const int n = 1024;
const TensorShape shape{m, n};
Expand All @@ -262,7 +262,7 @@ TEST(ReductionFunctionsTest, InvalidBufferSize) {
ASSERT_FALSE(status.IsOK());
}

TEST(ReductionFunctionsTest, GetApplicableMatrixReduction) {
TEST(CudaEpReductionFunctionsTest, GetApplicableMatrixReduction) {
auto test_get_applicable_matrix_reduction =
[](cudnnReduceTensorOp_t cudnn_op,
const std::vector<int64_t>& dims, const std::vector<int64_t>& axes,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -218,16 +218,32 @@ jobs:
- powershell: |
python3 -m pip uninstall -y onnxruntime onnxruntime-gpu onnxruntime-training onnxruntime-directml -qq
Get-ChildItem -Path dist/*.whl | foreach {pip --disable-pip-version-check install --upgrade $_.fullname}
workingDirectory: '$(Build.BinariesDirectory)\${{ parameters.BuildConfig }}\${{ parameters.BuildConfig }}'
displayName: 'Install onnxruntime wheel'
- ${{ if eq(parameters.RunOnnxRuntimeTests, true) }}:
- powershell: |
python $(Build.SourcesDirectory)\tools\ci_build\build.py --config ${{ parameters.BuildConfig }} --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --build_shared_lib --test --cmake_generator "Visual Studio 17 2022" --build_shared_lib --enable_onnx_tests ${{ parameters.additionalBuildFlags }}
workingDirectory: '$(Build.BinariesDirectory)\${{ parameters.BuildConfig }}\${{ parameters.BuildConfig }}'
displayName: 'Run tests'
- ${{ if and(contains(parameters.additionalBuildFlags, 'use_cuda'), contains(parameters.additionalBuildFlags, 'use_dml')) }}:
- powershell: |
python $(Build.SourcesDirectory)\tools\ci_build\build.py --config ${{ parameters.BuildConfig }} --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --build_shared_lib --test --cmake_generator "Visual Studio 17 2022" --enable_onnx_tests ${{ parameters.additionalBuildFlags }}
workingDirectory: '$(Build.BinariesDirectory)\${{ parameters.BuildConfig }}\${{ parameters.BuildConfig }}'
displayName: 'Run tests excluding CUDA tests'
env:
NO_CUDA_TEST: '1'
GTEST_FILTER: '-CudaEp*:CudaNhwcTypedTest*:*cpu_*models*' # Exclude CUDA EP tests under providers/cuda/ and cpu models test
PATH: '$(Build.BinariesDirectory)\${{ parameters.BuildConfig }}\${{ parameters.BuildConfig }};$(PATH)' # For onnxruntime4j_test to find dependent dlls
- powershell: |
python $(Build.SourcesDirectory)\tools\ci_build\build.py --config ${{ parameters.BuildConfig }} --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --build_shared_lib --test --cmake_generator "Visual Studio 17 2022" --enable_onnx_tests ${{ parameters.additionalBuildFlags }}
workingDirectory: '$(Build.BinariesDirectory)\${{ parameters.BuildConfig }}\${{ parameters.BuildConfig }}'
displayName: 'Run tests excluding DML tests'
env:
NO_DML_TEST: '1'
GTEST_FILTER: '-*cpu_*models*'
PATH: '$(Build.BinariesDirectory)\${{ parameters.BuildConfig }}\${{ parameters.BuildConfig }};$(PATH)'
- ${{ else }}:
- powershell: |
python $(Build.SourcesDirectory)\tools\ci_build\build.py --config ${{ parameters.BuildConfig }} --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --build_shared_lib --test --cmake_generator "Visual Studio 17 2022" --enable_onnx_tests ${{ parameters.additionalBuildFlags }}
workingDirectory: '$(Build.BinariesDirectory)\${{ parameters.BuildConfig }}\${{ parameters.BuildConfig }}'
displayName: 'Run tests'
- ${{ if eq(parameters.GenerateDocumentation, true) }}:
- task: PythonScript@0
Expand Down
2 changes: 1 addition & 1 deletion tools/ci_build/github/azure-pipelines/templates/win-ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -413,7 +413,7 @@ stages:
workingDirectory: '$(Build.BinariesDirectory)'
env:
NO_CUDA_TEST: '1'
GTEST_FILTER: -*CudaNhwcTypedTest*
GTEST_FILTER: '-CudaEp*:CudaNhwcTypedTest*' # Exclude CUDA EP tests under providers/cuda/
- task: PythonScript@0
displayName: 'test excludes DML'
condition: and(succeeded(), eq('${{ parameters.runTests}}', true))
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -62,4 +62,28 @@ stages:
RunOnnxRuntimeTests: ${{ parameters.RunOnnxRuntimeTests }}
ORT_EP_NAME: CUDA
WITH_CACHE: true
MachinePool: onnxruntime-Win2022-GPU-A10
MachinePool: onnxruntime-Win2022-GPU-A10

- stage: cuda_dml
dependsOn: []
jobs:
- template: templates/jobs/win-ci-vs-2022-job.yml
parameters:
BuildConfig: 'RelWithDebInfo'
EnvSetupScript: setup_env_cuda.bat
buildArch: x64
additionalBuildFlags: >-
--build_java --build_nodejs --use_cuda --cuda_home="$(Agent.TempDirectory)\v${{ parameters.CudaVersion }}"
--enable_cuda_profiling --enable_transformers_tool_test
--use_dml
--cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86
--cmake_extra_defines onnxruntime_BUILD_UNIT_TESTS=ON
--cmake_extra_defines onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS=ON
msbuildPlatform: x64
isX86: false
job_name_suffix: x64_RelWithDebInfo
RunOnnxRuntimeTests: ${{ parameters.RunOnnxRuntimeTests }}
ORT_EP_NAME: CUDA
EnablePython: false
WITH_CACHE: true
MachinePool: onnxruntime-Win2022-GPU-A10
Original file line number Diff line number Diff line change
Expand Up @@ -43,11 +43,11 @@ stages:
BuildConfig: 'RelWithDebInfo'
EnvSetupScript: setup_env.bat
buildArch: x64
additionalBuildFlags: --enable_pybind --use_dml --enable_wcos --use_winml
additionalBuildFlags: --enable_pybind --use_dml --enable_wcos --use_winml
msbuildPlatform: x64
isX86: false
job_name_suffix: x64_RelWithDebInfo
RunOnnxRuntimeTests: ${{ parameters.RunOnnxRuntimeTests }}
ORT_EP_NAME: DML
WITH_CACHE: false
MachinePool: onnxruntime-Win2022-GPU-dml-A10
MachinePool: onnxruntime-Win2022-GPU-dml-A10

0 comments on commit 85751e7

Please sign in to comment.