From f67b624791cf01f3c7d16f724d1e9ade8f0f40bd Mon Sep 17 00:00:00 2001 From: Rong Ou Date: Thu, 9 Feb 2023 14:28:30 -0800 Subject: [PATCH 1/6] Fix quantile tests running on multi-gpus --- tests/cpp/common/test_quantile.cu | 39 +++++++++++++++++-------------- 1 file changed, 21 insertions(+), 18 deletions(-) diff --git a/tests/cpp/common/test_quantile.cu b/tests/cpp/common/test_quantile.cu index d3f7dbed0f4b..bd699e8e99b6 100644 --- a/tests/cpp/common/test_quantile.cu +++ b/tests/cpp/common/test_quantile.cu @@ -349,22 +349,24 @@ void TestAllReduceBasic(int32_t n_gpus) { constexpr size_t kRows = 1000, kCols = 100; RunWithSeedsAndBins(kRows, [=](int32_t seed, size_t n_bins, MetaInfo const& info) { + auto const device = collective::GetRank(); + // Set up single node version; - HostDeviceVector ft; - SketchContainer sketch_on_single_node(ft, n_bins, kCols, kRows, 0); + HostDeviceVector ft({}, device); + SketchContainer sketch_on_single_node(ft, n_bins, kCols, kRows, device); size_t intermediate_num_cuts = std::min( kRows * world, static_cast(n_bins * WQSketch::kFactor)); std::vector containers; for (auto rank = 0; rank < world; ++rank) { - HostDeviceVector storage; + HostDeviceVector storage({}, device); std::string interface_str = RandomDataGenerator{kRows, kCols, 0} - .Device(0) + .Device(device) .Seed(rank + seed) .GenerateArrayInterface(&storage); data::CupyAdapter adapter(interface_str); - HostDeviceVector ft; - containers.emplace_back(ft, n_bins, kCols, kRows, 0); + HostDeviceVector ft({}, device); + containers.emplace_back(ft, n_bins, kCols, kRows, device); AdapterDeviceSketch(adapter.Value(), n_bins, info, std::numeric_limits::quiet_NaN(), &containers.back()); @@ -375,16 +377,16 @@ void TestAllReduceBasic(int32_t n_gpus) { sketch_on_single_node.FixError(); } sketch_on_single_node.Unique(); - TestQuantileElemRank(0, sketch_on_single_node.Data(), + TestQuantileElemRank(device, sketch_on_single_node.Data(), sketch_on_single_node.ColumnsPtr(), true); // Set up distributed version. We rely on using rank as seed to generate // the exact same copy of data. auto rank = collective::GetRank(); - SketchContainer sketch_distributed(ft, n_bins, kCols, kRows, 0); - HostDeviceVector storage; + SketchContainer sketch_distributed(ft, n_bins, kCols, kRows, device); + HostDeviceVector storage({}, device); std::string interface_str = RandomDataGenerator{kRows, kCols, 0} - .Device(0) + .Device(device) .Seed(rank + seed) .GenerateArrayInterface(&storage); data::CupyAdapter adapter(interface_str); @@ -399,7 +401,7 @@ void TestAllReduceBasic(int32_t n_gpus) { ASSERT_EQ(sketch_distributed.Data().size(), sketch_on_single_node.Data().size()); - TestQuantileElemRank(0, sketch_distributed.Data(), + TestQuantileElemRank(device, sketch_distributed.Data(), sketch_distributed.ColumnsPtr(), true); std::vector single_node_data( @@ -437,12 +439,13 @@ void TestSameOnAllWorkers(int32_t n_gpus) { constexpr size_t kRows = 1000, kCols = 100; RunWithSeedsAndBins(kRows, [=](int32_t seed, size_t n_bins, MetaInfo const &info) { - auto rank = collective::GetRank(); - HostDeviceVector ft; - SketchContainer sketch_distributed(ft, n_bins, kCols, kRows, 0); - HostDeviceVector storage; + auto const rank = collective::GetRank(); + auto const device = rank; + HostDeviceVector ft({}, device); + SketchContainer sketch_distributed(ft, n_bins, kCols, kRows, device); + HostDeviceVector storage({}, device); std::string interface_str = RandomDataGenerator{kRows, kCols, 0} - .Device(0) + .Device(device) .Seed(rank + seed) .GenerateArrayInterface(&storage); data::CupyAdapter adapter(interface_str); @@ -451,7 +454,7 @@ void TestSameOnAllWorkers(int32_t n_gpus) { &sketch_distributed); sketch_distributed.AllReduce(); sketch_distributed.Unique(); - TestQuantileElemRank(0, sketch_distributed.Data(), sketch_distributed.ColumnsPtr(), true); + TestQuantileElemRank(device, sketch_distributed.Data(), sketch_distributed.ColumnsPtr(), true); // Test for all workers having the same sketch. size_t n_data = sketch_distributed.Data().size(); @@ -468,7 +471,7 @@ void TestSameOnAllWorkers(int32_t n_gpus) { thrust::copy(thrust::device, local_data.data(), local_data.data() + local_data.size(), all_workers.begin() + local_data.size() * rank); - collective::DeviceCommunicator* communicator = collective::Communicator::GetDevice(0); + collective::DeviceCommunicator* communicator = collective::Communicator::GetDevice(device); communicator->AllReduceSum(all_workers.data().get(), all_workers.size()); communicator->Synchronize(); From 960f09e6c3c7af3d61e7e215e9f0076a86e1664d Mon Sep 17 00:00:00 2001 From: Hyunsu Philip Cho Date: Thu, 9 Feb 2023 22:35:03 -0800 Subject: [PATCH 2/6] Run some gtests with multiple GPUs --- tests/buildkite/pipeline-mgpu.yml | 5 +++++ tests/buildkite/test-cpp-gpu.sh | 2 +- tests/buildkite/test-cpp-mgpu.sh | 13 +++++++++++++ tests/cpp/common/test_quantile.cu | 4 ++-- 4 files changed, 21 insertions(+), 3 deletions(-) create mode 100755 tests/buildkite/test-cpp-mgpu.sh diff --git a/tests/buildkite/pipeline-mgpu.yml b/tests/buildkite/pipeline-mgpu.yml index 75d7855b6dc9..316e78ad8033 100644 --- a/tests/buildkite/pipeline-mgpu.yml +++ b/tests/buildkite/pipeline-mgpu.yml @@ -36,6 +36,11 @@ steps: queue: linux-amd64-mgpu - wait #### -------- TEST -------- + - label: ":console: Run Google Tests" + command: "tests/buildkite/test-cpp-mgpu.sh" + key: test-cpp-mgpu + agents: + queue: linux-amd64-mgpu - label: ":console: Test Python package, 4 GPUs" command: "tests/buildkite/test-python-gpu.sh mgpu" key: test-python-mgpu diff --git a/tests/buildkite/test-cpp-gpu.sh b/tests/buildkite/test-cpp-gpu.sh index f1ddf9d5f5e0..9bfed6864126 100755 --- a/tests/buildkite/test-cpp-gpu.sh +++ b/tests/buildkite/test-cpp-gpu.sh @@ -10,7 +10,7 @@ chmod +x build/testxgboost tests/ci_build/ci_build.sh gpu nvidia-docker \ --build-arg CUDA_VERSION_ARG=$CUDA_VERSION \ --build-arg RAPIDS_VERSION_ARG=$RAPIDS_VERSION \ - build/testxgboost + build/testxgboost --gtest_filter=-*MGPU* # Disabled until https://github.com/dmlc/xgboost/issues/8619 is resolved # echo "--- Run Google Tests with CUDA, using a GPU, RMM enabled" diff --git a/tests/buildkite/test-cpp-mgpu.sh b/tests/buildkite/test-cpp-mgpu.sh new file mode 100755 index 000000000000..d607e16ca306 --- /dev/null +++ b/tests/buildkite/test-cpp-mgpu.sh @@ -0,0 +1,13 @@ +#!/bin/bash + +set -euo pipefail + +source tests/buildkite/conftest.sh + +echo "--- Run Google Tests with CUDA, using multiple GPUs" +buildkite-agent artifact download "build/testxgboost" . --step build-cuda +chmod +x build/testxgboost +tests/ci_build/ci_build.sh gpu nvidia-docker \ + --build-arg CUDA_VERSION_ARG=$CUDA_VERSION \ + --build-arg RAPIDS_VERSION_ARG=$RAPIDS_VERSION \ + build/testxgboost --gtest_filter=*MGPU* diff --git a/tests/cpp/common/test_quantile.cu b/tests/cpp/common/test_quantile.cu index bd699e8e99b6..4a71a6a5bf3b 100644 --- a/tests/cpp/common/test_quantile.cu +++ b/tests/cpp/common/test_quantile.cu @@ -422,7 +422,7 @@ void TestAllReduceBasic(int32_t n_gpus) { } } // anonymous namespace -TEST(GPUQuantile, AllReduceBasic) { +TEST(GPUQuantile, MGPU_AllReduceBasic) { auto const n_gpus = AllVisibleGPUs(); RunWithInMemoryCommunicator(n_gpus, TestAllReduceBasic, n_gpus); } @@ -495,7 +495,7 @@ void TestSameOnAllWorkers(int32_t n_gpus) { } } // anonymous namespace -TEST(GPUQuantile, SameOnAllWorkers) { +TEST(GPUQuantile, MGPU_SameOnAllWorkers) { auto const n_gpus = AllVisibleGPUs(); RunWithInMemoryCommunicator(n_gpus, TestSameOnAllWorkers, n_gpus); } From 757c304cdd7cfa4b1a18740b46a1a9cb0456f2cf Mon Sep 17 00:00:00 2001 From: Rong Ou Date: Thu, 9 Feb 2023 23:41:20 -0800 Subject: [PATCH 3/6] fix mgpu test naming --- tests/cpp/common/test_quantile.cu | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/cpp/common/test_quantile.cu b/tests/cpp/common/test_quantile.cu index 4a71a6a5bf3b..13a4e08d5a18 100644 --- a/tests/cpp/common/test_quantile.cu +++ b/tests/cpp/common/test_quantile.cu @@ -422,7 +422,7 @@ void TestAllReduceBasic(int32_t n_gpus) { } } // anonymous namespace -TEST(GPUQuantile, MGPU_AllReduceBasic) { +TEST(GPUQuantile, MGPUAllReduceBasic) { auto const n_gpus = AllVisibleGPUs(); RunWithInMemoryCommunicator(n_gpus, TestAllReduceBasic, n_gpus); } @@ -495,7 +495,7 @@ void TestSameOnAllWorkers(int32_t n_gpus) { } } // anonymous namespace -TEST(GPUQuantile, MGPU_SameOnAllWorkers) { +TEST(GPUQuantile, MGPUSameOnAllWorkers) { auto const n_gpus = AllVisibleGPUs(); RunWithInMemoryCommunicator(n_gpus, TestSameOnAllWorkers, n_gpus); } From 05739ab169debca4deeb5c75fd79df91a100833e Mon Sep 17 00:00:00 2001 From: Hyunsu Philip Cho Date: Fri, 10 Feb 2023 10:47:54 -0800 Subject: [PATCH 4/6] Instruct NCCL to print extra logs --- tests/buildkite/test-cpp-mgpu.sh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/buildkite/test-cpp-mgpu.sh b/tests/buildkite/test-cpp-mgpu.sh index d607e16ca306..86ea0e42635e 100755 --- a/tests/buildkite/test-cpp-mgpu.sh +++ b/tests/buildkite/test-cpp-mgpu.sh @@ -4,6 +4,8 @@ set -euo pipefail source tests/buildkite/conftest.sh +export CI_DOCKER_EXTRA_PARAMS_INIT='-e NCCL_DEBUG=INFO' + echo "--- Run Google Tests with CUDA, using multiple GPUs" buildkite-agent artifact download "build/testxgboost" . --step build-cuda chmod +x build/testxgboost From 4e9e6c517182f857582a6cc9b8720381ef0b575c Mon Sep 17 00:00:00 2001 From: Hyunsu Philip Cho Date: Sat, 11 Feb 2023 22:23:57 -0800 Subject: [PATCH 5/6] Allocate extra space in /dev/shm to enable NCCL --- tests/buildkite/test-cpp-mgpu.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/buildkite/test-cpp-mgpu.sh b/tests/buildkite/test-cpp-mgpu.sh index 86ea0e42635e..935a301a66f1 100755 --- a/tests/buildkite/test-cpp-mgpu.sh +++ b/tests/buildkite/test-cpp-mgpu.sh @@ -4,7 +4,8 @@ set -euo pipefail source tests/buildkite/conftest.sh -export CI_DOCKER_EXTRA_PARAMS_INIT='-e NCCL_DEBUG=INFO' +# Allocate extra space in /dev/shm to enable NCCL +export CI_DOCKER_EXTRA_PARAMS_INIT='--shm-size=4g' echo "--- Run Google Tests with CUDA, using multiple GPUs" buildkite-agent artifact download "build/testxgboost" . --step build-cuda From 26697707044c306bf4d1cb1dbb859224febc5f14 Mon Sep 17 00:00:00 2001 From: Rong Ou Date: Sun, 12 Feb 2023 16:09:06 -0800 Subject: [PATCH 6/6] use gtest_skip to skip mgpu tests --- tests/buildkite/test-cpp-gpu.sh | 2 +- tests/cpp/common/test_quantile.cu | 18 ++++++------------ 2 files changed, 7 insertions(+), 13 deletions(-) diff --git a/tests/buildkite/test-cpp-gpu.sh b/tests/buildkite/test-cpp-gpu.sh index 9bfed6864126..f1ddf9d5f5e0 100755 --- a/tests/buildkite/test-cpp-gpu.sh +++ b/tests/buildkite/test-cpp-gpu.sh @@ -10,7 +10,7 @@ chmod +x build/testxgboost tests/ci_build/ci_build.sh gpu nvidia-docker \ --build-arg CUDA_VERSION_ARG=$CUDA_VERSION \ --build-arg RAPIDS_VERSION_ARG=$RAPIDS_VERSION \ - build/testxgboost --gtest_filter=-*MGPU* + build/testxgboost # Disabled until https://github.com/dmlc/xgboost/issues/8619 is resolved # echo "--- Run Google Tests with CUDA, using a GPU, RMM enabled" diff --git a/tests/cpp/common/test_quantile.cu b/tests/cpp/common/test_quantile.cu index 13a4e08d5a18..cb24f8bb4140 100644 --- a/tests/cpp/common/test_quantile.cu +++ b/tests/cpp/common/test_quantile.cu @@ -341,12 +341,6 @@ TEST(GPUQuantile, MultiMerge) { namespace { void TestAllReduceBasic(int32_t n_gpus) { auto const world = collective::GetWorldSize(); - if (world != 1) { - ASSERT_EQ(world, n_gpus); - } else { - return; - } - constexpr size_t kRows = 1000, kCols = 100; RunWithSeedsAndBins(kRows, [=](int32_t seed, size_t n_bins, MetaInfo const& info) { auto const device = collective::GetRank(); @@ -424,18 +418,15 @@ void TestAllReduceBasic(int32_t n_gpus) { TEST(GPUQuantile, MGPUAllReduceBasic) { auto const n_gpus = AllVisibleGPUs(); + if (n_gpus <= 1) { + GTEST_SKIP() << "Skipping MGPUAllReduceBasic test with # GPUs = " << n_gpus; + } RunWithInMemoryCommunicator(n_gpus, TestAllReduceBasic, n_gpus); } namespace { void TestSameOnAllWorkers(int32_t n_gpus) { auto world = collective::GetWorldSize(); - if (world != 1) { - ASSERT_EQ(world, n_gpus); - } else { - return; - } - constexpr size_t kRows = 1000, kCols = 100; RunWithSeedsAndBins(kRows, [=](int32_t seed, size_t n_bins, MetaInfo const &info) { @@ -497,6 +488,9 @@ void TestSameOnAllWorkers(int32_t n_gpus) { TEST(GPUQuantile, MGPUSameOnAllWorkers) { auto const n_gpus = AllVisibleGPUs(); + if (n_gpus <= 1) { + GTEST_SKIP() << "Skipping MGPUSameOnAllWorkers test with # GPUs = " << n_gpus; + } RunWithInMemoryCommunicator(n_gpus, TestSameOnAllWorkers, n_gpus); }