From f67b624791cf01f3c7d16f724d1e9ade8f0f40bd Mon Sep 17 00:00:00 2001
From: Rong Ou <rong.ou@gmail.com>
Date: Thu, 9 Feb 2023 14:28:30 -0800
Subject: [PATCH 1/6] Fix quantile tests running on multi-gpus

---
 tests/cpp/common/test_quantile.cu | 39 +++++++++++++++++--------------
 1 file changed, 21 insertions(+), 18 deletions(-)
diff --git a/tests/cpp/common/test_quantile.cu b/tests/cpp/common/test_quantile.cu
index d3f7dbed0f4b..bd699e8e99b6 100644
--- a/tests/cpp/common/test_quantile.cu
+++ b/tests/cpp/common/test_quantile.cu
@@ -349,22 +349,24 @@ void TestAllReduceBasic(int32_t n_gpus) {
 
   constexpr size_t kRows = 1000, kCols = 100;
   RunWithSeedsAndBins(kRows, [=](int32_t seed, size_t n_bins, MetaInfo const& info) {
+    auto const device = collective::GetRank();
+
     // Set up single node version;
-    HostDeviceVector<FeatureType> ft;
-    SketchContainer sketch_on_single_node(ft, n_bins, kCols, kRows, 0);
+    HostDeviceVector<FeatureType> ft({}, device);
+    SketchContainer sketch_on_single_node(ft, n_bins, kCols, kRows, device);
 
     size_t intermediate_num_cuts = std::min(
         kRows * world, static_cast<size_t>(n_bins * WQSketch::kFactor));
     std::vector<SketchContainer> containers;
     for (auto rank = 0; rank < world; ++rank) {
-      HostDeviceVector<float> storage;
+      HostDeviceVector<float> storage({}, device);
       std::string interface_str = RandomDataGenerator{kRows, kCols, 0}
-                                      .Device(0)
+                                      .Device(device)
                                       .Seed(rank + seed)
                                       .GenerateArrayInterface(&storage);
       data::CupyAdapter adapter(interface_str);
-      HostDeviceVector<FeatureType> ft;
-      containers.emplace_back(ft, n_bins, kCols, kRows, 0);
+      HostDeviceVector<FeatureType> ft({}, device);
+      containers.emplace_back(ft, n_bins, kCols, kRows, device);
       AdapterDeviceSketch(adapter.Value(), n_bins, info,
                           std::numeric_limits<float>::quiet_NaN(),
                           &containers.back());
@@ -375,16 +377,16 @@ void TestAllReduceBasic(int32_t n_gpus) {
       sketch_on_single_node.FixError();
     }
     sketch_on_single_node.Unique();
-    TestQuantileElemRank(0, sketch_on_single_node.Data(),
+    TestQuantileElemRank(device, sketch_on_single_node.Data(),
                          sketch_on_single_node.ColumnsPtr(), true);
 
     // Set up distributed version.  We rely on using rank as seed to generate
     // the exact same copy of data.
     auto rank = collective::GetRank();
-    SketchContainer sketch_distributed(ft, n_bins, kCols, kRows, 0);
-    HostDeviceVector<float> storage;
+    SketchContainer sketch_distributed(ft, n_bins, kCols, kRows, device);
+    HostDeviceVector<float> storage({}, device);
     std::string interface_str = RandomDataGenerator{kRows, kCols, 0}
-                                    .Device(0)
+                                    .Device(device)
                                     .Seed(rank + seed)
                                     .GenerateArrayInterface(&storage);
     data::CupyAdapter adapter(interface_str);
@@ -399,7 +401,7 @@ void TestAllReduceBasic(int32_t n_gpus) {
     ASSERT_EQ(sketch_distributed.Data().size(),
               sketch_on_single_node.Data().size());
 
-    TestQuantileElemRank(0, sketch_distributed.Data(),
+    TestQuantileElemRank(device, sketch_distributed.Data(),
                          sketch_distributed.ColumnsPtr(), true);
 
     std::vector<SketchEntry> single_node_data(
@@ -437,12 +439,13 @@ void TestSameOnAllWorkers(int32_t n_gpus) {
   constexpr size_t kRows = 1000, kCols = 100;
   RunWithSeedsAndBins(kRows, [=](int32_t seed, size_t n_bins,
                                  MetaInfo const &info) {
-    auto rank = collective::GetRank();
-    HostDeviceVector<FeatureType> ft;
-    SketchContainer sketch_distributed(ft, n_bins, kCols, kRows, 0);
-    HostDeviceVector<float> storage;
+    auto const rank = collective::GetRank();
+    auto const device = rank;
+    HostDeviceVector<FeatureType> ft({}, device);
+    SketchContainer sketch_distributed(ft, n_bins, kCols, kRows, device);
+    HostDeviceVector<float> storage({}, device);
     std::string interface_str = RandomDataGenerator{kRows, kCols, 0}
-                                    .Device(0)
+                                    .Device(device)
                                     .Seed(rank + seed)
                                     .GenerateArrayInterface(&storage);
     data::CupyAdapter adapter(interface_str);
@@ -451,7 +454,7 @@ void TestSameOnAllWorkers(int32_t n_gpus) {
                         &sketch_distributed);
     sketch_distributed.AllReduce();
     sketch_distributed.Unique();
-    TestQuantileElemRank(0, sketch_distributed.Data(), sketch_distributed.ColumnsPtr(), true);
+    TestQuantileElemRank(device, sketch_distributed.Data(), sketch_distributed.ColumnsPtr(), true);
 
     // Test for all workers having the same sketch.
     size_t n_data = sketch_distributed.Data().size();
@@ -468,7 +471,7 @@ void TestSameOnAllWorkers(int32_t n_gpus) {
     thrust::copy(thrust::device, local_data.data(),
                  local_data.data() + local_data.size(),
                  all_workers.begin() + local_data.size() * rank);
-    collective::DeviceCommunicator* communicator = collective::Communicator::GetDevice(0);
+    collective::DeviceCommunicator* communicator = collective::Communicator::GetDevice(device);
 
     communicator->AllReduceSum(all_workers.data().get(), all_workers.size());
     communicator->Synchronize();

From 960f09e6c3c7af3d61e7e215e9f0076a86e1664d Mon Sep 17 00:00:00 2001
From: Hyunsu Philip Cho <chohyu01@cs.washington.edu>
Date: Thu, 9 Feb 2023 22:35:03 -0800
Subject: [PATCH 2/6] Run some gtests with multiple GPUs

---
 tests/buildkite/pipeline-mgpu.yml |  5 +++++
 tests/buildkite/test-cpp-gpu.sh   |  2 +-
 tests/buildkite/test-cpp-mgpu.sh  | 13 +++++++++++++
 tests/cpp/common/test_quantile.cu |  4 ++--
 4 files changed, 21 insertions(+), 3 deletions(-)
 create mode 100755 tests/buildkite/test-cpp-mgpu.sh

diff --git a/tests/buildkite/pipeline-mgpu.yml b/tests/buildkite/pipeline-mgpu.yml
index 75d7855b6dc9..316e78ad8033 100644
--- a/tests/buildkite/pipeline-mgpu.yml
+++ b/tests/buildkite/pipeline-mgpu.yml
@@ -36,6 +36,11 @@ steps:
       queue: linux-amd64-mgpu
   - wait
   #### -------- TEST --------
+  - label: ":console: Run Google Tests"
+    command: "tests/buildkite/test-cpp-mgpu.sh"
+    key: test-cpp-mgpu
+    agents:
+      queue: linux-amd64-mgpu
   - label: ":console: Test Python package, 4 GPUs"
     command: "tests/buildkite/test-python-gpu.sh mgpu"
     key: test-python-mgpu
diff --git a/tests/buildkite/test-cpp-gpu.sh b/tests/buildkite/test-cpp-gpu.sh
index f1ddf9d5f5e0..9bfed6864126 100755
--- a/tests/buildkite/test-cpp-gpu.sh
+++ b/tests/buildkite/test-cpp-gpu.sh
@@ -10,7 +10,7 @@ chmod +x build/testxgboost
 tests/ci_build/ci_build.sh gpu nvidia-docker \
   --build-arg CUDA_VERSION_ARG=$CUDA_VERSION \
   --build-arg RAPIDS_VERSION_ARG=$RAPIDS_VERSION \
-  build/testxgboost
+  build/testxgboost --gtest_filter=-*MGPU*
 
 # Disabled until https://github.com/dmlc/xgboost/issues/8619 is resolved
 # echo "--- Run Google Tests with CUDA, using a GPU, RMM enabled"
diff --git a/tests/buildkite/test-cpp-mgpu.sh b/tests/buildkite/test-cpp-mgpu.sh
new file mode 100755
index 000000000000..d607e16ca306
--- /dev/null
+++ b/tests/buildkite/test-cpp-mgpu.sh
@@ -0,0 +1,13 @@
+#!/bin/bash
+
+set -euo pipefail
+
+source tests/buildkite/conftest.sh
+
+echo "--- Run Google Tests with CUDA, using multiple GPUs"
+buildkite-agent artifact download "build/testxgboost" . --step build-cuda
+chmod +x build/testxgboost
+tests/ci_build/ci_build.sh gpu nvidia-docker \
+  --build-arg CUDA_VERSION_ARG=$CUDA_VERSION \
+  --build-arg RAPIDS_VERSION_ARG=$RAPIDS_VERSION \
+  build/testxgboost --gtest_filter=*MGPU*
diff --git a/tests/cpp/common/test_quantile.cu b/tests/cpp/common/test_quantile.cu
index bd699e8e99b6..4a71a6a5bf3b 100644
--- a/tests/cpp/common/test_quantile.cu
+++ b/tests/cpp/common/test_quantile.cu
@@ -422,7 +422,7 @@ void TestAllReduceBasic(int32_t n_gpus) {
 }
 }  // anonymous namespace
 
-TEST(GPUQuantile, AllReduceBasic) {
+TEST(GPUQuantile, MGPU_AllReduceBasic) {
   auto const n_gpus = AllVisibleGPUs();
   RunWithInMemoryCommunicator(n_gpus, TestAllReduceBasic, n_gpus);
 }
@@ -495,7 +495,7 @@ void TestSameOnAllWorkers(int32_t n_gpus) {
 }
 }  // anonymous namespace
 
-TEST(GPUQuantile, SameOnAllWorkers) {
+TEST(GPUQuantile, MGPU_SameOnAllWorkers) {
   auto const n_gpus = AllVisibleGPUs();
   RunWithInMemoryCommunicator(n_gpus, TestSameOnAllWorkers, n_gpus);
 }

From 757c304cdd7cfa4b1a18740b46a1a9cb0456f2cf Mon Sep 17 00:00:00 2001
From: Rong Ou <rong.ou@gmail.com>
Date: Thu, 9 Feb 2023 23:41:20 -0800
Subject: [PATCH 3/6] fix mgpu test naming

---
 tests/cpp/common/test_quantile.cu | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/cpp/common/test_quantile.cu b/tests/cpp/common/test_quantile.cu
index 4a71a6a5bf3b..13a4e08d5a18 100644
--- a/tests/cpp/common/test_quantile.cu
+++ b/tests/cpp/common/test_quantile.cu
@@ -422,7 +422,7 @@ void TestAllReduceBasic(int32_t n_gpus) {
 }
 }  // anonymous namespace
 
-TEST(GPUQuantile, MGPU_AllReduceBasic) {
+TEST(GPUQuantile, MGPUAllReduceBasic) {
   auto const n_gpus = AllVisibleGPUs();
   RunWithInMemoryCommunicator(n_gpus, TestAllReduceBasic, n_gpus);
 }
@@ -495,7 +495,7 @@ void TestSameOnAllWorkers(int32_t n_gpus) {
 }
 }  // anonymous namespace
 
-TEST(GPUQuantile, MGPU_SameOnAllWorkers) {
+TEST(GPUQuantile, MGPUSameOnAllWorkers) {
   auto const n_gpus = AllVisibleGPUs();
   RunWithInMemoryCommunicator(n_gpus, TestSameOnAllWorkers, n_gpus);
 }

From 05739ab169debca4deeb5c75fd79df91a100833e Mon Sep 17 00:00:00 2001
From: Hyunsu Philip Cho <chohyu01@cs.washington.edu>
Date: Fri, 10 Feb 2023 10:47:54 -0800
Subject: [PATCH 4/6] Instruct NCCL to print extra logs

---
 tests/buildkite/test-cpp-mgpu.sh | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tests/buildkite/test-cpp-mgpu.sh b/tests/buildkite/test-cpp-mgpu.sh
index d607e16ca306..86ea0e42635e 100755
--- a/tests/buildkite/test-cpp-mgpu.sh
+++ b/tests/buildkite/test-cpp-mgpu.sh
@@ -4,6 +4,8 @@ set -euo pipefail
 
 source tests/buildkite/conftest.sh
 
+export CI_DOCKER_EXTRA_PARAMS_INIT='-e NCCL_DEBUG=INFO'
+
 echo "--- Run Google Tests with CUDA, using multiple GPUs"
 buildkite-agent artifact download "build/testxgboost" . --step build-cuda
 chmod +x build/testxgboost

From 4e9e6c517182f857582a6cc9b8720381ef0b575c Mon Sep 17 00:00:00 2001
From: Hyunsu Philip Cho <chohyu01@cs.washington.edu>
Date: Sat, 11 Feb 2023 22:23:57 -0800
Subject: [PATCH 5/6] Allocate extra space in /dev/shm to enable NCCL

---
 tests/buildkite/test-cpp-mgpu.sh | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tests/buildkite/test-cpp-mgpu.sh b/tests/buildkite/test-cpp-mgpu.sh
index 86ea0e42635e..935a301a66f1 100755
--- a/tests/buildkite/test-cpp-mgpu.sh
+++ b/tests/buildkite/test-cpp-mgpu.sh
@@ -4,7 +4,8 @@ set -euo pipefail
 
 source tests/buildkite/conftest.sh
 
-export CI_DOCKER_EXTRA_PARAMS_INIT='-e NCCL_DEBUG=INFO'
+# Allocate extra space in /dev/shm to enable NCCL
+export CI_DOCKER_EXTRA_PARAMS_INIT='--shm-size=4g'
 
 echo "--- Run Google Tests with CUDA, using multiple GPUs"
 buildkite-agent artifact download "build/testxgboost" . --step build-cuda

From 26697707044c306bf4d1cb1dbb859224febc5f14 Mon Sep 17 00:00:00 2001
From: Rong Ou <rong.ou@gmail.com>
Date: Sun, 12 Feb 2023 16:09:06 -0800
Subject: [PATCH 6/6] use gtest_skip to skip mgpu tests

---
 tests/buildkite/test-cpp-gpu.sh   |  2 +-
 tests/cpp/common/test_quantile.cu | 18 ++++++------------
 2 files changed, 7 insertions(+), 13 deletions(-)

diff --git a/tests/buildkite/test-cpp-gpu.sh b/tests/buildkite/test-cpp-gpu.sh
index 9bfed6864126..f1ddf9d5f5e0 100755
--- a/tests/buildkite/test-cpp-gpu.sh
+++ b/tests/buildkite/test-cpp-gpu.sh
@@ -10,7 +10,7 @@ chmod +x build/testxgboost
 tests/ci_build/ci_build.sh gpu nvidia-docker \
   --build-arg CUDA_VERSION_ARG=$CUDA_VERSION \
   --build-arg RAPIDS_VERSION_ARG=$RAPIDS_VERSION \
-  build/testxgboost --gtest_filter=-*MGPU*
+  build/testxgboost
 
 # Disabled until https://github.com/dmlc/xgboost/issues/8619 is resolved
 # echo "--- Run Google Tests with CUDA, using a GPU, RMM enabled"
diff --git a/tests/cpp/common/test_quantile.cu b/tests/cpp/common/test_quantile.cu
index 13a4e08d5a18..cb24f8bb4140 100644
--- a/tests/cpp/common/test_quantile.cu
+++ b/tests/cpp/common/test_quantile.cu
@@ -341,12 +341,6 @@ TEST(GPUQuantile, MultiMerge) {
 namespace {
 void TestAllReduceBasic(int32_t n_gpus) {
   auto const world = collective::GetWorldSize();
-  if (world != 1) {
-    ASSERT_EQ(world, n_gpus);
-  } else {
-    return;
-  }
-
   constexpr size_t kRows = 1000, kCols = 100;
   RunWithSeedsAndBins(kRows, [=](int32_t seed, size_t n_bins, MetaInfo const& info) {
     auto const device = collective::GetRank();
@@ -424,18 +418,15 @@ void TestAllReduceBasic(int32_t n_gpus) {
 
 TEST(GPUQuantile, MGPUAllReduceBasic) {
   auto const n_gpus = AllVisibleGPUs();
+  if (n_gpus <= 1) {
+    GTEST_SKIP() << "Skipping MGPUAllReduceBasic test with # GPUs = " << n_gpus;
+  }
   RunWithInMemoryCommunicator(n_gpus, TestAllReduceBasic, n_gpus);
 }
 
 namespace {
 void TestSameOnAllWorkers(int32_t n_gpus) {
   auto world = collective::GetWorldSize();
-  if (world != 1) {
-    ASSERT_EQ(world, n_gpus);
-  } else {
-    return;
-  }
-
   constexpr size_t kRows = 1000, kCols = 100;
   RunWithSeedsAndBins(kRows, [=](int32_t seed, size_t n_bins,
                                  MetaInfo const &info) {
@@ -497,6 +488,9 @@ void TestSameOnAllWorkers(int32_t n_gpus) {
 
 TEST(GPUQuantile, MGPUSameOnAllWorkers) {
   auto const n_gpus = AllVisibleGPUs();
+  if (n_gpus <= 1) {
+    GTEST_SKIP() << "Skipping MGPUSameOnAllWorkers test with # GPUs = " << n_gpus;
+  }
   RunWithInMemoryCommunicator(n_gpus, TestSameOnAllWorkers, n_gpus);
 }