ROCm · saurabhAMD · Sep 4, 2024 · Sep 4, 2024 · corey-derochie-amd · Sep 13, 2024
@@ -193,7 +193,7 @@ namespace RcclUnitTesting
     for (int isMultiProcess : testBed.ev.GetIsMultiProcessList())
     {
       int const numProcesses = isMultiProcess ? totalRanks : 1;
-      testBed.InitComms(TestBed::GetDeviceIdsList(numProcesses, totalRanks));
+      testBed.InitComms(testBed.GetDeviceIdsList(numProcesses, totalRanks));
 
       for (int dataIdx = 0; dataIdx < dataTypes.size() && isCorrect; ++dataIdx)
       {

@@ -73,7 +73,7 @@ namespace RcclUnitTesting
     for (int isMultiProcess : testBed.ev.GetIsMultiProcessList())
     {
       int const numProcesses = isMultiProcess ? totalRanks : 1;
-      testBed.InitComms(TestBed::GetDeviceIdsList(numProcesses, totalRanks));
+      testBed.InitComms(testBed.GetDeviceIdsList(numProcesses, totalRanks));
 
       // Prepare AllToAllV options
       std::vector<size_t> numInputElements;
@@ -130,7 +130,7 @@ namespace RcclUnitTesting
     for (int isMultiProcess : testBed.ev.GetIsMultiProcessList())
     {
       int const numProcesses = isMultiProcess ? totalRanks : 1;
-      testBed.InitComms(TestBed::GetDeviceIdsList(numProcesses, totalRanks));
+      testBed.InitComms(testBed.GetDeviceIdsList(numProcesses, totalRanks));
 
       // Prepare AllToAllV options
       std::vector<size_t> numInputElements;

@@ -28,7 +28,7 @@ namespace RcclUnitTesting
     {
       // Test either single process all GPUs, or 1 process per GPU
       int const numProcesses = isMultiProcess ? totalRanks : 1;
-      testBed.InitComms(TestBed::GetDeviceIdsList(numProcesses, totalRanks), numCollPerGroup);
+      testBed.InitComms(testBed.GetDeviceIdsList(numProcesses, totalRanks), numCollPerGroup);
 
       if (testBed.ev.showNames)
         INFO("%s %d-ranks GroupCall Identical\n", isMultiProcess ? "MP" : "SP", totalRanks);
@@ -84,7 +84,7 @@ namespace RcclUnitTesting
     {
       // Test either single process all GPUs, or 1 process per GPU
       int const numProcesses = isMultiProcess ? totalRanks : 1;
-      testBed.InitComms(TestBed::GetDeviceIdsList(numProcesses, totalRanks), numCollPerGroup);
+      testBed.InitComms(testBed.GetDeviceIdsList(numProcesses, totalRanks), numCollPerGroup);
 
       if (testBed.ev.showNames)
         INFO("%s %d-ranks GroupCall Different\n", isMultiProcess ? "MP" : "SP", totalRanks);
@@ -139,7 +139,7 @@ namespace RcclUnitTesting
     {
       // Test either single process all GPUs, or 1 process per GPU
       int const numProcesses = isMultiProcess ? totalRanks : 1;
-      testBed.InitComms(TestBed::GetDeviceIdsList(numProcesses, totalRanks), numCollPerGroup);
+      testBed.InitComms(testBed.GetDeviceIdsList(numProcesses, totalRanks), numCollPerGroup);
 
       if (testBed.ev.showNames)
         INFO("%s %d-ranks GroupCall MixedDayaType\n", isMultiProcess ? "MP" : "SP", totalRanks);
@@ -194,7 +194,7 @@ namespace RcclUnitTesting
             INFO("%s %d-ranks Multistream %d-Group Calls across %d streams\n",
                  isMultiProcess ? "MP" : "SP", totalRanks, numCollPerGroup, numStreamsPerGroup);
 
-          testBed.InitComms(TestBed::GetDeviceIdsList(numProcesses, totalRanks),
+          testBed.InitComms(testBed.GetDeviceIdsList(numProcesses, totalRanks),
                             numCollPerGroup, numStreamsPerGroup);
 
           // Set up each collective in group in different stream (modulo numStreamsPerGroup)
@@ -244,7 +244,7 @@ namespace RcclUnitTesting
       int const numProcesses     = isMultiProcess ? totalRanks : 1;
 
       // Initialize comms by specifying the # of group calls
-      testBed.InitComms(TestBed::GetDeviceIdsList(numProcesses, totalRanks), numCollsPerGroup, numStreamsPerGroup, numGroupCalls, useBlocking);
+      testBed.InitComms(testBed.GetDeviceIdsList(numProcesses, totalRanks), numCollsPerGroup, numStreamsPerGroup, numGroupCalls, useBlocking);
 
       if (testBed.ev.showNames)
         INFO("%s %d-ranks GroupCall MultiGroupCall\n", isMultiProcess ? "MP" : "SP", totalRanks);

@@ -34,7 +34,7 @@ namespace RcclUnitTesting
     {
       int const numProcesses = isMultiProcess ? totalRanks : 1;
       // Initialize communicators in non-blocking mode
-      testBed.InitComms(TestBed::GetDeviceIdsList(numProcesses, totalRanks), 1, 1, 1, useBlocking);
+      testBed.InitComms(testBed.GetDeviceIdsList(numProcesses, totalRanks), 1, 1, 1, useBlocking);
 
       // Loop over various collective functions
       for (auto funcType : funcTypes)

@@ -28,7 +28,7 @@ namespace RcclUnitTesting
       int ranksPerGpu = rpg == 0 ? 1 : testBed.ev.maxRanksPerGpu;
       int totalRanks = numGpus * ranksPerGpu;
       int const numProcesses = isMultiProcess ? numGpus : 1;
-      testBed.InitComms(TestBed::GetDeviceIdsList(numProcesses, numGpus, ranksPerGpu), 1);
+      testBed.InitComms(testBed.GetDeviceIdsList(numProcesses, numGpus, ranksPerGpu), 1);
 
       for (int dataIdx = 0; dataIdx < dataTypes.size() && isCorrect; ++dataIdx)
       for (int numIdx = 0; numIdx < numElements.size() && isCorrect; ++numIdx)
@@ -106,7 +106,7 @@ namespace RcclUnitTesting
       int ranksPerGpu = rpg == 0 ? 1 : testBed.ev.maxRanksPerGpu;
       int totalRanks = numGpus * ranksPerGpu;
       int const numProcesses = isMultiProcess ? numGpus : 1;
-      testBed.InitComms(TestBed::GetDeviceIdsList(numProcesses, numGpus, ranksPerGpu), 1);
+      testBed.InitComms(testBed.GetDeviceIdsList(numProcesses, numGpus, ranksPerGpu), 1);
 
       for (int dataIdx = 0; dataIdx < dataTypes.size() && isCorrect; ++dataIdx)
       for (int numIdx = 0; numIdx < numElements.size() && isCorrect; ++numIdx)

@@ -9,6 +9,8 @@
 #include <cstdlib>
 #include <unistd.h>
 #include <sys/wait.h>
+#include <iostream>
+#include <unordered_map>
 
 namespace RcclUnitTesting
 {
@@ -88,6 +90,117 @@ namespace RcclUnitTesting
     return TEST_SUCCESS;
   }
 
+
+  std::string execCommand(const char* cmd) {
+    std::array<char, 128> buffer;
+    std::string result;
+    std::unique_ptr<FILE, decltype(&pclose)> pipe(popen(cmd, "r"), pclose);
+    if (!pipe) {
+        throw std::runtime_error("popen() failed!");
+    }
+    while (fgets(buffer.data(), buffer.size(), pipe.get()) != nullptr) {
+        result += buffer.data();
+    }
+    return result;
+  }
+
+
+  int getDevicePriority (std::vector<int> *gpuPriorityOrder){
+    // Prepare parent->child pipe
+    int pipefd[2];
+    if (pipe(pipefd) == -1) {
+      ERROR("Unable to create parent->child pipe for getting the device priority vector.\n");
+      return TEST_FAIL;
+    }
+    pid_t pid = fork();
+    if (0 == pid) {
+      std::vector<int> result;
+      try {
+          std::string log = execCommand("rocm-smi --showuniqueid");
+          std::unordered_map<std::string, std::vector<int>> uniqueIdToGpuIndexes;
+          std::string::size_type pos = 0;
+
+          while ((pos = log.find("GPU[", pos)) != std::string::npos) {
+              int gpuIndex = std::stoi(log.substr(pos + 4));
+              std::string::size_type idPos = log.find("Unique ID:", pos);
+              if (idPos == std::string::npos) break;
+              std::string::size_type idEnd = log.find_first_of(" \n", idPos + 11);
+              std::string uniqueId = log.substr(idPos + 11, idEnd - (idPos + 11));
+              uniqueIdToGpuIndexes[uniqueId].push_back(gpuIndex);
+              pos = log.find('\n', pos);
+          }
+
+          // Create a vector of pairs for sorting unique IDs based on the number of associated GPUs
+          std::vector<std::pair<std::string, std::vector<int>>> sortedIds(uniqueIdToGpuIndexes.begin(), uniqueIdToGpuIndexes.end());
+          std::sort(sortedIds.begin(), sortedIds.end(), [](const auto& a, const auto& b) {
+              return a.second.size() > b.second.size();
+          });
+
+          for (const auto& pair : sortedIds) {
+              result.insert(result.end(), pair.second.begin(), pair.second.end());
+          }
+      } catch (const std::exception& e) {
+          std::cerr << "Error: " << e.what() << std::endl;
+          return 1;
+      }
+      if (write(pipefd[1], result.data(), gpuPriorityOrder->size() * sizeof(int)) != gpuPriorityOrder->size() * sizeof(int)) return TEST_FAIL;
+      close(pipefd[0]);
+      close(pipefd[1]);
+      exit(EXIT_SUCCESS);
+    } 
+    else {
+      int status;
+      if (read(pipefd[0], gpuPriorityOrder->data(), gpuPriorityOrder->size() * sizeof(int)) != gpuPriorityOrder->size() * sizeof(int)) return TEST_FAIL;
+      waitpid(pid, &status, 0);
+      assert(!status);
+      close(pipefd[0]);
+      close(pipefd[1]);
+    }
+    return TEST_SUCCESS;
+    return 0;
+  }
+
+
+  int getDeviceMode (bool *cpxMode){
+    // Prepare parent->child pipe
+    int pipefd[2];
+    if (pipe(pipefd) == -1)
+    {
+      ERROR("Unable to create parent->child pipe for getting the device mode\n");
+      return TEST_FAIL;
+    }
+    pid_t pid = fork();
+    if (0 == pid)
+    {
+      bool iscpxMode = false;
+      try {
+          std::string log = execCommand("rocm-smi --showcomputepartition");
+          bool foundCPX = log.find("CPX") != std::string::npos;
+          if (foundCPX) {
+            iscpxMode = true;
+          }
+      } catch (const std::exception& e) {
+          std::cerr << "Error: " << e.what() << std::endl;
+          return 1;
+      }
+      if (write(pipefd[1], &iscpxMode, sizeof(iscpxMode)) != sizeof(iscpxMode)) return TEST_FAIL;
+      close(pipefd[0]);
+      close(pipefd[1]);
+      exit(EXIT_SUCCESS);
+    }
+    else {
+      int status;
+      if (read(pipefd[0], cpxMode, sizeof(*cpxMode)) != sizeof(*cpxMode)) return TEST_FAIL;
+      waitpid(pid, &status, 0);
+      assert(!status);
+      close(pipefd[0]);
+      close(pipefd[1]);
+    }
+    return TEST_SUCCESS;
+    return 0;
+  }
+
+
   EnvVars::EnvVars()
   {
     // Collect number of GPUs available
@@ -115,6 +228,18 @@ namespace RcclUnitTesting
     // Total number of reduction ops
     int numOps = ncclNumOps;
 
+    gpuPriorityOrder.resize(numDetectedGpus);
+    for(int i=0;i<numDetectedGpus;i++){
+      gpuPriorityOrder[i] = i;
+    }
+    if(isGfx94) {
+      bool cpxMode = false;
+      getDeviceMode(&cpxMode);
+      if(cpxMode) {
+        onlyPow2Gpus = true;
+        getDevicePriority(&gpuPriorityOrder);
+      }
+    }
     std::vector<std::string> redOpStrings = GetEnvVarsList("UT_REDOPS");
     for (auto s : redOpStrings)
     {

@@ -18,20 +18,21 @@ namespace RcclUnitTesting
   class EnvVars
   {
   public:
-    bool showNames;      // List test case names during run        [UT_SHOW_NAMES]
-    int  minGpus;        // Set the minimum number of GPUs to use  [UT_MIN_GPUS]
-    int  maxGpus;        // Set the maximum number of GPUs to use  [UT_MAX_GPUS]
-    bool onlyPow2Gpus;   // Only allow power-of-2 # of GPUs        [UT_POW2_GPUS]
-    int  processMask;    // Filter single/multi process            [UT_PROCESS_MASK]
-    bool verbose;        // Show verbose TestBed output for debug  [UT_VERBOSE]
-    int  printValues;    // Print out input/output/expected arrays [UT_PRINT_VALUES]
-    int  maxRanksPerGpu; // Number of ranks using the same GPU     [UT_MAX_RANKS_PER_GPU]
-    bool showTiming;     // Show timing per case at end            [UT_SHOW_TIMING]
-    bool useInteractive; // Run in interactive mode                [UT_INTERACTIVE]
-    int  timeoutUs;      // Set timeout for child in microseconds  [UT_TIMEOUT_US]
-    bool useMultithreading; // Multi-thread single-process ranks   [UT_MULTITHREAD]
-    bool isGfx94;        // Detects if architecture is gfx94
-    bool isGfx12;        // Detects if architecture is gfx12
+    bool showNames;                     // List test case names during run        [UT_SHOW_NAMES]
+    int  minGpus;                       // Set the minimum number of GPUs to use  [UT_MIN_GPUS]
+    int  maxGpus;                       // Set the maximum number of GPUs to use  [UT_MAX_GPUS]
+    bool onlyPow2Gpus;                  // Only allow power-of-2 # of GPUs        [UT_POW2_GPUS]
+    int  processMask;                   // Filter single/multi process            [UT_PROCESS_MASK]
+    bool verbose;                       // Show verbose TestBed output for debug  [UT_VERBOSE]
+    int  printValues;                   // Print out input/output/expected arrays [UT_PRINT_VALUES]
+    int  maxRanksPerGpu;                // Number of ranks using the same GPU     [UT_MAX_RANKS_PER_GPU]
+    bool showTiming;                    // Show timing per case at end            [UT_SHOW_TIMING]
+    bool useInteractive;                // Run in interactive mode                [UT_INTERACTIVE]
+    int  timeoutUs;                     // Set timeout for child in microseconds  [UT_TIMEOUT_US]
+    bool useMultithreading;             // Multi-thread single-process ranks   [UT_MULTITHREAD]
+    bool isGfx94;                       // Detects if architecture is gfx94
+    bool isGfx12;                       // Detects if architecture is gfx12
+    std::vector<int> gpuPriorityOrder;  // Orders the gpus based on the associativity of them with OAM with higher gpus linked.
 
     // Constructor that parses and collects environment variables
     EnvVars();

@@ -193,7 +193,7 @@ namespace RcclUnitTesting
 
   void TestBed::InitComms(int const numGpus, int const numCollectivesInGroup, int const numStreamsPerGroup, int const numGroupCalls, bool const useBlocking)
   {
-    InitComms(TestBed::GetDeviceIdsList(1, numGpus), TestBed::GetNumCollsPerGroup(numCollectivesInGroup, numGroupCalls), TestBed::GetNumStreamsPerGroup(numStreamsPerGroup, numGroupCalls), numGroupCalls, useBlocking);
+    InitComms(GetDeviceIdsList(1, numGpus), TestBed::GetNumCollsPerGroup(numCollectivesInGroup, numGroupCalls), TestBed::GetNumStreamsPerGroup(numStreamsPerGroup, numGroupCalls), numGroupCalls, useBlocking);
   }
 
   void TestBed::SetCollectiveArgs(ncclFunc_t      const funcType,
@@ -576,7 +576,7 @@ namespace RcclUnitTesting
     int k=0;
     for (int i = 0; i < numProcesses; i++)
       for (int j = 0; j < ntasks * ranksPerGpu; j++) {
-        result[i].push_back(k%numGpus);
+        result[i].push_back(ev.gpuPriorityOrder[k%numGpus]);
         k++;
       }
     return result;
@@ -668,7 +668,7 @@ namespace RcclUnitTesting
       if(enableSweep == false && (numGpus < 8 || numRanks < 8)) {
         continue;
       }
-      this->InitComms(TestBed::GetDeviceIdsList(numChildren, numGpus, ranksPerGpu));
+      this->InitComms(this->GetDeviceIdsList(numChildren, numGpus, ranksPerGpu));
       if (testing::Test::HasFailure())
       {
         isCorrect = false;

@@ -134,10 +134,10 @@ namespace RcclUnitTesting
                                                   int const numGroupCalls);
 
     // Helper function that splits up GPUs to the given number of processes
-    static std::vector<std::vector<int>> GetDeviceIdsList(int const numProcesses,
+    std::vector<std::vector<int>> GetDeviceIdsList(int const numProcesses,
                                                           int const numGpus,
                                                           int const ranksPerGpu);
-    static std::vector<std::vector<int>> GetDeviceIdsList(int const numProcesses,
+    std::vector<std::vector<int>> GetDeviceIdsList(int const numProcesses,
                                                           int const numGpus);
 
     // Generate a test case name