Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[ON HOLD] Unit Tests for CPX mode #1319

Open
wants to merge 2 commits into
base: develop
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion test/AllReduceTests.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -193,7 +193,7 @@ namespace RcclUnitTesting
for (int isMultiProcess : testBed.ev.GetIsMultiProcessList())
{
int const numProcesses = isMultiProcess ? totalRanks : 1;
testBed.InitComms(TestBed::GetDeviceIdsList(numProcesses, totalRanks));
testBed.InitComms(testBed.GetDeviceIdsList(numProcesses, totalRanks));

for (int dataIdx = 0; dataIdx < dataTypes.size() && isCorrect; ++dataIdx)
{
Expand Down
4 changes: 2 additions & 2 deletions test/AllToAllVTests.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ namespace RcclUnitTesting
for (int isMultiProcess : testBed.ev.GetIsMultiProcessList())
{
int const numProcesses = isMultiProcess ? totalRanks : 1;
testBed.InitComms(TestBed::GetDeviceIdsList(numProcesses, totalRanks));
testBed.InitComms(testBed.GetDeviceIdsList(numProcesses, totalRanks));

// Prepare AllToAllV options
std::vector<size_t> numInputElements;
Expand Down Expand Up @@ -130,7 +130,7 @@ namespace RcclUnitTesting
for (int isMultiProcess : testBed.ev.GetIsMultiProcessList())
{
int const numProcesses = isMultiProcess ? totalRanks : 1;
testBed.InitComms(TestBed::GetDeviceIdsList(numProcesses, totalRanks));
testBed.InitComms(testBed.GetDeviceIdsList(numProcesses, totalRanks));

// Prepare AllToAllV options
std::vector<size_t> numInputElements;
Expand Down
10 changes: 5 additions & 5 deletions test/GroupCallTests.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ namespace RcclUnitTesting
{
// Test either single process all GPUs, or 1 process per GPU
int const numProcesses = isMultiProcess ? totalRanks : 1;
testBed.InitComms(TestBed::GetDeviceIdsList(numProcesses, totalRanks), numCollPerGroup);
testBed.InitComms(testBed.GetDeviceIdsList(numProcesses, totalRanks), numCollPerGroup);

if (testBed.ev.showNames)
INFO("%s %d-ranks GroupCall Identical\n", isMultiProcess ? "MP" : "SP", totalRanks);
Expand Down Expand Up @@ -84,7 +84,7 @@ namespace RcclUnitTesting
{
// Test either single process all GPUs, or 1 process per GPU
int const numProcesses = isMultiProcess ? totalRanks : 1;
testBed.InitComms(TestBed::GetDeviceIdsList(numProcesses, totalRanks), numCollPerGroup);
testBed.InitComms(testBed.GetDeviceIdsList(numProcesses, totalRanks), numCollPerGroup);

if (testBed.ev.showNames)
INFO("%s %d-ranks GroupCall Different\n", isMultiProcess ? "MP" : "SP", totalRanks);
Expand Down Expand Up @@ -139,7 +139,7 @@ namespace RcclUnitTesting
{
// Test either single process all GPUs, or 1 process per GPU
int const numProcesses = isMultiProcess ? totalRanks : 1;
testBed.InitComms(TestBed::GetDeviceIdsList(numProcesses, totalRanks), numCollPerGroup);
testBed.InitComms(testBed.GetDeviceIdsList(numProcesses, totalRanks), numCollPerGroup);

if (testBed.ev.showNames)
INFO("%s %d-ranks GroupCall MixedDayaType\n", isMultiProcess ? "MP" : "SP", totalRanks);
Expand Down Expand Up @@ -194,7 +194,7 @@ namespace RcclUnitTesting
INFO("%s %d-ranks Multistream %d-Group Calls across %d streams\n",
isMultiProcess ? "MP" : "SP", totalRanks, numCollPerGroup, numStreamsPerGroup);

testBed.InitComms(TestBed::GetDeviceIdsList(numProcesses, totalRanks),
testBed.InitComms(testBed.GetDeviceIdsList(numProcesses, totalRanks),
numCollPerGroup, numStreamsPerGroup);

// Set up each collective in group in different stream (modulo numStreamsPerGroup)
Expand Down Expand Up @@ -244,7 +244,7 @@ namespace RcclUnitTesting
int const numProcesses = isMultiProcess ? totalRanks : 1;

// Initialize comms by specifying the # of group calls
testBed.InitComms(TestBed::GetDeviceIdsList(numProcesses, totalRanks), numCollsPerGroup, numStreamsPerGroup, numGroupCalls, useBlocking);
testBed.InitComms(testBed.GetDeviceIdsList(numProcesses, totalRanks), numCollsPerGroup, numStreamsPerGroup, numGroupCalls, useBlocking);

if (testBed.ev.showNames)
INFO("%s %d-ranks GroupCall MultiGroupCall\n", isMultiProcess ? "MP" : "SP", totalRanks);
Expand Down
2 changes: 1 addition & 1 deletion test/NonBlockingTests.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ namespace RcclUnitTesting
{
int const numProcesses = isMultiProcess ? totalRanks : 1;
// Initialize communicators in non-blocking mode
testBed.InitComms(TestBed::GetDeviceIdsList(numProcesses, totalRanks), 1, 1, 1, useBlocking);
testBed.InitComms(testBed.GetDeviceIdsList(numProcesses, totalRanks), 1, 1, 1, useBlocking);

// Loop over various collective functions
for (auto funcType : funcTypes)
Expand Down
4 changes: 2 additions & 2 deletions test/SendRecvTests.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ namespace RcclUnitTesting
int ranksPerGpu = rpg == 0 ? 1 : testBed.ev.maxRanksPerGpu;
int totalRanks = numGpus * ranksPerGpu;
int const numProcesses = isMultiProcess ? numGpus : 1;
testBed.InitComms(TestBed::GetDeviceIdsList(numProcesses, numGpus, ranksPerGpu), 1);
testBed.InitComms(testBed.GetDeviceIdsList(numProcesses, numGpus, ranksPerGpu), 1);

for (int dataIdx = 0; dataIdx < dataTypes.size() && isCorrect; ++dataIdx)
for (int numIdx = 0; numIdx < numElements.size() && isCorrect; ++numIdx)
Expand Down Expand Up @@ -106,7 +106,7 @@ namespace RcclUnitTesting
int ranksPerGpu = rpg == 0 ? 1 : testBed.ev.maxRanksPerGpu;
int totalRanks = numGpus * ranksPerGpu;
int const numProcesses = isMultiProcess ? numGpus : 1;
testBed.InitComms(TestBed::GetDeviceIdsList(numProcesses, numGpus, ranksPerGpu), 1);
testBed.InitComms(testBed.GetDeviceIdsList(numProcesses, numGpus, ranksPerGpu), 1);

for (int dataIdx = 0; dataIdx < dataTypes.size() && isCorrect; ++dataIdx)
for (int numIdx = 0; numIdx < numElements.size() && isCorrect; ++numIdx)
Expand Down
125 changes: 125 additions & 0 deletions test/common/EnvVars.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@
#include <cstdlib>
#include <unistd.h>
#include <sys/wait.h>
#include <iostream>
#include <unordered_map>

namespace RcclUnitTesting
{
Expand Down Expand Up @@ -88,6 +90,117 @@ namespace RcclUnitTesting
return TEST_SUCCESS;
}


std::string execCommand(const char* cmd) {
std::array<char, 128> buffer;
std::string result;
std::unique_ptr<FILE, decltype(&pclose)> pipe(popen(cmd, "r"), pclose);
if (!pipe) {
throw std::runtime_error("popen() failed!");
}
while (fgets(buffer.data(), buffer.size(), pipe.get()) != nullptr) {
result += buffer.data();
}
return result;
}


int getDevicePriority (std::vector<int> *gpuPriorityOrder){
// Prepare parent->child pipe
int pipefd[2];
if (pipe(pipefd) == -1) {
ERROR("Unable to create parent->child pipe for getting the device priority vector.\n");
return TEST_FAIL;
}
pid_t pid = fork();
if (0 == pid) {
std::vector<int> result;
try {
std::string log = execCommand("rocm-smi --showuniqueid");
std::unordered_map<std::string, std::vector<int>> uniqueIdToGpuIndexes;
std::string::size_type pos = 0;

while ((pos = log.find("GPU[", pos)) != std::string::npos) {
int gpuIndex = std::stoi(log.substr(pos + 4));
std::string::size_type idPos = log.find("Unique ID:", pos);
if (idPos == std::string::npos) break;
std::string::size_type idEnd = log.find_first_of(" \n", idPos + 11);
std::string uniqueId = log.substr(idPos + 11, idEnd - (idPos + 11));
uniqueIdToGpuIndexes[uniqueId].push_back(gpuIndex);
pos = log.find('\n', pos);
}

// Create a vector of pairs for sorting unique IDs based on the number of associated GPUs
std::vector<std::pair<std::string, std::vector<int>>> sortedIds(uniqueIdToGpuIndexes.begin(), uniqueIdToGpuIndexes.end());
std::sort(sortedIds.begin(), sortedIds.end(), [](const auto& a, const auto& b) {
return a.second.size() > b.second.size();
});

for (const auto& pair : sortedIds) {
result.insert(result.end(), pair.second.begin(), pair.second.end());
}
} catch (const std::exception& e) {
std::cerr << "Error: " << e.what() << std::endl;
return 1;
}
if (write(pipefd[1], result.data(), gpuPriorityOrder->size() * sizeof(int)) != gpuPriorityOrder->size() * sizeof(int)) return TEST_FAIL;
close(pipefd[0]);
close(pipefd[1]);
exit(EXIT_SUCCESS);
}
else {
int status;
if (read(pipefd[0], gpuPriorityOrder->data(), gpuPriorityOrder->size() * sizeof(int)) != gpuPriorityOrder->size() * sizeof(int)) return TEST_FAIL;
waitpid(pid, &status, 0);
assert(!status);
close(pipefd[0]);
close(pipefd[1]);
}
return TEST_SUCCESS;
return 0;
}


int getDeviceMode (bool *cpxMode){
// Prepare parent->child pipe
int pipefd[2];
if (pipe(pipefd) == -1)
{
ERROR("Unable to create parent->child pipe for getting the device mode\n");
return TEST_FAIL;
}
pid_t pid = fork();
if (0 == pid)
{
bool iscpxMode = false;
try {
std::string log = execCommand("rocm-smi --showcomputepartition");
bool foundCPX = log.find("CPX") != std::string::npos;
if (foundCPX) {
iscpxMode = true;
}
} catch (const std::exception& e) {
std::cerr << "Error: " << e.what() << std::endl;
return 1;
}
if (write(pipefd[1], &iscpxMode, sizeof(iscpxMode)) != sizeof(iscpxMode)) return TEST_FAIL;
close(pipefd[0]);
close(pipefd[1]);
exit(EXIT_SUCCESS);
}
else {
int status;
if (read(pipefd[0], cpxMode, sizeof(*cpxMode)) != sizeof(*cpxMode)) return TEST_FAIL;
waitpid(pid, &status, 0);
assert(!status);
close(pipefd[0]);
close(pipefd[1]);
}
return TEST_SUCCESS;
return 0;
}
Comment on lines +164 to +201
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm curious why rocm-smi is run in a fork process?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Because we cannot use HIP call prior to launching unless it is inside another child process. You can refer to line 207 for this comment. The same procedure is used for other features such as finding the architecture of the system as in line 211.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In your case, I think popen already forks and creates a pipe internally, so you may not need your own fork. If so, getDeviceMode and getDevicePriority can be simplified.



EnvVars::EnvVars()
{
// Collect number of GPUs available
Expand Down Expand Up @@ -115,6 +228,18 @@ namespace RcclUnitTesting
// Total number of reduction ops
int numOps = ncclNumOps;

gpuPriorityOrder.resize(numDetectedGpus);
for(int i=0;i<numDetectedGpus;i++){
gpuPriorityOrder[i] = i;
}
if(isGfx94) {
bool cpxMode = false;
getDeviceMode(&cpxMode);
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The return value of getDeviceMode is never used, so if it can be simplified as I've suggested, you could rename it to isDeviceCpxMode and return bool.

if(cpxMode) {
onlyPow2Gpus = true;
getDevicePriority(&gpuPriorityOrder);
}
}
std::vector<std::string> redOpStrings = GetEnvVarsList("UT_REDOPS");
for (auto s : redOpStrings)
{
Expand Down
29 changes: 15 additions & 14 deletions test/common/EnvVars.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,20 +18,21 @@ namespace RcclUnitTesting
class EnvVars
{
public:
bool showNames; // List test case names during run [UT_SHOW_NAMES]
int minGpus; // Set the minimum number of GPUs to use [UT_MIN_GPUS]
int maxGpus; // Set the maximum number of GPUs to use [UT_MAX_GPUS]
bool onlyPow2Gpus; // Only allow power-of-2 # of GPUs [UT_POW2_GPUS]
int processMask; // Filter single/multi process [UT_PROCESS_MASK]
bool verbose; // Show verbose TestBed output for debug [UT_VERBOSE]
int printValues; // Print out input/output/expected arrays [UT_PRINT_VALUES]
int maxRanksPerGpu; // Number of ranks using the same GPU [UT_MAX_RANKS_PER_GPU]
bool showTiming; // Show timing per case at end [UT_SHOW_TIMING]
bool useInteractive; // Run in interactive mode [UT_INTERACTIVE]
int timeoutUs; // Set timeout for child in microseconds [UT_TIMEOUT_US]
bool useMultithreading; // Multi-thread single-process ranks [UT_MULTITHREAD]
bool isGfx94; // Detects if architecture is gfx94
bool isGfx12; // Detects if architecture is gfx12
bool showNames; // List test case names during run [UT_SHOW_NAMES]
int minGpus; // Set the minimum number of GPUs to use [UT_MIN_GPUS]
int maxGpus; // Set the maximum number of GPUs to use [UT_MAX_GPUS]
bool onlyPow2Gpus; // Only allow power-of-2 # of GPUs [UT_POW2_GPUS]
int processMask; // Filter single/multi process [UT_PROCESS_MASK]
bool verbose; // Show verbose TestBed output for debug [UT_VERBOSE]
int printValues; // Print out input/output/expected arrays [UT_PRINT_VALUES]
int maxRanksPerGpu; // Number of ranks using the same GPU [UT_MAX_RANKS_PER_GPU]
bool showTiming; // Show timing per case at end [UT_SHOW_TIMING]
bool useInteractive; // Run in interactive mode [UT_INTERACTIVE]
int timeoutUs; // Set timeout for child in microseconds [UT_TIMEOUT_US]
bool useMultithreading; // Multi-thread single-process ranks [UT_MULTITHREAD]
bool isGfx94; // Detects if architecture is gfx94
bool isGfx12; // Detects if architecture is gfx12
std::vector<int> gpuPriorityOrder; // Orders the gpus based on the associativity of them with OAM with higher gpus linked.

// Constructor that parses and collects environment variables
EnvVars();
Expand Down
6 changes: 3 additions & 3 deletions test/common/TestBed.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -193,7 +193,7 @@ namespace RcclUnitTesting

void TestBed::InitComms(int const numGpus, int const numCollectivesInGroup, int const numStreamsPerGroup, int const numGroupCalls, bool const useBlocking)
{
InitComms(TestBed::GetDeviceIdsList(1, numGpus), TestBed::GetNumCollsPerGroup(numCollectivesInGroup, numGroupCalls), TestBed::GetNumStreamsPerGroup(numStreamsPerGroup, numGroupCalls), numGroupCalls, useBlocking);
InitComms(GetDeviceIdsList(1, numGpus), TestBed::GetNumCollsPerGroup(numCollectivesInGroup, numGroupCalls), TestBed::GetNumStreamsPerGroup(numStreamsPerGroup, numGroupCalls), numGroupCalls, useBlocking);
}

void TestBed::SetCollectiveArgs(ncclFunc_t const funcType,
Expand Down Expand Up @@ -576,7 +576,7 @@ namespace RcclUnitTesting
int k=0;
for (int i = 0; i < numProcesses; i++)
for (int j = 0; j < ntasks * ranksPerGpu; j++) {
result[i].push_back(k%numGpus);
result[i].push_back(ev.gpuPriorityOrder[k%numGpus]);
k++;
}
return result;
Expand Down Expand Up @@ -668,7 +668,7 @@ namespace RcclUnitTesting
if(enableSweep == false && (numGpus < 8 || numRanks < 8)) {
continue;
}
this->InitComms(TestBed::GetDeviceIdsList(numChildren, numGpus, ranksPerGpu));
this->InitComms(this->GetDeviceIdsList(numChildren, numGpus, ranksPerGpu));
if (testing::Test::HasFailure())
{
isCorrect = false;
Expand Down
4 changes: 2 additions & 2 deletions test/common/TestBed.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -134,10 +134,10 @@ namespace RcclUnitTesting
int const numGroupCalls);

// Helper function that splits up GPUs to the given number of processes
static std::vector<std::vector<int>> GetDeviceIdsList(int const numProcesses,
std::vector<std::vector<int>> GetDeviceIdsList(int const numProcesses,
int const numGpus,
int const ranksPerGpu);
static std::vector<std::vector<int>> GetDeviceIdsList(int const numProcesses,
std::vector<std::vector<int>> GetDeviceIdsList(int const numProcesses,
int const numGpus);

// Generate a test case name
Expand Down