Skip to content

Commit

Permalink
Introduce RCCL_NET_HDP_FLUSH and RCCL_NET_GDR_FLUSH
Browse files Browse the repository at this point in the history
Both are on by default. Turn both off will skip all flush will likely
result in data error.
  • Loading branch information
wenkaidu committed Dec 11, 2024
1 parent 41d8249 commit cda8fa9
Show file tree
Hide file tree
Showing 2 changed files with 14 additions and 3 deletions.
4 changes: 2 additions & 2 deletions src/graph/paths.cc
Original file line number Diff line number Diff line change
Expand Up @@ -450,15 +450,15 @@ ncclResult_t ncclTopoCheckGdr(struct ncclTopoSystem* system, int64_t busId, int6
}

// Set to 0 to disable the flush on Hopper when using GDR
NCCL_PARAM(NetForceFlush, "NET_FORCE_FLUSH", 1);
NCCL_PARAM(NetForceFlush, "NET_FORCE_FLUSH", 0);

// Determine whether we need to flush the GDR recv buffers
ncclResult_t ncclTopoNeedFlush(struct ncclTopoSystem* system, int64_t busId, int* flush) {
int g;
NCCLCHECK(ncclTopoIdToIndex(system, GPU, busId, &g));
struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g;
#if defined(__HIP_PLATFORM_AMD__) || defined(__HIPCC__)
*flush = ncclParamNetForceFlush();
*flush = 1;
#else
// Flush is required on Ampere and earlier
*flush = gpu->gpu.cudaCompCap < 90 ? 1 : ncclParamNetForceFlush();
Expand Down
13 changes: 12 additions & 1 deletion src/transport/net.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1355,6 +1355,7 @@ static ncclResult_t sendProxyProgress(struct ncclProxyState* proxyState, struct
}

RCCL_PARAM(NetHdpFlush, "NET_HDP_FLUSH", 1);
RCCL_PARAM(NetGdrFlush, "NET_GDR_FLUSH", 1);

static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct ncclProxyArgs* args) {
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_NET_COLLECT_POLL_CNT)
Expand Down Expand Up @@ -1546,8 +1547,13 @@ static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct
// GDRCOPY support
struct recvNetResources* resources = (struct recvNetResources*) (subGroup->connection->transportResources);
if (rcclParamNetHdpFlush() && resources->curr_hdp_reg) {
static bool once = true;
*resources->curr_hdp_reg = 0x1;
__sync_synchronize();
if (once) {
once = false;
INFO(NCCL_INIT, "%s: flushed HDP %p", __func__, resources->curr_hdp_reg);
}
}
if (resources->gdcFlush) {
#if defined (__x86_64__)
Expand All @@ -1557,8 +1563,9 @@ static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct
WARN("NET: GDR Flush only supported on x86_64");
return ncclInternalError;
#endif
} else {
} else if (rcclParamNetGdrFlush()) {
int subCount = 0;
static bool once = true;
for (int i=0; i<subGroup->groupSize; i++) {
struct ncclProxySubArgs* sub = subGroup + i;
if (step < sub->nsteps) {
Expand All @@ -1575,6 +1582,10 @@ static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct
}
struct recvNetResources* resources = (struct recvNetResources*) (subGroup->connection->transportResources);
NCCLCHECK(proxyState->ncclNet->iflush(resources->netRecvComm, subCount, ptrs, sizes, mhandles, subGroup->requests+(step%NCCL_STEPS)));
if (once) {
once = false;
INFO(NCCL_INIT, "%s: issued GDR flush", __func__);
}
}
}
args->idle = 0;
Expand Down

0 comments on commit cda8fa9

Please sign in to comment.