Skip to content

Commit

Permalink
Control flushing by NCCL_NET_FORCE_FLUSH and RCCL_NET_HDP_FLUSH
Browse files Browse the repository at this point in the history
  • Loading branch information
wenkaidu committed Dec 11, 2024
1 parent e42661d commit 41d8249
Show file tree
Hide file tree
Showing 2 changed files with 8 additions and 4 deletions.
4 changes: 2 additions & 2 deletions src/graph/paths.cc
Original file line number Diff line number Diff line change
Expand Up @@ -450,15 +450,15 @@ ncclResult_t ncclTopoCheckGdr(struct ncclTopoSystem* system, int64_t busId, int6
}

// Set to 0 to disable the flush on Hopper when using GDR
NCCL_PARAM(NetForceFlush, "NET_FORCE_FLUSH", 0);
NCCL_PARAM(NetForceFlush, "NET_FORCE_FLUSH", 1);

// Determine whether we need to flush the GDR recv buffers
ncclResult_t ncclTopoNeedFlush(struct ncclTopoSystem* system, int64_t busId, int* flush) {
int g;
NCCLCHECK(ncclTopoIdToIndex(system, GPU, busId, &g));
struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g;
#if defined(__HIP_PLATFORM_AMD__) || defined(__HIPCC__)
*flush = 1;
*flush = ncclParamNetForceFlush();
#else
// Flush is required on Ampere and earlier
*flush = gpu->gpu.cudaCompCap < 90 ? 1 : ncclParamNetForceFlush();
Expand Down
8 changes: 6 additions & 2 deletions src/transport/net.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1354,6 +1354,8 @@ static ncclResult_t sendProxyProgress(struct ncclProxyState* proxyState, struct
return ncclSuccess;
}

RCCL_PARAM(NetHdpFlush, "NET_HDP_FLUSH", 1);

static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct ncclProxyArgs* args) {
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_NET_COLLECT_POLL_CNT)
g_npkit_net_poll_cnt++;
Expand Down Expand Up @@ -1543,8 +1545,10 @@ static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct
if (totalSize > 0 && p == NCCL_PROTO_SIMPLE && needFlush) {
// GDRCOPY support
struct recvNetResources* resources = (struct recvNetResources*) (subGroup->connection->transportResources);
if (resources->curr_hdp_reg) *resources->curr_hdp_reg = 0x1;
__sync_synchronize();
if (rcclParamNetHdpFlush() && resources->curr_hdp_reg) {
*resources->curr_hdp_reg = 0x1;
__sync_synchronize();
}
if (resources->gdcFlush) {
#if defined (__x86_64__)
// Force a PCI-E read from GPU memory
Expand Down

0 comments on commit 41d8249

Please sign in to comment.