diff --git a/src/transport/net.cc b/src/transport/net.cc index d139f60f5..21312c4fa 100644 --- a/src/transport/net.cc +++ b/src/transport/net.cc @@ -249,7 +249,11 @@ static ncclResult_t recvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph NCCLCHECK(ncclTopoCheckGdr(comm->topo, myInfo->busId, netId, 0, &req.useGdr)); // Determine whether we need to flush the GDR buffer on recv or not - if (req.useGdr) NCCLCHECK(ncclTopoNeedFlush(comm->topo, myInfo->busId, &req.needFlush)); + if (req.useGdr) { + NCCLCHECK(ncclTopoNeedFlush(comm->topo, myInfo->busId, &req.needFlush)); + CUDACHECK(hipDeviceGetAttribute((int*)&req.curr_hdp_reg, hipDeviceAttributeHdpMemFlushCntl, myInfo->cudaDev)); + recv->conn.curr_hdp_reg = req.curr_hdp_reg; + } // We don't support PXN on receive yet tpProxyRank = comm->topParentRanks[myInfo->rank]; @@ -1586,8 +1590,9 @@ static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct sub->transmitted += args->sliceSteps; for (uint64_t step=sub->transmitted-args->sliceSteps; steptransmitted; step++) ncclProfilingRecord(args, s+i, step, ncclProxyProfileRecvGPUWait); if (step < sub->nsteps) { - __sync_synchronize(); struct recvNetResources* resources = (struct recvNetResources*) (sub->connection->transportResources); + *resources->curr_hdp_reg = 0x1; + __sync_synchronize(); volatile uint64_t* recvTail = resources->gdcSync ? resources->gdcSync : &resources->recvMem->tail; if (sub->reg) { // We may have added more net steps, but reg operations only have a single step w.r.t. the GPU.