Skip to content

Commit

Permalink
Switched calls to cudaMemcpyAsync to be cudaMemcpy in `ncclTransp…
Browse files Browse the repository at this point in the history
…ortP2pSetup` to avoid race condition with `cudaIpcOpenMemHandle` inside p2p `connect`. See `ncclP2pImportShareableBuffer`.
  • Loading branch information
corey-derochie-amd committed Dec 4, 2024
1 parent a05329b commit 0b707d6
Showing 1 changed file with 2 additions and 2 deletions.
4 changes: 2 additions & 2 deletions src/transport.cc
Original file line number Diff line number Diff line change
Expand Up @@ -205,7 +205,7 @@ ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph*
if (ret == ncclSuccess) {
conn->connected = 1;
/* comm->channels[c].devPeers[sendPeer]->send[connIndex] is a device memory access. */
CUDACHECKGOTO(cudaMemcpyAsync(&comm->channels[c].devPeersHostPtr[sendPeer]->send[connIndex], &conn->conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, comm->sharedRes->hostStream.cudaStream), ret, fail);
CUDACHECKGOTO(cudaMemcpy(&comm->channels[c].devPeersHostPtr[sendPeer]->send[connIndex], &conn->conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, comm->sharedRes->hostStream.cudaStream), ret, fail);
} else if (ret == ncclInProgress) {
allChannelsConnected = false;
}
Expand All @@ -224,7 +224,7 @@ ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph*
if (ret == ncclSuccess) {
conn->connected = 1;
/* comm->channels[c].devPeers[recvPeer]->recv[connIndex] is a device memory access. */
CUDACHECKGOTO(cudaMemcpyAsync(&comm->channels[c].devPeersHostPtr[recvPeer]->recv[connIndex], &conn->conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, comm->sharedRes->hostStream.cudaStream), ret, fail);
CUDACHECKGOTO(cudaMemcpy(&comm->channels[c].devPeersHostPtr[recvPeer]->recv[connIndex], &conn->conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, comm->sharedRes->hostStream.cudaStream), ret, fail);
} else if (ret == ncclInProgress) {
allChannelsConnected = false;
}
Expand Down

0 comments on commit 0b707d6

Please sign in to comment.