From 033d3af5447b063404867d80ef7f654a64cf28f4 Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Thu, 22 Feb 2024 02:30:44 -0500 Subject: [PATCH 1/6] Don't pirate CUDA.CuDevice. --- src/communicator.jl | 6 +++--- test/runtests.jl | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/communicator.jl b/src/communicator.jl index b342a23..901b645 100644 --- a/src/communicator.jl +++ b/src/communicator.jl @@ -64,14 +64,14 @@ function Communicators(devices) end """ - CuDevice(comm::Communicator) :: CuDevice + NCCL.device(comm::Communicator) :: CuDevice The device of the communicator # External Links - [`ncclCommCuDevice`](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/api/comms.html#ncclcommcudevice) """ -function CUDA.CuDevice(comm::Communicator) +function device(comm::Communicator) dev_ref = Ref{Cint}(C_NULL) ncclCommCuDevice(comm, dev_ref) return CuDevice(dev_ref[]) @@ -128,7 +128,7 @@ Get the default stream for device `devid`, or the device corresponding to communicator `comm`. """ function default_device_stream(comm::Communicator) - dev = CuDevice(comm) + dev = device(comm) device!(dev) do stream() end diff --git a/test/runtests.jl b/test/runtests.jl index 1ae382b..403e92a 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -12,7 +12,7 @@ using NCCL comms = NCCL.Communicators(CUDA.devices()) for (i,dev) in enumerate(CUDA.devices()) @test NCCL.rank(comms[i]) == i-1 - @test CuDevice(comms[i]) == dev + @test NCCL.device(comms[i]) == dev @test NCCL.size(comms[i]) == length(CUDA.devices()) end id = NCCL.UniqueID() From 7af04cb9e188e0b5a62154a28be9074a99b02a99 Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Thu, 22 Feb 2024 02:31:25 -0500 Subject: [PATCH 2/6] Revert version bump. --- Project.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Project.toml b/Project.toml index a83a708..ae61437 100644 --- a/Project.toml +++ b/Project.toml @@ -1,6 +1,6 @@ name = "NCCL" uuid = "3fe64909-d7a1-4096-9b7d-7a0f12cf0f6b" -version = "0.2.0" +version = "0.1.0" [deps] CEnum = "fa961155-64e5-5f13-b03f-caf6b980ea82" From 9589d5551bfc3b5f6a1d1a39376d7e8e3ed98a2b Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Thu, 22 Feb 2024 02:34:55 -0500 Subject: [PATCH 3/6] Format. --- src/base.jl | 3 --- src/collective.jl | 27 +++++++++++++++++++-------- src/communicator.jl | 2 -- src/pointtopoint.jl | 10 ++++++---- 4 files changed, 25 insertions(+), 17 deletions(-) diff --git a/src/base.jl b/src/base.jl index 2a02e5f..c8a58bb 100644 --- a/src/base.jl +++ b/src/base.jl @@ -17,9 +17,6 @@ function version() ncclGetVersion(ver_r) ver = ver_r[] - # nccl.h defines this as: - #define NCCL_VERSION(X,Y,Z) (((X) <= 2 && (Y) <= 8) ? (X) * 1000 + (Y) * 100 + (Z) : (X) * 10000 + (Y) * 100 + (Z)) - if ver < 2900 major, ver = divrem(ver, 1000) minor, patch = divrem(ver, 100) diff --git a/src/collective.jl b/src/collective.jl index 3893523..054fa3a 100644 --- a/src/collective.jl +++ b/src/collective.jl @@ -9,7 +9,8 @@ or [`NCCL.avg`](@ref)), writing the result to `recvbuf` to all ranks. # External links - [`ncclAllReduce`](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/api/colls.html#ncclallreduce) """ -function Allreduce!(sendbuf, recvbuf, op, comm::Communicator; stream::CuStream=default_device_stream(comm)) +function Allreduce!(sendbuf, recvbuf, op, comm::Communicator; + stream::CuStream=default_device_stream(comm)) count = length(recvbuf) @assert length(sendbuf) == count data_type = ncclDataType_t(eltype(recvbuf)) @@ -27,8 +28,10 @@ end Reduce the array `sendrecvbuf` using `op` (one of `+`, `*`, `min`, `max`, or `[`NCCL.avg`](@ref)`), writing the result inplace to all ranks. """ -Allreduce!(sendrecvbuf, op, comm::Communicator; stream::CuStream=default_device_stream(comm) ) = +function Allreduce!(sendrecvbuf, op, comm::Communicator; + stream::CuStream=default_device_stream(comm)) Allreduce!(sendrecvbuf, sendrecvbuf, op, comm; stream) +end """ NCCL.Broadcast!( @@ -41,14 +44,17 @@ Copies array the `sendbuf` on rank `root` to `recvbuf` on all ranks. # External links - [`ncclBroadcast`](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/api/colls.html#ncclbroadcast) """ -function Broadcast!(sendbuf, recvbuf, comm::Communicator; root::Integer=0, stream::CuStream=default_device_stream(comm)) +function Broadcast!(sendbuf, recvbuf, comm::Communicator; root::Integer=0, + stream::CuStream=default_device_stream(comm)) data_type = ncclDataType_t(eltype(recvbuf)) count = length(recvbuf) ncclBroadcast(sendbuf, recvbuf, count, data_type, root, comm, stream) return recvbuf end -Broadcast!(sendrecvbuf, comm::Communicator; root::Integer=0, stream::CuStream=default_device_stream(comm)) = +function Broadcast!(sendrecvbuf, comm::Communicator; root::Integer=0, + stream::CuStream=default_device_stream(comm)) Broadcast!(sendrecvbuf, sendrecvbuf, comm; root, stream) +end """ @@ -63,15 +69,18 @@ or `[`NCCL.avg`](@ref)`), writing the result to `recvbuf` on rank `root`. # External links - [`ncclReduce`](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/api/colls.html#ncclreduce) """ -function Reduce!(sendbuf, recvbuf, op, comm::Communicator; root::Integer=0, stream::CuStream=default_device_stream(comm)) +function Reduce!(sendbuf, recvbuf, op, comm::Communicator; root::Integer=0, + stream::CuStream=default_device_stream(comm)) data_type = ncclDataType_t(eltype(recvbuf)) count = length(recvbuf) _op = ncclRedOp_t(op) ncclReduce(sendbuf, recvbuf, count, data_type, _op, root, comm, stream) return recvbuf end -Reduce!(sendrecvbuf, op, comm::Communicator; root::Integer=0, stream::CuStream=default_device_stream(comm)) = +function Reduce!(sendrecvbuf, op, comm::Communicator; root::Integer=0, + stream::CuStream=default_device_stream(comm)) Reduce!(sendrecvbuf, sendrecvbuf, op, comm; root, stream) +end """ NCCL.Allgather!( @@ -84,7 +93,8 @@ Concatenate `sendbuf` from each rank into `recvbuf` on all ranks. # External links - [`ncclAllGather`](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/api/colls.html#ncclallgather) """ -function Allgather!(sendbuf, recvbuf, comm::Communicator; stream::CuStream=default_device_stream(comm)) +function Allgather!(sendbuf, recvbuf, comm::Communicator; + stream::CuStream=default_device_stream(comm)) data_type = ncclDataType_t(eltype(recvbuf)) sendcount = length(sendbuf) @assert length(recvbuf) == sendcount * size(comm) @@ -105,7 +115,8 @@ scattered over the devices such that `recvbuf` on each rank will contain the # External links - [`ncclReduceScatter`](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/api/colls.html#ncclreducescatter) """ -function ReduceScatter!(sendbuf, recvbuf, op, comm::Communicator; stream::CuStream=default_device_stream(comm) ) +function ReduceScatter!(sendbuf, recvbuf, op, comm::Communicator; + stream::CuStream=default_device_stream(comm)) recvcount = length(recvbuf) @assert length(sendbuf) == recvcount * size(comm) data_type = ncclDataType_t(eltype(recvbuf)) diff --git a/src/communicator.jl b/src/communicator.jl index 901b645..7168fd5 100644 --- a/src/communicator.jl +++ b/src/communicator.jl @@ -77,7 +77,6 @@ function device(comm::Communicator) return CuDevice(dev_ref[]) end - """ NCCL.size(comm::Communicator) :: Int @@ -120,7 +119,6 @@ function abort(comm::Communicator) return end - """ NCCL.default_device_stream(comm::Communicator) :: CuStream diff --git a/src/pointtopoint.jl b/src/pointtopoint.jl index fdd24da..c0d9544 100644 --- a/src/pointtopoint.jl +++ b/src/pointtopoint.jl @@ -2,7 +2,7 @@ NCCL.Send( sendbuf, comm::Communicator; dest::Integer, - stream::CuStream = default_device_stream(comm)) + stream::CuStream = default_device_stream(comm)) ) Send data from `sendbuf` to rank `dest`. A matching [`Recv!`](@ref) must also be @@ -11,7 +11,8 @@ called. # External links - [`ncclSend`](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/api/p2p.html#ncclsend) """ -function Send(sendbuf, comm::Communicator; dest::Integer, stream::CuStream=default_device_stream(comm)) +function Send(sendbuf, comm::Communicator; dest::Integer, + stream::CuStream=default_device_stream(comm)) count = length(sendbuf) datatype = ncclDataType_t(eltype(sendbuf)) ncclSend(sendbuf, count, datatype, dest, comm, stream) @@ -22,7 +23,7 @@ end NCCL.Recv!( recvbuf, comm::Communicator; source::Integer, - stream::CuStream = default_device_stream(comm)) + stream::CuStream = default_device_stream(comm)) ) Write the data from a matching [`Send`](@ref) on rank `source` into `recvbuf`. @@ -31,7 +32,8 @@ Write the data from a matching [`Send`](@ref) on rank `source` into `recvbuf`. - [`ncclRecv`](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/api/p2p.html#ncclrecv) """ -function Recv!(recvbuf, comm::Communicator; source::Integer, stream::CuStream=default_device_stream(comm)) +function Recv!(recvbuf, comm::Communicator; source::Integer, + stream::CuStream=default_device_stream(comm)) count = length(recvbuf) datatype = ncclDataType_t(eltype(recvbuf)) ncclRecv(recvbuf, count, datatype, source, comm, stream) From 080655621f2e899b380c57bd4144e70faafdc20c Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Thu, 22 Feb 2024 03:46:54 -0500 Subject: [PATCH 4/6] Fix and test Communicator construction. --- src/communicator.jl | 29 +++++++++++++++++++++++------ test/runtests.jl | 9 +++++---- 2 files changed, 28 insertions(+), 10 deletions(-) diff --git a/src/communicator.jl b/src/communicator.jl index 7168fd5..36fcf8f 100644 --- a/src/communicator.jl +++ b/src/communicator.jl @@ -22,25 +22,42 @@ function destroy(comm::Communicator) end Base.unsafe_convert(::Type{LibNCCL.ncclComm_t}, comm::Communicator) = comm.handle -# creates a new communicator (multi thread/process version) -function Communicator(nranks::Integer, comm_id::UniqueID, rank::Integer) +""" + NCCL.Communicator(nranks, rank; [unique_id]) :: Communicator + +Create a single communicator for use in a multi-threaded or multi-process +environment. `nranks` is the number of ranks in the communicator, and `rank` +is the 0-based index of the current rank. `unique_id` is an optional unique +identifier for the communicator. + +# Examples +``` +comm = Communicator(length(CUDA.devices()), id, myid()) +# this blocks until all other ranks have connected +``` + +# External links +- [`ncclCommInitRank`](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/api/comms.html#ncclCommInitRank) +""" +function Communicator(nranks::Integer, rank::Integer; + unique_id::UniqueID=UniqueID()) + 0 <= rank < nranks || throw(ArgumentError("rank must be in [0, nranks)")) handle_ref = Ref{ncclComm_t}(C_NULL) - ncclCommInitRank(handle_ref, nranks, comm_id, rank) + ncclCommInitRank(handle_ref, nranks, unique_id, rank) c = Communicator(handle_ref[]) return finalizer(destroy, c) end -# creates a clique of communicators (single process version) """ NCCL.Communicators(devices) :: Vector{Communicator} -Construct and initialize a clique of NCCL Communicators. +Construct and initialize a clique of NCCL Communicators over the devices +on a single host. `devices` can either be a collection of identifiers, or `CuDevice`s. # Examples ``` -# initialize a clique over all devices on the host comms = NCCL.Communicators(CUDA.devices()) ``` diff --git a/test/runtests.jl b/test/runtests.jl index 403e92a..a96158b 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -9,16 +9,17 @@ using NCCL @testset "NCCL" begin @testset "Communicator" begin + # clique of communicators comms = NCCL.Communicators(CUDA.devices()) for (i,dev) in enumerate(CUDA.devices()) @test NCCL.rank(comms[i]) == i-1 @test NCCL.device(comms[i]) == dev @test NCCL.size(comms[i]) == length(CUDA.devices()) end - id = NCCL.UniqueID() - #=num_devs = length(CUDA.devices()) - comm = Communicator(num_devs, id, 0) - @test device(comm) == 0=# + + # single communicator (with nranks=1 or this would block) + comm = Communicator(1, 0) + @test NCCL.device(comm) == CuDevice(0) end @testset "Allreduce!" begin From c139c127e8f1e015dd1d5af021e371a5c6d0509e Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Thu, 22 Feb 2024 03:49:50 -0500 Subject: [PATCH 5/6] Bump Julia requirement. --- Project.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Project.toml b/Project.toml index ae61437..758744a 100644 --- a/Project.toml +++ b/Project.toml @@ -9,4 +9,4 @@ NCCL_jll = "4d6d38e4-5b87-5e63-912a-873ff2d649b7" [compat] CEnum = "0.2, 0.3, 0.4, 0.5" -julia = "1.6" +julia = "1.8" From 08fddada65a4e21817f6803c17841ed7e9bbad99 Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Thu, 22 Feb 2024 03:57:59 -0500 Subject: [PATCH 6/6] Docs. --- docs/src/index.md | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/docs/src/index.md b/docs/src/index.md index 14cbfe3..8dd833b 100644 --- a/docs/src/index.md +++ b/docs/src/index.md @@ -1,6 +1,6 @@ # NCCL.jl - A Julia wrapper for the [NVIDIA Collective Communications Library (NCCL)](https://developer.nvidia.com/nccl). + A Julia wrapper for the [NVIDIA Collective Communications Library (NCCL)](https://developer.nvidia.com/nccl). # API @@ -18,8 +18,9 @@ version() ## Communicators ```@docs +Communicator Communicators -CUDA.CuDevice(comm::Communicator) +device(comm::Communicator) size(comm::Communicator) rank(comm::Communicator) abort(comm::Communicator) @@ -50,4 +51,4 @@ avg ```@docs Send Recv! -``` \ No newline at end of file +```