Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Prepare release #51

Merged
merged 6 commits into from
Feb 22, 2024
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions Project.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
name = "NCCL"
uuid = "3fe64909-d7a1-4096-9b7d-7a0f12cf0f6b"
version = "0.2.0"
version = "0.1.0"

[deps]
CEnum = "fa961155-64e5-5f13-b03f-caf6b980ea82"
Expand All @@ -9,4 +9,4 @@ NCCL_jll = "4d6d38e4-5b87-5e63-912a-873ff2d649b7"

[compat]
CEnum = "0.2, 0.3, 0.4, 0.5"
julia = "1.6"
julia = "1.8"
3 changes: 0 additions & 3 deletions src/base.jl
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,6 @@ function version()
ncclGetVersion(ver_r)
ver = ver_r[]

# nccl.h defines this as:
#define NCCL_VERSION(X,Y,Z) (((X) <= 2 && (Y) <= 8) ? (X) * 1000 + (Y) * 100 + (Z) : (X) * 10000 + (Y) * 100 + (Z))

if ver < 2900
major, ver = divrem(ver, 1000)
minor, patch = divrem(ver, 100)
Expand Down
27 changes: 19 additions & 8 deletions src/collective.jl
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,8 @@ or [`NCCL.avg`](@ref)), writing the result to `recvbuf` to all ranks.
# External links
- [`ncclAllReduce`](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/api/colls.html#ncclallreduce)
"""
function Allreduce!(sendbuf, recvbuf, op, comm::Communicator; stream::CuStream=default_device_stream(comm))
function Allreduce!(sendbuf, recvbuf, op, comm::Communicator;
stream::CuStream=default_device_stream(comm))
count = length(recvbuf)
@assert length(sendbuf) == count
data_type = ncclDataType_t(eltype(recvbuf))
Expand All @@ -27,8 +28,10 @@ end
Reduce the array `sendrecvbuf` using `op` (one of `+`, `*`, `min`, `max`,
or `[`NCCL.avg`](@ref)`), writing the result inplace to all ranks.
"""
Allreduce!(sendrecvbuf, op, comm::Communicator; stream::CuStream=default_device_stream(comm) ) =
function Allreduce!(sendrecvbuf, op, comm::Communicator;
stream::CuStream=default_device_stream(comm))
Allreduce!(sendrecvbuf, sendrecvbuf, op, comm; stream)
end

"""
NCCL.Broadcast!(
Expand All @@ -41,14 +44,17 @@ Copies array the `sendbuf` on rank `root` to `recvbuf` on all ranks.
# External links
- [`ncclBroadcast`](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/api/colls.html#ncclbroadcast)
"""
function Broadcast!(sendbuf, recvbuf, comm::Communicator; root::Integer=0, stream::CuStream=default_device_stream(comm))
function Broadcast!(sendbuf, recvbuf, comm::Communicator; root::Integer=0,
stream::CuStream=default_device_stream(comm))
data_type = ncclDataType_t(eltype(recvbuf))
count = length(recvbuf)
ncclBroadcast(sendbuf, recvbuf, count, data_type, root, comm, stream)
return recvbuf
end
Broadcast!(sendrecvbuf, comm::Communicator; root::Integer=0, stream::CuStream=default_device_stream(comm)) =
function Broadcast!(sendrecvbuf, comm::Communicator; root::Integer=0,
stream::CuStream=default_device_stream(comm))
Broadcast!(sendrecvbuf, sendrecvbuf, comm; root, stream)
end


"""
Expand All @@ -63,15 +69,18 @@ or `[`NCCL.avg`](@ref)`), writing the result to `recvbuf` on rank `root`.
# External links
- [`ncclReduce`](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/api/colls.html#ncclreduce)
"""
function Reduce!(sendbuf, recvbuf, op, comm::Communicator; root::Integer=0, stream::CuStream=default_device_stream(comm))
function Reduce!(sendbuf, recvbuf, op, comm::Communicator; root::Integer=0,
stream::CuStream=default_device_stream(comm))
data_type = ncclDataType_t(eltype(recvbuf))
count = length(recvbuf)
_op = ncclRedOp_t(op)
ncclReduce(sendbuf, recvbuf, count, data_type, _op, root, comm, stream)
return recvbuf
end
Reduce!(sendrecvbuf, op, comm::Communicator; root::Integer=0, stream::CuStream=default_device_stream(comm)) =
function Reduce!(sendrecvbuf, op, comm::Communicator; root::Integer=0,
stream::CuStream=default_device_stream(comm))
Reduce!(sendrecvbuf, sendrecvbuf, op, comm; root, stream)
end

"""
NCCL.Allgather!(
Expand All @@ -84,7 +93,8 @@ Concatenate `sendbuf` from each rank into `recvbuf` on all ranks.
# External links
- [`ncclAllGather`](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/api/colls.html#ncclallgather)
"""
function Allgather!(sendbuf, recvbuf, comm::Communicator; stream::CuStream=default_device_stream(comm))
function Allgather!(sendbuf, recvbuf, comm::Communicator;
stream::CuStream=default_device_stream(comm))
data_type = ncclDataType_t(eltype(recvbuf))
sendcount = length(sendbuf)
@assert length(recvbuf) == sendcount * size(comm)
Expand All @@ -105,7 +115,8 @@ scattered over the devices such that `recvbuf` on each rank will contain the
# External links
- [`ncclReduceScatter`](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/api/colls.html#ncclreducescatter)
"""
function ReduceScatter!(sendbuf, recvbuf, op, comm::Communicator; stream::CuStream=default_device_stream(comm) )
function ReduceScatter!(sendbuf, recvbuf, op, comm::Communicator;
stream::CuStream=default_device_stream(comm))
recvcount = length(recvbuf)
@assert length(sendbuf) == recvcount * size(comm)
data_type = ncclDataType_t(eltype(recvbuf))
Expand Down
37 changes: 26 additions & 11 deletions src/communicator.jl
Original file line number Diff line number Diff line change
Expand Up @@ -22,25 +22,42 @@ function destroy(comm::Communicator)
end
Base.unsafe_convert(::Type{LibNCCL.ncclComm_t}, comm::Communicator) = comm.handle

# creates a new communicator (multi thread/process version)
function Communicator(nranks::Integer, comm_id::UniqueID, rank::Integer)
"""
NCCL.Communicator(nranks, rank; [unique_id]) :: Communicator

Create a single communicator for use in a multi-threaded or multi-process
environment. `nranks` is the number of ranks in the communicator, and `rank`
is the 0-based index of the current rank. `unique_id` is an optional unique
identifier for the communicator.

# Examples
```
comm = Communicator(length(CUDA.devices()), id, myid())
# this blocks until all other ranks have connected
```

# External links
- [`ncclCommInitRank`](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/api/comms.html#ncclCommInitRank)
"""
function Communicator(nranks::Integer, rank::Integer;
unique_id::UniqueID=UniqueID())
0 <= rank < nranks || throw(ArgumentError("rank must be in [0, nranks)"))
handle_ref = Ref{ncclComm_t}(C_NULL)
ncclCommInitRank(handle_ref, nranks, comm_id, rank)
ncclCommInitRank(handle_ref, nranks, unique_id, rank)
c = Communicator(handle_ref[])
return finalizer(destroy, c)
end

# creates a clique of communicators (single process version)
"""
NCCL.Communicators(devices) :: Vector{Communicator}

Construct and initialize a clique of NCCL Communicators.
Construct and initialize a clique of NCCL Communicators over the devices
on a single host.

`devices` can either be a collection of identifiers, or `CuDevice`s.

# Examples
```
# initialize a clique over all devices on the host
comms = NCCL.Communicators(CUDA.devices())
```

Expand All @@ -64,20 +81,19 @@ function Communicators(devices)
end

"""
CuDevice(comm::Communicator) :: CuDevice
NCCL.device(comm::Communicator) :: CuDevice

The device of the communicator

# External Links
- [`ncclCommCuDevice`](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/api/comms.html#ncclcommcudevice)
"""
function CUDA.CuDevice(comm::Communicator)
function device(comm::Communicator)
Comment on lines -74 to +91
Copy link
Member Author

@maleadt maleadt Feb 22, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

cc @simonbyrne, it seemed questionable to map this onto the CuDevice constructor, while CUDA.jl itself uses device()-like functions for e.g. querying the current device.

dev_ref = Ref{Cint}(C_NULL)
ncclCommCuDevice(comm, dev_ref)
return CuDevice(dev_ref[])
end


"""
NCCL.size(comm::Communicator) :: Int

Expand Down Expand Up @@ -120,15 +136,14 @@ function abort(comm::Communicator)
return
end


"""
NCCL.default_device_stream(comm::Communicator) :: CuStream

Get the default stream for device `devid`, or the device corresponding to
communicator `comm`.
"""
function default_device_stream(comm::Communicator)
dev = CuDevice(comm)
dev = device(comm)
device!(dev) do
stream()
end
Expand Down
10 changes: 6 additions & 4 deletions src/pointtopoint.jl
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
NCCL.Send(
sendbuf, comm::Communicator;
dest::Integer,
stream::CuStream = default_device_stream(comm))
stream::CuStream = default_device_stream(comm))
)

Send data from `sendbuf` to rank `dest`. A matching [`Recv!`](@ref) must also be
Expand All @@ -11,7 +11,8 @@ called.
# External links
- [`ncclSend`](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/api/p2p.html#ncclsend)
"""
function Send(sendbuf, comm::Communicator; dest::Integer, stream::CuStream=default_device_stream(comm))
function Send(sendbuf, comm::Communicator; dest::Integer,
stream::CuStream=default_device_stream(comm))
count = length(sendbuf)
datatype = ncclDataType_t(eltype(sendbuf))
ncclSend(sendbuf, count, datatype, dest, comm, stream)
Expand All @@ -22,7 +23,7 @@ end
NCCL.Recv!(
recvbuf, comm::Communicator;
source::Integer,
stream::CuStream = default_device_stream(comm))
stream::CuStream = default_device_stream(comm))
)

Write the data from a matching [`Send`](@ref) on rank `source` into `recvbuf`.
Expand All @@ -31,7 +32,8 @@ Write the data from a matching [`Send`](@ref) on rank `source` into `recvbuf`.
- [`ncclRecv`](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/api/p2p.html#ncclrecv)

"""
function Recv!(recvbuf, comm::Communicator; source::Integer, stream::CuStream=default_device_stream(comm))
function Recv!(recvbuf, comm::Communicator; source::Integer,
stream::CuStream=default_device_stream(comm))
count = length(recvbuf)
datatype = ncclDataType_t(eltype(recvbuf))
ncclRecv(recvbuf, count, datatype, source, comm, stream)
Expand Down
11 changes: 6 additions & 5 deletions test/runtests.jl
Original file line number Diff line number Diff line change
Expand Up @@ -9,16 +9,17 @@ using NCCL
@testset "NCCL" begin

@testset "Communicator" begin
# clique of communicators
comms = NCCL.Communicators(CUDA.devices())
for (i,dev) in enumerate(CUDA.devices())
@test NCCL.rank(comms[i]) == i-1
@test CuDevice(comms[i]) == dev
@test NCCL.device(comms[i]) == dev
@test NCCL.size(comms[i]) == length(CUDA.devices())
end
id = NCCL.UniqueID()
#=num_devs = length(CUDA.devices())
comm = Communicator(num_devs, id, 0)
@test device(comm) == 0=#

# single communicator (with nranks=1 or this would block)
comm = Communicator(1, 0)
@test NCCL.device(comm) == CuDevice(0)
end

@testset "Allreduce!" begin
Expand Down
Loading