Skip to content

Commit

Permalink
Merge pull request #51 from JuliaGPU/tb/release
Browse files Browse the repository at this point in the history
Prepare for initial release
  • Loading branch information
maleadt authored Feb 22, 2024
2 parents 039bc01 + 08fddad commit d004e29
Show file tree
Hide file tree
Showing 7 changed files with 63 additions and 36 deletions.
4 changes: 2 additions & 2 deletions Project.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
name = "NCCL"
uuid = "3fe64909-d7a1-4096-9b7d-7a0f12cf0f6b"
version = "0.2.0"
version = "0.1.0"

[deps]
CEnum = "fa961155-64e5-5f13-b03f-caf6b980ea82"
Expand All @@ -9,4 +9,4 @@ NCCL_jll = "4d6d38e4-5b87-5e63-912a-873ff2d649b7"

[compat]
CEnum = "0.2, 0.3, 0.4, 0.5"
julia = "1.6"
julia = "1.8"
7 changes: 4 additions & 3 deletions docs/src/index.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# NCCL.jl

A Julia wrapper for the [NVIDIA Collective Communications Library (NCCL)](https://developer.nvidia.com/nccl).
A Julia wrapper for the [NVIDIA Collective Communications Library (NCCL)](https://developer.nvidia.com/nccl).

# API

Expand All @@ -18,8 +18,9 @@ version()
## Communicators

```@docs
Communicator
Communicators
CUDA.CuDevice(comm::Communicator)
device(comm::Communicator)
size(comm::Communicator)
rank(comm::Communicator)
abort(comm::Communicator)
Expand Down Expand Up @@ -50,4 +51,4 @@ avg
```@docs
Send
Recv!
```
```
3 changes: 0 additions & 3 deletions src/base.jl
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,6 @@ function version()
ncclGetVersion(ver_r)
ver = ver_r[]

# nccl.h defines this as:
#define NCCL_VERSION(X,Y,Z) (((X) <= 2 && (Y) <= 8) ? (X) * 1000 + (Y) * 100 + (Z) : (X) * 10000 + (Y) * 100 + (Z))

if ver < 2900
major, ver = divrem(ver, 1000)
minor, patch = divrem(ver, 100)
Expand Down
27 changes: 19 additions & 8 deletions src/collective.jl
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,8 @@ or [`NCCL.avg`](@ref)), writing the result to `recvbuf` to all ranks.
# External links
- [`ncclAllReduce`](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/api/colls.html#ncclallreduce)
"""
function Allreduce!(sendbuf, recvbuf, op, comm::Communicator; stream::CuStream=default_device_stream(comm))
function Allreduce!(sendbuf, recvbuf, op, comm::Communicator;
stream::CuStream=default_device_stream(comm))
count = length(recvbuf)
@assert length(sendbuf) == count
data_type = ncclDataType_t(eltype(recvbuf))
Expand All @@ -27,8 +28,10 @@ end
Reduce the array `sendrecvbuf` using `op` (one of `+`, `*`, `min`, `max`,
or `[`NCCL.avg`](@ref)`), writing the result inplace to all ranks.
"""
Allreduce!(sendrecvbuf, op, comm::Communicator; stream::CuStream=default_device_stream(comm) ) =
function Allreduce!(sendrecvbuf, op, comm::Communicator;
stream::CuStream=default_device_stream(comm))
Allreduce!(sendrecvbuf, sendrecvbuf, op, comm; stream)
end

"""
NCCL.Broadcast!(
Expand All @@ -41,14 +44,17 @@ Copies array the `sendbuf` on rank `root` to `recvbuf` on all ranks.
# External links
- [`ncclBroadcast`](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/api/colls.html#ncclbroadcast)
"""
function Broadcast!(sendbuf, recvbuf, comm::Communicator; root::Integer=0, stream::CuStream=default_device_stream(comm))
function Broadcast!(sendbuf, recvbuf, comm::Communicator; root::Integer=0,
stream::CuStream=default_device_stream(comm))
data_type = ncclDataType_t(eltype(recvbuf))
count = length(recvbuf)
ncclBroadcast(sendbuf, recvbuf, count, data_type, root, comm, stream)
return recvbuf
end
Broadcast!(sendrecvbuf, comm::Communicator; root::Integer=0, stream::CuStream=default_device_stream(comm)) =
function Broadcast!(sendrecvbuf, comm::Communicator; root::Integer=0,
stream::CuStream=default_device_stream(comm))
Broadcast!(sendrecvbuf, sendrecvbuf, comm; root, stream)
end


"""
Expand All @@ -63,15 +69,18 @@ or `[`NCCL.avg`](@ref)`), writing the result to `recvbuf` on rank `root`.
# External links
- [`ncclReduce`](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/api/colls.html#ncclreduce)
"""
function Reduce!(sendbuf, recvbuf, op, comm::Communicator; root::Integer=0, stream::CuStream=default_device_stream(comm))
function Reduce!(sendbuf, recvbuf, op, comm::Communicator; root::Integer=0,
stream::CuStream=default_device_stream(comm))
data_type = ncclDataType_t(eltype(recvbuf))
count = length(recvbuf)
_op = ncclRedOp_t(op)
ncclReduce(sendbuf, recvbuf, count, data_type, _op, root, comm, stream)
return recvbuf
end
Reduce!(sendrecvbuf, op, comm::Communicator; root::Integer=0, stream::CuStream=default_device_stream(comm)) =
function Reduce!(sendrecvbuf, op, comm::Communicator; root::Integer=0,
stream::CuStream=default_device_stream(comm))
Reduce!(sendrecvbuf, sendrecvbuf, op, comm; root, stream)
end

"""
NCCL.Allgather!(
Expand All @@ -84,7 +93,8 @@ Concatenate `sendbuf` from each rank into `recvbuf` on all ranks.
# External links
- [`ncclAllGather`](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/api/colls.html#ncclallgather)
"""
function Allgather!(sendbuf, recvbuf, comm::Communicator; stream::CuStream=default_device_stream(comm))
function Allgather!(sendbuf, recvbuf, comm::Communicator;
stream::CuStream=default_device_stream(comm))
data_type = ncclDataType_t(eltype(recvbuf))
sendcount = length(sendbuf)
@assert length(recvbuf) == sendcount * size(comm)
Expand All @@ -105,7 +115,8 @@ scattered over the devices such that `recvbuf` on each rank will contain the
# External links
- [`ncclReduceScatter`](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/api/colls.html#ncclreducescatter)
"""
function ReduceScatter!(sendbuf, recvbuf, op, comm::Communicator; stream::CuStream=default_device_stream(comm) )
function ReduceScatter!(sendbuf, recvbuf, op, comm::Communicator;
stream::CuStream=default_device_stream(comm))
recvcount = length(recvbuf)
@assert length(sendbuf) == recvcount * size(comm)
data_type = ncclDataType_t(eltype(recvbuf))
Expand Down
37 changes: 26 additions & 11 deletions src/communicator.jl
Original file line number Diff line number Diff line change
Expand Up @@ -22,25 +22,42 @@ function destroy(comm::Communicator)
end
Base.unsafe_convert(::Type{LibNCCL.ncclComm_t}, comm::Communicator) = comm.handle

# creates a new communicator (multi thread/process version)
function Communicator(nranks::Integer, comm_id::UniqueID, rank::Integer)
"""
NCCL.Communicator(nranks, rank; [unique_id]) :: Communicator
Create a single communicator for use in a multi-threaded or multi-process
environment. `nranks` is the number of ranks in the communicator, and `rank`
is the 0-based index of the current rank. `unique_id` is an optional unique
identifier for the communicator.
# Examples
```
comm = Communicator(length(CUDA.devices()), id, myid())
# this blocks until all other ranks have connected
```
# External links
- [`ncclCommInitRank`](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/api/comms.html#ncclCommInitRank)
"""
function Communicator(nranks::Integer, rank::Integer;
unique_id::UniqueID=UniqueID())
0 <= rank < nranks || throw(ArgumentError("rank must be in [0, nranks)"))
handle_ref = Ref{ncclComm_t}(C_NULL)
ncclCommInitRank(handle_ref, nranks, comm_id, rank)
ncclCommInitRank(handle_ref, nranks, unique_id, rank)
c = Communicator(handle_ref[])
return finalizer(destroy, c)
end

# creates a clique of communicators (single process version)
"""
NCCL.Communicators(devices) :: Vector{Communicator}
Construct and initialize a clique of NCCL Communicators.
Construct and initialize a clique of NCCL Communicators over the devices
on a single host.
`devices` can either be a collection of identifiers, or `CuDevice`s.
# Examples
```
# initialize a clique over all devices on the host
comms = NCCL.Communicators(CUDA.devices())
```
Expand All @@ -64,20 +81,19 @@ function Communicators(devices)
end

"""
CuDevice(comm::Communicator) :: CuDevice
NCCL.device(comm::Communicator) :: CuDevice
The device of the communicator
# External Links
- [`ncclCommCuDevice`](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/api/comms.html#ncclcommcudevice)
"""
function CUDA.CuDevice(comm::Communicator)
function device(comm::Communicator)
dev_ref = Ref{Cint}(C_NULL)
ncclCommCuDevice(comm, dev_ref)
return CuDevice(dev_ref[])
end


"""
NCCL.size(comm::Communicator) :: Int
Expand Down Expand Up @@ -120,15 +136,14 @@ function abort(comm::Communicator)
return
end


"""
NCCL.default_device_stream(comm::Communicator) :: CuStream
Get the default stream for device `devid`, or the device corresponding to
communicator `comm`.
"""
function default_device_stream(comm::Communicator)
dev = CuDevice(comm)
dev = device(comm)
device!(dev) do
stream()
end
Expand Down
10 changes: 6 additions & 4 deletions src/pointtopoint.jl
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
NCCL.Send(
sendbuf, comm::Communicator;
dest::Integer,
stream::CuStream = default_device_stream(comm))
stream::CuStream = default_device_stream(comm))
)
Send data from `sendbuf` to rank `dest`. A matching [`Recv!`](@ref) must also be
Expand All @@ -11,7 +11,8 @@ called.
# External links
- [`ncclSend`](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/api/p2p.html#ncclsend)
"""
function Send(sendbuf, comm::Communicator; dest::Integer, stream::CuStream=default_device_stream(comm))
function Send(sendbuf, comm::Communicator; dest::Integer,
stream::CuStream=default_device_stream(comm))
count = length(sendbuf)
datatype = ncclDataType_t(eltype(sendbuf))
ncclSend(sendbuf, count, datatype, dest, comm, stream)
Expand All @@ -22,7 +23,7 @@ end
NCCL.Recv!(
recvbuf, comm::Communicator;
source::Integer,
stream::CuStream = default_device_stream(comm))
stream::CuStream = default_device_stream(comm))
)
Write the data from a matching [`Send`](@ref) on rank `source` into `recvbuf`.
Expand All @@ -31,7 +32,8 @@ Write the data from a matching [`Send`](@ref) on rank `source` into `recvbuf`.
- [`ncclRecv`](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/api/p2p.html#ncclrecv)
"""
function Recv!(recvbuf, comm::Communicator; source::Integer, stream::CuStream=default_device_stream(comm))
function Recv!(recvbuf, comm::Communicator; source::Integer,
stream::CuStream=default_device_stream(comm))
count = length(recvbuf)
datatype = ncclDataType_t(eltype(recvbuf))
ncclRecv(recvbuf, count, datatype, source, comm, stream)
Expand Down
11 changes: 6 additions & 5 deletions test/runtests.jl
Original file line number Diff line number Diff line change
Expand Up @@ -9,16 +9,17 @@ using NCCL
@testset "NCCL" begin

@testset "Communicator" begin
# clique of communicators
comms = NCCL.Communicators(CUDA.devices())
for (i,dev) in enumerate(CUDA.devices())
@test NCCL.rank(comms[i]) == i-1
@test CuDevice(comms[i]) == dev
@test NCCL.device(comms[i]) == dev
@test NCCL.size(comms[i]) == length(CUDA.devices())
end
id = NCCL.UniqueID()
#=num_devs = length(CUDA.devices())
comm = Communicator(num_devs, id, 0)
@test device(comm) == 0=#

# single communicator (with nranks=1 or this would block)
comm = Communicator(1, 0)
@test NCCL.device(comm) == CuDevice(0)
end

@testset "Allreduce!" begin
Expand Down

0 comments on commit d004e29

Please sign in to comment.