From 033d3af5447b063404867d80ef7f654a64cf28f4 Mon Sep 17 00:00:00 2001
From: Tim Besard <tim.besard@gmail.com>
Date: Thu, 22 Feb 2024 02:30:44 -0500
Subject: [PATCH 1/6] Don't pirate CUDA.CuDevice.

---
 src/communicator.jl | 6 +++---
 test/runtests.jl    | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/communicator.jl b/src/communicator.jl
index b342a23..901b645 100644
--- a/src/communicator.jl
+++ b/src/communicator.jl
@@ -64,14 +64,14 @@ function Communicators(devices)
 end
 
 """
-    CuDevice(comm::Communicator) :: CuDevice
+    NCCL.device(comm::Communicator) :: CuDevice
 
 The device of the communicator
 
 # External Links
 - [`ncclCommCuDevice`](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/api/comms.html#ncclcommcudevice)
 """
-function CUDA.CuDevice(comm::Communicator)
+function device(comm::Communicator)
     dev_ref = Ref{Cint}(C_NULL)
     ncclCommCuDevice(comm, dev_ref)
     return CuDevice(dev_ref[])
@@ -128,7 +128,7 @@ Get the default stream for device `devid`, or the device corresponding to
 communicator `comm`.
 """
 function default_device_stream(comm::Communicator)
-    dev = CuDevice(comm)
+    dev = device(comm)
     device!(dev) do
         stream()
     end
diff --git a/test/runtests.jl b/test/runtests.jl
index 1ae382b..403e92a 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -12,7 +12,7 @@ using NCCL
     comms = NCCL.Communicators(CUDA.devices())
     for (i,dev) in enumerate(CUDA.devices())
         @test NCCL.rank(comms[i]) == i-1
-        @test CuDevice(comms[i]) == dev
+        @test NCCL.device(comms[i]) == dev
         @test NCCL.size(comms[i]) == length(CUDA.devices())
     end
     id  = NCCL.UniqueID()

From 7af04cb9e188e0b5a62154a28be9074a99b02a99 Mon Sep 17 00:00:00 2001
From: Tim Besard <tim.besard@gmail.com>
Date: Thu, 22 Feb 2024 02:31:25 -0500
Subject: [PATCH 2/6] Revert version bump.

---
 Project.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Project.toml b/Project.toml
index a83a708..ae61437 100644
--- a/Project.toml
+++ b/Project.toml
@@ -1,6 +1,6 @@
 name = "NCCL"
 uuid = "3fe64909-d7a1-4096-9b7d-7a0f12cf0f6b"
-version = "0.2.0"
+version = "0.1.0"
 
 [deps]
 CEnum = "fa961155-64e5-5f13-b03f-caf6b980ea82"

From 9589d5551bfc3b5f6a1d1a39376d7e8e3ed98a2b Mon Sep 17 00:00:00 2001
From: Tim Besard <tim.besard@gmail.com>
Date: Thu, 22 Feb 2024 02:34:55 -0500
Subject: [PATCH 3/6] Format.

---
 src/base.jl         |  3 ---
 src/collective.jl   | 27 +++++++++++++++++++--------
 src/communicator.jl |  2 --
 src/pointtopoint.jl | 10 ++++++----
 4 files changed, 25 insertions(+), 17 deletions(-)

diff --git a/src/base.jl b/src/base.jl
index 2a02e5f..c8a58bb 100644
--- a/src/base.jl
+++ b/src/base.jl
@@ -17,9 +17,6 @@ function version()
     ncclGetVersion(ver_r)
     ver = ver_r[]
 
-    # nccl.h defines this as:
-    #define NCCL_VERSION(X,Y,Z) (((X) <= 2 && (Y) <= 8) ? (X) * 1000 + (Y) * 100 + (Z) : (X) * 10000 + (Y) * 100 + (Z))
-
     if ver < 2900
         major, ver = divrem(ver, 1000)
         minor, patch = divrem(ver, 100)
diff --git a/src/collective.jl b/src/collective.jl
index 3893523..054fa3a 100644
--- a/src/collective.jl
+++ b/src/collective.jl
@@ -9,7 +9,8 @@ or [`NCCL.avg`](@ref)), writing the result to `recvbuf` to all ranks.
 # External links
 - [`ncclAllReduce`](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/api/colls.html#ncclallreduce)
 """
-function Allreduce!(sendbuf, recvbuf, op, comm::Communicator; stream::CuStream=default_device_stream(comm))
+function Allreduce!(sendbuf, recvbuf, op, comm::Communicator;
+                    stream::CuStream=default_device_stream(comm))
     count = length(recvbuf)
     @assert length(sendbuf) == count
     data_type = ncclDataType_t(eltype(recvbuf))
@@ -27,8 +28,10 @@ end
 Reduce the array `sendrecvbuf` using `op` (one of `+`, `*`, `min`, `max`,
 or `[`NCCL.avg`](@ref)`), writing the result inplace to all ranks.
 """
-Allreduce!(sendrecvbuf, op, comm::Communicator; stream::CuStream=default_device_stream(comm) ) =
+function Allreduce!(sendrecvbuf, op, comm::Communicator;
+                    stream::CuStream=default_device_stream(comm))
     Allreduce!(sendrecvbuf, sendrecvbuf, op, comm; stream)
+end
 
 """
     NCCL.Broadcast!(
@@ -41,14 +44,17 @@ Copies array the `sendbuf` on rank `root` to `recvbuf` on all ranks.
 # External links
 - [`ncclBroadcast`](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/api/colls.html#ncclbroadcast)
 """
-function Broadcast!(sendbuf, recvbuf, comm::Communicator; root::Integer=0, stream::CuStream=default_device_stream(comm))
+function Broadcast!(sendbuf, recvbuf, comm::Communicator; root::Integer=0,
+                    stream::CuStream=default_device_stream(comm))
     data_type = ncclDataType_t(eltype(recvbuf))
     count = length(recvbuf)
     ncclBroadcast(sendbuf, recvbuf, count, data_type, root, comm, stream)
     return recvbuf
 end
-Broadcast!(sendrecvbuf, comm::Communicator;  root::Integer=0, stream::CuStream=default_device_stream(comm)) =
+function Broadcast!(sendrecvbuf, comm::Communicator;  root::Integer=0,
+                    stream::CuStream=default_device_stream(comm))
     Broadcast!(sendrecvbuf, sendrecvbuf, comm; root, stream)
+end
 
 
 """
@@ -63,15 +69,18 @@ or `[`NCCL.avg`](@ref)`), writing the result to `recvbuf` on rank `root`.
 # External links
 - [`ncclReduce`](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/api/colls.html#ncclreduce)
 """
-function Reduce!(sendbuf, recvbuf, op, comm::Communicator; root::Integer=0, stream::CuStream=default_device_stream(comm))
+function Reduce!(sendbuf, recvbuf, op, comm::Communicator; root::Integer=0,
+                 stream::CuStream=default_device_stream(comm))
     data_type = ncclDataType_t(eltype(recvbuf))
     count = length(recvbuf)
     _op = ncclRedOp_t(op)
     ncclReduce(sendbuf, recvbuf, count, data_type, _op, root, comm, stream)
     return recvbuf
 end
-Reduce!(sendrecvbuf, op, comm::Communicator; root::Integer=0, stream::CuStream=default_device_stream(comm)) =
+function Reduce!(sendrecvbuf, op, comm::Communicator; root::Integer=0,
+                 stream::CuStream=default_device_stream(comm))
     Reduce!(sendrecvbuf, sendrecvbuf, op, comm; root, stream)
+end
 
 """
     NCCL.Allgather!(
@@ -84,7 +93,8 @@ Concatenate `sendbuf` from each rank into `recvbuf` on all ranks.
 # External links
 - [`ncclAllGather`](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/api/colls.html#ncclallgather)
 """
-function Allgather!(sendbuf, recvbuf, comm::Communicator; stream::CuStream=default_device_stream(comm))
+function Allgather!(sendbuf, recvbuf, comm::Communicator;
+                    stream::CuStream=default_device_stream(comm))
     data_type = ncclDataType_t(eltype(recvbuf))
     sendcount = length(sendbuf)
     @assert length(recvbuf) == sendcount * size(comm)
@@ -105,7 +115,8 @@ scattered over the devices such that `recvbuf` on each rank will contain the
 # External links
 - [`ncclReduceScatter`](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/api/colls.html#ncclreducescatter)
 """
-function ReduceScatter!(sendbuf, recvbuf, op, comm::Communicator; stream::CuStream=default_device_stream(comm) )
+function ReduceScatter!(sendbuf, recvbuf, op, comm::Communicator;
+                        stream::CuStream=default_device_stream(comm))
     recvcount = length(recvbuf)
     @assert length(sendbuf) == recvcount * size(comm)
     data_type = ncclDataType_t(eltype(recvbuf))
diff --git a/src/communicator.jl b/src/communicator.jl
index 901b645..7168fd5 100644
--- a/src/communicator.jl
+++ b/src/communicator.jl
@@ -77,7 +77,6 @@ function device(comm::Communicator)
     return CuDevice(dev_ref[])
 end
 
-
 """
     NCCL.size(comm::Communicator) :: Int
 
@@ -120,7 +119,6 @@ function abort(comm::Communicator)
     return
 end
 
-
 """
     NCCL.default_device_stream(comm::Communicator) :: CuStream
 
diff --git a/src/pointtopoint.jl b/src/pointtopoint.jl
index fdd24da..c0d9544 100644
--- a/src/pointtopoint.jl
+++ b/src/pointtopoint.jl
@@ -2,7 +2,7 @@
     NCCL.Send(
         sendbuf, comm::Communicator;
         dest::Integer,
-        stream::CuStream = default_device_stream(comm))  
+        stream::CuStream = default_device_stream(comm))
     )
 
 Send data from `sendbuf` to rank `dest`. A matching [`Recv!`](@ref) must also be
@@ -11,7 +11,8 @@ called.
 # External links
 - [`ncclSend`](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/api/p2p.html#ncclsend)
 """
-function Send(sendbuf, comm::Communicator; dest::Integer, stream::CuStream=default_device_stream(comm))
+function Send(sendbuf, comm::Communicator; dest::Integer,
+              stream::CuStream=default_device_stream(comm))
     count = length(sendbuf)
     datatype = ncclDataType_t(eltype(sendbuf))
     ncclSend(sendbuf, count, datatype, dest, comm, stream)
@@ -22,7 +23,7 @@ end
     NCCL.Recv!(
         recvbuf, comm::Communicator;
         source::Integer,
-        stream::CuStream = default_device_stream(comm))  
+        stream::CuStream = default_device_stream(comm))
     )
 
 Write the data from a matching [`Send`](@ref) on rank `source` into `recvbuf`.
@@ -31,7 +32,8 @@ Write the data from a matching [`Send`](@ref) on rank `source` into `recvbuf`.
 - [`ncclRecv`](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/api/p2p.html#ncclrecv)
 
 """
-function Recv!(recvbuf, comm::Communicator; source::Integer, stream::CuStream=default_device_stream(comm))
+function Recv!(recvbuf, comm::Communicator; source::Integer,
+               stream::CuStream=default_device_stream(comm))
     count = length(recvbuf)
     datatype = ncclDataType_t(eltype(recvbuf))
     ncclRecv(recvbuf, count, datatype, source, comm, stream)

From 080655621f2e899b380c57bd4144e70faafdc20c Mon Sep 17 00:00:00 2001
From: Tim Besard <tim.besard@gmail.com>
Date: Thu, 22 Feb 2024 03:46:54 -0500
Subject: [PATCH 4/6] Fix and test Communicator construction.

---
 src/communicator.jl | 29 +++++++++++++++++++++++------
 test/runtests.jl    |  9 +++++----
 2 files changed, 28 insertions(+), 10 deletions(-)

diff --git a/src/communicator.jl b/src/communicator.jl
index 7168fd5..36fcf8f 100644
--- a/src/communicator.jl
+++ b/src/communicator.jl
@@ -22,25 +22,42 @@ function destroy(comm::Communicator)
 end
 Base.unsafe_convert(::Type{LibNCCL.ncclComm_t}, comm::Communicator) = comm.handle
 
-# creates a new communicator (multi thread/process version)
-function Communicator(nranks::Integer, comm_id::UniqueID, rank::Integer)
+"""
+    NCCL.Communicator(nranks, rank; [unique_id]) :: Communicator
+
+Create a single communicator for use in a multi-threaded or multi-process
+environment. `nranks` is the number of ranks in the communicator, and `rank`
+is the 0-based index of the current rank. `unique_id` is an optional unique
+identifier for the communicator.
+
+# Examples
+```
+comm = Communicator(length(CUDA.devices()), id, myid())
+# this blocks until all other ranks have connected
+```
+
+# External links
+- [`ncclCommInitRank`](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/api/comms.html#ncclCommInitRank)
+"""
+function Communicator(nranks::Integer, rank::Integer;
+                      unique_id::UniqueID=UniqueID())
+    0 <= rank < nranks || throw(ArgumentError("rank must be in [0, nranks)"))
     handle_ref = Ref{ncclComm_t}(C_NULL)
-    ncclCommInitRank(handle_ref, nranks, comm_id, rank)
+    ncclCommInitRank(handle_ref, nranks, unique_id, rank)
     c = Communicator(handle_ref[])
     return finalizer(destroy, c)
 end
 
-# creates a clique of communicators (single process version)
 """
     NCCL.Communicators(devices) :: Vector{Communicator}
 
-Construct and initialize a clique of NCCL Communicators.
+Construct and initialize a clique of NCCL Communicators over the devices
+on a single host.
 
 `devices` can either be a collection of identifiers, or `CuDevice`s.
 
 # Examples
 ```
-# initialize a clique over all devices on the host
 comms = NCCL.Communicators(CUDA.devices())
 ```
 
diff --git a/test/runtests.jl b/test/runtests.jl
index 403e92a..a96158b 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -9,16 +9,17 @@ using NCCL
 @testset "NCCL" begin
 
 @testset "Communicator" begin
+    # clique of communicators
     comms = NCCL.Communicators(CUDA.devices())
     for (i,dev) in enumerate(CUDA.devices())
         @test NCCL.rank(comms[i]) == i-1
         @test NCCL.device(comms[i]) == dev
         @test NCCL.size(comms[i]) == length(CUDA.devices())
     end
-    id  = NCCL.UniqueID()
-    #=num_devs = length(CUDA.devices())
-    comm  = Communicator(num_devs, id, 0)
-    @test device(comm) == 0=#
+
+    # single communicator (with nranks=1 or this would block)
+    comm  = Communicator(1, 0)
+    @test NCCL.device(comm) == CuDevice(0)
 end
 
 @testset "Allreduce!" begin

From c139c127e8f1e015dd1d5af021e371a5c6d0509e Mon Sep 17 00:00:00 2001
From: Tim Besard <tim.besard@gmail.com>
Date: Thu, 22 Feb 2024 03:49:50 -0500
Subject: [PATCH 5/6] Bump Julia requirement.

---
 Project.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Project.toml b/Project.toml
index ae61437..758744a 100644
--- a/Project.toml
+++ b/Project.toml
@@ -9,4 +9,4 @@ NCCL_jll = "4d6d38e4-5b87-5e63-912a-873ff2d649b7"
 
 [compat]
 CEnum = "0.2, 0.3, 0.4, 0.5"
-julia = "1.6"
+julia = "1.8"

From 08fddada65a4e21817f6803c17841ed7e9bbad99 Mon Sep 17 00:00:00 2001
From: Tim Besard <tim.besard@gmail.com>
Date: Thu, 22 Feb 2024 03:57:59 -0500
Subject: [PATCH 6/6] Docs.

---
 docs/src/index.md | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/docs/src/index.md b/docs/src/index.md
index 14cbfe3..8dd833b 100644
--- a/docs/src/index.md
+++ b/docs/src/index.md
@@ -1,6 +1,6 @@
 # NCCL.jl
 
- A Julia wrapper for the [NVIDIA Collective Communications Library (NCCL)](https://developer.nvidia.com/nccl). 
+ A Julia wrapper for the [NVIDIA Collective Communications Library (NCCL)](https://developer.nvidia.com/nccl).
 
 # API
 
@@ -18,8 +18,9 @@ version()
 ## Communicators
 
 ```@docs
+Communicator
 Communicators
-CUDA.CuDevice(comm::Communicator)
+device(comm::Communicator)
 size(comm::Communicator)
 rank(comm::Communicator)
 abort(comm::Communicator)
@@ -50,4 +51,4 @@ avg
 ```@docs
 Send
 Recv!
-```
\ No newline at end of file
+```