diff --git a/examples/WORKSPACE.bazel b/examples/WORKSPACE.bazel
index f79eaaf7..7b6b5cb3 100644
--- a/examples/WORKSPACE.bazel
+++ b/examples/WORKSPACE.bazel
@@ -17,3 +17,31 @@ load("@rules_cuda//cuda:repositories.bzl", "register_detected_cuda_toolchains",
 rules_cuda_dependencies()
 
 register_detected_cuda_toolchains()
+
+#################################
+# Dependencies for nccl example #
+#################################
+load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive")
+
+http_archive(
+    name = "nccl",
+    add_prefix = "nccl",
+    build_file = "@rules_cuda_examples//nccl:nccl.BUILD",
+    sha256 = "83b299cfc2dfe63887dadf3590b3ac2b8b2fd68ec5515b6878774eda39a697d2",
+    strip_prefix = "nccl-9814c75eea18fc7374cde884592233b6b7dc055b",
+    urls = ["https://github.com/nvidia/nccl/archive/9814c75eea18fc7374cde884592233b6b7dc055b.tar.gz"],
+)
+
+http_archive(
+    name = "nccl-tests",
+    add_prefix = "nccl-tests",
+    build_file = "@rules_cuda_examples//nccl:nccl-tests.BUILD",
+    patch_args = [
+        "--directory=nccl-tests",
+        "-p1",
+    ],
+    patches = ["@rules_cuda_examples//nccl:nccl-tests-clang.patch"],
+    sha256 = "946adb84f63aec66aea7aab9739d41df81c24f783e85fba6328ba243cfc057e0",
+    strip_prefix = "nccl-tests-1a5f551ffd6e3271982b03a9d5653a3f6ba545fa",
+    urls = ["https://github.com/nvidia/nccl-tests/archive/1a5f551ffd6e3271982b03a9d5653a3f6ba545fa.tar.gz"],
+)
diff --git a/examples/nccl/BUILD.bazel b/examples/nccl/BUILD.bazel
new file mode 100644
index 00000000..b5446b9a
--- /dev/null
+++ b/examples/nccl/BUILD.bazel
@@ -0,0 +1,22 @@
+filegroup(
+    name = "nccl_shared",
+    srcs = [
+        "@nccl//:nccl_shared",
+    ],
+)
+
+filegroup(
+    name = "perf_binaries",
+    srcs = [
+        "@nccl-tests//:all_gather_perf",
+        "@nccl-tests//:all_reduce_perf",
+        "@nccl-tests//:alltoall_perf",
+        "@nccl-tests//:broadcast_perf",
+        "@nccl-tests//:gather_perf",
+        "@nccl-tests//:hypercube_perf",
+        "@nccl-tests//:reduce_perf",
+        "@nccl-tests//:reduce_scatter_perf",
+        "@nccl-tests//:scatter_perf",
+        "@nccl-tests//:sendrecv_perf",
+    ],
+)
diff --git a/examples/nccl/nccl-tests-clang.patch b/examples/nccl/nccl-tests-clang.patch
new file mode 100644
index 00000000..9d6d60d6
--- /dev/null
+++ b/examples/nccl/nccl-tests-clang.patch
@@ -0,0 +1,172 @@
+diff --git a/src/all_gather.cu b/src/all_gather.cu
+index 0831207..941ec1b 100644
+--- a/src/all_gather.cu
++++ b/src/all_gather.cu
+@@ -85,9 +85,7 @@ testResult_t AllGatherRunTest(struct threadArgs* args, int root, ncclDataType_t
+   return testSuccess;
+ }
+ 
+-struct testEngine allGatherEngine = {
++struct testEngine ncclTestEngine = {
+   AllGatherGetBuffSize,
+   AllGatherRunTest
+ };
+-
+-#pragma weak ncclTestEngine=allGatherEngine
+diff --git a/src/all_reduce.cu b/src/all_reduce.cu
+index a38eabe..acb66a8 100644
+--- a/src/all_reduce.cu
++++ b/src/all_reduce.cu
+@@ -93,9 +93,7 @@ testResult_t AllReduceRunTest(struct threadArgs* args, int root, ncclDataType_t
+   return testSuccess;
+ }
+ 
+-struct testEngine allReduceEngine = {
++struct testEngine ncclTestEngine = {
+   AllReduceGetBuffSize,
+   AllReduceRunTest
+ };
+-
+-#pragma weak ncclTestEngine=allReduceEngine
+diff --git a/src/alltoall.cu b/src/alltoall.cu
+index 41c7c4a..712e664 100644
+--- a/src/alltoall.cu
++++ b/src/alltoall.cu
+@@ -99,9 +99,7 @@ testResult_t AlltoAllRunTest(struct threadArgs* args, int root, ncclDataType_t t
+   return testSuccess;
+ }
+ 
+-struct testEngine alltoAllEngine = {
++struct testEngine ncclTestEngine = {
+   AlltoAllGetBuffSize,
+   AlltoAllRunTest
+ };
+-
+-#pragma weak ncclTestEngine=alltoAllEngine
+diff --git a/src/broadcast.cu b/src/broadcast.cu
+index 903066a..778c664 100644
+--- a/src/broadcast.cu
++++ b/src/broadcast.cu
+@@ -99,9 +99,7 @@ testResult_t BroadcastRunTest(struct threadArgs* args, int root, ncclDataType_t
+   return testSuccess;
+ }
+ 
+-struct testEngine broadcastEngine = {
++struct testEngine ncclTestEngine = {
+   BroadcastGetBuffSize,
+   BroadcastRunTest
+ };
+-
+-#pragma weak ncclTestEngine=broadcastEngine
+diff --git a/src/common.cu b/src/common.cu
+index 48a629c..d888edc 100644
+--- a/src/common.cu
++++ b/src/common.cu
+@@ -330,7 +330,7 @@ testResult_t startColl(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
+   size_t count = args->nbytes / wordSize(type);
+ 
+   // Try to change offset for each iteration so that we avoid cache effects and catch race conditions in ptrExchange
+-  size_t totalnbytes = max(args->sendBytes, args->expectedBytes);
++  size_t totalnbytes = std::max(args->sendBytes, args->expectedBytes);
+   size_t steps = totalnbytes ? args->maxbytes / totalnbytes : 1;
+   size_t shift = totalnbytes * (iter % steps);
+ 
+@@ -597,7 +597,7 @@ testResult_t TimeTest(struct threadArgs* args, ncclDataType_t type, const char*
+       setupArgs(size, type, args);
+       char rootName[100];
+       sprintf(rootName, "%6i", root);
+-      PRINT("%12li  %12li  %8s  %6s  %6s", max(args->sendBytes, args->expectedBytes), args->nbytes / wordSize(type), typeName, opName, rootName);
++      PRINT("%12li  %12li  %8s  %6s  %6s", std::max(args->sendBytes, args->expectedBytes), args->nbytes / wordSize(type), typeName, opName, rootName);
+       TESTCHECK(BenchTime(args, type, op, root, 0));
+       TESTCHECK(BenchTime(args, type, op, root, 1));
+       PRINT("\n");
+diff --git a/src/gather.cu b/src/gather.cu
+index 03ef4d9..242a298 100644
+--- a/src/gather.cu
++++ b/src/gather.cu
+@@ -108,9 +108,7 @@ testResult_t GatherRunTest(struct threadArgs* args, int root, ncclDataType_t typ
+   return testSuccess;
+ }
+ 
+-struct testEngine gatherEngine = {
++struct testEngine ncclTestEngine = {
+   GatherGetBuffSize,
+   GatherRunTest
+ };
+-
+-#pragma weak ncclTestEngine=gatherEngine
+diff --git a/src/hypercube.cu b/src/hypercube.cu
+index 5c1456f..9aadfc5 100644
+--- a/src/hypercube.cu
++++ b/src/hypercube.cu
+@@ -110,9 +110,7 @@ testResult_t HyperCubeRunTest(struct threadArgs* args, int root, ncclDataType_t
+   return testSuccess;
+ }
+ 
+-struct testEngine hyperCubeEngine = {
++struct testEngine ncclTestEngine = {
+   HyperCubeGetBuffSize,
+   HyperCubeRunTest
+ };
+-
+-#pragma weak ncclTestEngine=hyperCubeEngine
+diff --git a/src/reduce.cu b/src/reduce.cu
+index f2fa80d..80aadc5 100644
+--- a/src/reduce.cu
++++ b/src/reduce.cu
+@@ -102,9 +102,7 @@ testResult_t ReduceRunTest(struct threadArgs* args, int root, ncclDataType_t typ
+   return testSuccess;
+ }
+ 
+-struct testEngine reduceEngine = {
++struct testEngine ncclTestEngine = {
+   ReduceGetBuffSize,
+   ReduceRunTest
+ };
+-
+-#pragma weak ncclTestEngine=reduceEngine
+diff --git a/src/reduce_scatter.cu b/src/reduce_scatter.cu
+index ed372e3..212a6f0 100644
+--- a/src/reduce_scatter.cu
++++ b/src/reduce_scatter.cu
+@@ -97,9 +97,7 @@ testResult_t ReduceScatterRunTest(struct threadArgs* args, int root, ncclDataTyp
+   return testSuccess;
+ }
+ 
+-struct testEngine reduceScatterEngine = {
++struct testEngine ncclTestEngine = {
+   ReduceScatterGetBuffSize,
+   ReduceScatterRunTest
+ };
+-
+-#pragma weak ncclTestEngine=reduceScatterEngine
+diff --git a/src/scatter.cu b/src/scatter.cu
+index 49d20e1..56f5ede 100644
+--- a/src/scatter.cu
++++ b/src/scatter.cu
+@@ -104,9 +104,7 @@ testResult_t ScatterRunTest(struct threadArgs* args, int root, ncclDataType_t ty
+   return testSuccess;
+ }
+ 
+-struct testEngine scatterEngine = {
++struct testEngine ncclTestEngine = {
+   ScatterGetBuffSize,
+   ScatterRunTest
+ };
+-
+-#pragma weak ncclTestEngine=scatterEngine
+diff --git a/src/sendrecv.cu b/src/sendrecv.cu
+index c9eb5bb..316a449 100644
+--- a/src/sendrecv.cu
++++ b/src/sendrecv.cu
+@@ -106,9 +106,7 @@ testResult_t SendRecvRunTest(struct threadArgs* args, int root, ncclDataType_t t
+   return testSuccess;
+ }
+ 
+-struct testEngine sendRecvEngine = {
++struct testEngine ncclTestEngine = {
+   SendRecvGetBuffSize,
+   SendRecvRunTest
+ };
+-
+-#pragma weak ncclTestEngine=sendRecvEngine
diff --git a/examples/nccl/nccl-tests.BUILD b/examples/nccl/nccl-tests.BUILD
new file mode 100644
index 00000000..f482e6db
--- /dev/null
+++ b/examples/nccl/nccl-tests.BUILD
@@ -0,0 +1,50 @@
+load("@rules_cuda//cuda:defs.bzl", "cuda_library")
+load("@rules_cuda_examples//nccl:nccl-tests.bzl", "nccl_tests_binary")
+
+cc_library(
+    name = "nccl_tests_include",
+    hdrs = glob(["nccl-tests/src/*.h"]),
+    includes = ["nccl-tests/src"],
+)
+
+cuda_library(
+    name = "common_cuda",
+    srcs = [
+        "nccl-tests/src/common.cu",
+        "nccl-tests/verifiable/verifiable.cu",
+    ] + glob([
+        "nccl-tests/**/*.h",
+    ]),
+    deps = [
+        ":nccl_tests_include",
+        "@nccl",
+    ],
+)
+
+cc_library(
+    name = "common_cc",
+    srcs = ["nccl-tests/src/timer.cc"],
+    hdrs = ["nccl-tests/src/timer.h"],
+    alwayslink = 1,
+)
+
+# :common_cuda, :common_cc and @nccl//:nccl_shared are implicitly hardcoded in `nccl_tests_binary`
+nccl_tests_binary(name = "all_reduce")
+
+nccl_tests_binary(name = "all_gather")
+
+nccl_tests_binary(name = "broadcast")
+
+nccl_tests_binary(name = "reduce_scatter")
+
+nccl_tests_binary(name = "reduce")
+
+nccl_tests_binary(name = "alltoall")
+
+nccl_tests_binary(name = "scatter")
+
+nccl_tests_binary(name = "gather")
+
+nccl_tests_binary(name = "sendrecv")
+
+nccl_tests_binary(name = "hypercube")
diff --git a/examples/nccl/nccl-tests.bzl b/examples/nccl/nccl-tests.bzl
new file mode 100644
index 00000000..48229031
--- /dev/null
+++ b/examples/nccl/nccl-tests.bzl
@@ -0,0 +1,19 @@
+load("@rules_cuda//cuda:defs.bzl", "cuda_library")
+
+def nccl_tests_binary(name, cc_deps = [], cuda_deps = []):
+    cuda_library(
+        name = name,
+        srcs = ["nccl-tests/src/{}.cu".format(name)],
+        deps = [
+            "@nccl//:nccl_shared",
+            ":common_cuda",
+        ],
+        alwayslink = 1,
+    )
+
+    bin_name = name + "_perf"
+    native.cc_binary(
+        name = bin_name,
+        deps = [":common_cc", ":" + name],
+        visibility = ["//visibility:public"],
+    )
diff --git a/examples/nccl/nccl.BUILD b/examples/nccl/nccl.BUILD
new file mode 100644
index 00000000..98f36117
--- /dev/null
+++ b/examples/nccl/nccl.BUILD
@@ -0,0 +1,163 @@
+load("@bazel_skylib//rules:expand_template.bzl", "expand_template")
+load("@rules_cuda//cuda:defs.bzl", "cuda_library", "cuda_objects")
+load("@rules_cuda_examples//nccl:nccl.bzl", "if_cuda_clang", "if_cuda_nvcc", "nccl_primitive")
+
+expand_template(
+    name = "nccl_h",
+    out = "nccl/src/include/nccl.h",
+    substitutions = {
+        "${nccl:Major}": "2",
+        "${nccl:Minor}": "18",
+        "${nccl:Patch}": "3",
+        "${nccl:Suffix}": "",
+        # NCCL_VERSION(X,Y,Z) ((X) * 10000 + (Y) * 100 + (Z))
+        "${nccl:Version}": "21803",
+    },
+    template = "nccl/src/nccl.h.in",
+)
+
+cc_library(
+    name = "nccl_include",
+    hdrs = [
+        ":nccl_h",
+    ] + glob([
+        "nccl/src/include/**/*.h",
+        "nccl/src/include/**/*.hpp",
+    ]),
+    includes = [
+        # this will add both nccl/src/include in repo and
+        # bazel-out/<compilation_mode>/bin/nccl/src/include to include paths
+        # so the previous expand_template generate nccl.h to the very path!
+        "nccl/src/include",
+    ],
+)
+
+cuda_objects(
+    name = "nccl_device_common",
+    srcs = [
+        "nccl/src/collectives/device/functions.cu",
+        "nccl/src/collectives/device/onerank_reduce.cu",
+    ] + glob([
+        "nccl/src/collectives/device/**/*.h",
+    ]),
+    copts = if_cuda_nvcc(["--extended-lambda"]),
+    ptxasopts = ["-maxrregcount=96"],
+    deps = [":nccl_include"],
+)
+
+# must be manually disabled if cuda version is lower than 11.
+USE_BF16 = True
+
+filegroup(
+    name = "collective_dev_hdrs",
+    srcs = [
+        "nccl/src/collectives/device/all_gather.h",
+        "nccl/src/collectives/device/all_reduce.h",
+        "nccl/src/collectives/device/broadcast.h",
+        "nccl/src/collectives/device/common.h",
+        "nccl/src/collectives/device/common_kernel.h",
+        "nccl/src/collectives/device/gen_rules.sh",
+        "nccl/src/collectives/device/op128.h",
+        "nccl/src/collectives/device/primitives.h",
+        "nccl/src/collectives/device/prims_ll.h",
+        "nccl/src/collectives/device/prims_ll128.h",
+        "nccl/src/collectives/device/prims_simple.h",
+        "nccl/src/collectives/device/reduce.h",
+        "nccl/src/collectives/device/reduce_kernel.h",
+        "nccl/src/collectives/device/reduce_scatter.h",
+        "nccl/src/collectives/device/sendrecv.h",
+    ],
+)
+
+# cuda_objects for each type of primitive
+nccl_primitive(
+    name = "all_gather",
+    hdrs = ["collective_dev_hdrs"],
+    use_bf16 = USE_BF16,
+    deps = [":nccl_device_common"],
+)
+
+nccl_primitive(
+    name = "all_reduce",
+    hdrs = ["collective_dev_hdrs"],
+    use_bf16 = USE_BF16,
+    deps = [":nccl_device_common"],
+)
+
+nccl_primitive(
+    name = "broadcast",
+    hdrs = ["collective_dev_hdrs"],
+    use_bf16 = USE_BF16,
+    deps = [":nccl_device_common"],
+)
+
+nccl_primitive(
+    name = "reduce",
+    hdrs = ["collective_dev_hdrs"],
+    use_bf16 = USE_BF16,
+    deps = [":nccl_device_common"],
+)
+
+nccl_primitive(
+    name = "reduce_scatter",
+    hdrs = ["collective_dev_hdrs"],
+    use_bf16 = USE_BF16,
+    deps = [":nccl_device_common"],
+)
+
+nccl_primitive(
+    name = "sendrecv",
+    hdrs = ["collective_dev_hdrs"],
+    use_bf16 = USE_BF16,
+    deps = [":nccl_device_common"],
+)
+
+# device link
+cuda_library(
+    name = "collectives",
+    rdc = 1,
+    deps = [
+        ":all_gather",
+        ":all_reduce",
+        ":broadcast",
+        ":reduce",
+        ":reduce_scatter",
+        ":sendrecv",
+    ],
+    alwayslink = 1,
+)
+
+cc_binary(
+    name = "nccl",
+    srcs = glob(
+        [
+            "nccl/src/*.cc",
+            "nccl/src/collectives/*.cc",
+            "nccl/src/graph/*.cc",
+            "nccl/src/graph/*.h",
+            "nccl/src/misc/*.cc",
+            "nccl/src/transport/*.cc",
+        ],
+        exclude = [
+            # https://github.com/NVIDIA/nccl/issues/658
+            "nccl/src/enhcompat.cc",
+        ],
+    ),
+    copts = if_cuda_clang(["-xcu"]),
+    linkshared = 1,
+    linkstatic = 1,
+    visibility = ["//visibility:public"],
+    deps = [
+        ":collectives",
+        ":nccl_include",
+        "@rules_cuda//cuda:runtime",
+    ],
+)
+
+# To allow downstream targets to link with the nccl shared library, we need to `cc_import` it again.
+# See https://groups.google.com/g/bazel-discuss/c/RtbidPdVFyU/m/TsUDOVHIAwAJ
+cc_import(
+    name = "nccl_shared",
+    shared_library = ":nccl",
+    visibility = ["//visibility:public"],
+)
diff --git a/examples/nccl/nccl.bzl b/examples/nccl/nccl.bzl
new file mode 100644
index 00000000..e2758f27
--- /dev/null
+++ b/examples/nccl/nccl.bzl
@@ -0,0 +1,43 @@
+load("@bazel_skylib//rules:copy_file.bzl", "copy_file")
+load("@rules_cuda//cuda:defs.bzl", "cuda_library", "cuda_objects")
+
+def if_cuda_nvcc(if_true, if_false = []):
+    return select({
+        "@rules_cuda//cuda:compiler_is_nvcc": if_true,
+        "//conditions:default": if_false,
+    })
+
+def if_cuda_clang(if_true, if_false = []):
+    return select({
+        "@rules_cuda//cuda:compiler_is_clang": if_true,
+        "//conditions:default": if_false,
+    })
+
+def nccl_primitive(name, hdrs = [], deps = [], use_bf16 = True):
+    ops = ["sum", "prod", "min", "max", "premulsum", "sumpostdiv"]
+    datatypes = ["i8", "u8", "i32", "u32", "i64", "u64", "f16", "f32", "f64"]
+    if use_bf16:
+        datatypes.append("bf16")
+
+    intermediate_targets = []
+    for opn, op in enumerate(ops):
+        for dtn, dt in enumerate(datatypes):
+            name_op_dt = "{}_{}_{}".format(name, op, dt)
+            copy_file(
+                name = name_op_dt + "_rename",
+                src = "nccl/src/collectives/device/{}.cu".format(name),
+                out = "nccl/src/collectives/device/{}.cu".format(name_op_dt),
+            )
+
+            cuda_objects(
+                name = name_op_dt,
+                srcs = [":{}_rename".format(name_op_dt)],
+                hdrs = hdrs,
+                deps = deps,
+                ptxasopts = ["-maxrregcount=96"],
+                defines = ["NCCL_OP={}".format(opn), "NCCL_TYPE={}".format(dtn)],
+                includes = ["nccl/src/collectives/device"],
+            )
+            intermediate_targets.append(":" + name_op_dt)
+
+    cuda_objects(name = name, deps = intermediate_targets)