diff --git a/examples/WORKSPACE.bazel b/examples/WORKSPACE.bazel index f79eaaf7..7b6b5cb3 100644 --- a/examples/WORKSPACE.bazel +++ b/examples/WORKSPACE.bazel @@ -17,3 +17,31 @@ load("@rules_cuda//cuda:repositories.bzl", "register_detected_cuda_toolchains", rules_cuda_dependencies() register_detected_cuda_toolchains() + +################################# +# Dependencies for nccl example # +################################# +load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive") + +http_archive( + name = "nccl", + add_prefix = "nccl", + build_file = "@rules_cuda_examples//nccl:nccl.BUILD", + sha256 = "83b299cfc2dfe63887dadf3590b3ac2b8b2fd68ec5515b6878774eda39a697d2", + strip_prefix = "nccl-9814c75eea18fc7374cde884592233b6b7dc055b", + urls = ["https://github.com/nvidia/nccl/archive/9814c75eea18fc7374cde884592233b6b7dc055b.tar.gz"], +) + +http_archive( + name = "nccl-tests", + add_prefix = "nccl-tests", + build_file = "@rules_cuda_examples//nccl:nccl-tests.BUILD", + patch_args = [ + "--directory=nccl-tests", + "-p1", + ], + patches = ["@rules_cuda_examples//nccl:nccl-tests-clang.patch"], + sha256 = "946adb84f63aec66aea7aab9739d41df81c24f783e85fba6328ba243cfc057e0", + strip_prefix = "nccl-tests-1a5f551ffd6e3271982b03a9d5653a3f6ba545fa", + urls = ["https://github.com/nvidia/nccl-tests/archive/1a5f551ffd6e3271982b03a9d5653a3f6ba545fa.tar.gz"], +) diff --git a/examples/nccl/BUILD.bazel b/examples/nccl/BUILD.bazel new file mode 100644 index 00000000..b5446b9a --- /dev/null +++ b/examples/nccl/BUILD.bazel @@ -0,0 +1,22 @@ +filegroup( + name = "nccl_shared", + srcs = [ + "@nccl//:nccl_shared", + ], +) + +filegroup( + name = "perf_binaries", + srcs = [ + "@nccl-tests//:all_gather_perf", + "@nccl-tests//:all_reduce_perf", + "@nccl-tests//:alltoall_perf", + "@nccl-tests//:broadcast_perf", + "@nccl-tests//:gather_perf", + "@nccl-tests//:hypercube_perf", + "@nccl-tests//:reduce_perf", + "@nccl-tests//:reduce_scatter_perf", + "@nccl-tests//:scatter_perf", + "@nccl-tests//:sendrecv_perf", + ], +) diff --git a/examples/nccl/nccl-tests-clang.patch b/examples/nccl/nccl-tests-clang.patch new file mode 100644 index 00000000..9d6d60d6 --- /dev/null +++ b/examples/nccl/nccl-tests-clang.patch @@ -0,0 +1,172 @@ +diff --git a/src/all_gather.cu b/src/all_gather.cu +index 0831207..941ec1b 100644 +--- a/src/all_gather.cu ++++ b/src/all_gather.cu +@@ -85,9 +85,7 @@ testResult_t AllGatherRunTest(struct threadArgs* args, int root, ncclDataType_t + return testSuccess; + } + +-struct testEngine allGatherEngine = { ++struct testEngine ncclTestEngine = { + AllGatherGetBuffSize, + AllGatherRunTest + }; +- +-#pragma weak ncclTestEngine=allGatherEngine +diff --git a/src/all_reduce.cu b/src/all_reduce.cu +index a38eabe..acb66a8 100644 +--- a/src/all_reduce.cu ++++ b/src/all_reduce.cu +@@ -93,9 +93,7 @@ testResult_t AllReduceRunTest(struct threadArgs* args, int root, ncclDataType_t + return testSuccess; + } + +-struct testEngine allReduceEngine = { ++struct testEngine ncclTestEngine = { + AllReduceGetBuffSize, + AllReduceRunTest + }; +- +-#pragma weak ncclTestEngine=allReduceEngine +diff --git a/src/alltoall.cu b/src/alltoall.cu +index 41c7c4a..712e664 100644 +--- a/src/alltoall.cu ++++ b/src/alltoall.cu +@@ -99,9 +99,7 @@ testResult_t AlltoAllRunTest(struct threadArgs* args, int root, ncclDataType_t t + return testSuccess; + } + +-struct testEngine alltoAllEngine = { ++struct testEngine ncclTestEngine = { + AlltoAllGetBuffSize, + AlltoAllRunTest + }; +- +-#pragma weak ncclTestEngine=alltoAllEngine +diff --git a/src/broadcast.cu b/src/broadcast.cu +index 903066a..778c664 100644 +--- a/src/broadcast.cu ++++ b/src/broadcast.cu +@@ -99,9 +99,7 @@ testResult_t BroadcastRunTest(struct threadArgs* args, int root, ncclDataType_t + return testSuccess; + } + +-struct testEngine broadcastEngine = { ++struct testEngine ncclTestEngine = { + BroadcastGetBuffSize, + BroadcastRunTest + }; +- +-#pragma weak ncclTestEngine=broadcastEngine +diff --git a/src/common.cu b/src/common.cu +index 48a629c..d888edc 100644 +--- a/src/common.cu ++++ b/src/common.cu +@@ -330,7 +330,7 @@ testResult_t startColl(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t + size_t count = args->nbytes / wordSize(type); + + // Try to change offset for each iteration so that we avoid cache effects and catch race conditions in ptrExchange +- size_t totalnbytes = max(args->sendBytes, args->expectedBytes); ++ size_t totalnbytes = std::max(args->sendBytes, args->expectedBytes); + size_t steps = totalnbytes ? args->maxbytes / totalnbytes : 1; + size_t shift = totalnbytes * (iter % steps); + +@@ -597,7 +597,7 @@ testResult_t TimeTest(struct threadArgs* args, ncclDataType_t type, const char* + setupArgs(size, type, args); + char rootName[100]; + sprintf(rootName, "%6i", root); +- PRINT("%12li %12li %8s %6s %6s", max(args->sendBytes, args->expectedBytes), args->nbytes / wordSize(type), typeName, opName, rootName); ++ PRINT("%12li %12li %8s %6s %6s", std::max(args->sendBytes, args->expectedBytes), args->nbytes / wordSize(type), typeName, opName, rootName); + TESTCHECK(BenchTime(args, type, op, root, 0)); + TESTCHECK(BenchTime(args, type, op, root, 1)); + PRINT("\n"); +diff --git a/src/gather.cu b/src/gather.cu +index 03ef4d9..242a298 100644 +--- a/src/gather.cu ++++ b/src/gather.cu +@@ -108,9 +108,7 @@ testResult_t GatherRunTest(struct threadArgs* args, int root, ncclDataType_t typ + return testSuccess; + } + +-struct testEngine gatherEngine = { ++struct testEngine ncclTestEngine = { + GatherGetBuffSize, + GatherRunTest + }; +- +-#pragma weak ncclTestEngine=gatherEngine +diff --git a/src/hypercube.cu b/src/hypercube.cu +index 5c1456f..9aadfc5 100644 +--- a/src/hypercube.cu ++++ b/src/hypercube.cu +@@ -110,9 +110,7 @@ testResult_t HyperCubeRunTest(struct threadArgs* args, int root, ncclDataType_t + return testSuccess; + } + +-struct testEngine hyperCubeEngine = { ++struct testEngine ncclTestEngine = { + HyperCubeGetBuffSize, + HyperCubeRunTest + }; +- +-#pragma weak ncclTestEngine=hyperCubeEngine +diff --git a/src/reduce.cu b/src/reduce.cu +index f2fa80d..80aadc5 100644 +--- a/src/reduce.cu ++++ b/src/reduce.cu +@@ -102,9 +102,7 @@ testResult_t ReduceRunTest(struct threadArgs* args, int root, ncclDataType_t typ + return testSuccess; + } + +-struct testEngine reduceEngine = { ++struct testEngine ncclTestEngine = { + ReduceGetBuffSize, + ReduceRunTest + }; +- +-#pragma weak ncclTestEngine=reduceEngine +diff --git a/src/reduce_scatter.cu b/src/reduce_scatter.cu +index ed372e3..212a6f0 100644 +--- a/src/reduce_scatter.cu ++++ b/src/reduce_scatter.cu +@@ -97,9 +97,7 @@ testResult_t ReduceScatterRunTest(struct threadArgs* args, int root, ncclDataTyp + return testSuccess; + } + +-struct testEngine reduceScatterEngine = { ++struct testEngine ncclTestEngine = { + ReduceScatterGetBuffSize, + ReduceScatterRunTest + }; +- +-#pragma weak ncclTestEngine=reduceScatterEngine +diff --git a/src/scatter.cu b/src/scatter.cu +index 49d20e1..56f5ede 100644 +--- a/src/scatter.cu ++++ b/src/scatter.cu +@@ -104,9 +104,7 @@ testResult_t ScatterRunTest(struct threadArgs* args, int root, ncclDataType_t ty + return testSuccess; + } + +-struct testEngine scatterEngine = { ++struct testEngine ncclTestEngine = { + ScatterGetBuffSize, + ScatterRunTest + }; +- +-#pragma weak ncclTestEngine=scatterEngine +diff --git a/src/sendrecv.cu b/src/sendrecv.cu +index c9eb5bb..316a449 100644 +--- a/src/sendrecv.cu ++++ b/src/sendrecv.cu +@@ -106,9 +106,7 @@ testResult_t SendRecvRunTest(struct threadArgs* args, int root, ncclDataType_t t + return testSuccess; + } + +-struct testEngine sendRecvEngine = { ++struct testEngine ncclTestEngine = { + SendRecvGetBuffSize, + SendRecvRunTest + }; +- +-#pragma weak ncclTestEngine=sendRecvEngine diff --git a/examples/nccl/nccl-tests.BUILD b/examples/nccl/nccl-tests.BUILD new file mode 100644 index 00000000..f482e6db --- /dev/null +++ b/examples/nccl/nccl-tests.BUILD @@ -0,0 +1,50 @@ +load("@rules_cuda//cuda:defs.bzl", "cuda_library") +load("@rules_cuda_examples//nccl:nccl-tests.bzl", "nccl_tests_binary") + +cc_library( + name = "nccl_tests_include", + hdrs = glob(["nccl-tests/src/*.h"]), + includes = ["nccl-tests/src"], +) + +cuda_library( + name = "common_cuda", + srcs = [ + "nccl-tests/src/common.cu", + "nccl-tests/verifiable/verifiable.cu", + ] + glob([ + "nccl-tests/**/*.h", + ]), + deps = [ + ":nccl_tests_include", + "@nccl", + ], +) + +cc_library( + name = "common_cc", + srcs = ["nccl-tests/src/timer.cc"], + hdrs = ["nccl-tests/src/timer.h"], + alwayslink = 1, +) + +# :common_cuda, :common_cc and @nccl//:nccl_shared are implicitly hardcoded in `nccl_tests_binary` +nccl_tests_binary(name = "all_reduce") + +nccl_tests_binary(name = "all_gather") + +nccl_tests_binary(name = "broadcast") + +nccl_tests_binary(name = "reduce_scatter") + +nccl_tests_binary(name = "reduce") + +nccl_tests_binary(name = "alltoall") + +nccl_tests_binary(name = "scatter") + +nccl_tests_binary(name = "gather") + +nccl_tests_binary(name = "sendrecv") + +nccl_tests_binary(name = "hypercube") diff --git a/examples/nccl/nccl-tests.bzl b/examples/nccl/nccl-tests.bzl new file mode 100644 index 00000000..48229031 --- /dev/null +++ b/examples/nccl/nccl-tests.bzl @@ -0,0 +1,19 @@ +load("@rules_cuda//cuda:defs.bzl", "cuda_library") + +def nccl_tests_binary(name, cc_deps = [], cuda_deps = []): + cuda_library( + name = name, + srcs = ["nccl-tests/src/{}.cu".format(name)], + deps = [ + "@nccl//:nccl_shared", + ":common_cuda", + ], + alwayslink = 1, + ) + + bin_name = name + "_perf" + native.cc_binary( + name = bin_name, + deps = [":common_cc", ":" + name], + visibility = ["//visibility:public"], + ) diff --git a/examples/nccl/nccl.BUILD b/examples/nccl/nccl.BUILD new file mode 100644 index 00000000..98f36117 --- /dev/null +++ b/examples/nccl/nccl.BUILD @@ -0,0 +1,163 @@ +load("@bazel_skylib//rules:expand_template.bzl", "expand_template") +load("@rules_cuda//cuda:defs.bzl", "cuda_library", "cuda_objects") +load("@rules_cuda_examples//nccl:nccl.bzl", "if_cuda_clang", "if_cuda_nvcc", "nccl_primitive") + +expand_template( + name = "nccl_h", + out = "nccl/src/include/nccl.h", + substitutions = { + "${nccl:Major}": "2", + "${nccl:Minor}": "18", + "${nccl:Patch}": "3", + "${nccl:Suffix}": "", + # NCCL_VERSION(X,Y,Z) ((X) * 10000 + (Y) * 100 + (Z)) + "${nccl:Version}": "21803", + }, + template = "nccl/src/nccl.h.in", +) + +cc_library( + name = "nccl_include", + hdrs = [ + ":nccl_h", + ] + glob([ + "nccl/src/include/**/*.h", + "nccl/src/include/**/*.hpp", + ]), + includes = [ + # this will add both nccl/src/include in repo and + # bazel-out//bin/nccl/src/include to include paths + # so the previous expand_template generate nccl.h to the very path! + "nccl/src/include", + ], +) + +cuda_objects( + name = "nccl_device_common", + srcs = [ + "nccl/src/collectives/device/functions.cu", + "nccl/src/collectives/device/onerank_reduce.cu", + ] + glob([ + "nccl/src/collectives/device/**/*.h", + ]), + copts = if_cuda_nvcc(["--extended-lambda"]), + ptxasopts = ["-maxrregcount=96"], + deps = [":nccl_include"], +) + +# must be manually disabled if cuda version is lower than 11. +USE_BF16 = True + +filegroup( + name = "collective_dev_hdrs", + srcs = [ + "nccl/src/collectives/device/all_gather.h", + "nccl/src/collectives/device/all_reduce.h", + "nccl/src/collectives/device/broadcast.h", + "nccl/src/collectives/device/common.h", + "nccl/src/collectives/device/common_kernel.h", + "nccl/src/collectives/device/gen_rules.sh", + "nccl/src/collectives/device/op128.h", + "nccl/src/collectives/device/primitives.h", + "nccl/src/collectives/device/prims_ll.h", + "nccl/src/collectives/device/prims_ll128.h", + "nccl/src/collectives/device/prims_simple.h", + "nccl/src/collectives/device/reduce.h", + "nccl/src/collectives/device/reduce_kernel.h", + "nccl/src/collectives/device/reduce_scatter.h", + "nccl/src/collectives/device/sendrecv.h", + ], +) + +# cuda_objects for each type of primitive +nccl_primitive( + name = "all_gather", + hdrs = ["collective_dev_hdrs"], + use_bf16 = USE_BF16, + deps = [":nccl_device_common"], +) + +nccl_primitive( + name = "all_reduce", + hdrs = ["collective_dev_hdrs"], + use_bf16 = USE_BF16, + deps = [":nccl_device_common"], +) + +nccl_primitive( + name = "broadcast", + hdrs = ["collective_dev_hdrs"], + use_bf16 = USE_BF16, + deps = [":nccl_device_common"], +) + +nccl_primitive( + name = "reduce", + hdrs = ["collective_dev_hdrs"], + use_bf16 = USE_BF16, + deps = [":nccl_device_common"], +) + +nccl_primitive( + name = "reduce_scatter", + hdrs = ["collective_dev_hdrs"], + use_bf16 = USE_BF16, + deps = [":nccl_device_common"], +) + +nccl_primitive( + name = "sendrecv", + hdrs = ["collective_dev_hdrs"], + use_bf16 = USE_BF16, + deps = [":nccl_device_common"], +) + +# device link +cuda_library( + name = "collectives", + rdc = 1, + deps = [ + ":all_gather", + ":all_reduce", + ":broadcast", + ":reduce", + ":reduce_scatter", + ":sendrecv", + ], + alwayslink = 1, +) + +cc_binary( + name = "nccl", + srcs = glob( + [ + "nccl/src/*.cc", + "nccl/src/collectives/*.cc", + "nccl/src/graph/*.cc", + "nccl/src/graph/*.h", + "nccl/src/misc/*.cc", + "nccl/src/transport/*.cc", + ], + exclude = [ + # https://github.com/NVIDIA/nccl/issues/658 + "nccl/src/enhcompat.cc", + ], + ), + copts = if_cuda_clang(["-xcu"]), + linkshared = 1, + linkstatic = 1, + visibility = ["//visibility:public"], + deps = [ + ":collectives", + ":nccl_include", + "@rules_cuda//cuda:runtime", + ], +) + +# To allow downstream targets to link with the nccl shared library, we need to `cc_import` it again. +# See https://groups.google.com/g/bazel-discuss/c/RtbidPdVFyU/m/TsUDOVHIAwAJ +cc_import( + name = "nccl_shared", + shared_library = ":nccl", + visibility = ["//visibility:public"], +) diff --git a/examples/nccl/nccl.bzl b/examples/nccl/nccl.bzl new file mode 100644 index 00000000..e2758f27 --- /dev/null +++ b/examples/nccl/nccl.bzl @@ -0,0 +1,43 @@ +load("@bazel_skylib//rules:copy_file.bzl", "copy_file") +load("@rules_cuda//cuda:defs.bzl", "cuda_library", "cuda_objects") + +def if_cuda_nvcc(if_true, if_false = []): + return select({ + "@rules_cuda//cuda:compiler_is_nvcc": if_true, + "//conditions:default": if_false, + }) + +def if_cuda_clang(if_true, if_false = []): + return select({ + "@rules_cuda//cuda:compiler_is_clang": if_true, + "//conditions:default": if_false, + }) + +def nccl_primitive(name, hdrs = [], deps = [], use_bf16 = True): + ops = ["sum", "prod", "min", "max", "premulsum", "sumpostdiv"] + datatypes = ["i8", "u8", "i32", "u32", "i64", "u64", "f16", "f32", "f64"] + if use_bf16: + datatypes.append("bf16") + + intermediate_targets = [] + for opn, op in enumerate(ops): + for dtn, dt in enumerate(datatypes): + name_op_dt = "{}_{}_{}".format(name, op, dt) + copy_file( + name = name_op_dt + "_rename", + src = "nccl/src/collectives/device/{}.cu".format(name), + out = "nccl/src/collectives/device/{}.cu".format(name_op_dt), + ) + + cuda_objects( + name = name_op_dt, + srcs = [":{}_rename".format(name_op_dt)], + hdrs = hdrs, + deps = deps, + ptxasopts = ["-maxrregcount=96"], + defines = ["NCCL_OP={}".format(opn), "NCCL_TYPE={}".format(dtn)], + includes = ["nccl/src/collectives/device"], + ) + intermediate_targets.append(":" + name_op_dt) + + cuda_objects(name = name, deps = intermediate_targets)