From f26dcfc86b3f6582821ace236e9dc56705296904 Mon Sep 17 00:00:00 2001 From: artemry-nv Date: Thu, 18 Feb 2021 02:28:57 +0300 Subject: [PATCH 1/8] Implemented Torch-UCC CI (build part) Signed-off-by: artemry-nv --- .ci/Dockerfile.ubuntu20.04 | 58 ++++++++++++++++++++++++++++ .ci/Jenkinsfile.shlib | 9 +++++ .ci/job_matrix.yaml | 79 ++++++++++++++++++++++++++++++++++++++ .ci/proj_jjb.yaml | 79 ++++++++++++++++++++++++++++++++++++++ .gitignore | 3 ++ .gitmodules | 9 +++++ 6 files changed, 237 insertions(+) create mode 100644 .ci/Dockerfile.ubuntu20.04 create mode 100644 .ci/Jenkinsfile.shlib create mode 100644 .ci/job_matrix.yaml create mode 100644 .ci/proj_jjb.yaml create mode 100644 .gitmodules diff --git a/.ci/Dockerfile.ubuntu20.04 b/.ci/Dockerfile.ubuntu20.04 new file mode 100644 index 0000000..6343cfb --- /dev/null +++ b/.ci/Dockerfile.ubuntu20.04 @@ -0,0 +1,58 @@ +FROM nvidia/cuda:11.2.1-devel-ubuntu20.04 + +ARG DEBIAN_FRONTEND=noninteractive + +RUN apt update && \ + apt install -y \ + apt-utils \ + autoconf \ + build-essential \ + cmake \ + curl \ + git \ + ibverbs-providers \ + ibverbs-utils \ + libnuma-dev \ + libtool-bin \ + vim \ + && \ + rm -rf /var/lib/apt/lists/* + +# Install conda +RUN curl -LO http://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh && \ + bash Miniconda3-latest-Linux-x86_64.sh -p /opt/conda -b && \ + rm -f Miniconda3-latest-Linux-x86_64.sh +ENV PATH /opt/conda/bin:${PATH} + +# Install conda python +RUN conda update -y conda && \ + conda install -c anaconda -y \ + python \ + pip && \ + pip install --no-cache-dir python-hostlist + +RUN ln -s /opt/conda/bin/python /usr/bin/python + +WORKDIR "/workspace" + +# Install PyTorch +RUN git clone https://github.com/pytorch/pytorch.git && \ + cd pytorch && \ + git submodule sync --recursive && \ + git submodule update --init --recursive && \ + pip install -r requirements.txt && \ + TORCH_CUDA_ARCH_LIST="7.0 8.0+PTX" \ + USE_GLOO=1 \ + USE_DISTRIBUTED=1 \ + USE_OPENCV=0 \ + USE_CUDA=1 \ + USE_NCCL=0 \ + USE_MKLDNN=0 \ + BUILD_TEST=0 \ + USE_FBGEMM=0 \ + USE_NNPACK=0 \ + USE_QNNPACK=0 \ + USE_XNNPACK=0 \ + USE_KINETO=1 \ + python setup.py install && \ + rm -rf /workspace/pytorch diff --git a/.ci/Jenkinsfile.shlib b/.ci/Jenkinsfile.shlib new file mode 100644 index 0000000..10bde70 --- /dev/null +++ b/.ci/Jenkinsfile.shlib @@ -0,0 +1,9 @@ +#!/usr/bin/groovy + +// load pipeline functions +// Requires pipeline-github-lib plugin to load library from github +@Library('github.com/Mellanox/ci-demo@master') +def matrix = new com.mellanox.cicd.Matrix() + +matrix.main() + diff --git a/.ci/job_matrix.yaml b/.ci/job_matrix.yaml new file mode 100644 index 0000000..ffc339c --- /dev/null +++ b/.ci/job_matrix.yaml @@ -0,0 +1,79 @@ +--- +job: 'torch-ucc' + +registry_host: 'harbor.mellanox.com' +# TODO change +registry_path: '/swx-infra/torch-ucc' +registry_auth: '1daaea28-800e-425f-a91f-3bd3e9136eea' + +kubernetes: + cloud: 'swx-k8s' + +volumes: + - { mountPath: '/hpc/local', hostPath: '/hpc/local' } + - { mountPath: '/auto/sw_tools', hostPath: '/auto/sw_tools' } + - { mountPath: '/.autodirect/mtrswgwork', hostPath: '/.autodirect/mtrswgwork' } + - { mountPath: '/.autodirect/sw/release', hostPath: '/.autodirect/sw/release' } + +env: + CUDA_HOME: '/usr/local/cuda' + UCX_BRANCH: 'v1.10.x' + UCX_SRC_DIR: '${WORKSPACE}/ucx' + UCX_BUILD_DIR: '${UCX_SRC_DIR}/build' + UCX_INSTALL_DIR: '${UCX_BUILD_DIR}/_install' + UCC_SRC_DIR: '${WORKSPACE}/ucc' + XCCL_SRC_DIR: '${WORKSPACE}/xccl' + XCCL_BUILD_DIR: '${XCCL_SRC_DIR}/build' + XCCL_INSTALL_DIR: '${XCCL_BUILD_DIR}/_install' + +runs_on_dockers: + - { file: '.ci/Dockerfile.ubuntu20.04', name: 'ubuntu20.04', tag: 'latest', arch: 'x86_64' } + +steps: + #============================================================================ + - name: Check Env + shell: '#!/bin/bash -eEx' + run: | + echo "INFO: check environment" + find . + printenv + cat /proc/1/cgroup + cat /etc/*release* + id + #============================================================================ + - name: Build UCX + shell: '#!/bin/bash -eEx' + run: | + echo "INFO: Build UCX" + cd ${UCX_SRC_DIR} + git checkout ${UCX_BRANCH} + ${UCX_SRC_DIR}/autogen.sh + mkdir -p ${UCX_BUILD_DIR} + cd ${UCX_BUILD_DIR} + ${UCX_SRC_DIR}/configure --enable-mt --with-cuda=${CUDA_HOME} --prefix=${UCX_INSTALL_DIR} + make -j install + #============================================================================ + - name: Build XCCL + shell: '#!/bin/bash -eEx' + run: | + echo "INFO: Build XCCL" + cd ${XCCL_SRC_DIR} + # TODO tmp W/A + sed -i 's|NVCCFLAGS = .*|NVCCFLAGS = "${UCS_CPPFLAGS} -I${XCCL_TOP_SRCDIR}/src -I${XCCL_TOP_SRCDIR}/src/core" --compiler-options -fno-rtti,-fno-exceptions|g' ${XCCL_SRC_DIR}/src/utils/cuda/kernels/Makefile.am + ${XCCL_SRC_DIR}/autogen.sh + mkdir -p ${XCCL_BUILD_DIR} + cd ${XCCL_BUILD_DIR} + ${XCCL_SRC_DIR}/configure --with-cuda=${CUDA_HOME} --with-ucx=${UCX_INSTALL_DIR} \ + --prefix=${XCCL_INSTALL_DIR} + sleep 7200 + make -j install + #============================================================================ + - name: Install Torch-UCC + run: | + echo "INFO: Install Torch-UCC" + cd ${WORKSPACE} + UCX_HOME=${UCX_INSTALL_DIR} + XCCL_HOME=${XCCL_INSTALL_DIR} + WITH_CUDA=${CUDA_HOME} + python setup.py install + #============================================================================ diff --git a/.ci/proj_jjb.yaml b/.ci/proj_jjb.yaml new file mode 100644 index 0000000..0760821 --- /dev/null +++ b/.ci/proj_jjb.yaml @@ -0,0 +1,79 @@ +- job-template: + name: "{jjb_proj}" + project-type: pipeline + properties: + - github: + url: "{jjb_git}" + - build-discarder: + days-to-keep: 50 + num-to-keep: 20 + - inject: + keep-system-variables: true + properties-content: | + jjb_proj={jjb_proj} + description: Do NOT edit this job through the Web GUI ! + concurrent: true + sandbox: true + parameters: + - string: + name: "sha1" + default: "master" + description: "Commit to be checked, set by PR" + - bool: + name: "build_dockers" + default: false + description: "Rebuild docker containers" + - string: + name: "conf_file" + default: ".ci/job_matrix.yaml" + description: "Regex to select job config file" + - bool: + name: "do_release" + default: false + description: "Release rpm" + - string: + name: "release_dir" + default: "/.autodirect/sw/release/sw_acceleration/{jjb_proj}" + description: "Location to release rpm to" + - string: + name: "script" + default: "{jjb_jenkinsfile}" + description: "Jenkinsfile to load on trigger" + - string: + name: "DEBUG" + default: 0 + description: "Enable debug prints and traces, valid values are 0-9" + # triggers: + # - github-pull-request: + # cron: 'H/5 * * * *' + # trigger-phrase: '.*\bbot:retest\b.*' + # status-add-test-results: true + # auth-id: '549927eb-7f38-4a8f-997a-81dd63605782' + # org-list: ["Mellanox"] + # white-list: ["swx-jenkins","swx-jenkins2","swx-jenkins3","mike-dubman","mellanox-github"] + # allow-whitelist-orgs-as-admins: true + pipeline-scm: + scm: + - git: + url: "{jjb_git}" + credentials-id: '549927eb-7f38-4a8f-997a-81dd63605782' + branches: [ '$sha1' ] + shallow-clone: true + depth: 10 + refspec: "+refs/heads/*:refs/remotes/origin/* +refs/pull/*:refs/remotes/origin/pr/*" + browser: githubweb + browser-url: "{jjb_git}" + script-path: "$script" + +- project: + name: proj_name + # TODO + jjb_email: 'TODO' + jjb_proj: 'torch-ucc' + # TODO tmp + jjb_git: 'git@github.com:artemry-nv/torch-ucc.git' + # TODO + jjb_owner: 'TODO' + jjb_jenkinsfile: '.ci/Jenkinsfile.shlib' + jobs: + - "{jjb_proj}" diff --git a/.gitignore b/.gitignore index 6c7df1c..e7050cd 100644 --- a/.gitignore +++ b/.gitignore @@ -60,3 +60,6 @@ dist/ # vscode *.code-workspace .vscode + +# IDEs +.idea/ diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 0000000..80bb6e4 --- /dev/null +++ b/.gitmodules @@ -0,0 +1,9 @@ +[submodule "xccl"] + path = xccl + url = https://github.com/openucx/xccl.git +[submodule "ucc"] + path = ucc + url = https://github.com/openucx/ucc.git +[submodule "ucx"] + path = ucx + url = https://github.com/openucx/ucx.git From b7ecf69abe4ec4d6cf499b963faf3f630d8a7452 Mon Sep 17 00:00:00 2001 From: artemry-nv Date: Thu, 18 Feb 2021 02:34:26 +0300 Subject: [PATCH 2/8] Updated submodules --- .gitmodules | 12 ++++++------ ucc | 1 + ucx | 1 + xccl | 1 + 4 files changed, 9 insertions(+), 6 deletions(-) create mode 160000 ucc create mode 160000 ucx create mode 160000 xccl diff --git a/.gitmodules b/.gitmodules index 80bb6e4..1350374 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,9 +1,9 @@ [submodule "xccl"] - path = xccl - url = https://github.com/openucx/xccl.git + path = xccl + url = https://github.com/openucx/xccl.git [submodule "ucc"] - path = ucc - url = https://github.com/openucx/ucc.git + path = ucc + url = https://github.com/openucx/ucc.git [submodule "ucx"] - path = ucx - url = https://github.com/openucx/ucx.git + path = ucx + url = https://github.com/openucx/ucx.git diff --git a/ucc b/ucc new file mode 160000 index 0000000..ea2ecb5 --- /dev/null +++ b/ucc @@ -0,0 +1 @@ +Subproject commit ea2ecb55332d933673f4fc1d5e44ffa5d73a8ad7 diff --git a/ucx b/ucx new file mode 160000 index 0000000..c19d6a5 --- /dev/null +++ b/ucx @@ -0,0 +1 @@ +Subproject commit c19d6a5af233f98a463b834e12cd8a4ab7e403cf diff --git a/xccl b/xccl new file mode 160000 index 0000000..f4f2082 --- /dev/null +++ b/xccl @@ -0,0 +1 @@ +Subproject commit f4f2082dbbedbf795c1cb376e94e811328704191 From 099ca8bcb545e6ce337a56de19c45f965c8ffd27 Mon Sep 17 00:00:00 2001 From: artemry-nv Date: Thu, 18 Feb 2021 02:40:01 +0300 Subject: [PATCH 3/8] Removed some debug stuff Signed-off-by: artemry-nv --- .ci/job_matrix.yaml | 1 - 1 file changed, 1 deletion(-) diff --git a/.ci/job_matrix.yaml b/.ci/job_matrix.yaml index ffc339c..761eea9 100644 --- a/.ci/job_matrix.yaml +++ b/.ci/job_matrix.yaml @@ -65,7 +65,6 @@ steps: cd ${XCCL_BUILD_DIR} ${XCCL_SRC_DIR}/configure --with-cuda=${CUDA_HOME} --with-ucx=${UCX_INSTALL_DIR} \ --prefix=${XCCL_INSTALL_DIR} - sleep 7200 make -j install #============================================================================ - name: Install Torch-UCC From 2f29428dafdeebbb79b025cd7982884df24f8bd8 Mon Sep 17 00:00:00 2001 From: artemry-nv Date: Wed, 24 Feb 2021 14:36:59 +0300 Subject: [PATCH 4/8] Reworked building/packaging, added docker image preparation, added tests --- .ci/Dockerfile.ubuntu20.04 | 83 ++++++++++++++++++++++++++---- .ci/job_matrix.yaml | 68 +++++++++--------------- .ci/scripts/run_tests_torch_ucc.sh | 18 +++++++ .ci/scripts/run_tests_xccl.sh | 23 +++++++++ ucc | 2 +- ucx | 2 +- xccl | 2 +- 7 files changed, 142 insertions(+), 56 deletions(-) create mode 100755 .ci/scripts/run_tests_torch_ucc.sh create mode 100755 .ci/scripts/run_tests_xccl.sh diff --git a/.ci/Dockerfile.ubuntu20.04 b/.ci/Dockerfile.ubuntu20.04 index 6343cfb..ce765d9 100644 --- a/.ci/Dockerfile.ubuntu20.04 +++ b/.ci/Dockerfile.ubuntu20.04 @@ -1,7 +1,24 @@ -FROM nvidia/cuda:11.2.1-devel-ubuntu20.04 +ARG CUDA_VER='11.2.1' +FROM nvidia/cuda:${CUDA_VER}-devel-ubuntu20.04 +#============================================================================== +ARG TORCH_UCC_ROOT_DIR=/opt/nvidia/torch-ucc +ENV TORCH_UCC_SRC_DIR=${TORCH_UCC_ROOT_DIR}/src +ENV TORCH_UCC_PKG_DIR=${TORCH_UCC_ROOT_DIR}/pkg +ENV TORCH_UCC_BIN_DIR=${TORCH_UCC_ROOT_DIR}/bin +ENV CUDA_HOME=/usr/local/cuda +ENV UCX_BRANCH=v1.10.x +ENV UCX_BUILD_TYPE=release-mt +ENV UCX_INSTALL_DIR=${TORCH_UCC_BIN_DIR}/ucx/build-${UCX_BUILD_TYPE} +ENV XCCL_BUILD_TYPE=debug +ENV XCCL_INSTALL_DIR=${TORCH_UCC_BIN_DIR}/xccl/build-${XCCL_BUILD_TYPE} +#============================================================================== +RUN mkdir -p ${TORCH_UCC_SRC_DIR} && \ + mkdir -p ${TORCH_UCC_PKG_DIR} && \ + mkdir -p ${TORCH_UCC_BIN_DIR} +COPY . ${TORCH_UCC_SRC_DIR} +#============================================================================== ARG DEBIAN_FRONTEND=noninteractive - RUN apt update && \ apt install -y \ apt-utils \ @@ -14,16 +31,48 @@ RUN apt update && \ ibverbs-utils \ libnuma-dev \ libtool-bin \ + ninja-build \ + openmpi-bin \ vim \ && \ rm -rf /var/lib/apt/lists/* - +#============================================================================== +# Build UCX +RUN echo "INFO: Build UCX" && \ + cd ${TORCH_UCC_SRC_DIR}/ucx && \ + git checkout ${UCX_BRANCH} && \ + ${TORCH_UCC_SRC_DIR}/ucx/autogen.sh && \ + mkdir -p ${TORCH_UCC_SRC_DIR}/ucx/build-${UCX_BUILD_TYPE} && \ + cd ${TORCH_UCC_SRC_DIR}/ucx/build-${UCX_BUILD_TYPE} && \ + ${TORCH_UCC_SRC_DIR}/ucx/contrib/configure-release-mt --with-cuda=${CUDA_HOME} --prefix=${UCX_INSTALL_DIR} && \ + make -j install && \ + echo "${UCX_INSTALL_DIR}/lib" > /etc/ld.so.conf.d/ucx.conf && \ + ldconfig && \ + ldconfig -p | grep -i ucx && \ + cd ${UCX_INSTALL_DIR} && tar cfz ${TORCH_UCC_PKG_DIR}/ucx-${UCX_BUILD_TYPE}.tgz --owner=0 --group=0 . +ENV PATH=${UCX_INSTALL_DIR}/bin:${PATH} +#============================================================================== +# Build XCCL +RUN echo "INFO: Build XCCL" && \ + cd ${TORCH_UCC_SRC_DIR}/xccl && \ + ${TORCH_UCC_SRC_DIR}/xccl/autogen.sh && \ + mkdir -p ${TORCH_UCC_SRC_DIR}/xccl/build-${XCCL_BUILD_TYPE} && \ + cd ${TORCH_UCC_SRC_DIR}/xccl/build-${XCCL_BUILD_TYPE} && \ + ${TORCH_UCC_SRC_DIR}/xccl/configure --with-ucx=${UCX_INSTALL_DIR} \ + --prefix=${XCCL_INSTALL_DIR} --enable-debug && \ + make -j install && \ + echo "${XCCL_INSTALL_DIR}/lib" > /etc/ld.so.conf.d/xccl.conf && \ + ldconfig && \ + ldconfig -p | grep -i xccl && \ + make -C test && \ + cd ${XCCL_INSTALL_DIR} && tar cfz ${TORCH_UCC_PKG_DIR}/xccl-${XCCL_BUILD_TYPE}.tgz --owner=0 --group=0 . +#============================================================================== # Install conda RUN curl -LO http://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh && \ bash Miniconda3-latest-Linux-x86_64.sh -p /opt/conda -b && \ rm -f Miniconda3-latest-Linux-x86_64.sh ENV PATH /opt/conda/bin:${PATH} - +#============================================================================== # Install conda python RUN conda update -y conda && \ conda install -c anaconda -y \ @@ -32,11 +81,11 @@ RUN conda update -y conda && \ pip install --no-cache-dir python-hostlist RUN ln -s /opt/conda/bin/python /usr/bin/python - -WORKDIR "/workspace" - -# Install PyTorch -RUN git clone https://github.com/pytorch/pytorch.git && \ +RUN python3 -m pip install --user --upgrade setuptools wheel auditwheel check-wheel-contents +#============================================================================== +# Build and Install PyTorch +RUN cd /tmp && \ + git clone https://github.com/pytorch/pytorch.git && \ cd pytorch && \ git submodule sync --recursive && \ git submodule update --init --recursive && \ @@ -54,5 +103,19 @@ RUN git clone https://github.com/pytorch/pytorch.git && \ USE_QNNPACK=0 \ USE_XNNPACK=0 \ USE_KINETO=1 \ + MAX_JOBS=$(($(nproc)-1)) \ python setup.py install && \ - rm -rf /workspace/pytorch + rm -rf /tmp/pytorch +#============================================================================== +# Install torch_ucc python module and build a wheel package +RUN echo "INFO: Install Torch-UCC" && \ + cd ${TORCH_UCC_SRC_DIR} && \ + env \ + UCX_HOME=${UCX_INSTALL_DIR} \ + XCCL_HOME=${XCCL_INSTALL_DIR} \ + WITH_CUDA=${CUDA_HOME} \ + python setup.py install bdist_wheel && \ + pip3 list | grep torch && \ + python -c 'import torch, torch_ucc' && \ + cp ${TORCH_UCC_SRC_DIR}/dist/*.whl ${TORCH_UCC_PKG_DIR} +#============================================================================== diff --git a/.ci/job_matrix.yaml b/.ci/job_matrix.yaml index 761eea9..fbeb2c5 100644 --- a/.ci/job_matrix.yaml +++ b/.ci/job_matrix.yaml @@ -4,6 +4,7 @@ job: 'torch-ucc' registry_host: 'harbor.mellanox.com' # TODO change registry_path: '/swx-infra/torch-ucc' +#registry_path: '/torch-ucc' registry_auth: '1daaea28-800e-425f-a91f-3bd3e9136eea' kubernetes: @@ -16,63 +17,44 @@ volumes: - { mountPath: '/.autodirect/sw/release', hostPath: '/.autodirect/sw/release' } env: - CUDA_HOME: '/usr/local/cuda' - UCX_BRANCH: 'v1.10.x' - UCX_SRC_DIR: '${WORKSPACE}/ucx' - UCX_BUILD_DIR: '${UCX_SRC_DIR}/build' - UCX_INSTALL_DIR: '${UCX_BUILD_DIR}/_install' - UCC_SRC_DIR: '${WORKSPACE}/ucc' - XCCL_SRC_DIR: '${WORKSPACE}/xccl' - XCCL_BUILD_DIR: '${XCCL_SRC_DIR}/build' - XCCL_INSTALL_DIR: '${XCCL_BUILD_DIR}/_install' + CUDA_VER: '11.2.1' + TORCH_UCC_URI_SUFFIX: '${TORCH_UCC_VERSION}/x86_64/ubuntu20.04/cuda${CUDA_VER}' + TORCH_UCC_ROOT_DIR: '/opt/nvidia/torch-ucc' + TORCH_UCC_SRC_DIR: '${TORCH_UCC_ROOT_DIR}/src' runs_on_dockers: - - { file: '.ci/Dockerfile.ubuntu20.04', name: 'ubuntu20.04', tag: 'latest', arch: 'x86_64' } + - { + file: '.ci/Dockerfile.ubuntu20.04', + name: 'ubuntu20.04', + tag: 'latest', + arch: 'x86_64', + uri: '${TORCH_UCC_URI_SUFFIX}', + build_args: '--rm --no-cache --build-arg CUDA_VER=${CUDA_VER} --build-arg TORCH_UCC_ROOT_DIR=${TORCH_UCC_ROOT_DIR}', + } + +# TODO debug +timeout_minutes: '180' steps: #============================================================================ - name: Check Env - shell: '#!/bin/bash -eEx' run: | echo "INFO: check environment" - find . printenv cat /proc/1/cgroup cat /etc/*release* id + find /opt/nvidia #============================================================================ - - name: Build UCX - shell: '#!/bin/bash -eEx' - run: | - echo "INFO: Build UCX" - cd ${UCX_SRC_DIR} - git checkout ${UCX_BRANCH} - ${UCX_SRC_DIR}/autogen.sh - mkdir -p ${UCX_BUILD_DIR} - cd ${UCX_BUILD_DIR} - ${UCX_SRC_DIR}/configure --enable-mt --with-cuda=${CUDA_HOME} --prefix=${UCX_INSTALL_DIR} - make -j install - #============================================================================ - - name: Build XCCL - shell: '#!/bin/bash -eEx' + - name: Run XCCL tests run: | - echo "INFO: Build XCCL" - cd ${XCCL_SRC_DIR} - # TODO tmp W/A - sed -i 's|NVCCFLAGS = .*|NVCCFLAGS = "${UCS_CPPFLAGS} -I${XCCL_TOP_SRCDIR}/src -I${XCCL_TOP_SRCDIR}/src/core" --compiler-options -fno-rtti,-fno-exceptions|g' ${XCCL_SRC_DIR}/src/utils/cuda/kernels/Makefile.am - ${XCCL_SRC_DIR}/autogen.sh - mkdir -p ${XCCL_BUILD_DIR} - cd ${XCCL_BUILD_DIR} - ${XCCL_SRC_DIR}/configure --with-cuda=${CUDA_HOME} --with-ucx=${UCX_INSTALL_DIR} \ - --prefix=${XCCL_INSTALL_DIR} - make -j install + echo "INFO: Run XCCL tests" + ${TORCH_UCC_SRC_DIR}/.ci/scripts/run_tests_xccl.sh + #sleep 10000 #============================================================================ - - name: Install Torch-UCC + - name: Run Torch-UCC tests run: | - echo "INFO: Install Torch-UCC" - cd ${WORKSPACE} - UCX_HOME=${UCX_INSTALL_DIR} - XCCL_HOME=${XCCL_INSTALL_DIR} - WITH_CUDA=${CUDA_HOME} - python setup.py install + echo "INFO: Run Torch-UCC tests" + ${TORCH_UCC_SRC_DIR}/.ci/scripts/run_tests_torch_ucc.sh + #sleep 20000 #============================================================================ diff --git a/.ci/scripts/run_tests_torch_ucc.sh b/.ci/scripts/run_tests_torch_ucc.sh new file mode 100755 index 0000000..5f74a15 --- /dev/null +++ b/.ci/scripts/run_tests_torch_ucc.sh @@ -0,0 +1,18 @@ +#!/bin/sh -eEx + +command -v mpirun +export TORCH_UCC_XCCL_TLS=ucx +ucx_info -e -u t +export UCX_LOG_LEVEL=info +#echo "XCCL allreduce" +#/bin/bash ${XCCL_SRC_DIR}/test/start_test.sh ${XCCL_SRC_DIR}/test/torch_allreduce_test.py --backend=gloo +#echo "XCCL alltoall" +#/bin/bash ${XCCL_SRC_DIR}/test/start_test.sh ${XCCL_SRC_DIR}/test/torch_alltoall_test.py --backend=gloo +#echo "XCCL alltoallv" +#/bin/bash ${XCCL_SRC_DIR}/test/start_test.sh ${XCCL_SRC_DIR}/test/torch_alltoallv_test.py --backend=gloo +#echo "XCCL barrier" +#/bin/bash ${XCCL_SRC_DIR}/test/start_test.sh ${XCCL_SRC_DIR}/test/torch_barrier_test.py --backend=gloo +#echo "XCCL allgather" +#/bin/bash ${XCCL_SRC_DIR}/test/start_test.sh ${XCCL_SRC_DIR}/test/torch_allgather_test.py --backend=gloo +#echo "XCCL broadcast" +#/bin/bash ${XCCL_SRC_DIR}/test/start_test.sh ${XCCL_SRC_DIR}/test/torch_bcast_test.py --backend=gloo diff --git a/.ci/scripts/run_tests_xccl.sh b/.ci/scripts/run_tests_xccl.sh new file mode 100755 index 0000000..5506f38 --- /dev/null +++ b/.ci/scripts/run_tests_xccl.sh @@ -0,0 +1,23 @@ +#!/bin/bash -eEx + +command -v mpirun +export UCX_SOCKADDR_CM_ENABLE=n +#MPI_ARGS_COMMON="--allow-run-as-root --oversubscribe -np 8 -H localhost:8 --bind-to none -mca coll ^hcoll" +#mpirun ${MPI_ARGS_COMMON} -x XCCL_TEST_TLS=hier ${XCCL_BUILD_DIR}/test/test_mpi_allreduce +#mpirun ${MPI_ARGS_COMMON} -x XCCL_TEST_TLS=hier ${XCCL_BUILD_DIR}/test/test_mpi_bcast +#mpirun ${MPI_ARGS_COMMON} -x XCCL_TEST_TLS=hier ${XCCL_BUILD_DIR}/test/test_mpi_barrier +# +#mpirun ${MPI_ARGS_COMMON} -x XCCL_TEAM_HIER_NODE_LEADER_RANK_ID=3 -x XCCL_TEST_TLS=hier ${XCCL_BUILD_DIR}/test/test_mpi_allreduce +#mpirun ${MPI_ARGS_COMMON} -x XCCL_TEAM_HIER_NODE_LEADER_RANK_ID=4 -x XCCL_TEST_TLS=hier ${XCCL_BUILD_DIR}/test/test_mpi_bcast +#mpirun ${MPI_ARGS_COMMON} -x XCCL_TEAM_HIER_NODE_LEADER_RANK_ID=5 -x XCCL_TEST_TLS=hier ${XCCL_BUILD_DIR}/test/test_mpi_barrier +# +#mpirun ${MPI_ARGS_COMMON} -x XCCL_TEAM_UCX_ALLREDUCE_ALG_ID=0 -x XCCL_TEST_TLS=ucx ${XCCL_BUILD_DIR}/test/test_mpi_allreduce +#mpirun ${MPI_ARGS_COMMON} -x XCCL_TEAM_UCX_ALLREDUCE_ALG_ID=1 -x XCCL_TEST_TLS=ucx ${XCCL_BUILD_DIR}/test/test_mpi_allreduce +#mpirun ${MPI_ARGS_COMMON} -x XCCL_TEST_TLS=ucx ${XCCL_BUILD_DIR}/test/test_mpi_bcast +#mpirun ${MPI_ARGS_COMMON} -x XCCL_TEST_TLS=ucx ${XCCL_BUILD_DIR}/test/test_mpi_barrier +#mpirun ${MPI_ARGS_COMMON} -x XCCL_TEST_TLS=ucx ${XCCL_BUILD_DIR}/test/test_mpi_alltoall +#mpirun ${MPI_ARGS_COMMON} -x XCCL_TEST_TLS=ucx ${XCCL_BUILD_DIR}/test/test_mpi_alltoallv +#mpirun ${MPI_ARGS_COMMON} -x XCCL_TEST_TLS=ucx ${XCCL_BUILD_DIR}/test/test_mpi_allgather +#mpirun -x XCCL_TEAM_UCX_ALLTOALL_PAIRWISE_CHUNK=0 ${MPI_ARGS_COMMON} -x XCCL_TEST_TLS=ucx ${XCCL_BUILD_DIR}/test/test_mpi_alltoall +#mpirun -x XCCL_TEAM_UCX_ALLTOALL_PAIRWISE_CHUNK=0 ${MPI_ARGS_COMMON} -x XCCL_TEST_TLS=ucx ${XCCL_BUILD_DIR}/test/test_mpi_alltoallv +#mpirun ${MPI_ARGS_COMMON} -x XCCL_TEST_TLS=hier -x XCCL_TEST_ITERS=500 -x XCCL_TEST_NTHREADS=4 -x XCCL_TEST_CHECK=1 ${XCCL_BUILD_DIR}/test/test_mpi_mt diff --git a/ucc b/ucc index ea2ecb5..92608a3 160000 --- a/ucc +++ b/ucc @@ -1 +1 @@ -Subproject commit ea2ecb55332d933673f4fc1d5e44ffa5d73a8ad7 +Subproject commit 92608a35c360a29198c7a2a7ca496fc23bd43997 diff --git a/ucx b/ucx index c19d6a5..ecaaf60 160000 --- a/ucx +++ b/ucx @@ -1 +1 @@ -Subproject commit c19d6a5af233f98a463b834e12cd8a4ab7e403cf +Subproject commit ecaaf60418e3a00020f714be16ecc1d70ff120a5 diff --git a/xccl b/xccl index f4f2082..3cd39e9 160000 --- a/xccl +++ b/xccl @@ -1 +1 @@ -Subproject commit f4f2082dbbedbf795c1cb376e94e811328704191 +Subproject commit 3cd39e982ef32e4d3562c58ab11c03ee3ba05514 From e5553dfd4f082ec7d7c712cb948a6ff08825f266 Mon Sep 17 00:00:00 2001 From: artemry-nv Date: Mon, 15 Mar 2021 09:39:58 +0300 Subject: [PATCH 5/8] Reworked for CentOS 8 Signed-off-by: artemry-nv --- .ci/Dockerfile.centos8 | 71 +++++++++ .ci/Dockerfile.ubuntu20.04 | 138 ++++++------------ .ci/configs/swx-clx01/hostfile.txt | 2 + .ci/configs/swx-clx02/hostfile.txt | 2 + .ci/job_matrix.yaml | 75 +++++++--- .../dlrm/0001-Added-torch_ucc-support.patch | 30 ++++ .ci/scripts/build_ucx.sh | 17 +++ .ci/scripts/build_xccl.sh | 18 +++ .ci/scripts/configure_python.sh | 5 + .ci/scripts/env.sh | 25 ++++ .ci/scripts/install_torch.sh | 37 +++++ .ci/scripts/install_torch_ucc.sh | 14 ++ .ci/scripts/run_fb_dlrm.sh | 77 ++++++++++ .ci/scripts/run_fb_dlrm_docker.sh | 60 ++++++++ .ci/scripts/run_param_benchmarks.sh | 46 ++++++ .ci/scripts/run_tests_torch_ucc.sh | 29 ++-- .ci/scripts/run_tests_xccl.sh | 40 ++--- .gitmodules | 12 +- 18 files changed, 549 insertions(+), 149 deletions(-) create mode 100644 .ci/Dockerfile.centos8 create mode 100644 .ci/configs/swx-clx01/hostfile.txt create mode 100644 .ci/configs/swx-clx02/hostfile.txt create mode 100644 .ci/patches/dlrm/0001-Added-torch_ucc-support.patch create mode 100755 .ci/scripts/build_ucx.sh create mode 100755 .ci/scripts/build_xccl.sh create mode 100755 .ci/scripts/configure_python.sh create mode 100755 .ci/scripts/env.sh create mode 100755 .ci/scripts/install_torch.sh create mode 100755 .ci/scripts/install_torch_ucc.sh create mode 100755 .ci/scripts/run_fb_dlrm.sh create mode 100755 .ci/scripts/run_fb_dlrm_docker.sh create mode 100755 .ci/scripts/run_param_benchmarks.sh diff --git a/.ci/Dockerfile.centos8 b/.ci/Dockerfile.centos8 new file mode 100644 index 0000000..a3b7b78 --- /dev/null +++ b/.ci/Dockerfile.centos8 @@ -0,0 +1,71 @@ +ARG CUDA_VER='11.2.1' +FROM nvidia/cuda:${CUDA_VER}-devel-centos8 +#============================================================================== +ARG TORCH_UCC_ROOT_DIR=/opt/nvidia/torch-ucc +ENV TORCH_UCC_SRC_DIR=${TORCH_UCC_ROOT_DIR}/src +ENV TORCH_UCC_PKG_DIR=${TORCH_UCC_ROOT_DIR}/pkg +ENV TORCH_UCC_BIN_DIR=${TORCH_UCC_ROOT_DIR}/bin +ENV TORCH_UCC_WORKLOADS_DIR=${TORCH_UCC_ROOT_DIR}/workloads +ENV CUDA_HOME=/usr/local/cuda +ENV UCX_BRANCH=v1.10.x +ENV UCX_BUILD_TYPE=release-mt +ENV UCX_INSTALL_DIR=${TORCH_UCC_BIN_DIR}/ucx/build-${UCX_BUILD_TYPE} +ENV XCCL_BUILD_TYPE=debug +ENV XCCL_INSTALL_DIR=${TORCH_UCC_BIN_DIR}/xccl/build-${XCCL_BUILD_TYPE} +#============================================================================== +RUN mkdir -p ${TORCH_UCC_SRC_DIR} && \ + mkdir -p ${TORCH_UCC_PKG_DIR} && \ + mkdir -p ${TORCH_UCC_BIN_DIR} && \ + mkdir -p ${TORCH_UCC_WORKLOADS_DIR} + +COPY . ${TORCH_UCC_SRC_DIR} +#============================================================================== +RUN yum groupinstall -y \ + 'Development Tools' \ + 'Infiniband Support' +RUN yum config-manager --set-enabled powertools && yum install -y \ + cmake \ + numactl \ + numactl-devel \ + openmpi \ + openmpi-devel \ + openssh-server \ + protobuf-compiler \ + protobuf-devel \ + python36-devel \ + vim +# Remove old UCX +RUN rpm -e --nodeps ucx +ENV PATH=/usr/lib64/openmpi/bin:${PATH} +#============================================================================== +# Configure SSH +RUN mkdir -p /var/run/sshd && \ + cat /etc/ssh/ssh_config | grep -v StrictHostKeyChecking > /etc/ssh/ssh_config.new && \ + echo " StrictHostKeyChecking no" >> /etc/ssh/ssh_config.new && \ + mv /etc/ssh/ssh_config.new /etc/ssh/ssh_config +#============================================================================== +# Build UCX +RUN ${TORCH_UCC_SRC_DIR}/.ci/scripts/build_ucx.sh +ENV PATH=${UCX_INSTALL_DIR}/bin:${PATH} +#============================================================================== +# Configure Python +RUN ${TORCH_UCC_SRC_DIR}/.ci/scripts/configure_python.sh +#============================================================================== +# Build XCCL +RUN ${TORCH_UCC_SRC_DIR}/.ci/scripts/build_xccl.sh +#============================================================================== +# Install PyTorch +RUN ${TORCH_UCC_SRC_DIR}/.ci/scripts/install_torch.sh +#============================================================================== +# Install torch_ucc python module and build a wheel package +RUN ${TORCH_UCC_SRC_DIR}/.ci/scripts/install_torch_ucc.sh +#============================================================================== +# Install workloads +WORKDIR ${TORCH_UCC_WORKLOADS_DIR} +RUN git clone https://github.com/facebookresearch/dlrm.git && \ + cd ${TORCH_UCC_WORKLOADS_DIR}/dlrm && \ + git apply ${TORCH_UCC_SRC_DIR}/.ci/patches/dlrm/0001-Added-torch_ucc-support.patch && \ + pip3 install -r ${TORCH_UCC_WORKLOADS_DIR}/dlrm/requirements.txt && \ + pip3 install tensorboard +RUN git clone https://github.com/facebookresearch/param.git && \ + pip3 install -r ${TORCH_UCC_WORKLOADS_DIR}/param/requirements.txt \ No newline at end of file diff --git a/.ci/Dockerfile.ubuntu20.04 b/.ci/Dockerfile.ubuntu20.04 index ce765d9..d6ecd18 100644 --- a/.ci/Dockerfile.ubuntu20.04 +++ b/.ci/Dockerfile.ubuntu20.04 @@ -1,10 +1,12 @@ -ARG CUDA_VER='11.2.1' +#ARG CUDA_VER='11.2.1' +ARG CUDA_VER='11.1.1' FROM nvidia/cuda:${CUDA_VER}-devel-ubuntu20.04 #============================================================================== ARG TORCH_UCC_ROOT_DIR=/opt/nvidia/torch-ucc ENV TORCH_UCC_SRC_DIR=${TORCH_UCC_ROOT_DIR}/src ENV TORCH_UCC_PKG_DIR=${TORCH_UCC_ROOT_DIR}/pkg ENV TORCH_UCC_BIN_DIR=${TORCH_UCC_ROOT_DIR}/bin +ENV TORCH_UCC_WORKLOADS_DIR=${TORCH_UCC_ROOT_DIR}/workloads ENV CUDA_HOME=/usr/local/cuda ENV UCX_BRANCH=v1.10.x ENV UCX_BUILD_TYPE=release-mt @@ -13,109 +15,61 @@ ENV XCCL_BUILD_TYPE=debug ENV XCCL_INSTALL_DIR=${TORCH_UCC_BIN_DIR}/xccl/build-${XCCL_BUILD_TYPE} #============================================================================== RUN mkdir -p ${TORCH_UCC_SRC_DIR} && \ - mkdir -p ${TORCH_UCC_PKG_DIR} && \ - mkdir -p ${TORCH_UCC_BIN_DIR} +mkdir -p ${TORCH_UCC_PKG_DIR} && \ +mkdir -p ${TORCH_UCC_BIN_DIR} && \ +mkdir -p ${TORCH_UCC_WORKLOADS_DIR} COPY . ${TORCH_UCC_SRC_DIR} #============================================================================== ARG DEBIAN_FRONTEND=noninteractive RUN apt update && \ - apt install -y \ - apt-utils \ - autoconf \ - build-essential \ - cmake \ - curl \ - git \ - ibverbs-providers \ - ibverbs-utils \ - libnuma-dev \ - libtool-bin \ - ninja-build \ - openmpi-bin \ - vim \ - && \ - rm -rf /var/lib/apt/lists/* +apt install -y \ +apt-utils \ +autoconf \ +build-essential \ +cmake \ +curl \ +git \ +ibverbs-providers \ +ibverbs-utils \ +libnuma-dev \ +libtool-bin \ +ninja-build \ +openmpi-bin \ +openssh-server \ +vim \ +&& \ +rm -rf /var/lib/apt/lists/* +#============================================================================== +# Configure SSH +RUN mkdir -p /var/run/sshd && \ +cat /etc/ssh/ssh_config | grep -v StrictHostKeyChecking > /etc/ssh/ssh_config.new && \ +echo " StrictHostKeyChecking no" >> /etc/ssh/ssh_config.new && \ +mv /etc/ssh/ssh_config.new /etc/ssh/ssh_config #============================================================================== # Build UCX -RUN echo "INFO: Build UCX" && \ - cd ${TORCH_UCC_SRC_DIR}/ucx && \ - git checkout ${UCX_BRANCH} && \ - ${TORCH_UCC_SRC_DIR}/ucx/autogen.sh && \ - mkdir -p ${TORCH_UCC_SRC_DIR}/ucx/build-${UCX_BUILD_TYPE} && \ - cd ${TORCH_UCC_SRC_DIR}/ucx/build-${UCX_BUILD_TYPE} && \ - ${TORCH_UCC_SRC_DIR}/ucx/contrib/configure-release-mt --with-cuda=${CUDA_HOME} --prefix=${UCX_INSTALL_DIR} && \ - make -j install && \ - echo "${UCX_INSTALL_DIR}/lib" > /etc/ld.so.conf.d/ucx.conf && \ - ldconfig && \ - ldconfig -p | grep -i ucx && \ - cd ${UCX_INSTALL_DIR} && tar cfz ${TORCH_UCC_PKG_DIR}/ucx-${UCX_BUILD_TYPE}.tgz --owner=0 --group=0 . +RUN ${TORCH_UCC_SRC_DIR}/.ci/scripts/build_ucx.sh ENV PATH=${UCX_INSTALL_DIR}/bin:${PATH} #============================================================================== -# Build XCCL -RUN echo "INFO: Build XCCL" && \ - cd ${TORCH_UCC_SRC_DIR}/xccl && \ - ${TORCH_UCC_SRC_DIR}/xccl/autogen.sh && \ - mkdir -p ${TORCH_UCC_SRC_DIR}/xccl/build-${XCCL_BUILD_TYPE} && \ - cd ${TORCH_UCC_SRC_DIR}/xccl/build-${XCCL_BUILD_TYPE} && \ - ${TORCH_UCC_SRC_DIR}/xccl/configure --with-ucx=${UCX_INSTALL_DIR} \ - --prefix=${XCCL_INSTALL_DIR} --enable-debug && \ - make -j install && \ - echo "${XCCL_INSTALL_DIR}/lib" > /etc/ld.so.conf.d/xccl.conf && \ - ldconfig && \ - ldconfig -p | grep -i xccl && \ - make -C test && \ - cd ${XCCL_INSTALL_DIR} && tar cfz ${TORCH_UCC_PKG_DIR}/xccl-${XCCL_BUILD_TYPE}.tgz --owner=0 --group=0 . -#============================================================================== -# Install conda -RUN curl -LO http://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh && \ - bash Miniconda3-latest-Linux-x86_64.sh -p /opt/conda -b && \ - rm -f Miniconda3-latest-Linux-x86_64.sh +# Configure Python +RUN ${TORCH_UCC_SRC_DIR}/.ci/scripts/configure_python.sh ENV PATH /opt/conda/bin:${PATH} #============================================================================== -# Install conda python -RUN conda update -y conda && \ - conda install -c anaconda -y \ - python \ - pip && \ - pip install --no-cache-dir python-hostlist - -RUN ln -s /opt/conda/bin/python /usr/bin/python -RUN python3 -m pip install --user --upgrade setuptools wheel auditwheel check-wheel-contents +# Build XCCL +RUN ${TORCH_UCC_SRC_DIR}/.ci/scripts/build_xccl.sh #============================================================================== -# Build and Install PyTorch -RUN cd /tmp && \ - git clone https://github.com/pytorch/pytorch.git && \ - cd pytorch && \ - git submodule sync --recursive && \ - git submodule update --init --recursive && \ - pip install -r requirements.txt && \ - TORCH_CUDA_ARCH_LIST="7.0 8.0+PTX" \ - USE_GLOO=1 \ - USE_DISTRIBUTED=1 \ - USE_OPENCV=0 \ - USE_CUDA=1 \ - USE_NCCL=0 \ - USE_MKLDNN=0 \ - BUILD_TEST=0 \ - USE_FBGEMM=0 \ - USE_NNPACK=0 \ - USE_QNNPACK=0 \ - USE_XNNPACK=0 \ - USE_KINETO=1 \ - MAX_JOBS=$(($(nproc)-1)) \ - python setup.py install && \ - rm -rf /tmp/pytorch +# Install PyTorch +RUN ${TORCH_UCC_SRC_DIR}/.ci/scripts/install_torch.sh #============================================================================== # Install torch_ucc python module and build a wheel package -RUN echo "INFO: Install Torch-UCC" && \ - cd ${TORCH_UCC_SRC_DIR} && \ - env \ - UCX_HOME=${UCX_INSTALL_DIR} \ - XCCL_HOME=${XCCL_INSTALL_DIR} \ - WITH_CUDA=${CUDA_HOME} \ - python setup.py install bdist_wheel && \ - pip3 list | grep torch && \ - python -c 'import torch, torch_ucc' && \ - cp ${TORCH_UCC_SRC_DIR}/dist/*.whl ${TORCH_UCC_PKG_DIR} +RUN ${TORCH_UCC_SRC_DIR}/.ci/scripts/install_torch_ucc.sh #============================================================================== +# Install workloads +WORKDIR ${TORCH_UCC_WORKLOADS_DIR} +RUN git clone https://github.com/facebookresearch/dlrm.git && \ +cd ${TORCH_UCC_WORKLOADS_DIR}/dlrm && \ +git apply ${TORCH_UCC_SRC_DIR}/.ci/patches/dlrm/0001-Added-torch_ucc-support.patch && \ +pip install -r ${TORCH_UCC_WORKLOADS_DIR}/dlrm/requirements.txt && \ +pip install tensorboard +RUN git clone https://github.com/facebookresearch/param.git && \ +pip install -r ${TORCH_UCC_WORKLOADS_DIR}/param/requirements.txt \ No newline at end of file diff --git a/.ci/configs/swx-clx01/hostfile.txt b/.ci/configs/swx-clx01/hostfile.txt new file mode 100644 index 0000000..d813292 --- /dev/null +++ b/.ci/configs/swx-clx01/hostfile.txt @@ -0,0 +1,2 @@ +swx-clx01 +swx-clx02 diff --git a/.ci/configs/swx-clx02/hostfile.txt b/.ci/configs/swx-clx02/hostfile.txt new file mode 100644 index 0000000..fbddaa1 --- /dev/null +++ b/.ci/configs/swx-clx02/hostfile.txt @@ -0,0 +1,2 @@ +swx-clx02 +swx-clx01 diff --git a/.ci/job_matrix.yaml b/.ci/job_matrix.yaml index fbeb2c5..7610198 100644 --- a/.ci/job_matrix.yaml +++ b/.ci/job_matrix.yaml @@ -2,13 +2,11 @@ job: 'torch-ucc' registry_host: 'harbor.mellanox.com' -# TODO change -registry_path: '/swx-infra/torch-ucc' -#registry_path: '/torch-ucc' -registry_auth: '1daaea28-800e-425f-a91f-3bd3e9136eea' +registry_path: '/torch-ucc' +registry_auth: '05d98651-e11c-4a57-9cc6-52df79014b89' -kubernetes: - cloud: 'swx-k8s' +#kubernetes: +# cloud: 'swx-k8s' volumes: - { mountPath: '/hpc/local', hostPath: '/hpc/local' } @@ -17,44 +15,85 @@ volumes: - { mountPath: '/.autodirect/sw/release', hostPath: '/.autodirect/sw/release' } env: - CUDA_VER: '11.2.1' - TORCH_UCC_URI_SUFFIX: '${TORCH_UCC_VERSION}/x86_64/ubuntu20.04/cuda${CUDA_VER}' - TORCH_UCC_ROOT_DIR: '/opt/nvidia/torch-ucc' - TORCH_UCC_SRC_DIR: '${TORCH_UCC_ROOT_DIR}/src' +CUDA_VER: '11.1.1' +# CUDA_VER: '11.2.1' +TORCH_UCC_URI_SUFFIX: '${TORCH_UCC_VERSION}/x86_64/centos8/cuda${CUDA_VER}' +TORCH_UCC_DOCKER_IMAGE_NAME: '${registry_host}${registry_path}/${TORCH_UCC_URI_SUFFIX}' +TORCH_UCC_ROOT_DIR: '/opt/nvidia/torch-ucc' +TORCH_UCC_SRC_DIR: '${TORCH_UCC_ROOT_DIR}/src' +XCCL_BUILD_TYPE: 'debug' + +docker_opt: '--pull always --network=host --uts=host --ipc=host --ulimit stack=67108864 --ulimit memlock=-1 --security-opt seccomp=unconfined --cap-add=SYS_ADMIN --device=/dev/infiniband/ --gpus all --user root' runs_on_dockers: - { - file: '.ci/Dockerfile.ubuntu20.04', - name: 'ubuntu20.04', - tag: 'latest', + file: '.ci/Dockerfile.centos8', + name: 'centos8', + tag: '${BUILD_NUMBER}', arch: 'x86_64', uri: '${TORCH_UCC_URI_SUFFIX}', build_args: '--rm --no-cache --build-arg CUDA_VER=${CUDA_VER} --build-arg TORCH_UCC_ROOT_DIR=${TORCH_UCC_ROOT_DIR}', + cloud: 'swx-k8s', + nodeLabel: 'swx-clx01 || swx-clx02', } +# bare metal +#runs_on_agents: +# - nodeLabel: 'swx-clx01 || swx-clx02' + # TODO debug -timeout_minutes: '180' +timeout_minutes: '400' steps: #============================================================================ - name: Check Env + #agentSelector: "{nodeLabel: 'swx-clx01 || swx-clx02'}" run: | echo "INFO: check environment" + hostname printenv cat /proc/1/cgroup cat /etc/*release* id find /opt/nvidia + ibv_devinfo + nvidia-smi + nvidia-smi topo -m #============================================================================ - name: Run XCCL tests + #agentSelector: "{nodeLabel: 'swx-clx01 || swx-clx02'}" run: | echo "INFO: Run XCCL tests" + hostname + cat /proc/1/cgroup ${TORCH_UCC_SRC_DIR}/.ci/scripts/run_tests_xccl.sh - #sleep 10000 - #============================================================================ + #============================================================================ - name: Run Torch-UCC tests + #agentSelector: "{nodeLabel: 'swx-clx01 || swx-clx02'}" run: | echo "INFO: Run Torch-UCC tests" + hostname + cat /proc/1/cgroup ${TORCH_UCC_SRC_DIR}/.ci/scripts/run_tests_torch_ucc.sh - #sleep 20000 - #============================================================================ +#============================================================================ +# - name: Run FB DLRM tests +# #agentSelector: "{nodeLabel: 'swx-clx01 || swx-clx02'}" +# run: | +# echo "INFO: Run FB DLRM tests" +# printenv +# cat /proc/1/cgroup +# cat /etc/*release* +# id +# find /opt/nvidia +# #ibv_devinfo +# #nvidia-smi +# #${WORKSPACE}/.ci/scripts/run_fb_dlrm_docker.sh +#============================================================================ +# - name: Run PARAM benchmarks +# agentSelector: "{nodeLabel: 'swx-clx01 || swx-clx02'}" +# run: | +# echo "INFO: Run PARAM benchmarks" +# hostname +# cat /proc/1/cgroup +# #${TORCH_UCC_SRC_DIR}/.ci/scripts/run_param_benchmarks.sh +#============================================================================ \ No newline at end of file diff --git a/.ci/patches/dlrm/0001-Added-torch_ucc-support.patch b/.ci/patches/dlrm/0001-Added-torch_ucc-support.patch new file mode 100644 index 0000000..2620579 --- /dev/null +++ b/.ci/patches/dlrm/0001-Added-torch_ucc-support.patch @@ -0,0 +1,30 @@ +From bcd8fc065ef04a0ea8f06e61a5e2581a308719fd Mon Sep 17 00:00:00 2001 +From: artemry-nv +Date: Tue, 9 Mar 2021 00:41:16 +0300 +Subject: [PATCH] Added torch_ucc support + +Signed-off-by: artemry-nv +--- + extend_distributed.py | 6 ++++++ + 1 file changed, 6 insertions(+) + +diff --git a/extend_distributed.py b/extend_distributed.py +index adcb60b..1f2c8a5 100644 +--- a/extend_distributed.py ++++ b/extend_distributed.py +@@ -20,6 +20,12 @@ except ImportError as e: + # print(e) + torch_ccl = False + ++try: ++ import torch_ucc ++except ImportError as e: ++ torch_ucc = False ++ ++ + my_rank = -1 + my_size = -1 + my_local_rank = -1 +-- +2.24.3 (Apple Git-128) + diff --git a/.ci/scripts/build_ucx.sh b/.ci/scripts/build_ucx.sh new file mode 100755 index 0000000..8df3411 --- /dev/null +++ b/.ci/scripts/build_ucx.sh @@ -0,0 +1,17 @@ +#!/bin/bash -eEx +set -o pipefail + +echo "INFO: Build UCX" +cd "${TORCH_UCC_SRC_DIR}/ucx" +git checkout "${UCX_BRANCH}" +"${TORCH_UCC_SRC_DIR}/ucx/autogen.sh" +mkdir -p "${TORCH_UCC_SRC_DIR}/ucx/build-${UCX_BUILD_TYPE}" +cd "${TORCH_UCC_SRC_DIR}/ucx/build-${UCX_BUILD_TYPE}" +# TODO debug +"${TORCH_UCC_SRC_DIR}/ucx/contrib/configure-release-mt" --with-cuda="${CUDA_HOME}" --prefix="${UCX_INSTALL_DIR}" +#"${TORCH_UCC_SRC_DIR}/ucx/contrib/configure-release-mt" --prefix="${UCX_INSTALL_DIR}" +make -j install +echo "${UCX_INSTALL_DIR}/lib" > /etc/ld.so.conf.d/ucx.conf +ldconfig +ldconfig -p | grep -i ucx +cd "${UCX_INSTALL_DIR}" && tar cfz "${TORCH_UCC_PKG_DIR}/ucx-${UCX_BUILD_TYPE}.tgz" --owner=0 --group=0 . diff --git a/.ci/scripts/build_xccl.sh b/.ci/scripts/build_xccl.sh new file mode 100755 index 0000000..409dcb7 --- /dev/null +++ b/.ci/scripts/build_xccl.sh @@ -0,0 +1,18 @@ +#!/bin/bash -eEx +set -o pipefail + +echo "INFO: Build XCCL" +cd "${TORCH_UCC_SRC_DIR}/xccl" +"${TORCH_UCC_SRC_DIR}/xccl/autogen.sh" +mkdir -p "${TORCH_UCC_SRC_DIR}/xccl/build-${XCCL_BUILD_TYPE}" +cd "${TORCH_UCC_SRC_DIR}/xccl/build-${XCCL_BUILD_TYPE}" +# TODO enable CUDA (compilation failed) +#"${TORCH_UCC_SRC_DIR}/xccl/configure" --with-ucx="${UCX_INSTALL_DIR}" --prefix="${XCCL_INSTALL_DIR}" --enable-debug +"${TORCH_UCC_SRC_DIR}/xccl/configure" --with-cuda="${CUDA_HOME}" --with-ucx="${UCX_INSTALL_DIR}" \ + --prefix="${XCCL_INSTALL_DIR}" --enable-debug +make -j install +echo "${XCCL_INSTALL_DIR}/lib" > /etc/ld.so.conf.d/xccl.conf +ldconfig +ldconfig -p | grep -i xccl +make -C test +cd "${XCCL_INSTALL_DIR}" && tar cfz "${TORCH_UCC_PKG_DIR}/xccl-${XCCL_BUILD_TYPE}.tgz" --owner=0 --group=0 . diff --git a/.ci/scripts/configure_python.sh b/.ci/scripts/configure_python.sh new file mode 100755 index 0000000..fddaf10 --- /dev/null +++ b/.ci/scripts/configure_python.sh @@ -0,0 +1,5 @@ +#!/bin/bash -eEx +set -o pipefail + +alternatives --set python /usr/bin/python3 +pip3 install --user --upgrade setuptools wheel diff --git a/.ci/scripts/env.sh b/.ci/scripts/env.sh new file mode 100755 index 0000000..7096f69 --- /dev/null +++ b/.ci/scripts/env.sh @@ -0,0 +1,25 @@ +#!/bin/bash -eEx + +SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd -P)" + +# shellcheck disable=SC2034 +DLRM_MODEL="small" + +HOSTNAME=$(hostname -s) +export HOSTNAME +SRC_ROOT_DIR=$(cd "${SCRIPT_DIR}/../../" && pwd -P) +CONFIGS_DIR="${SRC_ROOT_DIR}/.ci/configs" +export HOSTFILE=${CONFIGS_DIR}/$HOSTNAME/hostfile.txt +# shellcheck disable=SC2002 +HOSTS=$(cat "$HOSTFILE" | xargs | tr ' ' ',') +export HOSTS +NP=$(wc --lines "$HOSTFILE" | awk '{print $1}') +export NP + +if [ ! -f "${HOSTFILE}" ]; then + echo "ERROR: ${HOSTFILE} does not exist" + exit 1 +fi + +export MASTER_ADDR=$HOSTNAME +export MASTER_PORT=4242 diff --git a/.ci/scripts/install_torch.sh b/.ci/scripts/install_torch.sh new file mode 100755 index 0000000..504177e --- /dev/null +++ b/.ci/scripts/install_torch.sh @@ -0,0 +1,37 @@ +#!/bin/bash -eEx +set -o pipefail + +# TODO debug +#cd /tmp +#git clone https://github.com/pytorch/pytorch.git +#cd /tmp/pytorch +#git submodule sync --recursive +#git submodule update --init --recursive +#pip install -r requirements.txt +#export TORCH_CUDA_ARCH_LIST="7.0 8.0+PTX" +#export USE_GLOO=1 +#export USE_DISTRIBUTED=1 +#export USE_OPENCV=0 +## TODO debug +#export USE_CUDA=1 +##export USE_CUDA=0 +#export USE_NCCL=0 +#export USE_MKLDNN=0 +#export BUILD_TEST=0 +#export USE_FBGEMM=0 +#export USE_NNPACK=0 +#export USE_QNNPACK=0 +#export USE_XNNPACK=0 +#export USE_KINETO=1 +#export MAX_JOBS=$(($(nproc)-1)) +#python setup.py install +#rm -rf /tmp/pytorch + +# TODO debug +#conda install -y pytorch torchvision cpuonly -c pytorch-nightly +#conda install pytorch torchvision torchaudio cudatoolkit=11.0 -c pytorch-nightly +#conda uninstall -y pytorch torchvision +#conda install pytorch torchvision cudatoolkit=11.0 -c pytorch-nightly +#conda install pytorch cudatoolkit=11.0 -c pytorch-nightly +pip3 install --default-timeout=900 numpy +pip3 install --default-timeout=900 --pre torch -f https://download.pytorch.org/whl/nightly/cu110/torch_nightly.html diff --git a/.ci/scripts/install_torch_ucc.sh b/.ci/scripts/install_torch_ucc.sh new file mode 100755 index 0000000..8782d2d --- /dev/null +++ b/.ci/scripts/install_torch_ucc.sh @@ -0,0 +1,14 @@ +#!/bin/bash -eEx +set -o pipefail + +echo "INFO: Install Torch-UCC" +cd "${TORCH_UCC_SRC_DIR}" +export UCX_HOME=${UCX_INSTALL_DIR} +export XCCL_HOME=${XCCL_INSTALL_DIR} +# TODO debug +export WITH_CUDA=${CUDA_HOME} +#export WITH_CUDA=no +python setup.py install bdist_wheel +pip3 list | grep torch +python -c 'import torch, torch_ucc' +cp "${TORCH_UCC_SRC_DIR}/dist/"*.whl "${TORCH_UCC_PKG_DIR}" diff --git a/.ci/scripts/run_fb_dlrm.sh b/.ci/scripts/run_fb_dlrm.sh new file mode 100755 index 0000000..92c3828 --- /dev/null +++ b/.ci/scripts/run_fb_dlrm.sh @@ -0,0 +1,77 @@ +#!/bin/bash -eEx +set -o pipefail + +# TODO debug +exit 0 + +SCRIPT_DIR="$( + cd "$(dirname "$0")" + pwd -P +)" +cd "${SCRIPT_DIR}" +. "${SCRIPT_DIR}/env.sh" + +case ${DLRM_MODEL} in +"big") + emb_size="1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000" + emb_dim="256" + emb_lookup="100" + bot_mlp="512-512-256" + top_mlp="1024-1024-1024-1" + loss_func="mse" + round_targets="False" + lr="0.01" + #mb_size="2048" + emb_lookup_fixed="0" + ;; +"small") + emb_size="1000-1000-1000-1000-1000-1000-1000-1000" + emb_dim="64" + emb_lookup="100" + bot_mlp="512-512-64" + top_mlp="1024-1024-1024-1" + loss_func="mse" + round_targets="False" + lr="0.01" + #mb_size="2048" + emb_lookup_fixed="0" + ;; +*) + echo "ERROR: unsupported or empty DLRM_MODEL (${DLRM_MODEL})" + exit 1 + ;; +esac + +cd "${TORCH_UCC_ROOT_DIR}/workloads/dlrm" + +MPIRUN_OPTIONS="\ + -np $NP \ + -H $HOSTS \ + --map-by node \ + -x LD_LIBRARY_PATH \ + --allow-run-as-root \ +" + +# shellcheck disable=SC2086 +mpirun ${MPIRUN_OPTIONS} hostname +#mpirun ${MPIRUN_OPTIONS} python dlrm_s_pytorch.py \ +# --mini-batch-size=2048 \ +# --test-mini-batch-size=16384 \ +# --test-num-workers=0 \ +# --num-batches=100 \ +# --data-generation=random \ +# --arch-mlp-bot=$bot_mlp \ +# --arch-mlp-top=$top_mlp \ +# --arch-sparse-feature-size=$emb_dim \ +# --arch-embedding-size=$emb_size \ +# --num-indices-per-lookup=$emb_lookup \ +# --num-indices-per-lookup-fixed=$emb_lookup_fixed \ +# --arch-interaction-op=dot \ +# --numpy-rand-seed=727 \ +# --print-freq=1 \ +# --loss-function=$loss_func \ +# --round-targets=$round_targets \ +# --learning-rate=$lr \ +# --print-time \ +# --dist-backend=ucc \ +# --use-gpu diff --git a/.ci/scripts/run_fb_dlrm_docker.sh b/.ci/scripts/run_fb_dlrm_docker.sh new file mode 100755 index 0000000..44d58af --- /dev/null +++ b/.ci/scripts/run_fb_dlrm_docker.sh @@ -0,0 +1,60 @@ +#!/bin/bash -eEx +set -o pipefail + +SCRIPT_DIR="$( + cd "$(dirname "$0")" + pwd -P +)" +cd "${SCRIPT_DIR}" +. "${SCRIPT_DIR}/env.sh" + +DOCKER_RUN_ARGS="\ +--pull always \ +--network=host \ +--uts=host \ +--ipc=host \ +--ulimit stack=67108864 \ +--ulimit memlock=-1 \ +--security-opt seccomp=unconfined \ +--cap-add=SYS_ADMIN \ +--device=/dev/infiniband/ \ +--gpus all \ +--user root \ +-it \ +-d \ +--rm \ +--name=${DOCKER_CONTAINER_NAME} \ +" +DOCKER_SSH_PORT="12345" +DOCKER_CONTAINER_NAME="torch_ucc_ci" +DOCKER_IMAGE_NAME="${TORCH_UCC_DOCKER_IMAGE_NAME}:${BUILD_ID}" + +while read -r HOST; do + echo "INFO: HOST = $HOST" + STALE_DOCKER_CONTAINER_LIST=$(sudo ssh "$HOST" "docker ps -a -q -f name=${DOCKER_CONTAINER_NAME}") + if [ -n "${STALE_DOCKER_CONTAINER_LIST}" ]; then + echo "WARNING: stale docker container (name: ${DOCKER_CONTAINER_NAME}) is detected on ${HOST} (to be stopped)" + echo "INFO: Stopping stale docker container (name: ${DOCKER_CONTAINER_NAME}) on ${HOST}..." + sudo ssh "${HOST}" docker stop ${DOCKER_CONTAINER_NAME} + echo "INFO: Stopping stale docker container (name: ${DOCKER_CONTAINER_NAME}) on ${HOST}... DONE" + fi + + echo "INFO: start docker container on $HOST ..." + sudo ssh "$HOST" "docker run \ + ${DOCKER_RUN_ARGS} \ + ${DOCKER_IMAGE_NAME} \ + bash -c "/usr/sbin/sshd -p ${DOCKER_SSH_PORT}; sleep infinity"" + echo "INFO: start docker container on $HOST ... DONE" + + echo "INFO: verify docker container on $HOST ..." + sudo ssh "$HOST" -p ${DOCKER_SSH_PORT} hostname + echo "INFO: verify docker container on $HOST ... DONE" +done <"$HOSTFILE" + +sleep 20000 + +while read -r HOST; do + echo "INFO: stop docker container on $HOST ..." + sudo ssh "${HOST}" docker stop ${DOCKER_CONTAINER_NAME} + echo "INFO: stop docker container on $HOST ... DONE" +done <"$HOSTFILE" \ No newline at end of file diff --git a/.ci/scripts/run_param_benchmarks.sh b/.ci/scripts/run_param_benchmarks.sh new file mode 100755 index 0000000..5c35e31 --- /dev/null +++ b/.ci/scripts/run_param_benchmarks.sh @@ -0,0 +1,46 @@ +#!/bin/bash -eEx +set -o pipefail + +# TODO debug +exit 0 + +source /workspace/set-env-dist.sh +index=$LOCAL_RANK +export OMPI_COMM_WORLD_SIZE=$WORLD_SIZE +export OMPI_COMM_WORLD_LOCAL_SIZE=$LOCAL_SIZE +export OMPI_COMM_WORLD_RANK=$RANK +export OMPI_COMM_WORLD_LOCAL_RANK=$LOCAL_RANK + +if (( $index == 0 )); then + export UCX_NET_DEVICES=mlx5_0:1 + NUMA="numactl --physcpubind=48-63 --membind=3 " +elif (( $index == 1 )); then + export UCX_NET_DEVICES=mlx5_1:1 + NUMA="numactl --physcpubind=48-63 --membind=3 " +elif (( $index == 2 )); then + export UCX_NET_DEVICES=mlx5_2:1 + NUMA="numactl --physcpubind=16-31 --membind=1 " +elif (( $index == 3 )); then + export UCX_NET_DEVICES=mlx5_3:1 + NUMA="numactl --physcpubind=16-31 --membind=1 " +elif (( $index == 4 )); then + export UCX_NET_DEVICES=mlx5_6:1 + NUMA="numactl --physcpubind=112-127 --membind=7 " +elif (( $index == 5 )); then + export UCX_NET_DEVICES=mlx5_7:1 + NUMA="numactl --physcpubind=112-127 --membind=7 " +elif (( $index == 6 )); then + export UCX_NET_DEVICES=mlx5_8:1 + NUMA="numactl --physcpubind=80-95 --membind=5 " +elif (( $index == 7 )); then + export UCX_NET_DEVICES=mlx5_9:1 + NUMA="numactl --physcpubind=80-95 --membind=5 " +fi + +export XCCL_TEAM_UCX_NET_DEVICES=$UCX_NET_DEVICES +export XCCL_TEAM_HIER_NET_DEVICES=$UCX_NET_DEVICES + +EXE="$NUMA python /workspace/param/train/comms/pt/comms.py \ + --master-ip $MASTER_ADDR \ + --master-port $MASTER_PORT $@" +$EXE diff --git a/.ci/scripts/run_tests_torch_ucc.sh b/.ci/scripts/run_tests_torch_ucc.sh index 5f74a15..31e2e3e 100755 --- a/.ci/scripts/run_tests_torch_ucc.sh +++ b/.ci/scripts/run_tests_torch_ucc.sh @@ -1,18 +1,19 @@ -#!/bin/sh -eEx +#!/bin/bash -eEx +set -o pipefail command -v mpirun export TORCH_UCC_XCCL_TLS=ucx +export UCX_WARN_UNUSED_ENV_VARS=n ucx_info -e -u t -export UCX_LOG_LEVEL=info -#echo "XCCL allreduce" -#/bin/bash ${XCCL_SRC_DIR}/test/start_test.sh ${XCCL_SRC_DIR}/test/torch_allreduce_test.py --backend=gloo -#echo "XCCL alltoall" -#/bin/bash ${XCCL_SRC_DIR}/test/start_test.sh ${XCCL_SRC_DIR}/test/torch_alltoall_test.py --backend=gloo -#echo "XCCL alltoallv" -#/bin/bash ${XCCL_SRC_DIR}/test/start_test.sh ${XCCL_SRC_DIR}/test/torch_alltoallv_test.py --backend=gloo -#echo "XCCL barrier" -#/bin/bash ${XCCL_SRC_DIR}/test/start_test.sh ${XCCL_SRC_DIR}/test/torch_barrier_test.py --backend=gloo -#echo "XCCL allgather" -#/bin/bash ${XCCL_SRC_DIR}/test/start_test.sh ${XCCL_SRC_DIR}/test/torch_allgather_test.py --backend=gloo -#echo "XCCL broadcast" -#/bin/bash ${XCCL_SRC_DIR}/test/start_test.sh ${XCCL_SRC_DIR}/test/torch_bcast_test.py --backend=gloo +echo "XCCL allreduce" +/bin/bash ${TORCH_UCC_SRC_DIR}/test/start_test.sh ${TORCH_UCC_SRC_DIR}/test/torch_allreduce_test.py --backend=gloo +echo "XCCL alltoall" +/bin/bash ${TORCH_UCC_SRC_DIR}/test/start_test.sh ${TORCH_UCC_SRC_DIR}/test/torch_alltoall_test.py --backend=gloo +echo "XCCL alltoallv" +/bin/bash ${TORCH_UCC_SRC_DIR}/test/start_test.sh ${TORCH_UCC_SRC_DIR}/test/torch_alltoallv_test.py --backend=gloo +echo "XCCL barrier" +/bin/bash ${TORCH_UCC_SRC_DIR}/test/start_test.sh ${TORCH_UCC_SRC_DIR}/test/torch_barrier_test.py --backend=gloo +echo "XCCL allgather" +/bin/bash ${TORCH_UCC_SRC_DIR}/test/start_test.sh ${TORCH_UCC_SRC_DIR}/test/torch_allgather_test.py --backend=gloo +echo "XCCL broadcast" +/bin/bash ${TORCH_UCC_SRC_DIR}/test/start_test.sh ${TORCH_UCC_SRC_DIR}/test/torch_bcast_test.py --backend=gloo diff --git a/.ci/scripts/run_tests_xccl.sh b/.ci/scripts/run_tests_xccl.sh index 5506f38..c3929ca 100755 --- a/.ci/scripts/run_tests_xccl.sh +++ b/.ci/scripts/run_tests_xccl.sh @@ -1,23 +1,25 @@ #!/bin/bash -eEx +set -o pipefail command -v mpirun export UCX_SOCKADDR_CM_ENABLE=n -#MPI_ARGS_COMMON="--allow-run-as-root --oversubscribe -np 8 -H localhost:8 --bind-to none -mca coll ^hcoll" -#mpirun ${MPI_ARGS_COMMON} -x XCCL_TEST_TLS=hier ${XCCL_BUILD_DIR}/test/test_mpi_allreduce -#mpirun ${MPI_ARGS_COMMON} -x XCCL_TEST_TLS=hier ${XCCL_BUILD_DIR}/test/test_mpi_bcast -#mpirun ${MPI_ARGS_COMMON} -x XCCL_TEST_TLS=hier ${XCCL_BUILD_DIR}/test/test_mpi_barrier -# -#mpirun ${MPI_ARGS_COMMON} -x XCCL_TEAM_HIER_NODE_LEADER_RANK_ID=3 -x XCCL_TEST_TLS=hier ${XCCL_BUILD_DIR}/test/test_mpi_allreduce -#mpirun ${MPI_ARGS_COMMON} -x XCCL_TEAM_HIER_NODE_LEADER_RANK_ID=4 -x XCCL_TEST_TLS=hier ${XCCL_BUILD_DIR}/test/test_mpi_bcast -#mpirun ${MPI_ARGS_COMMON} -x XCCL_TEAM_HIER_NODE_LEADER_RANK_ID=5 -x XCCL_TEST_TLS=hier ${XCCL_BUILD_DIR}/test/test_mpi_barrier -# -#mpirun ${MPI_ARGS_COMMON} -x XCCL_TEAM_UCX_ALLREDUCE_ALG_ID=0 -x XCCL_TEST_TLS=ucx ${XCCL_BUILD_DIR}/test/test_mpi_allreduce -#mpirun ${MPI_ARGS_COMMON} -x XCCL_TEAM_UCX_ALLREDUCE_ALG_ID=1 -x XCCL_TEST_TLS=ucx ${XCCL_BUILD_DIR}/test/test_mpi_allreduce -#mpirun ${MPI_ARGS_COMMON} -x XCCL_TEST_TLS=ucx ${XCCL_BUILD_DIR}/test/test_mpi_bcast -#mpirun ${MPI_ARGS_COMMON} -x XCCL_TEST_TLS=ucx ${XCCL_BUILD_DIR}/test/test_mpi_barrier -#mpirun ${MPI_ARGS_COMMON} -x XCCL_TEST_TLS=ucx ${XCCL_BUILD_DIR}/test/test_mpi_alltoall -#mpirun ${MPI_ARGS_COMMON} -x XCCL_TEST_TLS=ucx ${XCCL_BUILD_DIR}/test/test_mpi_alltoallv -#mpirun ${MPI_ARGS_COMMON} -x XCCL_TEST_TLS=ucx ${XCCL_BUILD_DIR}/test/test_mpi_allgather -#mpirun -x XCCL_TEAM_UCX_ALLTOALL_PAIRWISE_CHUNK=0 ${MPI_ARGS_COMMON} -x XCCL_TEST_TLS=ucx ${XCCL_BUILD_DIR}/test/test_mpi_alltoall -#mpirun -x XCCL_TEAM_UCX_ALLTOALL_PAIRWISE_CHUNK=0 ${MPI_ARGS_COMMON} -x XCCL_TEST_TLS=ucx ${XCCL_BUILD_DIR}/test/test_mpi_alltoallv -#mpirun ${MPI_ARGS_COMMON} -x XCCL_TEST_TLS=hier -x XCCL_TEST_ITERS=500 -x XCCL_TEST_NTHREADS=4 -x XCCL_TEST_CHECK=1 ${XCCL_BUILD_DIR}/test/test_mpi_mt +export UCX_WARN_UNUSED_ENV_VARS=n +MPI_ARGS_COMMON="--allow-run-as-root --oversubscribe -np 8 -H localhost:8 --bind-to none --mca coll ^hcoll --mca btl ^openib --mca mtl ^ofi" +mpirun ${MPI_ARGS_COMMON} -x XCCL_TEST_TLS=hier ${TORCH_UCC_SRC_DIR}/xccl/build-${XCCL_BUILD_TYPE}/test/test_mpi_allreduce +mpirun ${MPI_ARGS_COMMON} -x XCCL_TEST_TLS=hier ${TORCH_UCC_SRC_DIR}/xccl/build-${XCCL_BUILD_TYPE}/test/test_mpi_bcast +mpirun ${MPI_ARGS_COMMON} -x XCCL_TEST_TLS=hier ${TORCH_UCC_SRC_DIR}/xccl/build-${XCCL_BUILD_TYPE}/test/test_mpi_barrier + +mpirun ${MPI_ARGS_COMMON} -x XCCL_TEAM_HIER_NODE_LEADER_RANK_ID=3 -x XCCL_TEST_TLS=hier ${TORCH_UCC_SRC_DIR}/xccl/build-${XCCL_BUILD_TYPE}/test/test_mpi_allreduce +mpirun ${MPI_ARGS_COMMON} -x XCCL_TEAM_HIER_NODE_LEADER_RANK_ID=4 -x XCCL_TEST_TLS=hier ${TORCH_UCC_SRC_DIR}/xccl/build-${XCCL_BUILD_TYPE}/test/test_mpi_bcast +mpirun ${MPI_ARGS_COMMON} -x XCCL_TEAM_HIER_NODE_LEADER_RANK_ID=5 -x XCCL_TEST_TLS=hier ${TORCH_UCC_SRC_DIR}/xccl/build-${XCCL_BUILD_TYPE}/test/test_mpi_barrier + +mpirun ${MPI_ARGS_COMMON} -x XCCL_TEAM_UCX_ALLREDUCE_ALG_ID=0 -x XCCL_TEST_TLS=ucx ${TORCH_UCC_SRC_DIR}/xccl/build-${XCCL_BUILD_TYPE}/test/test_mpi_allreduce +mpirun ${MPI_ARGS_COMMON} -x XCCL_TEAM_UCX_ALLREDUCE_ALG_ID=1 -x XCCL_TEST_TLS=ucx ${TORCH_UCC_SRC_DIR}/xccl/build-${XCCL_BUILD_TYPE}/test/test_mpi_allreduce +mpirun ${MPI_ARGS_COMMON} -x XCCL_TEST_TLS=ucx ${TORCH_UCC_SRC_DIR}/xccl/build-${XCCL_BUILD_TYPE}/test/test_mpi_bcast +mpirun ${MPI_ARGS_COMMON} -x XCCL_TEST_TLS=ucx ${TORCH_UCC_SRC_DIR}/xccl/build-${XCCL_BUILD_TYPE}/test/test_mpi_barrier +mpirun ${MPI_ARGS_COMMON} -x XCCL_TEST_TLS=ucx ${TORCH_UCC_SRC_DIR}/xccl/build-${XCCL_BUILD_TYPE}/test/test_mpi_alltoall +mpirun ${MPI_ARGS_COMMON} -x XCCL_TEST_TLS=ucx ${TORCH_UCC_SRC_DIR}/xccl/build-${XCCL_BUILD_TYPE}/test/test_mpi_alltoallv +mpirun ${MPI_ARGS_COMMON} -x XCCL_TEST_TLS=ucx ${TORCH_UCC_SRC_DIR}/xccl/build-${XCCL_BUILD_TYPE}/test/test_mpi_allgather +mpirun -x XCCL_TEAM_UCX_ALLTOALL_PAIRWISE_CHUNK=0 ${MPI_ARGS_COMMON} -x XCCL_TEST_TLS=ucx ${TORCH_UCC_SRC_DIR}/xccl/build-${XCCL_BUILD_TYPE}/test/test_mpi_alltoall +mpirun -x XCCL_TEAM_UCX_ALLTOALL_PAIRWISE_CHUNK=0 ${MPI_ARGS_COMMON} -x XCCL_TEST_TLS=ucx ${TORCH_UCC_SRC_DIR}/xccl/build-${XCCL_BUILD_TYPE}/test/test_mpi_alltoallv +mpirun ${MPI_ARGS_COMMON} -x XCCL_TEST_TLS=hier -x XCCL_TEST_ITERS=500 -x XCCL_TEST_NTHREADS=4 -x XCCL_TEST_CHECK=1 ${TORCH_UCC_SRC_DIR}/xccl/build-${XCCL_BUILD_TYPE}/test/test_mpi_mt diff --git a/.gitmodules b/.gitmodules index 1350374..562a232 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,9 +1,9 @@ [submodule "xccl"] - path = xccl - url = https://github.com/openucx/xccl.git +path = xccl +url = https://github.com/openucx/xccl.git [submodule "ucc"] - path = ucc - url = https://github.com/openucx/ucc.git +path = ucc +url = https://github.com/openucx/ucc.git [submodule "ucx"] - path = ucx - url = https://github.com/openucx/ucx.git +path = ucx +url = https://github.com/openucx/ucx.git From 2dfe78c8b3736260894bc859540515da3a25275e Mon Sep 17 00:00:00 2001 From: artemry-nv Date: Mon, 15 Mar 2021 09:40:58 +0300 Subject: [PATCH 6/8] Synced --- ucc | 2 +- ucx | 2 +- xccl | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/ucc b/ucc index 92608a3..45cdefa 160000 --- a/ucc +++ b/ucc @@ -1 +1 @@ -Subproject commit 92608a35c360a29198c7a2a7ca496fc23bd43997 +Subproject commit 45cdefa40010efe3b1634df0c881cb720ccf293d diff --git a/ucx b/ucx index ecaaf60..9184ba2 160000 --- a/ucx +++ b/ucx @@ -1 +1 @@ -Subproject commit ecaaf60418e3a00020f714be16ecc1d70ff120a5 +Subproject commit 9184ba2ad7346985b14a56cce4b451059b649953 diff --git a/xccl b/xccl index 3cd39e9..b046913 160000 --- a/xccl +++ b/xccl @@ -1 +1 @@ -Subproject commit 3cd39e982ef32e4d3562c58ab11c03ee3ba05514 +Subproject commit b04691392586477dd83bdf6de75f440540cb688c From a11486587bbd208f04640da18b2b70a6ad30be69 Mon Sep 17 00:00:00 2001 From: artemry-nv Date: Mon, 29 Mar 2021 22:36:00 +0300 Subject: [PATCH 7/8] Added multi node DLRM tests --- .ci/Dockerfile.centos8 | 77 +- .ci/Jenkinsfile.shlib | 2 +- .ci/job_matrix.yaml | 165 ++-- .ci/patches/dlrm/0002-Fixed-arg-list.patch | 25 + .ci/scripts/build_ucc.sh | 16 + .ci/scripts/build_xccl.sh | 17 +- .ci/scripts/configure_python.sh | 24 + .ci/scripts/env.sh | 19 +- .ci/scripts/install_torch.sh | 47 +- .ci/scripts/install_torch_ucc.sh | 14 +- .ci/scripts/install_torch_xccl.sh | 21 + .ci/scripts/run_dlrm.sh | 71 ++ .ci/scripts/run_dlrm_docker.sh | 100 +++ .ci/scripts/run_dlrm_s_pytorch.sh | 81 ++ .ci/scripts/run_tests_torch_ucc.sh | 31 +- .ci/scripts/run_tests_torch_xccl.sh | 24 + .ci/scripts/run_tests_ucc.sh | 8 + .github/workflows/main.yaml | 33 +- include/torch_ucc.hpp | 374 +++++++-- include/torch_ucc_ops.hpp | 168 ---- include/torch_ucc_sendrecv.hpp | 271 ------ include/torch_xccl.hpp | 41 - setup.py | 24 +- src/torch_ucc.cpp | 911 +++++++++++++-------- src/torch_ucc_sendrecv.cpp | 188 ----- src/torch_xccl.cpp | 646 --------------- test/torch_alltoallv_test.py | 12 +- test/torch_init_test.py | 27 + test/torch_sendrecv_test.py | 27 + test/torch_ucc_test_setup.py | 2 +- ucc | 2 +- ucx | 2 +- 32 files changed, 1531 insertions(+), 1939 deletions(-) create mode 100644 .ci/patches/dlrm/0002-Fixed-arg-list.patch create mode 100755 .ci/scripts/build_ucc.sh create mode 100755 .ci/scripts/install_torch_xccl.sh create mode 100755 .ci/scripts/run_dlrm.sh create mode 100755 .ci/scripts/run_dlrm_docker.sh create mode 100755 .ci/scripts/run_dlrm_s_pytorch.sh create mode 100755 .ci/scripts/run_tests_torch_xccl.sh create mode 100755 .ci/scripts/run_tests_ucc.sh delete mode 100644 include/torch_ucc_ops.hpp delete mode 100644 include/torch_ucc_sendrecv.hpp delete mode 100644 include/torch_xccl.hpp delete mode 100644 src/torch_ucc_sendrecv.cpp delete mode 100644 src/torch_xccl.cpp create mode 100644 test/torch_init_test.py create mode 100644 test/torch_sendrecv_test.py diff --git a/.ci/Dockerfile.centos8 b/.ci/Dockerfile.centos8 index a3b7b78..3586b3d 100644 --- a/.ci/Dockerfile.centos8 +++ b/.ci/Dockerfile.centos8 @@ -12,37 +12,45 @@ ENV UCX_BUILD_TYPE=release-mt ENV UCX_INSTALL_DIR=${TORCH_UCC_BIN_DIR}/ucx/build-${UCX_BUILD_TYPE} ENV XCCL_BUILD_TYPE=debug ENV XCCL_INSTALL_DIR=${TORCH_UCC_BIN_DIR}/xccl/build-${XCCL_BUILD_TYPE} +ENV UCC_INSTALL_DIR=${TORCH_UCC_BIN_DIR}/ucc/build +ENV TORCH_UCC_PYTHON_VENV_DIR=${TORCH_UCC_BIN_DIR}/python/venv #============================================================================== RUN mkdir -p ${TORCH_UCC_SRC_DIR} && \ - mkdir -p ${TORCH_UCC_PKG_DIR} && \ - mkdir -p ${TORCH_UCC_BIN_DIR} && \ - mkdir -p ${TORCH_UCC_WORKLOADS_DIR} +mkdir -p ${TORCH_UCC_PKG_DIR} && \ +mkdir -p ${TORCH_UCC_BIN_DIR} && \ +mkdir -p ${TORCH_UCC_WORKLOADS_DIR} && \ +mkdir -p ${TORCH_UCC_PYTHON_VENV_DIR} COPY . ${TORCH_UCC_SRC_DIR} #============================================================================== RUN yum groupinstall -y \ - 'Development Tools' \ - 'Infiniband Support' +'Development Tools' \ +'Infiniband Support' RUN yum config-manager --set-enabled powertools && yum install -y \ - cmake \ - numactl \ - numactl-devel \ - openmpi \ - openmpi-devel \ - openssh-server \ - protobuf-compiler \ - protobuf-devel \ - python36-devel \ - vim +cmake \ +numactl \ +numactl-devel \ +openmpi \ +openmpi-devel \ +openssh-server \ +protobuf-compiler \ +protobuf-devel \ +python36-devel \ +rdma-core-devel \ +vim # Remove old UCX RUN rpm -e --nodeps ucx -ENV PATH=/usr/lib64/openmpi/bin:${PATH} +ENV PATH=/usr/lib64/openmpi/bin:$PATH +RUN echo "export PATH=\"/usr/lib64/openmpi/bin:\$PATH\"" >> /etc/bashrc && \ +export LD_LIBRARY_PATH=\"/usr/lib64/openmpi/lib:\${LD_LIBRARY_PATH}\" >> /etc/bashrc #============================================================================== # Configure SSH RUN mkdir -p /var/run/sshd && \ - cat /etc/ssh/ssh_config | grep -v StrictHostKeyChecking > /etc/ssh/ssh_config.new && \ - echo " StrictHostKeyChecking no" >> /etc/ssh/ssh_config.new && \ - mv /etc/ssh/ssh_config.new /etc/ssh/ssh_config +cat /etc/ssh/ssh_config | grep -v StrictHostKeyChecking > /etc/ssh/ssh_config.new && \ +echo " StrictHostKeyChecking no" >> /etc/ssh/ssh_config.new && \ +mv /etc/ssh/ssh_config.new /etc/ssh/ssh_config && \ +ssh-keygen -A && \ +rm -f /run/nologin #============================================================================== # Build UCX RUN ${TORCH_UCC_SRC_DIR}/.ci/scripts/build_ucx.sh @@ -54,18 +62,33 @@ RUN ${TORCH_UCC_SRC_DIR}/.ci/scripts/configure_python.sh # Build XCCL RUN ${TORCH_UCC_SRC_DIR}/.ci/scripts/build_xccl.sh #============================================================================== +# Build UCC +RUN ${TORCH_UCC_SRC_DIR}/.ci/scripts/build_ucc.sh +#============================================================================== # Install PyTorch RUN ${TORCH_UCC_SRC_DIR}/.ci/scripts/install_torch.sh #============================================================================== -# Install torch_ucc python module and build a wheel package -RUN ${TORCH_UCC_SRC_DIR}/.ci/scripts/install_torch_ucc.sh -#============================================================================== # Install workloads +# TODO upstream the patches (if needed) WORKDIR ${TORCH_UCC_WORKLOADS_DIR} RUN git clone https://github.com/facebookresearch/dlrm.git && \ - cd ${TORCH_UCC_WORKLOADS_DIR}/dlrm && \ - git apply ${TORCH_UCC_SRC_DIR}/.ci/patches/dlrm/0001-Added-torch_ucc-support.patch && \ - pip3 install -r ${TORCH_UCC_WORKLOADS_DIR}/dlrm/requirements.txt && \ - pip3 install tensorboard +cd ${TORCH_UCC_WORKLOADS_DIR}/dlrm && \ +git apply ${TORCH_UCC_SRC_DIR}/.ci/patches/dlrm/0001-Added-torch_ucc-support.patch && \ +git apply ${TORCH_UCC_SRC_DIR}/.ci/patches/dlrm/0002-Fixed-arg-list.patch && \ +pip3 install -r ${TORCH_UCC_WORKLOADS_DIR}/dlrm/requirements.txt && \ +pip3 install tensorboard RUN git clone https://github.com/facebookresearch/param.git && \ - pip3 install -r ${TORCH_UCC_WORKLOADS_DIR}/param/requirements.txt \ No newline at end of file +pip3 install -r ${TORCH_UCC_WORKLOADS_DIR}/param/requirements.txt +#============================================================================== +# Install torch_ucc (XCCL version) python module and build a wheel package +RUN ${TORCH_UCC_SRC_DIR}/.ci/scripts/install_torch_xccl.sh +#============================================================================== +# Install torch_ucc (UCC version) python module and build a wheel package +RUN ${TORCH_UCC_SRC_DIR}/.ci/scripts/install_torch_ucc.sh +#============================================================================== +RUN groupadd -g 11429 swx-jenkins +RUN adduser --uid 6213 --gid 11429 --home /home/swx-jenkins swx-jenkins + +RUN groupadd -g 30 dip +RUN adduser --no-create-home --uid 50009 --gid 30 --home /labhome/artemry artemry +#============================================================================== diff --git a/.ci/Jenkinsfile.shlib b/.ci/Jenkinsfile.shlib index 10bde70..2486083 100644 --- a/.ci/Jenkinsfile.shlib +++ b/.ci/Jenkinsfile.shlib @@ -2,7 +2,7 @@ // load pipeline functions // Requires pipeline-github-lib plugin to load library from github -@Library('github.com/Mellanox/ci-demo@master') +@Library('github.com/Mellanox/ci-demo@stable') def matrix = new com.mellanox.cicd.Matrix() matrix.main() diff --git a/.ci/job_matrix.yaml b/.ci/job_matrix.yaml index 7610198..baaa9a1 100644 --- a/.ci/job_matrix.yaml +++ b/.ci/job_matrix.yaml @@ -9,85 +9,134 @@ registry_auth: '05d98651-e11c-4a57-9cc6-52df79014b89' # cloud: 'swx-k8s' volumes: - - { mountPath: '/hpc/local', hostPath: '/hpc/local' } - - { mountPath: '/auto/sw_tools', hostPath: '/auto/sw_tools' } - - { mountPath: '/.autodirect/mtrswgwork', hostPath: '/.autodirect/mtrswgwork' } - - { mountPath: '/.autodirect/sw/release', hostPath: '/.autodirect/sw/release' } +- { mountPath: '/hpc/local', hostPath: '/hpc/local' } +- { mountPath: '/auto/sw_tools', hostPath: '/auto/sw_tools' } +- { mountPath: '/.autodirect/mtrswgwork', hostPath: '/.autodirect/mtrswgwork' } +- { mountPath: '/.autodirect/sw/release', hostPath: '/.autodirect/sw/release' } env: -CUDA_VER: '11.1.1' -# CUDA_VER: '11.2.1' +CUDA_VER: '11.2.1' TORCH_UCC_URI_SUFFIX: '${TORCH_UCC_VERSION}/x86_64/centos8/cuda${CUDA_VER}' TORCH_UCC_DOCKER_IMAGE_NAME: '${registry_host}${registry_path}/${TORCH_UCC_URI_SUFFIX}' TORCH_UCC_ROOT_DIR: '/opt/nvidia/torch-ucc' TORCH_UCC_SRC_DIR: '${TORCH_UCC_ROOT_DIR}/src' +TORCH_UCC_BIN_DIR: '${TORCH_UCC_ROOT_DIR}/bin' +TORCH_UCC_PYTHON_VENV_DIR: '${TORCH_UCC_BIN_DIR}/python/venv' XCCL_BUILD_TYPE: 'debug' docker_opt: '--pull always --network=host --uts=host --ipc=host --ulimit stack=67108864 --ulimit memlock=-1 --security-opt seccomp=unconfined --cap-add=SYS_ADMIN --device=/dev/infiniband/ --gpus all --user root' runs_on_dockers: - - { - file: '.ci/Dockerfile.centos8', - name: 'centos8', - tag: '${BUILD_NUMBER}', - arch: 'x86_64', - uri: '${TORCH_UCC_URI_SUFFIX}', - build_args: '--rm --no-cache --build-arg CUDA_VER=${CUDA_VER} --build-arg TORCH_UCC_ROOT_DIR=${TORCH_UCC_ROOT_DIR}', - cloud: 'swx-k8s', - nodeLabel: 'swx-clx01 || swx-clx02', - } +- { +file: '.ci/Dockerfile.centos8', +name: 'centos8', +tag: '${BUILD_NUMBER}', +arch: 'x86_64', +uri: '${TORCH_UCC_URI_SUFFIX}', +build_args: '--rm --no-cache --build-arg CUDA_VER=${CUDA_VER} --build-arg TORCH_UCC_ROOT_DIR=${TORCH_UCC_ROOT_DIR}', +cloud: 'swx-k8s', +nodeLabel: 'swx-clx01 || swx-clx02', +} # bare metal -#runs_on_agents: -# - nodeLabel: 'swx-clx01 || swx-clx02' +runs_on_agents: +- nodeLabel: 'swx-clx01 || swx-clx02' # TODO debug timeout_minutes: '400' steps: - #============================================================================ - - name: Check Env - #agentSelector: "{nodeLabel: 'swx-clx01 || swx-clx02'}" - run: | - echo "INFO: check environment" - hostname - printenv - cat /proc/1/cgroup - cat /etc/*release* - id - find /opt/nvidia - ibv_devinfo - nvidia-smi - nvidia-smi topo -m - #============================================================================ - - name: Run XCCL tests - #agentSelector: "{nodeLabel: 'swx-clx01 || swx-clx02'}" - run: | - echo "INFO: Run XCCL tests" - hostname - cat /proc/1/cgroup - ${TORCH_UCC_SRC_DIR}/.ci/scripts/run_tests_xccl.sh - #============================================================================ - - name: Run Torch-UCC tests - #agentSelector: "{nodeLabel: 'swx-clx01 || swx-clx02'}" - run: | - echo "INFO: Run Torch-UCC tests" - hostname - cat /proc/1/cgroup - ${TORCH_UCC_SRC_DIR}/.ci/scripts/run_tests_torch_ucc.sh #============================================================================ -# - name: Run FB DLRM tests +- name: Check Env +agentSelector: "{nodeLabel: 'swx-clx01 || swx-clx02'}" +containerSelector: "{name:'centos8'}" +run: | +echo "INFO: check environment" +hostname +printenv +cat /proc/1/cgroup +cat /etc/*release* +id +#find /opt/nvidia +#ibv_devinfo +#nvidia-smi +#nvidia-smi topo -m +#============================================================================ +- name: Run XCCL tests +#agentSelector: "{nodeLabel: 'swx-clx01 || swx-clx02'}" +containerSelector: "{name:'centos8'}" +run: | +echo "INFO: Run XCCL tests" +. "${TORCH_UCC_PYTHON_VENV_DIR}/xccl/bin/activate" +hostname +cat /proc/1/cgroup +pip3 list | grep torch +${TORCH_UCC_SRC_DIR}/.ci/scripts/run_tests_xccl.sh +deactivate +#============================================================================ +# - name: Run UCC tests # #agentSelector: "{nodeLabel: 'swx-clx01 || swx-clx02'}" +# containerSelector: "{name:'centos8'}" # run: | -# echo "INFO: Run FB DLRM tests" -# printenv +# echo "INFO: Run UCC tests" +# . "${TORCH_UCC_PYTHON_VENV_DIR}/ucc/bin/activate" +# hostname # cat /proc/1/cgroup -# cat /etc/*release* -# id -# find /opt/nvidia -# #ibv_devinfo -# #nvidia-smi -# #${WORKSPACE}/.ci/scripts/run_fb_dlrm_docker.sh +# pip3 list | grep torch +# ${TORCH_UCC_SRC_DIR}/.ci/scripts/run_tests_ucc.sh +# deactivate +#============================================================================ +- name: Run Torch-UCC tests (XCCL) +#agentSelector: "{nodeLabel: 'swx-clx01 || swx-clx02'}" +containerSelector: "{name:'centos8'}" +run: | +echo "INFO: Run Torch-UCC tests (XCCL)" +. "${TORCH_UCC_PYTHON_VENV_DIR}/xccl/bin/activate" +hostname +cat /proc/1/cgroup +pip3 list | grep torch +${TORCH_UCC_SRC_DIR}/.ci/scripts/run_tests_torch_xccl.sh +deactivate +#============================================================================ +- name: Run Torch-UCC tests (UCC) +#agentSelector: "{nodeLabel: 'swx-clx01 || swx-clx02'}" +containerSelector: "{name:'centos8'}" +run: | +echo "INFO: Run Torch-UCC tests (UCC)" +. "${TORCH_UCC_PYTHON_VENV_DIR}/ucc/bin/activate" +hostname +cat /proc/1/cgroup +pip3 list | grep torch +${TORCH_UCC_SRC_DIR}/.ci/scripts/run_tests_torch_ucc.sh +deactivate +#============================================================================ +- name: Run DLRM tests (XCCL/GPU) +agentSelector: "{nodeLabel: 'swx-clx01 || swx-clx02'}" +run: | +echo "INFO: Run DLRM tests (XCCL/GPU)" +hostname +printenv +cat /proc/1/cgroup +cat /etc/*release* +id +find /opt/nvidia +ibv_devinfo +nvidia-smi +${WORKSPACE}/.ci/scripts/run_dlrm_docker.sh xccl +#============================================================================ +- name: Run DLRM tests (UCC/GPU) +agentSelector: "{nodeLabel: 'swx-clx01 || swx-clx02'}" +run: | +echo "INFO: Run DLRM tests (UCC/GPU)" +hostname +printenv +cat /proc/1/cgroup +cat /etc/*release* +id +find /opt/nvidia +ibv_devinfo +nvidia-smi +${WORKSPACE}/.ci/scripts/run_dlrm_docker.sh ucc #============================================================================ # - name: Run PARAM benchmarks # agentSelector: "{nodeLabel: 'swx-clx01 || swx-clx02'}" @@ -96,4 +145,4 @@ steps: # hostname # cat /proc/1/cgroup # #${TORCH_UCC_SRC_DIR}/.ci/scripts/run_param_benchmarks.sh -#============================================================================ \ No newline at end of file +#============================================================================ diff --git a/.ci/patches/dlrm/0002-Fixed-arg-list.patch b/.ci/patches/dlrm/0002-Fixed-arg-list.patch new file mode 100644 index 0000000..4f6b1ce --- /dev/null +++ b/.ci/patches/dlrm/0002-Fixed-arg-list.patch @@ -0,0 +1,25 @@ +From 481fd6aef896aa8ff15a161b7e88b2ea01ae673a Mon Sep 17 00:00:00 2001 +From: artemry-nv +Date: Mon, 29 Mar 2021 01:56:08 +0300 +Subject: [PATCH] Fixed arg list + +--- + dlrm_s_pytorch.py | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/dlrm_s_pytorch.py b/dlrm_s_pytorch.py +index 71a0414..36dab9b 100644 +--- a/dlrm_s_pytorch.py ++++ b/dlrm_s_pytorch.py +@@ -1477,7 +1477,7 @@ def run(): + + ext_dist.barrier() + with torch.autograd.profiler.profile( +- args.enable_profiling, use_gpu, record_shapes=True ++ args.enable_profiling, use_cuda=use_gpu, record_shapes=True + ) as prof: + if not args.inference_only: + k = 0 +-- +2.24.3 (Apple Git-128) + diff --git a/.ci/scripts/build_ucc.sh b/.ci/scripts/build_ucc.sh new file mode 100755 index 0000000..988d570 --- /dev/null +++ b/.ci/scripts/build_ucc.sh @@ -0,0 +1,16 @@ +#!/bin/bash -eEx +set -o pipefail + +echo "INFO: Build UCC" +UCC_SRC_DIR="${TORCH_UCC_SRC_DIR}/ucc" +cd "${UCC_SRC_DIR}" +"${UCC_SRC_DIR}/autogen.sh" +mkdir -p "${UCC_SRC_DIR}/build" +cd "${UCC_SRC_DIR}/build" +"${UCC_SRC_DIR}/configure" --with-ucx="${UCX_INSTALL_DIR}" --with-cuda="${CUDA_HOME}" \ + --prefix="${UCC_INSTALL_DIR}" --enable-gtest +make -j install +echo "${UCC_INSTALL_DIR}/lib" > /etc/ld.so.conf.d/ucc.conf +ldconfig +ldconfig -p | grep -i libucc +cd "${UCC_INSTALL_DIR}" && tar cfz "${TORCH_UCC_PKG_DIR}/ucc.tgz" --owner=0 --group=0 . diff --git a/.ci/scripts/build_xccl.sh b/.ci/scripts/build_xccl.sh index 409dcb7..45cebb9 100755 --- a/.ci/scripts/build_xccl.sh +++ b/.ci/scripts/build_xccl.sh @@ -2,17 +2,18 @@ set -o pipefail echo "INFO: Build XCCL" -cd "${TORCH_UCC_SRC_DIR}/xccl" -"${TORCH_UCC_SRC_DIR}/xccl/autogen.sh" -mkdir -p "${TORCH_UCC_SRC_DIR}/xccl/build-${XCCL_BUILD_TYPE}" -cd "${TORCH_UCC_SRC_DIR}/xccl/build-${XCCL_BUILD_TYPE}" +XCCL_SRC_DIR="${TORCH_UCC_SRC_DIR}/xccl" +cd "${XCCL_SRC_DIR}" +"${XCCL_SRC_DIR}/autogen.sh" +mkdir -p "${XCCL_SRC_DIR}/build-${XCCL_BUILD_TYPE}" +cd "${XCCL_SRC_DIR}/build-${XCCL_BUILD_TYPE}" # TODO enable CUDA (compilation failed) -#"${TORCH_UCC_SRC_DIR}/xccl/configure" --with-ucx="${UCX_INSTALL_DIR}" --prefix="${XCCL_INSTALL_DIR}" --enable-debug -"${TORCH_UCC_SRC_DIR}/xccl/configure" --with-cuda="${CUDA_HOME}" --with-ucx="${UCX_INSTALL_DIR}" \ - --prefix="${XCCL_INSTALL_DIR}" --enable-debug +#"${XCCL_SRC_DIR}/configure" --with-ucx="${UCX_INSTALL_DIR}" --prefix="${XCCL_INSTALL_DIR}" --enable-debug +"${XCCL_SRC_DIR}/configure" --with-cuda="${CUDA_HOME}" --with-ucx="${UCX_INSTALL_DIR}" \ +--prefix="${XCCL_INSTALL_DIR}" --enable-debug make -j install echo "${XCCL_INSTALL_DIR}/lib" > /etc/ld.so.conf.d/xccl.conf ldconfig -ldconfig -p | grep -i xccl +ldconfig -p | grep -i libxccl make -C test cd "${XCCL_INSTALL_DIR}" && tar cfz "${TORCH_UCC_PKG_DIR}/xccl-${XCCL_BUILD_TYPE}.tgz" --owner=0 --group=0 . diff --git a/.ci/scripts/configure_python.sh b/.ci/scripts/configure_python.sh index fddaf10..45dbc94 100755 --- a/.ci/scripts/configure_python.sh +++ b/.ci/scripts/configure_python.sh @@ -1,5 +1,29 @@ #!/bin/bash -eEx set -o pipefail +# Install conda +#cd /tmp +#curl -LO http://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh +#bash Miniconda3-latest-Linux-x86_64.sh -p /opt/conda -b +#rm -f Miniconda3-latest-Linux-x86_64.sh +#export PATH /opt/conda/bin:${PATH} + +# Install conda python +#conda update -y conda +#conda install -c anaconda -y \ +# python \ +# pip \ +# scikit-learn +#pip3 install --no-cache-dir python-hostlist + +#alternatives --set python /opt/conda/bin/python3 alternatives --set python /usr/bin/python3 pip3 install --user --upgrade setuptools wheel + +command -v python +python --version + +command -v python3 +python3 --version + +pip3 list diff --git a/.ci/scripts/env.sh b/.ci/scripts/env.sh index 7096f69..649acaa 100755 --- a/.ci/scripts/env.sh +++ b/.ci/scripts/env.sh @@ -3,23 +3,14 @@ SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd -P)" # shellcheck disable=SC2034 +#DLRM_MODEL="big" DLRM_MODEL="small" HOSTNAME=$(hostname -s) export HOSTNAME SRC_ROOT_DIR=$(cd "${SCRIPT_DIR}/../../" && pwd -P) -CONFIGS_DIR="${SRC_ROOT_DIR}/.ci/configs" -export HOSTFILE=${CONFIGS_DIR}/$HOSTNAME/hostfile.txt -# shellcheck disable=SC2002 -HOSTS=$(cat "$HOSTFILE" | xargs | tr ' ' ',') -export HOSTS -NP=$(wc --lines "$HOSTFILE" | awk '{print $1}') -export NP +export CONFIGS_DIR="${SRC_ROOT_DIR}/.ci/configs" -if [ ! -f "${HOSTFILE}" ]; then - echo "ERROR: ${HOSTFILE} does not exist" - exit 1 -fi - -export MASTER_ADDR=$HOSTNAME -export MASTER_PORT=4242 +# DLRM MASTER_PORT +export MASTER_PORT="12346" +export DOCKER_SSH_PORT="12345" diff --git a/.ci/scripts/install_torch.sh b/.ci/scripts/install_torch.sh index 504177e..649acaa 100755 --- a/.ci/scripts/install_torch.sh +++ b/.ci/scripts/install_torch.sh @@ -1,37 +1,16 @@ #!/bin/bash -eEx -set -o pipefail -# TODO debug -#cd /tmp -#git clone https://github.com/pytorch/pytorch.git -#cd /tmp/pytorch -#git submodule sync --recursive -#git submodule update --init --recursive -#pip install -r requirements.txt -#export TORCH_CUDA_ARCH_LIST="7.0 8.0+PTX" -#export USE_GLOO=1 -#export USE_DISTRIBUTED=1 -#export USE_OPENCV=0 -## TODO debug -#export USE_CUDA=1 -##export USE_CUDA=0 -#export USE_NCCL=0 -#export USE_MKLDNN=0 -#export BUILD_TEST=0 -#export USE_FBGEMM=0 -#export USE_NNPACK=0 -#export USE_QNNPACK=0 -#export USE_XNNPACK=0 -#export USE_KINETO=1 -#export MAX_JOBS=$(($(nproc)-1)) -#python setup.py install -#rm -rf /tmp/pytorch +SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd -P)" -# TODO debug -#conda install -y pytorch torchvision cpuonly -c pytorch-nightly -#conda install pytorch torchvision torchaudio cudatoolkit=11.0 -c pytorch-nightly -#conda uninstall -y pytorch torchvision -#conda install pytorch torchvision cudatoolkit=11.0 -c pytorch-nightly -#conda install pytorch cudatoolkit=11.0 -c pytorch-nightly -pip3 install --default-timeout=900 numpy -pip3 install --default-timeout=900 --pre torch -f https://download.pytorch.org/whl/nightly/cu110/torch_nightly.html +# shellcheck disable=SC2034 +#DLRM_MODEL="big" +DLRM_MODEL="small" + +HOSTNAME=$(hostname -s) +export HOSTNAME +SRC_ROOT_DIR=$(cd "${SCRIPT_DIR}/../../" && pwd -P) +export CONFIGS_DIR="${SRC_ROOT_DIR}/.ci/configs" + +# DLRM MASTER_PORT +export MASTER_PORT="12346" +export DOCKER_SSH_PORT="12345" diff --git a/.ci/scripts/install_torch_ucc.sh b/.ci/scripts/install_torch_ucc.sh index 8782d2d..a0d2527 100755 --- a/.ci/scripts/install_torch_ucc.sh +++ b/.ci/scripts/install_torch_ucc.sh @@ -1,14 +1,18 @@ #!/bin/bash -eEx set -o pipefail -echo "INFO: Install Torch-UCC" -cd "${TORCH_UCC_SRC_DIR}" +# UCC +echo "INFO: Install Torch-UCC (UCC version)" +cd "${TORCH_UCC_PYTHON_VENV_DIR}" +python3 -m venv --system-site-packages ucc +. "${TORCH_UCC_PYTHON_VENV_DIR}/ucc/bin/activate" export UCX_HOME=${UCX_INSTALL_DIR} -export XCCL_HOME=${XCCL_INSTALL_DIR} -# TODO debug +export UCC_HOME=${UCC_INSTALL_DIR} export WITH_CUDA=${CUDA_HOME} -#export WITH_CUDA=no +cd "${TORCH_UCC_SRC_DIR}" +git clean -ffdx python setup.py install bdist_wheel pip3 list | grep torch python -c 'import torch, torch_ucc' cp "${TORCH_UCC_SRC_DIR}/dist/"*.whl "${TORCH_UCC_PKG_DIR}" +deactivate diff --git a/.ci/scripts/install_torch_xccl.sh b/.ci/scripts/install_torch_xccl.sh new file mode 100755 index 0000000..92231b4 --- /dev/null +++ b/.ci/scripts/install_torch_xccl.sh @@ -0,0 +1,21 @@ +#!/bin/bash -eEx +set -o pipefail + +# XCCL +echo "INFO: Install Torch-UCC (XCCL version)" +cd "${TORCH_UCC_PYTHON_VENV_DIR}" +python3 -m venv --system-site-packages xccl +. "${TORCH_UCC_PYTHON_VENV_DIR}/xccl/bin/activate" +export UCX_HOME=${UCX_INSTALL_DIR} +export XCCL_HOME=${XCCL_INSTALL_DIR} +export WITH_CUDA=${CUDA_HOME} +TORCH_UCC_SRC_DIR_WITH_XCCL_SUPPORT="${TORCH_UCC_SRC_DIR}_xccl" +mkdir -p "${TORCH_UCC_SRC_DIR_WITH_XCCL_SUPPORT}" +git clone https://github.com/openucx/torch-ucc.git "${TORCH_UCC_SRC_DIR_WITH_XCCL_SUPPORT}" +cd "${TORCH_UCC_SRC_DIR_WITH_XCCL_SUPPORT}" +git clean -ffdx +python setup.py install bdist_wheel +pip3 list | grep torch +python -c 'import torch, torch_ucc' +cp "${TORCH_UCC_SRC_DIR_WITH_XCCL_SUPPORT}/dist/"*.whl "${TORCH_UCC_PKG_DIR}" +deactivate diff --git a/.ci/scripts/run_dlrm.sh b/.ci/scripts/run_dlrm.sh new file mode 100755 index 0000000..026870c --- /dev/null +++ b/.ci/scripts/run_dlrm.sh @@ -0,0 +1,71 @@ +#!/bin/bash -eEx +set -o pipefail + +SCRIPT_DIR="$( + cd "$(dirname "$0")" + pwd -P +)" +cd "${SCRIPT_DIR}" +. "${SCRIPT_DIR}/env.sh" + +TORCH_UCC_MODE="$1" +CPU_GPU_MODE="$2" +HOSTFILE="$3" + +if [ "${TORCH_UCC_MODE}" != "ucc" ] && [ "${TORCH_UCC_MODE}" != "xccl" ]; then + echo "ERROR: unsupported or empty TORCH_UCC_MODE (${TORCH_UCC_MODE}), supported values: ucc, xccl" + exit 1 +fi + +export TORCH_UCC_MODE +export CPU_GPU_MODE + +if [ -z "$HOSTFILE" ]; then + echo "ERROR: HOSTFILE is not specified" + exit 1 +fi + +export PATH="/usr/lib64/openmpi/bin:$PATH" +export LD_LIBRARY_PATH="/usr/lib64/openmpi/lib:${LD_LIBRARY_PATH}" + +HEAD_NODE=$(head -1 "$HOSTFILE") +export HEAD_NODE +export MASTER_ADDR=${HEAD_NODE} + +NP=$(wc --lines "$HOSTFILE" | awk '{print $1}') + +# shellcheck disable=SC2086 +mpirun \ + -np $NP \ + --hostfile ${HOSTFILE} \ + --map-by node \ + --allow-run-as-root \ + --mca plm_rsh_args '-p 12345' \ + -x PATH \ + -x LD_LIBRARY_PATH \ + hostname + +# shellcheck disable=SC2086 +mpirun \ + -np $NP \ + --hostfile ${HOSTFILE} \ + --map-by node \ + --allow-run-as-root \ + --mca plm_rsh_args '-p 12345' \ + -x PATH \ + -x LD_LIBRARY_PATH \ + cat /proc/1/cgroup + +# shellcheck disable=SC2086 +mpirun \ + -np $NP \ + --hostfile ${HOSTFILE} \ + --map-by node \ + --allow-run-as-root \ + --mca plm_rsh_args '-p 12345' \ + -x PATH \ + -x LD_LIBRARY_PATH \ + -x MASTER_ADDR \ + -x TORCH_UCC_MODE \ + -x CPU_GPU_MODE \ + /opt/nvidia/torch-ucc/src/.ci/scripts/run_dlrm_s_pytorch.sh diff --git a/.ci/scripts/run_dlrm_docker.sh b/.ci/scripts/run_dlrm_docker.sh new file mode 100755 index 0000000..5ea307b --- /dev/null +++ b/.ci/scripts/run_dlrm_docker.sh @@ -0,0 +1,100 @@ +#!/bin/bash -eEx +set -o pipefail + +function err_report () { + echo "Exited with ERROR in line $1" + exit 1 +} +trap 'err_report $LINENO' ERR + +SCRIPT_DIR="$( + cd "$(dirname "$0")" + pwd -P +)" +cd "${SCRIPT_DIR}" +. "${SCRIPT_DIR}/env.sh" + +TORCH_UCC_MODE="$1" + +if [ "${TORCH_UCC_MODE}" != "ucc" ] && [ "${TORCH_UCC_MODE}" != "xccl" ]; then + echo "ERROR: unsupported or empty TORCH_UCC_MODE (${TORCH_UCC_MODE}), supported values: ucc, xccl" + exit 1 +fi + +export HOSTFILE=${HOSTFILE:-${CONFIGS_DIR}/$HOSTNAME/hostfile.txt} + +if [ ! -f "${HOSTFILE}" ]; then + echo "ERROR: ${HOSTFILE} does not exist" + exit 1 +fi + +# shellcheck disable=SC2002 +HOSTS=$(cat "$HOSTFILE" | xargs | tr ' ' ',') +export HOSTS +HEAD_NODE=$(head -1 "$HOSTFILE") +export HEAD_NODE + +DOCKER_CONTAINER_NAME="torch_ucc" +# TODO debug +DOCKER_IMAGE_NAME="${TORCH_UCC_DOCKER_IMAGE_NAME}:${BUILD_ID}" +#DOCKER_IMAGE_NAME="harbor.mellanox.com/torch-ucc/1.0.0/x86_64/centos8/cuda11.2.1:205" + +DOCKER_RUN_ARGS="\ +--pull always \ +--network=host \ +--uts=host \ +--ipc=host \ +--ulimit stack=67108864 \ +--ulimit memlock=-1 \ +--security-opt seccomp=unconfined \ +--cap-add=SYS_ADMIN \ +--device=/dev/infiniband/ \ +--gpus all \ +--user root \ +-it \ +-d \ +--rm \ +--name=${DOCKER_CONTAINER_NAME} \ +-v /labhome:/labhome \ +-v /root/.ssh:/root/.ssh \ +" + +# shellcheck disable=SC2013 +for HOST in $(cat "$HOSTFILE"); do + echo "INFO: HOST = $HOST" + + STALE_DOCKER_CONTAINER_LIST=$(sudo ssh -n "$HOST" "docker ps -a -q -f name=${DOCKER_CONTAINER_NAME}") + if [ -n "${STALE_DOCKER_CONTAINER_LIST}" ]; then + echo "WARNING: stale docker container (name: ${DOCKER_CONTAINER_NAME}) is detected on ${HOST} (to be stopped)" + echo "INFO: Stopping stale docker container (name: ${DOCKER_CONTAINER_NAME}) on ${HOST}..." + sudo ssh "${HOST}" docker stop ${DOCKER_CONTAINER_NAME} + echo "INFO: Stopping stale docker container (name: ${DOCKER_CONTAINER_NAME}) on ${HOST}... DONE" + fi + + echo "INFO: start docker container on $HOST ..." + # shellcheck disable=SC2029 + sudo ssh "$HOST" "docker run \ + ${DOCKER_RUN_ARGS} \ + ${DOCKER_IMAGE_NAME} \ + bash -c '/usr/sbin/sshd -p ${DOCKER_SSH_PORT}; sleep infinity'" + echo "INFO: start docker container on $HOST ... DONE" + + sleep 5 + + echo "INFO: verify docker container on $HOST ..." + sudo ssh -p "${DOCKER_SSH_PORT}" "$HOST" hostname + sudo ssh -p "${DOCKER_SSH_PORT}" "$HOST" cat /proc/1/cgroup + echo "INFO: verify docker container on $HOST ... DONE" +done + +# TODO remove sudo +sudo ssh -p "${DOCKER_SSH_PORT}" "${HEAD_NODE}" /opt/nvidia/torch-ucc/src/.ci/scripts/run_dlrm.sh ${TORCH_UCC_MODE} cpu /opt/nvidia/torch-ucc/src/.ci/configs/$HOSTNAME/hostfile.txt +sudo ssh -p "${DOCKER_SSH_PORT}" "${HEAD_NODE}" /opt/nvidia/torch-ucc/src/.ci/scripts/run_dlrm.sh ${TORCH_UCC_MODE} gpu /opt/nvidia/torch-ucc/src/.ci/configs/$HOSTNAME/hostfile.txt + +# TODO debug +# shellcheck disable=SC2013 +#for HOST in $(cat "$HOSTFILE"); do +# echo "INFO: stop docker container on $HOST ..." +# ssh "${HOST}" docker stop ${DOCKER_CONTAINER_NAME} +# echo "INFO: stop docker container on $HOST ... DONE" +#done diff --git a/.ci/scripts/run_dlrm_s_pytorch.sh b/.ci/scripts/run_dlrm_s_pytorch.sh new file mode 100755 index 0000000..82b2f1b --- /dev/null +++ b/.ci/scripts/run_dlrm_s_pytorch.sh @@ -0,0 +1,81 @@ +#!/bin/bash -eEx +set -o pipefail + +SCRIPT_DIR="$( + cd "$(dirname "$0")" + pwd -P +)" +cd "${SCRIPT_DIR}" +. "${SCRIPT_DIR}/env.sh" + +if [ "${TORCH_UCC_MODE}" != "ucc" ] && [ "${TORCH_UCC_MODE}" != "xccl" ]; then + echo "ERROR: unsupported or empty TORCH_UCC_MODE (${TORCH_UCC_MODE}), supported values: ucc, xccl" + exit 1 +fi + +# shellcheck disable=SC1090 +. "/opt/nvidia/torch-ucc/bin/python/venv/${TORCH_UCC_MODE}/bin/activate" +pip3 list | grep torch +python -c 'import torch, torch_ucc' + +case ${DLRM_MODEL} in +"big") + emb_size="1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000" + emb_dim="256" + emb_lookup="100" + bot_mlp="512-512-256" + top_mlp="1024-1024-1024-1" + loss_func="mse" + round_targets="False" + lr="0.01" + #mb_size="2048" + emb_lookup_fixed="0" + ;; +"small") + emb_size="1000-1000-1000-1000-1000-1000-1000-1000" + emb_dim="64" + emb_lookup="100" + bot_mlp="512-512-64" + top_mlp="1024-1024-1024-1" + loss_func="mse" + round_targets="False" + lr="0.01" + #mb_size="2048" + emb_lookup_fixed="0" + ;; +*) + echo "ERROR: unsupported or empty DLRM_MODEL (${DLRM_MODEL})" + exit 1 + ;; +esac + +export UCX_NET_DEVICES="mlx5_0:1" + +if [ "${CPU_GPU_MODE}" = "gpu" ]; then + DLRM_S_PYTORCH_EXTRA_ARGS="--use-gpu" +fi + +# shellcheck disable=SC2086 +python /opt/nvidia/torch-ucc/workloads/dlrm/dlrm_s_pytorch.py \ + --mini-batch-size=2048 \ + --test-mini-batch-size=16384 \ + --test-num-workers=0 \ + --num-batches=10 \ + --data-generation=random \ + --arch-mlp-bot=$bot_mlp \ + --arch-mlp-top=$top_mlp \ + --arch-sparse-feature-size=$emb_dim \ + --arch-embedding-size=$emb_size \ + --num-indices-per-lookup=$emb_lookup \ + --num-indices-per-lookup-fixed=$emb_lookup_fixed \ + --arch-interaction-op=dot \ + --numpy-rand-seed=727 \ + --print-freq=1 \ + --loss-function=$loss_func \ + --round-targets=$round_targets \ + --learning-rate=$lr \ + --print-time \ + --dist-backend=ucc \ + ${DLRM_S_PYTORCH_EXTRA_ARGS} + +deactivate diff --git a/.ci/scripts/run_tests_torch_ucc.sh b/.ci/scripts/run_tests_torch_ucc.sh index 31e2e3e..7ffcca1 100755 --- a/.ci/scripts/run_tests_torch_ucc.sh +++ b/.ci/scripts/run_tests_torch_ucc.sh @@ -5,15 +5,22 @@ command -v mpirun export TORCH_UCC_XCCL_TLS=ucx export UCX_WARN_UNUSED_ENV_VARS=n ucx_info -e -u t -echo "XCCL allreduce" -/bin/bash ${TORCH_UCC_SRC_DIR}/test/start_test.sh ${TORCH_UCC_SRC_DIR}/test/torch_allreduce_test.py --backend=gloo -echo "XCCL alltoall" -/bin/bash ${TORCH_UCC_SRC_DIR}/test/start_test.sh ${TORCH_UCC_SRC_DIR}/test/torch_alltoall_test.py --backend=gloo -echo "XCCL alltoallv" -/bin/bash ${TORCH_UCC_SRC_DIR}/test/start_test.sh ${TORCH_UCC_SRC_DIR}/test/torch_alltoallv_test.py --backend=gloo -echo "XCCL barrier" -/bin/bash ${TORCH_UCC_SRC_DIR}/test/start_test.sh ${TORCH_UCC_SRC_DIR}/test/torch_barrier_test.py --backend=gloo -echo "XCCL allgather" -/bin/bash ${TORCH_UCC_SRC_DIR}/test/start_test.sh ${TORCH_UCC_SRC_DIR}/test/torch_allgather_test.py --backend=gloo -echo "XCCL broadcast" -/bin/bash ${TORCH_UCC_SRC_DIR}/test/start_test.sh ${TORCH_UCC_SRC_DIR}/test/torch_bcast_test.py --backend=gloo +TORCH_UCC_SRC_DIR_WITH_XCCL_SUPPORT="${TORCH_UCC_SRC_DIR}_xccl" + +echo "UCC barrier" +/bin/bash ${TORCH_UCC_SRC_DIR_WITH_XCCL_SUPPORT}/test/start_test.sh ${TORCH_UCC_SRC_DIR_WITH_XCCL_SUPPORT}/test/torch_barrier_test.py --backend=gloo + +echo "UCC alltoall" +/bin/bash ${TORCH_UCC_SRC_DIR_WITH_XCCL_SUPPORT}/test/start_test.sh ${TORCH_UCC_SRC_DIR_WITH_XCCL_SUPPORT}/test/torch_alltoall_test.py --backend=gloo + +echo "UCC alltoallv" +/bin/bash ${TORCH_UCC_SRC_DIR_WITH_XCCL_SUPPORT}/test/start_test.sh ${TORCH_UCC_SRC_DIR_WITH_XCCL_SUPPORT}/test/torch_alltoallv_test.py --backend=gloo + +echo "UCC allgather" +/bin/bash ${TORCH_UCC_SRC_DIR_WITH_XCCL_SUPPORT}/test/start_test.sh ${TORCH_UCC_SRC_DIR_WITH_XCCL_SUPPORT}/test/torch_allgather_test.py --backend=gloo + +echo "UCC allreduce" +/bin/bash ${TORCH_UCC_SRC_DIR_WITH_XCCL_SUPPORT}/test/start_test.sh ${TORCH_UCC_SRC_DIR_WITH_XCCL_SUPPORT}/test/torch_allreduce_test.py --backend=gloo + +echo "UCC broadcast" +/bin/bash ${TORCH_UCC_SRC_DIR_WITH_XCCL_SUPPORT}/test/start_test.sh ${TORCH_UCC_SRC_DIR_WITH_XCCL_SUPPORT}/test/torch_bcast_test.py --backend=gloo diff --git a/.ci/scripts/run_tests_torch_xccl.sh b/.ci/scripts/run_tests_torch_xccl.sh new file mode 100755 index 0000000..b817470 --- /dev/null +++ b/.ci/scripts/run_tests_torch_xccl.sh @@ -0,0 +1,24 @@ +#!/bin/bash -eEx +set -o pipefail + +command -v mpirun +export UCX_WARN_UNUSED_ENV_VARS=n +ucx_info -e -u t + +echo "XCCL allreduce" +/bin/bash ${TORCH_UCC_SRC_DIR}/test/start_test.sh ${TORCH_UCC_SRC_DIR}/test/torch_allreduce_test.py --backend=gloo + +echo "XCCL alltoall" +/bin/bash ${TORCH_UCC_SRC_DIR}/test/start_test.sh ${TORCH_UCC_SRC_DIR}/test/torch_alltoall_test.py --backend=gloo + +echo "XCCL alltoallv" +/bin/bash ${TORCH_UCC_SRC_DIR}/test/start_test.sh ${TORCH_UCC_SRC_DIR}/test/torch_alltoallv_test.py --backend=gloo + +echo "XCCL barrier" +/bin/bash ${TORCH_UCC_SRC_DIR}/test/start_test.sh ${TORCH_UCC_SRC_DIR}/test/torch_barrier_test.py --backend=gloo + +echo "XCCL allgather" +/bin/bash ${TORCH_UCC_SRC_DIR}/test/start_test.sh ${TORCH_UCC_SRC_DIR}/test/torch_allgather_test.py --backend=gloo + +echo "XCCL broadcast" +/bin/bash ${TORCH_UCC_SRC_DIR}/test/start_test.sh ${TORCH_UCC_SRC_DIR}/test/torch_bcast_test.py --backend=gloo diff --git a/.ci/scripts/run_tests_ucc.sh b/.ci/scripts/run_tests_ucc.sh new file mode 100755 index 0000000..bed2dd5 --- /dev/null +++ b/.ci/scripts/run_tests_ucc.sh @@ -0,0 +1,8 @@ +#!/bin/bash -eEx +set -o pipefail + +UCC_SRC_DIR="${TORCH_UCC_SRC_DIR}/ucc" +cd "${UCC_SRC_DIR}/build" + +export UCX_WARN_UNUSED_ENV_VARS=n +make gtest diff --git a/.github/workflows/main.yaml b/.github/workflows/main.yaml index 8e37481..f1b10d6 100644 --- a/.github/workflows/main.yaml +++ b/.github/workflows/main.yaml @@ -4,7 +4,7 @@ on: [push, pull_request] env: OPENUCX_LINK: https://github.com/openucx/ucx.git - XCCL_LINK: https://github.com/openucx/xccl.git + UCC_LINK: https://github.com/openucx/ucc.git jobs: tests: @@ -25,32 +25,31 @@ jobs: ./autogen.sh ./contrib/configure-release-mt --without-java --disable-numa --prefix=/opt/ucx make -j install - - name: Get XCCL + - name: Get UCC run: | - git clone ${XCCL_LINK} /tmp/xccl - cd /tmp/xccl + git clone ${UCC_LINK} /tmp/ucc + cd /tmp/ucc ./autogen.sh - ./configure --with-ucx=/opt/ucx --prefix=/opt/xccl + ./configure --with-ucx=/opt/ucx --prefix=/opt/ucc make -j install - uses: actions/checkout@v1 - - name: Build with UCX and XCCL + - name: Build with UCX and UCC run: | - UCX_HOME=/opt/ucx/ XCCL_HOME=/opt/xccl/ WITH_CUDA=no python setup.py install + UCX_HOME=/opt/ucx/ UCC_HOME=/opt/ucc/ WITH_CUDA=no python setup.py install - name: Tests run: | - export LD_LIBRARY_PATH=/opt/ucx/lib:/opt/xccl/lib:$LD_LIBRARY_PATH - export TORCH_UCC_XCCL_TLS=ucx + export LD_LIBRARY_PATH=/opt/ucx/lib:/opt/ucc/lib:$LD_LIBRARY_PATH /opt/ucx/bin/ucx_info -e -u t export UCX_LOG_LEVEL=info - echo "XCCL allreduce" - /bin/bash ./test/start_test.sh ./test/torch_allreduce_test.py --backend=gloo - echo "XCCL alltoall" + echo "UCC barrier" + /bin/bash ./test/start_test.sh ./test/torch_barrier_test.py --backend=gloo + echo "UCC alltoall" /bin/bash ./test/start_test.sh ./test/torch_alltoall_test.py --backend=gloo - echo "XCCL alltoallv" + echo "UCC alltoallv" /bin/bash ./test/start_test.sh ./test/torch_alltoallv_test.py --backend=gloo - echo "XCCL barrier" - /bin/bash ./test/start_test.sh ./test/torch_barrier_test.py --backend=gloo - echo "XCCL allgather" + echo "UCC allgather" /bin/bash ./test/start_test.sh ./test/torch_allgather_test.py --backend=gloo - echo "XCCL broadcast" + echo "UCC allreduce" + /bin/bash ./test/start_test.sh ./test/torch_allreduce_test.py --backend=gloo + echo "UCC broadcast" /bin/bash ./test/start_test.sh ./test/torch_bcast_test.py --backend=gloo diff --git a/include/torch_ucc.hpp b/include/torch_ucc.hpp index fafda36..4b6ad8c 100644 --- a/include/torch_ucc.hpp +++ b/include/torch_ucc.hpp @@ -1,18 +1,14 @@ /** - * * Copyright (C) Mellanox Technologies Ltd. 2001-2020. ALL RIGHTS RESERVED. - * * - * * See file LICENSE for terms. - * */ + * Copyright (C) Mellanox Technologies Ltd. 2020-2021. ALL RIGHTS RESERVED. + * See file LICENSE for terms. + */ #pragma once #include -#include #include #include -#include -#include #include #include @@ -21,57 +17,143 @@ #include #include #include - -#include -#include +#ifdef USE_CUDA +#include +#include +#endif +#include +#include namespace c10d { +#define TORCH_UCC_DEVICE_NOT_SET -2 +#define TORCH_UCX_COMM_BITS 15 +#define TORCH_UCX_RANK_BITS 16 +#define TORCH_UCX_TAG_BITS 32 +#define TORCH_UCX_OOB_BITS 1 + +#define TORCH_UCX_COMM_BITS_OFFSET 0 +#define TORCH_UCX_RANK_BITS_OFFSET TORCH_UCX_COMM_BITS +#define TORCH_UCX_TAG_BITS_OFFSET (TORCH_UCX_COMM_BITS + TORCH_UCX_RANK_BITS) +#define TORCH_UCX_OOB_BITS_OFFSET \ + (TORCH_UCX_COMM_BITS + TORCH_UCX_RANK_BITS + TORCH_UCX_TAG_BITS) + +#define TORCH_UCX_MAX_COMM ((((uint64_t)1) << TORCH_UCX_COMM_BITS) - 1) +#define TORCH_UCX_MAX_RANK ((((uint64_t)1) << TORCH_UCX_RANK_BITS) - 1) +#define TORCH_UCX_MAX_TAG ((((uint64_t)1) << TORCH_UCX_TAG_BITS) - 1) +#define TORCH_UCX_MAX_OOB ((((uint64_t)1) << TORCH_UCX_OOB_BITS) - 1) + +#define TORCH_UCX_COMM_MASK (TORCH_UCX_MAX_COMM << TORCH_UCX_COMM_BITS_OFFSET) +#define TORCH_UCX_RANK_MASK (TORCH_UCX_MAX_RANK << TORCH_UCX_RANK_BITS_OFFSET) +#define TORCH_UCX_TAG_MASK (TORCH_UCX_MAX_TAG << TORCH_UCX_TAG_BITS_OFFSET) +#define TORCH_UCX_OOB_MASK (TORCH_UCX_MAX_OOB << TORCH_UCX_OOB_BITS_OFFSET) + +#define TORCH_UCX_MAKE_P2P_TAG(_tag, _rank, _comm) \ + ((((uint64_t)(_tag)) << TORCH_UCX_TAG_BITS_OFFSET) | \ + (((uint64_t)(_rank)) << TORCH_UCX_RANK_BITS_OFFSET) | \ + (((uint64_t)(_comm)) << TORCH_UCX_COMM_BITS_OFFSET)) + +#define TORCH_UCX_MAKE_OOB_TAG(_tag, _rank, _comm) \ + ((((uint64_t)(_tag)) << TORCH_UCX_OOB_BITS_OFFSET) | \ + (((uint64_t)(_rank)) << TORCH_UCX_RANK_BITS_OFFSET) | \ + (((uint64_t)(_rank)) << TORCH_UCX_COMM_BITS_OFFSET)) + +#define TORCH_UCX_MAKE_SEND_TAG(_ucp_tag, _tag, _rank, _comm) \ + do { \ + (_ucp_tag) = TORCH_UCX_MAKE_P2P_TAG((_tag), (_rank), (_comm)); \ + } while (0) + +#define TORCH_UCX_ANY_SOURCE (TORCH_UCX_MAX_RANK - 1) +#define TORCH_UCX_ANY_SOURCE_MASK (~TORCH_UCX_RANK_MASK) +#define TORCH_UCX_SPECIFIC_SOURCE_MASK ((uint64_t)-1) + +#define TORCH_UCX_MAKE_RECV_TAG(_ucp_tag, _ucp_tag_mask, _tag, _rank, _comm) \ + do { \ + (_ucp_tag) = TORCH_UCX_MAKE_P2P_TAG((_tag), (_rank), (_comm)); \ + if ((_rank) == TORCH_UCX_ANY_SOURCE) { \ + (_ucp_tag_mask) = TORCH_UCX_ANY_SOURCE_MASK; \ + } else { \ + (_ucp_tag_mask) = TORCH_UCX_SPECIFIC_SOURCE_MASK; \ + } \ + } while (0) + +#define TORCH_UCX_MAKE_OOB_SEND_TAG(_ucp_tag, _tag, _rank, _comm) \ + do { \ + (_ucp_tag) = TORCH_UCX_MAKE_OOB_TAG((_tag), (_rank), (_comm)); \ + } while (0) + +#define TORCH_UCX_MAKE_OOB_RECV_TAG( \ + _ucp_tag, _ucp_tag_mask, _tag, _rank, _comm) \ + do { \ + (_ucp_tag) = TORCH_UCX_MAKE_OOB_TAG((_tag), (_rank), (_comm)); \ + (_ucp_tag_mask) = (uint64_t)-1; \ + } while (0) + +enum torch_ucx_tag_type_t { TORCH_UCX_P2P_TAG, TORCH_UCX_OOB_TAG }; + +class CommPG; + +class CommBase { + public: + CommBase() {} + virtual void progress() = 0; + virtual ~CommBase() {} +}; + class ProcessGroupUCC : public ProcessGroup { public: - class WorkUCX : public ProcessGroup::Work { + class WorkData { public: - WorkUCX(torch_ucx_request_t* request, torch_ucx_comm_t* ucx_comm) - : req(request), comm(ucx_comm) {} - ~WorkUCX() override; - bool isCompleted() override; - bool isSuccess() const override; - bool wait(std::chrono::milliseconds timeout = kUnsetTimeout) override; + WorkData() {} + virtual ~WorkData() {} + }; + class AlltoallWorkData : public WorkData { + public: + AlltoallWorkData(int size) + : send_lengths(size), + send_offsets(size), + recv_lengths(size), + recv_offsets(size) {} + std::vector send_lengths; + std::vector send_offsets; + std::vector recv_lengths; + std::vector recv_offsets; + }; - protected: - torch_ucx_request_t* req; - torch_ucx_comm_t* comm; - friend class ProcessGroupUCC; + class AllgatherWorkData : public WorkData { + public: + AllgatherWorkData(int size) + : recv_lengths(size), + recv_offsets(size) {} + std::vector recv_lengths; + std::vector recv_offsets; }; - class WorkColl : public ProcessGroup::Work { + class WorkUCC : public ProcessGroup::Work { + friend class ProcessGroupUCC; + friend class CommPG; + public: - WorkColl( - torch_ucc_coll_ops_t ops, - std::list>& list) - : coll_ops(ops), - work_list(list), - external_progress(false), - blocking_wait(false), - scratch(nullptr) {} - - ~WorkColl() override; + WorkUCC( + OpType opType, + ucc_status_t status, + ucc_coll_req_h request, + CommBase* comm) + : ProcessGroup::Work(-1, opType), + status_(status), + request_(request), + comm_(comm) {} + ~WorkUCC(); bool isCompleted() override; bool isSuccess() const override; bool wait(std::chrono::milliseconds timeout = kUnsetTimeout) override; + void finalize(); + std::unique_ptr data; protected: - torch_ucc_coll_ops_t coll_ops; - std::list>& work_list; - std::list>::iterator work_list_entry; - bool external_progress; - bool blocking_wait; - char* scratch; - std::vector src; - std::vector dst; - torch_ucc_coll_request_t* coll_req{}; - - friend class ProcessGroupUCC; + ucc_status_t status_; + ucc_coll_req_h request_; + CommBase* comm_; }; explicit ProcessGroupUCC( @@ -79,6 +161,8 @@ class ProcessGroupUCC : public ProcessGroup { int rank = -1, int size = -1); + void initComm(c10::Device dev); + ~ProcessGroupUCC() override; c10::intrusive_ptr broadcast( @@ -167,31 +251,193 @@ class ProcessGroupUCC : public ProcessGroup { protected: c10::intrusive_ptr store_; - torch_ucx_comm_t* ucx_comm{}; - torch_ucc_coll_comm_t* coll_comm; - torch_ucc_coll_ops_t coll_ops{}; - std::mutex pg_mutex; + std::shared_ptr comm; + uint32_t comm_id; + std::vector eps; + ucc_team_h team; +}; + +class CommUCX : public CommBase { + public: + ucp_context_h context; + ucp_worker_h worker; + + public: + void progress() override; + CommUCX(int comm_size); + ~CommUCX(); +}; + +class CommUCC : public CommBase { + public: + ucc_lib_h lib; + ucc_context_h context; + + public: + void progress() override; + CommUCC(int comm_size); + ~CommUCC(); +}; + +class CommPG { + CommUCX ucx_comm; + CommUCC ucc_comm; + c10::DeviceIndex device_index; + std::mutex mutex; std::thread progress_thread; - bool stop_progress_loop; - std::list> progress_list; std::condition_variable queue_produce_cv; std::condition_variable queue_consume_cv; + std::deque> progress_queue; + bool stop_progress_loop; + + public: + c10::DeviceIndex cuda_device_index; + CommPG(int comm_size, c10::Device dev) : + ucx_comm(comm_size), + ucc_comm(comm_size), + cuda_device_index(TORCH_UCC_DEVICE_NOT_SET) { + if (dev.is_cuda()) { + cuda_device_index = dev.index(); + } + stop_progress_loop = false; + progress_thread = std::thread(&CommPG::progress_loop, this); + pthread_setname_np(progress_thread.native_handle(), "ucc-progress"); + } + ~CommPG() { + std::unique_lock lock(mutex); + queue_consume_cv.wait(lock, [&] { return progress_queue.empty(); }); + stop_progress_loop = true; + lock.unlock(); + queue_produce_cv.notify_all(); + progress_thread.join(); + } + + void ucx_connect_eps( + std::vector& eps, + int rank, + int size, + const c10::intrusive_ptr& store); + + void ucx_disconnect_eps( + std::vector& eps, + const c10::intrusive_ptr& store); + + void ucc_create_team( + ucc_team_h& team, + int rank, + int size, + const c10::intrusive_ptr& store); + void ucc_destroy_team(ucc_team_h& team); + + c10::intrusive_ptr enqueue_p2p( + OpType opType, + ucc_coll_req_h request) { + if (request == nullptr) { + // p2p2 request completed immediately don't save it to progress queue + return c10::make_intrusive( + opType, UCC_OK, request, &ucx_comm); + } + std::unique_lock lock(mutex); + auto work = c10::make_intrusive( + opType, UCC_INPROGRESS, request, &ucx_comm); + progress_queue.push_back(work); + lock.unlock(); + queue_produce_cv.notify_one(); + return work; + } + + c10::intrusive_ptr enqueue_collective( + OpType opType, + ucc_coll_args_t& coll, + std::unique_ptr data, + ucc_team_h& team) { + std::unique_lock lock(mutex); + ucc_coll_req_h request; + ucc_status_t st; + st = ucc_collective_init(&coll, &request, team); + if (st != UCC_OK) { + LOG(ERROR) << "failed to init collective: " << ucc_status_string(st); + throw std::runtime_error(ucc_status_string(st)); + } + st = ucc_collective_post(request); + if (st != UCC_OK) { + LOG(ERROR) << "failed to post collective: " << ucc_status_string(st); + throw std::runtime_error(ucc_status_string(st)); + } + auto work = c10::make_intrusive( + opType, UCC_INPROGRESS, request, &ucc_comm); + work->data = std::move(data); + progress_queue.push_back(work); + lock.unlock(); + queue_produce_cv.notify_one(); + return work; + } - void progress_loop(c10::DeviceIndex default_dev_idx); - c10::intrusive_ptr enqueue_request( - torch_ucc_coll_request_t* req, - void* scratch); - torch_ucc_coll_comm_t* get_coll_comm(); - - private: - struct ucc_config { - bool enable_progress_thread; - bool blocking_wait; - bool high_priority_stream; - } config{}; - - void read_config(); - void check_tensor(const std::vector& tensors); + static std::shared_ptr get_comm(uint32_t& id, c10::Device dev, int comm_size) { + static std::mutex m; + static std::weak_ptr comm; + static uint32_t comm_id; + + std::lock_guard lock(m); + id = (comm_id++ % TORCH_UCX_COMM_BITS); + std::shared_ptr shared_comm = comm.lock(); + if (!shared_comm) { + shared_comm = std::make_shared(comm_size, dev); + comm = shared_comm; + } else { + if (dev.is_cuda()) { + if ((shared_comm->cuda_device_index != TORCH_UCC_DEVICE_NOT_SET) && + (shared_comm->cuda_device_index != dev.index())) { + LOG(ERROR) + << "ucc communicator was initialized with different cuda device," + << "multi device is not supported"; + throw std::runtime_error(ucc_status_string(UCC_ERR_NOT_SUPPORTED)); + } + shared_comm->cuda_device_index = dev.index(); + } + } + return shared_comm; + } + + void progress_loop() { + std::unique_lock lock(mutex); +#ifdef USE_CUDA + bool device_set = false; +#endif + while (!stop_progress_loop) { + if (progress_queue.empty()) { + queue_produce_cv.wait(lock); + continue; + } + auto work = progress_queue.front(); + progress_queue.pop_front(); + lock.unlock(); + queue_consume_cv.notify_one(); +#ifdef USE_CUDA + if ((!device_set) && (cuda_device_index != TORCH_UCC_DEVICE_NOT_SET)) { + c10::cuda::set_device(cuda_device_index); + device_set = true; + } +#endif + while (work->request_->status == UCC_INPROGRESS) { + work->comm_->progress(); + } + lock.lock(); + work->finalize(); + } + } + ucc_coll_req_h send_nb( + ucp_ep_h ep, + void* data, + ucs_memory_type_t mtype, + size_t size, + ucp_tag_t ucp_tag); + ucc_coll_req_h recv_nb( + void* data, + ucs_memory_type_t mtype, + size_t size, + ucp_tag_t ucp_tag, + ucp_tag_t ucp_tag_mask); }; } // namespace c10d diff --git a/include/torch_ucc_ops.hpp b/include/torch_ucc_ops.hpp deleted file mode 100644 index 30ef1b0..0000000 --- a/include/torch_ucc_ops.hpp +++ /dev/null @@ -1,168 +0,0 @@ -/** - * * Copyright (C) Mellanox Technologies Ltd. 2020-2021. ALL RIGHTS RESERVED. - * * - * * See file LICENSE for terms. - * */ - -#pragma once - -#include -#include -#include -#include -#include -#ifdef USE_CUDA -#include -#include -#endif -namespace c10d { - -enum torch_ucc_status_t { - TORCH_UCC_OK = 0, - TORCH_UCC_INPROGRESS = 1, - TORCH_UCC_OPERATION_INITIALIZED = 2, - TORCH_UCC_ERROR = -1, -}; - -struct torch_ucc_coll_config_t { - bool blocking_wait; - bool high_priority_stream; -}; - -struct torch_ucc_coll_comm_t { -#ifdef USE_CUDA - std::unique_ptr stream; - std::queue> event_pool; - std::mutex event_pool_mutex; -#endif - torch_ucc_coll_config_t config; -}; - -struct torch_ucc_coll_request_t { - torch_ucc_coll_comm_t *coll_comm; - c10::Device device; - std::vector src; - std::vector dst; -#ifdef USE_CUDA - std::unique_ptr tnsr_ready; - std::unique_ptr coll_finished; -#endif - torch_ucc_coll_request_t(): device(c10::DeviceType::CPU) {} - ~torch_ucc_coll_request_t() { -#ifdef USE_CUDA - if (device.is_cuda()) { - std::lock_guard lock(coll_comm->event_pool_mutex); - coll_comm->event_pool.push(std::move(tnsr_ready)); - coll_comm->event_pool.push(std::move(coll_finished)); - } -#endif - } -}; - -struct torch_ucc_coll_ops_t { - torch_ucc_status_t (*coll_comm_init)( - torch_ucx_comm_t* p2p_comm, - torch_ucc_coll_config_t* coll_config, - torch_ucc_coll_comm_t** coll_comm); - - torch_ucc_status_t (*allgather)( - torch_ucc_coll_comm_t* coll_comm, - std::vector& input_tensor, - std::vector& output_tensors, - torch_ucc_coll_request_t** request); - - torch_ucc_status_t (*alltoall)( - torch_ucc_coll_comm_t* coll_comm, - at::Tensor& input_tensor, - at::Tensor& output_tensor, - torch_ucc_coll_request_t** request); - - torch_ucc_status_t (*alltoallv)( - torch_ucc_coll_comm_t* coll_comm, - at::Tensor& input_tensor, - uint32_t* send_lengths, - uint32_t* send_offsets, - at::Tensor& output_tensor, - uint32_t* recv_lengths, - uint32_t* recv_offsets, - torch_ucc_coll_request_t** request); - - torch_ucc_status_t (*allreduce)( - torch_ucc_coll_comm_t* coll_comm, - std::vector& tensors, - const AllreduceOptions& opts, - torch_ucc_coll_request_t** request); - - torch_ucc_status_t (*barrier)( - torch_ucc_coll_comm_t* coll_comm, - torch_ucc_coll_request_t** request); - - torch_ucc_status_t (*broadcast)( - torch_ucc_coll_comm_t* coll_comm, - std::vector& tensors, - int root, - torch_ucc_coll_request_t** request); - - torch_ucc_status_t (*coll_progress)(torch_ucc_coll_request_t* request); - - torch_ucc_status_t (*coll_test)(torch_ucc_coll_request_t* request); - - torch_ucc_status_t (*coll_fence)(torch_ucc_coll_request_t* request); - - torch_ucc_status_t (*coll_finalize)(torch_ucc_coll_request_t* request); - - torch_ucc_status_t (*coll_comm_close)(torch_ucc_coll_comm_t* coll_comm); -}; - -extern torch_ucc_coll_ops_t xccl_coll_ops; - -inline void torch_ucc_coll_request_init( - torch_ucc_coll_comm_t* coll_comm, - torch_ucc_coll_request_t* request, - std::vector* srcPtr, - std::vector* dstPtr) { - request->coll_comm = coll_comm; - if (srcPtr) { - request->src = *srcPtr; - request->device = request->src[0].device(); -#ifdef USE_CUDA - request->tnsr_ready = nullptr; - request->coll_finished = nullptr; - if (request->device.is_cuda()) { - std::lock_guard lock(coll_comm->event_pool_mutex); - if (coll_comm->stream == nullptr) { - coll_comm->stream = std::make_unique( - at::cuda::getStreamFromPool(coll_comm->config.high_priority_stream, - request->device.index())); - } - if (coll_comm->event_pool.empty()) { - request->tnsr_ready = std::make_unique(); - request->coll_finished = std::make_unique(); - } else { - request->tnsr_ready = std::move(coll_comm->event_pool.front()); - coll_comm->event_pool.pop(); - request->coll_finished = std::move(coll_comm->event_pool.front()); - coll_comm->event_pool.pop(); - } - request->tnsr_ready->record( - at::cuda::getCurrentCUDAStream(request->device.index())); - request->tnsr_ready->block(*coll_comm->stream); - - } -#endif - } - if (dstPtr) { - request->dst = *dstPtr; - if (request->src[0].device() != request->dst[0].device()) { - fprintf(stderr, "ProcessGroupUCC: multidevice is not supported\n"); - } - } -} - -inline torch_ucc_status_t torch_ucc_coll_ops_init( - torch_ucc_coll_ops_t* coll_ops) { - *coll_ops = xccl_coll_ops; - return TORCH_UCC_OK; -} - -}; // namespace c10d diff --git a/include/torch_ucc_sendrecv.hpp b/include/torch_ucc_sendrecv.hpp deleted file mode 100644 index 2e75ed4..0000000 --- a/include/torch_ucc_sendrecv.hpp +++ /dev/null @@ -1,271 +0,0 @@ -/** - * * Copyright (C) Mellanox Technologies Ltd. 2001-2020. ALL RIGHTS RESERVED. - * * - * * See file LICENSE for terms. - * */ - -#pragma once - -#include -#include -#include -#include - -#include -#include - -namespace c10d { - -#define TORCH_UCX_COMM_BITS 15 -#define TORCH_UCX_RANK_BITS 16 -#define TORCH_UCX_TAG_BITS 32 -#define TORCH_UCX_OOB_BITS 1 - -#define TORCH_UCX_COMM_BITS_OFFSET 0 -#define TORCH_UCX_RANK_BITS_OFFSET TORCH_UCX_COMM_BITS -#define TORCH_UCX_TAG_BITS_OFFSET (TORCH_UCX_COMM_BITS + TORCH_UCX_RANK_BITS) -#define TORCH_UCX_OOB_BITS_OFFSET \ - (TORCH_UCX_COMM_BITS + TORCH_UCX_RANK_BITS + TORCH_UCX_TAG_BITS) - -#define TORCH_UCX_MAX_COMM ((((uint64_t)1) << TORCH_UCX_COMM_BITS) - 1) -#define TORCH_UCX_MAX_RANK ((((uint64_t)1) << TORCH_UCX_RANK_BITS) - 1) -#define TORCH_UCX_MAX_TAG ((((uint64_t)1) << TORCH_UCX_TAG_BITS) - 1) -#define TORCH_UCX_MAX_OOB ((((uint64_t)1) << TORCH_UCX_OOB_BITS) - 1) - -#define TORCH_UCX_COMM_MASK (TORCH_UCX_MAX_COMM << TORCH_UCX_COMM_BITS_OFFSET) -#define TORCH_UCX_RANK_MASK (TORCH_UCX_MAX_RANK << TORCH_UCX_RANK_BITS_OFFSET) -#define TORCH_UCX_TAG_MASK (TORCH_UCX_MAX_TAG << TORCH_UCX_TAG_BITS_OFFSET) -#define TORCH_UCX_OOB_MASK (TORCH_UCX_MAX_OOB << TORCH_UCX_OOB_BITS_OFFSET) - -#define TORCH_UCX_MAKE_P2P_TAG(_tag, _rank, _comm) \ - ((((uint64_t)(_tag)) << TORCH_UCX_TAG_BITS_OFFSET) | \ - (((uint64_t)(_rank)) << TORCH_UCX_RANK_BITS_OFFSET) | \ - (((uint64_t)(_comm)) << TORCH_UCX_COMM_BITS_OFFSET)) - -#define TORCH_UCX_MAKE_OOB_TAG(_tag, _rank, _comm) \ - ((((uint64_t)(_tag)) << TORCH_UCX_OOB_BITS_OFFSET) | \ - (((uint64_t)(_rank)) << TORCH_UCX_RANK_BITS_OFFSET) | \ - (((uint64_t)(_rank)) << TORCH_UCX_COMM_BITS_OFFSET)) - -#define TORCH_UCX_MAKE_SEND_TAG(_ucp_tag, _tag, _rank, _comm) \ - do { \ - (_ucp_tag) = TORCH_UCX_MAKE_P2P_TAG((_tag), (_rank), (_comm)); \ - } while (0) - -#define TORCH_UCX_ANY_SOURCE (TORCH_UCX_MAX_RANK - 1) -#define TORCH_UCX_ANY_SOURCE_MASK (~TORCH_UCX_RANK_MASK) -#define TORCH_UCX_SPECIFIC_SOURCE_MASK ((uint64_t)-1) - -#define TORCH_UCX_MAKE_RECV_TAG(_ucp_tag, _ucp_tag_mask, _tag, _rank, _comm) \ - do { \ - (_ucp_tag) = TORCH_UCX_MAKE_P2P_TAG((_tag), (_rank), (_comm)); \ - if ((_rank) == TORCH_UCX_ANY_SOURCE) { \ - (_ucp_tag_mask) = TORCH_UCX_ANY_SOURCE_MASK; \ - } else { \ - (_ucp_tag_mask) = TORCH_UCX_SPECIFIC_SOURCE_MASK; \ - } \ - } while (0) - -#define TORCH_UCX_MAKE_OOB_SEND_TAG(_ucp_tag, _tag, _rank, _comm) \ - do { \ - (_ucp_tag) = TORCH_UCX_MAKE_OOB_TAG((_tag), (_rank), (_comm)); \ - } while (0) - -#define TORCH_UCX_MAKE_OOB_RECV_TAG( \ - _ucp_tag, _ucp_tag_mask, _tag, _rank, _comm) \ - do { \ - (_ucp_tag) = TORCH_UCX_MAKE_OOB_TAG((_tag), (_rank), (_comm)); \ - (_ucp_tag_mask) = (uint64_t)-1; \ - } while (0) - -enum torch_ucx_status_t { - TORCH_UCX_OK = 0, - TORCH_UCX_INPROGRESS = 1, - TORCH_UCX_ERROR = -1, -}; - -enum torch_ucx_tag_type_t { - TORCH_UCX_P2P_TAG, - TORCH_UCX_OOB_TAG -}; - -enum torch_ucx_request_status_t { - TORCH_UCX_REQUEST_ACTIVE, - TORCH_UCX_REQUEST_DONE, -}; - -struct torch_ucx_request_t { - torch_ucx_request_status_t status; -}; - -struct torch_ucx_comm_t { - int size; - int rank; - ucp_context_h ctx; - ucp_ep_h* eps; - ucp_worker_h worker; -}; - -static inline void torch_ucx_request_free(torch_ucx_request_t* request) { - request->status = TORCH_UCX_REQUEST_ACTIVE; - ucp_request_free(request); -} - -static inline torch_ucx_status_t torch_ucx_check_req(ucs_status_ptr_t st) { - if (UCS_PTR_IS_ERR(st)) { - fprintf( - stderr, "ProcessGroupUCC: %s\n", ucs_status_string(UCS_PTR_STATUS(st))); - return TORCH_UCX_ERROR; - } - - return TORCH_UCX_OK; -} - -void torch_ucx_send_cmpl_cb( - void* request, - ucs_status_t status, - void* user_data); - -void torch_ucx_recv_cmpl_cb( - void* request, - ucs_status_t status, - const ucp_tag_recv_info_t* info, - void* user_data); - -torch_ucx_status_t torch_ucx_comm_init( - torch_ucx_comm_t** comm, - int size, - int rank, - const c10::intrusive_ptr& store); -void torch_ucx_comm_close( - torch_ucx_comm_t* comm, - const c10::intrusive_ptr& store); - -static inline torch_ucx_status_t torch_ucx_send_nb( - torch_ucx_comm_t* comm, - void* data, - ucs_memory_type_t mtype, - size_t size, - int dst_rank, - uint32_t tag, - torch_ucx_request_t** req, - torch_ucx_tag_type_t type) { - ucp_tag_t ucp_tag; - ucs_status_ptr_t st; - ucp_request_param_t params; - - switch (type) { - case TORCH_UCX_P2P_TAG: - TORCH_UCX_MAKE_SEND_TAG(ucp_tag, tag, comm->rank, 0); - break; - case TORCH_UCX_OOB_TAG: - TORCH_UCX_MAKE_OOB_SEND_TAG(ucp_tag, tag, comm->rank, 0); - break; - default: - return TORCH_UCX_ERROR; - }; - // fprintf(stderr, "rank %d send tag %" PRIu64 "(%d) shift %d\n", comm->rank, - // ucp_tag, tag, TORCH_UCX_OOB_TAG_BITS_OFFSET); - - params.op_attr_mask = UCP_OP_ATTR_FIELD_CALLBACK | - UCP_OP_ATTR_FIELD_DATATYPE | UCP_OP_ATTR_FIELD_MEMORY_TYPE; - params.datatype = ucp_dt_make_contig(size); - params.memory_type = mtype; - params.cb.send = torch_ucx_send_cmpl_cb; - st = ucp_tag_send_nbx(comm->eps[dst_rank], data, 1, ucp_tag, ¶ms); - if (torch_ucx_check_req(st) != TORCH_UCX_OK) { - return TORCH_UCX_ERROR; - }; - *req = reinterpret_cast(st); - - return TORCH_UCX_OK; -} - -static inline torch_ucx_status_t torch_ucx_recv_nb( - torch_ucx_comm_t* comm, - void* data, - ucs_memory_type_t mtype, - size_t size, - int src_rank, - uint32_t tag, - torch_ucx_request_t** req, - torch_ucx_tag_type_t type) { - ucp_tag_t ucp_tag, ucp_tag_mask; - ucs_status_ptr_t st; - ucp_request_param_t params; - - switch (type) { - case TORCH_UCX_P2P_TAG: - TORCH_UCX_MAKE_RECV_TAG(ucp_tag, ucp_tag_mask, tag, src_rank, 0); - break; - case TORCH_UCX_OOB_TAG: - TORCH_UCX_MAKE_OOB_RECV_TAG(ucp_tag, ucp_tag_mask, tag, src_rank, 0); - break; - default: - return TORCH_UCX_ERROR; - }; - - // fprintf(stderr, "rank %d recv tag %" PRIu64 " (%d) mask %" PRIu64 "\n", - // comm->rank, ucp_tag, tag, ucp_tag_mask ); - params.op_attr_mask = UCP_OP_ATTR_FIELD_CALLBACK | - UCP_OP_ATTR_FIELD_DATATYPE | UCP_OP_ATTR_FIELD_MEMORY_TYPE; - params.datatype = ucp_dt_make_contig(size); - params.cb.recv = torch_ucx_recv_cmpl_cb; - params.memory_type = mtype; - st = ucp_tag_recv_nbx(comm->worker, data, 1, ucp_tag, ucp_tag_mask, ¶ms); - if (torch_ucx_check_req(st) != TORCH_UCX_OK) { - return TORCH_UCX_ERROR; - }; - *req = reinterpret_cast(st); - /*TODO: check request*/ - - return TORCH_UCX_OK; -} - -static inline unsigned torch_ucx_comm_progress(torch_ucx_comm_t* comm) { - return ucp_worker_progress(comm->worker); -} - -static inline torch_ucx_status_t torch_ucx_req_test( - torch_ucx_comm_t* comm, - torch_ucx_request_t** reqs, - int n_reqs, - int* completed_idx, - int poll_count, - int n_completions_required) { - int n_polls = 0; - int n_completed; - - if (n_completions_required == 0) { - return TORCH_UCX_OK; - } - - while (poll_count < 0 || n_polls++ < poll_count) { - n_completed = 0; - for (int i = 0; i < n_reqs; i++) { - if (reqs[i] == nullptr) { - if (completed_idx) { - *completed_idx = i; - } - n_completed++; - } else { - if (reqs[i]->status != TORCH_UCX_REQUEST_DONE) { - torch_ucx_comm_progress(comm); - } else { - torch_ucx_request_free(reqs[i]); - reqs[i] = nullptr; - if (completed_idx) { - *completed_idx = i; - } - n_completed++; - } - } - if (n_completed == n_completions_required) { - return TORCH_UCX_OK; - } - } - } - return TORCH_UCX_INPROGRESS; -} - -} // namespace c10d diff --git a/include/torch_xccl.hpp b/include/torch_xccl.hpp deleted file mode 100644 index 7c3bf31..0000000 --- a/include/torch_xccl.hpp +++ /dev/null @@ -1,41 +0,0 @@ -/** - * * Copyright (C) Mellanox Technologies Ltd. 2020-2021. ALL RIGHTS RESERVED. - * * - * * See file LICENSE for terms. - * */ - -#pragma once - -#include -#include -#include - -namespace c10d { - -struct torch_xccl_comm_t { - torch_ucc_coll_comm_t super; - torch_ucx_comm_t* p2p_comm{}; - xccl_lib_h xccl_lib{}; - xccl_context_h xccl_ctx{}; - xccl_team_h xccl_team{}; -}; - -struct torch_xccl_request_t { - torch_ucc_coll_request_t super; - xccl_coll_req_h request{}; - xccl_collective_type_t coll_type; - torch_ucc_status_t status; - at::Tensor flat_tensor; -#ifdef USE_CUDA - cudaStream_t stream; -#endif -}; - -torch_ucc_status_t torch_xccl_comm_init( - torch_ucx_comm_t* p2p_comm, - torch_ucc_coll_config_t* coll_config, - torch_ucc_coll_comm_t** comm); - -torch_ucc_status_t torch_xccl_comm_close(torch_ucc_coll_comm_t* comm); - -} // namespace c10d diff --git a/setup.py b/setup.py index 5063a9c..57fe8bb 100644 --- a/setup.py +++ b/setup.py @@ -1,5 +1,6 @@ # -# Copyright (C) Mellanox Technologies Ltd. 2001-2020. ALL RIGHTS RESERVED. +# Copyright (C) Mellanox Technologies Ltd. 2020-2021. ALL RIGHTS RESERVED. +# See file LICENSE for terms. # import os @@ -9,15 +10,13 @@ ucc_plugin_dir = os.path.dirname(os.path.abspath(__file__)) ucx_home = os.environ.get("UCX_HOME") -if ucx_home is None: - ucx_home = os.environ.get("HPCX_UCX_DIR") if ucx_home is None: print("Couldn't find UCX install dir, please set UCX_HOME env variable") sys.exit(1) -xccl_home = os.environ.get("XCCL_HOME") -if xccl_home is None: - print("Couldn't find XCCL install dir, please set XCCL_HOME env variable") +ucc_home = os.environ.get("UCC_HOME") +if ucc_home is None: + print("Couldn't find UCC install dir, please set UCC_HOME env variable") sys.exit(1) plugin_compile_args = [] @@ -28,15 +27,13 @@ print("Debug build") plugin_compile_args.extend(["-g", "-O0"]) -plugin_sources = ["src/torch_ucc.cpp", - "src/torch_ucc_sendrecv.cpp", - "src/torch_xccl.cpp"] +plugin_sources = ["src/torch_ucc.cpp"] plugin_include_dirs = ["{}/include/".format(ucc_plugin_dir), "{}/include/".format(ucx_home), - "{}/include/".format(xccl_home)] + "{}/include/".format(ucc_home)] plugin_library_dirs = ["{}/lib/".format(ucx_home), - "{}/lib/".format(xccl_home)] -plugin_libraries = ["ucp", "uct", "ucm", "ucs", "xccl"] + "{}/lib/".format(ucc_home)] +plugin_libraries = ["ucp", "uct", "ucm", "ucs", "ucc"] with_cuda = os.environ.get("WITH_CUDA") if with_cuda is None or with_cuda == "no": @@ -60,10 +57,9 @@ libraries = plugin_libraries, extra_compile_args=plugin_compile_args ) - setup( name = "torch-ucc", - version = "0.1.0", + version = "1.0.0", ext_modules = [module], cmdclass={'build_ext': cpp_extension.BuildExtension} ) diff --git a/src/torch_ucc.cpp b/src/torch_ucc.cpp index a2697c5..5251688 100644 --- a/src/torch_ucc.cpp +++ b/src/torch_ucc.cpp @@ -1,16 +1,10 @@ /** - * * Copyright (C) Mellanox Technologies Ltd. 2020-2021. ALL RIGHTS RESERVED. - * * - * * See file LICENSE for terms. - * */ - -#include -#include -#include -#ifdef USE_CUDA -#include -#endif -#include + * Copyright (C) Mellanox Technologies Ltd. 2020-2021. ALL RIGHTS RESERVED. + * See file LICENSE for terms. + */ + +#include "torch_ucc.hpp" +#include namespace c10d { @@ -25,7 +19,44 @@ const std::map ucs_mtype_map = { {c10::kMetal, UCS_MEMORY_TYPE_UNKNOWN}, }; -void ProcessGroupUCC::check_tensor(const std::vector& tensors) { +const std::map ucc_mtype_map = { + {c10::kCPU, UCC_MEMORY_TYPE_HOST}, + {c10::kCUDA, UCC_MEMORY_TYPE_CUDA}, + {c10::kHIP, UCC_MEMORY_TYPE_ROCM}, + {c10::kFPGA, UCC_MEMORY_TYPE_UNKNOWN}, + {c10::kMSNPU, UCC_MEMORY_TYPE_UNKNOWN}, + {c10::kXLA, UCC_MEMORY_TYPE_UNKNOWN}, + {c10::kVulkan, UCC_MEMORY_TYPE_UNKNOWN}, + {c10::kMetal, UCC_MEMORY_TYPE_UNKNOWN}, +}; + +const std::map ucc_dtype_map = { + {at::kByte, UCC_DT_UINT8}, + {at::kChar, UCC_DT_INT8}, + {at::kHalf, UCC_DT_FLOAT16}, + {at::kDouble, UCC_DT_FLOAT64}, + {at::kFloat, UCC_DT_FLOAT32}, + {at::kInt, UCC_DT_INT32}, + {at::kLong, UCC_DT_INT64}, +}; + +const std::map ucc_op_map = { + {ReduceOp::SUM, UCC_OP_SUM}, + {ReduceOp::PRODUCT, UCC_OP_PROD}, + {ReduceOp::MIN, UCC_OP_MIN}, + {ReduceOp::MAX, UCC_OP_MAX}, + {ReduceOp::BAND, UCC_OP_BAND}, + {ReduceOp::BOR, UCC_OP_BOR}, + {ReduceOp::BXOR, UCC_OP_BXOR}, +}; + +void check_device(c10::Device dev1, c10::Device dev2) { + if (dev1.is_cuda() && dev2.is_cuda() && dev1 != dev2) { + throw std::runtime_error("ProcessGroupUCC multidevice is not supported"); + } +} + +void check_tensor(const std::vector& tensors) { if (tensors.size() != 1) { throw std::runtime_error("ProcessGroupUCC takes 1 tensor"); } @@ -39,277 +70,437 @@ void ProcessGroupUCC::check_tensor(const std::vector& tensors) { // TODO: check cuda case } -static torch_ucc_status_t compute_lengths_offsets( - int group_size, - const std::vector& split_sizes, - const at::Tensor& tensor, - uint32_t* lengths, - uint32_t* offsets) { - bool equal_splits = false; - size_t dim0_size = tensor.size(0); - size_t row_size = (dim0_size ? tensor.numel() / dim0_size : 1); - size_t split_size = 0; - size_t offset = 0; - - if (split_sizes.size() == 0) { - equal_splits = true; - split_size = tensor.size(0) / group_size; - } +ProcessGroupUCC::WorkUCC::~WorkUCC() { + TORCH_CHECK(request_ == nullptr, "TorchUCC, request wasn't finalized"); +} - for (int i = 0; i < group_size; i++) { - size_t length = row_size * (equal_splits ? split_size : split_sizes[i]); - if ((length > INT_MAX) || (offset > INT_MAX)) { - return TORCH_UCC_ERROR; - } - lengths[i] = length; - offsets[i] = offset; - offset += length; - } +bool ProcessGroupUCC::WorkUCC::isCompleted() { + return (status_ != UCC_INPROGRESS); +} + +bool ProcessGroupUCC::WorkUCC::isSuccess() const { + return (status_ >= 0); +} - return TORCH_UCC_OK; +bool ProcessGroupUCC::WorkUCC::wait(std::chrono::milliseconds /* unused */) { + while (!isCompleted()) + ; + return true; } -ProcessGroupUCC::WorkUCX::~WorkUCX() { - if (req != nullptr) { - torch_ucx_request_free(req); +void ProcessGroupUCC::WorkUCC::finalize() { + if (request_ != nullptr) { + if (isP2POp(opType_)) { + request_->status = UCC_INPROGRESS; + ucp_request_free(request_); + } else { + ucc_collective_finalize(request_); + } + status_ = UCC_OK; + request_ = nullptr; } } -bool ProcessGroupUCC::WorkUCX::isCompleted() { - torch_ucx_status_t st; +CommUCX::CommUCX(int comm_size) { + ucp_params_t params; + ucp_config_t* config; + ucs_status_t st; + ucp_worker_params_t worker_params; - st = torch_ucx_req_test(comm, &req, 1, nullptr, 1, 1); - return (st != TORCH_UCX_INPROGRESS); + st = ucp_config_read("TORCH", nullptr, &config); + if (st != UCS_OK) { + LOG(ERROR) << "failed to read UCP config: " << ucs_status_string(st); + throw std::runtime_error(ucs_status_string(st)); + } + memset(¶ms, 0, sizeof(ucp_params_t)); + params.field_mask = UCP_PARAM_FIELD_FEATURES | UCP_PARAM_FIELD_REQUEST_SIZE | + UCP_PARAM_FIELD_ESTIMATED_NUM_EPS | UCP_PARAM_FIELD_TAG_SENDER_MASK | + UCP_PARAM_FIELD_REQUEST_INIT | UCP_PARAM_FIELD_REQUEST_CLEANUP; + params.request_size = sizeof(ucc_coll_req_t); + params.features = UCP_FEATURE_TAG; + params.estimated_num_eps = comm_size; + params.tag_sender_mask = TORCH_UCX_RANK_MASK; + params.request_init = [](void* request) { + static_cast(request)->status = UCC_INPROGRESS; + }; + params.request_cleanup = [](void*) {}; + st = ucp_init(¶ms, config, &context); + ucp_config_release(config); + if (st != UCS_OK) { + LOG(ERROR) << "failed to init UCP context: " << ucs_status_string(st); + throw std::runtime_error(ucs_status_string(st)); + } + memset(&worker_params, 0, sizeof(ucp_worker_params_t)); + worker_params.field_mask = UCP_WORKER_PARAM_FIELD_THREAD_MODE; + worker_params.thread_mode = UCS_THREAD_MODE_MULTI; + st = ucp_worker_create(context, &worker_params, &worker); + if (st != UCS_OK) { + LOG(ERROR) << "failed to create UCP worker: " << ucs_status_string(st); + ucp_cleanup(context); + throw std::runtime_error(ucs_status_string(st)); + } } -bool ProcessGroupUCC::WorkUCX::isSuccess() const { - // TODO - return true; +void CommUCX::progress() { + ucp_worker_progress(worker); } -bool ProcessGroupUCC::WorkUCX::wait( - std::chrono::milliseconds /* unused */) { - torch_ucx_req_test(comm, &req, 1, nullptr, -1, 1); - return true; +CommUCX::~CommUCX() { + ucp_worker_destroy(worker); + ucp_cleanup(context); } -ProcessGroupUCC::WorkColl::~WorkColl() { - if (coll_req != nullptr) { - if (coll_ops.coll_test(coll_req) != TORCH_UCC_OK) { - fprintf( - stderr, - "ProcessGroupUCC: warn removing request before collective finish\n"); - } - coll_ops.coll_finalize(coll_req); +void CommPG::ucx_connect_eps( + std::vector& eps, + int rank, + int size, + const c10::intrusive_ptr& store) { + ucs_status_t st; + ucp_address_t* local_addr; + size_t local_addr_len; + + st = ucp_worker_get_address(ucx_comm.worker, &local_addr, &local_addr_len); + if (st != UCS_OK) { + throw std::runtime_error(ucs_status_string(st)); } - - if (scratch != nullptr) { - delete[] scratch; + auto key = "wa" + std::to_string(rank); + std::vector val = std::vector( + reinterpret_cast(local_addr), + reinterpret_cast(local_addr) + local_addr_len); + store->set(key, val); + ucp_worker_release_address(ucx_comm.worker, local_addr); + eps.resize(size); + for (int i = 0; i < size; i++) { + std::vector peer_addr = store->get("wa" + std::to_string(i)); + ucp_ep_params_t ep_params; + ep_params.field_mask = UCP_EP_PARAM_FIELD_REMOTE_ADDRESS; + ep_params.address = reinterpret_cast(peer_addr.data()); + st = ucp_ep_create(ucx_comm.worker, &ep_params, &(eps[i])); + if (st != UCS_OK) { + throw std::runtime_error(ucs_status_string(st)); + } } } -bool ProcessGroupUCC::WorkColl::isCompleted() { - torch_ucc_status_t st; +void CommPG::ucx_disconnect_eps( + std::vector& eps, + const c10::intrusive_ptr& store) { + ucs_status_t st; - if (!external_progress) { - coll_ops.coll_progress(coll_req); - st = coll_ops.coll_test(coll_req); - if (st != TORCH_UCC_INPROGRESS) { - work_list.erase(work_list_entry); + for (ucp_ep_h& ep : eps) { + ucs_status_ptr_t close_req = ucp_ep_close_nb(ep, UCP_EP_CLOSE_MODE_FLUSH); + if (UCS_PTR_IS_ERR(close_req)) { + LOG(ERROR) << "failed to close endpoint"; + return; + } + if (UCS_PTR_IS_PTR(close_req)) { + do { + ucp_worker_progress(ucx_comm.worker); + st = ucp_request_check_status(close_req); + } while (st != UCS_OK); + ucp_request_free(close_req); } + } + if ((size_t)store->add("epclosed", 1) == eps.size()) { + store->add("finished", 1); } else { - st = coll_ops.coll_test(coll_req); + store->wait({"finished"}); } - return (st != TORCH_UCC_INPROGRESS); } -bool ProcessGroupUCC::WorkColl::isSuccess() const { - // TODO - return true; -} - -bool ProcessGroupUCC::WorkColl::wait( - std::chrono::milliseconds /* unused */) { - if (blocking_wait || !coll_req->device.is_cuda()) { - while (!isCompleted()) { - }; - } else { - coll_ops.coll_fence(coll_req); +ucc_coll_req_h CommPG::send_nb( + ucp_ep_h ep, + void* data, + ucs_memory_type_t mtype, + size_t size, + ucp_tag_t ucp_tag) { + ucs_status_ptr_t st; + ucp_request_param_t params; + params.op_attr_mask = UCP_OP_ATTR_FIELD_CALLBACK | + UCP_OP_ATTR_FIELD_DATATYPE | UCP_OP_ATTR_FIELD_MEMORY_TYPE; + params.datatype = ucp_dt_make_contig(size); + params.memory_type = mtype; + params.cb.send = [](void* request, ucs_status_t status, void* user_data) { + static_cast(request)->status = UCC_OK; + }; + st = ucp_tag_send_nbx(ep, data, 1, ucp_tag, ¶ms); + if (UCS_PTR_IS_ERR(st)) { + LOG(ERROR) << "failed to send message: " + << ucs_status_string(UCS_PTR_STATUS(st)); + throw std::runtime_error(ucs_status_string(UCS_PTR_STATUS(st))); } - return true; + return reinterpret_cast(st); +} + +ucc_coll_req_h CommPG::recv_nb( + void* data, + ucs_memory_type_t mtype, + size_t size, + ucp_tag_t ucp_tag, + ucp_tag_t ucp_tag_mask) { + ucs_status_ptr_t st; + ucp_request_param_t params; + params.op_attr_mask = UCP_OP_ATTR_FIELD_CALLBACK | + UCP_OP_ATTR_FIELD_DATATYPE | UCP_OP_ATTR_FIELD_MEMORY_TYPE; + params.datatype = ucp_dt_make_contig(size); + params.cb.recv = [](void* request, + ucs_status_t status, + const ucp_tag_recv_info_t* info, + void* user_data) { + static_cast(request)->status = UCC_OK; + }; + params.memory_type = mtype; + st = ucp_tag_recv_nbx( + ucx_comm.worker, data, 1, ucp_tag, ucp_tag_mask, ¶ms); + if (UCS_PTR_IS_ERR(st)) { + LOG(ERROR) << "failed to recv message: " + << ucs_status_string(UCS_PTR_STATUS(st)); + throw std::runtime_error(ucs_status_string(UCS_PTR_STATUS(st))); + } + return reinterpret_cast(st); } -void ProcessGroupUCC::read_config() { - char* env; +CommUCC::CommUCC(int comm_size) { + ucc_lib_config_h lib_config; + ucc_context_config_h context_config; + ucc_lib_params_t lib_params; + ucc_context_params_t context_params; + ucc_status_t st; - config.enable_progress_thread = true; - env = std::getenv("TORCH_UCC_THREAD_ENABLE"); - if (env) { - config.enable_progress_thread = std::atoi(env); + st = ucc_lib_config_read("TORCH", nullptr, &lib_config); + if (st != UCC_OK) { + LOG(ERROR) << "failed to read UCC lib config: " << ucc_status_string(st); + throw std::runtime_error(ucc_status_string(st)); } - config.blocking_wait = true; - env = std::getenv("TORCH_UCC_BLOCKING_WAIT"); - if (env) { - config.blocking_wait = std::atoi(env); + memset(&lib_params, 0, sizeof(ucc_lib_params_t)); + lib_params.mask = UCC_LIB_PARAM_FIELD_THREAD_MODE; + lib_params.thread_mode = UCC_THREAD_SINGLE; + st = ucc_init(&lib_params, lib_config, &lib); + ucc_lib_config_release(lib_config); + if (st != UCC_OK) { + LOG(ERROR) << "failed to init UCC lib: " << ucc_status_string(st); + throw std::runtime_error(ucc_status_string(st)); } - config.high_priority_stream = false; - env = std::getenv("TORCH_UCC_HIGH_PRIORITY_STREAM"); - if (env) { - config.high_priority_stream = std::atoi(env); + st = ucc_context_config_read(lib, NULL, &context_config); + if (st != UCC_OK) { + ucc_finalize(lib); + LOG(ERROR) << "failed to read UCC context config: " + << ucc_status_string(st); + throw std::runtime_error(ucc_status_string(st)); + } + st = ucc_context_config_modify(context_config, NULL, "ESTIMATED_NUM_EPS", + std::to_string(comm_size).c_str()); + if (st != UCC_OK) { + ucc_context_config_release(context_config); + ucc_finalize(lib); + LOG(ERROR) << "failed to modify UCC context config: " + << ucc_status_string(st); + throw std::runtime_error(ucc_status_string(st)); + } + memset(&context_params, 0, sizeof(ucc_context_params_t)); + context_params.mask = UCC_CONTEXT_PARAM_FIELD_TYPE; + context_params.ctx_type = UCC_CONTEXT_SHARED; + ucc_context_create(lib, &context_params, context_config, &context); + ucc_context_config_release(context_config); + if (st != UCC_OK) { + ucc_finalize(lib); + LOG(ERROR) << "failed to create UCC context: " << ucc_status_string(st); + throw std::runtime_error(ucc_status_string(st)); } } -ProcessGroupUCC::ProcessGroupUCC( - const c10::intrusive_ptr& store, - int rank, - int size) - : ProcessGroup(rank, size), store_(store), stop_progress_loop(false) { - torch_ucx_status_t st; - torch_ucc_status_t st_ucc; - - read_config(); - st = torch_ucx_comm_init(&ucx_comm, size, rank, store_); - if (st != TORCH_UCX_OK) { - throw std::runtime_error("ProcessGroupUCC init failed"); - } +void CommUCC::progress() { + ucc_context_progress(context); +} - st_ucc = torch_ucc_coll_ops_init(&coll_ops); - if (st_ucc != TORCH_UCC_OK) { - throw std::runtime_error("ProcessGroupUCC failed to init collops"); - } - coll_comm = nullptr; - - if (config.enable_progress_thread) { - c10::DeviceIndex dev_idx = 0; -#ifdef USE_CUDA - dev_idx = c10::cuda::current_device(); -#endif - progress_thread = std::thread(&ProcessGroupUCC::progress_loop, this, dev_idx); - } +CommUCC::~CommUCC() { + ucc_context_destroy(context); + ucc_finalize(lib); } -torch_ucc_coll_comm_t* ProcessGroupUCC::get_coll_comm() { - if (coll_comm == nullptr) { - torch_ucc_status_t st_ucc; - torch_ucc_coll_config_t cfg; +struct torch_ucc_oob_coll_info_t { + const c10::intrusive_ptr* store; + int rank; + int size; + void* rbuf; + size_t msglen; +}; - cfg.blocking_wait = config.blocking_wait; - cfg.high_priority_stream = config.high_priority_stream; - st_ucc = coll_ops.coll_comm_init(ucx_comm, &cfg, &coll_comm); - if (st_ucc != TORCH_UCC_OK) { - throw std::runtime_error( - "ProcessGroupUCC failed to init collective comm"); +static ucc_status_t oob_allgather( + void* sbuf, + void* rbuf, + size_t msglen, + void* coll_info, + void** req) { + torch_ucc_oob_coll_info_t* info = + reinterpret_cast(coll_info); + std::vector val = std::vector( + reinterpret_cast(sbuf), + reinterpret_cast(sbuf) + msglen); + (*info->store)->set("teamr" + std::to_string(info->rank), val); + info->rbuf = rbuf; + info->msglen = msglen; + *req = coll_info; + return UCC_OK; +} + +static ucc_status_t oob_allgather_test(void* req) { + torch_ucc_oob_coll_info_t* info = + reinterpret_cast(req); + + for (int r = 0; r < info->size; r++) { + if (!((*info->store)->check({"teamr" + std::to_string(r)}))) { + return UCC_INPROGRESS; } } + for (int r = 0; r < info->size; r++) { + std::vector data = + (*info->store)->get("teamr" + std::to_string(r)); + memcpy( + (void*)((ptrdiff_t)info->rbuf + info->msglen * r), + data.data(), + info->msglen); + } + return UCC_OK; +} + +static ucc_status_t oob_allgather_free(void* req) { + torch_ucc_oob_coll_info_t* info = + reinterpret_cast(req); + int num_done = (*info->store)->add({"team_ag_done"}, 1); + if (num_done == info->size) { + (*info->store)->deleteKey("team_ag_done"); + for (int r = 0; r < info->size; r++) { + if (r != info->rank) { + (*info->store)->add({"team_ag_finished" + std::to_string(r)}, 1); + } + } + } else { + (*info->store)->wait({"team_ag_finished" + std::to_string(info->rank)}); + } + (*info->store)->deleteKey("teamr" + std::to_string(info->rank)); + (*info->store)->deleteKey("team_ag_finished" + std::to_string(info->rank)); - return coll_comm; + return UCC_OK; } -void ProcessGroupUCC::progress_loop(c10::DeviceIndex default_dev_idx) { - std::unique_lock lock(pg_mutex); - torch_ucc_status_t st; -#ifdef USE_CUDA - at::cuda::OptionalCUDAGuard guard(default_dev_idx); - if (default_dev_idx == 0) { - c10::cuda::set_device(default_dev_idx); +void CommPG::ucc_create_team( + ucc_team_h& team, + int rank, + int size, + const c10::intrusive_ptr& store) { + ucc_status_t st; + ucc_team_params_t team_params; + torch_ucc_oob_coll_info_t* coll_info = new torch_ucc_oob_coll_info_t; + + coll_info->rank = rank; + coll_info->size = size; + coll_info->store = &store; + team_params.mask = UCC_TEAM_PARAM_FIELD_EP | UCC_TEAM_PARAM_FIELD_EP_RANGE | + UCC_TEAM_PARAM_FIELD_OOB; + team_params.oob.allgather = oob_allgather; + team_params.oob.req_test = oob_allgather_test; + team_params.oob.req_free = oob_allgather_free; + team_params.oob.coll_info = coll_info; + team_params.oob.participants = size; + team_params.ep = rank; + team_params.ep_range = UCC_COLLECTIVE_EP_RANGE_CONTIG; + st = ucc_team_create_post(&ucc_comm.context, 1, &team_params, &team); + if (st != UCC_OK) { + delete coll_info; + LOG(ERROR) << "failed to post team create: " << ucc_status_string(st); + throw std::runtime_error(ucc_status_string(st)); } -#endif - while (!stop_progress_loop) { - if (progress_list.empty()) { - queue_produce_cv.wait(lock); - continue; - } - auto work_coll = progress_list.front(); - progress_list.pop_front(); - lock.unlock(); - queue_consume_cv.notify_one(); -#ifdef USE_CUDA - if (work_coll->coll_req->device.is_cuda()) { - guard.set_device(work_coll->coll_req->device); - } -#endif - do { - st = coll_ops.coll_progress(work_coll->coll_req); - } while ( - (coll_ops.coll_test(work_coll->coll_req) == TORCH_UCC_INPROGRESS) && - (st == TORCH_UCC_OK)); - if (st != TORCH_UCC_OK) { - fprintf(stderr, "ProcessGroupUCC: coll progress failed\n"); - } - lock.lock(); + do { + st = ucc_team_create_test(team); + } while (st == UCC_INPROGRESS); + if (st != UCC_OK) { + delete coll_info; + LOG(ERROR) << "failed to create UCC team: " << ucc_status_string(st); + throw std::runtime_error(ucc_status_string(st)); } + // TODO: don't delete + delete coll_info; } -c10::intrusive_ptr ProcessGroupUCC::enqueue_request( - torch_ucc_coll_request_t* req, - void* scratch) { - std::unique_lock lock(pg_mutex); +void CommPG::ucc_destroy_team(ucc_team_h& team) { + ucc_team_destroy(team); +} - auto iter = progress_list.emplace( - progress_list.end(), - c10::make_intrusive(coll_ops, progress_list)); - (*iter)->work_list_entry = iter; - (*iter)->coll_req = req; - (*iter)->blocking_wait = config.blocking_wait; - (*iter)->external_progress = config.enable_progress_thread; - (*iter)->scratch = (char*)scratch; - auto workreq = (*iter); - lock.unlock(); - queue_produce_cv.notify_one(); - return workreq; +ProcessGroupUCC::ProcessGroupUCC( + const c10::intrusive_ptr& store, + int rank, + int size) + : ProcessGroup(rank, size), store_(store) { + comm = nullptr; } ProcessGroupUCC::~ProcessGroupUCC() { - if (config.enable_progress_thread) { - std::unique_lock lock(pg_mutex); - queue_consume_cv.wait(lock, [&] { return progress_list.empty(); }); - stop_progress_loop = true; - lock.unlock(); - queue_produce_cv.notify_all(); - progress_thread.join(); - } - if (progress_list.size() != 0) { - fprintf(stderr, "ProcessGroupUCC: warnning progress list is not empty\n"); - } - if (coll_comm != nullptr) { - coll_ops.coll_comm_close(coll_comm); - } - torch_ucx_comm_close(ucx_comm, store_); + comm->ucc_destroy_team(team); + comm->ucx_disconnect_eps(eps, store_); } -c10::intrusive_ptr ProcessGroupUCC::broadcast( - std::vector& tensors, - const BroadcastOptions& opts) { - torch_ucc_coll_comm_t* ucc_comm; - torch_ucc_coll_request_t* coll_req; - torch_ucc_status_t st; - - check_tensor(tensors); - c10::DeviceGuard guard(tensors[0].device()); - ucc_comm = get_coll_comm(); - st = coll_ops.broadcast(ucc_comm, tensors, opts.rootRank, &coll_req); - if (st != TORCH_UCC_OK) { - throw std::runtime_error("ProcessGroupUCC: broadcast failed"); +c10::intrusive_ptr ProcessGroupUCC::allgather( + std::vector>& outputTensors, + std::vector& inputTensors, + const AllgatherOptions& /* unused */) { + auto& tensor = inputTensors[0]; + check_device(tensor.device(), outputTensors[0][0].device()); + initComm(tensor.device()); + + AllgatherWorkData* data = new AllgatherWorkData(size_); + for (int i = 0; i < size_; i++) { + data->recv_lengths[i] = tensor.element_size() * tensor.numel(); + data->recv_offsets[i] = (uint64_t)outputTensors[0][i].data_ptr(); } - return enqueue_request(coll_req, nullptr); + ucc_coll_args_t coll; + coll.mask = UCC_COLL_ARGS_FIELD_FLAGS; + coll.flags = + UCC_COLL_ARGS_FLAG_COUNT_64BIT | UCC_COLL_ARGS_FLAG_DISPLACEMENTS_64BIT; + coll.coll_type = UCC_COLL_TYPE_ALLGATHERV; + coll.src.info.buffer = tensor.data_ptr(); + coll.src.info.count = tensor.element_size() * tensor.numel(); + coll.src.info.datatype = UCC_DT_UINT8; + coll.src.info.mem_type = ucc_mtype_map.at(tensor.device().type()); + coll.dst.info_v.buffer = nullptr; + coll.dst.info_v.counts = (ucc_count_t*)data->recv_lengths.data(); + coll.dst.info_v.displacements = (ucc_aint_t*)data->recv_offsets.data(); + coll.dst.info_v.datatype = UCC_DT_UINT8; + coll.dst.info_v.mem_type = + ucc_mtype_map.at(outputTensors[0][0].device().type()); + return comm->enqueue_collective( + OpType::ALLGATHER, coll, std::unique_ptr(data), team); +} + +c10::intrusive_ptr ProcessGroupUCC::allgather_base( + at::Tensor& /* unused */, + at::Tensor& /* unused */, + const AllgatherOptions& /* unused */) { + throw std::runtime_error("ProcessGroupUCC does not support allgather_base"); } c10::intrusive_ptr ProcessGroupUCC::allreduce( std::vector& tensors, const AllreduceOptions& opts) { - torch_ucc_coll_comm_t* ucc_comm; - torch_ucc_coll_request_t* coll_req; - torch_ucc_status_t st; - check_tensor(tensors); - c10::DeviceGuard guard(tensors[0].device()); - ucc_comm = get_coll_comm(); - st = coll_ops.allreduce(ucc_comm, tensors, opts, &coll_req); - if (st != TORCH_UCC_OK) { - throw std::runtime_error("ProcessGroupUCC: allreduce failed"); - } - - return enqueue_request(coll_req, nullptr); + auto& tensor = tensors[0]; + initComm(tensor.device()); + + ucc_coll_args_t coll; + coll.mask = UCC_COLL_ARGS_FIELD_PREDEFINED_REDUCTIONS | + UCC_COLL_ARGS_FIELD_FLAGS; + coll.flags = UCC_COLL_ARGS_FLAG_IN_PLACE; + coll.coll_type = UCC_COLL_TYPE_ALLREDUCE; + coll.reduce.predefined_op = ucc_op_map.at(opts.reduceOp); + coll.src.info.buffer = nullptr; + coll.src.info.count = tensor.numel(); + coll.src.info.datatype = ucc_dtype_map.at(tensor.scalar_type()); + coll.src.info.mem_type = ucc_mtype_map.at(tensor.device().type()); + coll.dst.info.buffer = tensor.data_ptr(); + coll.dst.info.count = tensor.numel(); + coll.dst.info.datatype = ucc_dtype_map.at(tensor.scalar_type()); + coll.dst.info.mem_type = ucc_mtype_map.at(tensor.device().type()); + return comm->enqueue_collective(OpType::ALLREDUCE, coll, nullptr, team); } c10::intrusive_ptr ProcessGroupUCC::allreduce_coalesced( @@ -319,49 +510,96 @@ c10::intrusive_ptr ProcessGroupUCC::allreduce_coalesced( "ProcessGroupUCC does not support allreduce_coalesced"); } -c10::intrusive_ptr ProcessGroupUCC::reduce( +c10::intrusive_ptr ProcessGroupUCC::alltoall( std::vector& /* unused */, - const ReduceOptions& /* unused */) { - throw std::runtime_error("ProcessGroupUCC does not support reduce"); + std::vector& /* unused */, + const AllToAllOptions& /* unused */) { + throw std::runtime_error("ProcessGroupUCC does not support alltoall"); } -c10::intrusive_ptr ProcessGroupUCC::allgather( - std::vector>& outputTensors, - std::vector& inputTensors, - const AllgatherOptions& /* unused */) { - torch_ucc_coll_comm_t* ucc_comm; - torch_ucc_coll_request_t* coll_req; - torch_ucc_status_t st; - - check_tensor(inputTensors); - c10::DeviceGuard guard(inputTensors[0].device()); - ucc_comm = get_coll_comm(); - st = coll_ops.allgather(ucc_comm, inputTensors, outputTensors[0], &coll_req); - if (st != TORCH_UCC_OK) { - throw std::runtime_error("ProcessGroupUCC: allgather failed"); - } - return enqueue_request(coll_req, nullptr); -} +c10::intrusive_ptr ProcessGroupUCC::alltoall_base( + at::Tensor& outputTensor, + at::Tensor& inputTensor, + std::vector& outputSplitSizes, + std::vector& inputSplitSizes, + const AllToAllOptions& /* unused */) { + check_device(inputTensor.device(), outputTensor.device()); + initComm(inputTensor.device()); + ucc_coll_args_t coll; + AlltoallWorkData* data; -c10::intrusive_ptr ProcessGroupUCC::allgather_base( - at::Tensor& /* unused */, - at::Tensor& /* unused */, - const AllgatherOptions& /* unused */) { - throw std::runtime_error("ProcessGroupUCC does not support allgather_base"); + if ((outputSplitSizes.size() == 0) && (inputSplitSizes.size() == 0)) { + data = nullptr; + TORCH_CHECK( + (outputTensor.size(0) % size_ == 0) && + (inputTensor.size(0) % size_ == 0), + "Tensor's dim 0 does not divide equally across group size"); + coll.mask = 0; + coll.coll_type = UCC_COLL_TYPE_ALLTOALL; + coll.src.info.buffer = inputTensor.data_ptr(); + coll.src.info.count = + inputTensor.element_size() * inputTensor.numel() / size_; + coll.src.info.datatype = UCC_DT_UINT8; + coll.src.info.mem_type = ucc_mtype_map.at(inputTensor.device().type()); + coll.dst.info.buffer = outputTensor.data_ptr(); + coll.dst.info.count = + outputTensor.element_size() * outputTensor.numel() / size_; + coll.dst.info.datatype = UCC_DT_UINT8; + coll.dst.info.mem_type = ucc_mtype_map.at(outputTensor.device().type()); + } else { + AlltoallWorkData* data = new AlltoallWorkData(size_); + c10d::checkSplitSizes(inputSplitSizes, inputTensor, size_); + c10d::checkSplitSizes(outputSplitSizes, outputTensor, size_); + computeLengthsAndOffsets( + outputSplitSizes, + outputTensor, + &data->recv_lengths, + &data->recv_offsets); + computeLengthsAndOffsets( + inputSplitSizes, inputTensor, &data->send_lengths, &data->send_offsets); + coll.mask = 0; + coll.coll_type = UCC_COLL_TYPE_ALLTOALLV; + coll.src.info_v.buffer = inputTensor.data_ptr(); + coll.src.info_v.counts = (ucc_count_t*)data->send_lengths.data(); + coll.src.info_v.displacements = (ucc_aint_t*)data->send_offsets.data(); + coll.src.info_v.datatype = ucc_dtype_map.at(inputTensor.scalar_type()); + coll.src.info_v.mem_type = ucc_mtype_map.at(inputTensor.device().type()); + coll.dst.info_v.buffer = outputTensor.data_ptr(); + coll.dst.info_v.counts = (ucc_count_t*)data->recv_lengths.data(); + coll.dst.info_v.displacements = (ucc_aint_t*)data->recv_offsets.data(); + coll.dst.info_v.datatype = ucc_dtype_map.at(outputTensor.scalar_type()); + coll.dst.info_v.mem_type = ucc_mtype_map.at(outputTensor.device().type()); + } + return comm->enqueue_collective( + OpType::ALLTOALL_BASE, coll, std::unique_ptr(data), team); } c10::intrusive_ptr ProcessGroupUCC::barrier( const BarrierOptions& /* unused */) { - torch_ucc_coll_comm_t* ucc_comm; - torch_ucc_coll_request_t* coll_req; - torch_ucc_status_t st; - - ucc_comm = get_coll_comm(); - st = coll_ops.barrier(ucc_comm, &coll_req); - if (st != TORCH_UCC_OK) { - throw std::runtime_error("ProcessGroupUCC: barrier failed"); - } - return enqueue_request(coll_req, nullptr); + initComm(c10::DeviceType::CPU); + + ucc_coll_args_t coll; + coll.mask = 0; + coll.coll_type = UCC_COLL_TYPE_BARRIER; + return comm->enqueue_collective(OpType::BARRIER, coll, nullptr, team); +} + +c10::intrusive_ptr ProcessGroupUCC::broadcast( + std::vector& tensors, + const BroadcastOptions& opts) { + check_tensor(tensors); + auto& tensor = tensors[0]; + initComm(tensor.device()); + + ucc_coll_args_t coll; + coll.mask = 0; + coll.coll_type = UCC_COLL_TYPE_BCAST; + coll.src.info.buffer = tensor.data_ptr(); + coll.src.info.count = tensor.numel(); + coll.src.info.datatype = ucc_dtype_map.at(tensor.scalar_type()); + coll.src.info.mem_type = ucc_mtype_map.at(tensor.device().type()); + coll.root = opts.rootRank; + return comm->enqueue_collective(OpType::BROADCAST, coll, nullptr, team); } c10::intrusive_ptr ProcessGroupUCC::gather( @@ -371,11 +609,10 @@ c10::intrusive_ptr ProcessGroupUCC::gather( throw std::runtime_error("ProcessGroupUCC does not support gather"); } -c10::intrusive_ptr ProcessGroupUCC::scatter( +c10::intrusive_ptr ProcessGroupUCC::reduce( std::vector& /* unused */, - std::vector>& /* unused */, - const ScatterOptions& /* unused */) { - throw std::runtime_error("ProcessGroupUCC does not support scatter"); + const ReduceOptions& /* unused */) { + throw std::runtime_error("ProcessGroupUCC does not support reduce"); } c10::intrusive_ptr ProcessGroupUCC::reduce_scatter( @@ -385,60 +622,11 @@ c10::intrusive_ptr ProcessGroupUCC::reduce_scatter( throw std::runtime_error("ProcessGroupUCC does not support reduce_scatter"); } -c10::intrusive_ptr ProcessGroupUCC::alltoall_base( - at::Tensor& outputTensor, - at::Tensor& inputTensor, - std::vector& outputSplitSizes, - std::vector& inputSplitSizes, - const AllToAllOptions& /* unused */) { - torch_ucc_coll_comm_t* ucc_comm; - torch_ucc_coll_request_t* coll_req; - torch_ucc_status_t st; - uint32_t* scratch; - - c10::DeviceGuard guard(inputTensor.device()); - ucc_comm = get_coll_comm(); - if ((outputSplitSizes.size() == 0) && (inputSplitSizes.size() == 0)) { - scratch = nullptr; - st = coll_ops.alltoall(ucc_comm, inputTensor, outputTensor, &coll_req); - if (st != TORCH_UCC_OK) { - throw std::runtime_error("ProcessGroupUCC: alltoall_base failed"); - } - } else { - scratch = new uint32_t[4 * size_]; - uint32_t* send_lengths = scratch; - uint32_t* recv_lengths = scratch + 1 * size_; - uint32_t* send_offsets = scratch + 2 * size_; - uint32_t* recv_offsets = scratch + 3 * size_; - st = compute_lengths_offsets( - size_, outputSplitSizes, outputTensor, recv_lengths, recv_offsets); - if (st != TORCH_UCC_OK) { - throw std::runtime_error("ProcessGroupUCC: alltoallv failed"); - } - st = compute_lengths_offsets( - size_, inputSplitSizes, inputTensor, send_lengths, send_offsets); - if (st != TORCH_UCC_OK) { - throw std::runtime_error("ProcessGroupUCC: alltoallv failed"); - } - - coll_ops.alltoallv( - ucc_comm, - inputTensor, - send_lengths, - send_offsets, - outputTensor, - recv_lengths, - recv_offsets, - &coll_req); - } - return enqueue_request(coll_req, scratch); -} - -c10::intrusive_ptr ProcessGroupUCC::alltoall( - std::vector& /* unused */, +c10::intrusive_ptr ProcessGroupUCC::scatter( std::vector& /* unused */, - const AllToAllOptions& /* unused */) { - throw std::runtime_error("ProcessGroupUCC does not support alltoall"); + std::vector>& /* unused */, + const ScatterOptions& /* unused */) { + throw std::runtime_error("ProcessGroupUCC does not support scatter"); } c10::intrusive_ptr ProcessGroupUCC::send( @@ -447,24 +635,17 @@ c10::intrusive_ptr ProcessGroupUCC::send( int tag) { check_tensor(tensors); auto& tensor = tensors[0]; - size_t size = tensor.numel() * tensor.element_size(); - torch_ucx_request_t* req; - torch_ucx_status_t st; + initComm(tensor.device()); - st = torch_ucx_send_nb( - ucx_comm, + ucp_tag_t ucp_tag; + TORCH_UCX_MAKE_SEND_TAG(ucp_tag, tag, rank_, comm_id); + ucc_coll_req_h request = comm->send_nb( + eps[dstRank], tensor.data_ptr(), ucs_mtype_map.at(tensor.device().type()), - size, - dstRank, - tag, - &req, - TORCH_UCX_P2P_TAG); - if (st < 0) { - throw std::runtime_error("ProcessGroupUCC: failed to send msg"); - } - - return c10::make_intrusive(req, ucx_comm); + tensor.numel() * tensor.element_size(), + ucp_tag); + return comm->enqueue_p2p(OpType::SEND, request); } c10::intrusive_ptr ProcessGroupUCC::recv( @@ -473,24 +654,17 @@ c10::intrusive_ptr ProcessGroupUCC::recv( int tag) { check_tensor(tensors); auto& tensor = tensors[0]; - size_t size = tensor.numel() * tensor.element_size(); - torch_ucx_request_t* req; - torch_ucx_status_t st; + initComm(tensor.device()); - st = torch_ucx_recv_nb( - ucx_comm, + ucp_tag_t ucp_tag, ucp_tag_mask; + TORCH_UCX_MAKE_RECV_TAG(ucp_tag, ucp_tag_mask, tag, srcRank, comm_id); + ucc_coll_req_h request = comm->recv_nb( tensor.data_ptr(), ucs_mtype_map.at(tensor.device().type()), - size, - srcRank, - tag, - &req, - TORCH_UCX_P2P_TAG); - if (st < 0) { - throw std::runtime_error("ProcessGroupUCC: failed to recv msg"); - } - - return c10::make_intrusive(req, ucx_comm); + tensor.numel() * tensor.element_size(), + ucp_tag, + ucp_tag_mask); + return comm->enqueue_p2p(OpType::RECV, request); } c10::intrusive_ptr ProcessGroupUCC::recvAnysource( @@ -498,24 +672,18 @@ c10::intrusive_ptr ProcessGroupUCC::recvAnysource( int tag) { check_tensor(tensors); auto& tensor = tensors[0]; - size_t size = tensor.numel() * tensor.element_size(); - torch_ucx_request_t* req; - torch_ucx_status_t st; + initComm(tensor.device()); - st = torch_ucx_recv_nb( - ucx_comm, + ucp_tag_t ucp_tag, ucp_tag_mask; + TORCH_UCX_MAKE_RECV_TAG( + ucp_tag, ucp_tag_mask, tag, TORCH_UCX_ANY_SOURCE, comm_id); + ucc_coll_req_h request = comm->recv_nb( tensor.data_ptr(), ucs_mtype_map.at(tensor.device().type()), - size, - TORCH_UCX_ANY_SOURCE, - tag, - &req, - TORCH_UCX_P2P_TAG); - if (st < 0) { - throw std::runtime_error("ProcessGroupUCC: failed to recv msg"); - } - - return c10::make_intrusive(req, ucx_comm); + tensor.numel() * tensor.element_size(), + ucp_tag, + ucp_tag_mask); + return comm->enqueue_p2p(OpType::RECVANYSOURCE, request); } c10::intrusive_ptr ProcessGroupUCC::createProcessGroupUCC( @@ -526,6 +694,25 @@ c10::intrusive_ptr ProcessGroupUCC::createProcessGroupUCC( return c10::make_intrusive(store, rank, size); } +void ProcessGroupUCC::initComm(c10::Device dev) { + if (!comm) { + comm = CommPG::get_comm(comm_id, dev, size_); + comm->ucx_connect_eps(eps, rank_, size_, store_); + comm->ucc_create_team(team, rank_, size_, store_); + } else { + if (dev.is_cuda()) { + if ((comm->cuda_device_index != TORCH_UCC_DEVICE_NOT_SET) && + (comm->cuda_device_index != dev.index())) { + LOG(ERROR) + << "ucc communicator was initialized with different cuda device," + << "multi device is not supported"; + throw std::runtime_error(ucc_status_string(UCC_ERR_NOT_SUPPORTED)); + } + comm->cuda_device_index = dev.index(); + } + } +} + PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { m.def("createProcessGroupUCC", &ProcessGroupUCC::createProcessGroupUCC); } diff --git a/src/torch_ucc_sendrecv.cpp b/src/torch_ucc_sendrecv.cpp deleted file mode 100644 index 0b9ee0b..0000000 --- a/src/torch_ucc_sendrecv.cpp +++ /dev/null @@ -1,188 +0,0 @@ -/** - * * Copyright (C) Mellanox Technologies Ltd. 2001-2020. ALL RIGHTS RESERVED. - * * - * * See file LICENSE for terms. - * */ - -#include "torch_ucc_sendrecv.hpp" - -namespace c10d { - -static void torch_ucx_req_init(void* request) { - torch_ucx_request_t* req = static_cast(request); - req->status = TORCH_UCX_REQUEST_ACTIVE; -} - -static void torch_ucx_req_cleanup(void* request) {} - -torch_ucx_status_t torch_ucx_comm_init( - torch_ucx_comm_t** ucx_comm, - int size, - int rank, - const c10::intrusive_ptr& store) { - ucp_params_t params; - ucp_config_t* config; - ucs_status_t st; - torch_ucx_comm_t* comm; - ucp_worker_params_t worker_params; - ucp_address_t* local_addr; - size_t local_addr_len; - std::string key; - std::vector val; - ucp_worker_attr_t worker_attr; - - comm = new torch_ucx_comm_t; - comm->rank = rank; - comm->size = size; - - st = ucp_config_read("TORCH", nullptr, &config); - if (st != UCS_OK) { - fprintf( - stderr, - "TorchUCC: failed to read ucp config %s\n", - ucs_status_string(st)); - goto free_comm; - } - - memset(¶ms, 0, sizeof(ucp_params_t)); - params.field_mask = UCP_PARAM_FIELD_FEATURES | UCP_PARAM_FIELD_REQUEST_SIZE | - UCP_PARAM_FIELD_ESTIMATED_NUM_EPS | UCP_PARAM_FIELD_TAG_SENDER_MASK | - UCP_PARAM_FIELD_REQUEST_INIT | UCP_PARAM_FIELD_REQUEST_CLEANUP; - params.request_size = sizeof(torch_ucx_request_t); - params.features = UCP_FEATURE_TAG; - params.estimated_num_eps = size; - params.request_init = torch_ucx_req_init; - params.request_cleanup = torch_ucx_req_cleanup; - params.tag_sender_mask = TORCH_UCX_RANK_MASK; - st = ucp_init(¶ms, config, &comm->ctx); - ucp_config_release(config); - if (st != UCS_OK) { - fprintf( - stderr, - "TorchUCC: failed to init ucp context %s\n", - ucs_status_string(st)); - goto free_comm; - } - - memset(&worker_params, 0, sizeof(ucp_worker_params_t)); - worker_params.field_mask = UCP_WORKER_PARAM_FIELD_THREAD_MODE; - worker_params.thread_mode = UCS_THREAD_MODE_MULTI; - st = ucp_worker_create(comm->ctx, &worker_params, &comm->worker); - if (st != UCS_OK) { - fprintf( - stderr, - "TorchUCC: failed to init ucp worker %s\n", - ucs_status_string(st)); - goto close_ctx; - } - - worker_attr.field_mask = UCP_WORKER_ATTR_FIELD_THREAD_MODE; - ucp_worker_query(comm->worker, &worker_attr); - if (worker_attr.thread_mode != UCS_THREAD_MODE_MULTI) { - fprintf(stderr, "TorchUCC: Thread mode multi is not supported\n"); - } - - st = ucp_worker_get_address(comm->worker, &local_addr, &local_addr_len); - if (st != UCS_OK) { - fprintf( - stderr, - "TorchUCC: failed to get ucp worker address %s\n", - ucs_status_string(st)); - goto close_worker; - } - - key = "wa" + std::to_string(rank); - val = std::vector( - reinterpret_cast(local_addr), - reinterpret_cast(local_addr) + local_addr_len); - store->set(key, val); - ucp_worker_release_address(comm->worker, local_addr); - comm->eps = new ucp_ep_h[size]; - for (int i = 0; i < size; i++) { - std::vector peer_addr = store->get("wa" + std::to_string(i)); - ucp_ep_params_t ep_params; - - ep_params.field_mask = UCP_EP_PARAM_FIELD_REMOTE_ADDRESS; - ep_params.address = reinterpret_cast(peer_addr.data()); - st = ucp_ep_create(comm->worker, &ep_params, &(comm->eps[i])); - if (st != UCS_OK) { - fprintf( - stderr, - "TorchUCC: failed to create ucp ep %s\n", - ucs_status_string(st)); - goto close_ep; - } - } - - *ucx_comm = comm; - return TORCH_UCX_OK; - -close_ep: - delete[] comm->eps; -close_worker: - ucp_worker_destroy(comm->worker); -close_ctx: - ucp_cleanup(comm->ctx); -free_comm: - delete comm; - *ucx_comm = nullptr; - return TORCH_UCX_ERROR; -} - -void torch_ucx_comm_close( - torch_ucx_comm_t* comm, - const c10::intrusive_ptr& store) { - ucs_status_ptr_t close_req; - ucs_status_t st; - - if (!comm) { - return; - } - - for (int i = 0; i < comm->size; i++) { - close_req = ucp_ep_close_nb(comm->eps[i], UCP_EP_CLOSE_MODE_FLUSH); - if (UCS_PTR_IS_ERR(close_req)) { - return; - } - if (UCS_PTR_IS_PTR(close_req)) { - do { - ucp_worker_progress(comm->worker); - st = ucp_request_check_status(close_req); - } while (st != UCS_OK); - ucp_request_free(close_req); - } - } - - auto key_ep_closed = "epclosed"; - auto num_closed_ep = store->add(key_ep_closed, 1); - - std::vector key_finished{"finished"}; - if (num_closed_ep == comm->size) { - store->add(key_finished[0], 1); - } else { - store->wait(key_finished); - } - delete[] comm->eps; - ucp_worker_destroy(comm->worker); - ucp_cleanup(comm->ctx); - delete comm; -} - -void torch_ucx_send_cmpl_cb( - void* request, - ucs_status_t status, - void* user_data) { - torch_ucx_request_t* req = static_cast(request); - req->status = TORCH_UCX_REQUEST_DONE; -} - -void torch_ucx_recv_cmpl_cb( - void* request, - ucs_status_t status, - const ucp_tag_recv_info_t* info, - void* user_data) { - torch_ucx_request_t* req = static_cast(request); - req->status = TORCH_UCX_REQUEST_DONE; -} - -} // namespace c10d diff --git a/src/torch_xccl.cpp b/src/torch_xccl.cpp deleted file mode 100644 index 75a817e..0000000 --- a/src/torch_xccl.cpp +++ /dev/null @@ -1,646 +0,0 @@ -/** - * * Copyright (C) Mellanox Technologies Ltd. 2020-2021. ALL RIGHTS RESERVED. - * * - * * See file LICENSE for terms. - * */ - -#include -#include -#include - -namespace c10d { - -struct xccl_oob_allgather_req_t { - xccl_ep_range_t range; - void* sbuf; - void* rbuf; - void* oob_coll_ctx; - int my_rank; - size_t msglen; - int iter; - int num_active_reqs; - torch_ucx_request_t* reqs[2]; - int done; -}; - -static xccl_status_t oob_allgather_test(void* req) { - xccl_oob_allgather_req_t* oob_req = - static_cast(req); - int rank, size, sendto, recvfrom, recvdatafrom, senddatafrom; - torch_ucx_comm_t* oob_ctx = - static_cast(oob_req->oob_coll_ctx); - char *tmpsend = nullptr, *tmprecv = nullptr; - size_t msglen = oob_req->msglen; - torch_ucx_status_t st; - - if (oob_req->done) { - return XCCL_OK; - } - - if (oob_req->range.type == XCCL_EP_RANGE_UNDEFINED) { - size = oob_ctx->size; - rank = oob_ctx->rank; - } else { - size = oob_req->range.ep_num; - rank = oob_req->my_rank; - } - - if (oob_req->iter == 0) { - tmprecv = (char*)oob_req->rbuf + (ptrdiff_t)(rank * msglen); - memcpy(tmprecv, oob_req->sbuf, msglen); - } - sendto = (rank + 1) % size; - recvfrom = (rank - 1 + size) % size; - if (oob_req->range.type != XCCL_EP_RANGE_UNDEFINED) { - sendto = xccl_range_to_rank(oob_req->range, sendto); - recvfrom = xccl_range_to_rank(oob_req->range, recvfrom); - } - for (; oob_req->iter < size - 1; oob_req->iter++) { - if (oob_req->iter > 0) { - st = torch_ucx_req_test( - oob_ctx, - oob_req->reqs, - oob_req->num_active_reqs, - nullptr, - 1, - oob_req->num_active_reqs); - if (st == TORCH_UCX_INPROGRESS) { - return XCCL_INPROGRESS; - } - oob_req->num_active_reqs = 0; - } - recvdatafrom = (rank - oob_req->iter - 1 + size) % size; - senddatafrom = (rank - oob_req->iter + size) % size; - tmprecv = (char*)oob_req->rbuf + (ptrdiff_t)(recvdatafrom * msglen); - tmpsend = (char*)oob_req->rbuf + (ptrdiff_t)(senddatafrom * msglen); - - torch_ucx_send_nb( - oob_ctx, - tmpsend, - UCS_MEMORY_TYPE_HOST, - msglen, - sendto, - 1, - &oob_req->reqs[0], - TORCH_UCX_OOB_TAG); - - torch_ucx_recv_nb( - oob_ctx, - tmprecv, - UCS_MEMORY_TYPE_HOST, - msglen, - recvfrom, - 1, - &oob_req->reqs[1], - TORCH_UCX_OOB_TAG); - oob_req->num_active_reqs += 2; - } - - st = torch_ucx_req_test( - oob_ctx, - oob_req->reqs, - oob_req->num_active_reqs, - nullptr, - 1, - oob_req->num_active_reqs); - if (st == TORCH_UCX_INPROGRESS) { - return XCCL_INPROGRESS; - } - - oob_req->done = 1; - return XCCL_OK; -} - -static xccl_status_t oob_allgather_free(void* req) { - xccl_oob_allgather_req_t* request = - static_cast(req); - delete request; - - return XCCL_OK; -} - -static int oob_allgather( - void* sbuf, - void* rbuf, - size_t msglen, - int my_rank, - xccl_ep_range_t range, - void* oob_coll_ctx, - void** req) { - xccl_oob_allgather_req_t* oob_req = new (xccl_oob_allgather_req_t); - - oob_req->sbuf = sbuf; - oob_req->rbuf = rbuf; - oob_req->msglen = msglen; - oob_req->range = range; - oob_req->oob_coll_ctx = oob_coll_ctx; - oob_req->my_rank = my_rank; - oob_req->iter = 0; - oob_req->num_active_reqs = 0; - oob_req->done = 0; - - *req = oob_req; - - return oob_allgather_test(oob_req); -} - -static inline xccl_tl_id_t xccl_tls_str_to_bitmap(const char* tls_str) { - uint64_t tls = 0; - - if (!tls_str) { - return (xccl_tl_id_t)tls; - } - - for (uint64_t i = 1; i < (uint64_t)XCCL_TL_LAST; i = i << 1) { - if (strstr(tls_str, xccl_tl_str((xccl_tl_id_t)i))) { - tls = tls | i; - } - } - - return (xccl_tl_id_t)tls; -} - -torch_ucc_status_t torch_xccl_comm_init( - torch_ucx_comm_t* p2p_comm, - torch_ucc_coll_config_t* coll_config, - torch_ucc_coll_comm_t** comm) { - torch_xccl_comm_t* xccl_comm; - xccl_lib_params_t lib_params; - xccl_lib_config_t* cfg; - xccl_status_t st; - char* tls_str; - - xccl_comm = new torch_xccl_comm_t; - xccl_comm->p2p_comm = p2p_comm; - memset(&lib_params, 0, sizeof(lib_params)); - lib_params.field_mask = - XCCL_LIB_PARAM_FIELD_TEAM_USAGE | XCCL_LIB_PARAM_FIELD_COLL_TYPES; - - lib_params.team_usage = XCCL_LIB_PARAMS_TEAM_USAGE_SW_COLLECTIVES | - XCCL_LIB_PARAMS_TEAM_USAGE_HW_COLLECTIVES; - - lib_params.coll_types = XCCL_COLL_CAP_BCAST | XCCL_COLL_CAP_ALLREDUCE | - XCCL_COLL_CAP_ALLTOALL | XCCL_COLL_CAP_ALLTOALLV; - - cfg = nullptr; - st = xccl_lib_init(&lib_params, cfg, &xccl_comm->xccl_lib); - if (st != XCCL_OK) { - fprintf(stderr, "TorchUCC: failed to init XCCL lib\n"); - goto free_comm; - } - - xccl_context_config_t* ctx_config; - uint64_t tls; - tls_str = getenv("TORCH_UCC_TLS"); - if (tls_str) { - tls = xccl_tls_str_to_bitmap(tls_str); - } else { - tls = XCCL_TL_ALL; - } - - st = - xccl_context_config_read(xccl_comm->xccl_lib, "TORCH", nullptr, &ctx_config); - if (st != XCCL_OK) { - fprintf(stderr, "TorchUCC: failed to read XCCL context config\n"); - goto free_lib; - } - if (tls & XCCL_TL_UCX) { - xccl_tl_id_t tl = XCCL_TL_UCX; - if (coll_config->blocking_wait) { - xccl_context_config_modify(&tl, ctx_config, "BLOCK_STREAM", "no"); - } else { - xccl_context_config_modify(&tl, ctx_config, "BLOCK_STREAM", "yes"); - } - } - - xccl_context_params_t ctx_params; - - ctx_params.field_mask = XCCL_CONTEXT_PARAM_FIELD_THREAD_MODE | - XCCL_CONTEXT_PARAM_FIELD_OOB | - XCCL_CONTEXT_PARAM_FIELD_TEAM_COMPLETION_TYPE | - XCCL_CONTEXT_PARAM_FIELD_TLS; - ctx_params.thread_mode = XCCL_THREAD_MODE_MULTIPLE; - ctx_params.completion_type = XCCL_TEAM_COMPLETION_TYPE_BLOCKING; - ctx_params.tls = tls; - ctx_params.oob.allgather = oob_allgather; - ctx_params.oob.req_test = oob_allgather_test; - ctx_params.oob.req_free = oob_allgather_free; - ctx_params.oob.coll_context = static_cast(p2p_comm); - ctx_params.oob.rank = p2p_comm->rank; - ctx_params.oob.size = p2p_comm->size; - - st = xccl_context_create( - xccl_comm->xccl_lib, &ctx_params, ctx_config, &xccl_comm->xccl_ctx); - xccl_context_config_release(ctx_config); - if (st != XCCL_OK) { - fprintf(stderr, "TorchUCC: failed to create XCCL context\n"); - goto free_lib; - } - - xccl_team_params_t team_params; - - team_params.field_mask = - XCCL_TEAM_PARAM_FIELD_EP_RANGE | XCCL_TEAM_PARAM_FIELD_OOB; - - team_params.range.type = XCCL_EP_RANGE_STRIDED; - team_params.range.strided.start = 0; - team_params.range.strided.stride = 1; - team_params.oob.allgather = oob_allgather; - team_params.oob.req_test = oob_allgather_test; - team_params.oob.req_free = oob_allgather_free; - team_params.oob.coll_context = static_cast(p2p_comm); - team_params.oob.rank = p2p_comm->rank; - team_params.oob.size = p2p_comm->size; - - st = xccl_team_create_post( - xccl_comm->xccl_ctx, &team_params, &xccl_comm->xccl_team); - if (st != XCCL_OK) { - fprintf(stderr, "TorchUCC: failed to create XCCL team\n"); - goto free_context; - } - while (XCCL_INPROGRESS == xccl_team_create_test(xccl_comm->xccl_team)) { - }; -#ifdef USE_CUDA - xccl_comm->super.stream = nullptr; -#endif - xccl_comm->super.config = *coll_config; - if (p2p_comm->rank == 0) { - //TODO: add TLS configuration print - LOG(INFO) << "ProcessGroupUCC initialized with following options:" - << "\nTORCH_UCC_BLOCKING_WAIT: " << - xccl_comm->super.config.blocking_wait << - "\nTORCH_UCC_HIGH_PRIORITY_STREAM: " << - xccl_comm->super.config.high_priority_stream; - } - *comm = (torch_ucc_coll_comm_t*)xccl_comm; - - return TORCH_UCC_OK; -free_context: - xccl_context_destroy(xccl_comm->xccl_ctx); -free_lib: - xccl_lib_cleanup(xccl_comm->xccl_lib); -free_comm: - delete xccl_comm; - *comm = nullptr; - return TORCH_UCC_ERROR; -} - -torch_ucc_status_t torch_xccl_comm_close(torch_ucc_coll_comm_t* comm) { - torch_xccl_comm_t* xccl_comm = (torch_xccl_comm_t*)comm; - - xccl_team_destroy(xccl_comm->xccl_team); - xccl_context_destroy(xccl_comm->xccl_ctx); - xccl_lib_cleanup(xccl_comm->xccl_lib); - delete xccl_comm; - - return TORCH_UCC_OK; -} - -const std::map xccl_op_map = { - {ReduceOp::MIN, XCCL_OP_MIN}, - {ReduceOp::MAX, XCCL_OP_MAX}, - {ReduceOp::SUM, XCCL_OP_SUM}, - {ReduceOp::PRODUCT, XCCL_OP_PROD}, -}; - -const std::map xccl_type_map = { - {at::kByte, XCCL_DT_UINT8}, - {at::kChar, XCCL_DT_INT8}, - {at::kHalf, XCCL_DT_FLOAT16}, - {at::kDouble, XCCL_DT_FLOAT64}, - {at::kFloat, XCCL_DT_FLOAT32}, - {at::kInt, XCCL_DT_INT32}, - {at::kLong, XCCL_DT_INT64}, -}; - -std::map xccl_collective_name = { - {XCCL_BARRIER, "Barrier"}, - {XCCL_BCAST, "Broadcast"}, - {XCCL_ALLREDUCE, "Allreduce"}, - {XCCL_ALLTOALL, "Alltoall"}, - {XCCL_ALLTOALLV, "Alltoallv"}, - {XCCL_ALLGATHER, "Allgather"}, -}; - -static void coll_args_init_with_stream(xccl_coll_op_args_t *coll_args, - torch_xccl_comm_t* xccl_comm, - torch_xccl_request_t* coll_req) { -#ifdef USE_CUDA - if (!coll_req->super.device.is_cuda()) { - return; - } - coll_req->stream = xccl_comm->super.stream->stream(); - coll_args->field_mask |= XCCL_COLL_OP_ARGS_FIELD_STREAM; - coll_args->stream.type = XCCL_STREAM_TYPE_CUDA; - coll_args->stream.stream = (void*)(&coll_req->stream); -#endif -} - -static torch_ucc_status_t xccl_init_and_post(xccl_coll_op_args_t *args, - xccl_team_h team, - torch_xccl_request_t *req) -{ - xccl_status_t st; - - st = xccl_collective_init(args, &req->request, team); - if (st != XCCL_OK) { - fprintf(stderr, "TorchUCC: XCCL %s init failed (%d)\n", - xccl_collective_name[args->coll_type], st); - return TORCH_UCC_ERROR; - } - st = xccl_collective_post(req->request); - if (st != XCCL_OK) { - fprintf(stderr, "TorchUCC: XCCL %s post failed (%d)\n", - xccl_collective_name[args->coll_type], st); - xccl_collective_finalize(req->request); - return TORCH_UCC_ERROR; - } - req->status = TORCH_UCC_INPROGRESS; -#ifdef USE_CUDA - /* Record event that later can be used for fence */ - if ((req->super.device.is_cuda()) && - (!req->super.coll_comm->config.blocking_wait)) { - req->super.coll_finished->record(*req->super.coll_comm->stream); - } -#endif - return TORCH_UCC_OK; -} - -torch_ucc_status_t torch_xccl_allgather( - torch_ucc_coll_comm_t* coll_comm, - std::vector& input_tensors, - std::vector& output_tensors, - torch_ucc_coll_request_t** request) { - torch_xccl_comm_t* xccl_comm = (torch_xccl_comm_t*)coll_comm; - xccl_coll_op_args_t coll_args; - torch_xccl_request_t* coll_req; - size_t buf_len; - - coll_req = new torch_xccl_request_t; - torch_ucc_coll_request_init( - coll_comm, - (torch_ucc_coll_request_t*)coll_req, - &input_tensors, - &output_tensors); - coll_req->coll_type = XCCL_ALLGATHER; - coll_req->flat_tensor = newLikeFlat(output_tensors); - - buf_len = input_tensors[0].element_size() * input_tensors[0].numel() * - xccl_comm->p2p_comm->size; - coll_args.field_mask = 0; - coll_args.coll_type = XCCL_ALLGATHER; - coll_args.buffer_info.src_buffer = input_tensors[0].data_ptr(); - coll_args.buffer_info.dst_buffer = coll_req->flat_tensor.data_ptr(); - coll_args.buffer_info.len = buf_len; - coll_args.alg.set_by_user = 0; - coll_args_init_with_stream(&coll_args, xccl_comm, coll_req); - if (xccl_init_and_post(&coll_args, xccl_comm->xccl_team, coll_req) != TORCH_UCC_OK) - { - delete coll_req; - return TORCH_UCC_ERROR; - } - *request = (torch_ucc_coll_request_t*)coll_req; - return TORCH_UCC_OK; -} - -torch_ucc_status_t torch_xccl_alltoall( - torch_ucc_coll_comm_t* coll_comm, - at::Tensor& input_tensor, - at::Tensor& output_tensor, - torch_ucc_coll_request_t** request) { - torch_xccl_comm_t* xccl_comm = (torch_xccl_comm_t*)coll_comm; - xccl_coll_op_args_t coll_args; - torch_xccl_request_t* coll_req; - size_t buf_len; - - coll_req = new torch_xccl_request_t; - std::vector input_tensors = {input_tensor}; - std::vector output_tensors = {output_tensor}; - torch_ucc_coll_request_init( - coll_comm, - (torch_ucc_coll_request_t*)coll_req, - &input_tensors, - &output_tensors); - coll_req->coll_type = XCCL_ALLTOALL; - coll_req->status = TORCH_UCC_OPERATION_INITIALIZED; - - buf_len = input_tensor.element_size() * input_tensor.numel() / - xccl_comm->p2p_comm->size; - coll_args.field_mask = 0; - coll_args.coll_type = XCCL_ALLTOALL; - coll_args.buffer_info.src_buffer = input_tensor.data_ptr(); - coll_args.buffer_info.dst_buffer = output_tensor.data_ptr(); - coll_args.buffer_info.len = buf_len; - coll_args.alg.set_by_user = 0; - coll_args_init_with_stream(&coll_args, xccl_comm, coll_req); - if (xccl_init_and_post(&coll_args, xccl_comm->xccl_team, coll_req) != TORCH_UCC_OK) - { - delete coll_req; - return TORCH_UCC_ERROR; - } - *request = (torch_ucc_coll_request_t*)coll_req; - return TORCH_UCC_OK; -} - -torch_ucc_status_t torch_xccl_alltoallv( - torch_ucc_coll_comm_t* coll_comm, - at::Tensor& input_tensor, - uint32_t* send_lengths, - uint32_t* send_offsets, - at::Tensor& output_tensor, - uint32_t* recv_lengths, - uint32_t* recv_offsets, - torch_ucc_coll_request_t** request) { - torch_xccl_comm_t* xccl_comm = (torch_xccl_comm_t*)coll_comm; - xccl_coll_op_args_t coll_args; - torch_xccl_request_t* coll_req; - - coll_req = new torch_xccl_request_t; - std::vector input_tensors = {input_tensor}; - std::vector output_tensors = {output_tensor}; - torch_ucc_coll_request_init( - coll_comm, - (torch_ucc_coll_request_t*)coll_req, - &input_tensors, - &output_tensors); - coll_req->coll_type = XCCL_ALLTOALLV; - coll_req->status = TORCH_UCC_OPERATION_INITIALIZED; - - coll_args.field_mask = 0; - coll_args.coll_type = XCCL_ALLTOALLV; - coll_args.buffer_info.src_buffer = input_tensor.data_ptr(); - coll_args.buffer_info.src_displacements = send_offsets; - coll_args.buffer_info.src_counts = send_lengths; - coll_args.buffer_info.src_datatype = - xccl_type_map.at(input_tensor.scalar_type()); - coll_args.buffer_info.dst_buffer = output_tensor.data_ptr(); - coll_args.buffer_info.dst_displacements = recv_offsets; - coll_args.buffer_info.dst_counts = recv_lengths; - coll_args.buffer_info.dst_datatype = - xccl_type_map.at(output_tensor.scalar_type()); - coll_args.alg.set_by_user = 0; - coll_args_init_with_stream(&coll_args, xccl_comm, coll_req); - if (xccl_init_and_post(&coll_args, xccl_comm->xccl_team, coll_req) != TORCH_UCC_OK) - { - delete coll_req; - return TORCH_UCC_ERROR; - } - *request = (torch_ucc_coll_request_t*)coll_req; - return TORCH_UCC_OK; -} - -torch_ucc_status_t torch_xccl_allreduce( - torch_ucc_coll_comm_t* coll_comm, - std::vector& tensors, - const AllreduceOptions& opts, - torch_ucc_coll_request_t** request) { - torch_xccl_comm_t* xccl_comm = (torch_xccl_comm_t*)coll_comm; - xccl_coll_op_args_t coll_args; - torch_xccl_request_t* coll_req; - - coll_req = new torch_xccl_request_t; - torch_ucc_coll_request_init( - coll_comm, (torch_ucc_coll_request_t*)coll_req, &tensors, nullptr); - coll_req->coll_type = XCCL_ALLREDUCE; - coll_req->status = TORCH_UCC_OPERATION_INITIALIZED; - - coll_args.field_mask = 0; - coll_args.coll_type = XCCL_ALLREDUCE; - coll_args.buffer_info.src_buffer = tensors[0].data_ptr(); - coll_args.buffer_info.dst_buffer = tensors[0].data_ptr(); - coll_args.buffer_info.len = tensors[0].numel() * tensors[0].element_size(); - coll_args.reduce_info.dt = xccl_type_map.at(tensors[0].scalar_type()); - coll_args.reduce_info.op = xccl_op_map.at(opts.reduceOp); - coll_args.reduce_info.count = tensors[0].numel(); - coll_args.alg.set_by_user = 0; - coll_args_init_with_stream(&coll_args, xccl_comm, coll_req); - if (xccl_init_and_post(&coll_args, xccl_comm->xccl_team, coll_req) != TORCH_UCC_OK) - { - delete coll_req; - return TORCH_UCC_ERROR; - } - *request = (torch_ucc_coll_request_t*)coll_req; - return TORCH_UCC_OK; -} - -torch_ucc_status_t torch_xccl_barrier( - torch_ucc_coll_comm_t* coll_comm, - torch_ucc_coll_request_t** request) { - torch_xccl_comm_t* xccl_comm = (torch_xccl_comm_t*)coll_comm; - xccl_coll_op_args_t coll_args; - torch_xccl_request_t* coll_req; - - coll_req = new torch_xccl_request_t; - torch_ucc_coll_request_init( - coll_comm, (torch_ucc_coll_request_t*)coll_req, nullptr, nullptr); - coll_req->coll_type = XCCL_BARRIER; - coll_req->status = TORCH_UCC_OPERATION_INITIALIZED; - - coll_args.field_mask = 0; - coll_args.coll_type = XCCL_BARRIER; - coll_args.alg.set_by_user = 0; - if (xccl_init_and_post(&coll_args, xccl_comm->xccl_team, coll_req) != TORCH_UCC_OK) - { - delete coll_req; - return TORCH_UCC_ERROR; - } - *request = (torch_ucc_coll_request_t*)coll_req; - return TORCH_UCC_OK; -} - -torch_ucc_status_t torch_xccl_broadcast( - torch_ucc_coll_comm_t* coll_comm, - std::vector& tensors, - int root, - torch_ucc_coll_request_t** request) { - torch_xccl_comm_t* xccl_comm = (torch_xccl_comm_t*)coll_comm; - xccl_coll_op_args_t coll_args; - torch_xccl_request_t* coll_req; - - coll_req = new torch_xccl_request_t; - torch_ucc_coll_request_init( - coll_comm, (torch_ucc_coll_request_t*)coll_req, &tensors, nullptr); - coll_req->coll_type = XCCL_BCAST; - coll_req->status = TORCH_UCC_OPERATION_INITIALIZED; - - coll_args.field_mask = 0; - coll_args.coll_type = XCCL_BCAST; - coll_args.buffer_info.src_buffer = tensors[0].data_ptr(); - coll_args.buffer_info.dst_buffer = tensors[0].data_ptr(); - coll_args.buffer_info.len = tensors[0].numel() * tensors[0].element_size(); - coll_args.root = root; - coll_args.alg.set_by_user = 0; - coll_args_init_with_stream(&coll_args, xccl_comm, coll_req); - if (xccl_init_and_post(&coll_args, xccl_comm->xccl_team, coll_req) != TORCH_UCC_OK) - { - delete coll_req; - return TORCH_UCC_ERROR; - } - *request = (torch_ucc_coll_request_t*)coll_req; - return TORCH_UCC_OK; -} - -torch_ucc_status_t torch_xccl_progress(torch_ucc_coll_request_t* request) { - torch_xccl_request_t* req = (torch_xccl_request_t*)request; - torch_xccl_comm_t *xccl_comm = (torch_xccl_comm_t*)request->coll_comm; - xccl_status_t st; - xccl_context_progress(xccl_comm->xccl_ctx); - st = xccl_collective_test(req->request); - if (st != XCCL_INPROGRESS) { - if (st != XCCL_OK) { - fprintf(stderr, "TorchUCC: context progress failed (%d) \n", st); - req->status = TORCH_UCC_ERROR; - return TORCH_UCC_ERROR; - } - if (req->coll_type == XCCL_ALLGATHER) { - int comm_size = xccl_comm->p2p_comm->size; - std::vector& output_vec = req->super.dst; - for (int i = 0; i < comm_size; ++i) { - output_vec[i].copy_(req->flat_tensor[i]); - } - } - xccl_collective_finalize(req->request); - req->status = TORCH_UCC_OK; - } - - return TORCH_UCC_OK; -} - -torch_ucc_status_t torch_xccl_test(torch_ucc_coll_request_t* request) { - torch_xccl_request_t* req = (torch_xccl_request_t*)request; - - return req->status; -} - -torch_ucc_status_t torch_xccl_fence(torch_ucc_coll_request_t* request) { -#ifdef USE_CUDA - torch_xccl_request_t* req = (torch_xccl_request_t*)request; - - if (req->status == TORCH_UCC_INPROGRESS) { - auto current_stream = at::cuda::getCurrentCUDAStream(req->super.device.index()); - req->super.coll_finished->block(current_stream); - } -#endif - return TORCH_UCC_OK; -} - -torch_ucc_status_t torch_xccl_free(torch_ucc_coll_request_t* request) { - torch_xccl_request_t* req = (torch_xccl_request_t*)request; - delete req; - return TORCH_UCC_OK; -} - -torch_ucc_coll_ops_t xccl_coll_ops{torch_xccl_comm_init, - torch_xccl_allgather, - torch_xccl_alltoall, - torch_xccl_alltoallv, - torch_xccl_allreduce, - torch_xccl_barrier, - torch_xccl_broadcast, - torch_xccl_progress, - torch_xccl_test, - torch_xccl_fence, - torch_xccl_free, - torch_xccl_comm_close}; - -} // namespace c10d diff --git a/test/torch_alltoallv_test.py b/test/torch_alltoallv_test.py index 548bef2..58b1bc6 100644 --- a/test/torch_alltoallv_test.py +++ b/test/torch_alltoallv_test.py @@ -3,6 +3,7 @@ # from torch_ucc_test_setup import * +import torch.autograd.profiler as profiler import numpy as np args = parse_test_args() @@ -25,15 +26,14 @@ recv_tensor = get_tensor(output_size[comm_rank], args.use_cuda) recv_tensor_test = get_tensor(output_size[comm_rank], args.use_cuda) dist.all_to_all_single(recv_tensor, send_tensor, - split[:, comm_rank], - split[comm_rank, :]) + split[:, comm_rank], + split[comm_rank, :]) dist.all_to_all_single(recv_tensor_test, send_tensor, - split[:, comm_rank], - split[comm_rank, :], - group=pg) + split[:, comm_rank], + split[comm_rank, :], + group=pg) status = check_tensor_equal(recv_tensor, recv_tensor_test) dist.all_reduce(status, group=pg) print_test_result(status, "{}({})".format(count, input_size[comm_rank]), comm_rank, comm_size) - if comm_rank == 0: print("Test alltoallv: succeeded") diff --git a/test/torch_init_test.py b/test/torch_init_test.py new file mode 100644 index 0000000..3f8e482 --- /dev/null +++ b/test/torch_init_test.py @@ -0,0 +1,27 @@ +import os +import random +import torch +import torch.distributed as dist +import torch_ucc +import time +import sys + +comm_size = int(os.environ['OMPI_COMM_WORLD_SIZE']) +comm_rank = int(os.environ['OMPI_COMM_WORLD_RANK']) + +os.environ['MASTER_PORT'] = '32167' +os.environ['MASTER_ADDR'] = 'localhost' +os.environ['RANK'] = str(comm_rank) +os.environ['WORLD_SIZE'] = str(comm_size) +dist.init_process_group('ucc', rank=comm_rank, world_size=comm_size) +#dist.new_group(ranks=[0, 1], backend='ucc') +for i in range(comm_size): + rand_sleep = random.randint(1, 1000) + time.sleep(rand_sleep/1000) + if i == comm_rank: + print("rank {} checks in".format(comm_rank)) + sys.stdout.flush() + dist.barrier() +dist.barrier() +if comm_rank == 0: + print("Test barrier: succeeded") diff --git a/test/torch_sendrecv_test.py b/test/torch_sendrecv_test.py new file mode 100644 index 0000000..b783144 --- /dev/null +++ b/test/torch_sendrecv_test.py @@ -0,0 +1,27 @@ +import os +import torch +import torch.distributed as dist +import torch_ucc + +comm_size = int(os.environ['OMPI_COMM_WORLD_SIZE']) +comm_rank = int(os.environ['OMPI_COMM_WORLD_RANK']) + +if comm_size != 2: + print("sendrecv rest requires exactly 2 ranks") + sys.exit(0) + +os.environ['MASTER_PORT'] = '32167' +os.environ['MASTER_ADDR'] = 'localhost' +os.environ['RANK'] = str(comm_rank) +os.environ['WORLD_SIZE'] = str(comm_size) +dist.init_process_group('ucc', rank=comm_rank, world_size=comm_size) + +if comm_rank == 0: + t = torch.full([16], comm_rank + 1) + print("send: ", t) + dist.send(t, 1, tag=128) +if comm_rank == 1: + t = torch.full([16], 0) + print("recv before: ", t) + dist.recv(t, 0, tag=128) + print("recv after: ", t) \ No newline at end of file diff --git a/test/torch_ucc_test_setup.py b/test/torch_ucc_test_setup.py index 0dc1a64..61ee520 100644 --- a/test/torch_ucc_test_setup.py +++ b/test/torch_ucc_test_setup.py @@ -9,11 +9,11 @@ import sys import os - def parse_test_args(): parser = argparse.ArgumentParser(description="PG UCC Test") parser.add_argument("--backend", type=str, default='mpi') parser.add_argument("--use-cuda", default=False, action='store_true') + parser.add_argument("--enable-prof",default=False, action='store_true') args = parser.parse_args() if args.use_cuda and not torch.cuda.is_available(): diff --git a/ucc b/ucc index 45cdefa..7f75fad 160000 --- a/ucc +++ b/ucc @@ -1 +1 @@ -Subproject commit 45cdefa40010efe3b1634df0c881cb720ccf293d +Subproject commit 7f75fad3f7a72e1053cbb4246a1cc7e62c75d4b3 diff --git a/ucx b/ucx index 9184ba2..737b5c4 160000 --- a/ucx +++ b/ucx @@ -1 +1 @@ -Subproject commit 9184ba2ad7346985b14a56cce4b451059b649953 +Subproject commit 737b5c4edface2e33c2321ac88e83320cad598eb From d78acd421ebbd1176ec3ba1ed4c7cf175d7d67c4 Mon Sep 17 00:00:00 2001 From: artemry-nv Date: Mon, 29 Mar 2021 22:42:15 +0300 Subject: [PATCH 8/8] Fixed merge issues Signed-off-by: artemry-nv --- .ci/Dockerfile.centos8 | 58 ++++----- .ci/Dockerfile.ubuntu20.04 | 56 ++++---- .ci/job_matrix.yaml | 210 +++++++++++++++--------------- .ci/scripts/build_xccl.sh | 2 +- .ci/scripts/install_torch.sh | 48 +++++-- .ci/scripts/run_fb_dlrm.sh | 77 ----------- .ci/scripts/run_fb_dlrm_docker.sh | 60 --------- 7 files changed, 199 insertions(+), 312 deletions(-) delete mode 100755 .ci/scripts/run_fb_dlrm.sh delete mode 100755 .ci/scripts/run_fb_dlrm_docker.sh diff --git a/.ci/Dockerfile.centos8 b/.ci/Dockerfile.centos8 index 3586b3d..1a0fdf9 100644 --- a/.ci/Dockerfile.centos8 +++ b/.ci/Dockerfile.centos8 @@ -16,41 +16,41 @@ ENV UCC_INSTALL_DIR=${TORCH_UCC_BIN_DIR}/ucc/build ENV TORCH_UCC_PYTHON_VENV_DIR=${TORCH_UCC_BIN_DIR}/python/venv #============================================================================== RUN mkdir -p ${TORCH_UCC_SRC_DIR} && \ -mkdir -p ${TORCH_UCC_PKG_DIR} && \ -mkdir -p ${TORCH_UCC_BIN_DIR} && \ -mkdir -p ${TORCH_UCC_WORKLOADS_DIR} && \ -mkdir -p ${TORCH_UCC_PYTHON_VENV_DIR} + mkdir -p ${TORCH_UCC_PKG_DIR} && \ + mkdir -p ${TORCH_UCC_BIN_DIR} && \ + mkdir -p ${TORCH_UCC_WORKLOADS_DIR} && \ + mkdir -p ${TORCH_UCC_PYTHON_VENV_DIR} COPY . ${TORCH_UCC_SRC_DIR} #============================================================================== RUN yum groupinstall -y \ -'Development Tools' \ -'Infiniband Support' + 'Development Tools' \ + 'Infiniband Support' RUN yum config-manager --set-enabled powertools && yum install -y \ -cmake \ -numactl \ -numactl-devel \ -openmpi \ -openmpi-devel \ -openssh-server \ -protobuf-compiler \ -protobuf-devel \ -python36-devel \ -rdma-core-devel \ -vim + cmake \ + numactl \ + numactl-devel \ + openmpi \ + openmpi-devel \ + openssh-server \ + protobuf-compiler \ + protobuf-devel \ + python36-devel \ + rdma-core-devel \ + vim # Remove old UCX RUN rpm -e --nodeps ucx ENV PATH=/usr/lib64/openmpi/bin:$PATH RUN echo "export PATH=\"/usr/lib64/openmpi/bin:\$PATH\"" >> /etc/bashrc && \ -export LD_LIBRARY_PATH=\"/usr/lib64/openmpi/lib:\${LD_LIBRARY_PATH}\" >> /etc/bashrc + export LD_LIBRARY_PATH=\"/usr/lib64/openmpi/lib:\${LD_LIBRARY_PATH}\" >> /etc/bashrc #============================================================================== # Configure SSH RUN mkdir -p /var/run/sshd && \ -cat /etc/ssh/ssh_config | grep -v StrictHostKeyChecking > /etc/ssh/ssh_config.new && \ -echo " StrictHostKeyChecking no" >> /etc/ssh/ssh_config.new && \ -mv /etc/ssh/ssh_config.new /etc/ssh/ssh_config && \ -ssh-keygen -A && \ -rm -f /run/nologin + cat /etc/ssh/ssh_config | grep -v StrictHostKeyChecking > /etc/ssh/ssh_config.new && \ + echo " StrictHostKeyChecking no" >> /etc/ssh/ssh_config.new && \ + mv /etc/ssh/ssh_config.new /etc/ssh/ssh_config && \ + ssh-keygen -A && \ + rm -f /run/nologin #============================================================================== # Build UCX RUN ${TORCH_UCC_SRC_DIR}/.ci/scripts/build_ucx.sh @@ -72,13 +72,13 @@ RUN ${TORCH_UCC_SRC_DIR}/.ci/scripts/install_torch.sh # TODO upstream the patches (if needed) WORKDIR ${TORCH_UCC_WORKLOADS_DIR} RUN git clone https://github.com/facebookresearch/dlrm.git && \ -cd ${TORCH_UCC_WORKLOADS_DIR}/dlrm && \ -git apply ${TORCH_UCC_SRC_DIR}/.ci/patches/dlrm/0001-Added-torch_ucc-support.patch && \ -git apply ${TORCH_UCC_SRC_DIR}/.ci/patches/dlrm/0002-Fixed-arg-list.patch && \ -pip3 install -r ${TORCH_UCC_WORKLOADS_DIR}/dlrm/requirements.txt && \ -pip3 install tensorboard + cd ${TORCH_UCC_WORKLOADS_DIR}/dlrm && \ + git apply ${TORCH_UCC_SRC_DIR}/.ci/patches/dlrm/0001-Added-torch_ucc-support.patch && \ + git apply ${TORCH_UCC_SRC_DIR}/.ci/patches/dlrm/0002-Fixed-arg-list.patch && \ + pip3 install -r ${TORCH_UCC_WORKLOADS_DIR}/dlrm/requirements.txt && \ + pip3 install tensorboard RUN git clone https://github.com/facebookresearch/param.git && \ -pip3 install -r ${TORCH_UCC_WORKLOADS_DIR}/param/requirements.txt + pip3 install -r ${TORCH_UCC_WORKLOADS_DIR}/param/requirements.txt #============================================================================== # Install torch_ucc (XCCL version) python module and build a wheel package RUN ${TORCH_UCC_SRC_DIR}/.ci/scripts/install_torch_xccl.sh diff --git a/.ci/Dockerfile.ubuntu20.04 b/.ci/Dockerfile.ubuntu20.04 index d6ecd18..dcff626 100644 --- a/.ci/Dockerfile.ubuntu20.04 +++ b/.ci/Dockerfile.ubuntu20.04 @@ -15,37 +15,37 @@ ENV XCCL_BUILD_TYPE=debug ENV XCCL_INSTALL_DIR=${TORCH_UCC_BIN_DIR}/xccl/build-${XCCL_BUILD_TYPE} #============================================================================== RUN mkdir -p ${TORCH_UCC_SRC_DIR} && \ -mkdir -p ${TORCH_UCC_PKG_DIR} && \ -mkdir -p ${TORCH_UCC_BIN_DIR} && \ -mkdir -p ${TORCH_UCC_WORKLOADS_DIR} + mkdir -p ${TORCH_UCC_PKG_DIR} && \ + mkdir -p ${TORCH_UCC_BIN_DIR} && \ + mkdir -p ${TORCH_UCC_WORKLOADS_DIR} COPY . ${TORCH_UCC_SRC_DIR} #============================================================================== ARG DEBIAN_FRONTEND=noninteractive RUN apt update && \ -apt install -y \ -apt-utils \ -autoconf \ -build-essential \ -cmake \ -curl \ -git \ -ibverbs-providers \ -ibverbs-utils \ -libnuma-dev \ -libtool-bin \ -ninja-build \ -openmpi-bin \ -openssh-server \ -vim \ -&& \ -rm -rf /var/lib/apt/lists/* + apt install -y \ + apt-utils \ + autoconf \ + build-essential \ + cmake \ + curl \ + git \ + ibverbs-providers \ + ibverbs-utils \ + libnuma-dev \ + libtool-bin \ + ninja-build \ + openmpi-bin \ + openssh-server \ + vim \ + && \ + rm -rf /var/lib/apt/lists/* #============================================================================== # Configure SSH RUN mkdir -p /var/run/sshd && \ -cat /etc/ssh/ssh_config | grep -v StrictHostKeyChecking > /etc/ssh/ssh_config.new && \ -echo " StrictHostKeyChecking no" >> /etc/ssh/ssh_config.new && \ -mv /etc/ssh/ssh_config.new /etc/ssh/ssh_config + cat /etc/ssh/ssh_config | grep -v StrictHostKeyChecking > /etc/ssh/ssh_config.new && \ + echo " StrictHostKeyChecking no" >> /etc/ssh/ssh_config.new && \ + mv /etc/ssh/ssh_config.new /etc/ssh/ssh_config #============================================================================== # Build UCX RUN ${TORCH_UCC_SRC_DIR}/.ci/scripts/build_ucx.sh @@ -67,9 +67,9 @@ RUN ${TORCH_UCC_SRC_DIR}/.ci/scripts/install_torch_ucc.sh # Install workloads WORKDIR ${TORCH_UCC_WORKLOADS_DIR} RUN git clone https://github.com/facebookresearch/dlrm.git && \ -cd ${TORCH_UCC_WORKLOADS_DIR}/dlrm && \ -git apply ${TORCH_UCC_SRC_DIR}/.ci/patches/dlrm/0001-Added-torch_ucc-support.patch && \ -pip install -r ${TORCH_UCC_WORKLOADS_DIR}/dlrm/requirements.txt && \ -pip install tensorboard + cd ${TORCH_UCC_WORKLOADS_DIR}/dlrm && \ + git apply ${TORCH_UCC_SRC_DIR}/.ci/patches/dlrm/0001-Added-torch_ucc-support.patch && \ + pip install -r ${TORCH_UCC_WORKLOADS_DIR}/dlrm/requirements.txt && \ + pip install tensorboard RUN git clone https://github.com/facebookresearch/param.git && \ -pip install -r ${TORCH_UCC_WORKLOADS_DIR}/param/requirements.txt \ No newline at end of file + pip install -r ${TORCH_UCC_WORKLOADS_DIR}/param/requirements.txt \ No newline at end of file diff --git a/.ci/job_matrix.yaml b/.ci/job_matrix.yaml index baaa9a1..5fb88a2 100644 --- a/.ci/job_matrix.yaml +++ b/.ci/job_matrix.yaml @@ -9,71 +9,71 @@ registry_auth: '05d98651-e11c-4a57-9cc6-52df79014b89' # cloud: 'swx-k8s' volumes: -- { mountPath: '/hpc/local', hostPath: '/hpc/local' } -- { mountPath: '/auto/sw_tools', hostPath: '/auto/sw_tools' } -- { mountPath: '/.autodirect/mtrswgwork', hostPath: '/.autodirect/mtrswgwork' } -- { mountPath: '/.autodirect/sw/release', hostPath: '/.autodirect/sw/release' } + - { mountPath: '/hpc/local', hostPath: '/hpc/local' } + - { mountPath: '/auto/sw_tools', hostPath: '/auto/sw_tools' } + - { mountPath: '/.autodirect/mtrswgwork', hostPath: '/.autodirect/mtrswgwork' } + - { mountPath: '/.autodirect/sw/release', hostPath: '/.autodirect/sw/release' } env: -CUDA_VER: '11.2.1' -TORCH_UCC_URI_SUFFIX: '${TORCH_UCC_VERSION}/x86_64/centos8/cuda${CUDA_VER}' -TORCH_UCC_DOCKER_IMAGE_NAME: '${registry_host}${registry_path}/${TORCH_UCC_URI_SUFFIX}' -TORCH_UCC_ROOT_DIR: '/opt/nvidia/torch-ucc' -TORCH_UCC_SRC_DIR: '${TORCH_UCC_ROOT_DIR}/src' -TORCH_UCC_BIN_DIR: '${TORCH_UCC_ROOT_DIR}/bin' -TORCH_UCC_PYTHON_VENV_DIR: '${TORCH_UCC_BIN_DIR}/python/venv' -XCCL_BUILD_TYPE: 'debug' + CUDA_VER: '11.2.1' + TORCH_UCC_URI_SUFFIX: '${TORCH_UCC_VERSION}/x86_64/centos8/cuda${CUDA_VER}' + TORCH_UCC_DOCKER_IMAGE_NAME: '${registry_host}${registry_path}/${TORCH_UCC_URI_SUFFIX}' + TORCH_UCC_ROOT_DIR: '/opt/nvidia/torch-ucc' + TORCH_UCC_SRC_DIR: '${TORCH_UCC_ROOT_DIR}/src' + TORCH_UCC_BIN_DIR: '${TORCH_UCC_ROOT_DIR}/bin' + TORCH_UCC_PYTHON_VENV_DIR: '${TORCH_UCC_BIN_DIR}/python/venv' + XCCL_BUILD_TYPE: 'debug' docker_opt: '--pull always --network=host --uts=host --ipc=host --ulimit stack=67108864 --ulimit memlock=-1 --security-opt seccomp=unconfined --cap-add=SYS_ADMIN --device=/dev/infiniband/ --gpus all --user root' runs_on_dockers: -- { -file: '.ci/Dockerfile.centos8', -name: 'centos8', -tag: '${BUILD_NUMBER}', -arch: 'x86_64', -uri: '${TORCH_UCC_URI_SUFFIX}', -build_args: '--rm --no-cache --build-arg CUDA_VER=${CUDA_VER} --build-arg TORCH_UCC_ROOT_DIR=${TORCH_UCC_ROOT_DIR}', -cloud: 'swx-k8s', -nodeLabel: 'swx-clx01 || swx-clx02', -} + - { + file: '.ci/Dockerfile.centos8', + name: 'centos8', + tag: '${BUILD_NUMBER}', + arch: 'x86_64', + uri: '${TORCH_UCC_URI_SUFFIX}', + build_args: '--rm --no-cache --build-arg CUDA_VER=${CUDA_VER} --build-arg TORCH_UCC_ROOT_DIR=${TORCH_UCC_ROOT_DIR}', + cloud: 'swx-k8s', + nodeLabel: 'swx-clx01 || swx-clx02', + } # bare metal runs_on_agents: -- nodeLabel: 'swx-clx01 || swx-clx02' + - nodeLabel: 'swx-clx01 || swx-clx02' # TODO debug timeout_minutes: '400' steps: -#============================================================================ -- name: Check Env -agentSelector: "{nodeLabel: 'swx-clx01 || swx-clx02'}" -containerSelector: "{name:'centos8'}" -run: | -echo "INFO: check environment" -hostname -printenv -cat /proc/1/cgroup -cat /etc/*release* -id -#find /opt/nvidia -#ibv_devinfo -#nvidia-smi -#nvidia-smi topo -m -#============================================================================ -- name: Run XCCL tests -#agentSelector: "{nodeLabel: 'swx-clx01 || swx-clx02'}" -containerSelector: "{name:'centos8'}" -run: | -echo "INFO: Run XCCL tests" -. "${TORCH_UCC_PYTHON_VENV_DIR}/xccl/bin/activate" -hostname -cat /proc/1/cgroup -pip3 list | grep torch -${TORCH_UCC_SRC_DIR}/.ci/scripts/run_tests_xccl.sh -deactivate -#============================================================================ + #============================================================================ + - name: Check Env + agentSelector: "{nodeLabel: 'swx-clx01 || swx-clx02'}" + containerSelector: "{name:'centos8'}" + run: | + echo "INFO: check environment" + hostname + printenv + cat /proc/1/cgroup + cat /etc/*release* + id + #find /opt/nvidia + #ibv_devinfo + #nvidia-smi + #nvidia-smi topo -m + #============================================================================ + - name: Run XCCL tests + #agentSelector: "{nodeLabel: 'swx-clx01 || swx-clx02'}" + containerSelector: "{name:'centos8'}" + run: | + echo "INFO: Run XCCL tests" + . "${TORCH_UCC_PYTHON_VENV_DIR}/xccl/bin/activate" + hostname + cat /proc/1/cgroup + pip3 list | grep torch + ${TORCH_UCC_SRC_DIR}/.ci/scripts/run_tests_xccl.sh + deactivate + #============================================================================ # - name: Run UCC tests # #agentSelector: "{nodeLabel: 'swx-clx01 || swx-clx02'}" # containerSelector: "{name:'centos8'}" @@ -85,59 +85,59 @@ deactivate # pip3 list | grep torch # ${TORCH_UCC_SRC_DIR}/.ci/scripts/run_tests_ucc.sh # deactivate -#============================================================================ -- name: Run Torch-UCC tests (XCCL) -#agentSelector: "{nodeLabel: 'swx-clx01 || swx-clx02'}" -containerSelector: "{name:'centos8'}" -run: | -echo "INFO: Run Torch-UCC tests (XCCL)" -. "${TORCH_UCC_PYTHON_VENV_DIR}/xccl/bin/activate" -hostname -cat /proc/1/cgroup -pip3 list | grep torch -${TORCH_UCC_SRC_DIR}/.ci/scripts/run_tests_torch_xccl.sh -deactivate -#============================================================================ -- name: Run Torch-UCC tests (UCC) -#agentSelector: "{nodeLabel: 'swx-clx01 || swx-clx02'}" -containerSelector: "{name:'centos8'}" -run: | -echo "INFO: Run Torch-UCC tests (UCC)" -. "${TORCH_UCC_PYTHON_VENV_DIR}/ucc/bin/activate" -hostname -cat /proc/1/cgroup -pip3 list | grep torch -${TORCH_UCC_SRC_DIR}/.ci/scripts/run_tests_torch_ucc.sh -deactivate -#============================================================================ -- name: Run DLRM tests (XCCL/GPU) -agentSelector: "{nodeLabel: 'swx-clx01 || swx-clx02'}" -run: | -echo "INFO: Run DLRM tests (XCCL/GPU)" -hostname -printenv -cat /proc/1/cgroup -cat /etc/*release* -id -find /opt/nvidia -ibv_devinfo -nvidia-smi -${WORKSPACE}/.ci/scripts/run_dlrm_docker.sh xccl -#============================================================================ -- name: Run DLRM tests (UCC/GPU) -agentSelector: "{nodeLabel: 'swx-clx01 || swx-clx02'}" -run: | -echo "INFO: Run DLRM tests (UCC/GPU)" -hostname -printenv -cat /proc/1/cgroup -cat /etc/*release* -id -find /opt/nvidia -ibv_devinfo -nvidia-smi -${WORKSPACE}/.ci/scripts/run_dlrm_docker.sh ucc -#============================================================================ + #============================================================================ + - name: Run Torch-UCC tests (XCCL) + #agentSelector: "{nodeLabel: 'swx-clx01 || swx-clx02'}" + containerSelector: "{name:'centos8'}" + run: | + echo "INFO: Run Torch-UCC tests (XCCL)" + . "${TORCH_UCC_PYTHON_VENV_DIR}/xccl/bin/activate" + hostname + cat /proc/1/cgroup + pip3 list | grep torch + ${TORCH_UCC_SRC_DIR}/.ci/scripts/run_tests_torch_xccl.sh + deactivate + #============================================================================ + - name: Run Torch-UCC tests (UCC) + #agentSelector: "{nodeLabel: 'swx-clx01 || swx-clx02'}" + containerSelector: "{name:'centos8'}" + run: | + echo "INFO: Run Torch-UCC tests (UCC)" + . "${TORCH_UCC_PYTHON_VENV_DIR}/ucc/bin/activate" + hostname + cat /proc/1/cgroup + pip3 list | grep torch + ${TORCH_UCC_SRC_DIR}/.ci/scripts/run_tests_torch_ucc.sh + deactivate + #============================================================================ + - name: Run DLRM tests (XCCL/GPU) + agentSelector: "{nodeLabel: 'swx-clx01 || swx-clx02'}" + run: | + echo "INFO: Run DLRM tests (XCCL/GPU)" + hostname + printenv + cat /proc/1/cgroup + cat /etc/*release* + id + find /opt/nvidia + ibv_devinfo + nvidia-smi + ${WORKSPACE}/.ci/scripts/run_dlrm_docker.sh xccl + #============================================================================ + - name: Run DLRM tests (UCC/GPU) + agentSelector: "{nodeLabel: 'swx-clx01 || swx-clx02'}" + run: | + echo "INFO: Run DLRM tests (UCC/GPU)" + hostname + printenv + cat /proc/1/cgroup + cat /etc/*release* + id + find /opt/nvidia + ibv_devinfo + nvidia-smi + ${WORKSPACE}/.ci/scripts/run_dlrm_docker.sh ucc + #============================================================================ # - name: Run PARAM benchmarks # agentSelector: "{nodeLabel: 'swx-clx01 || swx-clx02'}" # run: | @@ -145,4 +145,4 @@ ${WORKSPACE}/.ci/scripts/run_dlrm_docker.sh ucc # hostname # cat /proc/1/cgroup # #${TORCH_UCC_SRC_DIR}/.ci/scripts/run_param_benchmarks.sh -#============================================================================ + #============================================================================ diff --git a/.ci/scripts/build_xccl.sh b/.ci/scripts/build_xccl.sh index 45cebb9..9571c71 100755 --- a/.ci/scripts/build_xccl.sh +++ b/.ci/scripts/build_xccl.sh @@ -10,7 +10,7 @@ cd "${XCCL_SRC_DIR}/build-${XCCL_BUILD_TYPE}" # TODO enable CUDA (compilation failed) #"${XCCL_SRC_DIR}/configure" --with-ucx="${UCX_INSTALL_DIR}" --prefix="${XCCL_INSTALL_DIR}" --enable-debug "${XCCL_SRC_DIR}/configure" --with-cuda="${CUDA_HOME}" --with-ucx="${UCX_INSTALL_DIR}" \ ---prefix="${XCCL_INSTALL_DIR}" --enable-debug + --prefix="${XCCL_INSTALL_DIR}" --enable-debug make -j install echo "${XCCL_INSTALL_DIR}/lib" > /etc/ld.so.conf.d/xccl.conf ldconfig diff --git a/.ci/scripts/install_torch.sh b/.ci/scripts/install_torch.sh index 649acaa..dce6bd6 100755 --- a/.ci/scripts/install_torch.sh +++ b/.ci/scripts/install_torch.sh @@ -1,16 +1,40 @@ #!/bin/bash -eEx +set -o pipefail -SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd -P)" +# TODO debug +#cd /tmp +#git clone https://github.com/pytorch/pytorch.git +#cd /tmp/pytorch +#git submodule sync --recursive +#git submodule update --init --recursive +#pip3 install -r requirements.txt +#export TORCH_CUDA_ARCH_LIST="7.0 8.0+PTX" +#export USE_GLOO=1 +#export USE_DISTRIBUTED=1 +#export USE_OPENCV=0 +## TODO debug +#export USE_CUDA=1 +##export USE_CUDA=0 +#export USE_NCCL=0 +#export USE_MKLDNN=0 +#export BUILD_TEST=0 +#export USE_FBGEMM=0 +#export USE_NNPACK=0 +#export USE_QNNPACK=0 +#export USE_XNNPACK=0 +#export USE_KINETO=1 +#export MAX_JOBS=$(($(nproc)-1)) +#python setup.py install +#cd - +#rm -rf /tmp/pytorch -# shellcheck disable=SC2034 -#DLRM_MODEL="big" -DLRM_MODEL="small" +# TODO debug +#conda install -y pytorch torchvision cpuonly -c pytorch-nightly +#conda install pytorch torchvision torchaudio cudatoolkit=11.0 -c pytorch-nightly +#conda uninstall -y pytorch torchvision +#conda install pytorch torchvision cudatoolkit=11.0 -c pytorch-nightly +#conda install pytorch cudatoolkit=11.0 -c pytorch-nightly -HOSTNAME=$(hostname -s) -export HOSTNAME -SRC_ROOT_DIR=$(cd "${SCRIPT_DIR}/../../" && pwd -P) -export CONFIGS_DIR="${SRC_ROOT_DIR}/.ci/configs" - -# DLRM MASTER_PORT -export MASTER_PORT="12346" -export DOCKER_SSH_PORT="12345" +pip3 install --default-timeout=900 numpy +pip3 install --default-timeout=900 --pre torch -f https://download.pytorch.org/whl/nightly/cu110/torch_nightly.html +pip3 install "git+https://github.com/mlperf/logging.git@0.7.1" diff --git a/.ci/scripts/run_fb_dlrm.sh b/.ci/scripts/run_fb_dlrm.sh deleted file mode 100755 index 92c3828..0000000 --- a/.ci/scripts/run_fb_dlrm.sh +++ /dev/null @@ -1,77 +0,0 @@ -#!/bin/bash -eEx -set -o pipefail - -# TODO debug -exit 0 - -SCRIPT_DIR="$( - cd "$(dirname "$0")" - pwd -P -)" -cd "${SCRIPT_DIR}" -. "${SCRIPT_DIR}/env.sh" - -case ${DLRM_MODEL} in -"big") - emb_size="1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000" - emb_dim="256" - emb_lookup="100" - bot_mlp="512-512-256" - top_mlp="1024-1024-1024-1" - loss_func="mse" - round_targets="False" - lr="0.01" - #mb_size="2048" - emb_lookup_fixed="0" - ;; -"small") - emb_size="1000-1000-1000-1000-1000-1000-1000-1000" - emb_dim="64" - emb_lookup="100" - bot_mlp="512-512-64" - top_mlp="1024-1024-1024-1" - loss_func="mse" - round_targets="False" - lr="0.01" - #mb_size="2048" - emb_lookup_fixed="0" - ;; -*) - echo "ERROR: unsupported or empty DLRM_MODEL (${DLRM_MODEL})" - exit 1 - ;; -esac - -cd "${TORCH_UCC_ROOT_DIR}/workloads/dlrm" - -MPIRUN_OPTIONS="\ - -np $NP \ - -H $HOSTS \ - --map-by node \ - -x LD_LIBRARY_PATH \ - --allow-run-as-root \ -" - -# shellcheck disable=SC2086 -mpirun ${MPIRUN_OPTIONS} hostname -#mpirun ${MPIRUN_OPTIONS} python dlrm_s_pytorch.py \ -# --mini-batch-size=2048 \ -# --test-mini-batch-size=16384 \ -# --test-num-workers=0 \ -# --num-batches=100 \ -# --data-generation=random \ -# --arch-mlp-bot=$bot_mlp \ -# --arch-mlp-top=$top_mlp \ -# --arch-sparse-feature-size=$emb_dim \ -# --arch-embedding-size=$emb_size \ -# --num-indices-per-lookup=$emb_lookup \ -# --num-indices-per-lookup-fixed=$emb_lookup_fixed \ -# --arch-interaction-op=dot \ -# --numpy-rand-seed=727 \ -# --print-freq=1 \ -# --loss-function=$loss_func \ -# --round-targets=$round_targets \ -# --learning-rate=$lr \ -# --print-time \ -# --dist-backend=ucc \ -# --use-gpu diff --git a/.ci/scripts/run_fb_dlrm_docker.sh b/.ci/scripts/run_fb_dlrm_docker.sh deleted file mode 100755 index 44d58af..0000000 --- a/.ci/scripts/run_fb_dlrm_docker.sh +++ /dev/null @@ -1,60 +0,0 @@ -#!/bin/bash -eEx -set -o pipefail - -SCRIPT_DIR="$( - cd "$(dirname "$0")" - pwd -P -)" -cd "${SCRIPT_DIR}" -. "${SCRIPT_DIR}/env.sh" - -DOCKER_RUN_ARGS="\ ---pull always \ ---network=host \ ---uts=host \ ---ipc=host \ ---ulimit stack=67108864 \ ---ulimit memlock=-1 \ ---security-opt seccomp=unconfined \ ---cap-add=SYS_ADMIN \ ---device=/dev/infiniband/ \ ---gpus all \ ---user root \ --it \ --d \ ---rm \ ---name=${DOCKER_CONTAINER_NAME} \ -" -DOCKER_SSH_PORT="12345" -DOCKER_CONTAINER_NAME="torch_ucc_ci" -DOCKER_IMAGE_NAME="${TORCH_UCC_DOCKER_IMAGE_NAME}:${BUILD_ID}" - -while read -r HOST; do - echo "INFO: HOST = $HOST" - STALE_DOCKER_CONTAINER_LIST=$(sudo ssh "$HOST" "docker ps -a -q -f name=${DOCKER_CONTAINER_NAME}") - if [ -n "${STALE_DOCKER_CONTAINER_LIST}" ]; then - echo "WARNING: stale docker container (name: ${DOCKER_CONTAINER_NAME}) is detected on ${HOST} (to be stopped)" - echo "INFO: Stopping stale docker container (name: ${DOCKER_CONTAINER_NAME}) on ${HOST}..." - sudo ssh "${HOST}" docker stop ${DOCKER_CONTAINER_NAME} - echo "INFO: Stopping stale docker container (name: ${DOCKER_CONTAINER_NAME}) on ${HOST}... DONE" - fi - - echo "INFO: start docker container on $HOST ..." - sudo ssh "$HOST" "docker run \ - ${DOCKER_RUN_ARGS} \ - ${DOCKER_IMAGE_NAME} \ - bash -c "/usr/sbin/sshd -p ${DOCKER_SSH_PORT}; sleep infinity"" - echo "INFO: start docker container on $HOST ... DONE" - - echo "INFO: verify docker container on $HOST ..." - sudo ssh "$HOST" -p ${DOCKER_SSH_PORT} hostname - echo "INFO: verify docker container on $HOST ... DONE" -done <"$HOSTFILE" - -sleep 20000 - -while read -r HOST; do - echo "INFO: stop docker container on $HOST ..." - sudo ssh "${HOST}" docker stop ${DOCKER_CONTAINER_NAME} - echo "INFO: stop docker container on $HOST ... DONE" -done <"$HOSTFILE" \ No newline at end of file