diff --git a/.ci/Dockerfile.centos8 b/.ci/Dockerfile.centos8 new file mode 100644 index 0000000..1a0fdf9 --- /dev/null +++ b/.ci/Dockerfile.centos8 @@ -0,0 +1,94 @@ +ARG CUDA_VER='11.2.1' +FROM nvidia/cuda:${CUDA_VER}-devel-centos8 +#============================================================================== +ARG TORCH_UCC_ROOT_DIR=/opt/nvidia/torch-ucc +ENV TORCH_UCC_SRC_DIR=${TORCH_UCC_ROOT_DIR}/src +ENV TORCH_UCC_PKG_DIR=${TORCH_UCC_ROOT_DIR}/pkg +ENV TORCH_UCC_BIN_DIR=${TORCH_UCC_ROOT_DIR}/bin +ENV TORCH_UCC_WORKLOADS_DIR=${TORCH_UCC_ROOT_DIR}/workloads +ENV CUDA_HOME=/usr/local/cuda +ENV UCX_BRANCH=v1.10.x +ENV UCX_BUILD_TYPE=release-mt +ENV UCX_INSTALL_DIR=${TORCH_UCC_BIN_DIR}/ucx/build-${UCX_BUILD_TYPE} +ENV XCCL_BUILD_TYPE=debug +ENV XCCL_INSTALL_DIR=${TORCH_UCC_BIN_DIR}/xccl/build-${XCCL_BUILD_TYPE} +ENV UCC_INSTALL_DIR=${TORCH_UCC_BIN_DIR}/ucc/build +ENV TORCH_UCC_PYTHON_VENV_DIR=${TORCH_UCC_BIN_DIR}/python/venv +#============================================================================== +RUN mkdir -p ${TORCH_UCC_SRC_DIR} && \ + mkdir -p ${TORCH_UCC_PKG_DIR} && \ + mkdir -p ${TORCH_UCC_BIN_DIR} && \ + mkdir -p ${TORCH_UCC_WORKLOADS_DIR} && \ + mkdir -p ${TORCH_UCC_PYTHON_VENV_DIR} + +COPY . ${TORCH_UCC_SRC_DIR} +#============================================================================== +RUN yum groupinstall -y \ + 'Development Tools' \ + 'Infiniband Support' +RUN yum config-manager --set-enabled powertools && yum install -y \ + cmake \ + numactl \ + numactl-devel \ + openmpi \ + openmpi-devel \ + openssh-server \ + protobuf-compiler \ + protobuf-devel \ + python36-devel \ + rdma-core-devel \ + vim +# Remove old UCX +RUN rpm -e --nodeps ucx +ENV PATH=/usr/lib64/openmpi/bin:$PATH +RUN echo "export PATH=\"/usr/lib64/openmpi/bin:\$PATH\"" >> /etc/bashrc && \ + export LD_LIBRARY_PATH=\"/usr/lib64/openmpi/lib:\${LD_LIBRARY_PATH}\" >> /etc/bashrc +#============================================================================== +# Configure SSH +RUN mkdir -p /var/run/sshd && \ + cat /etc/ssh/ssh_config | grep -v StrictHostKeyChecking > /etc/ssh/ssh_config.new && \ + echo " StrictHostKeyChecking no" >> /etc/ssh/ssh_config.new && \ + mv /etc/ssh/ssh_config.new /etc/ssh/ssh_config && \ + ssh-keygen -A && \ + rm -f /run/nologin +#============================================================================== +# Build UCX +RUN ${TORCH_UCC_SRC_DIR}/.ci/scripts/build_ucx.sh +ENV PATH=${UCX_INSTALL_DIR}/bin:${PATH} +#============================================================================== +# Configure Python +RUN ${TORCH_UCC_SRC_DIR}/.ci/scripts/configure_python.sh +#============================================================================== +# Build XCCL +RUN ${TORCH_UCC_SRC_DIR}/.ci/scripts/build_xccl.sh +#============================================================================== +# Build UCC +RUN ${TORCH_UCC_SRC_DIR}/.ci/scripts/build_ucc.sh +#============================================================================== +# Install PyTorch +RUN ${TORCH_UCC_SRC_DIR}/.ci/scripts/install_torch.sh +#============================================================================== +# Install workloads +# TODO upstream the patches (if needed) +WORKDIR ${TORCH_UCC_WORKLOADS_DIR} +RUN git clone https://github.com/facebookresearch/dlrm.git && \ + cd ${TORCH_UCC_WORKLOADS_DIR}/dlrm && \ + git apply ${TORCH_UCC_SRC_DIR}/.ci/patches/dlrm/0001-Added-torch_ucc-support.patch && \ + git apply ${TORCH_UCC_SRC_DIR}/.ci/patches/dlrm/0002-Fixed-arg-list.patch && \ + pip3 install -r ${TORCH_UCC_WORKLOADS_DIR}/dlrm/requirements.txt && \ + pip3 install tensorboard +RUN git clone https://github.com/facebookresearch/param.git && \ + pip3 install -r ${TORCH_UCC_WORKLOADS_DIR}/param/requirements.txt +#============================================================================== +# Install torch_ucc (XCCL version) python module and build a wheel package +RUN ${TORCH_UCC_SRC_DIR}/.ci/scripts/install_torch_xccl.sh +#============================================================================== +# Install torch_ucc (UCC version) python module and build a wheel package +RUN ${TORCH_UCC_SRC_DIR}/.ci/scripts/install_torch_ucc.sh +#============================================================================== +RUN groupadd -g 11429 swx-jenkins +RUN adduser --uid 6213 --gid 11429 --home /home/swx-jenkins swx-jenkins + +RUN groupadd -g 30 dip +RUN adduser --no-create-home --uid 50009 --gid 30 --home /labhome/artemry artemry +#============================================================================== diff --git a/.ci/Dockerfile.ubuntu20.04 b/.ci/Dockerfile.ubuntu20.04 new file mode 100644 index 0000000..dcff626 --- /dev/null +++ b/.ci/Dockerfile.ubuntu20.04 @@ -0,0 +1,75 @@ +#ARG CUDA_VER='11.2.1' +ARG CUDA_VER='11.1.1' +FROM nvidia/cuda:${CUDA_VER}-devel-ubuntu20.04 +#============================================================================== +ARG TORCH_UCC_ROOT_DIR=/opt/nvidia/torch-ucc +ENV TORCH_UCC_SRC_DIR=${TORCH_UCC_ROOT_DIR}/src +ENV TORCH_UCC_PKG_DIR=${TORCH_UCC_ROOT_DIR}/pkg +ENV TORCH_UCC_BIN_DIR=${TORCH_UCC_ROOT_DIR}/bin +ENV TORCH_UCC_WORKLOADS_DIR=${TORCH_UCC_ROOT_DIR}/workloads +ENV CUDA_HOME=/usr/local/cuda +ENV UCX_BRANCH=v1.10.x +ENV UCX_BUILD_TYPE=release-mt +ENV UCX_INSTALL_DIR=${TORCH_UCC_BIN_DIR}/ucx/build-${UCX_BUILD_TYPE} +ENV XCCL_BUILD_TYPE=debug +ENV XCCL_INSTALL_DIR=${TORCH_UCC_BIN_DIR}/xccl/build-${XCCL_BUILD_TYPE} +#============================================================================== +RUN mkdir -p ${TORCH_UCC_SRC_DIR} && \ + mkdir -p ${TORCH_UCC_PKG_DIR} && \ + mkdir -p ${TORCH_UCC_BIN_DIR} && \ + mkdir -p ${TORCH_UCC_WORKLOADS_DIR} + +COPY . ${TORCH_UCC_SRC_DIR} +#============================================================================== +ARG DEBIAN_FRONTEND=noninteractive +RUN apt update && \ + apt install -y \ + apt-utils \ + autoconf \ + build-essential \ + cmake \ + curl \ + git \ + ibverbs-providers \ + ibverbs-utils \ + libnuma-dev \ + libtool-bin \ + ninja-build \ + openmpi-bin \ + openssh-server \ + vim \ + && \ + rm -rf /var/lib/apt/lists/* +#============================================================================== +# Configure SSH +RUN mkdir -p /var/run/sshd && \ + cat /etc/ssh/ssh_config | grep -v StrictHostKeyChecking > /etc/ssh/ssh_config.new && \ + echo " StrictHostKeyChecking no" >> /etc/ssh/ssh_config.new && \ + mv /etc/ssh/ssh_config.new /etc/ssh/ssh_config +#============================================================================== +# Build UCX +RUN ${TORCH_UCC_SRC_DIR}/.ci/scripts/build_ucx.sh +ENV PATH=${UCX_INSTALL_DIR}/bin:${PATH} +#============================================================================== +# Configure Python +RUN ${TORCH_UCC_SRC_DIR}/.ci/scripts/configure_python.sh +ENV PATH /opt/conda/bin:${PATH} +#============================================================================== +# Build XCCL +RUN ${TORCH_UCC_SRC_DIR}/.ci/scripts/build_xccl.sh +#============================================================================== +# Install PyTorch +RUN ${TORCH_UCC_SRC_DIR}/.ci/scripts/install_torch.sh +#============================================================================== +# Install torch_ucc python module and build a wheel package +RUN ${TORCH_UCC_SRC_DIR}/.ci/scripts/install_torch_ucc.sh +#============================================================================== +# Install workloads +WORKDIR ${TORCH_UCC_WORKLOADS_DIR} +RUN git clone https://github.com/facebookresearch/dlrm.git && \ + cd ${TORCH_UCC_WORKLOADS_DIR}/dlrm && \ + git apply ${TORCH_UCC_SRC_DIR}/.ci/patches/dlrm/0001-Added-torch_ucc-support.patch && \ + pip install -r ${TORCH_UCC_WORKLOADS_DIR}/dlrm/requirements.txt && \ + pip install tensorboard +RUN git clone https://github.com/facebookresearch/param.git && \ + pip install -r ${TORCH_UCC_WORKLOADS_DIR}/param/requirements.txt \ No newline at end of file diff --git a/.ci/Jenkinsfile.shlib b/.ci/Jenkinsfile.shlib new file mode 100644 index 0000000..2486083 --- /dev/null +++ b/.ci/Jenkinsfile.shlib @@ -0,0 +1,9 @@ +#!/usr/bin/groovy + +// load pipeline functions +// Requires pipeline-github-lib plugin to load library from github +@Library('github.com/Mellanox/ci-demo@stable') +def matrix = new com.mellanox.cicd.Matrix() + +matrix.main() + diff --git a/.ci/configs/swx-clx01/hostfile.txt b/.ci/configs/swx-clx01/hostfile.txt new file mode 100644 index 0000000..d813292 --- /dev/null +++ b/.ci/configs/swx-clx01/hostfile.txt @@ -0,0 +1,2 @@ +swx-clx01 +swx-clx02 diff --git a/.ci/configs/swx-clx02/hostfile.txt b/.ci/configs/swx-clx02/hostfile.txt new file mode 100644 index 0000000..fbddaa1 --- /dev/null +++ b/.ci/configs/swx-clx02/hostfile.txt @@ -0,0 +1,2 @@ +swx-clx02 +swx-clx01 diff --git a/.ci/job_matrix.yaml b/.ci/job_matrix.yaml new file mode 100644 index 0000000..5fb88a2 --- /dev/null +++ b/.ci/job_matrix.yaml @@ -0,0 +1,148 @@ +--- +job: 'torch-ucc' + +registry_host: 'harbor.mellanox.com' +registry_path: '/torch-ucc' +registry_auth: '05d98651-e11c-4a57-9cc6-52df79014b89' + +#kubernetes: +# cloud: 'swx-k8s' + +volumes: + - { mountPath: '/hpc/local', hostPath: '/hpc/local' } + - { mountPath: '/auto/sw_tools', hostPath: '/auto/sw_tools' } + - { mountPath: '/.autodirect/mtrswgwork', hostPath: '/.autodirect/mtrswgwork' } + - { mountPath: '/.autodirect/sw/release', hostPath: '/.autodirect/sw/release' } + +env: + CUDA_VER: '11.2.1' + TORCH_UCC_URI_SUFFIX: '${TORCH_UCC_VERSION}/x86_64/centos8/cuda${CUDA_VER}' + TORCH_UCC_DOCKER_IMAGE_NAME: '${registry_host}${registry_path}/${TORCH_UCC_URI_SUFFIX}' + TORCH_UCC_ROOT_DIR: '/opt/nvidia/torch-ucc' + TORCH_UCC_SRC_DIR: '${TORCH_UCC_ROOT_DIR}/src' + TORCH_UCC_BIN_DIR: '${TORCH_UCC_ROOT_DIR}/bin' + TORCH_UCC_PYTHON_VENV_DIR: '${TORCH_UCC_BIN_DIR}/python/venv' + XCCL_BUILD_TYPE: 'debug' + +docker_opt: '--pull always --network=host --uts=host --ipc=host --ulimit stack=67108864 --ulimit memlock=-1 --security-opt seccomp=unconfined --cap-add=SYS_ADMIN --device=/dev/infiniband/ --gpus all --user root' + +runs_on_dockers: + - { + file: '.ci/Dockerfile.centos8', + name: 'centos8', + tag: '${BUILD_NUMBER}', + arch: 'x86_64', + uri: '${TORCH_UCC_URI_SUFFIX}', + build_args: '--rm --no-cache --build-arg CUDA_VER=${CUDA_VER} --build-arg TORCH_UCC_ROOT_DIR=${TORCH_UCC_ROOT_DIR}', + cloud: 'swx-k8s', + nodeLabel: 'swx-clx01 || swx-clx02', + } + +# bare metal +runs_on_agents: + - nodeLabel: 'swx-clx01 || swx-clx02' + +# TODO debug +timeout_minutes: '400' + +steps: + #============================================================================ + - name: Check Env + agentSelector: "{nodeLabel: 'swx-clx01 || swx-clx02'}" + containerSelector: "{name:'centos8'}" + run: | + echo "INFO: check environment" + hostname + printenv + cat /proc/1/cgroup + cat /etc/*release* + id + #find /opt/nvidia + #ibv_devinfo + #nvidia-smi + #nvidia-smi topo -m + #============================================================================ + - name: Run XCCL tests + #agentSelector: "{nodeLabel: 'swx-clx01 || swx-clx02'}" + containerSelector: "{name:'centos8'}" + run: | + echo "INFO: Run XCCL tests" + . "${TORCH_UCC_PYTHON_VENV_DIR}/xccl/bin/activate" + hostname + cat /proc/1/cgroup + pip3 list | grep torch + ${TORCH_UCC_SRC_DIR}/.ci/scripts/run_tests_xccl.sh + deactivate + #============================================================================ +# - name: Run UCC tests +# #agentSelector: "{nodeLabel: 'swx-clx01 || swx-clx02'}" +# containerSelector: "{name:'centos8'}" +# run: | +# echo "INFO: Run UCC tests" +# . "${TORCH_UCC_PYTHON_VENV_DIR}/ucc/bin/activate" +# hostname +# cat /proc/1/cgroup +# pip3 list | grep torch +# ${TORCH_UCC_SRC_DIR}/.ci/scripts/run_tests_ucc.sh +# deactivate + #============================================================================ + - name: Run Torch-UCC tests (XCCL) + #agentSelector: "{nodeLabel: 'swx-clx01 || swx-clx02'}" + containerSelector: "{name:'centos8'}" + run: | + echo "INFO: Run Torch-UCC tests (XCCL)" + . "${TORCH_UCC_PYTHON_VENV_DIR}/xccl/bin/activate" + hostname + cat /proc/1/cgroup + pip3 list | grep torch + ${TORCH_UCC_SRC_DIR}/.ci/scripts/run_tests_torch_xccl.sh + deactivate + #============================================================================ + - name: Run Torch-UCC tests (UCC) + #agentSelector: "{nodeLabel: 'swx-clx01 || swx-clx02'}" + containerSelector: "{name:'centos8'}" + run: | + echo "INFO: Run Torch-UCC tests (UCC)" + . "${TORCH_UCC_PYTHON_VENV_DIR}/ucc/bin/activate" + hostname + cat /proc/1/cgroup + pip3 list | grep torch + ${TORCH_UCC_SRC_DIR}/.ci/scripts/run_tests_torch_ucc.sh + deactivate + #============================================================================ + - name: Run DLRM tests (XCCL/GPU) + agentSelector: "{nodeLabel: 'swx-clx01 || swx-clx02'}" + run: | + echo "INFO: Run DLRM tests (XCCL/GPU)" + hostname + printenv + cat /proc/1/cgroup + cat /etc/*release* + id + find /opt/nvidia + ibv_devinfo + nvidia-smi + ${WORKSPACE}/.ci/scripts/run_dlrm_docker.sh xccl + #============================================================================ + - name: Run DLRM tests (UCC/GPU) + agentSelector: "{nodeLabel: 'swx-clx01 || swx-clx02'}" + run: | + echo "INFO: Run DLRM tests (UCC/GPU)" + hostname + printenv + cat /proc/1/cgroup + cat /etc/*release* + id + find /opt/nvidia + ibv_devinfo + nvidia-smi + ${WORKSPACE}/.ci/scripts/run_dlrm_docker.sh ucc + #============================================================================ +# - name: Run PARAM benchmarks +# agentSelector: "{nodeLabel: 'swx-clx01 || swx-clx02'}" +# run: | +# echo "INFO: Run PARAM benchmarks" +# hostname +# cat /proc/1/cgroup +# #${TORCH_UCC_SRC_DIR}/.ci/scripts/run_param_benchmarks.sh + #============================================================================ diff --git a/.ci/patches/dlrm/0001-Added-torch_ucc-support.patch b/.ci/patches/dlrm/0001-Added-torch_ucc-support.patch new file mode 100644 index 0000000..2620579 --- /dev/null +++ b/.ci/patches/dlrm/0001-Added-torch_ucc-support.patch @@ -0,0 +1,30 @@ +From bcd8fc065ef04a0ea8f06e61a5e2581a308719fd Mon Sep 17 00:00:00 2001 +From: artemry-nv +Date: Tue, 9 Mar 2021 00:41:16 +0300 +Subject: [PATCH] Added torch_ucc support + +Signed-off-by: artemry-nv +--- + extend_distributed.py | 6 ++++++ + 1 file changed, 6 insertions(+) + +diff --git a/extend_distributed.py b/extend_distributed.py +index adcb60b..1f2c8a5 100644 +--- a/extend_distributed.py ++++ b/extend_distributed.py +@@ -20,6 +20,12 @@ except ImportError as e: + # print(e) + torch_ccl = False + ++try: ++ import torch_ucc ++except ImportError as e: ++ torch_ucc = False ++ ++ + my_rank = -1 + my_size = -1 + my_local_rank = -1 +-- +2.24.3 (Apple Git-128) + diff --git a/.ci/patches/dlrm/0002-Fixed-arg-list.patch b/.ci/patches/dlrm/0002-Fixed-arg-list.patch new file mode 100644 index 0000000..4f6b1ce --- /dev/null +++ b/.ci/patches/dlrm/0002-Fixed-arg-list.patch @@ -0,0 +1,25 @@ +From 481fd6aef896aa8ff15a161b7e88b2ea01ae673a Mon Sep 17 00:00:00 2001 +From: artemry-nv +Date: Mon, 29 Mar 2021 01:56:08 +0300 +Subject: [PATCH] Fixed arg list + +--- + dlrm_s_pytorch.py | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/dlrm_s_pytorch.py b/dlrm_s_pytorch.py +index 71a0414..36dab9b 100644 +--- a/dlrm_s_pytorch.py ++++ b/dlrm_s_pytorch.py +@@ -1477,7 +1477,7 @@ def run(): + + ext_dist.barrier() + with torch.autograd.profiler.profile( +- args.enable_profiling, use_gpu, record_shapes=True ++ args.enable_profiling, use_cuda=use_gpu, record_shapes=True + ) as prof: + if not args.inference_only: + k = 0 +-- +2.24.3 (Apple Git-128) + diff --git a/.ci/proj_jjb.yaml b/.ci/proj_jjb.yaml new file mode 100644 index 0000000..0760821 --- /dev/null +++ b/.ci/proj_jjb.yaml @@ -0,0 +1,79 @@ +- job-template: + name: "{jjb_proj}" + project-type: pipeline + properties: + - github: + url: "{jjb_git}" + - build-discarder: + days-to-keep: 50 + num-to-keep: 20 + - inject: + keep-system-variables: true + properties-content: | + jjb_proj={jjb_proj} + description: Do NOT edit this job through the Web GUI ! + concurrent: true + sandbox: true + parameters: + - string: + name: "sha1" + default: "master" + description: "Commit to be checked, set by PR" + - bool: + name: "build_dockers" + default: false + description: "Rebuild docker containers" + - string: + name: "conf_file" + default: ".ci/job_matrix.yaml" + description: "Regex to select job config file" + - bool: + name: "do_release" + default: false + description: "Release rpm" + - string: + name: "release_dir" + default: "/.autodirect/sw/release/sw_acceleration/{jjb_proj}" + description: "Location to release rpm to" + - string: + name: "script" + default: "{jjb_jenkinsfile}" + description: "Jenkinsfile to load on trigger" + - string: + name: "DEBUG" + default: 0 + description: "Enable debug prints and traces, valid values are 0-9" + # triggers: + # - github-pull-request: + # cron: 'H/5 * * * *' + # trigger-phrase: '.*\bbot:retest\b.*' + # status-add-test-results: true + # auth-id: '549927eb-7f38-4a8f-997a-81dd63605782' + # org-list: ["Mellanox"] + # white-list: ["swx-jenkins","swx-jenkins2","swx-jenkins3","mike-dubman","mellanox-github"] + # allow-whitelist-orgs-as-admins: true + pipeline-scm: + scm: + - git: + url: "{jjb_git}" + credentials-id: '549927eb-7f38-4a8f-997a-81dd63605782' + branches: [ '$sha1' ] + shallow-clone: true + depth: 10 + refspec: "+refs/heads/*:refs/remotes/origin/* +refs/pull/*:refs/remotes/origin/pr/*" + browser: githubweb + browser-url: "{jjb_git}" + script-path: "$script" + +- project: + name: proj_name + # TODO + jjb_email: 'TODO' + jjb_proj: 'torch-ucc' + # TODO tmp + jjb_git: 'git@github.com:artemry-nv/torch-ucc.git' + # TODO + jjb_owner: 'TODO' + jjb_jenkinsfile: '.ci/Jenkinsfile.shlib' + jobs: + - "{jjb_proj}" diff --git a/.ci/scripts/build_ucc.sh b/.ci/scripts/build_ucc.sh new file mode 100755 index 0000000..988d570 --- /dev/null +++ b/.ci/scripts/build_ucc.sh @@ -0,0 +1,16 @@ +#!/bin/bash -eEx +set -o pipefail + +echo "INFO: Build UCC" +UCC_SRC_DIR="${TORCH_UCC_SRC_DIR}/ucc" +cd "${UCC_SRC_DIR}" +"${UCC_SRC_DIR}/autogen.sh" +mkdir -p "${UCC_SRC_DIR}/build" +cd "${UCC_SRC_DIR}/build" +"${UCC_SRC_DIR}/configure" --with-ucx="${UCX_INSTALL_DIR}" --with-cuda="${CUDA_HOME}" \ + --prefix="${UCC_INSTALL_DIR}" --enable-gtest +make -j install +echo "${UCC_INSTALL_DIR}/lib" > /etc/ld.so.conf.d/ucc.conf +ldconfig +ldconfig -p | grep -i libucc +cd "${UCC_INSTALL_DIR}" && tar cfz "${TORCH_UCC_PKG_DIR}/ucc.tgz" --owner=0 --group=0 . diff --git a/.ci/scripts/build_ucx.sh b/.ci/scripts/build_ucx.sh new file mode 100755 index 0000000..8df3411 --- /dev/null +++ b/.ci/scripts/build_ucx.sh @@ -0,0 +1,17 @@ +#!/bin/bash -eEx +set -o pipefail + +echo "INFO: Build UCX" +cd "${TORCH_UCC_SRC_DIR}/ucx" +git checkout "${UCX_BRANCH}" +"${TORCH_UCC_SRC_DIR}/ucx/autogen.sh" +mkdir -p "${TORCH_UCC_SRC_DIR}/ucx/build-${UCX_BUILD_TYPE}" +cd "${TORCH_UCC_SRC_DIR}/ucx/build-${UCX_BUILD_TYPE}" +# TODO debug +"${TORCH_UCC_SRC_DIR}/ucx/contrib/configure-release-mt" --with-cuda="${CUDA_HOME}" --prefix="${UCX_INSTALL_DIR}" +#"${TORCH_UCC_SRC_DIR}/ucx/contrib/configure-release-mt" --prefix="${UCX_INSTALL_DIR}" +make -j install +echo "${UCX_INSTALL_DIR}/lib" > /etc/ld.so.conf.d/ucx.conf +ldconfig +ldconfig -p | grep -i ucx +cd "${UCX_INSTALL_DIR}" && tar cfz "${TORCH_UCC_PKG_DIR}/ucx-${UCX_BUILD_TYPE}.tgz" --owner=0 --group=0 . diff --git a/.ci/scripts/build_xccl.sh b/.ci/scripts/build_xccl.sh new file mode 100755 index 0000000..9571c71 --- /dev/null +++ b/.ci/scripts/build_xccl.sh @@ -0,0 +1,19 @@ +#!/bin/bash -eEx +set -o pipefail + +echo "INFO: Build XCCL" +XCCL_SRC_DIR="${TORCH_UCC_SRC_DIR}/xccl" +cd "${XCCL_SRC_DIR}" +"${XCCL_SRC_DIR}/autogen.sh" +mkdir -p "${XCCL_SRC_DIR}/build-${XCCL_BUILD_TYPE}" +cd "${XCCL_SRC_DIR}/build-${XCCL_BUILD_TYPE}" +# TODO enable CUDA (compilation failed) +#"${XCCL_SRC_DIR}/configure" --with-ucx="${UCX_INSTALL_DIR}" --prefix="${XCCL_INSTALL_DIR}" --enable-debug +"${XCCL_SRC_DIR}/configure" --with-cuda="${CUDA_HOME}" --with-ucx="${UCX_INSTALL_DIR}" \ + --prefix="${XCCL_INSTALL_DIR}" --enable-debug +make -j install +echo "${XCCL_INSTALL_DIR}/lib" > /etc/ld.so.conf.d/xccl.conf +ldconfig +ldconfig -p | grep -i libxccl +make -C test +cd "${XCCL_INSTALL_DIR}" && tar cfz "${TORCH_UCC_PKG_DIR}/xccl-${XCCL_BUILD_TYPE}.tgz" --owner=0 --group=0 . diff --git a/.ci/scripts/configure_python.sh b/.ci/scripts/configure_python.sh new file mode 100755 index 0000000..45dbc94 --- /dev/null +++ b/.ci/scripts/configure_python.sh @@ -0,0 +1,29 @@ +#!/bin/bash -eEx +set -o pipefail + +# Install conda +#cd /tmp +#curl -LO http://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh +#bash Miniconda3-latest-Linux-x86_64.sh -p /opt/conda -b +#rm -f Miniconda3-latest-Linux-x86_64.sh +#export PATH /opt/conda/bin:${PATH} + +# Install conda python +#conda update -y conda +#conda install -c anaconda -y \ +# python \ +# pip \ +# scikit-learn +#pip3 install --no-cache-dir python-hostlist + +#alternatives --set python /opt/conda/bin/python3 +alternatives --set python /usr/bin/python3 +pip3 install --user --upgrade setuptools wheel + +command -v python +python --version + +command -v python3 +python3 --version + +pip3 list diff --git a/.ci/scripts/env.sh b/.ci/scripts/env.sh new file mode 100755 index 0000000..649acaa --- /dev/null +++ b/.ci/scripts/env.sh @@ -0,0 +1,16 @@ +#!/bin/bash -eEx + +SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd -P)" + +# shellcheck disable=SC2034 +#DLRM_MODEL="big" +DLRM_MODEL="small" + +HOSTNAME=$(hostname -s) +export HOSTNAME +SRC_ROOT_DIR=$(cd "${SCRIPT_DIR}/../../" && pwd -P) +export CONFIGS_DIR="${SRC_ROOT_DIR}/.ci/configs" + +# DLRM MASTER_PORT +export MASTER_PORT="12346" +export DOCKER_SSH_PORT="12345" diff --git a/.ci/scripts/install_torch.sh b/.ci/scripts/install_torch.sh new file mode 100755 index 0000000..dce6bd6 --- /dev/null +++ b/.ci/scripts/install_torch.sh @@ -0,0 +1,40 @@ +#!/bin/bash -eEx +set -o pipefail + +# TODO debug +#cd /tmp +#git clone https://github.com/pytorch/pytorch.git +#cd /tmp/pytorch +#git submodule sync --recursive +#git submodule update --init --recursive +#pip3 install -r requirements.txt +#export TORCH_CUDA_ARCH_LIST="7.0 8.0+PTX" +#export USE_GLOO=1 +#export USE_DISTRIBUTED=1 +#export USE_OPENCV=0 +## TODO debug +#export USE_CUDA=1 +##export USE_CUDA=0 +#export USE_NCCL=0 +#export USE_MKLDNN=0 +#export BUILD_TEST=0 +#export USE_FBGEMM=0 +#export USE_NNPACK=0 +#export USE_QNNPACK=0 +#export USE_XNNPACK=0 +#export USE_KINETO=1 +#export MAX_JOBS=$(($(nproc)-1)) +#python setup.py install +#cd - +#rm -rf /tmp/pytorch + +# TODO debug +#conda install -y pytorch torchvision cpuonly -c pytorch-nightly +#conda install pytorch torchvision torchaudio cudatoolkit=11.0 -c pytorch-nightly +#conda uninstall -y pytorch torchvision +#conda install pytorch torchvision cudatoolkit=11.0 -c pytorch-nightly +#conda install pytorch cudatoolkit=11.0 -c pytorch-nightly + +pip3 install --default-timeout=900 numpy +pip3 install --default-timeout=900 --pre torch -f https://download.pytorch.org/whl/nightly/cu110/torch_nightly.html +pip3 install "git+https://github.com/mlperf/logging.git@0.7.1" diff --git a/.ci/scripts/install_torch_ucc.sh b/.ci/scripts/install_torch_ucc.sh new file mode 100755 index 0000000..a0d2527 --- /dev/null +++ b/.ci/scripts/install_torch_ucc.sh @@ -0,0 +1,18 @@ +#!/bin/bash -eEx +set -o pipefail + +# UCC +echo "INFO: Install Torch-UCC (UCC version)" +cd "${TORCH_UCC_PYTHON_VENV_DIR}" +python3 -m venv --system-site-packages ucc +. "${TORCH_UCC_PYTHON_VENV_DIR}/ucc/bin/activate" +export UCX_HOME=${UCX_INSTALL_DIR} +export UCC_HOME=${UCC_INSTALL_DIR} +export WITH_CUDA=${CUDA_HOME} +cd "${TORCH_UCC_SRC_DIR}" +git clean -ffdx +python setup.py install bdist_wheel +pip3 list | grep torch +python -c 'import torch, torch_ucc' +cp "${TORCH_UCC_SRC_DIR}/dist/"*.whl "${TORCH_UCC_PKG_DIR}" +deactivate diff --git a/.ci/scripts/install_torch_xccl.sh b/.ci/scripts/install_torch_xccl.sh new file mode 100755 index 0000000..92231b4 --- /dev/null +++ b/.ci/scripts/install_torch_xccl.sh @@ -0,0 +1,21 @@ +#!/bin/bash -eEx +set -o pipefail + +# XCCL +echo "INFO: Install Torch-UCC (XCCL version)" +cd "${TORCH_UCC_PYTHON_VENV_DIR}" +python3 -m venv --system-site-packages xccl +. "${TORCH_UCC_PYTHON_VENV_DIR}/xccl/bin/activate" +export UCX_HOME=${UCX_INSTALL_DIR} +export XCCL_HOME=${XCCL_INSTALL_DIR} +export WITH_CUDA=${CUDA_HOME} +TORCH_UCC_SRC_DIR_WITH_XCCL_SUPPORT="${TORCH_UCC_SRC_DIR}_xccl" +mkdir -p "${TORCH_UCC_SRC_DIR_WITH_XCCL_SUPPORT}" +git clone https://github.com/openucx/torch-ucc.git "${TORCH_UCC_SRC_DIR_WITH_XCCL_SUPPORT}" +cd "${TORCH_UCC_SRC_DIR_WITH_XCCL_SUPPORT}" +git clean -ffdx +python setup.py install bdist_wheel +pip3 list | grep torch +python -c 'import torch, torch_ucc' +cp "${TORCH_UCC_SRC_DIR_WITH_XCCL_SUPPORT}/dist/"*.whl "${TORCH_UCC_PKG_DIR}" +deactivate diff --git a/.ci/scripts/run_dlrm.sh b/.ci/scripts/run_dlrm.sh new file mode 100755 index 0000000..026870c --- /dev/null +++ b/.ci/scripts/run_dlrm.sh @@ -0,0 +1,71 @@ +#!/bin/bash -eEx +set -o pipefail + +SCRIPT_DIR="$( + cd "$(dirname "$0")" + pwd -P +)" +cd "${SCRIPT_DIR}" +. "${SCRIPT_DIR}/env.sh" + +TORCH_UCC_MODE="$1" +CPU_GPU_MODE="$2" +HOSTFILE="$3" + +if [ "${TORCH_UCC_MODE}" != "ucc" ] && [ "${TORCH_UCC_MODE}" != "xccl" ]; then + echo "ERROR: unsupported or empty TORCH_UCC_MODE (${TORCH_UCC_MODE}), supported values: ucc, xccl" + exit 1 +fi + +export TORCH_UCC_MODE +export CPU_GPU_MODE + +if [ -z "$HOSTFILE" ]; then + echo "ERROR: HOSTFILE is not specified" + exit 1 +fi + +export PATH="/usr/lib64/openmpi/bin:$PATH" +export LD_LIBRARY_PATH="/usr/lib64/openmpi/lib:${LD_LIBRARY_PATH}" + +HEAD_NODE=$(head -1 "$HOSTFILE") +export HEAD_NODE +export MASTER_ADDR=${HEAD_NODE} + +NP=$(wc --lines "$HOSTFILE" | awk '{print $1}') + +# shellcheck disable=SC2086 +mpirun \ + -np $NP \ + --hostfile ${HOSTFILE} \ + --map-by node \ + --allow-run-as-root \ + --mca plm_rsh_args '-p 12345' \ + -x PATH \ + -x LD_LIBRARY_PATH \ + hostname + +# shellcheck disable=SC2086 +mpirun \ + -np $NP \ + --hostfile ${HOSTFILE} \ + --map-by node \ + --allow-run-as-root \ + --mca plm_rsh_args '-p 12345' \ + -x PATH \ + -x LD_LIBRARY_PATH \ + cat /proc/1/cgroup + +# shellcheck disable=SC2086 +mpirun \ + -np $NP \ + --hostfile ${HOSTFILE} \ + --map-by node \ + --allow-run-as-root \ + --mca plm_rsh_args '-p 12345' \ + -x PATH \ + -x LD_LIBRARY_PATH \ + -x MASTER_ADDR \ + -x TORCH_UCC_MODE \ + -x CPU_GPU_MODE \ + /opt/nvidia/torch-ucc/src/.ci/scripts/run_dlrm_s_pytorch.sh diff --git a/.ci/scripts/run_dlrm_docker.sh b/.ci/scripts/run_dlrm_docker.sh new file mode 100755 index 0000000..5ea307b --- /dev/null +++ b/.ci/scripts/run_dlrm_docker.sh @@ -0,0 +1,100 @@ +#!/bin/bash -eEx +set -o pipefail + +function err_report () { + echo "Exited with ERROR in line $1" + exit 1 +} +trap 'err_report $LINENO' ERR + +SCRIPT_DIR="$( + cd "$(dirname "$0")" + pwd -P +)" +cd "${SCRIPT_DIR}" +. "${SCRIPT_DIR}/env.sh" + +TORCH_UCC_MODE="$1" + +if [ "${TORCH_UCC_MODE}" != "ucc" ] && [ "${TORCH_UCC_MODE}" != "xccl" ]; then + echo "ERROR: unsupported or empty TORCH_UCC_MODE (${TORCH_UCC_MODE}), supported values: ucc, xccl" + exit 1 +fi + +export HOSTFILE=${HOSTFILE:-${CONFIGS_DIR}/$HOSTNAME/hostfile.txt} + +if [ ! -f "${HOSTFILE}" ]; then + echo "ERROR: ${HOSTFILE} does not exist" + exit 1 +fi + +# shellcheck disable=SC2002 +HOSTS=$(cat "$HOSTFILE" | xargs | tr ' ' ',') +export HOSTS +HEAD_NODE=$(head -1 "$HOSTFILE") +export HEAD_NODE + +DOCKER_CONTAINER_NAME="torch_ucc" +# TODO debug +DOCKER_IMAGE_NAME="${TORCH_UCC_DOCKER_IMAGE_NAME}:${BUILD_ID}" +#DOCKER_IMAGE_NAME="harbor.mellanox.com/torch-ucc/1.0.0/x86_64/centos8/cuda11.2.1:205" + +DOCKER_RUN_ARGS="\ +--pull always \ +--network=host \ +--uts=host \ +--ipc=host \ +--ulimit stack=67108864 \ +--ulimit memlock=-1 \ +--security-opt seccomp=unconfined \ +--cap-add=SYS_ADMIN \ +--device=/dev/infiniband/ \ +--gpus all \ +--user root \ +-it \ +-d \ +--rm \ +--name=${DOCKER_CONTAINER_NAME} \ +-v /labhome:/labhome \ +-v /root/.ssh:/root/.ssh \ +" + +# shellcheck disable=SC2013 +for HOST in $(cat "$HOSTFILE"); do + echo "INFO: HOST = $HOST" + + STALE_DOCKER_CONTAINER_LIST=$(sudo ssh -n "$HOST" "docker ps -a -q -f name=${DOCKER_CONTAINER_NAME}") + if [ -n "${STALE_DOCKER_CONTAINER_LIST}" ]; then + echo "WARNING: stale docker container (name: ${DOCKER_CONTAINER_NAME}) is detected on ${HOST} (to be stopped)" + echo "INFO: Stopping stale docker container (name: ${DOCKER_CONTAINER_NAME}) on ${HOST}..." + sudo ssh "${HOST}" docker stop ${DOCKER_CONTAINER_NAME} + echo "INFO: Stopping stale docker container (name: ${DOCKER_CONTAINER_NAME}) on ${HOST}... DONE" + fi + + echo "INFO: start docker container on $HOST ..." + # shellcheck disable=SC2029 + sudo ssh "$HOST" "docker run \ + ${DOCKER_RUN_ARGS} \ + ${DOCKER_IMAGE_NAME} \ + bash -c '/usr/sbin/sshd -p ${DOCKER_SSH_PORT}; sleep infinity'" + echo "INFO: start docker container on $HOST ... DONE" + + sleep 5 + + echo "INFO: verify docker container on $HOST ..." + sudo ssh -p "${DOCKER_SSH_PORT}" "$HOST" hostname + sudo ssh -p "${DOCKER_SSH_PORT}" "$HOST" cat /proc/1/cgroup + echo "INFO: verify docker container on $HOST ... DONE" +done + +# TODO remove sudo +sudo ssh -p "${DOCKER_SSH_PORT}" "${HEAD_NODE}" /opt/nvidia/torch-ucc/src/.ci/scripts/run_dlrm.sh ${TORCH_UCC_MODE} cpu /opt/nvidia/torch-ucc/src/.ci/configs/$HOSTNAME/hostfile.txt +sudo ssh -p "${DOCKER_SSH_PORT}" "${HEAD_NODE}" /opt/nvidia/torch-ucc/src/.ci/scripts/run_dlrm.sh ${TORCH_UCC_MODE} gpu /opt/nvidia/torch-ucc/src/.ci/configs/$HOSTNAME/hostfile.txt + +# TODO debug +# shellcheck disable=SC2013 +#for HOST in $(cat "$HOSTFILE"); do +# echo "INFO: stop docker container on $HOST ..." +# ssh "${HOST}" docker stop ${DOCKER_CONTAINER_NAME} +# echo "INFO: stop docker container on $HOST ... DONE" +#done diff --git a/.ci/scripts/run_dlrm_s_pytorch.sh b/.ci/scripts/run_dlrm_s_pytorch.sh new file mode 100755 index 0000000..82b2f1b --- /dev/null +++ b/.ci/scripts/run_dlrm_s_pytorch.sh @@ -0,0 +1,81 @@ +#!/bin/bash -eEx +set -o pipefail + +SCRIPT_DIR="$( + cd "$(dirname "$0")" + pwd -P +)" +cd "${SCRIPT_DIR}" +. "${SCRIPT_DIR}/env.sh" + +if [ "${TORCH_UCC_MODE}" != "ucc" ] && [ "${TORCH_UCC_MODE}" != "xccl" ]; then + echo "ERROR: unsupported or empty TORCH_UCC_MODE (${TORCH_UCC_MODE}), supported values: ucc, xccl" + exit 1 +fi + +# shellcheck disable=SC1090 +. "/opt/nvidia/torch-ucc/bin/python/venv/${TORCH_UCC_MODE}/bin/activate" +pip3 list | grep torch +python -c 'import torch, torch_ucc' + +case ${DLRM_MODEL} in +"big") + emb_size="1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000" + emb_dim="256" + emb_lookup="100" + bot_mlp="512-512-256" + top_mlp="1024-1024-1024-1" + loss_func="mse" + round_targets="False" + lr="0.01" + #mb_size="2048" + emb_lookup_fixed="0" + ;; +"small") + emb_size="1000-1000-1000-1000-1000-1000-1000-1000" + emb_dim="64" + emb_lookup="100" + bot_mlp="512-512-64" + top_mlp="1024-1024-1024-1" + loss_func="mse" + round_targets="False" + lr="0.01" + #mb_size="2048" + emb_lookup_fixed="0" + ;; +*) + echo "ERROR: unsupported or empty DLRM_MODEL (${DLRM_MODEL})" + exit 1 + ;; +esac + +export UCX_NET_DEVICES="mlx5_0:1" + +if [ "${CPU_GPU_MODE}" = "gpu" ]; then + DLRM_S_PYTORCH_EXTRA_ARGS="--use-gpu" +fi + +# shellcheck disable=SC2086 +python /opt/nvidia/torch-ucc/workloads/dlrm/dlrm_s_pytorch.py \ + --mini-batch-size=2048 \ + --test-mini-batch-size=16384 \ + --test-num-workers=0 \ + --num-batches=10 \ + --data-generation=random \ + --arch-mlp-bot=$bot_mlp \ + --arch-mlp-top=$top_mlp \ + --arch-sparse-feature-size=$emb_dim \ + --arch-embedding-size=$emb_size \ + --num-indices-per-lookup=$emb_lookup \ + --num-indices-per-lookup-fixed=$emb_lookup_fixed \ + --arch-interaction-op=dot \ + --numpy-rand-seed=727 \ + --print-freq=1 \ + --loss-function=$loss_func \ + --round-targets=$round_targets \ + --learning-rate=$lr \ + --print-time \ + --dist-backend=ucc \ + ${DLRM_S_PYTORCH_EXTRA_ARGS} + +deactivate diff --git a/.ci/scripts/run_param_benchmarks.sh b/.ci/scripts/run_param_benchmarks.sh new file mode 100755 index 0000000..5c35e31 --- /dev/null +++ b/.ci/scripts/run_param_benchmarks.sh @@ -0,0 +1,46 @@ +#!/bin/bash -eEx +set -o pipefail + +# TODO debug +exit 0 + +source /workspace/set-env-dist.sh +index=$LOCAL_RANK +export OMPI_COMM_WORLD_SIZE=$WORLD_SIZE +export OMPI_COMM_WORLD_LOCAL_SIZE=$LOCAL_SIZE +export OMPI_COMM_WORLD_RANK=$RANK +export OMPI_COMM_WORLD_LOCAL_RANK=$LOCAL_RANK + +if (( $index == 0 )); then + export UCX_NET_DEVICES=mlx5_0:1 + NUMA="numactl --physcpubind=48-63 --membind=3 " +elif (( $index == 1 )); then + export UCX_NET_DEVICES=mlx5_1:1 + NUMA="numactl --physcpubind=48-63 --membind=3 " +elif (( $index == 2 )); then + export UCX_NET_DEVICES=mlx5_2:1 + NUMA="numactl --physcpubind=16-31 --membind=1 " +elif (( $index == 3 )); then + export UCX_NET_DEVICES=mlx5_3:1 + NUMA="numactl --physcpubind=16-31 --membind=1 " +elif (( $index == 4 )); then + export UCX_NET_DEVICES=mlx5_6:1 + NUMA="numactl --physcpubind=112-127 --membind=7 " +elif (( $index == 5 )); then + export UCX_NET_DEVICES=mlx5_7:1 + NUMA="numactl --physcpubind=112-127 --membind=7 " +elif (( $index == 6 )); then + export UCX_NET_DEVICES=mlx5_8:1 + NUMA="numactl --physcpubind=80-95 --membind=5 " +elif (( $index == 7 )); then + export UCX_NET_DEVICES=mlx5_9:1 + NUMA="numactl --physcpubind=80-95 --membind=5 " +fi + +export XCCL_TEAM_UCX_NET_DEVICES=$UCX_NET_DEVICES +export XCCL_TEAM_HIER_NET_DEVICES=$UCX_NET_DEVICES + +EXE="$NUMA python /workspace/param/train/comms/pt/comms.py \ + --master-ip $MASTER_ADDR \ + --master-port $MASTER_PORT $@" +$EXE diff --git a/.ci/scripts/run_tests_torch_ucc.sh b/.ci/scripts/run_tests_torch_ucc.sh new file mode 100755 index 0000000..7ffcca1 --- /dev/null +++ b/.ci/scripts/run_tests_torch_ucc.sh @@ -0,0 +1,26 @@ +#!/bin/bash -eEx +set -o pipefail + +command -v mpirun +export TORCH_UCC_XCCL_TLS=ucx +export UCX_WARN_UNUSED_ENV_VARS=n +ucx_info -e -u t +TORCH_UCC_SRC_DIR_WITH_XCCL_SUPPORT="${TORCH_UCC_SRC_DIR}_xccl" + +echo "UCC barrier" +/bin/bash ${TORCH_UCC_SRC_DIR_WITH_XCCL_SUPPORT}/test/start_test.sh ${TORCH_UCC_SRC_DIR_WITH_XCCL_SUPPORT}/test/torch_barrier_test.py --backend=gloo + +echo "UCC alltoall" +/bin/bash ${TORCH_UCC_SRC_DIR_WITH_XCCL_SUPPORT}/test/start_test.sh ${TORCH_UCC_SRC_DIR_WITH_XCCL_SUPPORT}/test/torch_alltoall_test.py --backend=gloo + +echo "UCC alltoallv" +/bin/bash ${TORCH_UCC_SRC_DIR_WITH_XCCL_SUPPORT}/test/start_test.sh ${TORCH_UCC_SRC_DIR_WITH_XCCL_SUPPORT}/test/torch_alltoallv_test.py --backend=gloo + +echo "UCC allgather" +/bin/bash ${TORCH_UCC_SRC_DIR_WITH_XCCL_SUPPORT}/test/start_test.sh ${TORCH_UCC_SRC_DIR_WITH_XCCL_SUPPORT}/test/torch_allgather_test.py --backend=gloo + +echo "UCC allreduce" +/bin/bash ${TORCH_UCC_SRC_DIR_WITH_XCCL_SUPPORT}/test/start_test.sh ${TORCH_UCC_SRC_DIR_WITH_XCCL_SUPPORT}/test/torch_allreduce_test.py --backend=gloo + +echo "UCC broadcast" +/bin/bash ${TORCH_UCC_SRC_DIR_WITH_XCCL_SUPPORT}/test/start_test.sh ${TORCH_UCC_SRC_DIR_WITH_XCCL_SUPPORT}/test/torch_bcast_test.py --backend=gloo diff --git a/.ci/scripts/run_tests_torch_xccl.sh b/.ci/scripts/run_tests_torch_xccl.sh new file mode 100755 index 0000000..b817470 --- /dev/null +++ b/.ci/scripts/run_tests_torch_xccl.sh @@ -0,0 +1,24 @@ +#!/bin/bash -eEx +set -o pipefail + +command -v mpirun +export UCX_WARN_UNUSED_ENV_VARS=n +ucx_info -e -u t + +echo "XCCL allreduce" +/bin/bash ${TORCH_UCC_SRC_DIR}/test/start_test.sh ${TORCH_UCC_SRC_DIR}/test/torch_allreduce_test.py --backend=gloo + +echo "XCCL alltoall" +/bin/bash ${TORCH_UCC_SRC_DIR}/test/start_test.sh ${TORCH_UCC_SRC_DIR}/test/torch_alltoall_test.py --backend=gloo + +echo "XCCL alltoallv" +/bin/bash ${TORCH_UCC_SRC_DIR}/test/start_test.sh ${TORCH_UCC_SRC_DIR}/test/torch_alltoallv_test.py --backend=gloo + +echo "XCCL barrier" +/bin/bash ${TORCH_UCC_SRC_DIR}/test/start_test.sh ${TORCH_UCC_SRC_DIR}/test/torch_barrier_test.py --backend=gloo + +echo "XCCL allgather" +/bin/bash ${TORCH_UCC_SRC_DIR}/test/start_test.sh ${TORCH_UCC_SRC_DIR}/test/torch_allgather_test.py --backend=gloo + +echo "XCCL broadcast" +/bin/bash ${TORCH_UCC_SRC_DIR}/test/start_test.sh ${TORCH_UCC_SRC_DIR}/test/torch_bcast_test.py --backend=gloo diff --git a/.ci/scripts/run_tests_ucc.sh b/.ci/scripts/run_tests_ucc.sh new file mode 100755 index 0000000..bed2dd5 --- /dev/null +++ b/.ci/scripts/run_tests_ucc.sh @@ -0,0 +1,8 @@ +#!/bin/bash -eEx +set -o pipefail + +UCC_SRC_DIR="${TORCH_UCC_SRC_DIR}/ucc" +cd "${UCC_SRC_DIR}/build" + +export UCX_WARN_UNUSED_ENV_VARS=n +make gtest diff --git a/.ci/scripts/run_tests_xccl.sh b/.ci/scripts/run_tests_xccl.sh new file mode 100755 index 0000000..c3929ca --- /dev/null +++ b/.ci/scripts/run_tests_xccl.sh @@ -0,0 +1,25 @@ +#!/bin/bash -eEx +set -o pipefail + +command -v mpirun +export UCX_SOCKADDR_CM_ENABLE=n +export UCX_WARN_UNUSED_ENV_VARS=n +MPI_ARGS_COMMON="--allow-run-as-root --oversubscribe -np 8 -H localhost:8 --bind-to none --mca coll ^hcoll --mca btl ^openib --mca mtl ^ofi" +mpirun ${MPI_ARGS_COMMON} -x XCCL_TEST_TLS=hier ${TORCH_UCC_SRC_DIR}/xccl/build-${XCCL_BUILD_TYPE}/test/test_mpi_allreduce +mpirun ${MPI_ARGS_COMMON} -x XCCL_TEST_TLS=hier ${TORCH_UCC_SRC_DIR}/xccl/build-${XCCL_BUILD_TYPE}/test/test_mpi_bcast +mpirun ${MPI_ARGS_COMMON} -x XCCL_TEST_TLS=hier ${TORCH_UCC_SRC_DIR}/xccl/build-${XCCL_BUILD_TYPE}/test/test_mpi_barrier + +mpirun ${MPI_ARGS_COMMON} -x XCCL_TEAM_HIER_NODE_LEADER_RANK_ID=3 -x XCCL_TEST_TLS=hier ${TORCH_UCC_SRC_DIR}/xccl/build-${XCCL_BUILD_TYPE}/test/test_mpi_allreduce +mpirun ${MPI_ARGS_COMMON} -x XCCL_TEAM_HIER_NODE_LEADER_RANK_ID=4 -x XCCL_TEST_TLS=hier ${TORCH_UCC_SRC_DIR}/xccl/build-${XCCL_BUILD_TYPE}/test/test_mpi_bcast +mpirun ${MPI_ARGS_COMMON} -x XCCL_TEAM_HIER_NODE_LEADER_RANK_ID=5 -x XCCL_TEST_TLS=hier ${TORCH_UCC_SRC_DIR}/xccl/build-${XCCL_BUILD_TYPE}/test/test_mpi_barrier + +mpirun ${MPI_ARGS_COMMON} -x XCCL_TEAM_UCX_ALLREDUCE_ALG_ID=0 -x XCCL_TEST_TLS=ucx ${TORCH_UCC_SRC_DIR}/xccl/build-${XCCL_BUILD_TYPE}/test/test_mpi_allreduce +mpirun ${MPI_ARGS_COMMON} -x XCCL_TEAM_UCX_ALLREDUCE_ALG_ID=1 -x XCCL_TEST_TLS=ucx ${TORCH_UCC_SRC_DIR}/xccl/build-${XCCL_BUILD_TYPE}/test/test_mpi_allreduce +mpirun ${MPI_ARGS_COMMON} -x XCCL_TEST_TLS=ucx ${TORCH_UCC_SRC_DIR}/xccl/build-${XCCL_BUILD_TYPE}/test/test_mpi_bcast +mpirun ${MPI_ARGS_COMMON} -x XCCL_TEST_TLS=ucx ${TORCH_UCC_SRC_DIR}/xccl/build-${XCCL_BUILD_TYPE}/test/test_mpi_barrier +mpirun ${MPI_ARGS_COMMON} -x XCCL_TEST_TLS=ucx ${TORCH_UCC_SRC_DIR}/xccl/build-${XCCL_BUILD_TYPE}/test/test_mpi_alltoall +mpirun ${MPI_ARGS_COMMON} -x XCCL_TEST_TLS=ucx ${TORCH_UCC_SRC_DIR}/xccl/build-${XCCL_BUILD_TYPE}/test/test_mpi_alltoallv +mpirun ${MPI_ARGS_COMMON} -x XCCL_TEST_TLS=ucx ${TORCH_UCC_SRC_DIR}/xccl/build-${XCCL_BUILD_TYPE}/test/test_mpi_allgather +mpirun -x XCCL_TEAM_UCX_ALLTOALL_PAIRWISE_CHUNK=0 ${MPI_ARGS_COMMON} -x XCCL_TEST_TLS=ucx ${TORCH_UCC_SRC_DIR}/xccl/build-${XCCL_BUILD_TYPE}/test/test_mpi_alltoall +mpirun -x XCCL_TEAM_UCX_ALLTOALL_PAIRWISE_CHUNK=0 ${MPI_ARGS_COMMON} -x XCCL_TEST_TLS=ucx ${TORCH_UCC_SRC_DIR}/xccl/build-${XCCL_BUILD_TYPE}/test/test_mpi_alltoallv +mpirun ${MPI_ARGS_COMMON} -x XCCL_TEST_TLS=hier -x XCCL_TEST_ITERS=500 -x XCCL_TEST_NTHREADS=4 -x XCCL_TEST_CHECK=1 ${TORCH_UCC_SRC_DIR}/xccl/build-${XCCL_BUILD_TYPE}/test/test_mpi_mt diff --git a/.gitignore b/.gitignore index 6c7df1c..e7050cd 100644 --- a/.gitignore +++ b/.gitignore @@ -60,3 +60,6 @@ dist/ # vscode *.code-workspace .vscode + +# IDEs +.idea/ diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 0000000..562a232 --- /dev/null +++ b/.gitmodules @@ -0,0 +1,9 @@ +[submodule "xccl"] +path = xccl +url = https://github.com/openucx/xccl.git +[submodule "ucc"] +path = ucc +url = https://github.com/openucx/ucc.git +[submodule "ucx"] +path = ucx +url = https://github.com/openucx/ucx.git diff --git a/ucc b/ucc new file mode 160000 index 0000000..7f75fad --- /dev/null +++ b/ucc @@ -0,0 +1 @@ +Subproject commit 7f75fad3f7a72e1053cbb4246a1cc7e62c75d4b3 diff --git a/ucx b/ucx new file mode 160000 index 0000000..737b5c4 --- /dev/null +++ b/ucx @@ -0,0 +1 @@ +Subproject commit 737b5c4edface2e33c2321ac88e83320cad598eb diff --git a/xccl b/xccl new file mode 160000 index 0000000..b046913 --- /dev/null +++ b/xccl @@ -0,0 +1 @@ +Subproject commit b04691392586477dd83bdf6de75f440540cb688c