diff --git a/.ci/Dockerfile.centos8 b/.ci/Dockerfile.centos8
new file mode 100644
index 0000000..1a0fdf9
--- /dev/null
+++ b/.ci/Dockerfile.centos8
@@ -0,0 +1,94 @@
+ARG CUDA_VER='11.2.1'
+FROM nvidia/cuda:${CUDA_VER}-devel-centos8
+#==============================================================================
+ARG TORCH_UCC_ROOT_DIR=/opt/nvidia/torch-ucc
+ENV TORCH_UCC_SRC_DIR=${TORCH_UCC_ROOT_DIR}/src
+ENV TORCH_UCC_PKG_DIR=${TORCH_UCC_ROOT_DIR}/pkg
+ENV TORCH_UCC_BIN_DIR=${TORCH_UCC_ROOT_DIR}/bin
+ENV TORCH_UCC_WORKLOADS_DIR=${TORCH_UCC_ROOT_DIR}/workloads
+ENV CUDA_HOME=/usr/local/cuda
+ENV UCX_BRANCH=v1.10.x
+ENV UCX_BUILD_TYPE=release-mt
+ENV UCX_INSTALL_DIR=${TORCH_UCC_BIN_DIR}/ucx/build-${UCX_BUILD_TYPE}
+ENV XCCL_BUILD_TYPE=debug
+ENV XCCL_INSTALL_DIR=${TORCH_UCC_BIN_DIR}/xccl/build-${XCCL_BUILD_TYPE}
+ENV UCC_INSTALL_DIR=${TORCH_UCC_BIN_DIR}/ucc/build
+ENV TORCH_UCC_PYTHON_VENV_DIR=${TORCH_UCC_BIN_DIR}/python/venv
+#==============================================================================
+RUN mkdir -p ${TORCH_UCC_SRC_DIR} && \
+    mkdir -p ${TORCH_UCC_PKG_DIR} && \
+    mkdir -p ${TORCH_UCC_BIN_DIR} && \
+    mkdir -p ${TORCH_UCC_WORKLOADS_DIR} && \
+    mkdir -p ${TORCH_UCC_PYTHON_VENV_DIR}
+
+COPY . ${TORCH_UCC_SRC_DIR}
+#==============================================================================
+RUN yum groupinstall -y \
+    'Development Tools' \
+    'Infiniband Support'
+RUN yum config-manager --set-enabled powertools && yum install -y \
+    cmake \
+    numactl \
+    numactl-devel \
+    openmpi \
+    openmpi-devel \
+    openssh-server \
+    protobuf-compiler \
+    protobuf-devel \
+    python36-devel \
+    rdma-core-devel \
+    vim
+# Remove old UCX
+RUN rpm -e --nodeps ucx
+ENV PATH=/usr/lib64/openmpi/bin:$PATH
+RUN echo "export PATH=\"/usr/lib64/openmpi/bin:\$PATH\"" >> /etc/bashrc && \
+    export LD_LIBRARY_PATH=\"/usr/lib64/openmpi/lib:\${LD_LIBRARY_PATH}\" >> /etc/bashrc
+#==============================================================================
+# Configure SSH
+RUN mkdir -p /var/run/sshd && \
+    cat /etc/ssh/ssh_config | grep -v StrictHostKeyChecking > /etc/ssh/ssh_config.new && \
+    echo "    StrictHostKeyChecking no" >> /etc/ssh/ssh_config.new && \
+    mv /etc/ssh/ssh_config.new /etc/ssh/ssh_config && \
+    ssh-keygen -A &&  \
+    rm -f /run/nologin
+#==============================================================================
+# Build UCX
+RUN ${TORCH_UCC_SRC_DIR}/.ci/scripts/build_ucx.sh
+ENV PATH=${UCX_INSTALL_DIR}/bin:${PATH}
+#==============================================================================
+# Configure Python
+RUN ${TORCH_UCC_SRC_DIR}/.ci/scripts/configure_python.sh
+#==============================================================================
+# Build XCCL
+RUN ${TORCH_UCC_SRC_DIR}/.ci/scripts/build_xccl.sh
+#==============================================================================
+# Build UCC
+RUN ${TORCH_UCC_SRC_DIR}/.ci/scripts/build_ucc.sh
+#==============================================================================
+# Install PyTorch
+RUN ${TORCH_UCC_SRC_DIR}/.ci/scripts/install_torch.sh
+#==============================================================================
+# Install workloads
+# TODO upstream the patches (if needed)
+WORKDIR ${TORCH_UCC_WORKLOADS_DIR}
+RUN git clone https://github.com/facebookresearch/dlrm.git && \
+    cd ${TORCH_UCC_WORKLOADS_DIR}/dlrm && \
+    git apply ${TORCH_UCC_SRC_DIR}/.ci/patches/dlrm/0001-Added-torch_ucc-support.patch && \
+    git apply ${TORCH_UCC_SRC_DIR}/.ci/patches/dlrm/0002-Fixed-arg-list.patch && \
+    pip3 install -r ${TORCH_UCC_WORKLOADS_DIR}/dlrm/requirements.txt && \
+    pip3 install tensorboard
+RUN git clone https://github.com/facebookresearch/param.git && \
+    pip3 install -r ${TORCH_UCC_WORKLOADS_DIR}/param/requirements.txt
+#==============================================================================
+# Install torch_ucc (XCCL version) python module and build a wheel package
+RUN ${TORCH_UCC_SRC_DIR}/.ci/scripts/install_torch_xccl.sh
+#==============================================================================
+# Install torch_ucc (UCC version) python module and build a wheel package
+RUN ${TORCH_UCC_SRC_DIR}/.ci/scripts/install_torch_ucc.sh
+#==============================================================================
+RUN groupadd -g 11429 swx-jenkins
+RUN adduser --uid 6213 --gid 11429 --home /home/swx-jenkins swx-jenkins
+
+RUN groupadd -g 30 dip
+RUN adduser --no-create-home --uid 50009 --gid 30 --home /labhome/artemry artemry
+#==============================================================================
diff --git a/.ci/Dockerfile.ubuntu20.04 b/.ci/Dockerfile.ubuntu20.04
new file mode 100644
index 0000000..dcff626
--- /dev/null
+++ b/.ci/Dockerfile.ubuntu20.04
@@ -0,0 +1,75 @@
+#ARG CUDA_VER='11.2.1'
+ARG CUDA_VER='11.1.1'
+FROM nvidia/cuda:${CUDA_VER}-devel-ubuntu20.04
+#==============================================================================
+ARG TORCH_UCC_ROOT_DIR=/opt/nvidia/torch-ucc
+ENV TORCH_UCC_SRC_DIR=${TORCH_UCC_ROOT_DIR}/src
+ENV TORCH_UCC_PKG_DIR=${TORCH_UCC_ROOT_DIR}/pkg
+ENV TORCH_UCC_BIN_DIR=${TORCH_UCC_ROOT_DIR}/bin
+ENV TORCH_UCC_WORKLOADS_DIR=${TORCH_UCC_ROOT_DIR}/workloads
+ENV CUDA_HOME=/usr/local/cuda
+ENV UCX_BRANCH=v1.10.x
+ENV UCX_BUILD_TYPE=release-mt
+ENV UCX_INSTALL_DIR=${TORCH_UCC_BIN_DIR}/ucx/build-${UCX_BUILD_TYPE}
+ENV XCCL_BUILD_TYPE=debug
+ENV XCCL_INSTALL_DIR=${TORCH_UCC_BIN_DIR}/xccl/build-${XCCL_BUILD_TYPE}
+#==============================================================================
+RUN mkdir -p ${TORCH_UCC_SRC_DIR} && \
+    mkdir -p ${TORCH_UCC_PKG_DIR} && \
+    mkdir -p ${TORCH_UCC_BIN_DIR} && \
+    mkdir -p ${TORCH_UCC_WORKLOADS_DIR}
+
+COPY . ${TORCH_UCC_SRC_DIR}
+#==============================================================================
+ARG DEBIAN_FRONTEND=noninteractive
+RUN apt update && \
+    apt install -y \
+        apt-utils \
+        autoconf \
+        build-essential \
+        cmake \
+        curl \
+        git \
+        ibverbs-providers \
+        ibverbs-utils \
+        libnuma-dev \
+        libtool-bin \
+        ninja-build \
+        openmpi-bin \
+        openssh-server \
+        vim \
+    && \
+    rm -rf /var/lib/apt/lists/*
+#==============================================================================
+# Configure SSH
+RUN mkdir -p /var/run/sshd && \
+    cat /etc/ssh/ssh_config | grep -v StrictHostKeyChecking > /etc/ssh/ssh_config.new && \
+    echo "    StrictHostKeyChecking no" >> /etc/ssh/ssh_config.new && \
+    mv /etc/ssh/ssh_config.new /etc/ssh/ssh_config
+#==============================================================================
+# Build UCX
+RUN ${TORCH_UCC_SRC_DIR}/.ci/scripts/build_ucx.sh
+ENV PATH=${UCX_INSTALL_DIR}/bin:${PATH}
+#==============================================================================
+# Configure Python
+RUN ${TORCH_UCC_SRC_DIR}/.ci/scripts/configure_python.sh
+ENV PATH /opt/conda/bin:${PATH}
+#==============================================================================
+# Build XCCL
+RUN ${TORCH_UCC_SRC_DIR}/.ci/scripts/build_xccl.sh
+#==============================================================================
+# Install PyTorch
+RUN ${TORCH_UCC_SRC_DIR}/.ci/scripts/install_torch.sh
+#==============================================================================
+# Install torch_ucc python module and build a wheel package
+RUN ${TORCH_UCC_SRC_DIR}/.ci/scripts/install_torch_ucc.sh
+#==============================================================================
+# Install workloads
+WORKDIR ${TORCH_UCC_WORKLOADS_DIR}
+RUN git clone https://github.com/facebookresearch/dlrm.git && \
+    cd ${TORCH_UCC_WORKLOADS_DIR}/dlrm && \
+    git apply ${TORCH_UCC_SRC_DIR}/.ci/patches/dlrm/0001-Added-torch_ucc-support.patch && \
+    pip install -r ${TORCH_UCC_WORKLOADS_DIR}/dlrm/requirements.txt && \
+    pip install tensorboard
+RUN git clone https://github.com/facebookresearch/param.git && \
+    pip install -r ${TORCH_UCC_WORKLOADS_DIR}/param/requirements.txt
\ No newline at end of file
diff --git a/.ci/Jenkinsfile.shlib b/.ci/Jenkinsfile.shlib
new file mode 100644
index 0000000..2486083
--- /dev/null
+++ b/.ci/Jenkinsfile.shlib
@@ -0,0 +1,9 @@
+#!/usr/bin/groovy
+
+// load pipeline functions
+// Requires pipeline-github-lib plugin to load library from github
+@Library('github.com/Mellanox/ci-demo@stable')
+def matrix = new com.mellanox.cicd.Matrix()
+
+matrix.main()
+
diff --git a/.ci/configs/swx-clx01/hostfile.txt b/.ci/configs/swx-clx01/hostfile.txt
new file mode 100644
index 0000000..d813292
--- /dev/null
+++ b/.ci/configs/swx-clx01/hostfile.txt
@@ -0,0 +1,2 @@
+swx-clx01
+swx-clx02
diff --git a/.ci/configs/swx-clx02/hostfile.txt b/.ci/configs/swx-clx02/hostfile.txt
new file mode 100644
index 0000000..fbddaa1
--- /dev/null
+++ b/.ci/configs/swx-clx02/hostfile.txt
@@ -0,0 +1,2 @@
+swx-clx02
+swx-clx01
diff --git a/.ci/job_matrix.yaml b/.ci/job_matrix.yaml
new file mode 100644
index 0000000..5fb88a2
--- /dev/null
+++ b/.ci/job_matrix.yaml
@@ -0,0 +1,148 @@
+---
+job: 'torch-ucc'
+
+registry_host: 'harbor.mellanox.com'
+registry_path: '/torch-ucc'
+registry_auth: '05d98651-e11c-4a57-9cc6-52df79014b89'
+
+#kubernetes:
+#  cloud: 'swx-k8s'
+
+volumes:
+  - { mountPath: '/hpc/local', hostPath: '/hpc/local' }
+  - { mountPath: '/auto/sw_tools', hostPath: '/auto/sw_tools' }
+  - { mountPath: '/.autodirect/mtrswgwork', hostPath: '/.autodirect/mtrswgwork' }
+  - { mountPath: '/.autodirect/sw/release', hostPath: '/.autodirect/sw/release' }
+
+env:
+  CUDA_VER: '11.2.1'
+  TORCH_UCC_URI_SUFFIX: '${TORCH_UCC_VERSION}/x86_64/centos8/cuda${CUDA_VER}'
+  TORCH_UCC_DOCKER_IMAGE_NAME: '${registry_host}${registry_path}/${TORCH_UCC_URI_SUFFIX}'
+  TORCH_UCC_ROOT_DIR: '/opt/nvidia/torch-ucc'
+  TORCH_UCC_SRC_DIR: '${TORCH_UCC_ROOT_DIR}/src'
+  TORCH_UCC_BIN_DIR: '${TORCH_UCC_ROOT_DIR}/bin'
+  TORCH_UCC_PYTHON_VENV_DIR: '${TORCH_UCC_BIN_DIR}/python/venv'
+  XCCL_BUILD_TYPE: 'debug'
+
+docker_opt: '--pull always --network=host --uts=host --ipc=host --ulimit stack=67108864 --ulimit memlock=-1 --security-opt seccomp=unconfined --cap-add=SYS_ADMIN --device=/dev/infiniband/ --gpus all --user root'
+
+runs_on_dockers:
+  - {
+    file: '.ci/Dockerfile.centos8',
+    name: 'centos8',
+    tag: '${BUILD_NUMBER}',
+    arch: 'x86_64',
+    uri: '${TORCH_UCC_URI_SUFFIX}',
+    build_args: '--rm --no-cache --build-arg CUDA_VER=${CUDA_VER} --build-arg TORCH_UCC_ROOT_DIR=${TORCH_UCC_ROOT_DIR}',
+    cloud: 'swx-k8s',
+    nodeLabel: 'swx-clx01 || swx-clx02',
+  }
+
+# bare metal
+runs_on_agents:
+  - nodeLabel: 'swx-clx01 || swx-clx02'
+
+# TODO debug
+timeout_minutes: '400'
+
+steps:
+  #============================================================================
+  - name: Check Env
+    agentSelector: "{nodeLabel: 'swx-clx01 || swx-clx02'}"
+    containerSelector: "{name:'centos8'}"
+    run: |
+      echo "INFO: check environment"
+      hostname
+      printenv
+      cat /proc/1/cgroup
+      cat /etc/*release*
+      id
+      #find /opt/nvidia
+      #ibv_devinfo
+      #nvidia-smi
+      #nvidia-smi topo -m
+  #============================================================================
+  - name: Run XCCL tests
+    #agentSelector: "{nodeLabel: 'swx-clx01 || swx-clx02'}"
+    containerSelector: "{name:'centos8'}"
+    run: |
+      echo "INFO: Run XCCL tests"
+      . "${TORCH_UCC_PYTHON_VENV_DIR}/xccl/bin/activate"
+      hostname
+      cat /proc/1/cgroup
+      pip3 list | grep torch
+      ${TORCH_UCC_SRC_DIR}/.ci/scripts/run_tests_xccl.sh
+      deactivate
+  #============================================================================
+#  - name: Run UCC tests
+#    #agentSelector: "{nodeLabel: 'swx-clx01 || swx-clx02'}"
+#    containerSelector: "{name:'centos8'}"
+#    run: |
+#      echo "INFO: Run UCC tests"
+#      . "${TORCH_UCC_PYTHON_VENV_DIR}/ucc/bin/activate"
+#      hostname
+#      cat /proc/1/cgroup
+#      pip3 list | grep torch
+#      ${TORCH_UCC_SRC_DIR}/.ci/scripts/run_tests_ucc.sh
+#      deactivate
+  #============================================================================
+  - name: Run Torch-UCC tests (XCCL)
+    #agentSelector: "{nodeLabel: 'swx-clx01 || swx-clx02'}"
+    containerSelector: "{name:'centos8'}"
+    run: |
+      echo "INFO: Run Torch-UCC tests (XCCL)"
+      . "${TORCH_UCC_PYTHON_VENV_DIR}/xccl/bin/activate"
+      hostname
+      cat /proc/1/cgroup
+      pip3 list | grep torch
+      ${TORCH_UCC_SRC_DIR}/.ci/scripts/run_tests_torch_xccl.sh
+      deactivate
+  #============================================================================
+  - name: Run Torch-UCC tests (UCC)
+    #agentSelector: "{nodeLabel: 'swx-clx01 || swx-clx02'}"
+    containerSelector: "{name:'centos8'}"
+    run: |
+      echo "INFO: Run Torch-UCC tests (UCC)"
+      . "${TORCH_UCC_PYTHON_VENV_DIR}/ucc/bin/activate"
+      hostname
+      cat /proc/1/cgroup
+      pip3 list | grep torch
+      ${TORCH_UCC_SRC_DIR}/.ci/scripts/run_tests_torch_ucc.sh
+      deactivate
+  #============================================================================
+  - name: Run DLRM tests (XCCL/GPU)
+    agentSelector: "{nodeLabel: 'swx-clx01 || swx-clx02'}"
+    run: |
+      echo "INFO: Run DLRM tests (XCCL/GPU)"
+      hostname
+      printenv
+      cat /proc/1/cgroup
+      cat /etc/*release*
+      id
+      find /opt/nvidia
+      ibv_devinfo
+      nvidia-smi
+      ${WORKSPACE}/.ci/scripts/run_dlrm_docker.sh xccl
+  #============================================================================
+  - name: Run DLRM tests (UCC/GPU)
+    agentSelector: "{nodeLabel: 'swx-clx01 || swx-clx02'}"
+    run: |
+      echo "INFO: Run DLRM tests (UCC/GPU)"
+      hostname
+      printenv
+      cat /proc/1/cgroup
+      cat /etc/*release*
+      id
+      find /opt/nvidia
+      ibv_devinfo
+      nvidia-smi
+      ${WORKSPACE}/.ci/scripts/run_dlrm_docker.sh ucc
+  #============================================================================
+#  - name: Run PARAM benchmarks
+#    agentSelector: "{nodeLabel: 'swx-clx01 || swx-clx02'}"
+#    run: |
+#      echo "INFO: Run PARAM benchmarks"
+#      hostname
+#      cat /proc/1/cgroup
+#      #${TORCH_UCC_SRC_DIR}/.ci/scripts/run_param_benchmarks.sh
+  #============================================================================
diff --git a/.ci/patches/dlrm/0001-Added-torch_ucc-support.patch b/.ci/patches/dlrm/0001-Added-torch_ucc-support.patch
new file mode 100644
index 0000000..2620579
--- /dev/null
+++ b/.ci/patches/dlrm/0001-Added-torch_ucc-support.patch
@@ -0,0 +1,30 @@
+From bcd8fc065ef04a0ea8f06e61a5e2581a308719fd Mon Sep 17 00:00:00 2001
+From: artemry-nv <artemry@nvidia.com>
+Date: Tue, 9 Mar 2021 00:41:16 +0300
+Subject: [PATCH] Added torch_ucc support
+
+Signed-off-by: artemry-nv <artemry@nvidia.com>
+---
+ extend_distributed.py | 6 ++++++
+ 1 file changed, 6 insertions(+)
+
+diff --git a/extend_distributed.py b/extend_distributed.py
+index adcb60b..1f2c8a5 100644
+--- a/extend_distributed.py
++++ b/extend_distributed.py
+@@ -20,6 +20,12 @@ except ImportError as e:
+     # print(e)
+     torch_ccl = False
+ 
++try:
++    import torch_ucc
++except ImportError as e:
++    torch_ucc = False
++
++
+ my_rank = -1
+ my_size = -1
+ my_local_rank = -1
+-- 
+2.24.3 (Apple Git-128)
+
diff --git a/.ci/patches/dlrm/0002-Fixed-arg-list.patch b/.ci/patches/dlrm/0002-Fixed-arg-list.patch
new file mode 100644
index 0000000..4f6b1ce
--- /dev/null
+++ b/.ci/patches/dlrm/0002-Fixed-arg-list.patch
@@ -0,0 +1,25 @@
+From 481fd6aef896aa8ff15a161b7e88b2ea01ae673a Mon Sep 17 00:00:00 2001
+From: artemry-nv <artemry@nvidia.com>
+Date: Mon, 29 Mar 2021 01:56:08 +0300
+Subject: [PATCH] Fixed arg list
+
+---
+ dlrm_s_pytorch.py | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/dlrm_s_pytorch.py b/dlrm_s_pytorch.py
+index 71a0414..36dab9b 100644
+--- a/dlrm_s_pytorch.py
++++ b/dlrm_s_pytorch.py
+@@ -1477,7 +1477,7 @@ def run():
+ 
+     ext_dist.barrier()
+     with torch.autograd.profiler.profile(
+-        args.enable_profiling, use_gpu, record_shapes=True
++        args.enable_profiling, use_cuda=use_gpu, record_shapes=True
+     ) as prof:
+         if not args.inference_only:
+             k = 0
+-- 
+2.24.3 (Apple Git-128)
+
diff --git a/.ci/proj_jjb.yaml b/.ci/proj_jjb.yaml
new file mode 100644
index 0000000..0760821
--- /dev/null
+++ b/.ci/proj_jjb.yaml
@@ -0,0 +1,79 @@
+- job-template:
+    name: "{jjb_proj}"
+    project-type: pipeline
+    properties:
+      - github:
+          url: "{jjb_git}"
+      - build-discarder:
+          days-to-keep: 50
+          num-to-keep: 20
+      - inject:
+          keep-system-variables: true
+          properties-content: |
+            jjb_proj={jjb_proj}
+    description: Do NOT edit this job through the Web GUI !
+    concurrent: true
+    sandbox: true
+    parameters:
+      - string:
+          name: "sha1"
+          default: "master"
+          description: "Commit to be checked, set by PR"
+      - bool:
+          name: "build_dockers"
+          default: false
+          description: "Rebuild docker containers"
+      - string:
+          name: "conf_file"
+          default: ".ci/job_matrix.yaml"
+          description: "Regex to select job config file"
+      - bool:
+          name: "do_release"
+          default: false
+          description: "Release rpm"
+      - string:
+          name: "release_dir"
+          default: "/.autodirect/sw/release/sw_acceleration/{jjb_proj}"
+          description: "Location to release rpm to"
+      - string:
+          name: "script"
+          default: "{jjb_jenkinsfile}"
+          description: "Jenkinsfile to load on trigger"
+      - string:
+          name: "DEBUG"
+          default: 0
+          description: "Enable debug prints and traces, valid values are 0-9"
+    #    triggers:
+    #        - github-pull-request:
+    #            cron: 'H/5 * * * *'
+    #            trigger-phrase: '.*\bbot:retest\b.*'
+    #            status-add-test-results: true
+    #            auth-id: '549927eb-7f38-4a8f-997a-81dd63605782'
+    #            org-list: ["Mellanox"]
+    #            white-list: ["swx-jenkins","swx-jenkins2","swx-jenkins3","mike-dubman","mellanox-github"]
+    #            allow-whitelist-orgs-as-admins: true
+    pipeline-scm:
+      scm:
+        - git:
+            url: "{jjb_git}"
+            credentials-id: '549927eb-7f38-4a8f-997a-81dd63605782'
+            branches: [ '$sha1' ]
+            shallow-clone: true
+            depth: 10
+            refspec: "+refs/heads/*:refs/remotes/origin/* +refs/pull/*:refs/remotes/origin/pr/*"
+            browser: githubweb
+            browser-url: "{jjb_git}"
+      script-path: "$script"
+
+- project:
+    name: proj_name
+    # TODO
+    jjb_email: 'TODO'
+    jjb_proj: 'torch-ucc'
+    # TODO tmp
+    jjb_git: 'git@github.com:artemry-nv/torch-ucc.git'
+    # TODO
+    jjb_owner: 'TODO'
+    jjb_jenkinsfile: '.ci/Jenkinsfile.shlib'
+    jobs:
+      - "{jjb_proj}"
diff --git a/.ci/scripts/build_ucc.sh b/.ci/scripts/build_ucc.sh
new file mode 100755
index 0000000..988d570
--- /dev/null
+++ b/.ci/scripts/build_ucc.sh
@@ -0,0 +1,16 @@
+#!/bin/bash -eEx
+set -o pipefail
+
+echo "INFO: Build UCC"
+UCC_SRC_DIR="${TORCH_UCC_SRC_DIR}/ucc"
+cd "${UCC_SRC_DIR}"
+"${UCC_SRC_DIR}/autogen.sh"
+mkdir -p "${UCC_SRC_DIR}/build"
+cd "${UCC_SRC_DIR}/build"
+"${UCC_SRC_DIR}/configure" --with-ucx="${UCX_INSTALL_DIR}" --with-cuda="${CUDA_HOME}" \
+    --prefix="${UCC_INSTALL_DIR}" --enable-gtest
+make -j install
+echo "${UCC_INSTALL_DIR}/lib" > /etc/ld.so.conf.d/ucc.conf
+ldconfig
+ldconfig -p | grep -i libucc
+cd "${UCC_INSTALL_DIR}" && tar cfz "${TORCH_UCC_PKG_DIR}/ucc.tgz" --owner=0 --group=0 .
diff --git a/.ci/scripts/build_ucx.sh b/.ci/scripts/build_ucx.sh
new file mode 100755
index 0000000..8df3411
--- /dev/null
+++ b/.ci/scripts/build_ucx.sh
@@ -0,0 +1,17 @@
+#!/bin/bash -eEx
+set -o pipefail
+
+echo "INFO: Build UCX"
+cd "${TORCH_UCC_SRC_DIR}/ucx"
+git checkout "${UCX_BRANCH}"
+"${TORCH_UCC_SRC_DIR}/ucx/autogen.sh"
+mkdir -p "${TORCH_UCC_SRC_DIR}/ucx/build-${UCX_BUILD_TYPE}"
+cd "${TORCH_UCC_SRC_DIR}/ucx/build-${UCX_BUILD_TYPE}"
+# TODO debug
+"${TORCH_UCC_SRC_DIR}/ucx/contrib/configure-release-mt" --with-cuda="${CUDA_HOME}" --prefix="${UCX_INSTALL_DIR}"
+#"${TORCH_UCC_SRC_DIR}/ucx/contrib/configure-release-mt" --prefix="${UCX_INSTALL_DIR}"
+make -j install
+echo "${UCX_INSTALL_DIR}/lib" > /etc/ld.so.conf.d/ucx.conf
+ldconfig
+ldconfig -p | grep -i ucx
+cd "${UCX_INSTALL_DIR}" && tar cfz "${TORCH_UCC_PKG_DIR}/ucx-${UCX_BUILD_TYPE}.tgz" --owner=0 --group=0 .
diff --git a/.ci/scripts/build_xccl.sh b/.ci/scripts/build_xccl.sh
new file mode 100755
index 0000000..9571c71
--- /dev/null
+++ b/.ci/scripts/build_xccl.sh
@@ -0,0 +1,19 @@
+#!/bin/bash -eEx
+set -o pipefail
+
+echo "INFO: Build XCCL"
+XCCL_SRC_DIR="${TORCH_UCC_SRC_DIR}/xccl"
+cd "${XCCL_SRC_DIR}"
+"${XCCL_SRC_DIR}/autogen.sh"
+mkdir -p "${XCCL_SRC_DIR}/build-${XCCL_BUILD_TYPE}"
+cd "${XCCL_SRC_DIR}/build-${XCCL_BUILD_TYPE}"
+# TODO enable CUDA (compilation failed)
+#"${XCCL_SRC_DIR}/configure" --with-ucx="${UCX_INSTALL_DIR}" --prefix="${XCCL_INSTALL_DIR}" --enable-debug
+"${XCCL_SRC_DIR}/configure" --with-cuda="${CUDA_HOME}" --with-ucx="${UCX_INSTALL_DIR}" \
+    --prefix="${XCCL_INSTALL_DIR}" --enable-debug
+make -j install
+echo "${XCCL_INSTALL_DIR}/lib" > /etc/ld.so.conf.d/xccl.conf
+ldconfig
+ldconfig -p | grep -i libxccl
+make -C test
+cd "${XCCL_INSTALL_DIR}" && tar cfz "${TORCH_UCC_PKG_DIR}/xccl-${XCCL_BUILD_TYPE}.tgz" --owner=0 --group=0 .
diff --git a/.ci/scripts/configure_python.sh b/.ci/scripts/configure_python.sh
new file mode 100755
index 0000000..45dbc94
--- /dev/null
+++ b/.ci/scripts/configure_python.sh
@@ -0,0 +1,29 @@
+#!/bin/bash -eEx
+set -o pipefail
+
+# Install conda
+#cd /tmp
+#curl -LO http://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh
+#bash Miniconda3-latest-Linux-x86_64.sh -p /opt/conda -b
+#rm -f Miniconda3-latest-Linux-x86_64.sh
+#export PATH /opt/conda/bin:${PATH}
+
+# Install conda python
+#conda update -y conda
+#conda install -c anaconda -y \
+#    python \
+#    pip \
+#    scikit-learn
+#pip3 install --no-cache-dir python-hostlist
+
+#alternatives --set python /opt/conda/bin/python3
+alternatives --set python /usr/bin/python3
+pip3 install --user --upgrade setuptools wheel
+
+command -v python
+python --version
+
+command -v python3
+python3 --version
+
+pip3 list
diff --git a/.ci/scripts/env.sh b/.ci/scripts/env.sh
new file mode 100755
index 0000000..649acaa
--- /dev/null
+++ b/.ci/scripts/env.sh
@@ -0,0 +1,16 @@
+#!/bin/bash -eEx
+
+SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd -P)"
+
+# shellcheck disable=SC2034
+#DLRM_MODEL="big"
+DLRM_MODEL="small"
+
+HOSTNAME=$(hostname -s)
+export HOSTNAME
+SRC_ROOT_DIR=$(cd "${SCRIPT_DIR}/../../" && pwd -P)
+export CONFIGS_DIR="${SRC_ROOT_DIR}/.ci/configs"
+
+# DLRM MASTER_PORT
+export MASTER_PORT="12346"
+export DOCKER_SSH_PORT="12345"
diff --git a/.ci/scripts/install_torch.sh b/.ci/scripts/install_torch.sh
new file mode 100755
index 0000000..dce6bd6
--- /dev/null
+++ b/.ci/scripts/install_torch.sh
@@ -0,0 +1,40 @@
+#!/bin/bash -eEx
+set -o pipefail
+
+# TODO debug
+#cd /tmp
+#git clone https://github.com/pytorch/pytorch.git
+#cd /tmp/pytorch
+#git submodule sync --recursive
+#git submodule update --init --recursive
+#pip3 install -r requirements.txt
+#export TORCH_CUDA_ARCH_LIST="7.0 8.0+PTX"
+#export USE_GLOO=1
+#export USE_DISTRIBUTED=1
+#export USE_OPENCV=0
+## TODO debug
+#export USE_CUDA=1
+##export USE_CUDA=0
+#export USE_NCCL=0
+#export USE_MKLDNN=0
+#export BUILD_TEST=0
+#export USE_FBGEMM=0
+#export USE_NNPACK=0
+#export USE_QNNPACK=0
+#export USE_XNNPACK=0
+#export USE_KINETO=1
+#export MAX_JOBS=$(($(nproc)-1))
+#python setup.py install
+#cd -
+#rm -rf /tmp/pytorch
+
+# TODO debug
+#conda install -y pytorch torchvision cpuonly -c pytorch-nightly
+#conda install pytorch torchvision torchaudio cudatoolkit=11.0 -c pytorch-nightly
+#conda uninstall -y pytorch torchvision
+#conda install pytorch torchvision cudatoolkit=11.0 -c pytorch-nightly
+#conda install pytorch cudatoolkit=11.0 -c pytorch-nightly
+
+pip3 install --default-timeout=900 numpy
+pip3 install --default-timeout=900 --pre torch -f https://download.pytorch.org/whl/nightly/cu110/torch_nightly.html
+pip3 install "git+https://github.com/mlperf/logging.git@0.7.1"
diff --git a/.ci/scripts/install_torch_ucc.sh b/.ci/scripts/install_torch_ucc.sh
new file mode 100755
index 0000000..a0d2527
--- /dev/null
+++ b/.ci/scripts/install_torch_ucc.sh
@@ -0,0 +1,18 @@
+#!/bin/bash -eEx
+set -o pipefail
+
+# UCC
+echo "INFO: Install Torch-UCC (UCC version)"
+cd "${TORCH_UCC_PYTHON_VENV_DIR}"
+python3 -m venv --system-site-packages ucc
+. "${TORCH_UCC_PYTHON_VENV_DIR}/ucc/bin/activate"
+export UCX_HOME=${UCX_INSTALL_DIR}
+export UCC_HOME=${UCC_INSTALL_DIR}
+export WITH_CUDA=${CUDA_HOME}
+cd "${TORCH_UCC_SRC_DIR}"
+git clean -ffdx
+python setup.py install bdist_wheel
+pip3 list | grep torch
+python -c 'import torch, torch_ucc'
+cp "${TORCH_UCC_SRC_DIR}/dist/"*.whl "${TORCH_UCC_PKG_DIR}"
+deactivate
diff --git a/.ci/scripts/install_torch_xccl.sh b/.ci/scripts/install_torch_xccl.sh
new file mode 100755
index 0000000..92231b4
--- /dev/null
+++ b/.ci/scripts/install_torch_xccl.sh
@@ -0,0 +1,21 @@
+#!/bin/bash -eEx
+set -o pipefail
+
+# XCCL
+echo "INFO: Install Torch-UCC (XCCL version)"
+cd "${TORCH_UCC_PYTHON_VENV_DIR}"
+python3 -m venv --system-site-packages xccl
+. "${TORCH_UCC_PYTHON_VENV_DIR}/xccl/bin/activate"
+export UCX_HOME=${UCX_INSTALL_DIR}
+export XCCL_HOME=${XCCL_INSTALL_DIR}
+export WITH_CUDA=${CUDA_HOME}
+TORCH_UCC_SRC_DIR_WITH_XCCL_SUPPORT="${TORCH_UCC_SRC_DIR}_xccl"
+mkdir -p "${TORCH_UCC_SRC_DIR_WITH_XCCL_SUPPORT}"
+git clone https://github.com/openucx/torch-ucc.git "${TORCH_UCC_SRC_DIR_WITH_XCCL_SUPPORT}"
+cd "${TORCH_UCC_SRC_DIR_WITH_XCCL_SUPPORT}"
+git clean -ffdx
+python setup.py install bdist_wheel
+pip3 list | grep torch
+python -c 'import torch, torch_ucc'
+cp "${TORCH_UCC_SRC_DIR_WITH_XCCL_SUPPORT}/dist/"*.whl "${TORCH_UCC_PKG_DIR}"
+deactivate
diff --git a/.ci/scripts/run_dlrm.sh b/.ci/scripts/run_dlrm.sh
new file mode 100755
index 0000000..026870c
--- /dev/null
+++ b/.ci/scripts/run_dlrm.sh
@@ -0,0 +1,71 @@
+#!/bin/bash -eEx
+set -o pipefail
+
+SCRIPT_DIR="$(
+    cd "$(dirname "$0")"
+    pwd -P
+)"
+cd "${SCRIPT_DIR}"
+. "${SCRIPT_DIR}/env.sh"
+
+TORCH_UCC_MODE="$1"
+CPU_GPU_MODE="$2"
+HOSTFILE="$3"
+
+if [ "${TORCH_UCC_MODE}" != "ucc" ] && [ "${TORCH_UCC_MODE}" != "xccl" ]; then
+    echo "ERROR: unsupported or empty TORCH_UCC_MODE (${TORCH_UCC_MODE}), supported values: ucc, xccl"
+    exit 1
+fi
+
+export TORCH_UCC_MODE
+export CPU_GPU_MODE
+
+if [ -z "$HOSTFILE" ]; then
+    echo "ERROR: HOSTFILE is not specified"
+    exit 1
+fi
+
+export PATH="/usr/lib64/openmpi/bin:$PATH"
+export LD_LIBRARY_PATH="/usr/lib64/openmpi/lib:${LD_LIBRARY_PATH}"
+
+HEAD_NODE=$(head -1 "$HOSTFILE")
+export HEAD_NODE
+export MASTER_ADDR=${HEAD_NODE}
+
+NP=$(wc --lines "$HOSTFILE" | awk '{print $1}')
+
+# shellcheck disable=SC2086
+mpirun \
+    -np $NP \
+    --hostfile ${HOSTFILE} \
+    --map-by node \
+    --allow-run-as-root \
+    --mca plm_rsh_args '-p 12345' \
+    -x PATH \
+    -x LD_LIBRARY_PATH \
+    hostname
+
+# shellcheck disable=SC2086
+mpirun \
+    -np $NP \
+    --hostfile ${HOSTFILE} \
+    --map-by node \
+    --allow-run-as-root \
+    --mca plm_rsh_args '-p 12345' \
+    -x PATH \
+    -x LD_LIBRARY_PATH \
+    cat /proc/1/cgroup
+
+# shellcheck disable=SC2086
+mpirun \
+    -np $NP \
+    --hostfile ${HOSTFILE} \
+    --map-by node \
+    --allow-run-as-root \
+    --mca plm_rsh_args '-p 12345' \
+    -x PATH \
+    -x LD_LIBRARY_PATH \
+    -x MASTER_ADDR \
+    -x TORCH_UCC_MODE \
+    -x CPU_GPU_MODE \
+    /opt/nvidia/torch-ucc/src/.ci/scripts/run_dlrm_s_pytorch.sh
diff --git a/.ci/scripts/run_dlrm_docker.sh b/.ci/scripts/run_dlrm_docker.sh
new file mode 100755
index 0000000..5ea307b
--- /dev/null
+++ b/.ci/scripts/run_dlrm_docker.sh
@@ -0,0 +1,100 @@
+#!/bin/bash -eEx
+set -o pipefail
+
+function err_report () {
+    echo "Exited with ERROR in line $1"
+    exit 1
+}
+trap 'err_report $LINENO' ERR
+
+SCRIPT_DIR="$(
+    cd "$(dirname "$0")"
+    pwd -P
+)"
+cd "${SCRIPT_DIR}"
+. "${SCRIPT_DIR}/env.sh"
+
+TORCH_UCC_MODE="$1"
+
+if [ "${TORCH_UCC_MODE}" != "ucc" ] && [ "${TORCH_UCC_MODE}" != "xccl" ]; then
+    echo "ERROR: unsupported or empty TORCH_UCC_MODE (${TORCH_UCC_MODE}), supported values: ucc, xccl"
+    exit 1
+fi
+
+export HOSTFILE=${HOSTFILE:-${CONFIGS_DIR}/$HOSTNAME/hostfile.txt}
+
+if [ ! -f "${HOSTFILE}" ]; then
+    echo "ERROR: ${HOSTFILE} does not exist"
+    exit 1
+fi
+
+# shellcheck disable=SC2002
+HOSTS=$(cat "$HOSTFILE" | xargs | tr ' ' ',')
+export HOSTS
+HEAD_NODE=$(head -1 "$HOSTFILE")
+export HEAD_NODE
+
+DOCKER_CONTAINER_NAME="torch_ucc"
+# TODO debug
+DOCKER_IMAGE_NAME="${TORCH_UCC_DOCKER_IMAGE_NAME}:${BUILD_ID}"
+#DOCKER_IMAGE_NAME="harbor.mellanox.com/torch-ucc/1.0.0/x86_64/centos8/cuda11.2.1:205"
+
+DOCKER_RUN_ARGS="\
+--pull always \
+--network=host \
+--uts=host \
+--ipc=host \
+--ulimit stack=67108864 \
+--ulimit memlock=-1 \
+--security-opt seccomp=unconfined \
+--cap-add=SYS_ADMIN \
+--device=/dev/infiniband/ \
+--gpus all \
+--user root \
+-it \
+-d \
+--rm \
+--name=${DOCKER_CONTAINER_NAME} \
+-v /labhome:/labhome \
+-v /root/.ssh:/root/.ssh \
+"
+
+# shellcheck disable=SC2013
+for HOST in $(cat "$HOSTFILE"); do
+    echo "INFO: HOST = $HOST"
+
+    STALE_DOCKER_CONTAINER_LIST=$(sudo ssh -n "$HOST" "docker ps -a -q -f name=${DOCKER_CONTAINER_NAME}")
+    if [ -n "${STALE_DOCKER_CONTAINER_LIST}" ]; then
+        echo "WARNING: stale docker container (name: ${DOCKER_CONTAINER_NAME}) is detected on ${HOST} (to be stopped)"
+        echo "INFO: Stopping stale docker container (name: ${DOCKER_CONTAINER_NAME}) on ${HOST}..."
+        sudo ssh "${HOST}" docker stop ${DOCKER_CONTAINER_NAME}
+        echo "INFO: Stopping stale docker container (name: ${DOCKER_CONTAINER_NAME}) on ${HOST}... DONE"
+    fi
+
+    echo "INFO: start docker container on $HOST ..."
+    # shellcheck disable=SC2029
+    sudo ssh "$HOST" "docker run \
+        ${DOCKER_RUN_ARGS} \
+        ${DOCKER_IMAGE_NAME} \
+        bash -c '/usr/sbin/sshd -p ${DOCKER_SSH_PORT}; sleep infinity'"
+    echo "INFO: start docker container on $HOST ... DONE"
+
+    sleep 5
+
+    echo "INFO: verify docker container on $HOST ..."
+    sudo ssh -p "${DOCKER_SSH_PORT}" "$HOST" hostname
+    sudo ssh -p "${DOCKER_SSH_PORT}" "$HOST" cat /proc/1/cgroup
+    echo "INFO: verify docker container on $HOST ... DONE"
+done
+
+# TODO remove sudo
+sudo ssh -p "${DOCKER_SSH_PORT}" "${HEAD_NODE}" /opt/nvidia/torch-ucc/src/.ci/scripts/run_dlrm.sh ${TORCH_UCC_MODE} cpu /opt/nvidia/torch-ucc/src/.ci/configs/$HOSTNAME/hostfile.txt
+sudo ssh -p "${DOCKER_SSH_PORT}" "${HEAD_NODE}" /opt/nvidia/torch-ucc/src/.ci/scripts/run_dlrm.sh ${TORCH_UCC_MODE} gpu /opt/nvidia/torch-ucc/src/.ci/configs/$HOSTNAME/hostfile.txt
+
+# TODO debug
+# shellcheck disable=SC2013
+#for HOST in $(cat "$HOSTFILE"); do
+#    echo "INFO: stop docker container on $HOST ..."
+#    ssh "${HOST}" docker stop ${DOCKER_CONTAINER_NAME}
+#    echo "INFO: stop docker container on $HOST ... DONE"
+#done
diff --git a/.ci/scripts/run_dlrm_s_pytorch.sh b/.ci/scripts/run_dlrm_s_pytorch.sh
new file mode 100755
index 0000000..82b2f1b
--- /dev/null
+++ b/.ci/scripts/run_dlrm_s_pytorch.sh
@@ -0,0 +1,81 @@
+#!/bin/bash -eEx
+set -o pipefail
+
+SCRIPT_DIR="$(
+    cd "$(dirname "$0")"
+    pwd -P
+)"
+cd "${SCRIPT_DIR}"
+. "${SCRIPT_DIR}/env.sh"
+
+if [ "${TORCH_UCC_MODE}" != "ucc" ] && [ "${TORCH_UCC_MODE}" != "xccl" ]; then
+    echo "ERROR: unsupported or empty TORCH_UCC_MODE (${TORCH_UCC_MODE}), supported values: ucc, xccl"
+    exit 1
+fi
+
+# shellcheck disable=SC1090
+. "/opt/nvidia/torch-ucc/bin/python/venv/${TORCH_UCC_MODE}/bin/activate"
+pip3 list | grep torch
+python -c 'import torch, torch_ucc'
+
+case ${DLRM_MODEL} in
+"big")
+    emb_size="1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000-1000"
+    emb_dim="256"
+    emb_lookup="100"
+    bot_mlp="512-512-256"
+    top_mlp="1024-1024-1024-1"
+    loss_func="mse"
+    round_targets="False"
+    lr="0.01"
+    #mb_size="2048"
+    emb_lookup_fixed="0"
+    ;;
+"small")
+    emb_size="1000-1000-1000-1000-1000-1000-1000-1000"
+    emb_dim="64"
+    emb_lookup="100"
+    bot_mlp="512-512-64"
+    top_mlp="1024-1024-1024-1"
+    loss_func="mse"
+    round_targets="False"
+    lr="0.01"
+    #mb_size="2048"
+    emb_lookup_fixed="0"
+    ;;
+*)
+    echo "ERROR: unsupported or empty DLRM_MODEL (${DLRM_MODEL})"
+    exit 1
+    ;;
+esac
+
+export UCX_NET_DEVICES="mlx5_0:1"
+
+if [ "${CPU_GPU_MODE}" = "gpu" ]; then
+    DLRM_S_PYTORCH_EXTRA_ARGS="--use-gpu"
+fi
+
+# shellcheck disable=SC2086
+python /opt/nvidia/torch-ucc/workloads/dlrm/dlrm_s_pytorch.py \
+    --mini-batch-size=2048 \
+    --test-mini-batch-size=16384 \
+    --test-num-workers=0 \
+    --num-batches=10 \
+    --data-generation=random \
+    --arch-mlp-bot=$bot_mlp \
+    --arch-mlp-top=$top_mlp \
+    --arch-sparse-feature-size=$emb_dim \
+    --arch-embedding-size=$emb_size \
+    --num-indices-per-lookup=$emb_lookup \
+    --num-indices-per-lookup-fixed=$emb_lookup_fixed \
+    --arch-interaction-op=dot \
+    --numpy-rand-seed=727 \
+    --print-freq=1 \
+    --loss-function=$loss_func \
+    --round-targets=$round_targets \
+    --learning-rate=$lr \
+    --print-time \
+    --dist-backend=ucc \
+    ${DLRM_S_PYTORCH_EXTRA_ARGS}
+
+deactivate
diff --git a/.ci/scripts/run_param_benchmarks.sh b/.ci/scripts/run_param_benchmarks.sh
new file mode 100755
index 0000000..5c35e31
--- /dev/null
+++ b/.ci/scripts/run_param_benchmarks.sh
@@ -0,0 +1,46 @@
+#!/bin/bash -eEx
+set -o pipefail
+
+# TODO debug
+exit 0
+
+source /workspace/set-env-dist.sh
+index=$LOCAL_RANK
+export OMPI_COMM_WORLD_SIZE=$WORLD_SIZE
+export OMPI_COMM_WORLD_LOCAL_SIZE=$LOCAL_SIZE
+export OMPI_COMM_WORLD_RANK=$RANK
+export OMPI_COMM_WORLD_LOCAL_RANK=$LOCAL_RANK
+
+if (( $index == 0 )); then
+    export UCX_NET_DEVICES=mlx5_0:1
+    NUMA="numactl --physcpubind=48-63 --membind=3 "
+elif (( $index == 1 )); then
+    export UCX_NET_DEVICES=mlx5_1:1
+    NUMA="numactl --physcpubind=48-63 --membind=3 "
+elif (( $index == 2 )); then
+    export UCX_NET_DEVICES=mlx5_2:1
+    NUMA="numactl --physcpubind=16-31 --membind=1 "
+elif (( $index == 3 )); then
+    export UCX_NET_DEVICES=mlx5_3:1
+    NUMA="numactl --physcpubind=16-31 --membind=1 "
+elif (( $index == 4 )); then
+    export UCX_NET_DEVICES=mlx5_6:1
+    NUMA="numactl --physcpubind=112-127 --membind=7 "
+elif (( $index == 5 )); then
+    export UCX_NET_DEVICES=mlx5_7:1
+    NUMA="numactl --physcpubind=112-127 --membind=7 "
+elif (( $index == 6 )); then
+    export UCX_NET_DEVICES=mlx5_8:1
+    NUMA="numactl --physcpubind=80-95 --membind=5 "
+elif (( $index == 7 )); then
+    export UCX_NET_DEVICES=mlx5_9:1
+    NUMA="numactl --physcpubind=80-95 --membind=5 "
+fi
+
+export XCCL_TEAM_UCX_NET_DEVICES=$UCX_NET_DEVICES
+export XCCL_TEAM_HIER_NET_DEVICES=$UCX_NET_DEVICES
+
+EXE="$NUMA python /workspace/param/train/comms/pt/comms.py \
+            --master-ip $MASTER_ADDR \
+            --master-port $MASTER_PORT $@"
+$EXE
diff --git a/.ci/scripts/run_tests_torch_ucc.sh b/.ci/scripts/run_tests_torch_ucc.sh
new file mode 100755
index 0000000..7ffcca1
--- /dev/null
+++ b/.ci/scripts/run_tests_torch_ucc.sh
@@ -0,0 +1,26 @@
+#!/bin/bash -eEx
+set -o pipefail
+
+command -v mpirun
+export TORCH_UCC_XCCL_TLS=ucx
+export UCX_WARN_UNUSED_ENV_VARS=n
+ucx_info -e -u t
+TORCH_UCC_SRC_DIR_WITH_XCCL_SUPPORT="${TORCH_UCC_SRC_DIR}_xccl"
+
+echo "UCC barrier"
+/bin/bash ${TORCH_UCC_SRC_DIR_WITH_XCCL_SUPPORT}/test/start_test.sh ${TORCH_UCC_SRC_DIR_WITH_XCCL_SUPPORT}/test/torch_barrier_test.py --backend=gloo
+
+echo "UCC alltoall"
+/bin/bash ${TORCH_UCC_SRC_DIR_WITH_XCCL_SUPPORT}/test/start_test.sh ${TORCH_UCC_SRC_DIR_WITH_XCCL_SUPPORT}/test/torch_alltoall_test.py --backend=gloo
+
+echo "UCC alltoallv"
+/bin/bash ${TORCH_UCC_SRC_DIR_WITH_XCCL_SUPPORT}/test/start_test.sh ${TORCH_UCC_SRC_DIR_WITH_XCCL_SUPPORT}/test/torch_alltoallv_test.py --backend=gloo
+
+echo "UCC allgather"
+/bin/bash ${TORCH_UCC_SRC_DIR_WITH_XCCL_SUPPORT}/test/start_test.sh ${TORCH_UCC_SRC_DIR_WITH_XCCL_SUPPORT}/test/torch_allgather_test.py --backend=gloo
+
+echo "UCC allreduce"
+/bin/bash ${TORCH_UCC_SRC_DIR_WITH_XCCL_SUPPORT}/test/start_test.sh ${TORCH_UCC_SRC_DIR_WITH_XCCL_SUPPORT}/test/torch_allreduce_test.py --backend=gloo
+
+echo "UCC broadcast"
+/bin/bash ${TORCH_UCC_SRC_DIR_WITH_XCCL_SUPPORT}/test/start_test.sh ${TORCH_UCC_SRC_DIR_WITH_XCCL_SUPPORT}/test/torch_bcast_test.py --backend=gloo
diff --git a/.ci/scripts/run_tests_torch_xccl.sh b/.ci/scripts/run_tests_torch_xccl.sh
new file mode 100755
index 0000000..b817470
--- /dev/null
+++ b/.ci/scripts/run_tests_torch_xccl.sh
@@ -0,0 +1,24 @@
+#!/bin/bash -eEx
+set -o pipefail
+
+command -v mpirun
+export UCX_WARN_UNUSED_ENV_VARS=n
+ucx_info -e -u t
+
+echo "XCCL allreduce"
+/bin/bash ${TORCH_UCC_SRC_DIR}/test/start_test.sh ${TORCH_UCC_SRC_DIR}/test/torch_allreduce_test.py --backend=gloo
+
+echo "XCCL alltoall"
+/bin/bash ${TORCH_UCC_SRC_DIR}/test/start_test.sh ${TORCH_UCC_SRC_DIR}/test/torch_alltoall_test.py --backend=gloo
+
+echo "XCCL alltoallv"
+/bin/bash ${TORCH_UCC_SRC_DIR}/test/start_test.sh ${TORCH_UCC_SRC_DIR}/test/torch_alltoallv_test.py --backend=gloo
+
+echo "XCCL barrier"
+/bin/bash ${TORCH_UCC_SRC_DIR}/test/start_test.sh ${TORCH_UCC_SRC_DIR}/test/torch_barrier_test.py --backend=gloo
+
+echo "XCCL allgather"
+/bin/bash ${TORCH_UCC_SRC_DIR}/test/start_test.sh ${TORCH_UCC_SRC_DIR}/test/torch_allgather_test.py --backend=gloo
+
+echo "XCCL broadcast"
+/bin/bash ${TORCH_UCC_SRC_DIR}/test/start_test.sh ${TORCH_UCC_SRC_DIR}/test/torch_bcast_test.py --backend=gloo
diff --git a/.ci/scripts/run_tests_ucc.sh b/.ci/scripts/run_tests_ucc.sh
new file mode 100755
index 0000000..bed2dd5
--- /dev/null
+++ b/.ci/scripts/run_tests_ucc.sh
@@ -0,0 +1,8 @@
+#!/bin/bash -eEx
+set -o pipefail
+
+UCC_SRC_DIR="${TORCH_UCC_SRC_DIR}/ucc"
+cd "${UCC_SRC_DIR}/build"
+
+export UCX_WARN_UNUSED_ENV_VARS=n
+make gtest
diff --git a/.ci/scripts/run_tests_xccl.sh b/.ci/scripts/run_tests_xccl.sh
new file mode 100755
index 0000000..c3929ca
--- /dev/null
+++ b/.ci/scripts/run_tests_xccl.sh
@@ -0,0 +1,25 @@
+#!/bin/bash -eEx
+set -o pipefail
+
+command -v mpirun
+export UCX_SOCKADDR_CM_ENABLE=n
+export UCX_WARN_UNUSED_ENV_VARS=n
+MPI_ARGS_COMMON="--allow-run-as-root --oversubscribe -np 8 -H localhost:8 --bind-to none --mca coll ^hcoll --mca btl ^openib --mca mtl ^ofi"
+mpirun ${MPI_ARGS_COMMON} -x XCCL_TEST_TLS=hier ${TORCH_UCC_SRC_DIR}/xccl/build-${XCCL_BUILD_TYPE}/test/test_mpi_allreduce
+mpirun ${MPI_ARGS_COMMON} -x XCCL_TEST_TLS=hier ${TORCH_UCC_SRC_DIR}/xccl/build-${XCCL_BUILD_TYPE}/test/test_mpi_bcast
+mpirun ${MPI_ARGS_COMMON} -x XCCL_TEST_TLS=hier ${TORCH_UCC_SRC_DIR}/xccl/build-${XCCL_BUILD_TYPE}/test/test_mpi_barrier
+
+mpirun ${MPI_ARGS_COMMON} -x XCCL_TEAM_HIER_NODE_LEADER_RANK_ID=3 -x XCCL_TEST_TLS=hier ${TORCH_UCC_SRC_DIR}/xccl/build-${XCCL_BUILD_TYPE}/test/test_mpi_allreduce
+mpirun ${MPI_ARGS_COMMON} -x XCCL_TEAM_HIER_NODE_LEADER_RANK_ID=4 -x XCCL_TEST_TLS=hier ${TORCH_UCC_SRC_DIR}/xccl/build-${XCCL_BUILD_TYPE}/test/test_mpi_bcast
+mpirun ${MPI_ARGS_COMMON} -x XCCL_TEAM_HIER_NODE_LEADER_RANK_ID=5 -x XCCL_TEST_TLS=hier ${TORCH_UCC_SRC_DIR}/xccl/build-${XCCL_BUILD_TYPE}/test/test_mpi_barrier
+
+mpirun ${MPI_ARGS_COMMON} -x XCCL_TEAM_UCX_ALLREDUCE_ALG_ID=0 -x XCCL_TEST_TLS=ucx ${TORCH_UCC_SRC_DIR}/xccl/build-${XCCL_BUILD_TYPE}/test/test_mpi_allreduce
+mpirun ${MPI_ARGS_COMMON} -x XCCL_TEAM_UCX_ALLREDUCE_ALG_ID=1 -x XCCL_TEST_TLS=ucx ${TORCH_UCC_SRC_DIR}/xccl/build-${XCCL_BUILD_TYPE}/test/test_mpi_allreduce
+mpirun ${MPI_ARGS_COMMON} -x XCCL_TEST_TLS=ucx ${TORCH_UCC_SRC_DIR}/xccl/build-${XCCL_BUILD_TYPE}/test/test_mpi_bcast
+mpirun ${MPI_ARGS_COMMON} -x XCCL_TEST_TLS=ucx ${TORCH_UCC_SRC_DIR}/xccl/build-${XCCL_BUILD_TYPE}/test/test_mpi_barrier
+mpirun ${MPI_ARGS_COMMON} -x XCCL_TEST_TLS=ucx ${TORCH_UCC_SRC_DIR}/xccl/build-${XCCL_BUILD_TYPE}/test/test_mpi_alltoall
+mpirun ${MPI_ARGS_COMMON} -x XCCL_TEST_TLS=ucx ${TORCH_UCC_SRC_DIR}/xccl/build-${XCCL_BUILD_TYPE}/test/test_mpi_alltoallv
+mpirun ${MPI_ARGS_COMMON} -x XCCL_TEST_TLS=ucx ${TORCH_UCC_SRC_DIR}/xccl/build-${XCCL_BUILD_TYPE}/test/test_mpi_allgather
+mpirun -x XCCL_TEAM_UCX_ALLTOALL_PAIRWISE_CHUNK=0 ${MPI_ARGS_COMMON} -x XCCL_TEST_TLS=ucx ${TORCH_UCC_SRC_DIR}/xccl/build-${XCCL_BUILD_TYPE}/test/test_mpi_alltoall
+mpirun -x XCCL_TEAM_UCX_ALLTOALL_PAIRWISE_CHUNK=0 ${MPI_ARGS_COMMON} -x XCCL_TEST_TLS=ucx ${TORCH_UCC_SRC_DIR}/xccl/build-${XCCL_BUILD_TYPE}/test/test_mpi_alltoallv
+mpirun ${MPI_ARGS_COMMON} -x XCCL_TEST_TLS=hier -x XCCL_TEST_ITERS=500 -x XCCL_TEST_NTHREADS=4 -x XCCL_TEST_CHECK=1 ${TORCH_UCC_SRC_DIR}/xccl/build-${XCCL_BUILD_TYPE}/test/test_mpi_mt
diff --git a/.gitignore b/.gitignore
index 6c7df1c..e7050cd 100644
--- a/.gitignore
+++ b/.gitignore
@@ -60,3 +60,6 @@ dist/
 # vscode
 *.code-workspace
 .vscode
+
+# IDEs
+.idea/
diff --git a/.gitmodules b/.gitmodules
new file mode 100644
index 0000000..562a232
--- /dev/null
+++ b/.gitmodules
@@ -0,0 +1,9 @@
+[submodule "xccl"]
+path = xccl
+url = https://github.com/openucx/xccl.git
+[submodule "ucc"]
+path = ucc
+url = https://github.com/openucx/ucc.git
+[submodule "ucx"]
+path = ucx
+url = https://github.com/openucx/ucx.git
diff --git a/ucc b/ucc
new file mode 160000
index 0000000..7f75fad
--- /dev/null
+++ b/ucc
@@ -0,0 +1 @@
+Subproject commit 7f75fad3f7a72e1053cbb4246a1cc7e62c75d4b3
diff --git a/ucx b/ucx
new file mode 160000
index 0000000..737b5c4
--- /dev/null
+++ b/ucx
@@ -0,0 +1 @@
+Subproject commit 737b5c4edface2e33c2321ac88e83320cad598eb
diff --git a/xccl b/xccl
new file mode 160000
index 0000000..b046913
--- /dev/null
+++ b/xccl
@@ -0,0 +1 @@
+Subproject commit b04691392586477dd83bdf6de75f440540cb688c