Skip to content
This repository has been archived by the owner on May 17, 2022. It is now read-only.

Implemented Torch-UCC CI (WIP) #33

Open
wants to merge 9 commits into
base: ucc
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
94 changes: 94 additions & 0 deletions .ci/Dockerfile.centos8
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
ARG CUDA_VER='11.2.1'
FROM nvidia/cuda:${CUDA_VER}-devel-centos8
#==============================================================================
ARG TORCH_UCC_ROOT_DIR=/opt/nvidia/torch-ucc
ENV TORCH_UCC_SRC_DIR=${TORCH_UCC_ROOT_DIR}/src
ENV TORCH_UCC_PKG_DIR=${TORCH_UCC_ROOT_DIR}/pkg
ENV TORCH_UCC_BIN_DIR=${TORCH_UCC_ROOT_DIR}/bin
ENV TORCH_UCC_WORKLOADS_DIR=${TORCH_UCC_ROOT_DIR}/workloads
ENV CUDA_HOME=/usr/local/cuda
ENV UCX_BRANCH=v1.10.x
ENV UCX_BUILD_TYPE=release-mt
ENV UCX_INSTALL_DIR=${TORCH_UCC_BIN_DIR}/ucx/build-${UCX_BUILD_TYPE}
ENV XCCL_BUILD_TYPE=debug
ENV XCCL_INSTALL_DIR=${TORCH_UCC_BIN_DIR}/xccl/build-${XCCL_BUILD_TYPE}
ENV UCC_INSTALL_DIR=${TORCH_UCC_BIN_DIR}/ucc/build
ENV TORCH_UCC_PYTHON_VENV_DIR=${TORCH_UCC_BIN_DIR}/python/venv
#==============================================================================
RUN mkdir -p ${TORCH_UCC_SRC_DIR} && \
mkdir -p ${TORCH_UCC_PKG_DIR} && \
mkdir -p ${TORCH_UCC_BIN_DIR} && \
mkdir -p ${TORCH_UCC_WORKLOADS_DIR} && \
mkdir -p ${TORCH_UCC_PYTHON_VENV_DIR}

COPY . ${TORCH_UCC_SRC_DIR}
#==============================================================================
RUN yum groupinstall -y \
'Development Tools' \
'Infiniband Support'
RUN yum config-manager --set-enabled powertools && yum install -y \
cmake \
numactl \
numactl-devel \
openmpi \
openmpi-devel \
openssh-server \
protobuf-compiler \
protobuf-devel \
python36-devel \
rdma-core-devel \
vim
# Remove old UCX
RUN rpm -e --nodeps ucx
ENV PATH=/usr/lib64/openmpi/bin:$PATH
RUN echo "export PATH=\"/usr/lib64/openmpi/bin:\$PATH\"" >> /etc/bashrc && \
export LD_LIBRARY_PATH=\"/usr/lib64/openmpi/lib:\${LD_LIBRARY_PATH}\" >> /etc/bashrc
#==============================================================================
# Configure SSH
RUN mkdir -p /var/run/sshd && \
cat /etc/ssh/ssh_config | grep -v StrictHostKeyChecking > /etc/ssh/ssh_config.new && \
echo " StrictHostKeyChecking no" >> /etc/ssh/ssh_config.new && \
mv /etc/ssh/ssh_config.new /etc/ssh/ssh_config && \
ssh-keygen -A && \
rm -f /run/nologin
#==============================================================================
# Build UCX
RUN ${TORCH_UCC_SRC_DIR}/.ci/scripts/build_ucx.sh
ENV PATH=${UCX_INSTALL_DIR}/bin:${PATH}
#==============================================================================
# Configure Python
RUN ${TORCH_UCC_SRC_DIR}/.ci/scripts/configure_python.sh
#==============================================================================
# Build XCCL
RUN ${TORCH_UCC_SRC_DIR}/.ci/scripts/build_xccl.sh
#==============================================================================
# Build UCC
RUN ${TORCH_UCC_SRC_DIR}/.ci/scripts/build_ucc.sh
#==============================================================================
# Install PyTorch
RUN ${TORCH_UCC_SRC_DIR}/.ci/scripts/install_torch.sh
#==============================================================================
# Install workloads
# TODO upstream the patches (if needed)
WORKDIR ${TORCH_UCC_WORKLOADS_DIR}
RUN git clone https://github.com/facebookresearch/dlrm.git && \
cd ${TORCH_UCC_WORKLOADS_DIR}/dlrm && \
git apply ${TORCH_UCC_SRC_DIR}/.ci/patches/dlrm/0001-Added-torch_ucc-support.patch && \
git apply ${TORCH_UCC_SRC_DIR}/.ci/patches/dlrm/0002-Fixed-arg-list.patch && \
pip3 install -r ${TORCH_UCC_WORKLOADS_DIR}/dlrm/requirements.txt && \
pip3 install tensorboard
RUN git clone https://github.com/facebookresearch/param.git && \
pip3 install -r ${TORCH_UCC_WORKLOADS_DIR}/param/requirements.txt
#==============================================================================
# Install torch_ucc (XCCL version) python module and build a wheel package
RUN ${TORCH_UCC_SRC_DIR}/.ci/scripts/install_torch_xccl.sh
#==============================================================================
# Install torch_ucc (UCC version) python module and build a wheel package
RUN ${TORCH_UCC_SRC_DIR}/.ci/scripts/install_torch_ucc.sh
#==============================================================================
RUN groupadd -g 11429 swx-jenkins
RUN adduser --uid 6213 --gid 11429 --home /home/swx-jenkins swx-jenkins

RUN groupadd -g 30 dip
RUN adduser --no-create-home --uid 50009 --gid 30 --home /labhome/artemry artemry
#==============================================================================
75 changes: 75 additions & 0 deletions .ci/Dockerfile.ubuntu20.04
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
#ARG CUDA_VER='11.2.1'
ARG CUDA_VER='11.1.1'
FROM nvidia/cuda:${CUDA_VER}-devel-ubuntu20.04
#==============================================================================
ARG TORCH_UCC_ROOT_DIR=/opt/nvidia/torch-ucc
ENV TORCH_UCC_SRC_DIR=${TORCH_UCC_ROOT_DIR}/src
ENV TORCH_UCC_PKG_DIR=${TORCH_UCC_ROOT_DIR}/pkg
ENV TORCH_UCC_BIN_DIR=${TORCH_UCC_ROOT_DIR}/bin
ENV TORCH_UCC_WORKLOADS_DIR=${TORCH_UCC_ROOT_DIR}/workloads
ENV CUDA_HOME=/usr/local/cuda
ENV UCX_BRANCH=v1.10.x
ENV UCX_BUILD_TYPE=release-mt
ENV UCX_INSTALL_DIR=${TORCH_UCC_BIN_DIR}/ucx/build-${UCX_BUILD_TYPE}
ENV XCCL_BUILD_TYPE=debug
ENV XCCL_INSTALL_DIR=${TORCH_UCC_BIN_DIR}/xccl/build-${XCCL_BUILD_TYPE}
#==============================================================================
RUN mkdir -p ${TORCH_UCC_SRC_DIR} && \
mkdir -p ${TORCH_UCC_PKG_DIR} && \
mkdir -p ${TORCH_UCC_BIN_DIR} && \
mkdir -p ${TORCH_UCC_WORKLOADS_DIR}

COPY . ${TORCH_UCC_SRC_DIR}
#==============================================================================
ARG DEBIAN_FRONTEND=noninteractive
RUN apt update && \
apt install -y \
apt-utils \
autoconf \
build-essential \
cmake \
curl \
git \
ibverbs-providers \
ibverbs-utils \
libnuma-dev \
libtool-bin \
ninja-build \
openmpi-bin \
openssh-server \
vim \
&& \
rm -rf /var/lib/apt/lists/*
#==============================================================================
# Configure SSH
RUN mkdir -p /var/run/sshd && \
cat /etc/ssh/ssh_config | grep -v StrictHostKeyChecking > /etc/ssh/ssh_config.new && \
echo " StrictHostKeyChecking no" >> /etc/ssh/ssh_config.new && \
mv /etc/ssh/ssh_config.new /etc/ssh/ssh_config
#==============================================================================
# Build UCX
RUN ${TORCH_UCC_SRC_DIR}/.ci/scripts/build_ucx.sh
ENV PATH=${UCX_INSTALL_DIR}/bin:${PATH}
#==============================================================================
# Configure Python
RUN ${TORCH_UCC_SRC_DIR}/.ci/scripts/configure_python.sh
ENV PATH /opt/conda/bin:${PATH}
#==============================================================================
# Build XCCL
RUN ${TORCH_UCC_SRC_DIR}/.ci/scripts/build_xccl.sh
#==============================================================================
# Install PyTorch
RUN ${TORCH_UCC_SRC_DIR}/.ci/scripts/install_torch.sh
#==============================================================================
# Install torch_ucc python module and build a wheel package
RUN ${TORCH_UCC_SRC_DIR}/.ci/scripts/install_torch_ucc.sh
#==============================================================================
# Install workloads
WORKDIR ${TORCH_UCC_WORKLOADS_DIR}
RUN git clone https://github.com/facebookresearch/dlrm.git && \
cd ${TORCH_UCC_WORKLOADS_DIR}/dlrm && \
git apply ${TORCH_UCC_SRC_DIR}/.ci/patches/dlrm/0001-Added-torch_ucc-support.patch && \
pip install -r ${TORCH_UCC_WORKLOADS_DIR}/dlrm/requirements.txt && \
pip install tensorboard
RUN git clone https://github.com/facebookresearch/param.git && \
pip install -r ${TORCH_UCC_WORKLOADS_DIR}/param/requirements.txt
9 changes: 9 additions & 0 deletions .ci/Jenkinsfile.shlib
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
#!/usr/bin/groovy

// load pipeline functions
// Requires pipeline-github-lib plugin to load library from github
@Library('github.com/Mellanox/ci-demo@stable')
def matrix = new com.mellanox.cicd.Matrix()

matrix.main()

2 changes: 2 additions & 0 deletions .ci/configs/swx-clx01/hostfile.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
swx-clx01
swx-clx02
2 changes: 2 additions & 0 deletions .ci/configs/swx-clx02/hostfile.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
swx-clx02
swx-clx01
148 changes: 148 additions & 0 deletions .ci/job_matrix.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,148 @@
---
job: 'torch-ucc'

registry_host: 'harbor.mellanox.com'
registry_path: '/torch-ucc'
registry_auth: '05d98651-e11c-4a57-9cc6-52df79014b89'

#kubernetes:
# cloud: 'swx-k8s'

volumes:
- { mountPath: '/hpc/local', hostPath: '/hpc/local' }
- { mountPath: '/auto/sw_tools', hostPath: '/auto/sw_tools' }
- { mountPath: '/.autodirect/mtrswgwork', hostPath: '/.autodirect/mtrswgwork' }
- { mountPath: '/.autodirect/sw/release', hostPath: '/.autodirect/sw/release' }

env:
CUDA_VER: '11.2.1'
TORCH_UCC_URI_SUFFIX: '${TORCH_UCC_VERSION}/x86_64/centos8/cuda${CUDA_VER}'
TORCH_UCC_DOCKER_IMAGE_NAME: '${registry_host}${registry_path}/${TORCH_UCC_URI_SUFFIX}'
TORCH_UCC_ROOT_DIR: '/opt/nvidia/torch-ucc'
TORCH_UCC_SRC_DIR: '${TORCH_UCC_ROOT_DIR}/src'
TORCH_UCC_BIN_DIR: '${TORCH_UCC_ROOT_DIR}/bin'
TORCH_UCC_PYTHON_VENV_DIR: '${TORCH_UCC_BIN_DIR}/python/venv'
XCCL_BUILD_TYPE: 'debug'

docker_opt: '--pull always --network=host --uts=host --ipc=host --ulimit stack=67108864 --ulimit memlock=-1 --security-opt seccomp=unconfined --cap-add=SYS_ADMIN --device=/dev/infiniband/ --gpus all --user root'

runs_on_dockers:
- {
file: '.ci/Dockerfile.centos8',
name: 'centos8',
tag: '${BUILD_NUMBER}',
arch: 'x86_64',
uri: '${TORCH_UCC_URI_SUFFIX}',
build_args: '--rm --no-cache --build-arg CUDA_VER=${CUDA_VER} --build-arg TORCH_UCC_ROOT_DIR=${TORCH_UCC_ROOT_DIR}',
cloud: 'swx-k8s',
nodeLabel: 'swx-clx01 || swx-clx02',
}

# bare metal
runs_on_agents:
- nodeLabel: 'swx-clx01 || swx-clx02'

# TODO debug
timeout_minutes: '400'

steps:
#============================================================================
- name: Check Env
agentSelector: "{nodeLabel: 'swx-clx01 || swx-clx02'}"
containerSelector: "{name:'centos8'}"
run: |
echo "INFO: check environment"
hostname
printenv
cat /proc/1/cgroup
cat /etc/*release*
id
#find /opt/nvidia
#ibv_devinfo
#nvidia-smi
#nvidia-smi topo -m
#============================================================================
- name: Run XCCL tests
#agentSelector: "{nodeLabel: 'swx-clx01 || swx-clx02'}"
containerSelector: "{name:'centos8'}"
run: |
echo "INFO: Run XCCL tests"
. "${TORCH_UCC_PYTHON_VENV_DIR}/xccl/bin/activate"
hostname
cat /proc/1/cgroup
pip3 list | grep torch
${TORCH_UCC_SRC_DIR}/.ci/scripts/run_tests_xccl.sh
deactivate
artemry-nv marked this conversation as resolved.
Show resolved Hide resolved
#============================================================================
# - name: Run UCC tests
# #agentSelector: "{nodeLabel: 'swx-clx01 || swx-clx02'}"
# containerSelector: "{name:'centos8'}"
# run: |
# echo "INFO: Run UCC tests"
# . "${TORCH_UCC_PYTHON_VENV_DIR}/ucc/bin/activate"
# hostname
# cat /proc/1/cgroup
# pip3 list | grep torch
# ${TORCH_UCC_SRC_DIR}/.ci/scripts/run_tests_ucc.sh
# deactivate
#============================================================================
- name: Run Torch-UCC tests (XCCL)
#agentSelector: "{nodeLabel: 'swx-clx01 || swx-clx02'}"
containerSelector: "{name:'centos8'}"
run: |
echo "INFO: Run Torch-UCC tests (XCCL)"
. "${TORCH_UCC_PYTHON_VENV_DIR}/xccl/bin/activate"
hostname
cat /proc/1/cgroup
pip3 list | grep torch
${TORCH_UCC_SRC_DIR}/.ci/scripts/run_tests_torch_xccl.sh
deactivate
#============================================================================
- name: Run Torch-UCC tests (UCC)
#agentSelector: "{nodeLabel: 'swx-clx01 || swx-clx02'}"
containerSelector: "{name:'centos8'}"
run: |
echo "INFO: Run Torch-UCC tests (UCC)"
. "${TORCH_UCC_PYTHON_VENV_DIR}/ucc/bin/activate"
hostname
cat /proc/1/cgroup
pip3 list | grep torch
${TORCH_UCC_SRC_DIR}/.ci/scripts/run_tests_torch_ucc.sh
deactivate
#============================================================================
- name: Run DLRM tests (XCCL/GPU)
agentSelector: "{nodeLabel: 'swx-clx01 || swx-clx02'}"
run: |
echo "INFO: Run DLRM tests (XCCL/GPU)"
hostname
printenv
cat /proc/1/cgroup
cat /etc/*release*
id
find /opt/nvidia
ibv_devinfo
nvidia-smi
${WORKSPACE}/.ci/scripts/run_dlrm_docker.sh xccl
#============================================================================
- name: Run DLRM tests (UCC/GPU)
agentSelector: "{nodeLabel: 'swx-clx01 || swx-clx02'}"
run: |
echo "INFO: Run DLRM tests (UCC/GPU)"
hostname
printenv
cat /proc/1/cgroup
cat /etc/*release*
id
find /opt/nvidia
ibv_devinfo
nvidia-smi
${WORKSPACE}/.ci/scripts/run_dlrm_docker.sh ucc
#============================================================================
# - name: Run PARAM benchmarks
# agentSelector: "{nodeLabel: 'swx-clx01 || swx-clx02'}"
# run: |
# echo "INFO: Run PARAM benchmarks"
# hostname
# cat /proc/1/cgroup
# #${TORCH_UCC_SRC_DIR}/.ci/scripts/run_param_benchmarks.sh
#============================================================================
30 changes: 30 additions & 0 deletions .ci/patches/dlrm/0001-Added-torch_ucc-support.patch
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
From bcd8fc065ef04a0ea8f06e61a5e2581a308719fd Mon Sep 17 00:00:00 2001
From: artemry-nv <artemry@nvidia.com>
Date: Tue, 9 Mar 2021 00:41:16 +0300
Subject: [PATCH] Added torch_ucc support

Signed-off-by: artemry-nv <artemry@nvidia.com>
---
extend_distributed.py | 6 ++++++
1 file changed, 6 insertions(+)

diff --git a/extend_distributed.py b/extend_distributed.py
index adcb60b..1f2c8a5 100644
--- a/extend_distributed.py
+++ b/extend_distributed.py
@@ -20,6 +20,12 @@ except ImportError as e:
# print(e)
torch_ccl = False

+try:
+ import torch_ucc
+except ImportError as e:
+ torch_ucc = False
+
+
my_rank = -1
my_size = -1
my_local_rank = -1
--
2.24.3 (Apple Git-128)

25 changes: 25 additions & 0 deletions .ci/patches/dlrm/0002-Fixed-arg-list.patch
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
From 481fd6aef896aa8ff15a161b7e88b2ea01ae673a Mon Sep 17 00:00:00 2001
From: artemry-nv <artemry@nvidia.com>
Date: Mon, 29 Mar 2021 01:56:08 +0300
Subject: [PATCH] Fixed arg list

---
dlrm_s_pytorch.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dlrm_s_pytorch.py b/dlrm_s_pytorch.py
index 71a0414..36dab9b 100644
--- a/dlrm_s_pytorch.py
+++ b/dlrm_s_pytorch.py
@@ -1477,7 +1477,7 @@ def run():

ext_dist.barrier()
with torch.autograd.profiler.profile(
- args.enable_profiling, use_gpu, record_shapes=True
+ args.enable_profiling, use_cuda=use_gpu, record_shapes=True
) as prof:
if not args.inference_only:
k = 0
--
2.24.3 (Apple Git-128)

Loading