Skip to content
This repository has been archived by the owner on Nov 17, 2023. It is now read-only.

Commit

Permalink
[CI][1.x] Cherrypick: Upgrade unix gpu toolchain (#18186) (#18785)
Browse files Browse the repository at this point in the history
* Update unix gpu toolchain (#18186)

* update nvidiadocker command & remove cuda compat

* replace cu101 with cuda since compat is no longer to be used

* skip flaky tests

* get rid of ubuntu_build_cuda and point ubuntu_cu101 to base gpu instead of cuda compat

* Revert "skip flaky tests"

This reverts commit 1c720fa.

* revert removal of ubuntu_build_cuda

* add linux gpu g4 node to all steps using g3 in unix-gpu pipeline

* remove docker compose files

* add back the caffe test since caffe is deprecated for mx2.0 and not 1.x

* drop nvidia-docker requirement since docker19.0 supports it by default

:q

* remove compat from dockerfile

* Cherry-pick #18635 to v1.7.x (#18935)

* Remove mention of nightly in pypi (#18635)

* update bert dev.tsv link

Co-authored-by: Sheng Zha <szha@users.noreply.github.com>

* disable tvm in CI functions that rely on libcuda compat

* tvm off for ubuntu_gpu_cmake build

* drop tvm from all unix-gpu builds

Co-authored-by: Carin Meier <cmeier@gigasquidsoftware.com>
Co-authored-by: Sheng Zha <szha@users.noreply.github.com>
  • Loading branch information
3 people committed Aug 18, 2020
1 parent 6ae469a commit 9981e84
Show file tree
Hide file tree
Showing 6 changed files with 35 additions and 129 deletions.
1 change: 1 addition & 0 deletions ci/Jenkinsfile_utils.groovy
Original file line number Diff line number Diff line change
Expand Up @@ -255,6 +255,7 @@ def assign_node_labels(args) {
// knowing about the limitations.
NODE_LINUX_CPU = args.linux_cpu
NODE_LINUX_GPU = args.linux_gpu
NODE_LINUX_GPU_G4 = args.linux_gpu_g4
NODE_LINUX_GPU_P3 = args.linux_gpu_p3
NODE_WINDOWS_CPU = args.windows_cpu
NODE_WINDOWS_GPU = args.windows_gpu
Expand Down
25 changes: 10 additions & 15 deletions ci/build.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,23 +66,18 @@ def get_dockerfile(platform: str, path=get_dockerfiles_path()) -> str:
return os.path.join(path, "Dockerfile.{0}".format(platform))


def get_docker_binary(use_nvidia_docker: bool) -> str:
return "nvidia-docker" if use_nvidia_docker else "docker"


def build_docker(platform: str, docker_binary: str, registry: str, num_retries: int, no_cache: bool,
def build_docker(platform: str, registry: str, num_retries: int, no_cache: bool,
cache_intermediate: bool) -> str:
"""
Build a container for the given platform
:param platform: Platform
:param docker_binary: docker binary to use (docker/nvidia-docker)
:param registry: Dockerhub registry name
:param num_retries: Number of retries to build the docker image
:param no_cache: pass no-cache to docker to rebuild the images
:return: Id of the top level image
"""
tag = get_docker_tag(platform=platform, registry=registry)
logging.info("Building docker container tagged '%s' with %s", tag, docker_binary)
logging.info("Building docker container tagged '%s'", tag)
#
# We add a user with the same group as the executing non-root user so files created in the
# container match permissions of the local user. Same for the group.
Expand All @@ -99,7 +94,7 @@ def build_docker(platform: str, docker_binary: str, registry: str, num_retries:
#
# This doesn't work with multi head docker files.
#
cmd = [docker_binary, "build",
cmd = ["docker", "build",
"-f", get_dockerfile(platform),
"--build-arg", "USER_ID={}".format(os.getuid()),
"--build-arg", "GROUP_ID={}".format(os.getgid())]
Expand All @@ -119,19 +114,19 @@ def run_cmd():
run_cmd()
# Get image id by reading the tag. It's guaranteed (except race condition) that the tag exists. Otherwise, the
# check_call would have failed
image_id = _get_local_image_id(docker_binary=docker_binary, docker_tag=tag)
image_id = _get_local_image_id(docker_tag=tag)
if not image_id:
raise FileNotFoundError('Unable to find docker image id matching with {}'.format(tag))
return image_id


def _get_local_image_id(docker_binary, docker_tag):
def _get_local_image_id(docker_tag):
"""
Get the image id of the local docker layer with the passed tag
:param docker_tag: docker tag
:return: Image id as string or None if tag does not exist
"""
cmd = [docker_binary, "images", "-q", docker_tag]
cmd = ["docker", "images", "-q", docker_tag]
image_id_b = check_output(cmd)
image_id = image_id_b.decode('utf-8').strip()
if not image_id:
Expand Down Expand Up @@ -196,8 +191,9 @@ def container_run(docker_client: SafeDockerClient,

# Equivalent command
docker_cmd_list = [
get_docker_binary(nvidia_runtime),
"docker",
'run',
"--gpus all" if nvidia_runtime else "",
"--cap-add",
"SYS_PTRACE", # Required by ASAN
'--rm',
Expand Down Expand Up @@ -352,7 +348,6 @@ def main() -> int:
args = parser.parse_args()

command = list(chain(*args.command))
docker_binary = get_docker_binary(args.nvidiadocker)
docker_client = SafeDockerClient()

environment = dict([(e.split('=')[:2] if '=' in e else (e, os.environ[e]))
Expand All @@ -366,7 +361,7 @@ def main() -> int:
if args.docker_registry:
load_docker_cache(tag=tag, docker_registry=args.docker_registry)
if not args.run_only:
build_docker(platform=platform, docker_binary=docker_binary, registry=args.docker_registry,
build_docker(platform=platform, registry=args.docker_registry,
num_retries=args.docker_build_retries, no_cache=args.no_cache,
cache_intermediate=args.cache_intermediate)
else:
Expand Down Expand Up @@ -410,7 +405,7 @@ def main() -> int:
for platform in platforms:
tag = get_docker_tag(platform=platform, registry=args.docker_registry)
load_docker_cache(tag=tag, docker_registry=args.docker_registry)
build_docker(platform, docker_binary=docker_binary, registry=args.docker_registry,
build_docker(platform, registry=args.docker_registry,
num_retries=args.docker_build_retries, no_cache=args.no_cache)
if args.build_only:
continue
Expand Down
1 change: 0 additions & 1 deletion ci/docker/Dockerfile.build.ubuntu_gpu_cu101
Original file line number Diff line number Diff line change
Expand Up @@ -79,4 +79,3 @@ RUN /work/ubuntu_adduser.sh
COPY runtime_functions.sh /work/

WORKDIR /work/mxnet
ENV LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/cuda/compat
57 changes: 7 additions & 50 deletions ci/docker/runtime_functions.sh
Original file line number Diff line number Diff line change
Expand Up @@ -767,7 +767,7 @@ build_ubuntu_gpu_mkldnn() {
USE_CUDA=1 \
USE_CUDA_PATH=/usr/local/cuda \
USE_CUDNN=1 \
USE_TVM_OP=1 \
USE_TVM_OP=0 \
CUDA_ARCH="$CI_CUDA_COMPUTE_CAPABILITIES" \
USE_SIGNAL_HANDLER=1 \
-j$(nproc)
Expand All @@ -784,7 +784,7 @@ build_ubuntu_gpu_mkldnn_nocudnn() {
USE_CUDA=1 \
USE_CUDA_PATH=/usr/local/cuda \
USE_CUDNN=0 \
USE_TVM_OP=1 \
USE_TVM_OP=0 \
CUDA_ARCH="$CI_CUDA_COMPUTE_CAPABILITIES" \
USE_SIGNAL_HANDLER=1 \
-j$(nproc)
Expand All @@ -799,7 +799,7 @@ build_ubuntu_gpu_cuda101_cudnn7() {
USE_CUDA=1 \
USE_CUDA_PATH=/usr/local/cuda \
USE_CUDNN=1 \
USE_TVM_OP=1 \
USE_TVM_OP=0 \
USE_CPP_PACKAGE=1 \
USE_DIST_KVSTORE=1 \
CUDA_ARCH="$CI_CUDA_COMPUTE_CAPABILITIES" \
Expand Down Expand Up @@ -827,26 +827,6 @@ build_ubuntu_gpu_cuda101_cudnn7_mkldnn_cpp_test() {
make cython PYTHON=python3
}

build_ubuntu_gpu_cuda101_cudnn7_no_tvm_op() {
set -ex
build_ccache_wrappers
make \
DEV=1 \
USE_BLAS=openblas \
USE_MKLDNN=0 \
USE_CUDA=1 \
USE_CUDA_PATH=/usr/local/cuda \
USE_CUDNN=1 \
USE_TVM_OP=0 \
USE_CPP_PACKAGE=1 \
USE_DIST_KVSTORE=1 \
CUDA_ARCH="$CI_CUDA_COMPUTE_CAPABILITIES" \
USE_SIGNAL_HANDLER=1 \
-j$(nproc)

make cython PYTHON=python3
}

build_ubuntu_amalgamation() {
set -ex
# Amalgamation can not be run with -j nproc
Expand Down Expand Up @@ -874,7 +854,7 @@ build_ubuntu_gpu_cmake_mkldnn() {
-DUSE_SIGNAL_HANDLER=ON \
-DUSE_CUDA=1 \
-DUSE_CUDNN=1 \
-DUSE_TVM_OP=1 \
-DUSE_TVM_OP=0 \
-DPython3_EXECUTABLE=/usr/bin/python3 \
-DUSE_MKLML_MKL=1 \
-DCMAKE_BUILD_TYPE=Release \
Expand All @@ -893,7 +873,7 @@ build_ubuntu_gpu_cmake() {
-DUSE_SIGNAL_HANDLER=ON \
-DUSE_CUDA=ON \
-DUSE_CUDNN=ON \
-DUSE_TVM_OP=ON \
-DUSE_TVM_OP=OFF \
-DPython3_EXECUTABLE=/usr/bin/python3 \
-DUSE_MKL_IF_AVAILABLE=OFF \
-DUSE_MKLML_MKL=OFF \
Expand All @@ -916,7 +896,7 @@ build_ubuntu_gpu_cmake_no_rtc() {
-DUSE_SIGNAL_HANDLER=ON \
-DUSE_CUDA=ON \
-DUSE_CUDNN=ON \
-DUSE_TVM_OP=ON \
-DUSE_TVM_OP=OFF \
-DPython3_EXECUTABLE=/usr/bin/python3 \
-DUSE_MKL_IF_AVAILABLE=OFF \
-DUSE_MKLML_MKL=OFF \
Expand All @@ -932,29 +912,6 @@ build_ubuntu_gpu_cmake_no_rtc() {
ninja
}

build_ubuntu_gpu_cmake_no_tvm_op() {
set -ex
cd /work/build
build_ccache_wrappers
cmake \
-DUSE_SIGNAL_HANDLER=ON \
-DUSE_CUDA=ON \
-DUSE_CUDNN=ON \
-DUSE_TVM_OP=OFF \
-DPython3_EXECUTABLE=/usr/bin/python3 \
-DUSE_MKL_IF_AVAILABLE=OFF \
-DUSE_MKLML_MKL=OFF \
-DUSE_MKLDNN=OFF \
-DUSE_DIST_KVSTORE=ON \
-DCMAKE_BUILD_TYPE=Release \
-DMXNET_CUDA_ARCH="$CI_CMAKE_CUDA_ARCH" \
-DBUILD_CYTHON_MODULES=1 \
-G Ninja \
/work/mxnet

ninja
}

build_ubuntu_cpu_large_tensor() {
set -ex
cd /work/build
Expand All @@ -980,7 +937,7 @@ build_ubuntu_gpu_large_tensor() {
-DUSE_SIGNAL_HANDLER=ON \
-DUSE_CUDA=ON \
-DUSE_CUDNN=ON \
-DUSE_TVM_OP=ON \
-DUSE_TVM_OP=OFF \
-DPython3_EXECUTABLE=/usr/bin/python3 \
-DUSE_MKL_IF_AVAILABLE=OFF \
-DUSE_MKLML_MKL=OFF \
Expand Down
Loading

0 comments on commit 9981e84

Please sign in to comment.