Skip to content
This repository has been archived by the owner on Nov 17, 2023. It is now read-only.

Commit

Permalink
Use RTC for elementwise and broadcast ops (#18622)
Browse files Browse the repository at this point in the history
* Reapplying PR #17767

* Making RTC required

* Move cuda utils to src/common/cuda and refactor RTC part

* Unary ops via RTC

* Support binary_scalar forward

Remove elemwise_scatter_op.*

Fix BinaryScalar usage in NumPy

* Backward of binary scalar

* Binary forward

* Fix for binary_scalar

* Moving all binary forward to RTC

Reorganization

* Backward of binary ops

* Suuport broadcast

Add RTC to NumPy ops

* RTC for elementwise sum

Fixes

* RTC for backward usenone of broadcast

* RTC for broadcast bwd usein

* Remove non-RTC vectorization support

* Remove template from ReduceWorkspaceSize

* Fixes from rebase

* Guarding RTC usage behing MXNET_USE_CUDA

* More guards

* C++17 for CUDA code

* MixedUnaryBackwardInOut as RTC

* Removing unused variable

* Revert "C++17 for CUDA code"

This reverts commit b09090c.

* Get rid of CI tests without RTC
Get rid of if constexpr as CUDA 10 does not support it

* Fix lint

* Change a few more elemwise functions
Fix for too long value

* Fix large tensor build

* Another try with DBL_MAX

* Fix Windows compilation

* Fix the large int test

* Add the printing of error code value to CUDA_DRIVER_CALL

* Fix

* Fix binary scalar

* Get more information when cuLaunchKernel fails

* Going easy on Windows compiler

* Fix lint

* Reorganization to split strings due to Windows compilation problems

* Fix error with uninitialized value

* Fix handling of different types for backward of binary scalar

* Decreasing RTC overhead

* Fix lint and remove rest of mentions of ENABLE_RTC

* Jetson with RTC

* Fix the aws s3 command

* Debugging Windows failure

* More debugging of Windows failure

* Debug

* Fix the issue on Windows (long -> long long for 8B)

* libcuda.so for Jetson

* Enable debug information for RTC kernels and cleaning debug ptx dump

* Fix lint

* Try without linking the stub of libcuda.so to different place in Jetson

* Add docstring

* Answering review comments

* Unifying vectorization

* Fix

* Fixes for reduce ops

* Fix M=1 case

* Fixes from rebase
Fixes for mixed type gradient functions
Set the launch bounds on RTC kernels

* Fix

* Fix tests

* Adding tutorial for RTC

* Fixes after merge

* Fixes from review

* Change env var doc and undo the change to toctree
  • Loading branch information
ptrendx authored Aug 20, 2020
1 parent bbc39fa commit 29d6f27
Show file tree
Hide file tree
Showing 141 changed files with 7,274 additions and 3,548 deletions.
48 changes: 0 additions & 48 deletions 3rdparty/mshadow/mshadow/base.h
Original file line number Diff line number Diff line change
Expand Up @@ -272,7 +272,6 @@ extern "C" {
}

#include "./half.h"
#include "./half2.h"
#include "./bfloat.h"
#define MSHADOW_HALF_BF_OPERATOR(RTYPE, OP) \
MSHADOW_XINLINE RTYPE operator OP(mshadow::half::half_t a, mshadow::bfloat::bf16_t b) { \
Expand Down Expand Up @@ -387,11 +386,6 @@ struct DataType<half::half_t> {
#endif
};
template<>
struct DataType<half::half2_t> {
static const int kFlag = kFloat16;
static const int kLanes = 2;
};
template<>
struct DataType<bfloat::bf16_t> {
static const int kFlag = kBfloat16;
static const int kLanes = 1;
Expand Down Expand Up @@ -1144,48 +1138,6 @@ struct minimum {
}
#endif

#define MSHADOW_TYPE_SWITCH_WITH_HALF2(type, DType, ...) \
switch (type) { \
case mshadow::kFloat32: \
{ \
typedef float DType; \
{__VA_ARGS__} \
} \
break; \
case mshadow::kFloat64: \
{ \
typedef double DType; \
{__VA_ARGS__} \
} \
break; \
case mshadow::kFloat16: \
{ \
typedef mshadow::half::half2_t DType; \
{__VA_ARGS__} \
} \
break; \
case mshadow::kUint8: \
{ \
typedef uint8_t DType; \
{__VA_ARGS__} \
} \
break; \
case mshadow::kInt32: \
{ \
typedef int32_t DType; \
{__VA_ARGS__} \
} \
break; \
case mshadow::kInt64: \
{ \
typedef int64_t DType; \
{__VA_ARGS__} \
} \
break; \
default: \
LOG(FATAL) << "Unknown type enum " << type; \
}

#define MSHADOW_SGL_DBL_TYPE_SWITCH(type, DType, ...) \
switch (type) { \
case mshadow::kFloat32: \
Expand Down
162 changes: 0 additions & 162 deletions 3rdparty/mshadow/mshadow/half2.h

This file was deleted.

16 changes: 4 additions & 12 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,6 @@ option(USE_MXNET_LIB_NAMING "Use MXNet library naming conventions." ON)
option(USE_GPROF "Compile with gprof (profiling) flag" OFF)
option(USE_VTUNE "Enable use of Intel Amplifier XE (VTune)" OFF) # one could set VTUNE_ROOT for search path
option(USE_TVM_OP "Enable use of TVM operator build system." OFF)
option(ENABLE_CUDA_RTC "Build with CUDA runtime compilation support" ON)
option(BUILD_CPP_EXAMPLES "Build cpp examples" ON)
option(INSTALL_EXAMPLES "Install the example source files." OFF)
option(USE_SIGNAL_HANDLER "Print stack traces on segfaults." ON)
Expand Down Expand Up @@ -547,18 +546,11 @@ if(USE_CUDA)

string(REPLACE ";" " " CUDA_ARCH_FLAGS_SPACES "${CUDA_ARCH_FLAGS}")

find_package(CUDAToolkit REQUIRED cublas cufft cusolver curand
OPTIONAL_COMPONENTS nvToolsExt nvrtc)
find_package(CUDAToolkit REQUIRED cublas cufft cusolver curand nvrtc cuda_driver
OPTIONAL_COMPONENTS nvToolsExt)

list(APPEND mxnet_LINKER_LIBS CUDA::cudart CUDA::cublas CUDA::cufft CUDA::cusolver CUDA::curand)
if(ENABLE_CUDA_RTC)
if(CUDA_nvrtc_LIBRARY)
list(APPEND mxnet_LINKER_LIBS CUDA::nvrtc cuda)
add_definitions(-DMXNET_ENABLE_CUDA_RTC=1)
else()
message(FATAL_ERROR "ENABLE_CUDA_RTC=ON, but failed to find NVRTC. CMake will exit." )
endif()
endif()
list(APPEND mxnet_LINKER_LIBS CUDA::cudart CUDA::cublas CUDA::cufft CUDA::cusolver CUDA::curand
CUDA::nvrtc CUDA::cuda_driver)
list(APPEND SOURCE ${CUDA})
add_definitions(-DMXNET_USE_CUDA=1)

Expand Down
6 changes: 0 additions & 6 deletions ci/build_windows.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,6 @@ class BuildFlavour(Enum):
'-DCMAKE_CXX_COMPILER=cl '
'-DUSE_CUDA=OFF '
'-DUSE_CUDNN=OFF '
'-DENABLE_CUDA_RTC=OFF '
'-DUSE_OPENCV=ON '
'-DUSE_OPENMP=ON '
'-DUSE_BLAS=open '
Expand All @@ -76,7 +75,6 @@ class BuildFlavour(Enum):
'-DCMAKE_CXX_COMPILER=cl '
'-DUSE_CUDA=OFF '
'-DUSE_CUDNN=OFF '
'-DENABLE_CUDA_RTC=OFF '
'-DUSE_OPENCV=ON '
'-DUSE_OPENMP=ON '
'-DUSE_BLAS=open '
Expand All @@ -91,7 +89,6 @@ class BuildFlavour(Enum):
'-DCMAKE_CXX_COMPILER=cl '
'-DUSE_CUDA=OFF '
'-DUSE_CUDNN=OFF '
'-DENABLE_CUDA_RTC=OFF '
'-DUSE_OPENCV=ON '
'-DUSE_OPENMP=ON '
'-DUSE_BLAS=mkl '
Expand All @@ -106,7 +103,6 @@ class BuildFlavour(Enum):
'-DCMAKE_CXX_COMPILER=cl '
'-DUSE_CUDA=OFF '
'-DUSE_CUDNN=OFF '
'-DENABLE_CUDA_RTC=OFF '
'-DUSE_OPENCV=ON '
'-DUSE_OPENMP=ON '
'-DUSE_BLAS=mkl '
Expand All @@ -121,7 +117,6 @@ class BuildFlavour(Enum):
'-DCMAKE_CXX_COMPILER=cl '
'-DUSE_CUDA=ON '
'-DUSE_CUDNN=ON '
'-DENABLE_CUDA_RTC=ON '
'-DUSE_OPENCV=ON '
'-DUSE_OPENMP=ON '
'-DUSE_BLAS=open '
Expand All @@ -136,7 +131,6 @@ class BuildFlavour(Enum):
'-DCMAKE_CXX_COMPILER=cl '
'-DUSE_CUDA=ON '
'-DUSE_CUDNN=ON '
'-DENABLE_CUDA_RTC=ON '
'-DUSE_OPENCV=ON '
'-DUSE_OPENMP=ON '
'-DUSE_BLAS=open '
Expand Down
22 changes: 0 additions & 22 deletions ci/docker/runtime_functions.sh
Original file line number Diff line number Diff line change
Expand Up @@ -142,7 +142,6 @@ build_jetson() {
-DCMAKE_TOOLCHAIN_FILE=${CMAKE_TOOLCHAIN_FILE} \
-DUSE_CUDA=ON \
-DMXNET_CUDA_ARCH="5.2" \
-DENABLE_CUDA_RTC=OFF \
-DUSE_OPENCV=OFF \
-DUSE_OPENMP=ON \
-DUSE_LAPACK=OFF \
Expand Down Expand Up @@ -670,27 +669,6 @@ build_ubuntu_gpu_cmake() {
ninja
}

build_ubuntu_gpu_cmake_no_rtc() {
set -ex
cd /work/build
CC=gcc-7 CXX=g++-7 cmake \
-DUSE_SIGNAL_HANDLER=ON \
-DUSE_CUDA=ON \
-DUSE_CUDNN=ON \
-DUSE_MKL_IF_AVAILABLE=OFF \
-DUSE_MKLML_MKL=OFF \
-DUSE_MKLDNN=ON \
-DUSE_DIST_KVSTORE=ON \
-DCMAKE_BUILD_TYPE=Release \
-DMXNET_CUDA_ARCH="$CI_CMAKE_CUDA_ARCH" \
-DBUILD_CYTHON_MODULES=1 \
-DENABLE_CUDA_RTC=OFF \
-G Ninja \
/work/mxnet

ninja
}

build_ubuntu_cpu_large_tensor() {
set -ex
cd /work/build
Expand Down
14 changes: 0 additions & 14 deletions ci/jenkins/Jenkins_steps.groovy
Original file line number Diff line number Diff line change
Expand Up @@ -258,20 +258,6 @@ def compile_unix_cmake_gpu(lib_name) {
}]
}

def compile_unix_cmake_gpu_no_rtc(lib_name) {
return ['GPU: CMake CUDA RTC OFF': {
node(NODE_LINUX_CPU) {
ws('workspace/build-cmake-gpu-no-rtc') {
timeout(time: max_time, unit: 'MINUTES') {
utils.init_git()
utils.docker_run('ubuntu_gpu_cu101', 'build_ubuntu_gpu_cmake_no_rtc', false)
utils.pack_lib(lib_name, mx_cmake_lib)
}
}
}
}]
}

def compile_unix_tensorrt_gpu(lib_name) {
return ['TensorRT': {
node(NODE_LINUX_CPU) {
Expand Down
1 change: 0 additions & 1 deletion ci/jenkins/Jenkinsfile_unix_gpu
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,6 @@ core_logic: {
custom_steps.compile_unix_cmake_gpu('cmake_gpu'),
custom_steps.compile_unix_tensorrt_gpu('tensorrt'),
custom_steps.compile_unix_int64_gpu('gpu_int64'),
custom_steps.compile_unix_cmake_gpu_no_rtc('gpu_no_rtc'),
])

utils.parallel_stage('Tests', [
Expand Down
1 change: 0 additions & 1 deletion config/darwin.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -126,5 +126,4 @@ set(USE_INT64_TENSOR_SIZE OFF CACHE BOOL "Use int64_t to represent the total num
# Other GPU features
set(USE_NCCL "Use NVidia NCCL with CUDA" OFF)
set(NCCL_ROOT "" CACHE BOOL "NCCL install path. Supports autodetection.")
set(ENABLE_CUDA_RTC ON CACHE BOOL "Build with CUDA runtime compilation support")
set(USE_NVTX ON CACHE BOOL "Build with NVTX support")
1 change: 0 additions & 1 deletion config/linux.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -125,5 +125,4 @@ set(USE_INT64_TENSOR_SIZE OFF CACHE BOOL "Use int64_t to represent the total num
# Other GPU features
set(USE_NCCL "Use NVidia NCCL with CUDA" OFF)
set(NCCL_ROOT "" CACHE BOOL "NCCL install path. Supports autodetection.")
set(ENABLE_CUDA_RTC ON CACHE BOOL "Build with CUDA runtime compilation support")
set(USE_NVTX ON CACHE BOOL "Build with NVTX support")
1 change: 0 additions & 1 deletion config/linux_gpu.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -125,5 +125,4 @@ set(USE_INT64_TENSOR_SIZE OFF CACHE BOOL "Use int64_t to represent the total num
# Other GPU features
set(USE_NCCL "Use NVidia NCCL with CUDA" OFF)
set(NCCL_ROOT "" CACHE BOOL "NCCL install path. Supports autodetection.")
set(ENABLE_CUDA_RTC ON CACHE BOOL "Build with CUDA runtime compilation support")
set(USE_NVTX ON CACHE BOOL "Build with NVTX support")
Loading

0 comments on commit 29d6f27

Please sign in to comment.