From fd21d7c1ac458388afc077a0a021d08a3c3def05 Mon Sep 17 00:00:00 2001
From: Leonard Lausen <lausen@amazon.com>
Date: Mon, 30 Dec 2019 17:37:43 +0800
Subject: [PATCH] Switch to modern CMake CUDA handling (#17031) Introduce
 unified MXNET_CUDA_ARCH option to specify cuda architectures. Previously cuda
 architecture setting was partially broken and different options were applied
 to different parts of the build (CUDA_ARCH_NAME CUDA_ARCH_BIN CUDA_ARCH_PTX
 and CUDA_ARCH_LIST). Include FindCUDAToolkit from CMake 3.17, which replaces
 the deprecated FindCUDA functionality for finding the cuda toolkit include
 directories and libraries.

Workaround for DLL size limitation on Windows (#16980)
* change windows build system.
add gen_warp cpp version
add add_custom_command to run warp_gen
add download cmake
add option
change option
add dynamic read mxnet dll
---
 3rdparty/mshadow/cmake/Cuda.cmake         | 324 ---------
 3rdparty/mshadow/cmake/Utils.cmake        | 398 -----------
 3rdparty/mshadow/cmake/mshadow.cmake      |  91 ---
 3rdparty/mshadow/cmake/mshadowUtils.cmake |   2 -
 CMakeLists.txt                            | 422 +++++------
 ci/build_windows.py                       |   8 +-
 ci/docker/install/ubuntu_core.sh          |   2 +-
 ci/docker/runtime_functions.sh            |  17 +-
 cmake/BuildTVM.cmake                      |  26 +-
 cmake/FirstClassLangCuda.cmake            | 277 -------
 cmake/Modules/FindCUDAToolkit.cmake       | 833 ++++++++++++++++++++++
 contrib/tvmop/compile.py                  |   5 +
 tools/windowsbuild/README.md              |  19 +
 tools/windowsbuild/gen_warp.cpp           | 209 ++++++
 tools/windowsbuild/warp_dll.cpp           | 151 ++++
 15 files changed, 1463 insertions(+), 1321 deletions(-)
 delete mode 100644 3rdparty/mshadow/cmake/Cuda.cmake
 delete mode 100644 3rdparty/mshadow/cmake/Utils.cmake
 delete mode 100644 3rdparty/mshadow/cmake/mshadow.cmake
 delete mode 100644 3rdparty/mshadow/cmake/mshadowUtils.cmake
 delete mode 100644 cmake/FirstClassLangCuda.cmake
 create mode 100644 cmake/Modules/FindCUDAToolkit.cmake
 create mode 100644 tools/windowsbuild/README.md
 create mode 100644 tools/windowsbuild/gen_warp.cpp
 create mode 100644 tools/windowsbuild/warp_dll.cpp
diff --git a/3rdparty/mshadow/cmake/Cuda.cmake b/3rdparty/mshadow/cmake/Cuda.cmake
deleted file mode 100644
index bc09a3905076..000000000000
--- a/3rdparty/mshadow/cmake/Cuda.cmake
+++ /dev/null
@@ -1,324 +0,0 @@
-if(NOT USE_CUDA)
-  return()
-endif()
-
-include(CheckCXXCompilerFlag)
-check_cxx_compiler_flag("-std=c++11"   SUPPORT_CXX11)
-
-################################################################################################
-# A function for automatic detection of GPUs installed  (if autodetection is enabled)
-# Usage:
-#   mshadow_detect_installed_gpus(out_variable)
-function(mshadow_detect_installed_gpus out_variable)
-set(CUDA_gpu_detect_output "")
-  if(NOT CUDA_gpu_detect_output)
-    message(STATUS "Running GPU architecture autodetection")
-    set(__cufile ${PROJECT_BINARY_DIR}/detect_cuda_archs.cu)
-
-    file(WRITE ${__cufile} ""
-      "#include <cstdio>\n"
-      "#include <iostream>\n"
-      "using namespace std;\n"
-      "int main()\n"
-      "{\n"
-      "  int count = 0;\n"
-      "  if (cudaSuccess != cudaGetDeviceCount(&count)) { return -1; }\n"
-      "  if (count == 0) { cerr << \"No cuda devices detected\" << endl; return -1; }\n"
-      "  for (int device = 0; device < count; ++device)\n"
-      "  {\n"
-      "    cudaDeviceProp prop;\n"
-      "    if (cudaSuccess == cudaGetDeviceProperties(&prop, device))\n"
-      "      std::printf(\"%d.%d \", prop.major, prop.minor);\n"
-      "  }\n"
-      "  return 0;\n"
-      "}\n")
-    if(MSVC)
-      #find vcvarsall.bat and run it building msvc environment
-      get_filename_component(MY_COMPILER_DIR ${CMAKE_CXX_COMPILER} DIRECTORY)
-      find_file(MY_VCVARSALL_BAT vcvarsall.bat "${MY_COMPILER_DIR}/.." "${MY_COMPILER_DIR}/../..")
-      execute_process(COMMAND ${MY_VCVARSALL_BAT} && ${CUDA_NVCC_EXECUTABLE} -arch sm_30 --run  ${__cufile}
-                      WORKING_DIRECTORY "${PROJECT_BINARY_DIR}/CMakeFiles/"
-                      RESULT_VARIABLE __nvcc_res OUTPUT_VARIABLE __nvcc_out
-                      OUTPUT_STRIP_TRAILING_WHITESPACE)
-    else()
-      if(CUDA_LIBRARY_PATH)
-        set(CUDA_LINK_LIBRARY_PATH "-L${CUDA_LIBRARY_PATH}")
-      endif()
-      execute_process(COMMAND ${CUDA_NVCC_EXECUTABLE} -arch sm_30 --run ${__cufile} ${CUDA_LINK_LIBRARY_PATH}
-                      WORKING_DIRECTORY "${PROJECT_BINARY_DIR}/CMakeFiles/"
-                      RESULT_VARIABLE __nvcc_res OUTPUT_VARIABLE __nvcc_out
-                      OUTPUT_STRIP_TRAILING_WHITESPACE)
-    endif()
-    if(__nvcc_res EQUAL 0)
-      # nvcc outputs text containing line breaks when building with MSVC.
-      # The line below prevents CMake from inserting a variable with line
-      # breaks in the cache
-      message(STATUS "Found CUDA arch ${__nvcc_out}")
-      string(REGEX MATCH "([1-9].[0-9])" __nvcc_out "${__nvcc_out}")
-      string(REPLACE "2.1" "2.1(2.0)" __nvcc_out "${__nvcc_out}")
-      set(CUDA_gpu_detect_output ${__nvcc_out} CACHE INTERNAL "Returned GPU architetures from mshadow_detect_gpus tool" FORCE)
-    else()
-      message(WARNING "Running GPU detection script with nvcc failed: ${__nvcc_out}")
-    endif()
-  endif()
-
-  if(NOT CUDA_gpu_detect_output)
-    message(WARNING "Automatic GPU detection failed. Building for all known architectures (${mshadow_known_gpu_archs}).")
-    set(${out_variable} ${mshadow_known_gpu_archs} PARENT_SCOPE)
-  else()
-    set(${out_variable} ${CUDA_gpu_detect_output} PARENT_SCOPE)
-  endif()
-endfunction()
-
-
-################################################################################################
-# Function for selecting GPU arch flags for nvcc based on CUDA_ARCH_NAME
-# Usage:
-#   mshadow_select_nvcc_arch_flags(out_variable)
-function(mshadow_select_nvcc_arch_flags out_variable)
-  # List of arch names
-  set(__archs_names "Fermi" "Kepler" "Maxwell" "Pascal" "Volta" "All" "Manual")
-  set(__archs_name_default "All")
-  if(NOT CMAKE_CROSSCOMPILING)
-    list(APPEND __archs_names "Auto")
-    set(__archs_name_default "Auto")
-  endif()
-
-  # set CUDA_ARCH_NAME strings (so it will be seen as dropbox in CMake-Gui)
-  set(CUDA_ARCH_NAME ${__archs_name_default} CACHE STRING "Select target NVIDIA GPU achitecture.")
-  set_property( CACHE CUDA_ARCH_NAME PROPERTY STRINGS "" ${__archs_names} )
-  mark_as_advanced(CUDA_ARCH_NAME)
-
-  # verify CUDA_ARCH_NAME value
-  if(NOT ";${__archs_names};" MATCHES ";${CUDA_ARCH_NAME};")
-    string(REPLACE ";" ", " __archs_names "${__archs_names}")
-    message(FATAL_ERROR "Only ${__archs_names} architeture names are supported.")
-  endif()
-
-  if(${CUDA_ARCH_NAME} STREQUAL "Manual")
-    set(CUDA_ARCH_BIN ${mshadow_known_gpu_archs} CACHE STRING "Specify 'real' GPU architectures to build binaries for, BIN(PTX) format is supported")
-    set(CUDA_ARCH_PTX "50"                     CACHE STRING "Specify 'virtual' PTX architectures to build PTX intermediate code for")
-    mark_as_advanced(CUDA_ARCH_BIN CUDA_ARCH_PTX)
-  else()
-    unset(CUDA_ARCH_BIN CACHE)
-    unset(CUDA_ARCH_PTX CACHE)
-  endif()
-
-  if(${CUDA_ARCH_NAME} STREQUAL "Fermi")
-    set(__cuda_arch_bin "20 21(20)")
-  elseif(${CUDA_ARCH_NAME} STREQUAL "Kepler")
-    set(__cuda_arch_bin "30 35")
-  elseif(${CUDA_ARCH_NAME} STREQUAL "Maxwell")
-    set(__cuda_arch_bin "50")
-  elseif(${CUDA_ARCH_NAME} STREQUAL "Pascal")
-    set(__cuda_arch_bin "60 61")
-  elseif(${CUDA_ARCH_NAME} STREQUAL "Volta")
-    set(__cuda_arch_bin "70")
-  elseif(${CUDA_ARCH_NAME} STREQUAL "All")
-    set(__cuda_arch_bin ${mshadow_known_gpu_archs})
-  elseif(${CUDA_ARCH_NAME} STREQUAL "Auto")
-    mshadow_detect_installed_gpus(__cuda_arch_bin)
-  else()  # (${CUDA_ARCH_NAME} STREQUAL "Manual")
-    set(__cuda_arch_bin ${CUDA_ARCH_BIN})
-  endif()
-
-  # remove dots and convert to lists
-  string(REGEX REPLACE "\\." "" __cuda_arch_bin "${__cuda_arch_bin}")
-  string(REGEX REPLACE "\\." "" __cuda_arch_ptx "${CUDA_ARCH_PTX}")
-  string(REGEX MATCHALL "[0-9()]+" __cuda_arch_bin "${__cuda_arch_bin}")
-  string(REGEX MATCHALL "[0-9]+"   __cuda_arch_ptx "${__cuda_arch_ptx}")
-  mshadow_list_unique(__cuda_arch_bin __cuda_arch_ptx)
-
-  set(__nvcc_flags "")
-  set(__nvcc_archs_readable "")
-
-  # Tell NVCC to add binaries for the specified GPUs
-  foreach(__arch ${__cuda_arch_bin})
-    if(__arch MATCHES "([0-9]+)\\(([0-9]+)\\)")
-      # User explicitly specified PTX for the concrete BIN
-      list(APPEND __nvcc_flags -gencode arch=compute_${CMAKE_MATCH_2},code=sm_${CMAKE_MATCH_1})
-      list(APPEND __nvcc_archs_readable sm_${CMAKE_MATCH_1})
-    else()
-      # User didn't explicitly specify PTX for the concrete BIN, we assume PTX=BIN
-      list(APPEND __nvcc_flags -gencode arch=compute_${__arch},code=sm_${__arch})
-      list(APPEND __nvcc_archs_readable sm_${__arch})
-    endif()
-  endforeach()
-
-  # Tell NVCC to add PTX intermediate code for the specified architectures
-  foreach(__arch ${__cuda_arch_ptx})
-    list(APPEND __nvcc_flags -gencode arch=compute_${__arch},code=compute_${__arch})
-    list(APPEND __nvcc_archs_readable compute_${__arch})
-  endforeach()
-
-  string(REPLACE ";" " " __nvcc_archs_readable "${__nvcc_archs_readable}")
-  set(${out_variable}          ${__nvcc_flags}          PARENT_SCOPE)
-  set(${out_variable}_readable ${__nvcc_archs_readable} PARENT_SCOPE)
-endfunction()
-
-################################################################################################
-# Short command for cuda comnpilation
-# Usage:
-#   mshadow_cuda_compile(<objlist_variable> <cuda_files>)
-macro(mshadow_cuda_compile objlist_variable)
-  foreach(var CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_RELEASE CMAKE_CXX_FLAGS_DEBUG)
-    set(${var}_backup_in_cuda_compile_ "${${var}}")
-
-    # we remove /EHa as it generates warnings under windows
-    string(REPLACE "/EHa" "" ${var} "${${var}}")
-
-  endforeach()
-  if(UNIX OR APPLE)
-    list(APPEND CUDA_NVCC_FLAGS -Xcompiler -fPIC)
-  endif()
-
-  if(APPLE)
-    list(APPEND CUDA_NVCC_FLAGS -Xcompiler -Wno-unused-function)
-  endif()
-
-  set(CUDA_NVCC_FLAGS_DEBUG "${CUDA_NVCC_FLAGS_DEBUG} -G")
-
-  if(MSVC)
-    # disable noisy warnings:
-    # 4819: The file contains a character that cannot be represented in the current code page (number).
-    list(APPEND CUDA_NVCC_FLAGS -Xcompiler "/wd4819")
-    foreach(flag_var
-        CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_DEBUG CMAKE_CXX_FLAGS_RELEASE
-        CMAKE_CXX_FLAGS_MINSIZEREL CMAKE_CXX_FLAGS_RELWITHDEBINFO)
-      if(${flag_var} MATCHES "/MD")
-        string(REGEX REPLACE "/MD" "/MT" ${flag_var} "${${flag_var}}")
-      endif(${flag_var} MATCHES "/MD")
-    endforeach(flag_var)
-  endif()
-
-  # If the build system is a container, make sure the nvcc intermediate files
-  # go into the build output area rather than in /tmp, which may run out of space
-  if(IS_CONTAINER_BUILD)
-    set(CUDA_NVCC_INTERMEDIATE_DIR "${CMAKE_CURRENT_BINARY_DIR}")
-    message(STATUS "Container build enabled, so nvcc intermediate files in: ${CUDA_NVCC_INTERMEDIATE_DIR}")
-    list(APPEND CUDA_NVCC_FLAGS "--keep --keep-dir ${CUDA_NVCC_INTERMEDIATE_DIR}")
-  endif()
-
-  cuda_compile(cuda_objcs ${ARGN})
-
-  foreach(var CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_RELEASE CMAKE_CXX_FLAGS_DEBUG)
-    set(${var} "${${var}_backup_in_cuda_compile_}")
-    unset(${var}_backup_in_cuda_compile_)
-  endforeach()
-
-  set(${objlist_variable} ${cuda_objcs})
-endmacro()
-
-################################################################################################
-# Short command for cuDNN detection. Believe it soon will be a part of CUDA toolkit distribution.
-# That's why not FindcuDNN.cmake file, but just the macro
-# Usage:
-#   detect_cuDNN()
-function(detect_cuDNN)
-  set(CUDNN_ROOT "" CACHE PATH "CUDNN root folder")
-
-  find_path(CUDNN_INCLUDE cudnn.h
-            PATHS ${CUDNN_ROOT} $ENV{CUDNN_ROOT} ${CUDA_TOOLKIT_INCLUDE}
-            DOC "Path to cuDNN include directory." )
-
-  get_filename_component(__libpath_hist ${CUDA_CUDART_LIBRARY} PATH)
-  find_library(CUDNN_LIBRARY NAMES libcudnn.so cudnn.lib # libcudnn_static.a
-                             PATHS ${CUDNN_ROOT} $ENV{CUDNN_ROOT} ${CUDNN_INCLUDE} ${__libpath_hist}
-                             DOC "Path to cuDNN library.")
-
-  if(CUDNN_INCLUDE AND CUDNN_LIBRARY)
-    set(HAVE_CUDNN  TRUE PARENT_SCOPE)
-    set(CUDNN_FOUND TRUE PARENT_SCOPE)
-
-    mark_as_advanced(CUDNN_INCLUDE CUDNN_LIBRARY CUDNN_ROOT)
-    message(STATUS "Found cuDNN (include: ${CUDNN_INCLUDE}, library: ${CUDNN_LIBRARY})")
-  endif()
-endfunction()
-
-
-################################################################################################
-###  Non macro section
-################################################################################################
-
-# Try to prime CUDA_TOOLKIT_ROOT_DIR by looking for libcudart.so
-if(NOT CUDA_TOOLKIT_ROOT_DIR)
-  find_library(CUDA_LIBRARY_PATH libcudart.so PATHS ENV LD_LIBRARY_PATH PATH_SUFFIXES lib lib64)
-  if(CUDA_LIBRARY_PATH)
-    get_filename_component(CUDA_LIBRARY_PATH ${CUDA_LIBRARY_PATH} DIRECTORY)
-    set(CUDA_TOOLKIT_ROOT_DIR "${CUDA_LIBRARY_PATH}/..")
-  endif()
-endif()
-
-find_package(CUDA 5.5 QUIET REQUIRED)
-find_cuda_helper_libs(curand)  # cmake 2.8.7 compartibility which doesn't search for curand
-
-if(NOT CUDA_FOUND)
-  return()
-endif()
-
-set(HAVE_CUDA TRUE)
-message(STATUS "CUDA detected: " ${CUDA_VERSION})
-include_directories(SYSTEM ${CUDA_INCLUDE_DIRS})
-list(APPEND mshadow_LINKER_LIBS ${CUDA_CUDART_LIBRARY}
-                              ${CUDA_curand_LIBRARY} ${CUDA_CUBLAS_LIBRARIES})
-
-# Known NVIDIA GPU achitectures mshadow can be compiled for.
-# This list will be used for CUDA_ARCH_NAME = All option
-if(CUDA_ARCH_ALL)
-  set(mshadow_known_gpu_archs "${CUDA_ARCH_ALL}")
-else()
-  if(${CUDA_VERSION} EQUAL 9.0 OR ${CUDA_VERSION} GREATER 9.0)
-    set(mshadow_known_gpu_archs "30 35 50 52 60 61 70")
-  elseif(${CUDA_VERSION} EQUAL 8.0 OR ${CUDA_VERSION} GREATER 8.0)
-    set(mshadow_known_gpu_archs "30 35 50 52 60 61")
-  else()
-    set(mshadow_known_gpu_archs "30 35 50 52")
-  endif()
-endif()
-
-# cudnn detection
-if(USE_CUDNN)
-  detect_cuDNN()
-  if(HAVE_CUDNN)
-    add_definitions(-DUSE_CUDNN)
-    include_directories(SYSTEM ${CUDNN_INCLUDE})
-    list(APPEND mshadow_LINKER_LIBS ${CUDNN_LIBRARY})
-  endif()
-endif()
-
-# setting nvcc arch flags
-mshadow_select_nvcc_arch_flags(NVCC_FLAGS_EXTRA)
-list(APPEND CUDA_NVCC_FLAGS ${NVCC_FLAGS_EXTRA})
-message(STATUS "Added CUDA NVCC flags for: ${NVCC_FLAGS_EXTRA_readable}")
-
-# Boost 1.55 workaround, see https://svn.boost.org/trac/boost/ticket/9392 or
-# https://github.com/ComputationalRadiationPhysics/picongpu/blob/master/src/picongpu/CMakeLists.txt
-if(Boost_VERSION EQUAL 105500)
-  message(STATUS "Cuda + Boost 1.55: Applying noinline work around")
-  # avoid warning for CMake >= 2.8.12
-  set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} \"-DBOOST_NOINLINE=__attribute__((noinline))\" ")
-endif()
-
-# disable some nvcc diagnostic that apears in boost, glog, glags, opencv, etc.
-foreach(diag cc_clobber_ignored integer_sign_change useless_using_declaration set_but_not_used)
-  list(APPEND CUDA_NVCC_FLAGS -Xcudafe --diag_suppress=${diag})
-endforeach()
-
-# setting default testing device
-if(NOT CUDA_TEST_DEVICE)
-  set(CUDA_TEST_DEVICE -1)
-endif()
-
-mark_as_advanced(CUDA_BUILD_CUBIN CUDA_BUILD_EMULATION CUDA_VERBOSE_BUILD)
-mark_as_advanced(CUDA_SDK_ROOT_DIR CUDA_SEPARABLE_COMPILATION)
-
-# Handle clang/libc++ issue
-if(APPLE)
-  mshadow_detect_darwin_version(OSX_VERSION)
-
-  # OSX 10.9 and higher uses clang/libc++ by default which is incompartible with old CUDA toolkits
-  if(OSX_VERSION VERSION_GREATER 10.8)
-    # enabled by default if and only if CUDA version is less than 7.0
-    mshadow_option(USE_libstdcpp "Use libstdc++ instead of libc++" (CUDA_VERSION VERSION_LESS 7.0))
-  endif()
-endif()
diff --git a/3rdparty/mshadow/cmake/Utils.cmake b/3rdparty/mshadow/cmake/Utils.cmake
deleted file mode 100644
index dc464f0092f5..000000000000
--- a/3rdparty/mshadow/cmake/Utils.cmake
+++ /dev/null
@@ -1,398 +0,0 @@
-################################################################################################
-# Command alias for debugging messages
-# Usage:
-#   dmsg(<message>)
-function(dmsg)
-  message(STATUS ${ARGN})
-endfunction()
-
-################################################################################################
-# Removes duplicates from list(s)
-# Usage:
-#   mshadow_list_unique(<list_variable> [<list_variable>] [...])
-macro(mshadow_list_unique)
-  foreach(__lst ${ARGN})
-    if(${__lst})
-      list(REMOVE_DUPLICATES ${__lst})
-    endif()
-  endforeach()
-endmacro()
-
-################################################################################################
-# Clears variables from list
-# Usage:
-#   mshadow_clear_vars(<variables_list>)
-macro(mshadow_clear_vars)
-  foreach(_var ${ARGN})
-    unset(${_var})
-  endforeach()
-endmacro()
-
-################################################################################################
-# Removes duplicates from string
-# Usage:
-#   mshadow_string_unique(<string_variable>)
-function(mshadow_string_unique __string)
-  if(${__string})
-    set(__list ${${__string}})
-    separate_arguments(__list)
-    list(REMOVE_DUPLICATES __list)
-    foreach(__e ${__list})
-      set(__str "${__str} ${__e}")
-    endforeach()
-    set(${__string} ${__str} PARENT_SCOPE)
-  endif()
-endfunction()
-
-################################################################################################
-# Prints list element per line
-# Usage:
-#   mshadow_print_list(<list>)
-function(mshadow_print_list)
-  foreach(e ${ARGN})
-    message(STATUS ${e})
-  endforeach()
-endfunction()
-
-################################################################################################
-# Function merging lists of compiler flags to single string.
-# Usage:
-#   mshadow_merge_flag_lists(out_variable <list1> [<list2>] [<list3>] ...)
-function(mshadow_merge_flag_lists out_var)
-  set(__result "")
-  foreach(__list ${ARGN})
-    foreach(__flag ${${__list}})
-      string(STRIP ${__flag} __flag)
-      set(__result "${__result} ${__flag}")
-    endforeach()
-  endforeach()
-  string(STRIP ${__result} __result)
-  set(${out_var} ${__result} PARENT_SCOPE)
-endfunction()
-
-################################################################################################
-# Converts all paths in list to absolute
-# Usage:
-#   mshadow_convert_absolute_paths(<list_variable>)
-function(mshadow_convert_absolute_paths variable)
-  set(__dlist "")
-  foreach(__s ${${variable}})
-    get_filename_component(__abspath ${__s} ABSOLUTE)
-    list(APPEND __list ${__abspath})
-  endforeach()
-  set(${variable} ${__list} PARENT_SCOPE)
-endfunction()
-
-################################################################################################
-# Reads set of version defines from the header file
-# Usage:
-#   mshadow_parse_header(<file> <define1> <define2> <define3> ..)
-macro(mshadow_parse_header FILENAME FILE_VAR)
-  set(vars_regex "")
-  set(__parnet_scope OFF)
-  set(__add_cache OFF)
-  foreach(name ${ARGN})
-    if("${name}" STREQUAL "PARENT_SCOPE")
-      set(__parnet_scope ON)
-    elseif("${name}" STREQUAL "CACHE")
-      set(__add_cache ON)
-    elseif(vars_regex)
-      set(vars_regex "${vars_regex}|${name}")
-    else()
-      set(vars_regex "${name}")
-    endif()
-  endforeach()
-  if(EXISTS "${FILENAME}")
-    file(STRINGS "${FILENAME}" ${FILE_VAR} REGEX "#define[ \t]+(${vars_regex})[ \t]+[0-9]+" )
-  else()
-    unset(${FILE_VAR})
-  endif()
-  foreach(name ${ARGN})
-    if(NOT "${name}" STREQUAL "PARENT_SCOPE" AND NOT "${name}" STREQUAL "CACHE")
-      if(${FILE_VAR})
-        if(${FILE_VAR} MATCHES ".+[ \t]${name}[ \t]+([0-9]+).*")
-          string(REGEX REPLACE ".+[ \t]${name}[ \t]+([0-9]+).*" "\\1" ${name} "${${FILE_VAR}}")
-        else()
-          set(${name} "")
-        endif()
-        if(__add_cache)
-          set(${name} ${${name}} CACHE INTERNAL "${name} parsed from ${FILENAME}" FORCE)
-        elseif(__parnet_scope)
-          set(${name} "${${name}}" PARENT_SCOPE)
-        endif()
-      else()
-        unset(${name} CACHE)
-      endif()
-    endif()
-  endforeach()
-endmacro()
-
-################################################################################################
-# Reads single version define from the header file and parses it
-# Usage:
-#   mshadow_parse_header_single_define(<library_name> <file> <define_name>)
-function(mshadow_parse_header_single_define LIBNAME HDR_PATH VARNAME)
-  set(${LIBNAME}_H "")
-  if(EXISTS "${HDR_PATH}")
-    file(STRINGS "${HDR_PATH}" ${LIBNAME}_H REGEX "^#define[ \t]+${VARNAME}[ \t]+\"[^\"]*\".*$" LIMIT_COUNT 1)
-  endif()
-
-  if(${LIBNAME}_H)
-    string(REGEX REPLACE "^.*[ \t]${VARNAME}[ \t]+\"([0-9]+).*$" "\\1" ${LIBNAME}_VERSION_MAJOR "${${LIBNAME}_H}")
-    string(REGEX REPLACE "^.*[ \t]${VARNAME}[ \t]+\"[0-9]+\\.([0-9]+).*$" "\\1" ${LIBNAME}_VERSION_MINOR  "${${LIBNAME}_H}")
-    string(REGEX REPLACE "^.*[ \t]${VARNAME}[ \t]+\"[0-9]+\\.[0-9]+\\.([0-9]+).*$" "\\1" ${LIBNAME}_VERSION_PATCH "${${LIBNAME}_H}")
-    set(${LIBNAME}_VERSION_MAJOR ${${LIBNAME}_VERSION_MAJOR} ${ARGN} PARENT_SCOPE)
-    set(${LIBNAME}_VERSION_MINOR ${${LIBNAME}_VERSION_MINOR} ${ARGN} PARENT_SCOPE)
-    set(${LIBNAME}_VERSION_PATCH ${${LIBNAME}_VERSION_PATCH} ${ARGN} PARENT_SCOPE)
-    set(${LIBNAME}_VERSION_STRING "${${LIBNAME}_VERSION_MAJOR}.${${LIBNAME}_VERSION_MINOR}.${${LIBNAME}_VERSION_PATCH}" PARENT_SCOPE)
-
-    # append a TWEAK version if it exists:
-    set(${LIBNAME}_VERSION_TWEAK "")
-    if("${${LIBNAME}_H}" MATCHES "^.*[ \t]${VARNAME}[ \t]+\"[0-9]+\\.[0-9]+\\.[0-9]+\\.([0-9]+).*$")
-      set(${LIBNAME}_VERSION_TWEAK "${CMAKE_MATCH_1}" ${ARGN} PARENT_SCOPE)
-    endif()
-    if(${LIBNAME}_VERSION_TWEAK)
-      set(${LIBNAME}_VERSION_STRING "${${LIBNAME}_VERSION_STRING}.${${LIBNAME}_VERSION_TWEAK}" ${ARGN} PARENT_SCOPE)
-    else()
-      set(${LIBNAME}_VERSION_STRING "${${LIBNAME}_VERSION_STRING}" ${ARGN} PARENT_SCOPE)
-    endif()
-  endif()
-endfunction()
-
-########################################################################################################
-# An option that the user can select. Can accept condition to control when option is available for user.
-# Usage:
-#   mshadow_option(<option_variable> "doc string" <initial value or boolean expression> [IF <condition>])
-function(mshadow_option variable description value)
-  set(__value ${value})
-  set(__condition "")
-  set(__varname "__value")
-  foreach(arg ${ARGN})
-    if(arg STREQUAL "IF" OR arg STREQUAL "if")
-      set(__varname "__condition")
-    else()
-      list(APPEND ${__varname} ${arg})
-    endif()
-  endforeach()
-  unset(__varname)
-  if("${__condition}" STREQUAL "")
-    set(__condition 2 GREATER 1)
-  endif()
-
-  if(${__condition})
-    if("${__value}" MATCHES ";")
-      if(${__value})
-        option(${variable} "${description}" ON)
-      else()
-        option(${variable} "${description}" OFF)
-      endif()
-    elseif(DEFINED ${__value})
-      if(${__value})
-        option(${variable} "${description}" ON)
-      else()
-        option(${variable} "${description}" OFF)
-      endif()
-    else()
-      option(${variable} "${description}" ${__value})
-    endif()
-  else()
-    unset(${variable} CACHE)
-  endif()
-endfunction()
-
-################################################################################################
-# Utility macro for comparing two lists. Used for CMake debugging purposes
-# Usage:
-#   mshadow_compare_lists(<list_variable> <list2_variable> [description])
-function(mshadow_compare_lists list1 list2 desc)
-  set(__list1 ${${list1}})
-  set(__list2 ${${list2}})
-  list(SORT __list1)
-  list(SORT __list2)
-  list(LENGTH __list1 __len1)
-  list(LENGTH __list2 __len2)
-
-  if(NOT ${__len1} EQUAL ${__len2})
-    message(FATAL_ERROR "Lists are not equal. ${__len1} != ${__len2}. ${desc}")
-  endif()
-
-  foreach(__i RANGE 1 ${__len1})
-    math(EXPR __index "${__i}- 1")
-    list(GET __list1 ${__index} __item1)
-    list(GET __list2 ${__index} __item2)
-    if(NOT ${__item1} STREQUAL ${__item2})
-      message(FATAL_ERROR "Lists are not equal. Differ at element ${__index}. ${desc}")
-    endif()
-  endforeach()
-endfunction()
-
-################################################################################################
-# Command for disabling warnings for different platforms (see below for gcc and VisualStudio)
-# Usage:
-#   mshadow_warnings_disable(<CMAKE_[C|CXX]_FLAGS[_CONFIGURATION]> -Wshadow /wd4996 ..,)
-macro(mshadow_warnings_disable)
-  set(_flag_vars "")
-  set(_msvc_warnings "")
-  set(_gxx_warnings "")
-
-  foreach(arg ${ARGN})
-    if(arg MATCHES "^CMAKE_")
-      list(APPEND _flag_vars ${arg})
-    elseif(arg MATCHES "^/wd")
-      list(APPEND _msvc_warnings ${arg})
-    elseif(arg MATCHES "^-W")
-      list(APPEND _gxx_warnings ${arg})
-    endif()
-  endforeach()
-
-  if(NOT _flag_vars)
-    set(_flag_vars CMAKE_C_FLAGS CMAKE_CXX_FLAGS)
-  endif()
-
-  if(MSVC AND _msvc_warnings)
-    foreach(var ${_flag_vars})
-      foreach(warning ${_msvc_warnings})
-        set(${var} "${${var}} ${warning}")
-      endforeach()
-    endforeach()
-  elseif((CMAKE_COMPILER_IS_GNUCXX OR CMAKE_COMPILER_IS_CLANGXX) AND _gxx_warnings)
-    foreach(var ${_flag_vars})
-      foreach(warning ${_gxx_warnings})
-        if(NOT warning MATCHES "^-Wno-")
-          string(REPLACE "${warning}" "" ${var} "${${var}}")
-          string(REPLACE "-W" "-Wno-" warning "${warning}")
-        endif()
-        set(${var} "${${var}} ${warning}")
-      endforeach()
-    endforeach()
-  endif()
-  mshadow_clear_vars(_flag_vars _msvc_warnings _gxx_warnings)
-endmacro()
-
-################################################################################################
-# Helper function get current definitions
-# Usage:
-#   mshadow_get_current_definitions(<definitions_variable>)
-function(mshadow_get_current_definitions definitions_var)
-  get_property(current_definitions DIRECTORY PROPERTY COMPILE_DEFINITIONS)
-  set(result "")
-
-  foreach(d ${current_definitions})
-    list(APPEND result -D${d})
-  endforeach()
-
-  mshadow_list_unique(result)
-  set(${definitions_var} ${result} PARENT_SCOPE)
-endfunction()
-
-################################################################################################
-# Helper function get current includes/definitions
-# Usage:
-#   mshadow_get_current_cflags(<cflagslist_variable>)
-function(mshadow_get_current_cflags cflags_var)
-  get_property(current_includes DIRECTORY PROPERTY INCLUDE_DIRECTORIES)
-  mshadow_convert_absolute_paths(current_includes)
-  mshadow_get_current_definitions(cflags)
-
-  foreach(i ${current_includes})
-    list(APPEND cflags "-I${i}")
-  endforeach()
-
-  mshadow_list_unique(cflags)
-  set(${cflags_var} ${cflags} PARENT_SCOPE)
-endfunction()
-
-################################################################################################
-# Helper function to parse current linker libs into link directories, libflags and osx frameworks
-# Usage:
-#   mshadow_parse_linker_libs(<mshadow_LINKER_LIBS_var> <directories_var> <libflags_var> <frameworks_var>)
-function(mshadow_parse_linker_libs mshadow_LINKER_LIBS_variable folders_var flags_var frameworks_var)
-
-  set(__unspec "")
-  set(__debug "")
-  set(__optimized "")
-  set(__framework "")
-  set(__varname "__unspec")
-
-  # split libs into debug, optimized, unspecified and frameworks
-  foreach(list_elem ${${mshadow_LINKER_LIBS_variable}})
-    if(list_elem STREQUAL "debug")
-      set(__varname "__debug")
-    elseif(list_elem STREQUAL "optimized")
-      set(__varname "__optimized")
-    elseif(list_elem MATCHES "^-framework[ \t]+([^ \t].*)")
-      list(APPEND __framework -framework ${CMAKE_MATCH_1})
-    else()
-      list(APPEND ${__varname} ${list_elem})
-      set(__varname "__unspec")
-    endif()
-  endforeach()
-
-  # attach debug or optimized libs to unspecified according to current configuration
-  if(CMAKE_BUILD_TYPE MATCHES "Debug")
-    set(__libs ${__unspec} ${__debug})
-  else()
-    set(__libs ${__unspec} ${__optimized})
-  endif()
-
-  set(libflags "")
-  set(folders "")
-
-  # convert linker libraries list to link flags
-  foreach(lib ${__libs})
-    if(TARGET ${lib})
-      list(APPEND folders $<TARGET_LINKER_FILE_DIR:${lib}>)
-      list(APPEND libflags -l${lib})
-    elseif(lib MATCHES "^-l.*")
-      list(APPEND libflags ${lib})
-    elseif(IS_ABSOLUTE ${lib})
-      get_filename_component(name_we ${lib} NAME_WE)
-      get_filename_component(folder  ${lib} PATH)
-
-      string(REGEX MATCH "^lib(.*)" __match ${name_we})
-      list(APPEND libflags -l${CMAKE_MATCH_1})
-      list(APPEND folders    ${folder})
-    else()
-      message(FATAL_ERROR "Logic error. Need to update cmake script")
-    endif()
-  endforeach()
-
-  mshadow_list_unique(libflags folders)
-
-  set(${folders_var} ${folders} PARENT_SCOPE)
-  set(${flags_var} ${libflags} PARENT_SCOPE)
-  set(${frameworks_var} ${__framework} PARENT_SCOPE)
-endfunction()
-
-################################################################################################
-# Helper function to detect Darwin version, i.e. 10.8, 10.9, 10.10, ....
-# Usage:
-#   mshadow_detect_darwin_version(<version_variable>)
-function(mshadow_detect_darwin_version output_var)
-  if(APPLE)
-    execute_process(COMMAND /usr/bin/sw_vers -productVersion
-                    RESULT_VARIABLE __sw_vers OUTPUT_VARIABLE __sw_vers_out
-                    ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
-
-    set(${output_var} ${__sw_vers_out} PARENT_SCOPE)
-  else()
-    set(${output_var} "" PARENT_SCOPE)
-  endif()
-endfunction()
-
-################################################################################################
-# Convenient command to setup source group for IDEs that support this feature (VS, XCode)
-# Usage:
-#   caffe_source_group(<group> GLOB[_RECURSE] <globbing_expression>)
-function(mshadow_source_group group)
-  cmake_parse_arguments(CAFFE_SOURCE_GROUP "" "" "GLOB;GLOB_RECURSE" ${ARGN})
-  if(CAFFE_SOURCE_GROUP_GLOB)
-    file(GLOB srcs1 ${CAFFE_SOURCE_GROUP_GLOB})
-    source_group(${group} FILES ${srcs1})
-  endif()
-
-  if(CAFFE_SOURCE_GROUP_GLOB_RECURSE)
-    file(GLOB_RECURSE srcs2 ${CAFFE_SOURCE_GROUP_GLOB_RECURSE})
-    source_group(${group} FILES ${srcs2})
-  endif()
-endfunction()
\ No newline at end of file
diff --git a/3rdparty/mshadow/cmake/mshadow.cmake b/3rdparty/mshadow/cmake/mshadow.cmake
deleted file mode 100644
index 1ef76988d8d0..000000000000
--- a/3rdparty/mshadow/cmake/mshadow.cmake
+++ /dev/null
@@ -1,91 +0,0 @@
-set(mshadow_LINKER_LIBS "")
-
-set(BLAS "Open" CACHE STRING "Selected BLAS library")
-set_property(CACHE BLAS PROPERTY STRINGS "Atlas;Open;MKL")
-
-if(DEFINED USE_BLAS)
-  set(BLAS "${USE_BLAS}") 
-else()
-  if(USE_MKL_IF_AVAILABLE)
-    if(NOT MKL_FOUND)
-      find_package(MKL)
-    endif()
-    if(MKL_FOUND)
-      set(BLAS "MKL")
-    endif()
-  endif()
-endif()
-
-if(BLAS STREQUAL "Atlas" OR BLAS STREQUAL "atlas")
-  find_package(Atlas REQUIRED)
-  include_directories(SYSTEM ${Atlas_INCLUDE_DIR})
-  list(APPEND mshadow_LINKER_LIBS ${Atlas_LIBRARIES})
-  add_definitions(-DMSHADOW_USE_CBLAS=1)
-  add_definitions(-DMSHADOW_USE_MKL=0)
-elseif(BLAS STREQUAL "Open" OR BLAS STREQUAL "open")
-  find_package(OpenBLAS REQUIRED)
-  include_directories(SYSTEM ${OpenBLAS_INCLUDE_DIR})
-  list(APPEND mshadow_LINKER_LIBS ${OpenBLAS_LIB})
-  add_definitions(-DMSHADOW_USE_CBLAS=1)
-  add_definitions(-DMSHADOW_USE_MKL=0)
-elseif(BLAS STREQUAL "MKL" OR BLAS STREQUAL "mkl")
-  find_package(MKL REQUIRED)
-  include_directories(SYSTEM ${MKL_INCLUDE_DIR})
-  list(APPEND mshadow_LINKER_LIBS ${MKL_LIBRARIES})
-  add_definitions(-DMSHADOW_USE_CBLAS=0)
-  add_definitions(-DMSHADOW_USE_MKL=1)
-elseif(BLAS STREQUAL "apple")
-  find_package(Accelerate REQUIRED)
-  include_directories(SYSTEM ${Accelerate_INCLUDE_DIR})
-  list(APPEND mshadow_LINKER_LIBS ${Accelerate_LIBRARIES})
-  add_definitions(-DMSHADOW_USE_MKL=0)
-  add_definitions(-DMSHADOW_USE_CBLAS=1)
-endif()
-
-if(SUPPORT_MSSE2)
-	add_definitions(-DMSHADOW_USE_SSE=1)
-else()
-	add_definitions(-DMSHADOW_USE_SSE=0)
-endif()
-
-if(NOT DEFINED SUPPORT_F16C AND NOT MSVC)
-    check_cxx_compiler_flag("-mf16c"     COMPILER_SUPPORT_MF16C)
-    if(CMAKE_SYSTEM_NAME STREQUAL "Linux")
-        execute_process(COMMAND cat /proc/cpuinfo
-                COMMAND grep flags
-                COMMAND grep f16c
-                OUTPUT_VARIABLE CPU_SUPPORT_F16C)
-    elseif(CMAKE_SYSTEM_NAME STREQUAL "Darwin")
-        execute_process(COMMAND sysctl -a
-                COMMAND grep machdep.cpu.features
-                COMMAND grep F16C
-                OUTPUT_VARIABLE CPU_SUPPORT_F16C)
-    endif()
-    if(NOT CPU_SUPPORT_F16C)
-        message("CPU does not support F16C instructions")
-    endif()
-    if(CPU_SUPPORT_F16C AND COMPILER_SUPPORT_MF16C)
-        set(SUPPORT_F16C TRUE)
-    endif()
-endif()
-
-if(SUPPORT_F16C)
-    set(CMAKE_CXX_FLAGS  "${CMAKE_CXX_FLAGS} -mf16c")
-else()
-    add_definitions(-DMSHADOW_USE_F16C=0)
-endif()
-
-if(USE_CUDA)
-	find_package(CUDA 5.5 QUIET)
-	find_cuda_helper_libs(curand)
-	if(NOT CUDA_FOUND)
-		message(FATAL_ERROR "-- CUDA is disabled.")
-	endif()
-	add_definitions(-DMSHADOW_USE_CUDA=1)
-	add_definitions(-DMSHADOW_FORCE_STREAM)
-	include_directories(SYSTEM ${CUDA_INCLUDE_DIRS})
-    list(APPEND mshadow_LINKER_LIBS ${CUDA_CUDART_LIBRARY}
-                              ${CUDA_curand_LIBRARY} ${CUDA_CUBLAS_LIBRARIES})
-else()
-  add_definitions(-DMSHADOW_USE_CUDA=0)
-endif()
diff --git a/3rdparty/mshadow/cmake/mshadowUtils.cmake b/3rdparty/mshadow/cmake/mshadowUtils.cmake
deleted file mode 100644
index d4b8bfc89b7a..000000000000
--- a/3rdparty/mshadow/cmake/mshadowUtils.cmake
+++ /dev/null
@@ -1,2 +0,0 @@
-include("${CMAKE_CURRENT_LIST_DIR}/Utils.cmake")
-
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 6d329f5f1079..9e4ef25e5af4 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 3.0.2)
+cmake_minimum_required(VERSION 3.13)
 
 # workaround to store CMAKE_CROSSCOMPILING because is getting reset by the project command
 if(CMAKE_CROSSCOMPILING)
@@ -18,39 +18,57 @@ endif()
 
 include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/Utils.cmake)
 
+include(CMakeDependentOption)
 #Some things have order. This must be put in front alone
-mxnet_option(USE_CUDA             "Build with CUDA support"   ON)
-mxnet_option(USE_OLDCMAKECUDA     "Build with old cmake cuda" OFF)
-mxnet_option(USE_NCCL             "Use NVidia NCCL with CUDA" OFF)
-mxnet_option(USE_OPENCV           "Build with OpenCV support" ON)
-mxnet_option(USE_OPENMP           "Build with Openmp support" ON)
-mxnet_option(USE_CUDNN            "Build with cudnn support"  ON) # one could set CUDNN_ROOT for search path
-mxnet_option(USE_SSE              "Build with x86 SSE instruction support" ON IF NOT ARM)
-mxnet_option(USE_F16C             "Build with x86 F16C instruction support" ON) # autodetects support if ON
-mxnet_option(USE_LAPACK           "Build with lapack support" ON)
-mxnet_option(USE_MKL_IF_AVAILABLE "Use MKL if found" ON)
-mxnet_option(USE_MKLDNN           "Build with MKL-DNN support" ON IF USE_MKL_IF_AVAILABLE AND (NOT APPLE) AND (NOT MSVC) AND (CMAKE_HOST_SYSTEM_PROCESSOR STREQUAL "x86_64") AND (NOT CMAKE_CROSSCOMPILING))
-mxnet_option(USE_OPERATOR_TUNING  "Enable auto-tuning of operators" ON IF NOT MSVC)
-mxnet_option(USE_GPERFTOOLS       "Build with GPerfTools support" OFF)
-mxnet_option(USE_JEMALLOC         "Build with Jemalloc support"   ON)
-mxnet_option(USE_DIST_KVSTORE     "Build with DIST_KVSTORE support" OFF)
-mxnet_option(USE_PLUGINS_WARPCTC  "Use WARPCTC Plugins" OFF)
-mxnet_option(USE_PLUGIN_CAFFE     "Use Caffe Plugin" OFF)
-mxnet_option(USE_CPP_PACKAGE      "Build C++ Package" OFF)
-mxnet_option(USE_MXNET_LIB_NAMING "Use MXNet library naming conventions." ON)
-mxnet_option(USE_GPROF            "Compile with gprof (profiling) flag" OFF)
-mxnet_option(USE_CXX14_IF_AVAILABLE "Build with C++14 if the compiler supports it" OFF)
-mxnet_option(USE_VTUNE            "Enable use of Intel Amplifier XE (VTune)" OFF) # one could set VTUNE_ROOT for search path
-mxnet_option(USE_TVM_OP           "Enable use of TVM operator build system." OFF)
-mxnet_option(ENABLE_CUDA_RTC      "Build with CUDA runtime compilation support" ON)
-mxnet_option(BUILD_CPP_EXAMPLES   "Build cpp examples" ON)
-mxnet_option(INSTALL_EXAMPLES     "Install the example source files." OFF)
-mxnet_option(USE_SIGNAL_HANDLER   "Print stack traces on segfaults." ON)
-mxnet_option(USE_TENSORRT         "Enable inference optimization with TensorRT." OFF)
-mxnet_option(USE_ASAN             "Enable Clang/GCC ASAN sanitizers." OFF)
-mxnet_option(ENABLE_TESTCOVERAGE  "Enable compilation with test coverage metric output" OFF)
-mxnet_option(USE_INT64_TENSOR_SIZE "Use int64_t to represent the total number of elements in a tensor" OFF)
-mxnet_option(BUILD_CYTHON_MODULES "Build cython modules." OFF)
+option(USE_CUDA "Build with CUDA support"   ON)
+set(MXNET_CUDA_ARCH "Auto" CACHE STRING "Target NVIDIA GPU achitecture.
+Format: Auto | Common | All | LIST(ARCH_AND_PTX ...)
+- \"Auto\" detects local machine GPU compute arch at runtime.
+- \"Common\" and \"All\" cover common and entire subsets of architectures
+- ARCH_AND_PTX : NAME | NUM.NUM | NUM.NUM(NUM.NUM) | NUM.NUM+PTX
+- NAME: Fermi Kepler Maxwell Kepler+Tegra Kepler+Tesla Maxwell+Tegra Pascal Volta Turing
+- NUM: Any number. Only those pairs are currently accepted by NVCC though:
+       2.0 2.1 3.0 3.2 3.5 3.7 5.0 5.2 5.3 6.0 6.2 7.0 7.2 7.5")
+option(USE_NCCL "Use NVidia NCCL with CUDA" OFF)
+option(USE_OPENCV "Build with OpenCV support" ON)
+option(USE_OPENMP "Build with Openmp support" ON)
+cmake_dependent_option(USE_CUDNN "Build with cudnn support" ON "USE_CUDA" OFF) # one could set CUDNN_ROOT for search path
+cmake_dependent_option(USE_SSE "Build with x86 SSE instruction support" ON "NOT ARM" OFF)
+option(USE_F16C "Build with x86 F16C instruction support" ON) # autodetects support if ON
+option(USE_LAPACK "Build with lapack support" ON)
+option(USE_MKL_IF_AVAILABLE "Use MKL if found" ON)
+if(USE_MKL_IF_AVAILABLE AND (NOT APPLE) AND (NOT MSVC) AND (CMAKE_HOST_SYSTEM_PROCESSOR STREQUAL "x86_64") AND (NOT CMAKE_CROSSCOMPILING))
+  option(USE_MKLDNN "Build with MKL-DNN support" ON)
+else()
+  option(USE_MKLDNN "Build with MKL-DNN support" OFF)
+endif()
+if(NOT MSVC)
+  option(USE_OPERATOR_TUNING  "Enable auto-tuning of operators" ON)
+else()
+  option(USE_OPERATOR_TUNING  "Enable auto-tuning of operators" OFF)
+endif()
+option(USE_GPERFTOOLS "Build with GPerfTools support" OFF)
+option(USE_JEMALLOC "Build with Jemalloc support"   ON)
+option(USE_DIST_KVSTORE "Build with DIST_KVSTORE support" OFF)
+option(USE_PLUGINS_WARPCTC "Use WARPCTC Plugins" OFF)
+option(USE_PLUGIN_CAFFE "Use Caffe Plugin" OFF)
+option(USE_CPP_PACKAGE "Build C++ Package" OFF)
+option(USE_MXNET_LIB_NAMING "Use MXNet library naming conventions." ON)
+option(USE_GPROF "Compile with gprof (profiling) flag" OFF)
+option(USE_CXX14_IF_AVAILABLE "Build with C++14 if the compiler supports it" OFF)
+option(USE_VTUNE "Enable use of Intel Amplifier XE (VTune)" OFF) # one could set VTUNE_ROOT for search path
+option(USE_TVM_OP "Enable use of TVM operator build system." OFF)
+option(ENABLE_CUDA_RTC "Build with CUDA runtime compilation support" ON)
+option(BUILD_CPP_EXAMPLES "Build cpp examples" ON)
+option(INSTALL_EXAMPLES "Install the example source files." OFF)
+option(USE_SIGNAL_HANDLER "Print stack traces on segfaults." ON)
+option(USE_TENSORRT "Enable inference optimization with TensorRT." OFF)
+option(USE_ASAN "Enable Clang/GCC ASAN sanitizers." OFF)
+option(ENABLE_TESTCOVERAGE "Enable compilation with test coverage metric output" OFF)
+option(USE_INT64_TENSOR_SIZE "Use int64_t to represent the total number of elements in a tensor" OFF)
+option(BUILD_CYTHON_MODULES "Build cython modules." OFF)
+cmake_dependent_option(USE_SPLIT_ARCH_DLL "Build a separate DLL for each Cuda arch (Windows only)." ON "MSVC" OFF)
+
 
 message(STATUS "CMAKE_CROSSCOMPILING ${CMAKE_CROSSCOMPILING}")
 message(STATUS "CMAKE_HOST_SYSTEM_PROCESSOR ${CMAKE_HOST_SYSTEM_PROCESSOR}")
@@ -62,31 +80,29 @@ if(USE_TVM_OP)
   add_definitions(-DMXNET_USE_TVM_OP=1)
 endif()
 
-if(USE_CUDA AND NOT USE_OLDCMAKECUDA)
-  message(STATUS "CMake version '${CMAKE_VERSION}' using generator '${CMAKE_GENERATOR}'")
-  if(
-      (
-        (${CMAKE_GENERATOR} MATCHES "Visual Studio.*")
-        OR (${CMAKE_GENERATOR} MATCHES "Xcode.*")
-        OR (${CMAKE_GENERATOR} STREQUAL "Unix Makefiles")
-      ) AND (
-        (${CMAKE_VERSION} VERSION_GREATER "3.9.0") OR (${CMAKE_VERSION} VERSION_EQUAL "3.9.0")
-      )
-    )
-    set(FIRST_CUDA TRUE)
-    project(mxnet C CXX CUDA)
-  else()
-    set(FIRST_CUDA FALSE)
-    set(USE_OLDCMAKECUDA TRUE)
-    project(mxnet C CXX)
+message(STATUS "CMake version '${CMAKE_VERSION}' using generator '${CMAKE_GENERATOR}'")
+project(mxnet C CXX)
+if(USE_CUDA)
+  cmake_minimum_required(VERSION 3.13.2)  # CUDA 10 (Turing) detection available starting 3.13.2
+  enable_language(CUDA)
+  set(CMAKE_CUDA_STANDARD 11)
+  include(CheckCXXCompilerFlag)
+  if(USE_CXX14_IF_AVAILABLE)
+    check_cxx_compiler_flag("-std=c++14" SUPPORT_CXX14)
+    if (SUPPORT_CXX14)
+      set(CMAKE_CUDA_STANDARD 14)
+    endif()
   endif()
-else()
-  project(mxnet C CXX)
+  set(CMAKE_CUDA_STANDARD_REQUIRED ON)
 endif()
 
+if(UNIX)
+  set(CMAKE_POSITION_INDEPENDENT_CODE ON)
+endif()
 
 if(MSVC)
   set(SYSTEM_ARCHITECTURE x86_64)
+  enable_language(ASM_MASM)
 else()
   execute_process(COMMAND uname -m COMMAND tr -d '\n' OUTPUT_VARIABLE SYSTEM_ARCHITECTURE)
 endif()
@@ -102,7 +118,8 @@ endif()
 
 #Switch off modern thread local for dmlc-core, please see: https://github.com/dmlc/dmlc-core/issues/571#issuecomment-543467484
 add_definitions(-DDMLC_MODERN_THREAD_LOCAL=0)
-
+# disable stack trace in exception by default.
+add_definitions(-DDMLC_LOG_STACK_TRACE_SIZE=0)
 
 if(MSVC)
   add_definitions(-DWIN32_LEAN_AND_MEAN)
@@ -119,7 +136,7 @@ if(MSVC)
   endif()
   set(CMAKE_C_FLAGS "/MP")
   set(CMAKE_CXX_FLAGS "${CMAKE_C_FLAGS} /bigobj")
-else(MSVC)
+else()
   include(CheckCXXCompilerFlag)
   if(USE_CXX14_IF_AVAILABLE)
     check_cxx_compiler_flag("-std=c++14" SUPPORT_CXX14)
@@ -132,6 +149,7 @@ else(MSVC)
     check_cxx_compiler_flag("-msse3"     SUPPORT_MSSE3)
     check_cxx_compiler_flag("-msse2"     SUPPORT_MSSE2)
   else()
+    set(SUPPORT_MSSE3 FALSE)
     set(SUPPORT_MSSE2 FALSE)
   endif()
   # For cross complication, turn off flag if target device does not support it
@@ -148,7 +166,6 @@ else(MSVC)
   else()
     add_definitions(-DMSHADOW_USE_F16C=0)
   endif()
-  set(CMAKE_POSITION_INDEPENDENT_CODE ON)
   set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wall -Wno-unknown-pragmas -Wno-sign-compare")
   if ("${CMAKE_CXX_COMPILER_ID}" MATCHES ".*Clang$")
     set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wno-braced-scalar-init")
@@ -166,8 +183,12 @@ else(MSVC)
   endif()
   if(SUPPORT_MSSE3)
     set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -msse3")
+    add_definitions(-DMSHADOW_USE_SSE=1)
   elseif(SUPPORT_MSSE2)
     set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -msse2")
+    add_definitions(-DMSHADOW_USE_SSE=1)
+  else()
+    add_definitions(-DMSHADOW_USE_SSE=0)
   endif()
   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${CMAKE_C_FLAGS}")
   if(SUPPORT_CXX14)
@@ -240,7 +261,7 @@ if(USE_TENSORRT)
 endif()
 
 # please note that when you enable this, you might run into an linker not being able to work properly due to large code injection.
-# you can find more information here https://github.com/apache/incubator-mxnet/issues/15971 
+# you can find more information here https://github.com/apache/incubator-mxnet/issues/15971
 if(ENABLE_TESTCOVERAGE)
   message(STATUS "Compiling with test coverage support enabled. This will result in additional files being written to your source directory!")
   find_program( GCOV_PATH gcov )
@@ -258,67 +279,28 @@ endif()
 if(USE_MKLDNN)
   # CPU architecture (e.g., C5) can't run on another architecture (e.g., g3).
   if(MSVC)
-    set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} /EHsc")
-    set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /EHsc /Gy")
+    set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} /EHsc /MT")
+    set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /EHsc /Gy /MT")
   endif()
 
   set(MKLDNN_BUILD_TESTS OFF CACHE INTERNAL "" FORCE)
   set(MKLDNN_BUILD_EXAMPLES OFF CACHE INTERNAL "" FORCE)
   set(MKLDNN_ARCH_OPT_FLAGS "" CACHE INTERNAL "" FORCE)
-  set(MKLDNN_USE_MKL NONE CACHE INTERNAL "" FORCE)
   set(MKLDNN_ENABLE_JIT_PROFILING OFF CACHE INTERNAL "" FORCE)
+  set(MKLDNN_LIBRARY_TYPE STATIC CACHE INTERNAL "" FORCE)
 
   add_subdirectory(3rdparty/mkldnn)
 
   include_directories(3rdparty/mkldnn/include)
   include_directories(${PROJECT_BINARY_DIR}/3rdparty/mkldnn/include)
   add_definitions(-DMXNET_USE_MKLDNN=1)
-  list(APPEND mxnet_LINKER_LIBS mkldnn)
+  list(APPEND mxnet_LINKER_LIBS dnnl)
 endif()
 
 # Allow Cuda compiles outside of src tree to find things in 'src' and 'include'
 include_directories(${CMAKE_CURRENT_SOURCE_DIR}/include)
 include_directories(${CMAKE_CURRENT_SOURCE_DIR}/src)
 
-if(USE_CUDA)
-  find_package(CUDA REQUIRED)
-  add_definitions(-DMSHADOW_USE_CUDA=1)
-  if(FIRST_CUDA AND (NOT USE_OLDCMAKECUDA))
-    if(NOT CUDA_TOOLSET)
-      set(CUDA_TOOLSET "${CUDA_VERSION_STRING}")
-    endif()
-  else()
-    set(FIRST_CUDA FALSE)
-  endif()
-  if(USE_NCCL)
-    find_package(NCCL)
-    if(NCCL_FOUND)
-      include_directories(${NCCL_INCLUDE_DIRS})
-      list(APPEND mxnet_LINKER_LIBS ${NCCL_LIBRARIES})
-    else()
-      message(WARNING "Could not find NCCL libraries")
-    endif()
-  endif()
-  if(UNIX)
-    find_package(NVTX)
-    if(NVTX_FOUND)
-      include_directories(${NVTX_INCLUDE_DIRS})
-      list(APPEND mxnet_LINKER_LIBS ${NVTX_LIBRARIES})
-      add_definitions(-DMXNET_USE_NVTX=1)
-    else()
-      message(WARNING "Could not find NVTX libraries")
-    endif()
-  endif()
-else()
-  add_definitions(-DMSHADOW_USE_CUDA=0)
-endif()
-
-if(NCCL_FOUND)
-  add_definitions(-DMXNET_USE_NCCL=1)
-else()
-  add_definitions(-DMXNET_USE_NCCL=0)
-endif()
-
 if (USE_INT64_TENSOR_SIZE)
   message(STATUS "Using 64-bit integer for tensor size")
   add_definitions(-DMSHADOW_INT64_TENSOR_SIZE=1)
@@ -327,21 +309,6 @@ else()
 endif()
 
 include(cmake/ChooseBlas.cmake)
-if(USE_CUDA AND FIRST_CUDA)
-  include(3rdparty/mshadow/cmake/Utils.cmake)
-  include(cmake/FirstClassLangCuda.cmake)
-  include_directories(${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})
-else()
-  if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/3rdparty/mshadow/cmake)
-    include(3rdparty/mshadow/cmake/mshadow.cmake)
-    include(3rdparty/mshadow/cmake/Utils.cmake)
-    include(3rdparty/mshadow/cmake/Cuda.cmake)
-  else()
-    include(mshadowUtils)
-    include(Cuda)
-    include(mshadow)
-  endif()
-endif()
 
 if(USE_ASAN)
   set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fno-omit-frame-pointer -fsanitize=address")
@@ -436,6 +403,16 @@ endif()
 
 # ---[ OpenMP
 if(USE_OPENMP)
+
+  function(load_omp)
+    # Intel/llvm OpenMP: https://github.com/llvm-mirror/openmp
+    set(OPENMP_STANDALONE_BUILD TRUE)
+    set(LIBOMP_ENABLE_SHARED TRUE)
+    set(CMAKE_BUILD_TYPE Release)
+    set(OPENMP_ENABLE_LIBOMPTARGET OFF CACHE BOOL "LLVM OpenMP offloading support")  # Requires CMP0077 CMake 3.13
+    add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/3rdparty/openmp)
+  endfunction()
+
   find_package(OpenMP REQUIRED)
   # This should build on Windows, but there's some problem and I don't have a Windows box, so
   # could a Windows user please fix?
@@ -443,11 +420,7 @@ if(USE_OPENMP)
      AND SYSTEM_ARCHITECTURE STREQUAL "x86_64"
      AND NOT MSVC
      AND NOT CMAKE_CROSSCOMPILING)
-
-    # Intel/llvm OpenMP: https://github.com/llvm-mirror/openmp
-    set(OPENMP_STANDALONE_BUILD TRUE)
-    set(LIBOMP_ENABLE_SHARED TRUE)
-    add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/3rdparty/openmp)
+    load_omp()
     list(REMOVE_ITEM mxnet_LINKER_LIBS iomp5)
     list(APPEND mxnet_LINKER_LIBS omp)
     if(UNIX)
@@ -502,13 +475,15 @@ add_subdirectory(${GTEST_ROOT})
 find_package(GTest REQUIRED)
 
 # cudnn detection
-if(USE_CUDNN AND USE_CUDA)
-  detect_cuDNN()
-  if(HAVE_CUDNN)
+if(USE_CUDNN)
+  find_package(CUDNN)
+  if(CUDNN_FOUND)
     add_definitions(-DUSE_CUDNN)
     include_directories(SYSTEM ${CUDNN_INCLUDE})
     list(APPEND mxnet_LINKER_LIBS ${CUDNN_LIBRARY})
-      add_definitions(-DMSHADOW_USE_CUDNN=1)
+    add_definitions(-DMSHADOW_USE_CUDNN=1)
+  else()
+    set(USE_CUDNN OFF)
   endif()
 endif()
 
@@ -516,9 +491,7 @@ if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/3rdparty/dmlc-core/cmake)
   add_subdirectory("3rdparty/dmlc-core")
 endif()
 
-if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/3rdparty/mshadow/cmake)
-  add_subdirectory("3rdparty/mshadow")
-endif()
+add_subdirectory("3rdparty/mshadow")
 FILE(GLOB_RECURSE SOURCE "src/*.cc" "src/*.h" "include/*.h")
 FILE(GLOB_RECURSE CUDA "src/*.cu" "src/*.cuh")
 
@@ -618,61 +591,63 @@ if(MSVC)
 endif()
 
 if(USE_CUDA)
-  if(FIRST_CUDA)
-    mshadow_select_nvcc_arch_flags(NVCC_FLAGS_ARCH)
-    string(REPLACE ";" " " NVCC_FLAGS_ARCH "${NVCC_FLAGS_ARCH}")
-    set(CMAKE_CUDA_FLAGS "${NVCC_FLAGS_ARCH}")
-    list(APPEND mxnet_LINKER_LIBS cublas cufft cusolver curand)
-    if(ENABLE_CUDA_RTC)
-        list(APPEND mxnet_LINKER_LIBS nvrtc cuda)
-        add_definitions(-DMXNET_ENABLE_CUDA_RTC=1)
+  # CUDA_SELECT_NVCC_ARCH_FLAGS is not deprecated, though part of deprecated
+  # FindCUDA https://gitlab.kitware.com/cmake/cmake/issues/19199
+  include(${CMAKE_ROOT}/Modules/FindCUDA/select_compute_arch.cmake)
+  CUDA_SELECT_NVCC_ARCH_FLAGS(CUDA_ARCH_FLAGS ${MXNET_CUDA_ARCH})
+  message("-- CUDA: Using the following NVCC architecture flags ${CUDA_ARCH_FLAGS}")
+  set(arch_code_list)
+  foreach(arch_str ${CUDA_ARCH_FLAGS})
+    if((arch_str MATCHES ".*sm_[0-9]+")) 
+      string( REGEX REPLACE  ".*sm_([0-9]+)" "\\1" arch_code ${arch_str} )
+      list(APPEND arch_code_list ${arch_code})
     endif()
-    list(APPEND SOURCE ${CUDA})
-    add_definitions(-DMXNET_USE_CUDA=1)
-    link_directories(${CUDA_TOOLKIT_ROOT_DIR}/lib64)
-  else()
-    list(APPEND CUDA_INCLUDE_DIRS ${INCLUDE_DIRECTORIES})
-    # define preprocessor macro so that we will not include the generated forcelink header
-    if(ENABLE_CUDA_RTC)
+  endforeach()
+
+  string(REPLACE ";" " " CUDA_ARCH_FLAGS_SPACES "${CUDA_ARCH_FLAGS}")
+
+
+  find_package(CUDAToolkit REQUIRED cublas cufft cusolver curand
+    OPTIONAL_COMPONENTS nvToolsExt nvrtc)
+
+  list(APPEND mxnet_LINKER_LIBS CUDA::cudart CUDA::cublas CUDA::cufft CUDA::cusolver CUDA::curand)
+  if(ENABLE_CUDA_RTC)
+    if(CUDA_nvrtc_LIBRARY)
+      list(APPEND mxnet_LINKER_LIBS CUDA::nvrtc cuda)
       add_definitions(-DMXNET_ENABLE_CUDA_RTC=1)
+    else()
+      message(FATAL_ERROR "ENABLE_CUDA_RTC=ON, but failed to find NVRTC. CMake will exit." )
     endif()
-    # Create '.cmake' files for cuda compiles given definitions added thus far
-    mshadow_cuda_compile(cuda_objs ${CUDA})
-    if(MSVC)
-        if(ENABLE_CUDA_RTC)
-            FIND_LIBRARY(CUDA_nvrtc_LIBRARY nvrtc "${CUDA_TOOLKIT_ROOT_DIR}/lib/x64"  "${CUDA_TOOLKIT_ROOT_DIR}/lib/win32")
-            list(APPEND mxnet_LINKER_LIBS ${CUDA_nvrtc_LIBRARY})
-            set(CUDA_cuda_LIBRARY "${CUDA_nvrtc_LIBRARY}/../cuda.lib")
-            list(APPEND mxnet_LINKER_LIBS ${CUDA_cuda_LIBRARY})
-        endif()
-        FIND_LIBRARY(CUDA_cufft_LIBRARY nvrtc "${CUDA_TOOLKIT_ROOT_DIR}/lib/x64"  "${CUDA_TOOLKIT_ROOT_DIR}/lib/win32")
-        list(APPEND mxnet_LINKER_LIBS "${CUDA_cufft_LIBRARY}/../cufft.lib") # For fft operator
-        FIND_LIBRARY(CUDA_cusolver_LIBRARY nvrtc "${CUDA_TOOLKIT_ROOT_DIR}/lib/x64"  "${CUDA_TOOLKIT_ROOT_DIR}/lib/win32")
-        list(APPEND mxnet_LINKER_LIBS "${CUDA_cusolver_LIBRARY}/../cusolver.lib") # For cusolver
-        link_directories(${CUDA_TOOLKIT_ROOT_DIR}/lib/win32)
-        link_directories(${CUDA_TOOLKIT_ROOT_DIR}/lib/x64)
-    else(MSVC)
-        list(APPEND mxnet_LINKER_LIBS cufft cusolver)
-        if(ENABLE_CUDA_RTC)
-            list(APPEND mxnet_LINKER_LIBS nvrtc cuda)
-        endif()
-        link_directories("${CUDA_TOOLKIT_ROOT_DIR}/lib64")
+  endif()
+  list(APPEND SOURCE ${CUDA})
+  add_definitions(-DMXNET_USE_CUDA=1)
+  add_definitions(-DMSHADOW_USE_CUDA=1)
+	add_definitions(-DMSHADOW_FORCE_STREAM)
+
+  if(USE_NCCL)
+    find_package(NCCL)
+    if(NCCL_FOUND)
+      include_directories(${NCCL_INCLUDE_DIRS})
+      list(APPEND mxnet_LINKER_LIBS ${NCCL_LIBRARIES})
+      add_definitions(-DMXNET_USE_NCCL=1)
+    else()
+      add_definitions(-DMXNET_USE_NCCL=0)
+      message(WARNING "Could not find NCCL libraries")
     endif()
-    list(APPEND SOURCE ${cuda_objs} ${CUDA})
-    add_definitions(-DMXNET_USE_CUDA=1)
-    if(CUDA_LIBRARY_PATH)
-        if(IS_CONTAINER_BUILD)
-        # In case of building on a production-like build container which may not have Cuda installed
-        if(NOT CMAKE_SYSTEM_HAS_CUDA)
-            # Assuming building in a container that doesn't have CUDA installed (ie CPU-only build machine)
-            # so use the stub cuda driver shared library
-            if(EXISTS ${CUDA_LIBRARY_PATH}/stubs/libcuda.so)
-            link_directories(${CUDA_LIBRARY_PATH}/stubs)
-            endif()
-        endif()
-        endif()
+  endif()
+  if(UNIX)
+    if(CUDA_nvToolsExt_LIBRARY)
+      list(APPEND mxnet_LINKER_LIBS CUDA::nvToolsExt)
+      add_definitions(-DMXNET_USE_NVTX=1)
+    else()
+      message("Building without NVTX support.")
     endif()
- endif()
+  endif()
+
+  include_directories(${CUDAToolkit_INCLUDE_DIRS})
+  link_directories(${CUDAToolkit_LIBRARY_DIR})
+else()
+  add_definitions(-DMSHADOW_USE_CUDA=0)
 endif()
 
 # unsupported: if caffe is a subdirectory of mxnet, load its CMakeLists.txt as well
@@ -699,10 +674,11 @@ else()
 
 endif()
 
-add_library(sample_lib SHARED ${CMAKE_CURRENT_SOURCE_DIR}/example/lib_api/mylib.cc)
+add_library(sample_lib SHARED ${CMAKE_CURRENT_SOURCE_DIR}/example/extensions/lib_custom_op/gemm_lib.cc)
 target_include_directories(sample_lib PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/include/mxnet)
 set(MXNET_INSTALL_TARGETS mxnet)
 if(UNIX)
+  string(APPEND CMAKE_CUDA_FLAGS "${CUDA_ARCH_FLAGS_SPACES}")
   # Create dummy file since we want an empty shared library before linking
   set(DUMMY_SOURCE ${CMAKE_BINARY_DIR}/dummy.c)
   file(WRITE ${DUMMY_SOURCE} "")
@@ -714,32 +690,66 @@ if(UNIX)
   target_link_libraries(mxnet_static PUBLIC ${CMAKE_DL_LIBS})
   target_compile_options(sample_lib PUBLIC -shared)
   set_target_properties(mxnet_static PROPERTIES OUTPUT_NAME mxnet)
-else()
-  add_library(mxnet SHARED ${SOURCE})
+elseif(MSVC)
   target_compile_options(sample_lib PUBLIC /LD)
   set_target_properties(sample_lib PROPERTIES PREFIX "lib")
-endif()
 
-if(USE_CUDA)
-  if(FIRST_CUDA AND MSVC)
-    target_compile_options(mxnet PUBLIC "$<$<CONFIG:DEBUG>:-Xcompiler=-MTd -Gy>")
-    target_compile_options(mxnet PUBLIC "$<$<CONFIG:RELEASE>:-Xcompiler=-MT -Gy>")
+  if(USE_CUDA)
+    if(MSVC)
+      if(USE_SPLIT_ARCH_DLL)
+        add_executable(gen_warp tools/windowsbuild/gen_warp.cpp)
+        add_library(mxnet SHARED tools/windowsbuild/warp_dll.cpp ${CMAKE_BINARY_DIR}/warp_gen_cpp.cpp
+                    ${CMAKE_BINARY_DIR}/warp_gen.asm)
+        target_link_libraries(mxnet PRIVATE cudart Shlwapi)
+        list(GET arch_code_list 0 mxnet_first_arch)
+        foreach(arch ${arch_code_list})
+          add_library(mxnet_${arch} SHARED ${SOURCE})
+          target_compile_options(
+            mxnet_${arch}
+            PRIVATE
+            "$<$<COMPILE_LANGUAGE:CUDA>:--gpu-architecture=compute_${arch}>"
+          )
+          target_compile_options(
+            mxnet_${arch}
+            PRIVATE
+            "$<$<COMPILE_LANGUAGE:CUDA>:--gpu-code=sm_${arch},compute_${arch}>"
+          )
+          target_compile_options(
+            mxnet_${arch} 
+            PRIVATE "$<$<AND:$<CONFIG:DEBUG>,$<COMPILE_LANGUAGE:CUDA>>:-Xcompiler=-MTd -Gy /bigobj>")
+          target_compile_options(
+            mxnet_${arch}
+            PRIVATE "$<$<AND:$<CONFIG:RELEASE>,$<COMPILE_LANGUAGE:CUDA>>:-Xcompiler=-MT -Gy /bigobj>")
+        endforeach()
+
+        add_custom_command(
+          OUTPUT ${CMAKE_BINARY_DIR}/warp_gen_cpp.cpp ${CMAKE_BINARY_DIR}/warp_gen.asm
+          COMMAND gen_warp $<TARGET_FILE:mxnet_${mxnet_first_arch}> WORKING_DIRECTORY ${CMAKE_BINARY_DIR}/ DEPENDS $<TARGET_FILE:mxnet_${mxnet_first_arch}>)
+      else(USE_SPLIT_ARCH_DLL)
+        string(REPLACE ";" " " NVCC_FLAGS_ARCH "${NVCC_FLAGS_ARCH}")
+        set(CMAKE_CUDA_FLAGS "${CUDA_ARCH_FLAGS_SPACES}")
+        add_library(mxnet SHARED ${SOURCE})
+        target_compile_options(
+            mxnet 
+            PRIVATE "$<$<AND:$<CONFIG:DEBUG>,$<COMPILE_LANGUAGE:CUDA>>:-Xcompiler=-MTd -Gy /bigobj>")
+        target_compile_options(
+            mxnet
+            PRIVATE "$<$<AND:$<CONFIG:RELEASE>,$<COMPILE_LANGUAGE:CUDA>>:-Xcompiler=-MT -Gy /bigobj>")
+
+      endif(USE_SPLIT_ARCH_DLL)
+    else()
+      add_library(mxnet SHARED ${SOURCE})
+    endif()
+  else()
+    add_library(mxnet SHARED ${SOURCE})
   endif()
+
 endif()
+
 if(USE_DIST_KVSTORE)
   if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/3rdparty/ps-lite/CMakeLists.txt)
     add_subdirectory("3rdparty/ps-lite")
     list(APPEND pslite_LINKER_LIBS pslite protobuf)
-    target_link_libraries(mxnet PUBLIC debug ${pslite_LINKER_LIBS_DEBUG})
-    target_link_libraries(mxnet PUBLIC optimized ${pslite_LINKER_LIBS_RELEASE})
-    if(CMAKE_BUILD_TYPE STREQUAL "Debug")
-      list(APPEND mxnet_LINKER_LIBS ${pslite_LINKER_LIBS_DEBUG})
-    else()
-      list(APPEND mxnet_LINKER_LIBS ${pslite_LINKER_LIBS_RELEASE})
-    endif()
-    target_link_libraries(mxnet PUBLIC debug ${pslite_LINKER_LIBS_DEBUG})
-    target_link_libraries(mxnet PUBLIC optimized ${pslite_LINKER_LIBS_RELEASE})
-
   else()
     set(pslite_LINKER_LIBS protobuf zmq-static)
   endif()
@@ -762,8 +772,8 @@ if(USE_TVM_OP)
   endif()
 
   set(TVM_OP_COMPILE_OPTIONS "-o${CMAKE_CURRENT_BINARY_DIR}/libtvmop.so" "--config" "${CMAKE_CURRENT_BINARY_DIR}/tvmop.conf")
-  if(CUDA_ARCH_BIN)
-    set(TVM_OP_COMPILE_OPTIONS "${TVM_OP_COMPILE_OPTIONS}" "--cuda-arch" "${CUDA_ARCH_BIN}")
+  if(USE_CUDA)
+    set(TVM_OP_COMPILE_OPTIONS "${TVM_OP_COMPILE_OPTIONS}" "--cuda-arch" "\"${CUDA_ARCH_FLAGS}\"")
   endif()
   add_custom_command(TARGET mxnet POST_BUILD
     COMMAND ${CMAKE_COMMAND} -E env
@@ -773,13 +783,24 @@ if(USE_TVM_OP)
     )
 endif()
 
-target_link_libraries(mxnet PUBLIC ${mxnet_LINKER_LIBS})
-
 if(USE_PLUGINS_WARPCTC)
-  target_link_libraries(mxnet PUBLIC debug ${WARPCTC_LIB_DEBUG})
-  target_link_libraries(mxnet PUBLIC optimized ${WARPCTC_LIB_RELEASE})
+  list(APPEND mxnet_LINKER_LIBS ${WARPCTC_LIB})
 endif()
 
+if(MSVC)
+  if(USE_SPLIT_ARCH_DLL AND USE_CUDA)
+    foreach(arch ${arch_code_list})
+      target_link_libraries(mxnet_${arch} PUBLIC ${mxnet_LINKER_LIBS})
+      target_link_libraries(mxnet_${arch} PUBLIC dmlc)
+    endforeach()
+  else()
+    target_link_libraries(mxnet PUBLIC ${mxnet_LINKER_LIBS})
+    target_link_libraries(mxnet PUBLIC dmlc)
+  endif()
+else()
+  target_link_libraries(mxnet PUBLIC ${mxnet_LINKER_LIBS})
+  target_link_libraries(mxnet PUBLIC dmlc)
+endif()
 
 if(USE_OPENCV AND OpenCV_VERSION_MAJOR GREATER 2)
   add_executable(im2rec "tools/im2rec.cc")
@@ -799,7 +820,6 @@ else()
     is required for im2rec, im2rec will not be available")
 endif()
 
-target_link_libraries(mxnet PUBLIC dmlc)
 
 if(MSVC AND USE_MXNET_LIB_NAMING)
   set_target_properties(mxnet PROPERTIES OUTPUT_NAME "libmxnet")
diff --git a/ci/build_windows.py b/ci/build_windows.py
index ce77c316ab20..9af616d2331a 100755
--- a/ci/build_windows.py
+++ b/ci/build_windows.py
@@ -112,9 +112,7 @@ class BuildFlavour(Enum):
         '-DUSE_BLAS=open '
         '-DUSE_LAPACK=ON '
         '-DUSE_DIST_KVSTORE=OFF '
-        '-DCUDA_ARCH_NAME=Manual '
-        '-DCUDA_ARCH_BIN=52 '
-        '-DCUDA_ARCH_PTX=52 '
+        '-DMXNET_CUDA_ARCH="5.2" '
         '-DCMAKE_CXX_FLAGS="/FS /MD /O2 /Ob2" '
         '-DUSE_MKL_IF_AVAILABLE=OFF '
         '-DCMAKE_BUILD_TYPE=Release')
@@ -128,9 +126,7 @@ class BuildFlavour(Enum):
         '-DUSE_BLAS=open '
         '-DUSE_LAPACK=ON '
         '-DUSE_DIST_KVSTORE=OFF '
-        '-DCUDA_ARCH_NAME=Manual '
-        '-DCUDA_ARCH_BIN=52 '
-        '-DCUDA_ARCH_PTX=52 '
+        '-DMXNET_CUDA_ARCH="5.2" '
         '-DUSE_MKLDNN=ON '
         '-DCMAKE_CXX_FLAGS="/FS /MD /O2 /Ob2" '
         '-DCMAKE_BUILD_TYPE=Release')
diff --git a/ci/docker/install/ubuntu_core.sh b/ci/docker/install/ubuntu_core.sh
index 3cb806e0aadd..77c1fe2fb59d 100755
--- a/ci/docker/install/ubuntu_core.sh
+++ b/ci/docker/install/ubuntu_core.sh
@@ -49,7 +49,7 @@ apt-get install -y \
     wget
 
 # Use libturbojpeg package as it is correctly compiled with -fPIC flag
-# https://github.com/HaxeFoundation/hashlink/issues/147 
+# https://github.com/HaxeFoundation/hashlink/issues/147
 ln -s /usr/lib/x86_64-linux-gnu/libturbojpeg.so.0.1.0 /usr/lib/x86_64-linux-gnu/libturbojpeg.so
 
 
diff --git a/ci/docker/runtime_functions.sh b/ci/docker/runtime_functions.sh
index 581bb2fd5280..745214af2eea 100755
--- a/ci/docker/runtime_functions.sh
+++ b/ci/docker/runtime_functions.sh
@@ -25,7 +25,7 @@ set -ex
 NOSE_COVERAGE_ARGUMENTS="--with-coverage --cover-inclusive --cover-xml --cover-branches --cover-package=mxnet"
 NOSE_TIMER_ARGUMENTS="--with-timer --timer-ok 1 --timer-warning 15 --timer-filter warning,error"
 CI_CUDA_COMPUTE_CAPABILITIES="-gencode=arch=compute_52,code=sm_52 -gencode=arch=compute_70,code=sm_70"
-CI_CMAKE_CUDA_ARCH_BIN="52,70"
+CI_CMAKE_CUDA_ARCH="5.2 7.0"
 
 clean_repo() {
     set -ex
@@ -753,8 +753,7 @@ build_ubuntu_gpu_tensorrt() {
           -DUSE_OPENMP=0                          \
           -DUSE_MKLDNN=0                          \
           -DUSE_MKL_IF_AVAILABLE=OFF              \
-          -DCUDA_ARCH_NAME=Manual                 \
-          -DCUDA_ARCH_BIN=$CI_CMAKE_CUDA_ARCH_BIN \
+          -DMXNET_CUDA_ARCH="$CI_CMAKE_CUDA_ARCH" \
           -G Ninja                                \
           /work/mxnet
 
@@ -872,8 +871,7 @@ build_ubuntu_gpu_cmake_mkldnn() {
         -DPython3_EXECUTABLE=/usr/bin/python3   \
         -DUSE_MKLML_MKL=1                       \
         -DCMAKE_BUILD_TYPE=Release              \
-        -DCUDA_ARCH_NAME=Manual                 \
-        -DCUDA_ARCH_BIN=$CI_CMAKE_CUDA_ARCH_BIN \
+        -DMXNET_CUDA_ARCH="$CI_CMAKE_CUDA_ARCH" \
         -G Ninja                                \
         /work/mxnet
 
@@ -901,8 +899,7 @@ build_ubuntu_gpu_cmake() {
         -DUSE_MKLDNN=OFF                        \
         -DUSE_DIST_KVSTORE=ON                   \
         -DCMAKE_BUILD_TYPE=Release              \
-        -DCUDA_ARCH_NAME=Manual                 \
-        -DCUDA_ARCH_BIN=$CI_CMAKE_CUDA_ARCH_BIN \
+        -DMXNET_CUDA_ARCH="$CI_CMAKE_CUDA_ARCH" \
         -DBUILD_CYTHON_MODULES=1                \
         -G Ninja                                \
         /work/mxnet
@@ -928,8 +925,7 @@ build_ubuntu_gpu_cmake_no_tvm_op() {
         -DUSE_MKLDNN=OFF                        \
         -DUSE_DIST_KVSTORE=ON                   \
         -DCMAKE_BUILD_TYPE=Release              \
-        -DCUDA_ARCH_NAME=Manual                 \
-        -DCUDA_ARCH_BIN=$CI_CMAKE_CUDA_ARCH_BIN \
+        -DMXNET_CUDA_ARCH="$CI_CMAKE_CUDA_ARCH" \
         -DBUILD_CYTHON_MODULES=1                \
         -G Ninja                                \
         /work/mxnet
@@ -975,8 +971,7 @@ build_ubuntu_gpu_large_tensor() {
         -DUSE_MKLDNN=OFF                        \
         -DUSE_DIST_KVSTORE=ON                   \
         -DCMAKE_BUILD_TYPE=Release              \
-        -DCUDA_ARCH_NAME=Manual                 \
-        -DCUDA_ARCH_BIN=$CI_CMAKE_CUDA_ARCH_BIN \
+        -DMXNET_CUDA_ARCH="$CI_CMAKE_CUDA_ARCH" \
         -DUSE_INT64_TENSOR_SIZE=ON              \
         -G Ninja                                \
         /work/mxnet
diff --git a/cmake/BuildTVM.cmake b/cmake/BuildTVM.cmake
index db8b33b84596..2c2f573cddbd 100644
--- a/cmake/BuildTVM.cmake
+++ b/cmake/BuildTVM.cmake
@@ -98,16 +98,19 @@ set(USE_RANDOM OFF)
 # Whether use NNPack
 set(USE_NNPACK OFF)
 
-# Whether use CuDNN
-if(USE_CUDNN AND USE_CUDA)
-    detect_cuDNN()
-    if(HAVE_CUDNN)
-        set(USE_CUDNN ON)
-    else()
-        set(USE_CUDNN OFF)
-    endif()
-else()
-    set(USE_CUDNN OFF)
+# First-class Cuda in modern CMake provides us with CMAKE_CUDA_COMPILER But TVM
+# uses the deprecated findCUDA functionality which requires
+# CUDA_TOOLKIT_ROOT_DIR We follow the FindCUDAToolkit.cmake logic to compute
+# CUDA_TOOLKIT_ROOT_DIR for TVM https://gitlab.kitware.com/cmake/cmake/merge_requests/4093/
+if(USE_CUDA)
+  get_filename_component(cuda_dir "${CMAKE_CUDA_COMPILER}" DIRECTORY)
+  set(CUDA_BIN_DIR "${cuda_dir}" CACHE PATH "" FORCE)
+  unset(cuda_dir)
+  get_filename_component(CUDA_TOOLKIT_ROOT_DIR ${CUDA_BIN_DIR} DIRECTORY ABSOLUTE)
+
+  message("CMAKE_CUDA_COMPILER: ${CMAKE_CUDA_COMPILER}")
+  message("Inferred CUDA_TOOLKIT_ROOT_DIR for TVM as: ${CUDA_TOOLKIT_ROOT_DIR}")
+  set(USE_CUDA ${CUDA_TOOLKIT_ROOT_DIR})
 endif()
 
 # Whether use cuBLAS
@@ -133,3 +136,6 @@ set(USE_VTA_TSIM OFF)
 
 # Whether use Relay debug mode
 set(USE_RELAY_DEBUG OFF)
+
+# Use OPENMP thread pool to be compatible with MXNet
+set(USE_OPENMP ON)
diff --git a/cmake/FirstClassLangCuda.cmake b/cmake/FirstClassLangCuda.cmake
deleted file mode 100644
index 8d79c2b63ad9..000000000000
--- a/cmake/FirstClassLangCuda.cmake
+++ /dev/null
@@ -1,277 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-#this file is CUDA help function with CMAKE first class CUDA
-
-include(CheckCXXCompilerFlag)
-check_cxx_compiler_flag("-std=c++11"   SUPPORT_CXX11)
-if(USE_CXX14_IF_AVAILABLE)
-  check_cxx_compiler_flag("-std=c++14"   SUPPORT_CXX14)
-endif()
-
-################################################################################################
-# Short command for cuDNN detection. Believe it soon will be a part of CUDA toolkit distribution.
-# That's why not FindcuDNN.cmake file, but just the macro
-# Usage:
-#   detect_cuDNN()
-function(detect_cuDNN)
-  set(CUDNN_ROOT "" CACHE PATH "CUDNN root folder")
-
-  find_path(CUDNN_INCLUDE cudnn.h
-            PATHS ${CUDNN_ROOT} $ENV{CUDNN_ROOT}
-            DOC "Path to cuDNN include directory." )
-
-
-  find_library(CUDNN_LIBRARY NAMES libcudnn.so cudnn.lib # libcudnn_static.a
-                             PATHS ${CUDNN_ROOT} $ENV{CUDNN_ROOT} ${CUDNN_INCLUDE}
-                             PATH_SUFFIXES lib lib/x64
-                             DOC "Path to cuDNN library.")
-
-  if(CUDNN_INCLUDE AND CUDNN_LIBRARY)
-    set(HAVE_CUDNN  TRUE PARENT_SCOPE)
-    set(CUDNN_FOUND TRUE PARENT_SCOPE)
-
-    mark_as_advanced(CUDNN_INCLUDE CUDNN_LIBRARY CUDNN_ROOT)
-    message(STATUS "Found cuDNN (include: ${CUDNN_INCLUDE}, library: ${CUDNN_LIBRARY})")
-  endif()
-endfunction()
-
-
-
-################################################################################################
-# A function for automatic detection of GPUs installed  (if autodetection is enabled)
-# Usage:
-#   mshadow_detect_installed_gpus(out_variable)
-function(mshadow_detect_installed_gpus out_variable)
-  if(NOT CUDA_gpu_detect_output)
-    set(__cufile ${PROJECT_BINARY_DIR}/detect_cuda_archs.cu)
-
-    file(WRITE ${__cufile} ""
-      "#include <cstdio>\n"
-      "int main()\n"
-      "{\n"
-      "  int count = 0;\n"
-      "  if (cudaSuccess != cudaGetDeviceCount(&count)) return -1;\n"
-      "  if (count == 0) return -1;\n"
-      "  for (int device = 0; device < count; ++device)\n"
-      "  {\n"
-      "    cudaDeviceProp prop;\n"
-      "    if (cudaSuccess == cudaGetDeviceProperties(&prop, device))\n"
-      "      std::printf(\"%d.%d \", prop.major, prop.minor);\n"
-      "  }\n"
-      "  return 0;\n"
-      "}\n")
-    enable_language(CUDA)
-
-    try_run(__nvcc_res __compile_result ${PROJECT_BINARY_DIR} ${__cufile}
-      COMPILE_OUTPUT_VARIABLE __compile_out
-      RUN_OUTPUT_VARIABLE __nvcc_out)
-
-    if(__nvcc_res EQUAL 0 AND __compile_result)
-      # nvcc outputs text containing line breaks when building with MSVC.
-      # The line below prevents CMake from inserting a variable with line
-      # breaks in the cache
-      string(REGEX MATCH "([1-9].[0-9])" __nvcc_out "${__nvcc_out}")
-      string(REPLACE "2.1" "2.1(2.0)" __nvcc_out "${__nvcc_out}")
-      set(CUDA_gpu_detect_output ${__nvcc_out})
-    else()
-      message(WARNING "Running GPU detection script with nvcc failed: ${__nvcc_out} ${__compile_out}")
-    endif()
-  endif()
-
-  if(NOT CUDA_gpu_detect_output)
-    message(WARNING "Automatic GPU detection failed. Building for all known architectures (${mxnet_known_gpu_archs}).")
-    set(${out_variable} ${mxnet_known_gpu_archs} PARENT_SCOPE)
-  else()
-    set(${out_variable} ${CUDA_gpu_detect_output} PARENT_SCOPE)
-  endif()
-endfunction()
-
-
-# This list will be used for CUDA_ARCH_NAME = All option
-set(CUDA_KNOWN_GPU_ARCHITECTURES "Kepler" "Maxwell")
-
-# This list will be used for CUDA_ARCH_NAME = Common option (enabled by default)
-set(CUDA_COMMON_GPU_ARCHITECTURES "3.0" "3.5" "5.0")
-
-if (CUDA_TOOLSET VERSION_GREATER "6.5")
-  list(APPEND CUDA_KNOWN_GPU_ARCHITECTURES "Kepler+Tegra" "Kepler+Tesla" "Maxwell+Tegra")
-  list(APPEND CUDA_COMMON_GPU_ARCHITECTURES "5.2" "3.7")
-endif ()
-
-if (CUDA_TOOLSET VERSION_GREATER "7.5")
-  list(APPEND CUDA_KNOWN_GPU_ARCHITECTURES "Pascal")
-  list(APPEND CUDA_COMMON_GPU_ARCHITECTURES "6.0" "6.1" "6.1+PTX")
-else()
-  list(APPEND CUDA_COMMON_GPU_ARCHITECTURES "5.2+PTX")
-endif ()
-
-if (CUDA_TOOLSET VERSION_GREATER "9.0")
-  list(APPEND CUDA_KNOWN_GPU_ARCHITECTURES "Volta")
-  list(APPEND CUDA_COMMON_GPU_ARCHITECTURES "7.0")
-endif()
-
-if (CUDA_TOOLSET VERSION_GREATER "10.0")
-  list(APPEND CUDA_KNOWN_GPU_ARCHITECTURES "Turing")
-  list(APPEND CUDA_COMMON_GPU_ARCHITECTURES "7.5")
-endif()
-
-################################################################################################
-# Function for selecting GPU arch flags for nvcc based on CUDA_ARCH_NAME
-# Usage:
-#   mshadow_select_nvcc_arch_flags(out_variable)
-function(mshadow_select_nvcc_arch_flags out_variable)
-
-  set(CUDA_ARCH_LIST "Auto" CACHE STRING "Select target NVIDIA GPU achitecture.")
-  set_property( CACHE CUDA_ARCH_LIST PROPERTY STRINGS "" "Auto" "All" "Common" ${CUDA_KNOWN_GPU_ARCHITECTURES} )
-  mark_as_advanced(CUDA_ARCH_NAME)
-
-
-  if("X${CUDA_ARCH_LIST}" STREQUAL "X" )
-    set(CUDA_ARCH_LIST "All")
-  endif()
-
-  set(cuda_arch_bin)
-  set(cuda_arch_ptx)
-
-  message(STATUS " CUDA_ARCH_LIST: ${CUDA_ARCH_LIST}")
-  if("${CUDA_ARCH_LIST}" STREQUAL "All")
-    set(CUDA_ARCH_LIST ${CUDA_KNOWN_GPU_ARCHITECTURES})
-  elseif("${CUDA_ARCH_LIST}" STREQUAL "Common")
-    set(CUDA_ARCH_LIST ${CUDA_COMMON_GPU_ARCHITECTURES})
-  elseif("${CUDA_ARCH_LIST}" STREQUAL "Auto" OR "${CUDA_ARCH_LIST}" STREQUAL "")
-    set(mxnet_known_gpu_archs ${CUDA_COMMON_GPU_ARCHITECTURES})
-    mshadow_detect_installed_gpus(CUDA_ARCH_LIST)
-    message(STATUS "Autodetected CUDA architecture(s): ${CUDA_ARCH_LIST}")
-  endif()
-
-  # Now process the list and look for names
-  string(REGEX REPLACE "[ \t]+" ";" CUDA_ARCH_LIST "${CUDA_ARCH_LIST}")
-  list(REMOVE_DUPLICATES CUDA_ARCH_LIST)
-  foreach(arch_name ${CUDA_ARCH_LIST})
-    set(arch_bin)
-    set(arch_ptx)
-    set(add_ptx FALSE)
-    # Check to see if we are compiling PTX
-    if(arch_name MATCHES "(.*)\\+PTX$")
-      set(add_ptx TRUE)
-      set(arch_name ${CMAKE_MATCH_1})
-    endif()
-    if(arch_name MATCHES "^([0-9]\\.[0-9](\\([0-9]\\.[0-9]\\))?)$")
-      set(arch_bin ${CMAKE_MATCH_1})
-      set(arch_ptx ${arch_bin})
-    else()
-      # Look for it in our list of known architectures
-      if(${arch_name} STREQUAL "Fermi")
-        if (CUDA_TOOLSET VERSION_LESS "8.0")
-          set(arch_bin 2.0 "2.1(2.0)")
-        endif()
-      elseif(${arch_name} STREQUAL "Kepler+Tegra")
-        set(arch_bin 3.2)
-      elseif(${arch_name} STREQUAL "Kepler+Tesla")
-        set(arch_bin 3.7)
-      elseif(${arch_name} STREQUAL "Kepler")
-        set(arch_bin 3.0 3.5)
-        set(arch_ptx 3.5)
-      elseif(${arch_name} STREQUAL "Maxwell+Tegra")
-        set(arch_bin 5.3)
-      elseif(${arch_name} STREQUAL "Maxwell")
-        set(arch_bin 5.0 5.2)
-        set(arch_ptx 5.2)
-      elseif(${arch_name} STREQUAL "Pascal")
-        set(arch_bin 6.0 6.1)
-        set(arch_ptx 6.1)
-      elseif(${arch_name} STREQUAL "Volta")
-        set(arch_bin 7.0)
-        set(arch_ptx 7.0)
-      elseif(${arch_name} STREQUAL "Turing")
-        set(arch_bin 7.5)
-        set(arch_ptx 7.5)
-      else()
-        message(SEND_ERROR "Unknown CUDA Architecture Name ${arch_name} in CUDA_SELECT_NVCC_ARCH_FLAGS")
-      endif()
-    endif()
-    list(APPEND cuda_arch_bin ${arch_bin})
-    if(add_ptx)
-      if (NOT arch_ptx)
-        set(arch_ptx ${arch_bin})
-      endif()
-      list(APPEND cuda_arch_ptx ${arch_ptx})
-    endif()
-  endforeach()
-
-  # remove dots and convert to lists
-  string(REGEX REPLACE "\\." "" cuda_arch_bin "${cuda_arch_bin}")
-  string(REGEX REPLACE "\\." "" cuda_arch_ptx "${cuda_arch_ptx}")
-  string(REGEX MATCHALL "[0-9()]+" cuda_arch_bin "${cuda_arch_bin}")
-  string(REGEX MATCHALL "[0-9]+"   cuda_arch_ptx "${cuda_arch_ptx}")
-
-  if(cuda_arch_bin)
-    list(REMOVE_DUPLICATES cuda_arch_bin)
-  endif()
-  if(cuda_arch_ptx)
-    list(REMOVE_DUPLICATES cuda_arch_ptx)
-  endif()
-
-  message(STATUS "cuda arch bin: ${cuda_arch_bin}")
-  message(STATUS "cuda arch ptx: ${cuda_arch_ptx}")
-  set(nvcc_flags "")
-  set(nvcc_archs_readable "")
-
-  # Tell NVCC to add binaries for the specified GPUs
-  foreach(arch ${cuda_arch_bin})
-    if(arch MATCHES "([0-9]+)\\(([0-9]+)\\)")
-      # User explicitly specified ARCH for the concrete CODE
-      list(APPEND nvcc_flags -gencode arch=compute_${CMAKE_MATCH_2},code=sm_${CMAKE_MATCH_1})
-      list(APPEND nvcc_archs_readable sm_${CMAKE_MATCH_1})
-    else()
-      # User didn't explicitly specify ARCH for the concrete CODE, we assume ARCH=CODE
-      list(APPEND nvcc_flags -gencode arch=compute_${arch},code=sm_${arch})
-      list(APPEND nvcc_archs_readable sm_${arch})
-    endif()
-  endforeach()
-
-  # Tell NVCC to add PTX intermediate code for the specified architectures
-  foreach(arch ${cuda_arch_ptx})
-    list(APPEND nvcc_flags -gencode arch=compute_${arch},code=compute_${arch})
-    list(APPEND nvcc_archs_readable compute_${arch})
-  endforeach()
-
-  if(NOT MSVC)
-    if(SUPPORT_CXX14)
-      list(APPEND nvcc_flags "-std=c++14")
-    elseif(SUPPORT_CXX11)
-      list(APPEND nvcc_flags "-std=c++11")
-    endif()
-  endif()
-
-  string (REPLACE " " ";" CMAKE_CXX_FLAGS_STR "${CMAKE_CXX_FLAGS}")
-  foreach(_flag ${CMAKE_CXX_FLAGS_STR})
-    # Remove -std=c++XX flags
-    if(NOT "${_flag}" MATCHES "-std=.+")
-      # Remove link flags
-      if(NOT "${_flag}" MATCHES "-Wl,.+")
-        list(APPEND nvcc_flags "-Xcompiler ${_flag}")
-      endif()
-    endif()
-  endforeach()
-
-  string(REPLACE ";" " " nvcc_archs_readable "${nvcc_archs_readable}")
-  set(${out_variable}          ${nvcc_flags}          PARENT_SCOPE)
-  set(${out_variable}_readable ${nvcc_archs_readable} PARENT_SCOPE)
-endfunction()
-
diff --git a/cmake/Modules/FindCUDAToolkit.cmake b/cmake/Modules/FindCUDAToolkit.cmake
new file mode 100644
index 000000000000..1d9af2f548d0
--- /dev/null
+++ b/cmake/Modules/FindCUDAToolkit.cmake
@@ -0,0 +1,833 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# Original license notice, prior to modification by MXNet Contributors:
+#
+# Copyright 2000-2019 Kitware, Inc. and Contributors
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#
+# * Redistributions of source code must retain the above copyright
+#   notice, this list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright
+#   notice, this list of conditions and the following disclaimer in the
+#   documentation and/or other materials provided with the distribution.
+#
+# * Neither the name of Kitware, Inc. nor the names of Contributors
+#   may be used to endorse or promote products derived from this
+#   software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#[=======================================================================[.rst:
+FindCUDAToolkit
+---------------
+
+This script locates the NVIDIA CUDA toolkit and the associated libraries, but
+does not require the ``CUDA`` language be enabled for a given project. This
+module does not search for the NVIDIA CUDA Samples.
+
+Search Behavior
+^^^^^^^^^^^^^^^
+
+Finding the CUDA Toolkit requires finding the ``nvcc`` executable, which is
+searched for in the following order:
+
+1. If the ``CUDA`` language has been enabled we will use the directory
+   containing the compiler as the first search location for ``nvcc``.
+
+2. If the ``CUDAToolkit_ROOT`` cmake configuration variable (e.g.,
+   ``-DCUDAToolkit_ROOT=/some/path``) *or* environment variable is defined, it
+   will be searched.  If both an environment variable **and** a
+   configuration variable are specified, the *configuration* variable takes
+   precedence.
+
+   The directory specified here must be such that the executable ``nvcc`` can be
+   found underneath the directory specified by ``CUDAToolkit_ROOT``.  If
+   ``CUDAToolkit_ROOT`` is specified, but no ``nvcc`` is found underneath, this
+   package is marked as **not** found.  No subsequent search attempts are
+   performed.
+
+3. If the CUDA_PATH environment variable is defined, it will be searched.
+
+4. The user's path is searched for ``nvcc`` using :command:`find_program`.  If
+   this is found, no subsequent search attempts are performed.  Users are
+   responsible for ensuring that the first ``nvcc`` to show up in the path is
+   the desired path in the event that multiple CUDA Toolkits are installed.
+
+5. On Unix systems, if the symbolic link ``/usr/local/cuda`` exists, this is
+   used.  No subsequent search attempts are performed.  No default symbolic link
+   location exists for the Windows platform.
+
+6. The platform specific default install locations are searched.  If exactly one
+   candidate is found, this is used.  The default CUDA Toolkit install locations
+   searched are:
+
+   +-------------+-------------------------------------------------------------+
+   | Platform    | Search Pattern                                              |
+   +=============+=============================================================+
+   | macOS       | ``/Developer/NVIDIA/CUDA-X.Y``                              |
+   +-------------+-------------------------------------------------------------+
+   | Other Unix  | ``/usr/local/cuda-X.Y``                                     |
+   +-------------+-------------------------------------------------------------+
+   | Windows     | ``C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\vX.Y`` |
+   +-------------+-------------------------------------------------------------+
+
+   Where ``X.Y`` would be a specific version of the CUDA Toolkit, such as
+   ``/usr/local/cuda-9.0`` or
+   ``C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v9.0``
+
+   .. note::
+
+       When multiple CUDA Toolkits are installed in the default location of a
+       system (e.g., both ``/usr/local/cuda-9.0`` and ``/usr/local/cuda-10.0``
+       exist but the ``/usr/local/cuda`` symbolic link does **not** exist), this
+       package is marked as **not** found.
+
+       There are too many factors involved in making an automatic decision in
+       the presence of multiple CUDA Toolkits being installed.  In this
+       situation, users are encouraged to either (1) set ``CUDAToolkit_ROOT`` or
+       (2) ensure that the correct ``nvcc`` executable shows up in ``$PATH`` for
+       :command:`find_program` to find.
+
+Options
+^^^^^^^
+
+``VERSION``
+    If specified, describes the version of the CUDA Toolkit to search for.
+
+``REQUIRED``
+    If specified, configuration will error if a suitable CUDA Toolkit is not
+    found.
+
+``QUIET``
+    If specified, the search for a suitable CUDA Toolkit will not produce any
+    messages.
+
+``EXACT``
+    If specified, the CUDA Toolkit is considered found only if the exact
+    ``VERSION`` specified is recovered.
+
+Imported targets
+^^^^^^^^^^^^^^^^
+
+An :ref:`imported target <Imported targets>` named ``CUDA::toolkit`` is provided.
+
+This module defines :prop_tgt:`IMPORTED` targets for each
+of the following libraries that are part of the CUDAToolkit:
+
+- :ref:`CUDA Runtime Library<cuda_toolkit_rt_lib>`
+- :ref:`CUDA Driver Library<cuda_toolkit_driver_lib>`
+- :ref:`cuBLAS<cuda_toolkit_cuBLAS>`
+- :ref:`cuFFT<cuda_toolkit_cuFFT>`
+- :ref:`cuRAND<cuda_toolkit_cuRAND>`
+- :ref:`cuSOLVER<cuda_toolkit_cuSOLVER>`
+- :ref:`cuSPARSE<cuda_toolkit_cuSPARSE>`
+- :ref:`NPP<cuda_toolkit_NPP>`
+- :ref:`nvBLAS<cuda_toolkit_nvBLAS>`
+- :ref:`nvGRAPH<cuda_toolkit_nvGRAPH>`
+- :ref:`nvJPEG<cuda_toolkit_nvJPEG>`
+- :ref:`nvidia-ML<cuda_toolkit_nvML>`
+- :ref:`nvRTC<cuda_toolkit_nvRTC>`
+- :ref:`nvToolsExt<cuda_toolkit_nvToolsExt>`
+- :ref:`OpenCL<cuda_toolkit_opencl>`
+- :ref:`cuLIBOS<cuda_toolkit_cuLIBOS>`
+
+.. _`cuda_toolkit_rt_lib`:
+
+CUDA Runtime Library
+""""""""""""""""""""
+
+The CUDA Runtime library (cudart) are what most applications will typically
+need to link against to make any calls such as `cudaMalloc`, and `cudaFree`.
+They are an explicit dependency of almost every library.
+
+Targets Created:
+
+- ``CUDA::cudart``
+- ``CUDA::cudart_static``
+
+.. _`cuda_toolkit_driver_lib`:
+
+CUDA Driver Library
+""""""""""""""""""""
+
+The CUDA Driver library (cuda) are used by applications that use calls
+such as `cuMemAlloc`, and `cuMemFree`. This is generally used by advanced
+
+
+Targets Created:
+
+- ``CUDA::cuda_driver``
+- ``CUDA::cuda_driver``
+
+.. _`cuda_toolkit_cuBLAS`:
+
+cuBLAS
+""""""
+
+The `cuBLAS <https://docs.nvidia.com/cuda/cublas/index.html>`_ library.
+
+Targets Created:
+
+- ``CUDA::cublas``
+- ``CUDA::cublas_static``
+
+.. _`cuda_toolkit_cuFFT`:
+
+cuFFT
+"""""
+
+The `cuFFT <https://docs.nvidia.com/cuda/cufft/index.html>`_ library.
+
+Targets Created:
+
+- ``CUDA::cufft``
+- ``CUDA::cufftw``
+- ``CUDA::cufft_static``
+- ``CUDA::cufftw_static``
+
+cuRAND
+""""""
+
+The `cuRAND <https://docs.nvidia.com/cuda/curand/index.html>`_ library.
+
+Targets Created:
+
+- ``CUDA::curand``
+- ``CUDA::curand_static``
+
+.. _`cuda_toolkit_cuSOLVER`:
+
+cuSOLVER
+""""""""
+
+The `cuSOLVER <https://docs.nvidia.com/cuda/cusolver/index.html>`_ library.
+
+Targets Created:
+
+- ``CUDA::cusolver``
+- ``CUDA::cusolver_static``
+
+.. _`cuda_toolkit_cuSPARSE`:
+
+cuSPARSE
+""""""""
+
+The `cuSPARSE <https://docs.nvidia.com/cuda/cusparse/index.html>`_ library.
+
+Targets Created:
+
+- ``CUDA::cusparse``
+- ``CUDA::cusparse_static``
+
+.. _`cuda_toolkit_NPP`:
+
+NPP
+"""
+
+The `NPP <https://docs.nvidia.com/cuda/npp/index.html>`_ libraries.
+
+Targets Created:
+
+- `nppc`:
+
+  - ``CUDA::nppc``
+  - ``CUDA::nppc_static``
+
+- `nppial`: Arithmetic and logical operation functions in `nppi_arithmetic_and_logical_operations.h`
+
+  - ``CUDA::nppial``
+  - ``CUDA::nppial_static``
+
+- `nppicc`: Color conversion and sampling functions in `nppi_color_conversion.h`
+
+  - ``CUDA::nppicc``
+  - ``CUDA::nppicc_static``
+
+- `nppicom`: JPEG compression and decompression functions in `nppi_compression_functions.h`
+
+  - ``CUDA::nppicom``
+  - ``CUDA::nppicom_static``
+
+- `nppidei`: Data exchange and initialization functions in `nppi_data_exchange_and_initialization.h`
+
+  - ``CUDA::nppidei``
+  - ``CUDA::nppidei_static``
+
+- `nppif`: Filtering and computer vision functions in `nppi_filter_functions.h`
+
+  - ``CUDA::nppif``
+  - ``CUDA::nppif_static``
+
+- `nppig`: Geometry transformation functions found in `nppi_geometry_transforms.h`
+
+  - ``CUDA::nppig``
+  - ``CUDA::nppig_static``
+
+- `nppim`: Morphological operation functions found in `nppi_morphological_operations.h`
+
+  - ``CUDA::nppim``
+  - ``CUDA::nppim_static``
+
+- `nppist`: Statistics and linear transform in `nppi_statistics_functions.h` and `nppi_linear_transforms.h`
+
+  - ``CUDA::nppist``
+  - ``CUDA::nppist_static``
+
+- `nppisu`: Memory support functions in `nppi_support_functions.h`
+
+  - ``CUDA::nppisu``
+  - ``CUDA::nppisu_static``
+
+- `nppitc`: Threshold and compare operation functions in `nppi_threshold_and_compare_operations.h`
+
+  - ``CUDA::nppitc``
+  - ``CUDA::nppitc_static``
+
+- `npps`:
+
+  - ``CUDA::npps``
+  - ``CUDA::npps_static``
+
+.. _`cuda_toolkit_nvBLAS`:
+
+nvBLAS
+""""""
+
+The `nvBLAS <https://docs.nvidia.com/cuda/nvblas/index.html>`_ libraries.
+This is a shared library only.
+
+Targets Created:
+
+- ``CUDA::nvblas``
+
+.. _`cuda_toolkit_nvGRAPH`:
+
+nvGRAPH
+"""""""
+
+The `nvGRAPH <https://docs.nvidia.com/cuda/nvgraph/index.html>`_ library.
+
+Targets Created:
+
+- ``CUDA::nvgraph``
+- ``CUDA::nvgraph_static``
+
+
+.. _`cuda_toolkit_nvJPEG`:
+
+nvJPEG
+""""""
+
+The `nvJPEG <https://docs.nvidia.com/cuda/nvjpeg/index.html>`_ library.
+Introduced in CUDA 10.
+
+Targets Created:
+
+- ``CUDA::nvjpeg``
+- ``CUDA::nvjpeg_static``
+
+.. _`cuda_toolkit_nvRTC`:
+
+nvRTC
+"""""
+
+The `nvRTC <https://docs.nvidia.com/cuda/nvrtc/index.html>`_ (Runtime Compilation) library.
+This is a shared library only.
+
+Targets Created:
+
+- ``CUDA::nvrtc``
+
+.. _`cuda_toolkit_nvml`:
+
+nvidia-ML
+"""""""""
+
+The `NVIDIA Management Library <https://developer.nvidia.com/nvidia-management-library-nvml>`_.
+This is a shared library only.
+
+Targets Created:
+
+- ``CUDA::nvml``
+
+.. _`cuda_toolkit_opencl`:
+
+.. _`cuda_toolkit_nvToolsExt`:
+
+nvToolsExt
+""""""""""
+
+The `NVIDIA Tools Extension <https://docs.nvidia.com/gameworks/content/gameworkslibrary/nvtx/nvidia_tools_extension_library_nvtx.htm>`_.
+This is a shared library only.
+
+Targets Created:
+
+- ``CUDA::nvToolsExt``
+
+OpenCL
+""""""
+
+The `NVIDIA OpenCL Library <https://developer.nvidia.com/opencl>`_.
+This is a shared library only.
+
+Targets Created:
+
+- ``CUDA::OpenCL``
+
+.. _`cuda_toolkit_cuLIBOS`:
+
+cuLIBOS
+"""""""
+
+The cuLIBOS library is a backend thread abstraction layer library which is
+static only.  The ``CUDA::cublas_static``, ``CUDA::cusparse_static``,
+``CUDA::cufft_static``, ``CUDA::curand_static``, and (when implemented) NPP
+libraries all automatically have this dependency linked.
+
+Target Created:
+
+- ``CUDA::culibos``
+
+**Note**: direct usage of this target by consumers should not be necessary.
+
+.. _`cuda_toolkit_cuRAND`:
+
+
+
+Result variables
+^^^^^^^^^^^^^^^^
+
+``CUDAToolkit_FOUND``
+    A boolean specifying whether or not the CUDA Toolkit was found.
+
+``CUDAToolkit_VERSION``
+    The exact version of the CUDA Toolkit found (as reported by
+    ``nvcc --version``).
+
+``CUDAToolkit_VERSION_MAJOR``
+    The major version of the CUDA Toolkit.
+
+``CUDAToolkit_VERSION_MAJOR``
+    The minor version of the CUDA Toolkit.
+
+``CUDAToolkit_VERSION_PATCH``
+    The patch version of the CUDA Toolkit.
+
+``CUDAToolkit_BIN_DIR``
+    The path to the CUDA Toolkit library directory that contains the CUDA
+    executable ``nvcc``.
+
+``CUDAToolkit_INCLUDE_DIRS``
+    The path to the CUDA Toolkit ``include`` folder containing the header files
+    required to compile a project linking against CUDA.
+
+``CUDAToolkit_LIBRARY_DIR``
+    The path to the CUDA Toolkit library directory that contains the CUDA
+    Runtime library ``cudart``.
+
+``CUDAToolkit_NVCC_EXECUTABLE``
+    The path to the NVIDIA CUDA compiler ``nvcc``.  Note that this path may
+    **not** be the same as
+    :variable:`CMAKE_CUDA_COMPILER <CMAKE_<LANG>_COMPILER>`.  ``nvcc`` must be
+    found to determine the CUDA Toolkit version as well as determining other
+    features of the Toolkit.  This variable is set for the convenience of
+    modules that depend on this one.
+
+
+#]=======================================================================]
+
+# NOTE: much of this was simply extracted from FindCUDA.cmake.
+
+#   James Bigler, NVIDIA Corp (nvidia.com - jbigler)
+#   Abe Stephens, SCI Institute -- http://www.sci.utah.edu/~abe/FindCuda.html
+#
+#   Copyright (c) 2008 - 2009 NVIDIA Corporation.  All rights reserved.
+#
+#   Copyright (c) 2007-2009
+#   Scientific Computing and Imaging Institute, University of Utah
+#
+#   This code is licensed under the MIT License.  See the FindCUDA.cmake script
+#   for the text of the license.
+
+# The MIT License
+#
+# License for the specific language governing rights and limitations under
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included
+# in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+# DEALINGS IN THE SOFTWARE.
+#
+###############################################################################
+
+if(CMAKE_CUDA_COMPILER_LOADED AND NOT CUDAToolkit_BIN_DIR)
+  get_filename_component(cuda_dir "${CMAKE_CUDA_COMPILER}" DIRECTORY)
+  # use the already detected cuda compiler
+  set(CUDAToolkit_BIN_DIR "${cuda_dir}" CACHE PATH "")
+  unset(cuda_dir)
+endif()
+
+# Try language- or user-provided path first.
+if(CUDAToolkit_BIN_DIR)
+  find_program(CUDAToolkit_NVCC_EXECUTABLE
+    NAMES nvcc nvcc.exe
+    PATHS ${CUDAToolkit_BIN_DIR}
+    NO_DEFAULT_PATH
+    )
+endif()
+
+# Search using CUDAToolkit_ROOT
+find_program(CUDAToolkit_NVCC_EXECUTABLE
+  NAMES nvcc nvcc.exe
+  PATHS ENV CUDA_PATH
+  PATH_SUFFIXES bin
+)
+
+# If the user specified CUDAToolkit_ROOT but nvcc could not be found, this is an error.
+if (NOT CUDAToolkit_NVCC_EXECUTABLE AND (DEFINED CUDAToolkit_ROOT OR DEFINED ENV{CUDAToolkit_ROOT}))
+  # Declare error messages now, print later depending on find_package args.
+  set(fail_base "Could not find nvcc executable in path specified by")
+  set(cuda_root_fail "${fail_base} CUDAToolkit_ROOT=${CUDAToolkit_ROOT}")
+  set(env_cuda_root_fail "${fail_base} environment variable CUDAToolkit_ROOT=$ENV{CUDAToolkit_ROOT}")
+
+  if (CUDAToolkit_FIND_REQUIRED)
+    if (DEFINED CUDAToolkit_ROOT)
+      message(FATAL_ERROR ${cuda_root_fail})
+    elseif (DEFINED ENV{CUDAToolkit_ROOT})
+      message(FATAL_ERROR ${env_cuda_root_fail})
+    endif()
+  else()
+    if (NOT CUDAToolkit_FIND_QUIETLY)
+      if (DEFINED CUDAToolkit_ROOT)
+        message(STATUS ${cuda_root_fail})
+      elseif (DEFINED ENV{CUDAToolkit_ROOT})
+        message(STATUS ${env_cuda_root_fail})
+      endif()
+    endif()
+    set(CUDAToolkit_FOUND FALSE)
+    unset(fail_base)
+    unset(cuda_root_fail)
+    unset(env_cuda_root_fail)
+    return()
+  endif()
+endif()
+
+# CUDAToolkit_ROOT cmake / env variable not specified, try platform defaults.
+#
+# - Linux: /usr/local/cuda-X.Y
+# - macOS: /Developer/NVIDIA/CUDA-X.Y
+# - Windows: C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\vX.Y
+#
+# We will also search the default symlink location /usr/local/cuda first since
+# if CUDAToolkit_ROOT is not specified, it is assumed that the symlinked
+# directory is the desired location.
+if (NOT CUDAToolkit_NVCC_EXECUTABLE)
+  if (UNIX)
+    if (NOT APPLE)
+      set(platform_base "/usr/local/cuda-")
+    else()
+      set(platform_base "/Developer/NVIDIA/CUDA-")
+    endif()
+  else()
+    set(platform_base "C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v")
+  endif()
+
+  # Build out a descending list of possible cuda installations, e.g.
+  file(GLOB possible_paths "${platform_base}*")
+  # Iterate the glob results and create a descending list.
+  set(possible_versions)
+  foreach (p ${possible_paths})
+    # Extract version number from end of string
+    string(REGEX MATCH "[0-9][0-9]?\\.[0-9]$" p_version ${p})
+    if (IS_DIRECTORY ${p} AND p_version)
+      list(APPEND possible_versions ${p_version})
+    endif()
+  endforeach()
+
+  # Cannot use list(SORT) because that is alphabetical, we need numerical.
+  # NOTE: this is not an efficient sorting strategy.  But even if a user had
+  # every possible version of CUDA installed, this wouldn't create any
+  # significant overhead.
+  set(versions)
+  foreach (v ${possible_versions})
+    list(LENGTH versions num_versions)
+    # First version, nothing to compare with so just append.
+    if (num_versions EQUAL 0)
+      list(APPEND versions ${v})
+    else()
+      # Loop through list.  Insert at an index when comparison is
+      # VERSION_GREATER since we want a descending list.  Duplicates will not
+      # happen since this came from a glob list of directories.
+      set(i 0)
+      set(early_terminate FALSE)
+      while (i LESS num_versions)
+        list(GET versions ${i} curr)
+        if (v VERSION_GREATER curr)
+          list(INSERT versions ${i} ${v})
+          set(early_terminate TRUE)
+          break()
+        endif()
+        math(EXPR i "${i} + 1")
+      endwhile()
+      # If it did not get inserted, place it at the end.
+      if (NOT early_terminate)
+        list(APPEND versions ${v})
+      endif()
+    endif()
+  endforeach()
+
+  # With a descending list of versions, populate possible paths to search.
+  set(search_paths)
+  foreach (v ${versions})
+    list(APPEND search_paths "${platform_base}${v}")
+  endforeach()
+
+  # Force the global default /usr/local/cuda to the front on Unix.
+  if (UNIX)
+    list(INSERT search_paths 0 "/usr/local/cuda")
+  endif()
+
+  # Now search for nvcc again using the platform default search paths.
+  find_program(CUDAToolkit_NVCC_EXECUTABLE
+    NAMES nvcc nvcc.exe
+    PATHS ${search_paths}
+    PATH_SUFFIXES bin
+  )
+
+  # We are done with these variables now, cleanup for caller.
+  unset(platform_base)
+  unset(possible_paths)
+  unset(possible_versions)
+  unset(versions)
+  unset(i)
+  unset(early_terminate)
+  unset(search_paths)
+
+  if (NOT CUDAToolkit_NVCC_EXECUTABLE)
+    if (CUDAToolkit_FIND_REQUIRED)
+      message(FATAL_ERROR "Could not find nvcc, please set CUDAToolkit_ROOT.")
+    elseif(NOT CUDAToolkit_FIND_QUIETLY)
+      message(STATUS "Could not find nvcc, please set CUDAToolkit_ROOT.")
+    endif()
+
+    set(CUDAToolkit_FOUND FALSE)
+    return()
+  endif()
+endif()
+
+if(NOT CUDAToolkit_BIN_DIR AND CUDAToolkit_NVCC_EXECUTABLE)
+  get_filename_component(cuda_dir "${CUDAToolkit_NVCC_EXECUTABLE}" DIRECTORY)
+  set(CUDAToolkit_BIN_DIR "${cuda_dir}" CACHE PATH "" FORCE)
+  unset(cuda_dir)
+endif()
+
+if(CUDAToolkit_NVCC_EXECUTABLE AND
+   CUDAToolkit_NVCC_EXECUTABLE STREQUAL CMAKE_CUDA_COMPILER)
+  # Need to set these based off the already computed CMAKE_CUDA_COMPILER_VERSION value
+  # This if statement will always match, but is used to provide variables for MATCH 1,2,3...
+  if(CMAKE_CUDA_COMPILER_VERSION MATCHES [=[([0-9]+)\.([0-9]+)\.([0-9]+)]=])
+    set(CUDAToolkit_VERSION_MAJOR "${CMAKE_MATCH_1}")
+    set(CUDAToolkit_VERSION_MINOR "${CMAKE_MATCH_2}")
+    set(CUDAToolkit_VERSION_PATCH "${CMAKE_MATCH_3}")
+    set(CUDAToolkit_VERSION "${CMAKE_CUDA_COMPILER_VERSION}")
+  endif()
+else()
+  # Compute the version by invoking nvcc
+  execute_process (COMMAND ${CUDAToolkit_NVCC_EXECUTABLE} "--version" OUTPUT_VARIABLE NVCC_OUT)
+  if(NVCC_OUT MATCHES [=[ V([0-9]+)\.([0-9]+)\.([0-9]+)]=])
+    set(CUDAToolkit_VERSION_MAJOR "${CMAKE_MATCH_1}")
+    set(CUDAToolkit_VERSION_MINOR "${CMAKE_MATCH_2}")
+    set(CUDAToolkit_VERSION_PATCH "${CMAKE_MATCH_3}")
+    set(CUDAToolkit_VERSION  "${CMAKE_MATCH_1}.${CMAKE_MATCH_2}.${CMAKE_MATCH_3}")
+  endif()
+  unset(NVCC_OUT)
+endif()
+
+
+get_filename_component(CUDAToolkit_ROOT_DIR ${CUDAToolkit_BIN_DIR} DIRECTORY ABSOLUTE)
+
+# Now that we have the real ROOT_DIR, find components inside it.
+list(APPEND CMAKE_PREFIX_PATH ${CUDAToolkit_ROOT_DIR})
+
+# Find the include/ directory
+find_path(CUDAToolkit_INCLUDE_DIR
+  NAMES cuda_runtime.h
+)
+
+# And find the CUDA Runtime Library libcudart
+find_library(CUDA_CUDART
+  NAMES cudart
+  PATH_SUFFIXES lib64 lib/x64
+)
+if (NOT CUDA_CUDART AND NOT CUDAToolkit_FIND_QUIETLY)
+  message(STATUS "Unable to find cudart library.")
+endif()
+
+unset(CUDAToolkit_ROOT_DIR)
+list(REMOVE_AT CMAKE_PREFIX_PATH -1)
+
+#-----------------------------------------------------------------------------
+# Perform version comparison and validate all required variables are set.
+# MXNET NOTE: This differs from CMake source by ${CMAKE_CURRENT_LIST_DIR}
+# replaced with ${CMAKE_ROOT}/Modules
+include(${CMAKE_ROOT}/Modules/FindPackageHandleStandardArgs.cmake)
+find_package_handle_standard_args(CUDAToolkit
+  REQUIRED_VARS
+    CUDAToolkit_INCLUDE_DIR
+    CUDA_CUDART
+    CUDAToolkit_NVCC_EXECUTABLE
+  VERSION_VAR
+    CUDAToolkit_VERSION
+)
+
+#-----------------------------------------------------------------------------
+# Construct result variables
+if(CUDAToolkit_FOUND)
+ set(CUDAToolkit_INCLUDE_DIRS ${CUDAToolkit_INCLUDE_DIR})
+ get_filename_component(CUDAToolkit_LIBRARY_DIR ${CUDA_CUDART} DIRECTORY ABSOLUTE)
+endif()
+
+#-----------------------------------------------------------------------------
+# Construct import targets
+if(CUDAToolkit_FOUND)
+
+  function(find_and_add_cuda_import_lib lib_name)
+
+    if(ARGC GREATER 1)
+      set(search_names ${ARGN})
+    else()
+      set(search_names ${lib_name})
+    endif()
+
+    find_library(CUDA_${lib_name}_LIBRARY
+      NAMES ${search_names}
+      PATHS ${CUDAToolkit_LIBRARY_DIR}
+            ENV CUDA_PATH
+      PATH_SUFFIXES nvidia/current lib64 lib/x64 lib
+    )
+
+    if (NOT CUDA::${lib_name} AND CUDA_${lib_name}_LIBRARY)
+      add_library(CUDA::${lib_name} IMPORTED INTERFACE)
+      target_include_directories(CUDA::${lib_name} SYSTEM INTERFACE "${CUDAToolkit_INCLUDE_DIRS}")
+      target_link_libraries(CUDA::${lib_name} INTERFACE "${CUDA_${lib_name}_LIBRARY}")
+    endif()
+  endfunction()
+
+  function(add_cuda_link_dependency lib_name)
+    foreach(dependency IN LISTS ${ARGN})
+      target_link_libraries(CUDA::${lib_name} INTERFACE CUDA::${dependency})
+    endforeach()
+  endfunction()
+
+  add_library(CUDA::toolkit IMPORTED INTERFACE)
+  target_include_directories(CUDA::toolkit SYSTEM INTERFACE "${CUDAToolkit_INCLUDE_DIRS}")
+  target_link_directories(CUDA::toolkit INTERFACE "${CUDAToolkit_LIBRARY_DIR}")
+
+
+  find_and_add_cuda_import_lib(cuda_driver cuda)
+
+  find_and_add_cuda_import_lib(cudart)
+  find_and_add_cuda_import_lib(cudart_static)
+
+  foreach (cuda_lib cublas cufft cufftw curand cusolver cusparse nvgraph nvjpeg)
+    find_and_add_cuda_import_lib(${cuda_lib})
+    add_cuda_link_dependency(${cuda_lib} cudart)
+
+    find_and_add_cuda_import_lib(${cuda_lib}_static)
+    add_cuda_link_dependency(${cuda_lib}_static cudart_static)
+  endforeach()
+
+  # cuSOLVER depends on cuBLAS, and cuSPARSE
+  add_cuda_link_dependency(cusolver cublas cusparse)
+  add_cuda_link_dependency(cusolver_static cublas_static cusparse)
+
+  # nvGRAPH depends on cuRAND, and cuSOLVER.
+  add_cuda_link_dependency(nvgraph curand cusolver)
+  add_cuda_link_dependency(nvgraph_static curand_static cusolver_static)
+
+  find_and_add_cuda_import_lib(nppc)
+  find_and_add_cuda_import_lib(nppc_static)
+
+  add_cuda_link_dependency(nppc cudart)
+  add_cuda_link_dependency(nppc_static cudart_static culibos)
+
+  # Process the majority of the NPP libraries.
+  foreach (cuda_lib nppial nppicc nppidei nppif nppig nppim nppist nppitc npps nppicom nppisu)
+    find_and_add_cuda_import_lib(${cuda_lib})
+    find_and_add_cuda_import_lib(${cuda_lib}_static)
+    add_cuda_link_dependency(${cuda_lib} nppc)
+    add_cuda_link_dependency(${cuda_lib}_static nppc_static)
+  endforeach()
+
+  find_and_add_cuda_import_lib(nvrtc)
+  add_cuda_link_dependency(nvrtc cuda_driver)
+
+  find_and_add_cuda_import_lib(nvml nvidia-ml nvml)
+
+  if(WIN32)
+    # nvtools can be installed outside the CUDA toolkit directory
+    # so prefer the NVTOOLSEXT_PATH windows only environment variable
+    # In addition on windows the most common name is nvToolsExt64_1
+    find_library(CUDA_nvToolsExt_LIBRARY
+      NAMES nvToolsExt64_1 nvToolsExt64 nvToolsExt
+      PATHS ENV NVTOOLSEXT_PATH
+            ENV CUDA_PATH
+      PATH_SUFFIXES lib/x64 lib
+    )
+  endif()
+  find_and_add_cuda_import_lib(nvToolsExt nvToolsExt nvToolsExt64)
+
+  add_cuda_link_dependency(nvToolsExt cudart)
+
+  find_and_add_cuda_import_lib(OpenCL)
+
+  find_and_add_cuda_import_lib(culibos)
+  if(TARGET CUDA::culibos)
+    foreach (cuda_lib cublas cufft cusparse curand nvjpeg)
+      add_cuda_link_dependency(${cuda_lib}_static culibos)
+    endforeach()
+  endif()
+
+endif()
diff --git a/contrib/tvmop/compile.py b/contrib/tvmop/compile.py
index b0254218077a..43657f274348 100644
--- a/contrib/tvmop/compile.py
+++ b/contrib/tvmop/compile.py
@@ -50,6 +50,11 @@ def get_cuda_arch(arch):
     if len(arch) == 0:
         return None
 
+    # the arch string is of format '-gencode;arch=compute_XX,code=sm_XX'
+    # this format is computed by CMake CUDA_SELECT_NVCC_ARCH_FLAGS
+    if arch.startswith('-gencode;'):
+        return arch.split(';')
+
     # the arch string contains '-arch=sm_xx'
     flags = arch.split()
     for flag in flags:
diff --git a/tools/windowsbuild/README.md b/tools/windowsbuild/README.md
new file mode 100644
index 000000000000..7d8e7cf331cf
--- /dev/null
+++ b/tools/windowsbuild/README.md
@@ -0,0 +1,19 @@
+<!--- Licensed to the Apache Software Foundation (ASF) under one -->
+<!--- or more contributor license agreements.  See the NOTICE file -->
+<!--- distributed with this work for additional information -->
+<!--- regarding copyright ownership.  The ASF licenses this file -->
+<!--- to you under the Apache License, Version 2.0 (the -->
+<!--- "License"); you may not use this file except in compliance -->
+<!--- with the License.  You may obtain a copy of the License at -->
+
+<!---   http://www.apache.org/licenses/LICENSE-2.0 -->
+
+<!--- Unless required by applicable law or agreed to in writing, -->
+<!--- software distributed under the License is distributed on an -->
+<!--- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -->
+<!--- KIND, either express or implied.  See the License for the -->
+<!--- specific language governing permissions and limitations -->
+<!--- under the License. -->
+
+Due to dll size limitation under windows. Split dll into different dlls according to arch
+Reference https://github.com/apache/incubator-mxnet/pull/16980
\ No newline at end of file
diff --git a/tools/windowsbuild/gen_warp.cpp b/tools/windowsbuild/gen_warp.cpp
new file mode 100644
index 000000000000..2d90eaf364f3
--- /dev/null
+++ b/tools/windowsbuild/gen_warp.cpp
@@ -0,0 +1,209 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+ 
+#include <iostream>  
+#include <io.h>  
+#include <Windows.h>  
+#include <cstdint>
+#include <memory>
+#include <vector>
+#include <string>
+#include <iostream>
+#include <fstream>
+
+#define  IMAGE_SIZEOF_SIGNATURE 4
+
+
+DWORD rva_to_foa(IN DWORD RVA, IN PIMAGE_SECTION_HEADER section_header) 
+{
+
+	size_t count = 0;
+	for (count = 1; RVA > (section_header->VirtualAddress + section_header->Misc.VirtualSize); count++, section_header++);
+
+	DWORD FOA = RVA - section_header->VirtualAddress + section_header->PointerToRawData;
+
+	return FOA;
+} 
+
+std::string format(const char* format, ...)
+{
+	va_list args;
+	va_start(args, format);
+#ifndef _MSC_VER
+	size_t size = std::snprintf(nullptr, 0, format, args) + 1; // Extra space for '\0'
+	std::unique_ptr<char[]> buf(new char[size]);
+	std::vsnprintf(buf.get(), size, format, args);
+	return std::string(buf.get(), buf.get() + size - 1); // We don't want the '\0' inside
+#else
+	int size = _vscprintf(format, args) +1;
+	std::unique_ptr<char[]> buf(new char[size]);
+	vsnprintf_s(buf.get(), size, _TRUNCATE, format, args);
+	return std::string(buf.get());
+#endif
+	va_end(args);
+}
+
+int main(int argc, char* argv[])
+{
+
+	if (argc != 2)
+	{
+		return 0;
+	}
+
+	//open file
+	const HANDLE h_file = CreateFile(
+		argv[1],   
+		GENERIC_READ , 
+		FILE_SHARE_READ ,
+		nullptr,
+		OPEN_EXISTING,  
+		FILE_ATTRIBUTE_NORMAL,
+		nullptr);
+
+
+	DWORD size_high;
+	const DWORD size_low = GetFileSize(h_file, &size_high);
+
+	uint64_t dll_size = ((uint64_t(size_high)) << 32) + (uint64_t)size_low;
+
+	// Create File Mapping
+	const HANDLE h_map_file = CreateFileMapping(
+		h_file,
+		nullptr,
+		PAGE_READONLY, 
+		size_high,
+		size_low,   
+		nullptr);
+	if (h_map_file == INVALID_HANDLE_VALUE || h_map_file == nullptr)
+	{
+		std::cout << "error";
+		CloseHandle(h_file);
+		return 0;
+	}
+
+	//Map File to memory
+	void* pv_file = MapViewOfFile(
+		h_map_file,
+		FILE_MAP_READ,
+		0,
+		0,
+		0);
+
+	if (pv_file == nullptr)
+	{
+		std::cout << "error";
+		CloseHandle(h_file);
+		return 0;
+	}
+
+	uint8_t* p = static_cast<uint8_t*>(pv_file);
+
+
+	PIMAGE_DOS_HEADER dos_header = reinterpret_cast<PIMAGE_DOS_HEADER>(p);
+
+	const PIMAGE_NT_HEADERS nt_headers = reinterpret_cast<const PIMAGE_NT_HEADERS>(p + dos_header->e_lfanew);
+
+	const PIMAGE_FILE_HEADER file_header = &nt_headers->FileHeader;
+
+	PIMAGE_OPTIONAL_HEADER	optional_header = (PIMAGE_OPTIONAL_HEADER)(&nt_headers->OptionalHeader);
+
+	const DWORD file_alignment = optional_header->FileAlignment;
+
+
+	PIMAGE_SECTION_HEADER section_table =
+		reinterpret_cast<PIMAGE_SECTION_HEADER>(p + dos_header->e_lfanew +
+			IMAGE_SIZEOF_SIGNATURE +
+			IMAGE_SIZEOF_FILE_HEADER +
+			file_header->SizeOfOptionalHeader);
+
+	DWORD	export_foa = rva_to_foa(optional_header->DataDirectory[0].VirtualAddress, section_table);
+
+	PIMAGE_EXPORT_DIRECTORY export_directory = (PIMAGE_EXPORT_DIRECTORY)(p + export_foa);
+
+
+	DWORD name_list_foa = rva_to_foa(export_directory->AddressOfNames, section_table);
+
+	PDWORD name_list = (PDWORD)(p + name_list_foa);
+
+
+
+
+	std::vector<std::string> func_list;
+
+	for (size_t i = 0; i < export_directory->NumberOfNames; i++, name_list++)
+	{
+
+		DWORD name_foa = rva_to_foa(* name_list, section_table);
+		char* name = (char*)(p + name_foa);
+		func_list.emplace_back(name);
+
+	}
+
+
+	UnmapViewOfFile(pv_file); 
+	CloseHandle(h_map_file);
+	CloseHandle(h_file); 
+
+
+	std::ofstream gen_cpp_obj;
+	gen_cpp_obj.open("warp_gen_cpp.cpp", std::ios::out | std::ios::trunc);
+	gen_cpp_obj << "#include <Windows.h>\n";
+	gen_cpp_obj << "extern \"C\" \n{\n";
+
+
+	for (size_t i = 0; i < func_list.size(); ++i)
+	{
+		auto fun = func_list[i];
+		gen_cpp_obj << format("void * warp_point_%d;\n", i);
+		gen_cpp_obj << format("#pragma comment(linker, \"/export:%s=warp_func_%d\")\n", fun.c_str(), i);
+		gen_cpp_obj << format("void warp_func_%d();\n", i);
+		gen_cpp_obj << ("\n");
+	}
+	gen_cpp_obj << ("}\n");
+
+
+	gen_cpp_obj << ("void load_function(HMODULE hm)\n{\n");
+	for (size_t i = 0; i < func_list.size(); ++i)
+	{
+		auto fun = func_list[i];
+		gen_cpp_obj << format("warp_point_%d = (void*)GetProcAddress(hm, \"%s\");\n", i, fun.c_str());
+	}
+	gen_cpp_obj << ("}\n");
+
+	gen_cpp_obj.close();
+
+
+
+	std::ofstream gen_asm_obj;
+	gen_asm_obj.open("warp_gen.asm", std::ios::out | std::ios::trunc);
+	for (size_t i = 0; i < func_list.size(); ++i)
+	{
+		auto fun = func_list[i];
+		gen_asm_obj << format("EXTERN warp_point_%d:QWORD;\n", i);
+	}
+	gen_asm_obj << ".CODE\n";
+	for (size_t i = 0; i < func_list.size(); ++i)
+	{
+		auto fun = func_list[i];
+		gen_asm_obj << format("warp_func_%d PROC\njmp warp_point_%d;\nwarp_func_%d ENDP\n", i,i,i);
+	}
+	gen_asm_obj << "END\n";
+	gen_asm_obj.close();
+}
diff --git a/tools/windowsbuild/warp_dll.cpp b/tools/windowsbuild/warp_dll.cpp
new file mode 100644
index 000000000000..6a89a4e189de
--- /dev/null
+++ b/tools/windowsbuild/warp_dll.cpp
@@ -0,0 +1,151 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+ 
+#include <cstdio>
+#include <iostream>
+#include <cuda_runtime.h>
+#include <algorithm>
+#include <Windows.h>
+#include <io.h>
+#include <vector>
+#include <regex>
+#include <shlwapi.h>
+
+
+extern "C" IMAGE_DOS_HEADER __ImageBase;
+
+
+std::vector<int> find_mxnet_dll()
+{
+	std::vector<int> version;
+	intptr_t handle;
+
+	_wfinddata_t findData{};
+	std::wregex reg(L".*?mxnet_([0-9]+)\\.dll");
+
+	HMODULE hModule = reinterpret_cast<HMODULE>(&__ImageBase);
+	WCHAR szPathBuffer[MAX_PATH] = { 0 };
+	GetModuleFileNameW(hModule, szPathBuffer, MAX_PATH);
+
+	PathRemoveFileSpecW(szPathBuffer);
+	wcscat_s(szPathBuffer, L"\\mxnet_*.dll");
+
+	handle = _wfindfirst(szPathBuffer, &findData);
+	if (handle == -1)
+	{
+		return version;
+	}
+
+	do
+	{
+		if (!(findData.attrib & _A_SUBDIR) || wcscmp(findData.name, L".") != 0 || wcscmp(findData.name, L"..") != 0)
+		{
+			std::wstring str(findData.name);
+			std::wsmatch base_match;
+			if(std::regex_match(str, base_match, reg))
+			{
+				if (base_match.size() == 2) {
+					std::wssub_match base_sub_match = base_match[1];
+					std::wstring base = base_sub_match.str();
+					version.push_back(std::stoi(base)) ;
+				}
+			}
+		}
+	} while (_wfindnext(handle, &findData) == 0);    
+
+	_findclose(handle);
+	std::sort(version.begin(), version.end());
+	return version;
+}
+
+int find_version()
+{
+	std::vector<int> known_sm = find_mxnet_dll();
+	int count = 0;
+	int version = 75;
+	if (cudaSuccess != cudaGetDeviceCount(&count))
+	{
+		return 30;
+	}
+	if (count == 0)
+	{
+		return 30;
+	}
+
+
+	for (int device = 0; device < count; ++device)
+	{
+		cudaDeviceProp prop{};
+		if (cudaSuccess == cudaGetDeviceProperties(&prop, device))
+		{
+			version = std::min(version, prop.major * 10 + prop.minor);
+		}
+	}
+
+	for (int i = known_sm.size() -1 ; i >=0; --i)
+	{
+		if(known_sm[i]<= version)
+		{
+			return known_sm[i];
+		}
+	}
+
+	return version;
+}
+
+void load_function(HMODULE hm);
+
+void mxnet_init()
+{
+	int version = find_version();
+	WCHAR dll_name[MAX_PATH];
+	wsprintfW(dll_name, L"mxnet_%d.dll", version);
+	HMODULE hm = LoadLibraryW(dll_name);
+	load_function(hm);
+}
+
+
+extern "C" BOOL WINAPI DllMain(
+	HINSTANCE const instance,  // handle to DLL module
+	DWORD     const reason,    // reason for calling function
+	LPVOID    const reserved)  // reserved
+{
+	// Perform actions based on the reason for calling.
+	switch (reason)
+	{
+	case DLL_PROCESS_ATTACH:
+		mxnet_init();
+		// Initialize once for each new process.
+		// Return FALSE to fail DLL load.
+		break;
+
+	case DLL_THREAD_ATTACH:
+		// Do thread-specific initialization.
+		break;
+
+	case DLL_THREAD_DETACH:
+		// Do thread-specific cleanup.
+		break;
+
+	case DLL_PROCESS_DETACH:
+		// Perform any necessary cleanup.
+		break;
+	}
+	return TRUE;  // Successful DLL_PROCESS_ATTACH.
+}
\ No newline at end of file