Skip to content

Commit

Permalink
enable windows cuda unit tests (#4421)
Browse files Browse the repository at this point in the history
* enable cuda for unittest

* add one check

* rename

* minor fix

* add more check

* add cuda envs

* add more cuda envs

* add driver update

* update vs integration

* get cuda version

* merge

* add envs in install.sh

* reduce some change and add comments

* minor improve

* fix typo

* exit if driver update failed

* fix lint

* fix lint

* Avoid catching exception to show error message

* try 11.1

* try 11.1

* "try 11.1"

* set the downloaded gpu driver dlls only for 10.2

* check torch.cuda.is_available()

* add display.driver

* Revert "Avoid catching exception to show error message"

This reverts commit 1ce58c5.

* Update .circleci/unittest/windows/scripts/set_cuda_envs.sh

* Update .circleci/unittest/windows/scripts/set_cuda_envs.sh

* Update .circleci/unittest/windows/scripts/set_cuda_envs.sh

Co-authored-by: Nicolas Hug <nicolashug@fb.com>
Co-authored-by: Vasilis Vryniotis <datumbox@users.noreply.github.com>
Co-authored-by: Nikita Shulga <nikita.shulga@gmail.com>
Co-authored-by: Nikita Shulga <nshulga@fb.com>
  • Loading branch information
5 people authored Sep 29, 2021
1 parent eb2fb25 commit 932ca5a
Show file tree
Hide file tree
Showing 7 changed files with 114 additions and 41 deletions.
6 changes: 6 additions & 0 deletions .circleci/config.yml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

6 changes: 6 additions & 0 deletions .circleci/config.yml.in
Original file line number Diff line number Diff line change
Expand Up @@ -763,6 +763,12 @@ jobs:
paths:
- conda
- env
- run:
name: Install CUDA
command: packaging/windows/internal/cuda_install.bat
- run:
name: Update CUDA driver
command: packaging/windows/internal/driver_update.bat
- run:
name: Install torchvision
command: .circleci/unittest/windows/scripts/install.sh
Expand Down
14 changes: 13 additions & 1 deletion .circleci/unittest/windows/scripts/install.sh
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ unset PYTORCH_VERSION
# so no need to set PYTORCH_VERSION.
# In fact, keeping PYTORCH_VERSION forces us to hardcode PyTorch version in config.

set -e
set -ex

this_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"

Expand Down Expand Up @@ -35,5 +35,17 @@ if [ $PYTHON_VERSION == "3.6" ]; then
pip install pillow>=5.3.0
fi

torch_cuda=$(python -c "import torch; print(torch.cuda.is_available())")
echo torch.cuda.is_available is $torch_cuda

if [ ! -z "${CUDA_VERSION:-}" ] ; then
if [ "$torch_cuda" == "False" ]; then
echo "torch with cuda installed but torch.cuda.is_available() is False"
exit 1
fi
fi

source "$this_dir/set_cuda_envs.sh"

printf "* Installing torchvision\n"
"$this_dir/vc_env_helper.bat" python setup.py develop
3 changes: 3 additions & 0 deletions .circleci/unittest/windows/scripts/run_test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,9 @@ set -e
eval "$(./conda/Scripts/conda.exe 'shell.bash' 'hook')"
conda activate ./env

this_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
source "$this_dir/set_cuda_envs.sh"

export PYTORCH_TEST_WITH_SLOW='1'
python -m torch.utils.collect_env
pytest --cov=torchvision --junitxml=test-results/junit.xml -v --durations 20 test --ignore=test/test_datasets_download.py
65 changes: 39 additions & 26 deletions .circleci/unittest/windows/scripts/set_cuda_envs.sh
Original file line number Diff line number Diff line change
@@ -1,35 +1,48 @@
#!/usr/bin/env bash
set -ex

if [ "${CU_VERSION:-}" == "cpu" ] ; then
exit 0
fi
echo CU_VERSION is "${CU_VERSION}"
echo CUDA_VERSION is "${CUDA_VERSION}"

if [[ ${#CU_VERSION} -eq 5 ]]; then
CUDA_VERSION="${CU_VERSION:2:2}.${CU_VERSION:4:1}"
# Currenly, CU_VERSION and CUDA_VERSION are not consistent.
# to understand this code, see https://github.com/pytorch/vision/issues/4443
version="cpu"
if [[ ! -z "${CUDA_VERSION}" ]] ; then
version="$CUDA_VERSION"
else
if [[ ${#CU_VERSION} -eq 5 ]]; then
version="${CU_VERSION:2:2}.${CU_VERSION:4:1}"
fi
fi

# It's a log to see if CU_VERSION exists, if not, we use environment CUDA_VERSION directly
# in unittest_windows_gpu, there's no CU_VERSION, but CUDA_VERSION.
echo "Using CUDA $CUDA_VERSION, CU_VERSION is $CU_VERSION now"
# Don't use if [[ "$version" == "cpu" ]]; then exit 0 fi.
# It would exit the shell. One result is cpu tests would not run if the shell exit.
# Unless there's an error, Don't exit.
if [[ "$version" != "cpu" ]]; then
# set cuda envs
export PATH="/c/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v${version}/bin:/c/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v${version}/libnvvp:$PATH"
export CUDA_PATH_V${version/./_}="C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v${version}"
export CUDA_PATH="C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v${version}"

version=$CUDA_VERSION
if [ ! -d "$CUDA_PATH" ]; then
echo "$CUDA_PATH" does not exist
exit 1
fi

# set cuda envs
export PATH="/c/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v${version}/bin:/c/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v${version}/libnvvp:$PATH"
export CUDA_PATH_V${version/./_}="C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v${version}"
export CUDA_PATH="C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v${version}"
if [ ! -f "${CUDA_PATH}\include\nvjpeg.h" ]; then
echo "nvjpeg does not exist"
exit 1
fi

if [ ! -d "$CUDA_PATH" ]
then
echo "$CUDA_PATH" does not exist
exit 1
fi
# check cuda driver version
for path in '/c/Program Files/NVIDIA Corporation/NVSMI/nvidia-smi.exe' /c/Windows/System32/nvidia-smi.exe; do
if [[ -x "$path" ]]; then
"$path" || echo "true";
break
fi
done

# check cuda driver version
for path in '/c/Program Files/NVIDIA Corporation/NVSMI/nvidia-smi.exe' /c/Windows/System32/nvidia-smi.exe; do
if [[ -x "$path" ]]; then
"$path" || echo "true";
break
fi
done
which nvcc
which nvcc
nvcc --version
env | grep CUDA
fi
36 changes: 22 additions & 14 deletions packaging/windows/internal/cuda_install.bat
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,13 @@ set SRC_DIR=%~dp0\..

if not exist "%SRC_DIR%\temp_build" mkdir "%SRC_DIR%\temp_build"

rem in unit test workflow, we get CUDA_VERSION, for example 11.1
if defined CUDA_VERSION (
set CUDA_VER=%CUDA_VERSION:.=%
) else (
set CUDA_VER=%CU_VERSION:cu=%
)

set /a CUDA_VER=%CU_VERSION:cu=%
set CUDA_VER_MAJOR=%CUDA_VER:~0,-1%
set CUDA_VER_MINOR=%CUDA_VER:~-1,1%
Expand Down Expand Up @@ -65,7 +72,7 @@ if not exist "%SRC_DIR%\temp_build\cuda_10.1.243_426.00_win10.exe" (
curl -k -L https://ossci-windows.s3.amazonaws.com/cuda_10.1.243_426.00_win10.exe --output "%SRC_DIR%\temp_build\cuda_10.1.243_426.00_win10.exe"
if errorlevel 1 exit /b 1
set "CUDA_SETUP_FILE=%SRC_DIR%\temp_build\cuda_10.1.243_426.00_win10.exe"
set "ARGS=nvcc_10.1 cuobjdump_10.1 nvprune_10.1 cupti_10.1 cublas_10.1 cublas_dev_10.1 cudart_10.1 cufft_10.1 cufft_dev_10.1 curand_10.1 curand_dev_10.1 cusolver_10.1 cusolver_dev_10.1 cusparse_10.1 cusparse_dev_10.1 nvgraph_10.1 nvgraph_dev_10.1 npp_10.1 npp_dev_10.1 nvrtc_10.1 nvrtc_dev_10.1 nvml_dev_10.1"
set "ARGS=nvcc_10.1 cuobjdump_10.1 nvprune_10.1 cupti_10.1 cublas_10.1 cublas_dev_10.1 cudart_10.1 cufft_10.1 cufft_dev_10.1 curand_10.1 curand_dev_10.1 cusolver_10.1 cusolver_dev_10.1 cusparse_10.1 cusparse_dev_10.1 nvgraph_10.1 nvgraph_dev_10.1 npp_10.1 npp_dev_10.1 nvjpeg_10.1 nvjpeg_dev_10.1 nvrtc_10.1 nvrtc_dev_10.1 nvml_dev_10.1"
)

if not exist "%SRC_DIR%\temp_build\cudnn-10.1-windows10-x64-v7.6.4.38.zip" (
Expand All @@ -82,7 +89,7 @@ if not exist "%SRC_DIR%\temp_build\cuda_10.2.89_441.22_win10.exe" (
curl -k -L https://ossci-windows.s3.amazonaws.com/cuda_10.2.89_441.22_win10.exe --output "%SRC_DIR%\temp_build\cuda_10.2.89_441.22_win10.exe"
if errorlevel 1 exit /b 1
set "CUDA_SETUP_FILE=%SRC_DIR%\temp_build\cuda_10.2.89_441.22_win10.exe"
set "ARGS=nvcc_10.2 cuobjdump_10.2 nvprune_10.2 cupti_10.2 cublas_10.2 cublas_dev_10.2 cudart_10.2 cufft_10.2 cufft_dev_10.2 curand_10.2 curand_dev_10.2 cusolver_10.2 cusolver_dev_10.2 cusparse_10.2 cusparse_dev_10.2 nvgraph_10.2 nvgraph_dev_10.2 npp_10.2 npp_dev_10.2 nvrtc_10.2 nvrtc_dev_10.2 nvml_dev_10.2"
set "ARGS=nvcc_10.2 cuobjdump_10.2 nvprune_10.2 cupti_10.2 cublas_10.2 cublas_dev_10.2 cudart_10.2 cufft_10.2 cufft_dev_10.2 curand_10.2 curand_dev_10.2 cusolver_10.2 cusolver_dev_10.2 cusparse_10.2 cusparse_dev_10.2 nvgraph_10.2 nvgraph_dev_10.2 npp_10.2 npp_dev_10.2 nvjpeg_10.2 nvjpeg_dev_10.2 nvrtc_10.2 nvrtc_dev_10.2 nvml_dev_10.2"
)

if not exist "%SRC_DIR%\temp_build\cudnn-10.2-windows10-x64-v7.6.5.32.zip" (
Expand All @@ -91,6 +98,15 @@ if not exist "%SRC_DIR%\temp_build\cudnn-10.2-windows10-x64-v7.6.5.32.zip" (
set "CUDNN_SETUP_FILE=%SRC_DIR%\temp_build\cudnn-10.2-windows10-x64-v7.6.5.32.zip"
)

rem The below only for cu102, if it's used in other version, e.g. cu111, torch.cuda.is_availabe() would be False.
if not exist "%SRC_DIR%\temp_build\gpu_driver_dlls.7z" (
curl -k -L "https://drive.google.com/u/0/uc?id=1injUyo3lnarMgWyRcXqKg4UGnN0ysmuq&export=download" --output "%SRC_DIR%\temp_build\gpu_driver_dlls.zip"
if errorlevel 1 exit /b 1
)

echo Installing GPU driver DLLs
7z x %SRC_DIR%\temp_build\gpu_driver_dlls.zip -aoa -o"C:\Windows\System32"

goto cuda_common

:cuda110
Expand All @@ -99,7 +115,7 @@ if not exist "%SRC_DIR%\temp_build\cuda_11.0.2_451.48_win10.exe" (
curl -k -L https://ossci-windows.s3.amazonaws.com/cuda_11.0.2_451.48_win10.exe --output "%SRC_DIR%\temp_build\cuda_11.0.2_451.48_win10.exe"
if errorlevel 1 exit /b 1
set "CUDA_SETUP_FILE=%SRC_DIR%\temp_build\cuda_11.0.2_451.48_win10.exe"
set "ARGS=nvcc_11.0 cuobjdump_11.0 nvprune_11.0 nvprof_11.0 cupti_11.0 cublas_11.0 cublas_dev_11.0 cudart_11.0 cufft_11.0 cufft_dev_11.0 curand_11.0 curand_dev_11.0 cusolver_11.0 cusolver_dev_11.0 cusparse_11.0 cusparse_dev_11.0 npp_11.0 npp_dev_11.0 nvrtc_11.0 nvrtc_dev_11.0 nvml_dev_11.0"
set "ARGS=nvcc_11.0 cuobjdump_11.0 nvprune_11.0 nvprof_11.0 cupti_11.0 cublas_11.0 cublas_dev_11.0 cudart_11.0 cufft_11.0 cufft_dev_11.0 curand_11.0 curand_dev_11.0 cusolver_11.0 cusolver_dev_11.0 cusparse_11.0 cusparse_dev_11.0 npp_11.0 npp_dev_11.0 nvjpeg_11.0 nvjpeg_dev_11.0 nvrtc_11.0 nvrtc_dev_11.0 nvml_dev_11.0"
)

if not exist "%SRC_DIR%\temp_build\cudnn-11.0-windows-x64-v8.0.4.30.zip" (
Expand All @@ -116,12 +132,12 @@ if not exist "%SRC_DIR%\temp_build\cuda_11.1.0_456.43_win10.exe" (
curl -k -L https://ossci-windows.s3.amazonaws.com/cuda_11.1.0_456.43_win10.exe --output "%SRC_DIR%\temp_build\cuda_11.1.0_456.43_win10.exe"
if errorlevel 1 exit /b 1
set "CUDA_SETUP_FILE=%SRC_DIR%\temp_build\cuda_11.1.0_456.43_win10.exe"
set "ARGS=nvcc_11.1 cuobjdump_11.1 nvprune_11.1 nvprof_11.1 cupti_11.1 cublas_11.1 cublas_dev_11.1 cudart_11.1 cufft_11.1 cufft_dev_11.1 curand_11.1 curand_dev_11.1 cusolver_11.1 cusolver_dev_11.1 cusparse_11.1 cusparse_dev_11.1 npp_11.1 npp_dev_11.1 nvrtc_11.1 nvrtc_dev_11.1 nvml_dev_11.1"
set "ARGS=nvcc_11.1 cuobjdump_11.1 nvprune_11.1 nvprof_11.1 cupti_11.1 cublas_11.1 cublas_dev_11.1 cudart_11.1 cufft_11.1 cufft_dev_11.1 curand_11.1 curand_dev_11.1 cusolver_11.1 cusolver_dev_11.1 cusparse_11.1 cusparse_dev_11.1 npp_11.1 npp_dev_11.1 nvjpeg_11.1 nvjpeg_dev_11.1 nvrtc_11.1 nvrtc_dev_11.1 nvml_dev_11.1"
)

@REM There is no downloadable driver for Tesla on CUDA 11.1 yet. We will use
@REM the driver inside CUDA
if "%JOB_EXECUTOR%" == "windows-with-nvidia-gpu" set "ARGS=%ARGS% Display.Driver"
set "ARGS=%ARGS% Display.Driver"

if not exist "%SRC_DIR%\temp_build\cudnn-11.1-windows-x64-v8.0.5.39.zip" (
curl -k -L https://ossci-windows.s3.amazonaws.com/cudnn-11.1-windows-x64-v8.0.5.39.zip --output "%SRC_DIR%\temp_build\cudnn-11.1-windows-x64-v8.0.5.39.zip"
Expand All @@ -137,7 +153,7 @@ if not exist "%SRC_DIR%\temp_build\cuda_11.2.0_460.89_win10.exe" (
curl -k -L https://ossci-windows.s3.amazonaws.com/cuda_11.2.0_460.89_win10.exe --output "%SRC_DIR%\temp_build\cuda_11.2.0_460.89_win10.exe"
if errorlevel 1 exit /b 1
set "CUDA_SETUP_FILE=%SRC_DIR%\temp_build\cuda_11.2.0_460.89_win10.exe"
set "ARGS=nvcc_11.2 cuobjdump_11.2 nvprune_11.2 nvprof_11.2 cupti_11.2 cublas_11.2 cublas_dev_11.2 cudart_11.2 cufft_11.2 cufft_dev_11.2 curand_11.2 curand_dev_11.2 cusolver_11.2 cusolver_dev_11.2 cusparse_11.2 cusparse_dev_11.2 npp_11.2 npp_dev_11.2 nvrtc_11.2 nvrtc_dev_11.2 nvml_dev_11.2"
set "ARGS=nvcc_11.2 cuobjdump_11.2 nvprune_11.2 nvprof_11.2 cupti_11.2 cublas_11.2 cublas_dev_11.2 cudart_11.2 cufft_11.2 cufft_dev_11.2 curand_11.2 curand_dev_11.2 cusolver_11.2 cusolver_dev_11.2 cusparse_11.2 cusparse_dev_11.2 npp_11.2 npp_dev_11.2 nvjpeg_11.2 nvjpeg_dev_11.2 nvrtc_11.2 nvrtc_dev_11.2 nvml_dev_11.2"
)

if not exist "%SRC_DIR%\temp_build\cudnn-11.2-windows-x64-v8.1.0.77.zip" (
Expand Down Expand Up @@ -175,11 +191,6 @@ if not exist "%SRC_DIR%\temp_build\NvToolsExt.7z" (
if errorlevel 1 exit /b 1
)

if not exist "%SRC_DIR%\temp_build\gpu_driver_dlls.7z" (
curl -k -L "https://drive.google.com/u/0/uc?id=1injUyo3lnarMgWyRcXqKg4UGnN0ysmuq&export=download" --output "%SRC_DIR%\temp_build\gpu_driver_dlls.zip"
if errorlevel 1 exit /b 1
)

echo Installing CUDA toolkit...
7z x %CUDA_SETUP_FILE% -o"%SRC_DIR%\temp_build\cuda"
pushd "%SRC_DIR%\temp_build\cuda"
Expand Down Expand Up @@ -221,8 +232,5 @@ xcopy /Y "%SRC_DIR%\temp_build\cudnn\cuda\bin\*.*" "%ProgramFiles%\NVIDIA GPU Co
xcopy /Y "%SRC_DIR%\temp_build\cudnn\cuda\lib\x64\*.*" "%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v%CUDA_VERSION_STR%\lib\x64"
xcopy /Y "%SRC_DIR%\temp_build\cudnn\cuda\include\*.*" "%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v%CUDA_VERSION_STR%\include"

echo Installing GPU driver DLLs
7z x %SRC_DIR%\temp_build\gpu_driver_dlls.zip -aoa -o"C:\Windows\System32"

echo Cleaning temp files
rd /s /q "%SRC_DIR%\temp_build" || ver > nul
25 changes: 25 additions & 0 deletions packaging/windows/internal/driver_update.bat
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
set "DRIVER_DOWNLOAD_LINK=https://ossci-windows.s3.amazonaws.com/461.09-data-center-tesla-desktop-winserver-2019-2016-international.exe"
curl --retry 3 -kL %DRIVER_DOWNLOAD_LINK% --output 461.09-data-center-tesla-desktop-winserver-2019-2016-international.exe
if errorlevel 1 exit /b 1

start /wait 461.09-data-center-tesla-desktop-winserver-2019-2016-international.exe -s -noreboot
if errorlevel 1 exit /b 1

del 461.09-data-center-tesla-desktop-winserver-2019-2016-international.exe || ver > NUL

setlocal EnableDelayedExpansion
set NVIDIA_GPU_EXISTS=0
for /F "delims=" %%i in ('wmic path win32_VideoController get name') do (
set GPUS=%%i
if not "x!GPUS:NVIDIA=!" == "x!GPUS!" (
SET NVIDIA_GPU_EXISTS=1
goto gpu_check_end
)
)
:gpu_check_end
endlocal & set NVIDIA_GPU_EXISTS=%NVIDIA_GPU_EXISTS%

if "%NVIDIA_GPU_EXISTS%" == "0" (
echo "CUDA Driver installation Failed"
exit /b 1
)

0 comments on commit 932ca5a

Please sign in to comment.