Merge pull request #50 from NVlabs/sm_37

Add support for sm_37 architecture
NVlabs · Jan 18, 2022 · 6185718 · 6185718
2 parents 3cf5fba + efb45de
commit 6185718
Show file tree

Hide file tree

Showing 5 changed files with 37 additions and 20 deletions.
diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
@@ -26,7 +26,10 @@ jobs:
             arch: 70
           - os: ubuntu-18.04
             cuda: "10.2"
-            arch: 60
+            arch: 61
+          - os: ubuntu-18.04
+            cuda: "10.2"
+            arch: 37
     env:
       build_dir: "build"
       config: "Release"
@@ -70,7 +73,11 @@ jobs:
           - os: windows-2019
             visual_studio: "Visual Studio 16 2019"
             cuda: "11.5.1"
-            arch: 60
+            arch: 61
+          - os: windows-2019
+            visual_studio: "Visual Studio 16 2019"
+            cuda: "11.5.1"
+            arch: 37
     env:
       build_dir: "build"
       config: "Release"

diff --git a/dependencies/tiny-cuda-nn b/dependencies/tiny-cuda-nn
diff --git a/include/neural-graphics-primitives/common_device.cuh b/include/neural-graphics-primitives/common_device.cuh
@@ -122,16 +122,19 @@ __device__ void deposit_image_gradient(const Eigen::Matrix<float, N_DIMS, 1>& va
 		pos.x() = std::max(std::min(pos.x(), resolution.x()-1), 0);
 		pos.y() = std::max(std::min(pos.y(), resolution.y()-1), 0);
 
-		if (std::is_same<T, float>::value) {
-			for (uint32_t c = 0; c < N_DIMS; ++c) {
-				atomicAdd(&gradient[(pos.x() + pos.y() * resolution.x()) * N_DIMS + c], (T)value[c] * weight);
-				atomicAdd(&gradient_weight[(pos.x() + pos.y() * resolution.x()) * N_DIMS + c], weight);
-			}
-		} else if (std::is_same<T, __half>::value) {
+#if TCNN_MIN_GPU_ARCH >= 60 // atomicAdd(__half2) is only supported with compute capability 60 and above
+		if (std::is_same<T, __half>::value) {
 			for (uint32_t c = 0; c < N_DIMS; c += 2) {
 				atomicAdd((__half2*)&gradient[(pos.x() + pos.y() * resolution.x()) * N_DIMS + c], {(T)value[c] * weight, (T)value[c+1] * weight});
 				atomicAdd((__half2*)&gradient_weight[(pos.x() + pos.y() * resolution.x()) * N_DIMS + c], {weight, weight});
 			}
+		} else
+#endif
+		{
+			for (uint32_t c = 0; c < N_DIMS; ++c) {
+				atomicAdd(&gradient[(pos.x() + pos.y() * resolution.x()) * N_DIMS + c], (T)value[c] * weight);
+				atomicAdd(&gradient_weight[(pos.x() + pos.y() * resolution.x()) * N_DIMS + c], weight);
+			}
 		}
 	};
 

diff --git a/include/neural-graphics-primitives/envmap.cuh b/include/neural-graphics-primitives/envmap.cuh
@@ -82,14 +82,18 @@ __device__ void deposit_envmap_gradient(const tcnn::vector_t<T, 4>& value, T* __
 		pos.y() = std::max(std::min(pos.y(), envmap_resolution.y()-1), 0);
 
 		Eigen::Array4f result;
-		if (std::is_same<T, float>::value) {
-			for (uint32_t c = 0; c < 4; ++c) {
-				atomicAdd(&envmap_gradient[(pos.x() + pos.y() * envmap_resolution.x()) * 4 + c], value[c] * weight);
-			}
-		} else if (std::is_same<T, __half>::value) {
+
+#if TCNN_MIN_GPU_ARCH >= 60 // atomicAdd(__half2) is only supported with compute capability 60 and above
+		if (std::is_same<T, __half>::value) {
 			for (uint32_t c = 0; c < 4; c += 2) {
 				atomicAdd((__half2*)&envmap_gradient[(pos.x() + pos.y() * envmap_resolution.x()) * 4 + c], {value[c] * weight, value[c+1] * weight});
 			}
+		} else
+#endif
+		{
+			for (uint32_t c = 0; c < 4; ++c) {
+				atomicAdd(&envmap_gradient[(pos.x() + pos.y() * envmap_resolution.x()) * 4 + c], value[c] * weight);
+			}
 		}
 	};
 

diff --git a/include/neural-graphics-primitives/takikawa_encoding.cuh b/include/neural-graphics-primitives/takikawa_encoding.cuh
@@ -240,17 +240,20 @@ __global__ void kernel_takikawa_backward(
 
 				int param_idx = node.vertices[idx] * N_FEATURES_PER_LEVEL;
 
-				if (N_FEATURES_PER_LEVEL == 1 || !std::is_same<T, __half>::value) {
-					#pragma unroll
-					for (uint32_t f = 0; f < N_FEATURES_PER_LEVEL; ++f) {
-						atomicAdd(&grid_gradient[param_idx], (T)((float)grad[f] * weight));
-					}
-				} else {
+#if TCNN_MIN_GPU_ARCH >= 60 // atomicAdd(__half2) is only supported with compute capability 60 and above
+				if (N_FEATURES_PER_LEVEL > 1 && std::is_same<T, __half>::value) {
 					#pragma unroll
 					for (uint32_t feature = 0; feature < N_FEATURES_PER_LEVEL; feature += 2) {
 						__half2 v = {(__half)((float)grad[feature] * weight), (__half)((float)grad[feature+1] * weight)};
 						atomicAdd((__half2*)&grid_gradient[param_idx + feature], v);
 					}
+				} else
+#endif
+				{
+					#pragma unroll
+					for (uint32_t f = 0; f < N_FEATURES_PER_LEVEL; ++f) {
+						atomicAdd(&grid_gradient[param_idx], (T)((float)grad[f] * weight));
+					}
 				}
 			}
 		}