From efb45deab1e4c5cf0043832115de0216a9a8898e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20M=C3=BCller?= <thomas94@gmx.net>
Date: Tue, 18 Jan 2022 10:40:27 +0100
Subject: [PATCH] Add support for sm_37 architecture

---
 .github/workflows/main.yml                        | 11 +++++++++--
 dependencies/tiny-cuda-nn                         |  2 +-
 .../neural-graphics-primitives/common_device.cuh  | 15 +++++++++------
 include/neural-graphics-primitives/envmap.cuh     | 14 +++++++++-----
 .../takikawa_encoding.cuh                         | 15 +++++++++------
 5 files changed, 37 insertions(+), 20 deletions(-)

diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index 7cd6e20a1..35b134c46 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -26,7 +26,10 @@ jobs:
             arch: 70
           - os: ubuntu-18.04
             cuda: "10.2"
-            arch: 60
+            arch: 61
+          - os: ubuntu-18.04
+            cuda: "10.2"
+            arch: 37
     env:
       build_dir: "build"
       config: "Release"
@@ -70,7 +73,11 @@ jobs:
           - os: windows-2019
             visual_studio: "Visual Studio 16 2019"
             cuda: "11.5.1"
-            arch: 60
+            arch: 61
+          - os: windows-2019
+            visual_studio: "Visual Studio 16 2019"
+            cuda: "11.5.1"
+            arch: 37
     env:
       build_dir: "build"
       config: "Release"
diff --git a/dependencies/tiny-cuda-nn b/dependencies/tiny-cuda-nn
index 6041eadf3..3eb4cd571 160000
--- a/dependencies/tiny-cuda-nn
+++ b/dependencies/tiny-cuda-nn
@@ -1 +1 @@
-Subproject commit 6041eadf3fe5f2d7bb8e53b8328c7325a047ac42
+Subproject commit 3eb4cd571f9a2e1276c556e9d8b8bc34d44b84d3
diff --git a/include/neural-graphics-primitives/common_device.cuh b/include/neural-graphics-primitives/common_device.cuh
index 022b2b48a..b210075c9 100644
--- a/include/neural-graphics-primitives/common_device.cuh
+++ b/include/neural-graphics-primitives/common_device.cuh
@@ -122,16 +122,19 @@ __device__ void deposit_image_gradient(const Eigen::Matrix<float, N_DIMS, 1>& va
 		pos.x() = std::max(std::min(pos.x(), resolution.x()-1), 0);
 		pos.y() = std::max(std::min(pos.y(), resolution.y()-1), 0);
 
-		if (std::is_same<T, float>::value) {
-			for (uint32_t c = 0; c < N_DIMS; ++c) {
-				atomicAdd(&gradient[(pos.x() + pos.y() * resolution.x()) * N_DIMS + c], (T)value[c] * weight);
-				atomicAdd(&gradient_weight[(pos.x() + pos.y() * resolution.x()) * N_DIMS + c], weight);
-			}
-		} else if (std::is_same<T, __half>::value) {
+#if TCNN_MIN_GPU_ARCH >= 60 // atomicAdd(__half2) is only supported with compute capability 60 and above
+		if (std::is_same<T, __half>::value) {
 			for (uint32_t c = 0; c < N_DIMS; c += 2) {
 				atomicAdd((__half2*)&gradient[(pos.x() + pos.y() * resolution.x()) * N_DIMS + c], {(T)value[c] * weight, (T)value[c+1] * weight});
 				atomicAdd((__half2*)&gradient_weight[(pos.x() + pos.y() * resolution.x()) * N_DIMS + c], {weight, weight});
 			}
+		} else
+#endif
+		{
+			for (uint32_t c = 0; c < N_DIMS; ++c) {
+				atomicAdd(&gradient[(pos.x() + pos.y() * resolution.x()) * N_DIMS + c], (T)value[c] * weight);
+				atomicAdd(&gradient_weight[(pos.x() + pos.y() * resolution.x()) * N_DIMS + c], weight);
+			}
 		}
 	};
 
diff --git a/include/neural-graphics-primitives/envmap.cuh b/include/neural-graphics-primitives/envmap.cuh
index 4185f1dd5..273f66100 100644
--- a/include/neural-graphics-primitives/envmap.cuh
+++ b/include/neural-graphics-primitives/envmap.cuh
@@ -82,14 +82,18 @@ __device__ void deposit_envmap_gradient(const tcnn::vector_t<T, 4>& value, T* __
 		pos.y() = std::max(std::min(pos.y(), envmap_resolution.y()-1), 0);
 
 		Eigen::Array4f result;
-		if (std::is_same<T, float>::value) {
-			for (uint32_t c = 0; c < 4; ++c) {
-				atomicAdd(&envmap_gradient[(pos.x() + pos.y() * envmap_resolution.x()) * 4 + c], value[c] * weight);
-			}
-		} else if (std::is_same<T, __half>::value) {
+
+#if TCNN_MIN_GPU_ARCH >= 60 // atomicAdd(__half2) is only supported with compute capability 60 and above
+		if (std::is_same<T, __half>::value) {
 			for (uint32_t c = 0; c < 4; c += 2) {
 				atomicAdd((__half2*)&envmap_gradient[(pos.x() + pos.y() * envmap_resolution.x()) * 4 + c], {value[c] * weight, value[c+1] * weight});
 			}
+		} else
+#endif
+		{
+			for (uint32_t c = 0; c < 4; ++c) {
+				atomicAdd(&envmap_gradient[(pos.x() + pos.y() * envmap_resolution.x()) * 4 + c], value[c] * weight);
+			}
 		}
 	};
 
diff --git a/include/neural-graphics-primitives/takikawa_encoding.cuh b/include/neural-graphics-primitives/takikawa_encoding.cuh
index ad0cb9634..555dcc440 100644
--- a/include/neural-graphics-primitives/takikawa_encoding.cuh
+++ b/include/neural-graphics-primitives/takikawa_encoding.cuh
@@ -240,17 +240,20 @@ __global__ void kernel_takikawa_backward(
 
 				int param_idx = node.vertices[idx] * N_FEATURES_PER_LEVEL;
 
-				if (N_FEATURES_PER_LEVEL == 1 || !std::is_same<T, __half>::value) {
-					#pragma unroll
-					for (uint32_t f = 0; f < N_FEATURES_PER_LEVEL; ++f) {
-						atomicAdd(&grid_gradient[param_idx], (T)((float)grad[f] * weight));
-					}
-				} else {
+#if TCNN_MIN_GPU_ARCH >= 60 // atomicAdd(__half2) is only supported with compute capability 60 and above
+				if (N_FEATURES_PER_LEVEL > 1 && std::is_same<T, __half>::value) {
 					#pragma unroll
 					for (uint32_t feature = 0; feature < N_FEATURES_PER_LEVEL; feature += 2) {
 						__half2 v = {(__half)((float)grad[feature] * weight), (__half)((float)grad[feature+1] * weight)};
 						atomicAdd((__half2*)&grid_gradient[param_idx + feature], v);
 					}
+				} else
+#endif
+				{
+					#pragma unroll
+					for (uint32_t f = 0; f < N_FEATURES_PER_LEVEL; ++f) {
+						atomicAdd(&grid_gradient[param_idx], (T)((float)grad[f] * weight));
+					}
 				}
 			}
 		}