amdadvtech · takahiroharada · Feb 6, 2023 · Feb 4, 2023 · Feb 4, 2023 · Feb 6, 2023
diff --git a/ParallelPrimitives/RadixSortKernels.h b/ParallelPrimitives/RadixSortKernels.h
@@ -13,9 +13,6 @@ typedef unsigned long long u64;
 
 extern "C" __global__ void CountKernelReference( int* gSrc, int* gDst, int gN, int gNItemsPerWI, const int START_BIT, const int N_WGS_EXECUTED )
 {
-
-	const int gIdx = blockIdx.x * blockDim.x + threadIdx.x;
-
 	const int offset = blockIdx.x * blockDim.x * gNItemsPerWI;
 
 	int table[BIN_SIZE] = { 0 };
@@ -552,7 +549,6 @@ __device__ void SortImpl( int* gSrcKey, int* gSrcVal, int* gDstKey, int* gDstVal
 
 		for( int i = 0; i < upperBound; ++i )
 		{
-			const int idx = offset + threadIdx.x * SORT_N_ITEMS_PER_WI + i;
 			const int tableIdx = ( keys[i] >> START_BIT ) & RADIX_MASK;
 			const int dstIdx = localOffsets[tableIdx] + ( threadIdx.x * SORT_N_ITEMS_PER_WI + i ) - lds.histogram[0][tableIdx];
 			gDstKey[dstIdx] = keys[i];

diff --git a/bitcodes/generate_bitcode.sh b/bitcodes/generate_bitcode.sh
diff --git a/bitcodes/generate_bitcode_nvidia.sh b/bitcodes/generate_bitcode_nvidia.sh
diff --git a/bitcodes/oro_compiled_kernels.cpp b/bitcodes/oro_compiled_kernels.cpp
diff --git a/bitcodes/oro_compiled_kernels.fatbin b/bitcodes/oro_compiled_kernels.fatbin
diff --git a/bitcodes/oro_compiled_kernels.hipfb b/bitcodes/oro_compiled_kernels.hipfb
diff --git a/scripts/amdGpuList.json b/scripts/amdGpuList.json
@@ -0,0 +1,30 @@
+{
+	"amd": [
+		"gfx1100",
+		"gfx1101",
+		"gfx1102",
+		"gfx1103",
+
+		"gfx1030",
+		"gfx1031",
+		"gfx1032",
+		"gfx1033",
+		"gfx1034",
+		"gfx1035",
+		"gfx1036",
+
+		"gfx1010",
+		"gfx1011",
+		"gfx1012",
+		"gfx1013",
+
+		"gfx900",
+		"gfx902",
+		"gfx904",
+		"gfx906",
+		"gfx908",
+		"gfx909",
+		"gfx90a",
+		"gfx90c"
+	]
+}
diff --git a/scripts/kernelCompile.py b/scripts/kernelCompile.py
@@ -0,0 +1,43 @@
+import json
+import os
+import subprocess
+
+
+def getGpuList():
+  f = open("amdGpuList.json")
+  gpus = json.load(f)
+  f.close()
+  return gpus
+
+ps = []
+def compile( index ):
+	if index == 0 :
+		command = [
+			"..\\..\\hipsdk\\bin\\hipcc",
+			"-x", "hip", "..\ParallelPrimitives\RadixSortKernels.h", "-O3", "-std=c++17", "-ffast-math", "--cuda-device-only", "--genco", "-I../", "-include", "hip/hip_runtime.h", "-parallel-jobs=15"]
+		#command.append( "--offload-arch=gfx1100" )
+		for i in getGpuList()['amd']:
+			command.append( "--offload-arch=" + i )
+		command.append( "-o" )
+		command.append( "../bitcodes/oro_compiled_kernels.hipfb" )
+	else:
+		command = [
+			'nvcc', '-x','cu','..\ParallelPrimitives\RadixSortKernels.h','-O3', '-std=c++17', '--use_fast_math', '-fatbin', '-arch=all', 
+			'-I../', '-include', 'cuda_runtime.h' ]
+		command.append( '-o' )
+		command.append('../bitcodes/oro_compiled_kernels.fatbin')
+
+	print( " ".join( command ) )
+
+	if os.name == 'nt':
+		ps.append( subprocess.Popen( command, shell=True ) )
+	else:
+		ps.append( subprocess.Popen( command ) )
+
+compile( 0 )
+compile( 1 )
+
+for p in ps:
+	p.wait()
+
+print( "compile done." )