Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[ORO-0] Clean up. Added python script kernelCompile.py for compilation. #46

Merged
merged 3 commits into from
Feb 6, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 0 additions & 4 deletions ParallelPrimitives/RadixSortKernels.h
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,6 @@ typedef unsigned long long u64;

extern "C" __global__ void CountKernelReference( int* gSrc, int* gDst, int gN, int gNItemsPerWI, const int START_BIT, const int N_WGS_EXECUTED )
{

const int gIdx = blockIdx.x * blockDim.x + threadIdx.x;

const int offset = blockIdx.x * blockDim.x * gNItemsPerWI;

int table[BIN_SIZE] = { 0 };
Expand Down Expand Up @@ -552,7 +549,6 @@ __device__ void SortImpl( int* gSrcKey, int* gSrcVal, int* gDstKey, int* gDstVal

for( int i = 0; i < upperBound; ++i )
{
const int idx = offset + threadIdx.x * SORT_N_ITEMS_PER_WI + i;
const int tableIdx = ( keys[i] >> START_BIT ) & RADIX_MASK;
const int dstIdx = localOffsets[tableIdx] + ( threadIdx.x * SORT_N_ITEMS_PER_WI + i ) - lds.histogram[0][tableIdx];
gDstKey[dstIdx] = keys[i];
Expand Down
1 change: 0 additions & 1 deletion bitcodes/generate_bitcode.sh

This file was deleted.

1 change: 0 additions & 1 deletion bitcodes/generate_bitcode_nvidia.sh

This file was deleted.

7 changes: 0 additions & 7 deletions bitcodes/oro_compiled_kernels.cpp

This file was deleted.

Binary file modified bitcodes/oro_compiled_kernels.fatbin
Binary file not shown.
Binary file modified bitcodes/oro_compiled_kernels.hipfb
Binary file not shown.
30 changes: 30 additions & 0 deletions scripts/amdGpuList.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
{
"amd": [
"gfx1100",
"gfx1101",
"gfx1102",
"gfx1103",

"gfx1030",
"gfx1031",
"gfx1032",
"gfx1033",
"gfx1034",
"gfx1035",
"gfx1036",

"gfx1010",
"gfx1011",
"gfx1012",
"gfx1013",

"gfx900",
"gfx902",
"gfx904",
"gfx906",
"gfx908",
"gfx909",
"gfx90a",
"gfx90c"
]
}
43 changes: 43 additions & 0 deletions scripts/kernelCompile.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
import json
import os
import subprocess


def getGpuList():
f = open("amdGpuList.json")
gpus = json.load(f)
f.close()
return gpus

ps = []
def compile( index ):
if index == 0 :
command = [
"..\\..\\hipsdk\\bin\\hipcc",
"-x", "hip", "..\ParallelPrimitives\RadixSortKernels.h", "-O3", "-std=c++17", "-ffast-math", "--cuda-device-only", "--genco", "-I../", "-include", "hip/hip_runtime.h", "-parallel-jobs=15"]
#command.append( "--offload-arch=gfx1100" )
for i in getGpuList()['amd']:
command.append( "--offload-arch=" + i )
command.append( "-o" )
command.append( "../bitcodes/oro_compiled_kernels.hipfb" )
else:
command = [
'nvcc', '-x','cu','..\ParallelPrimitives\RadixSortKernels.h','-O3', '-std=c++17', '--use_fast_math', '-fatbin', '-arch=all',
'-I../', '-include', 'cuda_runtime.h' ]
command.append( '-o' )
command.append('../bitcodes/oro_compiled_kernels.fatbin')

print( " ".join( command ) )

if os.name == 'nt':
ps.append( subprocess.Popen( command, shell=True ) )
else:
ps.append( subprocess.Popen( command ) )

compile( 0 )
compile( 1 )

for p in ps:
p.wait()

print( "compile done." )