-
Notifications
You must be signed in to change notification settings - Fork 6
Reduce device RTL memory footprint #139
base: amd-stg-openmp
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -1449,10 +1449,35 @@ void CGOpenMPRuntimeNVPTX::emitNonSPMDEntryHeader(CodeGenFunction &CGF, | |
CGF.EmitRuntimeCall( | ||
createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_kernel_init), Args); | ||
|
||
StringRef DataSharingMemorySlotName = "openmp.data.sharing.memory.slot"; | ||
size_t WarpSlotSize = | ||
CGF.getTarget().getGridValue(llvm::omp::GV_Warp_Slot_Size); | ||
size_t DataSharingMemorySlotSize = WarpSlotSize * 64; | ||
|
||
// creating a global array which will be used for data sharing slots | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why construct this in codegen instead of as an array in the devicertl? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. because deviceRTL would not know about number of kernels present in the device image. |
||
// This will be optimized in clang-build-select-link | ||
llvm::Type *Ty = | ||
llvm::ArrayType::get(CGF.CGM.Int8Ty, | ||
DataSharingMemorySlotSize); | ||
llvm::GlobalVariable *DataSharingMemorySlot = new llvm::GlobalVariable( | ||
CGF.CGM.getModule(), Ty, | ||
false, llvm::GlobalValue::ExternalLinkage, nullptr, | ||
DataSharingMemorySlotName, | ||
nullptr, llvm::GlobalValue::NotThreadLocal, | ||
CGF.CGM.getContext().getTargetAddressSpace(LangAS::cuda_device)); | ||
|
||
DataSharingMemorySlot->setExternallyInitialized(true); | ||
DataSharingMemorySlot->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Local); | ||
DataSharingMemorySlot->setInitializer(llvm::UndefValue::get(Ty)); | ||
llvm::Value *DataSharingMemoryAddr = Bld.CreatePointerBitCastOrAddrSpaceCast( | ||
DataSharingMemorySlot, CGF.CGM.Int8PtrTy); | ||
|
||
llvm::Value *ArgsForInitStack[] = { DataSharingMemoryAddr, | ||
CGF.Builder.getInt64(DataSharingMemorySlotSize) }; | ||
// For data sharing, we need to initialize the stack. | ||
CGF.EmitRuntimeCall( | ||
createNVPTXRuntimeFunction( | ||
OMPRTL_NVPTX__kmpc_data_sharing_init_stack)); | ||
OMPRTL_NVPTX__kmpc_data_sharing_init_stack), ArgsForInitStack); | ||
|
||
emitGenericVarsProlog(CGF, WST.Loc); | ||
} | ||
|
@@ -1567,8 +1592,33 @@ void CGOpenMPRuntimeNVPTX::emitSPMDEntryHeader( | |
|
||
if (RequiresFullRuntime) { | ||
// For data sharing, we need to initialize the stack. | ||
StringRef DataSharingMemorySlotName = "openmp.data.sharing.memory.slot.spmd"; | ||
size_t WarpSlotSize = | ||
CGF.getTarget().getGridValue(llvm::omp::GV_Warp_Slot_Size); | ||
size_t DataSharingMemorySlotSize = WarpSlotSize * 64; | ||
|
||
// creating a global array which will be used for data sharing slots | ||
// This will be optimized in clang-build-select-link | ||
llvm::Type *Ty = | ||
llvm::ArrayType::get(CGF.CGM.Int8Ty, | ||
DataSharingMemorySlotSize); | ||
llvm::GlobalVariable *DataSharingMemorySlot = new llvm::GlobalVariable( | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is this the same array as above? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes |
||
CGF.CGM.getModule(), Ty, | ||
false, llvm::GlobalValue::ExternalLinkage, nullptr, | ||
DataSharingMemorySlotName, | ||
nullptr, llvm::GlobalValue::NotThreadLocal, | ||
CGF.CGM.getContext().getTargetAddressSpace(LangAS::cuda_device)); | ||
|
||
DataSharingMemorySlot->setExternallyInitialized(true); | ||
DataSharingMemorySlot->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Local); | ||
DataSharingMemorySlot->setInitializer(llvm::UndefValue::get(Ty)); | ||
llvm::Value *DataSharingMemoryAddr = Bld.CreatePointerBitCastOrAddrSpaceCast( | ||
DataSharingMemorySlot, CGF.CGM.Int8PtrTy); | ||
llvm::Value *ArgsForInitStack[] = { DataSharingMemoryAddr, | ||
CGF.Builder.getInt64(DataSharingMemorySlotSize) }; | ||
|
||
CGF.EmitRuntimeCall(createNVPTXRuntimeFunction( | ||
OMPRTL_NVPTX__kmpc_data_sharing_init_stack_spmd)); | ||
OMPRTL_NVPTX__kmpc_data_sharing_init_stack_spmd), ArgsForInitStack); | ||
} | ||
|
||
CGF.EmitBranch(ExecuteBB); | ||
|
@@ -2009,15 +2059,19 @@ CGOpenMPRuntimeNVPTX::createNVPTXRuntimeFunction(unsigned Function) { | |
} | ||
case OMPRTL_NVPTX__kmpc_data_sharing_init_stack: { | ||
/// Build void __kmpc_data_sharing_init_stack(); | ||
llvm::Type *TypeParams[] = {CGM.Int8PtrTy, | ||
CGM.Int64Ty}; | ||
auto *FnTy = | ||
llvm::FunctionType::get(CGM.VoidTy, llvm::None, /*isVarArg*/ false); | ||
llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false); | ||
RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_data_sharing_init_stack"); | ||
break; | ||
} | ||
case OMPRTL_NVPTX__kmpc_data_sharing_init_stack_spmd: { | ||
/// Build void __kmpc_data_sharing_init_stack_spmd(); | ||
llvm::Type *TypeParams[] = {CGM.Int8PtrTy, | ||
CGM.Int64Ty}; | ||
auto *FnTy = | ||
llvm::FunctionType::get(CGM.VoidTy, llvm::None, /*isVarArg*/ false); | ||
llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false); | ||
RTLFn = | ||
CGM.CreateRuntimeFunction(FnTy, "__kmpc_data_sharing_init_stack_spmd"); | ||
break; | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -433,8 +433,8 @@ EXTERN void __kmpc_kernel_prepare_parallel(void *WorkFn); | |
EXTERN bool __kmpc_kernel_parallel(void **WorkFn); | ||
EXTERN void __kmpc_kernel_end_parallel(); | ||
|
||
EXTERN void __kmpc_data_sharing_init_stack(); | ||
EXTERN void __kmpc_data_sharing_init_stack_spmd(); | ||
EXTERN void __kmpc_data_sharing_init_stack(char *Data, size_t size); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Probably can't modify these prototypes without also modifying nvptx, can add more functions There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Will do. Though these changes should work with nvptx as well but I don't know how to test. |
||
EXTERN void __kmpc_data_sharing_init_stack_spmd(char *Data, size_t size); | ||
EXTERN void *__kmpc_data_sharing_coalesced_push_stack(size_t size, | ||
int16_t UseSharedMemory); | ||
EXTERN void *__kmpc_data_sharing_push_stack(size_t size, int16_t UseSharedMemory); | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This looks like a constant - I thought the idea was to compute this per-kernel?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is bit different approach than computing per kernel. For now, I am allocating 1MB of memory globally and uniquely per kernel. So the memory footprint is proportional to the number of non-spmd kernels. This size can be adjusted later during clang-build-select-link phase.