Skip to content
This repository has been archived by the owner on Jan 26, 2024. It is now read-only.

Reduce device RTL memory footprint #139

Open
wants to merge 1 commit into
base: amd-stg-openmp
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
62 changes: 58 additions & 4 deletions clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1449,10 +1449,35 @@ void CGOpenMPRuntimeNVPTX::emitNonSPMDEntryHeader(CodeGenFunction &CGF,
CGF.EmitRuntimeCall(
createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_kernel_init), Args);

StringRef DataSharingMemorySlotName = "openmp.data.sharing.memory.slot";
size_t WarpSlotSize =
CGF.getTarget().getGridValue(llvm::omp::GV_Warp_Slot_Size);
size_t DataSharingMemorySlotSize = WarpSlotSize * 64;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This looks like a constant - I thought the idea was to compute this per-kernel?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is bit different approach than computing per kernel. For now, I am allocating 1MB of memory globally and uniquely per kernel. So the memory footprint is proportional to the number of non-spmd kernels. This size can be adjusted later during clang-build-select-link phase.


// creating a global array which will be used for data sharing slots
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why construct this in codegen instead of as an array in the devicertl?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

because deviceRTL would not know about number of kernels present in the device image.

// This will be optimized in clang-build-select-link
llvm::Type *Ty =
llvm::ArrayType::get(CGF.CGM.Int8Ty,
DataSharingMemorySlotSize);
llvm::GlobalVariable *DataSharingMemorySlot = new llvm::GlobalVariable(
CGF.CGM.getModule(), Ty,
false, llvm::GlobalValue::ExternalLinkage, nullptr,
DataSharingMemorySlotName,
nullptr, llvm::GlobalValue::NotThreadLocal,
CGF.CGM.getContext().getTargetAddressSpace(LangAS::cuda_device));

DataSharingMemorySlot->setExternallyInitialized(true);
DataSharingMemorySlot->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Local);
DataSharingMemorySlot->setInitializer(llvm::UndefValue::get(Ty));
llvm::Value *DataSharingMemoryAddr = Bld.CreatePointerBitCastOrAddrSpaceCast(
DataSharingMemorySlot, CGF.CGM.Int8PtrTy);

llvm::Value *ArgsForInitStack[] = { DataSharingMemoryAddr,
CGF.Builder.getInt64(DataSharingMemorySlotSize) };
// For data sharing, we need to initialize the stack.
CGF.EmitRuntimeCall(
createNVPTXRuntimeFunction(
OMPRTL_NVPTX__kmpc_data_sharing_init_stack));
OMPRTL_NVPTX__kmpc_data_sharing_init_stack), ArgsForInitStack);

emitGenericVarsProlog(CGF, WST.Loc);
}
Expand Down Expand Up @@ -1567,8 +1592,33 @@ void CGOpenMPRuntimeNVPTX::emitSPMDEntryHeader(

if (RequiresFullRuntime) {
// For data sharing, we need to initialize the stack.
StringRef DataSharingMemorySlotName = "openmp.data.sharing.memory.slot.spmd";
size_t WarpSlotSize =
CGF.getTarget().getGridValue(llvm::omp::GV_Warp_Slot_Size);
size_t DataSharingMemorySlotSize = WarpSlotSize * 64;

// creating a global array which will be used for data sharing slots
// This will be optimized in clang-build-select-link
llvm::Type *Ty =
llvm::ArrayType::get(CGF.CGM.Int8Ty,
DataSharingMemorySlotSize);
llvm::GlobalVariable *DataSharingMemorySlot = new llvm::GlobalVariable(
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this the same array as above?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes

CGF.CGM.getModule(), Ty,
false, llvm::GlobalValue::ExternalLinkage, nullptr,
DataSharingMemorySlotName,
nullptr, llvm::GlobalValue::NotThreadLocal,
CGF.CGM.getContext().getTargetAddressSpace(LangAS::cuda_device));

DataSharingMemorySlot->setExternallyInitialized(true);
DataSharingMemorySlot->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Local);
DataSharingMemorySlot->setInitializer(llvm::UndefValue::get(Ty));
llvm::Value *DataSharingMemoryAddr = Bld.CreatePointerBitCastOrAddrSpaceCast(
DataSharingMemorySlot, CGF.CGM.Int8PtrTy);
llvm::Value *ArgsForInitStack[] = { DataSharingMemoryAddr,
CGF.Builder.getInt64(DataSharingMemorySlotSize) };

CGF.EmitRuntimeCall(createNVPTXRuntimeFunction(
OMPRTL_NVPTX__kmpc_data_sharing_init_stack_spmd));
OMPRTL_NVPTX__kmpc_data_sharing_init_stack_spmd), ArgsForInitStack);
}

CGF.EmitBranch(ExecuteBB);
Expand Down Expand Up @@ -2009,15 +2059,19 @@ CGOpenMPRuntimeNVPTX::createNVPTXRuntimeFunction(unsigned Function) {
}
case OMPRTL_NVPTX__kmpc_data_sharing_init_stack: {
/// Build void __kmpc_data_sharing_init_stack();
llvm::Type *TypeParams[] = {CGM.Int8PtrTy,
CGM.Int64Ty};
auto *FnTy =
llvm::FunctionType::get(CGM.VoidTy, llvm::None, /*isVarArg*/ false);
llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_data_sharing_init_stack");
break;
}
case OMPRTL_NVPTX__kmpc_data_sharing_init_stack_spmd: {
/// Build void __kmpc_data_sharing_init_stack_spmd();
llvm::Type *TypeParams[] = {CGM.Int8PtrTy,
CGM.Int64Ty};
auto *FnTy =
llvm::FunctionType::get(CGM.VoidTy, llvm::None, /*isVarArg*/ false);
llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
RTLFn =
CGM.CreateRuntimeFunction(FnTy, "__kmpc_data_sharing_init_stack_spmd");
break;
Expand Down
12 changes: 6 additions & 6 deletions clang/test/OpenMP/nvptx_lambda_capturing.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -28,13 +28,13 @@

// CLASS: define internal void @__omp_offloading_{{.*}}_{{.*}}foo{{.*}}_l67_worker()
// CLASS: define weak void @__omp_offloading_{{.*}}_{{.*}}foo{{.*}}_l67([[S]]* {{%.+}}, [[CAP1]]* nonnull align 8 dereferenceable(8) {{%.+}})
// CLASS-NOT: getelementptr

// CLASS: br i1 %
// CLASS: call void @__omp_offloading_{{.*}}_{{.*}}foo{{.*}}_l67_worker()
// CLASS: br label %
// CLASS: br i1 %
// CLASS: call void @__kmpc_kernel_init(
// CLASS: call void @__kmpc_data_sharing_init_stack()
// CLASS: call void @__kmpc_data_sharing_init_stack
// CLASS: call void @llvm.memcpy.
// CLASS: [[L:%.+]] = load [[CAP1]]*, [[CAP1]]** [[L_ADDR:%.+]],
// CLASS: [[THIS_REF:%.+]] = getelementptr inbounds [[CAP1]], [[CAP1]]* [[L]], i32 0, i32 0
Expand All @@ -44,7 +44,7 @@
// CLASS: ret void

// CLASS: define weak void @__omp_offloading_{{.+}}foo{{.+}}_l69([[S]]* %{{.+}}, [[CAP1]]* nonnull align 8 dereferenceable(8) %{{.+}})
// CLASS-NOT: getelementptr

// CLASS: call void @llvm.memcpy.
// CLASS: [[L:%.+]] = load [[CAP1]]*, [[CAP1]]** [[L_ADDR:%.+]],
// CLASS: [[THIS_REF:%.+]] = getelementptr inbounds [[CAP1]], [[CAP1]]* [[L]], i32 0, i32 0
Expand Down Expand Up @@ -74,13 +74,13 @@ struct S {

// FUN: define internal void @__omp_offloading_{{.+}}_main_l124_worker()
// FUN: define weak void @__omp_offloading_{{.+}}_main_l124(i64 %{{.+}}, i32* nonnull align 4 dereferenceable(4) %{{.+}}, i32* nonnull align 4 dereferenceable(4) %{{.+}}, i32* %{{.+}}, i32* nonnull align 4 dereferenceable(4) %{{.+}}, [[CAP2]]* nonnull align 8 dereferenceable(40) %{{.+}})
// FUN-NOT: getelementptr

// FUN: br i1 %
// FUN: call void @__omp_offloading_{{.*}}_{{.*}}main{{.*}}_l124_worker()
// FUN: br label %
// FUN: br i1 %
// FUN: call void @__kmpc_kernel_init(
// FUN: call void @__kmpc_data_sharing_init_stack()
// FUN: call void @__kmpc_data_sharing_init_stack
// FUN: call void @llvm.memcpy.
// FUN: [[L:%.+]] = load [[CAP2]]*, [[CAP2]]** [[L_ADDR:%.+]],
// FUN: [[ARGC_CAP:%.+]] = getelementptr inbounds [[CAP2]], [[CAP2]]* [[L]], i32 0, i32 0
Expand All @@ -98,7 +98,7 @@ struct S {
// FUN: ret void

// FUN: define weak void @__omp_offloading_{{.+}}_main_l126(i32* nonnull align 4 dereferenceable(4) %{{.+}}, i32* nonnull align 4 dereferenceable(4) %{{.+}} i32* nonnull align 4 dereferenceable(4) %{{.+}}, i32* %{{.+}}, i32* nonnull align 4 dereferenceable(4) %{{.+}}, [[CAP2]]* nonnull align 8 dereferenceable(40) %{{.+}})
// FUN-NOT: getelementptr

// FUN: call void @llvm.memcpy.
// FUN: [[L:%.+]] = load [[CAP2]]*, [[CAP2]]** [[L_ADDR:%.+]],
// FUN: [[ARGC_CAP:%.+]] = getelementptr inbounds [[CAP2]], [[CAP2]]* [[L]], i32 0, i32 0
Expand Down
2 changes: 1 addition & 1 deletion clang/test/OpenMP/nvptx_parallel_for_codegen.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ int bar(int n){
// CHECK: define weak void @__omp_offloading_{{.*}}l13(
// CHECK: call void @__omp_offloading_{{.*}}l13_worker()
// CHECK: call void @__kmpc_kernel_init(
// CHECK: call void @__kmpc_data_sharing_init_stack()
// CHECK: call void @__kmpc_data_sharing_init_stack
// SEQ: [[IS_SHARED:%.+]] = load i16, i16* [[KERNEL_SHARED]],
// SEQ: [[SIZE:%.+]] = load i{{64|32}}, i{{64|32}}* [[KERNEL_SIZE]],
// SEQ: call void @__kmpc_get_team_static_memory(i16 0, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds ([[MEM_TY]], [[MEM_TY]] addrspace(3)* [[SHARED_GLOBAL_RD]], i32 0, i32 0, i32 0) to i8*), i64 %7, i16 %6, i8** addrspacecast (i8* addrspace(3)* [[KERNEL_PTR]] to i8**))
Expand Down
2 changes: 1 addition & 1 deletion clang/test/OpenMP/nvptx_target_codegen.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ struct TT {
// CHECK: store i32** [[PTR2_REF]], i32*** [[PTR2_REF_PTR:%.+]],
// CHECK: [[PTR2_REF:%.+]] = load i32**, i32*** [[PTR2_REF_PTR]],
// CHECK: call void @__kmpc_spmd_kernel_init(
// CHECK: call void @__kmpc_data_sharing_init_stack_spmd()
// CHECK: call void @__kmpc_data_sharing_init_stack_spmd
// CHECK: [[GTID:%.+]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @{{.+}})
// CHECK: store i32 [[GTID]], i32* [[THREADID:%.+]],
// CHECK: call void @{{.+}}(i32* [[THREADID]], i32* %{{.+}}, i32** [[PTR1_ADDR]], i32** [[PTR2_REF]])
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ int bar(int n){
// CHECK: [[AA:%.+]] = load i16*, i16** [[AA_ADDR]], align
// CHECK: [[THREAD_LIMIT:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
// CHECK: call void @__kmpc_spmd_kernel_init(i32 [[THREAD_LIMIT]], i16 1, i16 0)
// CHECK: call void @__kmpc_data_sharing_init_stack_spmd()
// CHECK: call void @__kmpc_data_sharing_init_stack_spmd
// CHECK: [[GTID:%.+]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @{{.+}})
// CHECK: store i32 [[GTID]], i32* [[THREADID:%.+]],
// CHECK: call void [[OUTLINED:@.+]](i32* [[THREADID]], i32* %{{.+}}, i16* [[AA]])
Expand Down Expand Up @@ -78,7 +78,7 @@ int bar(int n){
// CHECK: [[B:%.+]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align
// CHECK: [[THREAD_LIMIT:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
// CHECK: call void @__kmpc_spmd_kernel_init(i32 [[THREAD_LIMIT]], i16 1, i16 0)
// CHECK: call void @__kmpc_data_sharing_init_stack_spmd()
// CHECK: call void @__kmpc_data_sharing_init_stack_spmd(
// CHECK: [[GTID:%.+]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @{{.+}})
// CHECK: store i32 [[GTID]], i32* [[THREADID:%.+]],
// CHECK: call void [[OUTLINED:@.+]](i32* [[THREADID]], i32* %{{.+}}, i32* [[A]], i16* [[AA]], [10 x i32]* [[B]])
Expand Down
2 changes: 1 addition & 1 deletion clang/test/OpenMP/nvptx_teams_reduction_codegen.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -710,7 +710,7 @@ int bar(int n){
// CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l57}}(
//
// CHECK: call void @__kmpc_spmd_kernel_init(
// CHECK: call void @__kmpc_data_sharing_init_stack_spmd()
// CHECK: call void @__kmpc_data_sharing_init_stack_spmd
// CHECK: call void @__kmpc_spmd_kernel_deinit_v2(i16 1)

// CHECK-NOT: call void @{{__kmpc_get_team_static_memory|__kmpc_data_sharing_push_stack}}
Expand Down
7 changes: 4 additions & 3 deletions openmp/libomptarget/deviceRTLs/common/omptarget.h
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,7 @@ struct __kmpc_data_sharing_worker_slot_static {
__kmpc_data_sharing_slot *Prev;
void *PrevSlotStackPtr;
void *DataEnd;
char Data[DS_Worker_Warp_Slot_Size];
char *Data;
};
// Additional master slot type which is initialized with the default master slot
// size of 4 bytes.
Expand Down Expand Up @@ -256,9 +256,10 @@ class omptarget_nvptx_TeamDescr {
return (__kmpc_data_sharing_slot *)&worker_rootS[wid];
}

INLINE __kmpc_data_sharing_slot *GetPreallocatedSlotAddr(int wid) {
INLINE __kmpc_data_sharing_slot *GetPreallocatedSlotAddr(int wid, char *Data, size_t size) {
worker_rootS[wid].Data = Data;
worker_rootS[wid].DataEnd =
&worker_rootS[wid].Data[0] + DS_Worker_Warp_Slot_Size;
&worker_rootS[wid].Data[0] + size;
// We currently do not have a next slot.
worker_rootS[wid].Next = 0;
worker_rootS[wid].Prev = 0;
Expand Down
14 changes: 8 additions & 6 deletions openmp/libomptarget/deviceRTLs/common/src/data_sharing.cu
Original file line number Diff line number Diff line change
Expand Up @@ -28,13 +28,15 @@ INLINE static bool IsMasterThread(bool isSPMDExecutionMode) {
// Runtime functions for trunk data sharing scheme.
////////////////////////////////////////////////////////////////////////////////

INLINE static void data_sharing_init_stack_common() {
INLINE static void data_sharing_init_stack_common(char *Data, size_t size) {
ASSERT0(LT_FUSSY, isRuntimeInitialized(), "Runtime must be initialized.");
omptarget_nvptx_TeamDescr *teamDescr =
&omptarget_nvptx_threadPrivateContext->TeamContext();

size_t PerWarp = size / DS_Max_Warp_Number;
for (int WID = 0; WID < DS_Max_Warp_Number; WID++) {
__kmpc_data_sharing_slot *RootS = teamDescr->GetPreallocatedSlotAddr(WID);
char *MyPortion = Data + WID * PerWarp;
__kmpc_data_sharing_slot *RootS = teamDescr->GetPreallocatedSlotAddr(WID, MyPortion, PerWarp);
DataSharingState.SlotPtr[WID] = RootS;
DataSharingState.StackPtr[WID] = (void *)&RootS->Data[0];
}
Expand All @@ -44,25 +46,25 @@ INLINE static void data_sharing_init_stack_common() {
// once at the beginning of a data sharing context (coincides with the kernel
// initialization). This function is called only by the MASTER thread of each
// team in non-SPMD mode.
EXTERN void __kmpc_data_sharing_init_stack() {
EXTERN void __kmpc_data_sharing_init_stack(char *Data, size_t size) {
ASSERT0(LT_FUSSY, isRuntimeInitialized(), "Runtime must be initialized.");
// This function initializes the stack pointer with the pointer to the
// statically allocated shared memory slots. The size of a shared memory
// slot is pre-determined to be 256 bytes.
data_sharing_init_stack_common();
data_sharing_init_stack_common(Data, size);
omptarget_nvptx_globalArgs.Init();
}

// Initialize data sharing data structure. This function needs to be called
// once at the beginning of a data sharing context (coincides with the kernel
// initialization). This function is called in SPMD mode only.
EXTERN void __kmpc_data_sharing_init_stack_spmd() {
EXTERN void __kmpc_data_sharing_init_stack_spmd(char *Data, size_t size) {
ASSERT0(LT_FUSSY, isRuntimeInitialized(), "Runtime must be initialized.");
// This function initializes the stack pointer with the pointer to the
// statically allocated shared memory slots. The size of a shared memory
// slot is pre-determined to be 256 bytes.
if (GetThreadIdInBlock() == 0)
data_sharing_init_stack_common();
data_sharing_init_stack_common(Data, size);

__kmpc_impl_threadfence_block();
}
Expand Down
4 changes: 2 additions & 2 deletions openmp/libomptarget/deviceRTLs/interface.h
Original file line number Diff line number Diff line change
Expand Up @@ -433,8 +433,8 @@ EXTERN void __kmpc_kernel_prepare_parallel(void *WorkFn);
EXTERN bool __kmpc_kernel_parallel(void **WorkFn);
EXTERN void __kmpc_kernel_end_parallel();

EXTERN void __kmpc_data_sharing_init_stack();
EXTERN void __kmpc_data_sharing_init_stack_spmd();
EXTERN void __kmpc_data_sharing_init_stack(char *Data, size_t size);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Probably can't modify these prototypes without also modifying nvptx, can add more functions

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Will do. Though these changes should work with nvptx as well but I don't know how to test.

EXTERN void __kmpc_data_sharing_init_stack_spmd(char *Data, size_t size);
EXTERN void *__kmpc_data_sharing_coalesced_push_stack(size_t size,
int16_t UseSharedMemory);
EXTERN void *__kmpc_data_sharing_push_stack(size_t size, int16_t UseSharedMemory);
Expand Down