From 1e6e3e73d3e8cc46856e0beb2f862b9a5f4b6d64 Mon Sep 17 00:00:00 2001 From: Pushpinder Singh Date: Mon, 17 Aug 2020 06:17:55 -0500 Subject: [PATCH] Reduce device RTL memory footprint --- clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp | 62 +++++++++++++++++-- clang/test/OpenMP/nvptx_lambda_capturing.cpp | 12 ++-- .../OpenMP/nvptx_parallel_for_codegen.cpp | 2 +- clang/test/OpenMP/nvptx_target_codegen.cpp | 2 +- ...tx_target_parallel_num_threads_codegen.cpp | 4 +- .../OpenMP/nvptx_teams_reduction_codegen.cpp | 2 +- .../deviceRTLs/common/omptarget.h | 7 ++- .../deviceRTLs/common/src/data_sharing.cu | 14 +++-- openmp/libomptarget/deviceRTLs/interface.h | 4 +- 9 files changed, 83 insertions(+), 26 deletions(-) diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp b/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp index e9d76162e714..c7129d19c812 100644 --- a/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp +++ b/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp @@ -1449,10 +1449,35 @@ void CGOpenMPRuntimeNVPTX::emitNonSPMDEntryHeader(CodeGenFunction &CGF, CGF.EmitRuntimeCall( createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_kernel_init), Args); + StringRef DataSharingMemorySlotName = "openmp.data.sharing.memory.slot"; + size_t WarpSlotSize = + CGF.getTarget().getGridValue(llvm::omp::GV_Warp_Slot_Size); + size_t DataSharingMemorySlotSize = WarpSlotSize * 64; + + // creating a global array which will be used for data sharing slots + // This will be optimized in clang-build-select-link + llvm::Type *Ty = + llvm::ArrayType::get(CGF.CGM.Int8Ty, + DataSharingMemorySlotSize); + llvm::GlobalVariable *DataSharingMemorySlot = new llvm::GlobalVariable( + CGF.CGM.getModule(), Ty, + false, llvm::GlobalValue::ExternalLinkage, nullptr, + DataSharingMemorySlotName, + nullptr, llvm::GlobalValue::NotThreadLocal, + CGF.CGM.getContext().getTargetAddressSpace(LangAS::cuda_device)); + + DataSharingMemorySlot->setExternallyInitialized(true); + DataSharingMemorySlot->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Local); + DataSharingMemorySlot->setInitializer(llvm::UndefValue::get(Ty)); + llvm::Value *DataSharingMemoryAddr = Bld.CreatePointerBitCastOrAddrSpaceCast( + DataSharingMemorySlot, CGF.CGM.Int8PtrTy); + + llvm::Value *ArgsForInitStack[] = { DataSharingMemoryAddr, + CGF.Builder.getInt64(DataSharingMemorySlotSize) }; // For data sharing, we need to initialize the stack. CGF.EmitRuntimeCall( createNVPTXRuntimeFunction( - OMPRTL_NVPTX__kmpc_data_sharing_init_stack)); + OMPRTL_NVPTX__kmpc_data_sharing_init_stack), ArgsForInitStack); emitGenericVarsProlog(CGF, WST.Loc); } @@ -1567,8 +1592,33 @@ void CGOpenMPRuntimeNVPTX::emitSPMDEntryHeader( if (RequiresFullRuntime) { // For data sharing, we need to initialize the stack. + StringRef DataSharingMemorySlotName = "openmp.data.sharing.memory.slot.spmd"; + size_t WarpSlotSize = + CGF.getTarget().getGridValue(llvm::omp::GV_Warp_Slot_Size); + size_t DataSharingMemorySlotSize = WarpSlotSize * 64; + + // creating a global array which will be used for data sharing slots + // This will be optimized in clang-build-select-link + llvm::Type *Ty = + llvm::ArrayType::get(CGF.CGM.Int8Ty, + DataSharingMemorySlotSize); + llvm::GlobalVariable *DataSharingMemorySlot = new llvm::GlobalVariable( + CGF.CGM.getModule(), Ty, + false, llvm::GlobalValue::ExternalLinkage, nullptr, + DataSharingMemorySlotName, + nullptr, llvm::GlobalValue::NotThreadLocal, + CGF.CGM.getContext().getTargetAddressSpace(LangAS::cuda_device)); + + DataSharingMemorySlot->setExternallyInitialized(true); + DataSharingMemorySlot->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Local); + DataSharingMemorySlot->setInitializer(llvm::UndefValue::get(Ty)); + llvm::Value *DataSharingMemoryAddr = Bld.CreatePointerBitCastOrAddrSpaceCast( + DataSharingMemorySlot, CGF.CGM.Int8PtrTy); + llvm::Value *ArgsForInitStack[] = { DataSharingMemoryAddr, + CGF.Builder.getInt64(DataSharingMemorySlotSize) }; + CGF.EmitRuntimeCall(createNVPTXRuntimeFunction( - OMPRTL_NVPTX__kmpc_data_sharing_init_stack_spmd)); + OMPRTL_NVPTX__kmpc_data_sharing_init_stack_spmd), ArgsForInitStack); } CGF.EmitBranch(ExecuteBB); @@ -2009,15 +2059,19 @@ CGOpenMPRuntimeNVPTX::createNVPTXRuntimeFunction(unsigned Function) { } case OMPRTL_NVPTX__kmpc_data_sharing_init_stack: { /// Build void __kmpc_data_sharing_init_stack(); + llvm::Type *TypeParams[] = {CGM.Int8PtrTy, + CGM.Int64Ty}; auto *FnTy = - llvm::FunctionType::get(CGM.VoidTy, llvm::None, /*isVarArg*/ false); + llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false); RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_data_sharing_init_stack"); break; } case OMPRTL_NVPTX__kmpc_data_sharing_init_stack_spmd: { /// Build void __kmpc_data_sharing_init_stack_spmd(); + llvm::Type *TypeParams[] = {CGM.Int8PtrTy, + CGM.Int64Ty}; auto *FnTy = - llvm::FunctionType::get(CGM.VoidTy, llvm::None, /*isVarArg*/ false); + llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false); RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_data_sharing_init_stack_spmd"); break; diff --git a/clang/test/OpenMP/nvptx_lambda_capturing.cpp b/clang/test/OpenMP/nvptx_lambda_capturing.cpp index 8fe918b043cf..82c00ecb2a30 100644 --- a/clang/test/OpenMP/nvptx_lambda_capturing.cpp +++ b/clang/test/OpenMP/nvptx_lambda_capturing.cpp @@ -28,13 +28,13 @@ // CLASS: define internal void @__omp_offloading_{{.*}}_{{.*}}foo{{.*}}_l67_worker() // CLASS: define weak void @__omp_offloading_{{.*}}_{{.*}}foo{{.*}}_l67([[S]]* {{%.+}}, [[CAP1]]* nonnull align 8 dereferenceable(8) {{%.+}}) -// CLASS-NOT: getelementptr + // CLASS: br i1 % // CLASS: call void @__omp_offloading_{{.*}}_{{.*}}foo{{.*}}_l67_worker() // CLASS: br label % // CLASS: br i1 % // CLASS: call void @__kmpc_kernel_init( -// CLASS: call void @__kmpc_data_sharing_init_stack() +// CLASS: call void @__kmpc_data_sharing_init_stack // CLASS: call void @llvm.memcpy. // CLASS: [[L:%.+]] = load [[CAP1]]*, [[CAP1]]** [[L_ADDR:%.+]], // CLASS: [[THIS_REF:%.+]] = getelementptr inbounds [[CAP1]], [[CAP1]]* [[L]], i32 0, i32 0 @@ -44,7 +44,7 @@ // CLASS: ret void // CLASS: define weak void @__omp_offloading_{{.+}}foo{{.+}}_l69([[S]]* %{{.+}}, [[CAP1]]* nonnull align 8 dereferenceable(8) %{{.+}}) -// CLASS-NOT: getelementptr + // CLASS: call void @llvm.memcpy. // CLASS: [[L:%.+]] = load [[CAP1]]*, [[CAP1]]** [[L_ADDR:%.+]], // CLASS: [[THIS_REF:%.+]] = getelementptr inbounds [[CAP1]], [[CAP1]]* [[L]], i32 0, i32 0 @@ -74,13 +74,13 @@ struct S { // FUN: define internal void @__omp_offloading_{{.+}}_main_l124_worker() // FUN: define weak void @__omp_offloading_{{.+}}_main_l124(i64 %{{.+}}, i32* nonnull align 4 dereferenceable(4) %{{.+}}, i32* nonnull align 4 dereferenceable(4) %{{.+}}, i32* %{{.+}}, i32* nonnull align 4 dereferenceable(4) %{{.+}}, [[CAP2]]* nonnull align 8 dereferenceable(40) %{{.+}}) -// FUN-NOT: getelementptr + // FUN: br i1 % // FUN: call void @__omp_offloading_{{.*}}_{{.*}}main{{.*}}_l124_worker() // FUN: br label % // FUN: br i1 % // FUN: call void @__kmpc_kernel_init( -// FUN: call void @__kmpc_data_sharing_init_stack() +// FUN: call void @__kmpc_data_sharing_init_stack // FUN: call void @llvm.memcpy. // FUN: [[L:%.+]] = load [[CAP2]]*, [[CAP2]]** [[L_ADDR:%.+]], // FUN: [[ARGC_CAP:%.+]] = getelementptr inbounds [[CAP2]], [[CAP2]]* [[L]], i32 0, i32 0 @@ -98,7 +98,7 @@ struct S { // FUN: ret void // FUN: define weak void @__omp_offloading_{{.+}}_main_l126(i32* nonnull align 4 dereferenceable(4) %{{.+}}, i32* nonnull align 4 dereferenceable(4) %{{.+}} i32* nonnull align 4 dereferenceable(4) %{{.+}}, i32* %{{.+}}, i32* nonnull align 4 dereferenceable(4) %{{.+}}, [[CAP2]]* nonnull align 8 dereferenceable(40) %{{.+}}) -// FUN-NOT: getelementptr + // FUN: call void @llvm.memcpy. // FUN: [[L:%.+]] = load [[CAP2]]*, [[CAP2]]** [[L_ADDR:%.+]], // FUN: [[ARGC_CAP:%.+]] = getelementptr inbounds [[CAP2]], [[CAP2]]* [[L]], i32 0, i32 0 diff --git a/clang/test/OpenMP/nvptx_parallel_for_codegen.cpp b/clang/test/OpenMP/nvptx_parallel_for_codegen.cpp index 4ef167de3b8a..d81757ce9b06 100644 --- a/clang/test/OpenMP/nvptx_parallel_for_codegen.cpp +++ b/clang/test/OpenMP/nvptx_parallel_for_codegen.cpp @@ -45,7 +45,7 @@ int bar(int n){ // CHECK: define weak void @__omp_offloading_{{.*}}l13( // CHECK: call void @__omp_offloading_{{.*}}l13_worker() // CHECK: call void @__kmpc_kernel_init( -// CHECK: call void @__kmpc_data_sharing_init_stack() +// CHECK: call void @__kmpc_data_sharing_init_stack // SEQ: [[IS_SHARED:%.+]] = load i16, i16* [[KERNEL_SHARED]], // SEQ: [[SIZE:%.+]] = load i{{64|32}}, i{{64|32}}* [[KERNEL_SIZE]], // SEQ: call void @__kmpc_get_team_static_memory(i16 0, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds ([[MEM_TY]], [[MEM_TY]] addrspace(3)* [[SHARED_GLOBAL_RD]], i32 0, i32 0, i32 0) to i8*), i64 %7, i16 %6, i8** addrspacecast (i8* addrspace(3)* [[KERNEL_PTR]] to i8**)) diff --git a/clang/test/OpenMP/nvptx_target_codegen.cpp b/clang/test/OpenMP/nvptx_target_codegen.cpp index 56f04cb01f0a..fdc1116bee1c 100644 --- a/clang/test/OpenMP/nvptx_target_codegen.cpp +++ b/clang/test/OpenMP/nvptx_target_codegen.cpp @@ -36,7 +36,7 @@ struct TT { // CHECK: store i32** [[PTR2_REF]], i32*** [[PTR2_REF_PTR:%.+]], // CHECK: [[PTR2_REF:%.+]] = load i32**, i32*** [[PTR2_REF_PTR]], // CHECK: call void @__kmpc_spmd_kernel_init( -// CHECK: call void @__kmpc_data_sharing_init_stack_spmd() +// CHECK: call void @__kmpc_data_sharing_init_stack_spmd // CHECK: [[GTID:%.+]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @{{.+}}) // CHECK: store i32 [[GTID]], i32* [[THREADID:%.+]], // CHECK: call void @{{.+}}(i32* [[THREADID]], i32* %{{.+}}, i32** [[PTR1_ADDR]], i32** [[PTR2_REF]]) diff --git a/clang/test/OpenMP/nvptx_target_parallel_num_threads_codegen.cpp b/clang/test/OpenMP/nvptx_target_parallel_num_threads_codegen.cpp index 1aac48198415..5ed0904dedf7 100644 --- a/clang/test/OpenMP/nvptx_target_parallel_num_threads_codegen.cpp +++ b/clang/test/OpenMP/nvptx_target_parallel_num_threads_codegen.cpp @@ -47,7 +47,7 @@ int bar(int n){ // CHECK: [[AA:%.+]] = load i16*, i16** [[AA_ADDR]], align // CHECK: [[THREAD_LIMIT:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() // CHECK: call void @__kmpc_spmd_kernel_init(i32 [[THREAD_LIMIT]], i16 1, i16 0) - // CHECK: call void @__kmpc_data_sharing_init_stack_spmd() + // CHECK: call void @__kmpc_data_sharing_init_stack_spmd // CHECK: [[GTID:%.+]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @{{.+}}) // CHECK: store i32 [[GTID]], i32* [[THREADID:%.+]], // CHECK: call void [[OUTLINED:@.+]](i32* [[THREADID]], i32* %{{.+}}, i16* [[AA]]) @@ -78,7 +78,7 @@ int bar(int n){ // CHECK: [[B:%.+]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align // CHECK: [[THREAD_LIMIT:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() // CHECK: call void @__kmpc_spmd_kernel_init(i32 [[THREAD_LIMIT]], i16 1, i16 0) - // CHECK: call void @__kmpc_data_sharing_init_stack_spmd() + // CHECK: call void @__kmpc_data_sharing_init_stack_spmd( // CHECK: [[GTID:%.+]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @{{.+}}) // CHECK: store i32 [[GTID]], i32* [[THREADID:%.+]], // CHECK: call void [[OUTLINED:@.+]](i32* [[THREADID]], i32* %{{.+}}, i32* [[A]], i16* [[AA]], [10 x i32]* [[B]]) diff --git a/clang/test/OpenMP/nvptx_teams_reduction_codegen.cpp b/clang/test/OpenMP/nvptx_teams_reduction_codegen.cpp index 5aa3e1978b10..b2f132c0a135 100644 --- a/clang/test/OpenMP/nvptx_teams_reduction_codegen.cpp +++ b/clang/test/OpenMP/nvptx_teams_reduction_codegen.cpp @@ -710,7 +710,7 @@ int bar(int n){ // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l57}}( // // CHECK: call void @__kmpc_spmd_kernel_init( - // CHECK: call void @__kmpc_data_sharing_init_stack_spmd() + // CHECK: call void @__kmpc_data_sharing_init_stack_spmd // CHECK: call void @__kmpc_spmd_kernel_deinit_v2(i16 1) // CHECK-NOT: call void @{{__kmpc_get_team_static_memory|__kmpc_data_sharing_push_stack}} diff --git a/openmp/libomptarget/deviceRTLs/common/omptarget.h b/openmp/libomptarget/deviceRTLs/common/omptarget.h index 4f9f3d521135..de7d276081c5 100644 --- a/openmp/libomptarget/deviceRTLs/common/omptarget.h +++ b/openmp/libomptarget/deviceRTLs/common/omptarget.h @@ -91,7 +91,7 @@ struct __kmpc_data_sharing_worker_slot_static { __kmpc_data_sharing_slot *Prev; void *PrevSlotStackPtr; void *DataEnd; - char Data[DS_Worker_Warp_Slot_Size]; + char *Data; }; // Additional master slot type which is initialized with the default master slot // size of 4 bytes. @@ -256,9 +256,10 @@ class omptarget_nvptx_TeamDescr { return (__kmpc_data_sharing_slot *)&worker_rootS[wid]; } - INLINE __kmpc_data_sharing_slot *GetPreallocatedSlotAddr(int wid) { + INLINE __kmpc_data_sharing_slot *GetPreallocatedSlotAddr(int wid, char *Data, size_t size) { + worker_rootS[wid].Data = Data; worker_rootS[wid].DataEnd = - &worker_rootS[wid].Data[0] + DS_Worker_Warp_Slot_Size; + &worker_rootS[wid].Data[0] + size; // We currently do not have a next slot. worker_rootS[wid].Next = 0; worker_rootS[wid].Prev = 0; diff --git a/openmp/libomptarget/deviceRTLs/common/src/data_sharing.cu b/openmp/libomptarget/deviceRTLs/common/src/data_sharing.cu index 4e4ec32fd936..39a6d7802a5b 100644 --- a/openmp/libomptarget/deviceRTLs/common/src/data_sharing.cu +++ b/openmp/libomptarget/deviceRTLs/common/src/data_sharing.cu @@ -28,13 +28,15 @@ INLINE static bool IsMasterThread(bool isSPMDExecutionMode) { // Runtime functions for trunk data sharing scheme. //////////////////////////////////////////////////////////////////////////////// -INLINE static void data_sharing_init_stack_common() { +INLINE static void data_sharing_init_stack_common(char *Data, size_t size) { ASSERT0(LT_FUSSY, isRuntimeInitialized(), "Runtime must be initialized."); omptarget_nvptx_TeamDescr *teamDescr = &omptarget_nvptx_threadPrivateContext->TeamContext(); + size_t PerWarp = size / DS_Max_Warp_Number; for (int WID = 0; WID < DS_Max_Warp_Number; WID++) { - __kmpc_data_sharing_slot *RootS = teamDescr->GetPreallocatedSlotAddr(WID); + char *MyPortion = Data + WID * PerWarp; + __kmpc_data_sharing_slot *RootS = teamDescr->GetPreallocatedSlotAddr(WID, MyPortion, PerWarp); DataSharingState.SlotPtr[WID] = RootS; DataSharingState.StackPtr[WID] = (void *)&RootS->Data[0]; } @@ -44,25 +46,25 @@ INLINE static void data_sharing_init_stack_common() { // once at the beginning of a data sharing context (coincides with the kernel // initialization). This function is called only by the MASTER thread of each // team in non-SPMD mode. -EXTERN void __kmpc_data_sharing_init_stack() { +EXTERN void __kmpc_data_sharing_init_stack(char *Data, size_t size) { ASSERT0(LT_FUSSY, isRuntimeInitialized(), "Runtime must be initialized."); // This function initializes the stack pointer with the pointer to the // statically allocated shared memory slots. The size of a shared memory // slot is pre-determined to be 256 bytes. - data_sharing_init_stack_common(); + data_sharing_init_stack_common(Data, size); omptarget_nvptx_globalArgs.Init(); } // Initialize data sharing data structure. This function needs to be called // once at the beginning of a data sharing context (coincides with the kernel // initialization). This function is called in SPMD mode only. -EXTERN void __kmpc_data_sharing_init_stack_spmd() { +EXTERN void __kmpc_data_sharing_init_stack_spmd(char *Data, size_t size) { ASSERT0(LT_FUSSY, isRuntimeInitialized(), "Runtime must be initialized."); // This function initializes the stack pointer with the pointer to the // statically allocated shared memory slots. The size of a shared memory // slot is pre-determined to be 256 bytes. if (GetThreadIdInBlock() == 0) - data_sharing_init_stack_common(); + data_sharing_init_stack_common(Data, size); __kmpc_impl_threadfence_block(); } diff --git a/openmp/libomptarget/deviceRTLs/interface.h b/openmp/libomptarget/deviceRTLs/interface.h index 3a51fd5cd3ae..2a2680789b8b 100644 --- a/openmp/libomptarget/deviceRTLs/interface.h +++ b/openmp/libomptarget/deviceRTLs/interface.h @@ -433,8 +433,8 @@ EXTERN void __kmpc_kernel_prepare_parallel(void *WorkFn); EXTERN bool __kmpc_kernel_parallel(void **WorkFn); EXTERN void __kmpc_kernel_end_parallel(); -EXTERN void __kmpc_data_sharing_init_stack(); -EXTERN void __kmpc_data_sharing_init_stack_spmd(); +EXTERN void __kmpc_data_sharing_init_stack(char *Data, size_t size); +EXTERN void __kmpc_data_sharing_init_stack_spmd(char *Data, size_t size); EXTERN void *__kmpc_data_sharing_coalesced_push_stack(size_t size, int16_t UseSharedMemory); EXTERN void *__kmpc_data_sharing_push_stack(size_t size, int16_t UseSharedMemory);