Skip to content

Commit

Permalink
Merged master:4837daf8836 into amd-gfx:b603a109cd4
Browse files Browse the repository at this point in the history
Local branch amd-gfx b603a10 Merge master:03b902752e3 into amd-gfx.
Remote branch master 4837daf [DSE,MSSA] Check if Def is removable only wen we try to remove it.
  • Loading branch information
Sw authored and Sw committed Jun 25, 2020
2 parents b603a10 + 4837daf commit 4d9eea5
Show file tree
Hide file tree
Showing 24 changed files with 3,559 additions and 1,911 deletions.
1 change: 1 addition & 0 deletions clang/include/clang/Basic/LangOptions.def
Original file line number Diff line number Diff line change
Expand Up @@ -231,6 +231,7 @@ LANGOPT(OpenMPCUDANumSMs , 32, 0, "Number of SMs for CUDA devices.")
LANGOPT(OpenMPCUDABlocksPerSM , 32, 0, "Number of blocks per SM for CUDA devices.")
LANGOPT(OpenMPCUDAReductionBufNum , 32, 1024, "Number of the reduction records in the intermediate reduction buffer used for the teams reductions.")
LANGOPT(OpenMPOptimisticCollapse , 1, 0, "Use at most 32 bits to represent the collapsed loop nest counter.")
LANGOPT(OpenMPCUDATargetParallel, 1, 0, "Support parallel execution of target region on Cuda-based devices.")
LANGOPT(RenderScript , 1, 0, "RenderScript")

LANGOPT(CUDAIsDevice , 1, 0, "compiling for CUDA device")
Expand Down
6 changes: 6 additions & 0 deletions clang/include/clang/Driver/Options.td
Original file line number Diff line number Diff line change
Expand Up @@ -1687,6 +1687,12 @@ def fopenmp_optimistic_collapse : Flag<["-"], "fopenmp-optimistic-collapse">, Gr
Flags<[CC1Option, NoArgumentUnused, HelpHidden]>;
def fno_openmp_optimistic_collapse : Flag<["-"], "fno-openmp-optimistic-collapse">, Group<f_Group>,
Flags<[NoArgumentUnused, HelpHidden]>;
def fopenmp_cuda_parallel_target_regions : Flag<["-"], "fopenmp-cuda-parallel-target-regions">, Group<f_Group>,
Flags<[CC1Option, NoArgumentUnused, HelpHidden]>,
HelpText<"Support parallel execution of target regions on Cuda-based devices.">;
def fno_openmp_cuda_parallel_target_regions : Flag<["-"], "fno-openmp-cuda-parallel-target-regions">, Group<f_Group>,
Flags<[NoArgumentUnused, HelpHidden]>,
HelpText<"Support only serial execution of target regions on Cuda-based devices.">;
def static_openmp: Flag<["-"], "static-openmp">,
HelpText<"Use the static host OpenMP runtime while linking.">;
def fno_optimize_sibling_calls : Flag<["-"], "fno-optimize-sibling-calls">, Group<f_Group>;
Expand Down
28 changes: 23 additions & 5 deletions clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,9 @@ enum OpenMPRTLFunctionNVPTX {
/// Call to void* __kmpc_data_sharing_coalesced_push_stack(size_t size,
/// int16_t UseSharedMemory);
OMPRTL_NVPTX__kmpc_data_sharing_coalesced_push_stack,
/// Call to void* __kmpc_data_sharing_push_stack(size_t size, int16_t
/// UseSharedMemory);
OMPRTL_NVPTX__kmpc_data_sharing_push_stack,
/// Call to void __kmpc_data_sharing_pop_stack(void *a);
OMPRTL_NVPTX__kmpc_data_sharing_pop_stack,
/// Call to void __kmpc_begin_sharing_variables(void ***args,
Expand Down Expand Up @@ -1753,6 +1756,16 @@ CGOpenMPRuntimeNVPTX::createNVPTXRuntimeFunction(unsigned Function) {
FnTy, /*Name=*/"__kmpc_data_sharing_coalesced_push_stack");
break;
}
case OMPRTL_NVPTX__kmpc_data_sharing_push_stack: {
// Build void *__kmpc_data_sharing_push_stack(size_t size, int16_t
// UseSharedMemory);
llvm::Type *TypeParams[] = {CGM.SizeTy, CGM.Int16Ty};
auto *FnTy =
llvm::FunctionType::get(CGM.VoidPtrTy, TypeParams, /*isVarArg=*/false);
RTLFn = CGM.CreateRuntimeFunction(
FnTy, /*Name=*/"__kmpc_data_sharing_push_stack");
break;
}
case OMPRTL_NVPTX__kmpc_data_sharing_pop_stack: {
// Build void __kmpc_data_sharing_pop_stack(void *a);
llvm::Type *TypeParams[] = {CGM.VoidPtrTy};
Expand Down Expand Up @@ -2210,7 +2223,7 @@ void CGOpenMPRuntimeNVPTX::emitGenericVarsProlog(CodeGenFunction &CGF,
GlobalRecCastAddr = Phi;
I->getSecond().GlobalRecordAddr = Phi;
I->getSecond().IsInSPMDModeFlag = IsSPMD;
} else if (IsInTTDRegion) {
} else if (!CGM.getLangOpts().OpenMPCUDATargetParallel && IsInTTDRegion) {
assert(GlobalizedRecords.back().Records.size() < 2 &&
"Expected less than 2 globalized records: one for target and one "
"for teams.");
Expand Down Expand Up @@ -2283,12 +2296,16 @@ void CGOpenMPRuntimeNVPTX::emitGenericVarsProlog(CodeGenFunction &CGF,
} else {
// TODO: allow the usage of shared memory to be controlled by
// the user, for now, default to global.
bool UseSharedMemory =
IsInTTDRegion && GlobalRecordSize <= SharedMemorySize;
llvm::Value *GlobalRecordSizeArg[] = {
llvm::ConstantInt::get(CGM.SizeTy, GlobalRecordSize),
CGF.Builder.getInt16(/*UseSharedMemory=*/0)};
CGF.Builder.getInt16(UseSharedMemory ? 1 : 0)};
llvm::Value *GlobalRecValue = CGF.EmitRuntimeCall(
createNVPTXRuntimeFunction(
OMPRTL_NVPTX__kmpc_data_sharing_coalesced_push_stack),
IsInTTDRegion
? OMPRTL_NVPTX__kmpc_data_sharing_push_stack
: OMPRTL_NVPTX__kmpc_data_sharing_coalesced_push_stack),
GlobalRecordSizeArg);
GlobalRecCastAddr = Bld.CreatePointerBitCastOrAddrSpaceCast(
GlobalRecValue, GlobalRecPtrTy);
Expand Down Expand Up @@ -2435,7 +2452,7 @@ void CGOpenMPRuntimeNVPTX::emitGenericVarsEpilog(CodeGenFunction &CGF,
OMPRTL_NVPTX__kmpc_data_sharing_pop_stack),
CGF.EmitCastToVoidPtr(I->getSecond().GlobalRecordAddr));
CGF.EmitBlock(ExitBB);
} else if (IsInTTDRegion) {
} else if (!CGM.getLangOpts().OpenMPCUDATargetParallel && IsInTTDRegion) {
assert(GlobalizedRecords.back().RegionCounter > 0 &&
"region counter must be > 0.");
--GlobalizedRecords.back().RegionCounter;
Expand Down Expand Up @@ -5085,7 +5102,8 @@ static std::pair<unsigned, unsigned> getSMsBlocksPerSM(CodeGenModule &CGM) {
}

void CGOpenMPRuntimeNVPTX::clear() {
if (!GlobalizedRecords.empty()) {
if (!GlobalizedRecords.empty() &&
!CGM.getLangOpts().OpenMPCUDATargetParallel) {
ASTContext &C = CGM.getContext();
llvm::SmallVector<const GlobalPtrSizeRecsTy *, 4> GlobalRecs;
llvm::SmallVector<const GlobalPtrSizeRecsTy *, 4> SharedRecs;
Expand Down
7 changes: 7 additions & 0 deletions clang/lib/Driver/ToolChains/Clang.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5257,6 +5257,13 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
options::OPT_fno_openmp_cuda_mode, /*Default=*/false))
CmdArgs.push_back("-fopenmp-cuda-mode");

// When in OpenMP offloading mode with NVPTX target, forward
// cuda-parallel-target-regions flag
if (Args.hasFlag(options::OPT_fopenmp_cuda_parallel_target_regions,
options::OPT_fno_openmp_cuda_parallel_target_regions,
/*Default=*/true))
CmdArgs.push_back("-fopenmp-cuda-parallel-target-regions");

// When in OpenMP offloading mode with NVPTX target, check if full runtime
// is required.
if (Args.hasFlag(options::OPT_fopenmp_cuda_force_full_runtime,
Expand Down
6 changes: 6 additions & 0 deletions clang/lib/Frontend/CompilerInvocation.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3195,6 +3195,12 @@ static void ParseLangArgs(LangOptions &Opts, ArgList &Args, InputKind IK,
Opts.OpenMPCUDAMode = Opts.OpenMPIsDevice && (T.isNVPTX() || T.isAMDGCN()) &&
Args.hasArg(options::OPT_fopenmp_cuda_mode);

// Set CUDA support for parallel execution of target regions for OpenMP target
// NVPTX/AMDGCN if specified in options.
Opts.OpenMPCUDATargetParallel =
Opts.OpenMPIsDevice && (T.isNVPTX() || T.isAMDGCN()) &&
Args.hasArg(options::OPT_fopenmp_cuda_parallel_target_regions);

// Set CUDA mode for OpenMP target NVPTX/AMDGCN if specified in options
Opts.OpenMPCUDAForceFullRuntime =
Opts.OpenMPIsDevice && (T.isNVPTX() || T.isAMDGCN()) &&
Expand Down
9 changes: 9 additions & 0 deletions clang/lib/Sema/SemaType.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8110,6 +8110,15 @@ static void processTypeAttrs(TypeProcessingState &state, QualType &type,
case ParsedAttr::AT_AcquireHandle: {
if (!type->isFunctionType())
return;

if (attr.getNumArgs() != 1) {
state.getSema().Diag(attr.getLoc(),
diag::err_attribute_wrong_number_arguments)
<< attr << 1;
attr.setInvalid();
return;
}

StringRef HandleType;
if (!state.getSema().checkStringLiteralArgumentAttr(attr, 0, HandleType))
return;
Expand Down
29 changes: 16 additions & 13 deletions clang/test/OpenMP/nvptx_data_sharing.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,8 @@
///==========================================================================///

// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc
// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix CK1
// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix CK1 --check-prefix SEQ
// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -disable-llvm-optzns -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix CK1 --check-prefix PAR

// expected-no-diagnostics

Expand All @@ -26,11 +27,11 @@ void test_ds(){
}
}
}
// CK1: [[MEM_TY:%.+]] = type { [128 x i8] }
// CK1-DAG: [[SHARED_GLOBAL_RD:@.+]] = common addrspace(3) global [[MEM_TY]] zeroinitializer
// CK1-DAG: [[KERNEL_PTR:@.+]] = internal addrspace(3) global i8* null
// CK1-DAG: [[KERNEL_SIZE:@.+]] = internal unnamed_addr constant i64 8
// CK1-DAG: [[KERNEL_SHARED:@.+]] = internal unnamed_addr constant i16 1
// SEQ: [[MEM_TY:%.+]] = type { [128 x i8] }
// SEQ-DAG: [[SHARED_GLOBAL_RD:@.+]] = common addrspace(3) global [[MEM_TY]] zeroinitializer
// SEQ-DAG: [[KERNEL_PTR:@.+]] = internal addrspace(3) global i8* null
// SEQ-DAG: [[KERNEL_SIZE:@.+]] = internal unnamed_addr constant i64 8
// SEQ-DAG: [[KERNEL_SHARED:@.+]] = internal unnamed_addr constant i16 1

/// ========= In the worker function ========= ///
// CK1: {{.*}}define internal void @__omp_offloading{{.*}}test_ds{{.*}}_worker()
Expand All @@ -44,11 +45,12 @@ void test_ds(){
// CK1: [[SHAREDARGS2:%.+]] = alloca i8**
// CK1: call void @__kmpc_kernel_init
// CK1: call void @__kmpc_data_sharing_init_stack
// CK1: [[SHARED_MEM_FLAG:%.+]] = load i16, i16* [[KERNEL_SHARED]],
// CK1: [[SIZE:%.+]] = load i64, i64* [[KERNEL_SIZE]],
// CK1: call void @__kmpc_get_team_static_memory(i16 0, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds ([[MEM_TY]], [[MEM_TY]] addrspace(3)* [[SHARED_GLOBAL_RD]], i32 0, i32 0, i32 0) to i8*), i64 [[SIZE]], i16 [[SHARED_MEM_FLAG]], i8** addrspacecast (i8* addrspace(3)* [[KERNEL_PTR]] to i8**))
// CK1: [[KERNEL_RD:%.+]] = load i8*, i8* addrspace(3)* [[KERNEL_PTR]],
// CK1: [[GLOBALSTACK:%.+]] = getelementptr inbounds i8, i8* [[KERNEL_RD]], i64 0
// SEQ: [[SHARED_MEM_FLAG:%.+]] = load i16, i16* [[KERNEL_SHARED]],
// SEQ: [[SIZE:%.+]] = load i64, i64* [[KERNEL_SIZE]],
// SEQ: call void @__kmpc_get_team_static_memory(i16 0, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds ([[MEM_TY]], [[MEM_TY]] addrspace(3)* [[SHARED_GLOBAL_RD]], i32 0, i32 0, i32 0) to i8*), i64 [[SIZE]], i16 [[SHARED_MEM_FLAG]], i8** addrspacecast (i8* addrspace(3)* [[KERNEL_PTR]] to i8**))
// SEQ: [[KERNEL_RD:%.+]] = load i8*, i8* addrspace(3)* [[KERNEL_PTR]],
// SEQ: [[GLOBALSTACK:%.+]] = getelementptr inbounds i8, i8* [[KERNEL_RD]], i64 0
// PAR: [[GLOBALSTACK:%.+]] = call i8* @__kmpc_data_sharing_push_stack(i{{32|64}} 8, i16 1)
// CK1: [[GLOBALSTACK2:%.+]] = bitcast i8* [[GLOBALSTACK]] to %struct._globalized_locals_ty*
// CK1: [[A:%.+]] = getelementptr inbounds %struct._globalized_locals_ty, %struct._globalized_locals_ty* [[GLOBALSTACK2]], i32 0, i32 0
// CK1: [[B:%.+]] = getelementptr inbounds %struct._globalized_locals_ty, %struct._globalized_locals_ty* [[GLOBALSTACK2]], i32 0, i32 1
Expand All @@ -75,8 +77,9 @@ void test_ds(){
// CK1: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
// CK1: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
// CK1: call void @__kmpc_end_sharing_variables()
// CK1: [[SHARED_MEM_FLAG:%.+]] = load i16, i16* [[KERNEL_SHARED]],
// CK1: call void @__kmpc_restore_team_static_memory(i16 0, i16 [[SHARED_MEM_FLAG]])
// SEQ: [[SHARED_MEM_FLAG:%.+]] = load i16, i16* [[KERNEL_SHARED]],
// SEQ: call void @__kmpc_restore_team_static_memory(i16 0, i16 [[SHARED_MEM_FLAG]])
// PAR: call void @__kmpc_data_sharing_pop_stack(i8* [[GLOBALSTACK]])
// CK1: call void @__kmpc_kernel_deinit(i16 1)

/// ========= In the data sharing wrapper function ========= ///
Expand Down
Original file line number Diff line number Diff line change
@@ -1,9 +1,12 @@
// Test target codegen - host bc file has to be created first.
// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc
// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64
// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64 --check-prefix SEQ
// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -disable-llvm-optzns -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64 --check-prefix PAR
// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple i386-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm-bc %s -o %t-x86-host.bc
// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32
// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32
// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 --check-prefix SEQ
// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 --check-prefix SEQ
// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -disable-llvm-optzns -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 --check-prefix PAR
// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -disable-llvm-optzns -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 --check-prefix PAR
// expected-no-diagnostics
#ifndef HEADER
#define HEADER
Expand All @@ -21,19 +24,20 @@ int main(int argc, char **argv) {
return 0;
}

// CHECK: [[MEM_TY:%.+]] = type { [128 x i8] }
// CHECK-DAG: [[SHARED_GLOBAL_RD:@.+]] = common addrspace(3) global [[MEM_TY]] zeroinitializer
// CHECK-DAG: [[KERNEL_PTR:@.+]] = internal addrspace(3) global i8* null
// CHECK-DAG: [[KERNEL_SIZE:@.+]] = internal unnamed_addr constant i{{64|32}} 40
// CHECK-DAG: [[KERNEL_SHARED:@.+]] = internal unnamed_addr constant i16 1
// CHECK-DAG: @__omp_offloading_{{.*}}_main_l17_exec_mode = weak constant i8 0

// CHECK: define weak void @__omp_offloading_{{.*}}_main_l17([10 x i32]* nonnull align 4 dereferenceable(40) %{{.+}}, [10 x i32]* nonnull align 4 dereferenceable(40) %{{.+}}, i32* nonnull align 4 dereferenceable(4) %{{.+}}, i{{64|32}} %{{.+}}, [10 x i32]* nonnull align 4 dereferenceable(40) %{{.+}})
// CHECK: [[SHARED:%.+]] = load i16, i16* [[KERNEL_SHARED]],
// CHECK: [[SIZE:%.+]] = load i{{64|32}}, i{{64|32}}* [[KERNEL_SIZE]],
// CHECK: call void @__kmpc_get_team_static_memory(i16 1, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds ([[MEM_TY]], [[MEM_TY]] addrspace(3)* [[SHARED_GLOBAL_RD]], i32 0, i32 0, i32 0) to i8*), i{{64|32}} [[SIZE]], i16 [[SHARED]], i8** addrspacecast (i8* addrspace(3)* [[KERNEL_PTR]] to i8**))
// CHECK: [[PTR:%.+]] = load i8*, i8* addrspace(3)* [[KERNEL_PTR]],
// CHECK: [[GEP:%.+]] = getelementptr inbounds i8, i8* [[PTR]], i{{64|32}} 0
// SEQ: [[MEM_TY:%.+]] = type { [128 x i8] }
// SEQ-DAG: [[SHARED_GLOBAL_RD:@.+]] = common addrspace(3) global [[MEM_TY]] zeroinitializer
// SEQ-DAG: [[KERNEL_PTR:@.+]] = internal addrspace(3) global i8* null
// SEQ-DAG: [[KERNEL_SIZE:@.+]] = internal unnamed_addr constant i{{64|32}} 40
// SEQ-DAG: [[KERNEL_SHARED:@.+]] = internal unnamed_addr constant i16 1
// CHECK-DAG: @__omp_offloading_{{.*}}_main_l20_exec_mode = weak constant i8 0

// CHECK: define weak void @__omp_offloading_{{.*}}_main_l20([10 x i32]* nonnull align 4 dereferenceable(40) %{{.+}}, [10 x i32]* nonnull align 4 dereferenceable(40) %{{.+}}, i32* nonnull align 4 dereferenceable(4) %{{.+}}, i{{64|32}} %{{.+}}, [10 x i32]* nonnull align 4 dereferenceable(40) %{{.+}})
// SEQ: [[SHARED:%.+]] = load i16, i16* [[KERNEL_SHARED]],
// SEQ: [[SIZE:%.+]] = load i{{64|32}}, i{{64|32}}* [[KERNEL_SIZE]],
// SEQ: call void @__kmpc_get_team_static_memory(i16 1, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds ([[MEM_TY]], [[MEM_TY]] addrspace(3)* [[SHARED_GLOBAL_RD]], i32 0, i32 0, i32 0) to i8*), i{{64|32}} [[SIZE]], i16 [[SHARED]], i8** addrspacecast (i8* addrspace(3)* [[KERNEL_PTR]] to i8**))
// SEQ: [[PTR:%.+]] = load i8*, i8* addrspace(3)* [[KERNEL_PTR]],
// SEQ: [[GEP:%.+]] = getelementptr inbounds i8, i8* [[PTR]], i{{64|32}} 0
// PAR: [[GEP:%.+]] = call i8* @__kmpc_data_sharing_push_stack(i{{32|64}} 40, i16 1)
// CHECK: [[STACK:%.+]] = bitcast i8* [[GEP]] to %struct._globalized_locals_ty*
// CHECK: getelementptr inbounds %struct._globalized_locals_ty, %struct._globalized_locals_ty* [[STACK]], i{{32|64}} 0, i{{32|64}} 0
// CHECK-NOT: getelementptr inbounds %struct._globalized_locals_ty, %struct._globalized_locals_ty* [[STACK]],
Expand All @@ -43,8 +47,9 @@ int main(int argc, char **argv) {

// CHECK: call void @__kmpc_for_static_fini(%struct.ident_t* @

// CHECK: [[SHARED:%.+]] = load i16, i16* [[KERNEL_SHARED]],
// CHECK: call void @__kmpc_restore_team_static_memory(i16 1, i16 [[SHARED]])
// SEQ: [[SHARED:%.+]] = load i16, i16* [[KERNEL_SHARED]],
// SEQ: call void @__kmpc_restore_team_static_memory(i16 1, i16 [[SHARED]])
// PAR: call void @__kmpc_data_sharing_pop_stack(i8* [[GEP]])

// CHECK: define internal void [[PARALLEL]](
// CHECK-NOT: call i8* @__kmpc_data_sharing_push_stack(
Expand Down
Loading

0 comments on commit 4d9eea5

Please sign in to comment.