From 494db3816b0ece5b6722054f75cc2622ae1b840a Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Wed, 6 Jan 2021 11:51:52 +0000 Subject: [PATCH 01/12] [LoopDeletion] Also consider loops with subloops for deletion. Currently, LoopDeletion does skip loops that have sub-loops, but this means we currently fail to remove some no-op loops. One example are inner loops with live-out values. Those cannot be removed by itself. But the containing loop may itself be a no-op and the whole loop-nest can be deleted. The legality checks do not seem to rely on analyzing inner-loops only for correctness. With LoopDeletion being a LoopPass, the change means that we now unfortunately need to do some extra work in parent loops, by checking some conditions we already checked. But there appears to be no noticeable compile time impact: http://llvm-compile-time-tracker.com/compare.php?from=02d11f3cda2ab5b8bf4fc02639fd1f4b8c45963e&to=843201e9cf3b6871e18c52aede5897a22994c36c&stat=instructions This changes patch leads to ~10 more loops being deleted on MultiSource, SPEC2000, SPEC2006 with -O3 & LTO This patch is also required (together with a few others) to eliminate a no-op loop in omnetpp as discussed on llvm-dev 'LoopDeletion / removal of empty loops.' (http://lists.llvm.org/pipermail/llvm-dev/2020-December/147462.html) This change becomes relevant after removing potentially infinite loops is made possible in 'must-progress' loops (D86844). Note that I added a function call with side-effects to an outer loop in `llvm/test/Transforms/LoopDeletion/update-scev.ll` to preserve the original spirit of the test. Reviewed By: reames Differential Revision: https://reviews.llvm.org/D93716 --- llvm/lib/Transforms/Scalar/LoopDeletion.cpp | 7 ------ .../LoopDeletion/noop-loops-with-subloops.ll | 19 +------------- .../LoopDeletion/unreachable-loops.ll | 25 +++++++------------ .../Transforms/LoopDeletion/update-scev.ll | 3 +++ 4 files changed, 13 insertions(+), 41 deletions(-) diff --git a/llvm/lib/Transforms/Scalar/LoopDeletion.cpp b/llvm/lib/Transforms/Scalar/LoopDeletion.cpp index 814cfc7ac6a9bc..a94676eadeabac 100644 --- a/llvm/lib/Transforms/Scalar/LoopDeletion.cpp +++ b/llvm/lib/Transforms/Scalar/LoopDeletion.cpp @@ -156,13 +156,6 @@ static LoopDeletionResult deleteLoopIfDead(Loop *L, DominatorTree &DT, << "Deletion requires Loop with preheader and dedicated exits.\n"); return LoopDeletionResult::Unmodified; } - // We can't remove loops that contain subloops. If the subloops were dead, - // they would already have been removed in earlier executions of this pass. - if (L->begin() != L->end()) { - LLVM_DEBUG(dbgs() << "Loop contains subloops.\n"); - return LoopDeletionResult::Unmodified; - } - BasicBlock *ExitBlock = L->getUniqueExitBlock(); diff --git a/llvm/test/Transforms/LoopDeletion/noop-loops-with-subloops.ll b/llvm/test/Transforms/LoopDeletion/noop-loops-with-subloops.ll index 464c12f453a701..b7a921a8dd513a 100644 --- a/llvm/test/Transforms/LoopDeletion/noop-loops-with-subloops.ll +++ b/llvm/test/Transforms/LoopDeletion/noop-loops-with-subloops.ll @@ -10,24 +10,7 @@ target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f3 define void @test1(i64 %N, i64 %M, %pair_t* %ptr) willreturn { ; CHECK-LABEL: @test1( ; CHECK-NEXT: entry: -; CHECK-NEXT: br label [[OUTER_HEADER:%.*]] -; CHECK: outer.header: -; CHECK-NEXT: [[OUTER_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[OUTER_IV_NEXT:%.*]], [[OUTER_LATCH:%.*]] ] -; CHECK-NEXT: br label [[INNER:%.*]] -; CHECK: inner: -; CHECK-NEXT: [[INNER_IV:%.*]] = phi i64 [ 0, [[OUTER_HEADER]] ], [ [[INNER_IV_NEXT:%.*]], [[INNER]] ] -; CHECK-NEXT: [[GEP:%.*]] = getelementptr [[PAIR_T:%.*]], %pair_t* [[PTR:%.*]], i64 [[INNER_IV]] -; CHECK-NEXT: [[P:%.*]] = load [[PAIR_T]], %pair_t* [[GEP]], align 4 -; CHECK-NEXT: [[V_0:%.*]] = extractvalue [[PAIR_T]] [[P]], 0 -; CHECK-NEXT: [[V_1:%.*]] = extractvalue [[PAIR_T]] [[P]], 1 -; CHECK-NEXT: [[INNER_EC:%.*]] = icmp ult i64 [[V_0]], [[V_1]] -; CHECK-NEXT: [[INNER_IV_NEXT]] = add i64 [[INNER_IV]], 1 -; CHECK-NEXT: br i1 [[INNER_EC]], label [[OUTER_LATCH]], label [[INNER]] -; CHECK: outer.latch: -; CHECK-NEXT: [[LCSSA:%.*]] = phi i64 [ [[V_1]], [[INNER]] ] -; CHECK-NEXT: [[OUTER_EC:%.*]] = icmp ult i64 [[OUTER_IV]], [[LCSSA]] -; CHECK-NEXT: [[OUTER_IV_NEXT]] = add i64 [[OUTER_IV]], 1 -; CHECK-NEXT: br i1 [[OUTER_EC]], label [[EXIT:%.*]], label [[OUTER_HEADER]] +; CHECK-NEXT: br label [[EXIT:%.*]] ; CHECK: exit: ; CHECK-NEXT: ret void ; diff --git a/llvm/test/Transforms/LoopDeletion/unreachable-loops.ll b/llvm/test/Transforms/LoopDeletion/unreachable-loops.ll index a74ddf99285e23..c9e178fbf586f8 100644 --- a/llvm/test/Transforms/LoopDeletion/unreachable-loops.ll +++ b/llvm/test/Transforms/LoopDeletion/unreachable-loops.ll @@ -244,22 +244,15 @@ exit: ; Delete a loop (L2) which has subloop (L3). ; Here we delete loop L2, but leave L3 as is. -; FIXME: Can delete L3 as well, by iteratively going backward through the single -; predecessor of L3 until we reach L1's block that guarantees L3 is never -; executed. define void @test9(i64 %n) { ; CHECK-LABEL: test9 -; CHECK-LABEL: L2.preheader: -; CHECK-NEXT: br label %L3.preheader -; CHECK-NOT: L2: -; CHECK-LABEL: L3.preheader: -; CHECK-NEXT: %y.L2.lcssa = phi i64 [ undef, %L2.preheader ] -; CHECK-NEXT: br label %L3 -; CHECK-LABEL: L3: -; CHECK: br i1 %cond2, label %L3, label %L1.loopexit +; CHECK-LABEL: entry: +; CHECK-NEXT: br label %exit +; CHECK-LABEL: exit: +; CHECK-NEXT: ret void ; REMARKS-LABEL: Function: test9 ; REMARKS: Loop deleted because it never executes -entry: +entry: br label %L1 L1: @@ -283,12 +276,12 @@ exit: ; We cannot delete L3 because of call within it. ; Since L3 is not deleted, and entirely contained within L2, L2 is also not ; deleted. -; FIXME: We can delete unexecutable loops having -; subloops contained entirely within them. define void @test10(i64 %n) { ; CHECK-LABEL: test10 -; CHECK: L2: -; CHECK: L3: +; CHECK-LABEL: entry: +; CHECK-NEXT: br label %exit +; CHECK-LABEL: exit: +; CHECK-NEXT: ret void entry: br label %L1 diff --git a/llvm/test/Transforms/LoopDeletion/update-scev.ll b/llvm/test/Transforms/LoopDeletion/update-scev.ll index 44d23aa4060d1c..16a5530219177d 100644 --- a/llvm/test/Transforms/LoopDeletion/update-scev.ll +++ b/llvm/test/Transforms/LoopDeletion/update-scev.ll @@ -48,6 +48,7 @@ for.body6: ; preds = %for.body6, %for.bod for.inc11: ; preds = %for.body6 %and.lcssa = phi i32 [ %and, %for.body6 ] + call void @sideeffect(i32 %and.lcssa) %inc12 = add nsw i32 %val, 1 %tobool = icmp eq i32 %inc12, 0 br i1 %tobool, label %for.cond14, label %for.body @@ -56,6 +57,8 @@ for.cond14: ; preds = %for.cond14, %for.in br i1 undef, label %for.cond, label %for.cond14 } +declare void @sideeffect(i32) + ; LoopDeletion removes the loop %for.body7.1. Make sure %inc.lcssa.1 in the loop ; exit block is correctly invalidated. From 0c41b1c9f93c09966b87126820d3cf41d8eebbf9 Mon Sep 17 00:00:00 2001 From: Yvan Roux Date: Wed, 6 Jan 2021 16:01:38 +0100 Subject: [PATCH 02/12] [Driver][MachineOutliner] Support outlining option with LTO This patch propagates the -moutline flag when LTO is enabled and avoids passing it explicitly to the linker plugin. Differential Revision: https://reviews.llvm.org/D93385 --- clang/lib/Driver/ToolChains/Clang.cpp | 21 +------------ clang/lib/Driver/ToolChains/CommonArgs.cpp | 36 ++++++++++++++++++++++ clang/lib/Driver/ToolChains/CommonArgs.h | 4 +++ clang/test/Driver/arm-machine-outliner.c | 9 ++++++ 4 files changed, 50 insertions(+), 20 deletions(-) create mode 100644 clang/test/Driver/arm-machine-outliner.c diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp index f8b9bf25373e88..917601836c0a8e 100644 --- a/clang/lib/Driver/ToolChains/Clang.cpp +++ b/clang/lib/Driver/ToolChains/Clang.cpp @@ -6396,26 +6396,7 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA, options::OPT_fno_cxx_static_destructors, true)) CmdArgs.push_back("-fno-c++-static-destructors"); - if (Arg *A = Args.getLastArg(options::OPT_moutline, - options::OPT_mno_outline)) { - if (A->getOption().matches(options::OPT_moutline)) { - // We only support -moutline in AArch64 and ARM targets right now. If - // we're not compiling for these, emit a warning and ignore the flag. - // Otherwise, add the proper mllvm flags. - if (!(Triple.isARM() || Triple.isThumb() || - Triple.getArch() == llvm::Triple::aarch64 || - Triple.getArch() == llvm::Triple::aarch64_32)) { - D.Diag(diag::warn_drv_moutline_unsupported_opt) << Triple.getArchName(); - } else { - CmdArgs.push_back("-mllvm"); - CmdArgs.push_back("-enable-machine-outliner"); - } - } else { - // Disable all outlining behaviour. - CmdArgs.push_back("-mllvm"); - CmdArgs.push_back("-enable-machine-outliner=never"); - } - } + addMachineOutlinerArgs(D, Args, CmdArgs, Triple, /*IsLTO=*/false); if (Arg *A = Args.getLastArg(options::OPT_moutline_atomics, options::OPT_mno_outline_atomics)) { diff --git a/clang/lib/Driver/ToolChains/CommonArgs.cpp b/clang/lib/Driver/ToolChains/CommonArgs.cpp index fe5e7536d38054..6a95aa5ec62878 100644 --- a/clang/lib/Driver/ToolChains/CommonArgs.cpp +++ b/clang/lib/Driver/ToolChains/CommonArgs.cpp @@ -624,6 +624,9 @@ void tools::addLTOOptions(const ToolChain &ToolChain, const ArgList &Args, // Handle remarks hotness/threshold related options. renderRemarksHotnessOptions(Args, CmdArgs); + + addMachineOutlinerArgs(D, Args, CmdArgs, ToolChain.getEffectiveTriple(), + /*IsLTO=*/true); } void tools::addArchSpecificRPath(const ToolChain &TC, const ArgList &Args, @@ -1586,3 +1589,36 @@ unsigned tools::getOrCheckAMDGPUCodeObjectVersion( } return CodeObjVer; } + +void tools::addMachineOutlinerArgs(const Driver &D, + const llvm::opt::ArgList &Args, + llvm::opt::ArgStringList &CmdArgs, + const llvm::Triple &Triple, bool IsLTO) { + auto addArg = [&, IsLTO](const Twine &Arg) { + if (IsLTO) { + CmdArgs.push_back(Args.MakeArgString("-plugin-opt=" + Arg)); + } else { + CmdArgs.push_back("-mllvm"); + CmdArgs.push_back(Args.MakeArgString(Arg)); + } + }; + + if (Arg *A = Args.getLastArg(options::OPT_moutline, + options::OPT_mno_outline)) { + if (A->getOption().matches(options::OPT_moutline)) { + // We only support -moutline in AArch64 and ARM targets right now. If + // we're not compiling for these, emit a warning and ignore the flag. + // Otherwise, add the proper mllvm flags. + if (!(Triple.isARM() || Triple.isThumb() || + Triple.getArch() == llvm::Triple::aarch64 || + Triple.getArch() == llvm::Triple::aarch64_32)) { + D.Diag(diag::warn_drv_moutline_unsupported_opt) << Triple.getArchName(); + } else { + addArg(Twine("-enable-machine-outliner")); + } + } else { + // Disable all outlining behaviour. + addArg(Twine("-enable-machine-outliner=never")); + } + } +} diff --git a/clang/lib/Driver/ToolChains/CommonArgs.h b/clang/lib/Driver/ToolChains/CommonArgs.h index 9a365f3760228d..187c340d1c3c30 100644 --- a/clang/lib/Driver/ToolChains/CommonArgs.h +++ b/clang/lib/Driver/ToolChains/CommonArgs.h @@ -141,6 +141,10 @@ void addX86AlignBranchArgs(const Driver &D, const llvm::opt::ArgList &Args, unsigned getOrCheckAMDGPUCodeObjectVersion(const Driver &D, const llvm::opt::ArgList &Args, bool Diagnose = false); + +void addMachineOutlinerArgs(const Driver &D, const llvm::opt::ArgList &Args, + llvm::opt::ArgStringList &CmdArgs, + const llvm::Triple &Triple, bool IsLTO); } // end namespace tools } // end namespace driver } // end namespace clang diff --git a/clang/test/Driver/arm-machine-outliner.c b/clang/test/Driver/arm-machine-outliner.c new file mode 100644 index 00000000000000..6ac82838323a96 --- /dev/null +++ b/clang/test/Driver/arm-machine-outliner.c @@ -0,0 +1,9 @@ +// REQUIRES: arm-registered-target +// RUN: %clang -target armv7-linux-gnueabihf -moutline -c %s -### 2>&1 | FileCheck %s -check-prefix=ON +// ON: "-mllvm" "-enable-machine-outliner" +// RUN: %clang -target armv7-linux-gnueabihf -flto -moutline %s -### 2>&1 | FileCheck %s -check-prefix=ON-LTO +// ON-LTO: "-plugin-opt=-enable-machine-outliner" +// RUN: %clang -target armv7-linux-gnueabihf -moutline -mno-outline -c %s -### 2>&1 | FileCheck %s -check-prefix=OFF +// OFF: "-mllvm" "-enable-machine-outliner=never" +// RUN: %clang -target armv7-linux-gnueabihf -flto -moutline -mno-outline %s -### 2>&1 | FileCheck %s -check-prefix=OFF-LTO +// OFF-LTO: "-plugin-opt=-enable-machine-outliner=never" From 3fa6cedb6be809092f8a8b27e63bd4f6dc526a08 Mon Sep 17 00:00:00 2001 From: Erich Keane Date: Wed, 6 Jan 2021 06:46:01 -0800 Subject: [PATCH 03/12] Fix MaterializeTemporaryExpr's type when its an incomplete array. Like the VarDecl that gets its type updated based on an init-list, this patch corrects the MaterializeTemporaryExpr's type to make sure it isn't creating an incomplete type, which leads to a handful of CodeGen crashes (see PR 47636). Based on @rsmith 's comments on D88236 Differential Revision: https://reviews.llvm.org/D88298 --- clang/lib/Sema/SemaInit.cpp | 14 +++++++++++++- clang/test/AST/pr47636.cpp | 26 ++++++++++++++++++++++++++ clang/test/CodeGenCXX/pr47636.cpp | 12 ++++++++++++ 3 files changed, 51 insertions(+), 1 deletion(-) create mode 100644 clang/test/AST/pr47636.cpp diff --git a/clang/lib/Sema/SemaInit.cpp b/clang/lib/Sema/SemaInit.cpp index b5f31bf403d449..38f6a5975ea3ce 100644 --- a/clang/lib/Sema/SemaInit.cpp +++ b/clang/lib/Sema/SemaInit.cpp @@ -8200,9 +8200,21 @@ ExprResult InitializationSequence::Perform(Sema &S, if (S.CheckExceptionSpecCompatibility(CurInit.get(), DestType)) return ExprError(); + QualType MTETy = Step->Type; + + // When this is an incomplete array type (such as when this is + // initializing an array of unknown bounds from an init list), use THAT + // type instead so that we propogate the array bounds. + if (MTETy->isIncompleteArrayType() && + !CurInit.get()->getType()->isIncompleteArrayType() && + S.Context.hasSameType( + MTETy->getPointeeOrArrayElementType(), + CurInit.get()->getType()->getPointeeOrArrayElementType())) + MTETy = CurInit.get()->getType(); + // Materialize the temporary into memory. MaterializeTemporaryExpr *MTE = S.CreateMaterializeTemporaryExpr( - Step->Type, CurInit.get(), Entity.getType()->isLValueReferenceType()); + MTETy, CurInit.get(), Entity.getType()->isLValueReferenceType()); CurInit = MTE; // If we're extending this temporary to automatic storage duration -- we diff --git a/clang/test/AST/pr47636.cpp b/clang/test/AST/pr47636.cpp new file mode 100644 index 00000000000000..29d2a0a06e7916 --- /dev/null +++ b/clang/test/AST/pr47636.cpp @@ -0,0 +1,26 @@ +// RUN: %clang_cc1 -fsyntax-only %s -ast-dump | FileCheck %s + +int(&&intu_rvref)[] {1,2,3,4}; +// CHECK: VarDecl 0x[[GLOB_ADDR:[0-9a-f]+]] {{.*}} intu_rvref 'int (&&)[4]' listinit +// CHECK-NEXT: ExprWithCleanups {{.*}} 'int [4]' xvalue +// CHECK-NEXT: MaterializeTemporaryExpr {{.*}} 'int [4]' xvalue extended by Var 0x[[GLOB_ADDR]] 'intu_rvref' 'int (&&)[4]' +// CHECK-NEXT: InitListExpr {{.*}} 'int [4]' + +// CHECK: FunctionDecl {{.*}} static_const +void static_const() { + static const int(&&intu_rvref)[] {1,2,3,4}; + // CHECK: VarDecl 0x[[STATIC_ADDR:[0-9a-f]+]] {{.*}} intu_rvref 'const int (&&)[4]' static listinit + // CHECK-NEXT: ExprWithCleanups {{.*}} 'const int [4]' xvalue + // CHECK-NEXT: MaterializeTemporaryExpr {{.*}} 'const int [4]' xvalue extended by Var 0x[[STATIC_ADDR]] 'intu_rvref' 'const int (&&)[4]' + // CHECK-NEXT: InitListExpr {{.*}} 'const int [4]' +} + +// CHECK: FunctionDecl {{.*}} const_expr +constexpr int const_expr() { + int(&&intu_rvref)[]{1, 2, 3, 4}; + // CHECK: VarDecl 0x[[CE_ADDR:[0-9a-f]+]] {{.*}} intu_rvref 'int (&&)[4]' listinit + // CHECK-NEXT: ExprWithCleanups {{.*}} 'int [4]' xvalue + // CHECK-NEXT: MaterializeTemporaryExpr {{.*}} 'int [4]' xvalue extended by Var 0x[[CE_ADDR]] 'intu_rvref' 'int (&&)[4]' + // CHECK-NEXT: InitListExpr {{.*}} 'int [4]' + return intu_rvref[0]; +} diff --git a/clang/test/CodeGenCXX/pr47636.cpp b/clang/test/CodeGenCXX/pr47636.cpp index 64fb44114bd2bb..b6b31d62376103 100644 --- a/clang/test/CodeGenCXX/pr47636.cpp +++ b/clang/test/CodeGenCXX/pr47636.cpp @@ -8,3 +8,15 @@ void foo() { // CHECK: @_ZZ3foovE10intu_rvref = internal constant [4 x i32]* @_ZGRZ3foovE10intu_rvref_ // CHECK: @_ZGRZ3foovE10intu_rvref_ = internal constant [4 x i32] [i32 1, i32 2, i32 3, i32 4] } + +// Example given on review, ensure this doesn't crash as well. +constexpr int f() { + // CHECK: i32 @_Z1fv() + int(&&intu_rvref)[]{1, 2, 3, 4}; + // CHECK: %{{.*}} = alloca [4 x i32]* + return intu_rvref[2]; +} + +void use_f() { + int i = f(); +} From 25c78de6d2a50d6f90fd6cd3f0010eb3df157a6c Mon Sep 17 00:00:00 2001 From: Lei Zhang Date: Wed, 6 Jan 2021 10:28:01 -0500 Subject: [PATCH 04/12] [mlir][spirv] Update pass docs Reviewed By: hanchung Differential Revision: https://reviews.llvm.org/D94174 --- mlir/include/mlir/Conversion/Passes.td | 38 +++++++++++++++++++++++--- 1 file changed, 34 insertions(+), 4 deletions(-) diff --git a/mlir/include/mlir/Conversion/Passes.td b/mlir/include/mlir/Conversion/Passes.td index 2dc438534a4482..6a6ba6bbb3717b 100644 --- a/mlir/include/mlir/Conversion/Passes.td +++ b/mlir/include/mlir/Conversion/Passes.td @@ -144,6 +144,18 @@ def ConvertGpuOpsToROCDLOps : Pass<"convert-gpu-to-rocdl", "gpu::GPUModuleOp"> { def ConvertGPUToSPIRV : Pass<"convert-gpu-to-spirv", "ModuleOp"> { let summary = "Convert GPU dialect to SPIR-V dialect"; + let description = [{ + This pass converts supported GPU device ops to SPIR-V ops. It does not + handle GPU host ops. + + A `gpu.func` op can have parameters to pass in resources. But in SPIR-V + entry functions cannot take parameters; they use descriptors to access + resources. By default, parameters to a `gpu.func` op will be converted to + global variables. These global variables will be assigned sequential binding + numbers following their order in the original `gpu.func` op, starting from + 0, in set 0. One can attach `spv.interface_var_abi` to those parameters + to control the set and binding if wanted. + }]; let constructor = "mlir::createConvertGPUToSPIRVPass()"; let dependentDialects = ["spirv::SPIRVDialect"]; } @@ -155,6 +167,9 @@ def ConvertGPUToSPIRV : Pass<"convert-gpu-to-spirv", "ModuleOp"> { def ConvertGpuLaunchFuncToVulkanLaunchFunc : Pass<"convert-gpu-launch-to-vulkan-launch", "ModuleOp"> { let summary = "Convert gpu.launch_func to vulkanLaunch external call"; + let description = [{ + This pass is only intended for the mlir-vulkan-runner. + }]; let constructor = "mlir::createConvertGpuLaunchFuncToVulkanLaunchFuncPass()"; let dependentDialects = ["spirv::SPIRVDialect"]; } @@ -163,6 +178,9 @@ def ConvertVulkanLaunchFuncToVulkanCalls : Pass<"launch-func-to-vulkan", "ModuleOp"> { let summary = "Convert vulkanLaunch external call to Vulkan runtime external " "calls"; + let description = [{ + This pass is only intended for the mlir-vulkan-runner. + }]; let constructor = "mlir::createConvertVulkanLaunchFuncToVulkanCallsPass()"; let dependentDialects = ["LLVM::LLVMDialect"]; } @@ -194,7 +212,11 @@ def ConvertLinalgToStandard : Pass<"convert-linalg-to-std", "ModuleOp"> { //===----------------------------------------------------------------------===// def ConvertLinalgToSPIRV : Pass<"convert-linalg-to-spirv", "ModuleOp"> { - let summary = "Convert Linalg ops to SPIR-V ops"; + let summary = "Convert Linalg dialect to SPIR-V dialect"; + let description = [{ + This pass converts supported Linalg ops to SPIR-V ops. It's quite + experimental and are expected to migrate to other proper conversions. + }]; let constructor = "mlir::createLinalgToSPIRVPass()"; let dependentDialects = ["spirv::SPIRVDialect"]; } @@ -312,6 +334,10 @@ def ConvertShapeConstraints: Pass<"convert-shape-constraints", "FuncOp"> { def ConvertSPIRVToLLVM : Pass<"convert-spirv-to-llvm", "ModuleOp"> { let summary = "Convert SPIR-V dialect to LLVM dialect"; + let description = [{ + See https://mlir.llvm.org/docs/SPIRVToLLVMDialectConversion/ + for more details. + }]; let constructor = "mlir::createConvertSPIRVToLLVMPass()"; let dependentDialects = ["LLVM::LLVMDialect"]; } @@ -375,12 +401,17 @@ def ConvertStandardToLLVM : Pass<"convert-std-to-llvm", "ModuleOp"> { def LegalizeStandardForSPIRV : Pass<"legalize-std-for-spirv"> { let summary = "Legalize standard ops for SPIR-V lowering"; + let description = [{ + The pass contains certain intra standard op conversions that are meant for + lowering to SPIR-V ops, e.g., folding subviews loads/stores to the original + loads/stores from/to the original memref. + }]; let constructor = "mlir::createLegalizeStdOpsForSPIRVLoweringPass()"; let dependentDialects = ["spirv::SPIRVDialect"]; } def ConvertStandardToSPIRV : Pass<"convert-std-to-spirv", "ModuleOp"> { - let summary = "Convert Standard Ops to SPIR-V dialect"; + let summary = "Convert Standard dialect to SPIR-V dialect"; let constructor = "mlir::createConvertStandardToSPIRVPass()"; let dependentDialects = ["spirv::SPIRVDialect"]; } @@ -459,8 +490,7 @@ def ConvertVectorToROCDL : Pass<"convert-vector-to-rocdl", "ModuleOp"> { //===----------------------------------------------------------------------===// def ConvertVectorToSPIRV : Pass<"convert-vector-to-spirv", "ModuleOp"> { - let summary = "Lower the operations from the vector dialect into the SPIR-V " - "dialect"; + let summary = "Convert Vector dialect to SPIR-V dialect"; let constructor = "mlir::createConvertVectorToSPIRVPass()"; let dependentDialects = ["spirv::SPIRVDialect"]; } From 46975b5b29e2ecbf97eb7be2b124d94f0ce4b45e Mon Sep 17 00:00:00 2001 From: Krzysztof Parzyszek Date: Wed, 6 Jan 2021 09:11:04 -0600 Subject: [PATCH 05/12] [Hexagon] Wrap functions only used in asserts in ifndef NDEBUG --- llvm/lib/Target/Hexagon/HexagonVectorCombine.cpp | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/llvm/lib/Target/Hexagon/HexagonVectorCombine.cpp b/llvm/lib/Target/Hexagon/HexagonVectorCombine.cpp index 05269d37f812b2..01fd8a9ef9ce16 100644 --- a/llvm/lib/Target/Hexagon/HexagonVectorCombine.cpp +++ b/llvm/lib/Target/Hexagon/HexagonVectorCombine.cpp @@ -117,8 +117,11 @@ class HexagonVectorCombine { const HexagonSubtarget &HST; private: +#ifndef NDEBUG + // These two functions are only used for assertions at the moment. bool isByteVecTy(Type *Ty) const; - bool isSectorTy(Type *Ty) const LLVM_ATTRIBUTE_UNUSED; + bool isSectorTy(Type *Ty) const; +#endif Value *getElementRange(IRBuilder<> &Builder, Value *Lo, Value *Hi, int Start, int Length) const; }; @@ -1406,6 +1409,7 @@ auto HexagonVectorCombine::isSafeToMoveBeforeInBB(const Instruction &In, return true; } +#ifndef NDEBUG auto HexagonVectorCombine::isByteVecTy(Type *Ty) const -> bool { if (auto *VecTy = dyn_cast(Ty)) return VecTy->getElementType() == getByteTy(); @@ -1420,6 +1424,7 @@ auto HexagonVectorCombine::isSectorTy(Type *Ty) const -> bool { return Size == static_cast(HST.getVectorLength()); return Size == 4 || Size == 8; } +#endif auto HexagonVectorCombine::getElementRange(IRBuilder<> &Builder, Value *Lo, Value *Hi, int Start, From b69fe6a85db43df27ebb260716d41a3e1b0d7534 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Wed, 6 Jan 2021 15:44:36 +0000 Subject: [PATCH 06/12] [X86] Add icmp ne/eq (srl (ctlz x), log2(bw)) test coverage. Add vector coverage as well (which isn't currently supported). --- llvm/test/CodeGen/X86/lzcnt-cmp.ll | 258 +++++++++++++++++++++++++++++ 1 file changed, 258 insertions(+) create mode 100644 llvm/test/CodeGen/X86/lzcnt-cmp.ll diff --git a/llvm/test/CodeGen/X86/lzcnt-cmp.ll b/llvm/test/CodeGen/X86/lzcnt-cmp.ll new file mode 100644 index 00000000000000..435b09dd5d088b --- /dev/null +++ b/llvm/test/CodeGen/X86/lzcnt-cmp.ll @@ -0,0 +1,258 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=i686-- -mattr=+lzcnt | FileCheck %s --check-prefixes=X86 +; RUN: llc < %s -mtriple=x86_64-- -mattr=+lzcnt | FileCheck %s --check-prefix=X64 + +define i1 @lshr_ctlz_cmpeq_one_i64(i64 %in) { +; X86-LABEL: lshr_ctlz_cmpeq_one_i64: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: orl {{[0-9]+}}(%esp), %eax +; X86-NEXT: sete %al +; X86-NEXT: retl +; +; X64-LABEL: lshr_ctlz_cmpeq_one_i64: +; X64: # %bb.0: +; X64-NEXT: testq %rdi, %rdi +; X64-NEXT: sete %al +; X64-NEXT: retq + %ctlz = call i64 @llvm.ctlz.i64(i64 %in, i1 0) + %lshr = lshr i64 %ctlz, 6 + %icmp = icmp eq i64 %lshr, 1 + ret i1 %icmp +} + +define i1 @lshr_ctlz_undef_cmpeq_one_i64(i64 %in) { +; X86-LABEL: lshr_ctlz_undef_cmpeq_one_i64: +; X86: # %bb.0: +; X86-NEXT: xorl %eax, %eax +; X86-NEXT: cmpl $0, {{[0-9]+}}(%esp) +; X86-NEXT: jne .LBB1_2 +; X86-NEXT: # %bb.1: +; X86-NEXT: lzcntl {{[0-9]+}}(%esp), %eax +; X86-NEXT: addl $32, %eax +; X86-NEXT: .LBB1_2: +; X86-NEXT: testb $64, %al +; X86-NEXT: setne %al +; X86-NEXT: retl +; +; X64-LABEL: lshr_ctlz_undef_cmpeq_one_i64: +; X64: # %bb.0: +; X64-NEXT: lzcntq %rdi, %rax +; X64-NEXT: shrq $6, %rax +; X64-NEXT: cmpl $1, %eax +; X64-NEXT: sete %al +; X64-NEXT: retq + %ctlz = call i64 @llvm.ctlz.i64(i64 %in, i1 -1) + %lshr = lshr i64 %ctlz, 6 + %icmp = icmp eq i64 %lshr, 1 + ret i1 %icmp +} + +define i1 @lshr_ctlz_cmpne_zero_i64(i64 %in) { +; X86-LABEL: lshr_ctlz_cmpne_zero_i64: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: orl {{[0-9]+}}(%esp), %eax +; X86-NEXT: sete %al +; X86-NEXT: retl +; +; X64-LABEL: lshr_ctlz_cmpne_zero_i64: +; X64: # %bb.0: +; X64-NEXT: testq %rdi, %rdi +; X64-NEXT: sete %al +; X64-NEXT: retq + %ctlz = call i64 @llvm.ctlz.i64(i64 %in, i1 0) + %lshr = lshr i64 %ctlz, 6 + %icmp = icmp ne i64 %lshr, 0 + ret i1 %icmp +} + +define i1 @lshr_ctlz_undef_cmpne_zero_i64(i64 %in) { +; X86-LABEL: lshr_ctlz_undef_cmpne_zero_i64: +; X86: # %bb.0: +; X86-NEXT: xorl %eax, %eax +; X86-NEXT: cmpl $0, {{[0-9]+}}(%esp) +; X86-NEXT: jne .LBB3_2 +; X86-NEXT: # %bb.1: +; X86-NEXT: lzcntl {{[0-9]+}}(%esp), %eax +; X86-NEXT: addl $32, %eax +; X86-NEXT: .LBB3_2: +; X86-NEXT: testb $64, %al +; X86-NEXT: setne %al +; X86-NEXT: retl +; +; X64-LABEL: lshr_ctlz_undef_cmpne_zero_i64: +; X64: # %bb.0: +; X64-NEXT: lzcntq %rdi, %rax +; X64-NEXT: testb $64, %al +; X64-NEXT: setne %al +; X64-NEXT: retq + %ctlz = call i64 @llvm.ctlz.i64(i64 %in, i1 -1) + %lshr = lshr i64 %ctlz, 6 + %icmp = icmp ne i64 %lshr, 0 + ret i1 %icmp +} + +define <2 x i64> @lshr_ctlz_cmpeq_zero_v2i64(<2 x i64> %in) { +; X86-LABEL: lshr_ctlz_cmpeq_zero_v2i64: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: xorl %edx, %edx +; X86-NEXT: cmpl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, %ecx +; X86-NEXT: jne .LBB4_2 +; X86-NEXT: # %bb.1: +; X86-NEXT: lzcntl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: addl $32, %ecx +; X86-NEXT: .LBB4_2: +; X86-NEXT: cmpl $0, {{[0-9]+}}(%esp) +; X86-NEXT: jne .LBB4_4 +; X86-NEXT: # %bb.3: +; X86-NEXT: lzcntl {{[0-9]+}}(%esp), %edx +; X86-NEXT: addl $32, %edx +; X86-NEXT: .LBB4_4: +; X86-NEXT: andl $-64, %edx +; X86-NEXT: cmpl $1, %edx +; X86-NEXT: sbbl %edx, %edx +; X86-NEXT: andl $-64, %ecx +; X86-NEXT: cmpl $1, %ecx +; X86-NEXT: sbbl %ecx, %ecx +; X86-NEXT: movl %ecx, 12(%eax) +; X86-NEXT: movl %ecx, 8(%eax) +; X86-NEXT: movl %edx, 4(%eax) +; X86-NEXT: movl %edx, (%eax) +; X86-NEXT: retl $4 +; +; X64-LABEL: lshr_ctlz_cmpeq_zero_v2i64: +; X64: # %bb.0: +; X64-NEXT: movdqa %xmm0, %xmm1 +; X64-NEXT: psrlq $1, %xmm1 +; X64-NEXT: por %xmm0, %xmm1 +; X64-NEXT: movdqa %xmm1, %xmm0 +; X64-NEXT: psrlq $2, %xmm0 +; X64-NEXT: por %xmm1, %xmm0 +; X64-NEXT: movdqa %xmm0, %xmm1 +; X64-NEXT: psrlq $4, %xmm1 +; X64-NEXT: por %xmm0, %xmm1 +; X64-NEXT: movdqa %xmm1, %xmm0 +; X64-NEXT: psrlq $8, %xmm0 +; X64-NEXT: por %xmm1, %xmm0 +; X64-NEXT: movdqa %xmm0, %xmm1 +; X64-NEXT: psrlq $16, %xmm1 +; X64-NEXT: por %xmm0, %xmm1 +; X64-NEXT: movdqa %xmm1, %xmm0 +; X64-NEXT: psrlq $32, %xmm0 +; X64-NEXT: por %xmm1, %xmm0 +; X64-NEXT: pcmpeqd %xmm1, %xmm1 +; X64-NEXT: pxor %xmm0, %xmm1 +; X64-NEXT: movdqa %xmm1, %xmm0 +; X64-NEXT: psrlw $1, %xmm0 +; X64-NEXT: pand {{.*}}(%rip), %xmm0 +; X64-NEXT: psubb %xmm0, %xmm1 +; X64-NEXT: movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] +; X64-NEXT: movdqa %xmm1, %xmm2 +; X64-NEXT: pand %xmm0, %xmm2 +; X64-NEXT: psrlw $2, %xmm1 +; X64-NEXT: pand %xmm0, %xmm1 +; X64-NEXT: paddb %xmm2, %xmm1 +; X64-NEXT: movdqa %xmm1, %xmm2 +; X64-NEXT: psrlw $4, %xmm2 +; X64-NEXT: paddb %xmm1, %xmm2 +; X64-NEXT: pand {{.*}}(%rip), %xmm2 +; X64-NEXT: pxor %xmm0, %xmm0 +; X64-NEXT: psadbw %xmm0, %xmm2 +; X64-NEXT: psrlq $6, %xmm2 +; X64-NEXT: pcmpeqd %xmm0, %xmm2 +; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,0,3,2] +; X64-NEXT: pand %xmm2, %xmm0 +; X64-NEXT: retq + %ctlz = call <2 x i64> @llvm.ctlz.v2i64(<2 x i64> %in, i1 0) + %lshr = lshr <2 x i64> %ctlz, + %icmp = icmp eq <2 x i64> %lshr, zeroinitializer + %sext = sext <2 x i1> %icmp to <2 x i64> + ret <2 x i64> %sext +} + +define <2 x i64> @lshr_ctlz_cmpne_zero_v2i64(<2 x i64> %in) { +; X86-LABEL: lshr_ctlz_cmpne_zero_v2i64: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: xorl %edx, %edx +; X86-NEXT: cmpl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, %ecx +; X86-NEXT: jne .LBB5_2 +; X86-NEXT: # %bb.1: +; X86-NEXT: lzcntl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: addl $32, %ecx +; X86-NEXT: .LBB5_2: +; X86-NEXT: cmpl $0, {{[0-9]+}}(%esp) +; X86-NEXT: jne .LBB5_4 +; X86-NEXT: # %bb.3: +; X86-NEXT: lzcntl {{[0-9]+}}(%esp), %edx +; X86-NEXT: addl $32, %edx +; X86-NEXT: .LBB5_4: +; X86-NEXT: andl $-64, %edx +; X86-NEXT: negl %edx +; X86-NEXT: sbbl %edx, %edx +; X86-NEXT: andl $-64, %ecx +; X86-NEXT: negl %ecx +; X86-NEXT: sbbl %ecx, %ecx +; X86-NEXT: movl %ecx, 12(%eax) +; X86-NEXT: movl %ecx, 8(%eax) +; X86-NEXT: movl %edx, 4(%eax) +; X86-NEXT: movl %edx, (%eax) +; X86-NEXT: retl $4 +; +; X64-LABEL: lshr_ctlz_cmpne_zero_v2i64: +; X64: # %bb.0: +; X64-NEXT: movdqa %xmm0, %xmm1 +; X64-NEXT: psrlq $1, %xmm1 +; X64-NEXT: por %xmm0, %xmm1 +; X64-NEXT: movdqa %xmm1, %xmm0 +; X64-NEXT: psrlq $2, %xmm0 +; X64-NEXT: por %xmm1, %xmm0 +; X64-NEXT: movdqa %xmm0, %xmm1 +; X64-NEXT: psrlq $4, %xmm1 +; X64-NEXT: por %xmm0, %xmm1 +; X64-NEXT: movdqa %xmm1, %xmm0 +; X64-NEXT: psrlq $8, %xmm0 +; X64-NEXT: por %xmm1, %xmm0 +; X64-NEXT: movdqa %xmm0, %xmm1 +; X64-NEXT: psrlq $16, %xmm1 +; X64-NEXT: por %xmm0, %xmm1 +; X64-NEXT: movdqa %xmm1, %xmm2 +; X64-NEXT: psrlq $32, %xmm2 +; X64-NEXT: por %xmm1, %xmm2 +; X64-NEXT: pcmpeqd %xmm1, %xmm1 +; X64-NEXT: pxor %xmm1, %xmm2 +; X64-NEXT: movdqa %xmm2, %xmm0 +; X64-NEXT: psrlw $1, %xmm0 +; X64-NEXT: pand {{.*}}(%rip), %xmm0 +; X64-NEXT: psubb %xmm0, %xmm2 +; X64-NEXT: movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] +; X64-NEXT: movdqa %xmm2, %xmm3 +; X64-NEXT: pand %xmm0, %xmm3 +; X64-NEXT: psrlw $2, %xmm2 +; X64-NEXT: pand %xmm0, %xmm2 +; X64-NEXT: paddb %xmm3, %xmm2 +; X64-NEXT: movdqa %xmm2, %xmm0 +; X64-NEXT: psrlw $4, %xmm0 +; X64-NEXT: paddb %xmm2, %xmm0 +; X64-NEXT: pand {{.*}}(%rip), %xmm0 +; X64-NEXT: pxor %xmm2, %xmm2 +; X64-NEXT: psadbw %xmm2, %xmm0 +; X64-NEXT: psrlq $6, %xmm0 +; X64-NEXT: pcmpeqd %xmm2, %xmm0 +; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,0,3,2] +; X64-NEXT: pand %xmm2, %xmm0 +; X64-NEXT: pxor %xmm1, %xmm0 +; X64-NEXT: retq + %ctlz = call <2 x i64> @llvm.ctlz.v2i64(<2 x i64> %in, i1 0) + %lshr = lshr <2 x i64> %ctlz, + %icmp = icmp ne <2 x i64> %lshr, zeroinitializer + %sext = sext <2 x i1> %icmp to <2 x i64> + ret <2 x i64> %sext +} + +declare i64 @llvm.ctlz.i64(i64, i1) +declare <2 x i64> @llvm.ctlz.v2i64(<2 x i64>, i1) From 500864f928c272e8ebfd6493cb749083124bfd8b Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Wed, 6 Jan 2021 15:50:11 +0000 Subject: [PATCH 07/12] Remove some unused includes. NFCI. (unlike many other c++ headers) is relatively clean, so if the file doesn't use std::vector then it shouldn't need the header. --- llvm/include/llvm/Analysis/InlineAdvisor.h | 6 ++---- llvm/include/llvm/CodeGen/CodeGenPassBuilder.h | 1 - llvm/include/llvm/ExecutionEngine/JITEventListener.h | 1 - 3 files changed, 2 insertions(+), 6 deletions(-) diff --git a/llvm/include/llvm/Analysis/InlineAdvisor.h b/llvm/include/llvm/Analysis/InlineAdvisor.h index 4dbd5786ac7dd5..f051706dca16c5 100644 --- a/llvm/include/llvm/Analysis/InlineAdvisor.h +++ b/llvm/include/llvm/Analysis/InlineAdvisor.h @@ -9,13 +9,11 @@ #ifndef LLVM_INLINEADVISOR_H_ #define LLVM_INLINEADVISOR_H_ -#include -#include -#include - #include "llvm/Analysis/InlineCost.h" #include "llvm/Config/llvm-config.h" #include "llvm/IR/PassManager.h" +#include +#include namespace llvm { class BasicBlock; diff --git a/llvm/include/llvm/CodeGen/CodeGenPassBuilder.h b/llvm/include/llvm/CodeGen/CodeGenPassBuilder.h index b47aaa53eb89ff..893bc6e013f403 100644 --- a/llvm/include/llvm/CodeGen/CodeGenPassBuilder.h +++ b/llvm/include/llvm/CodeGen/CodeGenPassBuilder.h @@ -57,7 +57,6 @@ #include #include #include -#include namespace llvm { diff --git a/llvm/include/llvm/ExecutionEngine/JITEventListener.h b/llvm/include/llvm/ExecutionEngine/JITEventListener.h index 606b6f7cc12841..4eefd993de2be4 100644 --- a/llvm/include/llvm/ExecutionEngine/JITEventListener.h +++ b/llvm/include/llvm/ExecutionEngine/JITEventListener.h @@ -20,7 +20,6 @@ #include "llvm/IR/DebugLoc.h" #include "llvm/Support/CBindingWrapping.h" #include -#include namespace llvm { From 350247a93c07906300b79955ff882004a92ae368 Mon Sep 17 00:00:00 2001 From: Nicholas Guy Date: Wed, 2 Dec 2020 15:15:23 +0000 Subject: [PATCH 08/12] [AArch64] Rearrange mul(dup(sext/zext)) to mul(sext/zext(dup)) Performing this rearrangement allows for existing patterns to match cases where the vector may be built after an extend, instead of before. Differential Revision: https://reviews.llvm.org/D91255 --- .../Target/AArch64/AArch64ISelLowering.cpp | 143 ++++++++ .../AArch64/aarch64-dup-ext-scalable.ll | 327 ++++++++++++++++++ llvm/test/CodeGen/AArch64/aarch64-dup-ext.ll | 185 ++++++++++ 3 files changed, 655 insertions(+) create mode 100644 llvm/test/CodeGen/AArch64/aarch64-dup-ext-scalable.ll create mode 100644 llvm/test/CodeGen/AArch64/aarch64-dup-ext.ll diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 41dc285a368d17..40435c12ca3b42 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -11705,9 +11705,152 @@ static bool IsSVECntIntrinsic(SDValue S) { return false; } +/// Calculates what the pre-extend type is, based on the extension +/// operation node provided by \p Extend. +/// +/// In the case that \p Extend is a SIGN_EXTEND or a ZERO_EXTEND, the +/// pre-extend type is pulled directly from the operand, while other extend +/// operations need a bit more inspection to get this information. +/// +/// \param Extend The SDNode from the DAG that represents the extend operation +/// \param DAG The SelectionDAG hosting the \p Extend node +/// +/// \returns The type representing the \p Extend source type, or \p MVT::Other +/// if no valid type can be determined +static EVT calculatePreExtendType(SDValue Extend, SelectionDAG &DAG) { + switch (Extend.getOpcode()) { + case ISD::SIGN_EXTEND: + case ISD::ZERO_EXTEND: + return Extend.getOperand(0).getValueType(); + case ISD::AssertSext: + case ISD::AssertZext: + case ISD::SIGN_EXTEND_INREG: { + VTSDNode *TypeNode = dyn_cast(Extend.getOperand(1)); + if (!TypeNode) + return MVT::Other; + return TypeNode->getVT(); + } + case ISD::AND: { + ConstantSDNode *Constant = + dyn_cast(Extend.getOperand(1).getNode()); + if (!Constant) + return MVT::Other; + + uint32_t Mask = Constant->getZExtValue(); + + if (Mask == UCHAR_MAX) + return MVT::i8; + else if (Mask == USHRT_MAX) + return MVT::i16; + else if (Mask == UINT_MAX) + return MVT::i32; + + return MVT::Other; + } + default: + return MVT::Other; + } + + llvm_unreachable("Code path unhandled in calculatePreExtendType!"); +} + +/// Combines a dup(sext/zext) node pattern into sext/zext(dup) +/// making use of the vector SExt/ZExt rather than the scalar SExt/ZExt +static SDValue performCommonVectorExtendCombine(SDValue VectorShuffle, + SelectionDAG &DAG) { + + ShuffleVectorSDNode *ShuffleNode = + dyn_cast(VectorShuffle.getNode()); + if (!ShuffleNode) + return SDValue(); + + // Ensuring the mask is zero before continuing + if (!ShuffleNode->isSplat() || ShuffleNode->getSplatIndex() != 0) + return SDValue(); + + SDValue InsertVectorElt = VectorShuffle.getOperand(0); + + if (InsertVectorElt.getOpcode() != ISD::INSERT_VECTOR_ELT) + return SDValue(); + + SDValue InsertLane = InsertVectorElt.getOperand(2); + ConstantSDNode *Constant = dyn_cast(InsertLane.getNode()); + // Ensures the insert is inserting into lane 0 + if (!Constant || Constant->getZExtValue() != 0) + return SDValue(); + + SDValue Extend = InsertVectorElt.getOperand(1); + unsigned ExtendOpcode = Extend.getOpcode(); + + bool IsSExt = ExtendOpcode == ISD::SIGN_EXTEND || + ExtendOpcode == ISD::SIGN_EXTEND_INREG || + ExtendOpcode == ISD::AssertSext; + if (!IsSExt && ExtendOpcode != ISD::ZERO_EXTEND && + ExtendOpcode != ISD::AssertZext && ExtendOpcode != ISD::AND) + return SDValue(); + + EVT TargetType = VectorShuffle.getValueType(); + EVT PreExtendType = calculatePreExtendType(Extend, DAG); + + if ((TargetType != MVT::v8i16 && TargetType != MVT::v4i32 && + TargetType != MVT::v2i64) || + (PreExtendType == MVT::Other)) + return SDValue(); + + EVT PreExtendVT = TargetType.changeVectorElementType(PreExtendType); + + if (PreExtendVT.getVectorElementCount() != TargetType.getVectorElementCount()) + return SDValue(); + + if (TargetType.getScalarSizeInBits() != PreExtendVT.getScalarSizeInBits() * 2) + return SDValue(); + + SDLoc DL(VectorShuffle); + + SDValue InsertVectorNode = DAG.getNode( + InsertVectorElt.getOpcode(), DL, PreExtendVT, DAG.getUNDEF(PreExtendVT), + Extend.getOperand(0), DAG.getConstant(0, DL, MVT::i64)); + + std::vector ShuffleMask(TargetType.getVectorElementCount().getValue()); + + SDValue VectorShuffleNode = + DAG.getVectorShuffle(PreExtendVT, DL, InsertVectorNode, + DAG.getUNDEF(PreExtendVT), ShuffleMask); + + SDValue ExtendNode = + DAG.getNode(IsSExt ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND, DL, TargetType, + VectorShuffleNode, DAG.getValueType(TargetType)); + + return ExtendNode; +} + +/// Combines a mul(dup(sext/zext)) node pattern into mul(sext/zext(dup)) +/// making use of the vector SExt/ZExt rather than the scalar SExt/ZExt +static SDValue performMulVectorExtendCombine(SDNode *Mul, SelectionDAG &DAG) { + // If the value type isn't a vector, none of the operands are going to be dups + if (!Mul->getValueType(0).isVector()) + return SDValue(); + + SDValue Op0 = performCommonVectorExtendCombine(Mul->getOperand(0), DAG); + SDValue Op1 = performCommonVectorExtendCombine(Mul->getOperand(1), DAG); + + // Neither operands have been changed, don't make any further changes + if (!Op0 && !Op1) + return SDValue(); + + SDLoc DL(Mul); + return DAG.getNode(Mul->getOpcode(), DL, Mul->getValueType(0), + Op0 ? Op0 : Mul->getOperand(0), + Op1 ? Op1 : Mul->getOperand(1)); +} + static SDValue performMulCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget) { + + if (SDValue Ext = performMulVectorExtendCombine(N, DAG)) + return Ext; + if (DCI.isBeforeLegalizeOps()) return SDValue(); diff --git a/llvm/test/CodeGen/AArch64/aarch64-dup-ext-scalable.ll b/llvm/test/CodeGen/AArch64/aarch64-dup-ext-scalable.ll new file mode 100644 index 00000000000000..082e2db5270536 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/aarch64-dup-ext-scalable.ll @@ -0,0 +1,327 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple aarch64-none-linux-gnu -mattr=+sve | FileCheck %s + +define @dupsext_v2i8_v2i16(i8 %src, %b) { +; CHECK-LABEL: dupsext_v2i8_v2i16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: sxtb w8, w0 +; CHECK-NEXT: mov z1.d, x8 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: mul z0.d, p0/m, z0.d, z1.d +; CHECK-NEXT: ret +entry: + %in = sext i8 %src to i16 + %broadcast.splatinsert = insertelement undef, i16 %in, i16 0 + %broadcast.splat = shufflevector %broadcast.splatinsert, undef, zeroinitializer + %out = mul nsw %broadcast.splat, %b + ret %out +} + +define @dupsext_v4i8_v4i16(i8 %src, %b) { +; CHECK-LABEL: dupsext_v4i8_v4i16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: sxtb w8, w0 +; CHECK-NEXT: mov z1.s, w8 +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: mul z0.s, p0/m, z0.s, z1.s +; CHECK-NEXT: ret +entry: + %in = sext i8 %src to i16 + %broadcast.splatinsert = insertelement undef, i16 %in, i16 0 + %broadcast.splat = shufflevector %broadcast.splatinsert, undef, zeroinitializer + %out = mul nsw %broadcast.splat, %b + ret %out +} + +define @dupsext_v8i8_v8i16(i8 %src, %b) { +; CHECK-LABEL: dupsext_v8i8_v8i16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: sxtb w8, w0 +; CHECK-NEXT: mov z1.h, w8 +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: mul z0.h, p0/m, z0.h, z1.h +; CHECK-NEXT: ret +entry: + %in = sext i8 %src to i16 + %broadcast.splatinsert = insertelement undef, i16 %in, i16 0 + %broadcast.splat = shufflevector %broadcast.splatinsert, undef, zeroinitializer + %out = mul nsw %broadcast.splat, %b + ret %out +} + +define @dupsext_v2i8_v2i32(i8 %src, %b) { +; CHECK-LABEL: dupsext_v2i8_v2i32: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: sxtb w8, w0 +; CHECK-NEXT: mov z1.d, x8 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: mul z0.d, p0/m, z0.d, z1.d +; CHECK-NEXT: ret +entry: + %in = sext i8 %src to i32 + %broadcast.splatinsert = insertelement undef, i32 %in, i32 0 + %broadcast.splat = shufflevector %broadcast.splatinsert, undef, zeroinitializer + %out = mul nsw %broadcast.splat, %b + ret %out +} + +define @dupsext_v4i8_v4i32(i8 %src, %b) { +; CHECK-LABEL: dupsext_v4i8_v4i32: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: sxtb w8, w0 +; CHECK-NEXT: mov z1.s, w8 +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: mul z0.s, p0/m, z0.s, z1.s +; CHECK-NEXT: ret +entry: + %in = sext i8 %src to i32 + %broadcast.splatinsert = insertelement undef, i32 %in, i32 0 + %broadcast.splat = shufflevector %broadcast.splatinsert, undef, zeroinitializer + %out = mul nsw %broadcast.splat, %b + ret %out +} + +define @dupsext_v2i8_v2i64(i8 %src, %b) { +; CHECK-LABEL: dupsext_v2i8_v2i64: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: // kill: def $w0 killed $w0 def $x0 +; CHECK-NEXT: sxtb x8, w0 +; CHECK-NEXT: mov z1.d, x8 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: mul z0.d, p0/m, z0.d, z1.d +; CHECK-NEXT: ret +entry: + %in = sext i8 %src to i64 + %broadcast.splatinsert = insertelement undef, i64 %in, i64 0 + %broadcast.splat = shufflevector %broadcast.splatinsert, undef, zeroinitializer + %out = mul nsw %broadcast.splat, %b + ret %out +} + +define @dupsext_v2i16_v2i32(i16 %src, %b) { +; CHECK-LABEL: dupsext_v2i16_v2i32: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: sxth w8, w0 +; CHECK-NEXT: mov z1.d, x8 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: mul z0.d, p0/m, z0.d, z1.d +; CHECK-NEXT: ret +entry: + %in = sext i16 %src to i32 + %broadcast.splatinsert = insertelement undef, i32 %in, i32 0 + %broadcast.splat = shufflevector %broadcast.splatinsert, undef, zeroinitializer + %out = mul nsw %broadcast.splat, %b + ret %out +} + +define @dupsext_v4i16_v4i32(i16 %src, %b) { +; CHECK-LABEL: dupsext_v4i16_v4i32: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: sxth w8, w0 +; CHECK-NEXT: mov z1.s, w8 +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: mul z0.s, p0/m, z0.s, z1.s +; CHECK-NEXT: ret +entry: + %in = sext i16 %src to i32 + %broadcast.splatinsert = insertelement undef, i32 %in, i32 0 + %broadcast.splat = shufflevector %broadcast.splatinsert, undef, zeroinitializer + %out = mul nsw %broadcast.splat, %b + ret %out +} + +define @dupsext_v2i16_v2i64(i16 %src, %b) { +; CHECK-LABEL: dupsext_v2i16_v2i64: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: // kill: def $w0 killed $w0 def $x0 +; CHECK-NEXT: sxth x8, w0 +; CHECK-NEXT: mov z1.d, x8 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: mul z0.d, p0/m, z0.d, z1.d +; CHECK-NEXT: ret +entry: + %in = sext i16 %src to i64 + %broadcast.splatinsert = insertelement undef, i64 %in, i64 0 + %broadcast.splat = shufflevector %broadcast.splatinsert, undef, zeroinitializer + %out = mul nsw %broadcast.splat, %b + ret %out +} + +define @dupsext_v2i32_v2i64(i32 %src, %b) { +; CHECK-LABEL: dupsext_v2i32_v2i64: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: // kill: def $w0 killed $w0 def $x0 +; CHECK-NEXT: sxtw x8, w0 +; CHECK-NEXT: mov z1.d, x8 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: mul z0.d, p0/m, z0.d, z1.d +; CHECK-NEXT: ret +entry: + %in = sext i32 %src to i64 + %broadcast.splatinsert = insertelement undef, i64 %in, i64 0 + %broadcast.splat = shufflevector %broadcast.splatinsert, undef, zeroinitializer + %out = mul nsw %broadcast.splat, %b + ret %out +} + +define @dupzext_v2i8_v2i16(i8 %src, %b) { +; CHECK-LABEL: dupzext_v2i8_v2i16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: and w8, w0, #0xff +; CHECK-NEXT: mov z1.d, x8 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: mul z0.d, p0/m, z0.d, z1.d +; CHECK-NEXT: ret +entry: + %in = zext i8 %src to i16 + %broadcast.splatinsert = insertelement undef, i16 %in, i16 0 + %broadcast.splat = shufflevector %broadcast.splatinsert, undef, zeroinitializer + %out = mul nuw %broadcast.splat, %b + ret %out +} + +define @dupzext_v4i8_v4i16(i8 %src, %b) { +; CHECK-LABEL: dupzext_v4i8_v4i16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: and w8, w0, #0xff +; CHECK-NEXT: mov z1.s, w8 +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: mul z0.s, p0/m, z0.s, z1.s +; CHECK-NEXT: ret +entry: + %in = zext i8 %src to i16 + %broadcast.splatinsert = insertelement undef, i16 %in, i16 0 + %broadcast.splat = shufflevector %broadcast.splatinsert, undef, zeroinitializer + %out = mul nuw %broadcast.splat, %b + ret %out +} + +define @dupzext_v8i8_v8i16(i8 %src, %b) { +; CHECK-LABEL: dupzext_v8i8_v8i16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: and w8, w0, #0xff +; CHECK-NEXT: mov z1.h, w8 +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: mul z0.h, p0/m, z0.h, z1.h +; CHECK-NEXT: ret +entry: + %in = zext i8 %src to i16 + %broadcast.splatinsert = insertelement undef, i16 %in, i16 0 + %broadcast.splat = shufflevector %broadcast.splatinsert, undef, zeroinitializer + %out = mul nuw %broadcast.splat, %b + ret %out +} + +define @dupzext_v2i8_v2i32(i8 %src, %b) { +; CHECK-LABEL: dupzext_v2i8_v2i32: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: and w8, w0, #0xff +; CHECK-NEXT: mov z1.d, x8 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: mul z0.d, p0/m, z0.d, z1.d +; CHECK-NEXT: ret +entry: + %in = zext i8 %src to i32 + %broadcast.splatinsert = insertelement undef, i32 %in, i32 0 + %broadcast.splat = shufflevector %broadcast.splatinsert, undef, zeroinitializer + %out = mul nuw %broadcast.splat, %b + ret %out +} + +define @dupzext_v4i8_v4i32(i8 %src, %b) { +; CHECK-LABEL: dupzext_v4i8_v4i32: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: and w8, w0, #0xff +; CHECK-NEXT: mov z1.s, w8 +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: mul z0.s, p0/m, z0.s, z1.s +; CHECK-NEXT: ret +entry: + %in = zext i8 %src to i32 + %broadcast.splatinsert = insertelement undef, i32 %in, i32 0 + %broadcast.splat = shufflevector %broadcast.splatinsert, undef, zeroinitializer + %out = mul nuw %broadcast.splat, %b + ret %out +} + +define @dupzext_v2i8_v2i64(i8 %src, %b) { +; CHECK-LABEL: dupzext_v2i8_v2i64: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: // kill: def $w0 killed $w0 def $x0 +; CHECK-NEXT: and x8, x0, #0xff +; CHECK-NEXT: mov z1.d, x8 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: mul z0.d, p0/m, z0.d, z1.d +; CHECK-NEXT: ret +entry: + %in = zext i8 %src to i64 + %broadcast.splatinsert = insertelement undef, i64 %in, i64 0 + %broadcast.splat = shufflevector %broadcast.splatinsert, undef, zeroinitializer + %out = mul nuw %broadcast.splat, %b + ret %out +} + +define @dupzext_v2i16_v2i32(i16 %src, %b) { +; CHECK-LABEL: dupzext_v2i16_v2i32: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: and w8, w0, #0xffff +; CHECK-NEXT: mov z1.d, x8 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: mul z0.d, p0/m, z0.d, z1.d +; CHECK-NEXT: ret +entry: + %in = zext i16 %src to i32 + %broadcast.splatinsert = insertelement undef, i32 %in, i32 0 + %broadcast.splat = shufflevector %broadcast.splatinsert, undef, zeroinitializer + %out = mul nuw %broadcast.splat, %b + ret %out +} + +define @dupzext_v4i16_v4i32(i16 %src, %b) { +; CHECK-LABEL: dupzext_v4i16_v4i32: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: and w8, w0, #0xffff +; CHECK-NEXT: mov z1.s, w8 +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: mul z0.s, p0/m, z0.s, z1.s +; CHECK-NEXT: ret +entry: + %in = zext i16 %src to i32 + %broadcast.splatinsert = insertelement undef, i32 %in, i32 0 + %broadcast.splat = shufflevector %broadcast.splatinsert, undef, zeroinitializer + %out = mul nuw %broadcast.splat, %b + ret %out +} + +define @dupzext_v2i16_v2i64(i16 %src, %b) { +; CHECK-LABEL: dupzext_v2i16_v2i64: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: // kill: def $w0 killed $w0 def $x0 +; CHECK-NEXT: and x8, x0, #0xffff +; CHECK-NEXT: mov z1.d, x8 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: mul z0.d, p0/m, z0.d, z1.d +; CHECK-NEXT: ret +entry: + %in = zext i16 %src to i64 + %broadcast.splatinsert = insertelement undef, i64 %in, i64 0 + %broadcast.splat = shufflevector %broadcast.splatinsert, undef, zeroinitializer + %out = mul nuw %broadcast.splat, %b + ret %out +} + +define @dupzext_v2i32_v2i64(i32 %src, %b) { +; CHECK-LABEL: dupzext_v2i32_v2i64: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov w8, w0 +; CHECK-NEXT: mov z1.d, x8 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: mul z0.d, p0/m, z0.d, z1.d +; CHECK-NEXT: ret +entry: + %in = zext i32 %src to i64 + %broadcast.splatinsert = insertelement undef, i64 %in, i64 0 + %broadcast.splat = shufflevector %broadcast.splatinsert, undef, zeroinitializer + %out = mul nuw %broadcast.splat, %b + ret %out +} diff --git a/llvm/test/CodeGen/AArch64/aarch64-dup-ext.ll b/llvm/test/CodeGen/AArch64/aarch64-dup-ext.ll new file mode 100644 index 00000000000000..07ac3c87d14304 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/aarch64-dup-ext.ll @@ -0,0 +1,185 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple aarch64-none-linux-gnu | FileCheck %s + +; Supported combines + +define <8 x i16> @dupsext_v8i8_v8i16(i8 %src, <8 x i8> %b) { +; CHECK-LABEL: dupsext_v8i8_v8i16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: dup v1.8b, w0 +; CHECK-NEXT: smull v0.8h, v1.8b, v0.8b +; CHECK-NEXT: ret +entry: + %in = sext i8 %src to i16 + %ext.b = sext <8 x i8> %b to <8 x i16> + %broadcast.splatinsert = insertelement <8 x i16> undef, i16 %in, i16 0 + %broadcast.splat = shufflevector <8 x i16> %broadcast.splatinsert, <8 x i16> undef, <8 x i32> zeroinitializer + %out = mul nsw <8 x i16> %broadcast.splat, %ext.b + ret <8 x i16> %out +} + +define <8 x i16> @dupzext_v8i8_v8i16(i8 %src, <8 x i8> %b) { +; CHECK-LABEL: dupzext_v8i8_v8i16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: dup v1.8b, w0 +; CHECK-NEXT: umull v0.8h, v1.8b, v0.8b +; CHECK-NEXT: ret +entry: + %in = zext i8 %src to i16 + %ext.b = zext <8 x i8> %b to <8 x i16> + %broadcast.splatinsert = insertelement <8 x i16> undef, i16 %in, i16 0 + %broadcast.splat = shufflevector <8 x i16> %broadcast.splatinsert, <8 x i16> undef, <8 x i32> zeroinitializer + %out = mul nuw <8 x i16> %broadcast.splat, %ext.b + ret <8 x i16> %out +} + +define <4 x i32> @dupsext_v4i16_v4i32(i16 %src, <4 x i16> %b) { +; CHECK-LABEL: dupsext_v4i16_v4i32: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: dup v1.4h, w0 +; CHECK-NEXT: smull v0.4s, v1.4h, v0.4h +; CHECK-NEXT: ret +entry: + %in = sext i16 %src to i32 + %ext.b = sext <4 x i16> %b to <4 x i32> + %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %in, i32 0 + %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer + %out = mul nsw <4 x i32> %broadcast.splat, %ext.b + ret <4 x i32> %out +} + +define <4 x i32> @dupzext_v4i16_v4i32(i16 %src, <4 x i16> %b) { +; CHECK-LABEL: dupzext_v4i16_v4i32: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: dup v1.4h, w0 +; CHECK-NEXT: umull v0.4s, v1.4h, v0.4h +; CHECK-NEXT: ret +entry: + %in = zext i16 %src to i32 + %ext.b = zext <4 x i16> %b to <4 x i32> + %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %in, i32 0 + %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer + %out = mul nuw <4 x i32> %broadcast.splat, %ext.b + ret <4 x i32> %out +} + +define <2 x i64> @dupsext_v2i32_v2i64(i32 %src, <2 x i32> %b) { +; CHECK-LABEL: dupsext_v2i32_v2i64: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: dup v1.2s, w0 +; CHECK-NEXT: smull v0.2d, v1.2s, v0.2s +; CHECK-NEXT: ret +entry: + %in = sext i32 %src to i64 + %ext.b = sext <2 x i32> %b to <2 x i64> + %broadcast.splatinsert = insertelement <2 x i64> undef, i64 %in, i64 0 + %broadcast.splat = shufflevector <2 x i64> %broadcast.splatinsert, <2 x i64> undef, <2 x i32> zeroinitializer + %out = mul nsw <2 x i64> %broadcast.splat, %ext.b + ret <2 x i64> %out +} + +define <2 x i64> @dupzext_v2i32_v2i64(i32 %src, <2 x i32> %b) { +; CHECK-LABEL: dupzext_v2i32_v2i64: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: dup v1.2s, w0 +; CHECK-NEXT: umull v0.2d, v1.2s, v0.2s +; CHECK-NEXT: ret +entry: + %in = zext i32 %src to i64 + %ext.b = zext <2 x i32> %b to <2 x i64> + %broadcast.splatinsert = insertelement <2 x i64> undef, i64 %in, i64 0 + %broadcast.splat = shufflevector <2 x i64> %broadcast.splatinsert, <2 x i64> undef, <2 x i32> zeroinitializer + %out = mul nuw <2 x i64> %broadcast.splat, %ext.b + ret <2 x i64> %out +} + +; Unsupported combines + +define <2 x i16> @dupsext_v2i8_v2i16(i8 %src, <2 x i8> %b) { +; CHECK-LABEL: dupsext_v2i8_v2i16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: sxtb w8, w0 +; CHECK-NEXT: shl v0.2s, v0.2s, #24 +; CHECK-NEXT: sshr v0.2s, v0.2s, #24 +; CHECK-NEXT: dup v1.2s, w8 +; CHECK-NEXT: mul v0.2s, v1.2s, v0.2s +; CHECK-NEXT: ret +entry: + %in = sext i8 %src to i16 + %ext.b = sext <2 x i8> %b to <2 x i16> + %broadcast.splatinsert = insertelement <2 x i16> undef, i16 %in, i16 0 + %broadcast.splat = shufflevector <2 x i16> %broadcast.splatinsert, <2 x i16> undef, <2 x i32> zeroinitializer + %out = mul nsw <2 x i16> %broadcast.splat, %ext.b + ret <2 x i16> %out +} + +define <2 x i64> @dupzext_v2i16_v2i64(i16 %src, <2 x i16> %b) { +; CHECK-LABEL: dupzext_v2i16_v2i64: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: movi d1, #0x00ffff0000ffff +; CHECK-NEXT: and v0.8b, v0.8b, v1.8b +; CHECK-NEXT: ushll v0.2d, v0.2s, #0 +; CHECK-NEXT: // kill: def $w0 killed $w0 def $x0 +; CHECK-NEXT: and x8, x0, #0xffff +; CHECK-NEXT: fmov x10, d0 +; CHECK-NEXT: mov x9, v0.d[1] +; CHECK-NEXT: mul x10, x8, x10 +; CHECK-NEXT: mul x8, x8, x9 +; CHECK-NEXT: fmov d0, x10 +; CHECK-NEXT: mov v0.d[1], x8 +; CHECK-NEXT: ret +entry: + %in = zext i16 %src to i64 + %ext.b = zext <2 x i16> %b to <2 x i64> + %broadcast.splatinsert = insertelement <2 x i64> undef, i64 %in, i64 0 + %broadcast.splat = shufflevector <2 x i64> %broadcast.splatinsert, <2 x i64> undef, <2 x i32> zeroinitializer + %out = mul nuw <2 x i64> %broadcast.splat, %ext.b + ret <2 x i64> %out +} + +; dupsext_v4i8_v4i16 +; dupsext_v2i8_v2i32 +; dupsext_v4i8_v4i32 +; dupsext_v2i8_v2i64 +; dupsext_v2i16_v2i32 +; dupsext_v2i16_v2i64 +; dupzext_v2i8_v2i16 +; dupzext_v4i8_v4i16 +; dupzext_v2i8_v2i32 +; dupzext_v4i8_v4i32 +; dupzext_v2i8_v2i64 +; dupzext_v2i16_v2i32 +; dupzext_v2i16_v2i64 + +; Unsupported states + +define <8 x i16> @nonsplat_shuffleinsert(i8 %src, <8 x i8> %b) { +; CHECK-LABEL: nonsplat_shuffleinsert: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: sxtb w8, w0 +; CHECK-NEXT: sshll v0.8h, v0.8b, #0 +; CHECK-NEXT: dup v1.8h, w8 +; CHECK-NEXT: mul v0.8h, v1.8h, v0.8h +; CHECK-NEXT: ret +entry: + %in = sext i8 %src to i16 + %ext.b = sext <8 x i8> %b to <8 x i16> + %broadcast.splatinsert = insertelement <8 x i16> undef, i16 %in, i16 1 + %broadcast.splat = shufflevector <8 x i16> %broadcast.splatinsert, <8 x i16> undef, <8 x i32> + %out = mul nsw <8 x i16> %broadcast.splat, %ext.b + ret <8 x i16> %out +} + +define <8 x i16> @missing_insert(<8 x i8> %b) { +; CHECK-LABEL: missing_insert: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: sshll v0.8h, v0.8b, #0 +; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #4 +; CHECK-NEXT: mul v0.8h, v1.8h, v0.8h +; CHECK-NEXT: ret +entry: + %ext.b = sext <8 x i8> %b to <8 x i16> + %broadcast.splat = shufflevector <8 x i16> %ext.b, <8 x i16> undef, <8 x i32> + %out = mul nsw <8 x i16> %broadcast.splat, %ext.b + ret <8 x i16> %out +} From 1307e3f6c46cc3a6e6ad9cd46fc67efafcac939e Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Wed, 6 Jan 2021 16:13:36 +0000 Subject: [PATCH 09/12] [TargetLowering] Add icmp ne/eq (srl (ctlz x), log2(bw)) vector support. --- .../CodeGen/SelectionDAG/TargetLowering.cpp | 41 +++-- llvm/test/CodeGen/X86/lzcnt-cmp.ll | 171 +++++------------- 2 files changed, 66 insertions(+), 146 deletions(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp index d895a53e5a83af..f5abb2c513fbe9 100644 --- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -3486,35 +3486,36 @@ SDValue TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1, // Optimize some CTPOP cases. if (SDValue V = simplifySetCCWithCTPOP(*this, VT, N0, C1, Cond, dl, DAG)) return V; - } - - // FIXME: Support vectors. - if (auto *N1C = dyn_cast(N1.getNode())) { - const APInt &C1 = N1C->getAPIntValue(); // If the LHS is '(srl (ctlz x), 5)', the RHS is 0/1, and this is an // equality comparison, then we're just comparing whether X itself is // zero. if (N0.getOpcode() == ISD::SRL && (C1.isNullValue() || C1.isOneValue()) && N0.getOperand(0).getOpcode() == ISD::CTLZ && - N0.getOperand(1).getOpcode() == ISD::Constant) { - const APInt &ShAmt = N0.getConstantOperandAPInt(1); - if ((Cond == ISD::SETEQ || Cond == ISD::SETNE) && - ShAmt == Log2_32(N0.getValueSizeInBits())) { - if ((C1 == 0) == (Cond == ISD::SETEQ)) { - // (srl (ctlz x), 5) == 0 -> X != 0 - // (srl (ctlz x), 5) != 1 -> X != 0 - Cond = ISD::SETNE; - } else { - // (srl (ctlz x), 5) != 0 -> X == 0 - // (srl (ctlz x), 5) == 1 -> X == 0 - Cond = ISD::SETEQ; + isPowerOf2_32(N0.getScalarValueSizeInBits())) { + if (ConstantSDNode *ShAmt = isConstOrConstSplat(N0.getOperand(1))) { + if ((Cond == ISD::SETEQ || Cond == ISD::SETNE) && + ShAmt->getAPIntValue() == Log2_32(N0.getScalarValueSizeInBits())) { + if ((C1 == 0) == (Cond == ISD::SETEQ)) { + // (srl (ctlz x), 5) == 0 -> X != 0 + // (srl (ctlz x), 5) != 1 -> X != 0 + Cond = ISD::SETNE; + } else { + // (srl (ctlz x), 5) != 0 -> X == 0 + // (srl (ctlz x), 5) == 1 -> X == 0 + Cond = ISD::SETEQ; + } + SDValue Zero = DAG.getConstant(0, dl, N0.getValueType()); + return DAG.getSetCC(dl, VT, N0.getOperand(0).getOperand(0), Zero, + Cond); } - SDValue Zero = DAG.getConstant(0, dl, N0.getValueType()); - return DAG.getSetCC(dl, VT, N0.getOperand(0).getOperand(0), - Zero, Cond); } } + } + + // FIXME: Support vectors. + if (auto *N1C = dyn_cast(N1.getNode())) { + const APInt &C1 = N1C->getAPIntValue(); // (zext x) == C --> x == (trunc C) // (sext x) == C --> x == (trunc C) diff --git a/llvm/test/CodeGen/X86/lzcnt-cmp.ll b/llvm/test/CodeGen/X86/lzcnt-cmp.ll index 435b09dd5d088b..3823524f552a25 100644 --- a/llvm/test/CodeGen/X86/lzcnt-cmp.ll +++ b/llvm/test/CodeGen/X86/lzcnt-cmp.ll @@ -96,75 +96,36 @@ define i1 @lshr_ctlz_undef_cmpne_zero_i64(i64 %in) { define <2 x i64> @lshr_ctlz_cmpeq_zero_v2i64(<2 x i64> %in) { ; X86-LABEL: lshr_ctlz_cmpeq_zero_v2i64: ; X86: # %bb.0: +; X86-NEXT: pushl %esi +; X86-NEXT: .cfi_def_cfa_offset 8 +; X86-NEXT: .cfi_offset %esi, -8 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: xorl %ecx, %ecx +; X86-NEXT: orl {{[0-9]+}}(%esp), %edx +; X86-NEXT: setne %cl +; X86-NEXT: negl %ecx ; X86-NEXT: xorl %edx, %edx -; X86-NEXT: cmpl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, %ecx -; X86-NEXT: jne .LBB4_2 -; X86-NEXT: # %bb.1: -; X86-NEXT: lzcntl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: addl $32, %ecx -; X86-NEXT: .LBB4_2: -; X86-NEXT: cmpl $0, {{[0-9]+}}(%esp) -; X86-NEXT: jne .LBB4_4 -; X86-NEXT: # %bb.3: -; X86-NEXT: lzcntl {{[0-9]+}}(%esp), %edx -; X86-NEXT: addl $32, %edx -; X86-NEXT: .LBB4_4: -; X86-NEXT: andl $-64, %edx -; X86-NEXT: cmpl $1, %edx -; X86-NEXT: sbbl %edx, %edx -; X86-NEXT: andl $-64, %ecx -; X86-NEXT: cmpl $1, %ecx -; X86-NEXT: sbbl %ecx, %ecx -; X86-NEXT: movl %ecx, 12(%eax) -; X86-NEXT: movl %ecx, 8(%eax) -; X86-NEXT: movl %edx, 4(%eax) -; X86-NEXT: movl %edx, (%eax) +; X86-NEXT: orl {{[0-9]+}}(%esp), %esi +; X86-NEXT: setne %dl +; X86-NEXT: negl %edx +; X86-NEXT: movl %edx, 12(%eax) +; X86-NEXT: movl %edx, 8(%eax) +; X86-NEXT: movl %ecx, 4(%eax) +; X86-NEXT: movl %ecx, (%eax) +; X86-NEXT: popl %esi +; X86-NEXT: .cfi_def_cfa_offset 4 ; X86-NEXT: retl $4 ; ; X64-LABEL: lshr_ctlz_cmpeq_zero_v2i64: ; X64: # %bb.0: -; X64-NEXT: movdqa %xmm0, %xmm1 -; X64-NEXT: psrlq $1, %xmm1 -; X64-NEXT: por %xmm0, %xmm1 -; X64-NEXT: movdqa %xmm1, %xmm0 -; X64-NEXT: psrlq $2, %xmm0 -; X64-NEXT: por %xmm1, %xmm0 -; X64-NEXT: movdqa %xmm0, %xmm1 -; X64-NEXT: psrlq $4, %xmm1 -; X64-NEXT: por %xmm0, %xmm1 -; X64-NEXT: movdqa %xmm1, %xmm0 -; X64-NEXT: psrlq $8, %xmm0 -; X64-NEXT: por %xmm1, %xmm0 -; X64-NEXT: movdqa %xmm0, %xmm1 -; X64-NEXT: psrlq $16, %xmm1 -; X64-NEXT: por %xmm0, %xmm1 -; X64-NEXT: movdqa %xmm1, %xmm0 -; X64-NEXT: psrlq $32, %xmm0 -; X64-NEXT: por %xmm1, %xmm0 -; X64-NEXT: pcmpeqd %xmm1, %xmm1 -; X64-NEXT: pxor %xmm0, %xmm1 -; X64-NEXT: movdqa %xmm1, %xmm0 -; X64-NEXT: psrlw $1, %xmm0 -; X64-NEXT: pand {{.*}}(%rip), %xmm0 -; X64-NEXT: psubb %xmm0, %xmm1 -; X64-NEXT: movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] -; X64-NEXT: movdqa %xmm1, %xmm2 -; X64-NEXT: pand %xmm0, %xmm2 -; X64-NEXT: psrlw $2, %xmm1 -; X64-NEXT: pand %xmm0, %xmm1 -; X64-NEXT: paddb %xmm2, %xmm1 -; X64-NEXT: movdqa %xmm1, %xmm2 -; X64-NEXT: psrlw $4, %xmm2 -; X64-NEXT: paddb %xmm1, %xmm2 -; X64-NEXT: pand {{.*}}(%rip), %xmm2 -; X64-NEXT: pxor %xmm0, %xmm0 -; X64-NEXT: psadbw %xmm0, %xmm2 -; X64-NEXT: psrlq $6, %xmm2 -; X64-NEXT: pcmpeqd %xmm0, %xmm2 -; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,0,3,2] -; X64-NEXT: pand %xmm2, %xmm0 +; X64-NEXT: pxor %xmm1, %xmm1 +; X64-NEXT: pcmpeqd %xmm0, %xmm1 +; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,0,3,2] +; X64-NEXT: pand %xmm1, %xmm2 +; X64-NEXT: pcmpeqd %xmm0, %xmm0 +; X64-NEXT: pxor %xmm2, %xmm0 ; X64-NEXT: retq %ctlz = call <2 x i64> @llvm.ctlz.v2i64(<2 x i64> %in, i1 0) %lshr = lshr <2 x i64> %ctlz, @@ -176,76 +137,34 @@ define <2 x i64> @lshr_ctlz_cmpeq_zero_v2i64(<2 x i64> %in) { define <2 x i64> @lshr_ctlz_cmpne_zero_v2i64(<2 x i64> %in) { ; X86-LABEL: lshr_ctlz_cmpne_zero_v2i64: ; X86: # %bb.0: +; X86-NEXT: pushl %esi +; X86-NEXT: .cfi_def_cfa_offset 8 +; X86-NEXT: .cfi_offset %esi, -8 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: xorl %ecx, %ecx +; X86-NEXT: orl {{[0-9]+}}(%esp), %edx +; X86-NEXT: sete %cl +; X86-NEXT: negl %ecx ; X86-NEXT: xorl %edx, %edx -; X86-NEXT: cmpl $0, {{[0-9]+}}(%esp) -; X86-NEXT: movl $0, %ecx -; X86-NEXT: jne .LBB5_2 -; X86-NEXT: # %bb.1: -; X86-NEXT: lzcntl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: addl $32, %ecx -; X86-NEXT: .LBB5_2: -; X86-NEXT: cmpl $0, {{[0-9]+}}(%esp) -; X86-NEXT: jne .LBB5_4 -; X86-NEXT: # %bb.3: -; X86-NEXT: lzcntl {{[0-9]+}}(%esp), %edx -; X86-NEXT: addl $32, %edx -; X86-NEXT: .LBB5_4: -; X86-NEXT: andl $-64, %edx +; X86-NEXT: orl {{[0-9]+}}(%esp), %esi +; X86-NEXT: sete %dl ; X86-NEXT: negl %edx -; X86-NEXT: sbbl %edx, %edx -; X86-NEXT: andl $-64, %ecx -; X86-NEXT: negl %ecx -; X86-NEXT: sbbl %ecx, %ecx -; X86-NEXT: movl %ecx, 12(%eax) -; X86-NEXT: movl %ecx, 8(%eax) -; X86-NEXT: movl %edx, 4(%eax) -; X86-NEXT: movl %edx, (%eax) +; X86-NEXT: movl %edx, 12(%eax) +; X86-NEXT: movl %edx, 8(%eax) +; X86-NEXT: movl %ecx, 4(%eax) +; X86-NEXT: movl %ecx, (%eax) +; X86-NEXT: popl %esi +; X86-NEXT: .cfi_def_cfa_offset 4 ; X86-NEXT: retl $4 ; ; X64-LABEL: lshr_ctlz_cmpne_zero_v2i64: ; X64: # %bb.0: -; X64-NEXT: movdqa %xmm0, %xmm1 -; X64-NEXT: psrlq $1, %xmm1 -; X64-NEXT: por %xmm0, %xmm1 -; X64-NEXT: movdqa %xmm1, %xmm0 -; X64-NEXT: psrlq $2, %xmm0 -; X64-NEXT: por %xmm1, %xmm0 -; X64-NEXT: movdqa %xmm0, %xmm1 -; X64-NEXT: psrlq $4, %xmm1 -; X64-NEXT: por %xmm0, %xmm1 -; X64-NEXT: movdqa %xmm1, %xmm0 -; X64-NEXT: psrlq $8, %xmm0 -; X64-NEXT: por %xmm1, %xmm0 -; X64-NEXT: movdqa %xmm0, %xmm1 -; X64-NEXT: psrlq $16, %xmm1 -; X64-NEXT: por %xmm0, %xmm1 -; X64-NEXT: movdqa %xmm1, %xmm2 -; X64-NEXT: psrlq $32, %xmm2 -; X64-NEXT: por %xmm1, %xmm2 -; X64-NEXT: pcmpeqd %xmm1, %xmm1 -; X64-NEXT: pxor %xmm1, %xmm2 -; X64-NEXT: movdqa %xmm2, %xmm0 -; X64-NEXT: psrlw $1, %xmm0 -; X64-NEXT: pand {{.*}}(%rip), %xmm0 -; X64-NEXT: psubb %xmm0, %xmm2 -; X64-NEXT: movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51] -; X64-NEXT: movdqa %xmm2, %xmm3 -; X64-NEXT: pand %xmm0, %xmm3 -; X64-NEXT: psrlw $2, %xmm2 -; X64-NEXT: pand %xmm0, %xmm2 -; X64-NEXT: paddb %xmm3, %xmm2 -; X64-NEXT: movdqa %xmm2, %xmm0 -; X64-NEXT: psrlw $4, %xmm0 -; X64-NEXT: paddb %xmm2, %xmm0 -; X64-NEXT: pand {{.*}}(%rip), %xmm0 -; X64-NEXT: pxor %xmm2, %xmm2 -; X64-NEXT: psadbw %xmm2, %xmm0 -; X64-NEXT: psrlq $6, %xmm0 -; X64-NEXT: pcmpeqd %xmm2, %xmm0 -; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,0,3,2] -; X64-NEXT: pand %xmm2, %xmm0 -; X64-NEXT: pxor %xmm1, %xmm0 +; X64-NEXT: pxor %xmm1, %xmm1 +; X64-NEXT: pcmpeqd %xmm0, %xmm1 +; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,0,3,2] +; X64-NEXT: pand %xmm1, %xmm0 ; X64-NEXT: retq %ctlz = call <2 x i64> @llvm.ctlz.v2i64(<2 x i64> %in, i1 0) %lshr = lshr <2 x i64> %ctlz, From 7809fa20400000fd40b4a4b56696c7fbcd0f0fa9 Mon Sep 17 00:00:00 2001 From: Faris Rehman Date: Wed, 6 Jan 2021 15:42:24 +0000 Subject: [PATCH 10/12] [flang][driver] Add support for `-D`, `-U` Add support for options -D and -U in the new Flang driver. Summary of changes: - Create PreprocessorOptions, to be used by the driver then translated into Fortran::parser::Options - Create CompilerInvocation::setFortranOpts to pass preprocessor options into the parser options - Add a dedicated method, Flang::AddPreprocessingOptions, to extract preprocessing options from the driver arguments into the preprocessor command arguments Macros specified like -DName will default to definition 1. When defining macros, the new driver will drop anything after an end-of-line character. This is consistent with gfortran and clang, but different to what currently f18 does. However, flang (which is a bash wrapper for f18), also drops everything after an end-of-line character. So gfortran-like behaviour felt like the natural choice. Test is added to demonstrate this behaviour. Reviewed By: awarzynski Differential Revision: https://reviews.llvm.org/D93401 --- clang/include/clang/Driver/Options.td | 4 +- clang/lib/Driver/ToolChains/Flang.cpp | 14 +++- clang/lib/Driver/ToolChains/Flang.h | 9 +++ .../include/flang/Frontend/CompilerInstance.h | 8 +++ .../flang/Frontend/CompilerInvocation.h | 13 ++++ .../flang/Frontend/PreprocessorOptions.h | 42 +++++++++++ flang/lib/Frontend/CompilerInstance.cpp | 2 + flang/lib/Frontend/CompilerInvocation.cpp | 69 ++++++++++++++++++- .../test/Flang-Driver/driver-help-hidden.f90 | 2 + flang/test/Flang-Driver/driver-help.f90 | 12 ++-- flang/test/Flang-Driver/macro_def_undef.f90 | 38 ++++++++++ flang/test/Flang-Driver/macro_multiline.f90 | 22 ++++++ 12 files changed, 226 insertions(+), 9 deletions(-) create mode 100644 flang/include/flang/Frontend/PreprocessorOptions.h create mode 100644 flang/test/Flang-Driver/macro_def_undef.f90 create mode 100644 flang/test/Flang-Driver/macro_multiline.f90 diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td index 3c2a9f307c659c..428c14a7d9bbed 100644 --- a/clang/include/clang/Driver/Options.td +++ b/clang/include/clang/Driver/Options.td @@ -631,7 +631,7 @@ def C : Flag<["-"], "C">, Flags<[CC1Option]>, Group, HelpText<"Include comments in preprocessed output">, MarshallingInfoFlag<"PreprocessorOutputOpts.ShowComments">; def D : JoinedOrSeparate<["-"], "D">, Group, - Flags<[CC1Option]>, MetaVarName<"=">, + Flags<[CC1Option, FlangOption, FC1Option]>, MetaVarName<"=">, HelpText<"Define to (or 1 if omitted)">; def E : Flag<["-"], "E">, Flags<[NoXarchOption,CC1Option, FlangOption, FC1Option]>, Group, HelpText<"Only run the preprocessor">; @@ -730,7 +730,7 @@ def Ttext : JoinedOrSeparate<["-"], "Ttext">, Group, def T : JoinedOrSeparate<["-"], "T">, Group, MetaVarName<"