From 70995a1a3379ed3c21b1c5da6723f04166cb0ae6 Mon Sep 17 00:00:00 2001 From: Krzysztof Drewniak Date: Fri, 16 Aug 2024 16:24:25 -0500 Subject: [PATCH] [ScalarizeMaskedMemIntr] Optimize splat non-constant masks (#104537) In cases (like the ones added in the tests) where the condition of a masked load or store is a splat but not a constant (that is, a masked operation is being used to implement patterns like "load if the current lane is in-bounds, otherwise return 0"), optimize the 'scalarized' code to perform an aligned vector load/store if the splat constant is true. Additionally, take a few steps to preserve aliasing information and names when nothing is scalarized while I'm here. As motivation, some LLVM IR users will genatate masked load/store in cases that map to this kind of predicated operation (where either the vector is loaded/stored or it isn't) in order to take advantage of hardware primitives, but on AMDGPU, where we don't have a masked load or store, this pass would scalarize a load or store that was intended to be - and can be - vectorized while also introducing expensive branches. Fixes #104520 Pre-commit tests at #104527 --- .../Scalar/ScalarizeMaskedMemIntrin.cpp | 64 +- llvm/test/CodeGen/X86/bfloat.ll | 586 +----------------- llvm/test/CodeGen/X86/shuffle-half.ll | 298 +-------- .../X86/expand-masked-load.ll | 34 +- .../X86/expand-masked-store.ll | 25 +- 5 files changed, 90 insertions(+), 917 deletions(-) diff --git a/llvm/lib/Transforms/Scalar/ScalarizeMaskedMemIntrin.cpp b/llvm/lib/Transforms/Scalar/ScalarizeMaskedMemIntrin.cpp index 8eadf8900020d..9cb7bad94c20b 100644 --- a/llvm/lib/Transforms/Scalar/ScalarizeMaskedMemIntrin.cpp +++ b/llvm/lib/Transforms/Scalar/ScalarizeMaskedMemIntrin.cpp @@ -17,6 +17,7 @@ #include "llvm/ADT/Twine.h" #include "llvm/Analysis/DomTreeUpdater.h" #include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/Analysis/VectorUtils.h" #include "llvm/IR/BasicBlock.h" #include "llvm/IR/Constant.h" #include "llvm/IR/Constants.h" @@ -161,7 +162,9 @@ static void scalarizeMaskedLoad(const DataLayout &DL, CallInst *CI, // Short-cut if the mask is all-true. if (isa(Mask) && cast(Mask)->isAllOnesValue()) { - Value *NewI = Builder.CreateAlignedLoad(VecType, Ptr, AlignVal); + LoadInst *NewI = Builder.CreateAlignedLoad(VecType, Ptr, AlignVal); + NewI->copyMetadata(*CI); + NewI->takeName(CI); CI->replaceAllUsesWith(NewI); CI->eraseFromParent(); return; @@ -188,8 +191,39 @@ static void scalarizeMaskedLoad(const DataLayout &DL, CallInst *CI, return; } + // Optimize the case where the "masked load" is a predicated load - that is, + // where the mask is the splat of a non-constant scalar boolean. In that case, + // use that splated value as the guard on a conditional vector load. + if (isSplatValue(Mask, /*Index=*/0)) { + Value *Predicate = Builder.CreateExtractElement(Mask, uint64_t(0ull), + Mask->getName() + ".first"); + Instruction *ThenTerm = + SplitBlockAndInsertIfThen(Predicate, InsertPt, /*Unreachable=*/false, + /*BranchWeights=*/nullptr, DTU); + + BasicBlock *CondBlock = ThenTerm->getParent(); + CondBlock->setName("cond.load"); + Builder.SetInsertPoint(CondBlock->getTerminator()); + LoadInst *Load = Builder.CreateAlignedLoad(VecType, Ptr, AlignVal, + CI->getName() + ".cond.load"); + Load->copyMetadata(*CI); + + BasicBlock *PostLoad = ThenTerm->getSuccessor(0); + Builder.SetInsertPoint(PostLoad, PostLoad->begin()); + PHINode *Phi = Builder.CreatePHI(VecType, /*NumReservedValues=*/2); + Phi->addIncoming(Load, CondBlock); + Phi->addIncoming(Src0, IfBlock); + Phi->takeName(CI); + + CI->replaceAllUsesWith(Phi); + CI->eraseFromParent(); + ModifiedDT = true; + return; + } // If the mask is not v1i1, use scalar bit test operations. This generates // better results on X86 at least. + // Note: this produces worse code on AMDGPU, where the "i1" is implicitly SIMD + // - what's a good way to detect this? Value *SclrMask; if (VectorWidth != 1) { Type *SclrMaskTy = Builder.getIntNTy(VectorWidth); @@ -297,7 +331,9 @@ static void scalarizeMaskedStore(const DataLayout &DL, CallInst *CI, // Short-cut if the mask is all-true. if (isa(Mask) && cast(Mask)->isAllOnesValue()) { - Builder.CreateAlignedStore(Src, Ptr, AlignVal); + StoreInst *Store = Builder.CreateAlignedStore(Src, Ptr, AlignVal); + Store->takeName(CI); + Store->copyMetadata(*CI); CI->eraseFromParent(); return; } @@ -319,8 +355,31 @@ static void scalarizeMaskedStore(const DataLayout &DL, CallInst *CI, return; } + // Optimize the case where the "masked store" is a predicated store - that is, + // when the mask is the splat of a non-constant scalar boolean. In that case, + // optimize to a conditional store. + if (isSplatValue(Mask, /*Index=*/0)) { + Value *Predicate = Builder.CreateExtractElement(Mask, uint64_t(0ull), + Mask->getName() + ".first"); + Instruction *ThenTerm = + SplitBlockAndInsertIfThen(Predicate, InsertPt, /*Unreachable=*/false, + /*BranchWeights=*/nullptr, DTU); + BasicBlock *CondBlock = ThenTerm->getParent(); + CondBlock->setName("cond.store"); + Builder.SetInsertPoint(CondBlock->getTerminator()); + + StoreInst *Store = Builder.CreateAlignedStore(Src, Ptr, AlignVal); + Store->takeName(CI); + Store->copyMetadata(*CI); + + CI->eraseFromParent(); + ModifiedDT = true; + return; + } + // If the mask is not v1i1, use scalar bit test operations. This generates // better results on X86 at least. + Value *SclrMask; if (VectorWidth != 1) { Type *SclrMaskTy = Builder.getIntNTy(VectorWidth); @@ -997,7 +1056,6 @@ static bool optimizeCallInst(CallInst *CI, bool &ModifiedDT, any_of(II->args(), [](Value *V) { return isa(V->getType()); })) return false; - switch (II->getIntrinsicID()) { default: break; diff --git a/llvm/test/CodeGen/X86/bfloat.ll b/llvm/test/CodeGen/X86/bfloat.ll index ec76e8b05678b..3759909a2ccc8 100644 --- a/llvm/test/CodeGen/X86/bfloat.ll +++ b/llvm/test/CodeGen/X86/bfloat.ll @@ -759,347 +759,21 @@ define <32 x bfloat> @pr63017_2() nounwind { ; ; SSE2-LABEL: pr63017_2: ; SSE2: # %bb.0: -; SSE2-NEXT: pushq %r14 -; SSE2-NEXT: pushq %rbx -; SSE2-NEXT: subq $200, %rsp ; SSE2-NEXT: xorl %eax, %eax ; SSE2-NEXT: testb %al, %al ; SSE2-NEXT: jne .LBB12_1 ; SSE2-NEXT: # %bb.2: # %cond.load ; SSE2-NEXT: movzwl (%rax), %eax ; SSE2-NEXT: shll $16, %eax -; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: movd {{.*#+}} xmm0 = [-1.0E+0,0.0E+0,0.0E+0,0.0E+0] -; SSE2-NEXT: movd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill -; SSE2-NEXT: movd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill -; SSE2-NEXT: movd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill -; SSE2-NEXT: movd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill -; SSE2-NEXT: movd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill -; SSE2-NEXT: movd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill -; SSE2-NEXT: movd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill -; SSE2-NEXT: movd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill -; SSE2-NEXT: movd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill -; SSE2-NEXT: movd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill -; SSE2-NEXT: movd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill -; SSE2-NEXT: movd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill -; SSE2-NEXT: movd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill -; SSE2-NEXT: movd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill -; SSE2-NEXT: movdqa %xmm0, %xmm15 -; SSE2-NEXT: movd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill -; SSE2-NEXT: movdqa %xmm0, %xmm13 -; SSE2-NEXT: movdqa %xmm0, %xmm14 -; SSE2-NEXT: movdqa %xmm0, %xmm11 -; SSE2-NEXT: movdqa %xmm0, %xmm12 -; SSE2-NEXT: movdqa %xmm0, %xmm9 -; SSE2-NEXT: movdqa %xmm0, %xmm10 -; SSE2-NEXT: movdqa %xmm0, %xmm7 -; SSE2-NEXT: movdqa %xmm0, %xmm8 -; SSE2-NEXT: movdqa %xmm0, %xmm5 -; SSE2-NEXT: movdqa %xmm0, %xmm6 -; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: movdqa %xmm0, %xmm4 -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: movd %eax, %xmm0 ; SSE2-NEXT: jmp .LBB12_3 ; SSE2-NEXT: .LBB12_1: -; SSE2-NEXT: movd {{.*#+}} xmm2 = [-1.0E+0,0.0E+0,0.0E+0,0.0E+0] -; SSE2-NEXT: movdqa %xmm2, %xmm0 -; SSE2-NEXT: movd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill -; SSE2-NEXT: movd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill -; SSE2-NEXT: movd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill -; SSE2-NEXT: movd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill -; SSE2-NEXT: movd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill -; SSE2-NEXT: movd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill -; SSE2-NEXT: movd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill -; SSE2-NEXT: movd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill -; SSE2-NEXT: movd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill -; SSE2-NEXT: movd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill -; SSE2-NEXT: movd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill -; SSE2-NEXT: movd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill -; SSE2-NEXT: movd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill -; SSE2-NEXT: movd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill -; SSE2-NEXT: movdqa %xmm2, %xmm15 -; SSE2-NEXT: movd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill -; SSE2-NEXT: movdqa %xmm2, %xmm13 -; SSE2-NEXT: movdqa %xmm2, %xmm14 -; SSE2-NEXT: movdqa %xmm2, %xmm11 -; SSE2-NEXT: movdqa %xmm2, %xmm12 -; SSE2-NEXT: movdqa %xmm2, %xmm9 -; SSE2-NEXT: movdqa %xmm2, %xmm10 -; SSE2-NEXT: movdqa %xmm2, %xmm7 -; SSE2-NEXT: movdqa %xmm2, %xmm8 -; SSE2-NEXT: movdqa %xmm2, %xmm5 -; SSE2-NEXT: movdqa %xmm2, %xmm6 -; SSE2-NEXT: movdqa %xmm2, %xmm3 -; SSE2-NEXT: movdqa %xmm2, %xmm4 -; SSE2-NEXT: movdqa %xmm2, %xmm1 -; SSE2-NEXT: movd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill -; SSE2-NEXT: .LBB12_3: # %else -; SSE2-NEXT: xorl %eax, %eax -; SSE2-NEXT: testb %al, %al -; SSE2-NEXT: jne .LBB12_5 -; SSE2-NEXT: # %bb.4: # %cond.load1 -; SSE2-NEXT: movzwl (%rax), %eax -; SSE2-NEXT: shll $16, %eax -; SSE2-NEXT: movd %eax, %xmm0 -; SSE2-NEXT: .LBB12_5: # %else2 -; SSE2-NEXT: xorl %eax, %eax -; SSE2-NEXT: testb %al, %al -; SSE2-NEXT: jne .LBB12_7 -; SSE2-NEXT: # %bb.6: # %cond.load4 -; SSE2-NEXT: movzwl (%rax), %eax -; SSE2-NEXT: shll $16, %eax -; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: .LBB12_7: # %else5 -; SSE2-NEXT: xorl %eax, %eax -; SSE2-NEXT: testb %al, %al -; SSE2-NEXT: jne .LBB12_9 -; SSE2-NEXT: # %bb.8: # %cond.load7 -; SSE2-NEXT: movzwl (%rax), %eax -; SSE2-NEXT: shll $16, %eax -; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: .LBB12_9: # %else8 -; SSE2-NEXT: xorl %eax, %eax -; SSE2-NEXT: testb %al, %al -; SSE2-NEXT: jne .LBB12_11 -; SSE2-NEXT: # %bb.10: # %cond.load10 -; SSE2-NEXT: movzwl (%rax), %eax -; SSE2-NEXT: shll $16, %eax -; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: .LBB12_11: # %else11 -; SSE2-NEXT: xorl %eax, %eax -; SSE2-NEXT: testb %al, %al -; SSE2-NEXT: jne .LBB12_13 -; SSE2-NEXT: # %bb.12: # %cond.load13 -; SSE2-NEXT: movzwl (%rax), %eax -; SSE2-NEXT: shll $16, %eax -; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: .LBB12_13: # %else14 -; SSE2-NEXT: xorl %eax, %eax -; SSE2-NEXT: testb %al, %al -; SSE2-NEXT: jne .LBB12_15 -; SSE2-NEXT: # %bb.14: # %cond.load16 -; SSE2-NEXT: movzwl (%rax), %eax -; SSE2-NEXT: shll $16, %eax -; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: .LBB12_15: # %else17 -; SSE2-NEXT: xorl %eax, %eax -; SSE2-NEXT: testb %al, %al -; SSE2-NEXT: jne .LBB12_17 -; SSE2-NEXT: # %bb.16: # %cond.load19 -; SSE2-NEXT: movzwl (%rax), %eax -; SSE2-NEXT: shll $16, %eax -; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: .LBB12_17: # %else20 -; SSE2-NEXT: xorl %eax, %eax -; SSE2-NEXT: testb %al, %al -; SSE2-NEXT: jne .LBB12_19 -; SSE2-NEXT: # %bb.18: # %cond.load22 -; SSE2-NEXT: movzwl (%rax), %eax -; SSE2-NEXT: shll $16, %eax -; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: .LBB12_19: # %else23 -; SSE2-NEXT: xorl %eax, %eax -; SSE2-NEXT: testb %al, %al -; SSE2-NEXT: jne .LBB12_21 -; SSE2-NEXT: # %bb.20: # %cond.load25 -; SSE2-NEXT: movzwl (%rax), %eax -; SSE2-NEXT: shll $16, %eax -; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: .LBB12_21: # %else26 -; SSE2-NEXT: xorl %eax, %eax -; SSE2-NEXT: testb %al, %al -; SSE2-NEXT: jne .LBB12_23 -; SSE2-NEXT: # %bb.22: # %cond.load28 -; SSE2-NEXT: movzwl (%rax), %eax -; SSE2-NEXT: shll $16, %eax -; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: .LBB12_23: # %else29 -; SSE2-NEXT: xorl %eax, %eax -; SSE2-NEXT: testb %al, %al -; SSE2-NEXT: jne .LBB12_25 -; SSE2-NEXT: # %bb.24: # %cond.load31 -; SSE2-NEXT: movzwl (%rax), %eax -; SSE2-NEXT: shll $16, %eax -; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: .LBB12_25: # %else32 -; SSE2-NEXT: xorl %eax, %eax -; SSE2-NEXT: testb %al, %al -; SSE2-NEXT: jne .LBB12_27 -; SSE2-NEXT: # %bb.26: # %cond.load34 -; SSE2-NEXT: movzwl (%rax), %eax -; SSE2-NEXT: shll $16, %eax -; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: .LBB12_27: # %else35 -; SSE2-NEXT: xorl %eax, %eax -; SSE2-NEXT: testb %al, %al -; SSE2-NEXT: jne .LBB12_29 -; SSE2-NEXT: # %bb.28: # %cond.load37 -; SSE2-NEXT: movzwl (%rax), %eax -; SSE2-NEXT: shll $16, %eax -; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: .LBB12_29: # %else38 -; SSE2-NEXT: xorl %eax, %eax -; SSE2-NEXT: testb %al, %al -; SSE2-NEXT: jne .LBB12_31 -; SSE2-NEXT: # %bb.30: # %cond.load40 -; SSE2-NEXT: movzwl (%rax), %eax -; SSE2-NEXT: shll $16, %eax -; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: .LBB12_31: # %else41 -; SSE2-NEXT: xorl %eax, %eax -; SSE2-NEXT: testb %al, %al -; SSE2-NEXT: jne .LBB12_33 -; SSE2-NEXT: # %bb.32: # %cond.load43 -; SSE2-NEXT: movzwl (%rax), %eax -; SSE2-NEXT: shll $16, %eax -; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: .LBB12_33: # %else44 -; SSE2-NEXT: xorl %eax, %eax -; SSE2-NEXT: testb %al, %al -; SSE2-NEXT: jne .LBB12_35 -; SSE2-NEXT: # %bb.34: # %cond.load46 -; SSE2-NEXT: movzwl (%rax), %eax -; SSE2-NEXT: shll $16, %eax -; SSE2-NEXT: movd %eax, %xmm15 -; SSE2-NEXT: .LBB12_35: # %else47 -; SSE2-NEXT: xorl %eax, %eax -; SSE2-NEXT: testb %al, %al -; SSE2-NEXT: jne .LBB12_37 -; SSE2-NEXT: # %bb.36: # %cond.load49 -; SSE2-NEXT: movzwl (%rax), %eax -; SSE2-NEXT: shll $16, %eax -; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: .LBB12_37: # %else50 -; SSE2-NEXT: xorl %eax, %eax -; SSE2-NEXT: testb %al, %al -; SSE2-NEXT: jne .LBB12_39 -; SSE2-NEXT: # %bb.38: # %cond.load52 -; SSE2-NEXT: movzwl (%rax), %eax -; SSE2-NEXT: shll $16, %eax -; SSE2-NEXT: movd %eax, %xmm13 -; SSE2-NEXT: .LBB12_39: # %else53 -; SSE2-NEXT: xorl %eax, %eax -; SSE2-NEXT: testb %al, %al -; SSE2-NEXT: jne .LBB12_41 -; SSE2-NEXT: # %bb.40: # %cond.load55 -; SSE2-NEXT: movzwl (%rax), %eax -; SSE2-NEXT: shll $16, %eax -; SSE2-NEXT: movd %eax, %xmm14 -; SSE2-NEXT: .LBB12_41: # %else56 -; SSE2-NEXT: xorl %eax, %eax -; SSE2-NEXT: testb %al, %al -; SSE2-NEXT: jne .LBB12_43 -; SSE2-NEXT: # %bb.42: # %cond.load58 -; SSE2-NEXT: movzwl (%rax), %eax -; SSE2-NEXT: shll $16, %eax -; SSE2-NEXT: movd %eax, %xmm11 -; SSE2-NEXT: .LBB12_43: # %else59 -; SSE2-NEXT: xorl %eax, %eax -; SSE2-NEXT: testb %al, %al -; SSE2-NEXT: jne .LBB12_45 -; SSE2-NEXT: # %bb.44: # %cond.load61 -; SSE2-NEXT: movzwl (%rax), %eax -; SSE2-NEXT: shll $16, %eax -; SSE2-NEXT: movd %eax, %xmm12 -; SSE2-NEXT: .LBB12_45: # %else62 -; SSE2-NEXT: xorl %eax, %eax -; SSE2-NEXT: testb %al, %al -; SSE2-NEXT: jne .LBB12_47 -; SSE2-NEXT: # %bb.46: # %cond.load64 -; SSE2-NEXT: movzwl (%rax), %eax -; SSE2-NEXT: shll $16, %eax -; SSE2-NEXT: movd %eax, %xmm9 -; SSE2-NEXT: .LBB12_47: # %else65 -; SSE2-NEXT: xorl %eax, %eax -; SSE2-NEXT: testb %al, %al -; SSE2-NEXT: jne .LBB12_49 -; SSE2-NEXT: # %bb.48: # %cond.load67 -; SSE2-NEXT: movzwl (%rax), %eax -; SSE2-NEXT: shll $16, %eax -; SSE2-NEXT: movd %eax, %xmm10 -; SSE2-NEXT: .LBB12_49: # %else68 -; SSE2-NEXT: xorl %eax, %eax -; SSE2-NEXT: testb %al, %al -; SSE2-NEXT: jne .LBB12_51 -; SSE2-NEXT: # %bb.50: # %cond.load70 -; SSE2-NEXT: movzwl (%rax), %eax -; SSE2-NEXT: shll $16, %eax -; SSE2-NEXT: movd %eax, %xmm7 -; SSE2-NEXT: .LBB12_51: # %else71 -; SSE2-NEXT: xorl %eax, %eax -; SSE2-NEXT: testb %al, %al -; SSE2-NEXT: jne .LBB12_53 -; SSE2-NEXT: # %bb.52: # %cond.load73 -; SSE2-NEXT: movzwl (%rax), %eax -; SSE2-NEXT: shll $16, %eax -; SSE2-NEXT: movd %eax, %xmm8 -; SSE2-NEXT: .LBB12_53: # %else74 -; SSE2-NEXT: xorl %eax, %eax -; SSE2-NEXT: testb %al, %al -; SSE2-NEXT: jne .LBB12_55 -; SSE2-NEXT: # %bb.54: # %cond.load76 -; SSE2-NEXT: movzwl (%rax), %eax -; SSE2-NEXT: shll $16, %eax -; SSE2-NEXT: movd %eax, %xmm5 -; SSE2-NEXT: .LBB12_55: # %else77 -; SSE2-NEXT: xorl %eax, %eax -; SSE2-NEXT: testb %al, %al -; SSE2-NEXT: jne .LBB12_57 -; SSE2-NEXT: # %bb.56: # %cond.load79 -; SSE2-NEXT: movzwl (%rax), %eax -; SSE2-NEXT: shll $16, %eax -; SSE2-NEXT: movd %eax, %xmm6 -; SSE2-NEXT: .LBB12_57: # %else80 -; SSE2-NEXT: xorl %eax, %eax -; SSE2-NEXT: testb %al, %al -; SSE2-NEXT: jne .LBB12_59 -; SSE2-NEXT: # %bb.58: # %cond.load82 -; SSE2-NEXT: movzwl (%rax), %eax -; SSE2-NEXT: shll $16, %eax -; SSE2-NEXT: movd %eax, %xmm3 -; SSE2-NEXT: .LBB12_59: # %else83 -; SSE2-NEXT: xorl %eax, %eax -; SSE2-NEXT: testb %al, %al -; SSE2-NEXT: jne .LBB12_61 -; SSE2-NEXT: # %bb.60: # %cond.load85 -; SSE2-NEXT: movzwl (%rax), %eax -; SSE2-NEXT: shll $16, %eax -; SSE2-NEXT: movd %eax, %xmm4 -; SSE2-NEXT: .LBB12_61: # %else86 -; SSE2-NEXT: xorl %eax, %eax -; SSE2-NEXT: testb %al, %al -; SSE2-NEXT: jne .LBB12_63 -; SSE2-NEXT: # %bb.62: # %cond.load88 -; SSE2-NEXT: movzwl (%rax), %eax -; SSE2-NEXT: shll $16, %eax -; SSE2-NEXT: movd %eax, %xmm1 -; SSE2-NEXT: .LBB12_63: # %else89 -; SSE2-NEXT: xorl %eax, %eax -; SSE2-NEXT: testb %al, %al -; SSE2-NEXT: movd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill -; SSE2-NEXT: movd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill -; SSE2-NEXT: movd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill -; SSE2-NEXT: movd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill -; SSE2-NEXT: movd %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill -; SSE2-NEXT: movd %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill -; SSE2-NEXT: movd %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill -; SSE2-NEXT: movd %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill -; SSE2-NEXT: movd %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill -; SSE2-NEXT: movd %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill -; SSE2-NEXT: movd %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill -; SSE2-NEXT: movd %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill -; SSE2-NEXT: movd %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill -; SSE2-NEXT: movd %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill -; SSE2-NEXT: jne .LBB12_64 -; SSE2-NEXT: # %bb.65: # %cond.load91 -; SSE2-NEXT: movzwl (%rax), %eax -; SSE2-NEXT: shll $16, %eax -; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: jmp .LBB12_66 -; SSE2-NEXT: .LBB12_64: -; SSE2-NEXT: movd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill -; SSE2-NEXT: .LBB12_66: # %else92 +; SSE2-NEXT: movd {{.*#+}} xmm0 = [-1.0E+0,0.0E+0,0.0E+0,0.0E+0] +; SSE2-NEXT: .LBB12_3: +; SSE2-NEXT: pushq %r14 +; SSE2-NEXT: pushq %rbx +; SSE2-NEXT: subq $88, %rsp +; SSE2-NEXT: movd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill ; SSE2-NEXT: callq __truncsfbf2@PLT ; SSE2-NEXT: pextrw $0, %xmm0, %ebx ; SSE2-NEXT: shll $16, %ebx @@ -1316,7 +990,7 @@ define <32 x bfloat> @pr63017_2() nounwind { ; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE2-NEXT: addq $200, %rsp +; SSE2-NEXT: addq $88, %rsp ; SSE2-NEXT: popq %rbx ; SSE2-NEXT: popq %r14 ; SSE2-NEXT: retq @@ -1329,250 +1003,14 @@ define <32 x bfloat> @pr63017_2() nounwind { ; ; AVXNC-LABEL: pr63017_2: ; AVXNC: # %bb.0: -; AVXNC-NEXT: vpbroadcastw {{.*#+}} ymm0 = [49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024] +; AVXNC-NEXT: vbroadcastss {{.*#+}} ymm0 = [49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024] ; AVXNC-NEXT: xorl %eax, %eax ; AVXNC-NEXT: testb %al, %al -; AVXNC-NEXT: vmovdqa %ymm0, %ymm1 ; AVXNC-NEXT: jne .LBB12_2 ; AVXNC-NEXT: # %bb.1: # %cond.load -; AVXNC-NEXT: vpbroadcastw {{.*#+}} ymm1 = [49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024] -; AVXNC-NEXT: vpbroadcastw {{.*#+}} ymm0 = [49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024] -; AVXNC-NEXT: vpinsrw $0, (%rax), %xmm0, %xmm2 -; AVXNC-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] -; AVXNC-NEXT: .LBB12_2: # %else -; AVXNC-NEXT: xorl %eax, %eax -; AVXNC-NEXT: testb %al, %al -; AVXNC-NEXT: jne .LBB12_4 -; AVXNC-NEXT: # %bb.3: # %cond.load1 -; AVXNC-NEXT: vpinsrw $1, (%rax), %xmm0, %xmm2 -; AVXNC-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] -; AVXNC-NEXT: .LBB12_4: # %else2 -; AVXNC-NEXT: xorl %eax, %eax -; AVXNC-NEXT: testb %al, %al -; AVXNC-NEXT: jne .LBB12_6 -; AVXNC-NEXT: # %bb.5: # %cond.load4 -; AVXNC-NEXT: vpinsrw $2, (%rax), %xmm0, %xmm2 -; AVXNC-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] -; AVXNC-NEXT: .LBB12_6: # %else5 -; AVXNC-NEXT: xorl %eax, %eax -; AVXNC-NEXT: testb %al, %al -; AVXNC-NEXT: jne .LBB12_8 -; AVXNC-NEXT: # %bb.7: # %cond.load7 -; AVXNC-NEXT: vpinsrw $3, (%rax), %xmm0, %xmm2 -; AVXNC-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] -; AVXNC-NEXT: .LBB12_8: # %else8 -; AVXNC-NEXT: xorl %eax, %eax -; AVXNC-NEXT: testb %al, %al -; AVXNC-NEXT: jne .LBB12_10 -; AVXNC-NEXT: # %bb.9: # %cond.load10 -; AVXNC-NEXT: vpinsrw $4, (%rax), %xmm0, %xmm2 -; AVXNC-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] -; AVXNC-NEXT: .LBB12_10: # %else11 -; AVXNC-NEXT: xorl %eax, %eax -; AVXNC-NEXT: testb %al, %al -; AVXNC-NEXT: jne .LBB12_12 -; AVXNC-NEXT: # %bb.11: # %cond.load13 -; AVXNC-NEXT: vpinsrw $5, (%rax), %xmm0, %xmm2 -; AVXNC-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] -; AVXNC-NEXT: .LBB12_12: # %else14 -; AVXNC-NEXT: xorl %eax, %eax -; AVXNC-NEXT: testb %al, %al -; AVXNC-NEXT: jne .LBB12_14 -; AVXNC-NEXT: # %bb.13: # %cond.load16 -; AVXNC-NEXT: vpinsrw $6, (%rax), %xmm0, %xmm2 -; AVXNC-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] -; AVXNC-NEXT: .LBB12_14: # %else17 -; AVXNC-NEXT: xorl %eax, %eax -; AVXNC-NEXT: testb %al, %al -; AVXNC-NEXT: jne .LBB12_16 -; AVXNC-NEXT: # %bb.15: # %cond.load19 -; AVXNC-NEXT: vpinsrw $7, (%rax), %xmm0, %xmm2 -; AVXNC-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] -; AVXNC-NEXT: .LBB12_16: # %else20 -; AVXNC-NEXT: xorl %eax, %eax -; AVXNC-NEXT: testb %al, %al -; AVXNC-NEXT: jne .LBB12_18 -; AVXNC-NEXT: # %bb.17: # %cond.load22 -; AVXNC-NEXT: vpbroadcastw (%rax), %ymm2 -; AVXNC-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm0[1,2,3,4,5,6,7],ymm2[8],ymm0[9,10,11,12,13,14,15] -; AVXNC-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVXNC-NEXT: .LBB12_18: # %else23 -; AVXNC-NEXT: xorl %eax, %eax -; AVXNC-NEXT: testb %al, %al -; AVXNC-NEXT: jne .LBB12_20 -; AVXNC-NEXT: # %bb.19: # %cond.load25 -; AVXNC-NEXT: vpbroadcastw (%rax), %ymm2 -; AVXNC-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0],ymm2[1],ymm0[2,3,4,5,6,7,8],ymm2[9],ymm0[10,11,12,13,14,15] -; AVXNC-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVXNC-NEXT: .LBB12_20: # %else26 -; AVXNC-NEXT: xorl %eax, %eax -; AVXNC-NEXT: testb %al, %al -; AVXNC-NEXT: jne .LBB12_22 -; AVXNC-NEXT: # %bb.21: # %cond.load28 -; AVXNC-NEXT: vpbroadcastw (%rax), %ymm2 -; AVXNC-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0,1],ymm2[2],ymm0[3,4,5,6,7,8,9],ymm2[10],ymm0[11,12,13,14,15] -; AVXNC-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVXNC-NEXT: .LBB12_22: # %else29 -; AVXNC-NEXT: xorl %eax, %eax -; AVXNC-NEXT: testb %al, %al -; AVXNC-NEXT: jne .LBB12_24 -; AVXNC-NEXT: # %bb.23: # %cond.load31 -; AVXNC-NEXT: vpbroadcastw (%rax), %ymm2 -; AVXNC-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0,1,2],ymm2[3],ymm0[4,5,6,7,8,9,10],ymm2[11],ymm0[12,13,14,15] -; AVXNC-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVXNC-NEXT: .LBB12_24: # %else32 -; AVXNC-NEXT: xorl %eax, %eax -; AVXNC-NEXT: testb %al, %al -; AVXNC-NEXT: jne .LBB12_26 -; AVXNC-NEXT: # %bb.25: # %cond.load34 -; AVXNC-NEXT: vpbroadcastw (%rax), %ymm2 -; AVXNC-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0,1,2,3],ymm2[4],ymm0[5,6,7,8,9,10,11],ymm2[12],ymm0[13,14,15] -; AVXNC-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVXNC-NEXT: .LBB12_26: # %else35 -; AVXNC-NEXT: xorl %eax, %eax -; AVXNC-NEXT: testb %al, %al -; AVXNC-NEXT: jne .LBB12_28 -; AVXNC-NEXT: # %bb.27: # %cond.load37 -; AVXNC-NEXT: vpbroadcastw (%rax), %ymm2 -; AVXNC-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0,1,2,3,4],ymm2[5],ymm0[6,7,8,9,10,11,12],ymm2[13],ymm0[14,15] -; AVXNC-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVXNC-NEXT: .LBB12_28: # %else38 -; AVXNC-NEXT: xorl %eax, %eax -; AVXNC-NEXT: testb %al, %al -; AVXNC-NEXT: jne .LBB12_30 -; AVXNC-NEXT: # %bb.29: # %cond.load40 -; AVXNC-NEXT: vpbroadcastw (%rax), %ymm2 -; AVXNC-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],ymm2[6],ymm0[7,8,9,10,11,12,13],ymm2[14],ymm0[15] -; AVXNC-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVXNC-NEXT: .LBB12_30: # %else41 -; AVXNC-NEXT: xorl %eax, %eax -; AVXNC-NEXT: testb %al, %al -; AVXNC-NEXT: jne .LBB12_32 -; AVXNC-NEXT: # %bb.31: # %cond.load43 -; AVXNC-NEXT: vpbroadcastw (%rax), %ymm2 -; AVXNC-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5,6],ymm2[7],ymm0[8,9,10,11,12,13,14],ymm2[15] -; AVXNC-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVXNC-NEXT: .LBB12_32: # %else44 -; AVXNC-NEXT: xorl %eax, %eax -; AVXNC-NEXT: testb %al, %al -; AVXNC-NEXT: jne .LBB12_34 -; AVXNC-NEXT: # %bb.33: # %cond.load46 -; AVXNC-NEXT: vpinsrw $0, (%rax), %xmm1, %xmm2 -; AVXNC-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVXNC-NEXT: .LBB12_34: # %else47 -; AVXNC-NEXT: xorl %eax, %eax -; AVXNC-NEXT: testb %al, %al -; AVXNC-NEXT: jne .LBB12_36 -; AVXNC-NEXT: # %bb.35: # %cond.load49 -; AVXNC-NEXT: vpinsrw $1, (%rax), %xmm1, %xmm2 -; AVXNC-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVXNC-NEXT: .LBB12_36: # %else50 -; AVXNC-NEXT: xorl %eax, %eax -; AVXNC-NEXT: testb %al, %al -; AVXNC-NEXT: jne .LBB12_38 -; AVXNC-NEXT: # %bb.37: # %cond.load52 -; AVXNC-NEXT: vpinsrw $2, (%rax), %xmm1, %xmm2 -; AVXNC-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVXNC-NEXT: .LBB12_38: # %else53 -; AVXNC-NEXT: xorl %eax, %eax -; AVXNC-NEXT: testb %al, %al -; AVXNC-NEXT: jne .LBB12_40 -; AVXNC-NEXT: # %bb.39: # %cond.load55 -; AVXNC-NEXT: vpinsrw $3, (%rax), %xmm1, %xmm2 -; AVXNC-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVXNC-NEXT: .LBB12_40: # %else56 -; AVXNC-NEXT: xorl %eax, %eax -; AVXNC-NEXT: testb %al, %al -; AVXNC-NEXT: jne .LBB12_42 -; AVXNC-NEXT: # %bb.41: # %cond.load58 -; AVXNC-NEXT: vpinsrw $4, (%rax), %xmm1, %xmm2 -; AVXNC-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVXNC-NEXT: .LBB12_42: # %else59 -; AVXNC-NEXT: xorl %eax, %eax -; AVXNC-NEXT: testb %al, %al -; AVXNC-NEXT: jne .LBB12_44 -; AVXNC-NEXT: # %bb.43: # %cond.load61 -; AVXNC-NEXT: vpinsrw $5, (%rax), %xmm1, %xmm2 -; AVXNC-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVXNC-NEXT: .LBB12_44: # %else62 -; AVXNC-NEXT: xorl %eax, %eax -; AVXNC-NEXT: testb %al, %al -; AVXNC-NEXT: jne .LBB12_46 -; AVXNC-NEXT: # %bb.45: # %cond.load64 -; AVXNC-NEXT: vpinsrw $6, (%rax), %xmm1, %xmm2 -; AVXNC-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVXNC-NEXT: .LBB12_46: # %else65 -; AVXNC-NEXT: xorl %eax, %eax -; AVXNC-NEXT: testb %al, %al -; AVXNC-NEXT: jne .LBB12_48 -; AVXNC-NEXT: # %bb.47: # %cond.load67 -; AVXNC-NEXT: vpinsrw $7, (%rax), %xmm1, %xmm2 -; AVXNC-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVXNC-NEXT: .LBB12_48: # %else68 -; AVXNC-NEXT: xorl %eax, %eax -; AVXNC-NEXT: testb %al, %al -; AVXNC-NEXT: jne .LBB12_50 -; AVXNC-NEXT: # %bb.49: # %cond.load70 -; AVXNC-NEXT: vpbroadcastw (%rax), %ymm2 -; AVXNC-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm1[1,2,3,4,5,6,7],ymm2[8],ymm1[9,10,11,12,13,14,15] -; AVXNC-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVXNC-NEXT: .LBB12_50: # %else71 -; AVXNC-NEXT: xorl %eax, %eax -; AVXNC-NEXT: testb %al, %al -; AVXNC-NEXT: jne .LBB12_52 -; AVXNC-NEXT: # %bb.51: # %cond.load73 -; AVXNC-NEXT: vpbroadcastw (%rax), %ymm2 -; AVXNC-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0],ymm2[1],ymm1[2,3,4,5,6,7,8],ymm2[9],ymm1[10,11,12,13,14,15] -; AVXNC-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVXNC-NEXT: .LBB12_52: # %else74 -; AVXNC-NEXT: xorl %eax, %eax -; AVXNC-NEXT: testb %al, %al -; AVXNC-NEXT: jne .LBB12_54 -; AVXNC-NEXT: # %bb.53: # %cond.load76 -; AVXNC-NEXT: vpbroadcastw (%rax), %ymm2 -; AVXNC-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1],ymm2[2],ymm1[3,4,5,6,7,8,9],ymm2[10],ymm1[11,12,13,14,15] -; AVXNC-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVXNC-NEXT: .LBB12_54: # %else77 -; AVXNC-NEXT: xorl %eax, %eax -; AVXNC-NEXT: testb %al, %al -; AVXNC-NEXT: jne .LBB12_56 -; AVXNC-NEXT: # %bb.55: # %cond.load79 -; AVXNC-NEXT: vpbroadcastw (%rax), %ymm2 -; AVXNC-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1,2],ymm2[3],ymm1[4,5,6,7,8,9,10],ymm2[11],ymm1[12,13,14,15] -; AVXNC-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVXNC-NEXT: .LBB12_56: # %else80 -; AVXNC-NEXT: xorl %eax, %eax -; AVXNC-NEXT: testb %al, %al -; AVXNC-NEXT: jne .LBB12_58 -; AVXNC-NEXT: # %bb.57: # %cond.load82 -; AVXNC-NEXT: vpbroadcastw (%rax), %ymm2 -; AVXNC-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm2[4],ymm1[5,6,7,8,9,10,11],ymm2[12],ymm1[13,14,15] -; AVXNC-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVXNC-NEXT: .LBB12_58: # %else83 -; AVXNC-NEXT: xorl %eax, %eax -; AVXNC-NEXT: testb %al, %al -; AVXNC-NEXT: jne .LBB12_60 -; AVXNC-NEXT: # %bb.59: # %cond.load85 -; AVXNC-NEXT: vpbroadcastw (%rax), %ymm2 -; AVXNC-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1,2,3,4],ymm2[5],ymm1[6,7,8,9,10,11,12],ymm2[13],ymm1[14,15] -; AVXNC-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVXNC-NEXT: .LBB12_60: # %else86 -; AVXNC-NEXT: xorl %eax, %eax -; AVXNC-NEXT: testb %al, %al -; AVXNC-NEXT: jne .LBB12_62 -; AVXNC-NEXT: # %bb.61: # %cond.load88 -; AVXNC-NEXT: vpbroadcastw (%rax), %ymm2 -; AVXNC-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1,2,3,4,5],ymm2[6],ymm1[7,8,9,10,11,12,13],ymm2[14],ymm1[15] -; AVXNC-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVXNC-NEXT: .LBB12_62: # %else89 -; AVXNC-NEXT: xorl %eax, %eax -; AVXNC-NEXT: testb %al, %al -; AVXNC-NEXT: jne .LBB12_64 -; AVXNC-NEXT: # %bb.63: # %cond.load91 -; AVXNC-NEXT: vpbroadcastw (%rax), %ymm2 -; AVXNC-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1,2,3,4,5,6],ymm2[7],ymm1[8,9,10,11,12,13,14],ymm2[15] -; AVXNC-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVXNC-NEXT: .LBB12_64: # %else92 +; AVXNC-NEXT: vmovups (%rax), %ymm0 +; AVXNC-NEXT: .LBB12_2: +; AVXNC-NEXT: vmovaps %ymm0, %ymm1 ; AVXNC-NEXT: retq %1 = call <32 x bfloat> @llvm.masked.load.v32bf16.p0(ptr poison, i32 2, <32 x i1> poison, <32 x bfloat> ) ret <32 x bfloat> %1 diff --git a/llvm/test/CodeGen/X86/shuffle-half.ll b/llvm/test/CodeGen/X86/shuffle-half.ll index 291fe841043ed..001db2c7cecae 100644 --- a/llvm/test/CodeGen/X86/shuffle-half.ll +++ b/llvm/test/CodeGen/X86/shuffle-half.ll @@ -4,305 +4,13 @@ define <32 x half> @dump_vec() { ; CHECK-LABEL: dump_vec: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0 ; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: testb %al, %al ; CHECK-NEXT: jne .LBB0_2 ; CHECK-NEXT: # %bb.1: # %cond.load -; CHECK-NEXT: vpinsrw $0, (%rax), %xmm0, %xmm0 -; CHECK-NEXT: vmovd {{.*#+}} xmm1 = [65535,0,0,0] -; CHECK-NEXT: vpand %ymm0, %ymm1, %ymm0 -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 -; CHECK-NEXT: .LBB0_2: # %else -; CHECK-NEXT: xorl %eax, %eax -; CHECK-NEXT: testb %al, %al -; CHECK-NEXT: jne .LBB0_4 -; CHECK-NEXT: # %bb.3: # %cond.load1 -; CHECK-NEXT: vpbroadcastw (%rax), %xmm1 -; CHECK-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7] -; CHECK-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm0 -; CHECK-NEXT: .LBB0_4: # %else2 -; CHECK-NEXT: xorl %eax, %eax -; CHECK-NEXT: testb %al, %al -; CHECK-NEXT: jne .LBB0_6 -; CHECK-NEXT: # %bb.5: # %cond.load4 -; CHECK-NEXT: vpbroadcastw (%rax), %xmm1 -; CHECK-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2],xmm0[3,4,5,6,7] -; CHECK-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm0 -; CHECK-NEXT: .LBB0_6: # %else5 -; CHECK-NEXT: xorl %eax, %eax -; CHECK-NEXT: testb %al, %al -; CHECK-NEXT: jne .LBB0_8 -; CHECK-NEXT: # %bb.7: # %cond.load7 -; CHECK-NEXT: vpbroadcastw (%rax), %xmm1 -; CHECK-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2],xmm1[3],xmm0[4,5,6,7] -; CHECK-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm0 -; CHECK-NEXT: .LBB0_8: # %else8 -; CHECK-NEXT: xorl %eax, %eax -; CHECK-NEXT: testb %al, %al -; CHECK-NEXT: jne .LBB0_10 -; CHECK-NEXT: # %bb.9: # %cond.load10 -; CHECK-NEXT: vpbroadcastw (%rax), %xmm1 -; CHECK-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6,7] -; CHECK-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm0 -; CHECK-NEXT: .LBB0_10: # %else11 -; CHECK-NEXT: xorl %eax, %eax -; CHECK-NEXT: testb %al, %al -; CHECK-NEXT: jne .LBB0_12 -; CHECK-NEXT: # %bb.11: # %cond.load13 -; CHECK-NEXT: vpbroadcastw (%rax), %xmm1 -; CHECK-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2,3,4],xmm1[5],xmm0[6,7] -; CHECK-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm0 -; CHECK-NEXT: .LBB0_12: # %else14 -; CHECK-NEXT: xorl %eax, %eax -; CHECK-NEXT: testb %al, %al -; CHECK-NEXT: jne .LBB0_14 -; CHECK-NEXT: # %bb.13: # %cond.load16 -; CHECK-NEXT: vpbroadcastw (%rax), %xmm1 -; CHECK-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,5],xmm1[6],xmm0[7] -; CHECK-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm0 -; CHECK-NEXT: .LBB0_14: # %else17 -; CHECK-NEXT: xorl %eax, %eax -; CHECK-NEXT: testb %al, %al -; CHECK-NEXT: jne .LBB0_16 -; CHECK-NEXT: # %bb.15: # %cond.load19 -; CHECK-NEXT: vpbroadcastw (%rax), %xmm1 -; CHECK-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,5,6],xmm1[7] -; CHECK-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm0 -; CHECK-NEXT: .LBB0_16: # %else20 -; CHECK-NEXT: xorl %eax, %eax -; CHECK-NEXT: testb %al, %al -; CHECK-NEXT: jne .LBB0_18 -; CHECK-NEXT: # %bb.17: # %cond.load22 -; CHECK-NEXT: vpbroadcastw (%rax), %ymm1 -; CHECK-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15] -; CHECK-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; CHECK-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm0[4,5,6,7] -; CHECK-NEXT: .LBB0_18: # %else23 -; CHECK-NEXT: xorl %eax, %eax -; CHECK-NEXT: testb %al, %al -; CHECK-NEXT: jne .LBB0_20 -; CHECK-NEXT: # %bb.19: # %cond.load25 -; CHECK-NEXT: vpbroadcastw (%rax), %ymm1 -; CHECK-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6,7,8],ymm1[9],ymm0[10,11,12,13,14,15] -; CHECK-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; CHECK-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm0[4,5,6,7] -; CHECK-NEXT: .LBB0_20: # %else26 -; CHECK-NEXT: xorl %eax, %eax -; CHECK-NEXT: testb %al, %al -; CHECK-NEXT: jne .LBB0_22 -; CHECK-NEXT: # %bb.21: # %cond.load28 -; CHECK-NEXT: vpbroadcastw (%rax), %ymm1 -; CHECK-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1],ymm1[2],ymm0[3,4,5,6,7,8,9],ymm1[10],ymm0[11,12,13,14,15] -; CHECK-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; CHECK-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm0[4,5,6,7] -; CHECK-NEXT: .LBB0_22: # %else29 -; CHECK-NEXT: xorl %eax, %eax -; CHECK-NEXT: testb %al, %al -; CHECK-NEXT: jne .LBB0_24 -; CHECK-NEXT: # %bb.23: # %cond.load31 -; CHECK-NEXT: vpbroadcastw (%rax), %ymm1 -; CHECK-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6,7,8,9,10],ymm1[11],ymm0[12,13,14,15] -; CHECK-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; CHECK-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm0[4,5,6,7] -; CHECK-NEXT: .LBB0_24: # %else32 -; CHECK-NEXT: xorl %eax, %eax -; CHECK-NEXT: testb %al, %al -; CHECK-NEXT: jne .LBB0_26 -; CHECK-NEXT: # %bb.25: # %cond.load34 -; CHECK-NEXT: vpbroadcastw (%rax), %ymm1 -; CHECK-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4],ymm0[5,6,7,8,9,10,11],ymm1[12],ymm0[13,14,15] -; CHECK-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; CHECK-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm0[4,5,6,7] -; CHECK-NEXT: .LBB0_26: # %else35 -; CHECK-NEXT: xorl %eax, %eax -; CHECK-NEXT: testb %al, %al -; CHECK-NEXT: jne .LBB0_28 -; CHECK-NEXT: # %bb.27: # %cond.load37 -; CHECK-NEXT: vpbroadcastw (%rax), %ymm1 -; CHECK-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7,8,9,10,11,12],ymm1[13],ymm0[14,15] -; CHECK-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; CHECK-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm0[4,5,6,7] -; CHECK-NEXT: .LBB0_28: # %else38 -; CHECK-NEXT: xorl %eax, %eax -; CHECK-NEXT: testb %al, %al -; CHECK-NEXT: jne .LBB0_30 -; CHECK-NEXT: # %bb.29: # %cond.load40 -; CHECK-NEXT: vpbroadcastw (%rax), %ymm1 -; CHECK-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5],ymm1[6],ymm0[7,8,9,10,11,12,13],ymm1[14],ymm0[15] -; CHECK-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; CHECK-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm0[4,5,6,7] -; CHECK-NEXT: .LBB0_30: # %else41 -; CHECK-NEXT: xorl %eax, %eax -; CHECK-NEXT: testb %al, %al -; CHECK-NEXT: jne .LBB0_32 -; CHECK-NEXT: # %bb.31: # %cond.load43 -; CHECK-NEXT: vpbroadcastw (%rax), %ymm1 -; CHECK-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5,6],ymm1[7],ymm0[8,9,10,11,12,13,14],ymm1[15] -; CHECK-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; CHECK-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm0[4,5,6,7] -; CHECK-NEXT: .LBB0_32: # %else44 -; CHECK-NEXT: xorl %eax, %eax -; CHECK-NEXT: testb %al, %al -; CHECK-NEXT: jne .LBB0_34 -; CHECK-NEXT: # %bb.33: # %cond.load46 -; CHECK-NEXT: vpbroadcastw (%rax), %xmm1 -; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm2 -; CHECK-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3,4,5,6,7] -; CHECK-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; CHECK-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 -; CHECK-NEXT: .LBB0_34: # %else47 -; CHECK-NEXT: xorl %eax, %eax -; CHECK-NEXT: testb %al, %al -; CHECK-NEXT: jne .LBB0_36 -; CHECK-NEXT: # %bb.35: # %cond.load49 -; CHECK-NEXT: vpbroadcastw (%rax), %xmm1 -; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm2 -; CHECK-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2,3,4,5,6,7] -; CHECK-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; CHECK-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 -; CHECK-NEXT: .LBB0_36: # %else50 -; CHECK-NEXT: xorl %eax, %eax -; CHECK-NEXT: testb %al, %al -; CHECK-NEXT: jne .LBB0_38 -; CHECK-NEXT: # %bb.37: # %cond.load52 -; CHECK-NEXT: vpbroadcastw (%rax), %xmm1 -; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm2 -; CHECK-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2],xmm2[3,4,5,6,7] -; CHECK-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; CHECK-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 -; CHECK-NEXT: .LBB0_38: # %else53 -; CHECK-NEXT: xorl %eax, %eax -; CHECK-NEXT: testb %al, %al -; CHECK-NEXT: jne .LBB0_40 -; CHECK-NEXT: # %bb.39: # %cond.load55 -; CHECK-NEXT: vpbroadcastw (%rax), %xmm1 -; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm2 -; CHECK-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3],xmm2[4,5,6,7] -; CHECK-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; CHECK-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 -; CHECK-NEXT: .LBB0_40: # %else56 -; CHECK-NEXT: xorl %eax, %eax -; CHECK-NEXT: testb %al, %al -; CHECK-NEXT: jne .LBB0_42 -; CHECK-NEXT: # %bb.41: # %cond.load58 -; CHECK-NEXT: vpbroadcastw (%rax), %xmm1 -; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm2 -; CHECK-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4],xmm2[5,6,7] -; CHECK-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; CHECK-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 -; CHECK-NEXT: .LBB0_42: # %else59 -; CHECK-NEXT: xorl %eax, %eax -; CHECK-NEXT: testb %al, %al -; CHECK-NEXT: jne .LBB0_44 -; CHECK-NEXT: # %bb.43: # %cond.load61 -; CHECK-NEXT: vpbroadcastw (%rax), %xmm1 -; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm2 -; CHECK-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3,4],xmm1[5],xmm2[6,7] -; CHECK-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; CHECK-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 -; CHECK-NEXT: .LBB0_44: # %else62 -; CHECK-NEXT: xorl %eax, %eax -; CHECK-NEXT: testb %al, %al -; CHECK-NEXT: jne .LBB0_46 -; CHECK-NEXT: # %bb.45: # %cond.load64 -; CHECK-NEXT: vpbroadcastw (%rax), %xmm1 -; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm2 -; CHECK-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,5],xmm1[6],xmm2[7] -; CHECK-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; CHECK-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 -; CHECK-NEXT: .LBB0_46: # %else65 -; CHECK-NEXT: xorl %eax, %eax -; CHECK-NEXT: testb %al, %al -; CHECK-NEXT: jne .LBB0_48 -; CHECK-NEXT: # %bb.47: # %cond.load67 -; CHECK-NEXT: vpbroadcastw (%rax), %xmm1 -; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm2 -; CHECK-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,5,6],xmm1[7] -; CHECK-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; CHECK-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 -; CHECK-NEXT: .LBB0_48: # %else68 -; CHECK-NEXT: xorl %eax, %eax -; CHECK-NEXT: testb %al, %al -; CHECK-NEXT: jne .LBB0_50 -; CHECK-NEXT: # %bb.49: # %cond.load70 -; CHECK-NEXT: vpbroadcastw (%rax), %ymm1 -; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm2 -; CHECK-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3,4,5,6,7],ymm1[8],ymm2[9,10,11,12,13,14,15] -; CHECK-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; CHECK-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 -; CHECK-NEXT: .LBB0_50: # %else71 -; CHECK-NEXT: xorl %eax, %eax -; CHECK-NEXT: testb %al, %al -; CHECK-NEXT: jne .LBB0_52 -; CHECK-NEXT: # %bb.51: # %cond.load73 -; CHECK-NEXT: vpbroadcastw (%rax), %ymm1 -; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm2 -; CHECK-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3,4,5,6,7,8],ymm1[9],ymm2[10,11,12,13,14,15] -; CHECK-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; CHECK-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 -; CHECK-NEXT: .LBB0_52: # %else74 -; CHECK-NEXT: xorl %eax, %eax -; CHECK-NEXT: testb %al, %al -; CHECK-NEXT: jne .LBB0_54 -; CHECK-NEXT: # %bb.53: # %cond.load76 -; CHECK-NEXT: vpbroadcastw (%rax), %ymm1 -; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm2 -; CHECK-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4,5,6,7,8,9],ymm1[10],ymm2[11,12,13,14,15] -; CHECK-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; CHECK-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 -; CHECK-NEXT: .LBB0_54: # %else77 -; CHECK-NEXT: xorl %eax, %eax -; CHECK-NEXT: testb %al, %al -; CHECK-NEXT: jne .LBB0_56 -; CHECK-NEXT: # %bb.55: # %cond.load79 -; CHECK-NEXT: vpbroadcastw (%rax), %ymm1 -; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm2 -; CHECK-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6,7,8,9,10],ymm1[11],ymm2[12,13,14,15] -; CHECK-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; CHECK-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 -; CHECK-NEXT: .LBB0_56: # %else80 -; CHECK-NEXT: xorl %eax, %eax -; CHECK-NEXT: testb %al, %al -; CHECK-NEXT: jne .LBB0_58 -; CHECK-NEXT: # %bb.57: # %cond.load82 -; CHECK-NEXT: vpbroadcastw (%rax), %ymm1 -; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm2 -; CHECK-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4],ymm2[5,6,7,8,9,10,11],ymm1[12],ymm2[13,14,15] -; CHECK-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; CHECK-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 -; CHECK-NEXT: .LBB0_58: # %else83 -; CHECK-NEXT: xorl %eax, %eax -; CHECK-NEXT: testb %al, %al -; CHECK-NEXT: jne .LBB0_60 -; CHECK-NEXT: # %bb.59: # %cond.load85 -; CHECK-NEXT: vpbroadcastw (%rax), %ymm1 -; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm2 -; CHECK-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5],ymm2[6,7,8,9,10,11,12],ymm1[13],ymm2[14,15] -; CHECK-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; CHECK-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 -; CHECK-NEXT: .LBB0_60: # %else86 -; CHECK-NEXT: xorl %eax, %eax -; CHECK-NEXT: testb %al, %al -; CHECK-NEXT: jne .LBB0_62 -; CHECK-NEXT: # %bb.61: # %cond.load88 -; CHECK-NEXT: vpbroadcastw (%rax), %ymm1 -; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm2 -; CHECK-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6],ymm2[7,8,9,10,11,12,13],ymm1[14],ymm2[15] -; CHECK-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; CHECK-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 -; CHECK-NEXT: .LBB0_62: # %else89 -; CHECK-NEXT: xorl %eax, %eax -; CHECK-NEXT: testb %al, %al -; CHECK-NEXT: jne .LBB0_64 -; CHECK-NEXT: # %bb.63: # %cond.load91 -; CHECK-NEXT: vpbroadcastw (%rax), %ymm1 -; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm2 -; CHECK-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7],ymm2[8,9,10,11,12,13,14],ymm1[15] -; CHECK-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; CHECK-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 -; CHECK-NEXT: .LBB0_64: # %else92 +; CHECK-NEXT: vmovups (%rax), %zmm0 +; CHECK-NEXT: .LBB0_2: ; CHECK-NEXT: retq %1 = call <32 x half> @llvm.masked.load.v32f16.p0(ptr poison, i32 2, <32 x i1> poison, <32 x half> ) ret <32 x half> %1 diff --git a/llvm/test/Transforms/ScalarizeMaskedMemIntrin/X86/expand-masked-load.ll b/llvm/test/Transforms/ScalarizeMaskedMemIntrin/X86/expand-masked-load.ll index 9b1c59829b9ff..fffb5f021e52d 100644 --- a/llvm/test/Transforms/ScalarizeMaskedMemIntrin/X86/expand-masked-load.ll +++ b/llvm/test/Transforms/ScalarizeMaskedMemIntrin/X86/expand-masked-load.ll @@ -32,8 +32,8 @@ define <2 x i64> @scalarize_v2i64(ptr %p, <2 x i1> %mask, <2 x i64> %passthru) { define <2 x i64> @scalarize_v2i64_ones_mask(ptr %p, <2 x i64> %passthru) { ; CHECK-LABEL: @scalarize_v2i64_ones_mask( -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr [[P:%.*]], align 8 -; CHECK-NEXT: ret <2 x i64> [[TMP1]] +; CHECK-NEXT: [[RET:%.*]] = load <2 x i64>, ptr [[P:%.*]], align 8 +; CHECK-NEXT: ret <2 x i64> [[RET]] ; %ret = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr %p, i32 8, <2 x i1> , <2 x i64> %passthru) ret <2 x i64> %ret @@ -58,34 +58,18 @@ define <2 x i64> @scalarize_v2i64_const_mask(ptr %p, <2 x i64> %passthru) { ret <2 x i64> %ret } -; To be fixed: If the mask is the splat/broadcast of a non-constant value, use a -; vector load define <2 x i64> @scalarize_v2i64_splat_mask(ptr %p, i1 %mask, <2 x i64> %passthrough) { ; CHECK-LABEL: @scalarize_v2i64_splat_mask( ; CHECK-NEXT: [[MASK_VEC:%.*]] = insertelement <2 x i1> poison, i1 [[MASK:%.*]], i32 0 ; CHECK-NEXT: [[MASK_SPLAT:%.*]] = shufflevector <2 x i1> [[MASK_VEC]], <2 x i1> poison, <2 x i32> zeroinitializer -; CHECK-NEXT: [[SCALAR_MASK:%.*]] = bitcast <2 x i1> [[MASK_SPLAT]] to i2 -; CHECK-NEXT: [[TMP1:%.*]] = and i2 [[SCALAR_MASK]], 1 -; CHECK-NEXT: [[TMP2:%.*]] = icmp ne i2 [[TMP1]], 0 -; CHECK-NEXT: br i1 [[TMP2]], label [[COND_LOAD:%.*]], label [[ELSE:%.*]] +; CHECK-NEXT: [[MASK_SPLAT_FIRST:%.*]] = extractelement <2 x i1> [[MASK_SPLAT]], i64 0 +; CHECK-NEXT: br i1 [[MASK_SPLAT_FIRST]], label [[COND_LOAD:%.*]], label [[TMP1:%.*]] ; CHECK: cond.load: -; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[P:%.*]], i32 0 -; CHECK-NEXT: [[TMP4:%.*]] = load i64, ptr [[TMP3]], align 8 -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x i64> [[PASSTHROUGH:%.*]], i64 [[TMP4]], i64 0 -; CHECK-NEXT: br label [[ELSE]] -; CHECK: else: -; CHECK-NEXT: [[RES_PHI_ELSE:%.*]] = phi <2 x i64> [ [[TMP5]], [[COND_LOAD]] ], [ [[PASSTHROUGH]], [[TMP0:%.*]] ] -; CHECK-NEXT: [[TMP6:%.*]] = and i2 [[SCALAR_MASK]], -2 -; CHECK-NEXT: [[TMP7:%.*]] = icmp ne i2 [[TMP6]], 0 -; CHECK-NEXT: br i1 [[TMP7]], label [[COND_LOAD1:%.*]], label [[ELSE2:%.*]] -; CHECK: cond.load1: -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[P]], i32 1 -; CHECK-NEXT: [[TMP9:%.*]] = load i64, ptr [[TMP8]], align 8 -; CHECK-NEXT: [[TMP10:%.*]] = insertelement <2 x i64> [[RES_PHI_ELSE]], i64 [[TMP9]], i64 1 -; CHECK-NEXT: br label [[ELSE2]] -; CHECK: else2: -; CHECK-NEXT: [[RES_PHI_ELSE3:%.*]] = phi <2 x i64> [ [[TMP10]], [[COND_LOAD1]] ], [ [[RES_PHI_ELSE]], [[ELSE]] ] -; CHECK-NEXT: ret <2 x i64> [[RES_PHI_ELSE3]] +; CHECK-NEXT: [[RET_COND_LOAD:%.*]] = load <2 x i64>, ptr [[P:%.*]], align 8 +; CHECK-NEXT: br label [[TMP1]] +; CHECK: 1: +; CHECK-NEXT: [[RET:%.*]] = phi <2 x i64> [ [[RET_COND_LOAD]], [[COND_LOAD]] ], [ [[PASSTHROUGH:%.*]], [[TMP0:%.*]] ] +; CHECK-NEXT: ret <2 x i64> [[RET]] ; %mask.vec = insertelement <2 x i1> poison, i1 %mask, i32 0 %mask.splat = shufflevector <2 x i1> %mask.vec, <2 x i1> poison, <2 x i32> zeroinitializer diff --git a/llvm/test/Transforms/ScalarizeMaskedMemIntrin/X86/expand-masked-store.ll b/llvm/test/Transforms/ScalarizeMaskedMemIntrin/X86/expand-masked-store.ll index cd2815e67e672..4e3679dc5da99 100644 --- a/llvm/test/Transforms/ScalarizeMaskedMemIntrin/X86/expand-masked-store.ll +++ b/llvm/test/Transforms/ScalarizeMaskedMemIntrin/X86/expand-masked-store.ll @@ -56,31 +56,16 @@ define void @scalarize_v2i64_const_mask(ptr %p, <2 x i64> %data) { ret void } -; To be fixed: If the mask is the splat/broadcast of a non-constant value, use a -; vector store define void @scalarize_v2i64_splat_mask(ptr %p, <2 x i64> %data, i1 %mask) { ; CHECK-LABEL: @scalarize_v2i64_splat_mask( ; CHECK-NEXT: [[MASK_VEC:%.*]] = insertelement <2 x i1> poison, i1 [[MASK:%.*]], i32 0 ; CHECK-NEXT: [[MASK_SPLAT:%.*]] = shufflevector <2 x i1> [[MASK_VEC]], <2 x i1> poison, <2 x i32> zeroinitializer -; CHECK-NEXT: [[SCALAR_MASK:%.*]] = bitcast <2 x i1> [[MASK_SPLAT]] to i2 -; CHECK-NEXT: [[TMP1:%.*]] = and i2 [[SCALAR_MASK]], 1 -; CHECK-NEXT: [[TMP2:%.*]] = icmp ne i2 [[TMP1]], 0 -; CHECK-NEXT: br i1 [[TMP2]], label [[COND_STORE:%.*]], label [[ELSE:%.*]] +; CHECK-NEXT: [[MASK_SPLAT_FIRST:%.*]] = extractelement <2 x i1> [[MASK_SPLAT]], i64 0 +; CHECK-NEXT: br i1 [[MASK_SPLAT_FIRST]], label [[COND_STORE:%.*]], label [[TMP1:%.*]] ; CHECK: cond.store: -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x i64> [[DATA:%.*]], i64 0 -; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[P:%.*]], i32 0 -; CHECK-NEXT: store i64 [[TMP3]], ptr [[TMP4]], align 8 -; CHECK-NEXT: br label [[ELSE]] -; CHECK: else: -; CHECK-NEXT: [[TMP5:%.*]] = and i2 [[SCALAR_MASK]], -2 -; CHECK-NEXT: [[TMP6:%.*]] = icmp ne i2 [[TMP5]], 0 -; CHECK-NEXT: br i1 [[TMP6]], label [[COND_STORE1:%.*]], label [[ELSE2:%.*]] -; CHECK: cond.store1: -; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x i64> [[DATA]], i64 1 -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[P]], i32 1 -; CHECK-NEXT: store i64 [[TMP7]], ptr [[TMP8]], align 8 -; CHECK-NEXT: br label [[ELSE2]] -; CHECK: else2: +; CHECK-NEXT: store <2 x i64> [[DATA:%.*]], ptr [[P:%.*]], align 8 +; CHECK-NEXT: br label [[TMP1]] +; CHECK: 1: ; CHECK-NEXT: ret void ; %mask.vec = insertelement <2 x i1> poison, i1 %mask, i32 0