diff --git a/llvm/lib/Target/X86/X86FixupVectorConstants.cpp b/llvm/lib/Target/X86/X86FixupVectorConstants.cpp index 9da8d7338ea36c..ca52cb384a7ee6 100644 --- a/llvm/lib/Target/X86/X86FixupVectorConstants.cpp +++ b/llvm/lib/Target/X86/X86FixupVectorConstants.cpp @@ -67,6 +67,9 @@ FunctionPass *llvm::createX86FixupVectorConstants() { static std::optional extractConstantBits(const Constant *C) { unsigned NumBits = C->getType()->getPrimitiveSizeInBits(); + if (auto *CUndef = dyn_cast(C)) + return APInt::getZero(NumBits); + if (auto *CInt = dyn_cast(C)) return CInt->getValue(); @@ -80,6 +83,18 @@ static std::optional extractConstantBits(const Constant *C) { return APInt::getSplat(NumBits, *Bits); } } + + APInt Bits = APInt::getZero(NumBits); + for (unsigned I = 0, E = CV->getNumOperands(); I != E; ++I) { + Constant *Elt = CV->getOperand(I); + std::optional SubBits = extractConstantBits(Elt); + if (!SubBits) + return std::nullopt; + assert(NumBits == (E * SubBits->getBitWidth()) && + "Illegal vector element size"); + Bits.insertBits(*SubBits, I * SubBits->getBitWidth()); + } + return Bits; } if (auto *CDS = dyn_cast(C)) { @@ -223,6 +238,35 @@ static Constant *rebuildSplatableConstant(const Constant *C, return rebuildConstant(OriginalType->getContext(), SclTy, *Splat, NumSclBits); } +static Constant *rebuildZeroUpperConstant(const Constant *C, + unsigned ScalarBitWidth) { + Type *Ty = C->getType(); + Type *SclTy = Ty->getScalarType(); + unsigned NumBits = Ty->getPrimitiveSizeInBits(); + unsigned NumSclBits = SclTy->getPrimitiveSizeInBits(); + LLVMContext &Ctx = C->getContext(); + + if (NumBits > ScalarBitWidth) { + // Determine if the upper bits are all zero. + if (std::optional Bits = extractConstantBits(C)) { + if (Bits->countLeadingZeros() >= (NumBits - ScalarBitWidth)) { + // If the original constant was made of smaller elements, try to retain + // those types. + if (ScalarBitWidth > NumSclBits && (ScalarBitWidth % NumSclBits) == 0) + return rebuildConstant(Ctx, SclTy, *Bits, NumSclBits); + + // Fallback to raw integer bits. + APInt RawBits = Bits->zextOrTrunc(ScalarBitWidth); + return ConstantInt::get(Ctx, RawBits); + } + } + } + + return nullptr; +} + +typedef std::function RebuildFn; + bool X86FixupVectorConstantsPass::processInstruction(MachineFunction &MF, MachineBasicBlock &MBB, MachineInstr &MI) { @@ -233,37 +277,45 @@ bool X86FixupVectorConstantsPass::processInstruction(MachineFunction &MF, bool HasBWI = ST->hasBWI(); bool HasVLX = ST->hasVLX(); - auto ConvertToBroadcast = [&](unsigned OpBcst256, unsigned OpBcst128, - unsigned OpBcst64, unsigned OpBcst32, - unsigned OpBcst16, unsigned OpBcst8, - unsigned OperandNo) { - assert(MI.getNumOperands() >= (OperandNo + X86::AddrNumOperands) && - "Unexpected number of operands!"); - - if (auto *C = X86::getConstantFromPool(MI, OperandNo)) { - // Attempt to detect a suitable splat from increasing splat widths. - std::pair Broadcasts[] = { - {8, OpBcst8}, {16, OpBcst16}, {32, OpBcst32}, - {64, OpBcst64}, {128, OpBcst128}, {256, OpBcst256}, - }; - for (auto [BitWidth, OpBcst] : Broadcasts) { - if (OpBcst) { - // Construct a suitable splat constant and adjust the MI to - // use the new constant pool entry. - if (Constant *NewCst = rebuildSplatableConstant(C, BitWidth)) { - unsigned NewCPI = - CP->getConstantPoolIndex(NewCst, Align(BitWidth / 8)); - MI.setDesc(TII->get(OpBcst)); - MI.getOperand(OperandNo + X86::AddrDisp).setIndex(NewCPI); - return true; + auto FixupConstant = + [&](unsigned OpBcst256, unsigned OpBcst128, unsigned OpBcst64, + unsigned OpBcst32, unsigned OpBcst16, unsigned OpBcst8, + unsigned OpUpper64, unsigned OpUpper32, unsigned OperandNo) { + assert(MI.getNumOperands() >= (OperandNo + X86::AddrNumOperands) && + "Unexpected number of operands!"); + + if (auto *C = X86::getConstantFromPool(MI, OperandNo)) { + // Attempt to detect a suitable splat/vzload from increasing constant + // bitwidths. + // Prefer vzload vs broadcast for same bitwidth to avoid domain flips. + std::tuple FixupLoad[] = { + {8, OpBcst8, rebuildSplatableConstant}, + {16, OpBcst16, rebuildSplatableConstant}, + {32, OpUpper32, rebuildZeroUpperConstant}, + {32, OpBcst32, rebuildSplatableConstant}, + {64, OpUpper64, rebuildZeroUpperConstant}, + {64, OpBcst64, rebuildSplatableConstant}, + {128, OpBcst128, rebuildSplatableConstant}, + {256, OpBcst256, rebuildSplatableConstant}, + }; + for (auto [BitWidth, Op, RebuildConstant] : FixupLoad) { + if (Op) { + // Construct a suitable constant and adjust the MI to use the new + // constant pool entry. + if (Constant *NewCst = RebuildConstant(C, BitWidth)) { + unsigned NewCPI = + CP->getConstantPoolIndex(NewCst, Align(BitWidth / 8)); + MI.setDesc(TII->get(Op)); + MI.getOperand(OperandNo + X86::AddrDisp).setIndex(NewCPI); + return true; + } + } } } - } - } - return false; - }; + return false; + }; - // Attempt to convert full width vector loads into broadcast loads. + // Attempt to convert full width vector loads into broadcast/vzload loads. switch (Opc) { /* FP Loads */ case X86::MOVAPDrm: @@ -271,79 +323,82 @@ bool X86FixupVectorConstantsPass::processInstruction(MachineFunction &MF, case X86::MOVUPDrm: case X86::MOVUPSrm: // TODO: SSE3 MOVDDUP Handling - return false; + return FixupConstant(0, 0, 0, 0, 0, 0, X86::MOVSDrm, X86::MOVSSrm, 1); case X86::VMOVAPDrm: case X86::VMOVAPSrm: case X86::VMOVUPDrm: case X86::VMOVUPSrm: - return ConvertToBroadcast(0, 0, X86::VMOVDDUPrm, X86::VBROADCASTSSrm, 0, 0, - 1); + return FixupConstant(0, 0, X86::VMOVDDUPrm, X86::VBROADCASTSSrm, 0, 0, + X86::VMOVSDrm, X86::VMOVSSrm, 1); case X86::VMOVAPDYrm: case X86::VMOVAPSYrm: case X86::VMOVUPDYrm: case X86::VMOVUPSYrm: - return ConvertToBroadcast(0, X86::VBROADCASTF128rm, X86::VBROADCASTSDYrm, - X86::VBROADCASTSSYrm, 0, 0, 1); + return FixupConstant(0, X86::VBROADCASTF128rm, X86::VBROADCASTSDYrm, + X86::VBROADCASTSSYrm, 0, 0, 0, 0, 1); case X86::VMOVAPDZ128rm: case X86::VMOVAPSZ128rm: case X86::VMOVUPDZ128rm: case X86::VMOVUPSZ128rm: - return ConvertToBroadcast(0, 0, X86::VMOVDDUPZ128rm, - X86::VBROADCASTSSZ128rm, 0, 0, 1); + return FixupConstant(0, 0, X86::VMOVDDUPZ128rm, X86::VBROADCASTSSZ128rm, 0, + 0, X86::VMOVSDZrm, X86::VMOVSSZrm, 1); case X86::VMOVAPDZ256rm: case X86::VMOVAPSZ256rm: case X86::VMOVUPDZ256rm: case X86::VMOVUPSZ256rm: - return ConvertToBroadcast(0, X86::VBROADCASTF32X4Z256rm, - X86::VBROADCASTSDZ256rm, X86::VBROADCASTSSZ256rm, - 0, 0, 1); + return FixupConstant(0, X86::VBROADCASTF32X4Z256rm, X86::VBROADCASTSDZ256rm, + X86::VBROADCASTSSZ256rm, 0, 0, 0, 0, 1); case X86::VMOVAPDZrm: case X86::VMOVAPSZrm: case X86::VMOVUPDZrm: case X86::VMOVUPSZrm: - return ConvertToBroadcast(X86::VBROADCASTF64X4rm, X86::VBROADCASTF32X4rm, - X86::VBROADCASTSDZrm, X86::VBROADCASTSSZrm, 0, 0, - 1); + return FixupConstant(X86::VBROADCASTF64X4rm, X86::VBROADCASTF32X4rm, + X86::VBROADCASTSDZrm, X86::VBROADCASTSSZrm, 0, 0, 0, 0, + 1); /* Integer Loads */ + case X86::MOVDQArm: + case X86::MOVDQUrm: + return FixupConstant(0, 0, 0, 0, 0, 0, X86::MOVQI2PQIrm, X86::MOVDI2PDIrm, + 1); case X86::VMOVDQArm: case X86::VMOVDQUrm: - return ConvertToBroadcast( - 0, 0, HasAVX2 ? X86::VPBROADCASTQrm : X86::VMOVDDUPrm, - HasAVX2 ? X86::VPBROADCASTDrm : X86::VBROADCASTSSrm, - HasAVX2 ? X86::VPBROADCASTWrm : 0, HasAVX2 ? X86::VPBROADCASTBrm : 0, - 1); + return FixupConstant(0, 0, HasAVX2 ? X86::VPBROADCASTQrm : X86::VMOVDDUPrm, + HasAVX2 ? X86::VPBROADCASTDrm : X86::VBROADCASTSSrm, + HasAVX2 ? X86::VPBROADCASTWrm : 0, + HasAVX2 ? X86::VPBROADCASTBrm : 0, X86::VMOVQI2PQIrm, + X86::VMOVDI2PDIrm, 1); case X86::VMOVDQAYrm: case X86::VMOVDQUYrm: - return ConvertToBroadcast( + return FixupConstant( 0, HasAVX2 ? X86::VBROADCASTI128rm : X86::VBROADCASTF128rm, HasAVX2 ? X86::VPBROADCASTQYrm : X86::VBROADCASTSDYrm, HasAVX2 ? X86::VPBROADCASTDYrm : X86::VBROADCASTSSYrm, HasAVX2 ? X86::VPBROADCASTWYrm : 0, HasAVX2 ? X86::VPBROADCASTBYrm : 0, - 1); + 0, 0, 1); case X86::VMOVDQA32Z128rm: case X86::VMOVDQA64Z128rm: case X86::VMOVDQU32Z128rm: case X86::VMOVDQU64Z128rm: - return ConvertToBroadcast(0, 0, X86::VPBROADCASTQZ128rm, - X86::VPBROADCASTDZ128rm, - HasBWI ? X86::VPBROADCASTWZ128rm : 0, - HasBWI ? X86::VPBROADCASTBZ128rm : 0, 1); + return FixupConstant(0, 0, X86::VPBROADCASTQZ128rm, X86::VPBROADCASTDZ128rm, + HasBWI ? X86::VPBROADCASTWZ128rm : 0, + HasBWI ? X86::VPBROADCASTBZ128rm : 0, + X86::VMOVQI2PQIZrm, X86::VMOVDI2PDIZrm, 1); case X86::VMOVDQA32Z256rm: case X86::VMOVDQA64Z256rm: case X86::VMOVDQU32Z256rm: case X86::VMOVDQU64Z256rm: - return ConvertToBroadcast(0, X86::VBROADCASTI32X4Z256rm, - X86::VPBROADCASTQZ256rm, X86::VPBROADCASTDZ256rm, - HasBWI ? X86::VPBROADCASTWZ256rm : 0, - HasBWI ? X86::VPBROADCASTBZ256rm : 0, 1); + return FixupConstant(0, X86::VBROADCASTI32X4Z256rm, X86::VPBROADCASTQZ256rm, + X86::VPBROADCASTDZ256rm, + HasBWI ? X86::VPBROADCASTWZ256rm : 0, + HasBWI ? X86::VPBROADCASTBZ256rm : 0, 0, 0, 1); case X86::VMOVDQA32Zrm: case X86::VMOVDQA64Zrm: case X86::VMOVDQU32Zrm: case X86::VMOVDQU64Zrm: - return ConvertToBroadcast(X86::VBROADCASTI64X4rm, X86::VBROADCASTI32X4rm, - X86::VPBROADCASTQZrm, X86::VPBROADCASTDZrm, - HasBWI ? X86::VPBROADCASTWZrm : 0, - HasBWI ? X86::VPBROADCASTBZrm : 0, 1); + return FixupConstant(X86::VBROADCASTI64X4rm, X86::VBROADCASTI32X4rm, + X86::VPBROADCASTQZrm, X86::VPBROADCASTDZrm, + HasBWI ? X86::VPBROADCASTWZrm : 0, + HasBWI ? X86::VPBROADCASTBZrm : 0, 0, 0, 1); } auto ConvertToBroadcastAVX512 = [&](unsigned OpSrc32, unsigned OpSrc64) { @@ -368,7 +423,7 @@ bool X86FixupVectorConstantsPass::processInstruction(MachineFunction &MF, if (OpBcst32 || OpBcst64) { unsigned OpNo = OpBcst32 == 0 ? OpNoBcst64 : OpNoBcst32; - return ConvertToBroadcast(0, 0, OpBcst64, OpBcst32, 0, 0, OpNo); + return FixupConstant(0, 0, OpBcst64, OpBcst32, 0, 0, 0, 0, OpNo); } return false; }; diff --git a/llvm/test/CodeGen/X86/2011-20-21-zext-ui2fp.ll b/llvm/test/CodeGen/X86/2011-20-21-zext-ui2fp.ll index e4e5d51d272d61..1ae51ee9756388 100644 --- a/llvm/test/CodeGen/X86/2011-20-21-zext-ui2fp.ll +++ b/llvm/test/CodeGen/X86/2011-20-21-zext-ui2fp.ll @@ -7,7 +7,7 @@ define void @ui_to_fp_conv(ptr nocapture %aFOO, ptr nocapture %RET) nounwind { ; CHECK-LABEL: ui_to_fp_conv: ; CHECK: # %bb.0: # %allocas -; CHECK-NEXT: movaps {{.*#+}} xmm0 = [1.0E+0,1.0E+0,0.0E+0,0.0E+0] +; CHECK-NEXT: movsd {{.*#+}} xmm0 = [1.0E+0,1.0E+0,0.0E+0,0.0E+0] ; CHECK-NEXT: xorps %xmm1, %xmm1 ; CHECK-NEXT: movups %xmm1, 16(%rsi) ; CHECK-NEXT: movups %xmm0, (%rsi) diff --git a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll index fe48059f9d0e65..e592b714a05dc8 100644 --- a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll +++ b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll @@ -1053,7 +1053,7 @@ define void @vec256_i8_widen_to_i16_factor2_broadcast_to_v16i16_factor16(ptr %in ; SSE42-NEXT: paddb 48(%rsi), %xmm2 ; SSE42-NEXT: paddb (%rsi), %xmm0 ; SSE42-NEXT: paddb 32(%rsi), %xmm1 -; SSE42-NEXT: movdqa {{.*#+}} xmm3 = [1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u] +; SSE42-NEXT: movq {{.*#+}} xmm3 = [1,3,5,7,9,11,13,15,0,0,0,0,0,0,0,0] ; SSE42-NEXT: pshufb %xmm3, %xmm1 ; SSE42-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE42-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] @@ -1075,8 +1075,7 @@ define void @vec256_i8_widen_to_i16_factor2_broadcast_to_v16i16_factor16(ptr %in ; AVX-NEXT: vpaddb 48(%rsi), %xmm2, %xmm2 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vpaddb 32(%rsi), %xmm1, %xmm1 -; AVX-NEXT: vmovddup {{.*#+}} xmm3 = [1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15] -; AVX-NEXT: # xmm3 = mem[0,0] +; AVX-NEXT: vmovq {{.*#+}} xmm3 = [1,3,5,7,9,11,13,15,0,0,0,0,0,0,0,0] ; AVX-NEXT: vpshufb %xmm3, %xmm1, %xmm1 ; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] diff --git a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll index 2f576fe6715904..d3f6bd20a0127c 100644 --- a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll +++ b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll @@ -875,7 +875,7 @@ define void @vec256_i8_widen_to_i16_factor2_broadcast_to_v16i16_factor16(ptr %in ; SSE42-NEXT: movdqa (%rdi), %xmm0 ; SSE42-NEXT: movdqa 32(%rdi), %xmm1 ; SSE42-NEXT: movdqa 48(%rdi), %xmm2 -; SSE42-NEXT: movdqa {{.*#+}} xmm3 = [1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u] +; SSE42-NEXT: movq {{.*#+}} xmm3 = [1,3,5,7,9,11,13,15,0,0,0,0,0,0,0,0] ; SSE42-NEXT: pshufb %xmm3, %xmm1 ; SSE42-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE42-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] @@ -894,8 +894,7 @@ define void @vec256_i8_widen_to_i16_factor2_broadcast_to_v16i16_factor16(ptr %in ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vmovdqa 32(%rdi), %xmm1 ; AVX-NEXT: vmovdqa 48(%rdi), %xmm2 -; AVX-NEXT: vmovddup {{.*#+}} xmm3 = [1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15] -; AVX-NEXT: # xmm3 = mem[0,0] +; AVX-NEXT: vmovq {{.*#+}} xmm3 = [1,3,5,7,9,11,13,15,0,0,0,0,0,0,0,0] ; AVX-NEXT: vpshufb %xmm3, %xmm1, %xmm1 ; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] diff --git a/llvm/test/CodeGen/X86/avx-load-store.ll b/llvm/test/CodeGen/X86/avx-load-store.ll index 33eb704788740a..3f856d33145d86 100644 --- a/llvm/test/CodeGen/X86/avx-load-store.ll +++ b/llvm/test/CodeGen/X86/avx-load-store.ll @@ -220,7 +220,7 @@ define void @f_f() nounwind { ; CHECK-NEXT: testb %al, %al ; CHECK-NEXT: jne .LBB9_4 ; CHECK-NEXT: # %bb.3: # %cif_mixed_test_all -; CHECK-NEXT: vmovaps {{.*#+}} xmm0 = [4294967295,0,0,0] +; CHECK-NEXT: vmovss {{.*#+}} xmm0 = [4294967295,0,0,0] ; CHECK-NEXT: vmaskmovps %ymm0, %ymm0, (%rax) ; CHECK-NEXT: .LBB9_4: # %cif_mixed_test_any_check ; diff --git a/llvm/test/CodeGen/X86/avx2-arith.ll b/llvm/test/CodeGen/X86/avx2-arith.ll index 3e695811719448..d32143cf33f2fa 100644 --- a/llvm/test/CodeGen/X86/avx2-arith.ll +++ b/llvm/test/CodeGen/X86/avx2-arith.ll @@ -234,7 +234,7 @@ define <8 x i16> @mul_const8(<8 x i16> %x) { define <8 x i32> @mul_const9(<8 x i32> %x) { ; CHECK-LABEL: mul_const9: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa {{.*#+}} xmm1 = [2,0,0,0] +; CHECK-NEXT: vmovd {{.*#+}} xmm1 = [2,0,0,0] ; CHECK-NEXT: vpmulld %ymm1, %ymm0, %ymm0 ; CHECK-NEXT: ret{{[l|q]}} %y = mul <8 x i32> %x, diff --git a/llvm/test/CodeGen/X86/avx512-shuffles/shuffle-chained-bf16.ll b/llvm/test/CodeGen/X86/avx512-shuffles/shuffle-chained-bf16.ll index 99d6049fc1d865..12ce721b8c5d53 100644 --- a/llvm/test/CodeGen/X86/avx512-shuffles/shuffle-chained-bf16.ll +++ b/llvm/test/CodeGen/X86/avx512-shuffles/shuffle-chained-bf16.ll @@ -13,7 +13,7 @@ define <2 x bfloat> @shuffle_chained_v32bf16_v2bf16(<32 x bfloat> %a) { ; CHECK-NEXT: .cfi_def_cfa_register %rbp ; CHECK-NEXT: andq $-64, %rsp ; CHECK-NEXT: subq $128, %rsp -; CHECK-NEXT: vpbroadcastd {{.*#+}} xmm1 = [0,16,0,16,0,16,0,16] +; CHECK-NEXT: vmovd {{.*#+}} xmm1 = [0,16,0,0,0,0,0,0] ; CHECK-NEXT: vpermw %zmm0, %zmm1, %zmm0 ; CHECK-NEXT: vmovdqa64 %zmm0, (%rsp) ; CHECK-NEXT: vmovaps (%rsp), %xmm0 diff --git a/llvm/test/CodeGen/X86/bitreverse.ll b/llvm/test/CodeGen/X86/bitreverse.ll index 2d978b5e991c91..26b1d64874e590 100644 --- a/llvm/test/CodeGen/X86/bitreverse.ll +++ b/llvm/test/CodeGen/X86/bitreverse.ll @@ -587,17 +587,17 @@ define <2 x i16> @fold_v2i16() { ; ; X64-LABEL: fold_v2i16: ; X64: # %bb.0: -; X64-NEXT: movaps {{.*#+}} xmm0 = [61440,240,u,u,u,u,u,u] +; X64-NEXT: movss {{.*#+}} xmm0 = [61440,240,0,0,0,0,0,0] ; X64-NEXT: retq ; ; X86XOP-LABEL: fold_v2i16: ; X86XOP: # %bb.0: -; X86XOP-NEXT: vbroadcastss {{.*#+}} xmm0 = [61440,240,61440,240,61440,240,61440,240] +; X86XOP-NEXT: vmovss {{.*#+}} xmm0 = [61440,240,0,0,0,0,0,0] ; X86XOP-NEXT: retl ; ; GFNI-LABEL: fold_v2i16: ; GFNI: # %bb.0: -; GFNI-NEXT: vbroadcastss {{.*#+}} xmm0 = [61440,240,61440,240,61440,240,61440,240] +; GFNI-NEXT: vmovss {{.*#+}} xmm0 = [61440,240,0,0,0,0,0,0] ; GFNI-NEXT: retq %b = call <2 x i16> @llvm.bitreverse.v2i16(<2 x i16> ) ret <2 x i16> %b diff --git a/llvm/test/CodeGen/X86/combine-srl.ll b/llvm/test/CodeGen/X86/combine-srl.ll index 3e0f581ea01a0a..125196a0819b61 100644 --- a/llvm/test/CodeGen/X86/combine-srl.ll +++ b/llvm/test/CodeGen/X86/combine-srl.ll @@ -356,7 +356,7 @@ define <4 x i32> @combine_vec_lshr_lzcnt_bit1(<4 x i32> %x) { ; SSE-LABEL: combine_vec_lshr_lzcnt_bit1: ; SSE: # %bb.0: ; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; SSE-NEXT: movq {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; SSE-NEXT: movdqa %xmm1, %xmm2 ; SSE-NEXT: pshufb %xmm0, %xmm2 ; SSE-NEXT: psrlw $4, %xmm0 @@ -378,7 +378,7 @@ define <4 x i32> @combine_vec_lshr_lzcnt_bit1(<4 x i32> %x) { ; AVX-LABEL: combine_vec_lshr_lzcnt_bit1: ; AVX: # %bb.0: ; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX-NEXT: vmovq {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; AVX-NEXT: vpshufb %xmm0, %xmm1, %xmm2 ; AVX-NEXT: vpsrlw $4, %xmm0, %xmm0 ; AVX-NEXT: vpxor %xmm3, %xmm3, %xmm3 diff --git a/llvm/test/CodeGen/X86/combine-subo.ll b/llvm/test/CodeGen/X86/combine-subo.ll index 235df0a666ee9f..5e4bba6e0fd35c 100644 --- a/llvm/test/CodeGen/X86/combine-subo.ll +++ b/llvm/test/CodeGen/X86/combine-subo.ll @@ -217,13 +217,13 @@ define { <4 x i8>, <4 x i1> } @always_usub_const_vector() nounwind { define { <4 x i8>, <4 x i1> } @never_usub_const_vector() nounwind { ; SSE-LABEL: never_usub_const_vector: ; SSE: # %bb.0: -; SSE-NEXT: movaps {{.*#+}} xmm0 = [127,255,0,254,u,u,u,u,u,u,u,u,u,u,u,u] +; SSE-NEXT: movss {{.*#+}} xmm0 = [127,255,0,254,0,0,0,0,0,0,0,0,0,0,0,0] ; SSE-NEXT: xorps %xmm1, %xmm1 ; SSE-NEXT: retq ; ; AVX-LABEL: never_usub_const_vector: ; AVX: # %bb.0: -; AVX-NEXT: vbroadcastss {{.*#+}} xmm0 = [127,255,0,254,127,255,0,254,127,255,0,254,127,255,0,254] +; AVX-NEXT: vmovss {{.*#+}} xmm0 = [127,255,0,254,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; AVX-NEXT: retq %x = call { <4 x i8>, <4 x i1> } @llvm.usub.with.overflow.v4i8(<4 x i8> , <4 x i8> ) diff --git a/llvm/test/CodeGen/X86/constant-pool-sharing.ll b/llvm/test/CodeGen/X86/constant-pool-sharing.ll index db5a7810974c9d..062d87ed035fd6 100644 --- a/llvm/test/CodeGen/X86/constant-pool-sharing.ll +++ b/llvm/test/CodeGen/X86/constant-pool-sharing.ll @@ -77,7 +77,7 @@ define void @store_repeated_constants(ptr %lo, ptr %hi) { ; SSE-LINUX: # %bb.0: ; SSE-LINUX-NEXT: xorps %xmm0, %xmm0 ; SSE-LINUX-NEXT: movaps %xmm0, 48(%rdi) -; SSE-LINUX-NEXT: movaps {{.*#+}} xmm1 = [18446744073709551615,0] +; SSE-LINUX-NEXT: movsd {{.*#+}} xmm1 = [18446744073709551615,0] ; SSE-LINUX-NEXT: movaps %xmm1, 32(%rdi) ; SSE-LINUX-NEXT: movaps %xmm1, 16(%rdi) ; SSE-LINUX-NEXT: movaps %xmm1, (%rdi) @@ -92,7 +92,7 @@ define void @store_repeated_constants(ptr %lo, ptr %hi) { ; SSE-MSVC: # %bb.0: ; SSE-MSVC-NEXT: xorps %xmm0, %xmm0 ; SSE-MSVC-NEXT: movaps %xmm0, 48(%rcx) -; SSE-MSVC-NEXT: movaps {{.*#+}} xmm1 = [18446744073709551615,0] +; SSE-MSVC-NEXT: movsd {{.*#+}} xmm1 = [18446744073709551615,0] ; SSE-MSVC-NEXT: movaps %xmm1, 32(%rcx) ; SSE-MSVC-NEXT: movaps %xmm1, 16(%rcx) ; SSE-MSVC-NEXT: movaps %xmm1, (%rcx) diff --git a/llvm/test/CodeGen/X86/dpbusd.ll b/llvm/test/CodeGen/X86/dpbusd.ll index 06f11e6d527bde..fbea08eb1e5502 100644 --- a/llvm/test/CodeGen/X86/dpbusd.ll +++ b/llvm/test/CodeGen/X86/dpbusd.ll @@ -379,7 +379,7 @@ define i32 @vpdpbusd_2xi32(ptr%a, ptr%b, i32 %c, i32 %n) { ; AVX512VNNI-LABEL: vpdpbusd_2xi32: ; AVX512VNNI: # %bb.0: # %entry ; AVX512VNNI-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX512VNNI-NEXT: vmovdqa {{.*#+}} xmm1 = [65535,0,0,0] +; AVX512VNNI-NEXT: vmovd {{.*#+}} xmm1 = [65535,0,0,0] ; AVX512VNNI-NEXT: vpandq %zmm1, %zmm0, %zmm0 ; AVX512VNNI-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero ; AVX512VNNI-NEXT: vpandq %zmm1, %zmm2, %zmm1 diff --git a/llvm/test/CodeGen/X86/dpbusd_const.ll b/llvm/test/CodeGen/X86/dpbusd_const.ll index 2fe9feb06dff67..5862e614265b1f 100644 --- a/llvm/test/CodeGen/X86/dpbusd_const.ll +++ b/llvm/test/CodeGen/X86/dpbusd_const.ll @@ -108,7 +108,7 @@ define i32 @mul_4xi8_cs(<4 x i8> %a, i32 %c) { ; AVXVNNI: # %bb.0: # %entry ; AVXVNNI-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVXVNNI-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] -; AVXVNNI-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,255,0,0,0,0,0,0,0,0,0,0,0,0] +; AVXVNNI-NEXT: vmovd {{.*#+}} xmm2 = [0,1,2,255,0,0,0,0,0,0,0,0,0,0,0,0] ; AVXVNNI-NEXT: {vex} vpdpbusd %xmm0, %xmm2, %xmm1 ; AVXVNNI-NEXT: vmovd %xmm1, %eax ; AVXVNNI-NEXT: addl %edi, %eax @@ -118,7 +118,7 @@ define i32 @mul_4xi8_cs(<4 x i8> %a, i32 %c) { ; AVX512VNNI: # %bb.0: # %entry ; AVX512VNNI-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512VNNI-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] -; AVX512VNNI-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,255,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX512VNNI-NEXT: vmovd {{.*#+}} xmm1 = [0,1,2,255,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX512VNNI-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512VNNI-NEXT: vpdpbusd %zmm0, %zmm1, %zmm2 ; AVX512VNNI-NEXT: vmovd %xmm2, %eax @@ -130,7 +130,7 @@ define i32 @mul_4xi8_cs(<4 x i8> %a, i32 %c) { ; AVX512VLVNNI: # %bb.0: # %entry ; AVX512VLVNNI-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512VLVNNI-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] -; AVX512VLVNNI-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,255,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX512VLVNNI-NEXT: vmovd {{.*#+}} xmm1 = [0,1,2,255,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX512VLVNNI-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512VLVNNI-NEXT: vpdpbusd %xmm0, %xmm1, %xmm2 ; AVX512VLVNNI-NEXT: vmovd %xmm2, %eax diff --git a/llvm/test/CodeGen/X86/expand-vp-cast-intrinsics.ll b/llvm/test/CodeGen/X86/expand-vp-cast-intrinsics.ll index 3b015acb69bd2e..0a52dfff71eda4 100644 --- a/llvm/test/CodeGen/X86/expand-vp-cast-intrinsics.ll +++ b/llvm/test/CodeGen/X86/expand-vp-cast-intrinsics.ll @@ -532,7 +532,7 @@ define <2 x half> @vfptrunc_v2f16_v2f64(<2 x double> %a, <2 x i1> %m, i32 zeroex ; AVX512-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill ; AVX512-NEXT: callq __truncdfhf2@PLT ; AVX512-NEXT: vpbroadcastw %xmm0, %xmm1 -; AVX512-NEXT: vmovaps {{.*#+}} xmm0 = [4,0,0,0] +; AVX512-NEXT: vmovss {{.*#+}} xmm0 = [4,0,0,0] ; AVX512-NEXT: vpermi2ps (%rsp), %xmm1, %xmm0 # 16-byte Folded Reload ; AVX512-NEXT: addq $40, %rsp ; AVX512-NEXT: .cfi_def_cfa_offset 8 diff --git a/llvm/test/CodeGen/X86/fcmp-constant.ll b/llvm/test/CodeGen/X86/fcmp-constant.ll index 481a32b39dd377..335cb28213f929 100644 --- a/llvm/test/CodeGen/X86/fcmp-constant.ll +++ b/llvm/test/CodeGen/X86/fcmp-constant.ll @@ -92,7 +92,7 @@ define <2 x i64> @fcmp_ueq_v2f64_undef() { define <2 x i64> @fcmp_ueq_v2f64_undef_elt() { ; CHECK-LABEL: fcmp_ueq_v2f64_undef_elt: ; CHECK: # %bb.0: -; CHECK-NEXT: movaps {{.*#+}} xmm0 = [18446744073709551615,0] +; CHECK-NEXT: movsd {{.*#+}} xmm0 = [18446744073709551615,0] ; CHECK-NEXT: retq %1 = fcmp ueq <2 x double> , %2 = sext <2 x i1> %1 to <2 x i64> diff --git a/llvm/test/CodeGen/X86/fold-vector-sext-zext.ll b/llvm/test/CodeGen/X86/fold-vector-sext-zext.ll index 3f8bd24c380492..d31168f4078901 100644 --- a/llvm/test/CodeGen/X86/fold-vector-sext-zext.ll +++ b/llvm/test/CodeGen/X86/fold-vector-sext-zext.ll @@ -11,14 +11,12 @@ define <4 x i16> @test_sext_4i8_4i16() { ; X32-LABEL: test_sext_4i8_4i16: ; X32: # %bb.0: -; X32-NEXT: vmovddup {{.*#+}} xmm0 = [0,65535,2,65533,0,65535,2,65533] -; X32-NEXT: # xmm0 = mem[0,0] +; X32-NEXT: vmovsd {{.*#+}} xmm0 = [0,65535,2,65533,0,0,0,0] ; X32-NEXT: retl ; ; X64-LABEL: test_sext_4i8_4i16: ; X64: # %bb.0: -; X64-NEXT: vmovddup {{.*#+}} xmm0 = [0,65535,2,65533,0,65535,2,65533] -; X64-NEXT: # xmm0 = mem[0,0] +; X64-NEXT: vmovsd {{.*#+}} xmm0 = [0,65535,2,65533,0,0,0,0] ; X64-NEXT: retq %1 = insertelement <4 x i8> undef, i8 0, i32 0 %2 = insertelement <4 x i8> %1, i8 -1, i32 1 @@ -31,14 +29,12 @@ define <4 x i16> @test_sext_4i8_4i16() { define <4 x i16> @test_sext_4i8_4i16_undef() { ; X32-LABEL: test_sext_4i8_4i16_undef: ; X32: # %bb.0: -; X32-NEXT: vmovddup {{.*#+}} xmm0 = [0,65535,0,65533,0,65535,0,65533] -; X32-NEXT: # xmm0 = mem[0,0] +; X32-NEXT: vmovsd {{.*#+}} xmm0 = [0,65535,0,65533,0,0,0,0] ; X32-NEXT: retl ; ; X64-LABEL: test_sext_4i8_4i16_undef: ; X64: # %bb.0: -; X64-NEXT: vmovddup {{.*#+}} xmm0 = [0,65535,0,65533,0,65535,0,65533] -; X64-NEXT: # xmm0 = mem[0,0] +; X64-NEXT: vmovsd {{.*#+}} xmm0 = [0,65535,0,65533,0,0,0,0] ; X64-NEXT: retq %1 = insertelement <4 x i8> undef, i8 undef, i32 0 %2 = insertelement <4 x i8> %1, i8 -1, i32 1 @@ -211,14 +207,12 @@ define <8 x i32> @test_sext_8i8_8i32_undef() { define <4 x i16> @test_zext_4i8_4i16() { ; X32-LABEL: test_zext_4i8_4i16: ; X32: # %bb.0: -; X32-NEXT: vmovddup {{.*#+}} xmm0 = [0,255,2,253,0,255,2,253] -; X32-NEXT: # xmm0 = mem[0,0] +; X32-NEXT: vmovsd {{.*#+}} xmm0 = [0,255,2,253,0,0,0,0] ; X32-NEXT: retl ; ; X64-LABEL: test_zext_4i8_4i16: ; X64: # %bb.0: -; X64-NEXT: vmovddup {{.*#+}} xmm0 = [0,255,2,253,0,255,2,253] -; X64-NEXT: # xmm0 = mem[0,0] +; X64-NEXT: vmovsd {{.*#+}} xmm0 = [0,255,2,253,0,0,0,0] ; X64-NEXT: retq %1 = insertelement <4 x i8> undef, i8 0, i32 0 %2 = insertelement <4 x i8> %1, i8 -1, i32 1 @@ -267,14 +261,12 @@ define <4 x i64> @test_zext_4i8_4i64() { define <4 x i16> @test_zext_4i8_4i16_undef() { ; X32-LABEL: test_zext_4i8_4i16_undef: ; X32: # %bb.0: -; X32-NEXT: vmovddup {{.*#+}} xmm0 = [0,255,0,253,0,255,0,253] -; X32-NEXT: # xmm0 = mem[0,0] +; X32-NEXT: vmovsd {{.*#+}} xmm0 = [0,255,0,253,0,0,0,0] ; X32-NEXT: retl ; ; X64-LABEL: test_zext_4i8_4i16_undef: ; X64: # %bb.0: -; X64-NEXT: vmovddup {{.*#+}} xmm0 = [0,255,0,253,0,255,0,253] -; X64-NEXT: # xmm0 = mem[0,0] +; X64-NEXT: vmovsd {{.*#+}} xmm0 = [0,255,0,253,0,0,0,0] ; X64-NEXT: retq %1 = insertelement <4 x i8> undef, i8 undef, i32 0 %2 = insertelement <4 x i8> %1, i8 -1, i32 1 diff --git a/llvm/test/CodeGen/X86/half.ll b/llvm/test/CodeGen/X86/half.ll index b42f6fdea34b65..d0853fdc748d29 100644 --- a/llvm/test/CodeGen/X86/half.ll +++ b/llvm/test/CodeGen/X86/half.ll @@ -2116,7 +2116,7 @@ define void @pr63114() { ; CHECK-LIBCALL-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] ; CHECK-LIBCALL-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,0,65535,65535,65535,65535] ; CHECK-LIBCALL-NEXT: pand %xmm1, %xmm0 -; CHECK-LIBCALL-NEXT: movdqa {{.*#+}} xmm2 = [0,0,0,15360,0,0,0,0] +; CHECK-LIBCALL-NEXT: movq {{.*#+}} xmm2 = [0,0,0,15360,0,0,0,0] ; CHECK-LIBCALL-NEXT: por %xmm2, %xmm0 ; CHECK-LIBCALL-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,65535,65535,65535,65535,65535,0] ; CHECK-LIBCALL-NEXT: pand %xmm3, %xmm0 @@ -2181,7 +2181,7 @@ define void @pr63114() { ; CHECK-I686-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] ; CHECK-I686-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,0,65535,65535,65535,65535] ; CHECK-I686-NEXT: pand %xmm1, %xmm0 -; CHECK-I686-NEXT: movdqa {{.*#+}} xmm2 = [0,0,0,15360,0,0,0,0] +; CHECK-I686-NEXT: movq {{.*#+}} xmm2 = [0,0,0,15360,0,0,0,0] ; CHECK-I686-NEXT: por %xmm2, %xmm0 ; CHECK-I686-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,65535,65535,65535,65535,65535,0] ; CHECK-I686-NEXT: pand %xmm3, %xmm0 diff --git a/llvm/test/CodeGen/X86/icmp-pow2-mask.ll b/llvm/test/CodeGen/X86/icmp-pow2-mask.ll index e2b3a23827a018..6d2866f50c6c7c 100644 --- a/llvm/test/CodeGen/X86/icmp-pow2-mask.ll +++ b/llvm/test/CodeGen/X86/icmp-pow2-mask.ll @@ -21,7 +21,7 @@ define <8 x i16> @pow2_mask_v16i8(i8 zeroext %0) { ; SSE41-NEXT: movd %edi, %xmm0 ; SSE41-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE41-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [128,64,32,16,8,4,2,1,u,u,u,u,u,u,u,u] +; SSE41-NEXT: movq {{.*#+}} xmm1 = [128,64,32,16,8,4,2,1,0,0,0,0,0,0,0,0] ; SSE41-NEXT: pand %xmm1, %xmm0 ; SSE41-NEXT: pcmpeqb %xmm1, %xmm0 ; SSE41-NEXT: pmovsxbw %xmm0, %xmm0 @@ -97,7 +97,7 @@ define i64 @pow2_mask_v8i8(i8 zeroext %0) { ; SSE-NEXT: movd %edi, %xmm0 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [128,64,32,16,8,4,2,1,u,u,u,u,u,u,u,u] +; SSE-NEXT: movq {{.*#+}} xmm1 = [128,64,32,16,8,4,2,1,0,0,0,0,0,0,0,0] ; SSE-NEXT: pand %xmm1, %xmm0 ; SSE-NEXT: pcmpeqb %xmm1, %xmm0 ; SSE-NEXT: movq %xmm0, %rax diff --git a/llvm/test/CodeGen/X86/insert-into-constant-vector.ll b/llvm/test/CodeGen/X86/insert-into-constant-vector.ll index 418d632480328f..bc977e006606e8 100644 --- a/llvm/test/CodeGen/X86/insert-into-constant-vector.ll +++ b/llvm/test/CodeGen/X86/insert-into-constant-vector.ll @@ -375,7 +375,7 @@ define <8 x i64> @elt5_v8i64(i64 %x) { ; X86-SSE-LABEL: elt5_v8i64: ; X86-SSE: # %bb.0: ; X86-SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; X86-SSE-NEXT: movaps {{.*#+}} xmm2 = [4,0,0,0] +; X86-SSE-NEXT: movss {{.*#+}} xmm2 = [4,0,0,0] ; X86-SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] ; X86-SSE-NEXT: movaps {{.*#+}} xmm0 = [42,0,1,0] ; X86-SSE-NEXT: movaps {{.*#+}} xmm1 = [2,0,3,0] @@ -404,7 +404,7 @@ define <8 x i64> @elt5_v8i64(i64 %x) { ; X86-AVX1-LABEL: elt5_v8i64: ; X86-AVX1: # %bb.0: ; X86-AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; X86-AVX1-NEXT: vmovaps {{.*#+}} xmm1 = [4,0,0,0] +; X86-AVX1-NEXT: vmovss {{.*#+}} xmm1 = [4,0,0,0] ; X86-AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] ; X86-AVX1-NEXT: vinsertf128 $1, {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm1 ; X86-AVX1-NEXT: vmovaps {{.*#+}} ymm0 = [42,0,1,0,2,0,3,0] @@ -421,7 +421,7 @@ define <8 x i64> @elt5_v8i64(i64 %x) { ; X86-AVX2-LABEL: elt5_v8i64: ; X86-AVX2: # %bb.0: ; X86-AVX2-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; X86-AVX2-NEXT: vmovaps {{.*#+}} xmm1 = [4,0,0,0] +; X86-AVX2-NEXT: vmovss {{.*#+}} xmm1 = [4,0,0,0] ; X86-AVX2-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] ; X86-AVX2-NEXT: vinsertf128 $1, {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm1 ; X86-AVX2-NEXT: vmovaps {{.*#+}} ymm0 = [42,0,1,0,2,0,3,0] @@ -439,7 +439,7 @@ define <8 x i64> @elt5_v8i64(i64 %x) { ; X86-AVX512F: # %bb.0: ; X86-AVX512F-NEXT: vmovaps {{.*#+}} ymm0 = [42,0,1,0,2,0,3,0] ; X86-AVX512F-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; X86-AVX512F-NEXT: vmovaps {{.*#+}} xmm2 = [4,0,0,0] +; X86-AVX512F-NEXT: vmovss {{.*#+}} xmm2 = [4,0,0,0] ; X86-AVX512F-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] ; X86-AVX512F-NEXT: vinsertf128 $1, {{\.?LCPI[0-9]+_[0-9]+}}, %ymm1, %ymm1 ; X86-AVX512F-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 diff --git a/llvm/test/CodeGen/X86/insertelement-ones.ll b/llvm/test/CodeGen/X86/insertelement-ones.ll index ed7cbb0a8430d9..cfcb8798e0b9cc 100644 --- a/llvm/test/CodeGen/X86/insertelement-ones.ll +++ b/llvm/test/CodeGen/X86/insertelement-ones.ll @@ -280,7 +280,7 @@ define <16 x i16> @insert_v16i16_x12345x789ABCDEx(<16 x i16> %a) { ; ; AVX1-LABEL: insert_v16i16_x12345x789ABCDEx: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps {{.*#+}} xmm1 = [65535,0,0,0] +; AVX1-NEXT: vmovss {{.*#+}} xmm1 = [65535,0,0,0] ; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX1-NEXT: vorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 @@ -385,7 +385,7 @@ define <32 x i8> @insert_v32i8_x123456789ABCDEzGHIJKLMNOPQRSTxx(<32 x i8> %a) { ; ; AVX1-LABEL: insert_v32i8_x123456789ABCDEzGHIJKLMNOPQRSTxx: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps {{.*#+}} xmm1 = [255,0,0,0] +; AVX1-NEXT: vmovss {{.*#+}} xmm1 = [255,0,0,0] ; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX1-NEXT: vorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 diff --git a/llvm/test/CodeGen/X86/masked_gather_scatter_widen.ll b/llvm/test/CodeGen/X86/masked_gather_scatter_widen.ll index f63545bcd178e5..aad1b443448503 100644 --- a/llvm/test/CodeGen/X86/masked_gather_scatter_widen.ll +++ b/llvm/test/CodeGen/X86/masked_gather_scatter_widen.ll @@ -739,7 +739,7 @@ define <17 x float> @test_mgather_v17f32(ptr %base, <17 x i32> %index) ; WIDEN_AVX2-NEXT: vgatherdps %ymm5, (%rsi,%ymm1,4), %ymm6 ; WIDEN_AVX2-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; WIDEN_AVX2-NEXT: vgatherdps %ymm3, (%rsi,%ymm0,4), %ymm1 -; WIDEN_AVX2-NEXT: vmovaps {{.*#+}} xmm0 = [4294967295,0,0,0] +; WIDEN_AVX2-NEXT: vmovss {{.*#+}} xmm0 = [4294967295,0,0,0] ; WIDEN_AVX2-NEXT: vgatherdps %ymm0, (%rsi,%ymm2,4), %ymm4 ; WIDEN_AVX2-NEXT: vmovss %xmm4, 64(%rdi) ; WIDEN_AVX2-NEXT: vmovaps %ymm1, 32(%rdi) diff --git a/llvm/test/CodeGen/X86/masked_store_trunc.ll b/llvm/test/CodeGen/X86/masked_store_trunc.ll index f78646afccb345..6c21bb4d99e748 100644 --- a/llvm/test/CodeGen/X86/masked_store_trunc.ll +++ b/llvm/test/CodeGen/X86/masked_store_trunc.ll @@ -1395,7 +1395,7 @@ define void @truncstore_v4i64_v4i8(<4 x i64> %x, ptr %p, <4 x i32> %mask) { ; SSE4-LABEL: truncstore_v4i64_v4i8: ; SSE4: # %bb.0: ; SSE4-NEXT: pxor %xmm3, %xmm3 -; SSE4-NEXT: movdqa {{.*#+}} xmm4 = [0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; SSE4-NEXT: movd {{.*#+}} xmm4 = [0,8,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; SSE4-NEXT: pshufb %xmm4, %xmm1 ; SSE4-NEXT: pshufb %xmm4, %xmm0 ; SSE4-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] @@ -1435,7 +1435,7 @@ define void @truncstore_v4i64_v4i8(<4 x i64> %x, ptr %p, <4 x i32> %mask) { ; AVX1: # %bb.0: ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [0,8,0,0,0,8,0,0,0,8,0,0,0,8,0,0] +; AVX1-NEXT: vmovd {{.*#+}} xmm4 = [0,8,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX1-NEXT: vpshufb %xmm4, %xmm3, %xmm3 ; AVX1-NEXT: vpshufb %xmm4, %xmm0, %xmm0 ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] @@ -3791,7 +3791,7 @@ define void @truncstore_v8i32_v8i8(<8 x i32> %x, ptr %p, <8 x i32> %mask) { ; AVX1-LABEL: truncstore_v8i32_v8i8: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX1-NEXT: vmovd {{.*#+}} xmm3 = [0,4,8,12,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] @@ -3865,7 +3865,7 @@ define void @truncstore_v8i32_v8i8(<8 x i32> %x, ptr %p, <8 x i32> %mask) { ; AVX2: # %bb.0: ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm3 -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm4 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX2-NEXT: vmovd {{.*#+}} xmm4 = [0,4,8,12,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX2-NEXT: vpshufb %xmm4, %xmm3, %xmm3 ; AVX2-NEXT: vpshufb %xmm4, %xmm0, %xmm0 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] diff --git a/llvm/test/CodeGen/X86/memcmp-more-load-pairs.ll b/llvm/test/CodeGen/X86/memcmp-more-load-pairs.ll index 6eb02bfc1fd0c3..da46ea40655791 100644 --- a/llvm/test/CodeGen/X86/memcmp-more-load-pairs.ll +++ b/llvm/test/CodeGen/X86/memcmp-more-load-pairs.ll @@ -947,7 +947,7 @@ define i1 @length24_eq_const(ptr %X) nounwind { ; X64-MIC-AVX: # %bb.0: ; X64-MIC-AVX-NEXT: vmovdqu (%rdi), %xmm0 ; X64-MIC-AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; X64-MIC-AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [959985462,858927408,0,0] +; X64-MIC-AVX-NEXT: vmovq {{.*#+}} xmm2 = [959985462,858927408,0,0] ; X64-MIC-AVX-NEXT: vpcmpneqd %zmm2, %zmm1, %k0 ; X64-MIC-AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [858927408,926299444,825243960,892613426] ; X64-MIC-AVX-NEXT: vpcmpneqd %zmm1, %zmm0, %k1 diff --git a/llvm/test/CodeGen/X86/memcmp.ll b/llvm/test/CodeGen/X86/memcmp.ll index f5e7384362a92b..83cb0d6f973be5 100644 --- a/llvm/test/CodeGen/X86/memcmp.ll +++ b/llvm/test/CodeGen/X86/memcmp.ll @@ -1015,7 +1015,7 @@ define i1 @length24_eq_const(ptr %X) nounwind { ; X64-MIC-AVX: # %bb.0: ; X64-MIC-AVX-NEXT: vmovdqu (%rdi), %xmm0 ; X64-MIC-AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; X64-MIC-AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [959985462,858927408,0,0] +; X64-MIC-AVX-NEXT: vmovq {{.*#+}} xmm2 = [959985462,858927408,0,0] ; X64-MIC-AVX-NEXT: vpcmpneqd %zmm2, %zmm1, %k0 ; X64-MIC-AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [858927408,926299444,825243960,892613426] ; X64-MIC-AVX-NEXT: vpcmpneqd %zmm1, %zmm0, %k1 diff --git a/llvm/test/CodeGen/X86/pmaddubsw.ll b/llvm/test/CodeGen/X86/pmaddubsw.ll index ea0b4e4b21c775..e46a14673a5171 100644 --- a/llvm/test/CodeGen/X86/pmaddubsw.ll +++ b/llvm/test/CodeGen/X86/pmaddubsw.ll @@ -320,8 +320,7 @@ define <8 x i16> @pmaddubsw_bad_extend(ptr %Aptr, ptr %Bptr) { ; AVX1: # %bb.0: ; AVX1-NEXT: vmovdqa (%rdi), %xmm0 ; AVX1-NEXT: vmovdqa (%rsi), %xmm1 -; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14] -; AVX1-NEXT: # xmm2 = mem[0,0] +; AVX1-NEXT: vmovq {{.*#+}} xmm2 = [0,2,4,6,8,10,12,14,0,0,0,0,0,0,0,0] ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm3 ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm2 ; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u] @@ -349,9 +348,9 @@ define <8 x i16> @pmaddubsw_bad_extend(ptr %Aptr, ptr %Bptr) { ; AVX256: # %bb.0: ; AVX256-NEXT: vmovdqa (%rdi), %xmm0 ; AVX256-NEXT: vmovdqa (%rsi), %xmm1 -; AVX256-NEXT: vpbroadcastq {{.*#+}} xmm2 = [0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14] +; AVX256-NEXT: vmovq {{.*#+}} xmm2 = [0,2,4,6,8,10,12,14,0,0,0,0,0,0,0,0] ; AVX256-NEXT: vpshufb %xmm2, %xmm0, %xmm3 -; AVX256-NEXT: vpbroadcastq {{.*#+}} xmm4 = [1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15] +; AVX256-NEXT: vmovq {{.*#+}} xmm4 = [1,3,5,7,9,11,13,15,0,0,0,0,0,0,0,0] ; AVX256-NEXT: vpshufb %xmm4, %xmm0, %xmm0 ; AVX256-NEXT: vpshufb %xmm2, %xmm1, %xmm2 ; AVX256-NEXT: vpshufb %xmm4, %xmm1, %xmm1 diff --git a/llvm/test/CodeGen/X86/pr46532.ll b/llvm/test/CodeGen/X86/pr46532.ll index ffaa131d5cbb1b..cbc677229ede61 100644 --- a/llvm/test/CodeGen/X86/pr46532.ll +++ b/llvm/test/CodeGen/X86/pr46532.ll @@ -7,7 +7,7 @@ define void @WhileWithLoopInvariantOperation.21() { ; CHECK-NEXT: movq (%rax), %rax ; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0 ; CHECK-NEXT: vmovaps %xmm0, 32(%rax) -; CHECK-NEXT: vmovaps {{.*#+}} xmm0 = [4294967295,4294967295,0,0] +; CHECK-NEXT: vmovsd {{.*#+}} xmm0 = [4294967295,4294967295,0,0] ; CHECK-NEXT: vmaskmovps %ymm0, %ymm0, (%rax) while.1.body.preheader: %0 = load ptr, ptr undef, align 8, !invariant.load !0, !dereferenceable !1, !align !2 diff --git a/llvm/test/CodeGen/X86/pr63108.ll b/llvm/test/CodeGen/X86/pr63108.ll index 38e45595a8846e..b1576851ed0236 100644 --- a/llvm/test/CodeGen/X86/pr63108.ll +++ b/llvm/test/CodeGen/X86/pr63108.ll @@ -11,11 +11,11 @@ define i32 @PR63108() { ; SSE-NEXT: testb %al, %al ; SSE-NEXT: je .LBB0_2 ; SSE-NEXT: # %bb.1: -; SSE-NEXT: movdqa {{.*#+}} xmm0 = [251,223,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; SSE-NEXT: movd {{.*#+}} xmm0 = [57339,0,0,0] ; SSE-NEXT: jmp .LBB0_5 ; SSE-NEXT: .LBB0_2: # %vector.body.preheader ; SSE-NEXT: pxor %xmm0, %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [57339,0,0,0] +; SSE-NEXT: movd {{.*#+}} xmm1 = [57339,0,0,0] ; SSE-NEXT: xorl %eax, %eax ; SSE-NEXT: .p2align 4, 0x90 ; SSE-NEXT: .LBB0_3: # %vector.body @@ -47,10 +47,10 @@ define i32 @PR63108() { ; AVX1-NEXT: testb %al, %al ; AVX1-NEXT: je .LBB0_2 ; AVX1-NEXT: # %bb.1: -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm0 = [251,223,0,0,251,223,0,0,251,223,0,0,251,223,0,0] +; AVX1-NEXT: vmovd {{.*#+}} xmm0 = [57339,0,0,0] ; AVX1-NEXT: jmp .LBB0_5 ; AVX1-NEXT: .LBB0_2: # %vector.body.preheader -; AVX1-NEXT: vmovaps {{.*#+}} xmm0 = [57339,0,0,0] +; AVX1-NEXT: vmovss {{.*#+}} xmm0 = [57339,0,0,0] ; AVX1-NEXT: xorl %eax, %eax ; AVX1-NEXT: .p2align 4, 0x90 ; AVX1-NEXT: .LBB0_3: # %vector.body @@ -87,7 +87,7 @@ define i32 @PR63108() { ; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm0 = [251,223,251,223,251,223,251,223,251,223,251,223,251,223,251,223] ; AVX2-NEXT: jmp .LBB0_5 ; AVX2-NEXT: .LBB0_2: # %vector.body.preheader -; AVX2-NEXT: vmovdqa {{.*#+}} xmm0 = [57339,0,0,0] +; AVX2-NEXT: vmovd {{.*#+}} xmm0 = [57339,0,0,0] ; AVX2-NEXT: xorl %eax, %eax ; AVX2-NEXT: .p2align 4, 0x90 ; AVX2-NEXT: .LBB0_3: # %vector.body @@ -124,7 +124,7 @@ define i32 @PR63108() { ; AVX512-NEXT: vpbroadcastw {{.*#+}} xmm0 = [251,223,251,223,251,223,251,223,251,223,251,223,251,223,251,223] ; AVX512-NEXT: jmp .LBB0_5 ; AVX512-NEXT: .LBB0_2: # %vector.body.preheader -; AVX512-NEXT: vmovdqa {{.*#+}} xmm0 = [57339,0,0,0] +; AVX512-NEXT: vmovd {{.*#+}} xmm0 = [57339,0,0,0] ; AVX512-NEXT: xorl %eax, %eax ; AVX512-NEXT: .p2align 4, 0x90 ; AVX512-NEXT: .LBB0_3: # %vector.body diff --git a/llvm/test/CodeGen/X86/pr74736.ll b/llvm/test/CodeGen/X86/pr74736.ll index 3dfdbf102c953e..1c3b4bd4971c11 100644 --- a/llvm/test/CodeGen/X86/pr74736.ll +++ b/llvm/test/CodeGen/X86/pr74736.ll @@ -6,7 +6,7 @@ define void @main(<16 x i32> %0, i32 %1) { ; SSE-LABEL: main: ; SSE: # %bb.0: # %entry ; SSE-NEXT: movd %edi, %xmm4 -; SSE-NEXT: movaps {{.*#+}} xmm0 = [1,0,0,0] +; SSE-NEXT: movss {{.*#+}} xmm0 = [1,0,0,0] ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm4[1,0] ; SSE-NEXT: paddd %xmm0, %xmm0 ; SSE-NEXT: paddd %xmm1, %xmm1 diff --git a/llvm/test/CodeGen/X86/prefer-avx256-lzcnt.ll b/llvm/test/CodeGen/X86/prefer-avx256-lzcnt.ll index bbe46a99ffa414..5f13e974874351 100644 --- a/llvm/test/CodeGen/X86/prefer-avx256-lzcnt.ll +++ b/llvm/test/CodeGen/X86/prefer-avx256-lzcnt.ll @@ -38,7 +38,7 @@ define <8 x i16> @testv8i16(<8 x i16> %in) { define <16 x i8> @testv16i8(<16 x i8> %in) { ; AVX256-LABEL: testv16i8: ; AVX256: # %bb.0: -; AVX256-NEXT: vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX256-NEXT: vmovq {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; AVX256-NEXT: vpshufb %xmm0, %xmm1, %xmm2 ; AVX256-NEXT: vpsrlw $4, %xmm0, %xmm0 ; AVX256-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 diff --git a/llvm/test/CodeGen/X86/pshufb-mask-comments.ll b/llvm/test/CodeGen/X86/pshufb-mask-comments.ll index ef91e7a3f91075..b96338984d6f55 100644 --- a/llvm/test/CodeGen/X86/pshufb-mask-comments.ll +++ b/llvm/test/CodeGen/X86/pshufb-mask-comments.ll @@ -54,7 +54,7 @@ define <16 x i8> @test4(<16 x i8> %V, ptr %P) { define <16 x i8> @test5(<16 x i8> %V) { ; CHECK-LABEL: test5: ; CHECK: # %bb.0: -; CHECK-NEXT: movaps {{.*#+}} xmm1 = [1,0,0,0] +; CHECK-NEXT: movss {{.*#+}} xmm1 = [1,0,0,0] ; CHECK-NEXT: movaps %xmm1, (%rax) ; CHECK-NEXT: movaps {{.*#+}} xmm1 = [1,1] ; CHECK-NEXT: movaps %xmm1, (%rax) diff --git a/llvm/test/CodeGen/X86/ret-mmx.ll b/llvm/test/CodeGen/X86/ret-mmx.ll index 815f95e64496b0..81dd73363c1fb1 100644 --- a/llvm/test/CodeGen/X86/ret-mmx.ll +++ b/llvm/test/CodeGen/X86/ret-mmx.ll @@ -32,7 +32,7 @@ define <1 x i64> @t2() nounwind { define <2 x i32> @t3() nounwind { ; CHECK-LABEL: t3: ; CHECK: ## %bb.0: -; CHECK-NEXT: movaps {{.*#+}} xmm0 = [1,0,0,0] +; CHECK-NEXT: movss {{.*#+}} xmm0 = [1,0,0,0] ; CHECK-NEXT: retq ret <2 x i32> } diff --git a/llvm/test/CodeGen/X86/sad.ll b/llvm/test/CodeGen/X86/sad.ll index 3d3e935045475e..d043234705aa06 100644 --- a/llvm/test/CodeGen/X86/sad.ll +++ b/llvm/test/CodeGen/X86/sad.ll @@ -544,7 +544,7 @@ define dso_local i32 @sad_2i8() nounwind { ; SSE2: # %bb.0: # %entry ; SSE2-NEXT: pxor %xmm0, %xmm0 ; SSE2-NEXT: movq $-1024, %rax # imm = 0xFC00 -; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,0,0] +; SSE2-NEXT: movd {{.*#+}} xmm1 = [65535,0,0,0] ; SSE2-NEXT: .p2align 4, 0x90 ; SSE2-NEXT: .LBB3_1: # %vector.body ; SSE2-NEXT: # =>This Inner Loop Header: Depth=1 diff --git a/llvm/test/CodeGen/X86/sext-vsetcc.ll b/llvm/test/CodeGen/X86/sext-vsetcc.ll index 839c972b23dbab..de9d47526b166b 100644 --- a/llvm/test/CodeGen/X86/sext-vsetcc.ll +++ b/llvm/test/CodeGen/X86/sext-vsetcc.ll @@ -216,7 +216,7 @@ define <4 x i32> @cmp_ult_load_const(ptr %x) nounwind { ; SSE-LABEL: cmp_ult_load_const: ; SSE: # %bb.0: ; SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [42,214,0,255,u,u,u,u,u,u,u,u,u,u,u,u] +; SSE-NEXT: movd {{.*#+}} xmm1 = [42,214,0,255,0,0,0,0,0,0,0,0,0,0,0,0] ; SSE-NEXT: pmaxub %xmm0, %xmm1 ; SSE-NEXT: pcmpeqb %xmm0, %xmm1 ; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] @@ -243,7 +243,7 @@ define <3 x i32> @cmp_ult_load_const_bad_type(ptr %x) nounwind { ; SSE-LABEL: cmp_ult_load_const_bad_type: ; SSE: # %bb.0: ; SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [42,214,0,u,u,u,u,u,u,u,u,u,u,u,u,u] +; SSE-NEXT: movd {{.*#+}} xmm1 = [42,214,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; SSE-NEXT: pmaxub %xmm0, %xmm1 ; SSE-NEXT: pcmpeqb %xmm0, %xmm1 ; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] diff --git a/llvm/test/CodeGen/X86/shrink_vmul.ll b/llvm/test/CodeGen/X86/shrink_vmul.ll index 8dd06e0d848d32..2610f4322c8e2b 100644 --- a/llvm/test/CodeGen/X86/shrink_vmul.ll +++ b/llvm/test/CodeGen/X86/shrink_vmul.ll @@ -1745,7 +1745,7 @@ define void @mul_2xi16_varconst1(ptr nocapture readonly %a, i64 %index) { ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-SSE-NEXT: movl c, %edx ; X86-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X86-SSE-NEXT: movdqa {{.*#+}} xmm1 = [0,65535,u,u,u,u,u,u] +; X86-SSE-NEXT: movd {{.*#+}} xmm1 = [0,65535,0,0,0,0,0,0] ; X86-SSE-NEXT: movdqa %xmm0, %xmm2 ; X86-SSE-NEXT: pmulhuw %xmm1, %xmm2 ; X86-SSE-NEXT: pmullw %xmm1, %xmm0 @@ -1768,7 +1768,7 @@ define void @mul_2xi16_varconst1(ptr nocapture readonly %a, i64 %index) { ; X64-SSE: # %bb.0: # %entry ; X64-SSE-NEXT: movq c(%rip), %rax ; X64-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X64-SSE-NEXT: movdqa {{.*#+}} xmm1 = [0,65535,u,u,u,u,u,u] +; X64-SSE-NEXT: movd {{.*#+}} xmm1 = [0,65535,0,0,0,0,0,0] ; X64-SSE-NEXT: movdqa %xmm0, %xmm2 ; X64-SSE-NEXT: pmulhuw %xmm1, %xmm2 ; X64-SSE-NEXT: pmullw %xmm1, %xmm0 diff --git a/llvm/test/CodeGen/X86/shuffle-half.ll b/llvm/test/CodeGen/X86/shuffle-half.ll index 0d27fc38967668..291fe841043ed4 100644 --- a/llvm/test/CodeGen/X86/shuffle-half.ll +++ b/llvm/test/CodeGen/X86/shuffle-half.ll @@ -10,7 +10,7 @@ define <32 x half> @dump_vec() { ; CHECK-NEXT: jne .LBB0_2 ; CHECK-NEXT: # %bb.1: # %cond.load ; CHECK-NEXT: vpinsrw $0, (%rax), %xmm0, %xmm0 -; CHECK-NEXT: vmovdqa {{.*#+}} xmm1 = [65535,0,0,0] +; CHECK-NEXT: vmovd {{.*#+}} xmm1 = [65535,0,0,0] ; CHECK-NEXT: vpand %ymm0, %ymm1, %ymm0 ; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 diff --git a/llvm/test/CodeGen/X86/shuffle-strided-with-offset-256.ll b/llvm/test/CodeGen/X86/shuffle-strided-with-offset-256.ll index 42f1bd7824909b..f632654f89e04a 100644 --- a/llvm/test/CodeGen/X86/shuffle-strided-with-offset-256.ll +++ b/llvm/test/CodeGen/X86/shuffle-strided-with-offset-256.ll @@ -12,34 +12,22 @@ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BWVL define void @shuffle_v32i8_to_v16i8_1(ptr %L, ptr %S) nounwind { -; AVX1-LABEL: shuffle_v32i8_to_v16i8_1: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa (%rdi), %xmm0 -; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15] -; AVX1-NEXT: # xmm2 = mem[0,0] -; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX1-NEXT: vmovdqa %xmm0, (%rsi) -; AVX1-NEXT: retq -; -; AVX2-LABEL: shuffle_v32i8_to_v16i8_1: -; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa (%rdi), %xmm0 -; AVX2-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15] -; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX2-NEXT: vmovdqa %xmm0, (%rsi) -; AVX2-NEXT: retq +; AVX-LABEL: shuffle_v32i8_to_v16i8_1: +; AVX: # %bb.0: +; AVX-NEXT: vmovdqa (%rdi), %xmm0 +; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX-NEXT: vmovq {{.*#+}} xmm2 = [1,3,5,7,9,11,13,15,0,0,0,0,0,0,0,0] +; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX-NEXT: vmovdqa %xmm0, (%rsi) +; AVX-NEXT: retq ; ; AVX512F-LABEL: shuffle_v32i8_to_v16i8_1: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX512F-NEXT: vpbroadcastq {{.*#+}} xmm2 = [1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15] +; AVX512F-NEXT: vmovq {{.*#+}} xmm2 = [1,3,5,7,9,11,13,15,0,0,0,0,0,0,0,0] ; AVX512F-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] @@ -50,7 +38,7 @@ define void @shuffle_v32i8_to_v16i8_1(ptr %L, ptr %S) nounwind { ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX512VL-NEXT: vpbroadcastq {{.*#+}} xmm2 = [1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15] +; AVX512VL-NEXT: vmovq {{.*#+}} xmm2 = [1,3,5,7,9,11,13,15,0,0,0,0,0,0,0,0] ; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] @@ -147,27 +135,16 @@ define void @shuffle_v8i32_to_v4i32_1(ptr %L, ptr %S) nounwind { } define void @shuffle_v32i8_to_v8i8_1(ptr %L, ptr %S) nounwind { -; AVX1-LABEL: shuffle_v32i8_to_v8i8_1: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa (%rdi), %xmm0 -; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] -; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX1-NEXT: vmovq %xmm0, (%rsi) -; AVX1-NEXT: retq -; -; AVX2-LABEL: shuffle_v32i8_to_v8i8_1: -; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa (%rdi), %xmm0 -; AVX2-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] -; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX2-NEXT: vmovq %xmm0, (%rsi) -; AVX2-NEXT: retq +; AVX-LABEL: shuffle_v32i8_to_v8i8_1: +; AVX: # %bb.0: +; AVX-NEXT: vmovdqa (%rdi), %xmm0 +; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX-NEXT: vmovd {{.*#+}} xmm2 = [1,5,9,13,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX-NEXT: vmovq %xmm0, (%rsi) +; AVX-NEXT: retq ; ; AVX512F-LABEL: shuffle_v32i8_to_v8i8_1: ; AVX512F: # %bb.0: @@ -207,27 +184,16 @@ define void @shuffle_v32i8_to_v8i8_1(ptr %L, ptr %S) nounwind { } define void @shuffle_v32i8_to_v8i8_2(ptr %L, ptr %S) nounwind { -; AVX1-LABEL: shuffle_v32i8_to_v8i8_2: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa (%rdi), %xmm0 -; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] -; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX1-NEXT: vmovq %xmm0, (%rsi) -; AVX1-NEXT: retq -; -; AVX2-LABEL: shuffle_v32i8_to_v8i8_2: -; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa (%rdi), %xmm0 -; AVX2-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] -; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX2-NEXT: vmovq %xmm0, (%rsi) -; AVX2-NEXT: retq +; AVX-LABEL: shuffle_v32i8_to_v8i8_2: +; AVX: # %bb.0: +; AVX-NEXT: vmovdqa (%rdi), %xmm0 +; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX-NEXT: vmovd {{.*#+}} xmm2 = [2,6,10,14,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX-NEXT: vmovq %xmm0, (%rsi) +; AVX-NEXT: retq ; ; AVX512F-LABEL: shuffle_v32i8_to_v8i8_2: ; AVX512F: # %bb.0: @@ -267,27 +233,16 @@ define void @shuffle_v32i8_to_v8i8_2(ptr %L, ptr %S) nounwind { } define void @shuffle_v32i8_to_v8i8_3(ptr %L, ptr %S) nounwind { -; AVX1-LABEL: shuffle_v32i8_to_v8i8_3: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa (%rdi), %xmm0 -; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] -; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX1-NEXT: vmovq %xmm0, (%rsi) -; AVX1-NEXT: retq -; -; AVX2-LABEL: shuffle_v32i8_to_v8i8_3: -; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa (%rdi), %xmm0 -; AVX2-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] -; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX2-NEXT: vmovq %xmm0, (%rsi) -; AVX2-NEXT: retq +; AVX-LABEL: shuffle_v32i8_to_v8i8_3: +; AVX: # %bb.0: +; AVX-NEXT: vmovdqa (%rdi), %xmm0 +; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX-NEXT: vmovd {{.*#+}} xmm2 = [3,7,11,15,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX-NEXT: vmovq %xmm0, (%rsi) +; AVX-NEXT: retq ; ; AVX512F-LABEL: shuffle_v32i8_to_v8i8_3: ; AVX512F: # %bb.0: @@ -538,7 +493,7 @@ define void @shuffle_v32i8_to_v4i8_1(ptr %L, ptr %S) nounwind { ; AVX1: # %bb.0: ; AVX1-NEXT: vmovdqa (%rdi), %xmm0 ; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [1,9,0,0,1,9,0,0,1,9,0,0,1,9,0,0] +; AVX1-NEXT: vmovd {{.*#+}} xmm2 = [1,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] @@ -598,7 +553,7 @@ define void @shuffle_v32i8_to_v4i8_2(ptr %L, ptr %S) nounwind { ; AVX1: # %bb.0: ; AVX1-NEXT: vmovdqa (%rdi), %xmm0 ; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [2,10,0,0,2,10,0,0,2,10,0,0,2,10,0,0] +; AVX1-NEXT: vmovd {{.*#+}} xmm2 = [2,10,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] @@ -658,7 +613,7 @@ define void @shuffle_v32i8_to_v4i8_3(ptr %L, ptr %S) nounwind { ; AVX1: # %bb.0: ; AVX1-NEXT: vmovdqa (%rdi), %xmm0 ; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [3,11,0,0,3,11,0,0,3,11,0,0,3,11,0,0] +; AVX1-NEXT: vmovd {{.*#+}} xmm2 = [3,11,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] @@ -718,7 +673,7 @@ define void @shuffle_v32i8_to_v4i8_4(ptr %L, ptr %S) nounwind { ; AVX1: # %bb.0: ; AVX1-NEXT: vmovdqa (%rdi), %xmm0 ; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [4,12,0,0,4,12,0,0,4,12,0,0,4,12,0,0] +; AVX1-NEXT: vmovd {{.*#+}} xmm2 = [4,12,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] @@ -778,7 +733,7 @@ define void @shuffle_v32i8_to_v4i8_5(ptr %L, ptr %S) nounwind { ; AVX1: # %bb.0: ; AVX1-NEXT: vmovdqa (%rdi), %xmm0 ; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [5,13,0,0,5,13,0,0,5,13,0,0,5,13,0,0] +; AVX1-NEXT: vmovd {{.*#+}} xmm2 = [5,13,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] @@ -838,7 +793,7 @@ define void @shuffle_v32i8_to_v4i8_6(ptr %L, ptr %S) nounwind { ; AVX1: # %bb.0: ; AVX1-NEXT: vmovdqa (%rdi), %xmm0 ; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [6,14,0,0,6,14,0,0,6,14,0,0,6,14,0,0] +; AVX1-NEXT: vmovd {{.*#+}} xmm2 = [6,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] @@ -898,7 +853,7 @@ define void @shuffle_v32i8_to_v4i8_7(ptr %L, ptr %S) nounwind { ; AVX1: # %bb.0: ; AVX1-NEXT: vmovdqa (%rdi), %xmm0 ; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [7,15,0,0,7,15,0,0,7,15,0,0,7,15,0,0] +; AVX1-NEXT: vmovd {{.*#+}} xmm2 = [7,15,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] diff --git a/llvm/test/CodeGen/X86/shuffle-vs-trunc-256.ll b/llvm/test/CodeGen/X86/shuffle-vs-trunc-256.ll index 0ab141f4c2022b..05dd2344d30f7b 100644 --- a/llvm/test/CodeGen/X86/shuffle-vs-trunc-256.ll +++ b/llvm/test/CodeGen/X86/shuffle-vs-trunc-256.ll @@ -382,27 +382,16 @@ define void @trunc_v4i64_to_v4i32(ptr %L, ptr %S) nounwind { } define void @shuffle_v32i8_to_v8i8(ptr %L, ptr %S) nounwind { -; AVX1-LABEL: shuffle_v32i8_to_v8i8: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa (%rdi), %xmm0 -; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] -; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX1-NEXT: vmovq %xmm0, (%rsi) -; AVX1-NEXT: retq -; -; AVX2-LABEL: shuffle_v32i8_to_v8i8: -; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa (%rdi), %xmm0 -; AVX2-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] -; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX2-NEXT: vmovq %xmm0, (%rsi) -; AVX2-NEXT: retq +; AVX-LABEL: shuffle_v32i8_to_v8i8: +; AVX: # %bb.0: +; AVX-NEXT: vmovdqa (%rdi), %xmm0 +; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX-NEXT: vmovd {{.*#+}} xmm2 = [0,4,8,12,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX-NEXT: vmovq %xmm0, (%rsi) +; AVX-NEXT: retq ; ; AVX512F-LABEL: shuffle_v32i8_to_v8i8: ; AVX512F: # %bb.0: @@ -447,27 +436,16 @@ define void @shuffle_v32i8_to_v8i8(ptr %L, ptr %S) nounwind { } define void @trunc_v8i32_to_v8i8(ptr %L, ptr %S) nounwind { -; AVX1-LABEL: trunc_v8i32_to_v8i8: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa (%rdi), %xmm0 -; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] -; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX1-NEXT: vmovq %xmm0, (%rsi) -; AVX1-NEXT: retq -; -; AVX2-LABEL: trunc_v8i32_to_v8i8: -; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa (%rdi), %xmm0 -; AVX2-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] -; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX2-NEXT: vmovq %xmm0, (%rsi) -; AVX2-NEXT: retq +; AVX-LABEL: trunc_v8i32_to_v8i8: +; AVX: # %bb.0: +; AVX-NEXT: vmovdqa (%rdi), %xmm0 +; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX-NEXT: vmovd {{.*#+}} xmm2 = [0,4,8,12,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX-NEXT: vmovq %xmm0, (%rsi) +; AVX-NEXT: retq ; ; AVX512F-LABEL: trunc_v8i32_to_v8i8: ; AVX512F: # %bb.0: @@ -519,7 +497,7 @@ define <2 x i64> @trunc_v8i32_to_v8i8_return_v2i64(<8 x i32> %vec) nounwind { ; AVX1-LABEL: trunc_v8i32_to_v8i8_return_v2i64: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX1-NEXT: vmovd {{.*#+}} xmm2 = [0,4,8,12,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] @@ -530,7 +508,7 @@ define <2 x i64> @trunc_v8i32_to_v8i8_return_v2i64(<8 x i32> %vec) nounwind { ; AVX2-LABEL: trunc_v8i32_to_v8i8_return_v2i64: ; AVX2: # %bb.0: ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX2-NEXT: vmovd {{.*#+}} xmm2 = [0,4,8,12,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] @@ -692,7 +670,7 @@ define <16 x i8> @trunc_v8i32_to_v8i8_return_v16i8(<8 x i32> %vec) nounwind { ; AVX1-LABEL: trunc_v8i32_to_v8i8_return_v16i8: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX1-NEXT: vmovd {{.*#+}} xmm2 = [0,4,8,12,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] @@ -703,7 +681,7 @@ define <16 x i8> @trunc_v8i32_to_v8i8_return_v16i8(<8 x i32> %vec) nounwind { ; AVX2-LABEL: trunc_v8i32_to_v8i8_return_v16i8: ; AVX2: # %bb.0: ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX2-NEXT: vmovd {{.*#+}} xmm2 = [0,4,8,12,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] @@ -1021,7 +999,7 @@ define <16 x i8> @trunc_v4i64_to_v4i8_return_v16i8(<4 x i64> %vec) nounwind { ; AVX1-LABEL: trunc_v4i64_to_v4i8_return_v16i8: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [0,8,0,0,0,8,0,0,0,8,0,0,0,8,0,0] +; AVX1-NEXT: vmovd {{.*#+}} xmm2 = [0,8,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] @@ -1194,7 +1172,7 @@ define void @shuffle_v32i8_to_v4i8(ptr %L, ptr %S) nounwind { ; AVX1: # %bb.0: ; AVX1-NEXT: vmovdqa (%rdi), %xmm0 ; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [0,8,0,0,0,8,0,0,0,8,0,0,0,8,0,0] +; AVX1-NEXT: vmovd {{.*#+}} xmm2 = [0,8,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] @@ -1259,7 +1237,7 @@ define void @trunc_v4i64_to_v4i8(ptr %L, ptr %S) nounwind { ; AVX1: # %bb.0: ; AVX1-NEXT: vmovdqa (%rdi), %xmm0 ; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [0,8,0,0,0,8,0,0,0,8,0,0,0,8,0,0] +; AVX1-NEXT: vmovd {{.*#+}} xmm2 = [0,8,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] diff --git a/llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll b/llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll index 6e357a5fb34f50..85e160e497172a 100644 --- a/llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll +++ b/llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll @@ -392,7 +392,7 @@ define <4 x double> @PR34175(ptr %p) { ; ; AVX512BWVL-LABEL: PR34175: ; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpbroadcastq {{.*#+}} xmm0 = [0,8,16,24,0,8,16,24] +; AVX512BWVL-NEXT: vmovq {{.*#+}} xmm0 = [0,8,16,24,0,0,0,0] ; AVX512BWVL-NEXT: vmovdqu (%rdi), %ymm1 ; AVX512BWVL-NEXT: vpermt2w 32(%rdi), %ymm0, %ymm1 ; AVX512BWVL-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero @@ -411,7 +411,7 @@ define <4 x double> @PR34175(ptr %p) { ; ; AVX512VBMIVL-LABEL: PR34175: ; AVX512VBMIVL: # %bb.0: -; AVX512VBMIVL-NEXT: vpbroadcastq {{.*#+}} xmm0 = [0,8,16,24,0,8,16,24] +; AVX512VBMIVL-NEXT: vmovq {{.*#+}} xmm0 = [0,8,16,24,0,0,0,0] ; AVX512VBMIVL-NEXT: vmovdqu (%rdi), %ymm1 ; AVX512VBMIVL-NEXT: vpermt2w 32(%rdi), %ymm0, %ymm1 ; AVX512VBMIVL-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero diff --git a/llvm/test/CodeGen/X86/sse2-intrinsics-x86.ll b/llvm/test/CodeGen/X86/sse2-intrinsics-x86.ll index 348501caf619a3..777beea886f566 100644 --- a/llvm/test/CodeGen/X86/sse2-intrinsics-x86.ll +++ b/llvm/test/CodeGen/X86/sse2-intrinsics-x86.ll @@ -834,43 +834,43 @@ declare <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16>, <8 x i16>) nounwind rea define <16 x i8> @test_x86_sse2_packsswb_128_fold() { ; X86-SSE-LABEL: test_x86_sse2_packsswb_128_fold: ; X86-SSE: ## %bb.0: -; X86-SSE-NEXT: movaps {{.*#+}} xmm0 = [0,127,127,255,255,128,128,128,0,0,0,0,0,0,0,0] -; X86-SSE-NEXT: ## encoding: [0x0f,0x28,0x05,A,A,A,A] -; X86-SSE-NEXT: ## fixup A - offset: 3, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 +; X86-SSE-NEXT: movsd {{.*#+}} xmm0 = [0,127,127,255,255,128,128,128,0,0,0,0,0,0,0,0] +; X86-SSE-NEXT: ## encoding: [0xf2,0x0f,0x10,0x05,A,A,A,A] +; X86-SSE-NEXT: ## fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 ; X86-SSE-NEXT: retl ## encoding: [0xc3] ; ; X86-AVX1-LABEL: test_x86_sse2_packsswb_128_fold: ; X86-AVX1: ## %bb.0: -; X86-AVX1-NEXT: vmovaps {{.*#+}} xmm0 = [0,127,127,255,255,128,128,128,0,0,0,0,0,0,0,0] -; X86-AVX1-NEXT: ## encoding: [0xc5,0xf8,0x28,0x05,A,A,A,A] +; X86-AVX1-NEXT: vmovsd {{.*#+}} xmm0 = [0,127,127,255,255,128,128,128,0,0,0,0,0,0,0,0] +; X86-AVX1-NEXT: ## encoding: [0xc5,0xfb,0x10,0x05,A,A,A,A] ; X86-AVX1-NEXT: ## fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 ; X86-AVX1-NEXT: retl ## encoding: [0xc3] ; ; X86-AVX512-LABEL: test_x86_sse2_packsswb_128_fold: ; X86-AVX512: ## %bb.0: -; X86-AVX512-NEXT: vmovaps {{.*#+}} xmm0 = [0,127,127,255,255,128,128,128,0,0,0,0,0,0,0,0] -; X86-AVX512-NEXT: ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x05,A,A,A,A] +; X86-AVX512-NEXT: vmovsd {{.*#+}} xmm0 = [0,127,127,255,255,128,128,128,0,0,0,0,0,0,0,0] +; X86-AVX512-NEXT: ## EVEX TO VEX Compression encoding: [0xc5,0xfb,0x10,0x05,A,A,A,A] ; X86-AVX512-NEXT: ## fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 ; X86-AVX512-NEXT: retl ## encoding: [0xc3] ; ; X64-SSE-LABEL: test_x86_sse2_packsswb_128_fold: ; X64-SSE: ## %bb.0: -; X64-SSE-NEXT: movaps {{.*#+}} xmm0 = [0,127,127,255,255,128,128,128,0,0,0,0,0,0,0,0] -; X64-SSE-NEXT: ## encoding: [0x0f,0x28,0x05,A,A,A,A] -; X64-SSE-NEXT: ## fixup A - offset: 3, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte +; X64-SSE-NEXT: movsd {{.*#+}} xmm0 = [0,127,127,255,255,128,128,128,0,0,0,0,0,0,0,0] +; X64-SSE-NEXT: ## encoding: [0xf2,0x0f,0x10,0x05,A,A,A,A] +; X64-SSE-NEXT: ## fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte ; X64-SSE-NEXT: retq ## encoding: [0xc3] ; ; X64-AVX1-LABEL: test_x86_sse2_packsswb_128_fold: ; X64-AVX1: ## %bb.0: -; X64-AVX1-NEXT: vmovaps {{.*#+}} xmm0 = [0,127,127,255,255,128,128,128,0,0,0,0,0,0,0,0] -; X64-AVX1-NEXT: ## encoding: [0xc5,0xf8,0x28,0x05,A,A,A,A] +; X64-AVX1-NEXT: vmovsd {{.*#+}} xmm0 = [0,127,127,255,255,128,128,128,0,0,0,0,0,0,0,0] +; X64-AVX1-NEXT: ## encoding: [0xc5,0xfb,0x10,0x05,A,A,A,A] ; X64-AVX1-NEXT: ## fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte ; X64-AVX1-NEXT: retq ## encoding: [0xc3] ; ; X64-AVX512-LABEL: test_x86_sse2_packsswb_128_fold: ; X64-AVX512: ## %bb.0: -; X64-AVX512-NEXT: vmovaps {{.*#+}} xmm0 = [0,127,127,255,255,128,128,128,0,0,0,0,0,0,0,0] -; X64-AVX512-NEXT: ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x05,A,A,A,A] +; X64-AVX512-NEXT: vmovsd {{.*#+}} xmm0 = [0,127,127,255,255,128,128,128,0,0,0,0,0,0,0,0] +; X64-AVX512-NEXT: ## EVEX TO VEX Compression encoding: [0xc5,0xfb,0x10,0x05,A,A,A,A] ; X64-AVX512-NEXT: ## fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte ; X64-AVX512-NEXT: retq ## encoding: [0xc3] %res = call <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16> , <8 x i16> zeroinitializer) @@ -902,43 +902,43 @@ declare <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16>, <8 x i16>) nounwind rea define <16 x i8> @test_x86_sse2_packuswb_128_fold() { ; X86-SSE-LABEL: test_x86_sse2_packuswb_128_fold: ; X86-SSE: ## %bb.0: -; X86-SSE-NEXT: movaps {{.*#+}} xmm0 = [0,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0] -; X86-SSE-NEXT: ## encoding: [0x0f,0x28,0x05,A,A,A,A] -; X86-SSE-NEXT: ## fixup A - offset: 3, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 +; X86-SSE-NEXT: movss {{.*#+}} xmm0 = [0,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0] +; X86-SSE-NEXT: ## encoding: [0xf3,0x0f,0x10,0x05,A,A,A,A] +; X86-SSE-NEXT: ## fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 ; X86-SSE-NEXT: retl ## encoding: [0xc3] ; ; X86-AVX1-LABEL: test_x86_sse2_packuswb_128_fold: ; X86-AVX1: ## %bb.0: -; X86-AVX1-NEXT: vmovaps {{.*#+}} xmm0 = [0,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0] -; X86-AVX1-NEXT: ## encoding: [0xc5,0xf8,0x28,0x05,A,A,A,A] +; X86-AVX1-NEXT: vmovss {{.*#+}} xmm0 = [0,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0] +; X86-AVX1-NEXT: ## encoding: [0xc5,0xfa,0x10,0x05,A,A,A,A] ; X86-AVX1-NEXT: ## fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 ; X86-AVX1-NEXT: retl ## encoding: [0xc3] ; ; X86-AVX512-LABEL: test_x86_sse2_packuswb_128_fold: ; X86-AVX512: ## %bb.0: -; X86-AVX512-NEXT: vmovaps {{.*#+}} xmm0 = [0,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0] -; X86-AVX512-NEXT: ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x05,A,A,A,A] +; X86-AVX512-NEXT: vmovss {{.*#+}} xmm0 = [0,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0] +; X86-AVX512-NEXT: ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x05,A,A,A,A] ; X86-AVX512-NEXT: ## fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 ; X86-AVX512-NEXT: retl ## encoding: [0xc3] ; ; X64-SSE-LABEL: test_x86_sse2_packuswb_128_fold: ; X64-SSE: ## %bb.0: -; X64-SSE-NEXT: movaps {{.*#+}} xmm0 = [0,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0] -; X64-SSE-NEXT: ## encoding: [0x0f,0x28,0x05,A,A,A,A] -; X64-SSE-NEXT: ## fixup A - offset: 3, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte +; X64-SSE-NEXT: movss {{.*#+}} xmm0 = [0,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0] +; X64-SSE-NEXT: ## encoding: [0xf3,0x0f,0x10,0x05,A,A,A,A] +; X64-SSE-NEXT: ## fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte ; X64-SSE-NEXT: retq ## encoding: [0xc3] ; ; X64-AVX1-LABEL: test_x86_sse2_packuswb_128_fold: ; X64-AVX1: ## %bb.0: -; X64-AVX1-NEXT: vmovaps {{.*#+}} xmm0 = [0,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0] -; X64-AVX1-NEXT: ## encoding: [0xc5,0xf8,0x28,0x05,A,A,A,A] +; X64-AVX1-NEXT: vmovss {{.*#+}} xmm0 = [0,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0] +; X64-AVX1-NEXT: ## encoding: [0xc5,0xfa,0x10,0x05,A,A,A,A] ; X64-AVX1-NEXT: ## fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte ; X64-AVX1-NEXT: retq ## encoding: [0xc3] ; ; X64-AVX512-LABEL: test_x86_sse2_packuswb_128_fold: ; X64-AVX512: ## %bb.0: -; X64-AVX512-NEXT: vmovaps {{.*#+}} xmm0 = [0,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0] -; X64-AVX512-NEXT: ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x05,A,A,A,A] +; X64-AVX512-NEXT: vmovss {{.*#+}} xmm0 = [0,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0] +; X64-AVX512-NEXT: ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x05,A,A,A,A] ; X64-AVX512-NEXT: ## fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte ; X64-AVX512-NEXT: retq ## encoding: [0xc3] %res = call <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16> , <8 x i16> zeroinitializer) diff --git a/llvm/test/CodeGen/X86/vec_anyext.ll b/llvm/test/CodeGen/X86/vec_anyext.ll index cdd30165a99bc3..09e4a4b3a773d1 100644 --- a/llvm/test/CodeGen/X86/vec_anyext.ll +++ b/llvm/test/CodeGen/X86/vec_anyext.ll @@ -173,7 +173,7 @@ define <4 x i8> @func_8_64(ptr %a, ptr %b) nounwind { ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: vmovdqa (%ecx), %xmm0 ; X86-NEXT: vmovdqa 16(%ecx), %xmm1 -; X86-NEXT: vbroadcastss {{.*#+}} xmm2 = [0,8,0,0,0,8,0,0,0,8,0,0,0,8,0,0] +; X86-NEXT: vmovd {{.*#+}} xmm2 = [0,8,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; X86-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; X86-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; X86-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] @@ -211,8 +211,7 @@ define <4 x i8> @func_8_64(ptr %a, ptr %b) nounwind { define <4 x i16> @const_16_32() nounwind { ; CHECK-LABEL: const_16_32: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovddup {{.*#+}} xmm0 = [0,3,8,7,0,3,8,7] -; CHECK-NEXT: # xmm0 = mem[0,0] +; CHECK-NEXT: vmovsd {{.*#+}} xmm0 = [0,3,8,7,0,0,0,0] ; CHECK-NEXT: ret{{[l|q]}} %G = trunc <4 x i32> to <4 x i16> ret <4 x i16> %G @@ -221,8 +220,7 @@ define <4 x i16> @const_16_32() nounwind { define <4 x i16> @const_16_64() nounwind { ; CHECK-LABEL: const_16_64: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovddup {{.*#+}} xmm0 = [0,3,8,7,0,3,8,7] -; CHECK-NEXT: # xmm0 = mem[0,0] +; CHECK-NEXT: vmovsd {{.*#+}} xmm0 = [0,3,8,7,0,0,0,0] ; CHECK-NEXT: ret{{[l|q]}} %G = trunc <4 x i64> to <4 x i16> ret <4 x i16> %G diff --git a/llvm/test/CodeGen/X86/vec_fp_to_int.ll b/llvm/test/CodeGen/X86/vec_fp_to_int.ll index 1d81cb37730800..a0e9f33483b69c 100644 --- a/llvm/test/CodeGen/X86/vec_fp_to_int.ll +++ b/llvm/test/CodeGen/X86/vec_fp_to_int.ll @@ -1907,13 +1907,12 @@ define <2 x i64> @fptosi_2f64_to_2i64_const() { define <4 x i32> @fptosi_2f64_to_2i32_const() { ; SSE-LABEL: fptosi_2f64_to_2i32_const: ; SSE: # %bb.0: -; SSE-NEXT: movaps {{.*#+}} xmm0 = [4294967295,1,u,u] +; SSE-NEXT: movsd {{.*#+}} xmm0 = [4294967295,1,0,0] ; SSE-NEXT: retq ; ; AVX-LABEL: fptosi_2f64_to_2i32_const: ; AVX: # %bb.0: -; AVX-NEXT: vmovddup {{.*#+}} xmm0 = [4294967295,1,4294967295,1] -; AVX-NEXT: # xmm0 = mem[0,0] +; AVX-NEXT: vmovsd {{.*#+}} xmm0 = [4294967295,1,0,0] ; AVX-NEXT: retq %cvt = fptosi <2 x double> to <2 x i32> %ext = shufflevector <2 x i32> %cvt, <2 x i32> undef, <4 x i32> @@ -1966,13 +1965,12 @@ define <2 x i64> @fptoui_2f64_to_2i64_const() { define <4 x i32> @fptoui_2f64_to_2i32_const(<2 x double> %a) { ; SSE-LABEL: fptoui_2f64_to_2i32_const: ; SSE: # %bb.0: -; SSE-NEXT: movaps {{.*#+}} xmm0 = [2,4,u,u] +; SSE-NEXT: movsd {{.*#+}} xmm0 = [2,4,0,0] ; SSE-NEXT: retq ; ; AVX-LABEL: fptoui_2f64_to_2i32_const: ; AVX: # %bb.0: -; AVX-NEXT: vmovddup {{.*#+}} xmm0 = [2,4,2,4] -; AVX-NEXT: # xmm0 = mem[0,0] +; AVX-NEXT: vmovsd {{.*#+}} xmm0 = [2,4,0,0] ; AVX-NEXT: retq %cvt = fptoui <2 x double> to <2 x i32> %ext = shufflevector <2 x i32> %cvt, <2 x i32> undef, <4 x i32> diff --git a/llvm/test/CodeGen/X86/vec_set-A.ll b/llvm/test/CodeGen/X86/vec_set-A.ll index c8ff250b5bfbcd..a288579bda34ea 100644 --- a/llvm/test/CodeGen/X86/vec_set-A.ll +++ b/llvm/test/CodeGen/X86/vec_set-A.ll @@ -5,12 +5,12 @@ define <2 x i64> @test1() nounwind { ; X86-LABEL: test1: ; X86: # %bb.0: -; X86-NEXT: movaps {{.*#+}} xmm0 = [1,0,0,0] +; X86-NEXT: movss {{.*#+}} xmm0 = [1,0,0,0] ; X86-NEXT: retl ; ; X64-LABEL: test1: ; X64: # %bb.0: -; X64-NEXT: movaps {{.*#+}} xmm0 = [1,0,0,0] +; X64-NEXT: movss {{.*#+}} xmm0 = [1,0,0,0] ; X64-NEXT: retq ret <2 x i64> < i64 1, i64 0 > } diff --git a/llvm/test/CodeGen/X86/vector-blend.ll b/llvm/test/CodeGen/X86/vector-blend.ll index 73c2f4ca5b10bf..bd5c9363794aa1 100644 --- a/llvm/test/CodeGen/X86/vector-blend.ll +++ b/llvm/test/CodeGen/X86/vector-blend.ll @@ -79,22 +79,16 @@ define <4 x i8> @vsel_4xi8(<4 x i8> %v1, <4 x i8> %v2) { ; SSE41-LABEL: vsel_4xi8: ; SSE41: # %bb.0: # %entry ; SSE41-NEXT: movdqa %xmm0, %xmm2 -; SSE41-NEXT: movaps {{.*#+}} xmm0 = [255,255,0,255,u,u,u,u,u,u,u,u,u,u,u,u] +; SSE41-NEXT: movss {{.*#+}} xmm0 = [255,255,0,255,0,0,0,0,0,0,0,0,0,0,0,0] ; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1 ; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: retq ; -; AVX1-LABEL: vsel_4xi8: -; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255] -; AVX1-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: vsel_4xi8: -; AVX2: # %bb.0: # %entry -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255] -; AVX2-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0 -; AVX2-NEXT: retq +; AVX-LABEL: vsel_4xi8: +; AVX: # %bb.0: # %entry +; AVX-NEXT: vmovd {{.*#+}} xmm2 = [255,255,0,255,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0 +; AVX-NEXT: retq entry: %vsel = select <4 x i1> , <4 x i8> %v1, <4 x i8> %v2 ret <4 x i8> %vsel diff --git a/llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll b/llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll index acf45fc4bbeba4..0adb9ddfc426a8 100644 --- a/llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll +++ b/llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll @@ -4379,7 +4379,7 @@ define <2 x i32> @constrained_vector_fptoui_v2i32_v2f32() #0 { ; ; AVX512-LABEL: constrained_vector_fptoui_v2i32_v2f32: ; AVX512: # %bb.0: # %entry -; AVX512-NEXT: vmovaps {{.*#+}} xmm0 = [4.2E+1,4.3E+1,0.0E+0,0.0E+0] +; AVX512-NEXT: vmovsd {{.*#+}} xmm0 = [4.2E+1,4.3E+1,0.0E+0,0.0E+0] ; AVX512-NEXT: vcvttps2udq %zmm0, %zmm0 ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512-NEXT: vzeroupper diff --git a/llvm/test/CodeGen/X86/vector-fshl-128.ll b/llvm/test/CodeGen/X86/vector-fshl-128.ll index 4f100cd3e05309..550b2e06554385 100644 --- a/llvm/test/CodeGen/X86/vector-fshl-128.ll +++ b/llvm/test/CodeGen/X86/vector-fshl-128.ll @@ -1212,7 +1212,7 @@ define <4 x i32> @splatvar_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> % define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %amt) nounwind { ; SSE-LABEL: splatvar_funnnel_v8i16: ; SSE: # %bb.0: -; SSE-NEXT: movdqa {{.*#+}} xmm3 = [15,0,0,0] +; SSE-NEXT: movd {{.*#+}} xmm3 = [15,0,0,0] ; SSE-NEXT: movdqa %xmm2, %xmm4 ; SSE-NEXT: pandn %xmm3, %xmm4 ; SSE-NEXT: psrlw $1, %xmm1 @@ -1224,7 +1224,7 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> % ; ; AVX-LABEL: splatvar_funnnel_v8i16: ; AVX: # %bb.0: -; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [15,0,0,0] +; AVX-NEXT: vmovd {{.*#+}} xmm3 = [15,0,0,0] ; AVX-NEXT: vpandn %xmm3, %xmm2, %xmm4 ; AVX-NEXT: vpsrlw $1, %xmm1, %xmm1 ; AVX-NEXT: vpsrlw %xmm4, %xmm1, %xmm1 @@ -1235,7 +1235,7 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> % ; ; AVX512F-LABEL: splatvar_funnnel_v8i16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = [15,0,0,0] +; AVX512F-NEXT: vmovd {{.*#+}} xmm3 = [15,0,0,0] ; AVX512F-NEXT: vpandn %xmm3, %xmm2, %xmm4 ; AVX512F-NEXT: vpsrlw $1, %xmm1, %xmm1 ; AVX512F-NEXT: vpsrlw %xmm4, %xmm1, %xmm1 @@ -1246,7 +1246,7 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> % ; ; AVX512VL-LABEL: splatvar_funnnel_v8i16: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = [15,0,0,0] +; AVX512VL-NEXT: vmovd {{.*#+}} xmm3 = [15,0,0,0] ; AVX512VL-NEXT: vpandn %xmm3, %xmm2, %xmm4 ; AVX512VL-NEXT: vpsrlw $1, %xmm1, %xmm1 ; AVX512VL-NEXT: vpsrlw %xmm4, %xmm1, %xmm1 @@ -1257,7 +1257,7 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> % ; ; AVX512BW-LABEL: splatvar_funnnel_v8i16: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [15,0,0,0] +; AVX512BW-NEXT: vmovd {{.*#+}} xmm3 = [15,0,0,0] ; AVX512BW-NEXT: vpandn %xmm3, %xmm2, %xmm4 ; AVX512BW-NEXT: vpsrlw $1, %xmm1, %xmm1 ; AVX512BW-NEXT: vpsrlw %xmm4, %xmm1, %xmm1 @@ -1278,7 +1278,7 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> % ; ; AVX512VLBW-LABEL: splatvar_funnnel_v8i16: ; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm3 = [15,0,0,0] +; AVX512VLBW-NEXT: vmovd {{.*#+}} xmm3 = [15,0,0,0] ; AVX512VLBW-NEXT: vpandn %xmm3, %xmm2, %xmm4 ; AVX512VLBW-NEXT: vpsrlw $1, %xmm1, %xmm1 ; AVX512VLBW-NEXT: vpsrlw %xmm4, %xmm1, %xmm1 @@ -1295,7 +1295,7 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> % ; ; XOP-LABEL: splatvar_funnnel_v8i16: ; XOP: # %bb.0: -; XOP-NEXT: vmovdqa {{.*#+}} xmm3 = [15,0,0,0] +; XOP-NEXT: vmovd {{.*#+}} xmm3 = [15,0,0,0] ; XOP-NEXT: vpandn %xmm3, %xmm2, %xmm4 ; XOP-NEXT: vpsrlw $1, %xmm1, %xmm1 ; XOP-NEXT: vpsrlw %xmm4, %xmm1, %xmm1 @@ -1306,7 +1306,7 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> % ; ; X86-SSE2-LABEL: splatvar_funnnel_v8i16: ; X86-SSE2: # %bb.0: -; X86-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [15,0,0,0] +; X86-SSE2-NEXT: movd {{.*#+}} xmm3 = [15,0,0,0] ; X86-SSE2-NEXT: movdqa %xmm2, %xmm4 ; X86-SSE2-NEXT: pandn %xmm3, %xmm4 ; X86-SSE2-NEXT: psrlw $1, %xmm1 diff --git a/llvm/test/CodeGen/X86/vector-fshl-256.ll b/llvm/test/CodeGen/X86/vector-fshl-256.ll index 05be4e1ee928e4..683fdf15cdea41 100644 --- a/llvm/test/CodeGen/X86/vector-fshl-256.ll +++ b/llvm/test/CodeGen/X86/vector-fshl-256.ll @@ -1000,7 +1000,7 @@ define <8 x i32> @splatvar_funnnel_v8i32(<8 x i32> %x, <8 x i32> %y, <8 x i32> % define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i16> %amt) nounwind { ; AVX1-LABEL: splatvar_funnnel_v16i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [15,0,0,0] +; AVX1-NEXT: vmovd {{.*#+}} xmm3 = [15,0,0,0] ; AVX1-NEXT: vpandn %xmm3, %xmm2, %xmm4 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 ; AVX1-NEXT: vpsrlw $1, %xmm5, %xmm5 @@ -1088,7 +1088,7 @@ define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i ; ; XOPAVX1-LABEL: splatvar_funnnel_v16i16: ; XOPAVX1: # %bb.0: -; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [15,0,0,0] +; XOPAVX1-NEXT: vmovd {{.*#+}} xmm3 = [15,0,0,0] ; XOPAVX1-NEXT: vpandn %xmm3, %xmm2, %xmm4 ; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 ; XOPAVX1-NEXT: vpsrlw $1, %xmm5, %xmm5 @@ -1278,7 +1278,7 @@ define void @fancierRotate2(ptr %arr, ptr %control, i32 %rot0, i32 %rot1) { ; AVX1-NEXT: vmovd %ecx, %xmm2 ; AVX1-NEXT: movq $-1024, %rax # imm = 0xFC00 ; AVX1-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [31,0,0,0] +; AVX1-NEXT: vmovd {{.*#+}} xmm3 = [31,0,0,0] ; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 ; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: .p2align 4, 0x90 diff --git a/llvm/test/CodeGen/X86/vector-fshl-rot-128.ll b/llvm/test/CodeGen/X86/vector-fshl-rot-128.ll index 9ddd171b4db690..6a8d9d73f138b4 100644 --- a/llvm/test/CodeGen/X86/vector-fshl-rot-128.ll +++ b/llvm/test/CodeGen/X86/vector-fshl-rot-128.ll @@ -945,7 +945,7 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %amt) nounwind ; ; SSE41-LABEL: splatvar_funnnel_v8i16: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [15,0,0,0] +; SSE41-NEXT: movd {{.*#+}} xmm2 = [15,0,0,0] ; SSE41-NEXT: movdqa %xmm1, %xmm3 ; SSE41-NEXT: pandn %xmm2, %xmm3 ; SSE41-NEXT: movdqa %xmm0, %xmm4 @@ -958,7 +958,7 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %amt) nounwind ; ; AVX-LABEL: splatvar_funnnel_v8i16: ; AVX: # %bb.0: -; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [15,0,0,0] +; AVX-NEXT: vmovd {{.*#+}} xmm2 = [15,0,0,0] ; AVX-NEXT: vpandn %xmm2, %xmm1, %xmm3 ; AVX-NEXT: vpsrlw $1, %xmm0, %xmm4 ; AVX-NEXT: vpsrlw %xmm3, %xmm4, %xmm3 @@ -969,7 +969,7 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %amt) nounwind ; ; AVX512F-LABEL: splatvar_funnnel_v8i16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = [15,0,0,0] +; AVX512F-NEXT: vmovd {{.*#+}} xmm2 = [15,0,0,0] ; AVX512F-NEXT: vpandn %xmm2, %xmm1, %xmm3 ; AVX512F-NEXT: vpsrlw $1, %xmm0, %xmm4 ; AVX512F-NEXT: vpsrlw %xmm3, %xmm4, %xmm3 @@ -980,7 +980,7 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %amt) nounwind ; ; AVX512VL-LABEL: splatvar_funnnel_v8i16: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [15,0,0,0] +; AVX512VL-NEXT: vmovd {{.*#+}} xmm2 = [15,0,0,0] ; AVX512VL-NEXT: vpandn %xmm2, %xmm1, %xmm3 ; AVX512VL-NEXT: vpsrlw $1, %xmm0, %xmm4 ; AVX512VL-NEXT: vpsrlw %xmm3, %xmm4, %xmm3 @@ -991,7 +991,7 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %amt) nounwind ; ; AVX512BW-LABEL: splatvar_funnnel_v8i16: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [15,0,0,0] +; AVX512BW-NEXT: vmovd {{.*#+}} xmm2 = [15,0,0,0] ; AVX512BW-NEXT: vpandn %xmm2, %xmm1, %xmm3 ; AVX512BW-NEXT: vpsrlw $1, %xmm0, %xmm4 ; AVX512BW-NEXT: vpsrlw %xmm3, %xmm4, %xmm3 @@ -1002,7 +1002,7 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %amt) nounwind ; ; AVX512VLBW-LABEL: splatvar_funnnel_v8i16: ; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm2 = [15,0,0,0] +; AVX512VLBW-NEXT: vmovd {{.*#+}} xmm2 = [15,0,0,0] ; AVX512VLBW-NEXT: vpandn %xmm2, %xmm1, %xmm3 ; AVX512VLBW-NEXT: vpsrlw $1, %xmm0, %xmm4 ; AVX512VLBW-NEXT: vpsrlw %xmm3, %xmm4, %xmm3 diff --git a/llvm/test/CodeGen/X86/vector-fshl-rot-256.ll b/llvm/test/CodeGen/X86/vector-fshl-rot-256.ll index 58719e6bd8e0c3..6fc95cc7780ff4 100644 --- a/llvm/test/CodeGen/X86/vector-fshl-rot-256.ll +++ b/llvm/test/CodeGen/X86/vector-fshl-rot-256.ll @@ -757,7 +757,7 @@ define <8 x i32> @splatvar_funnnel_v8i32(<8 x i32> %x, <8 x i32> %amt) nounwind define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %amt) nounwind { ; AVX1-LABEL: splatvar_funnnel_v16i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [15,0,0,0] +; AVX1-NEXT: vmovd {{.*#+}} xmm2 = [15,0,0,0] ; AVX1-NEXT: vpandn %xmm2, %xmm1, %xmm3 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 ; AVX1-NEXT: vpsrlw $1, %xmm4, %xmm5 diff --git a/llvm/test/CodeGen/X86/vector-fshl-rot-sub128.ll b/llvm/test/CodeGen/X86/vector-fshl-rot-sub128.ll index 825ca727b624ea..3452b33ada2a9a 100644 --- a/llvm/test/CodeGen/X86/vector-fshl-rot-sub128.ll +++ b/llvm/test/CodeGen/X86/vector-fshl-rot-sub128.ll @@ -324,7 +324,7 @@ define <2 x i32> @constant_funnnel_v2i32(<2 x i32> %x) nounwind { ; AVX512F-LABEL: constant_funnnel_v2i32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512F-NEXT: vpbroadcastq {{.*#+}} xmm1 = [4,5,4,5] +; AVX512F-NEXT: vmovq {{.*#+}} xmm1 = [4,5,0,0] ; AVX512F-NEXT: vprolvd %zmm1, %zmm0, %zmm0 ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512F-NEXT: vzeroupper @@ -338,7 +338,7 @@ define <2 x i32> @constant_funnnel_v2i32(<2 x i32> %x) nounwind { ; AVX512BW-LABEL: constant_funnnel_v2i32: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm1 = [4,5,4,5] +; AVX512BW-NEXT: vmovq {{.*#+}} xmm1 = [4,5,0,0] ; AVX512BW-NEXT: vprolvd %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512BW-NEXT: vzeroupper @@ -352,7 +352,7 @@ define <2 x i32> @constant_funnnel_v2i32(<2 x i32> %x) nounwind { ; AVX512VBMI2-LABEL: constant_funnnel_v2i32: ; AVX512VBMI2: # %bb.0: ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512VBMI2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [4,5,4,5] +; AVX512VBMI2-NEXT: vmovq {{.*#+}} xmm1 = [4,5,0,0] ; AVX512VBMI2-NEXT: vprolvd %zmm1, %zmm0, %zmm0 ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512VBMI2-NEXT: vzeroupper diff --git a/llvm/test/CodeGen/X86/vector-fshl-sub128.ll b/llvm/test/CodeGen/X86/vector-fshl-sub128.ll index 0b6361ffd4fae3..64deaf0e75966e 100644 --- a/llvm/test/CodeGen/X86/vector-fshl-sub128.ll +++ b/llvm/test/CodeGen/X86/vector-fshl-sub128.ll @@ -390,7 +390,7 @@ define <2 x i32> @constant_funnnel_v2i32(<2 x i32> %x, <2 x i32> %y) nounwind { ; AVX512VBMI2: # %bb.0: ; AVX512VBMI2-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512VBMI2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [4,5,4,5] +; AVX512VBMI2-NEXT: vmovq {{.*#+}} xmm2 = [4,5,0,0] ; AVX512VBMI2-NEXT: vpshldvd %zmm2, %zmm1, %zmm0 ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512VBMI2-NEXT: vzeroupper diff --git a/llvm/test/CodeGen/X86/vector-fshr-128.ll b/llvm/test/CodeGen/X86/vector-fshr-128.ll index 1f51f02a197ac4..70e3a025167439 100644 --- a/llvm/test/CodeGen/X86/vector-fshr-128.ll +++ b/llvm/test/CodeGen/X86/vector-fshr-128.ll @@ -1337,7 +1337,7 @@ define <4 x i32> @splatvar_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> % define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %amt) nounwind { ; SSE-LABEL: splatvar_funnnel_v8i16: ; SSE: # %bb.0: -; SSE-NEXT: movdqa {{.*#+}} xmm3 = [15,0,0,0] +; SSE-NEXT: movd {{.*#+}} xmm3 = [15,0,0,0] ; SSE-NEXT: movdqa %xmm2, %xmm4 ; SSE-NEXT: pand %xmm3, %xmm4 ; SSE-NEXT: psrlw %xmm4, %xmm1 @@ -1349,7 +1349,7 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> % ; ; AVX-LABEL: splatvar_funnnel_v8i16: ; AVX: # %bb.0: -; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [15,0,0,0] +; AVX-NEXT: vmovd {{.*#+}} xmm3 = [15,0,0,0] ; AVX-NEXT: vpand %xmm3, %xmm2, %xmm4 ; AVX-NEXT: vpsrlw %xmm4, %xmm1, %xmm1 ; AVX-NEXT: vpandn %xmm3, %xmm2, %xmm2 @@ -1360,7 +1360,7 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> % ; ; AVX512F-LABEL: splatvar_funnnel_v8i16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = [15,0,0,0] +; AVX512F-NEXT: vmovd {{.*#+}} xmm3 = [15,0,0,0] ; AVX512F-NEXT: vpand %xmm3, %xmm2, %xmm4 ; AVX512F-NEXT: vpsrlw %xmm4, %xmm1, %xmm1 ; AVX512F-NEXT: vpandn %xmm3, %xmm2, %xmm2 @@ -1371,7 +1371,7 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> % ; ; AVX512VL-LABEL: splatvar_funnnel_v8i16: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = [15,0,0,0] +; AVX512VL-NEXT: vmovd {{.*#+}} xmm3 = [15,0,0,0] ; AVX512VL-NEXT: vpand %xmm3, %xmm2, %xmm4 ; AVX512VL-NEXT: vpsrlw %xmm4, %xmm1, %xmm1 ; AVX512VL-NEXT: vpandn %xmm3, %xmm2, %xmm2 @@ -1382,7 +1382,7 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> % ; ; AVX512BW-LABEL: splatvar_funnnel_v8i16: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [15,0,0,0] +; AVX512BW-NEXT: vmovd {{.*#+}} xmm3 = [15,0,0,0] ; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm4 ; AVX512BW-NEXT: vpsrlw %xmm4, %xmm1, %xmm1 ; AVX512BW-NEXT: vpandn %xmm3, %xmm2, %xmm2 @@ -1403,7 +1403,7 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> % ; ; AVX512VLBW-LABEL: splatvar_funnnel_v8i16: ; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm3 = [15,0,0,0] +; AVX512VLBW-NEXT: vmovd {{.*#+}} xmm3 = [15,0,0,0] ; AVX512VLBW-NEXT: vpand %xmm3, %xmm2, %xmm4 ; AVX512VLBW-NEXT: vpsrlw %xmm4, %xmm1, %xmm1 ; AVX512VLBW-NEXT: vpandn %xmm3, %xmm2, %xmm2 @@ -1421,7 +1421,7 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> % ; ; XOP-LABEL: splatvar_funnnel_v8i16: ; XOP: # %bb.0: -; XOP-NEXT: vmovdqa {{.*#+}} xmm3 = [15,0,0,0] +; XOP-NEXT: vmovd {{.*#+}} xmm3 = [15,0,0,0] ; XOP-NEXT: vpand %xmm3, %xmm2, %xmm4 ; XOP-NEXT: vpsrlw %xmm4, %xmm1, %xmm1 ; XOP-NEXT: vpandn %xmm3, %xmm2, %xmm2 @@ -1432,7 +1432,7 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> % ; ; X86-SSE2-LABEL: splatvar_funnnel_v8i16: ; X86-SSE2: # %bb.0: -; X86-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [15,0,0,0] +; X86-SSE2-NEXT: movd {{.*#+}} xmm3 = [15,0,0,0] ; X86-SSE2-NEXT: movdqa %xmm2, %xmm4 ; X86-SSE2-NEXT: pand %xmm3, %xmm4 ; X86-SSE2-NEXT: psrlw %xmm4, %xmm1 diff --git a/llvm/test/CodeGen/X86/vector-fshr-256.ll b/llvm/test/CodeGen/X86/vector-fshr-256.ll index 1a6ecea596563e..61aea6ad4d5955 100644 --- a/llvm/test/CodeGen/X86/vector-fshr-256.ll +++ b/llvm/test/CodeGen/X86/vector-fshr-256.ll @@ -1032,7 +1032,7 @@ define <8 x i32> @splatvar_funnnel_v8i32(<8 x i32> %x, <8 x i32> %y, <8 x i32> % define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i16> %amt) nounwind { ; AVX1-LABEL: splatvar_funnnel_v16i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [15,0,0,0] +; AVX1-NEXT: vmovd {{.*#+}} xmm3 = [15,0,0,0] ; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm4 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 ; AVX1-NEXT: vpsrlw %xmm4, %xmm5, %xmm5 @@ -1121,7 +1121,7 @@ define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i ; ; XOPAVX1-LABEL: splatvar_funnnel_v16i16: ; XOPAVX1: # %bb.0: -; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [15,0,0,0] +; XOPAVX1-NEXT: vmovd {{.*#+}} xmm3 = [15,0,0,0] ; XOPAVX1-NEXT: vpand %xmm3, %xmm2, %xmm4 ; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 ; XOPAVX1-NEXT: vpsrlw %xmm4, %xmm5, %xmm5 diff --git a/llvm/test/CodeGen/X86/vector-fshr-rot-128.ll b/llvm/test/CodeGen/X86/vector-fshr-rot-128.ll index 402eb73e18101b..3fa9994312e454 100644 --- a/llvm/test/CodeGen/X86/vector-fshr-rot-128.ll +++ b/llvm/test/CodeGen/X86/vector-fshr-rot-128.ll @@ -982,7 +982,7 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %amt) nounwind ; ; SSE41-LABEL: splatvar_funnnel_v8i16: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [15,0,0,0] +; SSE41-NEXT: movd {{.*#+}} xmm2 = [15,0,0,0] ; SSE41-NEXT: movdqa %xmm1, %xmm3 ; SSE41-NEXT: pand %xmm2, %xmm3 ; SSE41-NEXT: movdqa %xmm0, %xmm4 @@ -995,7 +995,7 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %amt) nounwind ; ; AVX-LABEL: splatvar_funnnel_v8i16: ; AVX: # %bb.0: -; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [15,0,0,0] +; AVX-NEXT: vmovd {{.*#+}} xmm2 = [15,0,0,0] ; AVX-NEXT: vpand %xmm2, %xmm1, %xmm3 ; AVX-NEXT: vpsrlw %xmm3, %xmm0, %xmm3 ; AVX-NEXT: vpandn %xmm2, %xmm1, %xmm1 @@ -1006,7 +1006,7 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %amt) nounwind ; ; AVX512F-LABEL: splatvar_funnnel_v8i16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = [15,0,0,0] +; AVX512F-NEXT: vmovd {{.*#+}} xmm2 = [15,0,0,0] ; AVX512F-NEXT: vpand %xmm2, %xmm1, %xmm3 ; AVX512F-NEXT: vpsrlw %xmm3, %xmm0, %xmm3 ; AVX512F-NEXT: vpandn %xmm2, %xmm1, %xmm1 @@ -1017,7 +1017,7 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %amt) nounwind ; ; AVX512VL-LABEL: splatvar_funnnel_v8i16: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [15,0,0,0] +; AVX512VL-NEXT: vmovd {{.*#+}} xmm2 = [15,0,0,0] ; AVX512VL-NEXT: vpand %xmm2, %xmm1, %xmm3 ; AVX512VL-NEXT: vpsrlw %xmm3, %xmm0, %xmm3 ; AVX512VL-NEXT: vpandn %xmm2, %xmm1, %xmm1 @@ -1028,7 +1028,7 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %amt) nounwind ; ; AVX512BW-LABEL: splatvar_funnnel_v8i16: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [15,0,0,0] +; AVX512BW-NEXT: vmovd {{.*#+}} xmm2 = [15,0,0,0] ; AVX512BW-NEXT: vpand %xmm2, %xmm1, %xmm3 ; AVX512BW-NEXT: vpsrlw %xmm3, %xmm0, %xmm3 ; AVX512BW-NEXT: vpandn %xmm2, %xmm1, %xmm1 @@ -1039,7 +1039,7 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %amt) nounwind ; ; AVX512VLBW-LABEL: splatvar_funnnel_v8i16: ; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm2 = [15,0,0,0] +; AVX512VLBW-NEXT: vmovd {{.*#+}} xmm2 = [15,0,0,0] ; AVX512VLBW-NEXT: vpand %xmm2, %xmm1, %xmm3 ; AVX512VLBW-NEXT: vpsrlw %xmm3, %xmm0, %xmm3 ; AVX512VLBW-NEXT: vpandn %xmm2, %xmm1, %xmm1 diff --git a/llvm/test/CodeGen/X86/vector-fshr-rot-256.ll b/llvm/test/CodeGen/X86/vector-fshr-rot-256.ll index bb311468ce913c..b2047a04f163e6 100644 --- a/llvm/test/CodeGen/X86/vector-fshr-rot-256.ll +++ b/llvm/test/CodeGen/X86/vector-fshr-rot-256.ll @@ -796,7 +796,7 @@ define <8 x i32> @splatvar_funnnel_v8i32(<8 x i32> %x, <8 x i32> %amt) nounwind define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %amt) nounwind { ; AVX1-LABEL: splatvar_funnnel_v16i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [15,0,0,0] +; AVX1-NEXT: vmovd {{.*#+}} xmm2 = [15,0,0,0] ; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm3 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 ; AVX1-NEXT: vpsrlw %xmm3, %xmm4, %xmm5 diff --git a/llvm/test/CodeGen/X86/vector-fshr-rot-sub128.ll b/llvm/test/CodeGen/X86/vector-fshr-rot-sub128.ll index b8c356711921d0..d78aa4e049e0a3 100644 --- a/llvm/test/CodeGen/X86/vector-fshr-rot-sub128.ll +++ b/llvm/test/CodeGen/X86/vector-fshr-rot-sub128.ll @@ -338,7 +338,7 @@ define <2 x i32> @constant_funnnel_v2i32(<2 x i32> %x) nounwind { ; AVX512F-LABEL: constant_funnnel_v2i32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512F-NEXT: vpbroadcastq {{.*#+}} xmm1 = [4,5,4,5] +; AVX512F-NEXT: vmovq {{.*#+}} xmm1 = [4,5,0,0] ; AVX512F-NEXT: vprorvd %zmm1, %zmm0, %zmm0 ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512F-NEXT: vzeroupper @@ -352,7 +352,7 @@ define <2 x i32> @constant_funnnel_v2i32(<2 x i32> %x) nounwind { ; AVX512BW-LABEL: constant_funnnel_v2i32: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm1 = [4,5,4,5] +; AVX512BW-NEXT: vmovq {{.*#+}} xmm1 = [4,5,0,0] ; AVX512BW-NEXT: vprorvd %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512BW-NEXT: vzeroupper @@ -366,7 +366,7 @@ define <2 x i32> @constant_funnnel_v2i32(<2 x i32> %x) nounwind { ; AVX512VBMI2-LABEL: constant_funnnel_v2i32: ; AVX512VBMI2: # %bb.0: ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512VBMI2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [4,5,4,5] +; AVX512VBMI2-NEXT: vmovq {{.*#+}} xmm1 = [4,5,0,0] ; AVX512VBMI2-NEXT: vprorvd %zmm1, %zmm0, %zmm0 ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512VBMI2-NEXT: vzeroupper diff --git a/llvm/test/CodeGen/X86/vector-fshr-sub128.ll b/llvm/test/CodeGen/X86/vector-fshr-sub128.ll index 56896927e7e5ad..1add344e3e41fb 100644 --- a/llvm/test/CodeGen/X86/vector-fshr-sub128.ll +++ b/llvm/test/CodeGen/X86/vector-fshr-sub128.ll @@ -454,7 +454,7 @@ define <2 x i32> @constant_funnnel_v2i32(<2 x i32> %x, <2 x i32> %y) nounwind { ; AVX512VBMI2: # %bb.0: ; AVX512VBMI2-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 ; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512VBMI2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [4,5,4,5] +; AVX512VBMI2-NEXT: vmovq {{.*#+}} xmm2 = [4,5,0,0] ; AVX512VBMI2-NEXT: vpshrdvd %zmm2, %zmm0, %zmm1 ; AVX512VBMI2-NEXT: vmovdqa %xmm1, %xmm0 ; AVX512VBMI2-NEXT: vzeroupper diff --git a/llvm/test/CodeGen/X86/vector-half-conversions.ll b/llvm/test/CodeGen/X86/vector-half-conversions.ll index 21533818bac11a..f59960f06f4a11 100644 --- a/llvm/test/CodeGen/X86/vector-half-conversions.ll +++ b/llvm/test/CodeGen/X86/vector-half-conversions.ll @@ -3173,7 +3173,7 @@ define <2 x i16> @cvt_2f64_to_2i16(<2 x double> %a0) nounwind { ; AVX512F-NEXT: callq __truncdfhf2@PLT ; AVX512F-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX512F-NEXT: vmovaps {{.*#+}} xmm1 = [16,0,0,0] +; AVX512F-NEXT: vmovss {{.*#+}} xmm1 = [16,0,0,0] ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512F-NEXT: vpermt2ps %zmm2, %zmm1, %zmm0 ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 @@ -3195,7 +3195,7 @@ define <2 x i16> @cvt_2f64_to_2i16(<2 x double> %a0) nounwind { ; AVX512-FASTLANE-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill ; AVX512-FASTLANE-NEXT: callq __truncdfhf2@PLT ; AVX512-FASTLANE-NEXT: vpbroadcastw %xmm0, %xmm1 -; AVX512-FASTLANE-NEXT: vmovaps {{.*#+}} xmm0 = [4,0,0,0] +; AVX512-FASTLANE-NEXT: vmovss {{.*#+}} xmm0 = [4,0,0,0] ; AVX512-FASTLANE-NEXT: vpermi2ps (%rsp), %xmm1, %xmm0 # 16-byte Folded Reload ; AVX512-FASTLANE-NEXT: addq $40, %rsp ; AVX512-FASTLANE-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-3.ll index d2bef6b234e383..47aab9e264c19a 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-3.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-3.ll @@ -240,11 +240,11 @@ define void @load_i16_stride3_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; ; AVX512BW-SLOW-LABEL: load_i16_stride3_vf4: ; AVX512BW-SLOW: # %bb.0: -; AVX512BW-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm0 = [0,3,6,9,0,3,6,9] +; AVX512BW-SLOW-NEXT: vmovq {{.*#+}} xmm0 = [0,3,6,9,0,0,0,0] ; AVX512BW-SLOW-NEXT: vmovdqa (%rdi), %xmm1 ; AVX512BW-SLOW-NEXT: vmovdqa 16(%rdi), %xmm2 ; AVX512BW-SLOW-NEXT: vpermi2w %xmm2, %xmm1, %xmm0 -; AVX512BW-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm3 = [1,4,7,10,1,4,7,10] +; AVX512BW-SLOW-NEXT: vmovq {{.*#+}} xmm3 = [1,4,7,10,0,0,0,0] ; AVX512BW-SLOW-NEXT: vpermi2w %xmm2, %xmm1, %xmm3 ; AVX512BW-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,3,2,3,4,5,6,7] ; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] @@ -257,13 +257,13 @@ define void @load_i16_stride3_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; ; AVX512BW-FAST-LABEL: load_i16_stride3_vf4: ; AVX512BW-FAST: # %bb.0: -; AVX512BW-FAST-NEXT: vpbroadcastq {{.*#+}} xmm0 = [0,3,6,9,0,3,6,9] +; AVX512BW-FAST-NEXT: vmovq {{.*#+}} xmm0 = [0,3,6,9,0,0,0,0] ; AVX512BW-FAST-NEXT: vmovdqa (%rdi), %xmm1 ; AVX512BW-FAST-NEXT: vmovdqa 16(%rdi), %xmm2 ; AVX512BW-FAST-NEXT: vpermi2w %xmm2, %xmm1, %xmm0 -; AVX512BW-FAST-NEXT: vpbroadcastq {{.*#+}} xmm3 = [1,4,7,10,1,4,7,10] +; AVX512BW-FAST-NEXT: vmovq {{.*#+}} xmm3 = [1,4,7,10,0,0,0,0] ; AVX512BW-FAST-NEXT: vpermi2w %xmm2, %xmm1, %xmm3 -; AVX512BW-FAST-NEXT: vpbroadcastq {{.*#+}} xmm4 = [2,5,8,11,2,5,8,11] +; AVX512BW-FAST-NEXT: vmovq {{.*#+}} xmm4 = [2,5,8,11,0,0,0,0] ; AVX512BW-FAST-NEXT: vpermi2w %xmm2, %xmm1, %xmm4 ; AVX512BW-FAST-NEXT: vmovq %xmm0, (%rsi) ; AVX512BW-FAST-NEXT: vmovq %xmm3, (%rdx) @@ -967,8 +967,7 @@ define void @load_i16_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm0[0,1],xmm1[2],xmm0[3,4],xmm1[5],xmm0[6,7] ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm10 = [4,5,4,5,4,5,4,5,10,11,0,1,6,7,12,13] ; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm11, %xmm11 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [2,3,8,9,14,15,0,0,2,3,8,9,14,15,0,0] -; AVX1-ONLY-NEXT: # xmm3 = mem[0,0] +; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm3 = [2,3,8,9,14,15,0,0,0,0,0,0,0,0,0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm2, %xmm15 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm15[0,1,2],xmm11[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -1807,8 +1806,7 @@ define void @load_i16_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm8[0,1],xmm13[2],xmm8[3,4],xmm13[5],xmm8[6,7] ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm1 = [4,5,4,5,4,5,4,5,10,11,0,1,6,7,12,13] ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm3, %xmm14 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [2,3,8,9,14,15,0,0,2,3,8,9,14,15,0,0] -; AVX1-ONLY-NEXT: # xmm3 = mem[0,0] +; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm3 = [2,3,8,9,14,15,0,0,0,0,0,0,0,0,0,0] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm0, %xmm13 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm13[0,1,2],xmm14[3,4,5,6,7] diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-5.ll index 904f8ddd2e7f44..0d5445dc194cec 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-5.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-5.ll @@ -409,22 +409,22 @@ define void @load_i16_stride5_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; ; AVX512BW-LABEL: load_i16_stride5_vf4: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm0 = [1,6,11,0,1,6,11,0] +; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = [1,6,11,0,0,0,0,0] ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm1 ; AVX512BW-NEXT: vpermw %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm2 = [0,5,10,0,0,5,10,0] +; AVX512BW-NEXT: vmovq {{.*#+}} xmm2 = [0,5,10,0,0,0,0,0] ; AVX512BW-NEXT: vpermw %zmm1, %zmm2, %zmm1 ; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm2 ; AVX512BW-NEXT: vpextrw $7, %xmm2, %eax ; AVX512BW-NEXT: vpinsrw $3, %eax, %xmm1, %xmm1 ; AVX512BW-NEXT: vpinsrw $3, 32(%rdi), %xmm0, %xmm0 -; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm2 = [2,7,12,17,2,7,12,17] +; AVX512BW-NEXT: vmovq {{.*#+}} xmm2 = [2,7,12,17,0,0,0,0] ; AVX512BW-NEXT: vmovdqa 32(%rdi), %ymm3 ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm4 ; AVX512BW-NEXT: vpermi2w %ymm3, %ymm4, %ymm2 -; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm5 = [3,8,13,18,3,8,13,18] +; AVX512BW-NEXT: vmovq {{.*#+}} xmm5 = [3,8,13,18,0,0,0,0] ; AVX512BW-NEXT: vpermi2w %ymm3, %ymm4, %ymm5 -; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm6 = [4,9,14,19,4,9,14,19] +; AVX512BW-NEXT: vmovq {{.*#+}} xmm6 = [4,9,14,19,0,0,0,0] ; AVX512BW-NEXT: vpermi2w %ymm3, %ymm4, %ymm6 ; AVX512BW-NEXT: vmovq %xmm1, (%rsi) ; AVX512BW-NEXT: vmovq %xmm0, (%rdx) @@ -7449,7 +7449,7 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-FAST-NEXT: vmovdqa64 %xmm31, %xmm15 ; AVX512F-FAST-NEXT: vmovdqa64 %xmm20, %xmm7 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm15[0],xmm7[1],xmm15[2,3] -; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm0 = [6,7,0,1,10,11,0,0,6,7,0,1,10,11,0,0] +; AVX512F-FAST-NEXT: vmovq {{.*#+}} xmm0 = [6,7,0,1,10,11,0,0,0,0,0,0,0,0,0,0] ; AVX512F-FAST-NEXT: vpshufb %xmm0, %xmm1, %xmm2 ; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} ymm18 = [1,3,6,0,5,u,u,u] ; AVX512F-FAST-NEXT: vmovdqa64 %ymm30, %ymm9 @@ -7482,7 +7482,7 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm24 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm15[0,1],xmm7[2],xmm15[3] ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm9[0],ymm10[1,2],ymm9[3],ymm10[4],ymm9[5],ymm10[6,7],ymm9[8],ymm10[9,10],ymm9[11],ymm10[12],ymm9[13],ymm10[14,15] -; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm2 = [8,9,2,3,12,13,0,0,8,9,2,3,12,13,0,0] +; AVX512F-FAST-NEXT: vmovq {{.*#+}} xmm2 = [8,9,2,3,12,13,0,0,0,0,0,0,0,0,0,0] ; AVX512F-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm3 ; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} ymm27 = [1,4,6,3,6,u,u,u] ; AVX512F-FAST-NEXT: vpermd %ymm1, %ymm27, %ymm1 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-6.ll index fffe46d3cbed1d..3efae5c1150456 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-6.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-6.ll @@ -213,7 +213,7 @@ define void @load_i16_stride6_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FAST-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[1,3,2,3,4,5,6,7] ; AVX512BW-FAST-NEXT: vpbroadcastw 4(%rdi), %xmm4 ; AVX512BW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] -; AVX512BW-FAST-NEXT: vpbroadcastd {{.*#+}} xmm5 = [3,9,3,9,3,9,3,9] +; AVX512BW-FAST-NEXT: vmovd {{.*#+}} xmm5 = [3,9,0,0,0,0,0,0] ; AVX512BW-FAST-NEXT: vpermi2w %xmm1, %xmm0, %xmm5 ; AVX512BW-FAST-NEXT: vpbroadcastw 20(%rdi), %xmm6 ; AVX512BW-FAST-NEXT: vpbroadcastw 8(%rdi), %xmm7 @@ -501,19 +501,19 @@ define void @load_i16_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-LABEL: load_i16_stride6_vf4: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm0 = [0,6,12,18,0,6,12,18] +; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = [0,6,12,18,0,0,0,0] ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm1 ; AVX512BW-NEXT: vmovdqa 32(%rdi), %ymm2 ; AVX512BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm0 -; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm3 = [1,7,13,19,1,7,13,19] +; AVX512BW-NEXT: vmovq {{.*#+}} xmm3 = [1,7,13,19,0,0,0,0] ; AVX512BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm3 -; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm4 = [2,8,14,20,2,8,14,20] +; AVX512BW-NEXT: vmovq {{.*#+}} xmm4 = [2,8,14,20,0,0,0,0] ; AVX512BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm4 -; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm5 = [3,9,15,21,3,9,15,21] +; AVX512BW-NEXT: vmovq {{.*#+}} xmm5 = [3,9,15,21,0,0,0,0] ; AVX512BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm5 -; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm6 = [4,10,16,22,4,10,16,22] +; AVX512BW-NEXT: vmovq {{.*#+}} xmm6 = [4,10,16,22,0,0,0,0] ; AVX512BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm6 -; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm7 = [5,11,17,23,5,11,17,23] +; AVX512BW-NEXT: vmovq {{.*#+}} xmm7 = [5,11,17,23,0,0,0,0] ; AVX512BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm7 ; AVX512BW-NEXT: vmovq %xmm0, (%rsi) ; AVX512BW-NEXT: vmovq %xmm3, (%rdx) diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-7.ll index 4b32647f6fb3e7..853f94c84d1a8e 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-7.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-7.ll @@ -240,7 +240,7 @@ define void @load_i16_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FAST-NEXT: vpsrlq $48, %xmm1, %xmm8 ; AVX512BW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3] ; AVX512BW-FAST-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] -; AVX512BW-FAST-NEXT: vpbroadcastd {{.*#+}} xmm8 = [6,13,6,13,6,13,6,13] +; AVX512BW-FAST-NEXT: vmovd {{.*#+}} xmm8 = [6,13,0,0,0,0,0,0] ; AVX512BW-FAST-NEXT: vpermi2w %xmm1, %xmm0, %xmm8 ; AVX512BW-FAST-NEXT: vmovd %xmm2, (%rsi) ; AVX512BW-FAST-NEXT: vmovd %xmm4, (%rdx) @@ -658,21 +658,21 @@ define void @load_i16_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm0 = [0,7,14,21,0,7,14,21] +; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = [0,7,14,21,0,0,0,0] ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm1 ; AVX512BW-NEXT: vmovdqa 32(%rdi), %ymm2 ; AVX512BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm0 -; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm3 = [1,8,15,22,1,8,15,22] +; AVX512BW-NEXT: vmovq {{.*#+}} xmm3 = [1,8,15,22,0,0,0,0] ; AVX512BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm3 -; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm4 = [2,9,16,23,2,9,16,23] +; AVX512BW-NEXT: vmovq {{.*#+}} xmm4 = [2,9,16,23,0,0,0,0] ; AVX512BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm4 -; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm5 = [3,10,17,24,3,10,17,24] +; AVX512BW-NEXT: vmovq {{.*#+}} xmm5 = [3,10,17,24,0,0,0,0] ; AVX512BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm5 -; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm6 = [4,11,18,25,4,11,18,25] +; AVX512BW-NEXT: vmovq {{.*#+}} xmm6 = [4,11,18,25,0,0,0,0] ; AVX512BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm6 -; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm7 = [5,12,19,26,5,12,19,26] +; AVX512BW-NEXT: vmovq {{.*#+}} xmm7 = [5,12,19,26,0,0,0,0] ; AVX512BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm7 -; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm8 = [6,13,20,27,6,13,20,27] +; AVX512BW-NEXT: vmovq {{.*#+}} xmm8 = [6,13,20,27,0,0,0,0] ; AVX512BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm8 ; AVX512BW-NEXT: vmovq %xmm0, (%rsi) ; AVX512BW-NEXT: vmovq %xmm3, (%rdx) @@ -1343,7 +1343,7 @@ define void @load_i16_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u,u,u,u,u,u,u,0,1,14,15,u,u,10,11] ; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,5],xmm2[6],xmm9[7] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7] -; AVX512F-FAST-NEXT: vpbroadcastd {{.*#+}} xmm11 = [8,9,4,5,8,9,4,5,8,9,4,5,8,9,4,5] +; AVX512F-FAST-NEXT: vmovd {{.*#+}} xmm11 = [8,9,4,5,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX512F-FAST-NEXT: vpshufb %xmm11, %xmm10, %xmm12 ; AVX512F-FAST-NEXT: vextracti128 $1, %ymm10, %xmm10 ; AVX512F-FAST-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[3,1,2,3,4,5,6,7] @@ -2380,7 +2380,7 @@ define void @load_i16_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm12 = ymm11[0],ymm12[1,2,3,4,5,6,7],ymm11[8],ymm12[9,10,11,12,13,14,15] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] -; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} xmm12 = [8,9,4,5,8,9,4,5,8,9,4,5,8,9,4,5] +; AVX2-FAST-NEXT: vmovd {{.*#+}} xmm12 = [8,9,4,5,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm14, %xmm15 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm14, %xmm14 ; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[3,1,2,3,4,5,6,7] @@ -2546,7 +2546,7 @@ define void @load_i16_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm10 = ymm11[0],ymm10[1,2,3,4,5,6,7],ymm11[8],ymm10[9,10,11,12,13,14,15] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm13 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} xmm11 = [8,9,4,5,8,9,4,5,8,9,4,5,8,9,4,5] +; AVX2-FAST-PERLANE-NEXT: vmovd {{.*#+}} xmm11 = [8,9,4,5,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm13, %xmm14 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm13, %xmm13 ; AVX2-FAST-PERLANE-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[3,1,2,3,4,5,6,7] @@ -5161,7 +5161,7 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm6[0,1,2,3,4],ymm1[5,6,7],ymm6[8,9,10,11,12],ymm1[13,14,15] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm15[0,1],ymm11[2,3],ymm15[4,5],ymm11[6,7] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm6, %xmm15 -; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} xmm2 = [8,9,4,5,8,9,4,5,8,9,4,5,8,9,4,5] +; AVX2-FAST-NEXT: vmovd {{.*#+}} xmm2 = [8,9,4,5,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm15, %xmm15 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[10,11,6,7,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm15[0],xmm6[1],xmm15[1],xmm6[2],xmm15[2],xmm6[3],xmm15[3] @@ -5496,7 +5496,7 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm2 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm2 = ymm15[0,1],mem[2],ymm15[3,4],mem[5],ymm15[6,7] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} xmm1 = [8,9,4,5,8,9,4,5,8,9,4,5,8,9,4,5] +; AVX2-FAST-PERLANE-NEXT: vmovd {{.*#+}} xmm1 = [8,9,4,5,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm2, %xmm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, %xmm14 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm2 @@ -5566,7 +5566,7 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm10[5,6,7],ymm8[8,9,10,11,12],ymm10[13,14,15] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm10 = ymm15[0,1],ymm12[2,3],ymm15[4,5],ymm12[6,7] ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm10, %xmm14 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} xmm1 = [8,9,4,5,8,9,4,5,8,9,4,5,8,9,4,5] +; AVX2-FAST-PERLANE-NEXT: vmovd {{.*#+}} xmm1 = [8,9,4,5,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm14, %xmm14 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[10,11,6,7,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm14[0],xmm10[1],xmm14[1],xmm10[2],xmm14[2],xmm10[3],xmm14[3] @@ -11374,7 +11374,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7] -; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} xmm4 = [8,9,4,5,8,9,4,5,8,9,4,5,8,9,4,5] +; AVX2-FAST-NEXT: vmovd {{.*#+}} xmm4 = [8,9,4,5,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm1, %xmm3 ; AVX2-FAST-NEXT: vmovdqa %xmm4, %xmm10 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm1 @@ -11431,7 +11431,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm0 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] -; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} xmm1 = [8,9,4,5,8,9,4,5,8,9,4,5,8,9,4,5] +; AVX2-FAST-NEXT: vmovd {{.*#+}} xmm1 = [8,9,4,5,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm3 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] @@ -11493,7 +11493,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm12 = ymm2[0,1,2,3,4],ymm1[5,6,7],ymm2[8,9,10,11,12],ymm1[13,14,15] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1],ymm7[2,3],ymm13[4,5],ymm7[6,7] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} xmm4 = [8,9,4,5,8,9,4,5,8,9,4,5,8,9,4,5] +; AVX2-FAST-NEXT: vmovd {{.*#+}} xmm4 = [8,9,4,5,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm2, %xmm2 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[10,11,6,7,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] @@ -11522,7 +11522,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vextracti128 $1, %ymm12, %xmm13 ; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm13, %xmm13 ; AVX2-FAST-NEXT: vmovdqa %xmm4, %xmm6 -; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} xmm2 = [10,11,6,7,10,11,6,7,10,11,6,7,10,11,6,7] +; AVX2-FAST-NEXT: vmovd {{.*#+}} xmm2 = [10,11,6,7,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm12, %xmm12 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload @@ -12194,7 +12194,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[0,1],ymm2[2],mem[3,4],ymm2[5],mem[6,7] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} xmm6 = [8,9,4,5,8,9,4,5,8,9,4,5,8,9,4,5] +; AVX2-FAST-PERLANE-NEXT: vmovd {{.*#+}} xmm6 = [8,9,4,5,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm2, %xmm4 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm6, %xmm12 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm2 @@ -12324,9 +12324,9 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vpblendd $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[0,1],ymm2[2,3],mem[4,5],ymm2[6,7] ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} xmm13 = [8,9,4,5,8,9,4,5,8,9,4,5,8,9,4,5] +; AVX2-FAST-PERLANE-NEXT: vmovd {{.*#+}} xmm13 = [8,9,4,5,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm4, %xmm8 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} xmm4 = [10,11,6,7,10,11,6,7,10,11,6,7,10,11,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovd {{.*#+}} xmm4 = [10,11,6,7,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm2, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = [u,u,u,u,u,u,u,u,2,3,0,1,14,15,12,13,26,27,26,27,26,27,26,27,18,19,16,17,30,31,28,29] diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-8.ll index 78065bc73c1d3f..054afe2dfdf2ed 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-8.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-8.ll @@ -271,23 +271,23 @@ define void @load_i16_stride8_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r11 -; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm0 = [0,8,16,24,0,8,16,24] +; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = [0,8,16,24,0,0,0,0] ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm1 ; AVX512BW-NEXT: vmovdqa 32(%rdi), %ymm2 ; AVX512BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm0 -; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm3 = [1,9,17,25,1,9,17,25] +; AVX512BW-NEXT: vmovq {{.*#+}} xmm3 = [1,9,17,25,0,0,0,0] ; AVX512BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm3 -; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm4 = [2,10,18,26,2,10,18,26] +; AVX512BW-NEXT: vmovq {{.*#+}} xmm4 = [2,10,18,26,0,0,0,0] ; AVX512BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm4 -; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm5 = [3,11,19,27,3,11,19,27] +; AVX512BW-NEXT: vmovq {{.*#+}} xmm5 = [3,11,19,27,0,0,0,0] ; AVX512BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm5 -; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm6 = [4,12,20,28,4,12,20,28] +; AVX512BW-NEXT: vmovq {{.*#+}} xmm6 = [4,12,20,28,0,0,0,0] ; AVX512BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm6 -; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm7 = [5,13,21,29,5,13,21,29] +; AVX512BW-NEXT: vmovq {{.*#+}} xmm7 = [5,13,21,29,0,0,0,0] ; AVX512BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm7 -; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm8 = [6,14,22,30,6,14,22,30] +; AVX512BW-NEXT: vmovq {{.*#+}} xmm8 = [6,14,22,30,0,0,0,0] ; AVX512BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm8 -; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm9 = [7,15,23,31,7,15,23,31] +; AVX512BW-NEXT: vmovq {{.*#+}} xmm9 = [7,15,23,31,0,0,0,0] ; AVX512BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm9 ; AVX512BW-NEXT: vmovq %xmm0, (%rsi) ; AVX512BW-NEXT: vmovq %xmm3, (%rdx) @@ -560,7 +560,7 @@ define void @load_i16_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm7 = xmm15[0,1],xmm7[2,3] ; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm15 = xmm14[2],xmm13[2],xmm14[3],xmm13[3] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm8 = xmm15[0,1],xmm8[2,3] -; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm15 = [3,7,3,7] +; AVX512F-SLOW-NEXT: vmovq {{.*#+}} xmm15 = [3,7,0,0] ; AVX512F-SLOW-NEXT: vpermt2d %xmm13, %xmm15, %xmm14 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm9 = xmm14[0,1],xmm9[2,3] ; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] @@ -615,7 +615,7 @@ define void @load_i16_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm15[0],xmm14[0],xmm15[1],xmm14[1] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3] ; AVX512F-FAST-NEXT: vmovdqa64 %xmm0, %xmm16 -; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm9 = [1,5,1,5] +; AVX512F-FAST-NEXT: vmovq {{.*#+}} xmm9 = [1,5,0,0] ; AVX512F-FAST-NEXT: vmovdqa %xmm15, %xmm2 ; AVX512F-FAST-NEXT: vpermt2d %xmm14, %xmm9, %xmm2 ; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm11[0],xmm10[0],xmm11[1],xmm10[1] @@ -626,7 +626,7 @@ define void @load_i16_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512F-FAST-NEXT: vpermt2d %xmm10, %xmm1, %xmm0 ; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm10 = xmm15[2],xmm14[2],xmm15[3],xmm14[3] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm10[0,1],xmm0[2,3] -; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm10 = [3,7,3,7] +; AVX512F-FAST-NEXT: vmovq {{.*#+}} xmm10 = [3,7,0,0] ; AVX512F-FAST-NEXT: vpermt2d %xmm14, %xmm10, %xmm15 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm11 = xmm15[0,1],xmm11[2,3] ; AVX512F-FAST-NEXT: vmovdqa64 %xmm17, %xmm14 @@ -1353,7 +1353,7 @@ define void @load_i16_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],xmm6[2,3] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm1[4,5,6,7] ; AVX512F-SLOW-NEXT: vmovdqa64 %ymm1, %ymm19 -; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm17 = [3,7,3,7] +; AVX512F-SLOW-NEXT: vmovq {{.*#+}} xmm17 = [3,7,0,0] ; AVX512F-SLOW-NEXT: vpermt2d %xmm4, %xmm17, %xmm14 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm14[0,1],xmm11[2,3] ; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] @@ -1490,7 +1490,7 @@ define void @load_i16_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0,1,2,3],ymm1[4,5,6,7] ; AVX512F-FAST-NEXT: vmovdqa64 %ymm1, %ymm24 -; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm15 = [1,5,1,5] +; AVX512F-FAST-NEXT: vmovq {{.*#+}} xmm15 = [1,5,0,0] ; AVX512F-FAST-NEXT: vmovdqa %xmm11, %xmm1 ; AVX512F-FAST-NEXT: vpermt2d %xmm5, %xmm15, %xmm1 ; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm12[0],xmm4[0],xmm12[1],xmm4[1] @@ -1523,7 +1523,7 @@ define void @load_i16_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3,4,5],ymm1[6,7] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX512F-FAST-NEXT: vmovdqa64 %ymm0, %ymm21 -; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm18 = [3,7,3,7] +; AVX512F-FAST-NEXT: vmovq {{.*#+}} xmm18 = [3,7,0,0] ; AVX512F-FAST-NEXT: vpermt2d %xmm5, %xmm18, %xmm11 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm11[0,1],xmm2[2,3] ; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] @@ -3238,7 +3238,7 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm13[0,1,2,3],ymm6[4,5,6,7] ; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm0 ; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm0 = [3,7,3,7] +; AVX512F-SLOW-NEXT: vmovq {{.*#+}} xmm0 = [3,7,0,0] ; AVX512F-SLOW-NEXT: vpermt2d %xmm11, %xmm0, %xmm7 ; AVX512F-SLOW-NEXT: vmovdqa %xmm0, %xmm6 ; AVX512F-SLOW-NEXT: vmovdqa64 %xmm25, %xmm0 @@ -3409,7 +3409,7 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm12 = xmm14[0,1],xmm12[2,3] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3],ymm11[4,5,6,7] ; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm11, %zmm1, %zmm1 -; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm12 = [3,7,3,7] +; AVX512F-SLOW-NEXT: vmovq {{.*#+}} xmm12 = [3,7,0,0] ; AVX512F-SLOW-NEXT: vpermt2d %xmm16, %xmm12, %xmm4 ; AVX512F-SLOW-NEXT: vmovdqa64 %xmm31, %xmm11 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],xmm11[2,3] @@ -3563,7 +3563,7 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7] ; AVX512F-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm11, %zmm0 ; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm1 = [1,5,1,5] +; AVX512F-FAST-NEXT: vmovq {{.*#+}} xmm1 = [1,5,0,0] ; AVX512F-FAST-NEXT: vmovdqa %xmm9, %xmm0 ; AVX512F-FAST-NEXT: vpermt2d %xmm20, %xmm1, %xmm0 ; AVX512F-FAST-NEXT: vmovdqa %xmm1, %xmm11 @@ -3630,7 +3630,7 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7] ; AVX512F-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 ; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm27 = [3,7,3,7] +; AVX512F-FAST-NEXT: vmovq {{.*#+}} xmm27 = [3,7,0,0] ; AVX512F-FAST-NEXT: vpermt2d %xmm20, %xmm27, %xmm9 ; AVX512F-FAST-NEXT: vmovdqa64 %xmm16, %xmm0 ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm9[0,1],xmm0[2,3] @@ -3733,7 +3733,7 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm0[4,5,6,7] ; AVX512F-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm31, %zmm31 ; AVX512F-FAST-NEXT: vmovdqa %xmm5, %xmm0 -; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm4 = [1,5,1,5] +; AVX512F-FAST-NEXT: vmovq {{.*#+}} xmm4 = [1,5,0,0] ; AVX512F-FAST-NEXT: vpermt2d %xmm18, %xmm4, %xmm0 ; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm12[0],xmm24[0],xmm12[1],xmm24[1] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] @@ -7297,7 +7297,7 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] ; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm20, %zmm0 ; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm0 = [3,7,3,7] +; AVX512F-SLOW-NEXT: vmovq {{.*#+}} xmm0 = [3,7,0,0] ; AVX512F-SLOW-NEXT: vmovdqa64 %xmm31, %xmm1 ; AVX512F-SLOW-NEXT: vpermt2d %xmm16, %xmm0, %xmm1 ; AVX512F-SLOW-NEXT: vmovdqa64 %xmm0, %xmm16 @@ -7711,7 +7711,7 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] ; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm20, %zmm0 -; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm16 = [3,7,3,7] +; AVX512F-SLOW-NEXT: vmovq {{.*#+}} xmm16 = [3,7,0,0] ; AVX512F-SLOW-NEXT: vmovdqa64 %xmm28, %xmm1 ; AVX512F-SLOW-NEXT: vpermt2d {{[-0-9]+}}(%r{{[sb]}}p), %xmm16, %xmm1 # 16-byte Folded Reload ; AVX512F-SLOW-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload @@ -8050,7 +8050,7 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm0[4,5,6,7] ; AVX512F-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm0 ; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm9 = [1,5,1,5] +; AVX512F-FAST-NEXT: vmovq {{.*#+}} xmm9 = [1,5,0,0] ; AVX512F-FAST-NEXT: vmovdqa64 %xmm30, %xmm0 ; AVX512F-FAST-NEXT: vpermt2d %xmm17, %xmm9, %xmm0 ; AVX512F-FAST-NEXT: vmovdqa64 %xmm17, %xmm30 @@ -8225,7 +8225,7 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] ; AVX512F-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm24, %zmm0 ; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm0 = [3,7,3,7] +; AVX512F-FAST-NEXT: vmovq {{.*#+}} xmm0 = [3,7,0,0] ; AVX512F-FAST-NEXT: vmovdqa64 %xmm31, %xmm1 ; AVX512F-FAST-NEXT: vpermt2d %xmm28, %xmm0, %xmm1 ; AVX512F-FAST-NEXT: vmovdqa64 %xmm0, %xmm16 @@ -8477,7 +8477,7 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512F-FAST-NEXT: vmovdqa64 %xmm23, %xmm0 ; AVX512F-FAST-NEXT: vmovdqa64 %xmm23, (%rsp) # 16-byte Spill -; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm15 = [1,5,1,5] +; AVX512F-FAST-NEXT: vmovq {{.*#+}} xmm15 = [1,5,0,0] ; AVX512F-FAST-NEXT: vpermt2d %xmm19, %xmm15, %xmm0 ; AVX512F-FAST-NEXT: vmovdqa64 %xmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm9[0],xmm29[0],xmm9[1],xmm29[1] @@ -8642,7 +8642,7 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX512F-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm21, %zmm21 -; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm17 = [3,7,3,7] +; AVX512F-FAST-NEXT: vmovq {{.*#+}} xmm17 = [3,7,0,0] ; AVX512F-FAST-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload ; AVX512F-FAST-NEXT: vpermt2d {{[-0-9]+}}(%r{{[sb]}}p), %xmm17, %xmm0 # 16-byte Folded Reload ; AVX512F-FAST-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-3.ll index 4ed9a99c58c3f1..cce2340d214aa6 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-3.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-3.ll @@ -78,7 +78,7 @@ define void @load_i32_stride3_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FAST-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512-FAST-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX512-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,3,2,3] -; AVX512-FAST-NEXT: vpbroadcastq {{.*#+}} xmm3 = [5,0,5,0] +; AVX512-FAST-NEXT: vmovd {{.*#+}} xmm3 = [5,0,0,0] ; AVX512-FAST-NEXT: vpermi2d %xmm0, %xmm1, %xmm3 ; AVX512-FAST-NEXT: vpbroadcastd 8(%rdi), %xmm0 ; AVX512-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-4.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-4.ll index 40355e3f7686f6..30ac7dc443182e 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-4.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-4.ll @@ -78,7 +78,7 @@ define void @load_i32_stride4_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FAST-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512-FAST-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX512-FAST-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX512-FAST-NEXT: vpbroadcastq {{.*#+}} xmm3 = [1,5,1,5] +; AVX512-FAST-NEXT: vmovq {{.*#+}} xmm3 = [1,5,0,0] ; AVX512-FAST-NEXT: vpermi2d %xmm1, %xmm0, %xmm3 ; AVX512-FAST-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX512-FAST-NEXT: vmovq %xmm2, (%rsi) @@ -160,8 +160,7 @@ define void @load_i32_stride4_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm5[0,1],xmm0[2,3] ; AVX2-ONLY-NEXT: vmovaps 48(%rdi), %xmm5 ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm6 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm7 = [1,5,1,5] -; AVX2-ONLY-NEXT: # xmm7 = mem[0,0] +; AVX2-ONLY-NEXT: vmovsd {{.*#+}} xmm7 = [1,5,0,0] ; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm8 ; AVX2-ONLY-NEXT: vpermps %ymm8, %ymm7, %ymm7 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm7[0,1],xmm6[2,3] @@ -171,8 +170,7 @@ define void @load_i32_stride4_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm4[2],xmm5[2],xmm4[3],xmm5[3] -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [3,7,3,7] -; AVX2-ONLY-NEXT: # xmm3 = mem[0,0] +; AVX2-ONLY-NEXT: vmovsd {{.*#+}} xmm3 = [3,7,0,0] ; AVX2-ONLY-NEXT: vpermps %ymm8, %ymm3, %ymm3 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] ; AVX2-ONLY-NEXT: vmovaps %xmm0, (%rsi) diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-6.ll index d3f9b1c4e15a72..317875ca0ba7a0 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-6.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-6.ll @@ -86,12 +86,10 @@ define void @load_i32_stride6_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm5[2,0,2,3] ; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2,3] -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [4,2,4,2] -; AVX2-ONLY-NEXT: # xmm3 = mem[0,0] +; AVX2-ONLY-NEXT: vmovsd {{.*#+}} xmm3 = [4,2,0,0] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] ; AVX2-ONLY-NEXT: vpermps %ymm0, %ymm3, %ymm3 -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm6 = [5,3,5,3] -; AVX2-ONLY-NEXT: # xmm6 = mem[0,0] +; AVX2-ONLY-NEXT: vmovsd {{.*#+}} xmm6 = [5,3,0,0] ; AVX2-ONLY-NEXT: vpermps %ymm0, %ymm6, %ymm0 ; AVX2-ONLY-NEXT: vmovlps %xmm4, (%rsi) ; AVX2-ONLY-NEXT: vmovlps %xmm2, (%rdx) @@ -118,13 +116,11 @@ define void @load_i32_stride6_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-SLOW-NEXT: vpinsrd $1, %r10d, %xmm4, %xmm4 ; AVX512-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX512-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3] -; AVX512-SLOW-NEXT: vmovddup {{.*#+}} xmm2 = [4,2,4,2] -; AVX512-SLOW-NEXT: # xmm2 = mem[0,0] +; AVX512-SLOW-NEXT: vmovsd {{.*#+}} xmm2 = [4,2,0,0] ; AVX512-SLOW-NEXT: vmovaps 32(%rdi), %ymm5 ; AVX512-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],mem[4,5,6,7] ; AVX512-SLOW-NEXT: vpermps %ymm5, %ymm2, %ymm2 -; AVX512-SLOW-NEXT: vmovddup {{.*#+}} xmm6 = [5,3,5,3] -; AVX512-SLOW-NEXT: # xmm6 = mem[0,0] +; AVX512-SLOW-NEXT: vmovsd {{.*#+}} xmm6 = [5,3,0,0] ; AVX512-SLOW-NEXT: vpermps %ymm5, %ymm6, %ymm5 ; AVX512-SLOW-NEXT: vmovq %xmm3, (%rsi) ; AVX512-SLOW-NEXT: vmovq %xmm1, (%rdx) @@ -147,15 +143,13 @@ define void @load_i32_stride6_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FAST-NEXT: vpermi2d %xmm2, %xmm1, %xmm4 ; AVX512-FAST-NEXT: vpbroadcastq {{.*#+}} xmm2 = [2,4,2,4] ; AVX512-FAST-NEXT: vpermi2d %xmm3, %xmm1, %xmm2 -; AVX512-FAST-NEXT: vpbroadcastq {{.*#+}} xmm5 = [3,5,3,5] +; AVX512-FAST-NEXT: vmovq {{.*#+}} xmm5 = [3,5,0,0] ; AVX512-FAST-NEXT: vpermi2d %xmm3, %xmm1, %xmm5 -; AVX512-FAST-NEXT: vmovddup {{.*#+}} xmm1 = [4,2,4,2] -; AVX512-FAST-NEXT: # xmm1 = mem[0,0] +; AVX512-FAST-NEXT: vmovsd {{.*#+}} xmm1 = [4,2,0,0] ; AVX512-FAST-NEXT: vmovaps 32(%rdi), %ymm3 ; AVX512-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],mem[4,5,6,7] ; AVX512-FAST-NEXT: vpermps %ymm3, %ymm1, %ymm1 -; AVX512-FAST-NEXT: vmovddup {{.*#+}} xmm6 = [5,3,5,3] -; AVX512-FAST-NEXT: # xmm6 = mem[0,0] +; AVX512-FAST-NEXT: vmovsd {{.*#+}} xmm6 = [5,3,0,0] ; AVX512-FAST-NEXT: vpermps %ymm3, %ymm6, %ymm3 ; AVX512-FAST-NEXT: vmovq %xmm0, (%rsi) ; AVX512-FAST-NEXT: vmovq %xmm4, (%rdx) @@ -308,13 +302,13 @@ define void @load_i32_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1,2],xmm8[3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm8 = xmm4[0,1],xmm7[2,3] ; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[0,1,0,2] -; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} xmm9 = [4,2,4,2] +; AVX2-ONLY-NEXT: vmovq {{.*#+}} xmm9 = [4,2,0,0] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vpermd %ymm1, %ymm9, %ymm2 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm8[2,3] ; AVX2-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1,2],xmm7[3] -; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} xmm7 = [5,3,5,3] +; AVX2-ONLY-NEXT: vmovq {{.*#+}} xmm7 = [5,3,0,0] ; AVX2-ONLY-NEXT: vpermd %ymm1, %ymm7, %ymm1 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3] ; AVX2-ONLY-NEXT: vmovdqa %xmm0, (%rsi) diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-7.ll index e4f94b368b7fd5..abd4525a677375 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-7.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-7.ll @@ -100,8 +100,7 @@ define void @load_i32_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0],xmm4[1],xmm6[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3] ; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm3[3,2,2,3] -; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm4 = [4,3,4,3] -; AVX2-ONLY-NEXT: # xmm4 = mem[0,0] +; AVX2-ONLY-NEXT: vmovsd {{.*#+}} xmm4 = [4,3,0,0] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vpermps %ymm7, %ymm4, %ymm4 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7] @@ -134,7 +133,7 @@ define void @load_i32_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0],xmm1[1],xmm4[2,3] ; AVX512-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] ; AVX512-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,2,2,3] -; AVX512-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm1 = [4,11,4,11] +; AVX512-SLOW-NEXT: vmovq {{.*#+}} xmm1 = [4,11,0,0] ; AVX512-SLOW-NEXT: vmovdqa 32(%rdi), %ymm5 ; AVX512-SLOW-NEXT: vmovdqa (%rdi), %ymm6 ; AVX512-SLOW-NEXT: vpermi2d %ymm5, %ymm6, %ymm1 @@ -165,9 +164,9 @@ define void @load_i32_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FAST-NEXT: vpermi2d %xmm1, %xmm0, %xmm3 ; AVX512-FAST-NEXT: vpbroadcastd 8(%rdi), %xmm4 ; AVX512-FAST-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0],xmm1[1],xmm4[2,3] -; AVX512-FAST-NEXT: vpbroadcastq {{.*#+}} xmm5 = [7,2,7,2] +; AVX512-FAST-NEXT: vmovq {{.*#+}} xmm5 = [7,2,0,0] ; AVX512-FAST-NEXT: vpermi2d %xmm0, %xmm1, %xmm5 -; AVX512-FAST-NEXT: vpbroadcastq {{.*#+}} xmm0 = [4,11,4,11] +; AVX512-FAST-NEXT: vmovq {{.*#+}} xmm0 = [4,11,0,0] ; AVX512-FAST-NEXT: vmovdqa 32(%rdi), %ymm1 ; AVX512-FAST-NEXT: vmovdqa (%rdi), %ymm6 ; AVX512-FAST-NEXT: vpermi2d %ymm1, %ymm6, %ymm0 @@ -355,8 +354,7 @@ define void @load_i32_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm9[2,3] ; AVX2-SLOW-NEXT: vbroadcastss 100(%rdi), %xmm9 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm9 = xmm8[0,1,2],xmm9[3] -; AVX2-SLOW-NEXT: vmovddup {{.*#+}} xmm10 = [4,3,4,3] -; AVX2-SLOW-NEXT: # xmm10 = mem[0,0] +; AVX2-SLOW-NEXT: vmovsd {{.*#+}} xmm10 = [4,3,0,0] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm11 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vpermps %ymm11, %ymm10, %ymm10 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm9 = xmm10[0,1],xmm9[2,3] @@ -414,8 +412,7 @@ define void @load_i32_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm9[2,3] ; AVX2-FAST-NEXT: vbroadcastss 100(%rdi), %xmm9 ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm9 = xmm8[0,1,2],xmm9[3] -; AVX2-FAST-NEXT: vmovddup {{.*#+}} xmm10 = [4,3,4,3] -; AVX2-FAST-NEXT: # xmm10 = mem[0,0] +; AVX2-FAST-NEXT: vmovsd {{.*#+}} xmm10 = [4,3,0,0] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm11 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: vpermps %ymm11, %ymm10, %ymm10 ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm9 = xmm10[0,1],xmm9[2,3] @@ -472,8 +469,7 @@ define void @load_i32_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm9[2,3] ; AVX2-FAST-PERLANE-NEXT: vbroadcastss 100(%rdi), %xmm9 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm9 = xmm8[0,1,2],xmm9[3] -; AVX2-FAST-PERLANE-NEXT: vmovddup {{.*#+}} xmm10 = [4,3,4,3] -; AVX2-FAST-PERLANE-NEXT: # xmm10 = mem[0,0] +; AVX2-FAST-PERLANE-NEXT: vmovsd {{.*#+}} xmm10 = [4,3,0,0] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm11 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm11, %ymm10, %ymm10 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm9 = xmm10[0,1],xmm9[2,3] @@ -850,7 +846,7 @@ define void @load_i32_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-SLOW-NEXT: vpbroadcastd 100(%rdi), %xmm10 ; AVX2-SLOW-NEXT: vmovdqa 64(%rdi), %xmm11 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm10 = xmm11[0,1,2],xmm10[3] -; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm12 = [4,3,4,3] +; AVX2-SLOW-NEXT: vmovq {{.*#+}} xmm12 = [4,3,0,0] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vpermd %ymm13, %ymm12, %ymm12 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm10 = xmm12[0,1],xmm10[2,3] @@ -954,7 +950,7 @@ define void @load_i32_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-NEXT: vpbroadcastd 100(%rdi), %xmm10 ; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %xmm11 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm10 = xmm11[0,1,2],xmm10[3] -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm12 = [4,3,4,3] +; AVX2-FAST-NEXT: vmovq {{.*#+}} xmm12 = [4,3,0,0] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: vpermd %ymm13, %ymm12, %ymm12 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm10 = xmm12[0,1],xmm10[2,3] @@ -1058,7 +1054,7 @@ define void @load_i32_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 100(%rdi), %xmm10 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %xmm11 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm10 = xmm11[0,1,2],xmm10[3] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} xmm12 = [4,3,4,3] +; AVX2-FAST-PERLANE-NEXT: vmovq {{.*#+}} xmm12 = [4,3,0,0] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm13 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermd %ymm13, %ymm12, %ymm12 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm10 = xmm12[0,1],xmm10[2,3] @@ -1880,7 +1876,7 @@ define void @load_i32_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vpbroadcastd 100(%rdi), %xmm0 ; AVX2-SLOW-NEXT: vmovdqa 64(%rdi), %xmm3 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm3[0,1,2],xmm0[3] -; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm5 = [4,3,4,3] +; AVX2-SLOW-NEXT: vmovq {{.*#+}} xmm5 = [4,3,0,0] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm7[0,1,2,3],ymm9[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqa %ymm7, %ymm12 ; AVX2-SLOW-NEXT: vpermd %ymm10, %ymm5, %ymm10 @@ -2104,7 +2100,7 @@ define void @load_i32_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vpbroadcastd 100(%rdi), %xmm0 ; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %xmm1 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm3 = [4,3,4,3] +; AVX2-FAST-NEXT: vmovq {{.*#+}} xmm3 = [4,3,0,0] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3],ymm5[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqa %ymm5, %ymm11 ; AVX2-FAST-NEXT: vmovdqa %ymm6, %ymm12 @@ -2332,7 +2328,7 @@ define void @load_i32_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 100(%rdi), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %xmm3 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm0 = xmm3[0,1,2],xmm0[3] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} xmm5 = [4,3,4,3] +; AVX2-FAST-PERLANE-NEXT: vmovq {{.*#+}} xmm5 = [4,3,0,0] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm10 = ymm7[0,1,2,3],ymm9[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm7, %ymm12 ; AVX2-FAST-PERLANE-NEXT: vpermd %ymm10, %ymm5, %ymm10 @@ -4253,7 +4249,7 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm1 = [4,3,4,3] +; AVX2-SLOW-NEXT: vmovq {{.*#+}} xmm1 = [4,3,0,0] ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] @@ -4763,7 +4759,7 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm0 = [4,3,4,3] +; AVX2-FAST-NEXT: vmovq {{.*#+}} xmm0 = [4,3,0,0] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] @@ -5274,7 +5270,7 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} xmm1 = [4,3,4,3] +; AVX2-FAST-PERLANE-NEXT: vmovq {{.*#+}} xmm1 = [4,3,0,0] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] @@ -9154,8 +9150,7 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-SLOW-NEXT: vbroadcastss 100(%rdi), %xmm0 ; AVX2-SLOW-NEXT: vmovaps 64(%rdi), %xmm1 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] -; AVX2-SLOW-NEXT: vmovddup {{.*#+}} xmm7 = [4,3,4,3] -; AVX2-SLOW-NEXT: # xmm7 = mem[0,0] +; AVX2-SLOW-NEXT: vmovsd {{.*#+}} xmm7 = [4,3,0,0] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-SLOW-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm2 = mem[0,1,2,3],ymm2[4,5,6,7] @@ -10195,8 +10190,7 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-NEXT: vbroadcastss 100(%rdi), %xmm0 ; AVX2-FAST-NEXT: vmovaps 64(%rdi), %xmm9 ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm0 = xmm9[0,1,2],xmm0[3] -; AVX2-FAST-NEXT: vmovddup {{.*#+}} xmm4 = [4,3,4,3] -; AVX2-FAST-NEXT: # xmm4 = mem[0,0] +; AVX2-FAST-NEXT: vmovsd {{.*#+}} xmm4 = [4,3,0,0] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] @@ -11245,8 +11239,7 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FAST-PERLANE-NEXT: vbroadcastss 100(%rdi), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%rdi), %xmm1 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] -; AVX2-FAST-PERLANE-NEXT: vmovddup {{.*#+}} xmm7 = [4,3,4,3] -; AVX2-FAST-PERLANE-NEXT: # xmm7 = mem[0,0] +; AVX2-FAST-PERLANE-NEXT: vmovsd {{.*#+}} xmm7 = [4,3,0,0] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[0,1,2,3],ymm2[4,5,6,7] diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-8.ll index faa1642831c155..8f0c4b9b726a5e 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-8.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-8.ll @@ -153,7 +153,7 @@ define void @load_i32_stride8_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FAST-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512-FAST-NEXT: vmovdqa 32(%rdi), %xmm1 ; AVX512-FAST-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX512-FAST-NEXT: vpbroadcastq {{.*#+}} xmm3 = [1,5,1,5] +; AVX512-FAST-NEXT: vmovq {{.*#+}} xmm3 = [1,5,0,0] ; AVX512-FAST-NEXT: vpermi2d %xmm1, %xmm0, %xmm3 ; AVX512-FAST-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX512-FAST-NEXT: vmovdqa 32(%rdi), %ymm1 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-2.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-2.ll index 88ebda3622cc9b..ec029c3f9c033b 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-2.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-2.ll @@ -182,8 +182,7 @@ define void @load_i8_stride2_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nou ; AVX1-ONLY-NEXT: vpand %xmm0, %xmm2, %xmm3 ; AVX1-ONLY-NEXT: vpand %xmm0, %xmm1, %xmm0 ; AVX1-ONLY-NEXT: vpackuswb %xmm3, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15] -; AVX1-ONLY-NEXT: # xmm3 = mem[0,0] +; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm3 = [1,3,5,7,9,11,13,15,0,0,0,0,0,0,0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm2, %xmm2 ; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm1, %xmm1 ; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] @@ -199,7 +198,7 @@ define void @load_i8_stride2_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nou ; AVX2-ONLY-NEXT: vpand %xmm0, %xmm2, %xmm3 ; AVX2-ONLY-NEXT: vpand %xmm0, %xmm1, %xmm0 ; AVX2-ONLY-NEXT: vpackuswb %xmm3, %xmm0, %xmm0 -; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} xmm3 = [1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15] +; AVX2-ONLY-NEXT: vmovq {{.*#+}} xmm3 = [1,3,5,7,9,11,13,15,0,0,0,0,0,0,0,0] ; AVX2-ONLY-NEXT: vpshufb %xmm3, %xmm2, %xmm2 ; AVX2-ONLY-NEXT: vpshufb %xmm3, %xmm1, %xmm1 ; AVX2-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] @@ -215,7 +214,7 @@ define void @load_i8_stride2_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nou ; AVX512F-NEXT: vpand %xmm0, %xmm2, %xmm3 ; AVX512F-NEXT: vpand %xmm0, %xmm1, %xmm0 ; AVX512F-NEXT: vpackuswb %xmm3, %xmm0, %xmm0 -; AVX512F-NEXT: vpbroadcastq {{.*#+}} xmm3 = [1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15] +; AVX512F-NEXT: vmovq {{.*#+}} xmm3 = [1,3,5,7,9,11,13,15,0,0,0,0,0,0,0,0] ; AVX512F-NEXT: vpshufb %xmm3, %xmm2, %xmm2 ; AVX512F-NEXT: vpshufb %xmm3, %xmm1, %xmm1 ; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] @@ -281,8 +280,7 @@ define void @load_i8_stride2_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nou ; AVX1-ONLY-NEXT: vpand %xmm0, %xmm2, %xmm6 ; AVX1-ONLY-NEXT: vpand %xmm0, %xmm1, %xmm0 ; AVX1-ONLY-NEXT: vpackuswb %xmm6, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm6 = [1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15] -; AVX1-ONLY-NEXT: # xmm6 = mem[0,0] +; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm6 = [1,3,5,7,9,11,13,15,0,0,0,0,0,0,0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm4, %xmm4 ; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm3, %xmm3 ; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm4[0] @@ -438,8 +436,7 @@ define void @load_i8_stride2_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nou ; AVX1-ONLY-NEXT: vpand %xmm1, %xmm8, %xmm12 ; AVX1-ONLY-NEXT: vpand %xmm1, %xmm7, %xmm1 ; AVX1-ONLY-NEXT: vpackuswb %xmm12, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm12 = [1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15] -; AVX1-ONLY-NEXT: # xmm12 = mem[0,0] +; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm12 = [1,3,5,7,9,11,13,15,0,0,0,0,0,0,0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm2, %xmm2 ; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm3, %xmm3 ; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-4.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-4.ll index 40894a14a681c7..fb9cb361ff8631 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-4.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-4.ll @@ -214,57 +214,31 @@ define void @load_i8_stride4_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: movq %xmm1, (%r8) ; SSE-NEXT: retq ; -; AVX1-ONLY-LABEL: load_i8_stride4_vf8: -; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm0 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] -; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm2, %xmm3 -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm1, %xmm0 -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm3 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm2, %xmm4 -; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm1, %xmm3 -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm4 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm2, %xmm5 -; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm1, %xmm4 -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm5 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] -; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm2, %xmm2 -; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; AVX1-ONLY-NEXT: vmovq %xmm0, (%rsi) -; AVX1-ONLY-NEXT: vmovq %xmm3, (%rdx) -; AVX1-ONLY-NEXT: vmovq %xmm4, (%rcx) -; AVX1-ONLY-NEXT: vmovq %xmm1, (%r8) -; AVX1-ONLY-NEXT: retq -; -; AVX2-ONLY-LABEL: load_i8_stride4_vf8: -; AVX2-ONLY: # %bb.0: -; AVX2-ONLY-NEXT: vpbroadcastd {{.*#+}} xmm0 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] -; AVX2-ONLY-NEXT: vmovdqa (%rdi), %xmm1 -; AVX2-ONLY-NEXT: vmovdqa 16(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vpshufb %xmm0, %xmm2, %xmm3 -; AVX2-ONLY-NEXT: vpshufb %xmm0, %xmm1, %xmm0 -; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; AVX2-ONLY-NEXT: vpbroadcastd {{.*#+}} xmm3 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] -; AVX2-ONLY-NEXT: vpshufb %xmm3, %xmm2, %xmm4 -; AVX2-ONLY-NEXT: vpshufb %xmm3, %xmm1, %xmm3 -; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] -; AVX2-ONLY-NEXT: vpbroadcastd {{.*#+}} xmm4 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] -; AVX2-ONLY-NEXT: vpshufb %xmm4, %xmm2, %xmm5 -; AVX2-ONLY-NEXT: vpshufb %xmm4, %xmm1, %xmm4 -; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] -; AVX2-ONLY-NEXT: vpbroadcastd {{.*#+}} xmm5 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] -; AVX2-ONLY-NEXT: vpshufb %xmm5, %xmm2, %xmm2 -; AVX2-ONLY-NEXT: vpshufb %xmm5, %xmm1, %xmm1 -; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; AVX2-ONLY-NEXT: vmovq %xmm0, (%rsi) -; AVX2-ONLY-NEXT: vmovq %xmm3, (%rdx) -; AVX2-ONLY-NEXT: vmovq %xmm4, (%rcx) -; AVX2-ONLY-NEXT: vmovq %xmm1, (%r8) -; AVX2-ONLY-NEXT: retq +; AVX1-LABEL: load_i8_stride4_vf8: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovd {{.*#+}} xmm0 = [0,4,8,12,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX1-NEXT: vmovdqa (%rdi), %xmm1 +; AVX1-NEXT: vmovdqa 16(%rdi), %xmm2 +; AVX1-NEXT: vpshufb %xmm0, %xmm2, %xmm3 +; AVX1-NEXT: vpshufb %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; AVX1-NEXT: vmovd {{.*#+}} xmm3 = [1,5,9,13,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm4 +; AVX1-NEXT: vpshufb %xmm3, %xmm1, %xmm3 +; AVX1-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] +; AVX1-NEXT: vmovd {{.*#+}} xmm4 = [2,6,10,14,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX1-NEXT: vpshufb %xmm4, %xmm2, %xmm5 +; AVX1-NEXT: vpshufb %xmm4, %xmm1, %xmm4 +; AVX1-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] +; AVX1-NEXT: vmovd {{.*#+}} xmm5 = [3,7,11,15,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX1-NEXT: vpshufb %xmm5, %xmm2, %xmm2 +; AVX1-NEXT: vpshufb %xmm5, %xmm1, %xmm1 +; AVX1-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; AVX1-NEXT: vmovq %xmm0, (%rsi) +; AVX1-NEXT: vmovq %xmm3, (%rdx) +; AVX1-NEXT: vmovq %xmm4, (%rcx) +; AVX1-NEXT: vmovq %xmm1, (%r8) +; AVX1-NEXT: retq ; ; AVX512-LABEL: load_i8_stride4_vf8: ; AVX512: # %bb.0: @@ -413,7 +387,7 @@ define void @load_i8_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm4, %xmm5 ; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm3, %xmm2 ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1] -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm5 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm5 = [0,4,8,12,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm1, %xmm6 ; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm0, %xmm5 ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] @@ -422,7 +396,7 @@ define void @load_i8_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm4, %xmm6 ; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm3, %xmm5 ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm6 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm6 = [1,5,9,13,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm1, %xmm7 ; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm0, %xmm6 ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1] @@ -431,7 +405,7 @@ define void @load_i8_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm4, %xmm7 ; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm3, %xmm6 ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1] -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm7 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm7 = [2,6,10,14,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm1, %xmm8 ; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm0, %xmm7 ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1] @@ -440,7 +414,7 @@ define void @load_i8_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm4, %xmm4 ; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm3, %xmm3 ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm4 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm4 = [3,7,11,15,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm1, %xmm1 ; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm0, %xmm0 ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] @@ -461,7 +435,7 @@ define void @load_i8_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-ONLY-NEXT: vpshufb %xmm4, %xmm3, %xmm5 ; AVX2-ONLY-NEXT: vpshufb %xmm4, %xmm2, %xmm4 ; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] -; AVX2-ONLY-NEXT: vpbroadcastd {{.*#+}} xmm5 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX2-ONLY-NEXT: vmovd {{.*#+}} xmm5 = [0,4,8,12,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX2-ONLY-NEXT: vpshufb %xmm5, %xmm1, %xmm6 ; AVX2-ONLY-NEXT: vpshufb %xmm5, %xmm0, %xmm5 ; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] @@ -470,7 +444,7 @@ define void @load_i8_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-ONLY-NEXT: vpshufb %xmm5, %xmm3, %xmm6 ; AVX2-ONLY-NEXT: vpshufb %xmm5, %xmm2, %xmm5 ; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] -; AVX2-ONLY-NEXT: vpbroadcastd {{.*#+}} xmm6 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX2-ONLY-NEXT: vmovd {{.*#+}} xmm6 = [1,5,9,13,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX2-ONLY-NEXT: vpshufb %xmm6, %xmm1, %xmm7 ; AVX2-ONLY-NEXT: vpshufb %xmm6, %xmm0, %xmm6 ; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1] @@ -479,7 +453,7 @@ define void @load_i8_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-ONLY-NEXT: vpshufb %xmm6, %xmm3, %xmm7 ; AVX2-ONLY-NEXT: vpshufb %xmm6, %xmm2, %xmm6 ; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1] -; AVX2-ONLY-NEXT: vpbroadcastd {{.*#+}} xmm7 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX2-ONLY-NEXT: vmovd {{.*#+}} xmm7 = [2,6,10,14,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX2-ONLY-NEXT: vpshufb %xmm7, %xmm1, %xmm8 ; AVX2-ONLY-NEXT: vpshufb %xmm7, %xmm0, %xmm7 ; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1] @@ -488,7 +462,7 @@ define void @load_i8_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-ONLY-NEXT: vpshufb %xmm7, %xmm3, %xmm3 ; AVX2-ONLY-NEXT: vpshufb %xmm7, %xmm2, %xmm2 ; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; AVX2-ONLY-NEXT: vpbroadcastd {{.*#+}} xmm3 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX2-ONLY-NEXT: vmovd {{.*#+}} xmm3 = [3,7,11,15,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX2-ONLY-NEXT: vpshufb %xmm3, %xmm1, %xmm1 ; AVX2-ONLY-NEXT: vpshufb %xmm3, %xmm0, %xmm0 ; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] @@ -780,7 +754,7 @@ define void @load_i8_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vmovdqa 96(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm2, %xmm3 ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm9 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm9 = [0,4,8,12,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX1-ONLY-NEXT: vmovdqa 80(%rdi), %xmm3 ; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm3, %xmm4 ; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm5 @@ -804,7 +778,7 @@ define void @load_i8_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm0, %xmm10 ; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm2, %xmm11 ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1] -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm11 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm11 = [1,5,9,13,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm3, %xmm12 ; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm5, %xmm13 ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1] @@ -822,7 +796,7 @@ define void @load_i8_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm0, %xmm11 ; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm2, %xmm12 ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1] -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm12 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm12 = [2,6,10,14,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm3, %xmm13 ; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm5, %xmm14 ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm13 = xmm14[0],xmm13[0],xmm14[1],xmm13[1] @@ -840,7 +814,7 @@ define void @load_i8_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm0, %xmm0 ; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm2, %xmm2 ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm2 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm2 = [3,7,11,15,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm3, %xmm3 ; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm5, %xmm5 ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] @@ -1545,7 +1519,7 @@ define void @load_i8_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm3, %xmm2 ; AVX1-ONLY-NEXT: vmovdqa %xmm3, %xmm8 ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm3 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm3 = [0,4,8,12,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX1-ONLY-NEXT: vmovdqa 80(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm0, %xmm5 @@ -1611,7 +1585,7 @@ define void @load_i8_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm8, %xmm2 ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm2 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm2 = [1,5,9,13,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm7, %xmm3 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload @@ -1662,7 +1636,7 @@ define void @load_i8_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm1, %xmm1 ; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm4, %xmm2 ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm2 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm2 = [2,6,10,14,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm7, %xmm3 ; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm8, %xmm5 ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] @@ -1705,7 +1679,7 @@ define void @load_i8_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm2, %xmm2 ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm2 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm2 = [3,7,11,15,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm3, %xmm3 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-5.ll index 130207fd9c2ebc..03139923ae5c46 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-5.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-5.ll @@ -150,47 +150,26 @@ define void @load_i8_stride5_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: movd %xmm3, (%r9) ; SSE-NEXT: retq ; -; AVX1-ONLY-LABEL: load_i8_stride5_vf4: -; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm0 = [0,5,10,15,0,5,10,15,0,5,10,15,0,5,10,15] -; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm1, %xmm3 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm4 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0] -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm4, %xmm4 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm5 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1] -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm5, %xmm5 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm6 = xmm1[3,4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2] -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm6, %xmm6 -; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3] -; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm1, %xmm0 -; AVX1-ONLY-NEXT: vmovd %xmm3, (%rsi) -; AVX1-ONLY-NEXT: vmovd %xmm4, (%rdx) -; AVX1-ONLY-NEXT: vmovd %xmm5, (%rcx) -; AVX1-ONLY-NEXT: vmovd %xmm6, (%r8) -; AVX1-ONLY-NEXT: vmovd %xmm0, (%r9) -; AVX1-ONLY-NEXT: retq -; -; AVX2-LABEL: load_i8_stride5_vf4: -; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm0 = [0,5,10,15,0,5,10,15,0,5,10,15,0,5,10,15] -; AVX2-NEXT: vmovdqa (%rdi), %xmm1 -; AVX2-NEXT: vmovdqa 16(%rdi), %xmm2 -; AVX2-NEXT: vpshufb %xmm0, %xmm1, %xmm3 -; AVX2-NEXT: vpalignr {{.*#+}} xmm4 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0] -; AVX2-NEXT: vpshufb %xmm0, %xmm4, %xmm4 -; AVX2-NEXT: vpalignr {{.*#+}} xmm5 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1] -; AVX2-NEXT: vpshufb %xmm0, %xmm5, %xmm5 -; AVX2-NEXT: vpalignr {{.*#+}} xmm6 = xmm1[3,4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2] -; AVX2-NEXT: vpshufb %xmm0, %xmm6, %xmm6 -; AVX2-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3] -; AVX2-NEXT: vpshufb %xmm0, %xmm1, %xmm0 -; AVX2-NEXT: vmovd %xmm3, (%rsi) -; AVX2-NEXT: vmovd %xmm4, (%rdx) -; AVX2-NEXT: vmovd %xmm5, (%rcx) -; AVX2-NEXT: vmovd %xmm6, (%r8) -; AVX2-NEXT: vmovd %xmm0, (%r9) -; AVX2-NEXT: retq +; AVX-LABEL: load_i8_stride5_vf4: +; AVX: # %bb.0: +; AVX-NEXT: vmovd {{.*#+}} xmm0 = [0,5,10,15,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX-NEXT: vmovdqa (%rdi), %xmm1 +; AVX-NEXT: vmovdqa 16(%rdi), %xmm2 +; AVX-NEXT: vpshufb %xmm0, %xmm1, %xmm3 +; AVX-NEXT: vpalignr {{.*#+}} xmm4 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0] +; AVX-NEXT: vpshufb %xmm0, %xmm4, %xmm4 +; AVX-NEXT: vpalignr {{.*#+}} xmm5 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1] +; AVX-NEXT: vpshufb %xmm0, %xmm5, %xmm5 +; AVX-NEXT: vpalignr {{.*#+}} xmm6 = xmm1[3,4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2] +; AVX-NEXT: vpshufb %xmm0, %xmm6, %xmm6 +; AVX-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3] +; AVX-NEXT: vpshufb %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vmovd %xmm3, (%rsi) +; AVX-NEXT: vmovd %xmm4, (%rdx) +; AVX-NEXT: vmovd %xmm5, (%rcx) +; AVX-NEXT: vmovd %xmm6, (%r8) +; AVX-NEXT: vmovd %xmm0, (%r9) +; AVX-NEXT: retq %wide.vec = load <20 x i8>, ptr %in.vec, align 64 %strided.vec0 = shufflevector <20 x i8> %wide.vec, <20 x i8> poison, <4 x i32> %strided.vec1 = shufflevector <20 x i8> %wide.vec, <20 x i8> poison, <4 x i32> @@ -707,7 +686,7 @@ define void @load_i8_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = xmm3[u,u,u,u,u,u,u],zero,zero,zero,xmm3[2,7,12,u,u,u] ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = xmm1[u,u,u,u,u,u,u,3,8,13],zero,zero,zero,xmm1[u,u,u] ; AVX1-ONLY-NEXT: vpor %xmm5, %xmm6, %xmm5 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm6 = [255,255,255,255,255,255,255,0,0,0,0,0,0,u,u,u] +; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm6 = [255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0] ; AVX1-ONLY-NEXT: vpblendvb %xmm6, %xmm4, %xmm5, %xmm4 ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm7 = [0,1,2,3,4,5,6,7,8,9,10,11,12,128,128,128] ; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm4, %xmm5 @@ -1551,7 +1530,7 @@ define void @load_i8_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm11 = xmm10[u,u,u,u,u,u,u],zero,zero,zero,xmm10[2,7,12,u,u,u] ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm12 = xmm8[u,u,u,u,u,u,u,3,8,13],zero,zero,zero,xmm8[u,u,u] ; AVX1-ONLY-NEXT: vpor %xmm11, %xmm12, %xmm11 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm13 = [255,255,255,255,255,255,255,0,0,0,0,0,0,u,u,u] +; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm13 = [255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0] ; AVX1-ONLY-NEXT: vpblendvb %xmm13, %xmm5, %xmm11, %xmm5 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm12 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255] ; AVX1-ONLY-NEXT: vandps %ymm5, %ymm12, %ymm11 @@ -3169,7 +3148,7 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vmovdqa %xmm7, %xmm12 ; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpor %xmm0, %xmm6, %xmm6 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,0,0,0,0,0,0,u,u,u] +; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm0 = [255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0] ; AVX1-ONLY-NEXT: vpblendvb %xmm0, %xmm1, %xmm6, %xmm1 ; AVX1-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 176(%rdi), %xmm1 @@ -3191,11 +3170,9 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vpor %xmm3, %xmm4, %xmm3 ; AVX1-ONLY-NEXT: vpblendvb %xmm0, %xmm2, %xmm3, %xmm2 ; AVX1-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [1,6,11,128,128,128,128,0,1,6,11,128,128,128,128,0] -; AVX1-ONLY-NEXT: # xmm3 = mem[0,0] +; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm3 = [1,6,11,128,128,128,128,0,0,0,0,0,0,0,0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm8, %xmm4 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm5 = [128,128,128,0,5,10,15,0,128,128,128,0,5,10,15,0] -; AVX1-ONLY-NEXT: # xmm5 = mem[0,0] +; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm5 = [128,128,128,0,5,10,15,0,0,0,0,0,0,0,0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm11, %xmm6 ; AVX1-ONLY-NEXT: vpor %xmm4, %xmm6, %xmm4 ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm7 = [128,128,3,8,13,0,0,128,128,128,3,8,13,0,0,128] @@ -3351,8 +3328,7 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm7 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm7, %xmm12 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm14[0,1,2],xmm12[3,4,5,6,7] -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm14 = [2,7,12,128,128,128,0,0,2,7,12,128,128,128,0,0] -; AVX1-ONLY-NEXT: # xmm14 = mem[0,0] +; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm14 = [2,7,12,128,128,128,0,0,0,0,0,0,0,0,0,0] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm15, %xmm0 ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm9 = [10,15,128,128,128,0,0,5,10,15,128,128,128,0,0,5] @@ -3485,8 +3461,7 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vpor %xmm7, %xmm4, %xmm4 ; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm6 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm10 = xmm6[u,u,u,u,u,u],zero,zero,zero,xmm6[1,6,11,u,u,u,u] -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm7 = [128,128,128,3,8,13,0,0,128,128,128,3,8,13,0,0] -; AVX1-ONLY-NEXT: # xmm7 = mem[0,0] +; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm7 = [128,128,128,3,8,13,0,0,0,0,0,0,0,0,0,0] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm12, %xmm12 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm12[0,1,2],xmm10[3,4,5],xmm12[6,7] @@ -3494,8 +3469,7 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: # xmm12 = mem[0,0] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm13, %xmm13 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm6 = [4,9,14,128,128,128,0,0,4,9,14,128,128,128,0,0] -; AVX1-ONLY-NEXT: # xmm6 = mem[0,0] +; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm6 = [4,9,14,128,128,128,0,0,0,0,0,0,0,0,0,0] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm15, %xmm15 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm15[0,1,2],xmm13[3,4,5],xmm15[6,7] diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-6.ll index 899f38951342e0..ed179d99595468 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-6.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-6.ll @@ -380,8 +380,7 @@ define void @load_i8_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm2[4,10,u,u,u,u,u,u,u,u,u,u,u] ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = xmm1[2,8,14],zero,zero,xmm1[u,u,u,u,u,u,u,u,u,u,u] ; AVX1-ONLY-NEXT: vpor %xmm5, %xmm6, %xmm5 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm6 = [0,1,2,3,4,128,128,128,0,1,2,3,4,128,128,128] -; AVX1-ONLY-NEXT: # xmm6 = mem[0,0] +; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm6 = [0,1,2,3,4,128,128,128,0,0,0,0,0,0,0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm5, %xmm5 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,zero,zero,xmm0[0,6,12,u,u,u,u,u,u,u,u] ; AVX1-ONLY-NEXT: vpor %xmm7, %xmm5, %xmm5 @@ -825,7 +824,7 @@ define void @load_i8_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm11 = xmm3[u,u,u,u,u,u,u,u,4,10,u,u,u,u,u,u] ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm12 = xmm0[u,u,u,u,u,u,u,u,u,u,u,u,u,2,8,14] ; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm11 = xmm12[1],xmm11[1] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm12 = [255,255,255,255,255,0,0,0,0,0,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm12 = [255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0] ; AVX1-ONLY-NEXT: vpblendvb %xmm12, %xmm10, %xmm11, %xmm10 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm11 = xmm4[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm4[2,8,14] ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = xmm5[u,u,u,u,u,u,u,u,u,u,0,6,12],zero,zero,zero @@ -1728,7 +1727,7 @@ define void @load_i8_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm8[u,u,u,u,u,u,u,u,4,10,u,u,u,u,u,u] ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm6[u,u,u,u,u,u,u,u,u,u,u,u,u,2,8,14] ; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm0[1] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = [255,255,255,255,255,0,0,0,0,0,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm0 = [255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0] ; AVX1-ONLY-NEXT: vpblendvb %xmm0, %xmm1, %xmm2, %xmm1 ; AVX1-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm9[5,11],zero,zero,zero,xmm9[u,u,u,u,u,u,u,u,u,u,u] @@ -1743,13 +1742,11 @@ define void @load_i8_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm3[1],xmm2[1] ; AVX1-ONLY-NEXT: vpblendvb %xmm0, %xmm1, %xmm2, %xmm0 ; AVX1-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm8 = [128,128,128,2,8,14,0,0,128,128,128,2,8,14,0,0] -; AVX1-ONLY-NEXT: # xmm8 = mem[0,0] +; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm8 = [128,128,128,2,8,14,0,0,0,0,0,0,0,0,0,0] ; AVX1-ONLY-NEXT: vmovdqa 112(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm0, %xmm1 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = [0,6,12,128,128,128,0,0,0,6,12,128,128,128,0,0] -; AVX1-ONLY-NEXT: # xmm2 = mem[0,0] +; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm2 = [0,6,12,128,128,128,0,0,0,0,0,0,0,0,0,0] ; AVX1-ONLY-NEXT: vmovdqa 96(%rdi), %xmm13 ; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm13, %xmm3 ; AVX1-ONLY-NEXT: vpor %xmm1, %xmm3, %xmm1 @@ -1763,7 +1760,7 @@ define void @load_i8_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm14, %xmm10 ; AVX1-ONLY-NEXT: vpor %xmm4, %xmm10, %xmm4 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm4, %ymm1 -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm15 = [0,0,4,10,0,0,4,10,0,0,4,10,0,0,4,10] +; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm15 = [0,0,4,10,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm6, %xmm4 ; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm0 = [0,6,12,0,0,6,12,0,0,6,12,0,0,6,12,0] ; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm5, %xmm10 @@ -1804,7 +1801,7 @@ define void @load_i8_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm15 = xmm14[u,u,u,u,u,u,u,u,u,u,u,3,9,15],zero,zero ; AVX1-ONLY-NEXT: vpor %xmm0, %xmm15, %xmm0 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm8 = [0,0,5,11,0,0,5,11,0,0,5,11,0,0,5,11] +; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm8 = [0,0,5,11,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm1, %xmm15 ; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm1 = [1,7,13,0,1,7,13,0,1,7,13,0,1,7,13,0] @@ -3597,11 +3594,9 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vmovdqa 240(%rdi), %xmm9 ; AVX1-ONLY-NEXT: vmovdqa 208(%rdi), %xmm10 ; AVX1-ONLY-NEXT: vmovdqa 192(%rdi), %xmm11 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [128,128,128,4,10,0,0,0,128,128,128,4,10,0,0,0] -; AVX1-ONLY-NEXT: # xmm3 = mem[0,0] -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm12 = [2,8,14,128,128,0,0,0,2,8,14,128,128,0,0,0] -; AVX1-ONLY-NEXT: # xmm12 = mem[0,0] -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm1 = [2,8,14,0,2,8,14,0,2,8,14,0,2,8,14,0] +; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm3 = [128,128,128,4,10,0,0,0,0,0,0,0,0,0,0,0] +; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm12 = [2,8,14,128,128,0,0,0,0,0,0,0,0,0,0,0] +; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm1 = [2,8,14,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm6, %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, %xmm13 ; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm4 = [0,0,6,12,0,0,6,12,0,0,6,12,0,0,6,12] @@ -3625,16 +3620,14 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vpor %xmm2, %xmm3, %xmm2 ; AVX1-ONLY-NEXT: vpblendvb %xmm5, %xmm1, %xmm2, %xmm1 ; AVX1-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm2 = [3,9,15,0,3,9,15,0,3,9,15,0,3,9,15,0] +; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm2 = [3,9,15,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm6, %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm13 ; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm0 = [0,1,7,13,0,1,7,13,0,1,7,13,0,1,7,13] ; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm15, %xmm2 ; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [128,128,128,5,11,0,0,0,128,128,128,5,11,0,0,0] -; AVX1-ONLY-NEXT: # xmm3 = mem[0,0] -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm12 = [3,9,15,128,128,0,0,0,3,9,15,128,128,0,0,0] -; AVX1-ONLY-NEXT: # xmm12 = mem[0,0] +; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm3 = [128,128,128,5,11,0,0,0,0,0,0,0,0,0,0,0] +; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm12 = [3,9,15,128,128,0,0,0,0,0,0,0,0,0,0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm7, %xmm2 ; AVX1-ONLY-NEXT: vmovdqa %xmm7, %xmm14 ; AVX1-ONLY-NEXT: vmovdqa %xmm3, %xmm15 @@ -3652,12 +3645,10 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vpor %xmm2, %xmm3, %xmm2 ; AVX1-ONLY-NEXT: vpblendvb %xmm5, %xmm1, %xmm2, %xmm0 ; AVX1-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = [4,10,128,128,128,0,0,0,4,10,128,128,128,0,0,0] -; AVX1-ONLY-NEXT: # xmm1 = mem[0,0] +; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm1 = [4,10,128,128,128,0,0,0,0,0,0,0,0,0,0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm4, %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, %xmm12 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = [128,128,0,6,12,0,0,0,128,128,0,6,12,0,0,0] -; AVX1-ONLY-NEXT: # xmm2 = mem[0,0] +; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm2 = [128,128,0,6,12,0,0,0,0,0,0,0,0,0,0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm14, %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm13 ; AVX1-ONLY-NEXT: vpor %xmm0, %xmm1, %xmm1 @@ -3668,7 +3659,7 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm5, %xmm2 ; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm0[1] -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = [255,255,255,255,255,0,0,0,0,0,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm0 = [255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0] ; AVX1-ONLY-NEXT: vpblendvb %xmm0, %xmm1, %xmm2, %xmm1 ; AVX1-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm11, %xmm1 @@ -3682,13 +3673,11 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm3[1],xmm2[1] ; AVX1-ONLY-NEXT: vpblendvb %xmm0, %xmm1, %xmm2, %xmm1 ; AVX1-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = [5,11,128,128,128,0,0,0,5,11,128,128,128,0,0,0] -; AVX1-ONLY-NEXT: # xmm2 = mem[0,0] +; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm2 = [5,11,128,128,128,0,0,0,0,0,0,0,0,0,0,0] ; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm7, %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm8 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [128,128,1,7,13,0,0,0,128,128,1,7,13,0,0,0] -; AVX1-ONLY-NEXT: # xmm3 = mem[0,0] +; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm3 = [128,128,1,7,13,0,0,0,0,0,0,0,0,0,0,0] ; AVX1-ONLY-NEXT: vmovdqa %xmm14, %xmm13 ; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm14, %xmm2 ; AVX1-ONLY-NEXT: vmovdqa %xmm3, %xmm14 @@ -3715,13 +3704,11 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm3[1],xmm2[1] ; AVX1-ONLY-NEXT: vpblendvb %xmm0, %xmm1, %xmm2, %xmm0 ; AVX1-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [128,128,128,2,8,14,0,0,128,128,128,2,8,14,0,0] -; AVX1-ONLY-NEXT: # xmm3 = mem[0,0] +; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm3 = [128,128,128,2,8,14,0,0,0,0,0,0,0,0,0,0] ; AVX1-ONLY-NEXT: vmovdqa 112(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm15 = [0,6,12,128,128,128,0,0,0,6,12,128,128,128,0,0] -; AVX1-ONLY-NEXT: # xmm15 = mem[0,0] +; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm15 = [0,6,12,128,128,128,0,0,0,0,0,0,0,0,0,0] ; AVX1-ONLY-NEXT: vmovdqa 96(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm1, %xmm1 @@ -3737,7 +3724,7 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u,u,u,u,u,2,8,14],zero,zero ; AVX1-ONLY-NEXT: vpor %xmm1, %xmm2, %xmm1 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm2 -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm14 = [0,0,4,10,0,0,4,10,0,0,4,10,0,0,4,10] +; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm14 = [0,0,4,10,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm5, %xmm4 ; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm1 = [0,6,12,0,0,6,12,0,0,6,12,0,0,6,12,0] ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm10, %xmm5 @@ -3825,12 +3812,10 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm7, %ymm0 ; AVX1-ONLY-NEXT: vorps %ymm0, %ymm1, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm8 = [128,128,128,3,9,15,0,0,128,128,128,3,9,15,0,0] -; AVX1-ONLY-NEXT: # xmm8 = mem[0,0] +; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm8 = [128,128,128,3,9,15,0,0,0,0,0,0,0,0,0,0] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm0, %xmm0 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm4 = [1,7,13,128,128,128,0,0,1,7,13,128,128,128,0,0] -; AVX1-ONLY-NEXT: # xmm4 = mem[0,0] +; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm4 = [1,7,13,128,128,128,0,0,0,0,0,0,0,0,0,0] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm1, %xmm1 ; AVX1-ONLY-NEXT: vpor %xmm0, %xmm1, %xmm0 @@ -3842,7 +3827,7 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm5, %xmm14 ; AVX1-ONLY-NEXT: vpor %xmm1, %xmm14, %xmm1 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm14 -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm3 = [0,0,5,11,0,0,5,11,0,0,5,11,0,0,5,11] +; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm3 = [0,0,5,11,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm0, %xmm0 ; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm7 = [1,7,13,0,1,7,13,0,1,7,13,0,1,7,13,0] @@ -3919,12 +3904,10 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm15, %ymm1 ; AVX1-ONLY-NEXT: vorps %ymm1, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm9 = [128,128,128,4,10,0,0,0,128,128,128,4,10,0,0,0] -; AVX1-ONLY-NEXT: # xmm9 = mem[0,0] +; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm9 = [128,128,128,4,10,0,0,0,0,0,0,0,0,0,0,0] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm10, %xmm0 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm13 = [2,8,14,128,128,0,0,0,2,8,14,128,128,0,0,0] -; AVX1-ONLY-NEXT: # xmm13 = mem[0,0] +; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm13 = [2,8,14,128,128,0,0,0,0,0,0,0,0,0,0,0] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm1, %xmm1 ; AVX1-ONLY-NEXT: vpor %xmm0, %xmm1, %xmm1 @@ -3941,7 +3924,7 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm8, %ymm1 ; AVX1-ONLY-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm4 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: vorps %ymm1, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm13 = [2,8,14,0,2,8,14,0,2,8,14,0,2,8,14,0] +; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm13 = [2,8,14,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm11, %xmm1 ; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm8 = [0,0,6,12,0,0,6,12,0,0,6,12,0,0,6,12] ; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm12, %xmm5 @@ -3987,11 +3970,9 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm5, %ymm0 ; AVX1-ONLY-NEXT: vorps %ymm0, %ymm3, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm14 = [128,128,128,5,11,0,0,0,128,128,128,5,11,0,0,0] -; AVX1-ONLY-NEXT: # xmm14 = mem[0,0] +; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm14 = [128,128,128,5,11,0,0,0,0,0,0,0,0,0,0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm10, %xmm0 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm13 = [3,9,15,128,128,0,0,0,3,9,15,128,128,0,0,0] -; AVX1-ONLY-NEXT: # xmm13 = mem[0,0] +; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm13 = [3,9,15,128,128,0,0,0,0,0,0,0,0,0,0,0] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm14, %xmm3 ; AVX1-ONLY-NEXT: vpor %xmm0, %xmm3, %xmm0 @@ -4009,7 +3990,7 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm9, %ymm0 ; AVX1-ONLY-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm4 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: vorps %ymm0, %ymm4, %ymm0 -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm8 = [3,9,15,0,3,9,15,0,3,9,15,0,3,9,15,0] +; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm8 = [3,9,15,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm4, %xmm4 ; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm9 = [0,1,7,13,0,1,7,13,0,1,7,13,0,1,7,13] @@ -4054,11 +4035,9 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm7, %ymm2 ; AVX1-ONLY-NEXT: vorps %ymm2, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = [4,10,128,128,128,0,0,0,4,10,128,128,128,0,0,0] -; AVX1-ONLY-NEXT: # xmm1 = mem[0,0] +; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm1 = [4,10,128,128,128,0,0,0,0,0,0,0,0,0,0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm14, %xmm0 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm14 = [128,128,0,6,12,0,0,0,128,128,0,6,12,0,0,0] -; AVX1-ONLY-NEXT: # xmm14 = mem[0,0] +; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm14 = [128,128,0,6,12,0,0,0,0,0,0,0,0,0,0,0] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm13, %xmm2 ; AVX1-ONLY-NEXT: vpor %xmm0, %xmm2, %xmm0 @@ -4121,12 +4100,10 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm0, %ymm3 ; AVX1-ONLY-NEXT: vorps %ymm3, %ymm4, %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm15 = [5,11,128,128,128,0,0,0,5,11,128,128,128,0,0,0] -; AVX1-ONLY-NEXT: # xmm15 = mem[0,0] +; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm15 = [5,11,128,128,128,0,0,0,0,0,0,0,0,0,0,0] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm2, %xmm3 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm4 = [128,128,1,7,13,0,0,0,128,128,1,7,13,0,0,0] -; AVX1-ONLY-NEXT: # xmm4 = mem[0,0] +; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm4 = [128,128,1,7,13,0,0,0,0,0,0,0,0,0,0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm13, %xmm5 ; AVX1-ONLY-NEXT: vpor %xmm3, %xmm5, %xmm3 ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm5 = [0,0,128,128,128,3,9,15,0,0,128,128,128,3,9,15] diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-7.ll index 718d4973f97e25..a7a468d15c4021 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-7.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-7.ll @@ -225,16 +225,16 @@ define void @load_i8_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[6,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[1,8,15],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u] ; AVX1-ONLY-NEXT: vpor %xmm3, %xmm4, %xmm3 -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm4 = [2,9,0,0,2,9,0,0,2,9,0,0,2,9,0,0] +; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm4 = [2,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm0, %xmm5 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = xmm1[0,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm6 = [3,10,0,0,3,10,0,0,3,10,0,0,3,10,0,0] +; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm6 = [3,10,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm0, %xmm7 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm8 = xmm1[1,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3] ; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm1, %xmm4 -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm8 = [4,11,0,0,4,11,0,0,4,11,0,0,4,11,0,0] +; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm8 = [4,11,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm0, %xmm9 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm9[0],xmm4[0],xmm9[1],xmm4[1],xmm9[2],xmm4[2],xmm9[3],xmm4[3] ; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm1, %xmm6 @@ -303,16 +303,16 @@ define void @load_i8_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512F-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[6,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512F-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[1,8,15],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512F-NEXT: vpor %xmm3, %xmm4, %xmm3 -; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm4 = [2,9,0,0,2,9,0,0,2,9,0,0,2,9,0,0] +; AVX512F-NEXT: vmovd {{.*#+}} xmm4 = [2,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX512F-NEXT: vpshufb %xmm4, %xmm0, %xmm5 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm6 = xmm1[0,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] -; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm6 = [3,10,0,0,3,10,0,0,3,10,0,0,3,10,0,0] +; AVX512F-NEXT: vmovd {{.*#+}} xmm6 = [3,10,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX512F-NEXT: vpshufb %xmm6, %xmm0, %xmm7 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm8 = xmm1[1,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3] ; AVX512F-NEXT: vpshufb %xmm4, %xmm1, %xmm4 -; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm8 = [4,11,0,0,4,11,0,0,4,11,0,0,4,11,0,0] +; AVX512F-NEXT: vmovd {{.*#+}} xmm8 = [4,11,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX512F-NEXT: vpshufb %xmm8, %xmm0, %xmm9 ; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm9[0],xmm4[0],xmm9[1],xmm4[1],xmm9[2],xmm4[2],xmm9[3],xmm4[3] ; AVX512F-NEXT: vpshufb %xmm6, %xmm1, %xmm6 @@ -654,7 +654,7 @@ define void @load_i8_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = xmm3[u,u,u,u],zero,zero,zero,xmm3[5,u,u,u,u,u,u,u,u] ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm9 = xmm2[u,u,u,u,0,7,14],zero,xmm2[u,u,u,u,u,u,u,u] ; AVX1-ONLY-NEXT: vpor %xmm7, %xmm9, %xmm7 -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm9 = [4,11,0,0,4,11,0,0,4,11,0,0,4,11,0,0] +; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm9 = [4,11,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm0, %xmm10 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm11 = xmm1[2,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3] @@ -1403,7 +1403,7 @@ define void @load_i8_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm3[u,u,u,u,u,3,10],zero,zero,zero,xmm3[u,u,u,u,u,u] ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = xmm4[u,u,u,u,u],zero,zero,xmm4[1,8,15,u,u,u,u,u,u] ; AVX1-ONLY-NEXT: vpor %xmm1, %xmm5, %xmm1 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm5 = [255,255,255,255,255,0,0,0,0,0,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm5 = [255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0] ; AVX1-ONLY-NEXT: vpblendvb %xmm5, %xmm0, %xmm1, %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 80(%rdi), %xmm1 @@ -1415,7 +1415,7 @@ define void @load_i8_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm9 = xmm4[u,u,u,u,u],zero,zero,xmm4[2,9,u,u,u,u,u,u,u] ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm10 = xmm3[u,u,u,u,u,4,11],zero,zero,xmm3[u,u,u,u,u,u,u] ; AVX1-ONLY-NEXT: vpor %xmm9, %xmm10, %xmm9 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm11 = [255,255,255,255,255,0,0,0,0,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm11 = [255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0] ; AVX1-ONLY-NEXT: vpblendvb %xmm11, %xmm8, %xmm9, %xmm8 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm9 = xmm1[u,u,u,u,5,12,u,u,u,u,u,u,u,u,u,u] ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm10 = xmm5[u,u,u,u,u,0,7,14,u,u,u,u,u,u,u,u] @@ -1455,7 +1455,7 @@ define void @load_i8_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = xmm6[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zero,xmm6[5,12] ; AVX1-ONLY-NEXT: vpor %xmm14, %xmm13, %xmm13 ; AVX1-ONLY-NEXT: vpblendvb %xmm9, %xmm11, %xmm13, %xmm11 -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm13 = [4,11,0,0,4,11,0,0,4,11,0,0,4,11,0,0] +; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm13 = [4,11,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm2, %xmm14 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm15 = xmm7[2,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3] @@ -2935,7 +2935,7 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm15[u,u,u,u,u,3,10],zero,zero,zero,xmm15[u,u,u,u,u,u] ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm4[u,u,u,u,u],zero,zero,xmm4[1,8,15,u,u,u,u,u,u] ; AVX1-ONLY-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,0,0,0,0,0,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm3 = [255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0] ; AVX1-ONLY-NEXT: vpblendvb %xmm3, %xmm1, %xmm2, %xmm1 ; AVX1-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm6[u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm6[u,u,u,u] @@ -2952,7 +2952,7 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm4[u,u,u,u,u],zero,zero,xmm4[2,9,u,u,u,u,u,u,u] ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm15[u,u,u,u,u,4,11],zero,zero,xmm15[u,u,u,u,u,u,u] ; AVX1-ONLY-NEXT: vpor %xmm0, %xmm2, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = [255,255,255,255,255,0,0,0,0,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm0 = [255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0] ; AVX1-ONLY-NEXT: vpblendvb %xmm0, %xmm1, %xmm2, %xmm1 ; AVX1-ONLY-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm8[u,u],zero,zero,zero,xmm8[5,12,u,u,u,u,u,u,u,u,u] @@ -2961,7 +2961,7 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm6[u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm6[u,u,u,u] ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm7[u,u,u,u,u,u,u],zero,zero,xmm7[1,8,15,u,u,u,u] ; AVX1-ONLY-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = [u,u,255,255,255,255,255,0,0,0,0,0,u,u,u,u] +; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm3 = [0,0,255,255,255,255,255,0,0,0,0,0,0,0,0,0] ; AVX1-ONLY-NEXT: vpblendvb %xmm3, %xmm1, %xmm2, %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm10[2,9],zero,zero,zero,xmm10[u,u,u,u,u,u,u,u,u,u,u] @@ -2988,7 +2988,7 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm7[u,u,u,u,u,u,u],zero,zero,xmm7[2,9,u,u,u,u,u] ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm6[u,u,u,u,u,u,u,4,11],zero,zero,xmm6[u,u,u,u,u] ; AVX1-ONLY-NEXT: vpor %xmm0, %xmm2, %xmm2 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = [u,u,255,255,255,255,255,0,0,0,0,u,u,u,u,u] +; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm0 = [0,0,255,255,255,255,255,0,0,0,0,0,0,0,0,0] ; AVX1-ONLY-NEXT: vpblendvb %xmm0, %xmm1, %xmm2, %xmm2 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm9[u,u,2,9],zero,zero,zero,xmm9[u,u,u,u,u,u,u,u,u] ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm8[u,u],zero,zero,xmm8[0,7,14,u,u,u,u,u,u,u,u,u] @@ -3096,7 +3096,7 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6],xmm3[7] ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm8[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zero,xmm8[4,11] ; AVX1-ONLY-NEXT: vpor %xmm4, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm12 = [2,9,0,0,2,9,0,0,2,9,0,0,2,9,0,0] +; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm12 = [2,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm2, %xmm4 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm4 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255] @@ -3115,7 +3115,7 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6],xmm3[7] ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = xmm8[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zero,xmm8[5,12] ; AVX1-ONLY-NEXT: vpor %xmm7, %xmm4, %xmm7 -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm4 = [3,10,0,0,3,10,0,0,3,10,0,0,3,10,0,0] +; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm4 = [3,10,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm2, %xmm10 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm7, %ymm7 ; AVX1-ONLY-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm10 # 32-byte Folded Reload @@ -3127,7 +3127,7 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vorps %ymm7, %ymm10, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill ; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm11, %xmm7 -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm14 = [4,11,0,0,4,11,0,0,4,11,0,0,4,11,0,0] +; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm14 = [4,11,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm5, %xmm10 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm10[0],xmm7[0],xmm10[1],xmm7[1],xmm10[2],xmm7[2],xmm10[3],xmm7[3] @@ -3153,7 +3153,7 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vorps %ymm7, %ymm3, %ymm3 ; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm11, %xmm4 -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm7 = [5,12,0,0,5,12,0,0,5,12,0,0,5,12,0,0] +; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm7 = [5,12,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm5, %xmm10 ; AVX1-ONLY-NEXT: vmovdqa %xmm5, %xmm3 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm10[0],xmm4[0],xmm10[1],xmm4[1],xmm10[2],xmm4[2],xmm10[3],xmm4[3] @@ -3179,7 +3179,7 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vandnps %ymm5, %ymm13, %ymm5 ; AVX1-ONLY-NEXT: vorps %ymm5, %ymm4, %ymm4 ; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm11, %xmm5 -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm7 = [6,13,0,0,6,13,0,0,6,13,0,0,6,13,0,0] +; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm7 = [6,13,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm3, %xmm10 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm10[0],xmm5[0],xmm10[1],xmm5[1],xmm10[2],xmm5[2],xmm10[3],xmm5[3] ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm10 = xmm15[u,u,u,u,2,9],zero,zero,zero,xmm15[u,u,u,u,u,u,u] @@ -6763,11 +6763,9 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm10 ; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm8 ; AVX1-ONLY-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = [128,128,128,5,12,0,0,0,128,128,128,5,12,0,0,0] -; AVX1-ONLY-NEXT: # xmm2 = mem[0,0] +; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm2 = [128,128,128,5,12,0,0,0,0,0,0,0,0,0,0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm3, %xmm4 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [0,7,14,128,128,0,0,0,0,7,14,128,128,0,0,0] -; AVX1-ONLY-NEXT: # xmm3 = mem[0,0] +; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm3 = [0,7,14,128,128,0,0,0,0,0,0,0,0,0,0,0] ; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm5 ; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm5, %xmm5 @@ -6781,7 +6779,7 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: # xmm5 = mem[0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm8, %xmm8 ; AVX1-ONLY-NEXT: vpor %xmm6, %xmm8, %xmm8 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm6 = [255,255,255,255,255,0,0,0,0,0,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm6 = [255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0] ; AVX1-ONLY-NEXT: vpblendvb %xmm6, %xmm7, %xmm8, %xmm7 ; AVX1-ONLY-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 240(%rdi), %xmm7 @@ -6863,11 +6861,9 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vpor %xmm2, %xmm3, %xmm2 ; AVX1-ONLY-NEXT: vpblendvb %xmm15, %xmm0, %xmm2, %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = [0,0,128,128,128,5,12,0,0,0,128,128,128,5,12,0] -; AVX1-ONLY-NEXT: # xmm0 = mem[0,0] +; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm0 = [0,0,128,128,128,5,12,0,0,0,0,0,0,0,0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm12, %xmm2 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm12 = [0,0,0,7,14,128,128,0,0,0,0,7,14,128,128,0] -; AVX1-ONLY-NEXT: # xmm12 = mem[0,0] +; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm12 = [0,0,0,7,14,128,128,0,0,0,0,0,0,0,0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm7, %xmm3 ; AVX1-ONLY-NEXT: vpor %xmm2, %xmm3, %xmm5 ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = [10,128,128,128,0,0,0,3,10,128,128,128,0,0,0,3] @@ -6877,7 +6873,7 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: # xmm3 = mem[0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm10, %xmm13 ; AVX1-ONLY-NEXT: vpor %xmm4, %xmm13, %xmm13 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm4 = [u,u,255,255,255,255,255,0,0,0,0,0,u,u,u,u] +; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm4 = [0,0,255,255,255,255,255,0,0,0,0,0,0,0,0,0] ; AVX1-ONLY-NEXT: vpblendvb %xmm4, %xmm5, %xmm13, %xmm5 ; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm8, %xmm0 @@ -6888,12 +6884,10 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vpor %xmm1, %xmm2, %xmm1 ; AVX1-ONLY-NEXT: vpblendvb %xmm4, %xmm0, %xmm1, %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm14 = [128,128,128,6,13,0,0,0,128,128,128,6,13,0,0,0] -; AVX1-ONLY-NEXT: # xmm14 = mem[0,0] +; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm14 = [128,128,128,6,13,0,0,0,0,0,0,0,0,0,0,0] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm11, %xmm0 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = [1,8,15,128,128,0,0,0,1,8,15,128,128,0,0,0] -; AVX1-ONLY-NEXT: # xmm2 = mem[0,0] +; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm2 = [1,8,15,128,128,0,0,0,0,0,0,0,0,0,0,0] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm10, %xmm3 ; AVX1-ONLY-NEXT: vpor %xmm0, %xmm3, %xmm3 @@ -6904,7 +6898,7 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm7, %xmm13 ; AVX1-ONLY-NEXT: vpor %xmm0, %xmm13, %xmm13 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm9 = [255,255,255,255,255,0,0,0,0,u,u,u,u,u,u,u] +; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm9 = [255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0] ; AVX1-ONLY-NEXT: vpblendvb %xmm9, %xmm3, %xmm13, %xmm0 ; AVX1-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload @@ -6920,11 +6914,9 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vpblendvb %xmm9, %xmm14, %xmm2, %xmm2 ; AVX1-ONLY-NEXT: vmovdqa %xmm9, %xmm12 ; AVX1-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm9 = [2,9,128,128,128,0,0,0,2,9,128,128,128,0,0,0] -; AVX1-ONLY-NEXT: # xmm9 = mem[0,0] +; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm9 = [2,9,128,128,128,0,0,0,0,0,0,0,0,0,0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm10, %xmm2 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [128,128,0,7,14,0,0,0,128,128,0,7,14,0,0,0] -; AVX1-ONLY-NEXT: # xmm3 = mem[0,0] +; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm3 = [128,128,0,7,14,0,0,0,0,0,0,0,0,0,0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm11, %xmm4 ; AVX1-ONLY-NEXT: vpor %xmm2, %xmm4, %xmm2 ; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm4 = [10,128,128,3,10,128,128,3,10,128,128,3,10,128,128,3] @@ -6946,11 +6938,9 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vpblendvb %xmm14, %xmm1, %xmm2, %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm14, %xmm6 ; AVX1-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = [3,10,128,128,128,0,0,0,3,10,128,128,128,0,0,0] -; AVX1-ONLY-NEXT: # xmm1 = mem[0,0] +; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm1 = [3,10,128,128,128,0,0,0,0,0,0,0,0,0,0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm10, %xmm3 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = [128,128,1,8,15,0,0,0,128,128,1,8,15,0,0,0] -; AVX1-ONLY-NEXT: # xmm2 = mem[0,0] +; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm2 = [128,128,1,8,15,0,0,0,0,0,0,0,0,0,0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm11, %xmm4 ; AVX1-ONLY-NEXT: vpor %xmm3, %xmm4, %xmm5 ; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm3 = [11,128,128,4,11,128,128,4,11,128,128,4,11,128,128,4] @@ -6968,12 +6958,10 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vpor %xmm2, %xmm3, %xmm2 ; AVX1-ONLY-NEXT: vpblendvb %xmm6, %xmm1, %xmm2, %xmm0 ; AVX1-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = [0,0,128,128,128,6,13,0,0,0,128,128,128,6,13,0] -; AVX1-ONLY-NEXT: # xmm0 = mem[0,0] +; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm0 = [0,0,128,128,128,6,13,0,0,0,0,0,0,0,0,0] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm11, %xmm1 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [0,0,1,8,15,128,128,0,0,0,1,8,15,128,128,0] -; AVX1-ONLY-NEXT: # xmm3 = mem[0,0] +; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm3 = [0,0,1,8,15,128,128,0,0,0,0,0,0,0,0,0] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm14, %xmm2 ; AVX1-ONLY-NEXT: vpor %xmm1, %xmm2, %xmm1 @@ -6984,7 +6972,7 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm12, %xmm13 ; AVX1-ONLY-NEXT: vpor %xmm2, %xmm13, %xmm13 -; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm2 = [u,u,255,255,255,255,255,0,0,0,0,u,u,u,u,u] +; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm2 = [0,0,255,255,255,255,255,0,0,0,0,0,0,0,0,0] ; AVX1-ONLY-NEXT: vpblendvb %xmm2, %xmm1, %xmm13, %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload @@ -6999,12 +6987,10 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vpor %xmm1, %xmm3, %xmm1 ; AVX1-ONLY-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = [0,0,2,9,128,128,128,0,0,0,2,9,128,128,128,0] -; AVX1-ONLY-NEXT: # xmm1 = mem[0,0] +; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm1 = [0,0,2,9,128,128,128,0,0,0,0,0,0,0,0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm14, %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm14, %xmm15 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [0,0,128,128,0,7,14,0,0,0,128,128,0,7,14,0] -; AVX1-ONLY-NEXT: # xmm3 = mem[0,0] +; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm3 = [0,0,128,128,0,7,14,0,0,0,0,0,0,0,0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm11, %xmm4 ; AVX1-ONLY-NEXT: vpor %xmm0, %xmm4, %xmm5 ; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm4 = [128,3,10,128,128,3,10,128,128,3,10,128,128,3,10,128] @@ -7025,11 +7011,9 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm6, %xmm0 ; AVX1-ONLY-NEXT: vpor %xmm3, %xmm0, %xmm0 ; AVX1-ONLY-NEXT: vpblendvb %xmm8, %xmm1, %xmm0, %xmm14 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = [0,0,3,10,128,128,128,0,0,0,3,10,128,128,128,0] -; AVX1-ONLY-NEXT: # xmm1 = mem[0,0] +; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm1 = [0,0,3,10,128,128,128,0,0,0,0,0,0,0,0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm15, %xmm0 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [0,0,128,128,1,8,15,0,0,0,128,128,1,8,15,0] -; AVX1-ONLY-NEXT: # xmm3 = mem[0,0] +; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm3 = [0,0,128,128,1,8,15,0,0,0,0,0,0,0,0,0] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm7, %xmm4 ; AVX1-ONLY-NEXT: vpor %xmm0, %xmm4, %xmm4 @@ -7098,10 +7082,10 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vpor %xmm0, %xmm1, %xmm0 ; AVX1-ONLY-NEXT: vpblendvb %xmm13, %xmm2, %xmm0, %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm0 = [2,9,0,0,2,9,0,0,2,9,0,0,2,9,0,0] +; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm0 = [2,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm7, %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, %xmm14 -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm0 = [0,0,4,11,0,0,4,11,0,0,4,11,0,0,4,11] +; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm0 = [0,0,4,11,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm2, %xmm2 ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] @@ -7167,7 +7151,7 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm11 = [0,0,0,1,2,3,8,15,0,0,0,1,2,3,8,15] ; AVX1-ONLY-NEXT: # xmm11 = mem[0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm7, %xmm10 -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm0 = [0,7,14,0,0,7,14,0,0,7,14,0,0,7,14,0] +; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm0 = [0,7,14,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX1-ONLY-NEXT: vmovdqa 112(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm2, %xmm12 @@ -7286,7 +7270,7 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vmovdqa %xmm4, %xmm12 ; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm4, %xmm7 ; AVX1-ONLY-NEXT: vpor %xmm7, %xmm1, %xmm1 -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm9 = [2,9,0,0,2,9,0,0,2,9,0,0,2,9,0,0] +; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm9 = [2,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm3, %xmm7 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm1, %ymm7 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255] @@ -7341,7 +7325,7 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm12, %xmm6 ; AVX1-ONLY-NEXT: vmovdqa %xmm12, %xmm11 ; AVX1-ONLY-NEXT: vpor %xmm6, %xmm2, %xmm6 -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm8 = [3,10,0,0,3,10,0,0,3,10,0,0,3,10,0,0] +; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm8 = [3,10,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm2, %xmm7 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm6, %ymm6 @@ -7371,9 +7355,9 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vorps %ymm4, %ymm3, %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm14 = [2,9,0,0,2,9,0,0,2,9,0,0,2,9,0,0] +; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm14 = [2,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm8, %xmm3 -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm0 = [4,11,0,0,4,11,0,0,4,11,0,0,4,11,0,0] +; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm0 = [4,11,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm13, %xmm4 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] @@ -7442,9 +7426,9 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vandps %ymm3, %ymm15, %ymm3 ; AVX1-ONLY-NEXT: vorps %ymm4, %ymm3, %ymm3 ; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm2 = [3,10,0,0,3,10,0,0,3,10,0,0,3,10,0,0] +; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm2 = [3,10,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm8, %xmm3 -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm5 = [5,12,0,0,5,12,0,0,5,12,0,0,5,12,0,0] +; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm5 = [5,12,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm13, %xmm4 ; AVX1-ONLY-NEXT: vmovdqa %xmm5, %xmm8 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] @@ -7512,7 +7496,7 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[4,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm2 = [6,13,0,0,6,13,0,0,6,13,0,0,6,13,0,0] +; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm2 = [6,13,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm0, %xmm6 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3] @@ -9561,7 +9545,7 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] ; AVX512F-ONLY-SLOW-NEXT: vpternlogq $184, %ymm0, %ymm1, %ymm2 ; AVX512F-ONLY-SLOW-NEXT: vpternlogq $226, %ymm17, %ymm7, %ymm28 -; AVX512F-ONLY-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm10 = [4,11,0,0,4,11,0,0,4,11,0,0,4,11,0,0] +; AVX512F-ONLY-SLOW-NEXT: vmovd {{.*#+}} xmm10 = [4,11,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX512F-ONLY-SLOW-NEXT: vpternlogq $202, %ymm29, %ymm21, %ymm15 ; AVX512F-ONLY-SLOW-NEXT: vpternlogq $202, %ymm21, %ymm29, %ymm7 ; AVX512F-ONLY-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload @@ -9977,7 +9961,7 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] ; AVX512F-ONLY-FAST-NEXT: vpternlogq $184, %ymm2, %ymm11, %ymm3 ; AVX512F-ONLY-FAST-NEXT: vpternlogq $226, %ymm15, %ymm14, %ymm16 -; AVX512F-ONLY-FAST-NEXT: vpbroadcastd {{.*#+}} xmm8 = [4,11,0,0,4,11,0,0,4,11,0,0,4,11,0,0] +; AVX512F-ONLY-FAST-NEXT: vmovd {{.*#+}} xmm8 = [4,11,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX512F-ONLY-FAST-NEXT: vpternlogq $202, %ymm29, %ymm31, %ymm13 ; AVX512F-ONLY-FAST-NEXT: vpternlogq $202, %ymm31, %ymm29, %ymm14 ; AVX512F-ONLY-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -10412,7 +10396,7 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-SLOW-NEXT: vmovdqa64 {{.*#+}} ymm18 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] ; AVX512DQ-SLOW-NEXT: vpternlogq $184, %ymm0, %ymm18, %ymm2 ; AVX512DQ-SLOW-NEXT: vpternlogq $226, %ymm21, %ymm7, %ymm16 -; AVX512DQ-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm10 = [4,11,0,0,4,11,0,0,4,11,0,0,4,11,0,0] +; AVX512DQ-SLOW-NEXT: vmovd {{.*#+}} xmm10 = [4,11,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX512DQ-SLOW-NEXT: vpternlogq $202, %ymm6, %ymm13, %ymm15 ; AVX512DQ-SLOW-NEXT: vpternlogq $202, %ymm13, %ymm6, %ymm7 ; AVX512DQ-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload @@ -10829,7 +10813,7 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] ; AVX512DQ-FAST-NEXT: vpternlogq $184, %ymm2, %ymm11, %ymm8 ; AVX512DQ-FAST-NEXT: vpternlogq $226, %ymm15, %ymm14, %ymm16 -; AVX512DQ-FAST-NEXT: vpbroadcastd {{.*#+}} xmm6 = [4,11,0,0,4,11,0,0,4,11,0,0,4,11,0,0] +; AVX512DQ-FAST-NEXT: vmovd {{.*#+}} xmm6 = [4,11,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX512DQ-FAST-NEXT: vpternlogq $202, %ymm31, %ymm0, %ymm13 ; AVX512DQ-FAST-NEXT: vpternlogq $202, %ymm0, %ymm31, %ymm14 ; AVX512DQ-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-8.ll index f2133b9e42d30d..0f779732c0fd52 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-8.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-8.ll @@ -240,37 +240,37 @@ define void @load_i8_stride8_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %r11 -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm0 = [0,8,0,0,0,8,0,0,0,8,0,0,0,8,0,0] +; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm0 = [0,8,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm2, %xmm3 ; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm1, %xmm0 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm3 = [1,9,0,0,1,9,0,0,1,9,0,0,1,9,0,0] +; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm3 = [1,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm2, %xmm4 ; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm1, %xmm3 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm4 = [2,10,0,0,2,10,0,0,2,10,0,0,2,10,0,0] +; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm4 = [2,10,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm2, %xmm5 ; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm1, %xmm4 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm5 = [3,11,0,0,3,11,0,0,3,11,0,0,3,11,0,0] +; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm5 = [3,11,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm2, %xmm6 ; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm1, %xmm5 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm6 = [4,12,0,0,4,12,0,0,4,12,0,0,4,12,0,0] +; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm6 = [4,12,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm2, %xmm7 ; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm1, %xmm6 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm7 = [5,13,0,0,5,13,0,0,5,13,0,0,5,13,0,0] +; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm7 = [5,13,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm2, %xmm8 ; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm1, %xmm7 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3] -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm8 = [6,14,0,0,6,14,0,0,6,14,0,0,6,14,0,0] +; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm8 = [6,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm2, %xmm9 ; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm1, %xmm8 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3] -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm9 = [7,15,0,0,7,15,0,0,7,15,0,0,7,15,0,0] +; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm9 = [7,15,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm2, %xmm2 ; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm1, %xmm1 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] @@ -592,7 +592,7 @@ define void @load_i8_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %r11 -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm4 = [0,0,0,8,0,0,0,8,0,0,0,8,0,0,0,8] +; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm4 = [0,0,0,8,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm2 @@ -600,70 +600,70 @@ define void @load_i8_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm3, %xmm5 ; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm2, %xmm4 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm5 = [0,8,0,0,0,8,0,0,0,8,0,0,0,8,0,0] +; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm5 = [0,8,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm1, %xmm6 ; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm0, %xmm5 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3],xmm5[4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm5 = [0,0,1,9,0,0,1,9,0,0,1,9,0,0,1,9] +; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm5 = [0,0,1,9,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm3, %xmm6 ; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm2, %xmm5 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm6 = [1,9,0,0,1,9,0,0,1,9,0,0,1,9,0,0] +; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm6 = [1,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm1, %xmm7 ; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm0, %xmm6 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3],xmm6[4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm6 = [0,0,2,10,0,0,2,10,0,0,2,10,0,0,2,10] +; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm6 = [0,0,2,10,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm3, %xmm7 ; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm2, %xmm6 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm7 = [2,10,0,0,2,10,0,0,2,10,0,0,2,10,0,0] +; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm7 = [2,10,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm1, %xmm8 ; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm0, %xmm7 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0,1],xmm6[2,3],xmm7[4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm7 = [0,0,3,11,0,0,3,11,0,0,3,11,0,0,3,11] +; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm7 = [0,0,3,11,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm3, %xmm8 ; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm2, %xmm7 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3] -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm8 = [3,11,0,0,3,11,0,0,3,11,0,0,3,11,0,0] +; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm8 = [3,11,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm1, %xmm9 ; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm0, %xmm8 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0,1],xmm7[2,3],xmm8[4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm8 = [0,0,4,12,0,0,4,12,0,0,4,12,0,0,4,12] +; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm8 = [0,0,4,12,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm3, %xmm9 ; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm2, %xmm8 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3] -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm9 = [4,12,0,0,4,12,0,0,4,12,0,0,4,12,0,0] +; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm9 = [4,12,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm1, %xmm10 ; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm0, %xmm9 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3],xmm9[4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm9 = [0,0,5,13,0,0,5,13,0,0,5,13,0,0,5,13] +; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm9 = [0,0,5,13,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm3, %xmm10 ; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm2, %xmm9 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3] -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm10 = [5,13,0,0,5,13,0,0,5,13,0,0,5,13,0,0] +; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm10 = [5,13,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm1, %xmm11 ; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm0, %xmm10 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0,1],xmm9[2,3],xmm10[4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm10 = [0,0,6,14,0,0,6,14,0,0,6,14,0,0,6,14] +; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm10 = [0,0,6,14,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm3, %xmm11 ; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm2, %xmm10 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3] -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm11 = [6,14,0,0,6,14,0,0,6,14,0,0,6,14,0,0] +; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm11 = [6,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm1, %xmm12 ; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm0, %xmm11 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0,1],xmm10[2,3],xmm11[4,5,6,7] -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm11 = [0,0,7,15,0,0,7,15,0,0,7,15,0,0,7,15] +; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm11 = [0,0,7,15,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm3, %xmm3 ; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm2, %xmm2 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm3 = [7,15,0,0,7,15,0,0,7,15,0,0,7,15,0,0] +; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm3 = [7,15,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm1, %xmm1 ; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm0, %xmm0 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] @@ -1287,7 +1287,7 @@ define void @load_i8_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm5, %xmm3 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1],xmm3[2],xmm6[2],xmm3[3],xmm6[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3,4,5],xmm0[6,7] -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm9 = [0,0,0,8,0,0,0,8,0,0,0,8,0,0,0,8] +; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm9 = [0,0,0,8,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm3 ; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm6 @@ -1295,7 +1295,7 @@ define void @load_i8_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm7, %xmm10 ; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm6, %xmm9 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3] -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm10 = [0,8,0,0,0,8,0,0,0,8,0,0,0,8,0,0] +; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm10 = [0,8,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm3, %xmm11 ; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm1, %xmm10 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3] @@ -1311,11 +1311,11 @@ define void @load_i8_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm5, %xmm10 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0,1,2,3,4,5],xmm9[6,7] -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm10 = [0,0,1,9,0,0,1,9,0,0,1,9,0,0,1,9] +; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm10 = [0,0,1,9,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm7, %xmm11 ; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm6, %xmm10 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3] -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm11 = [1,9,0,0,1,9,0,0,1,9,0,0,1,9,0,0] +; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm11 = [1,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm3, %xmm12 ; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm1, %xmm11 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3] @@ -1331,11 +1331,11 @@ define void @load_i8_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm5, %xmm11 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0,1,2,3,4,5],xmm10[6,7] -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm11 = [0,0,2,10,0,0,2,10,0,0,2,10,0,0,2,10] +; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm11 = [0,0,2,10,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm7, %xmm12 ; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm6, %xmm11 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3] -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm12 = [2,10,0,0,2,10,0,0,2,10,0,0,2,10,0,0] +; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm12 = [2,10,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm3, %xmm13 ; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm1, %xmm12 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, %xmm9 @@ -1351,11 +1351,11 @@ define void @load_i8_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm5, %xmm12 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm12[0,1,2,3,4,5],xmm11[6,7] -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm12 = [0,0,3,11,0,0,3,11,0,0,3,11,0,0,3,11] +; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm12 = [0,0,3,11,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm7, %xmm13 ; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm6, %xmm12 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3] -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm13 = [3,11,0,0,3,11,0,0,3,11,0,0,3,11,0,0] +; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm13 = [3,11,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm3, %xmm14 ; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm1, %xmm13 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm13[0],xmm14[0],xmm13[1],xmm14[1],xmm13[2],xmm14[2],xmm13[3],xmm14[3] @@ -1370,11 +1370,11 @@ define void @load_i8_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm5, %xmm13 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm13[0],xmm14[0],xmm13[1],xmm14[1],xmm13[2],xmm14[2],xmm13[3],xmm14[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm13[0,1,2,3,4,5],xmm12[6,7] -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm13 = [0,0,4,12,0,0,4,12,0,0,4,12,0,0,4,12] +; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm13 = [0,0,4,12,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm7, %xmm14 ; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm6, %xmm13 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm13[0],xmm14[0],xmm13[1],xmm14[1],xmm13[2],xmm14[2],xmm13[3],xmm14[3] -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm14 = [4,12,0,0,4,12,0,0,4,12,0,0,4,12,0,0] +; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm14 = [4,12,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm3, %xmm15 ; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm1, %xmm14 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3] @@ -1389,11 +1389,11 @@ define void @load_i8_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm5, %xmm14 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm14[0,1,2,3,4,5],xmm13[6,7] -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm14 = [0,0,5,13,0,0,5,13,0,0,5,13,0,0,5,13] +; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm14 = [0,0,5,13,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm7, %xmm15 ; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm6, %xmm14 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3] -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm15 = [5,13,0,0,5,13,0,0,5,13,0,0,5,13,0,0] +; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm15 = [5,13,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm3, %xmm0 ; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm1, %xmm15 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3] @@ -1408,11 +1408,11 @@ define void @load_i8_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm5, %xmm14 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm14[0,1,2,3,4,5],xmm0[6,7] -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm14 = [0,0,6,14,0,0,6,14,0,0,6,14,0,0,6,14] +; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm14 = [0,0,6,14,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm7, %xmm15 ; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm6, %xmm14 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3] -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm15 = [6,14,0,0,6,14,0,0,6,14,0,0,6,14,0,0] +; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm15 = [6,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm3, %xmm0 ; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm9, %xmm15 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3] @@ -1427,11 +1427,11 @@ define void @load_i8_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm5, %xmm2 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,5],xmm1[6,7] -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm2 = [0,0,7,15,0,0,7,15,0,0,7,15,0,0,7,15] +; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm2 = [0,0,7,15,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm7, %xmm4 ; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm6, %xmm2 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm4 = [7,15,0,0,7,15,0,0,7,15,0,0,7,15,0,0] +; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm4 = [7,15,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm3, %xmm3 ; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm9, %xmm4 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] @@ -2850,7 +2850,7 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm3[0,1,2,3,4,5],xmm0[6,7] -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm14 = [0,0,0,8,0,0,0,8,0,0,0,8,0,0,0,8] +; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm14 = [0,0,0,8,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm1 @@ -2862,7 +2862,7 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm3, %xmm9 ; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm2, %xmm11 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3] -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm3 = [0,8,0,0,0,8,0,0,0,8,0,0,0,8,0,0] +; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm3 = [0,8,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm1, %xmm11 ; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm0, %xmm13 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm13[0],xmm11[0],xmm13[1],xmm11[1],xmm13[2],xmm11[2],xmm13[3],xmm11[3] @@ -2914,13 +2914,13 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm5, %xmm4 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3,4,5],xmm1[6,7] -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm3 = [0,0,1,9,0,0,1,9,0,0,1,9,0,0,1,9] +; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm3 = [0,0,1,9,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm13, %xmm4 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm9, %xmm5 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm5 = [1,9,0,0,1,9,0,0,1,9,0,0,1,9,0,0] +; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm5 = [1,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm11, %xmm0 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload @@ -2965,11 +2965,11 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm8, %xmm4 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3,4,5],xmm1[6,7] -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm3 = [0,0,2,10,0,0,2,10,0,0,2,10,0,0,2,10] +; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm3 = [0,0,2,10,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm13, %xmm4 ; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm9, %xmm5 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm5 = [2,10,0,0,2,10,0,0,2,10,0,0,2,10,0,0] +; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm5 = [2,10,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm11, %xmm6 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm13, %xmm7 @@ -3011,13 +3011,13 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm8, %xmm4 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3,4,5],xmm1[6,7] -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm3 = [0,0,3,11,0,0,3,11,0,0,3,11,0,0,3,11] +; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm3 = [0,0,3,11,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm9, %xmm4 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm5, %xmm5 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm5 = [3,11,0,0,3,11,0,0,3,11,0,0,3,11,0,0] +; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm5 = [3,11,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm6, %xmm6 ; AVX1-ONLY-NEXT: vmovdqa %xmm13, %xmm8 @@ -3062,12 +3062,12 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm13, %xmm4 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3,4,5],xmm1[6,7] -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm3 = [0,0,4,12,0,0,4,12,0,0,4,12,0,0,4,12] +; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm3 = [0,0,4,12,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm9, %xmm4 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm5, %xmm5 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm5 = [4,12,0,0,4,12,0,0,4,12,0,0,4,12,0,0] +; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm5 = [4,12,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm6, %xmm6 ; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm8, %xmm7 @@ -3108,13 +3108,13 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm13, %xmm4 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3,4,5],xmm1[6,7] -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm3 = [0,0,5,13,0,0,5,13,0,0,5,13,0,0,5,13] +; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm3 = [0,0,5,13,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm4, %xmm4 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm11, %xmm5 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm5 = [5,13,0,0,5,13,0,0,5,13,0,0,5,13,0,0] +; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm5 = [5,13,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm10, %xmm6 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload @@ -3160,12 +3160,12 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm4, %xmm4 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3,4,5],xmm1[6,7] -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm3 = [0,0,6,14,0,0,6,14,0,0,6,14,0,0,6,14] +; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm3 = [0,0,6,14,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm8, %xmm4 ; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm11, %xmm5 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm5 = [6,14,0,0,6,14,0,0,6,14,0,0,6,14,0,0] +; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm5 = [6,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm10, %xmm6 ; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm14, %xmm7 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] @@ -3208,12 +3208,12 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm4, %xmm4 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3,4,5],xmm1[6,7] -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm3 = [0,0,7,15,0,0,7,15,0,0,7,15,0,0,7,15] +; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm3 = [0,0,7,15,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm8, %xmm4 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm5, %xmm5 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm5 = [7,15,0,0,7,15,0,0,7,15,0,0,7,15,0,0] +; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm5 = [7,15,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm6, %xmm6 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload @@ -4529,14 +4529,14 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3] ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm12[0,1,2,3,4,5,6],ymm4[7] -; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm12 = [0,0,1,9,0,0,1,9,0,0,1,9,0,0,1,9] +; AVX512F-SLOW-NEXT: vmovd {{.*#+}} xmm12 = [0,0,1,9,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX512F-SLOW-NEXT: vpshufb %xmm12, %xmm6, %xmm13 ; AVX512F-SLOW-NEXT: vpshufb %xmm12, %xmm0, %xmm12 ; AVX512F-SLOW-NEXT: vmovdqa %xmm0, %xmm10 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3] ; AVX512F-SLOW-NEXT: vmovdqa 128(%rdi), %xmm8 ; AVX512F-SLOW-NEXT: vmovdqa 144(%rdi), %xmm9 -; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1,9,0,0,1,9,0,0,1,9,0,0,1,9,0,0] +; AVX512F-SLOW-NEXT: vmovd {{.*#+}} xmm1 = [1,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm9, %xmm0 ; AVX512F-SLOW-NEXT: vmovdqa %xmm9, %xmm13 ; AVX512F-SLOW-NEXT: vpshufb %xmm1, %xmm8, %xmm1 @@ -4577,12 +4577,12 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5,6],ymm1[7] -; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm4 = [0,0,2,10,0,0,2,10,0,0,2,10,0,0,2,10] +; AVX512F-SLOW-NEXT: vmovd {{.*#+}} xmm4 = [0,0,2,10,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm6, %xmm14 ; AVX512F-SLOW-NEXT: vmovdqa %xmm6, %xmm12 ; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm10, %xmm4 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm14[0],xmm4[1],xmm14[1],xmm4[2],xmm14[2],xmm4[3],xmm14[3] -; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm14 = [2,10,0,0,2,10,0,0,2,10,0,0,2,10,0,0] +; AVX512F-SLOW-NEXT: vmovd {{.*#+}} xmm14 = [2,10,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX512F-SLOW-NEXT: vpshufb %xmm14, %xmm13, %xmm15 ; AVX512F-SLOW-NEXT: vmovdqa64 %xmm26, %xmm6 ; AVX512F-SLOW-NEXT: vpshufb %xmm14, %xmm6, %xmm14 @@ -4619,12 +4619,12 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5,6],ymm1[7] -; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm4 = [0,0,3,11,0,0,3,11,0,0,3,11,0,0,3,11] +; AVX512F-SLOW-NEXT: vmovd {{.*#+}} xmm4 = [0,0,3,11,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm12, %xmm14 ; AVX512F-SLOW-NEXT: vmovdqa64 %xmm12, %xmm27 ; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm10, %xmm4 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm14[0],xmm4[1],xmm14[1],xmm4[2],xmm14[2],xmm4[3],xmm14[3] -; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm14 = [3,11,0,0,3,11,0,0,3,11,0,0,3,11,0,0] +; AVX512F-SLOW-NEXT: vmovd {{.*#+}} xmm14 = [3,11,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX512F-SLOW-NEXT: vpshufb %xmm14, %xmm13, %xmm15 ; AVX512F-SLOW-NEXT: vmovdqa64 %xmm26, %xmm12 ; AVX512F-SLOW-NEXT: vpshufb %xmm14, %xmm12, %xmm14 @@ -4662,13 +4662,13 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5,6],ymm1[7] -; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm4 = [0,0,4,12,0,0,4,12,0,0,4,12,0,0,4,12] +; AVX512F-SLOW-NEXT: vmovd {{.*#+}} xmm4 = [0,0,4,12,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX512F-SLOW-NEXT: vmovdqa64 %xmm27, %xmm0 ; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm0, %xmm14 ; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm10, %xmm4 ; AVX512F-SLOW-NEXT: vmovdqa64 %xmm10, %xmm22 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm14[0],xmm4[1],xmm14[1],xmm4[2],xmm14[2],xmm4[3],xmm14[3] -; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm14 = [4,12,0,0,4,12,0,0,4,12,0,0,4,12,0,0] +; AVX512F-SLOW-NEXT: vmovd {{.*#+}} xmm14 = [4,12,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX512F-SLOW-NEXT: vpshufb %xmm14, %xmm13, %xmm0 ; AVX512F-SLOW-NEXT: vmovdqa64 %xmm13, %xmm25 ; AVX512F-SLOW-NEXT: vpshufb %xmm14, %xmm12, %xmm14 @@ -4705,13 +4705,13 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7] -; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm3 = [0,0,5,13,0,0,5,13,0,0,5,13,0,0,5,13] +; AVX512F-SLOW-NEXT: vmovd {{.*#+}} xmm3 = [0,0,5,13,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX512F-SLOW-NEXT: vmovdqa64 %xmm27, %xmm6 ; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm6, %xmm4 ; AVX512F-SLOW-NEXT: vmovdqa64 %xmm22, %xmm7 ; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm7, %xmm3 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm4 = [5,13,0,0,5,13,0,0,5,13,0,0,5,13,0,0] +; AVX512F-SLOW-NEXT: vmovd {{.*#+}} xmm4 = [5,13,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX512F-SLOW-NEXT: vmovdqa64 %xmm25, %xmm9 ; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm9, %xmm0 ; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm12, %xmm4 @@ -4749,11 +4749,11 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7] -; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm3 = [0,0,6,14,0,0,6,14,0,0,6,14,0,0,6,14] +; AVX512F-SLOW-NEXT: vmovd {{.*#+}} xmm3 = [0,0,6,14,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm6, %xmm5 ; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm7, %xmm3 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3] -; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm5 = [6,14,0,0,6,14,0,0,6,14,0,0,6,14,0,0] +; AVX512F-SLOW-NEXT: vmovd {{.*#+}} xmm5 = [6,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm9, %xmm0 ; AVX512F-SLOW-NEXT: vmovdqa %xmm9, %xmm14 ; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm12, %xmm5 @@ -4789,11 +4789,11 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5,6],ymm2[7] -; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm4 = [0,0,7,15,0,0,7,15,0,0,7,15,0,0,7,15] +; AVX512F-SLOW-NEXT: vmovd {{.*#+}} xmm4 = [0,0,7,15,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm6, %xmm5 ; AVX512F-SLOW-NEXT: vpshufb %xmm4, %xmm7, %xmm4 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] -; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm5 = [7,15,0,0,7,15,0,0,7,15,0,0,7,15,0,0] +; AVX512F-SLOW-NEXT: vmovd {{.*#+}} xmm5 = [7,15,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm14, %xmm7 ; AVX512F-SLOW-NEXT: vpshufb %xmm5, %xmm12, %xmm5 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3] @@ -7495,7 +7495,7 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm4, %xmm4 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0,1,2,3,4,5],xmm0[6,7] -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm2 = [0,0,0,8,0,0,0,8,0,0,0,8,0,0,0,8] +; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm2 = [0,0,0,8,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX1-ONLY-NEXT: vmovdqa 304(%rdi), %xmm12 ; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm12, %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -7503,7 +7503,7 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm5, %xmm5 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3] -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm9 = [0,8,0,0,0,8,0,0,0,8,0,0,0,8,0,0] +; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm9 = [0,8,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX1-ONLY-NEXT: vmovdqa 272(%rdi), %xmm11 ; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm11, %xmm6 ; AVX1-ONLY-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -7625,12 +7625,12 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm13, %xmm4 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm3[0,1,2,3,4,5],xmm0[6,7] -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm2 = [0,0,1,9,0,0,1,9,0,0,1,9,0,0,1,9] +; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm2 = [0,0,1,9,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm12, %xmm3 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm7, %xmm5 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm3 = [1,9,0,0,1,9,0,0,1,9,0,0,1,9,0,0] +; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm3 = [1,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm11, %xmm0 ; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm10, %xmm15 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3] @@ -7724,12 +7724,12 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm13, %xmm4 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm3[0,1,2,3,4,5],xmm2[6,7] -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm2 = [0,0,2,10,0,0,2,10,0,0,2,10,0,0,2,10] +; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm2 = [0,0,2,10,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm3, %xmm3 ; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm7, %xmm5 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm3 = [2,10,0,0,2,10,0,0,2,10,0,0,2,10,0,0] +; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm3 = [2,10,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm7, %xmm6 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload @@ -7822,13 +7822,13 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm4, %xmm4 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm3[0,1,2,3,4,5],xmm2[6,7] -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm2 = [0,0,3,11,0,0,3,11,0,0,3,11,0,0,3,11] +; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm2 = [0,0,3,11,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm3, %xmm3 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm5, %xmm5 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm3 = [3,11,0,0,3,11,0,0,3,11,0,0,3,11,0,0] +; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm3 = [3,11,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm7, %xmm6 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm7, %xmm14 @@ -7919,13 +7919,13 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm4, %xmm4 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm3[0,1,2,3,4,5],xmm2[6,7] -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm2 = [0,0,4,12,0,0,4,12,0,0,4,12,0,0,4,12] +; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm2 = [0,0,4,12,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm7, %xmm3 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm5, %xmm5 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm3 = [4,12,0,0,4,12,0,0,4,12,0,0,4,12,0,0] +; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm3 = [4,12,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm6, %xmm6 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload @@ -8019,12 +8019,12 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm4, %xmm4 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm3[0,1,2,3,4,5],xmm2[6,7] -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm2 = [0,0,5,13,0,0,5,13,0,0,5,13,0,0,5,13] +; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm2 = [0,0,5,13,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm7, %xmm3 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm5, %xmm5 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm3 = [5,13,0,0,5,13,0,0,5,13,0,0,5,13,0,0] +; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm3 = [5,13,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm6, %xmm6 ; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm8, %xmm14 @@ -8116,13 +8116,13 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm4, %xmm4 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm3[0,1,2,3,4,5],xmm2[6,7] -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm2 = [0,0,6,14,0,0,6,14,0,0,6,14,0,0,6,14] +; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm2 = [0,0,6,14,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm3, %xmm3 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm5, %xmm5 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm3 = [6,14,0,0,6,14,0,0,6,14,0,0,6,14,0,0] +; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm3 = [6,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm6, %xmm6 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload @@ -8212,13 +8212,13 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm4, %xmm4 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm1[0,1,2,3,4,5],xmm0[6,7] -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm1 = [0,0,7,15,0,0,7,15,0,0,7,15,0,0,7,15] +; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm1 = [0,0,7,15,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm0, %xmm0 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm6, %xmm6 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3] -; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm0 = [7,15,0,0,7,15,0,0,7,15,0,0,7,15,0,0] +; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm0 = [7,15,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload ; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm11, %xmm14 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload @@ -10910,14 +10910,14 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm3[0,1,2,3,4,5,6],ymm2[7] ; AVX512F-SLOW-NEXT: vmovdqa 416(%rdi), %xmm0 ; AVX512F-SLOW-NEXT: vmovdqa 432(%rdi), %xmm11 -; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm3 = [0,0,1,9,0,0,1,9,0,0,1,9,0,0,1,9] +; AVX512F-SLOW-NEXT: vmovd {{.*#+}} xmm3 = [0,0,1,9,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm11, %xmm2 ; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm0, %xmm5 ; AVX512F-SLOW-NEXT: vmovdqa64 %xmm0, %xmm22 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3] ; AVX512F-SLOW-NEXT: vmovdqa 384(%rdi), %xmm0 ; AVX512F-SLOW-NEXT: vmovdqa 400(%rdi), %xmm12 -; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm2 = [1,9,0,0,1,9,0,0,1,9,0,0,1,9,0,0] +; AVX512F-SLOW-NEXT: vmovd {{.*#+}} xmm2 = [1,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm12, %xmm6 ; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm0, %xmm7 ; AVX512F-SLOW-NEXT: vmovdqa64 %xmm0, %xmm31 @@ -11008,13 +11008,13 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm3[0,1,2,3,4,5,6],ymm2[7] -; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm3 = [0,0,2,10,0,0,2,10,0,0,2,10,0,0,2,10] +; AVX512F-SLOW-NEXT: vmovd {{.*#+}} xmm3 = [0,0,2,10,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm11, %xmm2 ; AVX512F-SLOW-NEXT: vmovdqa64 %xmm11, %xmm24 ; AVX512F-SLOW-NEXT: vmovdqa64 %xmm22, %xmm11 ; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm11, %xmm9 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm2[0],xmm9[1],xmm2[1],xmm9[2],xmm2[2],xmm9[3],xmm2[3] -; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2,10,0,0,2,10,0,0,2,10,0,0,2,10,0,0] +; AVX512F-SLOW-NEXT: vmovd {{.*#+}} xmm2 = [2,10,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm12, %xmm0 ; AVX512F-SLOW-NEXT: vmovdqa64 %xmm12, %xmm30 ; AVX512F-SLOW-NEXT: vmovdqa64 %xmm31, %xmm5 @@ -11100,13 +11100,13 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm3[0,1,2,3,4,5,6],ymm2[7] -; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm2 = [0,0,3,11,0,0,3,11,0,0,3,11,0,0,3,11] +; AVX512F-SLOW-NEXT: vmovd {{.*#+}} xmm2 = [0,0,3,11,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX512F-SLOW-NEXT: vmovdqa64 %xmm24, %xmm3 ; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm3, %xmm3 ; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm11, %xmm8 ; AVX512F-SLOW-NEXT: vmovdqa64 %xmm22, %xmm28 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm3[0],xmm8[1],xmm3[1],xmm8[2],xmm3[2],xmm8[3],xmm3[3] -; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm3 = [3,11,0,0,3,11,0,0,3,11,0,0,3,11,0,0] +; AVX512F-SLOW-NEXT: vmovd {{.*#+}} xmm3 = [3,11,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX512F-SLOW-NEXT: vmovdqa64 %xmm30, %xmm9 ; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm9, %xmm9 ; AVX512F-SLOW-NEXT: vmovdqa64 %xmm31, %xmm11 @@ -11189,13 +11189,13 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm3[0,1,2,3,4,5,6],ymm2[7] -; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm2 = [0,0,4,12,0,0,4,12,0,0,4,12,0,0,4,12] +; AVX512F-SLOW-NEXT: vmovd {{.*#+}} xmm2 = [0,0,4,12,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX512F-SLOW-NEXT: vmovdqa64 %xmm24, %xmm11 ; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm11, %xmm3 ; AVX512F-SLOW-NEXT: vmovdqa64 %xmm28, %xmm7 ; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm7, %xmm8 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm3[0],xmm8[1],xmm3[1],xmm8[2],xmm3[2],xmm8[3],xmm3[3] -; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm3 = [4,12,0,0,4,12,0,0,4,12,0,0,4,12,0,0] +; AVX512F-SLOW-NEXT: vmovd {{.*#+}} xmm3 = [4,12,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX512F-SLOW-NEXT: vmovdqa64 %xmm30, %xmm7 ; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm7, %xmm9 ; AVX512F-SLOW-NEXT: vmovdqa64 %xmm31, %xmm12 @@ -11278,13 +11278,13 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm3[0,1,2,3,4,5,6],ymm2[7] -; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm2 = [0,0,5,13,0,0,5,13,0,0,5,13,0,0,5,13] +; AVX512F-SLOW-NEXT: vmovd {{.*#+}} xmm2 = [0,0,5,13,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm11, %xmm3 ; AVX512F-SLOW-NEXT: vmovdqa64 %xmm24, %xmm27 ; AVX512F-SLOW-NEXT: vmovdqa64 %xmm28, %xmm5 ; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm5, %xmm8 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm3[0],xmm8[1],xmm3[1],xmm8[2],xmm3[2],xmm8[3],xmm3[3] -; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm3 = [5,13,0,0,5,13,0,0,5,13,0,0,5,13,0,0] +; AVX512F-SLOW-NEXT: vmovd {{.*#+}} xmm3 = [5,13,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX512F-SLOW-NEXT: vmovdqa64 %xmm30, %xmm10 ; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm10, %xmm9 ; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm12, %xmm15 @@ -11372,13 +11372,13 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm3[0,1,2,3,4,5,6],ymm2[7] -; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm2 = [0,0,6,14,0,0,6,14,0,0,6,14,0,0,6,14] +; AVX512F-SLOW-NEXT: vmovd {{.*#+}} xmm2 = [0,0,6,14,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX512F-SLOW-NEXT: vmovdqa64 %xmm27, %xmm3 ; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm3, %xmm3 ; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm5, %xmm8 ; AVX512F-SLOW-NEXT: vmovdqa64 %xmm5, %xmm21 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm3[0],xmm8[1],xmm3[1],xmm8[2],xmm3[2],xmm8[3],xmm3[3] -; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm3 = [6,14,0,0,6,14,0,0,6,14,0,0,6,14,0,0] +; AVX512F-SLOW-NEXT: vmovd {{.*#+}} xmm3 = [6,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm10, %xmm9 ; AVX512F-SLOW-NEXT: vmovdqa64 %xmm10, %xmm18 ; AVX512F-SLOW-NEXT: vmovdqa64 %xmm31, %xmm5 @@ -11462,13 +11462,13 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512F-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm3[0,1,2,3,4,5,6],ymm2[7] -; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm2 = [0,0,7,15,0,0,7,15,0,0,7,15,0,0,7,15] +; AVX512F-SLOW-NEXT: vmovd {{.*#+}} xmm2 = [0,0,7,15,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX512F-SLOW-NEXT: vmovdqa64 %xmm27, %xmm3 ; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm3, %xmm3 ; AVX512F-SLOW-NEXT: vmovdqa64 %xmm21, %xmm8 ; AVX512F-SLOW-NEXT: vpshufb %xmm2, %xmm8, %xmm8 ; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm3[0],xmm8[1],xmm3[1],xmm8[2],xmm3[2],xmm8[3],xmm3[3] -; AVX512F-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm3 = [7,15,0,0,7,15,0,0,7,15,0,0,7,15,0,0] +; AVX512F-SLOW-NEXT: vmovd {{.*#+}} xmm3 = [7,15,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX512F-SLOW-NEXT: vmovdqa64 %xmm18, %xmm9 ; AVX512F-SLOW-NEXT: vpshufb %xmm3, %xmm9, %xmm9 ; AVX512F-SLOW-NEXT: vmovdqa64 %xmm31, %xmm11 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-5.ll index c4bf0c3630e835..9ceb3a250bc418 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-5.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-5.ll @@ -342,7 +342,7 @@ define void @store_i8_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; AVX2-SLOW-NEXT: shrq $48, %rax ; AVX2-SLOW-NEXT: vmovd %eax, %xmm1 ; AVX2-SLOW-NEXT: vpbroadcastw %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm3 = [255,255,0,255,255,255,255,0,255,255,0,255,255,255,255,0] +; AVX2-SLOW-NEXT: vmovq {{.*#+}} xmm3 = [255,255,0,255,255,255,255,0,0,0,0,0,0,0,0,0] ; AVX2-SLOW-NEXT: vpblendvb %xmm3, %xmm0, %xmm1, %xmm0 ; AVX2-SLOW-NEXT: vmovq %xmm0, 32(%r9) ; AVX2-SLOW-NEXT: vmovdqa %ymm2, (%r9) @@ -374,7 +374,7 @@ define void @store_i8_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; AVX2-FAST-NEXT: shrq $48, %rax ; AVX2-FAST-NEXT: vmovd %eax, %xmm1 ; AVX2-FAST-NEXT: vpbroadcastw %xmm1, %xmm1 -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm3 = [255,255,0,255,255,255,255,0,255,255,0,255,255,255,255,0] +; AVX2-FAST-NEXT: vmovq {{.*#+}} xmm3 = [255,255,0,255,255,255,255,0,0,0,0,0,0,0,0,0] ; AVX2-FAST-NEXT: vpblendvb %xmm3, %xmm0, %xmm1, %xmm0 ; AVX2-FAST-NEXT: vmovq %xmm0, 32(%r9) ; AVX2-FAST-NEXT: vmovdqa %ymm2, (%r9) @@ -406,7 +406,7 @@ define void @store_i8_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; AVX2-FAST-PERLANE-NEXT: shrq $48, %rax ; AVX2-FAST-PERLANE-NEXT: vmovd %eax, %xmm1 ; AVX2-FAST-PERLANE-NEXT: vpbroadcastw %xmm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} xmm3 = [255,255,0,255,255,255,255,0,255,255,0,255,255,255,255,0] +; AVX2-FAST-PERLANE-NEXT: vmovq {{.*#+}} xmm3 = [255,255,0,255,255,255,255,0,0,0,0,0,0,0,0,0] ; AVX2-FAST-PERLANE-NEXT: vpblendvb %xmm3, %xmm0, %xmm1, %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovq %xmm0, 32(%r9) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, (%r9) diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll index 482da013d741be..3aab872fa4a91d 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll @@ -475,8 +475,7 @@ define void @store_i8_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = zero,xmm0[u,u,u,u,7,15],zero,xmm0[u,u,u,u,u,u,u,u] ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = xmm3[6,u,u,u,u],zero,zero,xmm3[7,u,u,u,u,u,u,u,u] ; AVX1-ONLY-NEXT: vpor %xmm6, %xmm7, %xmm6 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm7 = [0,255,255,255,255,0,0,0,0,255,255,255,255,0,0,0] -; AVX1-ONLY-NEXT: # xmm7 = mem[0,0] +; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm7 = [0,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0] ; AVX1-ONLY-NEXT: vpblendvb %xmm7, %xmm5, %xmm6, %xmm5 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,xmm2[0,8,u,u,u],zero,zero,xmm2[1,9,u,u,u],zero,zero ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = xmm1[0,8],zero,zero,xmm1[u,u,u,1,9],zero,zero,xmm1[u,u,u,2,10] diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-8.ll index e60034f7d6b75f..dcafb526790638 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-8.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-8.ll @@ -187,13 +187,11 @@ define void @store_i8_stride8_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],mem[0],xmm2[1],mem[1] ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],mem[0],xmm3[1],mem[1] ; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = [2,6,10,14,3,7,11,15,2,6,10,14,3,7,11,15] -; AVX1-ONLY-NEXT: # xmm2 = mem[0,0] +; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm2 = [2,6,10,14,3,7,11,15,0,0,0,0,0,0,0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm1, %xmm3 ; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm0, %xmm2 ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [0,4,8,12,1,5,9,13,0,4,8,12,1,5,9,13] -; AVX1-ONLY-NEXT: # xmm3 = mem[0,0] +; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm3 = [0,4,8,12,1,5,9,13,0,0,0,0,0,0,0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm1, %xmm1 ; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm0, %xmm0 ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] @@ -361,47 +359,39 @@ define void @store_i8_stride8_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero ; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero ; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0] -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm4 = [0,0,2,10,0,0,3,11,0,0,2,10,0,0,3,11] -; AVX1-ONLY-NEXT: # xmm4 = mem[0,0] +; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm4 = [0,0,2,10,0,0,3,11,0,0,0,0,0,0,0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm3, %xmm5 ; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm2, %xmm4 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm5 = [2,10,0,0,3,11,0,0,2,10,0,0,3,11,0,0] -; AVX1-ONLY-NEXT: # xmm5 = mem[0,0] +; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm5 = [2,10,0,0,3,11,0,0,0,0,0,0,0,0,0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm1, %xmm6 ; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm0, %xmm5 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3],xmm5[4,5],xmm4[6,7] -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm5 = [0,0,0,8,0,0,1,9,0,0,0,8,0,0,1,9] -; AVX1-ONLY-NEXT: # xmm5 = mem[0,0] +; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm5 = [0,0,0,8,0,0,1,9,0,0,0,0,0,0,0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm3, %xmm6 ; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm2, %xmm5 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm6 = [0,8,0,0,1,9,0,0,0,8,0,0,1,9,0,0] -; AVX1-ONLY-NEXT: # xmm6 = mem[0,0] +; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm6 = [0,8,0,0,1,9,0,0,0,0,0,0,0,0,0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm1, %xmm7 ; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm0, %xmm6 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3],xmm6[4,5],xmm5[6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm5, %ymm4 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm5 = [0,0,6,14,0,0,7,15,0,0,6,14,0,0,7,15] -; AVX1-ONLY-NEXT: # xmm5 = mem[0,0] +; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm5 = [0,0,6,14,0,0,7,15,0,0,0,0,0,0,0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm3, %xmm6 ; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm2, %xmm5 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm6 = [6,14,0,0,7,15,0,0,6,14,0,0,7,15,0,0] -; AVX1-ONLY-NEXT: # xmm6 = mem[0,0] +; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm6 = [6,14,0,0,7,15,0,0,0,0,0,0,0,0,0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm1, %xmm7 ; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm0, %xmm6 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3],xmm6[4,5],xmm5[6,7] -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm6 = [0,0,4,12,0,0,5,13,0,0,4,12,0,0,5,13] -; AVX1-ONLY-NEXT: # xmm6 = mem[0,0] +; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm6 = [0,0,4,12,0,0,5,13,0,0,0,0,0,0,0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm3, %xmm3 ; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm2, %xmm2 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [4,12,0,0,5,13,0,0,4,12,0,0,5,13,0,0] -; AVX1-ONLY-NEXT: # xmm3 = mem[0,0] +; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm3 = [4,12,0,0,5,13,0,0,0,0,0,0,0,0,0,0] ; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm1, %xmm1 ; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm0, %xmm0 ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] diff --git a/llvm/test/CodeGen/X86/vector-lzcnt-128.ll b/llvm/test/CodeGen/X86/vector-lzcnt-128.ll index 122b478577fbfe..5d02bb8b05f181 100644 --- a/llvm/test/CodeGen/X86/vector-lzcnt-128.ll +++ b/llvm/test/CodeGen/X86/vector-lzcnt-128.ll @@ -96,7 +96,7 @@ define <2 x i64> @testv2i64(<2 x i64> %in) nounwind { ; ; SSSE3-LABEL: testv2i64: ; SSSE3: # %bb.0: -; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; SSSE3-NEXT: movq {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; SSSE3-NEXT: movdqa %xmm2, %xmm3 ; SSSE3-NEXT: pshufb %xmm0, %xmm3 ; SSSE3-NEXT: movdqa %xmm0, %xmm1 @@ -128,7 +128,7 @@ define <2 x i64> @testv2i64(<2 x i64> %in) nounwind { ; ; SSE41-LABEL: testv2i64: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; SSE41-NEXT: movq {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; SSE41-NEXT: movdqa %xmm2, %xmm3 ; SSE41-NEXT: pshufb %xmm0, %xmm3 ; SSE41-NEXT: movdqa %xmm0, %xmm1 @@ -160,7 +160,7 @@ define <2 x i64> @testv2i64(<2 x i64> %in) nounwind { ; ; AVX1OR2-LABEL: testv2i64: ; AVX1OR2: # %bb.0: -; AVX1OR2-NEXT: vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX1OR2-NEXT: vmovq {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; AVX1OR2-NEXT: vpshufb %xmm0, %xmm1, %xmm2 ; AVX1OR2-NEXT: vpsrlw $4, %xmm0, %xmm3 ; AVX1OR2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 @@ -188,7 +188,7 @@ define <2 x i64> @testv2i64(<2 x i64> %in) nounwind { ; ; AVX512VL-LABEL: testv2i64: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX512VL-NEXT: vmovq {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; AVX512VL-NEXT: vpshufb %xmm0, %xmm1, %xmm2 ; AVX512VL-NEXT: vpsrlw $4, %xmm0, %xmm3 ; AVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm3, %xmm3 @@ -216,7 +216,7 @@ define <2 x i64> @testv2i64(<2 x i64> %in) nounwind { ; ; AVX512VLBWDQ-LABEL: testv2i64: ; AVX512VLBWDQ: # %bb.0: -; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX512VLBWDQ-NEXT: vmovq {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; AVX512VLBWDQ-NEXT: vpshufb %xmm0, %xmm1, %xmm2 ; AVX512VLBWDQ-NEXT: vpsrlw $4, %xmm0, %xmm3 ; AVX512VLBWDQ-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm3, %xmm3 @@ -257,7 +257,7 @@ define <2 x i64> @testv2i64(<2 x i64> %in) nounwind { ; ; X86-SSE-LABEL: testv2i64: ; X86-SSE: # %bb.0: -; X86-SSE-NEXT: movdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; X86-SSE-NEXT: movq {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; X86-SSE-NEXT: movdqa %xmm3, %xmm4 ; X86-SSE-NEXT: pshufb %xmm0, %xmm4 ; X86-SSE-NEXT: movdqa %xmm0, %xmm1 @@ -374,7 +374,7 @@ define <2 x i64> @testv2i64u(<2 x i64> %in) nounwind { ; ; SSSE3-LABEL: testv2i64u: ; SSSE3: # %bb.0: -; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; SSSE3-NEXT: movq {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; SSSE3-NEXT: movdqa %xmm2, %xmm3 ; SSSE3-NEXT: pshufb %xmm0, %xmm3 ; SSSE3-NEXT: movdqa %xmm0, %xmm1 @@ -406,7 +406,7 @@ define <2 x i64> @testv2i64u(<2 x i64> %in) nounwind { ; ; SSE41-LABEL: testv2i64u: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; SSE41-NEXT: movq {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; SSE41-NEXT: movdqa %xmm2, %xmm3 ; SSE41-NEXT: pshufb %xmm0, %xmm3 ; SSE41-NEXT: movdqa %xmm0, %xmm1 @@ -438,7 +438,7 @@ define <2 x i64> @testv2i64u(<2 x i64> %in) nounwind { ; ; AVX1OR2-LABEL: testv2i64u: ; AVX1OR2: # %bb.0: -; AVX1OR2-NEXT: vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX1OR2-NEXT: vmovq {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; AVX1OR2-NEXT: vpshufb %xmm0, %xmm1, %xmm2 ; AVX1OR2-NEXT: vpsrlw $4, %xmm0, %xmm3 ; AVX1OR2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 @@ -466,7 +466,7 @@ define <2 x i64> @testv2i64u(<2 x i64> %in) nounwind { ; ; AVX512VL-LABEL: testv2i64u: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX512VL-NEXT: vmovq {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; AVX512VL-NEXT: vpshufb %xmm0, %xmm1, %xmm2 ; AVX512VL-NEXT: vpsrlw $4, %xmm0, %xmm3 ; AVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm3, %xmm3 @@ -494,7 +494,7 @@ define <2 x i64> @testv2i64u(<2 x i64> %in) nounwind { ; ; AVX512VLBWDQ-LABEL: testv2i64u: ; AVX512VLBWDQ: # %bb.0: -; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX512VLBWDQ-NEXT: vmovq {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; AVX512VLBWDQ-NEXT: vpshufb %xmm0, %xmm1, %xmm2 ; AVX512VLBWDQ-NEXT: vpsrlw $4, %xmm0, %xmm3 ; AVX512VLBWDQ-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm3, %xmm3 @@ -535,7 +535,7 @@ define <2 x i64> @testv2i64u(<2 x i64> %in) nounwind { ; ; X86-SSE-LABEL: testv2i64u: ; X86-SSE: # %bb.0: -; X86-SSE-NEXT: movdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; X86-SSE-NEXT: movq {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; X86-SSE-NEXT: movdqa %xmm3, %xmm4 ; X86-SSE-NEXT: pshufb %xmm0, %xmm4 ; X86-SSE-NEXT: movdqa %xmm0, %xmm1 @@ -656,7 +656,7 @@ define <4 x i32> @testv4i32(<4 x i32> %in) nounwind { ; ; SSSE3-LABEL: testv4i32: ; SSSE3: # %bb.0: -; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; SSSE3-NEXT: movq {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; SSSE3-NEXT: movdqa %xmm1, %xmm2 ; SSSE3-NEXT: pshufb %xmm0, %xmm2 ; SSSE3-NEXT: movdqa %xmm0, %xmm3 @@ -682,7 +682,7 @@ define <4 x i32> @testv4i32(<4 x i32> %in) nounwind { ; ; SSE41-LABEL: testv4i32: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; SSE41-NEXT: movq {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; SSE41-NEXT: movdqa %xmm1, %xmm2 ; SSE41-NEXT: pshufb %xmm0, %xmm2 ; SSE41-NEXT: movdqa %xmm0, %xmm3 @@ -708,7 +708,7 @@ define <4 x i32> @testv4i32(<4 x i32> %in) nounwind { ; ; AVX1OR2-LABEL: testv4i32: ; AVX1OR2: # %bb.0: -; AVX1OR2-NEXT: vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX1OR2-NEXT: vmovq {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; AVX1OR2-NEXT: vpshufb %xmm0, %xmm1, %xmm2 ; AVX1OR2-NEXT: vpsrlw $4, %xmm0, %xmm3 ; AVX1OR2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 @@ -731,7 +731,7 @@ define <4 x i32> @testv4i32(<4 x i32> %in) nounwind { ; ; AVX512VL-LABEL: testv4i32: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX512VL-NEXT: vmovq {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; AVX512VL-NEXT: vpshufb %xmm0, %xmm1, %xmm2 ; AVX512VL-NEXT: vpsrlw $4, %xmm0, %xmm3 ; AVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm3, %xmm3 @@ -754,7 +754,7 @@ define <4 x i32> @testv4i32(<4 x i32> %in) nounwind { ; ; AVX512VLBWDQ-LABEL: testv4i32: ; AVX512VLBWDQ: # %bb.0: -; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX512VLBWDQ-NEXT: vmovq {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; AVX512VLBWDQ-NEXT: vpshufb %xmm0, %xmm1, %xmm2 ; AVX512VLBWDQ-NEXT: vpsrlw $4, %xmm0, %xmm3 ; AVX512VLBWDQ-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm3, %xmm3 @@ -790,7 +790,7 @@ define <4 x i32> @testv4i32(<4 x i32> %in) nounwind { ; ; X86-SSE-LABEL: testv4i32: ; X86-SSE: # %bb.0: -; X86-SSE-NEXT: movdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; X86-SSE-NEXT: movq {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; X86-SSE-NEXT: movdqa %xmm3, %xmm4 ; X86-SSE-NEXT: pshufb %xmm0, %xmm4 ; X86-SSE-NEXT: movdqa %xmm0, %xmm1 @@ -905,7 +905,7 @@ define <4 x i32> @testv4i32u(<4 x i32> %in) nounwind { ; ; SSSE3-LABEL: testv4i32u: ; SSSE3: # %bb.0: -; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; SSSE3-NEXT: movq {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; SSSE3-NEXT: movdqa %xmm1, %xmm2 ; SSSE3-NEXT: pshufb %xmm0, %xmm2 ; SSSE3-NEXT: movdqa %xmm0, %xmm3 @@ -931,7 +931,7 @@ define <4 x i32> @testv4i32u(<4 x i32> %in) nounwind { ; ; SSE41-LABEL: testv4i32u: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; SSE41-NEXT: movq {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; SSE41-NEXT: movdqa %xmm1, %xmm2 ; SSE41-NEXT: pshufb %xmm0, %xmm2 ; SSE41-NEXT: movdqa %xmm0, %xmm3 @@ -957,7 +957,7 @@ define <4 x i32> @testv4i32u(<4 x i32> %in) nounwind { ; ; AVX1OR2-LABEL: testv4i32u: ; AVX1OR2: # %bb.0: -; AVX1OR2-NEXT: vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX1OR2-NEXT: vmovq {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; AVX1OR2-NEXT: vpshufb %xmm0, %xmm1, %xmm2 ; AVX1OR2-NEXT: vpsrlw $4, %xmm0, %xmm3 ; AVX1OR2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 @@ -980,7 +980,7 @@ define <4 x i32> @testv4i32u(<4 x i32> %in) nounwind { ; ; AVX512VL-LABEL: testv4i32u: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX512VL-NEXT: vmovq {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; AVX512VL-NEXT: vpshufb %xmm0, %xmm1, %xmm2 ; AVX512VL-NEXT: vpsrlw $4, %xmm0, %xmm3 ; AVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm3, %xmm3 @@ -1003,7 +1003,7 @@ define <4 x i32> @testv4i32u(<4 x i32> %in) nounwind { ; ; AVX512VLBWDQ-LABEL: testv4i32u: ; AVX512VLBWDQ: # %bb.0: -; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX512VLBWDQ-NEXT: vmovq {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; AVX512VLBWDQ-NEXT: vpshufb %xmm0, %xmm1, %xmm2 ; AVX512VLBWDQ-NEXT: vpsrlw $4, %xmm0, %xmm3 ; AVX512VLBWDQ-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm3, %xmm3 @@ -1039,7 +1039,7 @@ define <4 x i32> @testv4i32u(<4 x i32> %in) nounwind { ; ; X86-SSE-LABEL: testv4i32u: ; X86-SSE: # %bb.0: -; X86-SSE-NEXT: movdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; X86-SSE-NEXT: movq {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; X86-SSE-NEXT: movdqa %xmm3, %xmm4 ; X86-SSE-NEXT: pshufb %xmm0, %xmm4 ; X86-SSE-NEXT: movdqa %xmm0, %xmm1 @@ -1142,7 +1142,7 @@ define <8 x i16> @testv8i16(<8 x i16> %in) nounwind { ; ; SSSE3-LABEL: testv8i16: ; SSSE3: # %bb.0: -; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; SSSE3-NEXT: movq {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; SSSE3-NEXT: movdqa %xmm1, %xmm2 ; SSSE3-NEXT: pshufb %xmm0, %xmm2 ; SSSE3-NEXT: movdqa %xmm0, %xmm3 @@ -1162,7 +1162,7 @@ define <8 x i16> @testv8i16(<8 x i16> %in) nounwind { ; ; SSE41-LABEL: testv8i16: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; SSE41-NEXT: movq {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; SSE41-NEXT: movdqa %xmm1, %xmm2 ; SSE41-NEXT: pshufb %xmm0, %xmm2 ; SSE41-NEXT: movdqa %xmm0, %xmm3 @@ -1182,7 +1182,7 @@ define <8 x i16> @testv8i16(<8 x i16> %in) nounwind { ; ; AVX1OR2-LABEL: testv8i16: ; AVX1OR2: # %bb.0: -; AVX1OR2-NEXT: vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX1OR2-NEXT: vmovq {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; AVX1OR2-NEXT: vpshufb %xmm0, %xmm1, %xmm2 ; AVX1OR2-NEXT: vpsrlw $4, %xmm0, %xmm3 ; AVX1OR2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 @@ -1200,7 +1200,7 @@ define <8 x i16> @testv8i16(<8 x i16> %in) nounwind { ; ; AVX512VL-LABEL: testv8i16: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX512VL-NEXT: vmovq {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; AVX512VL-NEXT: vpshufb %xmm0, %xmm1, %xmm2 ; AVX512VL-NEXT: vpsrlw $4, %xmm0, %xmm3 ; AVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm3, %xmm3 @@ -1218,7 +1218,7 @@ define <8 x i16> @testv8i16(<8 x i16> %in) nounwind { ; ; AVX512VLBWDQ-LABEL: testv8i16: ; AVX512VLBWDQ: # %bb.0: -; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX512VLBWDQ-NEXT: vmovq {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; AVX512VLBWDQ-NEXT: vpshufb %xmm0, %xmm1, %xmm2 ; AVX512VLBWDQ-NEXT: vpsrlw $4, %xmm0, %xmm3 ; AVX512VLBWDQ-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm3, %xmm3 @@ -1254,7 +1254,7 @@ define <8 x i16> @testv8i16(<8 x i16> %in) nounwind { ; ; X86-SSE-LABEL: testv8i16: ; X86-SSE: # %bb.0: -; X86-SSE-NEXT: movdqa {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; X86-SSE-NEXT: movq {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; X86-SSE-NEXT: movdqa %xmm2, %xmm3 ; X86-SSE-NEXT: pshufb %xmm0, %xmm3 ; X86-SSE-NEXT: movdqa %xmm0, %xmm1 @@ -1350,7 +1350,7 @@ define <8 x i16> @testv8i16u(<8 x i16> %in) nounwind { ; ; SSSE3-LABEL: testv8i16u: ; SSSE3: # %bb.0: -; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; SSSE3-NEXT: movq {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; SSSE3-NEXT: movdqa %xmm1, %xmm2 ; SSSE3-NEXT: pshufb %xmm0, %xmm2 ; SSSE3-NEXT: movdqa %xmm0, %xmm3 @@ -1370,7 +1370,7 @@ define <8 x i16> @testv8i16u(<8 x i16> %in) nounwind { ; ; SSE41-LABEL: testv8i16u: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; SSE41-NEXT: movq {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; SSE41-NEXT: movdqa %xmm1, %xmm2 ; SSE41-NEXT: pshufb %xmm0, %xmm2 ; SSE41-NEXT: movdqa %xmm0, %xmm3 @@ -1390,7 +1390,7 @@ define <8 x i16> @testv8i16u(<8 x i16> %in) nounwind { ; ; AVX1OR2-LABEL: testv8i16u: ; AVX1OR2: # %bb.0: -; AVX1OR2-NEXT: vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX1OR2-NEXT: vmovq {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; AVX1OR2-NEXT: vpshufb %xmm0, %xmm1, %xmm2 ; AVX1OR2-NEXT: vpsrlw $4, %xmm0, %xmm3 ; AVX1OR2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 @@ -1408,7 +1408,7 @@ define <8 x i16> @testv8i16u(<8 x i16> %in) nounwind { ; ; AVX512VL-LABEL: testv8i16u: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX512VL-NEXT: vmovq {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; AVX512VL-NEXT: vpshufb %xmm0, %xmm1, %xmm2 ; AVX512VL-NEXT: vpsrlw $4, %xmm0, %xmm3 ; AVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm3, %xmm3 @@ -1426,7 +1426,7 @@ define <8 x i16> @testv8i16u(<8 x i16> %in) nounwind { ; ; AVX512VLBWDQ-LABEL: testv8i16u: ; AVX512VLBWDQ: # %bb.0: -; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX512VLBWDQ-NEXT: vmovq {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; AVX512VLBWDQ-NEXT: vpshufb %xmm0, %xmm1, %xmm2 ; AVX512VLBWDQ-NEXT: vpsrlw $4, %xmm0, %xmm3 ; AVX512VLBWDQ-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm3, %xmm3 @@ -1462,7 +1462,7 @@ define <8 x i16> @testv8i16u(<8 x i16> %in) nounwind { ; ; X86-SSE-LABEL: testv8i16u: ; X86-SSE: # %bb.0: -; X86-SSE-NEXT: movdqa {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; X86-SSE-NEXT: movq {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; X86-SSE-NEXT: movdqa %xmm2, %xmm3 ; X86-SSE-NEXT: pshufb %xmm0, %xmm3 ; X86-SSE-NEXT: movdqa %xmm0, %xmm1 @@ -1552,7 +1552,7 @@ define <16 x i8> @testv16i8(<16 x i8> %in) nounwind { ; ; SSSE3-LABEL: testv16i8: ; SSSE3: # %bb.0: -; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; SSSE3-NEXT: movq {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; SSSE3-NEXT: movdqa %xmm1, %xmm2 ; SSSE3-NEXT: pshufb %xmm0, %xmm2 ; SSSE3-NEXT: psrlw $4, %xmm0 @@ -1567,7 +1567,7 @@ define <16 x i8> @testv16i8(<16 x i8> %in) nounwind { ; ; SSE41-LABEL: testv16i8: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; SSE41-NEXT: movq {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; SSE41-NEXT: movdqa %xmm1, %xmm2 ; SSE41-NEXT: pshufb %xmm0, %xmm2 ; SSE41-NEXT: psrlw $4, %xmm0 @@ -1582,7 +1582,7 @@ define <16 x i8> @testv16i8(<16 x i8> %in) nounwind { ; ; AVX1OR2-LABEL: testv16i8: ; AVX1OR2: # %bb.0: -; AVX1OR2-NEXT: vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX1OR2-NEXT: vmovq {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; AVX1OR2-NEXT: vpshufb %xmm0, %xmm1, %xmm2 ; AVX1OR2-NEXT: vpsrlw $4, %xmm0, %xmm0 ; AVX1OR2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 @@ -1595,7 +1595,7 @@ define <16 x i8> @testv16i8(<16 x i8> %in) nounwind { ; ; AVX512VL-LABEL: testv16i8: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX512VL-NEXT: vmovq {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; AVX512VL-NEXT: vpshufb %xmm0, %xmm1, %xmm2 ; AVX512VL-NEXT: vpsrlw $4, %xmm0, %xmm0 ; AVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 @@ -1608,7 +1608,7 @@ define <16 x i8> @testv16i8(<16 x i8> %in) nounwind { ; ; AVX512VLBWDQ-LABEL: testv16i8: ; AVX512VLBWDQ: # %bb.0: -; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX512VLBWDQ-NEXT: vmovq {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; AVX512VLBWDQ-NEXT: vpshufb %xmm0, %xmm1, %xmm2 ; AVX512VLBWDQ-NEXT: vpsrlw $4, %xmm0, %xmm0 ; AVX512VLBWDQ-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 @@ -1630,7 +1630,7 @@ define <16 x i8> @testv16i8(<16 x i8> %in) nounwind { ; ; X86-SSE-LABEL: testv16i8: ; X86-SSE: # %bb.0: -; X86-SSE-NEXT: movdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; X86-SSE-NEXT: movq {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; X86-SSE-NEXT: movdqa %xmm1, %xmm2 ; X86-SSE-NEXT: pshufb %xmm0, %xmm2 ; X86-SSE-NEXT: psrlw $4, %xmm0 @@ -1715,7 +1715,7 @@ define <16 x i8> @testv16i8u(<16 x i8> %in) nounwind { ; ; SSSE3-LABEL: testv16i8u: ; SSSE3: # %bb.0: -; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; SSSE3-NEXT: movq {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; SSSE3-NEXT: movdqa %xmm1, %xmm2 ; SSSE3-NEXT: pshufb %xmm0, %xmm2 ; SSSE3-NEXT: psrlw $4, %xmm0 @@ -1730,7 +1730,7 @@ define <16 x i8> @testv16i8u(<16 x i8> %in) nounwind { ; ; SSE41-LABEL: testv16i8u: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; SSE41-NEXT: movq {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; SSE41-NEXT: movdqa %xmm1, %xmm2 ; SSE41-NEXT: pshufb %xmm0, %xmm2 ; SSE41-NEXT: psrlw $4, %xmm0 @@ -1745,7 +1745,7 @@ define <16 x i8> @testv16i8u(<16 x i8> %in) nounwind { ; ; AVX1OR2-LABEL: testv16i8u: ; AVX1OR2: # %bb.0: -; AVX1OR2-NEXT: vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX1OR2-NEXT: vmovq {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; AVX1OR2-NEXT: vpshufb %xmm0, %xmm1, %xmm2 ; AVX1OR2-NEXT: vpsrlw $4, %xmm0, %xmm0 ; AVX1OR2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 @@ -1758,7 +1758,7 @@ define <16 x i8> @testv16i8u(<16 x i8> %in) nounwind { ; ; AVX512VL-LABEL: testv16i8u: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX512VL-NEXT: vmovq {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; AVX512VL-NEXT: vpshufb %xmm0, %xmm1, %xmm2 ; AVX512VL-NEXT: vpsrlw $4, %xmm0, %xmm0 ; AVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 @@ -1771,7 +1771,7 @@ define <16 x i8> @testv16i8u(<16 x i8> %in) nounwind { ; ; AVX512VLBWDQ-LABEL: testv16i8u: ; AVX512VLBWDQ: # %bb.0: -; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX512VLBWDQ-NEXT: vmovq {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; AVX512VLBWDQ-NEXT: vpshufb %xmm0, %xmm1, %xmm2 ; AVX512VLBWDQ-NEXT: vpsrlw $4, %xmm0, %xmm0 ; AVX512VLBWDQ-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 @@ -1793,7 +1793,7 @@ define <16 x i8> @testv16i8u(<16 x i8> %in) nounwind { ; ; X86-SSE-LABEL: testv16i8u: ; X86-SSE: # %bb.0: -; X86-SSE-NEXT: movdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; X86-SSE-NEXT: movq {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; X86-SSE-NEXT: movdqa %xmm1, %xmm2 ; X86-SSE-NEXT: pshufb %xmm0, %xmm2 ; X86-SSE-NEXT: psrlw $4, %xmm0 @@ -1812,22 +1812,22 @@ define <16 x i8> @testv16i8u(<16 x i8> %in) nounwind { define <2 x i64> @foldv2i64() nounwind { ; SSE-LABEL: foldv2i64: ; SSE: # %bb.0: -; SSE-NEXT: movaps {{.*#+}} xmm0 = [55,0,0,0] +; SSE-NEXT: movss {{.*#+}} xmm0 = [55,0,0,0] ; SSE-NEXT: retq ; ; NOBW-LABEL: foldv2i64: ; NOBW: # %bb.0: -; NOBW-NEXT: vmovaps {{.*#+}} xmm0 = [55,0,0,0] +; NOBW-NEXT: vmovss {{.*#+}} xmm0 = [55,0,0,0] ; NOBW-NEXT: retq ; ; AVX512VLBWDQ-LABEL: foldv2i64: ; AVX512VLBWDQ: # %bb.0: -; AVX512VLBWDQ-NEXT: vmovaps {{.*#+}} xmm0 = [55,0,0,0] +; AVX512VLBWDQ-NEXT: vmovss {{.*#+}} xmm0 = [55,0,0,0] ; AVX512VLBWDQ-NEXT: retq ; ; X86-SSE-LABEL: foldv2i64: ; X86-SSE: # %bb.0: -; X86-SSE-NEXT: movaps {{.*#+}} xmm0 = [55,0,0,0] +; X86-SSE-NEXT: movss {{.*#+}} xmm0 = [55,0,0,0] ; X86-SSE-NEXT: retl %out = call <2 x i64> @llvm.ctlz.v2i64(<2 x i64> , i1 0) ret <2 x i64> %out @@ -1836,22 +1836,22 @@ define <2 x i64> @foldv2i64() nounwind { define <2 x i64> @foldv2i64u() nounwind { ; SSE-LABEL: foldv2i64u: ; SSE: # %bb.0: -; SSE-NEXT: movaps {{.*#+}} xmm0 = [55,0,0,0] +; SSE-NEXT: movss {{.*#+}} xmm0 = [55,0,0,0] ; SSE-NEXT: retq ; ; NOBW-LABEL: foldv2i64u: ; NOBW: # %bb.0: -; NOBW-NEXT: vmovaps {{.*#+}} xmm0 = [55,0,0,0] +; NOBW-NEXT: vmovss {{.*#+}} xmm0 = [55,0,0,0] ; NOBW-NEXT: retq ; ; AVX512VLBWDQ-LABEL: foldv2i64u: ; AVX512VLBWDQ: # %bb.0: -; AVX512VLBWDQ-NEXT: vmovaps {{.*#+}} xmm0 = [55,0,0,0] +; AVX512VLBWDQ-NEXT: vmovss {{.*#+}} xmm0 = [55,0,0,0] ; AVX512VLBWDQ-NEXT: retq ; ; X86-SSE-LABEL: foldv2i64u: ; X86-SSE: # %bb.0: -; X86-SSE-NEXT: movaps {{.*#+}} xmm0 = [55,0,0,0] +; X86-SSE-NEXT: movss {{.*#+}} xmm0 = [55,0,0,0] ; X86-SSE-NEXT: retl %out = call <2 x i64> @llvm.ctlz.v2i64(<2 x i64> , i1 -1) ret <2 x i64> %out diff --git a/llvm/test/CodeGen/X86/vector-lzcnt-256.ll b/llvm/test/CodeGen/X86/vector-lzcnt-256.ll index fe6836c045f3be..8a0d9a6134cea5 100644 --- a/llvm/test/CodeGen/X86/vector-lzcnt-256.ll +++ b/llvm/test/CodeGen/X86/vector-lzcnt-256.ll @@ -13,7 +13,7 @@ define <4 x i64> @testv4i64(<4 x i64> %in) nounwind { ; AVX1-LABEL: testv4i64: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX1-NEXT: vmovq {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm4 ; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm1 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] @@ -199,7 +199,7 @@ define <4 x i64> @testv4i64u(<4 x i64> %in) nounwind { ; AVX1-LABEL: testv4i64u: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX1-NEXT: vmovq {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm4 ; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm1 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] @@ -385,7 +385,7 @@ define <8 x i32> @testv8i32(<8 x i32> %in) nounwind { ; AVX1-LABEL: testv8i32: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX1-NEXT: vmovq {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; AVX1-NEXT: vpshufb %xmm1, %xmm2, %xmm3 ; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm4 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] @@ -541,7 +541,7 @@ define <8 x i32> @testv8i32u(<8 x i32> %in) nounwind { ; AVX1-LABEL: testv8i32u: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX1-NEXT: vmovq {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; AVX1-NEXT: vpshufb %xmm1, %xmm2, %xmm3 ; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm4 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] @@ -697,7 +697,7 @@ define <16 x i16> @testv16i16(<16 x i16> %in) nounwind { ; AVX1-LABEL: testv16i16: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX1-NEXT: vmovq {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; AVX1-NEXT: vpshufb %xmm1, %xmm2, %xmm3 ; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm4 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] @@ -818,7 +818,7 @@ define <16 x i16> @testv16i16u(<16 x i16> %in) nounwind { ; AVX1-LABEL: testv16i16u: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX1-NEXT: vmovq {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; AVX1-NEXT: vpshufb %xmm1, %xmm2, %xmm3 ; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm4 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] @@ -939,7 +939,7 @@ define <32 x i8> @testv32i8(<32 x i8> %in) nounwind { ; AVX1-LABEL: testv32i8: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX1-NEXT: vmovq {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; AVX1-NEXT: vpshufb %xmm1, %xmm2, %xmm3 ; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] @@ -1035,7 +1035,7 @@ define <32 x i8> @testv32i8u(<32 x i8> %in) nounwind { ; AVX1-LABEL: testv32i8u: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX1-NEXT: vmovq {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; AVX1-NEXT: vpshufb %xmm1, %xmm2, %xmm3 ; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] diff --git a/llvm/test/CodeGen/X86/vector-mulfix-legalize.ll b/llvm/test/CodeGen/X86/vector-mulfix-legalize.ll index 82afa15079182f..b233855029c582 100644 --- a/llvm/test/CodeGen/X86/vector-mulfix-legalize.ll +++ b/llvm/test/CodeGen/X86/vector-mulfix-legalize.ll @@ -13,7 +13,7 @@ declare <4 x i16> @llvm.umul.fix.sat.v4i16(<4 x i16>, <4 x i16>, i32 immarg) define <4 x i16> @smulfix(<4 x i16> %a) { ; CHECK-LABEL: smulfix: ; CHECK: # %bb.0: -; CHECK-NEXT: movdqa {{.*#+}} xmm1 = [1,2,3,4,u,u,u,u] +; CHECK-NEXT: movq {{.*#+}} xmm1 = [1,2,3,4,0,0,0,0] ; CHECK-NEXT: movdqa %xmm0, %xmm2 ; CHECK-NEXT: pmullw %xmm1, %xmm2 ; CHECK-NEXT: psrlw $15, %xmm2 @@ -28,7 +28,7 @@ define <4 x i16> @smulfix(<4 x i16> %a) { define <4 x i16> @umulfix(<4 x i16> %a) { ; CHECK-LABEL: umulfix: ; CHECK: # %bb.0: -; CHECK-NEXT: movdqa {{.*#+}} xmm1 = [1,2,3,4,u,u,u,u] +; CHECK-NEXT: movq {{.*#+}} xmm1 = [1,2,3,4,0,0,0,0] ; CHECK-NEXT: movdqa %xmm0, %xmm2 ; CHECK-NEXT: pmullw %xmm1, %xmm2 ; CHECK-NEXT: psrlw $15, %xmm2 diff --git a/llvm/test/CodeGen/X86/vector-reduce-add-mask.ll b/llvm/test/CodeGen/X86/vector-reduce-add-mask.ll index dbdc3e09fcef08..8056b9a2963c31 100644 --- a/llvm/test/CodeGen/X86/vector-reduce-add-mask.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-add-mask.ll @@ -864,7 +864,7 @@ define i16 @test_v4i16_v4i8(<4 x i16> %a0) { ; ; SSE41-LABEL: test_v4i16_v4i8: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [u,32768,16384,8192,u,u,u,u] +; SSE41-NEXT: movq {{.*#+}} xmm1 = [0,32768,16384,8192,0,0,0,0] ; SSE41-NEXT: pmulhuw %xmm0, %xmm1 ; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7] ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] @@ -914,7 +914,7 @@ define i16 @test_v4i16_v4i8(<4 x i16> %a0) { ; AVX512BW-LABEL: test_v4i16_v4i8: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm1 = [0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovq {{.*#+}} xmm1 = [0,1,2,3,0,0,0,0] ; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX512BW-NEXT: vpaddw %xmm1, %xmm0, %xmm0 diff --git a/llvm/test/CodeGen/X86/vector-rotate-128.ll b/llvm/test/CodeGen/X86/vector-rotate-128.ll index 43c9be2dc6f976..ad810c092bf55e 100644 --- a/llvm/test/CodeGen/X86/vector-rotate-128.ll +++ b/llvm/test/CodeGen/X86/vector-rotate-128.ll @@ -719,7 +719,7 @@ define <2 x i64> @splatvar_rotate_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind { ; ; X86-SSE2-LABEL: splatvar_rotate_v2i64: ; X86-SSE2: # %bb.0: -; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [64,0,0,0] +; X86-SSE2-NEXT: movd {{.*#+}} xmm2 = [64,0,0,0] ; X86-SSE2-NEXT: psubq %xmm1, %xmm2 ; X86-SSE2-NEXT: movdqa %xmm0, %xmm3 ; X86-SSE2-NEXT: psllq %xmm1, %xmm3 @@ -815,7 +815,7 @@ define <8 x i16> @splatvar_rotate_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind { ; ; SSE41-LABEL: splatvar_rotate_v8i16: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [15,0,0,0] +; SSE41-NEXT: movd {{.*#+}} xmm2 = [15,0,0,0] ; SSE41-NEXT: movdqa %xmm1, %xmm3 ; SSE41-NEXT: pandn %xmm2, %xmm3 ; SSE41-NEXT: movdqa %xmm0, %xmm4 @@ -828,7 +828,7 @@ define <8 x i16> @splatvar_rotate_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind { ; ; AVX-LABEL: splatvar_rotate_v8i16: ; AVX: # %bb.0: -; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [15,0,0,0] +; AVX-NEXT: vmovd {{.*#+}} xmm2 = [15,0,0,0] ; AVX-NEXT: vpandn %xmm2, %xmm1, %xmm3 ; AVX-NEXT: vpsrlw $1, %xmm0, %xmm4 ; AVX-NEXT: vpsrlw %xmm3, %xmm4, %xmm3 @@ -839,7 +839,7 @@ define <8 x i16> @splatvar_rotate_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind { ; ; AVX512F-LABEL: splatvar_rotate_v8i16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = [15,0,0,0] +; AVX512F-NEXT: vmovd {{.*#+}} xmm2 = [15,0,0,0] ; AVX512F-NEXT: vpandn %xmm2, %xmm1, %xmm3 ; AVX512F-NEXT: vpsrlw $1, %xmm0, %xmm4 ; AVX512F-NEXT: vpsrlw %xmm3, %xmm4, %xmm3 @@ -850,7 +850,7 @@ define <8 x i16> @splatvar_rotate_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind { ; ; AVX512VL-LABEL: splatvar_rotate_v8i16: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [15,0,0,0] +; AVX512VL-NEXT: vmovd {{.*#+}} xmm2 = [15,0,0,0] ; AVX512VL-NEXT: vpandn %xmm2, %xmm1, %xmm3 ; AVX512VL-NEXT: vpsrlw $1, %xmm0, %xmm4 ; AVX512VL-NEXT: vpsrlw %xmm3, %xmm4, %xmm3 @@ -861,7 +861,7 @@ define <8 x i16> @splatvar_rotate_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind { ; ; AVX512BW-LABEL: splatvar_rotate_v8i16: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [15,0,0,0] +; AVX512BW-NEXT: vmovd {{.*#+}} xmm2 = [15,0,0,0] ; AVX512BW-NEXT: vpandn %xmm2, %xmm1, %xmm3 ; AVX512BW-NEXT: vpsrlw $1, %xmm0, %xmm4 ; AVX512BW-NEXT: vpsrlw %xmm3, %xmm4, %xmm3 @@ -872,7 +872,7 @@ define <8 x i16> @splatvar_rotate_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind { ; ; AVX512VLBW-LABEL: splatvar_rotate_v8i16: ; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm2 = [15,0,0,0] +; AVX512VLBW-NEXT: vmovd {{.*#+}} xmm2 = [15,0,0,0] ; AVX512VLBW-NEXT: vpandn %xmm2, %xmm1, %xmm3 ; AVX512VLBW-NEXT: vpsrlw $1, %xmm0, %xmm4 ; AVX512VLBW-NEXT: vpsrlw %xmm3, %xmm4, %xmm3 diff --git a/llvm/test/CodeGen/X86/vector-rotate-256.ll b/llvm/test/CodeGen/X86/vector-rotate-256.ll index c55335f8495697..dae2cf382b8205 100644 --- a/llvm/test/CodeGen/X86/vector-rotate-256.ll +++ b/llvm/test/CodeGen/X86/vector-rotate-256.ll @@ -649,7 +649,7 @@ define <8 x i32> @splatvar_rotate_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind { define <16 x i16> @splatvar_rotate_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind { ; AVX1-LABEL: splatvar_rotate_v16i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [15,0,0,0] +; AVX1-NEXT: vmovd {{.*#+}} xmm2 = [15,0,0,0] ; AVX1-NEXT: vpandn %xmm2, %xmm1, %xmm3 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 ; AVX1-NEXT: vpsrlw $1, %xmm4, %xmm5 diff --git a/llvm/test/CodeGen/X86/vector-shift-ashr-sub128.ll b/llvm/test/CodeGen/X86/vector-shift-ashr-sub128.ll index e15be224bec8cd..ff41d883380a86 100644 --- a/llvm/test/CodeGen/X86/vector-shift-ashr-sub128.ll +++ b/llvm/test/CodeGen/X86/vector-shift-ashr-sub128.ll @@ -1786,7 +1786,7 @@ define <4 x i16> @constant_shift_v4i16(<4 x i16> %a) nounwind { ; ; SSE41-LABEL: constant_shift_v4i16: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [u,u,16384,8192,u,u,u,u] +; SSE41-NEXT: movq {{.*#+}} xmm1 = [0,0,16384,8192,0,0,0,0] ; SSE41-NEXT: pmulhw %xmm0, %xmm1 ; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7] ; SSE41-NEXT: psraw $1, %xmm0 @@ -1818,7 +1818,7 @@ define <4 x i16> @constant_shift_v4i16(<4 x i16> %a) nounwind { ; AVX512BW-LABEL: constant_shift_v4i16: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm1 = [0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovq {{.*#+}} xmm1 = [0,1,2,3,0,0,0,0] ; AVX512BW-NEXT: vpsravw %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512BW-NEXT: vzeroupper @@ -1896,7 +1896,7 @@ define <2 x i16> @constant_shift_v2i16(<2 x i16> %a) nounwind { ; AVX512BW-LABEL: constant_shift_v2i16: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512BW-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2,3,2,3,2,3,2,3] +; AVX512BW-NEXT: vmovd {{.*#+}} xmm1 = [2,3,0,0,0,0,0,0] ; AVX512BW-NEXT: vpsravw %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512BW-NEXT: vzeroupper diff --git a/llvm/test/CodeGen/X86/vector-shift-lshr-sub128.ll b/llvm/test/CodeGen/X86/vector-shift-lshr-sub128.ll index f340615464cfa7..71719e03c7c6dc 100644 --- a/llvm/test/CodeGen/X86/vector-shift-lshr-sub128.ll +++ b/llvm/test/CodeGen/X86/vector-shift-lshr-sub128.ll @@ -1486,7 +1486,7 @@ define <4 x i16> @constant_shift_v4i16(<4 x i16> %a) nounwind { ; ; SSE41-LABEL: constant_shift_v4i16: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [u,32768,16384,8192,u,u,u,u] +; SSE41-NEXT: movq {{.*#+}} xmm1 = [0,32768,16384,8192,0,0,0,0] ; SSE41-NEXT: pmulhuw %xmm0, %xmm1 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7] ; SSE41-NEXT: retq @@ -1511,7 +1511,7 @@ define <4 x i16> @constant_shift_v4i16(<4 x i16> %a) nounwind { ; AVX512BW-LABEL: constant_shift_v4i16: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm1 = [0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovq {{.*#+}} xmm1 = [0,1,2,3,0,0,0,0] ; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512BW-NEXT: vzeroupper @@ -1581,7 +1581,7 @@ define <2 x i16> @constant_shift_v2i16(<2 x i16> %a) nounwind { ; AVX512BW-LABEL: constant_shift_v2i16: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512BW-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2,3,2,3,2,3,2,3] +; AVX512BW-NEXT: vmovd {{.*#+}} xmm1 = [2,3,0,0,0,0,0,0] ; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512BW-NEXT: vzeroupper diff --git a/llvm/test/CodeGen/X86/vector-shift-shl-sub128.ll b/llvm/test/CodeGen/X86/vector-shift-shl-sub128.ll index 4d4642b18878eb..19645fd08c9468 100644 --- a/llvm/test/CodeGen/X86/vector-shift-shl-sub128.ll +++ b/llvm/test/CodeGen/X86/vector-shift-shl-sub128.ll @@ -1339,7 +1339,7 @@ define <4 x i16> @constant_shift_v4i16(<4 x i16> %a) nounwind { ; AVX512BW-LABEL: constant_shift_v4i16: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm1 = [0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovq {{.*#+}} xmm1 = [0,1,2,3,0,0,0,0] ; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512BW-NEXT: vzeroupper @@ -1399,7 +1399,7 @@ define <2 x i16> @constant_shift_v2i16(<2 x i16> %a) nounwind { ; AVX512BW-LABEL: constant_shift_v2i16: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512BW-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2,3,2,3,2,3,2,3] +; AVX512BW-NEXT: vmovd {{.*#+}} xmm1 = [2,3,0,0,0,0,0,0] ; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512BW-NEXT: vzeroupper diff --git a/llvm/test/CodeGen/X86/vector-shuffle-128-v16.ll b/llvm/test/CodeGen/X86/vector-shuffle-128-v16.ll index b40be9452ddd77..e298091bfb983f 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-128-v16.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-128-v16.ll @@ -1003,7 +1003,7 @@ define <16 x i8> @shuffle_v16i8_01_03_05_07_09_11_13_15_17_19_21_23_25_27_29_31( ; ; SSSE3-LABEL: shuffle_v16i8_01_03_05_07_09_11_13_15_17_19_21_23_25_27_29_31: ; SSSE3: # %bb.0: -; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u] +; SSSE3-NEXT: movq {{.*#+}} xmm2 = [1,3,5,7,9,11,13,15,0,0,0,0,0,0,0,0] ; SSSE3-NEXT: pshufb %xmm2, %xmm1 ; SSSE3-NEXT: pshufb %xmm2, %xmm0 ; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] @@ -1011,7 +1011,7 @@ define <16 x i8> @shuffle_v16i8_01_03_05_07_09_11_13_15_17_19_21_23_25_27_29_31( ; ; SSE41-LABEL: shuffle_v16i8_01_03_05_07_09_11_13_15_17_19_21_23_25_27_29_31: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u] +; SSE41-NEXT: movq {{.*#+}} xmm2 = [1,3,5,7,9,11,13,15,0,0,0,0,0,0,0,0] ; SSE41-NEXT: pshufb %xmm2, %xmm1 ; SSE41-NEXT: pshufb %xmm2, %xmm0 ; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] @@ -1019,8 +1019,7 @@ define <16 x i8> @shuffle_v16i8_01_03_05_07_09_11_13_15_17_19_21_23_25_27_29_31( ; ; AVX1-LABEL: shuffle_v16i8_01_03_05_07_09_11_13_15_17_19_21_23_25_27_29_31: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15] -; AVX1-NEXT: # xmm2 = mem[0,0] +; AVX1-NEXT: vmovq {{.*#+}} xmm2 = [1,3,5,7,9,11,13,15,0,0,0,0,0,0,0,0] ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] @@ -1028,7 +1027,7 @@ define <16 x i8> @shuffle_v16i8_01_03_05_07_09_11_13_15_17_19_21_23_25_27_29_31( ; ; AVX2-LABEL: shuffle_v16i8_01_03_05_07_09_11_13_15_17_19_21_23_25_27_29_31: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15] +; AVX2-NEXT: vmovq {{.*#+}} xmm2 = [1,3,5,7,9,11,13,15,0,0,0,0,0,0,0,0] ; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] @@ -1036,7 +1035,7 @@ define <16 x i8> @shuffle_v16i8_01_03_05_07_09_11_13_15_17_19_21_23_25_27_29_31( ; ; AVX512VLBW-LABEL: shuffle_v16i8_01_03_05_07_09_11_13_15_17_19_21_23_25_27_29_31: ; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: vpbroadcastq {{.*#+}} xmm2 = [1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15] +; AVX512VLBW-NEXT: vmovq {{.*#+}} xmm2 = [1,3,5,7,9,11,13,15,0,0,0,0,0,0,0,0] ; AVX512VLBW-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX512VLBW-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX512VLBW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll index 5b4f15b51ec00a..bac63f2ddb5059 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll @@ -539,7 +539,7 @@ define <16 x i16> @shuffle_v16i16_00_00_00_12_00_00_00_00_00_00_00_00_00_00_00_0 ; ; AVX512VL-LABEL: shuffle_v16i16_00_00_00_12_00_00_00_00_00_00_00_00_00_00_00_00: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [0,0,0,12,0,0,0,0] +; AVX512VL-NEXT: vmovq {{.*#+}} xmm1 = [0,0,0,12,0,0,0,0] ; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0 ; AVX512VL-NEXT: retq ; @@ -580,7 +580,7 @@ define <16 x i16> @shuffle_v16i16_00_00_13_00_00_00_00_00_00_00_00_00_00_00_00_0 ; ; AVX512VL-LABEL: shuffle_v16i16_00_00_13_00_00_00_00_00_00_00_00_00_00_00_00_00: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [0,0,13,0,0,0,0,0] +; AVX512VL-NEXT: vmovq {{.*#+}} xmm1 = [0,0,13,0,0,0,0,0] ; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0 ; AVX512VL-NEXT: retq ; @@ -621,7 +621,7 @@ define <16 x i16> @shuffle_v16i16_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_0 ; ; AVX512VL-LABEL: shuffle_v16i16_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [0,14,0,0,0,0,0,0] +; AVX512VL-NEXT: vmovd {{.*#+}} xmm1 = [0,14,0,0,0,0,0,0] ; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0 ; AVX512VL-NEXT: retq ; @@ -662,7 +662,7 @@ define <16 x i16> @shuffle_v16i16_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_0 ; ; AVX512VL-LABEL: shuffle_v16i16_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [15,0,0,0] +; AVX512VL-NEXT: vmovd {{.*#+}} xmm1 = [15,0,0,0] ; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0 ; AVX512VL-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll index baa942794ccd8a..1cfa5e6dfdff53 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll @@ -1281,7 +1281,7 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_24_00_00_00_00_00_00_00_00_ ; ; AVX512VLVBMI-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_24_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: ; AVX512VLVBMI: # %bb.0: -; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} xmm1 = [0,0,0,0,0,0,0,24,0,0,0,0,0,0,0,0] +; AVX512VLVBMI-NEXT: vmovq {{.*#+}} xmm1 = [0,0,0,0,0,0,0,24,0,0,0,0,0,0,0,0] ; AVX512VLVBMI-NEXT: vpermb %ymm0, %ymm1, %ymm0 ; AVX512VLVBMI-NEXT: retq ; @@ -1328,7 +1328,7 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_25_00_00_00_00_00_00_00_00_00_ ; ; AVX512VLVBMI-LABEL: shuffle_v32i8_00_00_00_00_00_00_25_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: ; AVX512VLVBMI: # %bb.0: -; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} xmm1 = [0,0,0,0,0,0,25,0,0,0,0,0,0,0,0,0] +; AVX512VLVBMI-NEXT: vmovq {{.*#+}} xmm1 = [0,0,0,0,0,0,25,0,0,0,0,0,0,0,0,0] ; AVX512VLVBMI-NEXT: vpermb %ymm0, %ymm1, %ymm0 ; AVX512VLVBMI-NEXT: retq ; @@ -1375,7 +1375,7 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_26_00_00_00_00_00_00_00_00_00_00_ ; ; AVX512VLVBMI-LABEL: shuffle_v32i8_00_00_00_00_00_26_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: ; AVX512VLVBMI: # %bb.0: -; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} xmm1 = [0,0,0,0,0,26,0,0,0,0,0,0,0,0,0,0] +; AVX512VLVBMI-NEXT: vmovq {{.*#+}} xmm1 = [0,0,0,0,0,26,0,0,0,0,0,0,0,0,0,0] ; AVX512VLVBMI-NEXT: vpermb %ymm0, %ymm1, %ymm0 ; AVX512VLVBMI-NEXT: retq ; @@ -1422,7 +1422,7 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_27_00_00_00_00_00_00_00_00_00_00_00_ ; ; AVX512VLVBMI-LABEL: shuffle_v32i8_00_00_00_00_27_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: ; AVX512VLVBMI: # %bb.0: -; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} xmm1 = [0,0,0,0,27,0,0,0,0,0,0,0,0,0,0,0] +; AVX512VLVBMI-NEXT: vmovq {{.*#+}} xmm1 = [0,0,0,0,27,0,0,0,0,0,0,0,0,0,0,0] ; AVX512VLVBMI-NEXT: vpermb %ymm0, %ymm1, %ymm0 ; AVX512VLVBMI-NEXT: retq ; @@ -1469,7 +1469,7 @@ define <32 x i8> @shuffle_v32i8_00_00_00_28_00_00_00_00_00_00_00_00_00_00_00_00_ ; ; AVX512VLVBMI-LABEL: shuffle_v32i8_00_00_00_28_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: ; AVX512VLVBMI: # %bb.0: -; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} xmm1 = [0,0,0,28,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX512VLVBMI-NEXT: vmovd {{.*#+}} xmm1 = [0,0,0,28,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX512VLVBMI-NEXT: vpermb %ymm0, %ymm1, %ymm0 ; AVX512VLVBMI-NEXT: retq ; @@ -1516,7 +1516,7 @@ define <32 x i8> @shuffle_v32i8_00_00_29_00_00_00_00_00_00_00_00_00_00_00_00_00_ ; ; AVX512VLVBMI-LABEL: shuffle_v32i8_00_00_29_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: ; AVX512VLVBMI: # %bb.0: -; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} xmm1 = [0,0,29,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX512VLVBMI-NEXT: vmovd {{.*#+}} xmm1 = [0,0,29,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX512VLVBMI-NEXT: vpermb %ymm0, %ymm1, %ymm0 ; AVX512VLVBMI-NEXT: retq ; @@ -1563,7 +1563,7 @@ define <32 x i8> @shuffle_v32i8_00_30_00_00_00_00_00_00_00_00_00_00_00_00_00_00_ ; ; AVX512VLVBMI-LABEL: shuffle_v32i8_00_30_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: ; AVX512VLVBMI: # %bb.0: -; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} xmm1 = [0,30,0,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX512VLVBMI-NEXT: vmovd {{.*#+}} xmm1 = [0,30,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX512VLVBMI-NEXT: vpermb %ymm0, %ymm1, %ymm0 ; AVX512VLVBMI-NEXT: retq ; @@ -1610,7 +1610,7 @@ define <32 x i8> @shuffle_v32i8_31_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_ ; ; AVX512VLVBMI-LABEL: shuffle_v32i8_31_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: ; AVX512VLVBMI: # %bb.0: -; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} xmm1 = [31,0,0,0] +; AVX512VLVBMI-NEXT: vmovd {{.*#+}} xmm1 = [31,0,0,0] ; AVX512VLVBMI-NEXT: vpermb %ymm0, %ymm1, %ymm0 ; AVX512VLVBMI-NEXT: retq ; @@ -2565,12 +2565,10 @@ define <32 x i8> @shuffle_v32i8_39_38_37_36_35_34_33_32_15_14_13_12_11_10_09_08_ ; AVX1-LABEL: shuffle_v32i8_39_38_37_36_35_34_33_32_15_14_13_12_11_10_09_08_55_54_53_52_51_50_49_48_31_30_29_28_27_26_25_24: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vmovddup {{.*#+}} xmm3 = [15,14,13,12,11,10,9,8,15,14,13,12,11,10,9,8] -; AVX1-NEXT: # xmm3 = mem[0,0] +; AVX1-NEXT: vmovq {{.*#+}} xmm3 = [15,14,13,12,11,10,9,8,0,0,0,0,0,0,0,0] ; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 -; AVX1-NEXT: vmovddup {{.*#+}} xmm5 = [7,6,5,4,3,2,1,0,7,6,5,4,3,2,1,0] -; AVX1-NEXT: # xmm5 = mem[0,0] +; AVX1-NEXT: vmovq {{.*#+}} xmm5 = [7,6,5,4,3,2,1,0,0,0,0,0,0,0,0,0] ; AVX1-NEXT: vpshufb %xmm5, %xmm4, %xmm4 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm4[0],xmm2[0] ; AVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0 @@ -2771,7 +2769,7 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00_ ; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00_16_16_16_16_16_16_16_24_16_16_16_16_16_16_16_16: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,0,0,0,0,0,0,8,0,0,0,0,0,0,0,0] +; AVX1-NEXT: vmovq {{.*#+}} xmm2 = [0,0,0,0,0,0,0,8,0,0,0,0,0,0,0,0] ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 @@ -2785,7 +2783,7 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00_ ; XOPAVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00_16_16_16_16_16_16_16_24_16_16_16_16_16_16_16_16: ; XOPAVX1: # %bb.0: ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,0,0,0,0,0,0,8,0,0,0,0,0,0,0,0] +; XOPAVX1-NEXT: vmovq {{.*#+}} xmm2 = [0,0,0,0,0,0,0,8,0,0,0,0,0,0,0,0] ; XOPAVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; XOPAVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 @@ -2803,7 +2801,7 @@ define <32 x i8> @shuffle_v32i8_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00_ ; AVX1-LABEL: shuffle_v32i8_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00_16_30_16_16_16_16_16_16_16_16_16_16_16_16_16_16: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX1-NEXT: vmovd {{.*#+}} xmm2 = [0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 @@ -2817,7 +2815,7 @@ define <32 x i8> @shuffle_v32i8_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00_ ; XOPAVX1-LABEL: shuffle_v32i8_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00_16_30_16_16_16_16_16_16_16_16_16_16_16_16_16_16: ; XOPAVX1: # %bb.0: ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0] +; XOPAVX1-NEXT: vmovd {{.*#+}} xmm2 = [0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; XOPAVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; XOPAVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 @@ -2835,7 +2833,7 @@ define <32 x i8> @shuffle_v32i8_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_ ; AVX1-LABEL: shuffle_v32i8_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_31_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [15,0,0,0] +; AVX1-NEXT: vmovd {{.*#+}} xmm2 = [15,0,0,0] ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 @@ -2849,7 +2847,7 @@ define <32 x i8> @shuffle_v32i8_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_ ; XOPAVX1-LABEL: shuffle_v32i8_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_31_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16: ; XOPAVX1: # %bb.0: ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [15,0,0,0] +; XOPAVX1-NEXT: vmovd {{.*#+}} xmm2 = [15,0,0,0] ; XOPAVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; XOPAVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll index a69a1a18f26e50..02cad49d29fdb2 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll @@ -208,7 +208,7 @@ define <8 x float> @shuffle_v8f32_06000000(<8 x float> %a, <8 x float> %b) { ; ; AVX2OR512VL-LABEL: shuffle_v8f32_06000000: ; AVX2OR512VL: # %bb.0: -; AVX2OR512VL-NEXT: vmovaps {{.*#+}} xmm1 = [0,6,0,0] +; AVX2OR512VL-NEXT: vmovsd {{.*#+}} xmm1 = [0,6,0,0] ; AVX2OR512VL-NEXT: vpermps %ymm0, %ymm1, %ymm0 ; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> @@ -225,7 +225,7 @@ define <8 x float> @shuffle_v8f32_70000000(<8 x float> %a, <8 x float> %b) { ; ; AVX2OR512VL-LABEL: shuffle_v8f32_70000000: ; AVX2OR512VL: # %bb.0: -; AVX2OR512VL-NEXT: vmovaps {{.*#+}} xmm1 = [7,0,0,0] +; AVX2OR512VL-NEXT: vmovss {{.*#+}} xmm1 = [7,0,0,0] ; AVX2OR512VL-NEXT: vpermps %ymm0, %ymm1, %ymm0 ; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> @@ -1808,7 +1808,7 @@ define <8 x i32> @shuffle_v8i32_06000000(<8 x i32> %a, <8 x i32> %b) { ; ; AVX2OR512VL-LABEL: shuffle_v8i32_06000000: ; AVX2OR512VL: # %bb.0: -; AVX2OR512VL-NEXT: vmovaps {{.*#+}} xmm1 = [0,6,0,0] +; AVX2OR512VL-NEXT: vmovsd {{.*#+}} xmm1 = [0,6,0,0] ; AVX2OR512VL-NEXT: vpermps %ymm0, %ymm1, %ymm0 ; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> @@ -1825,7 +1825,7 @@ define <8 x i32> @shuffle_v8i32_70000000(<8 x i32> %a, <8 x i32> %b) { ; ; AVX2OR512VL-LABEL: shuffle_v8i32_70000000: ; AVX2OR512VL: # %bb.0: -; AVX2OR512VL-NEXT: vmovaps {{.*#+}} xmm1 = [7,0,0,0] +; AVX2OR512VL-NEXT: vmovss {{.*#+}} xmm1 = [7,0,0,0] ; AVX2OR512VL-NEXT: vpermps %ymm0, %ymm1, %ymm0 ; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> diff --git a/llvm/test/CodeGen/X86/vector-shuffle-512-v32.ll b/llvm/test/CodeGen/X86/vector-shuffle-512-v32.ll index fa0ec33bf34080..24b1b42c2dc058 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-512-v32.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-512-v32.ll @@ -199,13 +199,13 @@ define <32 x i16> @shuffle_v32i16_03_00_01_02_07_04_05_06_11_08_09_10_15_12_13_1 define <32 x i16> @shuffle_v32i16_0zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz(<32 x i16> %a) { ; KNL-LABEL: shuffle_v32i16_0zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz: ; KNL: ## %bb.0: -; KNL-NEXT: vmovdqa {{.*#+}} xmm1 = [65535,0,0,0] +; KNL-NEXT: vmovd {{.*#+}} xmm1 = [65535,0,0,0] ; KNL-NEXT: vpandq %zmm1, %zmm0, %zmm0 ; KNL-NEXT: retq ; ; SKX-LABEL: shuffle_v32i16_0zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz: ; SKX: ## %bb.0: -; SKX-NEXT: vmovaps {{.*#+}} xmm1 = [65535,0,0,0] +; SKX-NEXT: vmovss {{.*#+}} xmm1 = [65535,0,0,0] ; SKX-NEXT: vandps %zmm1, %zmm0, %zmm0 ; SKX-NEXT: retq %shuffle = shufflevector <32 x i16> %a, <32 x i16> zeroinitializer, <32 x i32> diff --git a/llvm/test/CodeGen/X86/vector-shuffle-512-v64.ll b/llvm/test/CodeGen/X86/vector-shuffle-512-v64.ll index 4df5307316a426..f4cc8522adec56 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-512-v64.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-512-v64.ll @@ -109,25 +109,25 @@ define <64 x i8> @shuffle_v64i8_79_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_ define <64 x i8> @shuffle_v64i8_0zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz(<64 x i8> %a) { ; AVX512F-LABEL: shuffle_v64i8_0zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm1 = [255,0,0,0] +; AVX512F-NEXT: vmovd {{.*#+}} xmm1 = [255,0,0,0] ; AVX512F-NEXT: vpandq %zmm1, %zmm0, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: shuffle_v64i8_0zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = [255,0,0,0] +; AVX512BW-NEXT: vmovd {{.*#+}} xmm1 = [255,0,0,0] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: retq ; ; AVX512DQ-LABEL: shuffle_v64i8_0zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vmovaps {{.*#+}} xmm1 = [255,0,0,0] +; AVX512DQ-NEXT: vmovss {{.*#+}} xmm1 = [255,0,0,0] ; AVX512DQ-NEXT: vandps %zmm1, %zmm0, %zmm0 ; AVX512DQ-NEXT: retq ; ; AVX512VBMI-LABEL: shuffle_v64i8_0zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz: ; AVX512VBMI: # %bb.0: -; AVX512VBMI-NEXT: vmovdqa {{.*#+}} xmm1 = [255,0,0,0] +; AVX512VBMI-NEXT: vmovd {{.*#+}} xmm1 = [255,0,0,0] ; AVX512VBMI-NEXT: vpandq %zmm1, %zmm0, %zmm0 ; AVX512VBMI-NEXT: retq %shuffle = shufflevector <64 x i8> %a, <64 x i8> zeroinitializer, <64 x i32> diff --git a/llvm/test/CodeGen/X86/vector-shuffle-512-v8.ll b/llvm/test/CodeGen/X86/vector-shuffle-512-v8.ll index 9bfca824fb71a9..008593a239f869 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-512-v8.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-512-v8.ll @@ -142,7 +142,7 @@ define <8 x double> @shuffle_v8f64_06000000(<8 x double> %a, <8 x double> %b) { define <8 x double> @shuffle_v8f64_70000000(<8 x double> %a, <8 x double> %b) { ; ALL-LABEL: shuffle_v8f64_70000000: ; ALL: # %bb.0: -; ALL-NEXT: vmovaps {{.*#+}} xmm1 = [7,0,0,0] +; ALL-NEXT: vmovss {{.*#+}} xmm1 = [7,0,0,0] ; ALL-NEXT: vpermpd %zmm0, %zmm1, %zmm0 ; ALL-NEXT: ret{{[l|q]}} %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> @@ -960,7 +960,7 @@ define <8 x i64> @shuffle_v8i64_06000000(<8 x i64> %a, <8 x i64> %b) { define <8 x i64> @shuffle_v8i64_70000000(<8 x i64> %a, <8 x i64> %b) { ; ALL-LABEL: shuffle_v8i64_70000000: ; ALL: # %bb.0: -; ALL-NEXT: vmovaps {{.*#+}} xmm1 = [7,0,0,0] +; ALL-NEXT: vmovss {{.*#+}} xmm1 = [7,0,0,0] ; ALL-NEXT: vpermpd %zmm0, %zmm1, %zmm0 ; ALL-NEXT: ret{{[l|q]}} %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining.ll index 1e2ee7b99a608d..d285d07e66049c 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-combining.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-combining.ll @@ -1744,26 +1744,18 @@ define <4 x i8> @combine_test1c(ptr %a, ptr %b) { ; SSE41: # %bb.0: ; SSE41-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero ; SSE41-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero -; SSE41-NEXT: movaps {{.*#+}} xmm0 = [0,255,255,255,u,u,u,u,u,u,u,u,u,u,u,u] +; SSE41-NEXT: movss {{.*#+}} xmm0 = [0,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0] ; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1 ; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: retq ; -; AVX1-LABEL: combine_test1c: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX1-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255] -; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: combine_test1c: -; AVX2: # %bb.0: -; AVX2-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX2-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255] -; AVX2-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: retq +; AVX-LABEL: combine_test1c: +; AVX: # %bb.0: +; AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX-NEXT: vmovd {{.*#+}} xmm2 = [0,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retq %A = load <4 x i8>, ptr %a %B = load <4 x i8>, ptr %b %1 = shufflevector <4 x i8> %A, <4 x i8> %B, <4 x i32> @@ -1838,26 +1830,18 @@ define <4 x i8> @combine_test4c(ptr %a, ptr %b) { ; SSE41: # %bb.0: ; SSE41-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero ; SSE41-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero -; SSE41-NEXT: movaps {{.*#+}} xmm0 = [255,0,255,255,u,u,u,u,u,u,u,u,u,u,u,u] +; SSE41-NEXT: movss {{.*#+}} xmm0 = [255,0,255,255,0,0,0,0,0,0,0,0,0,0,0,0] ; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1 ; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: retq ; -; AVX1-LABEL: combine_test4c: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX1-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255] -; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: combine_test4c: -; AVX2: # %bb.0: -; AVX2-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX2-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255] -; AVX2-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: retq +; AVX-LABEL: combine_test4c: +; AVX: # %bb.0: +; AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX-NEXT: vmovd {{.*#+}} xmm2 = [255,0,255,255,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retq %A = load <4 x i8>, ptr %a %B = load <4 x i8>, ptr %b %1 = shufflevector <4 x i8> %A, <4 x i8> %B, <4 x i32> @@ -3187,7 +3171,7 @@ declare <8 x i16> @llvm.x86.sse2.psrli.w(<8 x i16>, i32) define void @PR43024() { ; SSE2-LABEL: PR43024: ; SSE2: # %bb.0: -; SSE2-NEXT: movaps {{.*#+}} xmm0 = [NaN,NaN,0.0E+0,0.0E+0] +; SSE2-NEXT: movsd {{.*#+}} xmm0 = [NaN,NaN,0.0E+0,0.0E+0] ; SSE2-NEXT: movaps %xmm0, (%rax) ; SSE2-NEXT: addss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE2-NEXT: xorps %xmm1, %xmm1 @@ -3198,7 +3182,7 @@ define void @PR43024() { ; ; SSSE3-LABEL: PR43024: ; SSSE3: # %bb.0: -; SSSE3-NEXT: movaps {{.*#+}} xmm0 = [NaN,NaN,0.0E+0,0.0E+0] +; SSSE3-NEXT: movsd {{.*#+}} xmm0 = [NaN,NaN,0.0E+0,0.0E+0] ; SSSE3-NEXT: movaps %xmm0, (%rax) ; SSSE3-NEXT: addss %xmm0, %xmm0 ; SSSE3-NEXT: xorps %xmm1, %xmm1 @@ -3209,7 +3193,7 @@ define void @PR43024() { ; ; SSE41-LABEL: PR43024: ; SSE41: # %bb.0: -; SSE41-NEXT: movaps {{.*#+}} xmm0 = [NaN,NaN,0.0E+0,0.0E+0] +; SSE41-NEXT: movsd {{.*#+}} xmm0 = [NaN,NaN,0.0E+0,0.0E+0] ; SSE41-NEXT: movaps %xmm0, (%rax) ; SSE41-NEXT: addss %xmm0, %xmm0 ; SSE41-NEXT: xorps %xmm1, %xmm1 @@ -3220,7 +3204,7 @@ define void @PR43024() { ; ; AVX-LABEL: PR43024: ; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [NaN,NaN,0.0E+0,0.0E+0] +; AVX-NEXT: vmovsd {{.*#+}} xmm0 = [NaN,NaN,0.0E+0,0.0E+0] ; AVX-NEXT: vmovaps %xmm0, (%rax) ; AVX-NEXT: vaddss {{\.?LCPI[0-9]+_[0-9]+}}+4(%rip), %xmm0, %xmm0 ; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 diff --git a/llvm/test/CodeGen/X86/vector-shuffle-v1.ll b/llvm/test/CodeGen/X86/vector-shuffle-v1.ll index 95a57d1bdf3318..aca50c461a7a19 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-v1.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-v1.ll @@ -42,7 +42,7 @@ define <2 x i1> @shuf2i1_1_2(<2 x i1> %a) { ; AVX512F-NEXT: vpsllq $63, %xmm0, %xmm0 ; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1 ; AVX512F-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm1 = [18446744073709551615,0] +; AVX512F-NEXT: vmovq {{.*#+}} xmm1 = [18446744073709551615,0] ; AVX512F-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] ; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1 ; AVX512F-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} @@ -56,7 +56,7 @@ define <2 x i1> @shuf2i1_1_2(<2 x i1> %a) { ; AVX512VL-NEXT: vptestmq %xmm0, %xmm0, %k1 ; AVX512VL-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512VL-NEXT: vmovdqa64 %xmm0, %xmm1 {%k1} {z} -; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [18446744073709551615,0] +; AVX512VL-NEXT: vmovq {{.*#+}} xmm2 = [18446744073709551615,0] ; AVX512VL-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] ; AVX512VL-NEXT: vptestmq %xmm1, %xmm1, %k1 ; AVX512VL-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} @@ -67,7 +67,7 @@ define <2 x i1> @shuf2i1_1_2(<2 x i1> %a) { ; VL_BW_DQ-NEXT: vpsllq $63, %xmm0, %xmm0 ; VL_BW_DQ-NEXT: vpmovq2m %xmm0, %k0 ; VL_BW_DQ-NEXT: vpmovm2q %k0, %xmm0 -; VL_BW_DQ-NEXT: vmovdqa {{.*#+}} xmm1 = [18446744073709551615,0] +; VL_BW_DQ-NEXT: vmovq {{.*#+}} xmm1 = [18446744073709551615,0] ; VL_BW_DQ-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] ; VL_BW_DQ-NEXT: vpmovq2m %xmm0, %k0 ; VL_BW_DQ-NEXT: vpmovm2q %k0, %xmm0 diff --git a/llvm/test/CodeGen/X86/vector-trunc.ll b/llvm/test/CodeGen/X86/vector-trunc.ll index f8e3a7a23056fb..3294c7ffee40d2 100644 --- a/llvm/test/CodeGen/X86/vector-trunc.ll +++ b/llvm/test/CodeGen/X86/vector-trunc.ll @@ -516,7 +516,7 @@ define void @trunc8i32_8i8(<8 x i32> %a) { ; AVX1-LABEL: trunc8i32_8i8: ; AVX1: # %bb.0: # %entry ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX1-NEXT: vmovd {{.*#+}} xmm2 = [0,4,8,12,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] @@ -527,7 +527,7 @@ define void @trunc8i32_8i8(<8 x i32> %a) { ; AVX2-LABEL: trunc8i32_8i8: ; AVX2: # %bb.0: # %entry ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX2-NEXT: vmovd {{.*#+}} xmm2 = [0,4,8,12,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] diff --git a/llvm/test/CodeGen/X86/vector-tzcnt-128.ll b/llvm/test/CodeGen/X86/vector-tzcnt-128.ll index 3d5947d8e59bd4..3dc43031cea9ef 100644 --- a/llvm/test/CodeGen/X86/vector-tzcnt-128.ll +++ b/llvm/test/CodeGen/X86/vector-tzcnt-128.ll @@ -1762,37 +1762,37 @@ define <16 x i8> @testv16i8u(<16 x i8> %in) nounwind { define <2 x i64> @foldv2i64() nounwind { ; SSE-LABEL: foldv2i64: ; SSE: # %bb.0: -; SSE-NEXT: movaps {{.*#+}} xmm0 = [8,0,0,0] +; SSE-NEXT: movss {{.*#+}} xmm0 = [8,0,0,0] ; SSE-NEXT: retq ; ; AVX-LABEL: foldv2i64: ; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,0,0] +; AVX-NEXT: vmovss {{.*#+}} xmm0 = [8,0,0,0] ; AVX-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: foldv2i64: ; AVX512VPOPCNTDQ: # %bb.0: -; AVX512VPOPCNTDQ-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,0,0] +; AVX512VPOPCNTDQ-NEXT: vmovss {{.*#+}} xmm0 = [8,0,0,0] ; AVX512VPOPCNTDQ-NEXT: retq ; ; AVX512VPOPCNTDQVL-LABEL: foldv2i64: ; AVX512VPOPCNTDQVL: # %bb.0: -; AVX512VPOPCNTDQVL-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,0,0] +; AVX512VPOPCNTDQVL-NEXT: vmovss {{.*#+}} xmm0 = [8,0,0,0] ; AVX512VPOPCNTDQVL-NEXT: retq ; ; BITALG_NOVLX-LABEL: foldv2i64: ; BITALG_NOVLX: # %bb.0: -; BITALG_NOVLX-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,0,0] +; BITALG_NOVLX-NEXT: vmovss {{.*#+}} xmm0 = [8,0,0,0] ; BITALG_NOVLX-NEXT: retq ; ; BITALG-LABEL: foldv2i64: ; BITALG: # %bb.0: -; BITALG-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,0,0] +; BITALG-NEXT: vmovss {{.*#+}} xmm0 = [8,0,0,0] ; BITALG-NEXT: retq ; ; X86-SSE-LABEL: foldv2i64: ; X86-SSE: # %bb.0: -; X86-SSE-NEXT: movaps {{.*#+}} xmm0 = [8,0,0,0] +; X86-SSE-NEXT: movss {{.*#+}} xmm0 = [8,0,0,0] ; X86-SSE-NEXT: retl %out = call <2 x i64> @llvm.cttz.v2i64(<2 x i64> , i1 0) ret <2 x i64> %out @@ -1801,37 +1801,37 @@ define <2 x i64> @foldv2i64() nounwind { define <2 x i64> @foldv2i64u() nounwind { ; SSE-LABEL: foldv2i64u: ; SSE: # %bb.0: -; SSE-NEXT: movaps {{.*#+}} xmm0 = [8,0,0,0] +; SSE-NEXT: movss {{.*#+}} xmm0 = [8,0,0,0] ; SSE-NEXT: retq ; ; AVX-LABEL: foldv2i64u: ; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,0,0] +; AVX-NEXT: vmovss {{.*#+}} xmm0 = [8,0,0,0] ; AVX-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: foldv2i64u: ; AVX512VPOPCNTDQ: # %bb.0: -; AVX512VPOPCNTDQ-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,0,0] +; AVX512VPOPCNTDQ-NEXT: vmovss {{.*#+}} xmm0 = [8,0,0,0] ; AVX512VPOPCNTDQ-NEXT: retq ; ; AVX512VPOPCNTDQVL-LABEL: foldv2i64u: ; AVX512VPOPCNTDQVL: # %bb.0: -; AVX512VPOPCNTDQVL-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,0,0] +; AVX512VPOPCNTDQVL-NEXT: vmovss {{.*#+}} xmm0 = [8,0,0,0] ; AVX512VPOPCNTDQVL-NEXT: retq ; ; BITALG_NOVLX-LABEL: foldv2i64u: ; BITALG_NOVLX: # %bb.0: -; BITALG_NOVLX-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,0,0] +; BITALG_NOVLX-NEXT: vmovss {{.*#+}} xmm0 = [8,0,0,0] ; BITALG_NOVLX-NEXT: retq ; ; BITALG-LABEL: foldv2i64u: ; BITALG: # %bb.0: -; BITALG-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,0,0] +; BITALG-NEXT: vmovss {{.*#+}} xmm0 = [8,0,0,0] ; BITALG-NEXT: retq ; ; X86-SSE-LABEL: foldv2i64u: ; X86-SSE: # %bb.0: -; X86-SSE-NEXT: movaps {{.*#+}} xmm0 = [8,0,0,0] +; X86-SSE-NEXT: movss {{.*#+}} xmm0 = [8,0,0,0] ; X86-SSE-NEXT: retl %out = call <2 x i64> @llvm.cttz.v2i64(<2 x i64> , i1 -1) ret <2 x i64> %out diff --git a/llvm/test/CodeGen/X86/vselect-constants.ll b/llvm/test/CodeGen/X86/vselect-constants.ll index 0630a40b88099e..050b3329a4abb6 100644 --- a/llvm/test/CodeGen/X86/vselect-constants.ll +++ b/llvm/test/CodeGen/X86/vselect-constants.ll @@ -281,7 +281,7 @@ define i32 @wrong_min_signbits(<2 x i16> %x) { ; SSE: # %bb.0: ; SSE-NEXT: pxor %xmm1, %xmm1 ; SSE-NEXT: pcmpeqw %xmm0, %xmm1 -; SSE-NEXT: movdqa {{.*#+}} xmm0 = [1,0,0,0] +; SSE-NEXT: movd {{.*#+}} xmm0 = [1,0,0,0] ; SSE-NEXT: pand %xmm1, %xmm0 ; SSE-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 ; SSE-NEXT: por %xmm0, %xmm1 @@ -292,7 +292,7 @@ define i32 @wrong_min_signbits(<2 x i16> %x) { ; AVX: # %bb.0: ; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [2,0,0,0] +; AVX-NEXT: vmovd {{.*#+}} xmm1 = [2,0,0,0] ; AVX-NEXT: vpblendvb %xmm0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm0 ; AVX-NEXT: vmovd %xmm0, %eax ; AVX-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vselect-post-combine.ll b/llvm/test/CodeGen/X86/vselect-post-combine.ll index e91b8d029bcb4a..474f70f78937eb 100644 --- a/llvm/test/CodeGen/X86/vselect-post-combine.ll +++ b/llvm/test/CodeGen/X86/vselect-post-combine.ll @@ -4,7 +4,7 @@ define ptr @test_mul(ptr %addr) { ; AVX2-LABEL: test_mul: ; AVX2: # %bb.0: # %entry -; AVX2-NEXT: vmovdqa {{.*#+}} xmm0 = [255,0,0,0] +; AVX2-NEXT: vmovd {{.*#+}} xmm0 = [255,0,0,0] ; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; AVX2-NEXT: vpblendvb %xmm0, (%rdi), %xmm1, %xmm0 ; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero diff --git a/llvm/test/CodeGen/X86/widen_bitcnt.ll b/llvm/test/CodeGen/X86/widen_bitcnt.ll index 0f121d88b3573d..18cb5e1b86ec31 100644 --- a/llvm/test/CodeGen/X86/widen_bitcnt.ll +++ b/llvm/test/CodeGen/X86/widen_bitcnt.ll @@ -345,7 +345,7 @@ define <8 x i32> @widen_ctpop_v2i32_v8i32(<2 x i32> %a0, <2 x i32> %a1, <2 x i32 define <4 x i32> @widen_ctlz_v2i32_v4i32(<2 x i32> %a0, <2 x i32> %a1) { ; SSE42-LABEL: widen_ctlz_v2i32_v4i32: ; SSE42: # %bb.0: -; SSE42-NEXT: movdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; SSE42-NEXT: movq {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; SSE42-NEXT: movdqa %xmm3, %xmm6 ; SSE42-NEXT: pshufb %xmm0, %xmm6 ; SSE42-NEXT: movdqa %xmm0, %xmm5 @@ -394,7 +394,7 @@ define <4 x i32> @widen_ctlz_v2i32_v4i32(<2 x i32> %a0, <2 x i32> %a1) { ; ; AVX2-LABEL: widen_ctlz_v2i32_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX2-NEXT: vmovq {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; AVX2-NEXT: vpshufb %xmm0, %xmm2, %xmm3 ; AVX2-NEXT: vpsrlw $4, %xmm0, %xmm4 ; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] @@ -448,7 +448,7 @@ define <4 x i32> @widen_ctlz_v2i32_v4i32(<2 x i32> %a0, <2 x i32> %a1) { define <8 x i32> @widen_ctlz_v4i32_v8i32(<4 x i32> %a0, <4 x i32> %a1) { ; SSE42-LABEL: widen_ctlz_v4i32_v8i32: ; SSE42: # %bb.0: -; SSE42-NEXT: movdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; SSE42-NEXT: movq {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; SSE42-NEXT: movdqa %xmm3, %xmm6 ; SSE42-NEXT: pshufb %xmm0, %xmm6 ; SSE42-NEXT: movdqa %xmm0, %xmm5 @@ -535,7 +535,7 @@ define <8 x i32> @widen_ctlz_v4i32_v8i32(<4 x i32> %a0, <4 x i32> %a1) { define <8 x i32> @widen_ctlz_v2i32_v8i32(<2 x i32> %a0, <2 x i32> %a1, <2 x i32> %a2, <2 x i32> %a3) { ; SSE42-LABEL: widen_ctlz_v2i32_v8i32: ; SSE42: # %bb.0: -; SSE42-NEXT: movdqa {{.*#+}} xmm5 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; SSE42-NEXT: movq {{.*#+}} xmm5 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; SSE42-NEXT: movdqa %xmm5, %xmm8 ; SSE42-NEXT: pshufb %xmm0, %xmm8 ; SSE42-NEXT: movdqa %xmm0, %xmm7 @@ -629,7 +629,7 @@ define <8 x i32> @widen_ctlz_v2i32_v8i32(<2 x i32> %a0, <2 x i32> %a1, <2 x i32> ; AVX2: # %bb.0: ; AVX2-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1 ; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm5 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX2-NEXT: vmovq {{.*#+}} xmm5 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; AVX2-NEXT: vpshufb %xmm0, %xmm5, %xmm4 ; AVX2-NEXT: vpsrlw $4, %xmm0, %xmm6 ; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm7 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] @@ -724,7 +724,7 @@ define <8 x i32> @widen_ctlz_v2i32_v8i32(<2 x i32> %a0, <2 x i32> %a1, <2 x i32> define <4 x i32> @widen_ctlz_undef_v2i32_v4i32(<2 x i32> %a0, <2 x i32> %a1) { ; SSE42-LABEL: widen_ctlz_undef_v2i32_v4i32: ; SSE42: # %bb.0: -; SSE42-NEXT: movdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; SSE42-NEXT: movq {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; SSE42-NEXT: movdqa %xmm3, %xmm6 ; SSE42-NEXT: pshufb %xmm0, %xmm6 ; SSE42-NEXT: movdqa %xmm0, %xmm5 @@ -773,7 +773,7 @@ define <4 x i32> @widen_ctlz_undef_v2i32_v4i32(<2 x i32> %a0, <2 x i32> %a1) { ; ; AVX2-LABEL: widen_ctlz_undef_v2i32_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX2-NEXT: vmovq {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; AVX2-NEXT: vpshufb %xmm0, %xmm2, %xmm3 ; AVX2-NEXT: vpsrlw $4, %xmm0, %xmm4 ; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] @@ -827,7 +827,7 @@ define <4 x i32> @widen_ctlz_undef_v2i32_v4i32(<2 x i32> %a0, <2 x i32> %a1) { define <8 x i32> @widen_ctlz_undef_v4i32_v8i32(<4 x i32> %a0, <4 x i32> %a1) { ; SSE42-LABEL: widen_ctlz_undef_v4i32_v8i32: ; SSE42: # %bb.0: -; SSE42-NEXT: movdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; SSE42-NEXT: movq {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; SSE42-NEXT: movdqa %xmm3, %xmm6 ; SSE42-NEXT: pshufb %xmm0, %xmm6 ; SSE42-NEXT: movdqa %xmm0, %xmm5 @@ -914,7 +914,7 @@ define <8 x i32> @widen_ctlz_undef_v4i32_v8i32(<4 x i32> %a0, <4 x i32> %a1) { define <8 x i32> @widen_ctlz_undef_v2i32_v8i32(<2 x i32> %a0, <2 x i32> %a1, <2 x i32> %a2, <2 x i32> %a3) { ; SSE42-LABEL: widen_ctlz_undef_v2i32_v8i32: ; SSE42: # %bb.0: -; SSE42-NEXT: movdqa {{.*#+}} xmm5 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; SSE42-NEXT: movq {{.*#+}} xmm5 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; SSE42-NEXT: movdqa %xmm5, %xmm8 ; SSE42-NEXT: pshufb %xmm0, %xmm8 ; SSE42-NEXT: movdqa %xmm0, %xmm7 @@ -1008,7 +1008,7 @@ define <8 x i32> @widen_ctlz_undef_v2i32_v8i32(<2 x i32> %a0, <2 x i32> %a1, <2 ; AVX2: # %bb.0: ; AVX2-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1 ; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm5 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX2-NEXT: vmovq {{.*#+}} xmm5 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; AVX2-NEXT: vpshufb %xmm0, %xmm5, %xmm4 ; AVX2-NEXT: vpsrlw $4, %xmm0, %xmm6 ; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm7 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] diff --git a/llvm/test/CodeGen/X86/x86-interleaved-access.ll b/llvm/test/CodeGen/X86/x86-interleaved-access.ll index 742a3a0c66a2ac..06140b2395fca3 100644 --- a/llvm/test/CodeGen/X86/x86-interleaved-access.ll +++ b/llvm/test/CodeGen/X86/x86-interleaved-access.ll @@ -444,7 +444,7 @@ define <16 x i1> @interleaved_load_vf16_i8_stride4(ptr %ptr) nounwind { ; AVX1-NEXT: vpshufb %xmm2, %xmm4, %xmm5 ; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1] -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm5 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX1-NEXT: vmovd {{.*#+}} xmm5 = [0,4,8,12,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX1-NEXT: vpshufb %xmm5, %xmm1, %xmm6 ; AVX1-NEXT: vpshufb %xmm5, %xmm0, %xmm5 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] @@ -453,7 +453,7 @@ define <16 x i1> @interleaved_load_vf16_i8_stride4(ptr %ptr) nounwind { ; AVX1-NEXT: vpshufb %xmm5, %xmm4, %xmm6 ; AVX1-NEXT: vpshufb %xmm5, %xmm3, %xmm5 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm6 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX1-NEXT: vmovd {{.*#+}} xmm6 = [1,5,9,13,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX1-NEXT: vpshufb %xmm6, %xmm1, %xmm7 ; AVX1-NEXT: vpshufb %xmm6, %xmm0, %xmm6 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1] @@ -463,7 +463,7 @@ define <16 x i1> @interleaved_load_vf16_i8_stride4(ptr %ptr) nounwind { ; AVX1-NEXT: vpshufb %xmm5, %xmm4, %xmm6 ; AVX1-NEXT: vpshufb %xmm5, %xmm3, %xmm5 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm6 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX1-NEXT: vmovd {{.*#+}} xmm6 = [2,6,10,14,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX1-NEXT: vpshufb %xmm6, %xmm1, %xmm7 ; AVX1-NEXT: vpshufb %xmm6, %xmm0, %xmm6 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1] @@ -472,7 +472,7 @@ define <16 x i1> @interleaved_load_vf16_i8_stride4(ptr %ptr) nounwind { ; AVX1-NEXT: vpshufb %xmm6, %xmm4, %xmm4 ; AVX1-NEXT: vpshufb %xmm6, %xmm3, %xmm3 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX1-NEXT: vmovd {{.*#+}} xmm4 = [3,7,11,15,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX1-NEXT: vpshufb %xmm4, %xmm1, %xmm1 ; AVX1-NEXT: vpshufb %xmm4, %xmm0, %xmm0 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] @@ -492,7 +492,7 @@ define <16 x i1> @interleaved_load_vf16_i8_stride4(ptr %ptr) nounwind { ; AVX2-NEXT: vpshufb %xmm4, %xmm3, %xmm5 ; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm4 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm5 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX2-NEXT: vmovd {{.*#+}} xmm5 = [0,4,8,12,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX2-NEXT: vpshufb %xmm5, %xmm1, %xmm6 ; AVX2-NEXT: vpshufb %xmm5, %xmm0, %xmm5 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] @@ -501,7 +501,7 @@ define <16 x i1> @interleaved_load_vf16_i8_stride4(ptr %ptr) nounwind { ; AVX2-NEXT: vpshufb %xmm5, %xmm3, %xmm6 ; AVX2-NEXT: vpshufb %xmm5, %xmm2, %xmm5 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm6 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX2-NEXT: vmovd {{.*#+}} xmm6 = [1,5,9,13,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX2-NEXT: vpshufb %xmm6, %xmm1, %xmm7 ; AVX2-NEXT: vpshufb %xmm6, %xmm0, %xmm6 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1] @@ -511,7 +511,7 @@ define <16 x i1> @interleaved_load_vf16_i8_stride4(ptr %ptr) nounwind { ; AVX2-NEXT: vpshufb %xmm5, %xmm3, %xmm6 ; AVX2-NEXT: vpshufb %xmm5, %xmm2, %xmm5 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm6 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX2-NEXT: vmovd {{.*#+}} xmm6 = [2,6,10,14,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX2-NEXT: vpshufb %xmm6, %xmm1, %xmm7 ; AVX2-NEXT: vpshufb %xmm6, %xmm0, %xmm6 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1] @@ -520,7 +520,7 @@ define <16 x i1> @interleaved_load_vf16_i8_stride4(ptr %ptr) nounwind { ; AVX2-NEXT: vpshufb %xmm6, %xmm3, %xmm3 ; AVX2-NEXT: vpshufb %xmm6, %xmm2, %xmm2 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX2-NEXT: vmovd {{.*#+}} xmm3 = [3,7,11,15,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1 ; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] @@ -571,7 +571,7 @@ define <32 x i1> @interleaved_load_vf32_i8_stride4(ptr %ptr) nounwind { ; AVX1-NEXT: vpshufb %xmm6, %xmm3, %xmm4 ; AVX1-NEXT: vpshufb %xmm6, %xmm2, %xmm5 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm8 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX1-NEXT: vmovd {{.*#+}} xmm8 = [0,4,8,12,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX1-NEXT: vpshufb %xmm8, %xmm1, %xmm5 ; AVX1-NEXT: vpshufb %xmm8, %xmm0, %xmm7 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm7[0],xmm5[0],xmm7[1],xmm5[1] @@ -591,7 +591,7 @@ define <32 x i1> @interleaved_load_vf32_i8_stride4(ptr %ptr) nounwind { ; AVX1-NEXT: vpshufb %xmm11, %xmm3, %xmm8 ; AVX1-NEXT: vpshufb %xmm11, %xmm2, %xmm12 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm12[0],xmm8[0],xmm12[1],xmm8[1] -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm12 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13] +; AVX1-NEXT: vmovd {{.*#+}} xmm12 = [1,5,9,13,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX1-NEXT: vpshufb %xmm12, %xmm1, %xmm13 ; AVX1-NEXT: vpshufb %xmm12, %xmm0, %xmm14 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm13 = xmm14[0],xmm13[0],xmm14[1],xmm13[1] @@ -609,7 +609,7 @@ define <32 x i1> @interleaved_load_vf32_i8_stride4(ptr %ptr) nounwind { ; AVX1-NEXT: vpshufb %xmm10, %xmm3, %xmm11 ; AVX1-NEXT: vpshufb %xmm10, %xmm2, %xmm12 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1] -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm12 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14] +; AVX1-NEXT: vmovd {{.*#+}} xmm12 = [2,6,10,14,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX1-NEXT: vpshufb %xmm12, %xmm1, %xmm13 ; AVX1-NEXT: vpshufb %xmm12, %xmm0, %xmm14 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm13 = xmm14[0],xmm13[0],xmm14[1],xmm13[1] @@ -625,7 +625,7 @@ define <32 x i1> @interleaved_load_vf32_i8_stride4(ptr %ptr) nounwind { ; AVX1-NEXT: vpshufb %xmm12, %xmm3, %xmm3 ; AVX1-NEXT: vpshufb %xmm12, %xmm2, %xmm2 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15] +; AVX1-NEXT: vmovd {{.*#+}} xmm3 = [3,7,11,15,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX1-NEXT: vpshufb %xmm3, %xmm1, %xmm1 ; AVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0 ; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] diff --git a/llvm/test/CodeGen/X86/zero_extend_vector_inreg.ll b/llvm/test/CodeGen/X86/zero_extend_vector_inreg.ll index 023ac96181d3cc..8c9dc90d2a71da 100644 --- a/llvm/test/CodeGen/X86/zero_extend_vector_inreg.ll +++ b/llvm/test/CodeGen/X86/zero_extend_vector_inreg.ll @@ -1686,7 +1686,7 @@ define void @vec256_v32i8_to_v2i128_factor16(ptr %in.vec.base.ptr, ptr %in.vec.b ; SSE: # %bb.0: ; SSE-NEXT: movdqa (%rdi), %xmm0 ; SSE-NEXT: paddb (%rsi), %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,0,0,0] +; SSE-NEXT: movd {{.*#+}} xmm1 = [255,0,0,0] ; SSE-NEXT: pand %xmm0, %xmm1 ; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1] ; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero @@ -1783,7 +1783,7 @@ define void @vec256_v32i8_to_v1i256_factor32(ptr %in.vec.base.ptr, ptr %in.vec.b ; AVX2: # %bb.0: ; AVX2-NEXT: vmovdqa (%rdi), %ymm0 ; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [255,0,0,0] +; AVX2-NEXT: vmovd {{.*#+}} xmm1 = [255,0,0,0] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX2-NEXT: vmovdqa %ymm0, (%rcx) @@ -1794,7 +1794,7 @@ define void @vec256_v32i8_to_v1i256_factor32(ptr %in.vec.base.ptr, ptr %in.vec.b ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm1 = [255,0,0,0] +; AVX512F-NEXT: vmovd {{.*#+}} xmm1 = [255,0,0,0] ; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) @@ -1805,7 +1805,7 @@ define void @vec256_v32i8_to_v1i256_factor32(ptr %in.vec.base.ptr, ptr %in.vec.b ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512BW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = [255,0,0,0] +; AVX512BW-NEXT: vmovd {{.*#+}} xmm1 = [255,0,0,0] ; AVX512BW-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) @@ -1997,7 +1997,7 @@ define void @vec256_v16i16_to_v2i128_factor8(ptr %in.vec.base.ptr, ptr %in.vec.b ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa (%rdi), %xmm0 ; SSE2-NEXT: paddb (%rsi), %xmm0 -; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,0,0] +; SSE2-NEXT: movd {{.*#+}} xmm1 = [65535,0,0,0] ; SSE2-NEXT: pand %xmm0, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; SSE2-NEXT: psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero @@ -2126,7 +2126,7 @@ define void @vec256_v16i16_to_v1i256_factor16(ptr %in.vec.base.ptr, ptr %in.vec. ; AVX2: # %bb.0: ; AVX2-NEXT: vmovdqa (%rdi), %ymm0 ; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [65535,0,0,0] +; AVX2-NEXT: vmovd {{.*#+}} xmm1 = [65535,0,0,0] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX2-NEXT: vmovdqa %ymm0, (%rcx) @@ -2137,7 +2137,7 @@ define void @vec256_v16i16_to_v1i256_factor16(ptr %in.vec.base.ptr, ptr %in.vec. ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm1 = [65535,0,0,0] +; AVX512F-NEXT: vmovd {{.*#+}} xmm1 = [65535,0,0,0] ; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) @@ -2148,7 +2148,7 @@ define void @vec256_v16i16_to_v1i256_factor16(ptr %in.vec.base.ptr, ptr %in.vec. ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512BW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = [65535,0,0,0] +; AVX512BW-NEXT: vmovd {{.*#+}} xmm1 = [65535,0,0,0] ; AVX512BW-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) @@ -3360,7 +3360,7 @@ define void @vec384_v48i8_to_v3i128_factor16(ptr %in.vec.base.ptr, ptr %in.vec.b ; SSE: # %bb.0: ; SSE-NEXT: movdqa (%rdi), %xmm0 ; SSE-NEXT: paddb (%rsi), %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,0,0,0] +; SSE-NEXT: movd {{.*#+}} xmm1 = [255,0,0,0] ; SSE-NEXT: pand %xmm0, %xmm1 ; SSE-NEXT: movdqa %xmm0, %xmm2 ; SSE-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2] @@ -3483,7 +3483,7 @@ define void @vec384_v48i8_to_v2i192_factor24(ptr %in.vec.base.ptr, ptr %in.vec.b ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa (%rdi), %xmm0 ; SSE2-NEXT: paddb (%rsi), %xmm0 -; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [255,0,0,0] +; SSE2-NEXT: movd {{.*#+}} xmm1 = [255,0,0,0] ; SSE2-NEXT: pand %xmm0, %xmm1 ; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1] ; SSE2-NEXT: psrldq {{.*#+}} xmm0 = xmm0[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero @@ -3609,7 +3609,7 @@ define void @vec384_v48i8_to_v1i384_factor48(ptr %in.vec.base.ptr, ptr %in.vec.b ; AVX2: # %bb.0: ; AVX2-NEXT: vmovdqa (%rdi), %ymm0 ; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [255,0,0,0] +; AVX2-NEXT: vmovd {{.*#+}} xmm1 = [255,0,0,0] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vmovaps 32(%rdx), %ymm1 ; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 @@ -3622,7 +3622,7 @@ define void @vec384_v48i8_to_v1i384_factor48(ptr %in.vec.base.ptr, ptr %in.vec.b ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm1 = [255,0,0,0] +; AVX512F-NEXT: vmovd {{.*#+}} xmm1 = [255,0,0,0] ; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX512F-NEXT: vmovaps 32(%rdx), %ymm1 @@ -3635,7 +3635,7 @@ define void @vec384_v48i8_to_v1i384_factor48(ptr %in.vec.base.ptr, ptr %in.vec.b ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = [255,0,0,0] +; AVX512BW-NEXT: vmovd {{.*#+}} xmm1 = [255,0,0,0] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) @@ -4261,7 +4261,7 @@ define void @vec384_v24i16_to_v3i128_factor8(ptr %in.vec.base.ptr, ptr %in.vec.b ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa (%rdi), %xmm0 ; SSE2-NEXT: paddb (%rsi), %xmm0 -; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,0,0] +; SSE2-NEXT: movd {{.*#+}} xmm1 = [65535,0,0,0] ; SSE2-NEXT: pand %xmm0, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,0,0,0] ; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5] @@ -4442,7 +4442,7 @@ define void @vec384_v24i16_to_v2i192_factor12(ptr %in.vec.base.ptr, ptr %in.vec. ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa (%rdi), %xmm0 ; SSE2-NEXT: paddb (%rsi), %xmm0 -; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,0,0] +; SSE2-NEXT: movd {{.*#+}} xmm1 = [65535,0,0,0] ; SSE2-NEXT: pand %xmm0, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; SSE2-NEXT: psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero @@ -4589,7 +4589,7 @@ define void @vec384_v24i16_to_v1i384_factor24(ptr %in.vec.base.ptr, ptr %in.vec. ; AVX2: # %bb.0: ; AVX2-NEXT: vmovdqa (%rdi), %ymm0 ; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [65535,0,0,0] +; AVX2-NEXT: vmovd {{.*#+}} xmm1 = [65535,0,0,0] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vmovaps 32(%rdx), %ymm1 ; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 @@ -4602,7 +4602,7 @@ define void @vec384_v24i16_to_v1i384_factor24(ptr %in.vec.base.ptr, ptr %in.vec. ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm1 = [65535,0,0,0] +; AVX512F-NEXT: vmovd {{.*#+}} xmm1 = [65535,0,0,0] ; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX512F-NEXT: vmovaps 32(%rdx), %ymm1 @@ -5998,7 +5998,7 @@ define void @vec512_v64i8_to_v4i128_factor16(ptr %in.vec.base.ptr, ptr %in.vec.b ; SSE: # %bb.0: ; SSE-NEXT: movdqa (%rdi), %xmm0 ; SSE-NEXT: paddb (%rsi), %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,0,0,0] +; SSE-NEXT: movd {{.*#+}} xmm1 = [255,0,0,0] ; SSE-NEXT: pand %xmm0, %xmm1 ; SSE-NEXT: movdqa %xmm0, %xmm2 ; SSE-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2] @@ -6182,7 +6182,7 @@ define void @vec512_v64i8_to_v2i256_factor32(ptr %in.vec.base.ptr, ptr %in.vec.b ; SSE: # %bb.0: ; SSE-NEXT: movdqa (%rdi), %xmm0 ; SSE-NEXT: paddb (%rsi), %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,0,0,0] +; SSE-NEXT: movd {{.*#+}} xmm1 = [255,0,0,0] ; SSE-NEXT: pand %xmm0, %xmm1 ; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1] ; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero @@ -6217,7 +6217,7 @@ define void @vec512_v64i8_to_v2i256_factor32(ptr %in.vec.base.ptr, ptr %in.vec.b ; AVX2: # %bb.0: ; AVX2-NEXT: vmovdqa (%rdi), %ymm0 ; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [255,0,0,0] +; AVX2-NEXT: vmovd {{.*#+}} xmm1 = [255,0,0,0] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm1 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 @@ -6231,7 +6231,7 @@ define void @vec512_v64i8_to_v2i256_factor32(ptr %in.vec.base.ptr, ptr %in.vec.b ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm1 = [255,0,0,0] +; AVX512F-NEXT: vmovd {{.*#+}} xmm1 = [255,0,0,0] ; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm1 ; AVX512F-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 @@ -6245,7 +6245,7 @@ define void @vec512_v64i8_to_v2i256_factor32(ptr %in.vec.base.ptr, ptr %in.vec.b ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512BW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = [255,0,0,0] +; AVX512BW-NEXT: vmovd {{.*#+}} xmm1 = [255,0,0,0] ; AVX512BW-NEXT: vpand %ymm1, %ymm0, %ymm1 ; AVX512BW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 @@ -6297,7 +6297,7 @@ define void @vec512_v64i8_to_v1i512_factor64(ptr %in.vec.base.ptr, ptr %in.vec.b ; AVX2: # %bb.0: ; AVX2-NEXT: vmovdqa (%rdi), %ymm0 ; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [255,0,0,0] +; AVX2-NEXT: vmovd {{.*#+}} xmm1 = [255,0,0,0] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vmovaps 32(%rdx), %ymm1 ; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 @@ -6310,7 +6310,7 @@ define void @vec512_v64i8_to_v1i512_factor64(ptr %in.vec.base.ptr, ptr %in.vec.b ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm1 = [255,0,0,0] +; AVX512F-NEXT: vmovd {{.*#+}} xmm1 = [255,0,0,0] ; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX512F-NEXT: vmovaps 32(%rdx), %ymm1 @@ -6323,7 +6323,7 @@ define void @vec512_v64i8_to_v1i512_factor64(ptr %in.vec.base.ptr, ptr %in.vec.b ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = [255,0,0,0] +; AVX512BW-NEXT: vmovd {{.*#+}} xmm1 = [255,0,0,0] ; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) @@ -6573,7 +6573,7 @@ define void @vec512_v32i16_to_v4i128_factor8(ptr %in.vec.base.ptr, ptr %in.vec.b ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa (%rdi), %xmm0 ; SSE2-NEXT: paddb (%rsi), %xmm0 -; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,0,0] +; SSE2-NEXT: movd {{.*#+}} xmm1 = [65535,0,0,0] ; SSE2-NEXT: pand %xmm0, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,0,1] ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,0,0,0] @@ -6755,7 +6755,7 @@ define void @vec512_v32i16_to_v2i256_factor16(ptr %in.vec.base.ptr, ptr %in.vec. ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa (%rdi), %xmm0 ; SSE2-NEXT: paddb (%rsi), %xmm0 -; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,0,0] +; SSE2-NEXT: movd {{.*#+}} xmm1 = [65535,0,0,0] ; SSE2-NEXT: pand %xmm0, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; SSE2-NEXT: psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero @@ -6809,7 +6809,7 @@ define void @vec512_v32i16_to_v2i256_factor16(ptr %in.vec.base.ptr, ptr %in.vec. ; AVX2: # %bb.0: ; AVX2-NEXT: vmovdqa (%rdi), %ymm0 ; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [65535,0,0,0] +; AVX2-NEXT: vmovd {{.*#+}} xmm1 = [65535,0,0,0] ; AVX2-NEXT: vpand %ymm0, %ymm1, %ymm1 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[2,3],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 @@ -6823,7 +6823,7 @@ define void @vec512_v32i16_to_v2i256_factor16(ptr %in.vec.base.ptr, ptr %in.vec. ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm1 = [65535,0,0,0] +; AVX512F-NEXT: vmovd {{.*#+}} xmm1 = [65535,0,0,0] ; AVX512F-NEXT: vpand %ymm0, %ymm1, %ymm1 ; AVX512F-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[2,3],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 @@ -6837,7 +6837,7 @@ define void @vec512_v32i16_to_v2i256_factor16(ptr %in.vec.base.ptr, ptr %in.vec. ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512BW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = [65535,0,0,0] +; AVX512BW-NEXT: vmovd {{.*#+}} xmm1 = [65535,0,0,0] ; AVX512BW-NEXT: vpand %ymm0, %ymm1, %ymm1 ; AVX512BW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[2,3],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 @@ -6908,7 +6908,7 @@ define void @vec512_v32i16_to_v1i512_factor32(ptr %in.vec.base.ptr, ptr %in.vec. ; AVX2: # %bb.0: ; AVX2-NEXT: vmovdqa (%rdi), %ymm0 ; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [65535,0,0,0] +; AVX2-NEXT: vmovd {{.*#+}} xmm1 = [65535,0,0,0] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vmovaps 32(%rdx), %ymm1 ; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 @@ -6921,7 +6921,7 @@ define void @vec512_v32i16_to_v1i512_factor32(ptr %in.vec.base.ptr, ptr %in.vec. ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm1 = [65535,0,0,0] +; AVX512F-NEXT: vmovd {{.*#+}} xmm1 = [65535,0,0,0] ; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX512F-NEXT: vmovaps 32(%rdx), %ymm1 diff --git a/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll b/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll index b4c79aff5cef1f..8ab53140eb9110 100644 --- a/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll +++ b/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll @@ -1053,7 +1053,7 @@ define void @vec256_i8_widen_to_i16_factor2_broadcast_to_v16i16_factor16(ptr %in ; SSE42-NEXT: paddb 48(%rsi), %xmm2 ; SSE42-NEXT: paddb (%rsi), %xmm0 ; SSE42-NEXT: paddb 32(%rsi), %xmm1 -; SSE42-NEXT: movdqa {{.*#+}} xmm3 = [1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u] +; SSE42-NEXT: movq {{.*#+}} xmm3 = [1,3,5,7,9,11,13,15,0,0,0,0,0,0,0,0] ; SSE42-NEXT: pshufb %xmm3, %xmm1 ; SSE42-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE42-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] @@ -1075,8 +1075,7 @@ define void @vec256_i8_widen_to_i16_factor2_broadcast_to_v16i16_factor16(ptr %in ; AVX-NEXT: vpaddb 48(%rsi), %xmm2, %xmm2 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vpaddb 32(%rsi), %xmm1, %xmm1 -; AVX-NEXT: vmovddup {{.*#+}} xmm3 = [1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15] -; AVX-NEXT: # xmm3 = mem[0,0] +; AVX-NEXT: vmovq {{.*#+}} xmm3 = [1,3,5,7,9,11,13,15,0,0,0,0,0,0,0,0] ; AVX-NEXT: vpshufb %xmm3, %xmm1, %xmm1 ; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] @@ -6055,7 +6054,7 @@ define void @vec512_i8_widen_to_i256_factor32_broadcast_to_v2i256_factor2(ptr %i ; AVX2: # %bb.0: ; AVX2-NEXT: vmovdqa (%rdi), %ymm0 ; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [255,0,0,0] +; AVX2-NEXT: vmovd {{.*#+}} xmm1 = [255,0,0,0] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm1 ; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 @@ -6068,7 +6067,7 @@ define void @vec512_i8_widen_to_i256_factor32_broadcast_to_v2i256_factor2(ptr %i ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm1 = [255,0,0,0] +; AVX512F-NEXT: vmovd {{.*#+}} xmm1 = [255,0,0,0] ; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm1 ; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 @@ -6081,7 +6080,7 @@ define void @vec512_i8_widen_to_i256_factor32_broadcast_to_v2i256_factor2(ptr %i ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm1 = [255,0,0,0] +; AVX512DQ-NEXT: vmovd {{.*#+}} xmm1 = [255,0,0,0] ; AVX512DQ-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm1 ; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm0 @@ -6520,7 +6519,7 @@ define void @vec512_i16_widen_to_i256_factor16_broadcast_to_v2i256_factor2(ptr % ; AVX2: # %bb.0: ; AVX2-NEXT: vmovdqa (%rdi), %ymm0 ; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [65535,0,0,0] +; AVX2-NEXT: vmovd {{.*#+}} xmm1 = [65535,0,0,0] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm1 ; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 @@ -6533,7 +6532,7 @@ define void @vec512_i16_widen_to_i256_factor16_broadcast_to_v2i256_factor2(ptr % ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm1 = [65535,0,0,0] +; AVX512F-NEXT: vmovd {{.*#+}} xmm1 = [65535,0,0,0] ; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm1 ; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 @@ -6546,7 +6545,7 @@ define void @vec512_i16_widen_to_i256_factor16_broadcast_to_v2i256_factor2(ptr % ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm1 = [65535,0,0,0] +; AVX512DQ-NEXT: vmovd {{.*#+}} xmm1 = [65535,0,0,0] ; AVX512DQ-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm1 ; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm0 diff --git a/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast_from_memory.ll b/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast_from_memory.ll index 23f02e9245eeda..c362bdaa3217d0 100644 --- a/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast_from_memory.ll +++ b/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast_from_memory.ll @@ -875,7 +875,7 @@ define void @vec256_i8_widen_to_i16_factor2_broadcast_to_v16i16_factor16(ptr %in ; SSE42-NEXT: movdqa (%rdi), %xmm0 ; SSE42-NEXT: movdqa 32(%rdi), %xmm1 ; SSE42-NEXT: movdqa 48(%rdi), %xmm2 -; SSE42-NEXT: movdqa {{.*#+}} xmm3 = [1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u] +; SSE42-NEXT: movq {{.*#+}} xmm3 = [1,3,5,7,9,11,13,15,0,0,0,0,0,0,0,0] ; SSE42-NEXT: pshufb %xmm3, %xmm1 ; SSE42-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE42-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] @@ -894,8 +894,7 @@ define void @vec256_i8_widen_to_i16_factor2_broadcast_to_v16i16_factor16(ptr %in ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vmovdqa 32(%rdi), %xmm1 ; AVX-NEXT: vmovdqa 48(%rdi), %xmm2 -; AVX-NEXT: vmovddup {{.*#+}} xmm3 = [1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15] -; AVX-NEXT: # xmm3 = mem[0,0] +; AVX-NEXT: vmovq {{.*#+}} xmm3 = [1,3,5,7,9,11,13,15,0,0,0,0,0,0,0,0] ; AVX-NEXT: vpshufb %xmm3, %xmm1, %xmm1 ; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] @@ -4884,7 +4883,7 @@ define void @vec512_i8_widen_to_i256_factor32_broadcast_to_v2i256_factor2(ptr %i ; ; AVX2-LABEL: vec512_i8_widen_to_i256_factor32_broadcast_to_v2i256_factor2: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm0 = [255,0,0,0] +; AVX2-NEXT: vmovd {{.*#+}} xmm0 = [255,0,0,0] ; AVX2-NEXT: vpand (%rdi), %ymm0, %ymm0 ; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1 ; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 @@ -4895,7 +4894,7 @@ define void @vec512_i8_widen_to_i256_factor32_broadcast_to_v2i256_factor2(ptr %i ; ; AVX512F-LABEL: vec512_i8_widen_to_i256_factor32_broadcast_to_v2i256_factor2: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm0 = [255,0,0,0] +; AVX512F-NEXT: vmovd {{.*#+}} xmm0 = [255,0,0,0] ; AVX512F-NEXT: vpand (%rdi), %ymm0, %ymm0 ; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1 ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 @@ -4906,7 +4905,7 @@ define void @vec512_i8_widen_to_i256_factor32_broadcast_to_v2i256_factor2(ptr %i ; ; AVX512DQ-LABEL: vec512_i8_widen_to_i256_factor32_broadcast_to_v2i256_factor2: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm0 = [255,0,0,0] +; AVX512DQ-NEXT: vmovd {{.*#+}} xmm0 = [255,0,0,0] ; AVX512DQ-NEXT: vpand (%rdi), %ymm0, %ymm0 ; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1 ; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 @@ -5291,7 +5290,7 @@ define void @vec512_i16_widen_to_i256_factor16_broadcast_to_v2i256_factor2(ptr % ; ; AVX2-LABEL: vec512_i16_widen_to_i256_factor16_broadcast_to_v2i256_factor2: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm0 = [65535,0,0,0] +; AVX2-NEXT: vmovd {{.*#+}} xmm0 = [65535,0,0,0] ; AVX2-NEXT: vpand (%rdi), %ymm0, %ymm0 ; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1 ; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 @@ -5302,7 +5301,7 @@ define void @vec512_i16_widen_to_i256_factor16_broadcast_to_v2i256_factor2(ptr % ; ; AVX512F-LABEL: vec512_i16_widen_to_i256_factor16_broadcast_to_v2i256_factor2: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm0 = [65535,0,0,0] +; AVX512F-NEXT: vmovd {{.*#+}} xmm0 = [65535,0,0,0] ; AVX512F-NEXT: vpand (%rdi), %ymm0, %ymm0 ; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1 ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 @@ -5313,7 +5312,7 @@ define void @vec512_i16_widen_to_i256_factor16_broadcast_to_v2i256_factor2(ptr % ; ; AVX512DQ-LABEL: vec512_i16_widen_to_i256_factor16_broadcast_to_v2i256_factor2: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm0 = [65535,0,0,0] +; AVX512DQ-NEXT: vmovd {{.*#+}} xmm0 = [65535,0,0,0] ; AVX512DQ-NEXT: vpand (%rdi), %ymm0, %ymm0 ; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1 ; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0