Skip to content

Commit

Permalink
[X86] X86FixupVectorConstants - shrink vector load to movsd/movsd/mov…
Browse files Browse the repository at this point in the history
…d/movq 'zero upper' instructions (#79000)

If we're loading a vector constant that is known to be zero in the upper elements, then attempt to shrink the constant and just scalar load the lower 32/64 bits.

Always chose the vzload/broadcast with the smallest constant load, and prefer vzload over broadcasts for same bitwidth to avoid domain flips (mainly a AVX1 issue).

Fixes #73783
  • Loading branch information
RKSimon authored Jan 24, 2024
1 parent 182ab1c commit 8b43c1b
Show file tree
Hide file tree
Showing 103 changed files with 959 additions and 1,135 deletions.
177 changes: 116 additions & 61 deletions llvm/lib/Target/X86/X86FixupVectorConstants.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,9 @@ FunctionPass *llvm::createX86FixupVectorConstants() {
static std::optional<APInt> extractConstantBits(const Constant *C) {
unsigned NumBits = C->getType()->getPrimitiveSizeInBits();

if (auto *CUndef = dyn_cast<UndefValue>(C))
return APInt::getZero(NumBits);

if (auto *CInt = dyn_cast<ConstantInt>(C))
return CInt->getValue();

Expand All @@ -80,6 +83,18 @@ static std::optional<APInt> extractConstantBits(const Constant *C) {
return APInt::getSplat(NumBits, *Bits);
}
}

APInt Bits = APInt::getZero(NumBits);
for (unsigned I = 0, E = CV->getNumOperands(); I != E; ++I) {
Constant *Elt = CV->getOperand(I);
std::optional<APInt> SubBits = extractConstantBits(Elt);
if (!SubBits)
return std::nullopt;
assert(NumBits == (E * SubBits->getBitWidth()) &&
"Illegal vector element size");
Bits.insertBits(*SubBits, I * SubBits->getBitWidth());
}
return Bits;
}

if (auto *CDS = dyn_cast<ConstantDataSequential>(C)) {
Expand Down Expand Up @@ -223,6 +238,35 @@ static Constant *rebuildSplatableConstant(const Constant *C,
return rebuildConstant(OriginalType->getContext(), SclTy, *Splat, NumSclBits);
}

static Constant *rebuildZeroUpperConstant(const Constant *C,
unsigned ScalarBitWidth) {
Type *Ty = C->getType();
Type *SclTy = Ty->getScalarType();
unsigned NumBits = Ty->getPrimitiveSizeInBits();
unsigned NumSclBits = SclTy->getPrimitiveSizeInBits();
LLVMContext &Ctx = C->getContext();

if (NumBits > ScalarBitWidth) {
// Determine if the upper bits are all zero.
if (std::optional<APInt> Bits = extractConstantBits(C)) {
if (Bits->countLeadingZeros() >= (NumBits - ScalarBitWidth)) {
// If the original constant was made of smaller elements, try to retain
// those types.
if (ScalarBitWidth > NumSclBits && (ScalarBitWidth % NumSclBits) == 0)
return rebuildConstant(Ctx, SclTy, *Bits, NumSclBits);

// Fallback to raw integer bits.
APInt RawBits = Bits->zextOrTrunc(ScalarBitWidth);
return ConstantInt::get(Ctx, RawBits);
}
}
}

return nullptr;
}

typedef std::function<Constant *(const Constant *, unsigned)> RebuildFn;

bool X86FixupVectorConstantsPass::processInstruction(MachineFunction &MF,
MachineBasicBlock &MBB,
MachineInstr &MI) {
Expand All @@ -233,117 +277,128 @@ bool X86FixupVectorConstantsPass::processInstruction(MachineFunction &MF,
bool HasBWI = ST->hasBWI();
bool HasVLX = ST->hasVLX();

auto ConvertToBroadcast = [&](unsigned OpBcst256, unsigned OpBcst128,
unsigned OpBcst64, unsigned OpBcst32,
unsigned OpBcst16, unsigned OpBcst8,
unsigned OperandNo) {
assert(MI.getNumOperands() >= (OperandNo + X86::AddrNumOperands) &&
"Unexpected number of operands!");

if (auto *C = X86::getConstantFromPool(MI, OperandNo)) {
// Attempt to detect a suitable splat from increasing splat widths.
std::pair<unsigned, unsigned> Broadcasts[] = {
{8, OpBcst8}, {16, OpBcst16}, {32, OpBcst32},
{64, OpBcst64}, {128, OpBcst128}, {256, OpBcst256},
};
for (auto [BitWidth, OpBcst] : Broadcasts) {
if (OpBcst) {
// Construct a suitable splat constant and adjust the MI to
// use the new constant pool entry.
if (Constant *NewCst = rebuildSplatableConstant(C, BitWidth)) {
unsigned NewCPI =
CP->getConstantPoolIndex(NewCst, Align(BitWidth / 8));
MI.setDesc(TII->get(OpBcst));
MI.getOperand(OperandNo + X86::AddrDisp).setIndex(NewCPI);
return true;
auto FixupConstant =
[&](unsigned OpBcst256, unsigned OpBcst128, unsigned OpBcst64,
unsigned OpBcst32, unsigned OpBcst16, unsigned OpBcst8,
unsigned OpUpper64, unsigned OpUpper32, unsigned OperandNo) {
assert(MI.getNumOperands() >= (OperandNo + X86::AddrNumOperands) &&
"Unexpected number of operands!");

if (auto *C = X86::getConstantFromPool(MI, OperandNo)) {
// Attempt to detect a suitable splat/vzload from increasing constant
// bitwidths.
// Prefer vzload vs broadcast for same bitwidth to avoid domain flips.
std::tuple<unsigned, unsigned, RebuildFn> FixupLoad[] = {
{8, OpBcst8, rebuildSplatableConstant},
{16, OpBcst16, rebuildSplatableConstant},
{32, OpUpper32, rebuildZeroUpperConstant},
{32, OpBcst32, rebuildSplatableConstant},
{64, OpUpper64, rebuildZeroUpperConstant},
{64, OpBcst64, rebuildSplatableConstant},
{128, OpBcst128, rebuildSplatableConstant},
{256, OpBcst256, rebuildSplatableConstant},
};
for (auto [BitWidth, Op, RebuildConstant] : FixupLoad) {
if (Op) {
// Construct a suitable constant and adjust the MI to use the new
// constant pool entry.
if (Constant *NewCst = RebuildConstant(C, BitWidth)) {
unsigned NewCPI =
CP->getConstantPoolIndex(NewCst, Align(BitWidth / 8));
MI.setDesc(TII->get(Op));
MI.getOperand(OperandNo + X86::AddrDisp).setIndex(NewCPI);
return true;
}
}
}
}
}
}
return false;
};
return false;
};

// Attempt to convert full width vector loads into broadcast loads.
// Attempt to convert full width vector loads into broadcast/vzload loads.
switch (Opc) {
/* FP Loads */
case X86::MOVAPDrm:
case X86::MOVAPSrm:
case X86::MOVUPDrm:
case X86::MOVUPSrm:
// TODO: SSE3 MOVDDUP Handling
return false;
return FixupConstant(0, 0, 0, 0, 0, 0, X86::MOVSDrm, X86::MOVSSrm, 1);
case X86::VMOVAPDrm:
case X86::VMOVAPSrm:
case X86::VMOVUPDrm:
case X86::VMOVUPSrm:
return ConvertToBroadcast(0, 0, X86::VMOVDDUPrm, X86::VBROADCASTSSrm, 0, 0,
1);
return FixupConstant(0, 0, X86::VMOVDDUPrm, X86::VBROADCASTSSrm, 0, 0,
X86::VMOVSDrm, X86::VMOVSSrm, 1);
case X86::VMOVAPDYrm:
case X86::VMOVAPSYrm:
case X86::VMOVUPDYrm:
case X86::VMOVUPSYrm:
return ConvertToBroadcast(0, X86::VBROADCASTF128rm, X86::VBROADCASTSDYrm,
X86::VBROADCASTSSYrm, 0, 0, 1);
return FixupConstant(0, X86::VBROADCASTF128rm, X86::VBROADCASTSDYrm,
X86::VBROADCASTSSYrm, 0, 0, 0, 0, 1);
case X86::VMOVAPDZ128rm:
case X86::VMOVAPSZ128rm:
case X86::VMOVUPDZ128rm:
case X86::VMOVUPSZ128rm:
return ConvertToBroadcast(0, 0, X86::VMOVDDUPZ128rm,
X86::VBROADCASTSSZ128rm, 0, 0, 1);
return FixupConstant(0, 0, X86::VMOVDDUPZ128rm, X86::VBROADCASTSSZ128rm, 0,
0, X86::VMOVSDZrm, X86::VMOVSSZrm, 1);
case X86::VMOVAPDZ256rm:
case X86::VMOVAPSZ256rm:
case X86::VMOVUPDZ256rm:
case X86::VMOVUPSZ256rm:
return ConvertToBroadcast(0, X86::VBROADCASTF32X4Z256rm,
X86::VBROADCASTSDZ256rm, X86::VBROADCASTSSZ256rm,
0, 0, 1);
return FixupConstant(0, X86::VBROADCASTF32X4Z256rm, X86::VBROADCASTSDZ256rm,
X86::VBROADCASTSSZ256rm, 0, 0, 0, 0, 1);
case X86::VMOVAPDZrm:
case X86::VMOVAPSZrm:
case X86::VMOVUPDZrm:
case X86::VMOVUPSZrm:
return ConvertToBroadcast(X86::VBROADCASTF64X4rm, X86::VBROADCASTF32X4rm,
X86::VBROADCASTSDZrm, X86::VBROADCASTSSZrm, 0, 0,
1);
return FixupConstant(X86::VBROADCASTF64X4rm, X86::VBROADCASTF32X4rm,
X86::VBROADCASTSDZrm, X86::VBROADCASTSSZrm, 0, 0, 0, 0,
1);
/* Integer Loads */
case X86::MOVDQArm:
case X86::MOVDQUrm:
return FixupConstant(0, 0, 0, 0, 0, 0, X86::MOVQI2PQIrm, X86::MOVDI2PDIrm,
1);
case X86::VMOVDQArm:
case X86::VMOVDQUrm:
return ConvertToBroadcast(
0, 0, HasAVX2 ? X86::VPBROADCASTQrm : X86::VMOVDDUPrm,
HasAVX2 ? X86::VPBROADCASTDrm : X86::VBROADCASTSSrm,
HasAVX2 ? X86::VPBROADCASTWrm : 0, HasAVX2 ? X86::VPBROADCASTBrm : 0,
1);
return FixupConstant(0, 0, HasAVX2 ? X86::VPBROADCASTQrm : X86::VMOVDDUPrm,
HasAVX2 ? X86::VPBROADCASTDrm : X86::VBROADCASTSSrm,
HasAVX2 ? X86::VPBROADCASTWrm : 0,
HasAVX2 ? X86::VPBROADCASTBrm : 0, X86::VMOVQI2PQIrm,
X86::VMOVDI2PDIrm, 1);
case X86::VMOVDQAYrm:
case X86::VMOVDQUYrm:
return ConvertToBroadcast(
return FixupConstant(
0, HasAVX2 ? X86::VBROADCASTI128rm : X86::VBROADCASTF128rm,
HasAVX2 ? X86::VPBROADCASTQYrm : X86::VBROADCASTSDYrm,
HasAVX2 ? X86::VPBROADCASTDYrm : X86::VBROADCASTSSYrm,
HasAVX2 ? X86::VPBROADCASTWYrm : 0, HasAVX2 ? X86::VPBROADCASTBYrm : 0,
1);
0, 0, 1);
case X86::VMOVDQA32Z128rm:
case X86::VMOVDQA64Z128rm:
case X86::VMOVDQU32Z128rm:
case X86::VMOVDQU64Z128rm:
return ConvertToBroadcast(0, 0, X86::VPBROADCASTQZ128rm,
X86::VPBROADCASTDZ128rm,
HasBWI ? X86::VPBROADCASTWZ128rm : 0,
HasBWI ? X86::VPBROADCASTBZ128rm : 0, 1);
return FixupConstant(0, 0, X86::VPBROADCASTQZ128rm, X86::VPBROADCASTDZ128rm,
HasBWI ? X86::VPBROADCASTWZ128rm : 0,
HasBWI ? X86::VPBROADCASTBZ128rm : 0,
X86::VMOVQI2PQIZrm, X86::VMOVDI2PDIZrm, 1);
case X86::VMOVDQA32Z256rm:
case X86::VMOVDQA64Z256rm:
case X86::VMOVDQU32Z256rm:
case X86::VMOVDQU64Z256rm:
return ConvertToBroadcast(0, X86::VBROADCASTI32X4Z256rm,
X86::VPBROADCASTQZ256rm, X86::VPBROADCASTDZ256rm,
HasBWI ? X86::VPBROADCASTWZ256rm : 0,
HasBWI ? X86::VPBROADCASTBZ256rm : 0, 1);
return FixupConstant(0, X86::VBROADCASTI32X4Z256rm, X86::VPBROADCASTQZ256rm,
X86::VPBROADCASTDZ256rm,
HasBWI ? X86::VPBROADCASTWZ256rm : 0,
HasBWI ? X86::VPBROADCASTBZ256rm : 0, 0, 0, 1);
case X86::VMOVDQA32Zrm:
case X86::VMOVDQA64Zrm:
case X86::VMOVDQU32Zrm:
case X86::VMOVDQU64Zrm:
return ConvertToBroadcast(X86::VBROADCASTI64X4rm, X86::VBROADCASTI32X4rm,
X86::VPBROADCASTQZrm, X86::VPBROADCASTDZrm,
HasBWI ? X86::VPBROADCASTWZrm : 0,
HasBWI ? X86::VPBROADCASTBZrm : 0, 1);
return FixupConstant(X86::VBROADCASTI64X4rm, X86::VBROADCASTI32X4rm,
X86::VPBROADCASTQZrm, X86::VPBROADCASTDZrm,
HasBWI ? X86::VPBROADCASTWZrm : 0,
HasBWI ? X86::VPBROADCASTBZrm : 0, 0, 0, 1);
}

auto ConvertToBroadcastAVX512 = [&](unsigned OpSrc32, unsigned OpSrc64) {
Expand All @@ -368,7 +423,7 @@ bool X86FixupVectorConstantsPass::processInstruction(MachineFunction &MF,

if (OpBcst32 || OpBcst64) {
unsigned OpNo = OpBcst32 == 0 ? OpNoBcst64 : OpNoBcst32;
return ConvertToBroadcast(0, 0, OpBcst64, OpBcst32, 0, 0, OpNo);
return FixupConstant(0, 0, OpBcst64, OpBcst32, 0, 0, 0, 0, OpNo);
}
return false;
};
Expand Down
2 changes: 1 addition & 1 deletion llvm/test/CodeGen/X86/2011-20-21-zext-ui2fp.ll
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
define void @ui_to_fp_conv(ptr nocapture %aFOO, ptr nocapture %RET) nounwind {
; CHECK-LABEL: ui_to_fp_conv:
; CHECK: # %bb.0: # %allocas
; CHECK-NEXT: movaps {{.*#+}} xmm0 = [1.0E+0,1.0E+0,0.0E+0,0.0E+0]
; CHECK-NEXT: movsd {{.*#+}} xmm0 = [1.0E+0,1.0E+0,0.0E+0,0.0E+0]
; CHECK-NEXT: xorps %xmm1, %xmm1
; CHECK-NEXT: movups %xmm1, 16(%rsi)
; CHECK-NEXT: movups %xmm0, (%rsi)
Expand Down
5 changes: 2 additions & 3 deletions llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll
Original file line number Diff line number Diff line change
Expand Up @@ -1053,7 +1053,7 @@ define void @vec256_i8_widen_to_i16_factor2_broadcast_to_v16i16_factor16(ptr %in
; SSE42-NEXT: paddb 48(%rsi), %xmm2
; SSE42-NEXT: paddb (%rsi), %xmm0
; SSE42-NEXT: paddb 32(%rsi), %xmm1
; SSE42-NEXT: movdqa {{.*#+}} xmm3 = [1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
; SSE42-NEXT: movq {{.*#+}} xmm3 = [1,3,5,7,9,11,13,15,0,0,0,0,0,0,0,0]
; SSE42-NEXT: pshufb %xmm3, %xmm1
; SSE42-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; SSE42-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
Expand All @@ -1075,8 +1075,7 @@ define void @vec256_i8_widen_to_i16_factor2_broadcast_to_v16i16_factor16(ptr %in
; AVX-NEXT: vpaddb 48(%rsi), %xmm2, %xmm2
; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
; AVX-NEXT: vpaddb 32(%rsi), %xmm1, %xmm1
; AVX-NEXT: vmovddup {{.*#+}} xmm3 = [1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15]
; AVX-NEXT: # xmm3 = mem[0,0]
; AVX-NEXT: vmovq {{.*#+}} xmm3 = [1,3,5,7,9,11,13,15,0,0,0,0,0,0,0,0]
; AVX-NEXT: vpshufb %xmm3, %xmm1, %xmm1
; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -875,7 +875,7 @@ define void @vec256_i8_widen_to_i16_factor2_broadcast_to_v16i16_factor16(ptr %in
; SSE42-NEXT: movdqa (%rdi), %xmm0
; SSE42-NEXT: movdqa 32(%rdi), %xmm1
; SSE42-NEXT: movdqa 48(%rdi), %xmm2
; SSE42-NEXT: movdqa {{.*#+}} xmm3 = [1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
; SSE42-NEXT: movq {{.*#+}} xmm3 = [1,3,5,7,9,11,13,15,0,0,0,0,0,0,0,0]
; SSE42-NEXT: pshufb %xmm3, %xmm1
; SSE42-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; SSE42-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
Expand All @@ -894,8 +894,7 @@ define void @vec256_i8_widen_to_i16_factor2_broadcast_to_v16i16_factor16(ptr %in
; AVX-NEXT: vmovdqa (%rdi), %xmm0
; AVX-NEXT: vmovdqa 32(%rdi), %xmm1
; AVX-NEXT: vmovdqa 48(%rdi), %xmm2
; AVX-NEXT: vmovddup {{.*#+}} xmm3 = [1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15]
; AVX-NEXT: # xmm3 = mem[0,0]
; AVX-NEXT: vmovq {{.*#+}} xmm3 = [1,3,5,7,9,11,13,15,0,0,0,0,0,0,0,0]
; AVX-NEXT: vpshufb %xmm3, %xmm1, %xmm1
; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
Expand Down
2 changes: 1 addition & 1 deletion llvm/test/CodeGen/X86/avx-load-store.ll
Original file line number Diff line number Diff line change
Expand Up @@ -220,7 +220,7 @@ define void @f_f() nounwind {
; CHECK-NEXT: testb %al, %al
; CHECK-NEXT: jne .LBB9_4
; CHECK-NEXT: # %bb.3: # %cif_mixed_test_all
; CHECK-NEXT: vmovaps {{.*#+}} xmm0 = [4294967295,0,0,0]
; CHECK-NEXT: vmovss {{.*#+}} xmm0 = [4294967295,0,0,0]
; CHECK-NEXT: vmaskmovps %ymm0, %ymm0, (%rax)
; CHECK-NEXT: .LBB9_4: # %cif_mixed_test_any_check
;
Expand Down
2 changes: 1 addition & 1 deletion llvm/test/CodeGen/X86/avx2-arith.ll
Original file line number Diff line number Diff line change
Expand Up @@ -234,7 +234,7 @@ define <8 x i16> @mul_const8(<8 x i16> %x) {
define <8 x i32> @mul_const9(<8 x i32> %x) {
; CHECK-LABEL: mul_const9:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovdqa {{.*#+}} xmm1 = [2,0,0,0]
; CHECK-NEXT: vmovd {{.*#+}} xmm1 = [2,0,0,0]
; CHECK-NEXT: vpmulld %ymm1, %ymm0, %ymm0
; CHECK-NEXT: ret{{[l|q]}}
%y = mul <8 x i32> %x, <i32 2, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ define <2 x bfloat> @shuffle_chained_v32bf16_v2bf16(<32 x bfloat> %a) {
; CHECK-NEXT: .cfi_def_cfa_register %rbp
; CHECK-NEXT: andq $-64, %rsp
; CHECK-NEXT: subq $128, %rsp
; CHECK-NEXT: vpbroadcastd {{.*#+}} xmm1 = [0,16,0,16,0,16,0,16]
; CHECK-NEXT: vmovd {{.*#+}} xmm1 = [0,16,0,0,0,0,0,0]
; CHECK-NEXT: vpermw %zmm0, %zmm1, %zmm0
; CHECK-NEXT: vmovdqa64 %zmm0, (%rsp)
; CHECK-NEXT: vmovaps (%rsp), %xmm0
Expand Down
6 changes: 3 additions & 3 deletions llvm/test/CodeGen/X86/bitreverse.ll
Original file line number Diff line number Diff line change
Expand Up @@ -587,17 +587,17 @@ define <2 x i16> @fold_v2i16() {
;
; X64-LABEL: fold_v2i16:
; X64: # %bb.0:
; X64-NEXT: movaps {{.*#+}} xmm0 = [61440,240,u,u,u,u,u,u]
; X64-NEXT: movss {{.*#+}} xmm0 = [61440,240,0,0,0,0,0,0]
; X64-NEXT: retq
;
; X86XOP-LABEL: fold_v2i16:
; X86XOP: # %bb.0:
; X86XOP-NEXT: vbroadcastss {{.*#+}} xmm0 = [61440,240,61440,240,61440,240,61440,240]
; X86XOP-NEXT: vmovss {{.*#+}} xmm0 = [61440,240,0,0,0,0,0,0]
; X86XOP-NEXT: retl
;
; GFNI-LABEL: fold_v2i16:
; GFNI: # %bb.0:
; GFNI-NEXT: vbroadcastss {{.*#+}} xmm0 = [61440,240,61440,240,61440,240,61440,240]
; GFNI-NEXT: vmovss {{.*#+}} xmm0 = [61440,240,0,0,0,0,0,0]
; GFNI-NEXT: retq
%b = call <2 x i16> @llvm.bitreverse.v2i16(<2 x i16> <i16 15, i16 3840>)
ret <2 x i16> %b
Expand Down
4 changes: 2 additions & 2 deletions llvm/test/CodeGen/X86/combine-srl.ll
Original file line number Diff line number Diff line change
Expand Up @@ -356,7 +356,7 @@ define <4 x i32> @combine_vec_lshr_lzcnt_bit1(<4 x i32> %x) {
; SSE-LABEL: combine_vec_lshr_lzcnt_bit1:
; SSE: # %bb.0:
; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; SSE-NEXT: movdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
; SSE-NEXT: movq {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
; SSE-NEXT: movdqa %xmm1, %xmm2
; SSE-NEXT: pshufb %xmm0, %xmm2
; SSE-NEXT: psrlw $4, %xmm0
Expand All @@ -378,7 +378,7 @@ define <4 x i32> @combine_vec_lshr_lzcnt_bit1(<4 x i32> %x) {
; AVX-LABEL: combine_vec_lshr_lzcnt_bit1:
; AVX: # %bb.0:
; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
; AVX-NEXT: vmovq {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
; AVX-NEXT: vpshufb %xmm0, %xmm1, %xmm2
; AVX-NEXT: vpsrlw $4, %xmm0, %xmm0
; AVX-NEXT: vpxor %xmm3, %xmm3, %xmm3
Expand Down
4 changes: 2 additions & 2 deletions llvm/test/CodeGen/X86/combine-subo.ll
Original file line number Diff line number Diff line change
Expand Up @@ -217,13 +217,13 @@ define { <4 x i8>, <4 x i1> } @always_usub_const_vector() nounwind {
define { <4 x i8>, <4 x i1> } @never_usub_const_vector() nounwind {
; SSE-LABEL: never_usub_const_vector:
; SSE: # %bb.0:
; SSE-NEXT: movaps {{.*#+}} xmm0 = [127,255,0,254,u,u,u,u,u,u,u,u,u,u,u,u]
; SSE-NEXT: movss {{.*#+}} xmm0 = [127,255,0,254,0,0,0,0,0,0,0,0,0,0,0,0]
; SSE-NEXT: xorps %xmm1, %xmm1
; SSE-NEXT: retq
;
; AVX-LABEL: never_usub_const_vector:
; AVX: # %bb.0:
; AVX-NEXT: vbroadcastss {{.*#+}} xmm0 = [127,255,0,254,127,255,0,254,127,255,0,254,127,255,0,254]
; AVX-NEXT: vmovss {{.*#+}} xmm0 = [127,255,0,254,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
; AVX-NEXT: retq
%x = call { <4 x i8>, <4 x i1> } @llvm.usub.with.overflow.v4i8(<4 x i8> <i8 255, i8 255, i8 255, i8 255>, <4 x i8> <i8 128, i8 0, i8 255, i8 1>)
Expand Down
Loading

0 comments on commit 8b43c1b

Please sign in to comment.