Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[X86] X86FixupVectorConstants - shrink vector load to movsd/movsd/movd/movq 'zero upper' instructions #79000

Merged
merged 1 commit into from
Jan 24, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
177 changes: 116 additions & 61 deletions llvm/lib/Target/X86/X86FixupVectorConstants.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,9 @@ FunctionPass *llvm::createX86FixupVectorConstants() {
static std::optional<APInt> extractConstantBits(const Constant *C) {
unsigned NumBits = C->getType()->getPrimitiveSizeInBits();

if (auto *CUndef = dyn_cast<UndefValue>(C))
return APInt::getZero(NumBits);

if (auto *CInt = dyn_cast<ConstantInt>(C))
return CInt->getValue();

Expand All @@ -80,6 +83,18 @@ static std::optional<APInt> extractConstantBits(const Constant *C) {
return APInt::getSplat(NumBits, *Bits);
}
}

APInt Bits = APInt::getZero(NumBits);
for (unsigned I = 0, E = CV->getNumOperands(); I != E; ++I) {
Constant *Elt = CV->getOperand(I);
std::optional<APInt> SubBits = extractConstantBits(Elt);
if (!SubBits)
return std::nullopt;
assert(NumBits == (E * SubBits->getBitWidth()) &&
"Illegal vector element size");
Bits.insertBits(*SubBits, I * SubBits->getBitWidth());
}
return Bits;
}

if (auto *CDS = dyn_cast<ConstantDataSequential>(C)) {
Expand Down Expand Up @@ -223,6 +238,35 @@ static Constant *rebuildSplatableConstant(const Constant *C,
return rebuildConstant(OriginalType->getContext(), SclTy, *Splat, NumSclBits);
}

static Constant *rebuildZeroUpperConstant(const Constant *C,
unsigned ScalarBitWidth) {
Type *Ty = C->getType();
Type *SclTy = Ty->getScalarType();
unsigned NumBits = Ty->getPrimitiveSizeInBits();
unsigned NumSclBits = SclTy->getPrimitiveSizeInBits();
LLVMContext &Ctx = C->getContext();

if (NumBits > ScalarBitWidth) {
// Determine if the upper bits are all zero.
if (std::optional<APInt> Bits = extractConstantBits(C)) {
if (Bits->countLeadingZeros() >= (NumBits - ScalarBitWidth)) {
// If the original constant was made of smaller elements, try to retain
// those types.
if (ScalarBitWidth > NumSclBits && (ScalarBitWidth % NumSclBits) == 0)
return rebuildConstant(Ctx, SclTy, *Bits, NumSclBits);

// Fallback to raw integer bits.
APInt RawBits = Bits->zextOrTrunc(ScalarBitWidth);
return ConstantInt::get(Ctx, RawBits);
}
}
}

return nullptr;
}

typedef std::function<Constant *(const Constant *, unsigned)> RebuildFn;

bool X86FixupVectorConstantsPass::processInstruction(MachineFunction &MF,
MachineBasicBlock &MBB,
MachineInstr &MI) {
Expand All @@ -233,117 +277,128 @@ bool X86FixupVectorConstantsPass::processInstruction(MachineFunction &MF,
bool HasBWI = ST->hasBWI();
bool HasVLX = ST->hasVLX();

auto ConvertToBroadcast = [&](unsigned OpBcst256, unsigned OpBcst128,
unsigned OpBcst64, unsigned OpBcst32,
unsigned OpBcst16, unsigned OpBcst8,
unsigned OperandNo) {
assert(MI.getNumOperands() >= (OperandNo + X86::AddrNumOperands) &&
"Unexpected number of operands!");

if (auto *C = X86::getConstantFromPool(MI, OperandNo)) {
// Attempt to detect a suitable splat from increasing splat widths.
std::pair<unsigned, unsigned> Broadcasts[] = {
{8, OpBcst8}, {16, OpBcst16}, {32, OpBcst32},
{64, OpBcst64}, {128, OpBcst128}, {256, OpBcst256},
};
for (auto [BitWidth, OpBcst] : Broadcasts) {
if (OpBcst) {
// Construct a suitable splat constant and adjust the MI to
// use the new constant pool entry.
if (Constant *NewCst = rebuildSplatableConstant(C, BitWidth)) {
unsigned NewCPI =
CP->getConstantPoolIndex(NewCst, Align(BitWidth / 8));
MI.setDesc(TII->get(OpBcst));
MI.getOperand(OperandNo + X86::AddrDisp).setIndex(NewCPI);
return true;
auto FixupConstant =
[&](unsigned OpBcst256, unsigned OpBcst128, unsigned OpBcst64,
unsigned OpBcst32, unsigned OpBcst16, unsigned OpBcst8,
unsigned OpUpper64, unsigned OpUpper32, unsigned OperandNo) {
assert(MI.getNumOperands() >= (OperandNo + X86::AddrNumOperands) &&
"Unexpected number of operands!");

if (auto *C = X86::getConstantFromPool(MI, OperandNo)) {
// Attempt to detect a suitable splat/vzload from increasing constant
// bitwidths.
// Prefer vzload vs broadcast for same bitwidth to avoid domain flips.
std::tuple<unsigned, unsigned, RebuildFn> FixupLoad[] = {
{8, OpBcst8, rebuildSplatableConstant},
{16, OpBcst16, rebuildSplatableConstant},
{32, OpUpper32, rebuildZeroUpperConstant},
{32, OpBcst32, rebuildSplatableConstant},
{64, OpUpper64, rebuildZeroUpperConstant},
{64, OpBcst64, rebuildSplatableConstant},
{128, OpBcst128, rebuildSplatableConstant},
{256, OpBcst256, rebuildSplatableConstant},
};
for (auto [BitWidth, Op, RebuildConstant] : FixupLoad) {
if (Op) {
// Construct a suitable constant and adjust the MI to use the new
// constant pool entry.
if (Constant *NewCst = RebuildConstant(C, BitWidth)) {
unsigned NewCPI =
CP->getConstantPoolIndex(NewCst, Align(BitWidth / 8));
MI.setDesc(TII->get(Op));
MI.getOperand(OperandNo + X86::AddrDisp).setIndex(NewCPI);
return true;
}
}
}
}
}
}
return false;
};
return false;
};

// Attempt to convert full width vector loads into broadcast loads.
// Attempt to convert full width vector loads into broadcast/vzload loads.
switch (Opc) {
/* FP Loads */
case X86::MOVAPDrm:
case X86::MOVAPSrm:
case X86::MOVUPDrm:
case X86::MOVUPSrm:
// TODO: SSE3 MOVDDUP Handling
return false;
return FixupConstant(0, 0, 0, 0, 0, 0, X86::MOVSDrm, X86::MOVSSrm, 1);
case X86::VMOVAPDrm:
case X86::VMOVAPSrm:
case X86::VMOVUPDrm:
case X86::VMOVUPSrm:
return ConvertToBroadcast(0, 0, X86::VMOVDDUPrm, X86::VBROADCASTSSrm, 0, 0,
1);
return FixupConstant(0, 0, X86::VMOVDDUPrm, X86::VBROADCASTSSrm, 0, 0,
X86::VMOVSDrm, X86::VMOVSSrm, 1);
case X86::VMOVAPDYrm:
case X86::VMOVAPSYrm:
case X86::VMOVUPDYrm:
case X86::VMOVUPSYrm:
return ConvertToBroadcast(0, X86::VBROADCASTF128rm, X86::VBROADCASTSDYrm,
X86::VBROADCASTSSYrm, 0, 0, 1);
return FixupConstant(0, X86::VBROADCASTF128rm, X86::VBROADCASTSDYrm,
X86::VBROADCASTSSYrm, 0, 0, 0, 0, 1);
case X86::VMOVAPDZ128rm:
case X86::VMOVAPSZ128rm:
case X86::VMOVUPDZ128rm:
case X86::VMOVUPSZ128rm:
return ConvertToBroadcast(0, 0, X86::VMOVDDUPZ128rm,
X86::VBROADCASTSSZ128rm, 0, 0, 1);
return FixupConstant(0, 0, X86::VMOVDDUPZ128rm, X86::VBROADCASTSSZ128rm, 0,
0, X86::VMOVSDZrm, X86::VMOVSSZrm, 1);
case X86::VMOVAPDZ256rm:
case X86::VMOVAPSZ256rm:
case X86::VMOVUPDZ256rm:
case X86::VMOVUPSZ256rm:
return ConvertToBroadcast(0, X86::VBROADCASTF32X4Z256rm,
X86::VBROADCASTSDZ256rm, X86::VBROADCASTSSZ256rm,
0, 0, 1);
return FixupConstant(0, X86::VBROADCASTF32X4Z256rm, X86::VBROADCASTSDZ256rm,
X86::VBROADCASTSSZ256rm, 0, 0, 0, 0, 1);
case X86::VMOVAPDZrm:
case X86::VMOVAPSZrm:
case X86::VMOVUPDZrm:
case X86::VMOVUPSZrm:
return ConvertToBroadcast(X86::VBROADCASTF64X4rm, X86::VBROADCASTF32X4rm,
X86::VBROADCASTSDZrm, X86::VBROADCASTSSZrm, 0, 0,
1);
return FixupConstant(X86::VBROADCASTF64X4rm, X86::VBROADCASTF32X4rm,
X86::VBROADCASTSDZrm, X86::VBROADCASTSSZrm, 0, 0, 0, 0,
1);
/* Integer Loads */
case X86::MOVDQArm:
case X86::MOVDQUrm:
return FixupConstant(0, 0, 0, 0, 0, 0, X86::MOVQI2PQIrm, X86::MOVDI2PDIrm,
1);
case X86::VMOVDQArm:
case X86::VMOVDQUrm:
return ConvertToBroadcast(
0, 0, HasAVX2 ? X86::VPBROADCASTQrm : X86::VMOVDDUPrm,
HasAVX2 ? X86::VPBROADCASTDrm : X86::VBROADCASTSSrm,
HasAVX2 ? X86::VPBROADCASTWrm : 0, HasAVX2 ? X86::VPBROADCASTBrm : 0,
1);
return FixupConstant(0, 0, HasAVX2 ? X86::VPBROADCASTQrm : X86::VMOVDDUPrm,
HasAVX2 ? X86::VPBROADCASTDrm : X86::VBROADCASTSSrm,
HasAVX2 ? X86::VPBROADCASTWrm : 0,
HasAVX2 ? X86::VPBROADCASTBrm : 0, X86::VMOVQI2PQIrm,
X86::VMOVDI2PDIrm, 1);
case X86::VMOVDQAYrm:
case X86::VMOVDQUYrm:
return ConvertToBroadcast(
return FixupConstant(
0, HasAVX2 ? X86::VBROADCASTI128rm : X86::VBROADCASTF128rm,
HasAVX2 ? X86::VPBROADCASTQYrm : X86::VBROADCASTSDYrm,
HasAVX2 ? X86::VPBROADCASTDYrm : X86::VBROADCASTSSYrm,
HasAVX2 ? X86::VPBROADCASTWYrm : 0, HasAVX2 ? X86::VPBROADCASTBYrm : 0,
1);
0, 0, 1);
case X86::VMOVDQA32Z128rm:
case X86::VMOVDQA64Z128rm:
case X86::VMOVDQU32Z128rm:
case X86::VMOVDQU64Z128rm:
return ConvertToBroadcast(0, 0, X86::VPBROADCASTQZ128rm,
X86::VPBROADCASTDZ128rm,
HasBWI ? X86::VPBROADCASTWZ128rm : 0,
HasBWI ? X86::VPBROADCASTBZ128rm : 0, 1);
return FixupConstant(0, 0, X86::VPBROADCASTQZ128rm, X86::VPBROADCASTDZ128rm,
HasBWI ? X86::VPBROADCASTWZ128rm : 0,
HasBWI ? X86::VPBROADCASTBZ128rm : 0,
X86::VMOVQI2PQIZrm, X86::VMOVDI2PDIZrm, 1);
case X86::VMOVDQA32Z256rm:
case X86::VMOVDQA64Z256rm:
case X86::VMOVDQU32Z256rm:
case X86::VMOVDQU64Z256rm:
return ConvertToBroadcast(0, X86::VBROADCASTI32X4Z256rm,
X86::VPBROADCASTQZ256rm, X86::VPBROADCASTDZ256rm,
HasBWI ? X86::VPBROADCASTWZ256rm : 0,
HasBWI ? X86::VPBROADCASTBZ256rm : 0, 1);
return FixupConstant(0, X86::VBROADCASTI32X4Z256rm, X86::VPBROADCASTQZ256rm,
X86::VPBROADCASTDZ256rm,
HasBWI ? X86::VPBROADCASTWZ256rm : 0,
HasBWI ? X86::VPBROADCASTBZ256rm : 0, 0, 0, 1);
case X86::VMOVDQA32Zrm:
case X86::VMOVDQA64Zrm:
case X86::VMOVDQU32Zrm:
case X86::VMOVDQU64Zrm:
return ConvertToBroadcast(X86::VBROADCASTI64X4rm, X86::VBROADCASTI32X4rm,
X86::VPBROADCASTQZrm, X86::VPBROADCASTDZrm,
HasBWI ? X86::VPBROADCASTWZrm : 0,
HasBWI ? X86::VPBROADCASTBZrm : 0, 1);
return FixupConstant(X86::VBROADCASTI64X4rm, X86::VBROADCASTI32X4rm,
X86::VPBROADCASTQZrm, X86::VPBROADCASTDZrm,
HasBWI ? X86::VPBROADCASTWZrm : 0,
HasBWI ? X86::VPBROADCASTBZrm : 0, 0, 0, 1);
}

auto ConvertToBroadcastAVX512 = [&](unsigned OpSrc32, unsigned OpSrc64) {
Expand All @@ -368,7 +423,7 @@ bool X86FixupVectorConstantsPass::processInstruction(MachineFunction &MF,

if (OpBcst32 || OpBcst64) {
unsigned OpNo = OpBcst32 == 0 ? OpNoBcst64 : OpNoBcst32;
return ConvertToBroadcast(0, 0, OpBcst64, OpBcst32, 0, 0, OpNo);
return FixupConstant(0, 0, OpBcst64, OpBcst32, 0, 0, 0, 0, OpNo);
}
return false;
};
Expand Down
2 changes: 1 addition & 1 deletion llvm/test/CodeGen/X86/2011-20-21-zext-ui2fp.ll
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
define void @ui_to_fp_conv(ptr nocapture %aFOO, ptr nocapture %RET) nounwind {
; CHECK-LABEL: ui_to_fp_conv:
; CHECK: # %bb.0: # %allocas
; CHECK-NEXT: movaps {{.*#+}} xmm0 = [1.0E+0,1.0E+0,0.0E+0,0.0E+0]
; CHECK-NEXT: movsd {{.*#+}} xmm0 = [1.0E+0,1.0E+0,0.0E+0,0.0E+0]
; CHECK-NEXT: xorps %xmm1, %xmm1
; CHECK-NEXT: movups %xmm1, 16(%rsi)
; CHECK-NEXT: movups %xmm0, (%rsi)
Expand Down
5 changes: 2 additions & 3 deletions llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll
Original file line number Diff line number Diff line change
Expand Up @@ -1053,7 +1053,7 @@ define void @vec256_i8_widen_to_i16_factor2_broadcast_to_v16i16_factor16(ptr %in
; SSE42-NEXT: paddb 48(%rsi), %xmm2
; SSE42-NEXT: paddb (%rsi), %xmm0
; SSE42-NEXT: paddb 32(%rsi), %xmm1
; SSE42-NEXT: movdqa {{.*#+}} xmm3 = [1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
; SSE42-NEXT: movq {{.*#+}} xmm3 = [1,3,5,7,9,11,13,15,0,0,0,0,0,0,0,0]
; SSE42-NEXT: pshufb %xmm3, %xmm1
; SSE42-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; SSE42-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
Expand All @@ -1075,8 +1075,7 @@ define void @vec256_i8_widen_to_i16_factor2_broadcast_to_v16i16_factor16(ptr %in
; AVX-NEXT: vpaddb 48(%rsi), %xmm2, %xmm2
; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
; AVX-NEXT: vpaddb 32(%rsi), %xmm1, %xmm1
; AVX-NEXT: vmovddup {{.*#+}} xmm3 = [1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15]
; AVX-NEXT: # xmm3 = mem[0,0]
; AVX-NEXT: vmovq {{.*#+}} xmm3 = [1,3,5,7,9,11,13,15,0,0,0,0,0,0,0,0]
; AVX-NEXT: vpshufb %xmm3, %xmm1, %xmm1
; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -875,7 +875,7 @@ define void @vec256_i8_widen_to_i16_factor2_broadcast_to_v16i16_factor16(ptr %in
; SSE42-NEXT: movdqa (%rdi), %xmm0
; SSE42-NEXT: movdqa 32(%rdi), %xmm1
; SSE42-NEXT: movdqa 48(%rdi), %xmm2
; SSE42-NEXT: movdqa {{.*#+}} xmm3 = [1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
; SSE42-NEXT: movq {{.*#+}} xmm3 = [1,3,5,7,9,11,13,15,0,0,0,0,0,0,0,0]
; SSE42-NEXT: pshufb %xmm3, %xmm1
; SSE42-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; SSE42-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
Expand All @@ -894,8 +894,7 @@ define void @vec256_i8_widen_to_i16_factor2_broadcast_to_v16i16_factor16(ptr %in
; AVX-NEXT: vmovdqa (%rdi), %xmm0
; AVX-NEXT: vmovdqa 32(%rdi), %xmm1
; AVX-NEXT: vmovdqa 48(%rdi), %xmm2
; AVX-NEXT: vmovddup {{.*#+}} xmm3 = [1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15]
; AVX-NEXT: # xmm3 = mem[0,0]
; AVX-NEXT: vmovq {{.*#+}} xmm3 = [1,3,5,7,9,11,13,15,0,0,0,0,0,0,0,0]
; AVX-NEXT: vpshufb %xmm3, %xmm1, %xmm1
; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
Expand Down
2 changes: 1 addition & 1 deletion llvm/test/CodeGen/X86/avx-load-store.ll
Original file line number Diff line number Diff line change
Expand Up @@ -220,7 +220,7 @@ define void @f_f() nounwind {
; CHECK-NEXT: testb %al, %al
; CHECK-NEXT: jne .LBB9_4
; CHECK-NEXT: # %bb.3: # %cif_mixed_test_all
; CHECK-NEXT: vmovaps {{.*#+}} xmm0 = [4294967295,0,0,0]
; CHECK-NEXT: vmovss {{.*#+}} xmm0 = [4294967295,0,0,0]
; CHECK-NEXT: vmaskmovps %ymm0, %ymm0, (%rax)
; CHECK-NEXT: .LBB9_4: # %cif_mixed_test_any_check
;
Expand Down
2 changes: 1 addition & 1 deletion llvm/test/CodeGen/X86/avx2-arith.ll
Original file line number Diff line number Diff line change
Expand Up @@ -234,7 +234,7 @@ define <8 x i16> @mul_const8(<8 x i16> %x) {
define <8 x i32> @mul_const9(<8 x i32> %x) {
; CHECK-LABEL: mul_const9:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovdqa {{.*#+}} xmm1 = [2,0,0,0]
; CHECK-NEXT: vmovd {{.*#+}} xmm1 = [2,0,0,0]
; CHECK-NEXT: vpmulld %ymm1, %ymm0, %ymm0
; CHECK-NEXT: ret{{[l|q]}}
%y = mul <8 x i32> %x, <i32 2, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ define <2 x bfloat> @shuffle_chained_v32bf16_v2bf16(<32 x bfloat> %a) {
; CHECK-NEXT: .cfi_def_cfa_register %rbp
; CHECK-NEXT: andq $-64, %rsp
; CHECK-NEXT: subq $128, %rsp
; CHECK-NEXT: vpbroadcastd {{.*#+}} xmm1 = [0,16,0,16,0,16,0,16]
; CHECK-NEXT: vmovd {{.*#+}} xmm1 = [0,16,0,0,0,0,0,0]
; CHECK-NEXT: vpermw %zmm0, %zmm1, %zmm0
; CHECK-NEXT: vmovdqa64 %zmm0, (%rsp)
; CHECK-NEXT: vmovaps (%rsp), %xmm0
Expand Down
6 changes: 3 additions & 3 deletions llvm/test/CodeGen/X86/bitreverse.ll
Original file line number Diff line number Diff line change
Expand Up @@ -587,17 +587,17 @@ define <2 x i16> @fold_v2i16() {
;
; X64-LABEL: fold_v2i16:
; X64: # %bb.0:
; X64-NEXT: movaps {{.*#+}} xmm0 = [61440,240,u,u,u,u,u,u]
; X64-NEXT: movss {{.*#+}} xmm0 = [61440,240,0,0,0,0,0,0]
; X64-NEXT: retq
;
; X86XOP-LABEL: fold_v2i16:
; X86XOP: # %bb.0:
; X86XOP-NEXT: vbroadcastss {{.*#+}} xmm0 = [61440,240,61440,240,61440,240,61440,240]
; X86XOP-NEXT: vmovss {{.*#+}} xmm0 = [61440,240,0,0,0,0,0,0]
; X86XOP-NEXT: retl
;
; GFNI-LABEL: fold_v2i16:
; GFNI: # %bb.0:
; GFNI-NEXT: vbroadcastss {{.*#+}} xmm0 = [61440,240,61440,240,61440,240,61440,240]
; GFNI-NEXT: vmovss {{.*#+}} xmm0 = [61440,240,0,0,0,0,0,0]
; GFNI-NEXT: retq
%b = call <2 x i16> @llvm.bitreverse.v2i16(<2 x i16> <i16 15, i16 3840>)
ret <2 x i16> %b
Expand Down
4 changes: 2 additions & 2 deletions llvm/test/CodeGen/X86/combine-srl.ll
Original file line number Diff line number Diff line change
Expand Up @@ -356,7 +356,7 @@ define <4 x i32> @combine_vec_lshr_lzcnt_bit1(<4 x i32> %x) {
; SSE-LABEL: combine_vec_lshr_lzcnt_bit1:
; SSE: # %bb.0:
; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; SSE-NEXT: movdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
; SSE-NEXT: movq {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
; SSE-NEXT: movdqa %xmm1, %xmm2
; SSE-NEXT: pshufb %xmm0, %xmm2
; SSE-NEXT: psrlw $4, %xmm0
Expand All @@ -378,7 +378,7 @@ define <4 x i32> @combine_vec_lshr_lzcnt_bit1(<4 x i32> %x) {
; AVX-LABEL: combine_vec_lshr_lzcnt_bit1:
; AVX: # %bb.0:
; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
; AVX-NEXT: vmovq {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
; AVX-NEXT: vpshufb %xmm0, %xmm1, %xmm2
; AVX-NEXT: vpsrlw $4, %xmm0, %xmm0
; AVX-NEXT: vpxor %xmm3, %xmm3, %xmm3
Expand Down
4 changes: 2 additions & 2 deletions llvm/test/CodeGen/X86/combine-subo.ll
Original file line number Diff line number Diff line change
Expand Up @@ -217,13 +217,13 @@ define { <4 x i8>, <4 x i1> } @always_usub_const_vector() nounwind {
define { <4 x i8>, <4 x i1> } @never_usub_const_vector() nounwind {
; SSE-LABEL: never_usub_const_vector:
; SSE: # %bb.0:
; SSE-NEXT: movaps {{.*#+}} xmm0 = [127,255,0,254,u,u,u,u,u,u,u,u,u,u,u,u]
; SSE-NEXT: movss {{.*#+}} xmm0 = [127,255,0,254,0,0,0,0,0,0,0,0,0,0,0,0]
; SSE-NEXT: xorps %xmm1, %xmm1
; SSE-NEXT: retq
;
; AVX-LABEL: never_usub_const_vector:
; AVX: # %bb.0:
; AVX-NEXT: vbroadcastss {{.*#+}} xmm0 = [127,255,0,254,127,255,0,254,127,255,0,254,127,255,0,254]
; AVX-NEXT: vmovss {{.*#+}} xmm0 = [127,255,0,254,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
; AVX-NEXT: retq
%x = call { <4 x i8>, <4 x i1> } @llvm.usub.with.overflow.v4i8(<4 x i8> <i8 255, i8 255, i8 255, i8 255>, <4 x i8> <i8 128, i8 0, i8 255, i8 1>)
Expand Down
Loading