-
Notifications
You must be signed in to change notification settings - Fork 12.2k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[X86] X86FixupVectorConstants - shrink vector load to movsd/movsd/movd/movq 'zero upper' instructions #79000
Conversation
@llvm/pr-subscribers-backend-x86 Author: Simon Pilgrim (RKSimon) ChangesIf we're loading a vector constant that is known to be zero in the upper elements, then attempt to shrink the constant and just scalar load the lower 32/64 bits. Patch is 137.24 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/79000.diff 55 Files Affected:
diff --git a/llvm/lib/Target/X86/X86FixupVectorConstants.cpp b/llvm/lib/Target/X86/X86FixupVectorConstants.cpp
index 9da8d7338ea36c..8467aff3376076 100644
--- a/llvm/lib/Target/X86/X86FixupVectorConstants.cpp
+++ b/llvm/lib/Target/X86/X86FixupVectorConstants.cpp
@@ -223,6 +223,33 @@ static Constant *rebuildSplatableConstant(const Constant *C,
return rebuildConstant(OriginalType->getContext(), SclTy, *Splat, NumSclBits);
}
+static Constant *rebuildZeroUpperConstant(const Constant *C,
+ unsigned ScalarBitWidth) {
+ Type *Ty = C->getType();
+ Type *SclTy = Ty->getScalarType();
+ unsigned NumBits = Ty->getPrimitiveSizeInBits();
+ unsigned NumSclBits = SclTy->getPrimitiveSizeInBits();
+ LLVMContext &Ctx = C->getContext();
+
+ if (NumBits > ScalarBitWidth) {
+ // Determine if the upper bits are all zero.
+ if (std::optional<APInt> Bits = extractConstantBits(C)) {
+ if (Bits->countLeadingZeros() >= (NumBits - ScalarBitWidth)) {
+ // If the original constant was made of smaller elements, try to retain
+ // those types.
+ if (ScalarBitWidth > NumSclBits && (ScalarBitWidth % NumSclBits) == 0)
+ return rebuildConstant(Ctx, SclTy, *Bits, NumSclBits);
+
+ // Fallback to raw integer bits.
+ APInt RawBits = Bits->zextOrTrunc(ScalarBitWidth);
+ return ConstantInt::get(Ctx, RawBits);
+ }
+ }
+ }
+
+ return nullptr;
+}
+
bool X86FixupVectorConstantsPass::processInstruction(MachineFunction &MF,
MachineBasicBlock &MBB,
MachineInstr &MI) {
@@ -263,6 +290,34 @@ bool X86FixupVectorConstantsPass::processInstruction(MachineFunction &MF,
return false;
};
+ auto ConvertToZeroUpper = [&](unsigned OpUpper64, unsigned OpUpper32,
+ unsigned OperandNo) {
+ assert(MI.getNumOperands() >= (OperandNo + X86::AddrNumOperands) &&
+ "Unexpected number of operands!");
+
+ if (auto *C = X86::getConstantFromPool(MI, OperandNo)) {
+ // Attempt to detect a suitable splat from increasing splat widths.
+ std::pair<unsigned, unsigned> ZeroUppers[] = {
+ {32, OpUpper32},
+ {64, OpUpper64},
+ };
+ for (auto [BitWidth, OpUpper] : ZeroUppers) {
+ if (OpUpper) {
+ // Construct a suitable splat constant and adjust the MI to
+ // use the new constant pool entry.
+ if (Constant *NewCst = rebuildZeroUpperConstant(C, BitWidth)) {
+ unsigned NewCPI =
+ CP->getConstantPoolIndex(NewCst, Align(BitWidth / 8));
+ MI.setDesc(TII->get(OpUpper));
+ MI.getOperand(OperandNo + X86::AddrDisp).setIndex(NewCPI);
+ return true;
+ }
+ }
+ }
+ }
+ return false;
+ };
+
// Attempt to convert full width vector loads into broadcast loads.
switch (Opc) {
/* FP Loads */
@@ -271,13 +326,14 @@ bool X86FixupVectorConstantsPass::processInstruction(MachineFunction &MF,
case X86::MOVUPDrm:
case X86::MOVUPSrm:
// TODO: SSE3 MOVDDUP Handling
- return false;
+ return ConvertToZeroUpper(X86::MOVSDrm, X86::MOVSSrm, 1);
case X86::VMOVAPDrm:
case X86::VMOVAPSrm:
case X86::VMOVUPDrm:
case X86::VMOVUPSrm:
return ConvertToBroadcast(0, 0, X86::VMOVDDUPrm, X86::VBROADCASTSSrm, 0, 0,
- 1);
+ 1) ||
+ ConvertToZeroUpper(X86::VMOVSDrm, X86::VMOVSSrm, 1);
case X86::VMOVAPDYrm:
case X86::VMOVAPSYrm:
case X86::VMOVUPDYrm:
@@ -289,7 +345,8 @@ bool X86FixupVectorConstantsPass::processInstruction(MachineFunction &MF,
case X86::VMOVUPDZ128rm:
case X86::VMOVUPSZ128rm:
return ConvertToBroadcast(0, 0, X86::VMOVDDUPZ128rm,
- X86::VBROADCASTSSZ128rm, 0, 0, 1);
+ X86::VBROADCASTSSZ128rm, 0, 0, 1) ||
+ ConvertToZeroUpper(X86::VMOVSDZrm, X86::VMOVSSZrm, 1);
case X86::VMOVAPDZ256rm:
case X86::VMOVAPSZ256rm:
case X86::VMOVUPDZ256rm:
@@ -305,13 +362,17 @@ bool X86FixupVectorConstantsPass::processInstruction(MachineFunction &MF,
X86::VBROADCASTSDZrm, X86::VBROADCASTSSZrm, 0, 0,
1);
/* Integer Loads */
+ case X86::MOVDQArm:
+ case X86::MOVDQUrm:
+ return ConvertToZeroUpper(X86::MOVQI2PQIrm, X86::MOVDI2PDIrm, 1);
case X86::VMOVDQArm:
case X86::VMOVDQUrm:
return ConvertToBroadcast(
- 0, 0, HasAVX2 ? X86::VPBROADCASTQrm : X86::VMOVDDUPrm,
- HasAVX2 ? X86::VPBROADCASTDrm : X86::VBROADCASTSSrm,
- HasAVX2 ? X86::VPBROADCASTWrm : 0, HasAVX2 ? X86::VPBROADCASTBrm : 0,
- 1);
+ 0, 0, HasAVX2 ? X86::VPBROADCASTQrm : X86::VMOVDDUPrm,
+ HasAVX2 ? X86::VPBROADCASTDrm : X86::VBROADCASTSSrm,
+ HasAVX2 ? X86::VPBROADCASTWrm : 0,
+ HasAVX2 ? X86::VPBROADCASTBrm : 0, 1) ||
+ ConvertToZeroUpper(X86::VMOVQI2PQIrm, X86::VMOVDI2PDIrm, 1);
case X86::VMOVDQAYrm:
case X86::VMOVDQUYrm:
return ConvertToBroadcast(
@@ -327,7 +388,8 @@ bool X86FixupVectorConstantsPass::processInstruction(MachineFunction &MF,
return ConvertToBroadcast(0, 0, X86::VPBROADCASTQZ128rm,
X86::VPBROADCASTDZ128rm,
HasBWI ? X86::VPBROADCASTWZ128rm : 0,
- HasBWI ? X86::VPBROADCASTBZ128rm : 0, 1);
+ HasBWI ? X86::VPBROADCASTBZ128rm : 0, 1) ||
+ ConvertToZeroUpper(X86::VMOVQI2PQIZrm, X86::VMOVDI2PDIZrm, 1);
case X86::VMOVDQA32Z256rm:
case X86::VMOVDQA64Z256rm:
case X86::VMOVDQU32Z256rm:
diff --git a/llvm/test/CodeGen/X86/2011-20-21-zext-ui2fp.ll b/llvm/test/CodeGen/X86/2011-20-21-zext-ui2fp.ll
index e4e5d51d272d61..1ae51ee9756388 100644
--- a/llvm/test/CodeGen/X86/2011-20-21-zext-ui2fp.ll
+++ b/llvm/test/CodeGen/X86/2011-20-21-zext-ui2fp.ll
@@ -7,7 +7,7 @@
define void @ui_to_fp_conv(ptr nocapture %aFOO, ptr nocapture %RET) nounwind {
; CHECK-LABEL: ui_to_fp_conv:
; CHECK: # %bb.0: # %allocas
-; CHECK-NEXT: movaps {{.*#+}} xmm0 = [1.0E+0,1.0E+0,0.0E+0,0.0E+0]
+; CHECK-NEXT: movsd {{.*#+}} xmm0 = [1.0E+0,1.0E+0,0.0E+0,0.0E+0]
; CHECK-NEXT: xorps %xmm1, %xmm1
; CHECK-NEXT: movups %xmm1, 16(%rsi)
; CHECK-NEXT: movups %xmm0, (%rsi)
diff --git a/llvm/test/CodeGen/X86/avx-load-store.ll b/llvm/test/CodeGen/X86/avx-load-store.ll
index 33eb704788740a..3f856d33145d86 100644
--- a/llvm/test/CodeGen/X86/avx-load-store.ll
+++ b/llvm/test/CodeGen/X86/avx-load-store.ll
@@ -220,7 +220,7 @@ define void @f_f() nounwind {
; CHECK-NEXT: testb %al, %al
; CHECK-NEXT: jne .LBB9_4
; CHECK-NEXT: # %bb.3: # %cif_mixed_test_all
-; CHECK-NEXT: vmovaps {{.*#+}} xmm0 = [4294967295,0,0,0]
+; CHECK-NEXT: vmovss {{.*#+}} xmm0 = [4294967295,0,0,0]
; CHECK-NEXT: vmaskmovps %ymm0, %ymm0, (%rax)
; CHECK-NEXT: .LBB9_4: # %cif_mixed_test_any_check
;
diff --git a/llvm/test/CodeGen/X86/avx2-arith.ll b/llvm/test/CodeGen/X86/avx2-arith.ll
index 3e695811719448..d32143cf33f2fa 100644
--- a/llvm/test/CodeGen/X86/avx2-arith.ll
+++ b/llvm/test/CodeGen/X86/avx2-arith.ll
@@ -234,7 +234,7 @@ define <8 x i16> @mul_const8(<8 x i16> %x) {
define <8 x i32> @mul_const9(<8 x i32> %x) {
; CHECK-LABEL: mul_const9:
; CHECK: # %bb.0:
-; CHECK-NEXT: vmovdqa {{.*#+}} xmm1 = [2,0,0,0]
+; CHECK-NEXT: vmovd {{.*#+}} xmm1 = [2,0,0,0]
; CHECK-NEXT: vpmulld %ymm1, %ymm0, %ymm0
; CHECK-NEXT: ret{{[l|q]}}
%y = mul <8 x i32> %x, <i32 2, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
diff --git a/llvm/test/CodeGen/X86/combine-srl.ll b/llvm/test/CodeGen/X86/combine-srl.ll
index 3e0f581ea01a0a..125196a0819b61 100644
--- a/llvm/test/CodeGen/X86/combine-srl.ll
+++ b/llvm/test/CodeGen/X86/combine-srl.ll
@@ -356,7 +356,7 @@ define <4 x i32> @combine_vec_lshr_lzcnt_bit1(<4 x i32> %x) {
; SSE-LABEL: combine_vec_lshr_lzcnt_bit1:
; SSE: # %bb.0:
; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; SSE-NEXT: movdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; SSE-NEXT: movq {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
; SSE-NEXT: movdqa %xmm1, %xmm2
; SSE-NEXT: pshufb %xmm0, %xmm2
; SSE-NEXT: psrlw $4, %xmm0
@@ -378,7 +378,7 @@ define <4 x i32> @combine_vec_lshr_lzcnt_bit1(<4 x i32> %x) {
; AVX-LABEL: combine_vec_lshr_lzcnt_bit1:
; AVX: # %bb.0:
; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX-NEXT: vmovq {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
; AVX-NEXT: vpshufb %xmm0, %xmm1, %xmm2
; AVX-NEXT: vpsrlw $4, %xmm0, %xmm0
; AVX-NEXT: vpxor %xmm3, %xmm3, %xmm3
diff --git a/llvm/test/CodeGen/X86/constant-pool-sharing.ll b/llvm/test/CodeGen/X86/constant-pool-sharing.ll
index db5a7810974c9d..062d87ed035fd6 100644
--- a/llvm/test/CodeGen/X86/constant-pool-sharing.ll
+++ b/llvm/test/CodeGen/X86/constant-pool-sharing.ll
@@ -77,7 +77,7 @@ define void @store_repeated_constants(ptr %lo, ptr %hi) {
; SSE-LINUX: # %bb.0:
; SSE-LINUX-NEXT: xorps %xmm0, %xmm0
; SSE-LINUX-NEXT: movaps %xmm0, 48(%rdi)
-; SSE-LINUX-NEXT: movaps {{.*#+}} xmm1 = [18446744073709551615,0]
+; SSE-LINUX-NEXT: movsd {{.*#+}} xmm1 = [18446744073709551615,0]
; SSE-LINUX-NEXT: movaps %xmm1, 32(%rdi)
; SSE-LINUX-NEXT: movaps %xmm1, 16(%rdi)
; SSE-LINUX-NEXT: movaps %xmm1, (%rdi)
@@ -92,7 +92,7 @@ define void @store_repeated_constants(ptr %lo, ptr %hi) {
; SSE-MSVC: # %bb.0:
; SSE-MSVC-NEXT: xorps %xmm0, %xmm0
; SSE-MSVC-NEXT: movaps %xmm0, 48(%rcx)
-; SSE-MSVC-NEXT: movaps {{.*#+}} xmm1 = [18446744073709551615,0]
+; SSE-MSVC-NEXT: movsd {{.*#+}} xmm1 = [18446744073709551615,0]
; SSE-MSVC-NEXT: movaps %xmm1, 32(%rcx)
; SSE-MSVC-NEXT: movaps %xmm1, 16(%rcx)
; SSE-MSVC-NEXT: movaps %xmm1, (%rcx)
diff --git a/llvm/test/CodeGen/X86/dpbusd.ll b/llvm/test/CodeGen/X86/dpbusd.ll
index 06f11e6d527bde..fbea08eb1e5502 100644
--- a/llvm/test/CodeGen/X86/dpbusd.ll
+++ b/llvm/test/CodeGen/X86/dpbusd.ll
@@ -379,7 +379,7 @@ define i32 @vpdpbusd_2xi32(ptr%a, ptr%b, i32 %c, i32 %n) {
; AVX512VNNI-LABEL: vpdpbusd_2xi32:
; AVX512VNNI: # %bb.0: # %entry
; AVX512VNNI-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
-; AVX512VNNI-NEXT: vmovdqa {{.*#+}} xmm1 = [65535,0,0,0]
+; AVX512VNNI-NEXT: vmovd {{.*#+}} xmm1 = [65535,0,0,0]
; AVX512VNNI-NEXT: vpandq %zmm1, %zmm0, %zmm0
; AVX512VNNI-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
; AVX512VNNI-NEXT: vpandq %zmm1, %zmm2, %zmm1
diff --git a/llvm/test/CodeGen/X86/dpbusd_const.ll b/llvm/test/CodeGen/X86/dpbusd_const.ll
index 2fe9feb06dff67..5862e614265b1f 100644
--- a/llvm/test/CodeGen/X86/dpbusd_const.ll
+++ b/llvm/test/CodeGen/X86/dpbusd_const.ll
@@ -108,7 +108,7 @@ define i32 @mul_4xi8_cs(<4 x i8> %a, i32 %c) {
; AVXVNNI: # %bb.0: # %entry
; AVXVNNI-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVXVNNI-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
-; AVXVNNI-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,255,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVXVNNI-NEXT: vmovd {{.*#+}} xmm2 = [0,1,2,255,0,0,0,0,0,0,0,0,0,0,0,0]
; AVXVNNI-NEXT: {vex} vpdpbusd %xmm0, %xmm2, %xmm1
; AVXVNNI-NEXT: vmovd %xmm1, %eax
; AVXVNNI-NEXT: addl %edi, %eax
@@ -118,7 +118,7 @@ define i32 @mul_4xi8_cs(<4 x i8> %a, i32 %c) {
; AVX512VNNI: # %bb.0: # %entry
; AVX512VNNI-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512VNNI-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
-; AVX512VNNI-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,255,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX512VNNI-NEXT: vmovd {{.*#+}} xmm1 = [0,1,2,255,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX512VNNI-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX512VNNI-NEXT: vpdpbusd %zmm0, %zmm1, %zmm2
; AVX512VNNI-NEXT: vmovd %xmm2, %eax
@@ -130,7 +130,7 @@ define i32 @mul_4xi8_cs(<4 x i8> %a, i32 %c) {
; AVX512VLVNNI: # %bb.0: # %entry
; AVX512VLVNNI-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512VLVNNI-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
-; AVX512VLVNNI-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,255,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX512VLVNNI-NEXT: vmovd {{.*#+}} xmm1 = [0,1,2,255,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX512VLVNNI-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX512VLVNNI-NEXT: vpdpbusd %xmm0, %xmm1, %xmm2
; AVX512VLVNNI-NEXT: vmovd %xmm2, %eax
diff --git a/llvm/test/CodeGen/X86/expand-vp-cast-intrinsics.ll b/llvm/test/CodeGen/X86/expand-vp-cast-intrinsics.ll
index 3b015acb69bd2e..0a52dfff71eda4 100644
--- a/llvm/test/CodeGen/X86/expand-vp-cast-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/expand-vp-cast-intrinsics.ll
@@ -532,7 +532,7 @@ define <2 x half> @vfptrunc_v2f16_v2f64(<2 x double> %a, <2 x i1> %m, i32 zeroex
; AVX512-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill
; AVX512-NEXT: callq __truncdfhf2@PLT
; AVX512-NEXT: vpbroadcastw %xmm0, %xmm1
-; AVX512-NEXT: vmovaps {{.*#+}} xmm0 = [4,0,0,0]
+; AVX512-NEXT: vmovss {{.*#+}} xmm0 = [4,0,0,0]
; AVX512-NEXT: vpermi2ps (%rsp), %xmm1, %xmm0 # 16-byte Folded Reload
; AVX512-NEXT: addq $40, %rsp
; AVX512-NEXT: .cfi_def_cfa_offset 8
diff --git a/llvm/test/CodeGen/X86/fcmp-constant.ll b/llvm/test/CodeGen/X86/fcmp-constant.ll
index 481a32b39dd377..335cb28213f929 100644
--- a/llvm/test/CodeGen/X86/fcmp-constant.ll
+++ b/llvm/test/CodeGen/X86/fcmp-constant.ll
@@ -92,7 +92,7 @@ define <2 x i64> @fcmp_ueq_v2f64_undef() {
define <2 x i64> @fcmp_ueq_v2f64_undef_elt() {
; CHECK-LABEL: fcmp_ueq_v2f64_undef_elt:
; CHECK: # %bb.0:
-; CHECK-NEXT: movaps {{.*#+}} xmm0 = [18446744073709551615,0]
+; CHECK-NEXT: movsd {{.*#+}} xmm0 = [18446744073709551615,0]
; CHECK-NEXT: retq
%1 = fcmp ueq <2 x double> <double 0x3FF0000000000000, double 0xFFEFFFFFFFFFFFFF>, <double undef, double 0x3FF0000000000000>
%2 = sext <2 x i1> %1 to <2 x i64>
diff --git a/llvm/test/CodeGen/X86/half.ll b/llvm/test/CodeGen/X86/half.ll
index b42f6fdea34b65..d0853fdc748d29 100644
--- a/llvm/test/CodeGen/X86/half.ll
+++ b/llvm/test/CodeGen/X86/half.ll
@@ -2116,7 +2116,7 @@ define void @pr63114() {
; CHECK-LIBCALL-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1]
; CHECK-LIBCALL-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,0,65535,65535,65535,65535]
; CHECK-LIBCALL-NEXT: pand %xmm1, %xmm0
-; CHECK-LIBCALL-NEXT: movdqa {{.*#+}} xmm2 = [0,0,0,15360,0,0,0,0]
+; CHECK-LIBCALL-NEXT: movq {{.*#+}} xmm2 = [0,0,0,15360,0,0,0,0]
; CHECK-LIBCALL-NEXT: por %xmm2, %xmm0
; CHECK-LIBCALL-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,65535,65535,65535,65535,65535,0]
; CHECK-LIBCALL-NEXT: pand %xmm3, %xmm0
@@ -2181,7 +2181,7 @@ define void @pr63114() {
; CHECK-I686-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1]
; CHECK-I686-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,0,65535,65535,65535,65535]
; CHECK-I686-NEXT: pand %xmm1, %xmm0
-; CHECK-I686-NEXT: movdqa {{.*#+}} xmm2 = [0,0,0,15360,0,0,0,0]
+; CHECK-I686-NEXT: movq {{.*#+}} xmm2 = [0,0,0,15360,0,0,0,0]
; CHECK-I686-NEXT: por %xmm2, %xmm0
; CHECK-I686-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,65535,65535,65535,65535,65535,0]
; CHECK-I686-NEXT: pand %xmm3, %xmm0
diff --git a/llvm/test/CodeGen/X86/insert-into-constant-vector.ll b/llvm/test/CodeGen/X86/insert-into-constant-vector.ll
index 07787f3851bd90..05f81825d18051 100644
--- a/llvm/test/CodeGen/X86/insert-into-constant-vector.ll
+++ b/llvm/test/CodeGen/X86/insert-into-constant-vector.ll
@@ -375,7 +375,7 @@ define <8 x i64> @elt5_v8i64(i64 %x) {
; X86-SSE-LABEL: elt5_v8i64:
; X86-SSE: # %bb.0:
; X86-SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
-; X86-SSE-NEXT: movaps {{.*#+}} xmm2 = [4,0,0,0]
+; X86-SSE-NEXT: movss {{.*#+}} xmm2 = [4,0,0,0]
; X86-SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
; X86-SSE-NEXT: movaps {{.*#+}} xmm0 = [42,0,1,0]
; X86-SSE-NEXT: movaps {{.*#+}} xmm1 = [2,0,3,0]
@@ -404,7 +404,7 @@ define <8 x i64> @elt5_v8i64(i64 %x) {
; X86-AVX1-LABEL: elt5_v8i64:
; X86-AVX1: # %bb.0:
; X86-AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
-; X86-AVX1-NEXT: vmovaps {{.*#+}} xmm1 = [4,0,0,0]
+; X86-AVX1-NEXT: vmovss {{.*#+}} xmm1 = [4,0,0,0]
; X86-AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
; X86-AVX1-NEXT: vinsertf128 $1, {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm1
; X86-AVX1-NEXT: vmovaps {{.*#+}} ymm0 = [42,0,1,0,2,0,3,0]
@@ -421,7 +421,7 @@ define <8 x i64> @elt5_v8i64(i64 %x) {
; X86-AVX2-LABEL: elt5_v8i64:
; X86-AVX2: # %bb.0:
; X86-AVX2-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
-; X86-AVX2-NEXT: vmovaps {{.*#+}} xmm1 = [4,0,0,0]
+; X86-AVX2-NEXT: vmovss {{.*#+}} xmm1 = [4,0,0,0]
; X86-AVX2-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
; X86-AVX2-NEXT: vinsertf128 $1, {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm1
; X86-AVX2-NEXT: vmovaps {{.*#+}} ymm0 = [42,0,1,0,2,0,3,0]
@@ -439,7 +439,7 @@ define <8 x i64> @elt5_v8i64(i64 %x) {
; X86-AVX512F: # %bb.0:
; X86-AVX512F-NEXT: vmovaps {{.*#+}} ymm0 = [42,0,1,0,2,0,3,0]
; X86-AVX512F-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
-; X86-AVX512F-NEXT: vmovaps {{.*#+}} xmm2 = [4,0,0,0]
+; X86-AVX512F-NEXT: vmovss {{.*#+}} xmm2 = [4,0,0,0]
; X86-AVX512F-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0]
; X86-AVX512F-NEXT: vinsertf128 $1, {{\.?LCPI[0-9]+_[0-9]+}}, %ymm1, %ymm1
; X86-AVX512F-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
diff --git a/llvm/test/CodeGen/X86/insertelement-ones.ll b/llvm/test/CodeGen/X86/insertelement-ones.ll
index ed7cbb0a8430d9..cfcb8798e0b9cc 100644
--- a/llvm/test/CodeGen/X86/insertelement-ones.ll
+++ b/llvm/test/CodeGen/X86/insertelement-ones.ll
@@ -280,7 +280,7 @@ define <16 x i16> @insert_v16i16_x12345x789ABCDEx(<16 x i16> %a) {
;
; AVX1-LABEL: insert_v16i16_x12345x789ABCDEx:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovaps {{.*#+}} xmm1 = [65535,0,0,0]
+; AVX1-NEXT: vmovss {{.*#+}} xmm1 = [65535,0,0,0]
; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
; AVX1-NEXT: vorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
; AVX1-NEXT: vorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
@@ -385,7 +385,7 @@ define <32 x i8> @insert_v32i8_x123456789ABCDEzGHIJKLMNOPQRSTxx(<32 x i8> %a) {
;
; AVX1-LABEL: insert_v32i8_x123456789ABCDEzGHIJKLMNOPQRSTxx:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovaps {{.*#+}} xmm1 = [255,0,0,0]
+; AVX1-NEXT: vmovss {{.*#+}} xmm1 = [255,0,0,0]
; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
; AVX1-NEXT: vorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
; AVX1-NEXT: vorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
diff --git a/llvm/test/CodeGen/X86/masked_gather_scatter_widen.ll b/llvm/test/CodeGen/X86/masked_gather_scatter_widen.ll
index f63545bcd178e5..aad1b443448503 100644
--- a/llvm/test/CodeGen/X86/masked_gather_scatter_widen.ll
+++ b/llvm/test/CodeGen/X86/masked_gather_scatter_widen.ll
@@ -739,7 +739,7 @@ define <17 x float> @test_mgather_v17f32(ptr %base, <17 x i32> %index)
; WIDEN_AVX2-NEXT: vgatherdps %ymm5, (%rsi,%ymm1,4), %ymm6
; WIDEN_AVX2-NEXT: vxorps %xmm1, %xmm1, %xmm1
; WIDEN_AVX2-NEXT: vgatherdps %ymm3, (%rsi,%ymm0,4), %ymm1
-; WIDEN_AVX2-NEXT: vmovaps {{.*#+}} xmm0 = [4294967295,0,0,0]
+; WIDEN_AVX2-NEXT: vmovss {{.*#+}} xmm0 = [4294967295,0,0,0]
; WIDEN_AVX2-NEXT: vgatherdps %ymm0, (%rsi,%ymm2,4), %ymm4
; WIDEN_AVX2-NEXT: vmovss %xmm4, 64(%rdi)
; WIDEN_AVX2-NEXT: vmovaps %ymm1, 32(%rdi)
diff --git a/llvm/test/CodeGen/X86/memcmp-more-load-pairs.ll b/llvm/test/CodeGen/X86/memcmp-more-load-pairs.ll
index 6eb02bfc1fd0c3..da46ea40655791 100644
--- a/llvm/test/CodeGen/X86/memcmp-more-load-pairs.ll
+++ b/llvm/test/CodeGen/X86/memcmp-more-load-pairs.ll
@@ -947,7 +947,7 @@ define i1 @length24_eq_const(ptr %X) nounwind {
; X64-MIC-AVX: # %bb.0:
; X64-MIC-AVX-NEXT: vmovdqu (%rdi), %xmm0
; X64-MIC-AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
-; X64-MIC-AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [959985462,858927408,0,0]
+; X64-MIC-AVX-NEXT: vmovq {{.*#+}} xmm2 = [959985462,858927408,0,0]
; X64-MIC-AVX-NEXT: vpcmpneqd %zmm2, %zmm1, %k0
; X64-MIC-AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [858927408,926299444,825243960,892613426]
; X64-MIC-AVX-NEXT: vpcmpneqd %zmm1, %zmm0, %k1
diff --git a/llvm/test/CodeGen/X86/memcmp.ll b/llvm/test/C...
[truncated]
|
llvm/test/CodeGen/X86/pr63108.ll
Outdated
@@ -50,7 +50,7 @@ define i32 @PR63108() { | |||
; AVX1-NEXT: vbroadcastss {{.*#+}} xmm0 = [251,223,0,0,251,223,0,0,251,223,0,0,251,223,0,0] | |||
; AVX1-NEXT: jmp .LBB0_5 | |||
; AVX1-NEXT: .LBB0_2: # %vector.body.preheader | |||
; AVX1-NEXT: vmovaps {{.*#+}} xmm0 = [57339,0,0,0] | |||
; AVX1-NEXT: vmovss {{.*#+}} xmm0 = [251,223,0,0,0,0,0,0,0,0,0,0,0,0,0,0] |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why not a vmovd
here?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I expect the domain pass has flipped it to float
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I don't see FP domain instructions except for vbroadcastss
, is it because they are using the same xmm0
register?
Then I took a further look at vbroadcastss
, and found maybe we should replace it with a vmoss/vmod
too.
63343ed
to
fa8038a
Compare
; AVX1-NEXT: jmp .LBB0_5 | ||
; AVX1-NEXT: .LBB0_2: # %vector.body.preheader | ||
; AVX1-NEXT: vmovaps {{.*#+}} xmm0 = [57339,0,0,0] | ||
; AVX1-NEXT: vmovss {{.*#+}} xmm0 = [57339,0,0,0] |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
the movaps/movss is due to AVX1 only having fp-ops on the 256-bit code below
@phoebewang I've added missing general VectorConstant decodes (where before we only needed support for splats/broadcasts) and now prefer vzload over broadcasts (even if we would have been able to use a smaller broadcast constant), which seems to give consistent better codegen. |
@@ -563,15 +563,15 @@ define <16 x i1> @interleaved_load_vf16_i8_stride4(ptr %ptr) nounwind { | |||
define <32 x i1> @interleaved_load_vf32_i8_stride4(ptr %ptr) nounwind { | |||
; AVX1-LABEL: interleaved_load_vf32_i8_stride4: | |||
; AVX1: # %bb.0: | |||
; AVX1-NEXT: vbroadcastss {{.*#+}} xmm6 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] | |||
; AVX1-NEXT: vmovq {{.*#+}} xmm6 = [0,0,0,0,0,4,8,12,0,0,0,0,0,0,0,0] |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I see what's your mean by smaller broadcast constant, but I don't get the point why it's better here.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'm not convinced it is either - I'll take a look at refactoring the patch to always take the smallest constant (so the tests will flip between broadcast/vzload)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'm fine with the current output if it's not easy to flip between broadcast/vzload in the code.
…d/movq 'zero upper' instructions If we're loading a vector constant that is known to be zero in the upper elements, then attempt to shrink the constant and just scalar load the lower 32/64 bits. I've extended the constant fixup code to always attempt to use the smallest possible constant load, but for the same constant width prefer the scalar vzload over broadcasts (better chance of avoiding a domain clash).
@phoebewang This version always prefers the smallest possible constant load (and prefer vzload vs broadcast for the same constant width). This should also work when I add support for sextload/zextload handling for #71078 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM.
If we're loading a vector constant that is known to be zero in the upper elements, then attempt to shrink the constant and just scalar load the lower 32/64 bits.
Fixes #73783