-
Notifications
You must be signed in to change notification settings - Fork 12.6k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[PowerPC] Improve pwr7 codegen for v4i8 load #104507
Conversation
@llvm/pr-subscribers-backend-powerpc Author: None (RolandF77) ChangesThere are no partial vector loads on pwr7 so current v4i8 codegen is an int load then store to vector sized temp and re-load as vector. Try to use lfiwax to load 32 bits into an FP reg and take advantage of VSX FP and vector reg sharing to move the result to the right vector position. Patch is 57.85 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/104507.diff 11 Files Affected:
diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
index 8ff9f5a5a991e0..a926b226ba738e 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -11490,13 +11490,38 @@ SDValue PPCTargetLowering::LowerIS_FPCLASS(SDValue Op,
SDValue PPCTargetLowering::LowerSCALAR_TO_VECTOR(SDValue Op,
SelectionDAG &DAG) const {
SDLoc dl(Op);
+
+ MachineFunction &MF = DAG.getMachineFunction();
+ SDValue Op0 = Op.getOperand(0);
+ ReuseLoadInfo RLI;
+ if (Subtarget.hasVSX() && Op.getValueType() == MVT::v4i32 &&
+ Op0.getOpcode() == ISD::LOAD && Op0.getValueType() == MVT::i32 &&
+ Op0.hasOneUse() &&
+ canReuseLoadAddress(Op0, MVT::i32, RLI, DAG, ISD::NON_EXTLOAD)) {
+
+ MachineMemOperand *MMO =
+ MF.getMachineMemOperand(RLI.MPI, MachineMemOperand::MOLoad, 4,
+ RLI.Alignment, RLI.AAInfo, RLI.Ranges);
+ SDValue Ops[] = {RLI.Chain, RLI.Ptr};
+ SDValue Bits = DAG.getMemIntrinsicNode(PPCISD::LFIWAX, dl,
+ DAG.getVTList(MVT::f64, MVT::Other),
+ Ops, MVT::i32, MMO);
+ spliceIntoChain(RLI.ResChain, Bits.getValue(1), DAG);
+
+ SDValue ConvVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, Bits);
+ SDValue Bitcast = DAG.getBitcast(MVT::v4i32, ConvVec);
+ unsigned LowIx = Subtarget.isLittleEndian() ? 3 : 1;
+ return DAG.getNode(PPCISD::XXSPLT, dl, MVT::v4i32, Bitcast,
+ DAG.getConstant(LowIx, dl, MVT::i32));
+ }
+
// Create a stack slot that is 16-byte aligned.
- MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
+ MachineFrameInfo &MFI = MF.getFrameInfo();
int FrameIdx = MFI.CreateStackObject(16, Align(16), false);
EVT PtrVT = getPointerTy(DAG.getDataLayout());
SDValue FIdx = DAG.getFrameIndex(FrameIdx, PtrVT);
- SDValue Val = Op.getOperand(0);
+ SDValue Val = Op0;
EVT ValVT = Val.getValueType();
// P10 hardware store forwarding requires that a single store contains all
// the data for the load. P10 is able to merge a pair of adjacent stores. Try
diff --git a/llvm/test/CodeGen/PowerPC/build-vector-from-load-and-zeros.ll b/llvm/test/CodeGen/PowerPC/build-vector-from-load-and-zeros.ll
index 6d35a7281de6b4..54312fb5cee330 100644
--- a/llvm/test/CodeGen/PowerPC/build-vector-from-load-and-zeros.ll
+++ b/llvm/test/CodeGen/PowerPC/build-vector-from-load-and-zeros.ll
@@ -27,20 +27,18 @@ define <2 x i64> @build_v2i64_extload_0(ptr nocapture noundef readonly %p) {
; PWR7-LE-LABEL: build_v2i64_extload_0:
; PWR7-LE: # %bb.0: # %entry
; PWR7-LE-NEXT: li 4, 0
-; PWR7-LE-NEXT: lwz 3, 0(3)
; PWR7-LE-NEXT: stw 4, -16(1)
-; PWR7-LE-NEXT: addis 4, 2, .LCPI0_0@toc@ha
-; PWR7-LE-NEXT: addi 4, 4, .LCPI0_0@toc@l
-; PWR7-LE-NEXT: stw 3, -32(1)
-; PWR7-LE-NEXT: addi 3, 1, -32
-; PWR7-LE-NEXT: lxvd2x 0, 0, 4
-; PWR7-LE-NEXT: addi 4, 1, -16
-; PWR7-LE-NEXT: lxvd2x 1, 0, 4
-; PWR7-LE-NEXT: xxswapd 34, 0
+; PWR7-LE-NEXT: lfiwax 0, 0, 3
+; PWR7-LE-NEXT: addis 3, 2, .LCPI0_0@toc@ha
+; PWR7-LE-NEXT: addi 3, 3, .LCPI0_0@toc@l
+; PWR7-LE-NEXT: xxspltd 34, 0, 0
; PWR7-LE-NEXT: lxvd2x 0, 0, 3
-; PWR7-LE-NEXT: xxswapd 35, 1
+; PWR7-LE-NEXT: addi 3, 1, -16
+; PWR7-LE-NEXT: xxswapd 35, 0
+; PWR7-LE-NEXT: lxvd2x 0, 0, 3
+; PWR7-LE-NEXT: xxspltw 34, 34, 3
; PWR7-LE-NEXT: xxswapd 36, 0
-; PWR7-LE-NEXT: vperm 2, 3, 4, 2
+; PWR7-LE-NEXT: vperm 2, 4, 2, 3
; PWR7-LE-NEXT: blr
;
; PWR8-LE-LABEL: build_v2i64_extload_0:
@@ -337,17 +335,14 @@ entry:
define <4 x i32> @build_v4i32_load_0(ptr nocapture noundef readonly %p) {
; PWR7-BE-LABEL: build_v4i32_load_0:
; PWR7-BE: # %bb.0: # %entry
-; PWR7-BE-NEXT: lwz 3, 0(3)
-; PWR7-BE-NEXT: xxlxor 36, 36, 36
-; PWR7-BE-NEXT: sldi 3, 3, 32
-; PWR7-BE-NEXT: std 3, -32(1)
-; PWR7-BE-NEXT: std 3, -24(1)
+; PWR7-BE-NEXT: lfiwax 0, 0, 3
; PWR7-BE-NEXT: addis 3, 2, .LCPI8_0@toc@ha
+; PWR7-BE-NEXT: xxlxor 36, 36, 36
; PWR7-BE-NEXT: addi 3, 3, .LCPI8_0@toc@l
-; PWR7-BE-NEXT: lxvw4x 34, 0, 3
-; PWR7-BE-NEXT: addi 3, 1, -32
; PWR7-BE-NEXT: lxvw4x 35, 0, 3
-; PWR7-BE-NEXT: vperm 2, 3, 4, 2
+; PWR7-BE-NEXT: xxlor 34, 0, 0
+; PWR7-BE-NEXT: xxspltw 34, 34, 1
+; PWR7-BE-NEXT: vperm 2, 2, 4, 3
; PWR7-BE-NEXT: blr
;
; PWR8-BE-LABEL: build_v4i32_load_0:
@@ -365,20 +360,18 @@ define <4 x i32> @build_v4i32_load_0(ptr nocapture noundef readonly %p) {
; PWR7-LE-LABEL: build_v4i32_load_0:
; PWR7-LE: # %bb.0: # %entry
; PWR7-LE-NEXT: li 4, 0
-; PWR7-LE-NEXT: lwz 3, 0(3)
; PWR7-LE-NEXT: stw 4, -16(1)
-; PWR7-LE-NEXT: addis 4, 2, .LCPI8_0@toc@ha
-; PWR7-LE-NEXT: addi 4, 4, .LCPI8_0@toc@l
-; PWR7-LE-NEXT: stw 3, -32(1)
-; PWR7-LE-NEXT: addi 3, 1, -32
-; PWR7-LE-NEXT: lxvd2x 0, 0, 4
-; PWR7-LE-NEXT: addi 4, 1, -16
-; PWR7-LE-NEXT: lxvd2x 1, 0, 4
-; PWR7-LE-NEXT: xxswapd 34, 0
+; PWR7-LE-NEXT: lfiwax 0, 0, 3
+; PWR7-LE-NEXT: addis 3, 2, .LCPI8_0@toc@ha
+; PWR7-LE-NEXT: addi 3, 3, .LCPI8_0@toc@l
+; PWR7-LE-NEXT: xxspltd 34, 0, 0
; PWR7-LE-NEXT: lxvd2x 0, 0, 3
-; PWR7-LE-NEXT: xxswapd 35, 1
+; PWR7-LE-NEXT: addi 3, 1, -16
+; PWR7-LE-NEXT: xxswapd 35, 0
+; PWR7-LE-NEXT: lxvd2x 0, 0, 3
+; PWR7-LE-NEXT: xxspltw 34, 34, 3
; PWR7-LE-NEXT: xxswapd 36, 0
-; PWR7-LE-NEXT: vperm 2, 3, 4, 2
+; PWR7-LE-NEXT: vperm 2, 4, 2, 3
; PWR7-LE-NEXT: blr
;
; PWR8-LE-LABEL: build_v4i32_load_0:
@@ -400,17 +393,14 @@ entry:
define <4 x i32> @build_v4i32_load_1(ptr nocapture noundef readonly %p) {
; PWR7-BE-LABEL: build_v4i32_load_1:
; PWR7-BE: # %bb.0: # %entry
-; PWR7-BE-NEXT: lwz 3, 0(3)
-; PWR7-BE-NEXT: xxlxor 36, 36, 36
-; PWR7-BE-NEXT: sldi 3, 3, 32
-; PWR7-BE-NEXT: std 3, -16(1)
-; PWR7-BE-NEXT: std 3, -8(1)
+; PWR7-BE-NEXT: lfiwax 0, 0, 3
; PWR7-BE-NEXT: addis 3, 2, .LCPI9_0@toc@ha
+; PWR7-BE-NEXT: xxlxor 36, 36, 36
; PWR7-BE-NEXT: addi 3, 3, .LCPI9_0@toc@l
-; PWR7-BE-NEXT: lxvw4x 34, 0, 3
-; PWR7-BE-NEXT: addi 3, 1, -16
; PWR7-BE-NEXT: lxvw4x 35, 0, 3
-; PWR7-BE-NEXT: vperm 2, 4, 3, 2
+; PWR7-BE-NEXT: xxlor 34, 0, 0
+; PWR7-BE-NEXT: xxspltw 34, 34, 1
+; PWR7-BE-NEXT: vperm 2, 4, 2, 3
; PWR7-BE-NEXT: blr
;
; PWR8-BE-LABEL: build_v4i32_load_1:
@@ -427,20 +417,18 @@ define <4 x i32> @build_v4i32_load_1(ptr nocapture noundef readonly %p) {
; PWR7-LE-LABEL: build_v4i32_load_1:
; PWR7-LE: # %bb.0: # %entry
; PWR7-LE-NEXT: li 4, 0
-; PWR7-LE-NEXT: lwz 3, 0(3)
-; PWR7-LE-NEXT: stw 4, -32(1)
-; PWR7-LE-NEXT: addis 4, 2, .LCPI9_0@toc@ha
-; PWR7-LE-NEXT: addi 4, 4, .LCPI9_0@toc@l
-; PWR7-LE-NEXT: stw 3, -16(1)
+; PWR7-LE-NEXT: stw 4, -16(1)
+; PWR7-LE-NEXT: lfiwax 0, 0, 3
+; PWR7-LE-NEXT: addis 3, 2, .LCPI9_0@toc@ha
+; PWR7-LE-NEXT: addi 3, 3, .LCPI9_0@toc@l
+; PWR7-LE-NEXT: xxspltd 34, 0, 0
+; PWR7-LE-NEXT: lxvd2x 0, 0, 3
; PWR7-LE-NEXT: addi 3, 1, -16
-; PWR7-LE-NEXT: lxvd2x 0, 0, 4
-; PWR7-LE-NEXT: addi 4, 1, -32
-; PWR7-LE-NEXT: lxvd2x 1, 0, 4
-; PWR7-LE-NEXT: xxswapd 34, 0
+; PWR7-LE-NEXT: xxswapd 35, 0
; PWR7-LE-NEXT: lxvd2x 0, 0, 3
-; PWR7-LE-NEXT: xxswapd 35, 1
+; PWR7-LE-NEXT: xxspltw 34, 34, 3
; PWR7-LE-NEXT: xxswapd 36, 0
-; PWR7-LE-NEXT: vperm 2, 4, 3, 2
+; PWR7-LE-NEXT: vperm 2, 2, 4, 3
; PWR7-LE-NEXT: blr
;
; PWR8-LE-LABEL: build_v4i32_load_1:
@@ -463,17 +451,14 @@ entry:
define <4 x i32> @build_v4i32_load_2(ptr nocapture noundef readonly %p) {
; PWR7-BE-LABEL: build_v4i32_load_2:
; PWR7-BE: # %bb.0: # %entry
-; PWR7-BE-NEXT: lwz 3, 0(3)
-; PWR7-BE-NEXT: xxlxor 36, 36, 36
-; PWR7-BE-NEXT: sldi 3, 3, 32
-; PWR7-BE-NEXT: std 3, -16(1)
-; PWR7-BE-NEXT: std 3, -8(1)
+; PWR7-BE-NEXT: lfiwax 0, 0, 3
; PWR7-BE-NEXT: addis 3, 2, .LCPI10_0@toc@ha
+; PWR7-BE-NEXT: xxlxor 36, 36, 36
; PWR7-BE-NEXT: addi 3, 3, .LCPI10_0@toc@l
-; PWR7-BE-NEXT: lxvw4x 34, 0, 3
-; PWR7-BE-NEXT: addi 3, 1, -16
; PWR7-BE-NEXT: lxvw4x 35, 0, 3
-; PWR7-BE-NEXT: vperm 2, 4, 3, 2
+; PWR7-BE-NEXT: xxlor 34, 0, 0
+; PWR7-BE-NEXT: xxspltw 34, 34, 1
+; PWR7-BE-NEXT: vperm 2, 4, 2, 3
; PWR7-BE-NEXT: blr
;
; PWR8-BE-LABEL: build_v4i32_load_2:
@@ -491,20 +476,18 @@ define <4 x i32> @build_v4i32_load_2(ptr nocapture noundef readonly %p) {
; PWR7-LE-LABEL: build_v4i32_load_2:
; PWR7-LE: # %bb.0: # %entry
; PWR7-LE-NEXT: li 4, 0
-; PWR7-LE-NEXT: lwz 3, 0(3)
-; PWR7-LE-NEXT: stw 4, -32(1)
-; PWR7-LE-NEXT: addis 4, 2, .LCPI10_0@toc@ha
-; PWR7-LE-NEXT: addi 4, 4, .LCPI10_0@toc@l
-; PWR7-LE-NEXT: stw 3, -16(1)
+; PWR7-LE-NEXT: stw 4, -16(1)
+; PWR7-LE-NEXT: lfiwax 0, 0, 3
+; PWR7-LE-NEXT: addis 3, 2, .LCPI10_0@toc@ha
+; PWR7-LE-NEXT: addi 3, 3, .LCPI10_0@toc@l
+; PWR7-LE-NEXT: xxspltd 34, 0, 0
+; PWR7-LE-NEXT: lxvd2x 0, 0, 3
; PWR7-LE-NEXT: addi 3, 1, -16
-; PWR7-LE-NEXT: lxvd2x 0, 0, 4
-; PWR7-LE-NEXT: addi 4, 1, -32
-; PWR7-LE-NEXT: lxvd2x 1, 0, 4
-; PWR7-LE-NEXT: xxswapd 34, 0
+; PWR7-LE-NEXT: xxswapd 35, 0
; PWR7-LE-NEXT: lxvd2x 0, 0, 3
-; PWR7-LE-NEXT: xxswapd 35, 1
+; PWR7-LE-NEXT: xxspltw 34, 34, 3
; PWR7-LE-NEXT: xxswapd 36, 0
-; PWR7-LE-NEXT: vperm 2, 4, 3, 2
+; PWR7-LE-NEXT: vperm 2, 2, 4, 3
; PWR7-LE-NEXT: blr
;
; PWR8-LE-LABEL: build_v4i32_load_2:
@@ -526,17 +509,14 @@ entry:
define <4 x i32> @build_v4i32_load_3(ptr nocapture noundef readonly %p) {
; PWR7-BE-LABEL: build_v4i32_load_3:
; PWR7-BE: # %bb.0: # %entry
-; PWR7-BE-NEXT: lwz 3, 0(3)
-; PWR7-BE-NEXT: xxlxor 36, 36, 36
-; PWR7-BE-NEXT: sldi 3, 3, 32
-; PWR7-BE-NEXT: std 3, -16(1)
-; PWR7-BE-NEXT: std 3, -8(1)
+; PWR7-BE-NEXT: lfiwax 0, 0, 3
; PWR7-BE-NEXT: addis 3, 2, .LCPI11_0@toc@ha
+; PWR7-BE-NEXT: xxlxor 36, 36, 36
; PWR7-BE-NEXT: addi 3, 3, .LCPI11_0@toc@l
-; PWR7-BE-NEXT: lxvw4x 34, 0, 3
-; PWR7-BE-NEXT: addi 3, 1, -16
; PWR7-BE-NEXT: lxvw4x 35, 0, 3
-; PWR7-BE-NEXT: vperm 2, 4, 3, 2
+; PWR7-BE-NEXT: xxlor 34, 0, 0
+; PWR7-BE-NEXT: xxspltw 34, 34, 1
+; PWR7-BE-NEXT: vperm 2, 4, 2, 3
; PWR7-BE-NEXT: blr
;
; PWR8-BE-LABEL: build_v4i32_load_3:
@@ -553,20 +533,18 @@ define <4 x i32> @build_v4i32_load_3(ptr nocapture noundef readonly %p) {
; PWR7-LE-LABEL: build_v4i32_load_3:
; PWR7-LE: # %bb.0: # %entry
; PWR7-LE-NEXT: li 4, 0
-; PWR7-LE-NEXT: lwz 3, 0(3)
-; PWR7-LE-NEXT: stw 4, -32(1)
-; PWR7-LE-NEXT: addis 4, 2, .LCPI11_0@toc@ha
-; PWR7-LE-NEXT: addi 4, 4, .LCPI11_0@toc@l
-; PWR7-LE-NEXT: stw 3, -16(1)
+; PWR7-LE-NEXT: stw 4, -16(1)
+; PWR7-LE-NEXT: lfiwax 0, 0, 3
+; PWR7-LE-NEXT: addis 3, 2, .LCPI11_0@toc@ha
+; PWR7-LE-NEXT: addi 3, 3, .LCPI11_0@toc@l
+; PWR7-LE-NEXT: xxspltd 34, 0, 0
+; PWR7-LE-NEXT: lxvd2x 0, 0, 3
; PWR7-LE-NEXT: addi 3, 1, -16
-; PWR7-LE-NEXT: lxvd2x 0, 0, 4
-; PWR7-LE-NEXT: addi 4, 1, -32
-; PWR7-LE-NEXT: lxvd2x 1, 0, 4
-; PWR7-LE-NEXT: xxswapd 34, 0
+; PWR7-LE-NEXT: xxswapd 35, 0
; PWR7-LE-NEXT: lxvd2x 0, 0, 3
-; PWR7-LE-NEXT: xxswapd 35, 1
+; PWR7-LE-NEXT: xxspltw 34, 34, 3
; PWR7-LE-NEXT: xxswapd 36, 0
-; PWR7-LE-NEXT: vperm 2, 4, 3, 2
+; PWR7-LE-NEXT: vperm 2, 2, 4, 3
; PWR7-LE-NEXT: blr
;
; PWR8-LE-LABEL: build_v4i32_load_3:
diff --git a/llvm/test/CodeGen/PowerPC/canonical-merge-shuffles.ll b/llvm/test/CodeGen/PowerPC/canonical-merge-shuffles.ll
index c26f98c5b0495d..5e73999c80b5ac 100644
--- a/llvm/test/CodeGen/PowerPC/canonical-merge-shuffles.ll
+++ b/llvm/test/CodeGen/PowerPC/canonical-merge-shuffles.ll
@@ -536,15 +536,12 @@ define dso_local <8 x i16> @testmrglb3(ptr nocapture readonly %a) local_unnamed_
;
; P8-AIX-32-LABEL: testmrglb3:
; P8-AIX-32: # %bb.0: # %entry
-; P8-AIX-32-NEXT: lwz r4, 4(r3)
+; P8-AIX-32-NEXT: li r4, 4
+; P8-AIX-32-NEXT: lfiwax f1, 0, r3
; P8-AIX-32-NEXT: xxlxor v3, v3, v3
-; P8-AIX-32-NEXT: stw r4, -16(r1)
-; P8-AIX-32-NEXT: lwz r3, 0(r3)
-; P8-AIX-32-NEXT: stw r3, -32(r1)
-; P8-AIX-32-NEXT: addi r3, r1, -16
-; P8-AIX-32-NEXT: lxvw4x vs0, 0, r3
-; P8-AIX-32-NEXT: addi r3, r1, -32
-; P8-AIX-32-NEXT: lxvw4x vs1, 0, r3
+; P8-AIX-32-NEXT: lfiwax f0, r3, r4
+; P8-AIX-32-NEXT: xxspltw vs1, vs1, 1
+; P8-AIX-32-NEXT: xxspltw vs0, vs0, 1
; P8-AIX-32-NEXT: xxmrghw v2, vs1, vs0
; P8-AIX-32-NEXT: vmrghb v2, v3, v2
; P8-AIX-32-NEXT: blr
@@ -852,17 +849,15 @@ define dso_local <16 x i8> @no_RAUW_in_combine_during_legalize(ptr nocapture rea
;
; P8-AIX-32-LABEL: no_RAUW_in_combine_during_legalize:
; P8-AIX-32: # %bb.0: # %entry
+; P8-AIX-32-NEXT: li r5, 0
; P8-AIX-32-NEXT: slwi r4, r4, 2
; P8-AIX-32-NEXT: xxlxor v3, v3, v3
-; P8-AIX-32-NEXT: lwzx r3, r3, r4
-; P8-AIX-32-NEXT: li r4, 0
-; P8-AIX-32-NEXT: stw r4, -32(r1)
-; P8-AIX-32-NEXT: stw r3, -16(r1)
-; P8-AIX-32-NEXT: addi r3, r1, -32
-; P8-AIX-32-NEXT: lxvw4x vs0, 0, r3
+; P8-AIX-32-NEXT: stw r5, -16(r1)
+; P8-AIX-32-NEXT: lfiwax f0, r3, r4
; P8-AIX-32-NEXT: addi r3, r1, -16
; P8-AIX-32-NEXT: lxvw4x vs1, 0, r3
-; P8-AIX-32-NEXT: xxmrghw v2, vs0, vs1
+; P8-AIX-32-NEXT: xxspltw vs0, vs0, 1
+; P8-AIX-32-NEXT: xxmrghw v2, vs1, vs0
; P8-AIX-32-NEXT: vmrghb v2, v2, v3
; P8-AIX-32-NEXT: blr
entry:
@@ -1026,14 +1021,11 @@ define dso_local <2 x i64> @testSplat8(ptr nocapture readonly %ptr) local_unname
;
; P8-AIX-32-LABEL: testSplat8:
; P8-AIX-32: # %bb.0: # %entry
-; P8-AIX-32-NEXT: lwz r4, 4(r3)
-; P8-AIX-32-NEXT: stw r4, -16(r1)
-; P8-AIX-32-NEXT: lwz r3, 0(r3)
-; P8-AIX-32-NEXT: stw r3, -32(r1)
-; P8-AIX-32-NEXT: addi r3, r1, -16
-; P8-AIX-32-NEXT: lxvw4x vs0, 0, r3
-; P8-AIX-32-NEXT: addi r3, r1, -32
-; P8-AIX-32-NEXT: lxvw4x vs1, 0, r3
+; P8-AIX-32-NEXT: li r4, 4
+; P8-AIX-32-NEXT: lfiwax f1, 0, r3
+; P8-AIX-32-NEXT: lfiwax f0, r3, r4
+; P8-AIX-32-NEXT: xxspltw vs1, vs1, 1
+; P8-AIX-32-NEXT: xxspltw vs0, vs0, 1
; P8-AIX-32-NEXT: xxmrghw vs0, vs1, vs0
; P8-AIX-32-NEXT: xxmrghd v2, vs0, vs0
; P8-AIX-32-NEXT: blr
@@ -1081,17 +1073,14 @@ define <2 x i64> @testSplati64_0(ptr nocapture readonly %ptr) #0 {
;
; P8-AIX-32-LABEL: testSplati64_0:
; P8-AIX-32: # %bb.0: # %entry
-; P8-AIX-32-NEXT: lwz r4, 0(r3)
-; P8-AIX-32-NEXT: lwz r3, 4(r3)
-; P8-AIX-32-NEXT: stw r3, -16(r1)
+; P8-AIX-32-NEXT: li r4, 4
+; P8-AIX-32-NEXT: lfiwax f0, r3, r4
+; P8-AIX-32-NEXT: xxspltw v2, vs0, 1
+; P8-AIX-32-NEXT: lfiwax f0, 0, r3
; P8-AIX-32-NEXT: lwz r3, L..C3(r2) # %const.0
-; P8-AIX-32-NEXT: stw r4, -32(r1)
-; P8-AIX-32-NEXT: lxvw4x v2, 0, r3
-; P8-AIX-32-NEXT: addi r3, r1, -16
-; P8-AIX-32-NEXT: lxvw4x v3, 0, r3
-; P8-AIX-32-NEXT: addi r3, r1, -32
; P8-AIX-32-NEXT: lxvw4x v4, 0, r3
-; P8-AIX-32-NEXT: vperm v2, v4, v3, v2
+; P8-AIX-32-NEXT: xxspltw v3, vs0, 1
+; P8-AIX-32-NEXT: vperm v2, v3, v2, v4
; P8-AIX-32-NEXT: blr
entry:
%0 = load <1 x i64>, ptr %ptr, align 8
diff --git a/llvm/test/CodeGen/PowerPC/load-and-splat.ll b/llvm/test/CodeGen/PowerPC/load-and-splat.ll
index bc68ad2a67bf5d..38f47838a42fbf 100644
--- a/llvm/test/CodeGen/PowerPC/load-and-splat.ll
+++ b/llvm/test/CodeGen/PowerPC/load-and-splat.ll
@@ -208,47 +208,45 @@ define dso_local void @test4(ptr nocapture %c, ptr nocapture readonly %a) local_
;
; P9-AIX32-LABEL: test4:
; P9-AIX32: # %bb.0: # %entry
-; P9-AIX32-NEXT: lwz r5, 24(r4)
-; P9-AIX32-NEXT: lwz r4, 28(r4)
-; P9-AIX32-NEXT: stw r4, -16(r1)
+; P9-AIX32-NEXT: li r5, 28
+; P9-AIX32-NEXT: lfiwax f0, r4, r5
+; P9-AIX32-NEXT: li r5, 24
+; P9-AIX32-NEXT: lfiwax f1, r4, r5
; P9-AIX32-NEXT: lwz r4, L..C0(r2) # %const.0
-; P9-AIX32-NEXT: stw r5, -32(r1)
-; P9-AIX32-NEXT: lxv vs1, -16(r1)
-; P9-AIX32-NEXT: lxv vs2, -32(r1)
-; P9-AIX32-NEXT: lxv vs0, 0(r4)
-; P9-AIX32-NEXT: xxperm vs1, vs2, vs0
-; P9-AIX32-NEXT: stxv vs1, 0(r3)
+; P9-AIX32-NEXT: xxspltw vs0, vs0, 1
+; P9-AIX32-NEXT: lxv vs2, 0(r4)
+; P9-AIX32-NEXT: xxspltw vs1, vs1, 1
+; P9-AIX32-NEXT: xxperm vs0, vs1, vs2
+; P9-AIX32-NEXT: stxv vs0, 0(r3)
; P9-AIX32-NEXT: blr
;
; P8-AIX32-LABEL: test4:
; P8-AIX32: # %bb.0: # %entry
-; P8-AIX32-NEXT: lwz r5, 24(r4)
-; P8-AIX32-NEXT: lwz r4, 28(r4)
-; P8-AIX32-NEXT: stw r4, -16(r1)
+; P8-AIX32-NEXT: li r5, 28
+; P8-AIX32-NEXT: lfiwax f0, r4, r5
+; P8-AIX32-NEXT: li r5, 24
+; P8-AIX32-NEXT: xxspltw v2, vs0, 1
+; P8-AIX32-NEXT: lfiwax f0, r4, r5
; P8-AIX32-NEXT: lwz r4, L..C0(r2) # %const.0
-; P8-AIX32-NEXT: stw r5, -32(r1)
-; P8-AIX32-NEXT: lxvw4x v2, 0, r4
-; P8-AIX32-NEXT: addi r4, r1, -16
-; P8-AIX32-NEXT: lxvw4x v3, 0, r4
-; P8-AIX32-NEXT: addi r4, r1, -32
; P8-AIX32-NEXT: lxvw4x v4, 0, r4
-; P8-AIX32-NEXT: vperm v2, v4, v3, v2
+; P8-AIX32-NEXT: xxspltw v3, vs0, 1
+; P8-AIX32-NEXT: vperm v2, v3, v2, v4
; P8-AIX32-NEXT: stxvw4x v2, 0, r3
; P8-AIX32-NEXT: blr
;
; P7-AIX32-LABEL: test4:
; P7-AIX32: # %bb.0: # %entry
-; P7-AIX32-NEXT: lwz r5, 24(r4)
-; P7-AIX32-NEXT: lwz r4, 28(r4)
-; P7-AIX32-NEXT: stw r4, -16(r1)
+; P7-AIX32-NEXT: li r5, 28
+; P7-AIX32-NEXT: lfiwax f0, r4, r5
+; P7-AIX32-NEXT: li r5, 24
+; P7-AIX32-NEXT: xxlor v2, f0, f0
+; P7-AIX32-NEXT: xxspltw v2, v2, 1
+; P7-AIX32-NEXT: lfiwax f0, r4, r5
; P7-AIX32-NEXT: lwz r4, L..C0(r2) # %const.0
-; P7-AIX32-NEXT: stw r5, -32(r1)
-; P7-AIX32-NEXT: lxvw4x v2, 0, r4
-; P7-AIX32-NEXT: addi r4, r1, -16
-; P7-AIX32-NEXT: lxvw4x v3, 0, r4
-; P7-AIX32-NEXT: addi r4, r1, -32
; P7-AIX32-NEXT: lxvw4x v4, 0, r4
-; P7-AIX32-NEXT: vperm v2, v4, v3, v2
+; P7-AIX32-NEXT: xxlor v3, f0, f0
+; P7-AIX32-NEXT: xxspltw v3, v3, 1
+; P7-AIX32-NEXT: vperm v2, v3, v2, v4
; P7-AIX32-NEXT: stxvw4x v2, 0, r3
; P7-AIX32-NEXT: blr
entry:
@@ -362,47 +360,43 @@ define void @test6(ptr %a, ptr %in) {
;
; P9-AIX32-LABEL: test6:
; P9-AIX32: # %bb.0: # %entry
-; P9-AIX32-NEXT: lwz r4, 0(r4)
; P9-AIX32-NEXT: li r5, 0
-; P9-AIX32-NEXT: stw r5, -32(r1)
-; P9-AIX32-NEXT: lxv vs1, -32(r1)
-; P9-AIX32-NEXT: stw r4, -16(r1)
-; P9-AIX32-NEXT: lwz r4, L..C2(r2) # %const.0
+; P9-AIX32-NEXT: stw r5, -16(r1)
+; P9-AIX32-NEXT: lwz r5, L..C2(r2) # %const.0
+; P9-AIX32-NEXT: lfiwax f1, 0, r4
; P9-AIX32-NEXT: lxv vs2, -16(r1)
-; P9-AIX32-NEXT: lxv vs0, 0(r4)
-; P9-AIX32-NEXT: xxperm vs2, vs1, vs0
-; P9-AIX32-NEXT: stxv vs2, 0(r3)
+; P9-AIX32-NEXT: lxv vs0, 0(r5)
+; P9-AIX32-NEXT: xxspltw vs1, vs1, 1
+; P9-AIX32-NEXT: xxperm vs1, vs2, vs0
+; P9-AIX32-NEXT: stxv vs1, 0(r3)
; P9-AIX32-NEXT: blr
;
; P8-AIX32-LABEL: test6:
; P8-AIX32: # %bb.0: # %entry
-; P8-AIX32-NEXT: lwz r4, 0(r4)
; P8-AIX32-NEXT: li r5, 0
-; P8-AIX32-NEXT: stw r5, -32(r1)
-; P8-AIX32-NEXT: stw r4, -16(r1)
+; P8-AIX32-NEXT: stw r5, -16(r1)
+; P8-AIX32-NEXT: lfiwax f0, 0, r4
; P8-AIX32-NEXT: lwz r4, L..C2(r2) # %const.0
-; P8-AIX32-NEXT: lxvw4x v2, 0, r4
-; P8-AIX32-NEXT: addi r4, r1, -32
; P8-AIX32-NEXT: lxvw4x v3, 0, r4
; P8-AIX32-NEXT: addi r4, r1, -16
; P8-AIX32-NEXT: lxvw4x v4, 0, r4
-; P8-AIX32-NEXT: vperm v2, v3, v4, v2
+; P8-AIX32-NEXT: xxspltw v2, vs0, 1
+; P8-AIX32-NEXT: vperm v2, v4, v2, v3
; P8-AIX32-NEXT: stxvw4x v2, 0, r3
; P8-AIX32-NEXT: blr
;
; P7-AIX32-LABEL: test6:
; P7-AIX32: # %bb.0: # %entry
-; P7-AIX32-NEXT: lwz r4, 0(r4)
; P7-AIX32-NEXT: li r5, 0
-; P7-AIX32-NEXT: stw r5, -32(r1)
-; P7-AIX32-NEXT: stw r4, -16(r1)
+; P7-AIX32-NEXT: stw r5, -16(r1)
+; P7-AIX32-NEXT: lfiwax f0, 0, r4
; P7-AIX32-NEXT: lwz r4, L..C2(r2) # %const.0
-; P7-AIX32-NEXT: lxvw4x v2, 0, r4
-; P7-AIX32-NEXT: addi r4, r1, -32
; P7-AIX32-NEXT: lxvw4x v3, 0, r4
; P7-AIX32-NEXT: addi r4, r1, -16
+; P7-AIX32-NEXT: xxlor v2, f0, f0
; P7-AIX32-NEXT: lxvw4x v4, 0, r4
-; P7-AIX32-NEXT: vperm v2, v3, v4, v2
+; P7-AIX32-NEXT: xxspltw v2, v2, 1
+; P7-AIX32-NEXT: vperm v2, v4, v2, v3
; P7-AIX32-NEXT: stxvw4x v2, 0, r3
; P7-AIX32-NEXT: blr
entry:
@@ -810,40 +804,35 @@ define <16 x i8> @unadjusted_lxvdsx(ptr %s, ptr %t) {
;
; P9-AIX32-LABEL: unadjusted_lxvd...
[truncated]
|
Ping. |
MachineFunction &MF = DAG.getMachineFunction(); | ||
SDValue Op0 = Op.getOperand(0); | ||
ReuseLoadInfo RLI; | ||
if (Subtarget.hasVSX() && Op.getValueType() == MVT::v4i32 && |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
hasLFIWAX()
looks like a more reasonable target feature?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The check for VSX was actually for the use of xxspltw. I was assuming anything with vectors had lfiwax, but there's no harm in checking.
Op0.getOpcode() == ISD::LOAD && Op0.getValueType() == MVT::i32 && | ||
Op0.hasOneUse() && | ||
canReuseLoadAddress(Op0, MVT::i32, RLI, DAG, ISD::NON_EXTLOAD)) { | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This seems like should be lowered to PPCISD::LD_SPLAT
instead of expanding it here. BUILD_VECTOR
did the same way.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Good idea.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM. Thanks very much for making the improvements.
Update cost model to reflect codegen change to use lfiwzx for 32-bit partial vector loads on pwr7 with #104507.
Update cost model to reflect codegen change to use lfiwzx for 32-bit partial vector loads on pwr7 with llvm#104507.
There are no partial vector loads on pwr7 so current v4i8 codegen is an int load then store to vector sized temp and re-load as vector. Try to use lfiwax to load 32 bits into an FP reg and take advantage of VSX FP and vector reg sharing to move the result to the right vector position.