-
Notifications
You must be signed in to change notification settings - Fork 12k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[AArch64] Sink vscale calls into loops for better isel #70304
Conversation
@llvm/pr-subscribers-backend-aarch64 Author: Graham Hunter (huntergr-arm) ChangesFor more recent sve capable CPUs it is beneficial to use the inc* instruction This patch tells codegenprepare to sink appropriate vscale calls into Full diff: https://github.com/llvm/llvm-project/pull/70304.diff 3 Files Affected:
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 038c23b5e8d50ad..5b4f6531244c259 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -14507,6 +14507,19 @@ static bool shouldSinkVectorOfPtrs(Value *Ptrs, SmallVectorImpl<Use *> &Ops) {
return true;
}
+/// We want to sink following cases:
+/// (add|sub) A, ((mul|shl) vscale, imm); (add|sub) A, vscale
+static bool shouldSinkVScale(Value *Op, SmallVectorImpl<Use *> &Ops) {
+ if (match(Op, m_VScale()))
+ return true;
+ if (match(Op, m_Shl(m_VScale(), m_ConstantInt())) ||
+ match(Op, m_Mul(m_VScale(), m_ConstantInt()))) {
+ Ops.push_back(&cast<Instruction>(Op)->getOperandUse(0));
+ return true;
+ }
+ return false;
+}
+
/// Check if sinking \p I's operands to I's basic block is profitable, because
/// the operands can be folded into a target instruction, e.g.
/// shufflevectors extracts and/or sext/zext can be folded into (u,s)subl(2).
@@ -14623,12 +14636,29 @@ bool AArch64TargetLowering::shouldSinkOperands(
}
}
- if (!I->getType()->isVectorTy())
- return false;
-
switch (I->getOpcode()) {
case Instruction::Sub:
case Instruction::Add: {
+ // If the subtarget wants to make use of sve inc* instructions, then sink
+ // vscale intrinsic (along with any shifts or multiplies) so that the
+ // appropriate folds can be made.
+ if (Subtarget->useScalarIncVL()) {
+ bool Sink = false;
+ if (shouldSinkVScale(I->getOperand(0), Ops)) {
+ Ops.push_back(&I->getOperandUse(0));
+ Sink = true;
+ }
+
+ if (shouldSinkVScale(I->getOperand(1), Ops)) {
+ Ops.push_back(&I->getOperandUse(1));
+ Sink = true;
+ }
+
+ if (Sink)
+ return true;
+ }
+ if (!I->getType()->isVectorTy())
+ return false;
if (!areExtractExts(I->getOperand(0), I->getOperand(1)))
return false;
@@ -14647,6 +14677,8 @@ bool AArch64TargetLowering::shouldSinkOperands(
return true;
}
case Instruction::Or: {
+ if (!I->getType()->isVectorTy())
+ return false;
// Pattern: Or(And(MaskValue, A), And(Not(MaskValue), B)) ->
// bitselect(MaskValue, A, B) where Not(MaskValue) = Xor(MaskValue, -1)
if (Subtarget->hasNEON()) {
@@ -14684,6 +14716,8 @@ bool AArch64TargetLowering::shouldSinkOperands(
return false;
}
case Instruction::Mul: {
+ if (!I->getType()->isVectorTy())
+ return false;
int NumZExts = 0, NumSExts = 0;
for (auto &Op : I->operands()) {
// Make sure we are not already sinking this operand
diff --git a/llvm/test/CodeGen/AArch64/sve2-vscale-sinking-codegen.ll b/llvm/test/CodeGen/AArch64/sve2-vscale-sinking-codegen.ll
new file mode 100644
index 000000000000000..afd171aeda758d5
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve2-vscale-sinking-codegen.ll
@@ -0,0 +1,96 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
+; RUN: llc < %s | FileCheck %s
+
+target triple = "aarch64-unknown-linux-gnu"
+
+define void @inc_add(i32 %first, i32 %N, ptr %in1, ptr %in2, ptr %out) #0 {
+; CHECK-LABEL: inc_add:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: mov x8, xzr
+; CHECK-NEXT: mov w9, w1
+; CHECK-NEXT: .LBB0_1: // %vector.body
+; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x2, x8, lsl #2]
+; CHECK-NEXT: ld1w { z1.s }, p0/z, [x3, x8, lsl #2]
+; CHECK-NEXT: fmul z0.s, z0.s, z1.s
+; CHECK-NEXT: st1w { z0.s }, p0, [x4, x8, lsl #2]
+; CHECK-NEXT: incw x8
+; CHECK-NEXT: cmp x9, x8
+; CHECK-NEXT: b.ne .LBB0_1
+; CHECK-NEXT: // %bb.2: // %for.cond.cleanup
+; CHECK-NEXT: ret
+entry:
+ %wide.trip.count = zext i32 %N to i64
+ %0 = tail call i64 @llvm.vscale.i64()
+ %1 = shl nuw nsw i64 %0, 2
+ br label %vector.body
+
+vector.body:
+ %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
+ %2 = getelementptr inbounds float, ptr %in1, i64 %index
+ %wide.load = load <vscale x 4 x float>, ptr %2, align 4
+ %3 = getelementptr inbounds float, ptr %in2, i64 %index
+ %wide.load16 = load <vscale x 4 x float>, ptr %3, align 4
+ %4 = fmul <vscale x 4 x float> %wide.load, %wide.load16
+ %5 = getelementptr inbounds float, ptr %out, i64 %index
+ store <vscale x 4 x float> %4, ptr %5, align 4
+ %index.next = add nuw i64 %index, %1
+ %6 = icmp eq i64 %index.next, %wide.trip.count
+ br i1 %6, label %for.cond.cleanup, label %vector.body
+
+for.cond.cleanup:
+ ret void
+}
+
+define void @dec_sub(i32 %first, i32 %N, ptr %in1, ptr %in2, ptr %out) #0 {
+; CHECK-LABEL: dec_sub:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: rdvl x9, #-1
+; CHECK-NEXT: mov w8, w1
+; CHECK-NEXT: add x11, x9, #4
+; CHECK-NEXT: add x9, x2, x11
+; CHECK-NEXT: add x10, x3, x11
+; CHECK-NEXT: add x11, x4, x11
+; CHECK-NEXT: .LBB1_1: // %vector.body
+; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x9, x8, lsl #2]
+; CHECK-NEXT: ld1w { z1.s }, p0/z, [x10, x8, lsl #2]
+; CHECK-NEXT: fmul z0.s, z0.s, z1.s
+; CHECK-NEXT: st1w { z0.s }, p0, [x11, x8, lsl #2]
+; CHECK-NEXT: decw x8
+; CHECK-NEXT: cbnz x8, .LBB1_1
+; CHECK-NEXT: // %bb.2: // %for.cond.cleanup
+; CHECK-NEXT: ret
+entry:
+ %0 = zext i32 %N to i64
+ %1 = tail call i64 @llvm.vscale.i64()
+ %2 = shl nuw nsw i64 %1, 2
+ %3 = sub nsw i64 1, %2
+ %invariant.gep = getelementptr float, ptr %in1, i64 %3
+ %invariant.gep20 = getelementptr float, ptr %in2, i64 %3
+ %invariant.gep22 = getelementptr float, ptr %out, i64 %3
+ br label %vector.body
+
+vector.body:
+ %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
+ %offset.idx = sub i64 %0, %index
+ %gep = getelementptr float, ptr %invariant.gep, i64 %offset.idx
+ %wide.load = load <vscale x 4 x float>, ptr %gep, align 4
+ %gep21 = getelementptr float, ptr %invariant.gep20, i64 %offset.idx
+ %wide.load16 = load <vscale x 4 x float>, ptr %gep21, align 4
+ %4 = fmul <vscale x 4 x float> %wide.load, %wide.load16
+ %gep23 = getelementptr float, ptr %invariant.gep22, i64 %offset.idx
+ store <vscale x 4 x float> %4, ptr %gep23, align 4
+ %index.next = add nuw i64 %index, %2
+ %5 = icmp eq i64 %index.next, %0
+ br i1 %5, label %for.cond.cleanup, label %vector.body
+
+for.cond.cleanup:
+ ret void
+}
+
+declare i64 @llvm.vscale.i64()
+
+attributes #0 = { "target-features"="+sve2" }
diff --git a/llvm/test/CodeGen/AArch64/sve2-vscale-sinking.ll b/llvm/test/CodeGen/AArch64/sve2-vscale-sinking.ll
new file mode 100644
index 000000000000000..88d2f468d6a171c
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve2-vscale-sinking.ll
@@ -0,0 +1,112 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2
+; RUN: opt -codegenprepare -S -o - %s | FileCheck %s
+
+target triple = "aarch64-unknown-linux-gnu"
+
+define void @inc_add(i32 %first, i32 %N, ptr %in1, ptr %in2, ptr %out) #0 {
+; CHECK-LABEL: define void @inc_add
+; CHECK-SAME: (i32 [[FIRST:%.*]], i32 [[N:%.*]], ptr [[IN1:%.*]], ptr [[IN2:%.*]], ptr [[OUT:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK: vector.body:
+; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds float, ptr [[IN1]], i64 [[INDEX]]
+; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[TMP0]], align 4
+; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds float, ptr [[IN2]], i64 [[INDEX]]
+; CHECK-NEXT: [[WIDE_LOAD16:%.*]] = load <vscale x 4 x float>, ptr [[TMP1]], align 4
+; CHECK-NEXT: [[TMP2:%.*]] = fmul <vscale x 4 x float> [[WIDE_LOAD]], [[WIDE_LOAD16]]
+; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds float, ptr [[OUT]], i64 [[INDEX]]
+; CHECK-NEXT: store <vscale x 4 x float> [[TMP2]], ptr [[TMP3]], align 4
+; CHECK-NEXT: [[TMP4:%.*]] = tail call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[TMP5:%.*]] = shl nuw nsw i64 [[TMP4]], 2
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[WIDE_TRIP_COUNT]]
+; CHECK-NEXT: br i1 [[TMP6]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]]
+; CHECK: for.cond.cleanup:
+; CHECK-NEXT: ret void
+;
+entry:
+ %wide.trip.count = zext i32 %N to i64
+ %0 = tail call i64 @llvm.vscale.i64()
+ %1 = shl nuw nsw i64 %0, 2
+ br label %vector.body
+
+vector.body:
+ %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
+ %2 = getelementptr inbounds float, ptr %in1, i64 %index
+ %wide.load = load <vscale x 4 x float>, ptr %2, align 4
+ %3 = getelementptr inbounds float, ptr %in2, i64 %index
+ %wide.load16 = load <vscale x 4 x float>, ptr %3, align 4
+ %4 = fmul <vscale x 4 x float> %wide.load, %wide.load16
+ %5 = getelementptr inbounds float, ptr %out, i64 %index
+ store <vscale x 4 x float> %4, ptr %5, align 4
+ %index.next = add nuw i64 %index, %1
+ %6 = icmp eq i64 %index.next, %wide.trip.count
+ br i1 %6, label %for.cond.cleanup, label %vector.body
+
+for.cond.cleanup:
+ ret void
+}
+
+define void @dec_sub(i32 %first, i32 %N, ptr %in1, ptr %in2, ptr %out) #0 {
+; CHECK-LABEL: define void @dec_sub
+; CHECK-SAME: (i32 [[FIRST:%.*]], i32 [[N:%.*]], ptr [[IN1:%.*]], ptr [[IN2:%.*]], ptr [[OUT:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[TMP0:%.*]] = zext i32 [[N]] to i64
+; CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 2
+; CHECK-NEXT: [[TMP3:%.*]] = sub nsw i64 1, [[TMP2]]
+; CHECK-NEXT: [[INVARIANT_GEP:%.*]] = getelementptr float, ptr [[IN1]], i64 [[TMP3]]
+; CHECK-NEXT: [[INVARIANT_GEP20:%.*]] = getelementptr float, ptr [[IN2]], i64 [[TMP3]]
+; CHECK-NEXT: [[INVARIANT_GEP22:%.*]] = getelementptr float, ptr [[OUT]], i64 [[TMP3]]
+; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK: vector.body:
+; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[OFFSET_IDX:%.*]] = sub i64 [[TMP0]], [[INDEX]]
+; CHECK-NEXT: [[GEP:%.*]] = getelementptr float, ptr [[INVARIANT_GEP]], i64 [[OFFSET_IDX]]
+; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[GEP]], align 4
+; CHECK-NEXT: [[GEP21:%.*]] = getelementptr float, ptr [[INVARIANT_GEP20]], i64 [[OFFSET_IDX]]
+; CHECK-NEXT: [[WIDE_LOAD16:%.*]] = load <vscale x 4 x float>, ptr [[GEP21]], align 4
+; CHECK-NEXT: [[TMP4:%.*]] = fmul <vscale x 4 x float> [[WIDE_LOAD]], [[WIDE_LOAD16]]
+; CHECK-NEXT: [[GEP23:%.*]] = getelementptr float, ptr [[INVARIANT_GEP22]], i64 [[OFFSET_IDX]]
+; CHECK-NEXT: store <vscale x 4 x float> [[TMP4]], ptr [[GEP23]], align 4
+; CHECK-NEXT: [[TMP5:%.*]] = tail call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[TMP6:%.*]] = shl nuw nsw i64 [[TMP5]], 2
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]]
+; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[TMP0]]
+; CHECK-NEXT: br i1 [[TMP7]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]]
+; CHECK: for.cond.cleanup:
+; CHECK-NEXT: ret void
+;
+entry:
+ %0 = zext i32 %N to i64
+ %1 = tail call i64 @llvm.vscale.i64()
+ %2 = shl nuw nsw i64 %1, 2
+ %3 = sub nsw i64 1, %2
+ %invariant.gep = getelementptr float, ptr %in1, i64 %3
+ %invariant.gep20 = getelementptr float, ptr %in2, i64 %3
+ %invariant.gep22 = getelementptr float, ptr %out, i64 %3
+ br label %vector.body
+
+vector.body:
+ %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
+ %offset.idx = sub i64 %0, %index
+ %gep = getelementptr float, ptr %invariant.gep, i64 %offset.idx
+ %wide.load = load <vscale x 4 x float>, ptr %gep, align 4
+ %gep21 = getelementptr float, ptr %invariant.gep20, i64 %offset.idx
+ %wide.load16 = load <vscale x 4 x float>, ptr %gep21, align 4
+ %4 = fmul <vscale x 4 x float> %wide.load, %wide.load16
+ %gep23 = getelementptr float, ptr %invariant.gep22, i64 %offset.idx
+ store <vscale x 4 x float> %4, ptr %gep23, align 4
+ %index.next = add nuw i64 %index, %2
+ %5 = icmp eq i64 %index.next, %0
+ br i1 %5, label %for.cond.cleanup, label %vector.body
+
+for.cond.cleanup:
+ ret void
+}
+
+declare i64 @llvm.vscale.i64()
+
+attributes #0 = { "target-features"="+sve2" }
|
✅ With the latest revision this PR passed the C/C++ code formatter. |
// If the subtarget wants to make use of sve inc* instructions, then sink | ||
// vscale intrinsic (along with any shifts or multiplies) so that the | ||
// appropriate folds can be made. | ||
if (Subtarget->useScalarIncVL()) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is this check necessary? I'm hopefully there are reasons[1] to lower the constant regardless of whether we'll ultimately emit inc/dec instructions. Even if that does not happen MachineLICM should hoist the rdvl
anyway.
For example, if the add/sub is then splatted we can use the vector variants regardless of useScalarIncVL
.
target triple = "aarch64-unknown-linux-gnu" | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Do we need code generation tests for an IR transformation? We already have tests that show inc/dec construction.
bool Sink = false; | ||
if (shouldSinkVScale(I->getOperand(0), Ops)) { | ||
Ops.push_back(&I->getOperandUse(0)); | ||
Sink = true; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can you just return true
here? The only use case for Sink
is when both operands are vscale
related, and I'd expect such logic to be folded away. If you do this I recommend checking the second operand first given that's going to be the most common case.
// appropriate folds can be made. | ||
if (Subtarget->useScalarIncVL()) { | ||
bool Sink = false; | ||
if (shouldSinkVScale(I->getOperand(0), Ops)) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is there a use case for lowering the first operand of a subtract? Is there a vscale equivalent of subf? I guess there could be other reasons to lower this but that just backs up my first comment that perhaps we shouldn't be guarding the lowering with useScalarIncVL.
@@ -14623,6 +14636,22 @@ bool AArch64TargetLowering::shouldSinkOperands( | |||
} | |||
} | |||
|
|||
// Sink vscales close to uses for better isel |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
s/close/closer/
For more recent sve capable CPUs it is beneficial to use the inc* instruction to increment a value by vscale (potentially shifted or multiplied) even in short loops. This patch tells codegenprepare to sink appropriate vscale calls into blocks where they are used so that isel can match them.
ccd604e
to
46ecf8e
Compare
For more recent sve capable CPUs it is beneficial to use the inc* instruction
to increment a value by vscale (potentially shifted or multiplied) even in
short loops.
This patch tells codegenprepare to sink appropriate vscale calls into
blocks where they are used so that isel can match them.