[AArch64] Sink vscale calls into loops for better isel #70304

huntergr-arm · 2023-10-26T08:35:19Z

For more recent sve capable CPUs it is beneficial to use the inc* instruction
to increment a value by vscale (potentially shifted or multiplied) even in
short loops.

This patch tells codegenprepare to sink appropriate vscale calls into
blocks where they are used so that isel can match them.

llvmbot · 2023-10-26T08:36:27Z

@llvm/pr-subscribers-backend-aarch64

Author: Graham Hunter (huntergr-arm)

Changes

For more recent sve capable CPUs it is beneficial to use the inc* instruction
to increment a value by vscale (potentially shifted or multiplied) even in
short loops.

This patch tells codegenprepare to sink appropriate vscale calls into
blocks where they are used so that isel can match them.

Full diff: https://github.com/llvm/llvm-project/pull/70304.diff

3 Files Affected:

(modified) llvm/lib/Target/AArch64/AArch64ISelLowering.cpp (+37-3)
(added) llvm/test/CodeGen/AArch64/sve2-vscale-sinking-codegen.ll (+96)
(added) llvm/test/CodeGen/AArch64/sve2-vscale-sinking.ll (+112)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 038c23b5e8d50ad..5b4f6531244c259 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -14507,6 +14507,19 @@ static bool shouldSinkVectorOfPtrs(Value *Ptrs, SmallVectorImpl<Use *> &Ops) {
   return true;
 }
 
+/// We want to sink following cases:
+/// (add|sub) A, ((mul|shl) vscale, imm); (add|sub) A, vscale
+static bool shouldSinkVScale(Value *Op, SmallVectorImpl<Use *> &Ops) {
+  if (match(Op, m_VScale()))
+    return true;
+  if (match(Op, m_Shl(m_VScale(), m_ConstantInt())) ||
+      match(Op, m_Mul(m_VScale(), m_ConstantInt()))) {
+    Ops.push_back(&cast<Instruction>(Op)->getOperandUse(0));
+    return true;
+  }
+  return false;
+}
+
 /// Check if sinking \p I's operands to I's basic block is profitable, because
 /// the operands can be folded into a target instruction, e.g.
 /// shufflevectors extracts and/or sext/zext can be folded into (u,s)subl(2).
@@ -14623,12 +14636,29 @@ bool AArch64TargetLowering::shouldSinkOperands(
     }
   }
 
-  if (!I->getType()->isVectorTy())
-    return false;
-
   switch (I->getOpcode()) {
   case Instruction::Sub:
   case Instruction::Add: {
+    // If the subtarget wants to make use of sve inc* instructions, then sink
+    // vscale intrinsic (along with any shifts or multiplies) so that the
+    // appropriate folds can be made.
+    if (Subtarget->useScalarIncVL()) {
+      bool Sink = false;
+      if (shouldSinkVScale(I->getOperand(0), Ops)) {
+        Ops.push_back(&I->getOperandUse(0));
+        Sink = true;
+      }
+      
+      if (shouldSinkVScale(I->getOperand(1), Ops)) {
+        Ops.push_back(&I->getOperandUse(1));
+        Sink = true;
+      }
+
+      if (Sink)
+        return true;
+    }
+    if (!I->getType()->isVectorTy())
+      return false;
     if (!areExtractExts(I->getOperand(0), I->getOperand(1)))
       return false;
 
@@ -14647,6 +14677,8 @@ bool AArch64TargetLowering::shouldSinkOperands(
     return true;
   }
   case Instruction::Or: {
+    if (!I->getType()->isVectorTy())
+      return false;
     // Pattern: Or(And(MaskValue, A), And(Not(MaskValue), B)) ->
     // bitselect(MaskValue, A, B) where Not(MaskValue) = Xor(MaskValue, -1)
     if (Subtarget->hasNEON()) {
@@ -14684,6 +14716,8 @@ bool AArch64TargetLowering::shouldSinkOperands(
     return false;
   }
   case Instruction::Mul: {
+    if (!I->getType()->isVectorTy())
+      return false;
     int NumZExts = 0, NumSExts = 0;
     for (auto &Op : I->operands()) {
       // Make sure we are not already sinking this operand
diff --git a/llvm/test/CodeGen/AArch64/sve2-vscale-sinking-codegen.ll b/llvm/test/CodeGen/AArch64/sve2-vscale-sinking-codegen.ll
new file mode 100644
index 000000000000000..afd171aeda758d5
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve2-vscale-sinking-codegen.ll
@@ -0,0 +1,96 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
+; RUN: llc < %s | FileCheck %s
+
+target triple = "aarch64-unknown-linux-gnu"
+
+define void @inc_add(i32 %first, i32 %N, ptr %in1, ptr %in2, ptr %out) #0 {
+; CHECK-LABEL: inc_add:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    mov x8, xzr
+; CHECK-NEXT:    mov w9, w1
+; CHECK-NEXT:  .LBB0_1: // %vector.body
+; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x2, x8, lsl #2]
+; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x3, x8, lsl #2]
+; CHECK-NEXT:    fmul z0.s, z0.s, z1.s
+; CHECK-NEXT:    st1w { z0.s }, p0, [x4, x8, lsl #2]
+; CHECK-NEXT:    incw x8
+; CHECK-NEXT:    cmp x9, x8
+; CHECK-NEXT:    b.ne .LBB0_1
+; CHECK-NEXT:  // %bb.2: // %for.cond.cleanup
+; CHECK-NEXT:    ret
+entry:
+  %wide.trip.count = zext i32 %N to i64
+  %0 = tail call i64 @llvm.vscale.i64()
+  %1 = shl nuw nsw i64 %0, 2
+  br label %vector.body
+
+vector.body:
+  %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
+  %2 = getelementptr inbounds float, ptr %in1, i64 %index
+  %wide.load = load <vscale x 4 x float>, ptr %2, align 4
+  %3 = getelementptr inbounds float, ptr %in2, i64 %index
+  %wide.load16 = load <vscale x 4 x float>, ptr %3, align 4
+  %4 = fmul <vscale x 4 x float> %wide.load, %wide.load16
+  %5 = getelementptr inbounds float, ptr %out, i64 %index
+  store <vscale x 4 x float> %4, ptr %5, align 4
+  %index.next = add nuw i64 %index, %1
+  %6 = icmp eq i64 %index.next, %wide.trip.count
+  br i1 %6, label %for.cond.cleanup, label %vector.body
+
+for.cond.cleanup:
+  ret void
+}
+
+define void @dec_sub(i32 %first, i32 %N, ptr %in1, ptr %in2, ptr %out) #0 {
+; CHECK-LABEL: dec_sub:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    rdvl x9, #-1
+; CHECK-NEXT:    mov w8, w1
+; CHECK-NEXT:    add x11, x9, #4
+; CHECK-NEXT:    add x9, x2, x11
+; CHECK-NEXT:    add x10, x3, x11
+; CHECK-NEXT:    add x11, x4, x11
+; CHECK-NEXT:  .LBB1_1: // %vector.body
+; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x9, x8, lsl #2]
+; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x10, x8, lsl #2]
+; CHECK-NEXT:    fmul z0.s, z0.s, z1.s
+; CHECK-NEXT:    st1w { z0.s }, p0, [x11, x8, lsl #2]
+; CHECK-NEXT:    decw x8
+; CHECK-NEXT:    cbnz x8, .LBB1_1
+; CHECK-NEXT:  // %bb.2: // %for.cond.cleanup
+; CHECK-NEXT:    ret
+entry:
+  %0 = zext i32 %N to i64
+  %1 = tail call i64 @llvm.vscale.i64()
+  %2 = shl nuw nsw i64 %1, 2
+  %3 = sub nsw i64 1, %2
+  %invariant.gep = getelementptr float, ptr %in1, i64 %3
+  %invariant.gep20 = getelementptr float, ptr %in2, i64 %3
+  %invariant.gep22 = getelementptr float, ptr %out, i64 %3
+  br label %vector.body
+
+vector.body:
+  %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
+  %offset.idx = sub i64 %0, %index
+  %gep = getelementptr float, ptr %invariant.gep, i64 %offset.idx
+  %wide.load = load <vscale x 4 x float>, ptr %gep, align 4
+  %gep21 = getelementptr float, ptr %invariant.gep20, i64 %offset.idx
+  %wide.load16 = load <vscale x 4 x float>, ptr %gep21, align 4
+  %4 = fmul <vscale x 4 x float> %wide.load, %wide.load16
+  %gep23 = getelementptr float, ptr %invariant.gep22, i64 %offset.idx
+  store <vscale x 4 x float> %4, ptr %gep23, align 4
+  %index.next = add nuw i64 %index, %2
+  %5 = icmp eq i64 %index.next, %0
+  br i1 %5, label %for.cond.cleanup, label %vector.body
+
+for.cond.cleanup:
+  ret void
+}
+
+declare i64 @llvm.vscale.i64()
+
+attributes #0 = { "target-features"="+sve2" }
diff --git a/llvm/test/CodeGen/AArch64/sve2-vscale-sinking.ll b/llvm/test/CodeGen/AArch64/sve2-vscale-sinking.ll
new file mode 100644
index 000000000000000..88d2f468d6a171c
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve2-vscale-sinking.ll
@@ -0,0 +1,112 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2
+; RUN: opt -codegenprepare -S -o - %s | FileCheck %s
+
+target triple = "aarch64-unknown-linux-gnu"
+
+define void @inc_add(i32 %first, i32 %N, ptr %in1, ptr %in2, ptr %out) #0 {
+; CHECK-LABEL: define void @inc_add
+; CHECK-SAME: (i32 [[FIRST:%.*]], i32 [[N:%.*]], ptr [[IN1:%.*]], ptr [[IN2:%.*]], ptr [[OUT:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds float, ptr [[IN1]], i64 [[INDEX]]
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[TMP0]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds float, ptr [[IN2]], i64 [[INDEX]]
+; CHECK-NEXT:    [[WIDE_LOAD16:%.*]] = load <vscale x 4 x float>, ptr [[TMP1]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = fmul <vscale x 4 x float> [[WIDE_LOAD]], [[WIDE_LOAD16]]
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds float, ptr [[OUT]], i64 [[INDEX]]
+; CHECK-NEXT:    store <vscale x 4 x float> [[TMP2]], ptr [[TMP3]], align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = tail call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP5:%.*]] = shl nuw nsw i64 [[TMP4]], 2
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[WIDE_TRIP_COUNT]]
+; CHECK-NEXT:    br i1 [[TMP6]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]]
+; CHECK:       for.cond.cleanup:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %wide.trip.count = zext i32 %N to i64
+  %0 = tail call i64 @llvm.vscale.i64()
+  %1 = shl nuw nsw i64 %0, 2
+  br label %vector.body
+
+vector.body:
+  %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
+  %2 = getelementptr inbounds float, ptr %in1, i64 %index
+  %wide.load = load <vscale x 4 x float>, ptr %2, align 4
+  %3 = getelementptr inbounds float, ptr %in2, i64 %index
+  %wide.load16 = load <vscale x 4 x float>, ptr %3, align 4
+  %4 = fmul <vscale x 4 x float> %wide.load, %wide.load16
+  %5 = getelementptr inbounds float, ptr %out, i64 %index
+  store <vscale x 4 x float> %4, ptr %5, align 4
+  %index.next = add nuw i64 %index, %1
+  %6 = icmp eq i64 %index.next, %wide.trip.count
+  br i1 %6, label %for.cond.cleanup, label %vector.body
+
+for.cond.cleanup:
+  ret void
+}
+
+define void @dec_sub(i32 %first, i32 %N, ptr %in1, ptr %in2, ptr %out) #0 {
+; CHECK-LABEL: define void @dec_sub
+; CHECK-SAME: (i32 [[FIRST:%.*]], i32 [[N:%.*]], ptr [[IN1:%.*]], ptr [[IN2:%.*]], ptr [[OUT:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = zext i32 [[N]] to i64
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 2
+; CHECK-NEXT:    [[TMP3:%.*]] = sub nsw i64 1, [[TMP2]]
+; CHECK-NEXT:    [[INVARIANT_GEP:%.*]] = getelementptr float, ptr [[IN1]], i64 [[TMP3]]
+; CHECK-NEXT:    [[INVARIANT_GEP20:%.*]] = getelementptr float, ptr [[IN2]], i64 [[TMP3]]
+; CHECK-NEXT:    [[INVARIANT_GEP22:%.*]] = getelementptr float, ptr [[OUT]], i64 [[TMP3]]
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = sub i64 [[TMP0]], [[INDEX]]
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr float, ptr [[INVARIANT_GEP]], i64 [[OFFSET_IDX]]
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[GEP]], align 4
+; CHECK-NEXT:    [[GEP21:%.*]] = getelementptr float, ptr [[INVARIANT_GEP20]], i64 [[OFFSET_IDX]]
+; CHECK-NEXT:    [[WIDE_LOAD16:%.*]] = load <vscale x 4 x float>, ptr [[GEP21]], align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = fmul <vscale x 4 x float> [[WIDE_LOAD]], [[WIDE_LOAD16]]
+; CHECK-NEXT:    [[GEP23:%.*]] = getelementptr float, ptr [[INVARIANT_GEP22]], i64 [[OFFSET_IDX]]
+; CHECK-NEXT:    store <vscale x 4 x float> [[TMP4]], ptr [[GEP23]], align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = tail call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP6:%.*]] = shl nuw nsw i64 [[TMP5]], 2
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]]
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[TMP0]]
+; CHECK-NEXT:    br i1 [[TMP7]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]]
+; CHECK:       for.cond.cleanup:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %0 = zext i32 %N to i64
+  %1 = tail call i64 @llvm.vscale.i64()
+  %2 = shl nuw nsw i64 %1, 2
+  %3 = sub nsw i64 1, %2
+  %invariant.gep = getelementptr float, ptr %in1, i64 %3
+  %invariant.gep20 = getelementptr float, ptr %in2, i64 %3
+  %invariant.gep22 = getelementptr float, ptr %out, i64 %3
+  br label %vector.body
+
+vector.body:
+  %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
+  %offset.idx = sub i64 %0, %index
+  %gep = getelementptr float, ptr %invariant.gep, i64 %offset.idx
+  %wide.load = load <vscale x 4 x float>, ptr %gep, align 4
+  %gep21 = getelementptr float, ptr %invariant.gep20, i64 %offset.idx
+  %wide.load16 = load <vscale x 4 x float>, ptr %gep21, align 4
+  %4 = fmul <vscale x 4 x float> %wide.load, %wide.load16
+  %gep23 = getelementptr float, ptr %invariant.gep22, i64 %offset.idx
+  store <vscale x 4 x float> %4, ptr %gep23, align 4
+  %index.next = add nuw i64 %index, %2
+  %5 = icmp eq i64 %index.next, %0
+  br i1 %5, label %for.cond.cleanup, label %vector.body
+
+for.cond.cleanup:
+  ret void
+}
+
+declare i64 @llvm.vscale.i64()
+
+attributes #0 = { "target-features"="+sve2" }

github-actions · 2023-10-26T08:46:46Z

✅ With the latest revision this PR passed the C/C++ code formatter.

paulwalker-arm · 2023-10-26T09:49:43Z

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

+    // If the subtarget wants to make use of sve inc* instructions, then sink
+    // vscale intrinsic (along with any shifts or multiplies) so that the
+    // appropriate folds can be made.
+    if (Subtarget->useScalarIncVL()) {


Is this check necessary? I'm hopefully there are reasons[1] to lower the constant regardless of whether we'll ultimately emit inc/dec instructions. Even if that does not happen MachineLICM should hoist the rdvl anyway.

For example, if the add/sub is then splatted we can use the vector variants regardless of useScalarIncVL.

paulwalker-arm · 2023-10-26T09:58:54Z

llvm/test/CodeGen/AArch64/sve2-vscale-sinking-codegen.ll

+target triple = "aarch64-unknown-linux-gnu"
+


Do we need code generation tests for an IR transformation? We already have tests that show inc/dec construction.

paulwalker-arm · 2023-10-26T10:12:38Z

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

+      bool Sink = false;
+      if (shouldSinkVScale(I->getOperand(0), Ops)) {
+        Ops.push_back(&I->getOperandUse(0));
+        Sink = true;


Can you just return true here? The only use case for Sink is when both operands are vscale related, and I'd expect such logic to be folded away. If you do this I recommend checking the second operand first given that's going to be the most common case.

paulwalker-arm · 2023-10-26T10:16:25Z

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

+    // appropriate folds can be made.
+    if (Subtarget->useScalarIncVL()) {
+      bool Sink = false;
+      if (shouldSinkVScale(I->getOperand(0), Ops)) {


Is there a use case for lowering the first operand of a subtract? Is there a vscale equivalent of subf? I guess there could be other reasons to lower this but that just backs up my first comment that perhaps we shouldn't be guarding the lowering with useScalarIncVL.

paulwalker-arm · 2023-11-02T16:19:23Z

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

@@ -14623,6 +14636,22 @@ bool AArch64TargetLowering::shouldSinkOperands(
    }
  }

+  // Sink vscales close to uses for better isel


s/close/closer/

For more recent sve capable CPUs it is beneficial to use the inc* instruction to increment a value by vscale (potentially shifted or multiplied) even in short loops. This patch tells codegenprepare to sink appropriate vscale calls into blocks where they are used so that isel can match them.

huntergr-arm requested review from paulwalker-arm and mgabka October 26, 2023 08:35

llvmbot added the backend:AArch64 label Oct 26, 2023

paulwalker-arm reviewed Oct 26, 2023

View reviewed changes

vfdff mentioned this pull request Oct 26, 2023

[SVE] Missing combine optimaztion related to incw #58995

Closed

paulwalker-arm approved these changes Nov 2, 2023

View reviewed changes

huntergr-arm added 4 commits November 3, 2023 14:23

remove invalid whitespace

d449070

Simplified logic, added GEP support, removed codegen test.

5060673

rebase, fix tests, improve comment

46ecf8e

huntergr-arm force-pushed the vscale-sinking branch from ccd604e to 46ecf8e Compare November 6, 2023 12:24

huntergr-arm merged commit a850dbc into llvm:main Nov 7, 2023
3 checks passed

huntergr-arm deleted the vscale-sinking branch November 7, 2023 10:29

This was referenced Nov 8, 2023

fix empty bb maksfb/llvm-project#2

Closed

fix empty bb maksfb/llvm-project#3

Closed

paulwalker-arm mentioned this pull request Dec 4, 2023

Inefficient scalable vector codegen after "[SCEV] Add SCEVType to represent vscale." #61742

Closed

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

[AArch64] Sink vscale calls into loops for better isel #70304

[AArch64] Sink vscale calls into loops for better isel #70304

huntergr-arm commented Oct 26, 2023

llvmbot commented Oct 26, 2023

github-actions bot commented Oct 26, 2023 •

edited

Loading

paulwalker-arm Oct 26, 2023

paulwalker-arm Oct 26, 2023

paulwalker-arm Oct 26, 2023

paulwalker-arm Oct 26, 2023

paulwalker-arm Nov 2, 2023

[AArch64] Sink vscale calls into loops for better isel #70304

[AArch64] Sink vscale calls into loops for better isel #70304

Conversation

huntergr-arm commented Oct 26, 2023

llvmbot commented Oct 26, 2023

github-actions bot commented Oct 26, 2023 • edited Loading

paulwalker-arm Oct 26, 2023

Choose a reason for hiding this comment

paulwalker-arm Oct 26, 2023

Choose a reason for hiding this comment

paulwalker-arm Oct 26, 2023

Choose a reason for hiding this comment

paulwalker-arm Oct 26, 2023

Choose a reason for hiding this comment

paulwalker-arm Nov 2, 2023

Choose a reason for hiding this comment

github-actions bot commented Oct 26, 2023 •

edited

Loading