[LSR] Recognize vscale-relative immediates #88124

huntergr-arm · 2024-04-09T13:37:12Z

Final part of the vscale-aware LSR work, see https://discourse.llvm.org/t/rfc-vscale-aware-loopstrengthreduce/77131

It's a bit messy right now, I mainly just want to know if there's any objections to the current work before I finish it up.

llvmbot · 2024-04-09T13:37:48Z

@llvm/pr-subscribers-llvm-support

@llvm/pr-subscribers-llvm-transforms

Author: Graham Hunter (huntergr-arm)

Changes

Final part of the vscale-aware LSR work, see https://discourse.llvm.org/t/rfc-vscale-aware-loopstrengthreduce/77131

It's a bit messy right now, I mainly just want to know if there's any objections to the current work before I finish it up.

Patch is 59.52 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/88124.diff

2 Files Affected:

(modified) llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp (+372-160)
(added) llvm/test/Transforms/LoopStrengthReduce/AArch64/vscale-fixups.ll (+147)

diff --git a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
index ec42e2d6e193a6..b5d0113bafe023 100644
--- a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
@@ -197,6 +197,14 @@ static cl::opt<bool> AllowDropSolutionIfLessProfitable(
     "lsr-drop-solution", cl::Hidden, cl::init(false),
     cl::desc("Attempt to drop solution if it is less profitable"));
 
+static cl::opt<bool> EnableVScaleImmediates(
+    "lsr-enable-vscale-immediates", cl::Hidden, cl::init(true),
+    cl::desc("Enable analysis of vscale-relative immediates in LSR"));
+
+static cl::opt<bool> DropScaledForVScale(
+    "lsr-drop-scaled-reg-for-vscale", cl::Hidden, cl::init(true),
+    cl::desc("Avoid using scaled registers with vscale-relative addressing"));
+
 STATISTIC(NumTermFold,
           "Number of terminating condition fold recognized and performed");
 
@@ -247,6 +255,68 @@ class RegSortData {
   void dump() const;
 };
 
+// An offset from an address that is either scalable or fixed. Used for
+// per-target optimizations of addressing modes.
+class Immediate : public details::FixedOrScalableQuantity<Immediate, int64_t> {
+  constexpr Immediate(ScalarTy MinVal, bool Scalable)
+      : FixedOrScalableQuantity(MinVal, Scalable) {}
+
+  constexpr Immediate(const FixedOrScalableQuantity<Immediate, int64_t> &V)
+      : FixedOrScalableQuantity(V) {}
+
+public:
+  constexpr Immediate() : FixedOrScalableQuantity() {}
+
+  static constexpr Immediate getFixed(ScalarTy MinVal) {
+    return Immediate(MinVal, false);
+  }
+  static constexpr Immediate getScalable(ScalarTy MinVal) {
+    return Immediate(MinVal, true);
+  }
+  static constexpr Immediate get(ScalarTy MinVal, bool Scalable) {
+    return Immediate(MinVal, Scalable);
+  }
+
+  constexpr bool isLessThanZero() const { return Quantity < 0; }
+
+  constexpr bool isGreaterThanZero() const { return Quantity > 0; }
+
+  constexpr bool isMin() const {
+    return Quantity == std::numeric_limits<ScalarTy>::min();
+  }
+
+  constexpr bool isMax() const {
+    return Quantity == std::numeric_limits<ScalarTy>::max();
+  }
+};
+
+// This is needed for the Compare type of std::map when Immediate is used
+// as a key. We don't need it to be fully correct against any value of vscale,
+// just to make sure that vscale-related terms in the map are considered against
+// each other rather than being mixed up and potentially missing opportunities.
+struct KeyOrderTargetImmediate {
+  bool operator()(const Immediate &LHS, const Immediate &RHS) const {
+    if (LHS.isScalable() && !RHS.isScalable())
+      return false;
+    if (!LHS.isScalable() && RHS.isScalable())
+      return true;
+    return LHS.getKnownMinValue() < RHS.getKnownMinValue();
+  }
+};
+
+// This would be nicer if we could be generic instead of directly using size_t,
+// but there doesn't seem to be a type trait for is_orderable or
+// is_lessthan_comparable or similar.
+struct KeyOrderSizeTAndImmediate {
+  bool operator()(const std::pair<size_t, Immediate> &LHS,
+                  const std::pair<size_t, Immediate> &RHS) const {
+    size_t LSize = LHS.first;
+    size_t RSize = RHS.first;
+    if (LSize != RSize)
+      return LSize < RSize;
+    return KeyOrderTargetImmediate()(LHS.second, RHS.second);
+  }
+};
 } // end anonymous namespace
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
@@ -357,7 +427,7 @@ struct Formula {
   GlobalValue *BaseGV = nullptr;
 
   /// Base offset for complex addressing.
-  int64_t BaseOffset = 0;
+  Immediate BaseOffset;
 
   /// Whether any complex addressing has a base register.
   bool HasBaseReg = false;
@@ -388,7 +458,7 @@ struct Formula {
   /// An additional constant offset which added near the use. This requires a
   /// temporary register, but the offset itself can live in an add immediate
   /// field rather than a register.
-  int64_t UnfoldedOffset = 0;
+  Immediate UnfoldedOffset;
 
   Formula() = default;
 
@@ -628,7 +698,7 @@ void Formula::print(raw_ostream &OS) const {
     if (!First) OS << " + "; else First = false;
     BaseGV->printAsOperand(OS, /*PrintType=*/false);
   }
-  if (BaseOffset != 0) {
+  if (BaseOffset.isNonZero()) {
     if (!First) OS << " + "; else First = false;
     OS << BaseOffset;
   }
@@ -652,7 +722,7 @@ void Formula::print(raw_ostream &OS) const {
       OS << "<unknown>";
     OS << ')';
   }
-  if (UnfoldedOffset != 0) {
+  if (UnfoldedOffset.isNonZero()) {
     if (!First) OS << " + ";
     OS << "imm(" << UnfoldedOffset << ')';
   }
@@ -798,28 +868,34 @@ static const SCEV *getExactSDiv(const SCEV *LHS, const SCEV *RHS,
 
 /// If S involves the addition of a constant integer value, return that integer
 /// value, and mutate S to point to a new SCEV with that value excluded.
-static int64_t ExtractImmediate(const SCEV *&S, ScalarEvolution &SE) {
+static Immediate ExtractImmediate(const SCEV *&S, ScalarEvolution &SE) {
   if (const SCEVConstant *C = dyn_cast<SCEVConstant>(S)) {
     if (C->getAPInt().getSignificantBits() <= 64) {
       S = SE.getConstant(C->getType(), 0);
-      return C->getValue()->getSExtValue();
+      return Immediate::getFixed(C->getValue()->getSExtValue());
     }
   } else if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
     SmallVector<const SCEV *, 8> NewOps(Add->operands());
-    int64_t Result = ExtractImmediate(NewOps.front(), SE);
-    if (Result != 0)
+    Immediate Result = ExtractImmediate(NewOps.front(), SE);
+    if (Result.isNonZero())
       S = SE.getAddExpr(NewOps);
     return Result;
   } else if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {
     SmallVector<const SCEV *, 8> NewOps(AR->operands());
-    int64_t Result = ExtractImmediate(NewOps.front(), SE);
-    if (Result != 0)
+    Immediate Result = ExtractImmediate(NewOps.front(), SE);
+    if (Result.isNonZero())
       S = SE.getAddRecExpr(NewOps, AR->getLoop(),
                            // FIXME: AR->getNoWrapFlags(SCEV::FlagNW)
                            SCEV::FlagAnyWrap);
     return Result;
-  }
-  return 0;
+  } else if (EnableVScaleImmediates)
+    if (const SCEVMulExpr *M = dyn_cast<SCEVMulExpr>(S))
+      if (const SCEVConstant *C = dyn_cast<SCEVConstant>(M->getOperand(0)))
+        if (isa<SCEVVScale>(M->getOperand(1))) {
+          S = SE.getConstant(M->getType(), 0);
+          return Immediate::getScalable(C->getValue()->getSExtValue());
+        }
+  return Immediate();
 }
 
 /// If S involves the addition of a GlobalValue address, return that symbol, and
@@ -1134,7 +1210,7 @@ struct LSRFixup {
   /// A constant offset to be added to the LSRUse expression.  This allows
   /// multiple fixups to share the same LSRUse with different offsets, for
   /// example in an unrolled loop.
-  int64_t Offset = 0;
+  Immediate Offset;
 
   LSRFixup() = default;
 
@@ -1197,8 +1273,10 @@ class LSRUse {
   SmallVector<LSRFixup, 8> Fixups;
 
   /// Keep track of the min and max offsets of the fixups.
-  int64_t MinOffset = std::numeric_limits<int64_t>::max();
-  int64_t MaxOffset = std::numeric_limits<int64_t>::min();
+  Immediate MinOffset =
+      Immediate::getFixed(std::numeric_limits<int64_t>::max());
+  Immediate MaxOffset =
+      Immediate::getFixed(std::numeric_limits<int64_t>::min());
 
   /// This records whether all of the fixups using this LSRUse are outside of
   /// the loop, in which case some special-case heuristics may be used.
@@ -1234,9 +1312,9 @@ class LSRUse {
 
   void pushFixup(LSRFixup &f) {
     Fixups.push_back(f);
-    if (f.Offset > MaxOffset)
+    if (Immediate::isKnownGT(f.Offset, MaxOffset))
       MaxOffset = f.Offset;
-    if (f.Offset < MinOffset)
+    if (Immediate::isKnownLT(f.Offset, MinOffset))
       MinOffset = f.Offset;
   }
 
@@ -1254,7 +1332,7 @@ class LSRUse {
 
 static bool isAMCompletelyFolded(const TargetTransformInfo &TTI,
                                  LSRUse::KindType Kind, MemAccessTy AccessTy,
-                                 GlobalValue *BaseGV, int64_t BaseOffset,
+                                 GlobalValue *BaseGV, Immediate BaseOffset,
                                  bool HasBaseReg, int64_t Scale,
                                  Instruction *Fixup = nullptr);
 
@@ -1310,7 +1388,7 @@ void Cost::RateRegister(const Formula &F, const SCEV *Reg,
       // addressing.
       if (AMK == TTI::AMK_PreIndexed) {
         if (auto *Step = dyn_cast<SCEVConstant>(AR->getStepRecurrence(*SE)))
-          if (Step->getAPInt() == F.BaseOffset)
+          if (Step->getAPInt() == F.BaseOffset.getFixedValue())
             LoopCost = 0;
       } else if (AMK == TTI::AMK_PostIndexed) {
         const SCEV *LoopStep = AR->getStepRecurrence(*SE);
@@ -1401,24 +1479,29 @@ void Cost::RateFormula(const Formula &F,
     // allows to fold 2 registers.
     C.NumBaseAdds +=
         NumBaseParts - (1 + (F.Scale && isAMCompletelyFolded(*TTI, LU, F)));
-  C.NumBaseAdds += (F.UnfoldedOffset != 0);
+  C.NumBaseAdds += (F.UnfoldedOffset.isNonZero());
 
   // Accumulate non-free scaling amounts.
   C.ScaleCost += *getScalingFactorCost(*TTI, LU, F, *L).getValue();
 
   // Tally up the non-zero immediates.
   for (const LSRFixup &Fixup : LU.Fixups) {
-    int64_t O = Fixup.Offset;
-    int64_t Offset = (uint64_t)O + F.BaseOffset;
+    // FIXME: We probably want to noticeably increase the cost if the
+    // two offsets differ in scalability?
+    bool Scalable = Fixup.Offset.isScalable() || F.BaseOffset.isScalable();
+    int64_t O = Fixup.Offset.getKnownMinValue();
+    Immediate Offset = Immediate::get(
+        (uint64_t)(O) + F.BaseOffset.getKnownMinValue(), Scalable);
     if (F.BaseGV)
       C.ImmCost += 64; // Handle symbolic values conservatively.
                      // TODO: This should probably be the pointer size.
-    else if (Offset != 0)
-      C.ImmCost += APInt(64, Offset, true).getSignificantBits();
+    else if (Offset.isNonZero())
+      C.ImmCost +=
+          APInt(64, Offset.getKnownMinValue(), true).getSignificantBits();
 
     // Check with target if this offset with this instruction is
     // specifically not supported.
-    if (LU.Kind == LSRUse::Address && Offset != 0 &&
+    if (LU.Kind == LSRUse::Address && Offset.isNonZero() &&
         !isAMCompletelyFolded(*TTI, LSRUse::Address, LU.AccessTy, F.BaseGV,
                               Offset, F.HasBaseReg, F.Scale, Fixup.UserInst))
       C.NumBaseAdds++;
@@ -1546,7 +1629,7 @@ void LSRFixup::print(raw_ostream &OS) const {
     PIL->getHeader()->printAsOperand(OS, /*PrintType=*/false);
   }
 
-  if (Offset != 0)
+  if (Offset.isNonZero())
     OS << ", Offset=" << Offset;
 }
 
@@ -1673,14 +1756,19 @@ LLVM_DUMP_METHOD void LSRUse::dump() const {
 
 static bool isAMCompletelyFolded(const TargetTransformInfo &TTI,
                                  LSRUse::KindType Kind, MemAccessTy AccessTy,
-                                 GlobalValue *BaseGV, int64_t BaseOffset,
+                                 GlobalValue *BaseGV, Immediate BaseOffset,
                                  bool HasBaseReg, int64_t Scale,
-                                 Instruction *Fixup/*= nullptr*/) {
+                                 Instruction *Fixup /*= nullptr*/) {
   switch (Kind) {
-  case LSRUse::Address:
-    return TTI.isLegalAddressingMode(AccessTy.MemTy, BaseGV, BaseOffset,
-                                     HasBaseReg, Scale, AccessTy.AddrSpace, Fixup);
-
+  case LSRUse::Address: {
+    int64_t FixedOffset =
+        BaseOffset.isScalable() ? 0 : BaseOffset.getFixedValue();
+    int64_t ScalableOffset =
+        BaseOffset.isScalable() ? BaseOffset.getKnownMinValue() : 0;
+    return TTI.isLegalAddressingMode(AccessTy.MemTy, BaseGV, FixedOffset,
+                                     HasBaseReg, Scale, AccessTy.AddrSpace,
+                                     Fixup, ScalableOffset);
+  }
   case LSRUse::ICmpZero:
     // There's not even a target hook for querying whether it would be legal to
     // fold a GV into an ICmp.
@@ -1688,7 +1776,7 @@ static bool isAMCompletelyFolded(const TargetTransformInfo &TTI,
       return false;
 
     // ICmp only has two operands; don't allow more than two non-trivial parts.
-    if (Scale != 0 && HasBaseReg && BaseOffset != 0)
+    if (Scale != 0 && HasBaseReg && BaseOffset.isNonZero())
       return false;
 
     // ICmp only supports no scale or a -1 scale, as we can "fold" a -1 scale by
@@ -1698,7 +1786,7 @@ static bool isAMCompletelyFolded(const TargetTransformInfo &TTI,
 
     // If we have low-level target information, ask the target if it can fold an
     // integer immediate on an icmp.
-    if (BaseOffset != 0) {
+    if (BaseOffset.isNonZero()) {
       // We have one of:
       // ICmpZero     BaseReg + BaseOffset => ICmp BaseReg, -BaseOffset
       // ICmpZero -1*ScaleReg + BaseOffset => ICmp ScaleReg, BaseOffset
@@ -1706,8 +1794,8 @@ static bool isAMCompletelyFolded(const TargetTransformInfo &TTI,
       if (Scale == 0)
         // The cast does the right thing with
         // std::numeric_limits<int64_t>::min().
-        BaseOffset = -(uint64_t)BaseOffset;
-      return TTI.isLegalICmpImmediate(BaseOffset);
+        BaseOffset = BaseOffset.getFixed((uint64_t)BaseOffset.getFixedValue());
+      return TTI.isLegalICmpImmediate(BaseOffset.getFixedValue());
     }
 
     // ICmpZero BaseReg + -1*ScaleReg => ICmp BaseReg, ScaleReg
@@ -1715,30 +1803,36 @@ static bool isAMCompletelyFolded(const TargetTransformInfo &TTI,
 
   case LSRUse::Basic:
     // Only handle single-register values.
-    return !BaseGV && Scale == 0 && BaseOffset == 0;
+    return !BaseGV && Scale == 0 && BaseOffset.isZero();
 
   case LSRUse::Special:
     // Special case Basic to handle -1 scales.
-    return !BaseGV && (Scale == 0 || Scale == -1) && BaseOffset == 0;
+    return !BaseGV && (Scale == 0 || Scale == -1) && BaseOffset.isZero();
   }
 
   llvm_unreachable("Invalid LSRUse Kind!");
 }
 
 static bool isAMCompletelyFolded(const TargetTransformInfo &TTI,
-                                 int64_t MinOffset, int64_t MaxOffset,
+                                 Immediate MinOffset, Immediate MaxOffset,
                                  LSRUse::KindType Kind, MemAccessTy AccessTy,
-                                 GlobalValue *BaseGV, int64_t BaseOffset,
+                                 GlobalValue *BaseGV, Immediate BaseOffset,
                                  bool HasBaseReg, int64_t Scale) {
+  if (BaseOffset.isNonZero() &&
+      (BaseOffset.isScalable() != MinOffset.isScalable() ||
+       BaseOffset.isScalable() != MaxOffset.isScalable()))
+    return false;
+  // Check for overflow.
   // Check for overflow.
-  if (((int64_t)((uint64_t)BaseOffset + MinOffset) > BaseOffset) !=
-      (MinOffset > 0))
+  int64_t Base = BaseOffset.getKnownMinValue();
+  int64_t Min = MinOffset.getKnownMinValue();
+  int64_t Max = MaxOffset.getKnownMinValue();
+  if (((int64_t)((uint64_t)Base + Min) > Base) != (Min > 0))
     return false;
-  MinOffset = (uint64_t)BaseOffset + MinOffset;
-  if (((int64_t)((uint64_t)BaseOffset + MaxOffset) > BaseOffset) !=
-      (MaxOffset > 0))
+  MinOffset = Immediate::get((uint64_t)Base + Min, MinOffset.isScalable());
+  if (((int64_t)((uint64_t)Base + Max) > Base) != (Max > 0))
     return false;
-  MaxOffset = (uint64_t)BaseOffset + MaxOffset;
+  MaxOffset = Immediate::get((uint64_t)Base + Max, MaxOffset.isScalable());
 
   return isAMCompletelyFolded(TTI, Kind, AccessTy, BaseGV, MinOffset,
                               HasBaseReg, Scale) &&
@@ -1747,7 +1841,7 @@ static bool isAMCompletelyFolded(const TargetTransformInfo &TTI,
 }
 
 static bool isAMCompletelyFolded(const TargetTransformInfo &TTI,
-                                 int64_t MinOffset, int64_t MaxOffset,
+                                 Immediate MinOffset, Immediate MaxOffset,
                                  LSRUse::KindType Kind, MemAccessTy AccessTy,
                                  const Formula &F, const Loop &L) {
   // For the purpose of isAMCompletelyFolded either having a canonical formula
@@ -1763,10 +1857,10 @@ static bool isAMCompletelyFolded(const TargetTransformInfo &TTI,
 }
 
 /// Test whether we know how to expand the current formula.
-static bool isLegalUse(const TargetTransformInfo &TTI, int64_t MinOffset,
-                       int64_t MaxOffset, LSRUse::KindType Kind,
+static bool isLegalUse(const TargetTransformInfo &TTI, Immediate MinOffset,
+                       Immediate MaxOffset, LSRUse::KindType Kind,
                        MemAccessTy AccessTy, GlobalValue *BaseGV,
-                       int64_t BaseOffset, bool HasBaseReg, int64_t Scale) {
+                       Immediate BaseOffset, bool HasBaseReg, int64_t Scale) {
   // We know how to expand completely foldable formulae.
   return isAMCompletelyFolded(TTI, MinOffset, MaxOffset, Kind, AccessTy, BaseGV,
                               BaseOffset, HasBaseReg, Scale) ||
@@ -1777,13 +1871,21 @@ static bool isLegalUse(const TargetTransformInfo &TTI, int64_t MinOffset,
                                BaseGV, BaseOffset, true, 0));
 }
 
-static bool isLegalUse(const TargetTransformInfo &TTI, int64_t MinOffset,
-                       int64_t MaxOffset, LSRUse::KindType Kind,
+static bool isLegalUse(const TargetTransformInfo &TTI, Immediate MinOffset,
+                       Immediate MaxOffset, LSRUse::KindType Kind,
                        MemAccessTy AccessTy, const Formula &F) {
   return isLegalUse(TTI, MinOffset, MaxOffset, Kind, AccessTy, F.BaseGV,
                     F.BaseOffset, F.HasBaseReg, F.Scale);
 }
 
+static bool isLegalAddImmediate(const TargetTransformInfo &TTI,
+                                Immediate Offset) {
+  if (Offset.isScalable())
+    return TTI.isLegalAddScalableImmediate(Offset.getKnownMinValue());
+
+  return TTI.isLegalAddImmediate(Offset.getFixedValue());
+}
+
 static bool isAMCompletelyFolded(const TargetTransformInfo &TTI,
                                  const LSRUse &LU, const Formula &F) {
   // Target may want to look at the user instructions.
@@ -1817,11 +1919,13 @@ static InstructionCost getScalingFactorCost(const TargetTransformInfo &TTI,
   case LSRUse::Address: {
     // Check the scaling factor cost with both the min and max offsets.
     InstructionCost ScaleCostMinOffset = TTI.getScalingFactorCost(
-        LU.AccessTy.MemTy, F.BaseGV, F.BaseOffset + LU.MinOffset, F.HasBaseReg,
-        F.Scale, LU.AccessTy.AddrSpace);
+        LU.AccessTy.MemTy, F.BaseGV,
+        F.BaseOffset.getFixedValue() + LU.MinOffset.getFixedValue(),
+        F.HasBaseReg, F.Scale, LU.AccessTy.AddrSpace);
     InstructionCost ScaleCostMaxOffset = TTI.getScalingFactorCost(
-        LU.AccessTy.MemTy, F.BaseGV, F.BaseOffset + LU.MaxOffset, F.HasBaseReg,
-        F.Scale, LU.AccessTy.AddrSpace);
+        LU.AccessTy.MemTy, F.BaseGV,
+        F.BaseOffset.getFixedValue() + LU.MaxOffset.getFixedValue(),
+        F.HasBaseReg, F.Scale, LU.AccessTy.AddrSpace);
 
     assert(ScaleCostMinOffset.isValid() && ScaleCostMaxOffset.isValid() &&
            "Legal addressing mode has an illegal cost!");
@@ -1840,10 +1944,11 @@ static InstructionCost getScalingFactorCost(const TargetTransformInfo &TTI,
 
 static bool isAlwaysFoldable(const TargetTransformInfo &TTI,
                              LSRUse::KindType Kind, MemAccessTy AccessTy,
-                             GlobalValue *BaseGV, int64_t BaseOffset,
+                             GlobalValue *BaseGV, Immediate BaseOffset,
                              bool HasBaseReg) {
   // Fast-path: zero is always foldable.
-  if (BaseOffset == 0 && !BaseGV) return true;
+  if (BaseOffset.isZero() && !BaseGV)
+    return true;
 
   // Conservatively, create an address with an immediate and a
   // base and a scale.
@@ -1856,13 +1961,22 @@ static bool isAlwaysFoldable(const TargetTransformInfo &TTI,
     HasBaseReg = true;
   }
 
+  // FIXME: Try with + without a scale? Maybe based on TTI?
+  // I think basereg + scaledreg + immediateoffset isn't a good 'conservative'
+  // default for many architectures, not just AArch64 SVE. More investigation
+  // needed later to determine if this should be used more widely than just
+  // on scalable types.
+  if (HasBaseReg && BaseOffset.isNonZero() && Kind != LSRUse::ICmpZero &&
+      AccessTy.MemTy && AccessTy.MemTy->isScalableTy() && DropScaledForVScale)
+    Scale = 0;
+
   return isAMCompletelyFolded(TTI, Kind...
[truncated]

huntergr-arm · 2024-05-10T14:35:40Z

Rebased on top of the getScalingFactorCost patch.

github-actions · 2024-05-10T14:39:07Z

✅ With the latest revision this PR passed the C/C++ code formatter.

SamTebbs33

I like it so far. It seems like most of the changes are there to move from integers to Immediates.

SamTebbs33 · 2024-05-14T10:31:51Z

llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp

      // We have one of:
      // ICmpZero     BaseReg + BaseOffset => ICmp BaseReg, -BaseOffset
      // ICmpZero -1*ScaleReg + BaseOffset => ICmp ScaleReg, BaseOffset
      // Offs is the ICmp immediate.
      if (Scale == 0)
        // The cast does the right thing with
        // std::numeric_limits<int64_t>::min().
-        BaseOffset = -(uint64_t)BaseOffset;
-      return TTI.isLegalICmpImmediate(BaseOffset);
+        BaseOffset = BaseOffset.getFixed((uint64_t)BaseOffset.getFixedValue());


Looks like this removes the negation of the offset. Does that still have the same semantics?

SamTebbs33 · 2024-05-14T10:32:23Z

llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp

+      (BaseOffset.isScalable() != MinOffset.isScalable() ||
+       BaseOffset.isScalable() != MaxOffset.isScalable()))
+    return false;
+  // Check for overflow.


Duplicate comment here.

SamTebbs33 · 2024-05-14T12:07:50Z

llvm/test/Transforms/LoopStrengthReduce/AArch64/vscale-fixups.ll

@@ -0,0 +1,147 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3


It would be good to see what your changes do to this test, if you wouldn't mind creating a commit with this test before the commit with your functional changes?

huntergr-arm · 2024-05-15T13:31:31Z

Separated the conversion to use Immediate from the changes to support vscale immediates into different commits.

huntergr-arm · 2024-05-15T13:43:37Z

There's still several places where getFixedValue() is called where we might see a scalable immediate. I'll try writing some tests to see if I can reach them and fix them.

huntergr-arm · 2024-05-17T15:18:22Z

Added some new test cases and converted the checks to look at llc output, since it's easier to see what the impact is. I think the cost comparison changes will also be required to get the best results here, as there's still some problems with the many_mulvl1_addressing output.

…des. LSR will generate chains of related instructions with a known increment between them. With SVE, in the case of the test case, this can include increments like 'vscale * 16 + 8'. The idea of this patch is if we have a '+8' increment already calculated in the chain, we can generate a (legal) '+ vscale*16' addressing mode from it, allowing us to use the '[x16, llvm#1, mul vl]' addressing mode instructions. In order to do this we keep track of the known 'bases' when generating chains in GenerateIVChain, checking for each if the accumulated increment expression neatly folds into a legal addressing mode. If they do not we fall back to the existing LeftOverExpr, whether it is legal or not. This is mostly orthogonal to llvm#88124, dealing with the generation of chains as opposed to rest of LSR. The existing vscale addressing mode work has greatly helped compared to the last time I looked at this, allowing us to check that the addressing modes are indeed legal.

davemgreen · 2024-06-05T10:10:11Z

Hi - I have been looking at something that is related but mostly orthogonal to this in the generation of chains for vscale addressing modes recently. I've put the patch up at #94453. I don't think it changes alot of the same code as this patch, which is altering a lot more. That probably turned out to mostly be about how chains are generated. It does make use of the existing isLegalAddressingMode changes which have made it a lot easier since the last time I looked into it.

My high level comment for this patch, after looking into the chains, is the same as it has been elsewhere - that it feels like it is useful to at least conceptually be able to model Base + Imm + vscale * ScalableImm with both an Offset and a ScalableOffset treated independently. Even if it is unlikely that they are supported by the assembly, being able to represent them so that it can reason about things like (Base+Imm) + vscale*ScalableImm vs (Base+vscale*ScalableImm) + imm can be useful. I obviously could be wrong though, as I haven't seen the alternative version and am not sure how difficult it would be in practice.

huntergr-arm · 2024-06-07T09:45:41Z

Rebased on top of the isLSRCostLess changes I merged.

huntergr-arm · 2024-06-07T11:10:31Z

Hi - I have been looking at something that is related but mostly orthogonal to this in the generation of chains for vscale addressing modes recently. I've put the patch up at #94453. I don't think it changes alot of the same code as this patch, which is altering a lot more. That probably turned out to mostly be about how chains are generated. It does make use of the existing isLegalAddressingMode changes which have made it a lot easier since the last time I looked into it.

My high level comment for this patch, after looking into the chains, is the same as it has been elsewhere - that it feels like it is useful to at least conceptually be able to model Base + Imm + vscale * ScalableImm with both an Offset and a ScalableOffset treated independently. Even if it is unlikely that they are supported by the assembly, being able to represent them so that it can reason about things like (Base+Imm) + vscale*ScalableImm vs (Base+vscale*ScalableImm) + imm can be useful. I obviously could be wrong though, as I haven't seen the alternative version and am not sure how difficult it would be in practice.

I'm assuming by independently representing fixed and scalable immediates you mean we would have (potentially) two separate instances of Immediate (or whatever else we end up using)? I suppose there could be a use in terms of finding a legal add immediate followed up with a legal folded addressing mode, but I think that's outside the scope of this patch (which is just aimed at getting vscale-relative immediates off the ground).

SamTebbs33

Looks good to me.

…des (#94453) LSR will generate chains of related instructions with a known increment between them. With SVE, in the case of the test case, this can include increments like 'vscale * 16 + 8'. The idea of this patch is if we have a '+8' increment already calculated in the chain, we can generate a (legal) '+ vscale*16' addressing mode from it, allowing us to use the '[x16, #1, mul vl]' addressing mode instructions. In order to do this we keep track of the known 'bases' when generating chains in GenerateIVChain, checking for each if the accumulated increment expression from the base neatly folds into a legal addressing mode. If they do not we fall back to the existing LeftOverExpr, whether it is legal or not. This is mostly orthogonal to #88124, dealing with the generation of chains as opposed to rest of LSR. The existing vscale addressing mode work has greatly helped compared to the last time I looked at this, allowing us to check that the addressing modes are indeed legal.

…des (llvm#94453) LSR will generate chains of related instructions with a known increment between them. With SVE, in the case of the test case, this can include increments like 'vscale * 16 + 8'. The idea of this patch is if we have a '+8' increment already calculated in the chain, we can generate a (legal) '+ vscale*16' addressing mode from it, allowing us to use the '[x16, #1, mul vl]' addressing mode instructions. In order to do this we keep track of the known 'bases' when generating chains in GenerateIVChain, checking for each if the accumulated increment expression from the base neatly folds into a legal addressing mode. If they do not we fall back to the existing LeftOverExpr, whether it is legal or not. This is mostly orthogonal to llvm#88124, dealing with the generation of chains as opposed to rest of LSR. The existing vscale addressing mode work has greatly helped compared to the last time I looked at this, allowing us to check that the addressing modes are indeed legal.

paulwalker-arm

I'm not going to bang the "we should be using StackOffset everywhere" drum this time but there are certainly places within this patch where it would make things safer.

The hardest part of reviewing the patch is all the getFixedValue() calls where it's not clear why it safe to assume a fixed offset. There are part of the code where I think we should be able to use the operator overloading better to make the code agnostic to being fixed/scalable. Even for parts that are specific to one of those I think the code would be cleaner if it didn't look special.

Do you think it's possible to incorporate a scalable safe type without necessarily adding the scalable safe code paths? I'm not asking for a separate PR but the genuine improvements that would be easier to spot if they were in a separate commit separate from the refactoring.

paulwalker-arm · 2024-06-13T14:57:53Z

llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp

+          S = SE.getConstant(M->getType(), 0);
+          return Immediate::getScalable(C->getValue()->getSExtValue());
+        }
+  return Immediate();


Up to you but I prefer to be more explicit here and add getZero to Immediate. This will mirror the path taken by TypeSize.

paulwalker-arm · 2024-06-13T14:59:53Z

llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp

@@ -1134,7 +1214,7 @@ struct LSRFixup {
  /// A constant offset to be added to the LSRUse expression.  This allows
  /// multiple fixups to share the same LSRUse with different offsets, for
  /// example in an unrolled loop.
-  int64_t Offset = 0;
+  Immediate Offset;


If you follow the above then here (and the other places) we can also explicitly show the Offset is zero at this point.

paulwalker-arm · 2024-06-13T15:03:51Z

llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp

-  int64_t MinOffset = std::numeric_limits<int64_t>::max();
-  int64_t MaxOffset = std::numeric_limits<int64_t>::min();
+  Immediate MinOffset =
+      Immediate::getFixed(std::numeric_limits<int64_t>::max());


This assumes the internal representation. Given the matching isMin/isMax functions don't care about the scalable property we should implement Immediate::getMin()/Immediate::getMax()?

paulwalker-arm · 2024-06-13T15:52:01Z

llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp

@@ -1234,9 +1316,9 @@ class LSRUse {

  void pushFixup(LSRFixup &f) {
    Fixups.push_back(f);
-    if (f.Offset > MaxOffset)
+    if (Immediate::isKnownGT(f.Offset, MaxOffset))


isKnownGT doesn't require the operands to have matching scalable properties. Does this function need to worry about that? Perhaps you want separate entities for min/max scalable/fixed offsets?

paulwalker-arm · 2024-06-13T16:08:31Z

llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp

+    // FIXME: We probably want to noticeably increase the cost if the
+    // two offsets differ in scalability?
+    bool Scalable = Fixup.Offset.isScalable() || F.BaseOffset.isScalable();
+    int64_t O = Fixup.Offset.getKnownMinValue();
+    Immediate Offset = Immediate::get(
+        (uint64_t)(O) + F.BaseOffset.getKnownMinValue(), Scalable);


To me this code suggests at least the local Offset variable should be using StackOffset.

I think it's more that a wrapper for the casted arithmetic is needed. Right now we would need to cast and add the scalable and fixed parts of StackOffset independently.

paulwalker-arm · 2024-06-13T16:42:41Z

llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp

+    if (Base.BaseOffset.isScalable() != Offset.isScalable() &&
+        Base.BaseOffset.isNonZero() && Offset.isNonZero())


I talk to this above. For TypeSize we added support for zero to be both scalable and fixed to prevent this sort of thing.

Yeah, I treat them that way as well. But this is checking for scalability being equal if both are nonzero. I guess an 'isCompatible()' helper might be better (subject to bikeshedding over the name).

paulwalker-arm · 2024-06-13T16:47:12Z

llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp

-          Offset -= Step;
-          GenerateOffset(G, Offset);
+        for (Immediate Offset : Worklist) {
+          if (!Offset.isScalable()) {


I'm not familiar with the algorithm here but would a scalable offset here be bad?

It's trying to subtract an APInt derived from a fixed SCEVAddRecExpr step. We may be able to check for a scalable step to make use of this, but I'd want to come up with an appropriate test first.

paulwalker-arm · 2024-06-13T16:50:17Z

llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp

+  F.BaseOffset = Immediate::get((uint64_t)F.BaseOffset.getKnownMinValue() +
+                                    Imm.getKnownMinValue(),
+                                Imm.isScalable());


Is this necessary? Immediate inherits from FixedOrScalableQuantity which supports operator+.

It's the cast to unsigned that breaks the operators. (The internal quantity type is int64_t)

Assuming the cast is important, what about implementing something like Immediate::AddToUnsigned(Immediate&)? I'd sooner something explicit that works for both fixed and scalable rather than continually pulling apart the Immediate type.

Yeah. I did actually have wrappers in one of my early prototypes but I made a mistake somewhere (probably with the 3-term calculations) so abandoned it to confirm 1-1 against the current code in the diff.

I'll replace the simple cases with a wrapper first, then see about the more complex ones.

paulwalker-arm · 2024-06-13T17:15:34Z

llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp

+    Immediate NewBaseOffset =
+        Immediate::getFixed((uint64_t)Base.BaseOffset.getFixedValue() * Factor);


Similar to above, FixedOrScalableQuantity supports operator* with a scalar multiplier.

paulwalker-arm · 2024-06-13T17:22:05Z

llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp

+    Offset = Immediate::getFixed((uint64_t)Offset.getFixedValue() * Factor);
+    if (Offset.getFixedValue() / Factor != LU.MinOffset.getFixedValue())


Is this path fixed-offset specific or it is the case we've just not hit it for scalable-offsets yet?

Fixed specific, it's generating terms for ICmpZero uses (for which we won't have any legal scalable offsets)

huntergr-arm · 2024-06-14T15:43:52Z

I'm not going to bang the "we should be using StackOffset everywhere" drum this time but there are certainly places within this patch where it would make things safer.

The hardest part of reviewing the patch is all the getFixedValue() calls where it's not clear why it safe to assume a fixed offset. There are part of the code where I think we should be able to use the operator overloading better to make the code agnostic to being fixed/scalable. Even for parts that are specific to one of those I think the code would be cleaner if it didn't look special.

Do you think it's possible to incorporate a scalable safe type without necessarily adding the scalable safe code paths? I'm not asking for a separate PR but the genuine improvements that would be easier to spot if they were in a separate commit separate from the refactoring.

I'm not strictly opposed to using StackOffset, and it may simplify some of the other parts of the code. I used a mutually-exclusive struct here because there are places where it assumes it can make a combined offset between the current known min and max that must be legal, and asserts as much. If we cannot establish ordering between scalable and fixed offsets, then the whole min/max thing breaks.

So we could maybe use StackOffset, but basically use them in the same way Immediate is now, and track scalable and fixed min/max separately.

huntergr-arm · 2024-06-14T16:13:54Z

Do you think it's possible to incorporate a scalable safe type without necessarily adding the scalable safe code paths? I'm not asking for a separate PR but the genuine improvements that would be easier to spot if they were in a separate commit separate from the refactoring.

It is in a separate commit ('Convert LSR to use possibly-scalable Immediate type', followed by 'Scalable work')

huntergr-arm · 2024-06-21T11:08:25Z

Rebased, added some convenience functions.

I didn't switch to StackOffset yet. I think handling cases with mixed offsets requires more thought, and possibly redoing how we handle initial Use generation. When we do switch to it though, we can move the SCEV generation code to ScalarEvolution since that's an existing class available across the codebase.

SamTebbs33

Still looks good to me with your latest changes.

Extends LoopStrengthReduce to recognize immediates multiplied by vscale, and query the current target for whether they are legal offsets for memory operations or adds.

huntergr-arm requested review from paulwalker-arm and sdesmalen-arm April 9, 2024 13:37

llvmbot added the llvm:transforms label Apr 9, 2024

huntergr-arm requested a review from davemgreen April 10, 2024 09:00

davemgreen mentioned this pull request May 1, 2024

[TTI] Support scalable offsets in getScalingFactorCost #88113

Merged

huntergr-arm force-pushed the lsr-vscale-immediates branch from 5559950 to 1d10189 Compare May 10, 2024 14:34

SamTebbs33 self-requested a review May 14, 2024 09:44

SamTebbs33 reviewed May 14, 2024

View reviewed changes

huntergr-arm force-pushed the lsr-vscale-immediates branch 2 times, most recently from 0169d98 to 37ae026 Compare May 15, 2024 13:30

huntergr-arm force-pushed the lsr-vscale-immediates branch from 37ae026 to 6be93a7 Compare May 17, 2024 15:15

davemgreen mentioned this pull request Jun 5, 2024

[LSR][AArch64] Optimize chain generation based on legal addressing modes #94453

Merged

huntergr-arm force-pushed the lsr-vscale-immediates branch from 6be93a7 to 04f4312 Compare June 7, 2024 09:44

SamTebbs33 approved these changes Jun 10, 2024

View reviewed changes

paulwalker-arm reviewed Jun 13, 2024

View reviewed changes

Test cases for vscale immediates

3a04cf7

huntergr-arm added 3 commits June 21, 2024 09:29

Convert LSR to use possibly-scalable Immediate type

557a097

Scalable work

6265205

Add isFixed, getZero, getScalableFixedMin/Max convenience methods

f5e8555

huntergr-arm force-pushed the lsr-vscale-immediates branch from 04f4312 to f5e8555 Compare June 21, 2024 11:04

llvmbot added the llvm:support label Jun 21, 2024

SamTebbs33 approved these changes Jun 25, 2024

View reviewed changes

huntergr-arm merged commit 4311b14 into llvm:main Jul 1, 2024
8 checks passed

huntergr-arm deleted the lsr-vscale-immediates branch July 11, 2024 14:33

		@@ -0,0 +1,147 @@
		; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3

		if (Base.BaseOffset.isScalable() != Offset.isScalable() &&
		Base.BaseOffset.isNonZero() && Offset.isNonZero())

		Immediate NewBaseOffset =
		Immediate::getFixed((uint64_t)Base.BaseOffset.getFixedValue() * Factor);

		Offset = Immediate::getFixed((uint64_t)Offset.getFixedValue() * Factor);
		if (Offset.getFixedValue() / Factor != LU.MinOffset.getFixedValue())

[LSR] Recognize vscale-relative immediates #88124

[LSR] Recognize vscale-relative immediates #88124

Conversation

huntergr-arm commented Apr 9, 2024

llvmbot commented Apr 9, 2024 • edited Loading

huntergr-arm commented May 10, 2024

github-actions bot commented May 10, 2024 • edited Loading

SamTebbs33 left a comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

huntergr-arm commented May 15, 2024

huntergr-arm commented May 15, 2024

huntergr-arm commented May 17, 2024

davemgreen commented Jun 5, 2024

huntergr-arm commented Jun 7, 2024

huntergr-arm commented Jun 7, 2024

SamTebbs33 left a comment

Choose a reason for hiding this comment

paulwalker-arm left a comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

huntergr-arm commented Jun 14, 2024

huntergr-arm commented Jun 14, 2024

huntergr-arm commented Jun 21, 2024

SamTebbs33 left a comment

Choose a reason for hiding this comment

llvmbot commented Apr 9, 2024 •

edited

Loading

github-actions bot commented May 10, 2024 •

edited

Loading