From 518292dbdfceb496361b1c92e732e2ccf2a55548 Mon Sep 17 00:00:00 2001
From: QingShan Zhang <qshanz@cn.ibm.com>
Date: Thu, 12 Mar 2020 05:15:41 +0000
Subject: [PATCH] [PowerPC] Add the MacroFusion support for Power8

This patch is intend to implement the missing P8 MacroFusion for LLVM
according to Power8 User's Manual Section 10.1.12 Instruction Fusion

Differential Revision: https://reviews.llvm.org/D70651
---
 llvm/lib/Target/PowerPC/CMakeLists.txt       |   1 +
 llvm/lib/Target/PowerPC/PPC.td               |  13 +-
 llvm/lib/Target/PowerPC/PPCMacroFusion.cpp   | 203 +++++++++++++++++++
 llvm/lib/Target/PowerPC/PPCMacroFusion.def   |  45 ++++
 llvm/lib/Target/PowerPC/PPCMacroFusion.h     |  22 ++
 llvm/lib/Target/PowerPC/PPCSubtarget.cpp     |   3 +
 llvm/lib/Target/PowerPC/PPCSubtarget.h       |   6 +
 llvm/lib/Target/PowerPC/PPCTargetMachine.cpp |   6 +
 llvm/test/CodeGen/PowerPC/macro-fusion.ll    |  21 ++
 9 files changed, 319 insertions(+), 1 deletion(-)
 create mode 100644 llvm/lib/Target/PowerPC/PPCMacroFusion.cpp
 create mode 100644 llvm/lib/Target/PowerPC/PPCMacroFusion.def
 create mode 100644 llvm/lib/Target/PowerPC/PPCMacroFusion.h
 create mode 100644 llvm/test/CodeGen/PowerPC/macro-fusion.ll
diff --git a/llvm/lib/Target/PowerPC/CMakeLists.txt b/llvm/lib/Target/PowerPC/CMakeLists.txt
index 1893d6e32c9ace..91021d4e584e10 100644
--- a/llvm/lib/Target/PowerPC/CMakeLists.txt
+++ b/llvm/lib/Target/PowerPC/CMakeLists.txt
@@ -33,6 +33,7 @@ add_llvm_target(PowerPCCodeGen
   PPCMCInstLower.cpp
   PPCMachineFunctionInfo.cpp
   PPCMachineScheduler.cpp
+  PPCMacroFusion.cpp
   PPCMIPeephole.cpp
   PPCRegisterInfo.cpp
   PPCQPXLoadSplat.cpp
diff --git a/llvm/lib/Target/PowerPC/PPC.td b/llvm/lib/Target/PowerPC/PPC.td
index bd6b9dd041819e..fc817631e0acee 100644
--- a/llvm/lib/Target/PowerPC/PPC.td
+++ b/llvm/lib/Target/PowerPC/PPC.td
@@ -166,6 +166,16 @@ def FeatureHTM : SubtargetFeature<"htm", "HasHTM", "true",
                                   "Enable Hardware Transactional Memory instructions">;
 def FeatureMFTB   : SubtargetFeature<"", "FeatureMFTB", "true",
                                         "Implement mftb using the mfspr instruction">;
+def FeatureFusion : SubtargetFeature<"fusion", "HasFusion", "true",
+                                     "Target supports instruction fusion">;
+def FeatureAddiLoadFusion : SubtargetFeature<"fuse-addi-load",
+                                             "HasAddiLoadFusion", "true",
+                                             "Power8 Addi-Load fusion",
+                                             [FeatureFusion]>;
+def FeatureAddisLoadFusion : SubtargetFeature<"fuse-addis-load",
+                                              "HasAddisLoadFusion", "true",
+                                              "Power8 Addis-Load fusion",
+                                              [FeatureFusion]>;
 def FeatureUnalignedFloats :
   SubtargetFeature<"allow-unaligned-fp-access", "AllowsUnalignedFPAccess",
                    "true", "CPU does not trap on unaligned FP access">;
@@ -279,7 +289,8 @@ def ProcessorFeatures {
                                                  FeatureDirectMove,
                                                  FeatureICBT,
                                                  FeaturePartwordAtomic];
-  list<SubtargetFeature> P8SpecificFeatures = [];
+  list<SubtargetFeature> P8SpecificFeatures = [FeatureAddiLoadFusion,
+                                               FeatureAddisLoadFusion];
   list<SubtargetFeature> P8InheritableFeatures =
     !listconcat(P7InheritableFeatures, P8AdditionalFeatures);
   list<SubtargetFeature> P8Features =
diff --git a/llvm/lib/Target/PowerPC/PPCMacroFusion.cpp b/llvm/lib/Target/PowerPC/PPCMacroFusion.cpp
new file mode 100644
index 00000000000000..bde3f5918a2395
--- /dev/null
+++ b/llvm/lib/Target/PowerPC/PPCMacroFusion.cpp
@@ -0,0 +1,203 @@
+//===- PPCMacroFusion.cpp - PowerPC Macro Fusion --------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file This file contains the PowerPC implementation of the DAG scheduling
+///  mutation to pair instructions back to back.
+//
+//===----------------------------------------------------------------------===//
+
+#include "PPC.h"
+#include "PPCSubtarget.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/CodeGen/MacroFusion.h"
+
+using namespace llvm;
+namespace {
+
+class FusionFeature {
+public:
+  typedef SmallDenseSet<unsigned> FusionOpSet;
+
+  enum FusionKind {
+  #define FUSION_KIND(KIND) FK_##KIND
+  #define FUSION_FEATURE(KIND, HAS_FEATURE, DEP_OP_IDX, OPSET1, OPSET2) \
+    FUSION_KIND(KIND),
+  #include "PPCMacroFusion.def"
+  FUSION_KIND(END)
+  };
+private:
+  // Each fusion feature is assigned with one fusion kind. All the
+  // instructions with the same fusion kind have the same fusion characteristic.
+  FusionKind Kd;
+  // True if this feature is enabled.
+  bool Supported;
+  // li rx, si
+  // load rt, ra, rx
+  // The dependent operand index in the second op(load). And the negative means
+  // it could be any one. 
+  int DepOpIdx;
+  // The first fusion op set.
+  FusionOpSet OpSet1;
+  // The second fusion op set.
+  FusionOpSet OpSet2;
+public:
+  FusionFeature(FusionKind Kind, bool HasFeature, int Index,
+                const FusionOpSet &First, const FusionOpSet &Second) :
+    Kd(Kind), Supported(HasFeature), DepOpIdx(Index), OpSet1(First), 
+    OpSet2(Second) {}
+
+  bool hasOp1(unsigned Opc) const { return OpSet1.count(Opc) != 0; }
+  bool hasOp2(unsigned Opc) const { return OpSet2.count(Opc) != 0; }
+  bool isSupported() const { return Supported; }
+  Optional<unsigned> depOpIdx() const {
+    if (DepOpIdx < 0)
+      return None;
+    return DepOpIdx;
+  }
+
+  FusionKind getKind() const { return Kd; }
+};
+
+static bool matchingRegOps(const MachineInstr &FirstMI,
+                           int FirstMIOpIndex,
+                           const MachineInstr &SecondMI,
+                           int SecondMIOpIndex) {
+  const MachineOperand &Op1 = FirstMI.getOperand(FirstMIOpIndex);
+  const MachineOperand &Op2 = SecondMI.getOperand(SecondMIOpIndex);
+  if (!Op1.isReg() || !Op2.isReg())
+    return false;
+
+  return Op1.getReg() == Op2.getReg();
+}
+
+// Return true if the FirstMI meets the constraints of SecondMI according to
+// fusion specification.
+static bool checkOpConstraints(FusionFeature::FusionKind Kd,
+                               const MachineInstr &FirstMI,
+                               const MachineInstr &SecondMI) {
+  switch (Kd) {
+  // The hardware didn't require any specific check for the fused instructions'
+  // operands. Therefore, return true to indicate that, it is fusable.
+  default: return true;
+  // [addi rt,ra,si - lxvd2x xt,ra,rb] etc.
+  case FusionFeature::FK_AddiLoad: {
+    // lxvd2x(ra) cannot be zero
+    const MachineOperand &RA = SecondMI.getOperand(1);
+    if (!RA.isReg())
+      return true;
+
+    return Register::isVirtualRegister(RA.getReg()) ||
+      (RA.getReg() != PPC::ZERO && RA.getReg() != PPC::ZERO8);
+  }
+  // [addis rt,ra,si - ld rt,ds(ra)] etc.
+  case FusionFeature::FK_AddisLoad: {
+    const MachineOperand &RT = SecondMI.getOperand(0);
+    if (!RT.isReg())
+      return true;
+
+    // Only check it for non-virtual register.
+    if (!Register::isVirtualRegister(RT.getReg()))
+      // addis(rt) = ld(ra) = ld(rt)
+      // ld(rt) cannot be zero
+      if (!matchingRegOps(SecondMI, 0, SecondMI, 2) ||
+          (RT.getReg() == PPC::ZERO || RT.getReg() == PPC::ZERO8))
+          return false;
+
+    // addis(si) first 12 bits must be all 1s or all 0s
+    const MachineOperand &SI = FirstMI.getOperand(2);
+    if (!SI.isImm())
+      return true;
+    int64_t Imm = SI.getImm();
+    if (((Imm & 0xFFF0) != 0) || ((Imm & 0xFFF0) != 0xFFF0)) 
+      return false;
+
+    // If si = 1111111111110000 and the msb of the d/ds field of the load equals 
+    // 1, then fusion does not occur.
+    if ((Imm & 0xFFF0) == 0xFFF0) {
+      const MachineOperand &D = SecondMI.getOperand(1);
+      if (!D.isImm())
+        return true;
+
+      // 14 bit for DS field, while 16 bit for D field.
+      int MSB = 15;
+      if (SecondMI.getOpcode() == PPC::LD)
+        MSB = 13;
+
+      return (D.getImm() & (1ULL << MSB)) == 0;
+    }
+    return true;
+  }
+  }
+
+  llvm_unreachable("All the cases should have been handled");
+  return true;
+}
+
+/// Check if the instr pair, FirstMI and SecondMI, should be fused together.
+/// Given SecondMI, when FirstMI is unspecified, then check if SecondMI may be
+/// part of a fused pair at all.
+static bool shouldScheduleAdjacent(const TargetInstrInfo &TII,
+                                   const TargetSubtargetInfo &TSI,
+                                   const MachineInstr *FirstMI,
+                                   const MachineInstr &SecondMI) {
+  // We use the PPC namespace to avoid the need to prefix opcodes with PPC:: in
+  // the def file.
+  using namespace PPC;
+
+  const PPCSubtarget &ST = static_cast<const PPCSubtarget&>(TSI);
+  static const FusionFeature FusionFeatures[] = {
+  #define FUSION_FEATURE(KIND, HAS_FEATURE, DEP_OP_IDX, OPSET1, OPSET2) { \
+    FusionFeature::FUSION_KIND(KIND), ST.HAS_FEATURE(), DEP_OP_IDX, { OPSET1 },\
+    { OPSET2 } },
+   #include "PPCMacroFusion.def"
+  };
+  #undef FUSION_KIND
+
+  for (auto &Feature : FusionFeatures) {
+    // Skip if the feature is not supported.
+    if (!Feature.isSupported())
+      continue;
+
+    // Only when the SecondMI is fusable, we are starting to look for the
+    // fusable FirstMI.
+    if (Feature.hasOp2(SecondMI.getOpcode())) {
+      // If FirstMI == nullptr, that means, we're only checking whether SecondMI
+      // can be fused at all.
+      if (!FirstMI)
+        return true;
+
+      // Checking if the FirstMI is fusable with the SecondMI.
+      if (!Feature.hasOp1(FirstMI->getOpcode()))
+        continue;
+
+      auto DepOpIdx = Feature.depOpIdx();
+      if (DepOpIdx.hasValue()) {
+        // Checking if the result of the FirstMI is the desired operand of the
+        // SecondMI if the DepOpIdx is set. Otherwise, ignore it.
+        if (!matchingRegOps(*FirstMI, 0, SecondMI, *DepOpIdx))
+          return false;
+      }
+  
+      // Checking more on the instruction operands.
+      if (checkOpConstraints(Feature.getKind(), *FirstMI, SecondMI))
+        return true;
+    }
+  }
+
+  return false;
+}
+
+} // end anonymous namespace
+
+namespace llvm {
+
+std::unique_ptr<ScheduleDAGMutation> createPowerPCMacroFusionDAGMutation () {
+  return createMacroFusionDAGMutation(shouldScheduleAdjacent);
+}
+
+} // end namespace llvm
diff --git a/llvm/lib/Target/PowerPC/PPCMacroFusion.def b/llvm/lib/Target/PowerPC/PPCMacroFusion.def
new file mode 100644
index 00000000000000..c7e4e7c22e0a65
--- /dev/null
+++ b/llvm/lib/Target/PowerPC/PPCMacroFusion.def
@@ -0,0 +1,45 @@
+//=== ---- PPCMacroFusion.def - PowerPC MacroFuson Candidates -v-*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https)//llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier) Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains descriptions of the macro-fusion pair for PowerPC.
+//
+//===----------------------------------------------------------------------===//
+
+// NOTE: NO INCLUDE GUARD DESIRED!
+
+#ifndef FUSION_FEATURE
+
+// Each FUSION_FEATURE is assigned with one TYPE, and can be enabled/disabled
+// by HAS_FEATURE. The instructions pair is fusable only when the opcode
+// of the first instruction is in OPSET1, and the second instruction opcode is
+// in OPSET2. And if DEP_OP_IDX >=0, we will check the result of first OP is  
+// the operand of the second op with DEP_OP_IDX as its operand index. We assume
+// that the result of the first op is its operand zero. 
+#define FUSION_FEATURE(TYPE, HAS_FEATURE, DEP_OP_IDX, OPSET1, OPSET2)
+
+#endif
+
+#ifndef FUSION_OP_SET
+#define FUSION_OP_SET(...) __VA_ARGS__ 
+#endif
+
+// Power8 User Manual Section 10.1.12, Instruction Fusion 
+// {addi} followed by one of these {lxvd2x, lxvw4x, lxvdsx, lvebx, lvehx,
+// lvewx, lvx, lxsdx}
+FUSION_FEATURE(AddiLoad, hasAddiLoadFusion, 2, \
+               FUSION_OP_SET(ADDI, ADDI8, ADDItocL), \
+               FUSION_OP_SET(LXVD2X, LXVW4X, LXVDSX, LVEBX, LVEHX, LVEWX, \
+                             LVX, LXSDX))
+
+// {addis) followed by one of these {ld, lbz, lhz, lwz}
+FUSION_FEATURE(AddisLoad, hasAddisLoadFusion, 2, \
+               FUSION_OP_SET(ADDIS, ADDIS8, ADDIStocHA8), \
+               FUSION_OP_SET(LD, LBZ, LBZ8, LHZ, LHZ8, LWZ, LWZ8))
+
+#undef FUSION_FEATURE
+#undef FUSION_OP_SET
diff --git a/llvm/lib/Target/PowerPC/PPCMacroFusion.h b/llvm/lib/Target/PowerPC/PPCMacroFusion.h
new file mode 100644
index 00000000000000..91cbedf4558fcb
--- /dev/null
+++ b/llvm/lib/Target/PowerPC/PPCMacroFusion.h
@@ -0,0 +1,22 @@
+//===- PPCMacroFusion.h - PowerPC Macro Fusion ----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file This file contains the PowerPC definition of the DAG scheduling
+/// mutation to pair instructions back to back.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/MachineScheduler.h"
+
+namespace llvm {
+
+/// Note that you have to add:
+///   DAG.addMutation(createPowerPCMacroFusionDAGMutation());
+/// to PPCPassConfig::createMachineScheduler() to have an effect.
+std::unique_ptr<ScheduleDAGMutation> createPowerPCMacroFusionDAGMutation();
+} // llvm
diff --git a/llvm/lib/Target/PowerPC/PPCSubtarget.cpp b/llvm/lib/Target/PowerPC/PPCSubtarget.cpp
index 0549df1ec5cc95..b0c0f30a56ec6a 100644
--- a/llvm/lib/Target/PowerPC/PPCSubtarget.cpp
+++ b/llvm/lib/Target/PowerPC/PPCSubtarget.cpp
@@ -111,6 +111,9 @@ void PPCSubtarget::initializeEnvironment() {
   IsQPXStackUnaligned = false;
   HasHTM = false;
   HasFloat128 = false;
+  HasFusion = false;
+  HasAddiLoadFusion = false;
+  HasAddisLoadFusion = false;
   IsISA3_0 = false;
   UseLongCalls = false;
   SecurePlt = false;
diff --git a/llvm/lib/Target/PowerPC/PPCSubtarget.h b/llvm/lib/Target/PowerPC/PPCSubtarget.h
index a9a417106ae113..be061d9ce0a11b 100644
--- a/llvm/lib/Target/PowerPC/PPCSubtarget.h
+++ b/llvm/lib/Target/PowerPC/PPCSubtarget.h
@@ -135,6 +135,9 @@ class PPCSubtarget : public PPCGenSubtargetInfo {
   bool HasDirectMove;
   bool HasHTM;
   bool HasFloat128;
+  bool HasFusion;
+  bool HasAddiLoadFusion;
+  bool HasAddisLoadFusion;
   bool IsISA3_0;
   bool UseLongCalls;
   bool SecurePlt;
@@ -306,6 +309,9 @@ class PPCSubtarget : public PPCGenSubtargetInfo {
   bool hasFloat128() const { return HasFloat128; }
   bool isISA3_0() const { return IsISA3_0; }
   bool useLongCalls() const { return UseLongCalls; }
+  bool hasFusion() const { return HasFusion; }
+  bool hasAddiLoadFusion() const { return HasAddiLoadFusion; }
+  bool hasAddisLoadFusion() const { return HasAddisLoadFusion; }
   bool needsSwapsForVSXMemOps() const {
     return hasVSX() && isLittleEndian() && !hasP9Vector();
   }
diff --git a/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp b/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp
index 8aa1f1bcaef896..5e5df91fc4ab7e 100644
--- a/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp
+++ b/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp
@@ -14,6 +14,7 @@
 #include "MCTargetDesc/PPCMCTargetDesc.h"
 #include "PPC.h"
 #include "PPCMachineScheduler.h"
+#include "PPCMacroFusion.h"
 #include "PPCSubtarget.h"
 #include "PPCTargetObjectFile.h"
 #include "PPCTargetTransformInfo.h"
@@ -275,6 +276,9 @@ static ScheduleDAGInstrs *createPPCMachineScheduler(MachineSchedContext *C) {
                           std::make_unique<GenericScheduler>(C));
   // add DAG Mutations here.
   DAG->addMutation(createCopyConstrainDAGMutation(DAG->TII, DAG->TRI));
+  if (ST.hasFusion())
+    DAG->addMutation(createPowerPCMacroFusionDAGMutation());
+
   return DAG;
 }
 
@@ -286,6 +290,8 @@ static ScheduleDAGInstrs *createPPCPostMachineScheduler(
                       std::make_unique<PPCPostRASchedStrategy>(C) :
                       std::make_unique<PostGenericScheduler>(C), true);
   // add DAG Mutations here.
+  if (ST.hasFusion())
+    DAG->addMutation(createPowerPCMacroFusionDAGMutation());
   return DAG;
 }
 
diff --git a/llvm/test/CodeGen/PowerPC/macro-fusion.ll b/llvm/test/CodeGen/PowerPC/macro-fusion.ll
new file mode 100644
index 00000000000000..0e9ac85a186190
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/macro-fusion.ll
@@ -0,0 +1,21 @@
+; REQUIRES: asserts
+; RUN: llc < %s -mtriple=powerpc64le-unknown-linux-gnu -mcpu=pwr8 -verify-misched -debug-only=machine-scheduler \
+; RUN:  -o - 2>&1 > /dev/null | FileCheck %s --check-prefix=CHECK-P8
+
+@m = local_unnamed_addr global i64 0, align 8
+
+define i64 @fuse_addis_ld() {
+entry:
+; CHECK-P8: ********** MI Scheduling **********
+; CHECK-P8-LABEL: fuse_addis_ld:%bb.0 entry
+; CHECK-P8: Macro fuse: SU([[SU0:[0-9]+]]) - SU([[SU1:[0-9]+]]) /  ADDIStocHA8 - LD
+; CHECK-P8: SU([[SU0]]):   %[[REG3:[0-9]+]]:g8rc_and_g8rc_nox0 = ADDIStocHA8 $x2, @m
+; CHECK-P8: SU([[SU1]]):   %{{[0-9]+}}:g8rc = LD target-flags(ppc-toc-lo) @m, %[[REG3]]
+; CHECK-P8: ********** MI Scheduling **********
+; CHECK-P8-LABEL: fuse_addis_ld:%bb.0 entry
+; CHECK-P8: Macro fuse: SU([[SU0:[0-9]+]]) - SU([[SU1:[0-9]+]]) /  ADDIStocHA8 - LD
+; CHECK-P8: SU([[SU0]]):   renamable $x[[REG3:[0-9]+]] = ADDIStocHA8 $x2, @m
+; CHECK-P8: SU([[SU1]]):   renamable $x[[REG3]] = LD target-flags(ppc-toc-lo) @m, renamable $x[[REG3]]
+  %0 = load i64, i64* @m, align 8
+  ret i64 %0
+}