From 2c7870dccaf31167b7d7b422ed51d1f0b3e343d3 Mon Sep 17 00:00:00 2001
From: Arthur Eubanks <aeubanks@google.com>
Date: Thu, 19 Nov 2020 09:38:14 -0800
Subject: [PATCH 1/3] [NewPM] Add pipeline EP callback after initial frontend
 cleanup

This matches the legacy PM's EP_ModuleOptimizerEarly. Some backends use
this extension point and adding the pass somewhere else like
PipelineStartEPCallback doesn't work.

Reviewed By: ychen

Differential Revision: https://reviews.llvm.org/D91804
---
 llvm/include/llvm/Passes/PassBuilder.h    | 12 ++++++++++++
 llvm/lib/Passes/PassBuilder.cpp           |  5 +++++
 llvm/test/Other/new-pm-O0-ep-callbacks.ll |  1 +
 llvm/test/Other/new-pm-defaults.ll        |  5 +++++
 llvm/test/Other/pass-pipeline-parsing.ll  |  3 +++
 llvm/tools/opt/NewPMDriver.cpp            | 12 ++++++++++++
 6 files changed, 38 insertions(+)
diff --git a/llvm/include/llvm/Passes/PassBuilder.h b/llvm/include/llvm/Passes/PassBuilder.h
index 97e0b19ed07f0..fb1a83306f556 100644
--- a/llvm/include/llvm/Passes/PassBuilder.h
+++ b/llvm/include/llvm/Passes/PassBuilder.h
@@ -597,6 +597,15 @@ class PassBuilder {
     PipelineStartEPCallbacks.push_back(C);
   }
 
+  /// Register a callback for a default optimizer pipeline extension point.
+  ///
+  /// This extension point allows adding optimization right after passes that do
+  /// basic simplification of the input IR.
+  void registerPipelineEarlySimplificationEPCallback(
+      const std::function<void(ModulePassManager &, OptimizationLevel)> &C) {
+    PipelineEarlySimplificationEPCallbacks.push_back(C);
+  }
+
   /// Register a callback for a default optimizer pipeline extension point
   ///
   /// This extension point allows adding optimizations at the very end of the
@@ -729,6 +738,9 @@ class PassBuilder {
   // Module callbacks
   SmallVector<std::function<void(ModulePassManager &, OptimizationLevel)>, 2>
       PipelineStartEPCallbacks;
+  SmallVector<std::function<void(ModulePassManager &, OptimizationLevel)>, 2>
+      PipelineEarlySimplificationEPCallbacks;
+
   SmallVector<std::function<void(ModuleAnalysisManager &)>, 2>
       ModuleAnalysisRegistrationCallbacks;
   SmallVector<std::function<bool(StringRef, ModulePassManager &,
diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp
index e5189bdb4fd99..af3eae47d2425 100644
--- a/llvm/lib/Passes/PassBuilder.cpp
+++ b/llvm/lib/Passes/PassBuilder.cpp
@@ -1032,6 +1032,9 @@ PassBuilder::buildModuleSimplificationPipeline(OptimizationLevel Level,
   if (Phase == ThinLTOPhase::PostLink)
     MPM.addPass(LowerTypeTestsPass(nullptr, nullptr, true));
 
+  for (auto &C : PipelineEarlySimplificationEPCallbacks)
+    C(MPM, Level);
+
   // Interprocedural constant propagation now that basic cleanup has occurred
   // and prior to optimizing globals.
   // FIXME: This position in the pipeline hasn't been carefully considered in
@@ -1703,6 +1706,8 @@ ModulePassManager PassBuilder::buildO0DefaultPipeline(OptimizationLevel Level,
 
   for (auto &C : PipelineStartEPCallbacks)
     C(MPM, Level);
+  for (auto &C : PipelineEarlySimplificationEPCallbacks)
+    C(MPM, Level);
 
   // Build a minimal pipeline based on the semantics required by LLVM,
   // which is just that always inlining occurs. Further, disable generating
diff --git a/llvm/test/Other/new-pm-O0-ep-callbacks.ll b/llvm/test/Other/new-pm-O0-ep-callbacks.ll
index 1645156c1b28e..8fc8af0ec277d 100644
--- a/llvm/test/Other/new-pm-O0-ep-callbacks.ll
+++ b/llvm/test/Other/new-pm-O0-ep-callbacks.ll
@@ -4,6 +4,7 @@
 ; RUN: opt -disable-output -debug-pass-manager -passes-ep-cgscc-optimizer-late=no-op-cgscc -passes='default<O0>' 2>&1 < %s | FileCheck %s
 ; RUN: opt -disable-output -debug-pass-manager -passes-ep-vectorizer-start=no-op-function -passes='default<O0>' 2>&1 < %s | FileCheck %s
 ; RUN: opt -disable-output -debug-pass-manager -passes-ep-pipeline-start=no-op-module -passes='default<O0>' 2>&1 < %s | FileCheck %s
+; RUN: opt -disable-output -debug-pass-manager -passes-ep-pipeline-early-simplification=no-op-module -passes='default<O0>' 2>&1 < %s | FileCheck %s
 ; RUN: opt -disable-output -debug-pass-manager -passes-ep-optimizer-last=no-op-function -passes='default<O0>' 2>&1 < %s | FileCheck %s
 
 ; CHECK: Running pass: NoOp
diff --git a/llvm/test/Other/new-pm-defaults.ll b/llvm/test/Other/new-pm-defaults.ll
index 1f2142c5bcd1c..9e27486e981bc 100644
--- a/llvm/test/Other/new-pm-defaults.ll
+++ b/llvm/test/Other/new-pm-defaults.ll
@@ -55,6 +55,10 @@
 ; RUN:     -passes='default<O3>' -S  %s 2>&1 \
 ; RUN:     | FileCheck %s --check-prefixes=CHECK-O,CHECK-DEFAULT,CHECK-O3,%llvmcheckext,CHECK-EP-PIPELINE-START,CHECK-O23SZ
 ; RUN: opt -disable-verify -debug-pass-manager \
+; RUN:     -passes-ep-pipeline-early-simplification='no-op-module' \
+; RUN:     -passes='default<O3>' -S  %s 2>&1 \
+; RUN:     | FileCheck %s --check-prefixes=CHECK-O,CHECK-DEFAULT,CHECK-O3,%llvmcheckext,CHECK-EP-PIPELINE-EARLY-SIMPLIFICATION,CHECK-O23SZ
+; RUN: opt -disable-verify -debug-pass-manager \
 ; RUN:     -passes-ep-pipeline-start='no-op-module' \
 ; RUN:     -passes='lto-pre-link<O3>' -S  %s 2>&1 \
 ; RUN:     | FileCheck %s --check-prefixes=CHECK-O,CHECK-LTO,CHECK-O3,%llvmcheckext,CHECK-EP-PIPELINE-START,CHECK-O23SZ
@@ -84,6 +88,7 @@
 ; CHECK-O-NEXT: Running pass: LowerExpectIntrinsicPass
 ; CHECK-O3-NEXT: Running pass: CallSiteSplittingPass
 ; CHECK-O-NEXT: Finished llvm::Function pass manager run.
+; CHECK-EP-PIPELINE-EARLY-SIMPLIFICATION-NEXT: Running pass: NoOpModulePass
 ; CHECK-O-NEXT: Running pass: IPSCCPPass
 ; CHECK-O-NEXT: Running pass: CalledValuePropagationPass
 ; CHECK-O-NEXT: Running pass: GlobalOptPass
diff --git a/llvm/test/Other/pass-pipeline-parsing.ll b/llvm/test/Other/pass-pipeline-parsing.ll
index adf7554ac503b..4cff050f52df0 100644
--- a/llvm/test/Other/pass-pipeline-parsing.ll
+++ b/llvm/test/Other/pass-pipeline-parsing.ll
@@ -276,6 +276,9 @@
 ; RUN: opt -passes-ep-pipeline-start=bad -passes=no-op-function \
 ; RUN:       /dev/null -disable-output 2>&1 | FileCheck %s -check-prefix=PASSES-EP-PIPELINESTART-ERR
 ; PASSES-EP-PIPELINESTART-ERR: Could not parse -passes-ep-pipeline-start pipeline: unknown pass name 'bad'
+; RUN: opt -passes-ep-pipeline-early-simplification=bad -passes=no-op-function \
+; RUN:       /dev/null -disable-output 2>&1 | FileCheck %s -check-prefix=PASSES-EP-PIPELINEEARLYSIMPLIFICATION-ERR
+; PASSES-EP-PIPELINEEARLYSIMPLIFICATION-ERR: Could not parse -passes-ep-pipeline-early-simplification pipeline: unknown pass name 'bad'
 
 define void @f() {
 entry:
diff --git a/llvm/tools/opt/NewPMDriver.cpp b/llvm/tools/opt/NewPMDriver.cpp
index 224a3242eb17f..9badbdf64932d 100644
--- a/llvm/tools/opt/NewPMDriver.cpp
+++ b/llvm/tools/opt/NewPMDriver.cpp
@@ -107,6 +107,11 @@ static cl::opt<std::string> PipelineStartEPPipeline(
     cl::desc("A textual description of the module pass pipeline inserted at "
              "the PipelineStart extension point into default pipelines"),
     cl::Hidden);
+static cl::opt<std::string> PipelineEarlySimplificationEPPipeline(
+    "passes-ep-pipeline-early-simplification",
+    cl::desc("A textual description of the module pass pipeline inserted at "
+             "the EarlySimplification extension point into default pipelines"),
+    cl::Hidden);
 static cl::opt<std::string> OptimizerLastEPPipeline(
     "passes-ep-optimizer-last",
     cl::desc("A textual description of the module pass pipeline inserted at "
@@ -195,6 +200,13 @@ static void registerEPCallbacks(PassBuilder &PB) {
           ExitOnError Err("Unable to parse PipelineStartEP pipeline: ");
           Err(PB.parsePassPipeline(PM, PipelineStartEPPipeline));
         });
+  if (tryParsePipelineText<ModulePassManager>(
+          PB, PipelineEarlySimplificationEPPipeline))
+    PB.registerPipelineEarlySimplificationEPCallback(
+        [&PB](ModulePassManager &PM, PassBuilder::OptimizationLevel) {
+          ExitOnError Err("Unable to parse EarlySimplification pipeline: ");
+          Err(PB.parsePassPipeline(PM, PipelineEarlySimplificationEPPipeline));
+        });
   if (tryParsePipelineText<FunctionPassManager>(PB, OptimizerLastEPPipeline))
     PB.registerOptimizerLastEPCallback(
         [&PB](ModulePassManager &PM, PassBuilder::OptimizationLevel) {

From 9c588f53fc423dd0ed69250fbc93b37b40c0ef44 Mon Sep 17 00:00:00 2001
From: QingShan Zhang <qshanz@cn.ibm.com>
Date: Wed, 25 Nov 2020 05:37:15 +0000
Subject: [PATCH 2/3] [DAGCombine] Add hook to allow target specific test for
 sqrt input

PowerPC has instruction ftsqrt/xstsqrtdp etc to do the input test for software square root.
LLVM now tests it with smallest normalized value using abs + setcc. We should add hook to
target that has test instructions.

Reviewed By: Spatel, Chen Zheng, Qiu Chao Fang

Differential Revision: https://reviews.llvm.org/D80706
---
 llvm/include/llvm/CodeGen/TargetLowering.h    |  9 ++++
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 41 +++++++++--------
 llvm/lib/Target/PowerPC/PPCISelLowering.cpp   | 29 ++++++++++++
 llvm/lib/Target/PowerPC/PPCISelLowering.h     |  5 +++
 llvm/lib/Target/PowerPC/PPCInstrFormats.td    |  3 +-
 llvm/lib/Target/PowerPC/PPCInstrInfo.td       |  7 ++-
 llvm/lib/Target/PowerPC/PPCInstrVSX.td        |  3 +-
 llvm/test/CodeGen/PowerPC/fma-mutate.ll       |  7 +--
 llvm/test/CodeGen/PowerPC/recipest.ll         | 45 ++++++++-----------
 9 files changed, 96 insertions(+), 53 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index 164cbd7107132..16580a9160b9a 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -4277,6 +4277,15 @@ class TargetLowering : public TargetLoweringBase {
     return SDValue();
   }
 
+  /// Return a target-dependent comparison result if the input operand is
+  /// suitable for use with a square root estimate calculation. For example, the
+  /// comparison may check if the operand is NAN, INF, zero, normal, etc. The
+  /// result should be used as the condition operand for a select or branch.
+  virtual SDValue getSqrtInputTest(SDValue Operand, SelectionDAG &DAG,
+                                   const DenormalMode &Mode) const {
+    return SDValue();
+  }
+
   //===--------------------------------------------------------------------===//
   // Legalization utility functions
   //
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index cae602d166d15..4ac1743d2d342 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -22056,26 +22056,31 @@ SDValue DAGCombiner::buildSqrtEstimateImpl(SDValue Op, SDNodeFlags Flags,
         // possibly a denormal. Force the answer to 0.0 for those cases.
         SDLoc DL(Op);
         EVT CCVT = getSetCCResultType(VT);
-        ISD::NodeType SelOpcode = VT.isVector() ? ISD::VSELECT : ISD::SELECT;
+        SDValue FPZero = DAG.getConstantFP(0.0, DL, VT);
         DenormalMode DenormMode = DAG.getDenormalMode(VT);
-        if (DenormMode.Input == DenormalMode::IEEE) {
-          // This is specifically a check for the handling of denormal inputs,
-          // not the result.
-
-          // fabs(X) < SmallestNormal ? 0.0 : Est
-          const fltSemantics &FltSem = DAG.EVTToAPFloatSemantics(VT);
-          APFloat SmallestNorm = APFloat::getSmallestNormalized(FltSem);
-          SDValue NormC = DAG.getConstantFP(SmallestNorm, DL, VT);
-          SDValue FPZero = DAG.getConstantFP(0.0, DL, VT);
-          SDValue Fabs = DAG.getNode(ISD::FABS, DL, VT, Op);
-          SDValue IsDenorm = DAG.getSetCC(DL, CCVT, Fabs, NormC, ISD::SETLT);
-          Est = DAG.getNode(SelOpcode, DL, VT, IsDenorm, FPZero, Est);
-        } else {
-          // X == 0.0 ? 0.0 : Est
-          SDValue FPZero = DAG.getConstantFP(0.0, DL, VT);
-          SDValue IsZero = DAG.getSetCC(DL, CCVT, Op, FPZero, ISD::SETEQ);
-          Est = DAG.getNode(SelOpcode, DL, VT, IsZero, FPZero, Est);
+        // Try the target specific test first.
+        SDValue Test = TLI.getSqrtInputTest(Op, DAG, DenormMode);
+        if (!Test) {
+          // If no test provided by target, testing it with denormal inputs to
+          // avoid wrong estimate.
+          if (DenormMode.Input == DenormalMode::IEEE) {
+            // This is specifically a check for the handling of denormal inputs,
+            // not the result.
+
+            // Test = fabs(X) < SmallestNormal
+            const fltSemantics &FltSem = DAG.EVTToAPFloatSemantics(VT);
+            APFloat SmallestNorm = APFloat::getSmallestNormalized(FltSem);
+            SDValue NormC = DAG.getConstantFP(SmallestNorm, DL, VT);
+            SDValue Fabs = DAG.getNode(ISD::FABS, DL, VT, Op);
+            Test = DAG.getSetCC(DL, CCVT, Fabs, NormC, ISD::SETLT);
+          } else
+            // Test = X == 0.0
+            Test = DAG.getSetCC(DL, CCVT, Op, FPZero, ISD::SETEQ);
         }
+        // Test ? 0.0 : Est
+        Est = DAG.getNode(Test.getValueType().isVector() ? ISD::VSELECT
+                                                         : ISD::SELECT,
+                          DL, VT, Test, FPZero, Est);
       }
     }
     return Est;
diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
index 10aecf97fcdf1..d19fbd477d77e 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -1447,6 +1447,8 @@ const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const {
                                 return "PPCISD::FP_TO_SINT_IN_VSR";
   case PPCISD::FRE:             return "PPCISD::FRE";
   case PPCISD::FRSQRTE:         return "PPCISD::FRSQRTE";
+  case PPCISD::FTSQRT:
+    return "PPCISD::FTSQRT";
   case PPCISD::STFIWX:          return "PPCISD::STFIWX";
   case PPCISD::VPERM:           return "PPCISD::VPERM";
   case PPCISD::XXSPLT:          return "PPCISD::XXSPLT";
@@ -12758,6 +12760,33 @@ static int getEstimateRefinementSteps(EVT VT, const PPCSubtarget &Subtarget) {
   return RefinementSteps;
 }
 
+SDValue PPCTargetLowering::getSqrtInputTest(SDValue Op, SelectionDAG &DAG,
+                                            const DenormalMode &Mode) const {
+  // TODO - add support for v2f64/v4f32
+  EVT VT = Op.getValueType();
+  if (VT != MVT::f64)
+    return SDValue();
+
+  SDLoc DL(Op);
+  // The output register of FTSQRT is CR field.
+  SDValue FTSQRT = DAG.getNode(PPCISD::FTSQRT, DL, MVT::i32, Op);
+  // ftsqrt BF,FRB
+  // Let e_b be the unbiased exponent of the double-precision
+  // floating-point operand in register FRB.
+  // fe_flag is set to 1 if either of the following conditions occurs.
+  //   - The double-precision floating-point operand in register FRB is a zero,
+  //     a NaN, or an infinity, or a negative value.
+  //   - e_b is less than or equal to -970.
+  // Otherwise fe_flag is set to 0.
+  // Both VSX and non-VSX versions would set EQ bit in the CR if the number is
+  // not eligible for iteration. (zero/negative/infinity/nan or unbiased
+  // exponent is less than -970)
+  SDValue SRIdxVal = DAG.getTargetConstant(PPC::sub_eq, DL, MVT::i32);
+  return SDValue(DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG, DL, MVT::i1,
+                                    FTSQRT, SRIdxVal),
+                 0);
+}
+
 SDValue PPCTargetLowering::getSqrtEstimate(SDValue Operand, SelectionDAG &DAG,
                                            int Enabled, int &RefinementSteps,
                                            bool &UseOneConstNR,
diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.h b/llvm/lib/Target/PowerPC/PPCISelLowering.h
index 414a355264f83..6c4899fae22cb 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.h
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.h
@@ -89,6 +89,9 @@ namespace llvm {
     FRE,
     FRSQRTE,
 
+    /// Test instruction for software square root.
+    FTSQRT,
+
     /// VPERM - The PPC VPERM Instruction.
     ///
     VPERM,
@@ -1283,6 +1286,8 @@ namespace llvm {
                             bool Reciprocal) const override;
     SDValue getRecipEstimate(SDValue Operand, SelectionDAG &DAG, int Enabled,
                              int &RefinementSteps) const override;
+    SDValue getSqrtInputTest(SDValue Operand, SelectionDAG &DAG,
+                             const DenormalMode &Mode) const override;
     unsigned combineRepeatedFPDivisors() const override;
 
     SDValue
diff --git a/llvm/lib/Target/PowerPC/PPCInstrFormats.td b/llvm/lib/Target/PowerPC/PPCInstrFormats.td
index 5ff5fc78326ba..646efe64a22c7 100644
--- a/llvm/lib/Target/PowerPC/PPCInstrFormats.td
+++ b/llvm/lib/Target/PowerPC/PPCInstrFormats.td
@@ -637,9 +637,10 @@ class XForm_17<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
 }
 
 class XForm_17a<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
-               InstrItinClass itin>
+               InstrItinClass itin, list<dag> pattern>
   : XForm_17<opcode, xo, OOL, IOL, asmstr, itin > {
   let FRA = 0;
+  let Pattern = pattern;
 }
 
 class XForm_18<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
diff --git a/llvm/lib/Target/PowerPC/PPCInstrInfo.td b/llvm/lib/Target/PowerPC/PPCInstrInfo.td
index 2e77d04d4a79e..de9ae99adac73 100644
--- a/llvm/lib/Target/PowerPC/PPCInstrInfo.td
+++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.td
@@ -74,6 +74,9 @@ def SDT_PPCcondbr : SDTypeProfile<0, 3, [
   SDTCisVT<0, i32>, SDTCisVT<2, OtherVT>
 ]>;
 
+def SDT_PPCFtsqrt : SDTypeProfile<1, 1, [
+  SDTCisVT<0, i32>]>;
+
 def SDT_PPClbrx : SDTypeProfile<1, 2, [
   SDTCisInt<0>, SDTCisPtrTy<1>, SDTCisVT<2, OtherVT>
 ]>;
@@ -124,6 +127,7 @@ def SDT_PPCFPMinMax : SDTypeProfile<1, 2, [
 
 def PPCfre    : SDNode<"PPCISD::FRE",     SDTFPUnaryOp, []>;
 def PPCfrsqrte: SDNode<"PPCISD::FRSQRTE", SDTFPUnaryOp, []>;
+def PPCftsqrt : SDNode<"PPCISD::FTSQRT",  SDT_PPCFtsqrt,[]>;
 
 def PPCfcfid  : SDNode<"PPCISD::FCFID",   SDTFPUnaryOp, []>;
 def PPCfcfidu : SDNode<"PPCISD::FCFIDU",  SDTFPUnaryOp, []>;
@@ -2643,7 +2647,8 @@ let isCompare = 1, mayRaiseFPException = 1, hasSideEffects = 0 in {
 def FTDIV: XForm_17<63, 128, (outs crrc:$crD), (ins f8rc:$fA, f8rc:$fB),
                       "ftdiv $crD, $fA, $fB", IIC_FPCompare>;
 def FTSQRT: XForm_17a<63, 160, (outs crrc:$crD), (ins f8rc:$fB),
-                      "ftsqrt $crD, $fB", IIC_FPCompare>;
+                      "ftsqrt $crD, $fB", IIC_FPCompare,
+                      [(set i32:$crD, (PPCftsqrt f64:$fB))]>;
 
 let mayRaiseFPException = 1, hasSideEffects = 0 in {
   let Interpretation64Bit = 1, isCodeGenOnly = 1 in
diff --git a/llvm/lib/Target/PowerPC/PPCInstrVSX.td b/llvm/lib/Target/PowerPC/PPCInstrVSX.td
index 1ffbd405d87aa..b023c05960639 100644
--- a/llvm/lib/Target/PowerPC/PPCInstrVSX.td
+++ b/llvm/lib/Target/PowerPC/PPCInstrVSX.td
@@ -629,7 +629,8 @@ let hasSideEffects = 0 in {
                          "xstdivdp $crD, $XA, $XB", IIC_FPCompare, []>;
   def XSTSQRTDP : XX2Form_1<60, 106,
                           (outs crrc:$crD), (ins vsfrc:$XB),
-                          "xstsqrtdp $crD, $XB", IIC_FPCompare, []>;
+                          "xstsqrtdp $crD, $XB", IIC_FPCompare,
+                          [(set i32:$crD, (PPCftsqrt f64:$XB))]>;
   def XVTDIVDP : XX3Form_1<60, 125,
                          (outs crrc:$crD), (ins vsrc:$XA, vsrc:$XB),
                          "xvtdivdp $crD, $XA, $XB", IIC_FPCompare, []>;
diff --git a/llvm/test/CodeGen/PowerPC/fma-mutate.ll b/llvm/test/CodeGen/PowerPC/fma-mutate.ll
index a1e3473edf222..62cce7362c682 100644
--- a/llvm/test/CodeGen/PowerPC/fma-mutate.ll
+++ b/llvm/test/CodeGen/PowerPC/fma-mutate.ll
@@ -9,12 +9,9 @@ declare double @llvm.sqrt.f64(double)
 define double @foo3_fmf(double %a) nounwind {
 ; CHECK-LABEL: foo3_fmf:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    xsabsdp 0, 1
-; CHECK-NEXT:    addis 3, 2, .LCPI0_2@toc@ha
-; CHECK-NEXT:    lfd 2, .LCPI0_2@toc@l(3)
-; CHECK-NEXT:    xscmpudp 0, 0, 2
+; CHECK-NEXT:    xstsqrtdp 0, 1
 ; CHECK-NEXT:    xxlxor 0, 0, 0
-; CHECK-NEXT:    blt 0, .LBB0_2
+; CHECK-NEXT:    bc 12, 2, .LBB0_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    xsrsqrtedp 0, 1
 ; CHECK-NEXT:    addis 3, 2, .LCPI0_0@toc@ha
diff --git a/llvm/test/CodeGen/PowerPC/recipest.ll b/llvm/test/CodeGen/PowerPC/recipest.ll
index e3894bcd23f5a..cd8520b35ffad 100644
--- a/llvm/test/CodeGen/PowerPC/recipest.ll
+++ b/llvm/test/CodeGen/PowerPC/recipest.ll
@@ -749,11 +749,8 @@ define <4 x float> @hoo2_safe(<4 x float> %a, <4 x float> %b) nounwind {
 define double @foo3_fmf(double %a) nounwind {
 ; CHECK-P7-LABEL: foo3_fmf:
 ; CHECK-P7:       # %bb.0:
-; CHECK-P7-NEXT:    fabs 0, 1
-; CHECK-P7-NEXT:    addis 3, 2, .LCPI20_2@toc@ha
-; CHECK-P7-NEXT:    lfd 2, .LCPI20_2@toc@l(3)
-; CHECK-P7-NEXT:    fcmpu 0, 0, 2
-; CHECK-P7-NEXT:    blt 0, .LBB20_2
+; CHECK-P7-NEXT:    ftsqrt 0, 1
+; CHECK-P7-NEXT:    bc 12, 2, .LBB20_2
 ; CHECK-P7-NEXT:  # %bb.1:
 ; CHECK-P7-NEXT:    frsqrte 0, 1
 ; CHECK-P7-NEXT:    addis 3, 2, .LCPI20_0@toc@ha
@@ -770,18 +767,15 @@ define double @foo3_fmf(double %a) nounwind {
 ; CHECK-P7-NEXT:    fmul 1, 1, 0
 ; CHECK-P7-NEXT:    blr
 ; CHECK-P7-NEXT:  .LBB20_2:
-; CHECK-P7-NEXT:    addis 3, 2, .LCPI20_3@toc@ha
-; CHECK-P7-NEXT:    lfs 1, .LCPI20_3@toc@l(3)
+; CHECK-P7-NEXT:    addis 3, 2, .LCPI20_2@toc@ha
+; CHECK-P7-NEXT:    lfs 1, .LCPI20_2@toc@l(3)
 ; CHECK-P7-NEXT:    blr
 ;
 ; CHECK-P8-LABEL: foo3_fmf:
 ; CHECK-P8:       # %bb.0:
-; CHECK-P8-NEXT:    xsabsdp 0, 1
-; CHECK-P8-NEXT:    addis 3, 2, .LCPI20_2@toc@ha
-; CHECK-P8-NEXT:    lfd 2, .LCPI20_2@toc@l(3)
-; CHECK-P8-NEXT:    xscmpudp 0, 0, 2
+; CHECK-P8-NEXT:    xstsqrtdp 0, 1
 ; CHECK-P8-NEXT:    xxlxor 0, 0, 0
-; CHECK-P8-NEXT:    blt 0, .LBB20_2
+; CHECK-P8-NEXT:    bc 12, 2, .LBB20_2
 ; CHECK-P8-NEXT:  # %bb.1:
 ; CHECK-P8-NEXT:    xsrsqrtedp 0, 1
 ; CHECK-P8-NEXT:    addis 3, 2, .LCPI20_0@toc@ha
@@ -803,12 +797,9 @@ define double @foo3_fmf(double %a) nounwind {
 ;
 ; CHECK-P9-LABEL: foo3_fmf:
 ; CHECK-P9:       # %bb.0:
-; CHECK-P9-NEXT:    addis 3, 2, .LCPI20_2@toc@ha
-; CHECK-P9-NEXT:    xsabsdp 0, 1
-; CHECK-P9-NEXT:    lfd 2, .LCPI20_2@toc@l(3)
-; CHECK-P9-NEXT:    xscmpudp 0, 0, 2
+; CHECK-P9-NEXT:    xstsqrtdp 0, 1
 ; CHECK-P9-NEXT:    xxlxor 0, 0, 0
-; CHECK-P9-NEXT:    blt 0, .LBB20_2
+; CHECK-P9-NEXT:    bc 12, 2, .LBB20_2
 ; CHECK-P9-NEXT:  # %bb.1:
 ; CHECK-P9-NEXT:    xsrsqrtedp 0, 1
 ; CHECK-P9-NEXT:    addis 3, 2, .LCPI20_0@toc@ha
@@ -1038,18 +1029,18 @@ define <2 x double> @hoo4_fmf(<2 x double> %a) #1 {
 ; CHECK-P7-LABEL: hoo4_fmf:
 ; CHECK-P7:       # %bb.0:
 ; CHECK-P7-NEXT:    addis 3, 2, .LCPI26_2@toc@ha
+; CHECK-P7-NEXT:    ftsqrt 0, 1
 ; CHECK-P7-NEXT:    fmr 3, 1
-; CHECK-P7-NEXT:    addis 4, 2, .LCPI26_1@toc@ha
+; CHECK-P7-NEXT:    addis 4, 2, .LCPI26_0@toc@ha
 ; CHECK-P7-NEXT:    lfs 0, .LCPI26_2@toc@l(3)
-; CHECK-P7-NEXT:    addis 3, 2, .LCPI26_0@toc@ha
-; CHECK-P7-NEXT:    lfs 4, .LCPI26_1@toc@l(4)
-; CHECK-P7-NEXT:    lfs 5, .LCPI26_0@toc@l(3)
-; CHECK-P7-NEXT:    fcmpu 0, 1, 0
+; CHECK-P7-NEXT:    addis 3, 2, .LCPI26_1@toc@ha
+; CHECK-P7-NEXT:    lfs 5, .LCPI26_0@toc@l(4)
+; CHECK-P7-NEXT:    lfs 4, .LCPI26_1@toc@l(3)
 ; CHECK-P7-NEXT:    fmr 1, 0
-; CHECK-P7-NEXT:    bne 0, .LBB26_3
+; CHECK-P7-NEXT:    bc 4, 2, .LBB26_3
 ; CHECK-P7-NEXT:  # %bb.1:
-; CHECK-P7-NEXT:    fcmpu 0, 2, 0
-; CHECK-P7-NEXT:    bne 0, .LBB26_4
+; CHECK-P7-NEXT:    ftsqrt 0, 2
+; CHECK-P7-NEXT:    bc 4, 2, .LBB26_4
 ; CHECK-P7-NEXT:  .LBB26_2:
 ; CHECK-P7-NEXT:    fmr 2, 0
 ; CHECK-P7-NEXT:    blr
@@ -1063,8 +1054,8 @@ define <2 x double> @hoo4_fmf(<2 x double> %a) #1 {
 ; CHECK-P7-NEXT:    fmadd 1, 3, 1, 5
 ; CHECK-P7-NEXT:    fmul 3, 3, 4
 ; CHECK-P7-NEXT:    fmul 1, 3, 1
-; CHECK-P7-NEXT:    fcmpu 0, 2, 0
-; CHECK-P7-NEXT:    beq 0, .LBB26_2
+; CHECK-P7-NEXT:    ftsqrt 0, 2
+; CHECK-P7-NEXT:    bc 12, 2, .LBB26_2
 ; CHECK-P7-NEXT:  .LBB26_4:
 ; CHECK-P7-NEXT:    frsqrte 0, 2
 ; CHECK-P7-NEXT:    fmul 3, 2, 0

From 9130651126b745b18138b816487cdeb8a689a27f Mon Sep 17 00:00:00 2001
From: Max Kazantsev <mkazantsev@azul.com>
Date: Wed, 25 Nov 2020 13:20:15 +0700
Subject: [PATCH 3/3] Revert "[SCEV] Generalize no-self-wrap check in
 isLoopInvariantExitCondDuringFirstIterations"

This reverts commit 7dcc8899174f44b7447bc48a9f2ff27f5458f8b7.

This patch introduced a logical error that breaks whole logic of this analysis.
All checks we are making are supposed to be loop-independent, so that we could
safely remove the range check. The 'nw' fact is loop-dependent, so we can remove
the check basing on facts from this very check.

Motivating examples will follow-up.
---
 llvm/lib/Analysis/ScalarEvolution.cpp | 31 +++++++++++++--------------
 1 file changed, 15 insertions(+), 16 deletions(-)

diff --git a/llvm/lib/Analysis/ScalarEvolution.cpp b/llvm/lib/Analysis/ScalarEvolution.cpp
index 08ed363918a95..5f77f4aa05c2c 100644
--- a/llvm/lib/Analysis/ScalarEvolution.cpp
+++ b/llvm/lib/Analysis/ScalarEvolution.cpp
@@ -9643,19 +9643,17 @@ ScalarEvolution::getLoopInvariantExitCondDuringFirstIterations(
   if (!ICmpInst::isRelational(Pred))
     return None;
 
+  // TODO: Support steps other than +/- 1.
   const SCEV *Step = AR->getStepRecurrence(*this);
-  bool IsStepNonPositive = isKnownNonPositive(Step);
-  if (!IsStepNonPositive && !isKnownNonNegative(Step))
+  auto *One = getOne(Step->getType());
+  auto *MinusOne = getNegativeSCEV(One);
+  if (Step != One && Step != MinusOne)
     return None;
-  bool HasNoSelfWrap = AR->hasNoSelfWrap();
-  if (!HasNoSelfWrap)
-    // If num iter has same type as the AddRec, and step is +/- 1, even max
-    // possible number of iterations is not enough to self-wrap.
-    if (MaxIter->getType() == AR->getType())
-      if (Step == getOne(AR->getType()) || Step == getMinusOne(AR->getType()))
-        HasNoSelfWrap = true;
-  // Only proceed with non-self-wrapping ARs.
-  if (!HasNoSelfWrap)
+
+  // Type mismatch here means that MaxIter is potentially larger than max
+  // unsigned value in start type, which mean we cannot prove no wrap for the
+  // indvar.
+  if (AR->getType() != MaxIter->getType())
     return None;
 
   // Value of IV on suggested last iteration.
@@ -9663,13 +9661,14 @@ ScalarEvolution::getLoopInvariantExitCondDuringFirstIterations(
   // Does it still meet the requirement?
   if (!isKnownPredicateAt(Pred, Last, RHS, Context))
     return None;
-  // We know that the addrec does not have a self-wrap. To prove that there is
-  // no signed/unsigned wrap, we need to check that
-  // Start <= Last for positive step or Start >= Last for negative step. Either
-  // works for zero step.
+  // Because step is +/- 1 and MaxIter has same type as Start (i.e. it does
+  // not exceed max unsigned value of this type), this effectively proves
+  // that there is no wrap during the iteration. To prove that there is no
+  // signed/unsigned wrap, we need to check that
+  // Start <= Last for step = 1 or Start >= Last for step = -1.
   ICmpInst::Predicate NoOverflowPred =
       CmpInst::isSigned(Pred) ? ICmpInst::ICMP_SLE : ICmpInst::ICMP_ULE;
-  if (IsStepNonPositive)
+  if (Step == MinusOne)
     NoOverflowPred = CmpInst::getSwappedPredicate(NoOverflowPred);
   const SCEV *Start = AR->getStart();
   if (!isKnownPredicateAt(NoOverflowPred, Start, Last, Context))