From f787fe15d8e1cb63b40235e781cd7c2e130bbcd6 Mon Sep 17 00:00:00 2001
From: Michael Liao <michael.hliao@gmail.com>
Date: Fri, 11 Sep 2020 01:58:11 -0400
Subject: [PATCH 1/9] [EarlyCSE] Remove unnecessary operand swap.

- As min/max are commutative operators, there is no need to swap
  operands. That breaks the convention calculating the hash value.
---
 llvm/lib/Transforms/Scalar/EarlyCSE.cpp | 1 -
 llvm/test/CodeGen/AMDGPU/sad.ll         | 2 +-
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/llvm/lib/Transforms/Scalar/EarlyCSE.cpp b/llvm/lib/Transforms/Scalar/EarlyCSE.cpp
index f71a2b9e003a91..e47ecb4fbb44ae 100644
--- a/llvm/lib/Transforms/Scalar/EarlyCSE.cpp
+++ b/llvm/lib/Transforms/Scalar/EarlyCSE.cpp
@@ -199,7 +199,6 @@ static bool matchSelectWithOptionalNotCond(Value *V, Value *&Cond, Value *&A,
   case CmpInst::ICMP_SLE:
   case CmpInst::ICMP_SGE:
     Pred = CmpInst::getInversePredicate(Pred);
-    std::swap(A, B);
     Inversed = true;
     break;
   default:
diff --git a/llvm/test/CodeGen/AMDGPU/sad.ll b/llvm/test/CodeGen/AMDGPU/sad.ll
index 3a4a2d07772c12..464b413e655880 100644
--- a/llvm/test/CodeGen/AMDGPU/sad.ll
+++ b/llvm/test/CodeGen/AMDGPU/sad.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri -earlycse-debug-hash -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
 
 ; GCN-LABEL: {{^}}v_sad_u32_pat1:
 ; GCN: v_sad_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}

From 525c83cee00a3a92d9b1a9d6f39ee4fd6c0c798d Mon Sep 17 00:00:00 2001
From: Esme-Yi <esme.yi@ibm.com>
Date: Fri, 11 Sep 2020 07:16:58 +0000
Subject: [PATCH 2/9] [NFC][PowerPC] Add tests of constants-i64.

---
 llvm/test/CodeGen/PowerPC/constants-i64.ll | 70 ++++++++++++++++++----
 1 file changed, 58 insertions(+), 12 deletions(-)

diff --git a/llvm/test/CodeGen/PowerPC/constants-i64.ll b/llvm/test/CodeGen/PowerPC/constants-i64.ll
index 956845f5a5b359..38a765343fc74e 100644
--- a/llvm/test/CodeGen/PowerPC/constants-i64.ll
+++ b/llvm/test/CodeGen/PowerPC/constants-i64.ll
@@ -80,47 +80,93 @@ entry:
 ; CHECK: blr
 }
 
-define i64 @cn32_1() #0 {
+define i64 @uint32_1() #0 {
 entry:
   ret i64 3900000000
 
-; CHECK-LABEL: @cn32_1
+; CHECK-LABEL: @uint32_1
 ; CHECK: lis [[REG1:[0-9]+]], 232
 ; CHECK: ori [[REG2:[0-9]+]], [[REG1]], 30023
-; CHECK: sldi 3, [[REG1]], 8
+; CHECK: sldi 3, [[REG2]], 8
 ; CHECK: blr
 }
 
-define i32 @cn32_1_i32() #0 {
+define i32 @uint32_1_i32() #0 {
 entry:
   ret i32 -394967296
 
-; CHECK-LABEL: @cn32_1_i32
+; CHECK-LABEL: @uint32_1_i32
 ; CHECK: lis [[REG1:[0-9]+]], 232
 ; CHECK: ori [[REG2:[0-9]+]], [[REG1]], 30023
-; CHECK: sldi 3, [[REG1]], 8
+; CHECK: sldi 3, [[REG2]], 8
 ; CHECK: blr
 }
 
-define i64 @cn32_2() #0 {
+define i64 @uint32_2() #0 {
 entry:
   ret i64 4294967295
 
-; CHECK-LABEL: @cn32_2
+; CHECK-LABEL: @uint32_2
 ; CHECK: li [[REG1:[0-9]+]], 0
 ; CHECK: oris [[REG2:[0-9]+]], [[REG1]], 65535
-; CHECK: ori [[REG2:[0-9]+]], [[REG1]], 65535
+; CHECK: ori 3, [[REG2]], 65535
 ; CHECK: blr
 }
 
-define i32 @cn32_2_i32() #0 {
+define i32 @uint32_2_i32() #0 {
 entry:
   ret i32 -1
 
-; CHECK-LABEL: @cn32_2_i32
+; CHECK-LABEL: @uint32_2_i32
 ; CHECK: li [[REG1:[0-9]+]], 0
 ; CHECK: oris [[REG2:[0-9]+]], [[REG1]], 65535
-; CHECK: ori [[REG2:[0-9]+]], [[REG1]], 65535
+; CHECK: ori 3, [[REG2]], 65535
+; CHECK: blr
+}
+
+define i64 @uint32_3() #0 {
+entry:
+  ret i64 2147483648
+
+; CHECK-LABEL: @uint32_3
+; CHECK: li [[REG1:[0-9]+]], 1
+; CHECK: sldi 3, [[REG1]], 31
+; CHECK: blr
+}
+
+define i64 @uint32_4() #0 {
+entry:
+  ret i64 124800000032
+
+; CHECK-LABEL: @uint32_4
+; CHECK: li [[REG1:[0-9]+]], 29
+; CHECK: sldi [[REG2:[0-9]+]], [[REG1]], 32
+; CHECK: oris [[REG3:[0-9]+]], [[REG2]], 3752
+; CHECK: ori 3, [[REG3]], 57376
+; CHECK: blr
+}
+
+define i64 @cn_ones_1() #0 {
+entry:
+  ret i64 10460594175
+
+; CHECK-LABEL: @cn_ones_1
+; CHECK: li [[REG1:[0-9]+]], 2
+; CHECK: sldi [[REG2:[0-9]+]], [[REG1]], 32
+; CHECK: oris [[REG3:[0-9]+]], [[REG2]], 28543
+; CHECK: ori 3, [[REG3]], 65535
+; CHECK: blr
+}
+
+define i64 @cn_ones_2() #0 {
+entry:
+  ret i64 10459119615
+
+; CHECK-LABEL: @cn_ones_2
+; CHECK: li [[REG1:[0-9]+]], 2
+; CHECK: sldi [[REG2:[0-9]+]], [[REG1]], 32
+; CHECK: oris [[REG3:[0-9]+]], [[REG2]], 28521
+; CHECK: ori 3, [[REG3]], 32767
 ; CHECK: blr
 }
 

From e38be7091ee3d00430652aaa7b66ba3fc8394916 Mon Sep 17 00:00:00 2001
From: Guillaume Chatelet <gchatelet@google.com>
Date: Thu, 10 Sep 2020 14:27:27 +0000
Subject: [PATCH 3/9] [Clang] Clarify __builtin_memcpy_inline documentation

This patch updates the documentation about `__builtin_memcpy_inline` and reorders the sections so it is more consitent and understandable.

Differential Revision: https://reviews.llvm.org/D87458
---
 clang/docs/LanguageExtensions.rst | 36 ++++++++++++++++++-------------
 1 file changed, 21 insertions(+), 15 deletions(-)

diff --git a/clang/docs/LanguageExtensions.rst b/clang/docs/LanguageExtensions.rst
index 60b3f21b3e5004..073d9c86e22ff0 100644
--- a/clang/docs/LanguageExtensions.rst
+++ b/clang/docs/LanguageExtensions.rst
@@ -2408,20 +2408,6 @@ with ``__has_feature(cxx_constexpr_string_builtins)``.
 Memory builtins
 ---------------
 
- * ``__builtin_memcpy_inline``
-
-.. code-block:: c
-
-  void __builtin_memcpy_inline(void *dst, const void *src, size_t size);
-
-``__builtin_memcpy_inline(dst, src, size)`` is identical to
-``__builtin_memcpy(dst, src, size)`` except that the generated code is
-guaranteed not to call any external functions. See LLVM IR `llvm.memcpy.inline
-<https://llvm.org/docs/LangRef.html#llvm-memcpy-inline-intrinsic>`_ Intrinsic 
-for more information.
-
-Note that the `size` argument must be a compile time constant.
-
 Clang provides constant expression evaluation support for builtin forms of the
 following functions from the C standard library headers
 ``<string.h>`` and ``<wchar.h>``:
@@ -2439,7 +2425,27 @@ are pointers to arrays with the same trivially copyable element type, and the
 given size is an exact multiple of the element size that is no greater than
 the number of elements accessible through the source and destination operands.
 
-Constant evaluation support is not yet provided for ``__builtin_memcpy_inline``.
+Guaranteed inlined copy
+^^^^^^^^^^^^^^^^^^^^^^^
+
+.. code-block:: c
+
+  void __builtin_memcpy_inline(void *dst, const void *src, size_t size);
+
+
+``__builtin_memcpy_inline`` has been designed as a building block for efficient
+``memcpy`` implementations. It is identical to ``__builtin_memcpy`` but also
+guarantees not to call any external functions. See LLVM IR `llvm.memcpy.inline
+<https://llvm.org/docs/LangRef.html#llvm-memcpy-inline-intrinsic>`_ Intrinsic 
+for more information.
+
+This is useful to implement a custom version of ``memcpy``, implemement a
+``libc`` memcpy or work around the absence of a ``libc``.
+
+Note that the `size` argument must be a compile time constant.
+
+Note that this intrinsic cannot yet be called in a ``constexpr`` context.
+
 
 Atomic Min/Max builtins with memory ordering
 --------------------------------------------

From 46416f08031f6fcaccd9f51430f7a71c5f510495 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Martin=20Storsj=C3=B6?= <martin@martin.st>
Date: Thu, 10 Sep 2020 12:37:34 +0300
Subject: [PATCH 4/9] =?UTF-8?q?[CodeGen]=20[WinException]=C2=A0Remove=20a?=
 =?UTF-8?q?=20redundant=20explicit=20section=20switch=20for=20aarch64?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The following EmitWinEHHandlerData() implicitly switches to .xdata, just
like on x86_64.

This became orphaned from the original code requiring it in
0b61d220c9b1f0 / https://reviews.llvm.org/D61095.

Differential Revision: https://reviews.llvm.org/D87447
---
 llvm/lib/CodeGen/AsmPrinter/WinException.cpp | 9 ---------
 llvm/test/CodeGen/AArch64/win64-jumptable.ll | 1 -
 llvm/test/CodeGen/AArch64/wineh-mingw.ll     | 3 +--
 llvm/test/CodeGen/AArch64/wineh1.mir         | 1 -
 4 files changed, 1 insertion(+), 13 deletions(-)

diff --git a/llvm/lib/CodeGen/AsmPrinter/WinException.cpp b/llvm/lib/CodeGen/AsmPrinter/WinException.cpp
index cd8077e7d54865..c47ac7e17b6a17 100644
--- a/llvm/lib/CodeGen/AsmPrinter/WinException.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/WinException.cpp
@@ -258,15 +258,6 @@ void WinException::endFuncletImpl() {
     if (F.hasPersonalityFn())
       Per = classifyEHPersonality(F.getPersonalityFn()->stripPointerCasts());
 
-    // On funclet exit, we emit a fake "function" end marker, so that the call
-    // to EmitWinEHHandlerData below can calculate the size of the funclet or
-    // function.
-    if (isAArch64) {
-      MCSection *XData = Asm->OutStreamer->getAssociatedXDataSection(
-          Asm->OutStreamer->getCurrentSectionOnly());
-      Asm->OutStreamer->SwitchSection(XData);
-    }
-
     // Emit an UNWIND_INFO struct describing the prologue.
     Asm->OutStreamer->EmitWinEHHandlerData();
 
diff --git a/llvm/test/CodeGen/AArch64/win64-jumptable.ll b/llvm/test/CodeGen/AArch64/win64-jumptable.ll
index 0c61bcd52366a8..1983b2568cdee1 100644
--- a/llvm/test/CodeGen/AArch64/win64-jumptable.ll
+++ b/llvm/test/CodeGen/AArch64/win64-jumptable.ll
@@ -44,7 +44,6 @@ declare void @g(i32, i32)
 ; CHECK:    .word .LBB0_3-.LJTI0_0
 ; CHECK:    .word .LBB0_4-.LJTI0_0
 ; CHECK:    .word .LBB0_5-.LJTI0_0
-; CHECK:    .section  .xdata,"dr"
 ; CHECK:    .seh_handlerdata
 ; CHECK:    .text
 ; CHECK:    .seh_endproc
diff --git a/llvm/test/CodeGen/AArch64/wineh-mingw.ll b/llvm/test/CodeGen/AArch64/wineh-mingw.ll
index ff1a55711b9eae..d22c61fca75756 100644
--- a/llvm/test/CodeGen/AArch64/wineh-mingw.ll
+++ b/llvm/test/CodeGen/AArch64/wineh-mingw.ll
@@ -36,8 +36,7 @@ endtryfinally:
 ; WINEH: .seh_proc foo4
 ; WINEH: .seh_handler _d_eh_personality, @unwind, @except
 ; WINEH: ret
-; WINEH: .section .xdata,"dr"
-; WINEH-NEXT: .seh_handlerdata
+; WINEH:      .seh_handlerdata
 ; WINEH-NEXT: .text
 ; WINEH-NEXT: .seh_endproc
 ; WINEH: .section .xdata,"dr"
diff --git a/llvm/test/CodeGen/AArch64/wineh1.mir b/llvm/test/CodeGen/AArch64/wineh1.mir
index aed1550c54f731..2f73a5291ddd09 100644
--- a/llvm/test/CodeGen/AArch64/wineh1.mir
+++ b/llvm/test/CodeGen/AArch64/wineh1.mir
@@ -73,7 +73,6 @@
 # ASM: .seh_endepilogue
 
 # ASM: .seh_endfunclet
-# ASM: .section .xdata,"dr"
 # ASM: .seh_handlerdata
 # ASM: .text
 # ASM: .seh_endproc

From 700fbe591ac0f29c76e9f2bd77d752d4bd56d274 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Martin=20Storsj=C3=B6?= <martin@martin.st>
Date: Mon, 7 Sep 2020 14:45:37 +0300
Subject: [PATCH 5/9] [MC] [Win64EH] Canonicalize ARM64 unwind opcodes

Convert 2-byte opcodes to equivalent 1-byte ones.

Adjust the existing exhaustive testcase to avoid being altered by
the simplification rules (to keep that test exercising all individual
opcodes).

Fix the assembler parser limits for register pairs; for .seh_save_regp
and .seh_save_regp_x, we can allow up to x29, for a x29+x30 pair
(which gets remapped to the UOP_SaveFPLR(X) opcodes), for .seh_save_fregp
and .seh_save_fregpx, allow up to d14+d15.

Not creating .seh_save_next for float register pairs, as the
actual unwinder implementation in current versions of Windows is buggy
for that case.

This gives a minimal but measurable size reduction. (For a 6.5 MB
DLL with 300 KB .xdata, the .xdata shrinks by 48 bytes. The opcode
sequences are padded to a 4 byte boundary, so very small improvements
might not end up mattering directly.)

Differential Revision: https://reviews.llvm.org/D87367
---
 llvm/lib/MC/MCWin64EH.cpp                     |  61 ++++++++++
 .../AArch64/AsmParser/AArch64AsmParser.cpp    |   8 +-
 llvm/test/MC/AArch64/seh-optimize.s           | 106 ++++++++++++++++++
 llvm/test/MC/AArch64/seh.s                    |  18 +--
 4 files changed, 180 insertions(+), 13 deletions(-)
 create mode 100644 llvm/test/MC/AArch64/seh-optimize.s

diff --git a/llvm/lib/MC/MCWin64EH.cpp b/llvm/lib/MC/MCWin64EH.cpp
index fb0de40fc6d5f4..e9ab88234ad376 100644
--- a/llvm/lib/MC/MCWin64EH.cpp
+++ b/llvm/lib/MC/MCWin64EH.cpp
@@ -544,6 +544,63 @@ FindMatchingEpilog(const std::vector<WinEH::Instruction>& EpilogInstrs,
   return nullptr;
 }
 
+static void simplifyOpcodes(std::vector<WinEH::Instruction> &Instructions,
+                            bool Reverse) {
+  unsigned PrevOffset = -1;
+  unsigned PrevRegister = -1;
+
+  auto VisitInstruction = [&](WinEH::Instruction &Inst) {
+    // Convert 2-byte opcodes into equivalent 1-byte ones.
+    if (Inst.Operation == Win64EH::UOP_SaveRegP && Inst.Register == 29) {
+      Inst.Operation = Win64EH::UOP_SaveFPLR;
+    } else if (Inst.Operation == Win64EH::UOP_SaveRegPX &&
+               Inst.Register == 29) {
+      Inst.Operation = Win64EH::UOP_SaveFPLRX;
+    } else if (Inst.Operation == Win64EH::UOP_SaveRegPX &&
+               Inst.Register == 19 && Inst.Offset <= 248) {
+      Inst.Operation = Win64EH::UOP_SaveR19R20X;
+    } else if (Inst.Operation == Win64EH::UOP_AddFP && Inst.Offset == 0) {
+      Inst.Operation = Win64EH::UOP_SetFP;
+    } else if (Inst.Operation == Win64EH::UOP_SaveRegP &&
+               Inst.Register == PrevRegister + 2 &&
+               Inst.Offset == PrevOffset + 16) {
+      Inst.Operation = Win64EH::UOP_SaveNext;
+      // Intentionally not creating UOP_SaveNext for float register pairs,
+      // as current versions of Windows (up to at least 20.04) is buggy
+      // regarding SaveNext for float pairs.
+    }
+    // Update info about the previous instruction, for detecting if
+    // the next one can be made a UOP_SaveNext
+    if (Inst.Operation == Win64EH::UOP_SaveR19R20X) {
+      PrevOffset = 0;
+      PrevRegister = 19;
+    } else if (Inst.Operation == Win64EH::UOP_SaveRegPX) {
+      PrevOffset = 0;
+      PrevRegister = Inst.Register;
+    } else if (Inst.Operation == Win64EH::UOP_SaveRegP) {
+      PrevOffset = Inst.Offset;
+      PrevRegister = Inst.Register;
+    } else if (Inst.Operation == Win64EH::UOP_SaveNext) {
+      PrevRegister += 2;
+      PrevOffset += 16;
+    } else {
+      PrevRegister = -1;
+      PrevOffset = -1;
+    }
+  };
+
+  // Iterate over instructions in a forward order (for prologues),
+  // backwards for epilogues (i.e. always reverse compared to how the
+  // opcodes are stored).
+  if (Reverse) {
+    for (auto It = Instructions.rbegin(); It != Instructions.rend(); It++)
+      VisitInstruction(*It);
+  } else {
+    for (WinEH::Instruction &Inst : Instructions)
+      VisitInstruction(Inst);
+  }
+}
+
 // Populate the .xdata section.  The format of .xdata on ARM64 is documented at
 // https://docs.microsoft.com/en-us/cpp/build/arm64-exception-handling
 static void ARM64EmitUnwindInfo(MCStreamer &streamer, WinEH::FrameInfo *info) {
@@ -572,6 +629,10 @@ static void ARM64EmitUnwindInfo(MCStreamer &streamer, WinEH::FrameInfo *info) {
     return;
   }
 
+  simplifyOpcodes(info->Instructions, false);
+  for (auto &I : info->EpilogMap)
+    simplifyOpcodes(I.second, true);
+
   MCContext &context = streamer.getContext();
   MCSymbol *Label = context.createTempSymbol();
 
diff --git a/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp b/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
index 08a29bbb3e87a0..502966c6336767 100644
--- a/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
+++ b/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
@@ -5725,7 +5725,7 @@ bool AArch64AsmParser::parseDirectiveSEHSaveRegX(SMLoc L) {
 bool AArch64AsmParser::parseDirectiveSEHSaveRegP(SMLoc L) {
   unsigned Reg;
   int64_t Offset;
-  if (parseRegisterInRange(Reg, AArch64::X0, AArch64::X19, AArch64::LR) ||
+  if (parseRegisterInRange(Reg, AArch64::X0, AArch64::X19, AArch64::FP) ||
       parseComma() || parseImmExpr(Offset))
     return true;
   getTargetStreamer().EmitARM64WinCFISaveRegP(Reg, Offset);
@@ -5737,7 +5737,7 @@ bool AArch64AsmParser::parseDirectiveSEHSaveRegP(SMLoc L) {
 bool AArch64AsmParser::parseDirectiveSEHSaveRegPX(SMLoc L) {
   unsigned Reg;
   int64_t Offset;
-  if (parseRegisterInRange(Reg, AArch64::X0, AArch64::X19, AArch64::X28) ||
+  if (parseRegisterInRange(Reg, AArch64::X0, AArch64::X19, AArch64::FP) ||
       parseComma() || parseImmExpr(Offset))
     return true;
   getTargetStreamer().EmitARM64WinCFISaveRegPX(Reg, Offset);
@@ -5789,7 +5789,7 @@ bool AArch64AsmParser::parseDirectiveSEHSaveFRegX(SMLoc L) {
 bool AArch64AsmParser::parseDirectiveSEHSaveFRegP(SMLoc L) {
   unsigned Reg;
   int64_t Offset;
-  if (parseRegisterInRange(Reg, AArch64::D0, AArch64::D8, AArch64::D15) ||
+  if (parseRegisterInRange(Reg, AArch64::D0, AArch64::D8, AArch64::D14) ||
       parseComma() || parseImmExpr(Offset))
     return true;
   getTargetStreamer().EmitARM64WinCFISaveFRegP(Reg, Offset);
@@ -5801,7 +5801,7 @@ bool AArch64AsmParser::parseDirectiveSEHSaveFRegP(SMLoc L) {
 bool AArch64AsmParser::parseDirectiveSEHSaveFRegPX(SMLoc L) {
   unsigned Reg;
   int64_t Offset;
-  if (parseRegisterInRange(Reg, AArch64::D0, AArch64::D8, AArch64::D15) ||
+  if (parseRegisterInRange(Reg, AArch64::D0, AArch64::D8, AArch64::D14) ||
       parseComma() || parseImmExpr(Offset))
     return true;
   getTargetStreamer().EmitARM64WinCFISaveFRegPX(Reg, Offset);
diff --git a/llvm/test/MC/AArch64/seh-optimize.s b/llvm/test/MC/AArch64/seh-optimize.s
new file mode 100644
index 00000000000000..0bf33af9cc75f6
--- /dev/null
+++ b/llvm/test/MC/AArch64/seh-optimize.s
@@ -0,0 +1,106 @@
+// This test checks that the unwinding opcodes are remapped to more
+// efficient ones where possible.
+
+// RUN: llvm-mc -triple aarch64-pc-win32 -filetype=obj %s -o %t.o
+// RUN: llvm-readobj -u %t.o | FileCheck %s
+
+// CHECK:      UnwindInformation [
+// CHECK-NEXT:   RuntimeFunction {
+// CHECK-NEXT:     Function: func
+// CHECK-NEXT:     ExceptionRecord: .xdata
+// CHECK-NEXT:     ExceptionData {
+// CHECK:            Prologue [
+// CHECK-NEXT:         0xd882              ; stp d10, d11, [sp, #16]
+// CHECK-NEXT:         0xda07              ; stp d8, d9, [sp, #-64]!
+// CHECK-NEXT:         0xe6                ; save next
+// CHECK-NEXT:         0x28                ; stp x19, x20, [sp, #-64]!
+// CHECK-NEXT:         0xca49              ; stp x28, x29, [sp, #72]
+// CHECK-NEXT:         0xe6                ; save next
+// CHECK-NEXT:         0xe6                ; save next
+// CHECK-NEXT:         0xe6                ; save next
+// CHECK-NEXT:         0xcc47              ; stp x20, x21, [sp, #-64]!
+// CHECK-NEXT:         0x42                ; stp x29, x30, [sp, #16]
+// CHECK-NEXT:         0xca02              ; stp x27, x28, [sp, #16]
+// CHECK-NEXT:         0x83                ; stp x29, x30, [sp, #-32]!
+// CHECK-NEXT:         0xce03              ; stp x27, x28, [sp, #-32]!
+// CHECK-NEXT:         0xe1                ; mov fp, sp
+// CHECK-NEXT:         0xe201              ; add fp, sp, #8
+// CHECK-NEXT:         0xe4                ; end
+// CHECK-NEXT:       ]
+// CHECK-NEXT:       EpilogueScopes [
+// CHECK-NEXT:         EpilogueScope {
+// CHECK:                Opcodes [
+// CHECK-NEXT:             0xc904              ; ldp x23, x24, [sp, #32]
+// CHECK-NEXT:             0xe6                ; restore next
+// CHECK-NEXT:             0xcc83              ; ldp x21, x22, [sp], #32
+// CHECK-NEXT:             0x24                ; ldp x19, x20, [sp], #32
+// CHECK-NEXT:             0xcc1f              ; ldp x19, x20, [sp], #256
+// CHECK-NEXT:             0xe4                ; end
+// CHECK-NEXT:           ]
+// CHECK-NEXT:         }
+// CHECK-NEXT:       ]
+// CHECK-NEXT:     }
+// CHECK-NEXT:   }
+// CHECK-NEXT: ]
+
+
+    .text
+    .globl func
+    .seh_proc func
+func:
+    add x29, sp, #8
+    .seh_add_fp 8
+    add x29, sp, #0
+    .seh_add_fp 0
+
+    stp x27, x28, [sp, #-32]!
+    .seh_save_regp_x x27, 32
+    stp x29, x30, [sp, #-32]!
+    .seh_save_regp_x x29, 32
+
+    stp x27, x28, [sp, #16]
+    .seh_save_regp x27, 16
+    stp x29, x30, [sp, #16]
+    .seh_save_regp x29, 16
+
+    stp x20, x21, [sp, #-64]!
+    .seh_save_regp_x x20, 64
+    stp x22, x23, [sp, #16]
+    .seh_save_regp x22, 16
+    stp x24, x25, [sp, #32]
+    .seh_save_next
+    stp x26, x27, [sp, #48]
+    .seh_save_regp x26, 48
+    stp x28, x29, [sp, #72]
+    .seh_save_regp x28, 72
+
+    stp x19, x20, [sp, #-64]!
+    .seh_save_r19r20_x 64
+    stp x21, x22, [sp, #16]
+    .seh_save_regp x21, 16
+
+    stp d8,  d9,  [sp, #-64]!
+    .seh_save_fregp_x d8, 64
+    stp d10, d11, [sp, #16]
+    // This is intentionally not converted into a save_next, to avoid
+    // bugs in the windows unwinder.
+    .seh_save_fregp d10, 16
+
+    .seh_endprologue
+
+    nop
+
+    .seh_startepilogue
+    ldp x27, x28, [sp, #32]
+    .seh_save_regp x23, 32
+    ldp x23, x24, [sp, #16]
+    .seh_save_regp x23, 16
+    ldp x21, x22, [sp], #32
+    .seh_save_regp_x x21, 32
+    ldp x19, x20, [sp], #32
+    .seh_save_regp_x x19, 32
+    ldp x19, x20, [sp], #256
+    .seh_save_regp_x x19, 256
+    .seh_endepilogue
+    ret
+    .seh_endproc
diff --git a/llvm/test/MC/AArch64/seh.s b/llvm/test/MC/AArch64/seh.s
index f7faa64b9309a8..4e235d032d68ee 100644
--- a/llvm/test/MC/AArch64/seh.s
+++ b/llvm/test/MC/AArch64/seh.s
@@ -64,8 +64,8 @@
 // CHECK-NEXT:         0xe202              ; add fp, sp, #16
 // CHECK-NEXT:         0xdd41              ; str d13, [sp, #8]
 // CHECK-NEXT:         0xde83              ; str d12, [sp, #-32]!
-// CHECK-NEXT:         0xd882              ; stp d10, d11, [sp, #16]
-// CHECK-NEXT:         0xda03              ; stp d8, d9, [sp, #-32]!
+// CHECK-NEXT:         0xd884              ; stp d10, d11, [sp, #32]
+// CHECK-NEXT:         0xda05              ; stp d8, d9, [sp, #-48]!
 // CHECK-NEXT:         0x83                ; stp x29, x30, [sp, #-32]!
 // CHECK-NEXT:         0x46                ; stp x29, x30, [sp, #48]
 // CHECK-NEXT:         0xd141              ; str x24, [sp, #8]
@@ -74,7 +74,7 @@
 // CHECK-NEXT:         0xc882              ; stp x21, x22, [sp, #16]
 // CHECK-NEXT:         0xd6c2              ; stp x25, lr, [sp, #16]
 // CHECK-NEXT:         0x24                ; stp x19, x20, [sp, #-32]!
-// CHECK-NEXT:         0xcc03              ; stp x19, x20, [sp, #-32]!
+// CHECK-NEXT:         0xcc83              ; stp x21, x22, [sp, #-32]!
 // CHECK-NEXT:         0x83                ; stp x29, x30, [sp, #-32]!
 // CHECK-NEXT:         0xe1                ; mov fp, sp
 // CHECK-NEXT:         0x01                ; sub sp, #16
@@ -113,8 +113,8 @@ func:
     .seh_set_fp
     stp x29, x30, [sp, #-32]!
     .seh_save_fplr_x 32
-    stp x19, x20, [sp, #-32]!
-    .seh_save_regp_x x19, 32
+    stp x21, x22, [sp, #-32]!
+    .seh_save_regp_x x21, 32
     stp x19, x20, [sp, #-32]!
     .seh_save_r19r20_x 32
     stp x25, x30, [sp, #16]
@@ -131,10 +131,10 @@ func:
     .seh_save_fplr 48
     stp x29, x30, [sp, #-32]!
     .seh_save_fplr_x 32
-    stp d8, d9, [sp, #-32]!
-    .seh_save_fregp_x d8, 32
-    stp d10, d11, [sp, #16]
-    .seh_save_fregp d10, 16
+    stp d8, d9, [sp, #-48]!
+    .seh_save_fregp_x d8, 48
+    stp d10, d11, [sp, #32]
+    .seh_save_fregp d10, 32
     str d12, [sp, #-32]!
     .seh_save_freg_x d12, 32
     str d13, [sp, #8]

From 1308bb99e06752ab0b5175c92da31083f91af921 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Martin=20Storsj=C3=B6?= <martin@martin.st>
Date: Tue, 8 Sep 2020 00:00:07 +0300
Subject: [PATCH 6/9] [MC] [Win64EH] Write packed ARM64 epilogues if possible

This gives a pretty substantial size reduction; for a 6.5 MB
DLL with 300 KB .xdata, the .xdata shrinks by 66 KB.

Differential Revision: https://reviews.llvm.org/D87369
---
 llvm/include/llvm/MC/MCWinEH.h           |   8 +
 llvm/lib/MC/MCWin64EH.cpp                |  57 ++++++-
 llvm/test/CodeGen/AArch64/wineh3.mir     |  22 +--
 llvm/test/CodeGen/AArch64/wineh6.mir     |  20 +--
 llvm/test/CodeGen/AArch64/wineh7.mir     |  19 +--
 llvm/test/MC/AArch64/seh-packed-epilog.s | 187 +++++++++++++++++++++++
 llvm/test/MC/AArch64/seh.s               |  16 +-
 7 files changed, 266 insertions(+), 63 deletions(-)
 create mode 100644 llvm/test/MC/AArch64/seh-packed-epilog.s

diff --git a/llvm/include/llvm/MC/MCWinEH.h b/llvm/include/llvm/MC/MCWinEH.h
index 53cffccce8c1ad..f05f5f1641cd04 100644
--- a/llvm/include/llvm/MC/MCWinEH.h
+++ b/llvm/include/llvm/MC/MCWinEH.h
@@ -26,6 +26,14 @@ struct Instruction {
 
   Instruction(unsigned Op, MCSymbol *L, unsigned Reg, unsigned Off)
     : Label(L), Offset(Off), Register(Reg), Operation(Op) {}
+
+  bool operator==(const Instruction &I) const {
+    // Check whether two instructions refer to the same operation
+    // applied at a different spot (i.e. pointing at a different label).
+    return Offset == I.Offset && Register == I.Register &&
+           Operation == I.Operation;
+  }
+  bool operator!=(const Instruction &I) const { return !(*this == I); }
 };
 
 struct FrameInfo {
diff --git a/llvm/lib/MC/MCWin64EH.cpp b/llvm/lib/MC/MCWin64EH.cpp
index e9ab88234ad376..a585b50828379d 100644
--- a/llvm/lib/MC/MCWin64EH.cpp
+++ b/llvm/lib/MC/MCWin64EH.cpp
@@ -264,8 +264,7 @@ static int64_t GetAbsDifference(MCStreamer &Streamer, const MCSymbol *LHS,
   return value;
 }
 
-static uint32_t
-ARM64CountOfUnwindCodes(const std::vector<WinEH::Instruction> &Insns) {
+static uint32_t ARM64CountOfUnwindCodes(ArrayRef<WinEH::Instruction> Insns) {
   uint32_t Count = 0;
   for (const auto &I : Insns) {
     switch (static_cast<Win64EH::UnwindOpcodes>(I.Operation)) {
@@ -553,18 +552,23 @@ static void simplifyOpcodes(std::vector<WinEH::Instruction> &Instructions,
     // Convert 2-byte opcodes into equivalent 1-byte ones.
     if (Inst.Operation == Win64EH::UOP_SaveRegP && Inst.Register == 29) {
       Inst.Operation = Win64EH::UOP_SaveFPLR;
+      Inst.Register = -1;
     } else if (Inst.Operation == Win64EH::UOP_SaveRegPX &&
                Inst.Register == 29) {
       Inst.Operation = Win64EH::UOP_SaveFPLRX;
+      Inst.Register = -1;
     } else if (Inst.Operation == Win64EH::UOP_SaveRegPX &&
                Inst.Register == 19 && Inst.Offset <= 248) {
       Inst.Operation = Win64EH::UOP_SaveR19R20X;
+      Inst.Register = -1;
     } else if (Inst.Operation == Win64EH::UOP_AddFP && Inst.Offset == 0) {
       Inst.Operation = Win64EH::UOP_SetFP;
     } else if (Inst.Operation == Win64EH::UOP_SaveRegP &&
                Inst.Register == PrevRegister + 2 &&
                Inst.Offset == PrevOffset + 16) {
       Inst.Operation = Win64EH::UOP_SaveNext;
+      Inst.Register = -1;
+      Inst.Offset = 0;
       // Intentionally not creating UOP_SaveNext for float register pairs,
       // as current versions of Windows (up to at least 20.04) is buggy
       // regarding SaveNext for float pairs.
@@ -601,6 +605,47 @@ static void simplifyOpcodes(std::vector<WinEH::Instruction> &Instructions,
   }
 }
 
+static int checkPackedEpilog(MCStreamer &streamer, WinEH::FrameInfo *info,
+                             int PrologCodeBytes) {
+  // Can only pack if there's one single epilog
+  if (info->EpilogMap.size() != 1)
+    return -1;
+
+  const std::vector<WinEH::Instruction> &Epilog =
+      info->EpilogMap.begin()->second;
+
+  // Can pack if the epilog is a subset of the prolog but not vice versa
+  if (Epilog.size() > info->Instructions.size())
+    return -1;
+
+  // Check that the epilog actually is a perfect match for the end (backwrds)
+  // of the prolog.
+  for (int I = Epilog.size() - 1; I >= 0; I--) {
+    if (info->Instructions[I] != Epilog[Epilog.size() - 1 - I])
+      return -1;
+  }
+
+  // Check that the epilog actually is at the very end of the function,
+  // otherwise it can't be packed.
+  uint32_t DistanceFromEnd = (uint32_t)GetAbsDifference(
+      streamer, info->FuncletOrFuncEnd, info->EpilogMap.begin()->first);
+  if (DistanceFromEnd / 4 != Epilog.size())
+    return -1;
+
+  int Offset = ARM64CountOfUnwindCodes(
+      ArrayRef<WinEH::Instruction>(&info->Instructions[Epilog.size()],
+                                   info->Instructions.size() - Epilog.size()));
+
+  // Check that the offset and prolog size fits in the first word; it's
+  // unclear whether the epilog count in the extension word can be taken
+  // as packed epilog offset.
+  if (Offset > 31 || PrologCodeBytes > 124)
+    return -1;
+
+  info->EpilogMap.clear();
+  return Offset;
+}
+
 // Populate the .xdata section.  The format of .xdata on ARM64 is documented at
 // https://docs.microsoft.com/en-us/cpp/build/arm64-exception-handling
 static void ARM64EmitUnwindInfo(MCStreamer &streamer, WinEH::FrameInfo *info) {
@@ -679,6 +724,8 @@ static void ARM64EmitUnwindInfo(MCStreamer &streamer, WinEH::FrameInfo *info) {
   uint32_t PrologCodeBytes = ARM64CountOfUnwindCodes(info->Instructions);
   uint32_t TotalCodeBytes = PrologCodeBytes;
 
+  int PackedEpilogOffset = checkPackedEpilog(streamer, info, PrologCodeBytes);
+
   // Process epilogs.
   MapVector<MCSymbol *, uint32_t> EpilogInfo;
   // Epilogs processed so far.
@@ -711,15 +758,17 @@ static void ARM64EmitUnwindInfo(MCStreamer &streamer, WinEH::FrameInfo *info) {
   uint32_t CodeWordsMod = TotalCodeBytes % 4;
   if (CodeWordsMod)
     CodeWords++;
-  uint32_t EpilogCount = info->EpilogMap.size();
+  uint32_t EpilogCount =
+      PackedEpilogOffset >= 0 ? PackedEpilogOffset : info->EpilogMap.size();
   bool ExtensionWord = EpilogCount > 31 || TotalCodeBytes > 124;
   if (!ExtensionWord) {
     row1 |= (EpilogCount & 0x1F) << 22;
     row1 |= (CodeWords & 0x1F) << 27;
   }
-  // E is always 0 right now, TODO: packed epilog setup
   if (info->HandlesExceptions) // X
     row1 |= 1 << 20;
+  if (PackedEpilogOffset >= 0) // E
+    row1 |= 1 << 21;
   row1 |= FuncLength & 0x3FFFF;
   streamer.emitInt32(row1);
 
diff --git a/llvm/test/CodeGen/AArch64/wineh3.mir b/llvm/test/CodeGen/AArch64/wineh3.mir
index 6cbe7f42dc5ece..d1ffa4aedc0857 100644
--- a/llvm/test/CodeGen/AArch64/wineh3.mir
+++ b/llvm/test/CodeGen/AArch64/wineh3.mir
@@ -8,9 +8,9 @@
 # CHECK-NEXT:      FunctionLength: 124
 # CHECK-NEXT:      Version: 0
 # CHECK-NEXT:      ExceptionData: No
-# CHECK-NEXT:      EpiloguePacked: No
-# CHECK-NEXT:      EpilogueScopes: 1
-# CHECK-NEXT:      ByteCodeLength: 32
+# CHECK-NEXT:      EpiloguePacked: Yes
+# CHECK-NEXT:      EpilogueOffset: 0
+# CHECK-NEXT:      ByteCodeLength: 16
 # CHECK-NEXT:      Prologue [
 # CHECK-NEXT:        0xc80c              ; stp x19, x20, [sp, #96]
 # CHECK-NEXT:        0xc88a              ; stp x21, x22, [sp, #80]
@@ -21,22 +21,6 @@
 # CHECK-NEXT:        0xda8d              ; stp d10, d11, [sp, #-112]!
 # CHECK-NEXT:        0xe4                ; end
 # CHECK-NEXT:      ]
-# CHECK-NEXT:      EpilogueScopes [
-# CHECK-NEXT:        EpilogueScope {
-# CHECK-NEXT:          StartOffset: 23
-# CHECK-NEXT:          EpilogueStartIndex: 15
-# CHECK-NEXT:          Opcodes [
-# CHECK-NEXT:            0xc80c              ; ldp x19, x20, [sp, #96]
-# CHECK-NEXT:            0xc88a              ; ldp x21, x22, [sp, #80]
-# CHECK-NEXT:            0xc908              ; ldp x23, x24, [sp, #64]
-# CHECK-NEXT:            0xc986              ; ldp x25, x26, [sp, #48]
-# CHECK-NEXT:            0xca04              ; ldp x27, x28, [sp, #32]
-# CHECK-NEXT:            0xd802              ; ldp d8, d9, [sp, #16]
-# CHECK-NEXT:            0xda8d              ; ldp d10, d11, [sp], #112
-# CHECK-NEXT:            0xe4                ; end
-# CHECK-NEXT:          ]
-# CHECK-NEXT:        }
-# CHECK-NEXT:      ]
 # CHECK-NEXT:    }
 ...
 ---
diff --git a/llvm/test/CodeGen/AArch64/wineh6.mir b/llvm/test/CodeGen/AArch64/wineh6.mir
index 95a11aa3c4e82b..e7592bd7114605 100644
--- a/llvm/test/CodeGen/AArch64/wineh6.mir
+++ b/llvm/test/CodeGen/AArch64/wineh6.mir
@@ -6,25 +6,19 @@
 # CHECK-NEXT:      FunctionLength: 92
 # CHECK-NEXT:      Version: 0
 # CHECK-NEXT:      ExceptionData: No
-# CHECK-NEXT:      EpiloguePacked: No
-# CHECK-NEXT:      EpilogueScopes: 1
-# CHECK-NEXT:      ByteCodeLength: 8
+# CHECK-NEXT:      EpiloguePacked: Yes
+# CHECK-NEXT:      EpilogueOffset: 1
+# CHECK-NEXT:      ByteCodeLength: 4
 # CHECK-NEXT:      Prologue [
 # CHECK-NEXT:        0x02                ; sub sp, #32
 # CHECK-NEXT:        0xe1                ; mov fp, sp
 # CHECK-NEXT:        0x81                ; stp x29, x30, [sp, #-16]!
 # CHECK-NEXT:        0xe4                ; end
 # CHECK-NEXT:      ]
-# CHECK-NEXT:      EpilogueScopes [
-# CHECK-NEXT:        EpilogueScope {
-# CHECK-NEXT:          StartOffset: 20
-# CHECK-NEXT:          EpilogueStartIndex: 4
-# CHECK-NEXT:          Opcodes [
-# CHECK-NEXT:            0xe1                ; mov sp, fp
-# CHECK-NEXT:            0x81                ; ldp x29, x30, [sp], #16
-# CHECK-NEXT:            0xe4                ; end
-# CHECK-NEXT:          ]
-# CHECK-NEXT:        }
+# CHECK-NEXT:      Epilogue [
+# CHECK-NEXT:        0xe1                ; mov sp, fp
+# CHECK-NEXT:        0x81                ; ldp x29, x30, [sp], #16
+# CHECK-NEXT:        0xe4                ; end
 # CHECK-NEXT:      ]
 # CHECK-NEXT:    }
 ...
diff --git a/llvm/test/CodeGen/AArch64/wineh7.mir b/llvm/test/CodeGen/AArch64/wineh7.mir
index da64b3c002f3d7..6bf06d80861a42 100644
--- a/llvm/test/CodeGen/AArch64/wineh7.mir
+++ b/llvm/test/CodeGen/AArch64/wineh7.mir
@@ -6,9 +6,9 @@
 # CHECK-NEXT:      FunctionLength: 72
 # CHECK-NEXT:      Version: 0
 # CHECK-NEXT:      ExceptionData: No
-# CHECK-NEXT:      EpiloguePacked: No
-# CHECK-NEXT:      EpilogueScopes: 1
-# CHECK-NEXT:      ByteCodeLength: 16
+# CHECK-NEXT:      EpiloguePacked: Yes
+# CHECK-NEXT:      EpilogueOffset: 0
+# CHECK-NEXT:      ByteCodeLength: 8
 # CHECK-NEXT:      Prologue [
 # CHECK-NEXT:        0xe204              ; add fp, sp, #32
 # CHECK-NEXT:        0x44                ; stp x29, x30, [sp, #32]
@@ -16,19 +16,6 @@
 # CHECK-NEXT:        0xcc85              ; stp x21, x22, [sp, #-48]!
 # CHECK-NEXT:        0xe4                ; end
 # CHECK-NEXT:      ]
-# CHECK-NEXT:      EpilogueScopes [
-# CHECK-NEXT:        EpilogueScope {
-# CHECK-NEXT:          StartOffset: 13
-# CHECK-NEXT:          EpilogueStartIndex: 8
-# CHECK-NEXT:          Opcodes [
-# CHECK-NEXT:            0xe204              ; sub sp, fp, #32
-# CHECK-NEXT:            0x44                ; ldp x29, x30, [sp, #32]
-# CHECK-NEXT:            0xc802              ; ldp x19, x20, [sp, #16]
-# CHECK-NEXT:            0xcc85              ; ldp x21, x22, [sp], #48
-# CHECK-NEXT:            0xe4                ; end
-# CHECK-NEXT:          ]
-# CHECK-NEXT:        }
-# CHECK-NEXT:      ]
 # CHECK-NEXT:    }
 # CHECK-NEXT:  }
 
diff --git a/llvm/test/MC/AArch64/seh-packed-epilog.s b/llvm/test/MC/AArch64/seh-packed-epilog.s
new file mode 100644
index 00000000000000..f9978ea7a1139f
--- /dev/null
+++ b/llvm/test/MC/AArch64/seh-packed-epilog.s
@@ -0,0 +1,187 @@
+// This test checks that the epilogue is packed where possible.
+
+// RUN: llvm-mc -triple aarch64-pc-win32 -filetype=obj %s -o %t.o
+// RUN: llvm-readobj -u %t.o | FileCheck %s
+
+// CHECK:      UnwindInformation [
+// CHECK-NEXT:   RuntimeFunction {
+// CHECK-NEXT:     Function: func
+// CHECK-NEXT:     ExceptionRecord: .xdata
+// CHECK-NEXT:     ExceptionData {
+// CHECK-NEXT:       FunctionLength:
+// CHECK-NEXT:       Version:
+// CHECK-NEXT:       ExceptionData:
+// CHECK-NEXT:       EpiloguePacked: Yes
+// CHECK-NEXT:       EpilogueOffset: 2
+// CHECK-NEXT:       ByteCodeLength:
+// CHECK-NEXT:       Prologue [
+// CHECK-NEXT:         0xdc04              ; str d8, [sp, #32]
+// CHECK-NEXT:         0xe1                ; mov fp, sp
+// CHECK-NEXT:         0x42                ; stp x29, x30, [sp, #16]
+// CHECK-NEXT:         0x85                ; stp x29, x30, [sp, #-48]!
+// CHECK-NEXT:         0xe6                ; save next
+// CHECK-NEXT:         0x24                ; stp x19, x20, [sp, #-32]!
+// CHECK-NEXT:         0xc842              ; stp x20, x21, [sp, #16]
+// CHECK-NEXT:         0x03                ; sub sp, #48
+// CHECK-NEXT:         0xe4                ; end
+// CHECK-NEXT:       ]
+// CHECK-NEXT:       Epilogue [
+// CHECK-NEXT:         0xe1                ; mov sp, fp
+// CHECK-NEXT:         0x42                ; ldp x29, x30, [sp, #16]
+// CHECK-NEXT:         0x85                ; ldp x29, x30, [sp], #48
+// CHECK-NEXT:         0xe6                ; restore next
+// CHECK-NEXT:         0x24                ; ldp x19, x20, [sp], #32
+// CHECK-NEXT:         0xc842              ; ldp x20, x21, [sp, #16]
+// CHECK-NEXT:         0x03                ; add sp, #48
+// CHECK-NEXT:         0xe4                ; end
+// CHECK-NEXT:       ]
+// CHECK-NEXT:     }
+// CHECK-NEXT:   }
+// CHECK:        RuntimeFunction {
+// CHECK-NEXT:     Function: packed2
+// CHECK-NEXT:     ExceptionRecord:
+// CHECK-NEXT:     ExceptionData {
+// CHECK:            ExceptionData:
+// CHECK-NEXT:       EpiloguePacked: Yes
+// CHECK:        RuntimeFunction {
+// CHECK-NEXT:     Function: nonpacked1
+// CHECK-NEXT:     ExceptionRecord:
+// CHECK-NEXT:     ExceptionData {
+// CHECK:            ExceptionData:
+// CHECK-NEXT:       EpiloguePacked: No
+// CHECK:        RuntimeFunction {
+// CHECK-NEXT:     Function: nonpacked2
+// CHECK-NEXT:     ExceptionRecord:
+// CHECK-NEXT:     ExceptionData {
+// CHECK:            ExceptionData:
+// CHECK-NEXT:       EpiloguePacked: No
+// CHECK:        RuntimeFunction {
+// CHECK-NEXT:     Function: nonpacked3
+// CHECK-NEXT:     ExceptionRecord:
+// CHECK-NEXT:     ExceptionData {
+// CHECK:            ExceptionData:
+// CHECK-NEXT:       EpiloguePacked: No
+
+    .text
+    .globl func
+    .seh_proc func
+func:
+    sub sp, sp, #48
+    .seh_stackalloc 48
+    // Check that canonical opcode forms (r19r20_x, fplr, fplr_x, save_next,
+    // set_fp) are treated as a match even if one (in prologue or epilogue)
+    // was simplified from the more generic opcodes.
+    stp x20, x21, [sp, #16]
+    .seh_save_regp x20, 16
+    stp x19, x20, [sp, #-32]!
+    .seh_save_r19r20_x 32
+    stp x21, x22, [sp, #16]
+    .seh_save_regp x21, 16
+    stp x29, x30, [sp, #-48]!
+    .seh_save_regp_x x29, 48
+    stp x29, x30, [sp, #16]
+    .seh_save_regp x29, 16
+    add x29, sp, #0
+    .seh_add_fp 0
+    str d8, [sp, #32]
+    .seh_save_freg d8, 32
+    .seh_endprologue
+
+    nop
+
+    .seh_startepilogue
+    mov sp, x29
+    .seh_set_fp
+    ldp x29, x30, [sp, #16]
+    .seh_save_fplr 16
+    ldp x29, x30, [sp, #-48]!
+    .seh_save_fplr_x 48
+    ldp x21, x22, [sp, #16]
+    .seh_save_next
+    ldp x19, x20, [sp], #32
+    .seh_save_regp_x x19, 32
+    ldp x20, x21, [sp, #16]
+    .seh_save_regp x20, 16
+    add sp, sp, #48
+    .seh_stackalloc 48
+    .seh_endepilogue
+    ret
+    .seh_endproc
+
+
+    // Test a perfectly matching epilog with no offset.
+    .seh_proc packed2
+packed2:
+    sub sp, sp, #48
+    .seh_stackalloc 48
+    stp x29, lr, [sp, #-32]!
+    .seh_save_fplr_x 32
+    .seh_endprologue
+    nop
+    .seh_startepilogue
+    ldp x29, lr, [sp], #32
+    .seh_save_fplr_x 32
+    add sp, sp, #48
+    .seh_stackalloc 48
+    .seh_endepilogue
+    ret
+    .seh_endproc
+
+
+    .seh_proc nonpacked1
+nonpacked1:
+    sub sp, sp, #48
+    .seh_stackalloc 48
+    .seh_endprologue
+
+    nop
+    .seh_startepilogue
+    add sp, sp, #48
+    .seh_stackalloc 48
+    .seh_endepilogue
+    // This epilogue isn't packed with the prologue, as it doesn't align with
+    // the end of the function (one extra nop before the ret).
+    nop
+    ret
+    .seh_endproc
+
+
+    .seh_proc nonpacked2
+nonpacked2:
+    sub sp, sp, #48
+    .seh_stackalloc 48
+    sub sp, sp, #32
+    .seh_stackalloc 32
+    .seh_endprologue
+
+    nop
+    .seh_startepilogue
+    // Not packed; the epilogue mismatches at the second opcode.
+    add sp, sp, #16
+    .seh_stackalloc 16
+    add sp, sp, #48
+    .seh_stackalloc 48
+    .seh_endepilogue
+    ret
+    .seh_endproc
+
+    .seh_proc nonpacked3
+nonpacked3:
+    sub sp, sp, #48
+    .seh_stackalloc 48
+    sub sp, sp, #32
+    .seh_stackalloc 32
+    .seh_endprologue
+
+    nop
+    .seh_startepilogue
+    // Not packed; the epilogue is longer than the prologue.
+    mov sp, x29
+    .seh_set_fp
+    add sp, sp, #32
+    .seh_stackalloc 32
+    add sp, sp, #48
+    .seh_stackalloc 48
+    .seh_endepilogue
+    ret
+    .seh_endproc
diff --git a/llvm/test/MC/AArch64/seh.s b/llvm/test/MC/AArch64/seh.s
index 4e235d032d68ee..0da956cbf2f5de 100644
--- a/llvm/test/MC/AArch64/seh.s
+++ b/llvm/test/MC/AArch64/seh.s
@@ -20,7 +20,7 @@
 // CHECK-NEXT:   }
 // CHECK:        Section {
 // CHECK:          Name: .xdata
-// CHECK:          RawDataSize: 56
+// CHECK:          RawDataSize: 52
 // CHECK:          RelocationCount: 1
 // CHECK:          Characteristics [
 // CHECK-NEXT:       ALIGN_4BYTES
@@ -41,7 +41,7 @@
 
 // CHECK-NEXT: Relocations [
 // CHECK-NEXT:   Section (4) .xdata {
-// CHECK-NEXT:     0x2C IMAGE_REL_ARM64_ADDR32NB __C_specific_handler
+// CHECK-NEXT:     0x28 IMAGE_REL_ARM64_ADDR32NB __C_specific_handler
 // CHECK-NEXT:   }
 // CHECK-NEXT:   Section (5) .pdata {
 // CHECK-NEXT:     0x0 IMAGE_REL_ARM64_ADDR32NB func
@@ -80,15 +80,9 @@
 // CHECK-NEXT:         0x01                ; sub sp, #16
 // CHECK-NEXT:         0xe4                ; end
 // CHECK-NEXT:       ]
-// CHECK-NEXT:       EpilogueScopes [
-// CHECK-NEXT:         EpilogueScope {
-// CHECK-NEXT:           StartOffset: 23
-// CHECK-NEXT:           EpilogueStartIndex: 33
-// CHECK-NEXT:           Opcodes [
-// CHECK-NEXT:             0x01                ; add sp, #16
-// CHECK-NEXT:             0xe4                ; end
-// CHECK-NEXT:           ]
-// CHECK-NEXT:         }
+// CHECK-NEXT:       Epilogue [
+// CHECK-NEXT:         0x01                ; add sp, #16
+// CHECK-NEXT:         0xe4                ; end
 // CHECK-NEXT:       ]
 // CHECK-NEXT:       ExceptionHandler [
 // CHECK-NEXT:         Routine: __C_specific_handler (0x0)

From 28012e00d80b994ef0709377da15e2b25e6c0b72 Mon Sep 17 00:00:00 2001
From: Yevgeny Rouban <yrouban@azul.com>
Date: Fri, 11 Sep 2020 12:55:24 +0700
Subject: [PATCH 7/9] [NewPM] Introduce PreserveCFG check

Check that all passes, which report they preserve CFG,
are really preserving CFG.
A new standard instrumentation is introduced. It can be
switched on/off by the flag verify-cfg-preserved, which
is on by default for debug builds.

Reviewers: kuhar, fedor.sergeev

Differential Revision: https://reviews.llvm.org/D81558
---
 .../llvm/Passes/StandardInstrumentations.h    |  52 ++++++
 llvm/lib/Passes/StandardInstrumentations.cpp  | 164 ++++++++++++++++++
 2 files changed, 216 insertions(+)

diff --git a/llvm/include/llvm/Passes/StandardInstrumentations.h b/llvm/include/llvm/Passes/StandardInstrumentations.h
index 795e2770bbe185..76e217c8997453 100644
--- a/llvm/include/llvm/Passes/StandardInstrumentations.h
+++ b/llvm/include/llvm/Passes/StandardInstrumentations.h
@@ -17,8 +17,11 @@
 
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/PassInstrumentation.h"
 #include "llvm/IR/PassTimingInfo.h"
+#include "llvm/IR/ValueHandle.h"
+#include "llvm/Support/CommandLine.h"
 
 #include <string>
 #include <utility>
@@ -26,6 +29,7 @@
 namespace llvm {
 
 class Module;
+class Function;
 
 /// Instrumentation to print IR before/after passes.
 ///
@@ -73,6 +77,53 @@ class PrintPassInstrumentation {
   bool DebugLogging;
 };
 
+class PreservedCFGCheckerInstrumentation {
+private:
+  // CFG is a map BB -> {(Succ, Multiplicity)}, where BB is a non-leaf basic
+  // block, {(Succ, Multiplicity)} set of all pairs of the block's successors
+  // and the multiplicity of the edge (BB->Succ). As the mapped sets are
+  // unordered the order of successors is not tracked by the CFG. In other words
+  // this allows basic block successors to be swapped by a pass without
+  // reporting a CFG change. CFG can be guarded by basic block tracking pointers
+  // in the Graph (BBGuard). That is if any of the block is deleted or RAUWed
+  // then the CFG is treated poisoned and no block pointer of the Graph is used.
+  struct CFG {
+    struct BBGuard final : public CallbackVH {
+      BBGuard(const BasicBlock *BB) : CallbackVH(BB) {}
+      void deleted() override { CallbackVH::deleted(); }
+      void allUsesReplacedWith(Value *) override { CallbackVH::deleted(); }
+      bool isPoisoned() const { return !getValPtr(); }
+    };
+
+    Optional<DenseMap<intptr_t, BBGuard>> BBGuards;
+    DenseMap<const BasicBlock *, DenseMap<const BasicBlock *, unsigned>> Graph;
+
+    CFG(const Function *F, bool TrackBBLifetime = false);
+
+    bool operator==(const CFG &G) const {
+      return !isPoisoned() && !G.isPoisoned() && Graph == G.Graph;
+    }
+
+    bool isPoisoned() const {
+      if (BBGuards)
+        for (auto &BB : *BBGuards) {
+          if (BB.second.isPoisoned())
+            return true;
+        }
+      return false;
+    }
+
+    static void printDiff(raw_ostream &out, const CFG &Before,
+                          const CFG &After);
+  };
+
+  SmallVector<std::pair<StringRef, Optional<CFG>>, 8> GraphStackBefore;
+
+public:
+  static cl::opt<bool> VerifyPreservedCFG;
+  void registerCallbacks(PassInstrumentationCallbacks &PIC);
+};
+
 /// This class provides an interface to register all the standard pass
 /// instrumentations and manages their state (if any).
 class StandardInstrumentations {
@@ -80,6 +131,7 @@ class StandardInstrumentations {
   PrintPassInstrumentation PrintPass;
   TimePassesHandler TimePasses;
   OptNoneInstrumentation OptNone;
+  PreservedCFGCheckerInstrumentation PreservedCFGChecker;
 
 public:
   StandardInstrumentations(bool DebugLogging) : PrintPass(DebugLogging) {}
diff --git a/llvm/lib/Passes/StandardInstrumentations.cpp b/llvm/lib/Passes/StandardInstrumentations.cpp
index da58fa57bdae76..2ee373b912be08 100644
--- a/llvm/lib/Passes/StandardInstrumentations.cpp
+++ b/llvm/lib/Passes/StandardInstrumentations.cpp
@@ -36,6 +36,14 @@ static cl::opt<bool>
                   cl::desc("Enable skipping optional passes optnone functions "
                            "under new pass manager"));
 
+cl::opt<bool> PreservedCFGCheckerInstrumentation::VerifyPreservedCFG(
+    "verify-cfg-preserved", cl::Hidden,
+#ifdef NDEBUG
+    cl::init(false));
+#else
+    cl::init(true));
+#endif
+
 // FIXME: Change `-debug-pass-manager` from boolean to enum type. Similar to
 // `-debug-pass` in legacy PM.
 static cl::opt<bool>
@@ -338,10 +346,166 @@ void PrintPassInstrumentation::registerCallbacks(
   });
 }
 
+PreservedCFGCheckerInstrumentation::CFG::CFG(const Function *F,
+                                             bool TrackBBLifetime) {
+  if (TrackBBLifetime)
+    BBGuards = DenseMap<intptr_t, BBGuard>(F->size());
+  for (const auto &BB : *F) {
+    if (BBGuards)
+      BBGuards->try_emplace(intptr_t(&BB), &BB);
+    for (auto *Succ : successors(&BB)) {
+      Graph[&BB][Succ]++;
+      if (BBGuards)
+        BBGuards->try_emplace(intptr_t(Succ), Succ);
+    }
+  }
+}
+
+static void printBBName(raw_ostream &out, const BasicBlock *BB) {
+  if (BB->hasName()) {
+    out << BB->getName() << "<" << BB << ">";
+    return;
+  }
+
+  if (!BB->getParent()) {
+    out << "unnamed_removed<" << BB << ">";
+    return;
+  }
+
+  if (BB == &BB->getParent()->getEntryBlock()) {
+    out << "entry"
+        << "<" << BB << ">";
+    return;
+  }
+
+  unsigned FuncOrderBlockNum = 0;
+  for (auto &FuncBB : *BB->getParent()) {
+    if (&FuncBB == BB)
+      break;
+    FuncOrderBlockNum++;
+  }
+  out << "unnamed_" << FuncOrderBlockNum << "<" << BB << ">";
+}
+
+void PreservedCFGCheckerInstrumentation::CFG::printDiff(raw_ostream &out,
+                                                        const CFG &Before,
+                                                        const CFG &After) {
+  assert(!After.isPoisoned());
+
+  // Print function name.
+  const CFG *FuncGraph = nullptr;
+  if (!After.Graph.empty())
+    FuncGraph = &After;
+  else if (!Before.isPoisoned() && !Before.Graph.empty())
+    FuncGraph = &Before;
+
+  if (FuncGraph)
+    out << "In function @"
+        << FuncGraph->Graph.begin()->first->getParent()->getName() << "\n";
+
+  if (Before.isPoisoned()) {
+    out << "Some blocks were deleted\n";
+    return;
+  }
+
+  // Find and print graph differences.
+  if (Before.Graph.size() != After.Graph.size())
+    out << "Different number of non-leaf basic blocks: before="
+        << Before.Graph.size() << ", after=" << After.Graph.size() << "\n";
+
+  for (auto &BB : Before.Graph) {
+    auto BA = After.Graph.find(BB.first);
+    if (BA == After.Graph.end()) {
+      out << "Non-leaf block ";
+      printBBName(out, BB.first);
+      out << " is removed (" << BB.second.size() << " successors)\n";
+    }
+  }
+
+  for (auto &BA : After.Graph) {
+    auto BB = Before.Graph.find(BA.first);
+    if (BB == Before.Graph.end()) {
+      out << "Non-leaf block ";
+      printBBName(out, BA.first);
+      out << " is added (" << BA.second.size() << " successors)\n";
+      continue;
+    }
+
+    if (BB->second == BA.second)
+      continue;
+
+    out << "Different successors of block ";
+    printBBName(out, BA.first);
+    out << " (unordered):\n";
+    out << "- before (" << BB->second.size() << "): ";
+    for (auto &SuccB : BB->second) {
+      printBBName(out, SuccB.first);
+      if (SuccB.second != 1)
+        out << "(" << SuccB.second << "), ";
+      else
+        out << ", ";
+    }
+    out << "\n";
+    out << "- after (" << BA.second.size() << "): ";
+    for (auto &SuccA : BA.second) {
+      printBBName(out, SuccA.first);
+      if (SuccA.second != 1)
+        out << "(" << SuccA.second << "), ";
+      else
+        out << ", ";
+    }
+    out << "\n";
+  }
+}
+
+void PreservedCFGCheckerInstrumentation::registerCallbacks(
+    PassInstrumentationCallbacks &PIC) {
+  if (!VerifyPreservedCFG)
+    return;
+
+  PIC.registerBeforeNonSkippedPassCallback([this](StringRef P, Any IR) {
+    if (any_isa<const Function *>(IR))
+      GraphStackBefore.emplace_back(P, CFG(any_cast<const Function *>(IR)));
+    else
+      GraphStackBefore.emplace_back(P, None);
+  });
+
+  PIC.registerAfterPassInvalidatedCallback(
+      [this](StringRef P, const PreservedAnalyses &PassPA) {
+        auto Before = GraphStackBefore.pop_back_val();
+        assert(Before.first == P &&
+               "Before and After callbacks must correspond");
+        (void)Before;
+      });
+
+  PIC.registerAfterPassCallback([this](StringRef P, Any IR,
+                                       const PreservedAnalyses &PassPA) {
+    auto Before = GraphStackBefore.pop_back_val();
+    assert(Before.first == P && "Before and After callbacks must correspond");
+    auto &GraphBefore = Before.second;
+
+    if (!PassPA.allAnalysesInSetPreserved<CFGAnalyses>())
+      return;
+
+    if (any_isa<const Function *>(IR)) {
+      assert(GraphBefore && "Must be built in BeforePassCallback");
+      CFG GraphAfter(any_cast<const Function *>(IR), false /* NeedsGuard */);
+      if (GraphAfter == *GraphBefore)
+        return;
+
+      dbgs() << "Error: " << P
+             << " reported it preserved CFG, but changes detected:\n";
+      CFG::printDiff(dbgs(), *GraphBefore, GraphAfter);
+      report_fatal_error(Twine("Preserved CFG changed by ", P));
+    }
+  });
+}
+
 void StandardInstrumentations::registerCallbacks(
     PassInstrumentationCallbacks &PIC) {
   PrintIR.registerCallbacks(PIC);
   PrintPass.registerCallbacks(PIC);
   TimePasses.registerCallbacks(PIC);
   OptNone.registerCallbacks(PIC);
+  PreservedCFGChecker.registerCallbacks(PIC);
 }

From 1e1770a07ec0f6a3576362ea5eb97aedd33f4b26 Mon Sep 17 00:00:00 2001
From: David Sherwood <david.sherwood@arm.com>
Date: Thu, 3 Sep 2020 11:57:55 +0100
Subject: [PATCH 8/9] [SVE][CodeGen] Fix InlineFunction for scalable vectors

When inlining functions containing allocas of scalable vectors we
cannot specify the size in the lifetime markers, since we don't
know this at compile time.

Added new test here:

  test/Transforms/Inline/AArch64/sve-alloca-merge.ll

Differential Revision: https://reviews.llvm.org/D87139
---
 llvm/lib/Transforms/Utils/InlineFunction.cpp  |  7 +++--
 .../Inline/AArch64/sve-alloca-merge.ll        | 29 +++++++++++++++++++
 2 files changed, 33 insertions(+), 3 deletions(-)
 create mode 100644 llvm/test/Transforms/Inline/AArch64/sve-alloca-merge.ll

diff --git a/llvm/lib/Transforms/Utils/InlineFunction.cpp b/llvm/lib/Transforms/Utils/InlineFunction.cpp
index 30726627bc8295..7ff21d7ee9ef61 100644
--- a/llvm/lib/Transforms/Utils/InlineFunction.cpp
+++ b/llvm/lib/Transforms/Utils/InlineFunction.cpp
@@ -2061,7 +2061,7 @@ llvm::InlineResult llvm::InlineFunction(CallBase &CB, InlineFunctionInfo &IFI,
           dyn_cast<ConstantInt>(AI->getArraySize())) {
         auto &DL = Caller->getParent()->getDataLayout();
         Type *AllocaType = AI->getAllocatedType();
-        uint64_t AllocaTypeSize = DL.getTypeAllocSize(AllocaType);
+        TypeSize AllocaTypeSize = DL.getTypeAllocSize(AllocaType);
         uint64_t AllocaArraySize = AIArraySize->getLimitedValue();
 
         // Don't add markers for zero-sized allocas.
@@ -2070,9 +2070,10 @@ llvm::InlineResult llvm::InlineFunction(CallBase &CB, InlineFunctionInfo &IFI,
 
         // Check that array size doesn't saturate uint64_t and doesn't
         // overflow when it's multiplied by type size.
-        if (AllocaArraySize != std::numeric_limits<uint64_t>::max() &&
+        if (!AllocaTypeSize.isScalable() &&
+            AllocaArraySize != std::numeric_limits<uint64_t>::max() &&
             std::numeric_limits<uint64_t>::max() / AllocaArraySize >=
-                AllocaTypeSize) {
+                AllocaTypeSize.getFixedSize()) {
           AllocaSize = ConstantInt::get(Type::getInt64Ty(AI->getContext()),
                                         AllocaArraySize * AllocaTypeSize);
         }
diff --git a/llvm/test/Transforms/Inline/AArch64/sve-alloca-merge.ll b/llvm/test/Transforms/Inline/AArch64/sve-alloca-merge.ll
new file mode 100644
index 00000000000000..c355388ed836f5
--- /dev/null
+++ b/llvm/test/Transforms/Inline/AArch64/sve-alloca-merge.ll
@@ -0,0 +1,29 @@
+; RUN: opt -mtriple=aarch64--linux-gnu -mattr=+sve < %s -inline -S | FileCheck %s
+
+define void @bar(<vscale x 2 x i64>* %a) {
+entry:
+  %b = alloca <vscale x 2 x i64>, align 16
+  store <vscale x 2 x i64> zeroinitializer, <vscale x 2 x i64>* %b, align 16
+  %c = load <vscale x 2 x i64>, <vscale x 2 x i64>* %a, align 16
+  %d = load <vscale x 2 x i64>, <vscale x 2 x i64>* %b, align 16
+  %e = add <vscale x 2 x i64> %c, %d
+  %f = add <vscale x 2 x i64> %e, %c
+  store <vscale x 2 x i64> %f, <vscale x 2 x i64>* %a, align 16
+  ret void
+}
+
+define i64 @foo() {
+; CHECK-LABEL: @foo(
+; CHECK: %0 = bitcast <vscale x 2 x i64>* %{{.*}} to i8*
+; CHECK-NEXT: call void @llvm.lifetime.start.p0i8(i64 -1, i8* %0)
+; CHECK: %1 = bitcast <vscale x 2 x i64>* %{{.*}} to i8*
+; CHECK-NEXT: call void @llvm.lifetime.end.p0i8(i64 -1, i8* %1)
+entry:
+  %a = alloca <vscale x 2 x i64>, align 16
+  store <vscale x 2 x i64> zeroinitializer, <vscale x 2 x i64>* %a, align 16
+  %a1 = bitcast <vscale x 2 x i64>* %a to i64*
+  store i64 1, i64* %a1, align 8
+  call void @bar(<vscale x 2 x i64>* %a)
+  %el = load i64, i64* %a1
+  ret i64 %el
+}

From d380b582f7f04f7635b1fbdb8347a6095660a1b6 Mon Sep 17 00:00:00 2001
From: MaheshRavishankar <ravishankarm@google.com>
Date: Thu, 10 Sep 2020 23:56:34 -0700
Subject: [PATCH 9/9] [mlir][Linalg] Make LinalgBaseTilingPattern not delete
 the original operation.

The LinalgTilingPattern class dervied from the base deletes the
original operation. This allows for the use case where the more
transformations are necessary on the original operation after
tiling. In such cases the pattern can derive from
LinalgBaseTilingPattern instead of LinalgTilingPattern.

Differential Revision: https://reviews.llvm.org/D87308
---
 .../mlir/Dialect/Linalg/Transforms/Transforms.h        | 10 +++++++++-
 mlir/lib/Dialect/Linalg/Transforms/Transforms.cpp      |  2 --
 2 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h b/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h
index 3049570bd47b60..b55c429a9d02d7 100644
--- a/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h
+++ b/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h
@@ -313,6 +313,13 @@ struct LinalgTilingPattern : public LinalgBaseTilingPattern {
                       PatternBenefit benefit = 1)
       : LinalgBaseTilingPattern(OpTy::getOperationName(), context, options,
                                 marker, benefit) {}
+  LogicalResult matchAndRewrite(Operation *op,
+                                PatternRewriter &rewriter) const override {
+    if (failed(LinalgBaseTilingPattern::matchAndRewrite(op, rewriter)))
+      return failure();
+    rewriter.eraseOp(op);
+    return success();
+  }
 };
 
 ///
@@ -415,7 +422,8 @@ enum class LinalgLoweringType {
   AffineLoops = 2,
   ParallelLoops = 3
 };
-template <typename OpTy> struct LinalgLoweringPattern : public RewritePattern {
+template <typename OpTy>
+struct LinalgLoweringPattern : public RewritePattern {
   LinalgLoweringPattern(MLIRContext *context, LinalgLoweringType loweringType,
                         LinalgMarker marker = LinalgMarker(),
                         PatternBenefit benefit = 1)
diff --git a/mlir/lib/Dialect/Linalg/Transforms/Transforms.cpp b/mlir/lib/Dialect/Linalg/Transforms/Transforms.cpp
index afac3d5f5f9a43..c1aad620fe08af 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/Transforms.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/Transforms.cpp
@@ -126,8 +126,6 @@ LogicalResult mlir::linalg::LinalgBaseTilingPattern::matchAndRewrite(
 
   // New marker if specified.
   marker.replaceLinalgMarker(rewriter, res->op.getOperation());
-
-  rewriter.eraseOp(op);
   return success();
 }