Merged master:40b72c9c7920 into amd-gfx:b1dd8d87a662

Local branch amd-gfx b1dd8d8 Merged master:f980ed4184f9 into amd-gfx:411720708275 Remote branch master 40b72c9 [ARM] Extra MLA reductions tests. NFC
jaebaek · Sep 11, 2020 · ec51c0b · ec51c0b
2 parents b1dd8d8 + 40b72c9
commit ec51c0b
Show file tree

Hide file tree

Showing 11 changed files with 3,533 additions and 71 deletions.
diff --git a/clang/lib/Basic/Targets/PPC.h b/clang/lib/Basic/Targets/PPC.h
@@ -82,6 +82,7 @@ class LLVM_LIBRARY_VISIBILITY PPCTargetInfo : public TargetInfo {
     SimdDefaultAlign = 128;
     LongDoubleWidth = LongDoubleAlign = 128;
     LongDoubleFormat = &llvm::APFloat::PPCDoubleDouble();
+    HasStrictFP = true;
   }
 
   // Set the language option for altivec based on our value.

diff --git a/clang/test/CodeGen/builtins-ppc-fpconstrained.c b/clang/test/CodeGen/builtins-ppc-fpconstrained.c
@@ -2,14 +2,12 @@
 // RUN: %clang_cc1 -triple powerpc64le-gnu-linux -target-feature +vsx \
 // RUN: -emit-llvm %s -o - | FileCheck --check-prefix=CHECK-UNCONSTRAINED %s
 // RUN: %clang_cc1 -triple powerpc64le-gnu-linux -target-feature +vsx \
-// RUN: -fexperimental-strict-floating-point \
 // RUN:  -ffp-exception-behavior=strict -emit-llvm %s -o - | FileCheck \
 // RUN: --check-prefix=CHECK-CONSTRAINED -vv %s
 // RUN: %clang_cc1 -triple powerpc64le-gnu-linux -target-feature +vsx \
 // RUN: -fallow-half-arguments-and-returns -S -o - %s | \
 // RUN: FileCheck --check-prefix=CHECK-ASM --check-prefix=NOT-FIXME-CHECK  %s
 // RUN: %clang_cc1 -triple powerpc64le-gnu-linux -target-feature +vsx \
-// RUN: -fexperimental-strict-floating-point \
 // RUN: -fallow-half-arguments-and-returns -S -ffp-exception-behavior=strict \
 // RUN: -o - %s | FileCheck --check-prefix=CHECK-ASM \
 // RUN: --check-prefix=FIXME-CHECK  %s

diff --git a/compiler-rt/test/dfsan/event_callbacks.c b/compiler-rt/test/dfsan/event_callbacks.c
@@ -2,10 +2,6 @@
 // RUN: %clang_dfsan -O2 -mllvm -dfsan-event-callbacks %s %t-callbacks.o -o %t
 // RUN: %run %t FooBarBaz 2>&1 | FileCheck %s
 
-// See PR47488, parts of this test get optimized out by a more aggressive
-// dead store eliminator.
-// XFAIL: *
-
 // Tests that callbacks are inserted for store events when
 // -dfsan-event-callbacks is specified.
 
@@ -118,14 +114,16 @@ int main(int Argc, char *Argv[]) {
   LabelArgv = dfsan_create_label("Argv", 0);
   dfsan_set_label(LabelArgv, Argv[1], LenArgv);
 
-  char SinkBuf[64];
-  assert(LenArgv < sizeof(SinkBuf) - 1);
+  char Buf[64];
+  assert(LenArgv < sizeof(Buf) - 1);
 
   // CHECK: Label 4 copied to memory
-  memcpy(SinkBuf, Argv[1], LenArgv);
+  void *volatile SinkPtr = Buf;
+  memcpy(SinkPtr, Argv[1], LenArgv);
 
   // CHECK: Label 4 copied to memory
-  memmove(&SinkBuf[1], SinkBuf, LenArgv);
+  SinkPtr = &Buf[1];
+  memmove(SinkPtr, Buf, LenArgv);
 
   return 0;
 }

diff --git a/lld/ELF/Arch/PPC64.cpp b/lld/ELF/Arch/PPC64.cpp
@@ -62,6 +62,8 @@ enum DFormOpcd {
   ADDI = 14
 };
 
+constexpr uint32_t NOP = 0x60000000;
+
 enum class PPCLegacyInsn : uint32_t {
   NOINSN = 0,
   // Loads.
@@ -691,7 +693,7 @@ void PPC64::relaxGot(uint8_t *loc, const Relocation &rel, uint64_t val) const {
     writePrefixedInstruction(loc, pcRelInsn |
                                       ((totalDisp & 0x3ffff0000) << 16) |
                                       (totalDisp & 0xffff));
-    write32(loc + rel.addend, 0x60000000); // nop accessInsn.
+    write32(loc + rel.addend, NOP); // nop accessInsn.
     break;
   }
   default:
@@ -718,15 +720,15 @@ void PPC64::relaxTlsGdToLe(uint8_t *loc, const Relocation &rel,
 
   switch (rel.type) {
   case R_PPC64_GOT_TLSGD16_HA:
-    writeFromHalf16(loc, 0x60000000); // nop
+    writeFromHalf16(loc, NOP);
     break;
   case R_PPC64_GOT_TLSGD16:
   case R_PPC64_GOT_TLSGD16_LO:
     writeFromHalf16(loc, 0x3c6d0000); // addis r3, r13
     relocateNoSym(loc, R_PPC64_TPREL16_HA, val);
     break;
   case R_PPC64_TLSGD:
-    write32(loc, 0x60000000);     // nop
+    write32(loc, NOP);
     write32(loc + 4, 0x38630000); // addi r3, r3
     // Since we are relocating a half16 type relocation and Loc + 4 points to
     // the start of an instruction we need to advance the buffer by an extra
@@ -758,13 +760,13 @@ void PPC64::relaxTlsLdToLe(uint8_t *loc, const Relocation &rel,
 
   switch (rel.type) {
   case R_PPC64_GOT_TLSLD16_HA:
-    writeFromHalf16(loc, 0x60000000); // nop
+    writeFromHalf16(loc, NOP);
     break;
   case R_PPC64_GOT_TLSLD16_LO:
     writeFromHalf16(loc, 0x3c6d0000); // addis r3, r13, 0
     break;
   case R_PPC64_TLSLD:
-    write32(loc, 0x60000000);     // nop
+    write32(loc, NOP);
     write32(loc + 4, 0x38631000); // addi r3, r3, 4096
     break;
   case R_PPC64_DTPREL16:
@@ -829,7 +831,7 @@ void PPC64::relaxTlsIeToLe(uint8_t *loc, const Relocation &rel,
   unsigned offset = (config->ekind == ELF64BEKind) ? 2 : 0;
   switch (rel.type) {
   case R_PPC64_GOT_TPREL16_HA:
-    write32(loc - offset, 0x60000000); // nop
+    write32(loc - offset, NOP);
     break;
   case R_PPC64_GOT_TPREL16_LO_DS:
   case R_PPC64_GOT_TPREL16_DS: {
@@ -1128,7 +1130,7 @@ void PPC64::relocate(uint8_t *loc, const Relocation &rel, uint64_t val) const {
   case R_PPC64_REL16_HA:
   case R_PPC64_TPREL16_HA:
     if (config->tocOptimize && shouldTocOptimize && ha(val) == 0)
-      writeFromHalf16(loc, 0x60000000);
+      writeFromHalf16(loc, NOP);
     else
       write16(loc, ha(val));
     break;
@@ -1353,7 +1355,7 @@ void PPC64::relaxTlsGdToIe(uint8_t *loc, const Relocation &rel,
     return;
   }
   case R_PPC64_TLSGD:
-    write32(loc, 0x60000000);     // bl __tls_get_addr(sym@tlsgd) --> nop
+    write32(loc, NOP);            // bl __tls_get_addr(sym@tlsgd) --> nop
     write32(loc + 4, 0x7c636A14); // nop --> add r3, r3, r13
     return;
   default:
@@ -1424,7 +1426,7 @@ bool PPC64::adjustPrologueForCrossSplitStack(uint8_t *loc, uint8_t *end,
   uint32_t secondInstr = read32(loc + 8);
   if (!loImm && getPrimaryOpCode(secondInstr) == 14) {
     loImm = secondInstr & 0xFFFF;
-  } else if (secondInstr != 0x60000000) {
+  } else if (secondInstr != NOP) {
     return false;
   }
 
@@ -1438,7 +1440,7 @@ bool PPC64::adjustPrologueForCrossSplitStack(uint8_t *loc, uint8_t *end,
   };
   if (!checkRegOperands(firstInstr, 12, 1))
     return false;
-  if (secondInstr != 0x60000000 && !checkRegOperands(secondInstr, 12, 12))
+  if (secondInstr != NOP && !checkRegOperands(secondInstr, 12, 12))
     return false;
 
   int32_t stackFrameSize = (hiImm * 65536) + loImm;
@@ -1457,12 +1459,12 @@ bool PPC64::adjustPrologueForCrossSplitStack(uint8_t *loc, uint8_t *end,
   if (hiImm) {
     write32(loc + 4, 0x3D810000 | (uint16_t)hiImm);
     // If the low immediate is zero the second instruction will be a nop.
-    secondInstr = loImm ? 0x398C0000 | (uint16_t)loImm : 0x60000000;
+    secondInstr = loImm ? 0x398C0000 | (uint16_t)loImm : NOP;
     write32(loc + 8, secondInstr);
   } else {
     // addi r12, r1, imm
     write32(loc + 4, (0x39810000) | (uint16_t)loImm);
-    write32(loc + 8, 0x60000000);
+    write32(loc + 8, NOP);
   }
 
   return true;

diff --git a/llvm/docs/AMDGPUDwarfExtensionsForHeterogeneousDebugging.rst b/llvm/docs/AMDGPUDwarfExtensionsForHeterogeneousDebugging.rst
@@ -2678,7 +2678,7 @@ architectures.
 
 DWARF address space identifiers are used by:
 
-* The DWARF expession operations: ``DW_OP_LLVM_aspace_bregx``,
+* The DWARF expression operations: ``DW_OP_LLVM_aspace_bregx``,
   ``DW_OP_LLVM_form_aspace_address``, ``DW_OP_LLVM_implicit_aspace_pointer``,
   and ``DW_OP_xderef*``.
 
@@ -3387,7 +3387,7 @@ Standard Content Descriptions
     provided by the* ``DW_LNCT_path`` *field. When the source field is absent,
     consumers can access the file to get the source text.*
 
-    *This is particularly useful for programing languages that support runtime
+    *This is particularly useful for programming languages that support runtime
     compilation and runtime generation of source text. In these cases, the
     source text does not reside in any permanent file. For example, the OpenCL
     language [:ref:`OpenCL <amdgpu-dwarf-OpenCL>`] supports online compilation.*

diff --git a/llvm/examples/ThinLtoJIT/ThinLtoInstrumentationLayer.cpp b/llvm/examples/ThinLtoJIT/ThinLtoInstrumentationLayer.cpp
@@ -120,8 +120,8 @@ void ThinLtoInstrumentationLayer::nudgeIntoDiscovery(
   LLVM_DEBUG(dbgs() << "Nudged " << Count << " new functions into discovery\n");
 }
 
-void ThinLtoInstrumentationLayer::emit(
-    std::unique_ptr<MaterializationResponsibility> R, ThreadSafeModule TSM) {
+void ThinLtoInstrumentationLayer::emit(MaterializationResponsibility R,
+                                       ThreadSafeModule TSM) {
   TSM.withModuleDo([this](Module &M) {
     std::vector<Function *> FunctionsToInstrument;
 

diff --git a/llvm/examples/ThinLtoJIT/ThinLtoInstrumentationLayer.h b/llvm/examples/ThinLtoJIT/ThinLtoInstrumentationLayer.h
@@ -34,8 +34,7 @@ class ThinLtoInstrumentationLayer : public IRLayer {
 
   ~ThinLtoInstrumentationLayer() override;
 
-  void emit(std::unique_ptr<MaterializationResponsibility> R,
-            ThreadSafeModule TSM) override;
+  void emit(MaterializationResponsibility R, ThreadSafeModule TSM) override;
 
   unsigned reserveDiscoveryFlags(unsigned Count);
   void registerDiscoveryFlagOwners(std::vector<GlobalValue::GUID> Guids,

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -3694,11 +3694,13 @@ static bool isLoadCombineCandidateImpl(Value *Root, unsigned NumElts,
                                        TargetTransformInfo *TTI) {
   // Look past the root to find a source value. Arbitrarily follow the
   // path through operand 0 of any 'or'. Also, peek through optional
-  // shift-left-by-constant.
+  // shift-left-by-multiple-of-8-bits.
   Value *ZextLoad = Root;
+  const APInt *ShAmtC;
   while (!isa<ConstantExpr>(ZextLoad) &&
          (match(ZextLoad, m_Or(m_Value(), m_Value())) ||
-          match(ZextLoad, m_Shl(m_Value(), m_Constant()))))
+          (match(ZextLoad, m_Shl(m_Value(), m_APInt(ShAmtC))) &&
+           ShAmtC->urem(8) == 0)))
     ZextLoad = cast<BinaryOperator>(ZextLoad)->getOperand(0);
 
   // Check if the input is an extended load of the required or/shift expression.