merge main into amd-staging

Change-Id: Ifd894fe72a12b977fa159bde1b206637f42dffa9
ROCm · Nov 10, 2024 · e553a98 · e553a98
2 parents 9a0cb31 + 7085ac8
commit e553a98
Show file tree

Hide file tree

Showing 35 changed files with 1,352 additions and 484 deletions.
diff --git a/clang/test/Driver/riscv-profiles.c b/clang/test/Driver/riscv-profiles.c
@@ -57,6 +57,7 @@
 // RVA22U64: "-target-feature" "+f"
 // RVA22U64: "-target-feature" "+d"
 // RVA22U64: "-target-feature" "+c"
+// RVA22U64: "-target-feature" "+b"
 // RVA22U64: "-target-feature" "+zic64b"
 // RVA22U64: "-target-feature" "+zicbom"
 // RVA22U64: "-target-feature" "+zicbop"
@@ -83,6 +84,7 @@
 // RVA22S64: "-target-feature" "+f"
 // RVA22S64: "-target-feature" "+d"
 // RVA22S64: "-target-feature" "+c"
+// RVA22S64: "-target-feature" "+b"
 // RVA22S64: "-target-feature" "+zic64b"
 // RVA22S64: "-target-feature" "+zicbom"
 // RVA22S64: "-target-feature" "+zicbop"
@@ -118,6 +120,7 @@
 // RVA23U64: "-target-feature" "+f"
 // RVA23U64: "-target-feature" "+d"
 // RVA23U64: "-target-feature" "+c"
+// RVA23U64: "-target-feature" "+b"
 // RVA23U64: "-target-feature" "+v"
 // RVA23U64: "-target-feature" "+zic64b"
 // RVA23U64: "-target-feature" "+zicbom"
@@ -156,6 +159,7 @@
 // RVA23S64: "-target-feature" "+f"
 // RVA23S64: "-target-feature" "+d"
 // RVA23S64: "-target-feature" "+c"
+// RVA23S64: "-target-feature" "+b"
 // RVA23S64: "-target-feature" "+v"
 // RVA23S64: "-target-feature" "+h"
 // RVA23S64: "-target-feature" "+zic64b"
@@ -217,6 +221,7 @@
 // RVB23U64: "-target-feature" "+f"
 // RVB23U64: "-target-feature" "+d"
 // RVB23U64: "-target-feature" "+c"
+// RVB23U64: "-target-feature" "+b"
 // RVB23U64: "-target-feature" "+zic64b"
 // RVB23U64: "-target-feature" "+zicbom"
 // RVB23U64: "-target-feature" "+zicbop"
@@ -249,6 +254,7 @@
 // RVB23S64: "-target-feature" "+f"
 // RVB23S64: "-target-feature" "+d"
 // RVB23S64: "-target-feature" "+c"
+// RVB23S64: "-target-feature" "+b"
 // RVB23S64: "-target-feature" "+zic64b"
 // RVB23S64: "-target-feature" "+zicbom"
 // RVB23S64: "-target-feature" "+zicbop"
@@ -290,6 +296,7 @@
 // RUN: %clang --target=riscv32 -### -c %s 2>&1 -march=rvm23u32 -menable-experimental-extensions \
 // RUN:   | FileCheck -check-prefix=RVM23U32 %s
 // RVM23U32: "-target-feature" "+m"
+// RVM23U32: "-target-feature" "+b"
 // RVM23U32: "-target-feature" "+zicbop"
 // RVM23U32: "-target-feature" "+zicond"
 // RVM23U32: "-target-feature" "+zicsr"
@@ -309,6 +316,7 @@
 // PROFILE-WITH-ADDITIONAL: "-target-feature" "+f"
 // PROFILE-WITH-ADDITIONAL: "-target-feature" "+d"
 // PROFILE-WITH-ADDITIONAL: "-target-feature" "+c"
+// PROFILE-WITH-ADDITIONAL: "-target-feature" "+b"
 // PROFILE-WITH-ADDITIONAL: "-target-feature" "+zicbom"
 // PROFILE-WITH-ADDITIONAL: "-target-feature" "+zicbop"
 // PROFILE-WITH-ADDITIONAL: "-target-feature" "+zicboz"

diff --git a/compiler-rt/lib/ctx_profile/CtxInstrContextNode.h b/compiler-rt/lib/ctx_profile/CtxInstrContextNode.h
@@ -8,9 +8,9 @@
 //==============================================================================
 //
 // NOTE!
-// llvm/lib/ProfileData/CtxInstrContextNode.h and
+// llvm/include/llvm/ProfileData/CtxInstrContextNode.h and
 //   compiler-rt/lib/ctx_profile/CtxInstrContextNode.h
-// must be exact copies of each other
+// must be exact copies of each other.
 //
 // compiler-rt creates these objects as part of the instrumentation runtime for
 // contextual profiling. LLVM only consumes them to convert a contextual tree
@@ -114,4 +114,4 @@ class ContextNode final {
 };
 } // namespace ctx_profile
 } // namespace llvm
-#endif
+#endif
diff --git a/lld/test/ELF/aarch64-feature-pac.s b/lld/test/ELF/aarch64-feature-pac.s
@@ -76,12 +76,14 @@
 # PACDYN-NOT:      0x0000000070000001 (AARCH64_BTI_PLT)
 # PACDYN-NOT:      0x0000000070000003 (AARCH64_PAC_PLT)
 
-## Turn on PAC entries with the -z pac-plt command line option. There are no
-## warnings in this case as the choice to use PAC in PLT entries is orthogonal
-## to the choice of using PAC in relocatable objects. The presence of the PAC
-## .note.gnu.property is an indication of preference by the relocatable object.
+## Turn on PAC entries with the -z pac-plt command line option. For files w/o
+## GNU_PROPERTY_AARCH64_FEATURE_1_PAC set in GNU_PROPERTY_AARCH64_FEATURE_1_AND
+## property, emit a warning.
+
+# RUN: ld.lld %t.o %t2.o -z pac-plt %t.so -o %tpacplt.exe 2>&1 | FileCheck -DFILE=%t2.o --check-prefix WARN %s
+
+# WARN: warning: [[FILE]]: -z pac-plt: file does not have GNU_PROPERTY_AARCH64_FEATURE_1_PAC property
 
-# RUN: ld.lld %t.o %t2.o -z pac-plt %t.so -o %tpacplt.exe
 # RUN: llvm-readelf -n %tpacplt.exe | FileCheck --check-prefix=PACPROP %s
 # RUN: llvm-readelf --dynamic-table %tpacplt.exe | FileCheck --check-prefix PACDYN2 %s
 # RUN: llvm-objdump --no-print-imm-hex -d --mattr=+v8.3a --no-show-raw-insn %tpacplt.exe | FileCheck --check-prefix PACPLT %s

diff --git a/llvm/docs/NVPTXUsage.rst b/llvm/docs/NVPTXUsage.rst
@@ -599,6 +599,70 @@ described in the ``s2g.tile`` mode intrinsics above.
 For more information, refer PTX ISA
 `<https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-tensor>`_.
 
+'``llvm.nvvm.cp.async.bulk.tensor.prefetch.tile.[1-5]d``'
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+
+.. code-block:: llvm
+
+  declare void @llvm.nvvm.cp.async.bulk.tensor.prefetch.tile.1d(ptr %tensor_map, i32 %d0, i64 %ch, i1 %flag_ch)
+  declare void @llvm.nvvm.cp.async.bulk.tensor.prefetch.tile.2d(..., i32 %d0, i32 %d1, ...)
+  declare void @llvm.nvvm.cp.async.bulk.tensor.prefetch.tile.3d(..., i32 %d0, i32 %d1, i32 %d2, ...)
+  declare void @llvm.nvvm.cp.async.bulk.tensor.prefetch.tile.4d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, ...)
+  declare void @llvm.nvvm.cp.async.bulk.tensor.prefetch.tile.5d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, ...)
+
+Overview:
+"""""""""
+
+The '``@llvm.nvvm.cp.async.bulk.tensor.prefetch.tile.[1-5]d``' intrinsics
+correspond to the ``cp.async.bulk.prefetch.tensor.[1-5]d.L2.global*`` set
+of PTX instructions. These instructions initiate an asynchronous prefetch
+of tensor data from global memory to the L2 cache. In tile mode, the
+multi-dimensional layout of the source tensor is preserved at the destination.
+The dimension of the tensor data ranges from 1d to 5d with the coordinates
+specified by the ``i32 %d0 ... i32 %d4`` arguments.
+
+* The last argument to these intrinsics is a boolean flag
+  indicating support for cache_hint. This flag argument must
+  be a compile-time constant. When set, it indicates a valid
+  cache_hint (``i64 %ch``) and generates the ``.L2::cache_hint``
+  variant of the PTX instruction.
+
+For more information, refer PTX ISA
+`<https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-cp-async-bulk-prefetch-tensor>`_.
+
+'``llvm.nvvm.cp.async.bulk.tensor.prefetch.im2col.[1-5]d``'
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+
+.. code-block:: llvm
+
+  declare void @llvm.nvvm.cp.async.bulk.tensor.prefetch.im2col.3d(ptr %tensor_map, i32 %d0, i32 %d1, i32 %d2, i16 %im2col0, i64 %ch, i1 %flag_ch)
+  declare void @llvm.nvvm.cp.async.bulk.tensor.prefetch.im2col.4d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %im2col0, i16 %im2col1, ...)
+  declare void @llvm.nvvm.cp.async.bulk.tensor.prefetch.im2col.5d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %im2col0, i16 %im2col1, i16 %im2col2, ...)
+
+Overview:
+"""""""""
+
+The '``@llvm.nvvm.cp.async.bulk.tensor.prefetch.im2col.[1-5]d``' intrinsics
+correspond to the ``cp.async.bulk.prefetch.tensor.[1-5]d.L2.global*`` set
+of PTX instructions. These instructions initiate an asynchronous prefetch
+of tensor data from global memory to the L2 cache. In im2col mode, some
+dimensions of the source tensor are unrolled into a single dimensional
+column at the destination. In this mode, the tensor has to be at least
+three-dimensional. Along with the tensor coordinates, im2col offsets are
+also specified (denoted by ``i16 im2col0...i16 %im2col2``). The number
+of im2col offsets is two less than the number of dimensions of the tensor
+operation. The last argument to these intrinsics is a boolean flag, with
+the same functionality as described in the ``tile`` mode intrinsics above.
+
+For more information, refer PTX ISA
+`<https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-cp-async-bulk-prefetch-tensor>`_.
+
 Other Intrinsics
 ----------------
 

diff --git a/llvm/include/llvm/CodeGen/GlobalISel/LegalizerInfo.h b/llvm/include/llvm/CodeGen/GlobalISel/LegalizerInfo.h
@@ -1102,6 +1102,13 @@ class LegalizeRuleSet {
     return minScalar(TypeIdx, MinTy).maxScalar(TypeIdx, MaxTy);
   }
 
+  LegalizeRuleSet &clampScalar(bool Pred, unsigned TypeIdx, const LLT MinTy,
+                               const LLT MaxTy) {
+    if (!Pred)
+      return *this;
+    return clampScalar(TypeIdx, MinTy, MaxTy);
+  }
+
   /// Limit the range of scalar sizes to MinTy and MaxTy.
   LegalizeRuleSet &clampScalarOrElt(unsigned TypeIdx, const LLT MinTy,
                                     const LLT MaxTy) {

diff --git a/llvm/include/llvm/ExecutionEngine/Orc/Core.h b/llvm/include/llvm/ExecutionEngine/Orc/Core.h
@@ -19,7 +19,9 @@
 #include "llvm/ADT/IntrusiveRefCntPtr.h"
 #include "llvm/ExecutionEngine/JITLink/JITLinkDylib.h"
 #include "llvm/ExecutionEngine/JITSymbol.h"
+#include "llvm/ExecutionEngine/Orc/CoreContainers.h"
 #include "llvm/ExecutionEngine/Orc/ExecutorProcessControl.h"
+#include "llvm/ExecutionEngine/Orc/MaterializationUnit.h"
 #include "llvm/ExecutionEngine/Orc/Shared/ExecutorAddress.h"
 #include "llvm/ExecutionEngine/Orc/Shared/ExecutorSymbolDef.h"
 #include "llvm/ExecutionEngine/Orc/Shared/WrapperFunctionUtils.h"
@@ -39,7 +41,6 @@ namespace orc {
 // Forward declare some classes.
 class AsynchronousSymbolQuery;
 class ExecutionSession;
-class MaterializationUnit;
 class MaterializationResponsibility;
 class JITDylib;
 class ResourceTracker;
@@ -109,23 +110,6 @@ class ResourceManager {
                                        ResourceKey SrcK) = 0;
 };
 
-/// A set of symbol names (represented by SymbolStringPtrs for
-//         efficiency).
-using SymbolNameSet = DenseSet<SymbolStringPtr>;
-
-/// A vector of symbol names.
-using SymbolNameVector = std::vector<SymbolStringPtr>;
-
-/// A map from symbol names (as SymbolStringPtrs) to JITSymbols
-/// (address/flags pairs).
-using SymbolMap = DenseMap<SymbolStringPtr, ExecutorSymbolDef>;
-
-/// A map from symbol names (as SymbolStringPtrs) to JITSymbolFlags.
-using SymbolFlagsMap = DenseMap<SymbolStringPtr, JITSymbolFlags>;
-
-/// A map from JITDylibs to sets of symbols.
-using SymbolDependenceMap = DenseMap<JITDylib *, SymbolNameSet>;
-
 /// Lookup flags that apply to each dylib in the search order for a lookup.
 ///
 /// If MatchHiddenSymbolsOnly is used (the default) for a given dylib, then
@@ -682,83 +666,6 @@ class MaterializationResponsibility {
   SymbolStringPtr InitSymbol;
 };
 
-/// A MaterializationUnit represents a set of symbol definitions that can
-///        be materialized as a group, or individually discarded (when
-///        overriding definitions are encountered).
-///
-/// MaterializationUnits are used when providing lazy definitions of symbols to
-/// JITDylibs. The JITDylib will call materialize when the address of a symbol
-/// is requested via the lookup method. The JITDylib will call discard if a
-/// stronger definition is added or already present.
-class MaterializationUnit {
-  friend class ExecutionSession;
-  friend class JITDylib;
-
-public:
-  static char ID;
-
-  struct Interface {
-    Interface() = default;
-    Interface(SymbolFlagsMap InitalSymbolFlags, SymbolStringPtr InitSymbol)
-        : SymbolFlags(std::move(InitalSymbolFlags)),
-          InitSymbol(std::move(InitSymbol)) {
-      assert((!this->InitSymbol || this->SymbolFlags.count(this->InitSymbol)) &&
-             "If set, InitSymbol should appear in InitialSymbolFlags map");
-    }
-
-    SymbolFlagsMap SymbolFlags;
-    SymbolStringPtr InitSymbol;
-  };
-
-  MaterializationUnit(Interface I)
-      : SymbolFlags(std::move(I.SymbolFlags)),
-        InitSymbol(std::move(I.InitSymbol)) {}
-  virtual ~MaterializationUnit() = default;
-
-  /// Return the name of this materialization unit. Useful for debugging
-  /// output.
-  virtual StringRef getName() const = 0;
-
-  /// Return the set of symbols that this source provides.
-  const SymbolFlagsMap &getSymbols() const { return SymbolFlags; }
-
-  /// Returns the initialization symbol for this MaterializationUnit (if any).
-  const SymbolStringPtr &getInitializerSymbol() const { return InitSymbol; }
-
-  /// Implementations of this method should materialize all symbols
-  ///        in the materialzation unit, except for those that have been
-  ///        previously discarded.
-  virtual void
-  materialize(std::unique_ptr<MaterializationResponsibility> R) = 0;
-
-  /// Called by JITDylibs to notify MaterializationUnits that the given symbol
-  /// has been overridden.
-  void doDiscard(const JITDylib &JD, const SymbolStringPtr &Name) {
-    SymbolFlags.erase(Name);
-    if (InitSymbol == Name) {
-      DEBUG_WITH_TYPE("orc", {
-        dbgs() << "In " << getName() << ": discarding init symbol \""
-               << *Name << "\"\n";
-      });
-      InitSymbol = nullptr;
-    }
-    discard(JD, std::move(Name));
-  }
-
-protected:
-  SymbolFlagsMap SymbolFlags;
-  SymbolStringPtr InitSymbol;
-
-private:
-  virtual void anchor();
-
-  /// Implementations of this method should discard the given symbol
-  ///        from the source (e.g. if the source is an LLVM IR Module and the
-  ///        symbol is a function, delete the function body or mark it available
-  ///        externally).
-  virtual void discard(const JITDylib &JD, const SymbolStringPtr &Name) = 0;
-};
-
 /// A MaterializationUnit implementation for pre-existing absolute symbols.
 ///
 /// All symbols will be resolved and marked ready as soon as the unit is

diff --git a/llvm/include/llvm/ExecutionEngine/Orc/CoreContainers.h b/llvm/include/llvm/ExecutionEngine/Orc/CoreContainers.h
@@ -0,0 +1,47 @@
+//===---- CoreContainers.h - Symbol Containers for Core APIs ----*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Symbol container types for core ORC APIs.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_EXECUTIONENGINE_ORC_CORECONTAINERS_H
+#define LLVM_EXECUTIONENGINE_ORC_CORECONTAINERS_H
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ExecutionEngine/JITSymbol.h"
+#include "llvm/ExecutionEngine/Orc/Shared/ExecutorSymbolDef.h"
+#include "llvm/ExecutionEngine/Orc/SymbolStringPool.h"
+
+#include <vector>
+
+namespace llvm::orc {
+
+class JITDylib;
+
+/// A set of symbol names (represented by SymbolStringPtrs for
+//         efficiency).
+using SymbolNameSet = DenseSet<SymbolStringPtr>;
+
+/// A vector of symbol names.
+using SymbolNameVector = std::vector<SymbolStringPtr>;
+
+/// A map from symbol names (as SymbolStringPtrs) to JITSymbols
+/// (address/flags pairs).
+using SymbolMap = DenseMap<SymbolStringPtr, ExecutorSymbolDef>;
+
+/// A map from symbol names (as SymbolStringPtrs) to JITSymbolFlags.
+using SymbolFlagsMap = DenseMap<SymbolStringPtr, JITSymbolFlags>;
+
+/// A map from JITDylibs to sets of symbols.
+using SymbolDependenceMap = DenseMap<JITDylib *, SymbolNameSet>;
+
+} // End namespace llvm::orc
+
+#endif // LLVM_EXECUTIONENGINE_ORC_CORECONTAINERS_H