Integrate LLVM at 9fa55ec3 (#18670)

Cherry-picks: 1. llvm/llvm-project#110918 2. llvm/llvm-project#110904 3. llvm/llvm-project#110927 The revision disables the pack/unpack decomposition when any of inner tiles is dynamic. Because it leads to unbounded stack allocation (which is introduced by tensor.pad op). It's broken by the `Extend the logic to generalise tensor.pack` commits. See llvm/llvm-project@66f84c8 and llvm/llvm-project@1c01bcb. --------- Signed-off-by: Benoit Jacob <jacob.benoit.1@gmail.com> Signed-off-by: hanhanW <hanhan0912@gmail.com> Co-authored-by: hanhanW <hanhan0912@gmail.com>
iree-org · Oct 2, 2024 · 903ab0a · 903ab0a
1 parent cd48b10
commit 903ab0a
Show file tree

Hide file tree

Showing 9 changed files with 114 additions and 16 deletions.
diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp b/compiler/src/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp
@@ -30,6 +30,7 @@
 #include "mlir/Dialect/Utils/IndexingUtils.h"
 #include "mlir/Dialect/Utils/StaticValueUtils.h"
 #include "mlir/IR/Matchers.h"
+#include "mlir/IR/OpDefinition.h"
 #include "mlir/IR/TypeUtilities.h"
 #include "mlir/Interfaces/FunctionInterfaces.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
@@ -272,14 +273,22 @@ getVectorPreProcStrategy(linalg::LinalgOp linalgOp) {
   return VectorPreProcStrategy::None;
 }
 
-DictionaryAttr getPipelineConfWithPeelingAttr(MLIRContext *context) {
+static DictionaryAttr getPipelineConfWithPeelingAttr(MLIRContext *context) {
   auto enableLoopPeelingAttrName = getEnableLoopPeelingAttrName(context);
   auto unitAttr = UnitAttr::get(context);
 
   return DictionaryAttr::get(
       context, ArrayRef<NamedAttribute>({enableLoopPeelingAttrName, unitAttr}));
 }
 
+static DictionaryAttr
+getPipelineConfWithDecompositionAttr(MLIRContext *context) {
+  auto attrName = getEnableDecompositionAttrName(context);
+  auto unitAttr = UnitAttr::get(context);
+  return DictionaryAttr::get(context,
+                             ArrayRef<NamedAttribute>({attrName, unitAttr}));
+}
+
 /// Looks for the `native_vector_size` attribute in the hal.executable.target
 /// looked up from this op.
 static int64_t
@@ -1690,11 +1699,23 @@ static LogicalResult setRootConfig(mlir::FunctionOpInterface entryPointFn,
     distTileSizes[pos] = std::max<int64_t>(distTileSizes[pos], 1);
   }
 
+  // Dynamic inner tiles lead to unbounded stack allocation (which is introduced
+  // by tensor.pad op), so we do not decompose the cases. The x86 and risc-v
+  // backends prefer to not decompose the ops.
+  DictionaryAttr pipelineConfig;
+  auto target = IREE::HAL::ExecutableTargetAttr::lookup(entryPointFn);
+  bool hasDynamicInnerTile = llvm::any_of(
+      op.getMixedTiles(), [](OpFoldResult ofr) { return ofr.is<Value>(); });
+  if (!hasDynamicInnerTile && !isX86(target) && !isRISCV(target)) {
+    pipelineConfig = getPipelineConfWithDecompositionAttr(op.getContext());
+  }
+
   SmallVector<int64_t> vecTileSizes = getPackVectorTileSizes(entryPointFn, op);
   TileSizesListType tileSizesList = {distTileSizes, vecTileSizes};
   return setOpConfigAndEntryPointFnTranslation(
       entryPointFn, op, tileSizesList,
-      DispatchLoweringPassPipeline::CPUDataTiling);
+      DispatchLoweringPassPipeline::CPUDataTiling, /*workgroupSize=*/{},
+      /*subgroupSize=*/{}, pipelineConfig);
 }
 
 static LogicalResult setRootConfig(mlir::FunctionOpInterface entryPointFn,
@@ -1718,10 +1739,22 @@ static LogicalResult setRootConfig(mlir::FunctionOpInterface entryPointFn,
     tileSizes[pos] = ShapedType::isDynamic(size) ? 1 : size;
   }
 
+  // Dynamic inner tiles lead to unbounded stack allocation (which is introduced
+  // by tensor.pad op), so we do not decompose the cases. The x86 and risc-v
+  // backends prefer to not decompose the ops.
+  DictionaryAttr pipelineConfig;
+  auto target = IREE::HAL::ExecutableTargetAttr::lookup(entryPointFn);
+  bool hasDynamicInnerTile = llvm::any_of(
+      op.getMixedTiles(), [](OpFoldResult ofr) { return ofr.is<Value>(); });
+  if (!hasDynamicInnerTile && !isX86(target) && !isRISCV(target)) {
+    pipelineConfig = getPipelineConfWithDecompositionAttr(op.getContext());
+  }
+
   TileSizesListType tileSizesList = {distTileSizes, tileSizes};
   return setOpConfigAndEntryPointFnTranslation(
       entryPointFn, op, tileSizesList,
-      DispatchLoweringPassPipeline::CPUDataTiling);
+      DispatchLoweringPassPipeline::CPUDataTiling, /*workgroupSize=*/{},
+      /*subgroupSize=*/{}, pipelineConfig);
 }
 
 static LogicalResult setRootConfig(mlir::FunctionOpInterface entryPointFn,

diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/LLVMCPULowerExecutableTarget.cpp b/compiler/src/iree/compiler/Codegen/LLVMCPU/LLVMCPULowerExecutableTarget.cpp
@@ -100,8 +100,8 @@ void LLVMCPULowerExecutableTargetPass::runOnOperation() {
   LLVMCPUPipelineOptions pipelineOpts;
   if (isX86(target) || isRISCV(target)) {
     pipelineOpts.useConfiguredVectorSizes = false;
-    pipelineOpts.decomposePackUnPackOps = false;
   }
+  pipelineOpts.decomposePackUnPackOps = isDecompositionEnabled(funcOp);
   pipelineOpts.lowerToAVX2 = hasAVX2Feature(target);
   pipelineOpts.enableVectorMasking =
       isX86(target) || isRISCV(target) ||

diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/select_aarch64_lowering_strategy.mlir b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/select_aarch64_lowering_strategy.mlir
@@ -260,7 +260,7 @@ func.func @pack() attributes {hal.executable.target = #executable_target_system_
   return
 }
 //   CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[1, 16], [1, 1]]>
-//   CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<CPUDataTiling>
+//   CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<CPUDataTiling, {enable_decomposition}>
 //       CHECK: func.func @pack()
 //  CHECK-SAME:     translation_info = #[[TRANSLATION]]
 //       CHECK:   tensor.pack
@@ -293,14 +293,48 @@ func.func @unpack_outer_dynamic() attributes {hal.executable.target = #executabl
   return
 }
 //   CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[64, 64], [32, 16]]>
-//   CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<CPUDataTiling>
+//   CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<CPUDataTiling, {enable_decomposition}>
 //       CHECK: func.func @unpack_outer_dynamic()
 //  CHECK-SAME:     translation_info = #[[TRANSLATION]]
 //       CHECK:   tensor.unpack
 //  CHECK-SAME:       lowering_config = #[[CONFIG]]
 
 // -----
 
+#executable_target_system_elf_arm_64_ = #hal.executable.target<"llvm-cpu", "system-elf-arm_64", {data_layout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128", native_vector_size = 16 : index, target_triple = "aarch64-none-linux-android30"}>
+#pipeline_layout = #hal.pipeline.layout<constants = 6, bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>
+func.func @unpack_fully_dynamic() attributes {hal.executable.target = #executable_target_system_elf_arm_64_} {
+  %c131072 = arith.constant 131072 : index
+  %c0 = arith.constant 0 : index
+  %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32
+  %1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : i32
+  %2 = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : i32
+  %3 = hal.interface.constant.load layout(#pipeline_layout) ordinal(3) : i32
+  %4 = hal.interface.constant.load layout(#pipeline_layout) ordinal(4) : i32
+  %5 = hal.interface.constant.load layout(#pipeline_layout) ordinal(5) : i32
+  %6 = arith.index_castui %0 : i32 to index
+  %7 = arith.index_castui %1 : i32 to index
+  %8 = arith.index_castui %2 : i32 to index
+  %9 = arith.index_castui %3 : i32 to index
+  %10 = arith.index_castui %4 : i32 to index
+  %11 = arith.index_castui %5 : i32 to index
+  %12 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xi32>>{%6, %7, %10, %11}
+  %13 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c131072) : !flow.dispatch.tensor<writeonly:tensor<?x?xi32>>{%8, %9}
+  %14 = flow.dispatch.tensor.load %12, offsets = [0, 0, 0, 0], sizes = [%6, %7, 32, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?x?x?xi32>>{%6, %7, %10, %11} -> tensor<?x?x?x?xi32>
+  %15 = tensor.empty(%8, %9) : tensor<?x?xi32>
+  %unpack = tensor.unpack %14 inner_dims_pos = [0, 1] inner_tiles = [%10, %11] into %15 : tensor<?x?x?x?xi32> -> tensor<?x?xi32>
+  flow.dispatch.tensor.store %unpack, %13, offsets = [0, 0], sizes = [%8, %9], strides = [1, 1] : tensor<?x?xi32> -> !flow.dispatch.tensor<writeonly:tensor<?x?xi32>>{%8, %9}
+  return
+}
+//   CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[64, 64], [1, 1]]>
+//   CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<CPUDataTiling>
+//       CHECK: func.func @unpack_fully_dynamic()
+//  CHECK-SAME:     translation_info = #[[TRANSLATION]]
+//       CHECK:   tensor.unpack
+//  CHECK-SAME:       lowering_config = #[[CONFIG]]
+
+// -----
+
 #pipeline_layout = #hal.pipeline.layout<bindings = [
   #hal.pipeline.binding<storage_buffer>,
   #hal.pipeline.binding<storage_buffer>,

diff --git a/compiler/src/iree/compiler/Codegen/Utils/CPUUtils.cpp b/compiler/src/iree/compiler/Codegen/Utils/CPUUtils.cpp
@@ -20,6 +20,7 @@
 namespace mlir::iree_compiler {
 
 static const char kLoopPeelingAttrName[] = "enable_loop_peeling";
+static const char kDecompositionAttrName[] = "enable_decomposition";
 
 FailureOr<Operation *> getRootOperation(ArrayRef<Operation *> computeOps) {
   Operation *rootOperation = nullptr;
@@ -66,6 +67,16 @@ FailureOr<Operation *> getRootOperation(ArrayRef<Operation *> computeOps) {
   return rootOperation;
 }
 
+StringAttr getEnableDecompositionAttrName(MLIRContext *ctx) {
+  return StringAttr::get(ctx, kDecompositionAttrName);
+}
+
+bool isDecompositionEnabled(FunctionOpInterface funcOp) {
+  DictionaryAttr config = getTranslationInfo(funcOp).getConfiguration();
+
+  return config && config.contains(kDecompositionAttrName);
+}
+
 StringAttr getEnableLoopPeelingAttrName(MLIRContext *ctx) {
   return StringAttr::get(ctx, kLoopPeelingAttrName);
 }

diff --git a/compiler/src/iree/compiler/Codegen/Utils/CPUUtils.h b/compiler/src/iree/compiler/Codegen/Utils/CPUUtils.h
@@ -19,6 +19,15 @@ namespace mlir::iree_compiler {
 /// to the end of the function is the root op.
 FailureOr<Operation *> getRootOperation(ArrayRef<Operation *> computeOps);
 
+/// Creates a string attribute containing the name of the attribute that is
+/// used to enable decomposition.
+StringAttr getEnableDecompositionAttrName(MLIRContext *ctx);
+
+/// Checks whether loop peeling has been enabled for the input function. This
+/// is infered from the config dict. attribute that's part of to the
+/// translation info corresponding to this funciton.
+bool isDecompositionEnabled(FunctionOpInterface funcOp);
+
 /// Creates a string attribute containing the name of the attribute that is
 /// used to enable loop peeling.
 StringAttr getEnableLoopPeelingAttrName(MLIRContext *ctx);

diff --git a/compiler/src/iree/compiler/Dialect/HAL/Transforms/Passes.cpp b/compiler/src/iree/compiler/Dialect/HAL/Transforms/Passes.cpp
@@ -176,7 +176,8 @@ static void addExecutableSubstitutionPasses(OpPassManager &passManager,
   }
   if (!substitutions.empty()) {
     SubstituteExecutablesPassOptions substituteOptions;
-    substituteOptions.substitutions = substitutions;
+    substituteOptions.substitutions.assign(substitutions.begin(),
+                                           substitutions.end());
     passManager.addPass(
         IREE::HAL::createSubstituteExecutablesPass(substituteOptions));
   }
@@ -197,12 +198,19 @@ void buildHALDeviceAssignmentPassPipeline(
     // Today we just assign devices from parameters but we should instead be
     // performing analysis at the flow level and then doing magic device
     // database lookups here.
-    passManager.addPass(IREE::HAL::createAssignLegacyTargetDevicesPass(
-        {&targetRegistry, assignmentOptions.legacyTargetBackends}));
+    AssignLegacyTargetDevicesPassOptions options;
+    options.targetRegistry = &targetRegistry;
+    options.targetBackends.assign(
+        assignmentOptions.legacyTargetBackends.begin(),
+        assignmentOptions.legacyTargetBackends.end());
+    passManager.addPass(
+        IREE::HAL::createAssignLegacyTargetDevicesPass(options));
   }
   if (!assignmentOptions.targetDevices.empty()) {
-    passManager.addPass(IREE::HAL::createAssignTargetDevicesPass(
-        {assignmentOptions.targetDevices}));
+    AssignTargetDevicesPassOptions options;
+    options.targetDevices.assign(assignmentOptions.targetDevices.begin(),
+                                 assignmentOptions.targetDevices.end());
+    passManager.addPass(IREE::HAL::createAssignTargetDevicesPass(options));
   }
 
   // Create globals for each device (if needed).

diff --git a/compiler/src/iree/compiler/Dialect/VM/Tools/VMOpEncoderGen.cpp b/compiler/src/iree/compiler/Dialect/VM/Tools/VMOpEncoderGen.cpp
@@ -62,7 +62,7 @@ bool emitEncodeFnDefs(const llvm::RecordKeeper &recordKeeper, raw_ostream &os) {
     }
 
     os << "  if (";
-    auto printOneCondition = [&](Record *encodingExpr) {
+    auto printOneCondition = [&](const Record *encodingExpr) {
       StringRef expr = encodingExpr->getValueAsString("expr");
       std::vector<StringRef> params =
           encodingExpr->getValueAsListOfStrings("params");

diff --git a/compiler/src/iree/compiler/GlobalOptimization/Passes.cpp b/compiler/src/iree/compiler/GlobalOptimization/Passes.cpp
@@ -81,9 +81,12 @@ void buildGlobalOptimizationPassPipeline(
   // parameters are available for folding.
   if (!transformOptions.options.parameterImportPaths.empty()) {
     IREE::IO::Parameters::ImportParametersPassOptions importParametersOptions;
-    importParametersOptions.scopePaths =
-        transformOptions.options.parameterImportPaths;
-    importParametersOptions.keys = transformOptions.options.parameterImportKeys;
+    importParametersOptions.scopePaths.assign(
+        transformOptions.options.parameterImportPaths.begin(),
+        transformOptions.options.parameterImportPaths.end());
+    importParametersOptions.keys.assign(
+        transformOptions.options.parameterImportKeys.begin(),
+        transformOptions.options.parameterImportKeys.end());
     importParametersOptions.maximumSize =
         transformOptions.options.parameterImportMaximumSize;
     mainPassManager.addPass(IREE::IO::Parameters::createImportParametersPass(

diff --git a/third_party/llvm-project b/third_party/llvm-project