[mlir][tensor] Add consumer fusion for `tensor.pack` op. #103715

Yun-Fly · 2024-08-14T08:15:37Z

Add missing getIterationDomainTileFromOperandTile and getTiledImplementationFromOperandTile to tensor.pack and enable fusing it as a consumer.

llvmbot · 2024-08-14T08:15:55Z

@llvm/pr-subscribers-mlir-tensor
@llvm/pr-subscribers-mlir

@llvm/pr-subscribers-mlir-scf

Author: None (Yun-Fly)

Changes

Add missing getIterationDomainTileFromOperandTile and getTiledImplementationFromOperandTile to tensor.pack and enable fusing it as a consumer.

Full diff: https://github.com/llvm/llvm-project/pull/103715.diff

2 Files Affected:

(modified) mlir/lib/Dialect/Tensor/IR/TensorTilingInterfaceImpl.cpp (+91)
(modified) mlir/test/Interfaces/TilingInterface/tile-and-fuse-consumer.mlir (+59)

diff --git a/mlir/lib/Dialect/Tensor/IR/TensorTilingInterfaceImpl.cpp b/mlir/lib/Dialect/Tensor/IR/TensorTilingInterfaceImpl.cpp
index 361340a4e62f2d..51c232ae77fe6c 100644
--- a/mlir/lib/Dialect/Tensor/IR/TensorTilingInterfaceImpl.cpp
+++ b/mlir/lib/Dialect/Tensor/IR/TensorTilingInterfaceImpl.cpp
@@ -246,6 +246,97 @@ struct PackOpTiling
       return failure();
     return tilingResult.value();
   }
+
+  /// Method to return the position of iteration domain tile computed by the
+  /// tiled operation. In current `tensor.pack` context, the `resultOffsets` and
+  /// `resultSizes` only cover outer dimensions.
+  LogicalResult getIterationDomainTileFromOperandTile(
+      Operation *op, OpBuilder &b, unsigned operandNumber,
+      ArrayRef<OpFoldResult> offsets, ArrayRef<OpFoldResult> sizes,
+      SmallVectorImpl<OpFoldResult> &resultOffsets,
+      SmallVectorImpl<OpFoldResult> &resultSizes) const {
+    auto packOp = cast<PackOp>(op);
+    Location loc = packOp.getLoc();
+
+    SmallVector<OpFoldResult> outerDimOffsets, outerDimSizes;
+    DenseMap<int64_t, OpFoldResult> dimAndTileMapping =
+        packOp.getDimAndTileMapping();
+    for (auto dim : packOp.getOuterDimsPerm()) {
+      if (dimAndTileMapping.count(dim)) {
+        FailureOr<int64_t> cstSize =
+            ValueBoundsConstraintSet::computeConstantBound(
+                presburger::BoundType::UB, sizes[dim],
+                /*stopCondition=*/nullptr, /*closedUB=*/true);
+        std::optional<int64_t> cstInnerSize =
+            getConstantIntValue(dimAndTileMapping[dim]);
+        // Currently only expect perfect tiling cases.
+        if (failed(cstSize) || !cstInnerSize || *cstSize % *cstInnerSize != 0) {
+          return failure();
+        }
+
+        using AV = affine::AffineValueExpr;
+        affine::AffineBuilder ab(b, loc);
+        AffineExpr dim0, sym;
+        bindDims(b.getContext(), dim0);
+        bindSymbols(b.getContext(), sym);
+        auto avOffset = AV(dim0).bind(offsets[dim]);
+        auto avSize = AV(dim0).bind(sizes[dim]);
+        auto avTileSize = AV(sym).bind(dimAndTileMapping[dim]);
+        outerDimOffsets.push_back(ab.floor(avOffset, avTileSize));
+        outerDimSizes.push_back(ab.ceil(avSize, avTileSize));
+      } else {
+        outerDimOffsets.push_back(offsets[dim]);
+        outerDimSizes.push_back(sizes[dim]);
+      }
+    }
+
+    resultOffsets = outerDimOffsets;
+    resultSizes = outerDimSizes;
+    return success();
+  }
+
+  /// Method to return the tiled implementation of tensor.pack as a consumer.
+  FailureOr<TilingResult> getTiledImplementationFromOperandTile(
+      Operation *op, OpBuilder &b, unsigned operandNumber,
+      ArrayRef<OpFoldResult> offsets, ArrayRef<OpFoldResult> sizes) const {
+    auto packOp = cast<PackOp>(op);
+    Location loc = packOp.getLoc();
+
+    int64_t inputRank = packOp.getSourceRank();
+    auto oneAttr = b.getI64IntegerAttr(1);
+    SmallVector<OpFoldResult> strides(inputRank, oneAttr);
+
+    SmallVector<Value> tiledOperands;
+    tiledOperands.push_back(b.create<ExtractSliceOp>(loc, packOp.getSource(),
+                                                     offsets, sizes, strides));
+
+    SmallVector<OpFoldResult> outerDimOffsets, outerDimSizes;
+    if (failed(getIterationDomainTileFromOperandTile(
+            op, b, /*operandNumber=*/0, offsets, sizes, outerDimOffsets,
+            outerDimSizes)))
+      return failure();
+
+    SmallVector<OpFoldResult> outputOffsets, outputSizes;
+    if (failed(getResultTilePosition(op, b, 0, outerDimOffsets, outerDimSizes,
+                                     outputOffsets, outputSizes)))
+      return failure();
+
+    strides.append(packOp.getDestRank() - inputRank, oneAttr);
+    auto extractSlice = b.create<ExtractSliceOp>(
+        loc, packOp.getDest(), outputOffsets, outputSizes, strides);
+    tiledOperands.push_back(extractSlice);
+
+    if (auto val = packOp.getPaddingValue())
+      tiledOperands.push_back(val);
+    for (auto tile : packOp.getInnerTiles())
+      tiledOperands.push_back(tile);
+
+    Operation *tiledPackOp = b.create<PackOp>(
+        loc, TypeRange{extractSlice.getType()}, tiledOperands, op->getAttrs());
+
+    return TilingResult{{tiledPackOp},
+                        SmallVector<Value>(tiledPackOp->getResults())};
+  }
 };
 
 struct UnpackTileDimInfo {
diff --git a/mlir/test/Interfaces/TilingInterface/tile-and-fuse-consumer.mlir b/mlir/test/Interfaces/TilingInterface/tile-and-fuse-consumer.mlir
index 400b558e37fcda..741dfbfb1cd5c2 100644
--- a/mlir/test/Interfaces/TilingInterface/tile-and-fuse-consumer.mlir
+++ b/mlir/test/Interfaces/TilingInterface/tile-and-fuse-consumer.mlir
@@ -315,3 +315,62 @@ module attributes {transform.with_named_sequence} {
 //      CHECK:       }
 //      CHECK:   }
 //      CHECK:   return %[[FINAL_RESULT]]#1 :
+
+// -----
+
+#map = affine_map<(d0, d1) -> (d0, d1)>
+module {
+    func.func @fuse_pack_consumer_into_scf_forall(%arg0: tensor<32x32xf32>, %arg1: tensor<32x32xf32>, %arg2: tensor<64x32xf32>) -> tensor<4x32x16xf32> {
+        %c4 = arith.constant 4 : index
+        %c64 = arith.constant 64 : index
+        %c0 = arith.constant 0 : index
+        %1 = scf.forall (%arg3, %arg4) in (2, 2) shared_outs(%arg5 = %arg2) -> (tensor<64x32xf32>) {
+            %extracted_slice = tensor.extract_slice %arg5[%arg3, %arg4] [32, 32] [1, 1] : tensor<64x32xf32> to tensor<32x32xf32>
+            %3 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%arg0, %arg1 : tensor<32x32xf32>, tensor<32x32xf32>) outs(%extracted_slice : tensor<32x32xf32>) {
+                ^bb0(%in: f32, %in_16: f32, %out: f32):
+                %13 = arith.mulf %in, %in_16 : f32
+                %14 = arith.addf %out, %13 : f32
+                linalg.yield %14 : f32
+            } -> tensor<32x32xf32>
+            scf.forall.in_parallel {
+                tensor.parallel_insert_slice %3 into %arg5[%arg3, %arg4] [32, 32] [1, 1] : tensor<32x32xf32> into tensor<64x32xf32>
+            }
+        }
+        %output = tensor.empty() : tensor<4x32x16xf32>
+        %pack = tensor.pack %1 outer_dims_perm = [0, 1] inner_dims_pos = [0] inner_tiles = [16] into %output : tensor<64x32xf32> -> tensor<4x32x16xf32>
+        return %pack : tensor<4x32x16xf32>
+    }
+}
+  
+module attributes {transform.with_named_sequence} {
+    transform.named_sequence @__transform_main(%arg1 : !transform.any_op {transform.readonly}) {
+        %slice_op = transform.structured.match ops{["tensor.parallel_insert_slice"]} in %arg1
+        : (!transform.any_op) -> !transform.any_op
+        %a, %b = transform.test.fuse_consumer %slice_op
+        : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+        transform.yield
+    }
+}
+//      CHECK: #[[PACK_RESULT_MAP:.*]] = affine_map<(d0) -> (d0 floordiv 16)>
+//      CHECK: func.func @fuse_pack_consumer_into_scf_forall(
+// CHECK-SAME:     %[[ARG0:[a-zA-Z0-9]+]]: tensor<32x32xf32>
+// CHECK-SAME:     %[[ARG1:[a-zA-Z0-9]+]]: tensor<32x32xf32>
+// CHECK-SAME:     %[[ARG2:[a-zA-Z0-9]+]]: tensor<64x32xf32>)
+//      CHECK:   %[[OUT_INIT:.*]] = tensor.empty() : tensor<4x32x16xf32>
+//      CHECK:   %[[FINAL_RESULT:.*]]:2 = scf.forall (%[[IV1:.*]], %[[IV2:.*]]) in (2, 2)
+// CHECK-SAME:      shared_outs(%[[FIRST_OUT_ARG:.*]] = %[[ARG2]], %[[PACK_OUT_ARG:.*]] = %[[OUT_INIT]])
+// CHECK-SAME:   {
+//      CHECK:      %[[GENERIC_OUT_SLICE:.*]] = tensor.extract_slice %[[FIRST_OUT_ARG]][%[[IV1]], %[[IV2]]] [32, 32] [1, 1]
+//      CHECK:      %[[GENERIC_OUT:.*]] = linalg.generic
+// CHECK-SAME:              outs(%[[GENERIC_OUT_SLICE]] :
+//      CHECK:      %[[PACK_RESULT_OFFSET:.*]] = affine.apply #[[PACK_RESULT_MAP]](%[[IV1]])
+//      CHECK:      %[[TILED_PACK_DEST:.*]] = tensor.extract_slice %[[PACK_OUT_ARG]][%[[PACK_RESULT_OFFSET]], %[[IV2]], 0] [2, 32, 16] [1, 1, 1]
+//      CHECK:      %[[TILED_PACK_OUT:.*]] = tensor.pack %[[GENERIC_OUT]]
+// CHECK-SAME:                              outer_dims_perm = [0, 1] inner_dims_pos = [0] inner_tiles = [16]
+// CHECK-SAME:                              into %[[TILED_PACK_DEST]]
+//      CHECK:      scf.forall.in_parallel {
+//      CHECK:          tensor.parallel_insert_slice %[[TILED_PACK_OUT]] into %[[PACK_OUT_ARG]][%[[PACK_RESULT_OFFSET]],  %[[IV2]], 0] [2, 32, 16] [1, 1, 1]
+//      CHECK:          tensor.parallel_insert_slice %[[GENERIC_OUT]] into %[[FIRST_OUT_ARG]][%[[IV1]], %[[IV2]]] [32, 32] [1, 1]
+//      CHECK:       }
+//      CHECK:   }
+//      CHECK:   return %[[FINAL_RESULT]]#1 :

MaheshRavishankar

Few questions, but this looks mostly OK to me.

mlir/lib/Dialect/Tensor/IR/TensorTilingInterfaceImpl.cpp

Yun-Fly · 2024-08-19T07:12:50Z

Hi, @MaheshRavishankar @hanhanW. Let me try to rephrase the corner case we may encounter:

%1 = tensor.empty() : tensor<30xf32>
%pack = tensor.pack %1 outer_dims_perm = [0] padding_value(%pad : f32)  inner_dims_pos = [0] inner_tiles = [16] into %output : tensor<30xf32> -> tensor<2x16xf32>

If we do not consider tiling, the %1 will be packed into two rows:

[0,1,2,...,15] -> (0,0),(0,1),...(0,15)
[16,17,,...,29] -> (1,0),(1,1),...,(1,13), and padding (1,14), (1,15)

Then, if we take tiling into consideration, i.e. the tileSize from operand of pack is 15, which is not exactly divided by 16.

the first slice is extracted from (0) to (14) and inserted into (0,0)~(0,14) in first row.
the second slice is extracted from (15) to (29) and SHOULD BE respectively inserted into two rows with different length. i.e. first row: (0, 15) and second row (1,0)~(1,13) and padding (1,14), (1,15).

I am seeking your advice about how to deal with this? Based on current coordination(outerDimOffset = floor(offset,innerTile), outerDimTileSize = ceil(tileSize, innerTile)), both of first slice and second one will be inserted into first row(outerDimOffset=0 and outerDimTileSize =1).

BTW, this issue exists even if without padding, e.g.

%1 = tensor.empty() : tensor<30xf32>
%pack = tensor.pack %1 outer_dims_perm = [0] inner_dims_pos = [0] inner_tiles = [6] into %output : tensor<30xf32> -> tensor<5x6xf32>

where the tileSize from operand of pack is 5.

MaheshRavishankar · 2024-08-20T04:30:43Z

Ok, so I was trying something. I initially thought that fusing producer with consumer, or consumer with producer would give the same result. So I tried tiling pack and fusing the producer.

Input IR:

func.func @pack_fusion(%arg0 : tensor<30xf32>) -> tensor<2x16xf32> {
  %empty = tensor.empty() : tensor<30xf32>
  %0 = linalg.generic {
      indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>],
      iterator_types = ["parallel"]}
      ins(%arg0 : tensor<30xf32>) outs(%empty : tensor<30xf32>) {
    ^bb0(%b0 : f32, %b1 : f32) :
      %1 = arith.addf %b0, %b0 : f32
      linalg.yield %1 : f32
  } -> tensor<30xf32>
  %empty1 = tensor.empty() : tensor<2x16xf32>
  %pad = arith.constant 0.0 : f32
  %pack = tensor.pack %0 padding_value(%pad : f32) outer_dims_perm = [0]
      inner_dims_pos = [0] inner_tiles = [16] into %empty1
      : tensor<30xf32> -> tensor<2x16xf32>
  return %pack : tensor<2x16xf32>
}

which on using tileConsumerAndFuseProducer gives

module {
  func.func @pack_fusion(%arg0: tensor<30xf32>) -> tensor<2x16xf32> {
    %cst = arith.constant 0.000000e+00 : f32
    %0 = tensor.empty() : tensor<2x16xf32>
    %1 = scf.forall (%arg1) in (2) shared_outs(%arg2 = %0) -> (tensor<2x16xf32>) {
      %2 = affine.min affine_map<(d0) -> (d0 * -16 + 30, 16)>(%arg1)
      %3 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg1)
      %extracted_slice = tensor.extract_slice %arg0[%3] [%2] [1] : tensor<30xf32> to tensor<?xf32>
      %4 = tensor.empty(%2) : tensor<?xf32>
      %5 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%extracted_slice : tensor<?xf32>) outs(%4 : tensor<?xf32>) {
      ^bb0(%in: f32, %out: f32):
        %6 = arith.addf %in, %in : f32
        linalg.yield %6 : f32
      } -> tensor<?xf32>
      %extracted_slice_0 = tensor.extract_slice %arg2[%arg1, 0] [1, 16] [1, 1] : tensor<2x16xf32> to tensor<1x16xf32>
      %pack = tensor.pack %5 padding_value(%cst : f32) outer_dims_perm = [0] inner_dims_pos = [0] inner_tiles = [16] into %extracted_slice_0 : tensor<?xf32> -> tensor<1x16xf32>
      scf.forall.in_parallel {
        tensor.parallel_insert_slice %pack into %arg2[%arg1, 0] [1, 16] [1, 1] : tensor<1x16xf32> into tensor<2x16xf32>
      }
    }
    return %1 : tensor<2x16xf32>
  }
}

But to get this I need to specify tile sizes in terms of the result dimension of the pack. So that works for tile consumer + fuse producer, but if you tile the producer first (the linalg.generic) you cannot do the same. Just leaving a note here. I am going to take a pass at this PR now.

MaheshRavishankar

Ok, this makes sense to me now. Thanks.

mlir/lib/Dialect/Tensor/IR/TensorTilingInterfaceImpl.cpp

Yun-Fly · 2024-08-20T06:34:27Z

But to get this I need to specify tile sizes in terms of the result dimension of the pack. So that works for tile consumer + fuse producer, but if you tile the producer first (the linalg.generic) you cannot do the same.

Yes, as I mentioned above, the mirroring case of fusing pack as a consumer is actually fusing unpack as a producer. @hanhanW contributed a lot on this and had more experience to deal with incomplete tile before. It would be perfect if you can help to amend the restrictions left in this patch with an unified interface to address incomplete tile issue caused by either Pack or unPack operation:). I am also willing to study more about this. Thanks!

hanhanW · 2024-08-20T17:30:36Z

Thanks @MaheshRavishankar for taking over the review! I did not find cycles to do deep review, but I can imagine the problem now. Hopefully I can get back to this and add the support later. And thanks a lot for pushing on this, @Yun-Fly !

Yun-Fly · 2024-08-21T00:31:50Z

Thanks for all of your involving in busy schedules!

Add missing `getIterationDomainTileFromOperandTile` and `getTiledImplementationFromOperandTile` to `tensor.pack` and enable fusing it as a consumer. NOTE that, it only expects perfect tiling scenario without padding semantic currently.

Yun-Fly added mlir mlir:scf labels Aug 14, 2024

Yun-Fly requested a review from Abhishek-Varma August 14, 2024 08:15

Yun-Fly requested review from hanhanW and nicolasvasilache as code owners August 14, 2024 08:15

llvmbot added the mlir:tensor label Aug 14, 2024

MaheshRavishankar requested changes Aug 15, 2024

View reviewed changes

mlir/lib/Dialect/Tensor/IR/TensorTilingInterfaceImpl.cpp Show resolved Hide resolved

mlir/lib/Dialect/Tensor/IR/TensorTilingInterfaceImpl.cpp Show resolved Hide resolved

hanhanW reviewed Aug 15, 2024

View reviewed changes

mlir/lib/Dialect/Tensor/IR/TensorTilingInterfaceImpl.cpp Outdated Show resolved Hide resolved

Yun-Fly changed the title ~~[mlir][scf] fuse tensor.pack as consumer~~ [mlir][tensor] Add consumer fusion for tensor.pack op. Aug 16, 2024

MaheshRavishankar approved these changes Aug 20, 2024

View reviewed changes

mlir/lib/Dialect/Tensor/IR/TensorTilingInterfaceImpl.cpp Outdated Show resolved Hide resolved

mlir/lib/Dialect/Tensor/IR/TensorTilingInterfaceImpl.cpp Show resolved Hide resolved

mlir/lib/Dialect/Tensor/IR/TensorTilingInterfaceImpl.cpp Show resolved Hide resolved

Yun-Fly added 2 commits August 19, 2024 23:22

fuse tensor.pack as consumer

9f4bd46

fix comment

cdbf9ed

Yun-Fly force-pushed the yunfei/fuse_consumer_pack branch from d7fe9c7 to cdbf9ed Compare August 20, 2024 06:27

Yun-Fly requested a review from ZhennanQin August 22, 2024 02:52

ZhennanQin approved these changes Aug 23, 2024

View reviewed changes

Yun-Fly merged commit f06563a into llvm:main Aug 23, 2024
8 checks passed

Yun-Fly mentioned this pull request Aug 30, 2024

[mlir][tensor] Fix consumer fusion for tensor.pack without explicit outer_dims_perm attribute #106687

Merged

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

[mlir][tensor] Add consumer fusion for `tensor.pack` op. #103715

[mlir][tensor] Add consumer fusion for `tensor.pack` op. #103715

Yun-Fly commented Aug 14, 2024

llvmbot commented Aug 14, 2024 •

edited

Loading

MaheshRavishankar left a comment

Yun-Fly commented Aug 19, 2024

MaheshRavishankar commented Aug 20, 2024

MaheshRavishankar left a comment

Yun-Fly commented Aug 20, 2024

hanhanW commented Aug 20, 2024

Yun-Fly commented Aug 21, 2024

[mlir][tensor] Add consumer fusion for tensor.pack op. #103715

[mlir][tensor] Add consumer fusion for tensor.pack op. #103715

Conversation

Yun-Fly commented Aug 14, 2024

llvmbot commented Aug 14, 2024 • edited Loading

MaheshRavishankar left a comment

Choose a reason for hiding this comment

Yun-Fly commented Aug 19, 2024

MaheshRavishankar commented Aug 20, 2024

MaheshRavishankar left a comment

Choose a reason for hiding this comment

Yun-Fly commented Aug 20, 2024

hanhanW commented Aug 20, 2024

Yun-Fly commented Aug 21, 2024

[mlir][tensor] Add consumer fusion for `tensor.pack` op. #103715

[mlir][tensor] Add consumer fusion for `tensor.pack` op. #103715

llvmbot commented Aug 14, 2024 •

edited

Loading