From 14aac4339638dedc0ed18cc5ab35a346cda32e79 Mon Sep 17 00:00:00 2001 From: Cullen Rhodes Date: Mon, 16 Oct 2023 11:43:32 +0000 Subject: [PATCH] [mlir][ArmSME] Lower transfer_write + transpose to vertical store This patch extends the lowering of vector.transfer_write in VectorToArmSME to support in-flight transpose via SME vertical store. --- .../VectorToArmSME/VectorToArmSME.cpp | 47 ++++- .../Dialect/ArmSME/vector-ops-to-sme.mlir | 42 +++++ .../CPU/ArmSME/test-transfer-write-2d.mlir | 174 ++++++++++++++++++ 3 files changed, 260 insertions(+), 3 deletions(-) create mode 100644 mlir/test/Integration/Dialect/Vector/CPU/ArmSME/test-transfer-write-2d.mlir diff --git a/mlir/lib/Conversion/VectorToArmSME/VectorToArmSME.cpp b/mlir/lib/Conversion/VectorToArmSME/VectorToArmSME.cpp index 0cc5732c9212d1..40e8378306bbf2 100644 --- a/mlir/lib/Conversion/VectorToArmSME/VectorToArmSME.cpp +++ b/mlir/lib/Conversion/VectorToArmSME/VectorToArmSME.cpp @@ -136,13 +136,31 @@ struct TransferReadToArmSMELowering /// Conversion pattern for vector.transfer_write. /// -/// vector.transfer_write %vector, %source[%c0, %c0] : vector<[16]x[16]xi8>, -/// memref +/// --- +/// +/// Example 1: op with identity permutation map to horizontal +/// arm_sme.tile_store: +/// +/// vector.transfer_write %vector, %source[%c0, %c0] +/// {in_bounds = [true, true]} : vector<[16]x[16]xi8>, memref /// /// is converted to: /// /// arm_sme.tile_store %vector, %source[%c0, %c0] : memref, /// vector<[16]x[16]xi8> +/// --- +/// +/// Example 2: op with transpose permutation map to vertical arm_sme.tile_store +/// (in-flight transpose): +/// +/// vector.transfer_write %vector, %source[%c0, %c0] +/// {permutation_map = affine_map<(d0, d1) -> (d1, d0)>, +/// in_bounds = [true, true]} : vector<[16]x[16]xi8>, memref +/// +/// is converted to: +/// +/// arm_sme.tile_store %vector, %source[%c0, %c0] layout +/// : memref, vector<[16]x[16]xi8> struct TransferWriteToArmSMELowering : public OpRewritePattern { using OpRewritePattern::OpRewritePattern; @@ -153,12 +171,35 @@ struct TransferWriteToArmSMELowering if (!arm_sme::isValidSMETileVectorType(vType)) return failure(); + assert(writeOp.getTransferRank() == 2 && + "expected a permutation_map with result dims of the same rank as " + "the vector type"); + if (!llvm::isa(writeOp.getSource().getType())) return failure(); + // Out-of-bounds dims are not supported. + if (writeOp.hasOutOfBoundsDim()) + return rewriter.notifyMatchFailure(writeOp, + "not inbounds transfer write"); + + arm_sme::TileSliceLayout layout; + + AffineExpr d0, d1; + bindDims(writeOp.getContext(), d0, d1); + AffineMap map = writeOp.getPermutationMap(); + if (map.isIdentity()) + layout = arm_sme::TileSliceLayout::Horizontal; + else if (map == AffineMap::get(map.getNumDims(), 0, {d1, d0}, + writeOp.getContext())) + layout = arm_sme::TileSliceLayout::Vertical; + else + return rewriter.notifyMatchFailure(writeOp, + "unsupported permutation map"); + rewriter.replaceOpWithNewOp( writeOp, writeOp.getVector(), writeOp.getSource(), writeOp.getIndices(), - writeOp.getMask()); + writeOp.getMask(), layout); return success(); } }; diff --git a/mlir/test/Dialect/ArmSME/vector-ops-to-sme.mlir b/mlir/test/Dialect/ArmSME/vector-ops-to-sme.mlir index f9251edbe658b6..e1a8a9ff9bf10a 100644 --- a/mlir/test/Dialect/ArmSME/vector-ops-to-sme.mlir +++ b/mlir/test/Dialect/ArmSME/vector-ops-to-sme.mlir @@ -337,6 +337,37 @@ func.func @transfer_write_2d_with_mask_f64(%vector : vector<[2]x[2]xf64>, %dest // ----- +/// in-flight transpose via vertical store. + +// CHECK-LABEL: func.func @transfer_write_2d_transpose_i64( +// CHECK-SAME: %[[VECTOR:.*]]: vector<[2]x[2]xi64>, +// CHECK-SAME: %[[DEST:.*]]: memref) { +// CHECK: %[[C0:.*]] = arith.constant 0 : index +// CHECK: arm_sme.tile_store %[[VECTOR]], %[[DEST]]{{\[}}%[[C0]], %[[C0]]] layout : memref, vector<[2]x[2]xi64> +func.func @transfer_write_2d_transpose_i64(%vector : vector<[2]x[2]xi64>, %dest : memref) { + %c0 = arith.constant 0 : index + vector.transfer_write %vector, %dest[%c0, %c0] {permutation_map = affine_map<(d0, d1) -> (d1, d0)>, in_bounds = [true, true]} : vector<[2]x[2]xi64>, memref + return +} + +// ----- + +/// in-flight transpose via vertical store with mask. + +// CHECK-LABEL: func.func @transfer_write_2d_transpose_with_mask_bf16( +// CHECK-SAME: %[[VECTOR:.*]]: vector<[8]x[8]xbf16>, +// CHECK-SAME: %[[DEST:.*]]: memref, +// CHECK-SAME: %[[MASK:.*]]: vector<[8]x[8]xi1>) { +// CHECK: %[[C0:.*]] = arith.constant 0 : index +// CHECK: arm_sme.tile_store %[[VECTOR]], %[[DEST]]{{\[}}%[[C0]], %[[C0]]], %[[MASK]] layout : memref, vector<[8]x[8]xbf16> +func.func @transfer_write_2d_transpose_with_mask_bf16(%vector : vector<[8]x[8]xbf16>, %dest : memref, %mask : vector<[8]x[8]xi1>) { + %c0 = arith.constant 0 : index + vector.transfer_write %vector, %dest[%c0, %c0], %mask {permutation_map = affine_map<(d0, d1) -> (d1, d0)>, in_bounds = [true, true]} : vector<[8]x[8]xbf16>, memref + return +} + +// ----- + // The following tests check the 'vector.transfer_write' -> 'arm_sme.intr.zero' // lowering only occurs for vector types of correct rank, shape, element size // and number of scalable dims. @@ -398,6 +429,17 @@ func.func @transfer_write_2d__fixed(%vector : vector<16x16xi8>, %dest : memref, %dest : memref) { + %c0 = arith.constant 0 : index + vector.transfer_write %vector, %dest[%c0, %c0] : vector<[4]x[4]xf32>, memref + return +} + //===----------------------------------------------------------------------===// // vector.broadcast //===----------------------------------------------------------------------===// diff --git a/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/test-transfer-write-2d.mlir b/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/test-transfer-write-2d.mlir new file mode 100644 index 00000000000000..1cb685d7bc27cd --- /dev/null +++ b/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/test-transfer-write-2d.mlir @@ -0,0 +1,174 @@ +// DEFINE: %{entry_point} = entry +// DEFINE: %{compile} = mlir-opt %s \ +// DEFINE: -enable-arm-streaming="mode=locally enable-za" \ +// DEFINE: -convert-vector-to-arm-sme -convert-arm-sme-to-scf \ +// DEFINE: -convert-vector-to-llvm="enable-arm-sme" -cse -canonicalize \ +// DEFINE: -allocate-arm-sme-tiles -test-lower-to-llvm +// DEFINE: %{run} = %mcr_aarch64_cmd \ +// DEFINE: -march=aarch64 -mattr=+sve,+sme \ +// DEFINE: -e %{entry_point} -entry-point-result=void \ +// DEFINE: -shared-libs=%mlir_runner_utils,%mlir_c_runner_utils + +// RUN: %{compile} | %{run} | FileCheck %s + +llvm.func @printCString(!llvm.ptr) + +// TODO: replace with vector.print once #68695 lands. +func.func @print_str(%str: !llvm.ptr>) attributes { enable_arm_streaming_ignore } { + %c0 = llvm.mlir.constant(0 : index) : i64 + %str_bytes = llvm.getelementptr %str[%c0, %c0] + : (!llvm.ptr>, i64, i64) -> !llvm.ptr + llvm.call @printCString(%str_bytes) : (!llvm.ptr) -> () + return +} + +// Vector store. +func.func @transfer_write_2d(%A : memref, %base1: index, %base2: index) { + %c0 = arith.constant 0.0 : f32 + %zero = vector.splat %c0 : vector<[4]x[4]xf32> + vector.transfer_write %zero, %A[%base1, %base2] {in_bounds=[true, true]} : + vector<[4]x[4]xf32>, memref + return +} + +// Masked vector store. +func.func @transfer_write_2d_mask(%A : memref, %base1: index, %base2: index) { + %c0 = arith.constant 0.0 : f32 + %c2 = arith.constant 2 : index + %c3 = arith.constant 3 : index + %mask = vector.create_mask %c2, %c3 : vector<[4]x[4]xi1> + %zero = vector.splat %c0 : vector<[4]x[4]xf32> + vector.transfer_write %zero, %A[%base1, %base2], %mask {in_bounds=[true, true]} : + vector<[4]x[4]xf32>, memref + return +} + +// Vector store + transpose. +func.func @transfer_write_2d_transposed(%A : memref, %base1: index, %base2: index) { + %0 = vector.load %A[%base1, %base2] : memref, vector<[4]x[4]xf32> + vector.transfer_write %0, %A[%base1, %base2] {permutation_map = affine_map<(d0, d1) -> (d1, d0)>, in_bounds=[true, true]} : + vector<[4]x[4]xf32>, memref + return +} + +// Masked vector store + transpose. +func.func @transfer_write_2d_mask_transposed(%A : memref, %base1: index, %base2: index) { + %c2 = arith.constant 2 : index + %c4 = arith.constant 4 : index + %mask = vector.create_mask %c4, %c2 : vector<[4]x[4]xi1> + %0 = vector.load %A[%base1, %base2] : memref, vector<[4]x[4]xf32> + vector.transfer_write %0, %A[%base1, %base2], %mask {permutation_map = affine_map<(d0, d1) -> (d1, d0)>, in_bounds=[true, true]} : + vector<[4]x[4]xf32>, memref + return +} + +// Vector load + print. +func.func @load_and_print(%A : memref, %base1: index, %base2: index) { + %tile_begin_str = llvm.mlir.addressof @tile_begin : !llvm.ptr> + + %0 = vector.load %A[%base1, %base2] : memref, vector<[4]x[4]xf32> + + func.call @print_str(%tile_begin_str) : (!llvm.ptr>) -> () + vector.print %0: vector<[4]x[4]xf32> + + return +} + +// Allocate heap memory of size 'd0' x 'd1' and initialize. +// +// Example: +// +// initialize_memory(%c4, %c5) +// +// 0, 1, 2, 3, 4 +// 10, 11, 12, 13, 14 +// 20, 21, 22, 23, 24 +// 30, 31, 32, 33, 34 +// +// Returns dynamic memref. It's the callers responsiblity to free the returned +// memref. +func.func @initialize_memory(%d0 : index, %d1 : index) -> memref { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c1_f32 = arith.constant 1.0 : f32 + %c10_f32 = arith.constant 10.0 : f32 + + %A = memref.alloc(%d0, %d1) : memref + + %init = arith.constant 0.0 : f32 + scf.for %i = %c0 to %d0 step %c1 iter_args(%val = %init) -> f32 { + scf.for %j = %c0 to %d1 step %c1 iter_args(%inner_val = %val) -> f32 { + memref.store %inner_val, %A[%i, %j] : memref + %inner_val_next = arith.addf %inner_val, %c1_f32 : f32 + scf.yield %inner_val_next : f32 + } + %val_next = arith.addf %val, %c10_f32 : f32 + scf.yield %val_next : f32 + } + + return %A : memref +} + +func.func @entry() { + %c0 = arith.constant 0 : index + %c2 = arith.constant 2 : index + %c4 = arith.constant 4 : index + + // Allocate enough memory to load a 32-bit tile plus a tiny bit more to test + // non-zero offsets while remaining inbounds. + %vscale = vector.vscale + %svl_s = arith.muli %c4, %vscale : index + %svl_s_plus_two = arith.addi %svl_s, %c2 : index + + // 1. Initialize memory + // CHECK-LABEL: TILE BEGIN: + // CHECK-NEXT: ( 0, 1, 2, 3 + // CHECK-NEXT: ( 10, 11, 12, 13 + // CHECK-NEXT: ( 20, 21, 22, 23 + // CHECK-NEXT: ( 30, 31, 32, 33 + %A = call @initialize_memory(%svl_s_plus_two, %svl_s_plus_two) : (index, index) -> memref + call @load_and_print(%A, %c0, %c0) : (memref, index, index) -> () + + // 2. Write 2-D vector of zeroes to 1. at offset [2, 2]. + // CHECK-LABEL: TILE BEGIN: + // CHECK-NEXT: ( 0, 1, 2, 3 + // CHECK-NEXT: ( 10, 11, 12, 13 + // CHECK-NEXT: ( 20, 21, 0, 0 + // CHECK-NEXT: ( 30, 31, 0, 0 + call @transfer_write_2d(%A, %c2, %c2) : (memref, index, index) -> () + call @load_and_print(%A, %c0, %c0) : (memref, index, index) -> () + + // 3. Write 2-D vector of zeroes to 2. but with mask (nrows=2, ncols=3). + // CHECK-LABEL: TILE BEGIN: + // CHECK-NEXT: ( 0, 0, 0, 3 + // CHECK-NEXT: ( 0, 0, 0, 13 + // CHECK-NEXT: ( 20, 21, 0, 0 + // CHECK-NEXT: ( 30, 31, 0, 0 + call @transfer_write_2d_mask(%A, %c0, %c0) : (memref, index, index) -> () + call @load_and_print(%A, %c0, %c0) : (memref, index, index) -> () + + // 4. Reload 3. + store + transpose. + // CHECK-LABEL: TILE BEGIN: + // CHECK-NEXT: ( 0, 0, 20, 30 + // CHECK-NEXT: ( 0, 0, 21, 31 + // CHECK-NEXT: ( 0, 0, 0, 0 + // CHECK-NEXT: ( 3, 13, 0, 0 + call @transfer_write_2d_transposed(%A, %c0, %c0) : (memref, index, index) -> () + call @load_and_print(%A, %c0, %c0) : (memref, index, index) -> () + + // 5. Reload 4. + store + transpose but with mask (nrows=4, ncols=2). + // The mask applies after permutation + // CHECK-LABEL: TILE BEGIN: + // CHECK-NEXT: ( 0, 0, 20, 30 + // CHECK-NEXT: ( 0, 0, 21, 31 + // CHECK-NEXT: ( 20, 21, 0, 0 + // CHECK-NEXT: ( 30, 31, 0, 0 + call @transfer_write_2d_mask_transposed(%A, %c0, %c0) : (memref, index, index) -> () + call @load_and_print(%A, %c0, %c0) : (memref, index, index) -> () + + memref.dealloc %A : memref + + return +} + +llvm.mlir.global internal constant @tile_begin("TILE BEGIN: \0A\00")