Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[GPU] Add MLP test and linalg.fill lowering in 'linalg-to-xegpu' #220

Merged
merged 28 commits into from
Sep 11, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/build-llvm.yml
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ jobs:

- uses: actions/checkout@v4
with:
repository: Menooker/mlir-extensions
repository: intel/mlir-extensions
ref: ${{ env.IMEX_HASH }}
path: mlir-extensions
if: ${{ matrix.build-type == 'IMEX' }}
Expand Down
2 changes: 1 addition & 1 deletion cmake/imex-version.txt
Original file line number Diff line number Diff line change
@@ -1 +1 @@
ee459724294e165e360e1de72ad3b217eb9b6206
6c2e414a953b9a118bce6adac21cf9d42630e674
2 changes: 1 addition & 1 deletion cmake/imex.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ if (NOT DEFINED IMEX_INCLUDES)

# TODO: Change to main https://github.com/intel/mlir-extensions when all the
# required functionality is merged.
gc_fetch_content(imex "${IMEX_HASH}" https://github.com/Menooker/mlir-extensions
gc_fetch_content(imex "${IMEX_HASH}" https://github.com/intel/mlir-extensions
SET IMEX_CHECK_LLVM_VERSION=ON IMEX_ENABLE_L0_RUNTIME=0
)

Expand Down
97 changes: 97 additions & 0 deletions lib/gc/Transforms/GPU/LinalgToXeGPU.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1394,6 +1394,92 @@ struct ConvertNamedEltwiseToXeGPU : public OpRewritePattern<LinalgOpTy> {
LinalgToXeGPUOptions options;
};

// Create XeGPU kernel out of memory fill operation.
LogicalResult createMemoryFillKernel(linalg::LinalgOp linalgOp,
PatternRewriter &rewriter) {
Location loc = linalgOp.getLoc();
auto ctx = linalgOp.getContext();

auto scalar = linalgOp.getDpsInputs()[0];
auto output = linalgOp.getDpsInits()[0];
auto outputType = cast<ShapedType>(output.getType());
auto outputShape = outputType.getShape();

// Extract SIMD sized sub-tiles
int maxSizeSIMD = 256;
int64_t subTileCols = outputShape[1];
int64_t subTileRows = std::min(outputShape[0], maxSizeSIMD / subTileCols);

// Output descriptors for later stores.
SmallVector<Value> outputTiles = createDescriptorTiles(
rewriter, loc, output, outputShape, {0, 0}, {subTileRows, subTileCols});

SmallVector<Value> results;
for (size_t i = 0; i < outputTiles.size(); i++) {
// Operands are sub-tiles at the same location.
auto flatType = VectorType::get({subTileRows * subTileCols},
outputType.getElementType());
auto tileType = VectorType::get({subTileRows, subTileCols},
outputType.getElementType());
Value vec = rewriter.create<vector::BroadcastOp>(loc, flatType, scalar);
Value res = rewriter.create<vector::ShapeCastOp>(loc, tileType, vec);

if (!res)
return failure();

results.push_back(res);
}

// Store results.
auto writeCacheHint =
xegpu::CachePolicyAttr::get(ctx, xegpu::CachePolicy::WRITE_BACK);
for (size_t i = 0; i < outputTiles.size(); i++) {
rewriter.create<xegpu::StoreNdOp>(loc, results[i], outputTiles[i],
/*l1_hint=*/writeCacheHint,
/*l2_hint=*/writeCacheHint,
/*l3_hint=*/writeCacheHint);
}

rewriter.eraseOp(linalgOp);

return success();
}

// Convert a named fill operation to an XeGPU kernel.
template <typename LinalgOpTy>
struct ConvertMemoryFillToXeGPU : public OpRewritePattern<LinalgOpTy> {
using OpRewritePattern<LinalgOpTy>::OpRewritePattern;

ConvertMemoryFillToXeGPU(MLIRContext *ctx, LinalgToXeGPUOptions options)
: OpRewritePattern<LinalgOpTy>(ctx), options(options) {}

LogicalResult matchAndRewrite(LinalgOpTy linalgOp,
PatternRewriter &rewriter) const override {
if (!linalgOp.hasPureBufferSemantics()) {
return rewriter.notifyMatchFailure(
linalgOp, "Linalg eltwise to GPU expects memref type");
}
if (linalgOp.hasDynamicShape()) {
return rewriter.notifyMatchFailure(
linalgOp, "Expect static shape when mapping to GPU");
}
auto isInputValid =
success(linalgOp.isScalar(linalgOp.getDpsInputOperand(0)));
if (failed(isInputValid))
return isInputValid;

auto isOutputValid =
isValidMemrefOperand(linalgOp, linalgOp.getDpsInits()[0], rewriter);
if (failed(isOutputValid))
return isOutputValid;

return createMemoryFillKernel(linalgOp, rewriter);
}

private:
LinalgToXeGPUOptions options;
};

// TODO: Finalize BRGEMM support and register the pattern.
void populateLinalgGemmToXeGPUPatterns(RewritePatternSet &patterns,
LinalgToXeGPUOptions options) {
Expand All @@ -1418,6 +1504,12 @@ void populateLinalgEltwiseToXeGPUPatterns(RewritePatternSet &patterns,
options);
}

void populateLinalgMemoryFillToXeGPUPatterns(RewritePatternSet &patterns,
LinalgToXeGPUOptions options) {
patterns.add<ConvertMemoryFillToXeGPU<linalg::FillOp>>(patterns.getContext(),
options);
}

struct LinalgToXeGPU : public gc::impl::LinalgToXeGPUBase<LinalgToXeGPU> {
using LinalgToXeGPUBase::LinalgToXeGPUBase;

Expand All @@ -1429,6 +1521,11 @@ struct LinalgToXeGPU : public gc::impl::LinalgToXeGPUBase<LinalgToXeGPU> {
populateLinalgGemmToXeGPUPatterns(gemmPatterns, options);
(void)applyPatternsAndFoldGreedily(getOperation(), std::move(gemmPatterns));

// Convert memory fill ops.
RewritePatternSet fillPatterns(&getContext());
populateLinalgMemoryFillToXeGPUPatterns(fillPatterns, options);
(void)applyPatternsAndFoldGreedily(getOperation(), std::move(fillPatterns));

// Convert other remaining ops.
RewritePatternSet patterns(&getContext());
populateLinalgEltwiseToXeGPUPatterns(patterns, options);
Expand Down
2 changes: 1 addition & 1 deletion scripts/compile.sh
Original file line number Diff line number Diff line change
Expand Up @@ -120,7 +120,7 @@ build_llvm() {
local mlir_ext_dir="$EXTERNALS_DIR/mlir-extensions"
if ! [ -d "$mlir_ext_dir" ]; then
cd "$EXTERNALS_DIR"
git clone https://github.com/Menooker/mlir-extensions.git
git clone https://github.com/intel/mlir-extensions.git
cd "$mlir_ext_dir"
else
cd "$mlir_ext_dir"
Expand Down
57 changes: 57 additions & 0 deletions test/mlir/test/gc/gpu-runner/XeGPU/f16_mlp_32x4096x4096x4096.mlir
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
// RUN: gc-opt %s --pass-pipeline='builtin.module(func.func(iterative-tiling-and-fusion{use-cost-model=0 default-tile-size=matmul:{16,16}}),eliminate-empty-tensors,empty-tensor-to-alloc-tensor,one-shot-bufferize{bufferize-function-boundaries=1 function-boundary-type-conversion=identity-layout-map},drop-equivalent-buffer-results,func.func(finalizing-bufferize),canonicalize,cse,drop-equivalent-buffer-results,expand-realloc,canonicalize,ownership-based-buffer-deallocation,canonicalize,buffer-deallocation-simplification,bufferization-lower-deallocations,cse,canonicalize,convert-bufferization-to-memref,func.func(scf-forall-to-parallel),func.func(linalg-to-xegpu{stages=1 dpas-tile=8,16,16 k-tile=16}),xegpu-fold-alias-ops,func.func(convert-linalg-to-parallel-loops),func.func(gpu-map-parallel-loops),func.func(convert-parallel-loops-to-gpu),func.func(insert-gpu-allocs),gpu-kernel-outlining,canonicalize,set-spirv-capabilities{client-api=opencl},gpu.module(set-spirv-abi-attrs{client-api=opencl}),lower-affine,imex-vector-linearize,gpu.module(convert-xegpu-to-vc),reconcile-unrealized-casts,bf16-to-gpu,gpu.module(convert-func-to-spirv),gpu.module(convert-vector-to-spirv),imex-convert-gpu-to-spirv,spirv.module(spirv-lower-abi-attrs,spirv-update-vce),func.func(llvm-request-c-wrappers),serialize-spirv,convert-vector-to-scf,convert-gpu-to-gpux,convert-scf-to-cf,convert-cf-to-llvm,convert-vector-to-llvm,convert-index-to-llvm,convert-arith-to-llvm,convert-func-to-llvm,convert-math-to-llvm,convert-gpux-to-llvm,convert-index-to-llvm,expand-strided-metadata,lower-affine,finalize-memref-to-llvm,reconcile-unrealized-casts)' \
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does this test pass on your machine? For me it fails with the following error:

incorrect lowering for 'linalg.fill'?
/home/jovyan/graph-compiler/test/mlir/test/gc/gpu-runner/XeGPU/f16_mlp_32x4096x4096x4096.mlir:14:10: error: 'func.call' op operand type mismatch: expected operand type 'vector<16x16xf16>', but provided 'vector<256xf16>' for operand number 9
    %4 = linalg.add ins(%arg2, %2 : tensor<32x4096xf16>, tensor<32x4096xf16>) 
         ^
/home/jovyan/graph-compiler/test/mlir/test/gc/gpu-runner/XeGPU/f16_mlp_32x4096x4096x4096.mlir:14:10: note: see current operation: "func.call"(%275, %276, %277, %278, %279, %280, %281, %282, %274, %247) <{callee = @llvm.genx.raw.sends2.noresult.i1.v8i32.v128i32}> : (i8, i8, i1, i8, i8, i8, i32, i32, vector<8xi32>, vector<256xf16>) -> ()
/home/jovyan/graph-compiler/test/mlir/test/gc/gpu-runner/XeGPU/f16_mlp_32x4096x4096x4096.mlir:26:11: error: 'func.call' op operand type mismatch: expected operand type 'vector<16x16xf16>', but provided 'vector<256xf16>' for operand number 9
    %12 = linalg.add ins(%arg4, %10 : tensor<32x4096xf16>, tensor<32x4096xf16>) 
          ^
/home/jovyan/graph-compiler/test/mlir/test/gc/gpu-runner/XeGPU/f16_mlp_32x4096x4096x4096.mlir:26:11: note: see current operation: "func.call"(%275, %276, %277, %278, %279, %280, %281, %282, %274, %247) <{callee = @llvm.genx.raw.sends2.noresult.i1.v8i32.v128i32}> : (i8, i8, i1, i8, i8, i8, i32, i32, vector<8xi32>, vector<256xf16>) -> ()

If I remove all linalg.fill from the test it then fails with another error caused by double deallocations added by insert-gpu-allocs pass. This can be fixed with this patch to IMEX: Menooker/mlir-extensions#3 (have you applied this patch to your IMEX build? If so, we probably should merge it and update IMEX version)

free() problem
0.      Program arguments: /home/jovyan/graph-compiler/build/bin/gc-cpu-runner -e main --entry-point-result=void --shared-libs=/home/jovyan/llvm/llvm-gc-master-patches-install/lib/libmlir_runner_utils.so,/home/jovyan/llvm/llvm-gc-master-patches-install/lib/libmlir_c_runner_utils.so,/home/jovyan/graph-compiler/build/lib/libGcOpenclRuntime.so
 #0 0x0000562ee351c2a0 llvm::sys::PrintStackTrace(llvm::raw_ostream&, int) (/home/jovyan/graph-compiler/build/bin/gc-cpu-runner+0x3b82a0)
 #1 0x0000562ee35193af llvm::sys::RunSignalHandlers() (/home/jovyan/graph-compiler/build/bin/gc-cpu-runner+0x3b53af)
 #2 0x0000562ee3519505 SignalHandler(int) Signals.cpp:0:0
 #3 0x00007f2d1e2716ac (/usr/lib/x86_64-linux-gnu/intel-opencl/libigdrcl.so+0x5436ac)
 #4 0x00007f2d544cf520 (/lib/x86_64-linux-gnu/libc.so.6+0x42520)
 #5 0x00007f2d545323fe __libc_free (/lib/x86_64-linux-gnu/libc.so.6+0xa53fe)
 #6 0x00007f2d54a097aa 
 #7 0x00007f2d54a0a09b 
 #8 0x00007f2d54a0a441 
 #9 0x0000562ee3ad5a0c compileAndExecute((anonymous namespace)::Options&, mlir::Operation*, llvm::StringRef, (anonymous namespace)::CompileAndExecuteConfig, void**, std::unique_ptr<llvm::TargetMachine, std::default_delete<llvm::TargetMachine>>) JitRunner.cpp:0:0
#10 0x0000562ee3ad5ead compileAndExecuteVoidFunction((anonymous namespace)::Options&, mlir::Operation*, llvm::StringRef, (anonymous namespace)::CompileAndExecuteConfig, std::unique_ptr<llvm::TargetMachine, std::default_delete<llvm::TargetMachine>>) JitRunner.cpp:0:0
#11 0x0000562ee3ad7473 mlir::JitRunnerMain(int, char**, mlir::DialectRegistry const&, mlir::JitRunnerConfig) (/home/jovyan/graph-compiler/build/bin/gc-cpu-runner+0x973473)
#12 0x0000562ee34546c0 std::vector<std::unique_ptr<mlir::DialectExtensionBase, std::default_delete<mlir::DialectExtensionBase>>, std::allocator<std::unique_ptr<mlir::DialectExtensionBase, std::default_delete<mlir::DialectExtensionBase>>>>::~vector() /usr/include/c++/11/bits/stl_vector.h:680:15
#13 0x0000562ee34546c0 mlir::DialectRegistry::~DialectRegistry() /home/jovyan/llvm/llvm-gc-master-patches-install/include/mlir/IR/DialectRegistry.h:139:7
#14 0x0000562ee34546c0 main /home/jovyan/graph-compiler/src/gc-cpu-runner/gc-cpu-runner.cpp:46:1
#15 0x00007f2d544b6d90 (/lib/x86_64-linux-gnu/libc.so.6+0x29d90)
#16 0x00007f2d544b6e40 __libc_start_main (/lib/x86_64-linux-gnu/libc.so.6+0x29e40)
#17 0x0000562ee3505195 _start (/home/jovyan/graph-compiler/build/bin/gc-cpu-runner+0x3a1195)

After removing linalg.fill and applying the patch above to IMEX the test passes for me.

// RUN: | gc-cpu-runner -e main --entry-point-result=void \
// RUN: --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%opencl_runtime | FileCheck %s

module {
func.func @linalg_mlp(%arg0: tensor<32x4096xf16>, %arg1: tensor<4096x4096xf16>, %arg2 : tensor<32x4096xf16>,
%arg3: tensor<4096x4096xf16>, %arg4 : tensor<32x4096xf16>) {
%cst = arith.constant 0.000000e+00 : f16
%0 = tensor.empty() : tensor<32x4096xf16>
%1 = linalg.fill ins(%cst : f16) outs(%0 : tensor<32x4096xf16>) -> tensor<32x4096xf16>
%2 = linalg.matmul ins(%arg0, %arg1 : tensor<32x4096xf16>, tensor<4096x4096xf16>)
outs(%1 : tensor<32x4096xf16>) -> (tensor<32x4096xf16>)
%3 = tensor.empty() : tensor<32x4096xf16>
%4 = linalg.add ins(%arg2, %2 : tensor<32x4096xf16>, tensor<32x4096xf16>)
outs(%3 : tensor<32x4096xf16>) -> tensor<32x4096xf16>
%5 = arith.constant dense<0.000000e+00> : tensor<32x4096xf16>
%6 = tensor.empty() : tensor<32x4096xf16>
%7 = linalg.max ins(%5, %4 : tensor<32x4096xf16>, tensor<32x4096xf16>)
outs(%6 : tensor<32x4096xf16>) -> tensor<32x4096xf16>

%8 = tensor.empty() : tensor<32x4096xf16>
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

do you use it anywhere?

Suggested change
%8 = tensor.empty() : tensor<32x4096xf16>

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

fixed

%9 = linalg.fill ins(%cst : f16) outs(%8 : tensor<32x4096xf16>) -> tensor<32x4096xf16>
%10 = linalg.matmul ins(%7, %arg3 : tensor<32x4096xf16>, tensor<4096x4096xf16>)
outs(%9 : tensor<32x4096xf16>) -> (tensor<32x4096xf16>)
%11 = tensor.empty() : tensor<32x4096xf16>
%12 = linalg.add ins(%arg4, %10 : tensor<32x4096xf16>, tensor<32x4096xf16>)
outs(%11 : tensor<32x4096xf16>) -> tensor<32x4096xf16>
%13 = arith.constant dense<0.000000e+00> : tensor<32x4096xf16>
%14 = tensor.empty() : tensor<32x4096xf16>
%15 = linalg.max ins(%13, %12 : tensor<32x4096xf16>, tensor<32x4096xf16>)
outs(%14 : tensor<32x4096xf16>) -> tensor<32x4096xf16>

%slice = tensor.extract_slice %15[0, 0][32, 1][1, 1] : tensor<32x4096xf16> to tensor<32xf16>
%cast = tensor.cast %slice : tensor<32xf16> to tensor<*xf16>
call @printMemrefF16(%cast) : (tensor<*xf16>) -> ()

return
}

func.func @main() {
%0 = arith.constant dense<0.01> : tensor<32x4096xf16>
%1 = arith.constant dense<0.01> : tensor<4096x4096xf16>
%2 = arith.constant dense<0.02> : tensor<32x4096xf16>
%3 = arith.constant dense<0.01> : tensor<4096x4096xf16>
%4 = arith.constant dense<0.02> : tensor<32x4096xf16>

func.call @linalg_mlp(%0, %1, %2, %3, %4) : (tensor<32x4096xf16>, tensor<4096x4096xf16>, tensor<32x4096xf16>,
tensor<4096x4096xf16>, tensor<32x4096xf16>) -> ()
return
}

func.func private @printMemrefF16(%ptr : tensor<*xf16>) attributes { llvm.emit_c_interface }
}

// CHECK: Unranked Memref base@{{(0x)?[-0-9a-fA-F]*}}
// CHECK-SAME: rank = 1 offset = 0 sizes = [32] strides = [4096] data =
// CHECK-NEXT: [17.625, 17.625, 17.625, 17.625, 17.625, 17.625, 17.625, 17.625, 17.625, 17.625, 17.625, 17.625, 17.625, 17.625, 17.625, 17.625, 17.625, 17.625, 17.625, 17.625, 17.625, 17.625, 17.625, 17.625, 17.625, 17.625, 17.625, 17.625, 17.625, 17.625, 17.625, 17.625]