Skip to content

Commit

Permalink
[XeVM] Add first integration tests (#425)
Browse files Browse the repository at this point in the history
Enable XeVM integration tests (load/store/dpas) using wrappers and `gc-gpu-runner`. To achieve this, GPU components and tools are decoupled from IMEX, and `gpu-to-gpuopcl` pass (part of `gc-gpu-runner`) is extended to support upstream GPU code.
  • Loading branch information
akroviakov authored Jan 21, 2025
1 parent 1fa5c26 commit d5e6a56
Show file tree
Hide file tree
Showing 70 changed files with 1,322 additions and 142 deletions.
12 changes: 10 additions & 2 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
################################################################################
# Copyright (C) 2024 Intel Corporation
# Copyright (C) 2025 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -45,7 +45,8 @@ option(GC_ENABLE_TEST_DNNL_API "Build the dnnl tests" ${GC_ENABLE_DNNL_API})
option(GC_ENABLE_TEST_MLIR "Build the mlir tests" ON)
option(GC_ENABLE_TOOLS "Build the tools" ON)
option(GC_ENABLE_OPT "Build gc-opt" ${GC_ENABLE_TOOLS})
option(GC_ENABLE_IMEX "Enable Intel® Extension for MLIR" OFF)
option(GC_ENABLE_IMEX "Enable Intel® Extension for MLIR (implicitly enables GPU compilation)" OFF)
option(GC_ENABLE_GPU "Enable GPU runtime and tools components" OFF)
option(GC_ENABLE_BINDINGS_PYTHON "Enable Graph Complier Python Binding" ON)
option(GC_DEV_LINK_LLVM_DYLIB "Link dynamic libraries of LLVM and MLIR. For developers only. Do not use it in packing the library." OFF)
option(GC_ENABLE_RUNTIME_NAIVE_BRGEMM "Use naive BRGEMM as runtime backend for debug purpose." OFF)
Expand All @@ -55,6 +56,10 @@ if(GC_ENABLE_LEGACY)
add_subdirectory(legacy/core)
endif()

if (GC_ENABLE_GPU)
set(GC_ENABLE_GPU ON)
endif()

if (GC_ENABLE_IMEX)
# normalize the value for lit config
set(GC_ENABLE_IMEX ON)
Expand All @@ -70,6 +75,9 @@ endif()
############################## Targets #########################################
# All common options, includes etc. are added to this interface target.
add_library(GcInterface INTERFACE)
if (GC_ENABLE_GPU)
target_compile_options(GcInterface INTERFACE -DGC_USE_GPU)
endif()
target_compile_features(GcInterface INTERFACE cxx_std_17)
target_include_directories(GcInterface INTERFACE
$<BUILD_INTERFACE:${PROJECT_BINARY_DIR}/include>
Expand Down
3 changes: 2 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -76,5 +76,6 @@ Graph Compiler supports the following build-time options.
| GC_ENABLE_TEST | **ON**, OFF | Controls building the tests |
| GC_DEV_LINK_LLVM_DYLIB | ON, **OFF** | Controls dynamic link LLVM/MLIR libraries, mainly for developer |
| GC_ENABLE_BINDINGS_PYTHON | **ON**, OFF | Controls building the Python API |
| GC_ENABLE_IMEX | ON, **OFF** | Whether to enable the GPU components |
| GC_ENABLE_IMEX | ON, **OFF** | Whether to enable the IMEX components |
| GC_ENABLE_GPU | ON, **OFF** | Whether to enable the GPU tools and components |

1 change: 1 addition & 0 deletions include/gc/Conversion/Passes.h
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
#define GC_CONVERSION_PASSES_H

#include "gc/Conversion/XeVMToLLVM/XeVMToLLVM.h"
#include "mlir/Pass/Pass.h"

namespace mlir {

Expand Down
70 changes: 70 additions & 0 deletions include/gc/Dialect/LLVMIR/XeVMOps.td
Original file line number Diff line number Diff line change
Expand Up @@ -221,6 +221,76 @@ def XeVM_BlockPrefetch2dOp : XeVM_Op<"blockprefetch2d">,
let hasVerifier = 1;
}

def XeVM_MatrixElemType : AnyTypeOf<[AnyI8, AnyI16, AnyI32, F32, F16, BF16]>;

/// Enum attribute of the different precision types.
def XeVM_PrecisionTypeAttr : I32EnumAttr<"PrecisionType",
"XeVM precision type",
[
I32EnumAttrCase<"UNUSED", 0, "unused">,
I32EnumAttrCase<"U8", 1, "u8">,
I32EnumAttrCase<"U4", 2, "u4">,
I32EnumAttrCase<"U2", 3, "u2">,
I32EnumAttrCase<"S8", 4, "i8">,
I32EnumAttrCase<"S4", 5, "i4">,
I32EnumAttrCase<"S2", 6, "i2">,
I32EnumAttrCase<"BF8", 7, "bf8">,
I32EnumAttrCase<"TF32", 8, "tf32">,
I32EnumAttrCase<"BF16", 9, "bf16">,
I32EnumAttrCase<"FP16", 10, "f16">
]> {
let cppNamespace = "::mlir::xevm";
}

def XeVM_DPASOp : XeVM_Op<"dpas">,
Results<(outs FixedVectorOf<[XeVM_MatrixElemType]>:$d)>,
Arguments<(ins
FixedVectorOfRankAndType<[1], [XeVM_MatrixElemType]>:$c,
FixedVectorOfRankAndType<[1], [XeVM_MatrixElemType]>:$a,
FixedVectorOfRankAndType<[1], [XeVM_MatrixElemType]>:$b,
XeVM_PrecisionTypeAttr:$pa,
XeVM_PrecisionTypeAttr:$pb,
I32Attr:$rc
)> {

let summary = "Matrix multiply-add";

let description = [{
The `xevm.dpas` operation is a matrix multiplication plus accumulation:

D = C + A x B

where the A, B, C input matrices and the result D have shapes:
D : MxN
C : MxN
A : MxK
B : KxN

Shape restrictions:
M : must be 1, 2, 4, or 8
N : fixed execution size, must be 16
K : systolic_depth * OPS_PER_CHAN
OPS_PER_CHAN
1 : for TF32
2 : for 16-bit precision(BF, HF)
4 : for 8-bit precision (FP8, UB, B)
8 : for less-then 8 bit precision (U4/S4, U2/S2).

If systolic_depth is 8, K would be 8, 16, 32, or 64 (based on OPS_PER_CHAN).
$a, $b, $c, $d - matrix A, B, C, D, respectively
$pa, $pb - precision of matrix A and B resepectively
$rc - repeat count

Further restrictions as well as more details can be found here:
https://registry.khronos.org/OpenCL/extensions/intel/cl_intel_subgroup_matrix_multiply_accumulate.html
}];

let assemblyFormat = [{
operands ` ` `{` `pa` `=` $pa `,` `pb` `=` $pb `,` `rc` `=` $rc `}` attr-dict `:` functional-type(operands, results)
}];

// let hasVerifier = 1;
}

def XeVM_TargetAttr : XeVM_Attr<"XeVMTarget", "target"> {
let description = [{
Expand Down
2 changes: 1 addition & 1 deletion include/gc/ExecutionEngine/Driver/Driver.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ namespace mlir {
class DialectRegistry;
namespace gc {

const DialectRegistry &initCompilerAndGetDialects();
DialectRegistry &initCompilerAndGetDialects();

// the pointers to XXXMemRefType
using GeneralMemrefPtr = void *;
Expand Down
5 changes: 4 additions & 1 deletion include/gc/Transforms/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,8 +1,11 @@
if(GC_ENABLE_DNNL_API)
list(APPEND TABLEGEN_MACROS -DGC_HAS_ONEDNN_DIALECT)
endif()
if(GC_ENABLE_GPU)
list(APPEND TABLEGEN_MACROS -DGC_USE_GPU)
endif()
if(GC_ENABLE_IMEX)
list(APPEND TABLEGEN_MACROS -DGC_USE_IMEX)
list(APPEND TABLEGEN_MACROS -DGC_USE_IMEX -DGC_USE_GPU)
endif()

set(LLVM_TARGET_DEFINITIONS Passes.td)
Expand Down
3 changes: 2 additions & 1 deletion include/gc/Transforms/Passes.h
Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,6 @@ std::unique_ptr<Pass> createMergeAllocPass();
void populateFrontendPasses(mlir::OpPassManager &);
void populateCPUPipeline(mlir::OpPassManager &);

#ifdef GC_USE_IMEX
struct GPUPipelineOptions : PassPipelineOptions<GPUPipelineOptions> {
Option<bool> isUsmArgs{
*this, "is-usm-args",
Expand All @@ -136,6 +135,8 @@ struct GPUPipelineOptions : PassPipelineOptions<GPUPipelineOptions> {
llvm::cl::init(false)};
};
void populateGPUPipeline(mlir::OpPassManager &, const GPUPipelineOptions &);
#ifdef GC_USE_IMEX
void populateIMEXPipeline(mlir::OpPassManager &, const GPUPipelineOptions &);
#endif

#define GEN_PASS_DECL
Expand Down
27 changes: 15 additions & 12 deletions include/gc/Transforms/Passes.td
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,21 @@ def LinalgToXeGPU : Pass<"linalg-to-xegpu", "func::FuncOp"> {
"DPAS register block sizes MxNxK">,
];
}
#endif

#ifdef GC_USE_GPU
def GpuToGpuOcl : Pass<"gpu-to-gpuocl", "ModuleOp"> {
let summary = "Convert the GPU operations to GpuOclRuntime calls.";
let description = [{
Convert the gpu alloc, dealloc, memcpy and launch operations to GpuOclRuntime calls.
}];
let options = [
Option<"callFinish", "call-finish", "bool",
/*default=*/"false",
"Call finish() after each kernel launch.">
];
}
#endif // GC_USE_GPU

def AddContextArg : Pass<"add-ctx-arg", "func::FuncOp"> {
let summary = "Add a context argument.";
Expand All @@ -109,17 +124,6 @@ def AllocsToSLM : Pass<"allocs-to-slm", "func::FuncOp"> {
];
}

def GpuToGpuOcl : Pass<"gpu-to-gpuocl", "ModuleOp"> {
let summary = "Convert the GPU operations to GpuOclRuntime calls.";
let description = [{
Convert the gpu alloc, dealloc, memcpy and launch operations to GpuOclRuntime calls.
}];
let options = [
Option<"callFinish", "call-finish", "bool",
/*default=*/"false",
"Call finish() after each kernel launch.">
];
}

def GpuTilingAndFusion : Pass<"gpu-tiling", "func::FuncOp"> {
let summary = "GPU tiling and fusion path.";
Expand Down Expand Up @@ -185,7 +189,6 @@ def GpuXeVMAttachTarget: Pass<"xevm-attach-target", ""> {
];
}

#endif // GC_USE_IMEX

def IterativeTilingAndFusion : Pass<"iterative-tiling-and-fusion",
"func::FuncOp"> {
Expand Down
4 changes: 1 addition & 3 deletions lib/gc/CAPI/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,7 @@ set(GC_ALL_LIBS
GcAnalysis
MLIRCPURuntimeTransforms)

if(GC_ENABLE_IMEX)
list(APPEND GC_ALL_LIBS GcGpuPasses)
endif()
list(APPEND GC_ALL_LIBS GcGpuPasses)

add_mlir_public_c_api_library(GcCAPI
Dialects.cpp
Expand Down
Loading

0 comments on commit d5e6a56

Please sign in to comment.