diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPU.h b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPU.h index 7aaa4ecc7ee77a..87aabdc015fea5 100644 --- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPU.h +++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPU.h @@ -9,7 +9,12 @@ #ifndef MLIR_DIALECT_XEGPU_IR_XEGPU_H #define MLIR_DIALECT_XEGPU_IR_XEGPU_H -#include +#include "mlir/Bytecode/BytecodeOpInterface.h" +#include "mlir/IR/BuiltinTypes.h" +#include "mlir/IR/Dialect.h" +#include "mlir/Interfaces/ShapedOpInterfaces.h" +#include "mlir/Interfaces/SideEffectInterfaces.h" +#include "mlir/Interfaces/ViewLikeInterface.h" namespace mlir { namespace xegpu { diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td index bb325c272e3324..cd38549f1ccf43 100644 --- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td +++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td @@ -10,6 +10,7 @@ #define MLIR_DIALECT_XEGPU_IR_XEGPUATTRS_TD include "mlir/Dialect/XeGPU/IR/XeGPUDialect.td" +include "mlir/IR/EnumAttr.td" class XeGPUAttr traits = [], string baseCppClass = "::mlir::Attribute"> @@ -17,4 +18,64 @@ class XeGPUAttr traits = [], let mnemonic = attrMnemonic; } +def XeGPU_TensorDescAttr: XeGPUAttr<"TensorDesc", "tdesc_attr"> { + let parameters = (ins + OptionalParameter<"MemoryScopeAttr">: $memory_scope, + OptionalParameter<"IntegerAttr", "1">: $array_length, + OptionalParameter<"BoolAttr", "true">: $boundary_check + ); + + let builders = [ + AttrBuilder<(ins + CArg<"xegpu::MemoryScope", "xegpu::MemoryScope::Global">:$memory_scope, + CArg<"int", "1">:$array_length, + CArg<"bool", "true">: $boundary_check + )> + ]; + + let assemblyFormat = "`<` struct(params) `>`"; +} + +//===----------------------------------------------------------------------===// +// XeGPU Memory Scope Enums. +//===----------------------------------------------------------------------===// +def XeGPU_MemoryScopeGlobal: I32EnumAttrCase<"Global", 0, "global">; +def XeGPU_MemoryScopeShared: I32EnumAttrCase<"SLM", 1, "slm">; +def XeGPU_MemoryScope: I32EnumAttr<"MemoryScope", + "The address space of the memory the tensor descritor is created for", + [XeGPU_MemoryScopeGlobal, XeGPU_MemoryScopeShared]> { + let genSpecializedAttr = 0; + let cppNamespace = "::mlir::xegpu"; +} + +def XeGPU_MemoryScopeAttr: + EnumAttr { + let assemblyFormat = "$value"; +} + +//===----------------------------------------------------------------------===// +// XeGPU Cache Enums. +//===----------------------------------------------------------------------===// +def XeGPU_CachePolicyCached: I32EnumAttrCase<"CACHED", 0, "cached">; // valid for read and write +def XeGPU_CachePolicyUncached: I32EnumAttrCase<"UNCACHED", 1, "uncached">; // valid for read and write +def XeGPU_CachePolicyStreaming: I32EnumAttrCase<"STREAMING", 2, "streaming">; // valid for read only +def XeGPU_CachePolicyInvalid: I32EnumAttrCase<"READ_INVALIDATE", 3, "read_invalidate">; // valid for read only +def XeGPU_CachePolicyWriteBack: I32EnumAttrCase<"WRITE_BACK", 4, "write_back">; // valid for write only +def XeGPU_CachePolicyWriteThrough: I32EnumAttrCase<"WRITE_THROUGH", 5, "write_through">; // valid for write only + +def XeGPU_CachePolicyEnums : I32EnumAttr<"CachePolicy", "Cache policy", + [XeGPU_CachePolicyCached, XeGPU_CachePolicyUncached, + XeGPU_CachePolicyStreaming, XeGPU_CachePolicyInvalid, + XeGPU_CachePolicyWriteBack, XeGPU_CachePolicyWriteThrough]> { + let genSpecializedAttr = 0; + let cppNamespace = "::mlir::xegpu"; +} + +def XeGPU_CacheHintAttr + : EnumAttr { + let assemblyFormat = "`<` $value `>`"; +} + + + #endif // MLIR_DIALECT_XEGPU_IR_XEGPUATTRS_TD diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUDialect.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUDialect.td index 3851275ad30a0a..c2f09319c790e0 100644 --- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUDialect.td +++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUDialect.td @@ -23,8 +23,8 @@ def XeGPU_Dialect : Dialect { the lower-level GPU compiler. }]; - // let useDefaultTypePrinterParser = true; - // let useDefaultAttributePrinterParser = true; + let useDefaultTypePrinterParser = true; + let useDefaultAttributePrinterParser = true; } #endif // MLIR_DIALECT_XEGPU_IR_XEGPUDIALECT_TD diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td index 5825ef9195b03f..1f90dcb4bf55ad 100644 --- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td +++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td @@ -9,10 +9,13 @@ #ifndef MLIR_DIALECT_XEGPU_IR_XEGPUOPS_TD #define MLIR_DIALECT_XEGPU_IR_XEGPUOPS_TD +include "mlir/IR/AttrTypeBase.td" include "mlir/Dialect/XeGPU/IR/XeGPUAttrs.td" include "mlir/Dialect/XeGPU/IR/XeGPUDialect.td" include "mlir/Dialect/XeGPU/IR/XeGPUTypes.td" - +include "mlir/Interfaces/ShapedOpInterfaces.td" +include "mlir/Interfaces/SideEffectInterfaces.td" +include "mlir/Interfaces/ViewLikeInterface.td" // Base class for dialect operations. This operation inherits from the base // `Op` class in OpBase.td, and provides: @@ -20,7 +23,291 @@ include "mlir/Dialect/XeGPU/IR/XeGPUTypes.td" // * The mnemonic for the operation, or the name without the dialect prefix. // * A list of traits for the operation. class XeGPU_Op traits = []>: - Op; + Op { + + code extraBaseClassDeclaration = [{ + void printProperties(::mlir::MLIRContext *ctx, + ::mlir::OpAsmPrinter &p, const Properties &prop) { + Attribute propAttr = getPropertiesAsAttr(ctx, prop); + if (propAttr) + p << "<" << propAttr << ">"; + } + + static ::mlir::ParseResult parseProperties(::mlir::OpAsmParser &parser, + ::mlir::OperationState &result) { + if (mlir::succeeded(parser.parseLess())) { + if (parser.parseAttribute(result.propertiesAttr) || parser.parseGreater()) + return failure(); + } + return success(); + } + + }]; +} + + +def XeGPU_CreateNdDescOp: XeGPU_Op<"create_nd_tdesc", [Pure, ViewLikeOpInterface, + AttrSizedOperandSegments, OffsetSizeAndStrideOpInterface]> { + + let summary = "Create nd-tensor descriptor operation"; + let description = [{ + The "create_nd_tdesc" operation creates a TensorDescType which represents + a sub-view of a 2D memory region (It can be extended to support n-D memory + region if needed in future). Elements in the subview continuous in each + dimention. It encodes the following important information for supporting + Intel hardware features: + + * source: an object representing (starting address/pointer of) a 2D memory region. + It can be either a 2D memref object, or simply a pointer represented by uint64_t type. + for the later case, the shape and layout information of the 2D memory region should + be explicitly passed via `dynamic_shape` and `dynamic_strides` parameters. + * offsets: two index values represents offsets from the "source" at the each dimension + at which the subview of the target memory will be created. It is encoded via two + variables, including "dynamic_offsets" and "static_offsets", such that it can + accept various forms, such as, operands (e.g., [%c0, %c]) and attributes (e.g., [2, 4])). + * shape: the shape information of the memory region pointed by the "source". It is + typically encoded via the MemRefType of the source, e.g., memref<4096x4096xf16>. + But if "source" is simply a pointer represented as uint64_t type, or a memref + type without shape information e.g., memref, the shape information has + to be explicitly passed via the "dynamic_shape" argument. Currently "dynamic_shape" + only accepts operands(e.g., [%c4096, %c4096]), not attributes(e.g., [4096, 4096]). + * strides: the strides of the memory region pointed by the "source". Similar to shape, + it is typically encoded via the MemRefType of the source too. But if "source" is + simply a pointer represented as uint64_t type, or a memref type without shape + information e.g., memref, the strides information has to be explicitly + passed via the "dynamic_strides" argument. And it currently only accepts operands two. + + Example 1 (suppose the tensor shape inferred by the compiler is 8x16): + %0 = memref.alloc() : memref<1024x1024xf32> + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %1 = xegpu.create_nd_tdesc %0[%c0, %c0]: memref<1024x1024xf32> -> TensorDesc<8x16xf32> + + Example 2 (suppose the tensor shape inferred by the compiler is 8x16): + %0 = memref.alloc(%h, %w) : memref + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %1 = xegpu.create_nd_tdesc %0[%c0, %c0], [%h, %w], [%w, %c1]: memref -> TensorDesc<8x16xf32> + + Example 3 (suppose the tensor shape inferred by the compiler is 8x16): + %0 = ... : ui64 + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %1 = xegpu.create_nd_tdesc %0[%c0, %c0], [%h, %w], [%w, %c1]: ui64 -> TensorDesc<8x16xf32> + }]; + + let arguments = (ins + XeGPU_BaseAddrType: $source, + Variadic: $offsets, + Variadic: $shape, + Variadic: $strides, + DenseI64ArrayAttr: $static_offsets + ); + let results = (outs XeGPU_TensorDesc: $TensorDesc); + + let assemblyFormat = [{ + $source `` + custom($offsets, $static_offsets) + (`,` `[` $shape^ `]` `,` `[` $strides `]`)? + attr-dict `:` type($source) `->` qualified(type($TensorDesc)) + }]; + + let hasVerifier = 1; + + let builders = [ + OpBuilder<(ins "Type": $tdesc, "TypedValue": $source, + "llvm::ArrayRef": $offsets)>, + + OpBuilder<(ins "Type": $tdesc, "TypedValue ": $source, + "llvm::ArrayRef": $offsets, + "ValueRange": $shape, "ValueRange": $stride)> + ]; + + let extraClassDeclaration = extraBaseClassDeclaration # [{ + /// Returns the type of the source memref operand. + Type getSourceType() { + return getSource().getType(); + } + + /// Returns the type of the result TensorDesc. + xegpu::TensorDescType getType() { + return getTensorDesc().getType(); + } + + /// Return the element type of the TensorDesc + Type getElementType() { + return getType().getElementType(); + } + + /// Return the shape of the TensorDesc + llvm::ArrayRef getTensorDescShape() { + return getType().getShape(); + } + + /// wrapper for matching with OffsetSizeAndStrideOpInterface + OperandRange getSizes() { + return getShape(); + } + + /// wrapper for matching with OffsetSizeAndStrideOpInterface + /// If source is IntegerType and `shape` is filled, it will + /// return an array of ShapedType::kDynamic representing dynamic + /// shape encoded in the `shape` argument will be used. Presence + /// of `shape` overides static shape from source memref type. + SmallVector getStaticSizes() { + if (getSourceType().isa() || getShape().size()) { + auto dims = getMixedOffsets().size(); + return SmallVector(dims, ShapedType::kDynamic); + } + auto memrefType = getSourceType().dyn_cast(); + return SmallVector(memrefType.getShape()); + } + + /// wrapper for matching with OffsetSizeAndStrideOpInterface + /// If source is IntegerType or `strides` is filled, it will + /// return an array of ShapedType::kDynamic representing dynamic + /// strides encoded in the `strides` argument will be used. Presence + /// of `strides` overides static strides from source memref type. + SmallVector getStaticStrides() { + if (getSourceType().isa() || getStrides().size()) { + auto dims = getMixedOffsets().size(); + return SmallVector(dims, ShapedType::kDynamic); + } + auto memrefType = getSourceType().dyn_cast(); + auto [strides, offset] = getStridesAndOffset(memrefType); + return strides; + } + + /// Return the expected rank of each of the`static_offsets`, + /// `static_shape` and `static_strides` attributes. + std::array getArrayAttrMaxRanks() { + unsigned rank; + if (auto ty = getSourceType().dyn_cast()) { + rank = ty.getRank(); + } else { + rank = (unsigned)getMixedOffsets().size(); + } + return {rank, rank, rank}; + } + + /// Return the number of leading operands before the `offsets`, + /// `shape` and `strides` operands. + static unsigned getOffsetSizeAndStrideStartOperandIndex() { return 1; } + + mlir::Value getViewSource() { return getSource(); } + }]; +} + +def XeGPU_PrefetchNdOp : XeGPU_Op<"prefetch_nd", []> { + let summary = "prefetches a nD block to cache"; + let description = [{ + It issues an instruction to prefetch the data from memory to each + level of the cache based on their cache policy. + + Example: + ``` + xegpu.prefetch_nd %tdesc {l1_hint = #xegpu.cache_hint, + l2_hint = #xegpu.cache_hint, + l3_hint = #xegpu.cache_hint} + : !xegpu.tensor_desc<8x16xf16> + ``` + + }]; + + let arguments = (ins XeGPU_TensorDesc: $TensorDesc, + OptionalAttr: $l1_hint, + OptionalAttr: $l2_hint, + OptionalAttr: $l3_hint); + + let extraClassDeclaration = extraBaseClassDeclaration; + + let assemblyFormat = "$TensorDesc prop-dict attr-dict `:` qualified(type($TensorDesc))"; +} + + +def XeGPU_LoadNdOp : XeGPU_Op<"load_nd"> { + let summary = "loads a n-D block from memory (represented by TensorDesc)" + "to registers (represented by vector)"; + let description = [{ + LoadNdOp essentially mimics the hardware block read instruction to read + a block of data from memory to register. It takes a set of optional cache + hints for each level of cache, L1, L2 and L3. If hardware does not have a + correspoding cache, Corresponding cache hint attribute will be masked. + vnni transform is an hardware feature for Intel GPU, which is used to + do data packing during the load for B operand of matrix operation, if + the bit width of the data type is less then 32 bits, e.g., fp16. And + transpose is another Intel hardware feature, which will do transpose + operation when loading the data if the bit width of the data type is + fp32 or fp64. It implies that vnni and transpose cannot exit at the + same time. + + Example: + ``` + xegpu.load_nd %1 {transpose = [1, 0], + l1_hint = #xegpu.cache_hint, + l2_hint = #xegpu.cache_hint, + l3_hint = #xegpu.cache_hint} + : !xegpu.tensor_desc<8x16xf32> -> vector<16x8xf32> + ``` + + + }]; + + let arguments = (ins XeGPU_TensorDesc: $TensorDesc, + OptionalAttr: $vnni_axis, + OptionalAttr: $transpose, + OptionalAttr: $l1_hint, + OptionalAttr: $l2_hint, + OptionalAttr: $l3_hint); + + let results = (outs XeGPU_ValueType: $value); + + let extraClassDeclaration = extraBaseClassDeclaration # [{ + VectorType getType() { + return llvm::dyn_cast(getValue().getType()); + } + + xegpu::TensorDescType getTensorDescType() { + return getTensorDesc().getType(); + } + }]; + + let assemblyFormat = "$TensorDesc prop-dict attr-dict `:` qualified(type($TensorDesc)) `->` type($value)"; + let hasVerifier = 1; +} + +def XeGPU_StoreNdOp : XeGPU_Op<"store_nd", []> { + let summary = "stores a n-D block register region back to memory, currently only supports 2D"; + + let description = [{ + StoreNdOp essentially mimics the hardware block write instruction io + write a block of data from register into the memory region as described + by the TensorDesc. It takes a set of optional cache hints for each level + of cache, L1, L2 and L3. If hardware does not have a correspoding cache, + Corresponding cache hint attribute will be masked. + + Example: + ``` + xegpu.store_nd %3, %2 {l1_hint = #xegpu.cache_hint, + l2_hint = #xegpu.cache_hint, + l3_hint = #xegpu.cache_hint} + : vector<8x16xf16>, !xegpu.tensor_desc<8x16xf16> + ``` + + + }]; + + let arguments = (ins XeGPU_ValueType: $value, + XeGPU_TensorDesc: $TensorDesc, + OptionalAttr: $l1_hint, + OptionalAttr: $l2_hint, + OptionalAttr: $l3_hint); + + let extraClassDeclaration = extraBaseClassDeclaration; + let assemblyFormat = [{$value `,` $TensorDesc prop-dict attr-dict + `:` type($value) `,` qualified(type($TensorDesc))}]; + let hasVerifier = 1; +} #endif // MLIR_DIALECT_XEGPU_IR_XEGPUOPS_TD diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td index 1d75bb4e2906fe..19ac1693712dd8 100644 --- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td +++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td @@ -9,9 +9,9 @@ #ifndef MLIR_DIALECT_XEGPU_IR_XEGPUTYPES_TD #define MLIR_DIALECT_XEGPU_IR_XEGPUTYPES_TD -include "mlir/IR/BuiltinTypes.td" include "mlir/Dialect/XeGPU/IR/XeGPUAttrs.td" include "mlir/Dialect/XeGPU/IR/XeGPUDialect.td" +include "mlir/IR/BuiltinTypes.td" def XeGPU_IntType: AnyTypeOf<[I1, I8, I16, I32, I64, SI1, SI8, SI16, SI32, SI64, UI1, UI8, UI16, UI32, UI64]>; def XeGPU_FloatType: AnyTypeOf<[F16, F32, F64, BF16, TF32]>; @@ -30,4 +30,106 @@ class XeGPUTypeDef traits = [], let mnemonic = typeMnemonic; } +def XeGPU_TensorDesc: XeGPUTypeDef<"TensorDesc", "tensor_desc", + [ShapedTypeInterface], "::mlir::TensorType"> { + let summary = "TensorDesc describing regions of interested data."; + let description = [{ + TensorDesc is a type designed to describe regions of the interested data as well as some + features that are unique to Intel hardware. Different with the builtin tensor type in MLIR, + it essentially only contains the meta data, and doesn't hold the data by itself. It is designed + to mainly support 2D block load/store and DPAS (matrix multiplication instruction) on Intel GPU. + It encodes the following information: + + * shape: the sizes/shape of the intereted data block, e.g., 8x16 means 8 rows + and each row contains 16 contiguous data element. The rows could be + either contiguous or not, depends on whether the encoding attribute + is set or not. + * element_type: the data type of the data element, e.g., f16, f32. + + Similar to the builtin tensor, it also provides an optinal attribute to encoding + the following information via the TensorDescAttr object: + * memory_scope (xegpu::MemoryScope): [optional] where the data is located, + global memory or shared memory. It is default to Global. + * array_length (int): [optional] The number of contiguous blocks with size as `shape`, + that will be loaded by block load at a time. It is default to 1. + * boundary_check (bool): [optional] indicates whether the operation detects the boundary + and pads with zero for out-of-boundary access. It is default to do boundary check. + + + Syntax: + + ``` + TensorDesc-type ::= `tensor_desc` `<` dim-list element-type (attr-list)? `>` + element-type ::= float-type | integer-type | index-type + dim-list := (static-dim-list `x`)? + static-dim-list ::= decimal-literal `x` decimal-literal + attr-list = (, memory_scope = value)? (, arr_len = value)? (, boundary_check = value)? + ``` + + Examples: + + ```mlir + // A block TensorDesc with 8x16 i32 elements + xegpu.tensor_desc<8x16xi32> + + // A block TensorDesc with 8x16 f32 elements + xegpu.tensor_desc<8x16xf32> + + // A TensorDesc with 8x16 f32 elements for a memory region in shared memory space. + xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr> + ``` + }]; + + let parameters = (ins ArrayRefParameter<"int64_t">: $shape, + "mlir::Type": $elementType, + OptionalParameter<"mlir::Attribute">: $encoding); + + let extraClassDeclaration = [{ + using TensorType::clone; + using mlir::ShapedType::Trait::getElementTypeBitWidth; + using mlir::ShapedType::Trait::getRank; + using mlir::ShapedType::Trait::getNumElements; + using mlir::ShapedType::Trait::isDynamicDim; + using mlir::ShapedType::Trait::hasStaticShape; + using mlir::ShapedType::Trait::getNumDynamicDims; + using mlir::ShapedType::Trait::getDimSize; + using mlir::ShapedType::Trait::getDynamicDimIndex; + + TensorDescType clone(::mlir::Type elementType) { + return llvm::cast(cloneWith(getShape(), elementType)); + } + + TensorDescAttr getEncodingAsTensorDescAttr() const { + return llvm::dyn_cast_if_present(getEncoding()); + } + + xegpu::MemoryScope getMemoryScope() const { + auto attr = getEncodingAsTensorDescAttr(); + if (attr && attr.getMemoryScope()) + return attr.getMemoryScope().getValue(); + // return default value + return MemoryScope::Global; + } + + int getArrayLength() { + auto attr = getEncodingAsTensorDescAttr(); + if (attr && attr.getArrayLength()) + return attr.getArrayLength().getInt(); + // return default value + return 1; + } + + bool getBoundaryCheck() { + auto attr = getEncodingAsTensorDescAttr(); + if (attr && attr.getBoundaryCheck()) + return attr.getBoundaryCheck().getValue(); + // return default value + return true; + } + }]; + + let hasCustomAssemblyFormat = true; + +} + #endif // MLIR_DIALECT_XEGPU_IR_XEGPUTYPES_TD diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp index 4f839ee773476b..0b3f4b9c9dbeae 100644 --- a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp +++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp @@ -6,7 +6,10 @@ // //===----------------------------------------------------------------------===// -#include +#include "mlir/Dialect/XeGPU/IR/XeGPU.h" +#include "mlir/IR/Builders.h" +#include "mlir/IR/DialectImplementation.h" +#include "llvm/ADT/TypeSwitch.h" namespace mlir { namespace xegpu { @@ -26,8 +29,72 @@ void XeGPUDialect::initialize() { >(); } -// this file is for position occupation, -// we will add functions in following PRs. +//===----------------------------------------------------------------------===// +// XeGPU_TensorDescAttr +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// XeGPU_TensorDescType +//===----------------------------------------------------------------------===// +mlir::Type TensorDescType::parse(::mlir::AsmParser &parser) { + llvm::SmallVector shape; + mlir::Type elementType; + mlir::FailureOr encoding; + + // Parse literal '<' + if (parser.parseLess()) + return {}; + + auto shapeLoc = parser.getCurrentLocation(); + if (mlir::failed(parser.parseDimensionList(shape))) { + parser.emitError(shapeLoc, "failed to parse parameter 'shape'"); + return {}; + } + + auto elemTypeLoc = parser.getCurrentLocation(); + if (mlir::failed(parser.parseType(elementType))) { + parser.emitError(elemTypeLoc, "failed to parse parameter 'elementType'"); + return {}; + } + + // parse optional attributes + if (mlir::succeeded(parser.parseOptionalComma())) { + encoding = mlir::FieldParser::parse(parser); + if (mlir::failed(encoding)) { + parser.emitError( + parser.getCurrentLocation(), + "Failed to parse the attribute field for TensorDescType.\n"); + return {}; + } + } + + // Parse literal '>' + if (parser.parseGreater()) + return {}; + + return TensorDescType::get(parser.getContext(), shape, elementType, + encoding.value_or(mlir::Attribute())); +} + +void TensorDescType::print(::mlir::AsmPrinter &printer) const { + printer << "<"; + + auto shape = getShape(); + for (int64_t dim : shape) { + if (mlir::ShapedType::isDynamic(dim)) + printer << '?'; + else + printer << dim; + printer << 'x'; + } + + printer << getElementType(); + + if (auto encoding = getEncoding()) + printer << ", " << encoding; + + printer << ">"; +} } // namespace xegpu } // namespace mlir diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp index 0e89ac4df6ef28..3a75b173b757c5 100644 --- a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp +++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp @@ -6,14 +6,185 @@ // //===----------------------------------------------------------------------===// -#include +#include "mlir/Dialect/Utils/StaticValueUtils.h" +#include "mlir/Dialect/XeGPU/IR/XeGPU.h" +#include "mlir/IR/Builders.h" #define DEBUG_TYPE "xegpu" namespace mlir { namespace xegpu { -// this file is for position occupation, -// we will add functions in following PRs. + +static void transpose(llvm::ArrayRef trans, + std::vector &shape) { + std::vector old = shape; + for (size_t i = 0; i < trans.size(); i++) + shape[i] = old[trans[i]]; +} + +template +static std::string makeString(T array, bool breakline = false) { + std::string buf; + buf.clear(); + llvm::raw_string_ostream os(buf); + os << "["; + for (size_t i = 1; i < array.size(); i++) { + os << array[i - 1] << ", "; + if (breakline) + os << "\n\t\t"; + } + os << array.back() << "]"; + os.flush(); + return buf; +} + +//===----------------------------------------------------------------------===// +// XeGPU_CreateNdDescOp +//===----------------------------------------------------------------------===// +void CreateNdDescOp::build(OpBuilder &builder, OperationState &state, + Type tdesc, TypedValue source, + llvm::ArrayRef offsets) { + auto ty = source.getType(); + assert(ty && ty.hasStaticShape() && offsets.size() == (size_t)ty.getRank()); + + llvm::SmallVector staticOffsets; + llvm::SmallVector dynamicOffsets; + dispatchIndexOpFoldResults(offsets, dynamicOffsets, staticOffsets); + + build(builder, state, tdesc, source, dynamicOffsets /* dynamic offsets */, + ValueRange({}) /* empty dynamic shape */, + ValueRange({}) /* empty dynamic strides */, + staticOffsets /* static offsets */); +} + +void CreateNdDescOp::build(OpBuilder &builder, OperationState &state, + Type tdesc, TypedValue source, + llvm::ArrayRef offsets, + ValueRange shape, ValueRange stride) { + assert(shape.size() && offsets.size() && stride.size() && + shape.size() == stride.size() && shape.size() == offsets.size()); + + llvm::SmallVector staticOffsets; + llvm::SmallVector dynamicOffsets; + + dispatchIndexOpFoldResults(offsets, dynamicOffsets, staticOffsets); + + build(builder, state, tdesc, source, /* dynamic_offsets = */ dynamicOffsets, + /* dynamic shape = */ shape, /* dynamic strides = */ stride, + /* static offsets = */ staticOffsets); +} + +LogicalResult CreateNdDescOp::verify() { + auto rank = (int64_t)getMixedOffsets().size(); + bool invalidRank = (rank != 2); + bool invalidElemTy = false; + + // check source type matches the rank if it is a memref. + // It also should have the same ElementType as TensorDesc. + auto memrefTy = getSourceType().dyn_cast(); + if (memrefTy) { + invalidRank |= (memrefTy.getRank() != rank); + invalidElemTy |= memrefTy.getElementType() != getElementType(); + } + + // check result type matches the rank + invalidRank = (getType().getRank() != rank); + + // mismatches among shape, strides, and offsets are + // already handeled by OffsetSizeAndStrideOpInterface. + // So they are not check here. + if (invalidRank) + return emitOpError( + "Expecting the rank of shape, strides, offsets, " + "source memref type (if source is a memref) and TensorDesc " + "should match with each other. They currenlty are 2D."); + + if (invalidElemTy) + return emitOpError("TensorDesc should have the same element " + "type with the source if it is a memref.\n"); + + return success(); +} + +//===----------------------------------------------------------------------===// +// XeGPU_LoadNdOp +//===----------------------------------------------------------------------===// +LogicalResult LoadNdOp::verify() { + auto tdescTy = getTensorDescType(); + auto valueTy = getType(); + + if (tdescTy.getRank() != 2) + return emitOpError( + "The TensorDesc for LoadNdOp should be a 2D TensorDesc."); + + if (!valueTy) + return emitOpError("Invalid result, it should be a VectorType.\n"); + + auto tdescElemTy = tdescTy.getElementType(); + auto valueElemTy = valueTy.getElementType(); + + if (tdescElemTy != valueElemTy) + return emitOpError( + "Value should have the same element type as TensorDesc."); + + auto array_len = tdescTy.getArrayLength(); + auto tdescShape = tdescTy.getShape().vec(); + auto valueShape = valueTy.getShape().vec(); + + if (getTranspose()) { + auto trans = getTranspose().value(); + if (tdescShape.size() >= trans.size()) + transpose(trans, tdescShape); + else + emitWarning("Invalid transpose attr. It is ignored."); + } + + if (getVnniAxis()) { + auto axis = getVnniAxis().value(); + auto vnni_factor = valueShape.back(); + tdescShape[axis] /= vnni_factor; + tdescShape.push_back(vnni_factor); + } + + if (array_len > 1) { + auto it = tdescShape.begin(); + tdescShape.insert(it, array_len); + } + + if (tdescShape != valueShape) + return emitOpError() << "Result shape doesn't match TensorDesc shape." + << "The expected shape is " << makeString(tdescShape) + << ". But the given shape is " + << makeString(valueShape) << ".\n"; + return success(); +} + +//===----------------------------------------------------------------------===// +// XeGPU_StoreNdOp +//===----------------------------------------------------------------------===// +LogicalResult StoreNdOp::verify() { + auto dstTy = getTensorDesc().getType(); // Tile + auto valTy = getValue().getType().cast(); // Vector + + if (dstTy.getRank() != 2) + return emitOpError("Expecting a 2D TensorDesc shape.\n"); + + if (!valTy) + return emitOpError("Exepcting a VectorType result.\n"); + + auto dstElemTy = dstTy.getElementType(); + auto valElemTy = valTy.getElementType(); + + if (dstElemTy != valElemTy) { + return emitOpError() << "The element type of the value should " + "match the elementtype of the TensorDesc.\n"; + } + + if (dstTy.getShape() != valTy.getShape()) + return emitOpError() + << "The result shape should match the TensorDesc shape.\n"; + return success(); +} } // namespace xegpu } // namespace mlir diff --git a/mlir/test/Dialect/XeGPU/XeGPUOps.mlir b/mlir/test/Dialect/XeGPU/XeGPUOps.mlir new file mode 100644 index 00000000000000..039346adbb851c --- /dev/null +++ b/mlir/test/Dialect/XeGPU/XeGPUOps.mlir @@ -0,0 +1,62 @@ +// RUN: mlir-opt %s | FileCheck %s +// Verify the printed output can be parsed. +// RUN: mlir-opt %s | mlir-opt | FileCheck %s +// Verify the generic form can be parsed. +// RUN: mlir-opt -mlir-print-op-generic %s | mlir-opt | FileCheck %s + +// CHECK-LABEL: gpu.module @test { +gpu.module @test { +// CHECK: gpu.func @test_create_nd_tdesc_vc_1(%[[arg0:.*]]: memref<24x32xf32>) { +gpu.func @test_create_nd_tdesc_vc_1(%src: memref<24x32xf32>) { + // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32> + %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32> + gpu.return +} + +// CHECK: gpu.func @test_create_nd_tdesc_vc_2(%[[arg0:.*]]: ui64, %[[arg1:.*]]: index, %[[arg2:.*]]: index, %[[arg3:.*]]: index, %[[arg4:.*]]: index) { +gpu.func @test_create_nd_tdesc_vc_2(%src: ui64, %w : index, %h : index, %x : index, %y : index) { + //CHECK: %[[C:.*]] = arith.constant 1 : index + %c1 = arith.constant 1 : index + // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[arg3]], %[[arg4]]], [%[[arg2]], %[[arg1]]], [%[[arg1]], %[[C]]] : ui64 -> !xegpu.tensor_desc<8x16xf32> + %1 = xegpu.create_nd_tdesc %src[%x, %y], [%h, %w], [%w, %c1] : ui64 -> !xegpu.tensor_desc<8x16xf32> + gpu.return +} + +// CHECK: gpu.func @test_create_nd_tdesc_vc_3(%[[arg0:.*]]: memref<24x32xf32>) { +gpu.func @test_create_nd_tdesc_vc_3(%src: memref<24x32xf32>) { + // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %[[arg0]][0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<24x16xf32, #xegpu.tdesc_attr + %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<24x16xf32, #xegpu.tdesc_attr> + gpu.return +} + +// CHECK: gpu.func @test_prefetch_nd_vc(%[[arg0:.*]]: memref<24x32xf16>) { +gpu.func @test_prefetch_nd_vc(%src: memref<24x32xf16>) { + // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %[[arg0]][0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<8x16xf16> + %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<8x16xf16> + // CHECK: xegpu.prefetch_nd %[[R0]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<8x16xf16> + xegpu.prefetch_nd %1 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}>: !xegpu.tensor_desc<8x16xf16> + gpu.return +} + +// CHECK: func @test_load_nd_vc(%[[arg0:.*]]: memref<8x16xf16>) { +gpu.func @test_load_nd_vc(%src: memref<8x16xf16>) { + // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16> + %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16> + // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, vnni_axis = 0 : i64}> : !xegpu.tensor_desc<8x16xf16> -> vector<4x16x2xf16> + %2 = xegpu.load_nd %1 <{vnni_axis = 0, l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> + : !xegpu.tensor_desc<8x16xf16> -> vector<4x16x2xf16> + gpu.return +} + +// CHECK: func @test_store_nd_vc(%[[arg0:.*]]: memref<24x32xf16>) { +gpu.func @test_store_nd_vc(%dst: memref<24x32xf16>) { + // CHECK: %[[C:.*]] = arith.constant dense<1.000000e+00> : vector<24x32xf16> + %1 = arith.constant dense<1.0>: vector<24x32xf16> + // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %[[arg0]][0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<24x32xf16> + %2 = xegpu.create_nd_tdesc %dst[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<24x32xf16> + // CHECK: xegpu.store_nd %[[C]], %[[R0]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> : vector<24x32xf16>, !xegpu.tensor_desc<24x32xf16> + xegpu.store_nd %1, %2 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}>: vector<24x32xf16>, !xegpu.tensor_desc<24x32xf16> + gpu.return +} + +} \ No newline at end of file