Skip to content

Commit

Permalink
[MLIR][GPU-LLVM] Convert gpu.func to llvm.func (llvm#101664)
Browse files Browse the repository at this point in the history
Add support in `-convert-gpu-to-llvm-spv` to convert `gpu.func` to
`llvm.func` operations.

- `spir_kernel`/`spir_func` calling conventions used for
kernels/functions.
- `workgroup` attributions encoded as additional `llvm.ptr<3>`
arguments.
- No attribute used to annotate kernels
- `reqd_work_group_size` attribute using to encode
`gpu.known_block_size`.
- `llvm.mlir.workgroup_attrib_size` used to encode workgroup attribution
sizes. This will be attached to the pointer argument workgroup
attributions lower to.

**Note**: A notable missing feature that will be addressed in a
follow-up PR is a `-use-bare-ptr-memref-call-conv` option to replace
MemRef arguments with bare pointers to the MemRef element types instead
of the current MemRef descriptor approach.

---------

Signed-off-by: Victor Perez <victor.perez@codeplay.com>
  • Loading branch information
victor-eds authored Aug 9, 2024
1 parent 4c5ef66 commit d45de80
Show file tree
Hide file tree
Showing 16 changed files with 463 additions and 115 deletions.
18 changes: 18 additions & 0 deletions mlir/include/mlir/Conversion/SPIRVCommon/AttrToLLVMConverter.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
//===- AttrToLLVMConverter.h - SPIR-V attributes conversion to LLVM - C++ -===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
#ifndef MLIR_CONVERSION_SPIRVCOMMON_ATTRTOLLVMCONVERTER_H_
#define MLIR_CONVERSION_SPIRVCOMMON_ATTRTOLLVMCONVERTER_H_

#include "mlir/Dialect/SPIRV/IR/SPIRVEnums.h"

namespace mlir {
unsigned storageClassToAddressSpace(spirv::ClientAPI clientAPI,
spirv::StorageClass storageClass);
} // namespace mlir

#endif // MLIR_CONVERSION_SPIRVCOMMON_ATTRTOLLVMCONVERTER_H_
21 changes: 21 additions & 0 deletions mlir/include/mlir/Dialect/LLVMIR/LLVMAttrDefs.td
Original file line number Diff line number Diff line change
Expand Up @@ -1104,4 +1104,25 @@ def TailCallKindAttr : LLVM_Attr<"TailCallKind", "tailcallkind"> {
let assemblyFormat = "`<` $tailCallKind `>`";
}

//===----------------------------------------------------------------------===//
// WorkgroupAttributionAttr
//===----------------------------------------------------------------------===//

def WorkgroupAttributionAttr
: LLVM_Attr<"WorkgroupAttribution", "mlir.workgroup_attribution"> {
let summary = "GPU workgroup attribution information";
let description = [{
GPU workgroup attributions are `gpu.func` attributes encoding memory
allocations in the workgroup address space. These might be encoded as
`llvm.ptr` function arguments in our dialect, but then type and size
information would be dropped. This attribute can be attached to `llvm.ptr`
function arguments encoding GPU workgroup attributions to mark them as
arguments encoding workgroup attributions and keeping type and size
information in our dialect.
}];
let parameters = (ins "IntegerAttr":$num_elements,
"TypeAttr":$element_type);
let assemblyFormat = "`<` $num_elements `,` $element_type `>`";
}

#endif // LLVMIR_ATTRDEFS
5 changes: 5 additions & 0 deletions mlir/include/mlir/Dialect/LLVMIR/LLVMDialect.td
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,11 @@ def LLVM_Dialect : Dialect {
let hasRegionResultAttrVerify = 1;
let hasOperationAttrVerify = 1;

let discardableAttrs = (ins
/// Attribute encoding size and type of GPU workgroup attributions.
"WorkgroupAttributionAttr":$workgroup_attribution
);

let extraClassDeclaration = [{
/// Name of the data layout attributes.
static StringRef getDataLayoutAttrName() { return "llvm.data_layout"; }
Expand Down
1 change: 1 addition & 0 deletions mlir/lib/Conversion/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@ add_subdirectory(SCFToGPU)
add_subdirectory(SCFToOpenMP)
add_subdirectory(SCFToSPIRV)
add_subdirectory(ShapeToStandard)
add_subdirectory(SPIRVCommon)
add_subdirectory(SPIRVToLLVM)
add_subdirectory(TensorToLinalg)
add_subdirectory(TensorToSPIRV)
Expand Down
175 changes: 129 additions & 46 deletions mlir/lib/Conversion/GPUCommon/GPUOpsLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,29 +25,80 @@ GPUFuncOpLowering::matchAndRewrite(gpu::GPUFuncOp gpuFuncOp, OpAdaptor adaptor,
Location loc = gpuFuncOp.getLoc();

SmallVector<LLVM::GlobalOp, 3> workgroupBuffers;
workgroupBuffers.reserve(gpuFuncOp.getNumWorkgroupAttributions());
for (const auto [idx, attribution] :
llvm::enumerate(gpuFuncOp.getWorkgroupAttributions())) {
auto type = dyn_cast<MemRefType>(attribution.getType());
assert(type && type.hasStaticShape() && "unexpected type in attribution");

uint64_t numElements = type.getNumElements();

auto elementType =
cast<Type>(typeConverter->convertType(type.getElementType()));
auto arrayType = LLVM::LLVMArrayType::get(elementType, numElements);
std::string name =
std::string(llvm::formatv("__wg_{0}_{1}", gpuFuncOp.getName(), idx));
uint64_t alignment = 0;
if (auto alignAttr =
dyn_cast_or_null<IntegerAttr>(gpuFuncOp.getWorkgroupAttributionAttr(
idx, LLVM::LLVMDialect::getAlignAttrName())))
alignment = alignAttr.getInt();
auto globalOp = rewriter.create<LLVM::GlobalOp>(
gpuFuncOp.getLoc(), arrayType, /*isConstant=*/false,
LLVM::Linkage::Internal, name, /*value=*/Attribute(), alignment,
workgroupAddrSpace);
workgroupBuffers.push_back(globalOp);
if (encodeWorkgroupAttributionsAsArguments) {
// Append an `llvm.ptr` argument to the function signature to encode
// workgroup attributions.

ArrayRef<BlockArgument> workgroupAttributions =
gpuFuncOp.getWorkgroupAttributions();
size_t numAttributions = workgroupAttributions.size();

// Insert all arguments at the end.
unsigned index = gpuFuncOp.getNumArguments();
SmallVector<unsigned> argIndices(numAttributions, index);

// New arguments will simply be `llvm.ptr` with the correct address space
Type workgroupPtrType =
rewriter.getType<LLVM::LLVMPointerType>(workgroupAddrSpace);
SmallVector<Type> argTypes(numAttributions, workgroupPtrType);

// Attributes: noalias, llvm.mlir.workgroup_attribution(<size>, <type>)
std::array attrs{
rewriter.getNamedAttr(LLVM::LLVMDialect::getNoAliasAttrName(),
rewriter.getUnitAttr()),
rewriter.getNamedAttr(
getDialect().getWorkgroupAttributionAttrHelper().getName(),
rewriter.getUnitAttr()),
};
SmallVector<DictionaryAttr> argAttrs;
for (BlockArgument attribution : workgroupAttributions) {
auto attributionType = cast<MemRefType>(attribution.getType());
IntegerAttr numElements =
rewriter.getI64IntegerAttr(attributionType.getNumElements());
Type llvmElementType =
getTypeConverter()->convertType(attributionType.getElementType());
if (!llvmElementType)
return failure();
TypeAttr type = TypeAttr::get(llvmElementType);
attrs.back().setValue(
rewriter.getAttr<LLVM::WorkgroupAttributionAttr>(numElements, type));
argAttrs.push_back(rewriter.getDictionaryAttr(attrs));
}

// Location match function location
SmallVector<Location> argLocs(numAttributions, gpuFuncOp.getLoc());

// Perform signature modification
rewriter.modifyOpInPlace(
gpuFuncOp, [gpuFuncOp, &argIndices, &argTypes, &argAttrs, &argLocs]() {
static_cast<FunctionOpInterface>(gpuFuncOp).insertArguments(
argIndices, argTypes, argAttrs, argLocs);
});
} else {
workgroupBuffers.reserve(gpuFuncOp.getNumWorkgroupAttributions());
for (auto [idx, attribution] :
llvm::enumerate(gpuFuncOp.getWorkgroupAttributions())) {
auto type = dyn_cast<MemRefType>(attribution.getType());
assert(type && type.hasStaticShape() && "unexpected type in attribution");

uint64_t numElements = type.getNumElements();

auto elementType =
cast<Type>(typeConverter->convertType(type.getElementType()));
auto arrayType = LLVM::LLVMArrayType::get(elementType, numElements);
std::string name =
std::string(llvm::formatv("__wg_{0}_{1}", gpuFuncOp.getName(), idx));
uint64_t alignment = 0;
if (auto alignAttr = dyn_cast_or_null<IntegerAttr>(
gpuFuncOp.getWorkgroupAttributionAttr(
idx, LLVM::LLVMDialect::getAlignAttrName())))
alignment = alignAttr.getInt();
auto globalOp = rewriter.create<LLVM::GlobalOp>(
gpuFuncOp.getLoc(), arrayType, /*isConstant=*/false,
LLVM::Linkage::Internal, name, /*value=*/Attribute(), alignment,
workgroupAddrSpace);
workgroupBuffers.push_back(globalOp);
}
}

// Remap proper input types.
Expand Down Expand Up @@ -101,16 +152,19 @@ GPUFuncOpLowering::matchAndRewrite(gpu::GPUFuncOp gpuFuncOp, OpAdaptor adaptor,
// attribute. The former is necessary for further translation while the
// latter is expected by gpu.launch_func.
if (gpuFuncOp.isKernel()) {
attributes.emplace_back(kernelAttributeName, rewriter.getUnitAttr());
if (kernelAttributeName)
attributes.emplace_back(kernelAttributeName, rewriter.getUnitAttr());
// Set the dialect-specific block size attribute if there is one.
if (kernelBlockSizeAttributeName.has_value() && knownBlockSize) {
attributes.emplace_back(kernelBlockSizeAttributeName.value(),
knownBlockSize);
if (kernelBlockSizeAttributeName && knownBlockSize) {
attributes.emplace_back(kernelBlockSizeAttributeName, knownBlockSize);
}
}
LLVM::CConv callingConvention = gpuFuncOp.isKernel()
? kernelCallingConvention
: nonKernelCallingConvention;
auto llvmFuncOp = rewriter.create<LLVM::LLVMFuncOp>(
gpuFuncOp.getLoc(), gpuFuncOp.getName(), funcType,
LLVM::Linkage::External, /*dsoLocal=*/false, /*cconv=*/LLVM::CConv::C,
LLVM::Linkage::External, /*dsoLocal=*/false, callingConvention,
/*comdat=*/nullptr, attributes);

{
Expand All @@ -125,24 +179,51 @@ GPUFuncOpLowering::matchAndRewrite(gpu::GPUFuncOp gpuFuncOp, OpAdaptor adaptor,
rewriter.setInsertionPointToStart(&gpuFuncOp.front());
unsigned numProperArguments = gpuFuncOp.getNumArguments();

for (const auto [idx, global] : llvm::enumerate(workgroupBuffers)) {
auto ptrType = LLVM::LLVMPointerType::get(rewriter.getContext(),
global.getAddrSpace());
Value address = rewriter.create<LLVM::AddressOfOp>(
loc, ptrType, global.getSymNameAttr());
Value memory =
rewriter.create<LLVM::GEPOp>(loc, ptrType, global.getType(), address,
ArrayRef<LLVM::GEPArg>{0, 0});

// Build a memref descriptor pointing to the buffer to plug with the
// existing memref infrastructure. This may use more registers than
// otherwise necessary given that memref sizes are fixed, but we can try
// and canonicalize that away later.
Value attribution = gpuFuncOp.getWorkgroupAttributions()[idx];
auto type = cast<MemRefType>(attribution.getType());
auto descr = MemRefDescriptor::fromStaticShape(
rewriter, loc, *getTypeConverter(), type, memory);
signatureConversion.remapInput(numProperArguments + idx, descr);
if (encodeWorkgroupAttributionsAsArguments) {
// Build a MemRefDescriptor with each of the arguments added above.

unsigned numAttributions = gpuFuncOp.getNumWorkgroupAttributions();
assert(numProperArguments >= numAttributions &&
"Expecting attributions to be encoded as arguments already");

// Arguments encoding workgroup attributions will be in positions
// [numProperArguments, numProperArguments+numAttributions)
ArrayRef<BlockArgument> attributionArguments =
gpuFuncOp.getArguments().slice(numProperArguments - numAttributions,
numAttributions);
for (auto [idx, vals] : llvm::enumerate(llvm::zip_equal(
gpuFuncOp.getWorkgroupAttributions(), attributionArguments))) {
auto [attribution, arg] = vals;
auto type = cast<MemRefType>(attribution.getType());

// Arguments are of llvm.ptr type and attributions are of memref type:
// we need to wrap them in memref descriptors.
Value descr = MemRefDescriptor::fromStaticShape(
rewriter, loc, *getTypeConverter(), type, arg);

// And remap the arguments
signatureConversion.remapInput(numProperArguments + idx, descr);
}
} else {
for (const auto [idx, global] : llvm::enumerate(workgroupBuffers)) {
auto ptrType = LLVM::LLVMPointerType::get(rewriter.getContext(),
global.getAddrSpace());
Value address = rewriter.create<LLVM::AddressOfOp>(
loc, ptrType, global.getSymNameAttr());
Value memory =
rewriter.create<LLVM::GEPOp>(loc, ptrType, global.getType(),
address, ArrayRef<LLVM::GEPArg>{0, 0});

// Build a memref descriptor pointing to the buffer to plug with the
// existing memref infrastructure. This may use more registers than
// otherwise necessary given that memref sizes are fixed, but we can try
// and canonicalize that away later.
Value attribution = gpuFuncOp.getWorkgroupAttributions()[idx];
auto type = cast<MemRefType>(attribution.getType());
auto descr = MemRefDescriptor::fromStaticShape(
rewriter, loc, *getTypeConverter(), type, memory);
signatureConversion.remapInput(numProperArguments + idx, descr);
}
}

// Rewrite private memory attributions to alloca'ed buffers.
Expand Down Expand Up @@ -239,6 +320,8 @@ GPUFuncOpLowering::matchAndRewrite(gpu::GPUFuncOp gpuFuncOp, OpAdaptor adaptor,
copyPointerAttribute(LLVM::LLVMDialect::getDereferenceableAttrName());
copyPointerAttribute(
LLVM::LLVMDialect::getDereferenceableOrNullAttrName());
copyPointerAttribute(
LLVM::LLVMDialect::WorkgroupAttributionAttrHelper::getNameStr());
}
}
rewriter.eraseOp(gpuFuncOp);
Expand Down
59 changes: 47 additions & 12 deletions mlir/lib/Conversion/GPUCommon/GPUOpsLowering.h
Original file line number Diff line number Diff line change
Expand Up @@ -35,16 +35,41 @@ struct GPUDynamicSharedMemoryOpLowering
unsigned alignmentBit;
};

struct GPUFuncOpLoweringOptions {
/// The address space to use for `alloca`s in private memory.
unsigned allocaAddrSpace;
/// The address space to use declaring workgroup memory.
unsigned workgroupAddrSpace;

/// The attribute name to use instead of `gpu.kernel`. Null if no attribute
/// should be used.
StringAttr kernelAttributeName;
/// The attribute name to to set block size. Null if no attribute should be
/// used.
StringAttr kernelBlockSizeAttributeName;

/// The calling convention to use for kernel functions.
LLVM::CConv kernelCallingConvention = LLVM::CConv::C;
/// The calling convention to use for non-kernel functions.
LLVM::CConv nonKernelCallingConvention = LLVM::CConv::C;

/// Whether to encode workgroup attributions as additional arguments instead
/// of a global variable.
bool encodeWorkgroupAttributionsAsArguments = false;
};

struct GPUFuncOpLowering : ConvertOpToLLVMPattern<gpu::GPUFuncOp> {
GPUFuncOpLowering(
const LLVMTypeConverter &converter, unsigned allocaAddrSpace,
unsigned workgroupAddrSpace, StringAttr kernelAttributeName,
std::optional<StringAttr> kernelBlockSizeAttributeName = std::nullopt)
GPUFuncOpLowering(const LLVMTypeConverter &converter,
const GPUFuncOpLoweringOptions &options)
: ConvertOpToLLVMPattern<gpu::GPUFuncOp>(converter),
allocaAddrSpace(allocaAddrSpace),
workgroupAddrSpace(workgroupAddrSpace),
kernelAttributeName(kernelAttributeName),
kernelBlockSizeAttributeName(kernelBlockSizeAttributeName) {}
allocaAddrSpace(options.allocaAddrSpace),
workgroupAddrSpace(options.workgroupAddrSpace),
kernelAttributeName(options.kernelAttributeName),
kernelBlockSizeAttributeName(options.kernelBlockSizeAttributeName),
kernelCallingConvention(options.kernelCallingConvention),
nonKernelCallingConvention(options.nonKernelCallingConvention),
encodeWorkgroupAttributionsAsArguments(
options.encodeWorkgroupAttributionsAsArguments) {}

LogicalResult
matchAndRewrite(gpu::GPUFuncOp gpuFuncOp, OpAdaptor adaptor,
Expand All @@ -56,11 +81,21 @@ struct GPUFuncOpLowering : ConvertOpToLLVMPattern<gpu::GPUFuncOp> {
/// The address space to use declaring workgroup memory.
unsigned workgroupAddrSpace;

/// The attribute name to use instead of `gpu.kernel`.
/// The attribute name to use instead of `gpu.kernel`. Null if no attribute
/// should be used.
StringAttr kernelAttributeName;

/// The attribute name to to set block size
std::optional<StringAttr> kernelBlockSizeAttributeName;
/// The attribute name to to set block size. Null if no attribute should be
/// used.
StringAttr kernelBlockSizeAttributeName;

/// The calling convention to use for kernel functions
LLVM::CConv kernelCallingConvention;
/// The calling convention to use for non-kernel functions
LLVM::CConv nonKernelCallingConvention;

/// Whether to encode workgroup attributions as additional arguments instead
/// of a global variable.
bool encodeWorkgroupAttributionsAsArguments;
};

/// The lowering of gpu.printf to a call to HIP hostcalls
Expand Down
2 changes: 2 additions & 0 deletions mlir/lib/Conversion/GPUToLLVMSPV/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,9 @@ add_mlir_conversion_library(MLIRGPUToLLVMSPV

LINK_LIBS PUBLIC
MLIRGPUDialect
MLIRGPUToGPURuntimeTransforms
MLIRLLVMCommonConversion
MLIRLLVMDialect
MLIRSPIRVAttrToLLVMConversion
MLIRSPIRVDialect
)
Loading

0 comments on commit d45de80

Please sign in to comment.