diff --git a/mlir/include/mlir/Conversion/SPIRVCommon/AttrToLLVMConverter.h b/mlir/include/mlir/Conversion/SPIRVCommon/AttrToLLVMConverter.h new file mode 100644 index 000000000000000..a99dd0fe6f133ee --- /dev/null +++ b/mlir/include/mlir/Conversion/SPIRVCommon/AttrToLLVMConverter.h @@ -0,0 +1,18 @@ +//===- AttrToLLVMConverter.h - SPIR-V attributes conversion to LLVM - C++ -===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +#ifndef MLIR_CONVERSION_SPIRVCOMMON_ATTRTOLLVMCONVERTER_H_ +#define MLIR_CONVERSION_SPIRVCOMMON_ATTRTOLLVMCONVERTER_H_ + +#include "mlir/Dialect/SPIRV/IR/SPIRVEnums.h" + +namespace mlir { +unsigned storageClassToAddressSpace(spirv::ClientAPI clientAPI, + spirv::StorageClass storageClass); +} // namespace mlir + +#endif // MLIR_CONVERSION_SPIRVCOMMON_ATTRTOLLVMCONVERTER_H_ diff --git a/mlir/include/mlir/Dialect/LLVMIR/LLVMAttrDefs.td b/mlir/include/mlir/Dialect/LLVMIR/LLVMAttrDefs.td index 529c458ce12540c..5d96f5063425886 100644 --- a/mlir/include/mlir/Dialect/LLVMIR/LLVMAttrDefs.td +++ b/mlir/include/mlir/Dialect/LLVMIR/LLVMAttrDefs.td @@ -1104,4 +1104,25 @@ def TailCallKindAttr : LLVM_Attr<"TailCallKind", "tailcallkind"> { let assemblyFormat = "`<` $tailCallKind `>`"; } +//===----------------------------------------------------------------------===// +// WorkgroupAttributionAttr +//===----------------------------------------------------------------------===// + +def WorkgroupAttributionAttr + : LLVM_Attr<"WorkgroupAttribution", "mlir.workgroup_attribution"> { + let summary = "GPU workgroup attribution information"; + let description = [{ + GPU workgroup attributions are `gpu.func` attributes encoding memory + allocations in the workgroup address space. These might be encoded as + `llvm.ptr` function arguments in our dialect, but then type and size + information would be dropped. This attribute can be attached to `llvm.ptr` + function arguments encoding GPU workgroup attributions to mark them as + arguments encoding workgroup attributions and keeping type and size + information in our dialect. + }]; + let parameters = (ins "IntegerAttr":$num_elements, + "TypeAttr":$element_type); + let assemblyFormat = "`<` $num_elements `,` $element_type `>`"; +} + #endif // LLVMIR_ATTRDEFS diff --git a/mlir/include/mlir/Dialect/LLVMIR/LLVMDialect.td b/mlir/include/mlir/Dialect/LLVMIR/LLVMDialect.td index c4c011f30b3bcd9..7dc5d0522910e3e 100644 --- a/mlir/include/mlir/Dialect/LLVMIR/LLVMDialect.td +++ b/mlir/include/mlir/Dialect/LLVMIR/LLVMDialect.td @@ -21,6 +21,11 @@ def LLVM_Dialect : Dialect { let hasRegionResultAttrVerify = 1; let hasOperationAttrVerify = 1; + let discardableAttrs = (ins + /// Attribute encoding size and type of GPU workgroup attributions. + "WorkgroupAttributionAttr":$workgroup_attribution + ); + let extraClassDeclaration = [{ /// Name of the data layout attributes. static StringRef getDataLayoutAttrName() { return "llvm.data_layout"; } diff --git a/mlir/lib/Conversion/CMakeLists.txt b/mlir/lib/Conversion/CMakeLists.txt index 80c8b84d9ae89a5..813f700c5556e18 100644 --- a/mlir/lib/Conversion/CMakeLists.txt +++ b/mlir/lib/Conversion/CMakeLists.txt @@ -53,6 +53,7 @@ add_subdirectory(SCFToGPU) add_subdirectory(SCFToOpenMP) add_subdirectory(SCFToSPIRV) add_subdirectory(ShapeToStandard) +add_subdirectory(SPIRVCommon) add_subdirectory(SPIRVToLLVM) add_subdirectory(TensorToLinalg) add_subdirectory(TensorToSPIRV) diff --git a/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.cpp b/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.cpp index 6053e34f30a418f..5b590a457f77142 100644 --- a/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.cpp +++ b/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.cpp @@ -25,29 +25,80 @@ GPUFuncOpLowering::matchAndRewrite(gpu::GPUFuncOp gpuFuncOp, OpAdaptor adaptor, Location loc = gpuFuncOp.getLoc(); SmallVector workgroupBuffers; - workgroupBuffers.reserve(gpuFuncOp.getNumWorkgroupAttributions()); - for (const auto [idx, attribution] : - llvm::enumerate(gpuFuncOp.getWorkgroupAttributions())) { - auto type = dyn_cast(attribution.getType()); - assert(type && type.hasStaticShape() && "unexpected type in attribution"); - - uint64_t numElements = type.getNumElements(); - - auto elementType = - cast(typeConverter->convertType(type.getElementType())); - auto arrayType = LLVM::LLVMArrayType::get(elementType, numElements); - std::string name = - std::string(llvm::formatv("__wg_{0}_{1}", gpuFuncOp.getName(), idx)); - uint64_t alignment = 0; - if (auto alignAttr = - dyn_cast_or_null(gpuFuncOp.getWorkgroupAttributionAttr( - idx, LLVM::LLVMDialect::getAlignAttrName()))) - alignment = alignAttr.getInt(); - auto globalOp = rewriter.create( - gpuFuncOp.getLoc(), arrayType, /*isConstant=*/false, - LLVM::Linkage::Internal, name, /*value=*/Attribute(), alignment, - workgroupAddrSpace); - workgroupBuffers.push_back(globalOp); + if (encodeWorkgroupAttributionsAsArguments) { + // Append an `llvm.ptr` argument to the function signature to encode + // workgroup attributions. + + ArrayRef workgroupAttributions = + gpuFuncOp.getWorkgroupAttributions(); + size_t numAttributions = workgroupAttributions.size(); + + // Insert all arguments at the end. + unsigned index = gpuFuncOp.getNumArguments(); + SmallVector argIndices(numAttributions, index); + + // New arguments will simply be `llvm.ptr` with the correct address space + Type workgroupPtrType = + rewriter.getType(workgroupAddrSpace); + SmallVector argTypes(numAttributions, workgroupPtrType); + + // Attributes: noalias, llvm.mlir.workgroup_attribution(, ) + std::array attrs{ + rewriter.getNamedAttr(LLVM::LLVMDialect::getNoAliasAttrName(), + rewriter.getUnitAttr()), + rewriter.getNamedAttr( + getDialect().getWorkgroupAttributionAttrHelper().getName(), + rewriter.getUnitAttr()), + }; + SmallVector argAttrs; + for (BlockArgument attribution : workgroupAttributions) { + auto attributionType = cast(attribution.getType()); + IntegerAttr numElements = + rewriter.getI64IntegerAttr(attributionType.getNumElements()); + Type llvmElementType = + getTypeConverter()->convertType(attributionType.getElementType()); + if (!llvmElementType) + return failure(); + TypeAttr type = TypeAttr::get(llvmElementType); + attrs.back().setValue( + rewriter.getAttr(numElements, type)); + argAttrs.push_back(rewriter.getDictionaryAttr(attrs)); + } + + // Location match function location + SmallVector argLocs(numAttributions, gpuFuncOp.getLoc()); + + // Perform signature modification + rewriter.modifyOpInPlace( + gpuFuncOp, [gpuFuncOp, &argIndices, &argTypes, &argAttrs, &argLocs]() { + static_cast(gpuFuncOp).insertArguments( + argIndices, argTypes, argAttrs, argLocs); + }); + } else { + workgroupBuffers.reserve(gpuFuncOp.getNumWorkgroupAttributions()); + for (auto [idx, attribution] : + llvm::enumerate(gpuFuncOp.getWorkgroupAttributions())) { + auto type = dyn_cast(attribution.getType()); + assert(type && type.hasStaticShape() && "unexpected type in attribution"); + + uint64_t numElements = type.getNumElements(); + + auto elementType = + cast(typeConverter->convertType(type.getElementType())); + auto arrayType = LLVM::LLVMArrayType::get(elementType, numElements); + std::string name = + std::string(llvm::formatv("__wg_{0}_{1}", gpuFuncOp.getName(), idx)); + uint64_t alignment = 0; + if (auto alignAttr = dyn_cast_or_null( + gpuFuncOp.getWorkgroupAttributionAttr( + idx, LLVM::LLVMDialect::getAlignAttrName()))) + alignment = alignAttr.getInt(); + auto globalOp = rewriter.create( + gpuFuncOp.getLoc(), arrayType, /*isConstant=*/false, + LLVM::Linkage::Internal, name, /*value=*/Attribute(), alignment, + workgroupAddrSpace); + workgroupBuffers.push_back(globalOp); + } } // Remap proper input types. @@ -101,16 +152,19 @@ GPUFuncOpLowering::matchAndRewrite(gpu::GPUFuncOp gpuFuncOp, OpAdaptor adaptor, // attribute. The former is necessary for further translation while the // latter is expected by gpu.launch_func. if (gpuFuncOp.isKernel()) { - attributes.emplace_back(kernelAttributeName, rewriter.getUnitAttr()); + if (kernelAttributeName) + attributes.emplace_back(kernelAttributeName, rewriter.getUnitAttr()); // Set the dialect-specific block size attribute if there is one. - if (kernelBlockSizeAttributeName.has_value() && knownBlockSize) { - attributes.emplace_back(kernelBlockSizeAttributeName.value(), - knownBlockSize); + if (kernelBlockSizeAttributeName && knownBlockSize) { + attributes.emplace_back(kernelBlockSizeAttributeName, knownBlockSize); } } + LLVM::CConv callingConvention = gpuFuncOp.isKernel() + ? kernelCallingConvention + : nonKernelCallingConvention; auto llvmFuncOp = rewriter.create( gpuFuncOp.getLoc(), gpuFuncOp.getName(), funcType, - LLVM::Linkage::External, /*dsoLocal=*/false, /*cconv=*/LLVM::CConv::C, + LLVM::Linkage::External, /*dsoLocal=*/false, callingConvention, /*comdat=*/nullptr, attributes); { @@ -125,24 +179,51 @@ GPUFuncOpLowering::matchAndRewrite(gpu::GPUFuncOp gpuFuncOp, OpAdaptor adaptor, rewriter.setInsertionPointToStart(&gpuFuncOp.front()); unsigned numProperArguments = gpuFuncOp.getNumArguments(); - for (const auto [idx, global] : llvm::enumerate(workgroupBuffers)) { - auto ptrType = LLVM::LLVMPointerType::get(rewriter.getContext(), - global.getAddrSpace()); - Value address = rewriter.create( - loc, ptrType, global.getSymNameAttr()); - Value memory = - rewriter.create(loc, ptrType, global.getType(), address, - ArrayRef{0, 0}); - - // Build a memref descriptor pointing to the buffer to plug with the - // existing memref infrastructure. This may use more registers than - // otherwise necessary given that memref sizes are fixed, but we can try - // and canonicalize that away later. - Value attribution = gpuFuncOp.getWorkgroupAttributions()[idx]; - auto type = cast(attribution.getType()); - auto descr = MemRefDescriptor::fromStaticShape( - rewriter, loc, *getTypeConverter(), type, memory); - signatureConversion.remapInput(numProperArguments + idx, descr); + if (encodeWorkgroupAttributionsAsArguments) { + // Build a MemRefDescriptor with each of the arguments added above. + + unsigned numAttributions = gpuFuncOp.getNumWorkgroupAttributions(); + assert(numProperArguments >= numAttributions && + "Expecting attributions to be encoded as arguments already"); + + // Arguments encoding workgroup attributions will be in positions + // [numProperArguments, numProperArguments+numAttributions) + ArrayRef attributionArguments = + gpuFuncOp.getArguments().slice(numProperArguments - numAttributions, + numAttributions); + for (auto [idx, vals] : llvm::enumerate(llvm::zip_equal( + gpuFuncOp.getWorkgroupAttributions(), attributionArguments))) { + auto [attribution, arg] = vals; + auto type = cast(attribution.getType()); + + // Arguments are of llvm.ptr type and attributions are of memref type: + // we need to wrap them in memref descriptors. + Value descr = MemRefDescriptor::fromStaticShape( + rewriter, loc, *getTypeConverter(), type, arg); + + // And remap the arguments + signatureConversion.remapInput(numProperArguments + idx, descr); + } + } else { + for (const auto [idx, global] : llvm::enumerate(workgroupBuffers)) { + auto ptrType = LLVM::LLVMPointerType::get(rewriter.getContext(), + global.getAddrSpace()); + Value address = rewriter.create( + loc, ptrType, global.getSymNameAttr()); + Value memory = + rewriter.create(loc, ptrType, global.getType(), + address, ArrayRef{0, 0}); + + // Build a memref descriptor pointing to the buffer to plug with the + // existing memref infrastructure. This may use more registers than + // otherwise necessary given that memref sizes are fixed, but we can try + // and canonicalize that away later. + Value attribution = gpuFuncOp.getWorkgroupAttributions()[idx]; + auto type = cast(attribution.getType()); + auto descr = MemRefDescriptor::fromStaticShape( + rewriter, loc, *getTypeConverter(), type, memory); + signatureConversion.remapInput(numProperArguments + idx, descr); + } } // Rewrite private memory attributions to alloca'ed buffers. @@ -239,6 +320,8 @@ GPUFuncOpLowering::matchAndRewrite(gpu::GPUFuncOp gpuFuncOp, OpAdaptor adaptor, copyPointerAttribute(LLVM::LLVMDialect::getDereferenceableAttrName()); copyPointerAttribute( LLVM::LLVMDialect::getDereferenceableOrNullAttrName()); + copyPointerAttribute( + LLVM::LLVMDialect::WorkgroupAttributionAttrHelper::getNameStr()); } } rewriter.eraseOp(gpuFuncOp); diff --git a/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.h b/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.h index 92e69badc27ddfc..444a07a93ca36e7 100644 --- a/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.h +++ b/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.h @@ -35,16 +35,41 @@ struct GPUDynamicSharedMemoryOpLowering unsigned alignmentBit; }; +struct GPUFuncOpLoweringOptions { + /// The address space to use for `alloca`s in private memory. + unsigned allocaAddrSpace; + /// The address space to use declaring workgroup memory. + unsigned workgroupAddrSpace; + + /// The attribute name to use instead of `gpu.kernel`. Null if no attribute + /// should be used. + StringAttr kernelAttributeName; + /// The attribute name to to set block size. Null if no attribute should be + /// used. + StringAttr kernelBlockSizeAttributeName; + + /// The calling convention to use for kernel functions. + LLVM::CConv kernelCallingConvention = LLVM::CConv::C; + /// The calling convention to use for non-kernel functions. + LLVM::CConv nonKernelCallingConvention = LLVM::CConv::C; + + /// Whether to encode workgroup attributions as additional arguments instead + /// of a global variable. + bool encodeWorkgroupAttributionsAsArguments = false; +}; + struct GPUFuncOpLowering : ConvertOpToLLVMPattern { - GPUFuncOpLowering( - const LLVMTypeConverter &converter, unsigned allocaAddrSpace, - unsigned workgroupAddrSpace, StringAttr kernelAttributeName, - std::optional kernelBlockSizeAttributeName = std::nullopt) + GPUFuncOpLowering(const LLVMTypeConverter &converter, + const GPUFuncOpLoweringOptions &options) : ConvertOpToLLVMPattern(converter), - allocaAddrSpace(allocaAddrSpace), - workgroupAddrSpace(workgroupAddrSpace), - kernelAttributeName(kernelAttributeName), - kernelBlockSizeAttributeName(kernelBlockSizeAttributeName) {} + allocaAddrSpace(options.allocaAddrSpace), + workgroupAddrSpace(options.workgroupAddrSpace), + kernelAttributeName(options.kernelAttributeName), + kernelBlockSizeAttributeName(options.kernelBlockSizeAttributeName), + kernelCallingConvention(options.kernelCallingConvention), + nonKernelCallingConvention(options.nonKernelCallingConvention), + encodeWorkgroupAttributionsAsArguments( + options.encodeWorkgroupAttributionsAsArguments) {} LogicalResult matchAndRewrite(gpu::GPUFuncOp gpuFuncOp, OpAdaptor adaptor, @@ -56,11 +81,21 @@ struct GPUFuncOpLowering : ConvertOpToLLVMPattern { /// The address space to use declaring workgroup memory. unsigned workgroupAddrSpace; - /// The attribute name to use instead of `gpu.kernel`. + /// The attribute name to use instead of `gpu.kernel`. Null if no attribute + /// should be used. StringAttr kernelAttributeName; - - /// The attribute name to to set block size - std::optional kernelBlockSizeAttributeName; + /// The attribute name to to set block size. Null if no attribute should be + /// used. + StringAttr kernelBlockSizeAttributeName; + + /// The calling convention to use for kernel functions + LLVM::CConv kernelCallingConvention; + /// The calling convention to use for non-kernel functions + LLVM::CConv nonKernelCallingConvention; + + /// Whether to encode workgroup attributions as additional arguments instead + /// of a global variable. + bool encodeWorkgroupAttributionsAsArguments; }; /// The lowering of gpu.printf to a call to HIP hostcalls diff --git a/mlir/lib/Conversion/GPUToLLVMSPV/CMakeLists.txt b/mlir/lib/Conversion/GPUToLLVMSPV/CMakeLists.txt index da5650b2b68dded..d47c5e679d86e80 100644 --- a/mlir/lib/Conversion/GPUToLLVMSPV/CMakeLists.txt +++ b/mlir/lib/Conversion/GPUToLLVMSPV/CMakeLists.txt @@ -6,7 +6,9 @@ add_mlir_conversion_library(MLIRGPUToLLVMSPV LINK_LIBS PUBLIC MLIRGPUDialect + MLIRGPUToGPURuntimeTransforms MLIRLLVMCommonConversion MLIRLLVMDialect + MLIRSPIRVAttrToLLVMConversion MLIRSPIRVDialect ) diff --git a/mlir/lib/Conversion/GPUToLLVMSPV/GPUToLLVMSPV.cpp b/mlir/lib/Conversion/GPUToLLVMSPV/GPUToLLVMSPV.cpp index 27d63b5f8948d48..36e4a6a38a68e47 100644 --- a/mlir/lib/Conversion/GPUToLLVMSPV/GPUToLLVMSPV.cpp +++ b/mlir/lib/Conversion/GPUToLLVMSPV/GPUToLLVMSPV.cpp @@ -8,15 +8,18 @@ #include "mlir/Conversion/GPUToLLVMSPV/GPUToLLVMSPVPass.h" +#include "../GPUCommon/GPUOpsLowering.h" #include "mlir/Conversion/LLVMCommon/ConversionTarget.h" #include "mlir/Conversion/LLVMCommon/LoweringOptions.h" #include "mlir/Conversion/LLVMCommon/Pattern.h" #include "mlir/Conversion/LLVMCommon/TypeConverter.h" +#include "mlir/Conversion/SPIRVCommon/AttrToLLVMConverter.h" #include "mlir/Dialect/GPU/IR/GPUDialect.h" #include "mlir/Dialect/LLVMIR/LLVMAttrs.h" #include "mlir/Dialect/LLVMIR/LLVMDialect.h" #include "mlir/Dialect/LLVMIR/LLVMTypes.h" #include "mlir/Dialect/SPIRV/IR/SPIRVDialect.h" +#include "mlir/Dialect/SPIRV/IR/SPIRVEnums.h" #include "mlir/Dialect/SPIRV/IR/TargetAndABI.h" #include "mlir/IR/BuiltinTypes.h" #include "mlir/IR/Matchers.h" @@ -321,8 +324,8 @@ struct GPUToLLVMSPVConversionPass final LLVMConversionTarget target(*context); target.addIllegalOp(); + gpu::GPUFuncOp, gpu::GlobalIdOp, gpu::GridDimOp, + gpu::ReturnOp, gpu::ShuffleOp, gpu::ThreadIdOp>(); populateGpuToLLVMSPVConversionPatterns(converter, patterns); @@ -340,11 +343,27 @@ struct GPUToLLVMSPVConversionPass final namespace mlir { void populateGpuToLLVMSPVConversionPatterns(LLVMTypeConverter &typeConverter, RewritePatternSet &patterns) { - patterns.add, LaunchConfigOpConversion, LaunchConfigOpConversion, LaunchConfigOpConversion, LaunchConfigOpConversion>(typeConverter); + constexpr spirv::ClientAPI clientAPI = spirv::ClientAPI::OpenCL; + MLIRContext *context = &typeConverter.getContext(); + unsigned privateAddressSpace = + storageClassToAddressSpace(clientAPI, spirv::StorageClass::Function); + unsigned localAddressSpace = + storageClassToAddressSpace(clientAPI, spirv::StorageClass::Workgroup); + OperationName llvmFuncOpName(LLVM::LLVMFuncOp::getOperationName(), context); + StringAttr kernelBlockSizeAttributeName = + LLVM::LLVMFuncOp::getReqdWorkGroupSizeAttrName(llvmFuncOpName); + patterns.add( + typeConverter, + GPUFuncOpLoweringOptions{ + privateAddressSpace, localAddressSpace, + /*kernelAttributeName=*/{}, kernelBlockSizeAttributeName, + LLVM::CConv::SPIR_KERNEL, LLVM::CConv::SPIR_FUNC, + /*encodeWorkgroupAttributionsAsArguments=*/true}); } } // namespace mlir diff --git a/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp b/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp index faa97caacb88517..060a1e1e82f75e6 100644 --- a/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp +++ b/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp @@ -365,13 +365,15 @@ void mlir::populateGpuToNVVMConversionPatterns(LLVMTypeConverter &converter, // attributions since NVVM models it as `alloca`s in the default // memory space and does not support `alloca`s with addrspace(5). patterns.add( - converter, /*allocaAddrSpace=*/0, - /*workgroupAddrSpace=*/ - static_cast(NVVM::NVVMMemorySpace::kSharedMemorySpace), - StringAttr::get(&converter.getContext(), - NVVM::NVVMDialect::getKernelFuncAttrName()), - StringAttr::get(&converter.getContext(), - NVVM::NVVMDialect::getMaxntidAttrName())); + converter, + GPUFuncOpLoweringOptions{ + /*allocaAddrSpace=*/0, + /*workgroupAddrSpace=*/ + static_cast(NVVM::NVVMMemorySpace::kSharedMemorySpace), + StringAttr::get(&converter.getContext(), + NVVM::NVVMDialect::getKernelFuncAttrName()), + StringAttr::get(&converter.getContext(), + NVVM::NVVMDialect::getMaxntidAttrName())}); populateOpPatterns(converter, patterns, "__nv_fmodf", "__nv_fmod"); diff --git a/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp b/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp index 100181cdc69fe73..564bab1ad92b90e 100644 --- a/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp +++ b/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp @@ -372,10 +372,11 @@ void mlir::populateGpuToROCDLConversionPatterns( patterns.add(converter); patterns.add( converter, - /*allocaAddrSpace=*/ROCDL::ROCDLDialect::kPrivateMemoryAddressSpace, - /*workgroupAddrSpace=*/ROCDL::ROCDLDialect::kSharedMemoryAddressSpace, - rocdlDialect->getKernelAttrHelper().getName(), - rocdlDialect->getReqdWorkGroupSizeAttrHelper().getName()); + GPUFuncOpLoweringOptions{ + /*allocaAddrSpace=*/ROCDL::ROCDLDialect::kPrivateMemoryAddressSpace, + /*workgroupAddrSpace=*/ROCDL::ROCDLDialect::kSharedMemoryAddressSpace, + rocdlDialect->getKernelAttrHelper().getName(), + rocdlDialect->getReqdWorkGroupSizeAttrHelper().getName()}); if (Runtime::HIP == runtime) { patterns.add(converter); } else if (Runtime::OpenCL == runtime) { diff --git a/mlir/lib/Conversion/SPIRVCommon/AttrToLLVMConverter.cpp b/mlir/lib/Conversion/SPIRVCommon/AttrToLLVMConverter.cpp new file mode 100644 index 000000000000000..7f83a474c3f93c6 --- /dev/null +++ b/mlir/lib/Conversion/SPIRVCommon/AttrToLLVMConverter.cpp @@ -0,0 +1,60 @@ +//===- AttrToLLVMConverter.cpp - SPIR-V attributes conversion to LLVM -C++ ===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include + +namespace mlir { +namespace { + +//===----------------------------------------------------------------------===// +// Constants +//===----------------------------------------------------------------------===// + +constexpr unsigned defaultAddressSpace = 0; + +//===----------------------------------------------------------------------===// +// Utility functions +//===----------------------------------------------------------------------===// + +static unsigned +storageClassToOCLAddressSpace(spirv::StorageClass storageClass) { + // Based on + // https://registry.khronos.org/SPIR-V/specs/unified1/OpenCL.ExtendedInstructionSet.100.html#_binary_form + // and clang/lib/Basic/Targets/SPIR.h. + switch (storageClass) { + case spirv::StorageClass::Function: + return 0; + case spirv::StorageClass::Input: + case spirv::StorageClass::CrossWorkgroup: + return 1; + case spirv::StorageClass::UniformConstant: + return 2; + case spirv::StorageClass::Workgroup: + return 3; + case spirv::StorageClass::Generic: + return 4; + case spirv::StorageClass::DeviceOnlyINTEL: + return 5; + case spirv::StorageClass::HostOnlyINTEL: + return 6; + default: + return defaultAddressSpace; + } +} +} // namespace + +unsigned storageClassToAddressSpace(spirv::ClientAPI clientAPI, + spirv::StorageClass storageClass) { + switch (clientAPI) { + case spirv::ClientAPI::OpenCL: + return storageClassToOCLAddressSpace(storageClass); + default: + return defaultAddressSpace; + } +} +} // namespace mlir diff --git a/mlir/lib/Conversion/SPIRVCommon/CMakeLists.txt b/mlir/lib/Conversion/SPIRVCommon/CMakeLists.txt new file mode 100644 index 000000000000000..cd5a4c225efbf42 --- /dev/null +++ b/mlir/lib/Conversion/SPIRVCommon/CMakeLists.txt @@ -0,0 +1,6 @@ +add_mlir_conversion_library(MLIRSPIRVAttrToLLVMConversion + AttrToLLVMConverter.cpp + + DEPENDS + MLIRSPIRVEnumsIncGen +) diff --git a/mlir/lib/Conversion/SPIRVToLLVM/CMakeLists.txt b/mlir/lib/Conversion/SPIRVToLLVM/CMakeLists.txt index 549785b154c1b2e..e563315d95c9ca6 100644 --- a/mlir/lib/Conversion/SPIRVToLLVM/CMakeLists.txt +++ b/mlir/lib/Conversion/SPIRVToLLVM/CMakeLists.txt @@ -18,6 +18,7 @@ add_mlir_conversion_library(MLIRSPIRVToLLVM MLIRLLVMCommonConversion MLIRLLVMDialect MLIRMemRefToLLVM + MLIRSPIRVAttrToLLVMConversion MLIRSPIRVDialect MLIRSPIRVUtils MLIRTransforms diff --git a/mlir/lib/Conversion/SPIRVToLLVM/SPIRVToLLVM.cpp b/mlir/lib/Conversion/SPIRVToLLVM/SPIRVToLLVM.cpp index da09384bfbe8954..ca7863163241985 100644 --- a/mlir/lib/Conversion/SPIRVToLLVM/SPIRVToLLVM.cpp +++ b/mlir/lib/Conversion/SPIRVToLLVM/SPIRVToLLVM.cpp @@ -13,6 +13,7 @@ #include "mlir/Conversion/SPIRVToLLVM/SPIRVToLLVM.h" #include "mlir/Conversion/LLVMCommon/Pattern.h" #include "mlir/Conversion/LLVMCommon/TypeConverter.h" +#include "mlir/Conversion/SPIRVCommon/AttrToLLVMConverter.h" #include "mlir/Dialect/LLVMIR/LLVMDialect.h" #include "mlir/Dialect/SPIRV/IR/SPIRVDialect.h" #include "mlir/Dialect/SPIRV/IR/SPIRVEnums.h" @@ -28,12 +29,6 @@ using namespace mlir; -//===----------------------------------------------------------------------===// -// Constants -//===----------------------------------------------------------------------===// - -constexpr unsigned defaultAddressSpace = 0; - //===----------------------------------------------------------------------===// // Utility functions //===----------------------------------------------------------------------===// @@ -273,47 +268,13 @@ static std::optional convertArrayType(spirv::ArrayType type, return LLVM::LLVMArrayType::get(llvmElementType, numElements); } -static unsigned mapToOpenCLAddressSpace(spirv::StorageClass storageClass) { - // Based on - // https://registry.khronos.org/SPIR-V/specs/unified1/OpenCL.ExtendedInstructionSet.100.html#_binary_form - // and clang/lib/Basic/Targets/SPIR.h. - switch (storageClass) { -#define STORAGE_SPACE_MAP(storage, space) \ - case spirv::StorageClass::storage: \ - return space; - STORAGE_SPACE_MAP(Function, 0) - STORAGE_SPACE_MAP(CrossWorkgroup, 1) - STORAGE_SPACE_MAP(Input, 1) - STORAGE_SPACE_MAP(UniformConstant, 2) - STORAGE_SPACE_MAP(Workgroup, 3) - STORAGE_SPACE_MAP(Generic, 4) - STORAGE_SPACE_MAP(DeviceOnlyINTEL, 5) - STORAGE_SPACE_MAP(HostOnlyINTEL, 6) -#undef STORAGE_SPACE_MAP - default: - return defaultAddressSpace; - } -} - -static unsigned mapToAddressSpace(spirv::ClientAPI clientAPI, - spirv::StorageClass storageClass) { - switch (clientAPI) { -#define CLIENT_MAP(client, storage) \ - case spirv::ClientAPI::client: \ - return mapTo##client##AddressSpace(storage); - CLIENT_MAP(OpenCL, storageClass) -#undef CLIENT_MAP - default: - return defaultAddressSpace; - } -} - /// Converts SPIR-V pointer type to LLVM pointer. Pointer's storage class is not /// modelled at the moment. static Type convertPointerType(spirv::PointerType type, LLVMTypeConverter &converter, spirv::ClientAPI clientAPI) { - unsigned addressSpace = mapToAddressSpace(clientAPI, type.getStorageClass()); + unsigned addressSpace = + storageClassToAddressSpace(clientAPI, type.getStorageClass()); return LLVM::LLVMPointerType::get(type.getContext(), addressSpace); } @@ -822,7 +783,7 @@ class GlobalVariablePattern : LLVM::Linkage::External; auto newGlobalOp = rewriter.replaceOpWithNewOp( op, dstType, isConstant, linkage, op.getSymName(), Attribute(), - /*alignment=*/0, mapToAddressSpace(clientAPI, storageClass)); + /*alignment=*/0, storageClassToAddressSpace(clientAPI, storageClass)); // Attach location attribute if applicable if (op.getLocationAttr()) diff --git a/mlir/test/Conversion/GPUToLLVMSPV/gpu-to-llvm-spv.mlir b/mlir/test/Conversion/GPUToLLVMSPV/gpu-to-llvm-spv.mlir index bd7e5d139b0010b..8e133288b832b68 100644 --- a/mlir/test/Conversion/GPUToLLVMSPV/gpu-to-llvm-spv.mlir +++ b/mlir/test/Conversion/GPUToLLVMSPV/gpu-to-llvm-spv.mlir @@ -377,3 +377,129 @@ gpu.module @shuffles_mismatch { return } } + +// ----- + +gpu.module @kernels { + // CHECK: llvm.func spir_funccc @no_kernel() { + gpu.func @no_kernel() { + gpu.return + } + + // CHECK: llvm.func spir_kernelcc @kernel_no_arg() attributes {gpu.kernel} { + gpu.func @kernel_no_arg() kernel { + gpu.return + } + + // CHECK: llvm.func spir_kernelcc @kernel_with_args(%{{.*}}: f32, %{{.*}}: i64) attributes {gpu.kernel} { + gpu.func @kernel_with_args(%arg0: f32, %arg1: i64) kernel { + gpu.return + } + + // CHECK-64: llvm.func spir_kernelcc @kernel_with_conv_args(%{{.*}}: i64, %{{.*}}: !llvm.ptr, %{{.*}}: !llvm.ptr, %{{.*}}: i64) attributes {gpu.kernel} { + // CHECK-32: llvm.func spir_kernelcc @kernel_with_conv_args(%{{.*}}: i32, %{{.*}}: !llvm.ptr, %{{.*}}: !llvm.ptr, %{{.*}}: i32) attributes {gpu.kernel} { + gpu.func @kernel_with_conv_args(%arg0: index, %arg1: memref) kernel { + gpu.return + } + + // CHECK-64: llvm.func spir_kernelcc @kernel_with_sized_memref(%{{.*}}: !llvm.ptr, %{{.*}}: !llvm.ptr, %{{.*}}: i64, %{{.*}}: i64, %{{.*}}: i64) attributes {gpu.kernel} { + // CHECK-32: llvm.func spir_kernelcc @kernel_with_sized_memref(%{{.*}}: !llvm.ptr, %{{.*}}: !llvm.ptr, %{{.*}}: i32, %{{.*}}: i32, %{{.*}}: i32) attributes {gpu.kernel} { + gpu.func @kernel_with_sized_memref(%arg0: memref<1xindex>) kernel { + gpu.return + } + + // CHECK-64: llvm.func spir_kernelcc @kernel_with_ND_memref(%{{.*}}: !llvm.ptr, %{{.*}}: !llvm.ptr, %{{.*}}: i64, %{{.*}}: i64, %{{.*}}: i64, %{{.*}}: i64, %{{.*}}: i64, %{{.*}}: i64, %{{.*}}: i64) attributes {gpu.kernel} { + // CHECK-32: llvm.func spir_kernelcc @kernel_with_ND_memref(%{{.*}}: !llvm.ptr, %{{.*}}: !llvm.ptr, %{{.*}}: i32, %{{.*}}: i32, %{{.*}}: i32, %{{.*}}: i32, %{{.*}}: i32, %{{.*}}: i32, %{{.*}}: i32) attributes {gpu.kernel} { + gpu.func @kernel_with_ND_memref(%arg0: memref<128x128x128xindex>) kernel { + gpu.return + } +} + +// ----- + +gpu.module @kernels { +// CHECK-LABEL: llvm.func spir_kernelcc @kernel_with_private_attributions() attributes {gpu.kernel} { + +// Private attribution is converted to an llvm.alloca + +// CHECK: %[[VAL_2:.*]] = llvm.mlir.constant(32 : i64) : i64 +// CHECK: %[[VAL_3:.*]] = llvm.alloca %[[VAL_2]] x f32 : (i64) -> !llvm.ptr + +// MemRef descriptor built from allocated pointer + +// CHECK-64: %[[VAL_4:.*]] = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> +// CHECK-32: %[[VAL_4:.*]] = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i32, array<1 x i32>, array<1 x i32>)> + +// CHECK: %[[VAL_5:.*]] = llvm.insertvalue %[[VAL_3]], %[[VAL_4]][0] +// CHECK: llvm.insertvalue %[[VAL_3]], %[[VAL_5]][1] + +// Same code as above + +// CHECK: %[[VAL_14:.*]] = llvm.mlir.constant(16 : i64) : i64 +// CHECK: %[[VAL_15:.*]] = llvm.alloca %[[VAL_14]] x i16 : (i64) -> !llvm.ptr + +// CHECK-64: %[[VAL_16:.*]] = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> +// CHECK-32: %[[VAL_16:.*]] = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i32, array<1 x i32>, array<1 x i32>)> + +// CHECK: %[[VAL_17:.*]] = llvm.insertvalue %[[VAL_15]], %[[VAL_16]][0] +// CHECK: llvm.insertvalue %[[VAL_15]], %[[VAL_17]][1] + gpu.func @kernel_with_private_attributions() + private(%arg2: memref<32xf32>, %arg3: memref<16xi16>) + kernel { + gpu.return + } + +// Workgroup attributions are converted to an llvm.ptr<3> argument + +// CHECK-LABEL: llvm.func spir_kernelcc @kernel_with_workgoup_attributions( +// CHECK-SAME: %[[VAL_29:.*]]: !llvm.ptr<3> {llvm.noalias, llvm.workgroup_attribution = #llvm.mlir.workgroup_attribution<32 : i64, f32>}, +// CHECK-SAME: %[[VAL_30:.*]]: !llvm.ptr<3> {llvm.noalias, llvm.workgroup_attribution = #llvm.mlir.workgroup_attribution<16 : i64, i16>}) attributes {gpu.kernel} { + +// MemRef descriptor built from new argument + +// CHECK-64: %[[VAL_31:.*]] = llvm.mlir.undef : !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)> +// CHECK-32: %[[VAL_31:.*]] = llvm.mlir.undef : !llvm.struct<(ptr<3>, ptr<3>, i32, array<1 x i32>, array<1 x i32>)> + +// CHECK: %[[VAL_32:.*]] = llvm.insertvalue %[[VAL_29]], %[[VAL_31]][0] +// CHECK: llvm.insertvalue %[[VAL_29]], %[[VAL_32]][1] + +// Same as above + +// CHECK-64: %[[VAL_41:.*]] = llvm.mlir.undef : !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)> +// CHECK-32: %[[VAL_41:.*]] = llvm.mlir.undef : !llvm.struct<(ptr<3>, ptr<3>, i32, array<1 x i32>, array<1 x i32>)> + +// CHECK: %[[VAL_42:.*]] = llvm.insertvalue %[[VAL_30]], %[[VAL_41]][0] +// CHECK: llvm.insertvalue %[[VAL_30]], %[[VAL_42]][1] + gpu.func @kernel_with_workgoup_attributions() + workgroup(%arg2: memref<32xf32, 3>, %arg3: memref<16xi16, 3>) + kernel { + gpu.return + } + +// Check with both private and workgroup attributions. Simply check additional +// arguments and a llvm.alloca are present. + +// CHECK-LABEL: llvm.func spir_kernelcc @kernel_with_both_attributions( +// CHECK-SAME: %{{.*}}: !llvm.ptr<3> {llvm.noalias, llvm.workgroup_attribution = #llvm.mlir.workgroup_attribution<8 : i64, f32>}, +// CHECK-64-SAME: %{{.*}}: !llvm.ptr<3> {llvm.noalias, llvm.workgroup_attribution = #llvm.mlir.workgroup_attribution<16 : i64, i64>}) attributes {gpu.kernel} { +// CHECK-32-SAME: %{{.*}}: !llvm.ptr<3> {llvm.noalias, llvm.workgroup_attribution = #llvm.mlir.workgroup_attribution<16 : i64, i32>}) attributes {gpu.kernel} { + +// CHECK: %[[VAL_79:.*]] = llvm.mlir.constant(32 : i64) : i64 +// CHECK: %[[VAL_80:.*]] = llvm.alloca %[[VAL_79]] x i32 : (i64) -> !llvm.ptr + +// CHECK: %[[VAL_91:.*]] = llvm.mlir.constant(32 : i64) : i64 +// CHECK-64: %[[VAL_92:.*]] = llvm.alloca %[[VAL_91]] x i64 : (i64) -> !llvm.ptr +// CHECK-32: %[[VAL_92:.*]] = llvm.alloca %[[VAL_91]] x i32 : (i64) -> !llvm.ptr + gpu.func @kernel_with_both_attributions() + workgroup(%arg4: memref<8xf32, 3>, %arg5: memref<16xindex, 3>) + private(%arg6: memref<32xi32>, %arg7: memref<32xindex>) + kernel { + gpu.return + } + +// CHECK-LABEL: llvm.func spir_kernelcc @kernel_known_block_size +// CHECK-SAME: reqd_work_group_size = array + gpu.func @kernel_known_block_size() kernel attributes {known_block_size = array} { + gpu.return + } +} diff --git a/mlir/test/Dialect/LLVMIR/func.mlir b/mlir/test/Dialect/LLVMIR/func.mlir index 40b4e49f08a3ea1..e2a444c1faaba11 100644 --- a/mlir/test/Dialect/LLVMIR/func.mlir +++ b/mlir/test/Dialect/LLVMIR/func.mlir @@ -472,3 +472,10 @@ llvm.func @reqd_work_group_size_hint() attributes {reqd_work_group_size = array< // CHECK: @intel_reqd_sub_group_size_hint() // CHECK-SAME: intel_reqd_sub_group_size = 32 : i32 llvm.func @intel_reqd_sub_group_size_hint() attributes {llvm.intel_reqd_sub_group_size = 32 : i32} + +// ----- + +// CHECK: @workgroup_attribution +// CHECK-SAME: llvm.workgroup_attribution = #llvm.mlir.workgroup_attribution<512 : i64, i32> +// CHECK-SAME: llvm.workgroup_attribution = #llvm.mlir.workgroup_attribution<128 : i64, !llvm.struct<(i32, i64, f32)> +llvm.func @workgroup_attribution(%arg0: !llvm.ptr {llvm.workgroup_attribution = #llvm.mlir.workgroup_attribution<512 : i64, i32>}, %arg1: !llvm.ptr {llvm.workgroup_attribution = #llvm.mlir.workgroup_attribution<128 : i64, !llvm.struct<(i32, i64, f32)>>})