Skip to content

Commit

Permalink
[mlir][sparse][gpu] re-enable all GPU libgen tests (#72185)
Browse files Browse the repository at this point in the history
Previous change no longer properly used the GPU libgen pass (even though
most tests still passed falling back to CPU). This revision puts the
proper pass order into place. Also bit of a cleanup of CPU codegen vs.
libgen setup.
  • Loading branch information
aartbik authored Nov 14, 2023
1 parent 57dd23b commit 5f32bcf
Show file tree
Hide file tree
Showing 15 changed files with 85 additions and 78 deletions.
3 changes: 1 addition & 2 deletions mlir/include/mlir/Dialect/SparseTensor/Pipelines/Passes.h
Original file line number Diff line number Diff line change
Expand Up @@ -144,8 +144,7 @@ struct SparseCompilerOptions

/// Projects out the options for `createSparsificationPass`.
SparsificationOptions sparsificationOptions() const {
return SparsificationOptions(parallelization, enableGPULibgen,
enableRuntimeLibrary);
return SparsificationOptions(parallelization, enableRuntimeLibrary);
}

/// Projects out the options for `createConvertVectorToLLVMPass`.
Expand Down
15 changes: 6 additions & 9 deletions mlir/include/mlir/Dialect/SparseTensor/Transforms/Passes.h
Original file line number Diff line number Diff line change
Expand Up @@ -74,15 +74,11 @@ std::unique_ptr<Pass> createPreSparsificationRewritePass();

/// Options for the Sparsification pass.
struct SparsificationOptions {
SparsificationOptions(SparseParallelizationStrategy p, bool gpuLibgen,
bool enableRT)
: parallelizationStrategy(p), enableGPULibgen(gpuLibgen),
enableRuntimeLibrary(enableRT) {}
SparsificationOptions(SparseParallelizationStrategy p, bool enableRT)
: parallelizationStrategy(p), enableRuntimeLibrary(enableRT) {}
SparsificationOptions()
: SparsificationOptions(SparseParallelizationStrategy::kNone, false,
true) {}
: SparsificationOptions(SparseParallelizationStrategy::kNone, true) {}
SparseParallelizationStrategy parallelizationStrategy;
bool enableGPULibgen;
bool enableRuntimeLibrary;
};

Expand Down Expand Up @@ -196,7 +192,8 @@ void populateSparseGPULibgenPatterns(RewritePatternSet &patterns,
bool enableRT);

std::unique_ptr<Pass> createSparseGPUCodegenPass();
std::unique_ptr<Pass> createSparseGPUCodegenPass(unsigned numThreads);
std::unique_ptr<Pass> createSparseGPUCodegenPass(unsigned numThreads,
bool enableRT);

//===----------------------------------------------------------------------===//
// The SparseStorageSpecifierToLLVM pass.
Expand Down Expand Up @@ -225,7 +222,7 @@ std::unique_ptr<Pass> createSparsificationAndBufferizationPass(
const SparsificationOptions &sparsificationOptions,
bool createSparseDeallocs, bool enableRuntimeLibrary,
bool enableBufferInitialization, unsigned vectorLength,
bool enableVLAVectorization, bool enableSIMDIndex32);
bool enableVLAVectorization, bool enableSIMDIndex32, bool enableGPULibgen);

//===----------------------------------------------------------------------===//
// Registration.
Expand Down
12 changes: 6 additions & 6 deletions mlir/include/mlir/Dialect/SparseTensor/Transforms/Passes.td
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,6 @@ def SparsificationPass : Pass<"sparsification", "ModuleOp"> {
"affine::AffineDialect",
"arith::ArithDialect",
"bufferization::BufferizationDialect",
"gpu::GPUDialect",
"LLVM::LLVMDialect",
"linalg::LinalgDialect",
"memref::MemRefDialect",
Expand All @@ -131,9 +130,6 @@ def SparsificationPass : Pass<"sparsification", "ModuleOp"> {
clEnumValN(mlir::SparseParallelizationStrategy::kAnyStorageAnyLoop,
"any-storage-any-loop",
"Enable sparse parallelization for any storage and loop."))}]>,
Option<"enableGPULibgen", "enable-gpu-libgen", "bool",
"false",
"Enable GPU acceleration by means of direct library calls (like cuSPARSE)">,
Option<"enableRuntimeLibrary", "enable-runtime-library", "bool",
"true", "Enable runtime library for manipulating sparse tensors">,
];
Expand Down Expand Up @@ -368,7 +364,9 @@ def SparseVectorization : Pass<"sparse-vectorization", "ModuleOp"> {
def SparseGPUCodegen : Pass<"sparse-gpu-codegen", "ModuleOp"> {
let summary = "Generates GPU code during sparsification";
let description = [{
Enables the sparsifier to use GPU acceleration.
Enables the sparsifier to use GPU acceleration. When the number of GPU
threads is set to zero, the pass tries to enable GPU acceleration by
means of direct library calls (like cuSPARSE).
}];
let constructor = "mlir::createSparseGPUCodegenPass()";
let dependentDialects = [
Expand All @@ -381,7 +379,9 @@ def SparseGPUCodegen : Pass<"sparse-gpu-codegen", "ModuleOp"> {
"sparse_tensor::SparseTensorDialect",
];
let options = [
Option<"numThreads", "num_threads", "int32_t", "1024", "Sets the number of GPU threads">,
Option<"numThreads", "num-threads", "int32_t", "1024", "Sets the number of GPU threads">,
Option<"enableRuntimeLibrary", "enable-runtime-library", "bool",
"true", "Enable runtime library for manipulating sparse tensors">,
];
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,18 +31,25 @@

void mlir::sparse_tensor::buildSparseCompiler(
OpPassManager &pm, const SparseCompilerOptions &options) {
// Rewrite named linalg ops into generic ops.
pm.addNestedPass<func::FuncOp>(createLinalgGeneralizationPass());

// Sparsification and bufferization mini-pipeline.
pm.addPass(createSparsificationAndBufferizationPass(
getBufferizationOptionsForSparsification(
options.testBufferizationAnalysisOnly),
options.sparsificationOptions(), options.createSparseDeallocs,
options.enableRuntimeLibrary, options.enableBufferInitialization,
options.vectorLength,
/*enableVLAVectorization=*/options.armSVE,
/*enableSIMDIndex32=*/options.force32BitVectorIndices));
/*enableSIMDIndex32=*/options.force32BitVectorIndices,
options.enableGPULibgen));

// Bail-early for test setup.
if (options.testBufferizationAnalysisOnly)
return;

// Storage specifier lowering and bufferization wrap-up.
pm.addPass(createStorageSpecifierToLLVMPass());
pm.addNestedPass<func::FuncOp>(createCanonicalizerPass());
pm.addNestedPass<func::FuncOp>(
Expand Down Expand Up @@ -72,8 +79,10 @@ void mlir::sparse_tensor::buildSparseCompiler(
pm.addNestedPass<func::FuncOp>(createConvertMathToLLVMPass());
pm.addPass(createConvertMathToLibmPass());
pm.addPass(createConvertComplexToLibmPass());

// Repeat convert-vector-to-llvm.
pm.addPass(createConvertVectorToLLVMPass(options.lowerVectorToLLVMOptions()));

pm.addPass(createConvertComplexToLLVMPass());
pm.addPass(createConvertVectorToLLVMPass(options.lowerVectorToLLVMOptions()));
pm.addPass(createConvertFuncToLLVMPass());
Expand Down
23 changes: 13 additions & 10 deletions mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorPasses.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -82,19 +82,15 @@ struct SparsificationPass
SparsificationPass(const SparsificationPass &pass) = default;
SparsificationPass(const SparsificationOptions &options) {
parallelization = options.parallelizationStrategy;
enableGPULibgen = options.enableGPULibgen;
enableRuntimeLibrary = options.enableRuntimeLibrary;
}

void runOnOperation() override {
auto *ctx = &getContext();
// Translate strategy flags to strategy options.
SparsificationOptions options(parallelization, enableGPULibgen,
enableRuntimeLibrary);
// Apply GPU libgen (if requested), sparsification, and cleanup rewriting.
SparsificationOptions options(parallelization, enableRuntimeLibrary);
// Apply sparsification and cleanup rewriting.
RewritePatternSet patterns(ctx);
if (enableGPULibgen)
populateSparseGPULibgenPatterns(patterns, enableRuntimeLibrary);
populateSparsificationPatterns(patterns, options);
scf::ForOp::getCanonicalizationPatterns(patterns, ctx);
(void)applyPatternsAndFoldGreedily(getOperation(), std::move(patterns));
Expand Down Expand Up @@ -323,12 +319,18 @@ struct SparseGPUCodegenPass
: public impl::SparseGPUCodegenBase<SparseGPUCodegenPass> {
SparseGPUCodegenPass() = default;
SparseGPUCodegenPass(const SparseGPUCodegenPass &pass) = default;
SparseGPUCodegenPass(unsigned nT) { numThreads = nT; }
SparseGPUCodegenPass(unsigned nT, bool enableRT) {
numThreads = nT;
enableRuntimeLibrary = enableRT;
}

void runOnOperation() override {
auto *ctx = &getContext();
RewritePatternSet patterns(ctx);
populateSparseGPUCodegenPatterns(patterns, numThreads);
if (numThreads == 0)
populateSparseGPULibgenPatterns(patterns, enableRuntimeLibrary);
else
populateSparseGPUCodegenPatterns(patterns, numThreads);
(void)applyPatternsAndFoldGreedily(getOperation(), std::move(patterns));
}
};
Expand Down Expand Up @@ -457,8 +459,9 @@ std::unique_ptr<Pass> mlir::createSparseGPUCodegenPass() {
return std::make_unique<SparseGPUCodegenPass>();
}

std::unique_ptr<Pass> mlir::createSparseGPUCodegenPass(unsigned numThreads) {
return std::make_unique<SparseGPUCodegenPass>(numThreads);
std::unique_ptr<Pass> mlir::createSparseGPUCodegenPass(unsigned numThreads,
bool enableRT) {
return std::make_unique<SparseGPUCodegenPass>(numThreads, enableRT);
}

std::unique_ptr<Pass> mlir::createStorageSpecifierToLLVMPass() {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -65,15 +65,16 @@ class SparsificationAndBufferizationPass
const SparsificationOptions &sparsificationOptions,
bool createSparseDeallocs, bool enableRuntimeLibrary,
bool enableBufferInitialization, unsigned vectorLength,
bool enableVLAVectorization, bool enableSIMDIndex32)
bool enableVLAVectorization, bool enableSIMDIndex32, bool enableGPULibgen)
: bufferizationOptions(bufferizationOptions),
sparsificationOptions(sparsificationOptions),
createSparseDeallocs(createSparseDeallocs),
enableRuntimeLibrary(enableRuntimeLibrary),
enableBufferInitialization(enableBufferInitialization),
vectorLength(vectorLength),
enableVLAVectorization(enableVLAVectorization),
enableSIMDIndex32(enableSIMDIndex32) {}
enableSIMDIndex32(enableSIMDIndex32), enableGPULibgen(enableGPULibgen) {
}

/// Bufferize all dense ops. This assumes that no further analysis is needed
/// and that all required buffer copies were already inserted by
Expand Down Expand Up @@ -139,6 +140,8 @@ class SparsificationAndBufferizationPass
// of `bufferization.alloc_tensor` ops.
{
OpPassManager pm("builtin.module");
if (enableGPULibgen)
pm.addPass(createSparseGPUCodegenPass(0, enableRuntimeLibrary));
pm.addPass(createSparseReinterpretMapPass(ReinterpretMapScope::kAll));
pm.addPass(createSparsificationPass(sparsificationOptions));
pm.addNestedPass<func::FuncOp>(createStageSparseOperationsPass());
Expand Down Expand Up @@ -177,6 +180,7 @@ class SparsificationAndBufferizationPass
unsigned vectorLength;
bool enableVLAVectorization;
bool enableSIMDIndex32;
bool enableGPULibgen;
};

} // namespace sparse_tensor
Expand Down Expand Up @@ -210,18 +214,19 @@ std::unique_ptr<mlir::Pass> mlir::createSparsificationAndBufferizationPass() {
/*enableBufferInitialization=*/false,
/*vectorLength=*/0,
/*enableVLAVectorization=*/false,
/*enableSIMDIndex32=*/false);
/*enableSIMDIndex32=*/false,
/*enableGPULibgen=*/false);
}

std::unique_ptr<mlir::Pass> mlir::createSparsificationAndBufferizationPass(
const bufferization::OneShotBufferizationOptions &bufferizationOptions,
const SparsificationOptions &sparsificationOptions,
bool createSparseDeallocs, bool enableRuntimeLibrary,
bool enableBufferInitialization, unsigned vectorLength,
bool enableVLAVectorization, bool enableSIMDIndex32) {
bool enableVLAVectorization, bool enableSIMDIndex32, bool enableGPULibgen) {
return std::make_unique<
mlir::sparse_tensor::SparsificationAndBufferizationPass>(
bufferizationOptions, sparsificationOptions, createSparseDeallocs,
enableRuntimeLibrary, enableBufferInitialization, vectorLength,
enableVLAVectorization, enableSIMDIndex32);
enableVLAVectorization, enableSIMDIndex32, enableGPULibgen);
}
3 changes: 1 addition & 2 deletions mlir/test/Dialect/SparseTensor/GPU/gpu_matmul_lib.mlir
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
// RUN: mlir-opt %s --linalg-generalize-named-ops \
// RUN: --sparsification="enable-gpu-libgen" | FileCheck %s
// RUN: mlir-opt %s --linalg-generalize-named-ops --sparse-gpu-codegen="num-threads=0" | FileCheck %s

#CSR = #sparse_tensor.encoding<{ map = (d0, d1) -> (d0 : dense, d1 : compressed) }>

Expand Down
3 changes: 1 addition & 2 deletions mlir/test/Dialect/SparseTensor/GPU/gpu_matmul_lib_2to4.mlir
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
// RUN: mlir-opt %s --linalg-generalize-named-ops \
// RUN: --sparsification="enable-gpu-libgen" | FileCheck %s
// RUN: mlir-opt %s --linalg-generalize-named-ops --sparse-gpu-codegen="num-threads=0" | FileCheck %s

// CHECK-LABEL: func.func @matmul(
// CHECK-SAME: %[[VAL_0:.*0]]: tensor<?x?xf16>,
Expand Down
3 changes: 1 addition & 2 deletions mlir/test/Dialect/SparseTensor/GPU/gpu_matvec_lib.mlir
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
// RUN: mlir-opt %s --linalg-generalize-named-ops \
// RUN: --sparsification="enable-gpu-libgen" | FileCheck %s
// RUN: mlir-opt %s --linalg-generalize-named-ops --sparse-gpu-codegen="num-threads=0" | FileCheck %s

#SortedCOO = #sparse_tensor.encoding<{
map = (d0, d1) -> (d0 : compressed(nonunique), d1 : singleton)
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
// RUN: mlir-opt %s --sparsification="enable-gpu-libgen" | FileCheck %s
// RUN: mlir-opt %s --sparse-gpu-codegen="num-threads=0" | FileCheck %s

#trait_sampled_dense_dense = {
indexing_maps = [
Expand Down
2 changes: 1 addition & 1 deletion mlir/test/Dialect/SparseTensor/GPU/gpu_sddmm_lib.mlir
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
// RUN: mlir-opt %s --sparsification="enable-gpu-libgen" | FileCheck %s
// RUN: mlir-opt %s --sparse-gpu-codegen="num-threads=0" | FileCheck %s

#BSR = #sparse_tensor.encoding<{
map = (i, j) -> (
Expand Down
3 changes: 1 addition & 2 deletions mlir/test/Dialect/SparseTensor/GPU/gpu_spgemm_lib.mlir
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
// RUN: mlir-opt %s --linalg-generalize-named-ops \
// RUN: --sparsification="enable-gpu-libgen" | FileCheck %s
// RUN: mlir-opt %s --linalg-generalize-named-ops --sparse-gpu-codegen="num-threads=0" | FileCheck %s

#CSR = #sparse_tensor.encoding<{ map = (d0, d1) -> (d0 : dense, d1 : compressed) }>

Expand Down
Empty file.
Empty file.
Original file line number Diff line number Diff line change
Expand Up @@ -85,32 +85,30 @@ module {
// A kernel that computes a BSR sampled dense matrix matrix multiplication
// using a "spy" function and in-place update of the sampling sparse matrix.
//
// TODO: re-enable the following test.
//
// func.func @SDDMM_block(%args: tensor<?x?xf32, #BSR>,
// %arga: tensor<?x?xf32>,
// %argb: tensor<?x?xf32>) -> tensor<?x?xf32, #BSR> {
// %result = linalg.generic #trait_SDDMM
// ins(%arga, %argb: tensor<?x?xf32>, tensor<?x?xf32>)
// outs(%args: tensor<?x?xf32, #BSR>) {
// ^bb(%a: f32, %b: f32, %s: f32):
// %f0 = arith.constant 0.0 : f32
// %u = sparse_tensor.unary %s : f32 to f32
// present={
// ^bb0(%p: f32):
// %mul = arith.mulf %a, %b : f32
// sparse_tensor.yield %mul : f32
// }
// absent={}
// %r = sparse_tensor.reduce %s, %u, %f0 : f32 {
// ^bb0(%p: f32, %q: f32):
// %add = arith.addf %p, %q : f32
// sparse_tensor.yield %add : f32
// }
// linalg.yield %r : f32
// } -> tensor<?x?xf32, #BSR>
// return %result : tensor<?x?xf32, #BSR>
// }
func.func @SDDMM_block(%args: tensor<?x?xf32, #BSR>,
%arga: tensor<?x?xf32>,
%argb: tensor<?x?xf32>) -> tensor<?x?xf32, #BSR> {
%result = linalg.generic #trait_SDDMM
ins(%arga, %argb: tensor<?x?xf32>, tensor<?x?xf32>)
outs(%args: tensor<?x?xf32, #BSR>) {
^bb(%a: f32, %b: f32, %s: f32):
%f0 = arith.constant 0.0 : f32
%u = sparse_tensor.unary %s : f32 to f32
present={
^bb0(%p: f32):
%mul = arith.mulf %a, %b : f32
sparse_tensor.yield %mul : f32
}
absent={}
%r = sparse_tensor.reduce %s, %u, %f0 : f32 {
^bb0(%p: f32, %q: f32):
%add = arith.addf %p, %q : f32
sparse_tensor.yield %add : f32
}
linalg.yield %r : f32
} -> tensor<?x?xf32, #BSR>
return %result : tensor<?x?xf32, #BSR>
}

func.func private @getTensorFilename(index) -> (!Filename)

Expand Down Expand Up @@ -153,15 +151,15 @@ module {
//
%fileName = call @getTensorFilename(%c0) : (index) -> (!Filename)
%m_csr = sparse_tensor.new %fileName : !Filename to tensor<?x?xf32, #CSR>
// %m_bsr = sparse_tensor.new %fileName : !Filename to tensor<?x?xf32, #BSR>
%m_bsr = sparse_tensor.new %fileName : !Filename to tensor<?x?xf32, #BSR>

// Call the kernel.
%0 = call @SDDMM(%m_csr, %a, %b)
: (tensor<?x?xf32, #CSR>,
tensor<?x?xf32>, tensor<?x?xf32>) -> tensor<?x?xf32, #CSR>
// %1 = call @SDDMM_block(%m_bsr, %a, %b)
// : (tensor<?x?xf32, #BSR>,
// tensor<?x?xf32>, tensor<?x?xf32>) -> tensor<?x?xf32, #BSR>
%1 = call @SDDMM_block(%m_bsr, %a, %b)
: (tensor<?x?xf32, #BSR>,
tensor<?x?xf32>, tensor<?x?xf32>) -> tensor<?x?xf32, #BSR>

//
// Print the result for verification. Note that the "spy" determines what
Expand All @@ -170,18 +168,18 @@ module {
// in the original zero positions).
//
// CHECK: ( 5, 10, 24, 19, 53, 42, 55, 56 )
// C_HECK-NEXT: ( 5, 10, 8, 19, 24, 24, 40, 53, 42, 55, 56, 64 )
// CHECK-NEXT: ( 5, 10, 8, 19, 24, 24, 40, 53, 42, 55, 56, 64 )
//
%v0 = sparse_tensor.values %0 : tensor<?x?xf32, #CSR> to memref<?xf32>
%vv0 = vector.transfer_read %v0[%c0], %d0 : memref<?xf32>, vector<8xf32>
vector.print %vv0 : vector<8xf32>
// %v1 = sparse_tensor.values %1 : tensor<?x?xf32, #BSR> to memref<?xf32>
// %vv1 = vector.transfer_read %v1[%c0], %d0 : memref<?xf32>, vector<12xf32>
// vector.print %vv1 : vector<12xf32>
%v1 = sparse_tensor.values %1 : tensor<?x?xf32, #BSR> to memref<?xf32>
%vv1 = vector.transfer_read %v1[%c0], %d0 : memref<?xf32>, vector<12xf32>
vector.print %vv1 : vector<12xf32>

// Release the resources.
bufferization.dealloc_tensor %0 : tensor<?x?xf32, #CSR>
// bufferization.dealloc_tensor %1 : tensor<?x?xf32, #BSR>
bufferization.dealloc_tensor %1 : tensor<?x?xf32, #BSR>

llvm.call @mgpuDestroySparseEnv() : () -> ()
return
Expand Down

0 comments on commit 5f32bcf

Please sign in to comment.