[mlir][sparse][gpu] re-enable all GPU libgen tests (#72185)

Previous change no longer properly used the GPU libgen pass (even though most tests still passed falling back to CPU). This revision puts the proper pass order into place. Also bit of a cleanup of CPU codegen vs. libgen setup.
llvm · Nov 14, 2023 · 5f32bcf · 5f32bcf
1 parent 57dd23b
commit 5f32bcf
Show file tree

Hide file tree

Showing 15 changed files with 85 additions and 78 deletions.
diff --git a/mlir/include/mlir/Dialect/SparseTensor/Pipelines/Passes.h b/mlir/include/mlir/Dialect/SparseTensor/Pipelines/Passes.h
@@ -144,8 +144,7 @@ struct SparseCompilerOptions
 
   /// Projects out the options for `createSparsificationPass`.
   SparsificationOptions sparsificationOptions() const {
-    return SparsificationOptions(parallelization, enableGPULibgen,
-                                 enableRuntimeLibrary);
+    return SparsificationOptions(parallelization, enableRuntimeLibrary);
   }
 
   /// Projects out the options for `createConvertVectorToLLVMPass`.

diff --git a/mlir/include/mlir/Dialect/SparseTensor/Transforms/Passes.h b/mlir/include/mlir/Dialect/SparseTensor/Transforms/Passes.h
@@ -74,15 +74,11 @@ std::unique_ptr<Pass> createPreSparsificationRewritePass();
 
 /// Options for the Sparsification pass.
 struct SparsificationOptions {
-  SparsificationOptions(SparseParallelizationStrategy p, bool gpuLibgen,
-                        bool enableRT)
-      : parallelizationStrategy(p), enableGPULibgen(gpuLibgen),
-        enableRuntimeLibrary(enableRT) {}
+  SparsificationOptions(SparseParallelizationStrategy p, bool enableRT)
+      : parallelizationStrategy(p), enableRuntimeLibrary(enableRT) {}
   SparsificationOptions()
-      : SparsificationOptions(SparseParallelizationStrategy::kNone, false,
-                              true) {}
+      : SparsificationOptions(SparseParallelizationStrategy::kNone, true) {}
   SparseParallelizationStrategy parallelizationStrategy;
-  bool enableGPULibgen;
   bool enableRuntimeLibrary;
 };
 
@@ -196,7 +192,8 @@ void populateSparseGPULibgenPatterns(RewritePatternSet &patterns,
                                      bool enableRT);
 
 std::unique_ptr<Pass> createSparseGPUCodegenPass();
-std::unique_ptr<Pass> createSparseGPUCodegenPass(unsigned numThreads);
+std::unique_ptr<Pass> createSparseGPUCodegenPass(unsigned numThreads,
+                                                 bool enableRT);
 
 //===----------------------------------------------------------------------===//
 // The SparseStorageSpecifierToLLVM pass.
@@ -225,7 +222,7 @@ std::unique_ptr<Pass> createSparsificationAndBufferizationPass(
     const SparsificationOptions &sparsificationOptions,
     bool createSparseDeallocs, bool enableRuntimeLibrary,
     bool enableBufferInitialization, unsigned vectorLength,
-    bool enableVLAVectorization, bool enableSIMDIndex32);
+    bool enableVLAVectorization, bool enableSIMDIndex32, bool enableGPULibgen);
 
 //===----------------------------------------------------------------------===//
 // Registration.

diff --git a/mlir/include/mlir/Dialect/SparseTensor/Transforms/Passes.td b/mlir/include/mlir/Dialect/SparseTensor/Transforms/Passes.td
@@ -105,7 +105,6 @@ def SparsificationPass : Pass<"sparsification", "ModuleOp"> {
     "affine::AffineDialect",
     "arith::ArithDialect",
     "bufferization::BufferizationDialect",
-    "gpu::GPUDialect",
     "LLVM::LLVMDialect",
     "linalg::LinalgDialect",
     "memref::MemRefDialect",
@@ -131,9 +130,6 @@ def SparsificationPass : Pass<"sparsification", "ModuleOp"> {
              clEnumValN(mlir::SparseParallelizationStrategy::kAnyStorageAnyLoop,
                         "any-storage-any-loop",
                         "Enable sparse parallelization for any storage and loop."))}]>,
-    Option<"enableGPULibgen", "enable-gpu-libgen", "bool",
-           "false",
-           "Enable GPU acceleration by means of direct library calls (like cuSPARSE)">,
     Option<"enableRuntimeLibrary", "enable-runtime-library", "bool",
            "true", "Enable runtime library for manipulating sparse tensors">,
   ];
@@ -368,7 +364,9 @@ def SparseVectorization : Pass<"sparse-vectorization", "ModuleOp"> {
 def SparseGPUCodegen : Pass<"sparse-gpu-codegen", "ModuleOp"> {
   let summary = "Generates GPU code during sparsification";
   let description = [{
-    Enables the sparsifier to use GPU acceleration.
+    Enables the sparsifier to use GPU acceleration. When the number of GPU
+    threads is set to zero, the pass tries to enable GPU acceleration by
+    means of direct library calls (like cuSPARSE).
   }];
   let constructor = "mlir::createSparseGPUCodegenPass()";
   let dependentDialects = [
@@ -381,7 +379,9 @@ def SparseGPUCodegen : Pass<"sparse-gpu-codegen", "ModuleOp"> {
     "sparse_tensor::SparseTensorDialect",
   ];
   let options = [
-    Option<"numThreads", "num_threads", "int32_t", "1024", "Sets the number of GPU threads">,
+    Option<"numThreads", "num-threads", "int32_t", "1024", "Sets the number of GPU threads">,
+    Option<"enableRuntimeLibrary", "enable-runtime-library", "bool",
+           "true", "Enable runtime library for manipulating sparse tensors">,
   ];
 }
 

diff --git a/mlir/lib/Dialect/SparseTensor/Pipelines/SparseTensorPipelines.cpp b/mlir/lib/Dialect/SparseTensor/Pipelines/SparseTensorPipelines.cpp
@@ -31,18 +31,25 @@
 
 void mlir::sparse_tensor::buildSparseCompiler(
     OpPassManager &pm, const SparseCompilerOptions &options) {
+  // Rewrite named linalg ops into generic ops.
   pm.addNestedPass<func::FuncOp>(createLinalgGeneralizationPass());
+
+  // Sparsification and bufferization mini-pipeline.
   pm.addPass(createSparsificationAndBufferizationPass(
       getBufferizationOptionsForSparsification(
           options.testBufferizationAnalysisOnly),
       options.sparsificationOptions(), options.createSparseDeallocs,
       options.enableRuntimeLibrary, options.enableBufferInitialization,
       options.vectorLength,
       /*enableVLAVectorization=*/options.armSVE,
-      /*enableSIMDIndex32=*/options.force32BitVectorIndices));
+      /*enableSIMDIndex32=*/options.force32BitVectorIndices,
+      options.enableGPULibgen));
+
+  // Bail-early for test setup.
   if (options.testBufferizationAnalysisOnly)
     return;
 
+  // Storage specifier lowering and bufferization wrap-up.
   pm.addPass(createStorageSpecifierToLLVMPass());
   pm.addNestedPass<func::FuncOp>(createCanonicalizerPass());
   pm.addNestedPass<func::FuncOp>(
@@ -72,8 +79,10 @@ void mlir::sparse_tensor::buildSparseCompiler(
   pm.addNestedPass<func::FuncOp>(createConvertMathToLLVMPass());
   pm.addPass(createConvertMathToLibmPass());
   pm.addPass(createConvertComplexToLibmPass());
+
   // Repeat convert-vector-to-llvm.
   pm.addPass(createConvertVectorToLLVMPass(options.lowerVectorToLLVMOptions()));
+
   pm.addPass(createConvertComplexToLLVMPass());
   pm.addPass(createConvertVectorToLLVMPass(options.lowerVectorToLLVMOptions()));
   pm.addPass(createConvertFuncToLLVMPass());

diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorPasses.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorPasses.cpp
@@ -82,19 +82,15 @@ struct SparsificationPass
   SparsificationPass(const SparsificationPass &pass) = default;
   SparsificationPass(const SparsificationOptions &options) {
     parallelization = options.parallelizationStrategy;
-    enableGPULibgen = options.enableGPULibgen;
     enableRuntimeLibrary = options.enableRuntimeLibrary;
   }
 
   void runOnOperation() override {
     auto *ctx = &getContext();
     // Translate strategy flags to strategy options.
-    SparsificationOptions options(parallelization, enableGPULibgen,
-                                  enableRuntimeLibrary);
-    // Apply GPU libgen (if requested), sparsification, and cleanup rewriting.
+    SparsificationOptions options(parallelization, enableRuntimeLibrary);
+    // Apply sparsification and cleanup rewriting.
     RewritePatternSet patterns(ctx);
-    if (enableGPULibgen)
-      populateSparseGPULibgenPatterns(patterns, enableRuntimeLibrary);
     populateSparsificationPatterns(patterns, options);
     scf::ForOp::getCanonicalizationPatterns(patterns, ctx);
     (void)applyPatternsAndFoldGreedily(getOperation(), std::move(patterns));
@@ -323,12 +319,18 @@ struct SparseGPUCodegenPass
     : public impl::SparseGPUCodegenBase<SparseGPUCodegenPass> {
   SparseGPUCodegenPass() = default;
   SparseGPUCodegenPass(const SparseGPUCodegenPass &pass) = default;
-  SparseGPUCodegenPass(unsigned nT) { numThreads = nT; }
+  SparseGPUCodegenPass(unsigned nT, bool enableRT) {
+    numThreads = nT;
+    enableRuntimeLibrary = enableRT;
+  }
 
   void runOnOperation() override {
     auto *ctx = &getContext();
     RewritePatternSet patterns(ctx);
-    populateSparseGPUCodegenPatterns(patterns, numThreads);
+    if (numThreads == 0)
+      populateSparseGPULibgenPatterns(patterns, enableRuntimeLibrary);
+    else
+      populateSparseGPUCodegenPatterns(patterns, numThreads);
     (void)applyPatternsAndFoldGreedily(getOperation(), std::move(patterns));
   }
 };
@@ -457,8 +459,9 @@ std::unique_ptr<Pass> mlir::createSparseGPUCodegenPass() {
   return std::make_unique<SparseGPUCodegenPass>();
 }
 
-std::unique_ptr<Pass> mlir::createSparseGPUCodegenPass(unsigned numThreads) {
-  return std::make_unique<SparseGPUCodegenPass>(numThreads);
+std::unique_ptr<Pass> mlir::createSparseGPUCodegenPass(unsigned numThreads,
+                                                       bool enableRT) {
+  return std::make_unique<SparseGPUCodegenPass>(numThreads, enableRT);
 }
 
 std::unique_ptr<Pass> mlir::createStorageSpecifierToLLVMPass() {

diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/SparsificationAndBufferizationPass.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/SparsificationAndBufferizationPass.cpp
@@ -65,15 +65,16 @@ class SparsificationAndBufferizationPass
       const SparsificationOptions &sparsificationOptions,
       bool createSparseDeallocs, bool enableRuntimeLibrary,
       bool enableBufferInitialization, unsigned vectorLength,
-      bool enableVLAVectorization, bool enableSIMDIndex32)
+      bool enableVLAVectorization, bool enableSIMDIndex32, bool enableGPULibgen)
       : bufferizationOptions(bufferizationOptions),
         sparsificationOptions(sparsificationOptions),
         createSparseDeallocs(createSparseDeallocs),
         enableRuntimeLibrary(enableRuntimeLibrary),
         enableBufferInitialization(enableBufferInitialization),
         vectorLength(vectorLength),
         enableVLAVectorization(enableVLAVectorization),
-        enableSIMDIndex32(enableSIMDIndex32) {}
+        enableSIMDIndex32(enableSIMDIndex32), enableGPULibgen(enableGPULibgen) {
+  }
 
   /// Bufferize all dense ops. This assumes that no further analysis is needed
   /// and that all required buffer copies were already inserted by
@@ -139,6 +140,8 @@ class SparsificationAndBufferizationPass
     // of `bufferization.alloc_tensor` ops.
     {
       OpPassManager pm("builtin.module");
+      if (enableGPULibgen)
+        pm.addPass(createSparseGPUCodegenPass(0, enableRuntimeLibrary));
       pm.addPass(createSparseReinterpretMapPass(ReinterpretMapScope::kAll));
       pm.addPass(createSparsificationPass(sparsificationOptions));
       pm.addNestedPass<func::FuncOp>(createStageSparseOperationsPass());
@@ -177,6 +180,7 @@ class SparsificationAndBufferizationPass
   unsigned vectorLength;
   bool enableVLAVectorization;
   bool enableSIMDIndex32;
+  bool enableGPULibgen;
 };
 
 } // namespace sparse_tensor
@@ -210,18 +214,19 @@ std::unique_ptr<mlir::Pass> mlir::createSparsificationAndBufferizationPass() {
       /*enableBufferInitialization=*/false,
       /*vectorLength=*/0,
       /*enableVLAVectorization=*/false,
-      /*enableSIMDIndex32=*/false);
+      /*enableSIMDIndex32=*/false,
+      /*enableGPULibgen=*/false);
 }
 
 std::unique_ptr<mlir::Pass> mlir::createSparsificationAndBufferizationPass(
     const bufferization::OneShotBufferizationOptions &bufferizationOptions,
     const SparsificationOptions &sparsificationOptions,
     bool createSparseDeallocs, bool enableRuntimeLibrary,
     bool enableBufferInitialization, unsigned vectorLength,
-    bool enableVLAVectorization, bool enableSIMDIndex32) {
+    bool enableVLAVectorization, bool enableSIMDIndex32, bool enableGPULibgen) {
   return std::make_unique<
       mlir::sparse_tensor::SparsificationAndBufferizationPass>(
       bufferizationOptions, sparsificationOptions, createSparseDeallocs,
       enableRuntimeLibrary, enableBufferInitialization, vectorLength,
-      enableVLAVectorization, enableSIMDIndex32);
+      enableVLAVectorization, enableSIMDIndex32, enableGPULibgen);
 }
diff --git a/mlir/test/Dialect/SparseTensor/GPU/gpu_matmul_lib.mlir b/mlir/test/Dialect/SparseTensor/GPU/gpu_matmul_lib.mlir
@@ -1,5 +1,4 @@
-// RUN: mlir-opt %s --linalg-generalize-named-ops \
-// RUN:             --sparsification="enable-gpu-libgen" | FileCheck %s
+// RUN: mlir-opt %s --linalg-generalize-named-ops --sparse-gpu-codegen="num-threads=0" | FileCheck %s
 
 #CSR = #sparse_tensor.encoding<{ map = (d0, d1) -> (d0 : dense, d1 : compressed) }>
 

diff --git a/mlir/test/Dialect/SparseTensor/GPU/gpu_matmul_lib_2to4.mlir b/mlir/test/Dialect/SparseTensor/GPU/gpu_matmul_lib_2to4.mlir
@@ -1,5 +1,4 @@
-// RUN: mlir-opt %s --linalg-generalize-named-ops \
-// RUN:             --sparsification="enable-gpu-libgen" | FileCheck %s
+// RUN: mlir-opt %s --linalg-generalize-named-ops --sparse-gpu-codegen="num-threads=0" | FileCheck %s
 
 // CHECK-LABEL:   func.func @matmul(
 // CHECK-SAME:      %[[VAL_0:.*0]]: tensor<?x?xf16>,

diff --git a/mlir/test/Dialect/SparseTensor/GPU/gpu_matvec_lib.mlir b/mlir/test/Dialect/SparseTensor/GPU/gpu_matvec_lib.mlir
@@ -1,5 +1,4 @@
-// RUN: mlir-opt %s --linalg-generalize-named-ops \
-// RUN:             --sparsification="enable-gpu-libgen" | FileCheck %s
+// RUN: mlir-opt %s --linalg-generalize-named-ops --sparse-gpu-codegen="num-threads=0" | FileCheck %s
 
 #SortedCOO = #sparse_tensor.encoding<{
   map = (d0, d1) -> (d0 : compressed(nonunique), d1 : singleton)

diff --git a/mlir/test/Dialect/SparseTensor/GPU/gpu_sampled_matmul_lib.mlir b/mlir/test/Dialect/SparseTensor/GPU/gpu_sampled_matmul_lib.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s --sparsification="enable-gpu-libgen" | FileCheck %s
+// RUN: mlir-opt %s --sparse-gpu-codegen="num-threads=0" | FileCheck %s
 
 #trait_sampled_dense_dense = {
   indexing_maps = [

diff --git a/mlir/test/Dialect/SparseTensor/GPU/gpu_sddmm_lib.mlir b/mlir/test/Dialect/SparseTensor/GPU/gpu_sddmm_lib.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s --sparsification="enable-gpu-libgen" | FileCheck %s
+// RUN: mlir-opt %s --sparse-gpu-codegen="num-threads=0" | FileCheck %s
 
 #BSR = #sparse_tensor.encoding<{
   map = (i, j) -> (

diff --git a/mlir/test/Dialect/SparseTensor/GPU/gpu_spgemm_lib.mlir b/mlir/test/Dialect/SparseTensor/GPU/gpu_spgemm_lib.mlir
@@ -1,5 +1,4 @@
-// RUN: mlir-opt %s --linalg-generalize-named-ops \
-// RUN:             --sparsification="enable-gpu-libgen" | FileCheck %s
+// RUN: mlir-opt %s --linalg-generalize-named-ops --sparse-gpu-codegen="num-threads=0" | FileCheck %s
 
 #CSR = #sparse_tensor.encoding<{ map = (d0, d1) -> (d0 : dense, d1 : compressed) }>
 

diff --git a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-matmul-lib.mlir b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-matmul-lib.mlir
diff --git a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-matvec-lib.mlir b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-matvec-lib.mlir
diff --git a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-sddmm-lib.mlir b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-sddmm-lib.mlir
@@ -85,32 +85,30 @@ module {
   // A kernel that computes a BSR sampled dense matrix matrix multiplication
   // using a "spy" function and in-place update of the sampling sparse matrix.
   //
-  // TODO: re-enable the following test.
-  //
-  // func.func @SDDMM_block(%args: tensor<?x?xf32, #BSR>,
-  //                        %arga: tensor<?x?xf32>,
-  //                        %argb: tensor<?x?xf32>) -> tensor<?x?xf32, #BSR> {
-  //   %result = linalg.generic #trait_SDDMM
-  //     ins(%arga, %argb: tensor<?x?xf32>, tensor<?x?xf32>)
-  //     outs(%args: tensor<?x?xf32, #BSR>) {
-  //       ^bb(%a: f32, %b: f32, %s: f32):
-  //          %f0 = arith.constant 0.0 : f32
-  //          %u = sparse_tensor.unary %s : f32 to f32
-  //            present={
-  //               ^bb0(%p: f32):
-  //                 %mul = arith.mulf %a, %b : f32
-  //                 sparse_tensor.yield %mul : f32
-  //            }
-  //            absent={}
-  //          %r = sparse_tensor.reduce %s, %u, %f0 : f32 {
-  //             ^bb0(%p: f32, %q: f32):
-  //               %add = arith.addf %p, %q : f32
-  //               sparse_tensor.yield %add : f32
-  //           }
-  //          linalg.yield %r : f32
-  //     } -> tensor<?x?xf32, #BSR>
-  //   return %result : tensor<?x?xf32, #BSR>
-  // }
+  func.func @SDDMM_block(%args: tensor<?x?xf32, #BSR>,
+                         %arga: tensor<?x?xf32>,
+                         %argb: tensor<?x?xf32>) -> tensor<?x?xf32, #BSR> {
+    %result = linalg.generic #trait_SDDMM
+      ins(%arga, %argb: tensor<?x?xf32>, tensor<?x?xf32>)
+      outs(%args: tensor<?x?xf32, #BSR>) {
+        ^bb(%a: f32, %b: f32, %s: f32):
+           %f0 = arith.constant 0.0 : f32
+           %u = sparse_tensor.unary %s : f32 to f32
+             present={
+                ^bb0(%p: f32):
+                  %mul = arith.mulf %a, %b : f32
+                  sparse_tensor.yield %mul : f32
+             }
+             absent={}
+           %r = sparse_tensor.reduce %s, %u, %f0 : f32 {
+              ^bb0(%p: f32, %q: f32):
+                %add = arith.addf %p, %q : f32
+                sparse_tensor.yield %add : f32
+            }
+           linalg.yield %r : f32
+      } -> tensor<?x?xf32, #BSR>
+    return %result : tensor<?x?xf32, #BSR>
+  }
 
   func.func private @getTensorFilename(index) -> (!Filename)
 
@@ -153,15 +151,15 @@ module {
     //
     %fileName = call @getTensorFilename(%c0) : (index) -> (!Filename)
     %m_csr = sparse_tensor.new %fileName : !Filename to tensor<?x?xf32, #CSR>
-    // %m_bsr = sparse_tensor.new %fileName : !Filename to tensor<?x?xf32, #BSR>
+    %m_bsr = sparse_tensor.new %fileName : !Filename to tensor<?x?xf32, #BSR>
 
     // Call the kernel.
     %0 = call @SDDMM(%m_csr, %a, %b)
        : (tensor<?x?xf32, #CSR>,
           tensor<?x?xf32>, tensor<?x?xf32>) -> tensor<?x?xf32, #CSR>
-    // %1 = call @SDDMM_block(%m_bsr, %a, %b)
-    //    : (tensor<?x?xf32, #BSR>,
-    //       tensor<?x?xf32>, tensor<?x?xf32>) -> tensor<?x?xf32, #BSR>
+    %1 = call @SDDMM_block(%m_bsr, %a, %b)
+       : (tensor<?x?xf32, #BSR>,
+          tensor<?x?xf32>, tensor<?x?xf32>) -> tensor<?x?xf32, #BSR>
 
     //
     // Print the result for verification. Note that the "spy" determines what
@@ -170,18 +168,18 @@ module {
     // in the original zero positions).
     //
     // CHECK:      ( 5, 10, 24, 19, 53, 42, 55, 56 )
-    // C_HECK-NEXT: ( 5, 10, 8, 19, 24, 24, 40, 53, 42, 55, 56, 64 )
+    // CHECK-NEXT: ( 5, 10, 8, 19, 24, 24, 40, 53, 42, 55, 56, 64 )
     //
     %v0 = sparse_tensor.values %0 : tensor<?x?xf32, #CSR> to memref<?xf32>
     %vv0 = vector.transfer_read %v0[%c0], %d0 : memref<?xf32>, vector<8xf32>
     vector.print %vv0 : vector<8xf32>
-    // %v1 = sparse_tensor.values %1 : tensor<?x?xf32, #BSR> to memref<?xf32>
-    // %vv1 = vector.transfer_read %v1[%c0], %d0 : memref<?xf32>, vector<12xf32>
-    // vector.print %vv1 : vector<12xf32>
+    %v1 = sparse_tensor.values %1 : tensor<?x?xf32, #BSR> to memref<?xf32>
+    %vv1 = vector.transfer_read %v1[%c0], %d0 : memref<?xf32>, vector<12xf32>
+    vector.print %vv1 : vector<12xf32>
 
     // Release the resources.
     bufferization.dealloc_tensor %0 : tensor<?x?xf32, #CSR>
-    // bufferization.dealloc_tensor %1 : tensor<?x?xf32, #BSR>
+    bufferization.dealloc_tensor %1 : tensor<?x?xf32, #BSR>
 
     llvm.call @mgpuDestroySparseEnv() : () -> ()
     return