[SYCL][Fusion] Test kernel fusion and optimization (intel/llvm-test-s…

…uite#1535) Test different scenarios for kernel fusion, including creation of the fused kernel by the JIT compiler and performance optimizations such as dataflow internalization. Automatically detect availability of the kernel fusion extension in the DPC++ build in `lit.cfg.py` and make it available for `REQUIRES` clauses. Spec: intel#7098 Implementation: intel#7831 Signed-off-by: Lukas Sommer <lukas.sommer@codeplay.com>
aelovikov-intel · Jan 27, 2023 · 60ab99e · 60ab99e
1 parent b6653f5
commit 60ab99e
Show file tree

Hide file tree

Showing 29 changed files with 2,140 additions and 8 deletions.
diff --git a/SYCL/KernelFusion/abort_fusion.cpp b/SYCL/KernelFusion/abort_fusion.cpp
@@ -0,0 +1,105 @@
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: env SYCL_RT_WARNING_LEVEL=1 %CPU_RUN_PLACEHOLDER %t.out 2>&1\
+// RUN: %CPU_CHECK_PLACEHOLDER
+// RUN: env SYCL_RT_WARNING_LEVEL=1 %GPU_RUN_PLACEHOLDER %t.out 2>&1\
+// RUN: %GPU_CHECK_PLACEHOLDER
+// UNSUPPORTED: cuda || hip
+// REQUIRES: fusion
+
+// Test fusion being aborted: Different scenarios causing the JIT compiler
+// to abort fusion due to constraint violations for fusion. Also check that
+// warnings are printed when SYCL_RT_WARNING_LEVEL=1.
+
+#include <sycl/sycl.hpp>
+
+using namespace sycl;
+
+constexpr size_t dataSize = 512;
+
+enum class Internalization { None, Local, Private };
+
+template <typename Kernel1Name, typename Kernel2Name, int Kernel1Dim>
+void performFusion(queue &q, range<Kernel1Dim> k1Global,
+                   range<Kernel1Dim> k1Local) {
+  int in[dataSize], tmp[dataSize], out[dataSize];
+
+  for (size_t i = 0; i < dataSize; ++i) {
+    in[i] = i;
+    tmp[i] = -1;
+    out[i] = -1;
+  }
+  {
+    buffer<int> bIn{in, range{dataSize}};
+    buffer<int> bTmp{tmp, range{dataSize}};
+    buffer<int> bOut{out, range{dataSize}};
+
+    ext::codeplay::experimental::fusion_wrapper fw(q);
+    fw.start_fusion();
+
+    assert(fw.is_in_fusion_mode() && "Queue should be in fusion mode");
+
+    q.submit([&](handler &cgh) {
+      auto accIn = bIn.get_access(cgh);
+      auto accTmp = bTmp.get_access(cgh);
+      cgh.parallel_for<Kernel1Name>(nd_range<Kernel1Dim>{k1Global, k1Local},
+                                    [=](item<Kernel1Dim> i) {
+                                      auto LID = i.get_linear_id();
+                                      accTmp[LID] = accIn[LID] + 5;
+                                    });
+    });
+
+    q.submit([&](handler &cgh) {
+      auto accTmp = bTmp.get_access(cgh);
+      auto accOut = bOut.get_access(cgh);
+      cgh.parallel_for<Kernel2Name>(nd_range<1>{{dataSize}, {8}}, [=](id<1> i) {
+        accOut[i] = accTmp[i] * 2;
+      });
+    });
+
+    fw.complete_fusion({ext::codeplay::experimental::property::no_barriers{}});
+
+    assert(!fw.is_in_fusion_mode() &&
+           "Queue should not be in fusion mode anymore");
+  }
+
+  // Check the results
+  size_t numErrors = 0;
+  for (size_t i = 0; i < k1Global.size(); ++i) {
+    if (out[i] != ((i + 5) * 2)) {
+      ++numErrors;
+    }
+  }
+  if (numErrors) {
+    std::cout << "COMPUTATION ERROR\n";
+  } else {
+    std::cout << "COMPUTATION OK\n";
+  }
+}
+
+int main() {
+
+  queue q{ext::codeplay::experimental::property::queue::enable_fusion{}};
+
+  // Scenario: Fusing two kernels with different dimensionality should lead to
+  // fusion being aborted.
+  performFusion<class Kernel1_1, class Kernel2_1>(q, range<2>{32, 16},
+                                                  range<2>{1, 8});
+  // CHECK: WARNING: Cannot fuse kernels with different dimensionality
+  // CHECK-NEXT: COMPUTATION OK
+
+  // Scenario: Fusing two kernels with different global size should lead to
+  // fusion being aborted.
+  performFusion<class Kernel1_2, class Kernel2_2>(q, range<1>{256},
+                                                  range<1>{8});
+  // CHECK-NEXT: WARNING: Cannot fuse kerneles with different global size
+  // CHECK-NEXT: COMPUTATION OK
+
+  // Scenario: Fusing two kernels with different local size should lead to
+  // fusion being aborted.
+  performFusion<class Kernel1_3, class Kernel2_3>(q, range<1>{dataSize},
+                                                  range<1>{16});
+  // CHECK-NEXT: WARNING: Cannot fuse kernels with different local size
+  // CHECK-NEXT: COMPUTATION OK
+
+  return 0;
+}
diff --git a/SYCL/KernelFusion/abort_internalization.cpp b/SYCL/KernelFusion/abort_internalization.cpp
@@ -0,0 +1,174 @@
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: env SYCL_ENABLE_FUSION_CACHING=0 SYCL_RT_WARNING_LEVEL=1 %CPU_RUN_PLACEHOLDER %t.out 2>&1\
+// RUN: %CPU_CHECK_PLACEHOLDER
+// RUN: env SYCL_ENABLE_FUSION_CACHING=0 SYCL_RT_WARNING_LEVEL=1 %GPU_RUN_PLACEHOLDER %t.out 2>&1\
+// RUN: %GPU_CHECK_PLACEHOLDER
+// UNSUPPORTED: cuda || hip
+// REQUIRES: fusion
+
+// Test incomplete internalization: Different scenarios causing the JIT compiler
+// to abort internalization due to target or parameter mismatch. Also check that
+// warnings are printed when SYCL_RT_WARNING_LEVEL=1.
+
+#include <sycl/sycl.hpp>
+
+using namespace sycl;
+
+constexpr size_t dataSize = 512;
+
+enum class Internalization { None, Local, Private };
+
+void performFusion(queue &q, Internalization intKernel1,
+                   size_t localSizeKernel1, Internalization intKernel2,
+                   size_t localSizeKernel2,
+                   bool expectInternalization = false) {
+  int in[dataSize], tmp[dataSize], out[dataSize];
+  for (size_t i = 0; i < dataSize; ++i) {
+    in[i] = i;
+    tmp[i] = -1;
+    out[i] = -1;
+  }
+  {
+    buffer<int> bIn{in, range{dataSize}};
+    buffer<int> bTmp{tmp, range{dataSize}};
+    buffer<int> bOut{out, range{dataSize}};
+
+    ext::codeplay::experimental::fusion_wrapper fw{q};
+    fw.start_fusion();
+
+    assert(fw.is_in_fusion_mode() && "Queue should be in fusion mode");
+
+    q.submit([&](handler &cgh) {
+      auto accIn = bIn.get_access(cgh);
+      property_list properties{};
+      if (intKernel1 == Internalization::Private) {
+        properties = {
+            sycl::ext::codeplay::experimental::property::promote_private{}};
+      } else if (intKernel1 == Internalization::Local) {
+        properties = {
+            sycl::ext::codeplay::experimental::property::promote_local{}};
+      }
+      accessor<int> accTmp = bTmp.get_access(cgh, properties);
+
+      if (localSizeKernel1 > 0) {
+        cgh.parallel_for<class Kernel1>(
+            nd_range<1>{{dataSize}, {localSizeKernel1}},
+            [=](id<1> i) { accTmp[i] = accIn[i] + 5; });
+      } else {
+        cgh.parallel_for<class KernelOne>(
+            dataSize, [=](id<1> i) { accTmp[i] = accIn[i] + 5; });
+      }
+    });
+
+    q.submit([&](handler &cgh) {
+      property_list properties{};
+      if (intKernel2 == Internalization::Private) {
+        properties = {
+            sycl::ext::codeplay::experimental::property::promote_private{}};
+      } else if (intKernel2 == Internalization::Local) {
+        properties = {
+            sycl::ext::codeplay::experimental::property::promote_local{}};
+      }
+      accessor<int> accTmp = bTmp.get_access(cgh, properties);
+      auto accOut = bOut.get_access(cgh);
+      if (localSizeKernel2 > 0) {
+        cgh.parallel_for<class Kernel2>(
+            nd_range<1>{{dataSize}, {localSizeKernel2}},
+            [=](id<1> i) { accOut[i] = accTmp[i] * 2; });
+      } else {
+        cgh.parallel_for<class KernelTwo>(
+            dataSize, [=](id<1> i) { accOut[i] = accTmp[i] * 2; });
+      }
+    });
+
+    fw.complete_fusion({ext::codeplay::experimental::property::no_barriers{}});
+
+    assert(!fw.is_in_fusion_mode() &&
+           "Queue should not be in fusion mode anymore");
+  }
+
+  // Check the results
+  size_t numErrors = 0;
+  size_t numInternalized = 0;
+  for (size_t i = 0; i < dataSize; ++i) {
+    if (out[i] != ((i + 5) * 2)) {
+      ++numErrors;
+    }
+    if (tmp[i] == -1) {
+      ++numInternalized;
+    }
+  }
+  if (numErrors) {
+    std::cout << "COMPUTATION ERROR\n";
+    return;
+  }
+  if (!expectInternalization && numInternalized) {
+    std::cout << "WRONG INTERNALIZATION\n";
+    return;
+  }
+  std::cout << "COMPUTATION OK\n";
+}
+
+int main() {
+  queue q{ext::codeplay::experimental::property::queue::enable_fusion{}};
+
+  // Scenario: One accessor without internalization, one with local
+  // internalization. Should fall back to no internalization and print a
+  // warning.
+  std::cout << "None, Local(0)\n";
+  performFusion(q, Internalization::None, 0, Internalization::Local, 0);
+  // CHECK: None, Local(0)
+  // CHECK-NEXT: WARNING: Not performing specified local promotion, due to previous mismatch or because previous accessor specified no promotion
+  // CHECK-NEXT: COMPUTATION OK
+
+  // Scenario: One accessor without internalization, one with private
+  // internalization. Should fall back to no internalization and print a
+  // warning.
+  std::cout << "None, Private\n";
+  performFusion(q, Internalization::None, 0, Internalization::Private, 0);
+  // CHECK-NEXT: None, Private
+  // CHECK-NEXT: WARNING: Not performing specified private promotion, due to previous mismatch or because previous accessor specified no promotion
+  // CHECK-NEXT: COMPUTATION OK
+
+  // Scenario: Both accessor with local promotion, but the second kernel does
+  // not specify a work-group size. No promotion should happen and a warning
+  // should be printed.
+  std::cout << "Local(8), Local(0)\n";
+  performFusion(q, Internalization::Local, 8, Internalization::Local, 0);
+  // CHECK-NEXT: Local(8), Local(0)
+  // CHECK-NEXT: WARNING: Work-group size for local promotion not specified, not performing internalization
+  // CHECK-NEXT: COMPUTATION OK
+
+  // Scenario: Both accessor with local promotion, but the first kernel does
+  // not specify a work-group size. No promotion should happen and a warning
+  // should be printed.
+  std::cout << "Local(0), Local(8)\n";
+  performFusion(q, Internalization::Local, 0, Internalization::Local, 8);
+  // CHECK-NEXT: Local(0), Local(8)
+  // CHECK-NEXT: WARNING: Work-group size for local promotion not specified, not performing internalization
+  // CHECK-NEXT: WARNING: Not performing specified local promotion, due to previous mismatch or because previous accessor specified no promotion
+  // CHECK-NEXT: WARNING: Cannot fuse kernels with different local size
+  // CHECK-NEXT: COMPUTATION OK
+
+  // Scenario: Both accessor with local promotion, but the kernels specify
+  // different work-group sizes. No promotion should happen and a warning should
+  // be printed.
+  std::cout << "Local(8), Local(16)\n";
+  performFusion(q, Internalization::Local, 8, Internalization::Local, 16);
+  // CHECK-NEXT: Local(8), Local(16)
+  // CHECK-NEXT: WARNING: Not performing specified local promotion due to work-group size mismatch
+  // CHECK-NEXT: WARNING: Cannot fuse kernels with different local size
+  // CHECK-NEXT: COMPUTATION OK
+
+  // Scenario: One accessor with local internalization, one with private
+  // internalization. Should fall back to local internalization and print a
+  // warning.
+  std::cout << "Local(8), Private(8)\n";
+  performFusion(q, Internalization::Local, 8, Internalization::Private, 8,
+                /* expectInternalization */ true);
+  // CHECK-NEXT: Local(8), Private(8)
+  // CHECK-NEXT: WARNING: Performing local internalization instead, because previous accessor specified local promotion
+  // CHECK-NEXT: COMPUTATION OK
+
+  return 0;
+}
diff --git a/SYCL/KernelFusion/barrier_local_internalization.cpp b/SYCL/KernelFusion/barrier_local_internalization.cpp
@@ -0,0 +1,83 @@
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// UNSUPPORTED: cuda || hip
+// REQUIRES: fusion
+
+// Test complete fusion with local internalization and a combination of kernels
+// that require a work-group barrier to be inserted by fusion.
+
+#include <sycl/sycl.hpp>
+
+using namespace sycl;
+
+int main() {
+  constexpr size_t dataSize = 512;
+  int in1[dataSize], in2[dataSize], in3[dataSize], tmp[dataSize], out[dataSize];
+
+  for (size_t i = 0; i < dataSize; ++i) {
+    in1[i] = i * 2;
+    in2[i] = i * 3;
+    in3[i] = i * 4;
+    tmp[i] = -1;
+    out[i] = -1;
+  }
+
+  queue q{ext::codeplay::experimental::property::queue::enable_fusion{}};
+
+  {
+    buffer<int> bIn1{in1, range{dataSize}};
+    buffer<int> bIn2{in2, range{dataSize}};
+    buffer<int> bIn3{in3, range{dataSize}};
+    buffer<int> bTmp{
+        tmp,
+        range{dataSize},
+        {sycl::ext::codeplay::experimental::property::promote_local{}}};
+    buffer<int> bOut{out, range{dataSize}};
+
+    ext::codeplay::experimental::fusion_wrapper fw{q};
+    fw.start_fusion();
+
+    assert(fw.is_in_fusion_mode() && "Queue should be in fusion mode");
+
+    q.submit([&](handler &cgh) {
+      auto accIn1 = bIn1.get_access(cgh);
+      auto accIn2 = bIn2.get_access(cgh);
+      auto accTmp = bTmp.get_access(cgh);
+      cgh.parallel_for<class KernelOne>(
+          nd_range<1>{{dataSize}, {32}}, [=](nd_item<1> i) {
+            auto workgroupSize = i.get_local_range(0);
+            auto baseOffset = i.get_group_linear_id() * workgroupSize;
+            auto localIndex = i.get_local_linear_id();
+            auto localOffset = (workgroupSize - 1) - localIndex;
+            accTmp[baseOffset + localOffset] =
+                accIn1[baseOffset + localOffset] +
+                accIn2[baseOffset + localOffset];
+          });
+    });
+
+    q.submit([&](handler &cgh) {
+      auto accTmp = bTmp.get_access(cgh);
+      auto accIn3 = bIn3.get_access(cgh);
+      auto accOut = bOut.get_access(cgh);
+      cgh.parallel_for<class KernelTwo>(
+          nd_range<1>{{dataSize}, {32}}, [=](nd_item<1> i) {
+            auto index = i.get_global_linear_id();
+            accOut[index] = accTmp[index] * accIn3[index];
+          });
+    });
+
+    fw.complete_fusion();
+
+    assert(!fw.is_in_fusion_mode() &&
+           "Queue should not be in fusion mode anymore");
+  }
+
+  // Check the results
+  for (size_t i = 0; i < dataSize; ++i) {
+    assert(out[i] == (20 * i * i) && "Computation error");
+    assert(tmp[i] == -1 && "Not internalized");
+  }
+
+  return 0;
+}