From 568a96aabc6edabe8514ae163aecc64cd5a41878 Mon Sep 17 00:00:00 2001
From: "Mateusz P. Nowak" <112635238+mateuszpn@users.noreply.github.com>
Date: Tue, 15 Oct 2024 13:57:26 +0200
Subject: [PATCH] Benchmark updates for faster run and more reliable results
 (#2164)

---
 scripts/benchmarks/benches/SobelFilter.py |   2 +-
 scripts/benchmarks/benches/bitcracker.py  |   2 +-
 scripts/benchmarks/benches/compute.py     |   9 +-
 scripts/benchmarks/benches/hashtable.py   |   2 +-
 scripts/benchmarks/benches/quicksilver.py |   2 +-
 scripts/benchmarks/benches/result.py      |   1 +
 scripts/benchmarks/benches/syclbench.py   | 256 +++++++++++++---------
 scripts/benchmarks/benches/velocity.py    |   6 +-
 scripts/benchmarks/main.py                |  38 ++--
 9 files changed, 187 insertions(+), 131 deletions(-)

diff --git a/scripts/benchmarks/benches/SobelFilter.py b/scripts/benchmarks/benches/SobelFilter.py
index b28681c2ee..b9e7619e47 100644
--- a/scripts/benchmarks/benches/SobelFilter.py
+++ b/scripts/benchmarks/benches/SobelFilter.py
@@ -35,5 +35,5 @@ def parse_output(self, stdout: str) -> float:
         if match:
             return round(float(match.group(1)) * 1000, 3)
         else:
-            raise ValueError("Failed to parse benchmark output.")
+            raise ValueError("{self.__class__.__name__}: Failed to parse benchmark output.")
 
diff --git a/scripts/benchmarks/benches/bitcracker.py b/scripts/benchmarks/benches/bitcracker.py
index 4b2f2aba4f..bb198433fa 100644
--- a/scripts/benchmarks/benches/bitcracker.py
+++ b/scripts/benchmarks/benches/bitcracker.py
@@ -31,4 +31,4 @@ def parse_output(self, stdout: str) -> float:
         if match:
             return float(match.group(1))
         else:
-            raise ValueError("Failed to parse benchmark output.")
+            raise ValueError("{self.__class__.__name__}: Failed to parse benchmark output.")
diff --git a/scripts/benchmarks/benches/compute.py b/scripts/benchmarks/benches/compute.py
index 974286a9f5..473db80a75 100644
--- a/scripts/benchmarks/benches/compute.py
+++ b/scripts/benchmarks/benches/compute.py
@@ -20,7 +20,7 @@ def setup(self):
         if self.built:
             return
 
-        repo_path = git_clone(self.directory, "compute-benchmarks-repo", "https://github.com/intel/compute-benchmarks.git", "f6882552215736f90295244046fcb6e17fe53e83")
+        repo_path = git_clone(self.directory, "compute-benchmarks-repo", "https://github.com/intel/compute-benchmarks.git", "08c41bb8bc1762ad53c6194df6d36bfcceff4aa2")
         build_path = create_build_path(self.directory, 'compute-benchmarks-build')
 
         configure_command = [
@@ -34,12 +34,13 @@ def setup(self):
             f"-DBUILD_UR=ON",
             f"-Dunified-runtime_DIR={options.ur_dir}/lib/cmake/unified-runtime",
         ]
-        run(configure_command, add_sycl=True)
 
+        print(f"{self.__class__.__name__}: Run {configure_command}")
+        run(configure_command, add_sycl=True)
+        print(f"{self.__class__.__name__}: Run cmake --build {build_path} -j")
         run(f"cmake --build {build_path} -j", add_sycl=True)
 
         self.built = True
-        self.bins = os.path.join(build_path, 'bin')
 
 class ComputeBenchmark(Benchmark):
     def __init__(self, bench, name, test):
@@ -58,8 +59,8 @@ def unit(self):
         return "μs"
 
     def setup(self):
+        self.benchmark_bin = os.path.join(self.bench.directory, 'compute-benchmarks-build', 'bin', self.bench_name)
         self.bench.setup()
-        self.benchmark_bin = os.path.join(self.bench.bins, self.bench_name)
 
     def run(self, env_vars) -> list[Result]:
         command = [
diff --git a/scripts/benchmarks/benches/hashtable.py b/scripts/benchmarks/benches/hashtable.py
index 7558183bf0..c5ed397dbb 100644
--- a/scripts/benchmarks/benches/hashtable.py
+++ b/scripts/benchmarks/benches/hashtable.py
@@ -31,4 +31,4 @@ def parse_output(self, stdout: str) -> float:
         if match:
             return float(match.group(1))
         else:
-            raise ValueError("Failed to parse keys per second from benchmark output.")
+            raise ValueError("{self.__class__.__name__}: Failed to parse keys per second from benchmark output.")
diff --git a/scripts/benchmarks/benches/quicksilver.py b/scripts/benchmarks/benches/quicksilver.py
index c864e6c368..b7600d11be 100644
--- a/scripts/benchmarks/benches/quicksilver.py
+++ b/scripts/benchmarks/benches/quicksilver.py
@@ -42,4 +42,4 @@ def parse_output(self, stdout: str) -> float:
         if match:
             return float(match.group(1))
         else:
-            raise ValueError("Failed to parse benchmark output.")
+            raise ValueError("{self.__class__.__name__}: Failed to parse benchmark output.")
diff --git a/scripts/benchmarks/benches/result.py b/scripts/benchmarks/benches/result.py
index 896ff4da98..6fc7e16095 100644
--- a/scripts/benchmarks/benches/result.py
+++ b/scripts/benchmarks/benches/result.py
@@ -14,6 +14,7 @@ class Result:
     command: str
     env: str
     stdout: str
+    passed: bool = True
     unit: str = ""
     name: str = ""
     lower_is_better: bool = True
diff --git a/scripts/benchmarks/benches/syclbench.py b/scripts/benchmarks/benches/syclbench.py
index f52c68c2dd..b9d6e50623 100644
--- a/scripts/benchmarks/benches/syclbench.py
+++ b/scripts/benchmarks/benches/syclbench.py
@@ -22,9 +22,7 @@ def setup(self):
         if self.built:
             return
 
-        build_path = os.path.join(self.directory, 'sycl-bench-build')
-        create_build_path(build_path, '')
-
+        build_path = create_build_path(self.directory, 'sycl-bench-build')
         repo_path = git_clone(self.directory, "sycl-bench-repo", "https://github.com/mateuszpn/sycl-bench.git", "1e6ab2cfd004a72c5336c26945965017e06eab71")
 
         configure_command = [
@@ -37,20 +35,17 @@ def setup(self):
             f"-DSYCL_IMPL=dpcpp"
         ]
 
-        print(f"Run {configure_command}")
         run(configure_command, add_sycl=True)
-
-        print(f"Run cmake --build {build_path}")
         run(f"cmake --build {build_path} -j", add_sycl=True)
 
         self.built = True
-        self.bins = build_path
 
 class SyclBenchmark(Benchmark):
     def __init__(self, bench, name, test):
         self.bench = bench
         self.bench_name = name
         self.test = test
+        self.done = False
         super().__init__(bench.directory)
 
     def bin_args(self) -> list[str]:
@@ -64,17 +59,19 @@ def unit(self):
 
     def setup(self):
         self.bench.setup()
-        self.benchmark_bin = os.path.join(self.bench.bins, self.bench_name)
+        self.benchmark_bin = os.path.join(self.directory, 'sycl-bench-build', self.bench_name)
 
     def run(self, env_vars) -> list[Result]:
-        outputfile = f"{self.bench.directory}/{self.test}.csv"
+        if self.done:
+            return
+        self.outputfile = os.path.join(self.bench.directory, self.test+".csv")
+        print(f"{self.__class__.__name__}: Results in {self.outputfile}")
         command = [
             f"{self.benchmark_bin}",
             f"--warmup-run",
-            f"--num-runs=3",
-            f"--output={outputfile}"
+            f"--num-runs={options.iterations}",
+            f"--output={self.outputfile}"
         ]
-        bin_dir = self.bench.bins
 
         command += self.bin_args()
         env_vars.update(self.extra_env_vars())
@@ -82,26 +79,158 @@ def run(self, env_vars) -> list[Result]:
         # no output to stdout, all in outputfile
         self.run_bench(command, env_vars)
 
-        with open(outputfile, 'r') as f:
+        with open(self.outputfile, 'r') as f:
             reader = csv.reader(f)
             res_list = []
             for row in reader:
                 if not row[0].startswith('#'):
                     res_list.append(
                         Result(label=row[0],
-                               value=float(row[12]) * 1000, # convert to ms
-                               command=command,
-                               env=env_vars,
-                               stdout=row))
-
+                            value=float(row[12]) * 1000, # convert to ms
+                            passed=(row[1]=="PASS"),
+                            command=command,
+                            env=env_vars,
+                            stdout=row))
+        self.done = True
         return res_list
 
     def teardown(self):
+        print(f"Removing {self.outputfile}...")
+        os.remove(self.outputfile)
         return
 
     def name(self):
         return self.test
 
+# multi benchmarks
+class Blocked_transform(SyclBenchmark):
+    def __init__(self, bench):
+        super().__init__(bench, "blocked_transform", "BlockedTransform_multi")
+
+    def bin_args(self) -> list[str]:
+        return [
+            f"--size=2049",
+            f"--local=1024"
+        ]
+
+class DagTaskI(SyclBenchmark):
+    def __init__(self, bench):
+        super().__init__(bench, "dag_task_throughput_independent", "IndependentDAGTaskThroughput_multi")
+
+    def bin_args(self) -> list[str]:
+        return [
+            f"--size=32768",
+        ]
+
+class DagTaskS(SyclBenchmark):
+    def __init__(self, bench):
+        super().__init__(bench, "dag_task_throughput_sequential", "DAGTaskThroughput_multi")
+
+    def bin_args(self) -> list[str]:
+        return [
+            f"--size=327680",
+        ]
+
+class HostDevBandwidth(SyclBenchmark):
+    def __init__(self, bench):
+        super().__init__(bench, "host_device_bandwidth", "HostDeviceBandwidth_multi")
+
+    def bin_args(self) -> list[str]:
+        return [
+            f"--size=512",
+        ]
+
+class LocalMem(SyclBenchmark):
+    def __init__(self, bench):
+        super().__init__(bench, "local_mem", f"LocalMem_multi")
+
+    def bin_args(self) -> list[str]:
+        return [
+            f"--size=10240000",
+        ]
+
+class Pattern_L2(SyclBenchmark):
+    def __init__(self, bench):
+        super().__init__(bench, "pattern_L2", "L2_multi")
+
+    def bin_args(self) -> list[str]:
+        return [
+            f"--size=1024000000",
+        ]
+
+class Reduction(SyclBenchmark):
+    def __init__(self, bench):
+        super().__init__(bench, "reduction", "Pattern_Reduction_multi")
+
+    def bin_args(self) -> list[str]:
+        return [
+            f"--size=10240000",
+        ]
+
+class ScalarProd(SyclBenchmark):
+    def __init__(self, bench):
+        super().__init__(bench, "scalar_prod", "ScalarProduct_multi")
+
+    def bin_args(self) -> list[str]:
+        return [
+            f"--size=102400000",
+        ]
+
+class SegmentReduction(SyclBenchmark):
+    def __init__(self, bench):
+        super().__init__(bench, "segmentedreduction", "Pattern_SegmentedReduction_multi")
+
+    def bin_args(self) -> list[str]:
+        return [
+            f"--size=102400000",
+        ]
+
+class UsmAccLatency(SyclBenchmark):
+    def __init__(self, bench):
+        super().__init__(bench, "usm_accessors_latency", "USM_Latency_multi")
+
+    def bin_args(self) -> list[str]:
+        return [
+            f"--size=4096",
+        ]
+
+class UsmAllocLatency(SyclBenchmark):
+    def __init__(self, bench):
+        super().__init__(bench, "usm_allocation_latency", "USM_Allocation_latency_multi")
+
+    def bin_args(self) -> list[str]:
+        return [
+            f"--size=1024000000",
+        ]
+
+class UsmInstrMix(SyclBenchmark):
+    def __init__(self, bench):
+        super().__init__(bench, "usm_instr_mix", "USM_Instr_Mix_multi")
+
+    def bin_args(self) -> list[str]:
+        return [
+            f"--size=8192",
+        ]
+
+class UsmPinnedOverhead(SyclBenchmark):
+    def __init__(self, bench):
+        super().__init__(bench, "usm_pinned_overhead", "USM_Pinned_Overhead_multi")
+
+    def bin_args(self) -> list[str]:
+        return [
+            f"--size=10240000",
+        ]
+
+class VecAdd(SyclBenchmark):
+    def __init__(self, bench):
+        super().__init__(bench, "vec_add", "VectorAddition_multi")
+
+    def bin_args(self) -> list[str]:
+        return [
+            f"--size=102400000",
+        ]
+
+# single benchmarks
 class Arith(SyclBenchmark):
     def __init__(self, bench):
         super().__init__(bench, "arith", "Arith_int32_512")
@@ -152,7 +281,7 @@ def __init__(self, bench):
 
     def bin_args(self) -> list[str]:
         return [
-            f"--size=20480",
+            f"--size=204800",
         ]
 
 class Correlation(SyclBenchmark):
@@ -179,7 +308,7 @@ def __init__(self, bench):
 
     def bin_args(self) -> list[str]:
         return [
-            f"--size=8192",
+            f"--size=1536",
         ]
 
 class Gesumv(SyclBenchmark):
@@ -224,7 +353,7 @@ def __init__(self, bench):
 
     def bin_args(self) -> list[str]:
         return [
-            f"--size=640000",
+            f"--size=4096",
         ]
 
 class MatmulChain(SyclBenchmark):
@@ -269,7 +398,7 @@ def __init__(self, bench):
 
     def bin_args(self) -> list[str]:
         return [
-            f"--size=--size=100000000",
+            f"--size=5000000000",
         ]
 
 class Syr2k(SyclBenchmark):
@@ -278,7 +407,7 @@ def __init__(self, bench):
 
     def bin_args(self) -> list[str]:
         return [
-            f"--size=6144",
+            f"--size=2048",
         ]
 
 class Syrk(SyclBenchmark):
@@ -287,84 +416,5 @@ def __init__(self, bench):
 
     def bin_args(self) -> list[str]:
         return [
-            f"--size=4096",
-        ]
-
-# multi benchmarks
-class Blocked_transform(SyclBenchmark):
-    def __init__(self, bench):
-        super().__init__(bench, "blocked_transform", "BlockedTransform_multi")
-
-    def bin_args(self) -> list[str]:
-        return [
-            f"--size=16384",
-            f"--local=1024"
-        ]
-
-class DagTaskI(SyclBenchmark):
-    def __init__(self, bench):
-        super().__init__(bench, "dag_task_throughput_independent", "IndependentDAGTaskThroughput_multi")
-
-    def bin_args(self) -> list[str]:
-        return [
-            f"--size=32768",
+            f"--size=1024",
         ]
-
-class DagTaskS(SyclBenchmark):
-    def __init__(self, bench):
-        super().__init__(bench, "dag_task_throughput_sequential", "DAGTaskThroughput_multi")
-
-    def bin_args(self) -> list[str]:
-        return [
-            f"--size=327680",
-        ]
-
-class HostDevBandwidth(SyclBenchmark):
-    def __init__(self, bench):
-        super().__init__(bench, "host_device_bandwidth", "HostDeviceBandwidth_multi")
-
-class LocalMem(SyclBenchmark):
-    def __init__(self, bench):
-        super().__init__(bench, "local_mem", f"LocalMem_multi")
-
-    def bin_args(self) -> list[str]:
-        return [
-            f"--size=512",
-        ]
-
-class Pattern_L2(SyclBenchmark):
-    def __init__(self, bench):
-        super().__init__(bench, "pattern_L2", "L2_multi")
-
-class Reduction(SyclBenchmark):
-    def __init__(self, bench):
-        super().__init__(bench, "reduction", "Pattern_Reduction_multi")
-
-class ScalarProd(SyclBenchmark):
-    def __init__(self, bench):
-        super().__init__(bench, "scalar_prod", "ScalarProduct_multi")
-
-class SegmentReduction(SyclBenchmark):
-    def __init__(self, bench):
-        super().__init__(bench, "segmentedreduction", "Pattern_SegmentedReduction_multi")
-
-class UsmAccLatency(SyclBenchmark):
-    def __init__(self, bench):
-        super().__init__(bench, "usm_accessors_latency", "USM_Latency_multi")
-
-class UsmAllocLatency(SyclBenchmark):
-    def __init__(self, bench):
-        super().__init__(bench, "usm_allocation_latency", "USM_Allocation_latency_multi")
-
-class UsmInstrMix(SyclBenchmark):
-    def __init__(self, bench):
-        super().__init__(bench, "usm_instr_mix", "USM_Instr_Mix_multi")
-
-class UsmPinnedOverhead(SyclBenchmark):
-    def __init__(self, bench):
-        super().__init__(bench, "usm_pinned_overhead", "USM_Pinned_Overhead_multi")
-
-class VecAdd(SyclBenchmark):
-    def __init__(self, bench):
-        super().__init__(bench, "vec_add", "VectorAddition_multi")
-
diff --git a/scripts/benchmarks/benches/velocity.py b/scripts/benchmarks/benches/velocity.py
index 9d79f78178..3c903bf11b 100644
--- a/scripts/benchmarks/benches/velocity.py
+++ b/scripts/benchmarks/benches/velocity.py
@@ -9,12 +9,11 @@
 from utils.utils import run, create_build_path
 from .options import options
 import os
-import re
 
 class VelocityBench:
     def __init__(self, directory):
         self.directory = directory
-        self.repo_path = git_clone(self.directory, "velocity-bench-repo", "https://github.com/oneapi-src/Velocity-Bench", "34ee4ebe18d91dfdd38b7d798fd986b41874fcbc")
+        self.repo_path = git_clone(self.directory, "velocity-bench-repo", "https://github.com/oneapi-src/Velocity-Bench/", "b22215c16f789100449c34bf4eaa3fb178983d69")
 
 class VelocityBase(Benchmark):
     def __init__(self, name: str, bin_name: str, vb: VelocityBench):
@@ -29,6 +28,7 @@ def download_deps(self):
 
     def setup(self):
         self.download_deps()
+        self.benchmark_bin = os.path.join(self.directory, self.bench_name, self.bin_name)
 
         build_path = create_build_path(self.directory, self.bench_name)
 
@@ -41,8 +41,6 @@ def setup(self):
         run(configure_command, {'CC': 'clang', 'CXX':'clang++'}, add_sycl=True)
         run(f"cmake --build {build_path} -j", add_sycl=True)
 
-        self.benchmark_bin = os.path.join(build_path, self.bin_name)
-
     def bin_args(self) -> list[str]:
         return []
 
diff --git a/scripts/benchmarks/main.py b/scripts/benchmarks/main.py
index e4844a6f09..dac43643fc 100755
--- a/scripts/benchmarks/main.py
+++ b/scripts/benchmarks/main.py
@@ -19,16 +19,17 @@
 from output import generate_markdown
 import argparse
 import re
+import subprocess
 
 # Update this if you are changing the layout of the results files
-INTERNAL_WORKDIR_VERSION = '1.6'
+INTERNAL_WORKDIR_VERSION = '1.7'
 
 def main(directory, additional_env_vars, save_name, compare_names, filter):
     prepare_workdir(directory, INTERNAL_WORKDIR_VERSION)
 
     cb = ComputeBench(directory)
-    sb = SyclBench(directory)
     vb = VelocityBench(directory)
+    sb = SyclBench(directory)
 
     benchmarks = [
         # *** Compute benchmarks
@@ -53,7 +54,7 @@ def main(directory, additional_env_vars, save_name, compare_names, filter):
         SobelFilter(vb),
 
         # *** sycl-bench multi benchmarks
-        Blocked_transform(sb),
+        # Blocked_transform(sb), # run time < 1ms
         DagTaskI(sb),
         DagTaskS(sb),
         HostDevBandwidth(sb),
@@ -69,12 +70,12 @@ def main(directory, additional_env_vars, save_name, compare_names, filter):
         VecAdd(sb),
 
         # *** sycl-bench single benchmarks
-        TwoDConvolution(sb),
+        # TwoDConvolution(sb), # run time < 1ms
         Two_mm(sb),
         Three_mm(sb),
-        Arith(sb),
+        # Arith(sb), # run time < 1ms
         Atax(sb),
-        Atomic_reduction(sb),
+        # Atomic_reduction(sb), # run time < 1ms
         Bicg(sb),
         Correlation(sb),
         Covariance(sb),
@@ -83,7 +84,7 @@ def main(directory, additional_env_vars, save_name, compare_names, filter):
         Gramschmidt(sb),
         KMeans(sb),
         LinRegCoeff(sb),
-        LinRegError(sb),
+        # LinRegError(sb), # run time < 1ms
         MatmulChain(sb),
         MolDyn(sb),
         Mvt(sb),
@@ -117,24 +118,29 @@ def main(directory, additional_env_vars, save_name, compare_names, filter):
                 bench_results = benchmark.run(merged_env_vars)
                 if bench_results is not None:
                     for bench_result in bench_results:
-                        print(f"complete ({bench_result.label}: {bench_result.value} {benchmark.unit()}).")
+                        if bench_result.passed:
+                            print(f"complete ({bench_result.label}: {bench_result.value:.3f} {benchmark.unit()}).")
+                        else:
+                            print(f"complete ({bench_result.label}: verification FAILED)")
                         iteration_results.append(bench_result)
                 else:
-                    print(f"did not finish.")
+                    print(f"did not finish (OK for sycl-bench).")
+                    break;
 
             if len(iteration_results) == 0:
                 continue
 
             for label in set([result.label for result in iteration_results]):
-                label_results = [result for result in iteration_results if result.label == label]
-                label_results.sort(key=lambda res: res.value)
-                median_index = len(label_results) // 2
-                median_result = label_results[median_index]
+                label_results = [result for result in iteration_results if result.label == label and result.passed == True]
+                if len(label_results) > 0:
+                    label_results.sort(key=lambda res: res.value)
+                    median_index = len(label_results) // 2
+                    median_result = label_results[median_index]
 
-                median_result.unit = benchmark.unit()
-                median_result.name = label
+                    median_result.unit = benchmark.unit()
+                    median_result.name = label
 
-                results.append(median_result)
+                    results.append(median_result)
         except Exception as e:
             if options.exit_on_failure:
                 raise e