From 568a96aabc6edabe8514ae163aecc64cd5a41878 Mon Sep 17 00:00:00 2001 From: "Mateusz P. Nowak" <112635238+mateuszpn@users.noreply.github.com> Date: Tue, 15 Oct 2024 13:57:26 +0200 Subject: [PATCH] Benchmark updates for faster run and more reliable results (#2164) --- scripts/benchmarks/benches/SobelFilter.py | 2 +- scripts/benchmarks/benches/bitcracker.py | 2 +- scripts/benchmarks/benches/compute.py | 9 +- scripts/benchmarks/benches/hashtable.py | 2 +- scripts/benchmarks/benches/quicksilver.py | 2 +- scripts/benchmarks/benches/result.py | 1 + scripts/benchmarks/benches/syclbench.py | 256 +++++++++++++--------- scripts/benchmarks/benches/velocity.py | 6 +- scripts/benchmarks/main.py | 38 ++-- 9 files changed, 187 insertions(+), 131 deletions(-) diff --git a/scripts/benchmarks/benches/SobelFilter.py b/scripts/benchmarks/benches/SobelFilter.py index b28681c2ee..b9e7619e47 100644 --- a/scripts/benchmarks/benches/SobelFilter.py +++ b/scripts/benchmarks/benches/SobelFilter.py @@ -35,5 +35,5 @@ def parse_output(self, stdout: str) -> float: if match: return round(float(match.group(1)) * 1000, 3) else: - raise ValueError("Failed to parse benchmark output.") + raise ValueError("{self.__class__.__name__}: Failed to parse benchmark output.") diff --git a/scripts/benchmarks/benches/bitcracker.py b/scripts/benchmarks/benches/bitcracker.py index 4b2f2aba4f..bb198433fa 100644 --- a/scripts/benchmarks/benches/bitcracker.py +++ b/scripts/benchmarks/benches/bitcracker.py @@ -31,4 +31,4 @@ def parse_output(self, stdout: str) -> float: if match: return float(match.group(1)) else: - raise ValueError("Failed to parse benchmark output.") + raise ValueError("{self.__class__.__name__}: Failed to parse benchmark output.") diff --git a/scripts/benchmarks/benches/compute.py b/scripts/benchmarks/benches/compute.py index 974286a9f5..473db80a75 100644 --- a/scripts/benchmarks/benches/compute.py +++ b/scripts/benchmarks/benches/compute.py @@ -20,7 +20,7 @@ def setup(self): if self.built: return - repo_path = git_clone(self.directory, "compute-benchmarks-repo", "https://github.com/intel/compute-benchmarks.git", "f6882552215736f90295244046fcb6e17fe53e83") + repo_path = git_clone(self.directory, "compute-benchmarks-repo", "https://github.com/intel/compute-benchmarks.git", "08c41bb8bc1762ad53c6194df6d36bfcceff4aa2") build_path = create_build_path(self.directory, 'compute-benchmarks-build') configure_command = [ @@ -34,12 +34,13 @@ def setup(self): f"-DBUILD_UR=ON", f"-Dunified-runtime_DIR={options.ur_dir}/lib/cmake/unified-runtime", ] - run(configure_command, add_sycl=True) + print(f"{self.__class__.__name__}: Run {configure_command}") + run(configure_command, add_sycl=True) + print(f"{self.__class__.__name__}: Run cmake --build {build_path} -j") run(f"cmake --build {build_path} -j", add_sycl=True) self.built = True - self.bins = os.path.join(build_path, 'bin') class ComputeBenchmark(Benchmark): def __init__(self, bench, name, test): @@ -58,8 +59,8 @@ def unit(self): return "μs" def setup(self): + self.benchmark_bin = os.path.join(self.bench.directory, 'compute-benchmarks-build', 'bin', self.bench_name) self.bench.setup() - self.benchmark_bin = os.path.join(self.bench.bins, self.bench_name) def run(self, env_vars) -> list[Result]: command = [ diff --git a/scripts/benchmarks/benches/hashtable.py b/scripts/benchmarks/benches/hashtable.py index 7558183bf0..c5ed397dbb 100644 --- a/scripts/benchmarks/benches/hashtable.py +++ b/scripts/benchmarks/benches/hashtable.py @@ -31,4 +31,4 @@ def parse_output(self, stdout: str) -> float: if match: return float(match.group(1)) else: - raise ValueError("Failed to parse keys per second from benchmark output.") + raise ValueError("{self.__class__.__name__}: Failed to parse keys per second from benchmark output.") diff --git a/scripts/benchmarks/benches/quicksilver.py b/scripts/benchmarks/benches/quicksilver.py index c864e6c368..b7600d11be 100644 --- a/scripts/benchmarks/benches/quicksilver.py +++ b/scripts/benchmarks/benches/quicksilver.py @@ -42,4 +42,4 @@ def parse_output(self, stdout: str) -> float: if match: return float(match.group(1)) else: - raise ValueError("Failed to parse benchmark output.") + raise ValueError("{self.__class__.__name__}: Failed to parse benchmark output.") diff --git a/scripts/benchmarks/benches/result.py b/scripts/benchmarks/benches/result.py index 896ff4da98..6fc7e16095 100644 --- a/scripts/benchmarks/benches/result.py +++ b/scripts/benchmarks/benches/result.py @@ -14,6 +14,7 @@ class Result: command: str env: str stdout: str + passed: bool = True unit: str = "" name: str = "" lower_is_better: bool = True diff --git a/scripts/benchmarks/benches/syclbench.py b/scripts/benchmarks/benches/syclbench.py index f52c68c2dd..b9d6e50623 100644 --- a/scripts/benchmarks/benches/syclbench.py +++ b/scripts/benchmarks/benches/syclbench.py @@ -22,9 +22,7 @@ def setup(self): if self.built: return - build_path = os.path.join(self.directory, 'sycl-bench-build') - create_build_path(build_path, '') - + build_path = create_build_path(self.directory, 'sycl-bench-build') repo_path = git_clone(self.directory, "sycl-bench-repo", "https://github.com/mateuszpn/sycl-bench.git", "1e6ab2cfd004a72c5336c26945965017e06eab71") configure_command = [ @@ -37,20 +35,17 @@ def setup(self): f"-DSYCL_IMPL=dpcpp" ] - print(f"Run {configure_command}") run(configure_command, add_sycl=True) - - print(f"Run cmake --build {build_path}") run(f"cmake --build {build_path} -j", add_sycl=True) self.built = True - self.bins = build_path class SyclBenchmark(Benchmark): def __init__(self, bench, name, test): self.bench = bench self.bench_name = name self.test = test + self.done = False super().__init__(bench.directory) def bin_args(self) -> list[str]: @@ -64,17 +59,19 @@ def unit(self): def setup(self): self.bench.setup() - self.benchmark_bin = os.path.join(self.bench.bins, self.bench_name) + self.benchmark_bin = os.path.join(self.directory, 'sycl-bench-build', self.bench_name) def run(self, env_vars) -> list[Result]: - outputfile = f"{self.bench.directory}/{self.test}.csv" + if self.done: + return + self.outputfile = os.path.join(self.bench.directory, self.test+".csv") + print(f"{self.__class__.__name__}: Results in {self.outputfile}") command = [ f"{self.benchmark_bin}", f"--warmup-run", - f"--num-runs=3", - f"--output={outputfile}" + f"--num-runs={options.iterations}", + f"--output={self.outputfile}" ] - bin_dir = self.bench.bins command += self.bin_args() env_vars.update(self.extra_env_vars()) @@ -82,26 +79,158 @@ def run(self, env_vars) -> list[Result]: # no output to stdout, all in outputfile self.run_bench(command, env_vars) - with open(outputfile, 'r') as f: + with open(self.outputfile, 'r') as f: reader = csv.reader(f) res_list = [] for row in reader: if not row[0].startswith('#'): res_list.append( Result(label=row[0], - value=float(row[12]) * 1000, # convert to ms - command=command, - env=env_vars, - stdout=row)) - + value=float(row[12]) * 1000, # convert to ms + passed=(row[1]=="PASS"), + command=command, + env=env_vars, + stdout=row)) + self.done = True return res_list def teardown(self): + print(f"Removing {self.outputfile}...") + os.remove(self.outputfile) return def name(self): return self.test +# multi benchmarks +class Blocked_transform(SyclBenchmark): + def __init__(self, bench): + super().__init__(bench, "blocked_transform", "BlockedTransform_multi") + + def bin_args(self) -> list[str]: + return [ + f"--size=2049", + f"--local=1024" + ] + +class DagTaskI(SyclBenchmark): + def __init__(self, bench): + super().__init__(bench, "dag_task_throughput_independent", "IndependentDAGTaskThroughput_multi") + + def bin_args(self) -> list[str]: + return [ + f"--size=32768", + ] + +class DagTaskS(SyclBenchmark): + def __init__(self, bench): + super().__init__(bench, "dag_task_throughput_sequential", "DAGTaskThroughput_multi") + + def bin_args(self) -> list[str]: + return [ + f"--size=327680", + ] + +class HostDevBandwidth(SyclBenchmark): + def __init__(self, bench): + super().__init__(bench, "host_device_bandwidth", "HostDeviceBandwidth_multi") + + def bin_args(self) -> list[str]: + return [ + f"--size=512", + ] + +class LocalMem(SyclBenchmark): + def __init__(self, bench): + super().__init__(bench, "local_mem", f"LocalMem_multi") + + def bin_args(self) -> list[str]: + return [ + f"--size=10240000", + ] + +class Pattern_L2(SyclBenchmark): + def __init__(self, bench): + super().__init__(bench, "pattern_L2", "L2_multi") + + def bin_args(self) -> list[str]: + return [ + f"--size=1024000000", + ] + +class Reduction(SyclBenchmark): + def __init__(self, bench): + super().__init__(bench, "reduction", "Pattern_Reduction_multi") + + def bin_args(self) -> list[str]: + return [ + f"--size=10240000", + ] + +class ScalarProd(SyclBenchmark): + def __init__(self, bench): + super().__init__(bench, "scalar_prod", "ScalarProduct_multi") + + def bin_args(self) -> list[str]: + return [ + f"--size=102400000", + ] + +class SegmentReduction(SyclBenchmark): + def __init__(self, bench): + super().__init__(bench, "segmentedreduction", "Pattern_SegmentedReduction_multi") + + def bin_args(self) -> list[str]: + return [ + f"--size=102400000", + ] + +class UsmAccLatency(SyclBenchmark): + def __init__(self, bench): + super().__init__(bench, "usm_accessors_latency", "USM_Latency_multi") + + def bin_args(self) -> list[str]: + return [ + f"--size=4096", + ] + +class UsmAllocLatency(SyclBenchmark): + def __init__(self, bench): + super().__init__(bench, "usm_allocation_latency", "USM_Allocation_latency_multi") + + def bin_args(self) -> list[str]: + return [ + f"--size=1024000000", + ] + +class UsmInstrMix(SyclBenchmark): + def __init__(self, bench): + super().__init__(bench, "usm_instr_mix", "USM_Instr_Mix_multi") + + def bin_args(self) -> list[str]: + return [ + f"--size=8192", + ] + +class UsmPinnedOverhead(SyclBenchmark): + def __init__(self, bench): + super().__init__(bench, "usm_pinned_overhead", "USM_Pinned_Overhead_multi") + + def bin_args(self) -> list[str]: + return [ + f"--size=10240000", + ] + +class VecAdd(SyclBenchmark): + def __init__(self, bench): + super().__init__(bench, "vec_add", "VectorAddition_multi") + + def bin_args(self) -> list[str]: + return [ + f"--size=102400000", + ] + +# single benchmarks class Arith(SyclBenchmark): def __init__(self, bench): super().__init__(bench, "arith", "Arith_int32_512") @@ -152,7 +281,7 @@ def __init__(self, bench): def bin_args(self) -> list[str]: return [ - f"--size=20480", + f"--size=204800", ] class Correlation(SyclBenchmark): @@ -179,7 +308,7 @@ def __init__(self, bench): def bin_args(self) -> list[str]: return [ - f"--size=8192", + f"--size=1536", ] class Gesumv(SyclBenchmark): @@ -224,7 +353,7 @@ def __init__(self, bench): def bin_args(self) -> list[str]: return [ - f"--size=640000", + f"--size=4096", ] class MatmulChain(SyclBenchmark): @@ -269,7 +398,7 @@ def __init__(self, bench): def bin_args(self) -> list[str]: return [ - f"--size=--size=100000000", + f"--size=5000000000", ] class Syr2k(SyclBenchmark): @@ -278,7 +407,7 @@ def __init__(self, bench): def bin_args(self) -> list[str]: return [ - f"--size=6144", + f"--size=2048", ] class Syrk(SyclBenchmark): @@ -287,84 +416,5 @@ def __init__(self, bench): def bin_args(self) -> list[str]: return [ - f"--size=4096", - ] - -# multi benchmarks -class Blocked_transform(SyclBenchmark): - def __init__(self, bench): - super().__init__(bench, "blocked_transform", "BlockedTransform_multi") - - def bin_args(self) -> list[str]: - return [ - f"--size=16384", - f"--local=1024" - ] - -class DagTaskI(SyclBenchmark): - def __init__(self, bench): - super().__init__(bench, "dag_task_throughput_independent", "IndependentDAGTaskThroughput_multi") - - def bin_args(self) -> list[str]: - return [ - f"--size=32768", + f"--size=1024", ] - -class DagTaskS(SyclBenchmark): - def __init__(self, bench): - super().__init__(bench, "dag_task_throughput_sequential", "DAGTaskThroughput_multi") - - def bin_args(self) -> list[str]: - return [ - f"--size=327680", - ] - -class HostDevBandwidth(SyclBenchmark): - def __init__(self, bench): - super().__init__(bench, "host_device_bandwidth", "HostDeviceBandwidth_multi") - -class LocalMem(SyclBenchmark): - def __init__(self, bench): - super().__init__(bench, "local_mem", f"LocalMem_multi") - - def bin_args(self) -> list[str]: - return [ - f"--size=512", - ] - -class Pattern_L2(SyclBenchmark): - def __init__(self, bench): - super().__init__(bench, "pattern_L2", "L2_multi") - -class Reduction(SyclBenchmark): - def __init__(self, bench): - super().__init__(bench, "reduction", "Pattern_Reduction_multi") - -class ScalarProd(SyclBenchmark): - def __init__(self, bench): - super().__init__(bench, "scalar_prod", "ScalarProduct_multi") - -class SegmentReduction(SyclBenchmark): - def __init__(self, bench): - super().__init__(bench, "segmentedreduction", "Pattern_SegmentedReduction_multi") - -class UsmAccLatency(SyclBenchmark): - def __init__(self, bench): - super().__init__(bench, "usm_accessors_latency", "USM_Latency_multi") - -class UsmAllocLatency(SyclBenchmark): - def __init__(self, bench): - super().__init__(bench, "usm_allocation_latency", "USM_Allocation_latency_multi") - -class UsmInstrMix(SyclBenchmark): - def __init__(self, bench): - super().__init__(bench, "usm_instr_mix", "USM_Instr_Mix_multi") - -class UsmPinnedOverhead(SyclBenchmark): - def __init__(self, bench): - super().__init__(bench, "usm_pinned_overhead", "USM_Pinned_Overhead_multi") - -class VecAdd(SyclBenchmark): - def __init__(self, bench): - super().__init__(bench, "vec_add", "VectorAddition_multi") - diff --git a/scripts/benchmarks/benches/velocity.py b/scripts/benchmarks/benches/velocity.py index 9d79f78178..3c903bf11b 100644 --- a/scripts/benchmarks/benches/velocity.py +++ b/scripts/benchmarks/benches/velocity.py @@ -9,12 +9,11 @@ from utils.utils import run, create_build_path from .options import options import os -import re class VelocityBench: def __init__(self, directory): self.directory = directory - self.repo_path = git_clone(self.directory, "velocity-bench-repo", "https://github.com/oneapi-src/Velocity-Bench", "34ee4ebe18d91dfdd38b7d798fd986b41874fcbc") + self.repo_path = git_clone(self.directory, "velocity-bench-repo", "https://github.com/oneapi-src/Velocity-Bench/", "b22215c16f789100449c34bf4eaa3fb178983d69") class VelocityBase(Benchmark): def __init__(self, name: str, bin_name: str, vb: VelocityBench): @@ -29,6 +28,7 @@ def download_deps(self): def setup(self): self.download_deps() + self.benchmark_bin = os.path.join(self.directory, self.bench_name, self.bin_name) build_path = create_build_path(self.directory, self.bench_name) @@ -41,8 +41,6 @@ def setup(self): run(configure_command, {'CC': 'clang', 'CXX':'clang++'}, add_sycl=True) run(f"cmake --build {build_path} -j", add_sycl=True) - self.benchmark_bin = os.path.join(build_path, self.bin_name) - def bin_args(self) -> list[str]: return [] diff --git a/scripts/benchmarks/main.py b/scripts/benchmarks/main.py index e4844a6f09..dac43643fc 100755 --- a/scripts/benchmarks/main.py +++ b/scripts/benchmarks/main.py @@ -19,16 +19,17 @@ from output import generate_markdown import argparse import re +import subprocess # Update this if you are changing the layout of the results files -INTERNAL_WORKDIR_VERSION = '1.6' +INTERNAL_WORKDIR_VERSION = '1.7' def main(directory, additional_env_vars, save_name, compare_names, filter): prepare_workdir(directory, INTERNAL_WORKDIR_VERSION) cb = ComputeBench(directory) - sb = SyclBench(directory) vb = VelocityBench(directory) + sb = SyclBench(directory) benchmarks = [ # *** Compute benchmarks @@ -53,7 +54,7 @@ def main(directory, additional_env_vars, save_name, compare_names, filter): SobelFilter(vb), # *** sycl-bench multi benchmarks - Blocked_transform(sb), + # Blocked_transform(sb), # run time < 1ms DagTaskI(sb), DagTaskS(sb), HostDevBandwidth(sb), @@ -69,12 +70,12 @@ def main(directory, additional_env_vars, save_name, compare_names, filter): VecAdd(sb), # *** sycl-bench single benchmarks - TwoDConvolution(sb), + # TwoDConvolution(sb), # run time < 1ms Two_mm(sb), Three_mm(sb), - Arith(sb), + # Arith(sb), # run time < 1ms Atax(sb), - Atomic_reduction(sb), + # Atomic_reduction(sb), # run time < 1ms Bicg(sb), Correlation(sb), Covariance(sb), @@ -83,7 +84,7 @@ def main(directory, additional_env_vars, save_name, compare_names, filter): Gramschmidt(sb), KMeans(sb), LinRegCoeff(sb), - LinRegError(sb), + # LinRegError(sb), # run time < 1ms MatmulChain(sb), MolDyn(sb), Mvt(sb), @@ -117,24 +118,29 @@ def main(directory, additional_env_vars, save_name, compare_names, filter): bench_results = benchmark.run(merged_env_vars) if bench_results is not None: for bench_result in bench_results: - print(f"complete ({bench_result.label}: {bench_result.value} {benchmark.unit()}).") + if bench_result.passed: + print(f"complete ({bench_result.label}: {bench_result.value:.3f} {benchmark.unit()}).") + else: + print(f"complete ({bench_result.label}: verification FAILED)") iteration_results.append(bench_result) else: - print(f"did not finish.") + print(f"did not finish (OK for sycl-bench).") + break; if len(iteration_results) == 0: continue for label in set([result.label for result in iteration_results]): - label_results = [result for result in iteration_results if result.label == label] - label_results.sort(key=lambda res: res.value) - median_index = len(label_results) // 2 - median_result = label_results[median_index] + label_results = [result for result in iteration_results if result.label == label and result.passed == True] + if len(label_results) > 0: + label_results.sort(key=lambda res: res.value) + median_index = len(label_results) // 2 + median_result = label_results[median_index] - median_result.unit = benchmark.unit() - median_result.name = label + median_result.unit = benchmark.unit() + median_result.name = label - results.append(median_result) + results.append(median_result) except Exception as e: if options.exit_on_failure: raise e