From 81a68a418c5699ccef7e442af0588501f2f9e560 Mon Sep 17 00:00:00 2001 From: fjwillemsen Date: Thu, 8 Feb 2024 16:57:19 +0100 Subject: [PATCH 01/26] Added RegisterObserver with common interface among backends --- kernel_tuner/backends/cupy.py | 1 + kernel_tuner/backends/nvcuda.py | 5 +++++ kernel_tuner/backends/pycuda.py | 1 + kernel_tuner/observers/register.py | 12 ++++++++++++ 4 files changed, 19 insertions(+) create mode 100644 kernel_tuner/observers/register.py diff --git a/kernel_tuner/backends/cupy.py b/kernel_tuner/backends/cupy.py index a1e13ff03..19cb55a93 100644 --- a/kernel_tuner/backends/cupy.py +++ b/kernel_tuner/backends/cupy.py @@ -132,6 +132,7 @@ def compile(self, kernel_instance): ) self.func = self.current_module.get_function(kernel_name) + self.num_regs = self.func.num_regs return self.func def start_event(self): diff --git a/kernel_tuner/backends/nvcuda.py b/kernel_tuner/backends/nvcuda.py index c6fb73d5e..70bb637c5 100644 --- a/kernel_tuner/backends/nvcuda.py +++ b/kernel_tuner/backends/nvcuda.py @@ -192,6 +192,11 @@ def compile(self, kernel_instance): ) cuda_error_check(err) + # get the number of registers per thread used in this kernel + num_regs = cuda.cuFuncGetAttribute(cuda.CUfunction_attribute.CU_FUNC_ATTRIBUTE_NUM_REGS, self.func) + assert num_regs[0] == 0, f"Retrieving number of registers per thread unsuccesful: code {num_regs[0]}" + self.num_regs = num_regs[1] + except RuntimeError as re: _, n = nvrtc.nvrtcGetProgramLogSize(program) log = b" " * n diff --git a/kernel_tuner/backends/pycuda.py b/kernel_tuner/backends/pycuda.py index 3c168f824..3111d1a71 100644 --- a/kernel_tuner/backends/pycuda.py +++ b/kernel_tuner/backends/pycuda.py @@ -218,6 +218,7 @@ def compile(self, kernel_instance): ) self.func = self.current_module.get_function(kernel_name) + self.num_regs = self.func.num_regs return self.func except drv.CompileError as e: if "uses too much shared data" in e.stderr: diff --git a/kernel_tuner/observers/register.py b/kernel_tuner/observers/register.py new file mode 100644 index 000000000..b1310db2f --- /dev/null +++ b/kernel_tuner/observers/register.py @@ -0,0 +1,12 @@ +from kernel_tuner.observers.observer import BenchmarkObserver + +class RegisterObserver(BenchmarkObserver): + """Observer for counting the number of registers.""" + + def __init__(self) -> None: + super().__init__() + + def get_results(self): + return { + "num_regs": self.dev.num_regs + } \ No newline at end of file From 943b3c470c72bde8141ee40a5c4647aad0050280 Mon Sep 17 00:00:00 2001 From: fjwillemsen Date: Thu, 8 Feb 2024 17:24:36 +0100 Subject: [PATCH 02/26] Added test for RegisterObserver, added clause in case of mocktest --- kernel_tuner/backends/pycuda.py | 3 ++- test/test_observers.py | 26 +++++++++++++++++++++++++- 2 files changed, 27 insertions(+), 2 deletions(-) diff --git a/kernel_tuner/backends/pycuda.py b/kernel_tuner/backends/pycuda.py index 3111d1a71..7fddc9393 100644 --- a/kernel_tuner/backends/pycuda.py +++ b/kernel_tuner/backends/pycuda.py @@ -218,7 +218,8 @@ def compile(self, kernel_instance): ) self.func = self.current_module.get_function(kernel_name) - self.num_regs = self.func.num_regs + if not isinstance(self.func, str): + self.num_regs = self.func.num_regs return self.func except drv.CompileError as e: if "uses too much shared data" in e.stderr: diff --git a/test/test_observers.py b/test/test_observers.py index d881fed74..1765be334 100644 --- a/test/test_observers.py +++ b/test/test_observers.py @@ -2,9 +2,10 @@ import kernel_tuner from kernel_tuner.observers.nvml import NVMLObserver +from kernel_tuner.observers.register import RegisterObserver from kernel_tuner.observers.observer import BenchmarkObserver -from .context import skip_if_no_pycuda, skip_if_no_pynvml +from .context import skip_if_no_pycuda, skip_if_no_pynvml, skip_if_no_cupy, skip_if_no_cuda from .test_runners import env # noqa: F401 @@ -20,6 +21,29 @@ def test_nvml_observer(env): assert "temperature" in result[0] assert result[0]["temperature"] > 0 +@skip_if_no_pycuda +def test_register_observer_pycuda(env): + registerobserver = RegisterObserver() + env[-1]["block_size_x"] = [128] + result, _ = kernel_tuner.tune_kernel(*env, observers=[registerobserver], lang='CUDA') + assert "num_regs" in result[0] + assert result[0]["num_regs"] > 0 + +@skip_if_no_cupy +def test_register_observer_cupy(env): + registerobserver = RegisterObserver() + env[-1]["block_size_x"] = [128] + result, _ = kernel_tuner.tune_kernel(*env, observers=[registerobserver], lang='CuPy') + assert "num_regs" in result[0] + assert result[0]["num_regs"] > 0 + +@skip_if_no_cuda +def test_register_observer_nvcuda(env): + registerobserver = RegisterObserver() + env[-1]["block_size_x"] = [128] + result, _ = kernel_tuner.tune_kernel(*env, observers=[registerobserver], lang='NVCUDA') + assert "num_regs" in result[0] + assert result[0]["num_regs"] > 0 @skip_if_no_pycuda def test_custom_observer(env): From 1681730399f91d2143c0f7630b0c8e87a5438b42 Mon Sep 17 00:00:00 2001 From: fjwillemsen Date: Fri, 9 Feb 2024 11:43:26 +0100 Subject: [PATCH 03/26] Added useful error message in case Register Observer is not supported --- kernel_tuner/observers/register.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/kernel_tuner/observers/register.py b/kernel_tuner/observers/register.py index b1310db2f..92f22ffd8 100644 --- a/kernel_tuner/observers/register.py +++ b/kernel_tuner/observers/register.py @@ -7,6 +7,10 @@ def __init__(self) -> None: super().__init__() def get_results(self): + try: + registers_per_thread = self.dev.num_regs + except AttributeError: + raise NotImplementedError(f"Backend '{type(self.dev).__name__}' does not support count of registers per thread") return { - "num_regs": self.dev.num_regs + "num_regs": registers_per_thread } \ No newline at end of file From f153945b0f2c6a7a55d2dde4fa3f6ad1a8ce310e Mon Sep 17 00:00:00 2001 From: fjwillemsen Date: Fri, 9 Feb 2024 11:44:11 +0100 Subject: [PATCH 04/26] Added tests for Register Observer for OpenCL and HIP backends --- test/test_observers.py | 57 ++++++++++++++++++++++++------------------ 1 file changed, 33 insertions(+), 24 deletions(-) diff --git a/test/test_observers.py b/test/test_observers.py index 1765be334..c1cc460a9 100644 --- a/test/test_observers.py +++ b/test/test_observers.py @@ -1,12 +1,14 @@ - - import kernel_tuner from kernel_tuner.observers.nvml import NVMLObserver from kernel_tuner.observers.register import RegisterObserver from kernel_tuner.observers.observer import BenchmarkObserver -from .context import skip_if_no_pycuda, skip_if_no_pynvml, skip_if_no_cupy, skip_if_no_cuda +from .context import skip_if_no_pycuda, skip_if_no_pynvml, skip_if_no_cupy, skip_if_no_cuda, skip_if_no_opencl, skip_if_no_pyhip from .test_runners import env # noqa: F401 +from .test_opencl_functions import env as env_opencl # noqa: F401 +from .test_hip_functions import env as env_hip # noqa: F401 + +from pytest import raises @skip_if_no_pycuda @@ -22,39 +24,46 @@ def test_nvml_observer(env): assert result[0]["temperature"] > 0 @skip_if_no_pycuda -def test_register_observer_pycuda(env): - registerobserver = RegisterObserver() +def test_custom_observer(env): env[-1]["block_size_x"] = [128] - result, _ = kernel_tuner.tune_kernel(*env, observers=[registerobserver], lang='CUDA') + + class MyObserver(BenchmarkObserver): + def get_results(self): + return {"name": self.dev.name} + + result, _ = kernel_tuner.tune_kernel(*env, observers=[MyObserver()]) + + assert "name" in result[0] + assert len(result[0]["name"]) > 0 + +@skip_if_no_pycuda +def test_register_observer_pycuda(env): + result, _ = kernel_tuner.tune_kernel(*env, observers=[RegisterObserver()], lang='CUDA') assert "num_regs" in result[0] assert result[0]["num_regs"] > 0 @skip_if_no_cupy def test_register_observer_cupy(env): - registerobserver = RegisterObserver() - env[-1]["block_size_x"] = [128] - result, _ = kernel_tuner.tune_kernel(*env, observers=[registerobserver], lang='CuPy') + result, _ = kernel_tuner.tune_kernel(*env, observers=[RegisterObserver()], lang='CuPy') assert "num_regs" in result[0] assert result[0]["num_regs"] > 0 @skip_if_no_cuda def test_register_observer_nvcuda(env): - registerobserver = RegisterObserver() - env[-1]["block_size_x"] = [128] - result, _ = kernel_tuner.tune_kernel(*env, observers=[registerobserver], lang='NVCUDA') + result, _ = kernel_tuner.tune_kernel(*env, observers=[RegisterObserver()], lang='NVCUDA') assert "num_regs" in result[0] assert result[0]["num_regs"] > 0 -@skip_if_no_pycuda -def test_custom_observer(env): - env[-1]["block_size_x"] = [128] - - class MyObserver(BenchmarkObserver): - def get_results(self): - return {"name": self.dev.name} - - result, _ = kernel_tuner.tune_kernel(*env, observers=[MyObserver()]) - - assert "name" in result[0] - assert len(result[0]["name"]) > 0 +@skip_if_no_opencl +def test_register_observer_opencl(env_opencl): + with raises(NotImplementedError) as err: + kernel_tuner.tune_kernel(*env_opencl, observers=[RegisterObserver()], lang='OpenCL') + assert err.errisinstance(NotImplementedError) + assert "OpenCL" in str(err.value) +@skip_if_no_pyhip +def test_register_observer_hip(env_opencl): + with raises(NotImplementedError) as err: + kernel_tuner.tune_kernel(*env_opencl, observers=[RegisterObserver()], lang='HIP') + assert err.errisinstance(NotImplementedError) + assert "Hip" in str(err.value) From 7bd7c2b7ac67b35add3060a42e0a069980f2f871 Mon Sep 17 00:00:00 2001 From: fjwillemsen Date: Mon, 12 Feb 2024 12:07:34 +0100 Subject: [PATCH 05/26] Added instruction for pointing cache directory elsewhere --- CONTRIBUTING.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CONTRIBUTING.rst b/CONTRIBUTING.rst index 65f7140ce..793a066cb 100644 --- a/CONTRIBUTING.rst +++ b/CONTRIBUTING.rst @@ -85,7 +85,7 @@ Steps without :bash:`sudo` access (e.g. on a cluster): - /path/to/directory * [Optional] both Mamba and Miniconda can be automatically activated via :bash:`~/.bashrc`. Do not forget to add these (usually provided at the end of the installation). * Exit the shell and re-enter to make sure Conda is available. :bash:`cd` to the kernel tuner directory. - * [Optional] if you have limited user folder space, the Pip cache can be pointed elsewhere with the environment variable :bash:`PIP_CACHE_DIR`. The cache location can be checked with :bash:`pip cache dir`. + * [Optional] if you have limited user folder space, the Pip cache can be pointed elsewhere with the environment variable :bash:`PIP_CACHE_DIR`. The cache location can be checked with :bash:`pip cache dir`. On Linu, to point the entire :bash:`~/.cache` default elsewhere, use the :bash:`XDG_CACHE_HOME` environment variable. * [Optional] update Conda if available before continuing: :bash:`conda update -n base -c conda-forge conda`. #. Setup a virtual environment: :bash:`conda create --name kerneltuner python=3.11` (or whatever Python version and environment name you prefer). #. Activate the virtual environment: :bash:`conda activate kerneltuner`. From 9dea13795e1ccdc0d150ba75bb11c4966057df80 Mon Sep 17 00:00:00 2001 From: fjwillemsen Date: Mon, 12 Feb 2024 14:50:30 +0100 Subject: [PATCH 06/26] Non-argument streams are now correctly passed in the CuPy and NVCUDA backends --- kernel_tuner/backends/cupy.py | 2 ++ kernel_tuner/backends/nvcuda.py | 2 ++ 2 files changed, 4 insertions(+) diff --git a/kernel_tuner/backends/cupy.py b/kernel_tuner/backends/cupy.py index 19cb55a93..2e494a14f 100644 --- a/kernel_tuner/backends/cupy.py +++ b/kernel_tuner/backends/cupy.py @@ -198,6 +198,8 @@ def run_kernel(self, func, gpu_args, threads, grid, stream=None): of the grid :type grid: tuple(int, int) """ + if stream is None: + stream = self.stream func(grid, threads, gpu_args, stream=stream, shared_mem=self.smem_size) def memset(self, allocation, value, size): diff --git a/kernel_tuner/backends/nvcuda.py b/kernel_tuner/backends/nvcuda.py index 70bb637c5..4884a3a6d 100644 --- a/kernel_tuner/backends/nvcuda.py +++ b/kernel_tuner/backends/nvcuda.py @@ -278,6 +278,8 @@ def run_kernel(self, func, gpu_args, threads, grid, stream=None): of the grid :type grid: tuple(int, int) """ + if stream is None: + stream = self.stream arg_types = list() for arg in gpu_args: if isinstance(arg, cuda.CUdeviceptr): From df5414541d970472852ebe9cfb6c117e1ef27f94 Mon Sep 17 00:00:00 2001 From: fjwillemsen Date: Thu, 15 Feb 2024 16:18:09 +0100 Subject: [PATCH 07/26] Fixed several issues pertaining to the setting of clocks, in particular one where the memory clock would always be seen as not-equal due to a rounding error --- kernel_tuner/observers/nvml.py | 44 +++++++++++++--------------------- 1 file changed, 16 insertions(+), 28 deletions(-) diff --git a/kernel_tuner/observers/nvml.py b/kernel_tuner/observers/nvml.py index d33327a3c..200d98992 100644 --- a/kernel_tuner/observers/nvml.py +++ b/kernel_tuner/observers/nvml.py @@ -135,6 +135,8 @@ def persistence_mode(self, new_mode): raise ValueError( "Illegal value for persistence mode, should be either 0 or 1" ) + if self.persistence_mode == new_mode: + return try: pynvml.nvmlDeviceSetPersistenceMode(self.dev, new_mode) self._persistence_mode = pynvml.nvmlDeviceGetPersistenceMode(self.dev) @@ -168,21 +170,15 @@ def set_clocks(self, mem_clock, gr_clock): self.nvidia_smi, "-i", str(self.id), - "--lock-gpu-clocks=" + str(gr_clock) + "," + str(gr_clock), ] - subprocess.run(args, check=True) - args = [ - "sudo", - self.nvidia_smi, - "-i", - str(self.id), - "--lock-memory-clocks=" + str(mem_clock) + "," + str(mem_clock), - ] - subprocess.run(args, check=True) + command_set_mem_clocks = f"--lock-memory-clocks={str(mem_clock)},{str(mem_clock)}" + command_set_gpu_clocks = f"--lock-gpu-clocks={str(gr_clock)},{str(gr_clock)}" + subprocess.run(args + [command_set_gpu_clocks], check=True) + subprocess.run(args + [command_set_mem_clocks], check=True) else: try: - if self.persistence_mode != 0: - self.persistence_mode = 0 + if self.persistence_mode != 1: + self.persistence_mode = 1 except Exception: pass try: @@ -233,24 +229,20 @@ def reset_clocks(self): if ( gr_app_clock != self.gr_clock_default or mem_app_clock != self.mem_clock_default - ): + ): self.set_clocks(self.mem_clock_default, self.gr_clock_default) @property def gr_clock(self): """Control the graphics clock (may require permission), only values compatible with the memory clock can be set directly.""" - return pynvml.nvmlDeviceGetClockInfo(self.dev, pynvml.NVML_CLOCK_GRAPHICS) + if self.use_locked_clocks: + return pynvml.nvmlDeviceGetClockInfo(self.dev, pynvml.NVML_CLOCK_GRAPHICS) + else: + return pynvml.nvmlDeviceGetApplicationsClock(self.dev, pynvml.NVML_CLOCK_GRAPHICS) @gr_clock.setter def gr_clock(self, new_clock): - cur_clock = ( - pynvml.nvmlDeviceGetClockInfo(self.dev, pynvml.NVML_CLOCK_GRAPHICS) - if self.use_locked_clocks - else pynvml.nvmlDeviceGetApplicationsClock( - self.dev, pynvml.NVML_CLOCK_GRAPHICS - ) - ) - if new_clock != cur_clock: + if new_clock != self.gr_clock: self.set_clocks(self.mem_clock, new_clock) @property @@ -268,12 +260,8 @@ def mem_clock(self): @mem_clock.setter def mem_clock(self, new_clock): - cur_clock = ( - pynvml.nvmlDeviceGetClockInfo(self.dev, pynvml.NVML_CLOCK_MEM) - if self.use_locked_clocks - else pynvml.nvmlDeviceGetApplicationsClock(self.dev, pynvml.NVML_CLOCK_MEM) - ) - if new_clock != cur_clock: + if new_clock != self.mem_clock: + print(f"mem_clock setter calls set_clocks because {new_clock=} != {cur_clock=}") self.set_clocks(new_clock, self.gr_clock) @property From 4cc4a1399b2f48fc6cdbc65e9ede021365f91aec Mon Sep 17 00:00:00 2001 From: fjwillemsen Date: Thu, 15 Feb 2024 16:51:51 +0100 Subject: [PATCH 08/26] Time spent setting NVML parameters (clock & memory frequency, power) goes to framework time instead of benchmark time --- kernel_tuner/core.py | 23 ++++++++++++++++------- kernel_tuner/runners/sequential.py | 2 +- 2 files changed, 17 insertions(+), 8 deletions(-) diff --git a/kernel_tuner/core.py b/kernel_tuner/core.py index 174cd3af5..1cd39b6a3 100644 --- a/kernel_tuner/core.py +++ b/kernel_tuner/core.py @@ -391,12 +391,8 @@ def benchmark_continuous(self, func, gpu_args, threads, grid, result, duration): for obs in self.continuous_observers: result.update(obs.get_results()) - def benchmark(self, func, gpu_args, instance, verbose, objective): - """benchmark the kernel instance""" - logging.debug("benchmark " + instance.name) - logging.debug("thread block dimensions x,y,z=%d,%d,%d", *instance.threads) - logging.debug("grid dimensions x,y,z=%d,%d,%d", *instance.grid) - + def set_nvml_parameters(self, instance): + """Set the NVML parameters. Avoids setting time leaking into benchmark time.""" if self.use_nvml: if "nvml_pwr_limit" in instance.params: new_limit = int( @@ -409,6 +405,15 @@ def benchmark(self, func, gpu_args, instance, verbose, objective): if "nvml_mem_clock" in instance.params: self.nvml.mem_clock = instance.params["nvml_mem_clock"] + def benchmark(self, func, gpu_args, instance, verbose, objective, skip_nvml_setting=False): + """Benchmark the kernel instance.""" + logging.debug("benchmark " + instance.name) + logging.debug("thread block dimensions x,y,z=%d,%d,%d", *instance.threads) + logging.debug("grid dimensions x,y,z=%d,%d,%d", *instance.grid) + + if self.use_nvml and not skip_nvml_setting: + self.set_nvml_parameters(instance) + # Call the observers to register the configuration to be benchmarked for obs in self.dev.observers: obs.register_configuration(instance.params) @@ -577,11 +582,15 @@ def compile_and_benchmark(self, kernel_source, gpu_args, params, kernel_options, # benchmark if func: + # setting the NVML parameters here avoids this time from leaking into the benchmark time, ends up in framework time instead + if self.use_nvml: + self.set_nvml_parameters(instance) start_benchmark = time.perf_counter() result.update( - self.benchmark(func, gpu_args, instance, verbose, to.objective) + self.benchmark(func, gpu_args, instance, verbose, to.objective, skip_nvml_setting=False) ) last_benchmark_time = 1000 * (time.perf_counter() - start_benchmark) + print(f"Benchmark time: {last_benchmark_time}") except Exception as e: # dump kernel sources to temp file diff --git a/kernel_tuner/runners/sequential.py b/kernel_tuner/runners/sequential.py index c493a0089..23d9dc2ba 100644 --- a/kernel_tuner/runners/sequential.py +++ b/kernel_tuner/runners/sequential.py @@ -100,7 +100,7 @@ def run(self, parameter_space, tuning_options): params = process_metrics(params, tuning_options.metrics) # get the framework time by estimating based on other times - total_time = 1000 * (perf_counter() - self.start_time) - warmup_time + total_time = 1000 * ((perf_counter() - self.start_time) - warmup_time) # TODO is it valid that we deduct the warmup time here? params['strategy_time'] = self.last_strategy_time params['framework_time'] = max(total_time - (params['compile_time'] + params['verification_time'] + params['benchmark_time'] + params['strategy_time']), 0) params['timestamp'] = str(datetime.now(timezone.utc)) From e309bc1de7bcbf0291b099a3472de1c62099973d Mon Sep 17 00:00:00 2001 From: fjwillemsen Date: Thu, 15 Feb 2024 16:59:53 +0100 Subject: [PATCH 09/26] Time spent setting NVML parameters (clock & memory frequency, power) goes to framework time instead of benchmark time --- kernel_tuner/core.py | 1 - 1 file changed, 1 deletion(-) diff --git a/kernel_tuner/core.py b/kernel_tuner/core.py index 1cd39b6a3..18d3245d6 100644 --- a/kernel_tuner/core.py +++ b/kernel_tuner/core.py @@ -590,7 +590,6 @@ def compile_and_benchmark(self, kernel_source, gpu_args, params, kernel_options, self.benchmark(func, gpu_args, instance, verbose, to.objective, skip_nvml_setting=False) ) last_benchmark_time = 1000 * (time.perf_counter() - start_benchmark) - print(f"Benchmark time: {last_benchmark_time}") except Exception as e: # dump kernel sources to temp file From d6aac8b443d64fe6d4071a8ee19ca471c8a57308 Mon Sep 17 00:00:00 2001 From: Floris-Jan Willemsen Date: Thu, 15 Feb 2024 17:08:51 +0100 Subject: [PATCH 10/26] Removed redundant print statement --- kernel_tuner/observers/nvml.py | 1 - 1 file changed, 1 deletion(-) diff --git a/kernel_tuner/observers/nvml.py b/kernel_tuner/observers/nvml.py index 200d98992..0bd9adc84 100644 --- a/kernel_tuner/observers/nvml.py +++ b/kernel_tuner/observers/nvml.py @@ -261,7 +261,6 @@ def mem_clock(self): @mem_clock.setter def mem_clock(self, new_clock): if new_clock != self.mem_clock: - print(f"mem_clock setter calls set_clocks because {new_clock=} != {cur_clock=}") self.set_clocks(new_clock, self.gr_clock) @property From a020791a555912ce98a136c652d6f343fc95ee1e Mon Sep 17 00:00:00 2001 From: fjwillemsen Date: Wed, 28 Feb 2024 10:13:25 +0100 Subject: [PATCH 11/26] Added L2 cache size property to CUDA backends --- kernel_tuner/backends/cupy.py | 1 + kernel_tuner/backends/nvcuda.py | 4 ++++ kernel_tuner/backends/pycuda.py | 1 + 3 files changed, 6 insertions(+) diff --git a/kernel_tuner/backends/cupy.py b/kernel_tuner/backends/cupy.py index 2e494a14f..e6d43cebf 100644 --- a/kernel_tuner/backends/cupy.py +++ b/kernel_tuner/backends/cupy.py @@ -46,6 +46,7 @@ def __init__(self, device=0, iterations=7, compiler_options=None, observers=None self.devprops = dev.attributes self.cc = dev.compute_capability self.max_threads = self.devprops["MaxThreadsPerBlock"] + self.cache_size_L2 = self.devprops["L2CacheSize"] self.iterations = iterations self.current_module = None diff --git a/kernel_tuner/backends/nvcuda.py b/kernel_tuner/backends/nvcuda.py index 4884a3a6d..068cf453d 100644 --- a/kernel_tuner/backends/nvcuda.py +++ b/kernel_tuner/backends/nvcuda.py @@ -66,6 +66,10 @@ def __init__(self, device=0, iterations=7, compiler_options=None, observers=None cudart.cudaDeviceAttr.cudaDevAttrMaxThreadsPerBlock, device ) cuda_error_check(err) + err, self.cache_size_L2 = cudart.cudaDeviceGetAttribute( + cudart.cudaDeviceAttr.cudaDevAttrL2CacheSize, device + ) + cuda_error_check(err) self.cc = f"{major}{minor}" self.iterations = iterations self.current_module = None diff --git a/kernel_tuner/backends/pycuda.py b/kernel_tuner/backends/pycuda.py index 7fddc9393..659f51594 100644 --- a/kernel_tuner/backends/pycuda.py +++ b/kernel_tuner/backends/pycuda.py @@ -101,6 +101,7 @@ def _finish_up(): str(k): v for (k, v) in self.context.get_device().get_attributes().items() } self.max_threads = devprops["MAX_THREADS_PER_BLOCK"] + self.cache_size_L2 = devprops["L2_CACHE_SIZE"] cc = str(devprops.get("COMPUTE_CAPABILITY_MAJOR", "0")) + str( devprops.get("COMPUTE_CAPABILITY_MINOR", "0") ) From 6e6e5fb9bfeac34a87af19a1ea7c8f7e0e21d11a Mon Sep 17 00:00:00 2001 From: fjwillemsen Date: Wed, 28 Feb 2024 10:18:45 +0100 Subject: [PATCH 12/26] Added specification to CUPY compiler options --- kernel_tuner/backends/cupy.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/kernel_tuner/backends/cupy.py b/kernel_tuner/backends/cupy.py index e6d43cebf..9a817a1c6 100644 --- a/kernel_tuner/backends/cupy.py +++ b/kernel_tuner/backends/cupy.py @@ -125,6 +125,10 @@ def compile(self, kernel_instance): compiler_options = self.compiler_options if not any(["-std=" in opt for opt in self.compiler_options]): compiler_options = ["--std=c++11"] + self.compiler_options + if not any([b"--gpu-architecture=" in opt for opt in compiler_options]): + compiler_options.append( + f"--gpu-architecture=compute_{self.cc}".encode("UTF-8") + ) options = tuple(compiler_options) From f15338faf2c771078c01393254740c24dd99b5a9 Mon Sep 17 00:00:00 2001 From: fjwillemsen Date: Wed, 28 Feb 2024 11:50:04 +0100 Subject: [PATCH 13/26] Added L2 cache size property to OpenCL, HIP and mocked PyCUDA backends --- kernel_tuner/backends/hip.py | 1 + kernel_tuner/backends/opencl.py | 4 ++++ test/test_pycuda_mocked.py | 3 ++- 3 files changed, 7 insertions(+), 1 deletion(-) diff --git a/kernel_tuner/backends/hip.py b/kernel_tuner/backends/hip.py index 1db4cb302..dc0ffb1cb 100644 --- a/kernel_tuner/backends/hip.py +++ b/kernel_tuner/backends/hip.py @@ -59,6 +59,7 @@ def __init__(self, device=0, iterations=7, compiler_options=None, observers=None self.name = self.hipProps._name.decode('utf-8') self.max_threads = self.hipProps.maxThreadsPerBlock + self.cache_size_L2 = self.hipProps.l2CacheSize self.device = device self.compiler_options = compiler_options or [] self.iterations = iterations diff --git a/kernel_tuner/backends/opencl.py b/kernel_tuner/backends/opencl.py index af3be1c00..4946d804f 100644 --- a/kernel_tuner/backends/opencl.py +++ b/kernel_tuner/backends/opencl.py @@ -45,6 +45,10 @@ def __init__( self.max_threads = self.ctx.devices[0].get_info( cl.device_info.MAX_WORK_GROUP_SIZE ) + # TODO the L2 cache size request fails + # self.cache_size_L2 = self.ctx.devices[0].get_info( + # cl.device_affinity_domain.L2_CACHE + # ) self.compiler_options = compiler_options or [] # observer stuff diff --git a/test/test_pycuda_mocked.py b/test/test_pycuda_mocked.py index 21f352a3f..e47fc8e8e 100644 --- a/test/test_pycuda_mocked.py +++ b/test/test_pycuda_mocked.py @@ -13,7 +13,8 @@ def setup_mock(drv): context = Mock() devprops = {'MAX_THREADS_PER_BLOCK': 1024, 'COMPUTE_CAPABILITY_MAJOR': 5, - 'COMPUTE_CAPABILITY_MINOR': 5} + 'COMPUTE_CAPABILITY_MINOR': 5, + 'L2_CACHE_SIZE': 4096} context.return_value.get_device.return_value.get_attributes.return_value = devprops context.return_value.get_device.return_value.compute_capability.return_value = "55" drv.Device.return_value.retain_primary_context.return_value = context() From 00ac419643f237fd39a21601e45289332b70132b Mon Sep 17 00:00:00 2001 From: fjwillemsen Date: Wed, 28 Feb 2024 12:54:04 +0100 Subject: [PATCH 14/26] Added function to check for compute capability validity, improved check on gpu-architecture compiler option, added gpu-architecture auto-adding to CuPy --- kernel_tuner/backends/cupy.py | 11 +++++++---- kernel_tuner/backends/nvcuda.py | 19 ++++++++++++------- kernel_tuner/util.py | 6 ++++++ 3 files changed, 25 insertions(+), 11 deletions(-) diff --git a/kernel_tuner/backends/cupy.py b/kernel_tuner/backends/cupy.py index 9a817a1c6..9b06ddb20 100644 --- a/kernel_tuner/backends/cupy.py +++ b/kernel_tuner/backends/cupy.py @@ -1,10 +1,12 @@ """This module contains all Cupy specific kernel_tuner functions.""" from __future__ import print_function +from warnings import warn import numpy as np from kernel_tuner.backends.backend import GPUBackend from kernel_tuner.observers.cupy import CupyRuntimeObserver +from kernel_tuner.util import is_valid_nvrtc_gpu_arch_cc # embedded in try block to be able to generate documentation # and run tests without cupy installed @@ -125,10 +127,11 @@ def compile(self, kernel_instance): compiler_options = self.compiler_options if not any(["-std=" in opt for opt in self.compiler_options]): compiler_options = ["--std=c++11"] + self.compiler_options - if not any([b"--gpu-architecture=" in opt for opt in compiler_options]): - compiler_options.append( - f"--gpu-architecture=compute_{self.cc}".encode("UTF-8") - ) + if is_valid_nvrtc_gpu_arch_cc(self.cc): + if not any(["--gpu-architecture=" in opt or "-arch" in opt for opt in compiler_options]): + compiler_options.append(f"--gpu-architecture=compute_{self.cc}") + else: + warn(f"Could not add compiler option '--gpu-architecture=compute_{self.cc}' as {self.cc} is an invalid target") options = tuple(compiler_options) diff --git a/kernel_tuner/backends/nvcuda.py b/kernel_tuner/backends/nvcuda.py index 068cf453d..9c3c37097 100644 --- a/kernel_tuner/backends/nvcuda.py +++ b/kernel_tuner/backends/nvcuda.py @@ -1,9 +1,11 @@ """This module contains all NVIDIA cuda-python specific kernel_tuner functions.""" +from warnings import warn + import numpy as np from kernel_tuner.backends.backend import GPUBackend from kernel_tuner.observers.nvcuda import CudaRuntimeObserver -from kernel_tuner.util import SkippableFailure, cuda_error_check +from kernel_tuner.util import SkippableFailure, cuda_error_check, is_valid_nvrtc_gpu_arch_cc # embedded in try block to be able to generate documentation # and run tests without cuda-python installed @@ -165,12 +167,15 @@ def compile(self, kernel_instance): compiler_options.append(b"--std=c++11") if not any(["--std=" in opt for opt in self.compiler_options]): self.compiler_options.append("--std=c++11") - if not any([b"--gpu-architecture=" in opt for opt in compiler_options]): - compiler_options.append( - f"--gpu-architecture=compute_{self.cc}".encode("UTF-8") - ) - if not any(["--gpu-architecture=" in opt for opt in self.compiler_options]): - self.compiler_options.append(f"--gpu-architecture=compute_{self.cc}") + if is_valid_nvrtc_gpu_arch_cc(self.cc): + if not any([b"--gpu-architecture=" in opt or b"-arch" in opt for opt in compiler_options]): + compiler_options.append( + f"--gpu-architecture=compute_{self.cc}".encode("UTF-8") + ) + if not any(["--gpu-architecture=" in opt or "-arch" in opt for opt in self.compiler_options]): + self.compiler_options.append(f"--gpu-architecture=compute_{self.cc}") + else: + warn(f"Could not add compiler option '--gpu-architecture=compute_{self.cc}' as {self.cc} is an invalid target") err, program = nvrtc.nvrtcCreateProgram( str.encode(kernel_string), b"CUDAProgram", 0, [], [] diff --git a/kernel_tuner/util.py b/kernel_tuner/util.py index 6e9cdf5b0..d0adeb047 100644 --- a/kernel_tuner/util.py +++ b/kernel_tuner/util.py @@ -570,6 +570,12 @@ def get_total_timings(results, env, overhead_time): return env +def is_valid_nvrtc_gpu_arch_cc(compute_capability: str) -> bool: + """Returns whether the Compute Capability is a valid argument for NVRTC `--gpu-architecture=`, as per https://docs.nvidia.com/cuda/nvrtc/index.html#group__options.""" + valid_cc = ['50', '52', '53', '60', '61', '62', '70', '72', '75', '80', '87', '89', '90', '90a'] + return str(compute_capability) in valid_cc + + def print_config(config, tuning_options, runner): """Print the configuration string with tunable parameters and benchmark results.""" print_config_output(tuning_options.tune_params, config, runner.quiet, tuning_options.metrics, runner.units) From 55ab07473d4cc12501bce74eae305e5f2573c707 Mon Sep 17 00:00:00 2001 From: fjwillemsen Date: Wed, 28 Feb 2024 12:55:17 +0100 Subject: [PATCH 15/26] Added a flush kernel to clear the L2 cache between runs --- kernel_tuner/core.py | 53 ++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 51 insertions(+), 2 deletions(-) diff --git a/kernel_tuner/core.py b/kernel_tuner/core.py index 18d3245d6..ee831c836 100644 --- a/kernel_tuner/core.py +++ b/kernel_tuner/core.py @@ -340,14 +340,62 @@ def __init__( if not quiet: print("Using: " + self.dev.name) - def benchmark_default(self, func, gpu_args, threads, grid, result): - """Benchmark one kernel execution at a time""" + if lang.upper() not in ['OPENCL', 'C', 'FORTRAN']: + # flush the L2 cache, inspired by https://github.com/pytorch/FBGEMM/blob/eb3c304e6c213b81f2b2077813d3c6d16597aa97/fbgemm_gpu/bench/verify_fp16_stochastic_benchmark.cu#L130 + flush_gpu_string = """ + __global__ void flush_gpu(char* d_flush, char* d_flush2, bool do_write) { + const int idx = blockIdx.x * blockDim.x + threadIdx.x; + const char val = d_flush[idx]; + if (do_write * val) { + d_flush2[idx] = val; + } + } + """ + cache_size = self.dev.cache_size_L2 + d_flush = np.ones((cache_size), order='F').astype(np.float32) + d_flush2 = np.ones((cache_size), order='F').astype(np.float32) + self.flush_kernel_gpu_args = [d_flush, d_flush2, np.int32(True)] + + from kernel_tuner.interface import Options + options = { + 'kernel_name': 'flush_gpu', + 'lang': 'CUDA', + 'arguments': self.flush_kernel_gpu_args, + 'problem_size': cache_size, + 'grid_div_x': None, + 'grid_div_y': None, + 'grid_div_z': None, + 'block_size_names': None, + } + options = Options(options) + flush_kernel_lang = lang.upper() if lang.upper() in ['CUDA', 'CUPY', 'NVCUDA'] else 'CUPY' + flush_kernel_source = KernelSource('flush_gpu', flush_gpu_string, flush_kernel_lang) + self.flush_kernel_instance = self.create_kernel_instance(flush_kernel_source, kernel_options=options, params=dict(), verbose=not quiet) + self.flush_kernel = self.compile_kernel(self.flush_kernel_instance, verbose=not quiet) + self.flush_kernel_gpu_args = self.ready_argument_list(self.flush_kernel_gpu_args) + + # from kernel_tuner.kernelbuilder import PythonKernel + # self.flush_kernel = PythonKernel('flush_gpu', flush_gpu_string, cache_size, self.flush_kernel_gpu_args) + + def flush_cache(self): + """This special function can be called to flush the L2 cache.""" + if hasattr(self, 'flush_kernel'): + return + self.dev.synchronize() + assert self.run_kernel(self.flush_kernel, self.flush_kernel_gpu_args, self.flush_kernel_instance) + # self.flush_kernel.run_kernel(self.flush_kernel.gpu_args) + self.dev.synchronize() + + def benchmark_default(self, func, gpu_args, threads, grid, result, flush_cache=True): + """Benchmark one kernel execution at a time. Run with `flush_cache=True` to avoid caching effects between iterations.""" observers = [ obs for obs in self.dev.observers if not isinstance(obs, ContinuousObserver) ] self.dev.synchronize() for _ in range(self.iterations): + if flush_cache: + self.flush_cache() for obs in observers: obs.before_start() self.dev.synchronize() @@ -1008,3 +1056,4 @@ def wrap_templated_kernel(kernel_string, kernel_name): new_kernel_string += wrapper_function return new_kernel_string, name + "_wrapper" + \ No newline at end of file From e106baeacbb4b12c0556e0cd90b6246d8f2f0704 Mon Sep 17 00:00:00 2001 From: fjwillemsen Date: Wed, 28 Feb 2024 12:55:54 +0100 Subject: [PATCH 16/26] Added a flush kernel to clear the L2 cache between runs --- kernel_tuner/core.py | 1 - 1 file changed, 1 deletion(-) diff --git a/kernel_tuner/core.py b/kernel_tuner/core.py index ee831c836..cf775d308 100644 --- a/kernel_tuner/core.py +++ b/kernel_tuner/core.py @@ -1056,4 +1056,3 @@ def wrap_templated_kernel(kernel_string, kernel_name): new_kernel_string += wrapper_function return new_kernel_string, name + "_wrapper" - \ No newline at end of file From 0cb5e3a4a9acd4d3aecb19c1ad953d1b899c7d56 Mon Sep 17 00:00:00 2001 From: fjwillemsen Date: Thu, 29 Feb 2024 21:46:27 +0100 Subject: [PATCH 17/26] Made function for scaling the compute capability to a valid one, added tests for this function, removed setting --gpu-architecture for CuPy as it is already set internally --- kernel_tuner/backends/cupy.py | 7 +------ kernel_tuner/backends/nvcuda.py | 17 +++++++---------- kernel_tuner/util.py | 22 +++++++++++++++++----- test/test_util_functions.py | 13 +++++++++++++ 4 files changed, 38 insertions(+), 21 deletions(-) diff --git a/kernel_tuner/backends/cupy.py b/kernel_tuner/backends/cupy.py index 9b06ddb20..f53663daa 100644 --- a/kernel_tuner/backends/cupy.py +++ b/kernel_tuner/backends/cupy.py @@ -6,7 +6,6 @@ from kernel_tuner.backends.backend import GPUBackend from kernel_tuner.observers.cupy import CupyRuntimeObserver -from kernel_tuner.util import is_valid_nvrtc_gpu_arch_cc # embedded in try block to be able to generate documentation # and run tests without cupy installed @@ -127,11 +126,7 @@ def compile(self, kernel_instance): compiler_options = self.compiler_options if not any(["-std=" in opt for opt in self.compiler_options]): compiler_options = ["--std=c++11"] + self.compiler_options - if is_valid_nvrtc_gpu_arch_cc(self.cc): - if not any(["--gpu-architecture=" in opt or "-arch" in opt for opt in compiler_options]): - compiler_options.append(f"--gpu-architecture=compute_{self.cc}") - else: - warn(f"Could not add compiler option '--gpu-architecture=compute_{self.cc}' as {self.cc} is an invalid target") + # CuPy already sets the --gpu-architecture by itself, as per https://github.com/cupy/cupy/blob/20ccd63c0acc40969c851b1917dedeb032209e8b/cupy/cuda/compiler.py#L145 options = tuple(compiler_options) diff --git a/kernel_tuner/backends/nvcuda.py b/kernel_tuner/backends/nvcuda.py index 9c3c37097..0a74f6d9f 100644 --- a/kernel_tuner/backends/nvcuda.py +++ b/kernel_tuner/backends/nvcuda.py @@ -5,7 +5,7 @@ from kernel_tuner.backends.backend import GPUBackend from kernel_tuner.observers.nvcuda import CudaRuntimeObserver -from kernel_tuner.util import SkippableFailure, cuda_error_check, is_valid_nvrtc_gpu_arch_cc +from kernel_tuner.util import SkippableFailure, cuda_error_check, to_valid_nvrtc_gpu_arch_cc # embedded in try block to be able to generate documentation # and run tests without cuda-python installed @@ -167,15 +167,12 @@ def compile(self, kernel_instance): compiler_options.append(b"--std=c++11") if not any(["--std=" in opt for opt in self.compiler_options]): self.compiler_options.append("--std=c++11") - if is_valid_nvrtc_gpu_arch_cc(self.cc): - if not any([b"--gpu-architecture=" in opt or b"-arch" in opt for opt in compiler_options]): - compiler_options.append( - f"--gpu-architecture=compute_{self.cc}".encode("UTF-8") - ) - if not any(["--gpu-architecture=" in opt or "-arch" in opt for opt in self.compiler_options]): - self.compiler_options.append(f"--gpu-architecture=compute_{self.cc}") - else: - warn(f"Could not add compiler option '--gpu-architecture=compute_{self.cc}' as {self.cc} is an invalid target") + if not any([b"--gpu-architecture=" in opt or b"-arch" in opt for opt in compiler_options]): + compiler_options.append( + f"--gpu-architecture=compute_{to_valid_nvrtc_gpu_arch_cc(self.cc)}".encode("UTF-8") + ) + if not any(["--gpu-architecture=" in opt or "-arch" in opt for opt in self.compiler_options]): + self.compiler_options.append(f"--gpu-architecture=compute_{to_valid_nvrtc_gpu_arch_cc(self.cc)}") err, program = nvrtc.nvrtcCreateProgram( str.encode(kernel_string), b"CUDAProgram", 0, [], [] diff --git a/kernel_tuner/util.py b/kernel_tuner/util.py index d0adeb047..77aa2607d 100644 --- a/kernel_tuner/util.py +++ b/kernel_tuner/util.py @@ -221,7 +221,7 @@ def check_block_size_names(block_size_names): if not isinstance(block_size_names, list): raise ValueError("block_size_names should be a list of strings!") if len(block_size_names) > 3: - raise ValueError("block_size_names should not contain more than 3 names!") + raise ValueError(f"block_size_names should not contain more than 3 names! ({block_size_names=})") if not all([isinstance(name, "".__class__) for name in block_size_names]): raise ValueError("block_size_names should contain only strings!") @@ -570,10 +570,22 @@ def get_total_timings(results, env, overhead_time): return env -def is_valid_nvrtc_gpu_arch_cc(compute_capability: str) -> bool: - """Returns whether the Compute Capability is a valid argument for NVRTC `--gpu-architecture=`, as per https://docs.nvidia.com/cuda/nvrtc/index.html#group__options.""" - valid_cc = ['50', '52', '53', '60', '61', '62', '70', '72', '75', '80', '87', '89', '90', '90a'] - return str(compute_capability) in valid_cc +def to_valid_nvrtc_gpu_arch_cc(compute_capability: str) -> str: + """Returns a valid Compute Capability for NVRTC `--gpu-architecture=`, as per https://docs.nvidia.com/cuda/nvrtc/index.html#group__options.""" + valid_cc = ['50', '52', '53', '60', '61', '62', '70', '72', '75', '80', '87', '89', '90', '90a'] # must be in ascending order, when updating also update test_to_valid_nvrtc_gpu_arch_cc + compute_capability = str(compute_capability) + if len(compute_capability) < 2: + raise ValueError(f"Compute capability '{compute_capability}' must be at least of length 2, is {len(compute_capability)}") + if compute_capability in valid_cc: + return compute_capability + # if the compute capability does not match, scale down to the nearest matching + subset_cc = [cc for cc in valid_cc if compute_capability[0] == cc[0]] + if len(subset_cc) > 0: + # get the next-highest valid CC + highest_cc_index = max([i for i, cc in enumerate(subset_cc) if int(cc[1]) <= int(compute_capability[1])]) + return subset_cc[highest_cc_index] + # if all else fails, return the default 52 + return '52' def print_config(config, tuning_options, runner): diff --git a/test/test_util_functions.py b/test/test_util_functions.py index c66964b0e..a202d5de3 100644 --- a/test/test_util_functions.py +++ b/test/test_util_functions.py @@ -146,6 +146,19 @@ def test_get_thread_block_dimensions(): assert threads[2] == 1 +def test_to_valid_nvrtc_gpu_arch_cc(): + assert to_valid_nvrtc_gpu_arch_cc("89") == "89" + assert to_valid_nvrtc_gpu_arch_cc("88") == "87" + assert to_valid_nvrtc_gpu_arch_cc("86") == "80" + assert to_valid_nvrtc_gpu_arch_cc("40") == "52" + assert to_valid_nvrtc_gpu_arch_cc("90b") == "90a" + assert to_valid_nvrtc_gpu_arch_cc("91c") == "90a" + assert to_valid_nvrtc_gpu_arch_cc("10123001") == "52" + with pytest.raises(ValueError): + assert to_valid_nvrtc_gpu_arch_cc("") + assert to_valid_nvrtc_gpu_arch_cc("1") + + def test_prepare_kernel_string(): kernel = "this is a weird kernel" grid = (3, 7) From b6825060ce1dfa1efd4b10e1f6807e76189b898b Mon Sep 17 00:00:00 2001 From: fjwillemsen Date: Fri, 1 Mar 2024 17:45:41 +0100 Subject: [PATCH 18/26] Applied suggestions from comments by @csbnw --- kernel_tuner/backends/cupy.py | 3 +- kernel_tuner/backends/hip.py | 1 - kernel_tuner/backends/nvcuda.py | 4 --- kernel_tuner/backends/opencl.py | 4 --- kernel_tuner/backends/pycuda.py | 1 - kernel_tuner/core.py | 52 ++------------------------------- test/test_pycuda_mocked.py | 3 +- test/test_util_functions.py | 2 +- 8 files changed, 5 insertions(+), 65 deletions(-) diff --git a/kernel_tuner/backends/cupy.py b/kernel_tuner/backends/cupy.py index f53663daa..914f211a7 100644 --- a/kernel_tuner/backends/cupy.py +++ b/kernel_tuner/backends/cupy.py @@ -47,7 +47,6 @@ def __init__(self, device=0, iterations=7, compiler_options=None, observers=None self.devprops = dev.attributes self.cc = dev.compute_capability self.max_threads = self.devprops["MaxThreadsPerBlock"] - self.cache_size_L2 = self.devprops["L2CacheSize"] self.iterations = iterations self.current_module = None @@ -126,7 +125,7 @@ def compile(self, kernel_instance): compiler_options = self.compiler_options if not any(["-std=" in opt for opt in self.compiler_options]): compiler_options = ["--std=c++11"] + self.compiler_options - # CuPy already sets the --gpu-architecture by itself, as per https://github.com/cupy/cupy/blob/20ccd63c0acc40969c851b1917dedeb032209e8b/cupy/cuda/compiler.py#L145 + # CuPy already sets the --gpu-architecture by itself, as per https://github.com/cupy/cupy/blob/main/cupy/cuda/compiler.py#L145 options = tuple(compiler_options) diff --git a/kernel_tuner/backends/hip.py b/kernel_tuner/backends/hip.py index dc0ffb1cb..1db4cb302 100644 --- a/kernel_tuner/backends/hip.py +++ b/kernel_tuner/backends/hip.py @@ -59,7 +59,6 @@ def __init__(self, device=0, iterations=7, compiler_options=None, observers=None self.name = self.hipProps._name.decode('utf-8') self.max_threads = self.hipProps.maxThreadsPerBlock - self.cache_size_L2 = self.hipProps.l2CacheSize self.device = device self.compiler_options = compiler_options or [] self.iterations = iterations diff --git a/kernel_tuner/backends/nvcuda.py b/kernel_tuner/backends/nvcuda.py index 0a74f6d9f..15259cb23 100644 --- a/kernel_tuner/backends/nvcuda.py +++ b/kernel_tuner/backends/nvcuda.py @@ -68,10 +68,6 @@ def __init__(self, device=0, iterations=7, compiler_options=None, observers=None cudart.cudaDeviceAttr.cudaDevAttrMaxThreadsPerBlock, device ) cuda_error_check(err) - err, self.cache_size_L2 = cudart.cudaDeviceGetAttribute( - cudart.cudaDeviceAttr.cudaDevAttrL2CacheSize, device - ) - cuda_error_check(err) self.cc = f"{major}{minor}" self.iterations = iterations self.current_module = None diff --git a/kernel_tuner/backends/opencl.py b/kernel_tuner/backends/opencl.py index 4946d804f..af3be1c00 100644 --- a/kernel_tuner/backends/opencl.py +++ b/kernel_tuner/backends/opencl.py @@ -45,10 +45,6 @@ def __init__( self.max_threads = self.ctx.devices[0].get_info( cl.device_info.MAX_WORK_GROUP_SIZE ) - # TODO the L2 cache size request fails - # self.cache_size_L2 = self.ctx.devices[0].get_info( - # cl.device_affinity_domain.L2_CACHE - # ) self.compiler_options = compiler_options or [] # observer stuff diff --git a/kernel_tuner/backends/pycuda.py b/kernel_tuner/backends/pycuda.py index 659f51594..7fddc9393 100644 --- a/kernel_tuner/backends/pycuda.py +++ b/kernel_tuner/backends/pycuda.py @@ -101,7 +101,6 @@ def _finish_up(): str(k): v for (k, v) in self.context.get_device().get_attributes().items() } self.max_threads = devprops["MAX_THREADS_PER_BLOCK"] - self.cache_size_L2 = devprops["L2_CACHE_SIZE"] cc = str(devprops.get("COMPUTE_CAPABILITY_MAJOR", "0")) + str( devprops.get("COMPUTE_CAPABILITY_MINOR", "0") ) diff --git a/kernel_tuner/core.py b/kernel_tuner/core.py index cf775d308..76bed9497 100644 --- a/kernel_tuner/core.py +++ b/kernel_tuner/core.py @@ -340,62 +340,14 @@ def __init__( if not quiet: print("Using: " + self.dev.name) - if lang.upper() not in ['OPENCL', 'C', 'FORTRAN']: - # flush the L2 cache, inspired by https://github.com/pytorch/FBGEMM/blob/eb3c304e6c213b81f2b2077813d3c6d16597aa97/fbgemm_gpu/bench/verify_fp16_stochastic_benchmark.cu#L130 - flush_gpu_string = """ - __global__ void flush_gpu(char* d_flush, char* d_flush2, bool do_write) { - const int idx = blockIdx.x * blockDim.x + threadIdx.x; - const char val = d_flush[idx]; - if (do_write * val) { - d_flush2[idx] = val; - } - } - """ - cache_size = self.dev.cache_size_L2 - d_flush = np.ones((cache_size), order='F').astype(np.float32) - d_flush2 = np.ones((cache_size), order='F').astype(np.float32) - self.flush_kernel_gpu_args = [d_flush, d_flush2, np.int32(True)] - - from kernel_tuner.interface import Options - options = { - 'kernel_name': 'flush_gpu', - 'lang': 'CUDA', - 'arguments': self.flush_kernel_gpu_args, - 'problem_size': cache_size, - 'grid_div_x': None, - 'grid_div_y': None, - 'grid_div_z': None, - 'block_size_names': None, - } - options = Options(options) - flush_kernel_lang = lang.upper() if lang.upper() in ['CUDA', 'CUPY', 'NVCUDA'] else 'CUPY' - flush_kernel_source = KernelSource('flush_gpu', flush_gpu_string, flush_kernel_lang) - self.flush_kernel_instance = self.create_kernel_instance(flush_kernel_source, kernel_options=options, params=dict(), verbose=not quiet) - self.flush_kernel = self.compile_kernel(self.flush_kernel_instance, verbose=not quiet) - self.flush_kernel_gpu_args = self.ready_argument_list(self.flush_kernel_gpu_args) - - # from kernel_tuner.kernelbuilder import PythonKernel - # self.flush_kernel = PythonKernel('flush_gpu', flush_gpu_string, cache_size, self.flush_kernel_gpu_args) - - def flush_cache(self): - """This special function can be called to flush the L2 cache.""" - if hasattr(self, 'flush_kernel'): - return - self.dev.synchronize() - assert self.run_kernel(self.flush_kernel, self.flush_kernel_gpu_args, self.flush_kernel_instance) - # self.flush_kernel.run_kernel(self.flush_kernel.gpu_args) - self.dev.synchronize() - - def benchmark_default(self, func, gpu_args, threads, grid, result, flush_cache=True): - """Benchmark one kernel execution at a time. Run with `flush_cache=True` to avoid caching effects between iterations.""" + def benchmark_default(self, func, gpu_args, threads, grid, result): + """Benchmark one kernel execution at a time.""" observers = [ obs for obs in self.dev.observers if not isinstance(obs, ContinuousObserver) ] self.dev.synchronize() for _ in range(self.iterations): - if flush_cache: - self.flush_cache() for obs in observers: obs.before_start() self.dev.synchronize() diff --git a/test/test_pycuda_mocked.py b/test/test_pycuda_mocked.py index e47fc8e8e..6bdfeef07 100644 --- a/test/test_pycuda_mocked.py +++ b/test/test_pycuda_mocked.py @@ -13,8 +13,7 @@ def setup_mock(drv): context = Mock() devprops = {'MAX_THREADS_PER_BLOCK': 1024, 'COMPUTE_CAPABILITY_MAJOR': 5, - 'COMPUTE_CAPABILITY_MINOR': 5, - 'L2_CACHE_SIZE': 4096} + 'COMPUTE_CAPABILITY_MINOR': 5,} context.return_value.get_device.return_value.get_attributes.return_value = devprops context.return_value.get_device.return_value.compute_capability.return_value = "55" drv.Device.return_value.retain_primary_context.return_value = context() diff --git a/test/test_util_functions.py b/test/test_util_functions.py index a202d5de3..970603d83 100644 --- a/test/test_util_functions.py +++ b/test/test_util_functions.py @@ -153,7 +153,7 @@ def test_to_valid_nvrtc_gpu_arch_cc(): assert to_valid_nvrtc_gpu_arch_cc("40") == "52" assert to_valid_nvrtc_gpu_arch_cc("90b") == "90a" assert to_valid_nvrtc_gpu_arch_cc("91c") == "90a" - assert to_valid_nvrtc_gpu_arch_cc("10123001") == "52" + assert to_valid_nvrtc_gpu_arch_cc("1234") == "52" with pytest.raises(ValueError): assert to_valid_nvrtc_gpu_arch_cc("") assert to_valid_nvrtc_gpu_arch_cc("1") From da907b180ac70e0abe4da4ed8003b8c408b65ef4 Mon Sep 17 00:00:00 2001 From: fjwillemsen Date: Fri, 1 Mar 2024 17:54:45 +0100 Subject: [PATCH 19/26] Removed redundant comments / printing --- kernel_tuner/runners/sequential.py | 2 +- kernel_tuner/util.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel_tuner/runners/sequential.py b/kernel_tuner/runners/sequential.py index 23d9dc2ba..aeebd5116 100644 --- a/kernel_tuner/runners/sequential.py +++ b/kernel_tuner/runners/sequential.py @@ -100,7 +100,7 @@ def run(self, parameter_space, tuning_options): params = process_metrics(params, tuning_options.metrics) # get the framework time by estimating based on other times - total_time = 1000 * ((perf_counter() - self.start_time) - warmup_time) # TODO is it valid that we deduct the warmup time here? + total_time = 1000 * ((perf_counter() - self.start_time) - warmup_time) params['strategy_time'] = self.last_strategy_time params['framework_time'] = max(total_time - (params['compile_time'] + params['verification_time'] + params['benchmark_time'] + params['strategy_time']), 0) params['timestamp'] = str(datetime.now(timezone.utc)) diff --git a/kernel_tuner/util.py b/kernel_tuner/util.py index 77aa2607d..df73f2127 100644 --- a/kernel_tuner/util.py +++ b/kernel_tuner/util.py @@ -221,7 +221,7 @@ def check_block_size_names(block_size_names): if not isinstance(block_size_names, list): raise ValueError("block_size_names should be a list of strings!") if len(block_size_names) > 3: - raise ValueError(f"block_size_names should not contain more than 3 names! ({block_size_names=})") + raise ValueError("block_size_names should not contain more than 3 names!") if not all([isinstance(name, "".__class__) for name in block_size_names]): raise ValueError("block_size_names should contain only strings!") From 10915131d6e0a39a52e5866ac24da9bc9f8db038 Mon Sep 17 00:00:00 2001 From: Ben van Werkhoven Date: Mon, 22 Apr 2024 09:41:40 +0200 Subject: [PATCH 20/26] fix typo --- CONTRIBUTING.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CONTRIBUTING.rst b/CONTRIBUTING.rst index 793a066cb..4047fde80 100644 --- a/CONTRIBUTING.rst +++ b/CONTRIBUTING.rst @@ -85,7 +85,7 @@ Steps without :bash:`sudo` access (e.g. on a cluster): - /path/to/directory * [Optional] both Mamba and Miniconda can be automatically activated via :bash:`~/.bashrc`. Do not forget to add these (usually provided at the end of the installation). * Exit the shell and re-enter to make sure Conda is available. :bash:`cd` to the kernel tuner directory. - * [Optional] if you have limited user folder space, the Pip cache can be pointed elsewhere with the environment variable :bash:`PIP_CACHE_DIR`. The cache location can be checked with :bash:`pip cache dir`. On Linu, to point the entire :bash:`~/.cache` default elsewhere, use the :bash:`XDG_CACHE_HOME` environment variable. + * [Optional] if you have limited user folder space, the Pip cache can be pointed elsewhere with the environment variable :bash:`PIP_CACHE_DIR`. The cache location can be checked with :bash:`pip cache dir`. On Linux, to point the entire :bash:`~/.cache` default elsewhere, use the :bash:`XDG_CACHE_HOME` environment variable. * [Optional] update Conda if available before continuing: :bash:`conda update -n base -c conda-forge conda`. #. Setup a virtual environment: :bash:`conda create --name kerneltuner python=3.11` (or whatever Python version and environment name you prefer). #. Activate the virtual environment: :bash:`conda activate kerneltuner`. From 88830216c8e7a8215db93e67dd7f9f1dbce9e2b3 Mon Sep 17 00:00:00 2001 From: Ben van Werkhoven Date: Mon, 22 Apr 2024 09:51:29 +0200 Subject: [PATCH 21/26] simplified code, break if persistence mode cant be set --- kernel_tuner/observers/nvml.py | 37 ++++++++++++++-------------------- 1 file changed, 15 insertions(+), 22 deletions(-) diff --git a/kernel_tuner/observers/nvml.py b/kernel_tuner/observers/nvml.py index 0bd9adc84..739c6864f 100644 --- a/kernel_tuner/observers/nvml.py +++ b/kernel_tuner/observers/nvml.py @@ -1,6 +1,7 @@ import re import subprocess import time +from warnings import warn import numpy as np @@ -156,43 +157,35 @@ def set_clocks(self, mem_clock, gr_clock): raise ValueError("Illegal value for memory clock") if gr_clock not in self.supported_gr_clocks[mem_clock]: raise ValueError(f"Graphics clock incompatible with memory clock ({mem_clock}), compatible graphics clocks: {self.supported_gr_clocks[mem_clock]}") - self.modified_clocks = True + + # Check whether persistence mode is set. Without persistence mode, setting the clocks is not meaningful + # I deliberately removed the try..except clause here, if setting persistence mode fails, setting the clocks should fail + if self.persistence_mode != 1: + self.persistence_mode = 1 + if self.use_locked_clocks: - if self.persistence_mode != 1: - self.persistence_mode = 1 try: pynvml.nvmlDeviceSetGpuLockedClocks(self.dev, gr_clock, gr_clock) pynvml.nvmlDeviceSetMemoryLockedClocks(self.dev, mem_clock, mem_clock) except pynvml.NVMLError_NoPermission: if self.nvidia_smi: - args = [ - "sudo", - self.nvidia_smi, - "-i", - str(self.id), - ] + args = ["sudo", self.nvidia_smi, "-i", str(self.id)] command_set_mem_clocks = f"--lock-memory-clocks={str(mem_clock)},{str(mem_clock)}" command_set_gpu_clocks = f"--lock-gpu-clocks={str(gr_clock)},{str(gr_clock)}" subprocess.run(args + [command_set_gpu_clocks], check=True) subprocess.run(args + [command_set_mem_clocks], check=True) else: - try: - if self.persistence_mode != 1: - self.persistence_mode = 1 - except Exception: - pass try: pynvml.nvmlDeviceSetApplicationsClocks(self.dev, mem_clock, gr_clock) except pynvml.NVMLError_NoPermission: if self.nvidia_smi: - args = [ - "sudo", - self.nvidia_smi, - "-i", - str(self.id), - "--applications-clocks=" + str(mem_clock) + "," + str(gr_clock), - ] - subprocess.run(args, check=True) + args = ["sudo", self.nvidia_smi, "-i", str(self.id)] + command_set_clocks = f"--applications-clocks={str(mem_clock)},{str(gr_clock)}" + subprocess.run(args + command_set_clocks, check=True) + + # Store the fact that we have modified the clocks + self.modified_clocks = True + def reset_clocks(self): """Reset the clocks to the default clock if the device uses a non default clock.""" From caff3dc4facc78bcf98b72d99c37c717a54af2ed Mon Sep 17 00:00:00 2001 From: Ben van Werkhoven Date: Mon, 22 Apr 2024 09:51:53 +0200 Subject: [PATCH 22/26] removed unused import --- kernel_tuner/observers/nvml.py | 1 - 1 file changed, 1 deletion(-) diff --git a/kernel_tuner/observers/nvml.py b/kernel_tuner/observers/nvml.py index 739c6864f..d8d4cc8a4 100644 --- a/kernel_tuner/observers/nvml.py +++ b/kernel_tuner/observers/nvml.py @@ -1,7 +1,6 @@ import re import subprocess import time -from warnings import warn import numpy as np From 2ec94179404c04e871ae1f8d9ca0b83d56df2e90 Mon Sep 17 00:00:00 2001 From: Ben van Werkhoven Date: Mon, 22 Apr 2024 09:56:07 +0200 Subject: [PATCH 23/26] formatted with black --- kernel_tuner/observers/nvml.py | 79 +++++++++------------------------- 1 file changed, 21 insertions(+), 58 deletions(-) diff --git a/kernel_tuner/observers/nvml.py b/kernel_tuner/observers/nvml.py index d8d4cc8a4..e90d8a72b 100644 --- a/kernel_tuner/observers/nvml.py +++ b/kernel_tuner/observers/nvml.py @@ -15,9 +15,7 @@ class nvml: """Class that gathers the NVML functionality for one device.""" - def __init__( - self, device_id=0, nvidia_smi_fallback="nvidia-smi", use_locked_clocks=False - ): + def __init__(self, device_id=0, nvidia_smi_fallback="nvidia-smi", use_locked_clocks=False): """Create object to control device using NVML.""" pynvml.nvmlInit() self.dev = pynvml.nvmlDeviceGetHandleByIndex(device_id) @@ -26,9 +24,7 @@ def __init__( try: self.pwr_limit_default = pynvml.nvmlDeviceGetPowerManagementLimit(self.dev) - self.pwr_constraints = pynvml.nvmlDeviceGetPowerManagementLimitConstraints( - self.dev - ) + self.pwr_constraints = pynvml.nvmlDeviceGetPowerManagementLimitConstraints(self.dev) except pynvml.NVMLError_NotSupported: self.pwr_limit_default = None # inverted range to make all range checks fail @@ -52,9 +48,7 @@ def __init__( self.gr_clock_default = pynvml.nvmlDeviceGetDefaultApplicationsClock( self.dev, pynvml.NVML_CLOCK_GRAPHICS ) - self.mem_clock_default = pynvml.nvmlDeviceGetDefaultApplicationsClock( - self.dev, pynvml.NVML_CLOCK_MEM - ) + self.mem_clock_default = pynvml.nvmlDeviceGetDefaultApplicationsClock(self.dev, pynvml.NVML_CLOCK_MEM) except pynvml.NVMLError_NotSupported: self.gr_clock_default = None self.sm_clock_default = None @@ -67,9 +61,7 @@ def __init__( # gather the supported gr clocks for each supported mem clock into a dict self.supported_gr_clocks = {} for mem_clock in self.supported_mem_clocks: - supported_gr_clocks = pynvml.nvmlDeviceGetSupportedGraphicsClocks( - self.dev, mem_clock - ) + supported_gr_clocks = pynvml.nvmlDeviceGetSupportedGraphicsClocks(self.dev, mem_clock) self.supported_gr_clocks[mem_clock] = supported_gr_clocks # test whether locked gr clocks and mem clocks are supported @@ -132,9 +124,7 @@ def persistence_mode(self): @persistence_mode.setter def persistence_mode(self, new_mode): if new_mode not in [0, 1]: - raise ValueError( - "Illegal value for persistence mode, should be either 0 or 1" - ) + raise ValueError("Illegal value for persistence mode, should be either 0 or 1") if self.persistence_mode == new_mode: return try: @@ -155,7 +145,9 @@ def set_clocks(self, mem_clock, gr_clock): if mem_clock not in self.supported_mem_clocks: raise ValueError("Illegal value for memory clock") if gr_clock not in self.supported_gr_clocks[mem_clock]: - raise ValueError(f"Graphics clock incompatible with memory clock ({mem_clock}), compatible graphics clocks: {self.supported_gr_clocks[mem_clock]}") + raise ValueError( + f"Graphics clock incompatible with memory clock ({mem_clock}), compatible graphics clocks: {self.supported_gr_clocks[mem_clock]}" + ) # Check whether persistence mode is set. Without persistence mode, setting the clocks is not meaningful # I deliberately removed the try..except clause here, if setting persistence mode fails, setting the clocks should fail @@ -185,7 +177,6 @@ def set_clocks(self, mem_clock, gr_clock): # Store the fact that we have modified the clocks self.modified_clocks = True - def reset_clocks(self): """Reset the clocks to the default clock if the device uses a non default clock.""" if self.use_locked_clocks: @@ -212,16 +203,9 @@ def reset_clocks(self): subprocess.run(args, check=True) elif self.gr_clock_default is not None: - gr_app_clock = pynvml.nvmlDeviceGetApplicationsClock( - self.dev, pynvml.NVML_CLOCK_GRAPHICS - ) - mem_app_clock = pynvml.nvmlDeviceGetApplicationsClock( - self.dev, pynvml.NVML_CLOCK_MEM - ) - if ( - gr_app_clock != self.gr_clock_default - or mem_app_clock != self.mem_clock_default - ): + gr_app_clock = pynvml.nvmlDeviceGetApplicationsClock(self.dev, pynvml.NVML_CLOCK_GRAPHICS) + mem_app_clock = pynvml.nvmlDeviceGetApplicationsClock(self.dev, pynvml.NVML_CLOCK_MEM) + if gr_app_clock != self.gr_clock_default or mem_app_clock != self.mem_clock_default: self.set_clocks(self.mem_clock_default, self.gr_clock_default) @property @@ -246,9 +230,7 @@ def mem_clock(self): mem_clock = pynvml.nvmlDeviceGetClockInfo(self.dev, pynvml.NVML_CLOCK_MEM) return min(self.supported_mem_clocks, key=lambda x: abs(x - mem_clock)) else: - return pynvml.nvmlDeviceGetApplicationsClock( - self.dev, pynvml.NVML_CLOCK_MEM - ) + return pynvml.nvmlDeviceGetApplicationsClock(self.dev, pynvml.NVML_CLOCK_MEM) @mem_clock.setter def mem_clock(self, new_clock): @@ -269,9 +251,7 @@ def auto_boost(self): def auto_boost(self, setting): # might need to use pynvml.NVML_FEATURE_DISABLED or pynvml.NVML_FEATURE_ENABLED instead of 0 or 1 if setting not in [0, 1]: - raise ValueError( - "Illegal value for auto boost enabled, should be either 0 or 1" - ) + raise ValueError("Illegal value for auto boost enabled, should be either 0 or 1") pynvml.nvmlDeviceSetAutoBoostedClocksEnabled(self.dev, setting) self._auto_boost = pynvml.nvmlDeviceGetAutoBoostedClocksEnabled(self.dev)[0] @@ -363,9 +343,7 @@ def __init__( if any([obs in self.needs_power for obs in observables]): self.measure_power = True power_observables = [obs for obs in observables if obs in self.needs_power] - self.continuous_observer = NVMLPowerObserver( - power_observables, self, self.nvml, continous_duration - ) + self.continuous_observer = NVMLPowerObserver(power_observables, self, self.nvml, continous_duration) # remove power observables self.observables = [obs for obs in observables if obs not in self.needs_power] @@ -380,11 +358,7 @@ def __init__( for obs in self.observables: self.results[obs + "s"] = [] - self.during_obs = [ - obs - for obs in observables - if obs in ["core_freq", "mem_freq", "temperature"] - ] + self.during_obs = [obs for obs in observables if obs in ["core_freq", "mem_freq", "temperature"]] self.iteration = {obs: [] for obs in self.during_obs} def before_start(self): @@ -406,15 +380,11 @@ def during(self): if "mem_freq" in self.observables: self.iteration["mem_freq"].append(self.nvml.mem_clock) if self.record_gr_voltage: - self.gr_voltage_readings.append( - [time.perf_counter() - self.t0, self.nvml.gr_voltage()] - ) + self.gr_voltage_readings.append([time.perf_counter() - self.t0, self.nvml.gr_voltage()]) def after_finish(self): if "temperature" in self.observables: - self.results["temperatures"].append( - np.average(self.iteration["temperature"]) - ) + self.results["temperatures"].append(np.average(self.iteration["temperature"])) if "core_freq" in self.observables: self.results["core_freqs"].append(np.average(self.iteration["core_freq"])) if "mem_freq" in self.observables: @@ -423,12 +393,8 @@ def after_finish(self): if "gr_voltage" in self.observables: execution_time = time.time() - self.t0 gr_voltage_readings = self.gr_voltage_readings - gr_voltage_readings = [ - [0.0, gr_voltage_readings[0][1]] - ] + gr_voltage_readings - gr_voltage_readings = gr_voltage_readings + [ - [execution_time, gr_voltage_readings[-1][1]] - ] + gr_voltage_readings = [[0.0, gr_voltage_readings[0][1]]] + gr_voltage_readings + gr_voltage_readings = gr_voltage_readings + [[execution_time, gr_voltage_readings[-1][1]]] # time in s, graphics voltage in millivolts self.results["gr_voltages"].append(np.average(gr_voltage_readings[:][:][1])) @@ -490,8 +456,7 @@ def during(self): timestamp = time.perf_counter() - self.t0 # only store the result if we get a new measurement from NVML if len(self.power_readings) == 0 or ( - self.power_readings[-1][1] != power_usage - or timestamp - self.power_readings[-1][0] > 0.01 + self.power_readings[-1][1] != power_usage or timestamp - self.power_readings[-1][0] > 0.01 ): self.power_readings.append([timestamp, power_usage]) @@ -538,9 +503,7 @@ def get_nvml_pwr_limits(device, n=None, quiet=False): n = int((power_limit_max - power_limit_min) / power_limit_round) + 1 # Rounded power limit values - power_limits = power_limit_round * np.round( - (np.linspace(power_limit_min, power_limit_max, n) / power_limit_round) - ) + power_limits = power_limit_round * np.round((np.linspace(power_limit_min, power_limit_max, n) / power_limit_round)) power_limits = sorted(list(set([int(power_limit) for power_limit in power_limits]))) tune_params["nvml_pwr_limit"] = power_limits From ee16b86d7f944225d9274997f8f5bdb1a367feab Mon Sep 17 00:00:00 2001 From: Ben van Werkhoven Date: Mon, 22 Apr 2024 10:30:04 +0200 Subject: [PATCH 24/26] simplified to_valid_nvrtc_gpu_arch_cc --- kernel_tuner/util.py | 17 +++-------------- test/test_util_functions.py | 3 --- 2 files changed, 3 insertions(+), 17 deletions(-) diff --git a/kernel_tuner/util.py b/kernel_tuner/util.py index df73f2127..64d5a618b 100644 --- a/kernel_tuner/util.py +++ b/kernel_tuner/util.py @@ -570,22 +570,11 @@ def get_total_timings(results, env, overhead_time): return env +NVRTC_VALID_CC = np.array(['50', '52', '53', '60', '61', '62', '70', '72', '75', '80', '87', '89', '90', '90a']) + def to_valid_nvrtc_gpu_arch_cc(compute_capability: str) -> str: """Returns a valid Compute Capability for NVRTC `--gpu-architecture=`, as per https://docs.nvidia.com/cuda/nvrtc/index.html#group__options.""" - valid_cc = ['50', '52', '53', '60', '61', '62', '70', '72', '75', '80', '87', '89', '90', '90a'] # must be in ascending order, when updating also update test_to_valid_nvrtc_gpu_arch_cc - compute_capability = str(compute_capability) - if len(compute_capability) < 2: - raise ValueError(f"Compute capability '{compute_capability}' must be at least of length 2, is {len(compute_capability)}") - if compute_capability in valid_cc: - return compute_capability - # if the compute capability does not match, scale down to the nearest matching - subset_cc = [cc for cc in valid_cc if compute_capability[0] == cc[0]] - if len(subset_cc) > 0: - # get the next-highest valid CC - highest_cc_index = max([i for i, cc in enumerate(subset_cc) if int(cc[1]) <= int(compute_capability[1])]) - return subset_cc[highest_cc_index] - # if all else fails, return the default 52 - return '52' + return max(NVRTC_VALID_CC[NVRTC_VALID_CC<=compute_capability], default='52') def print_config(config, tuning_options, runner): diff --git a/test/test_util_functions.py b/test/test_util_functions.py index 970603d83..f3431991b 100644 --- a/test/test_util_functions.py +++ b/test/test_util_functions.py @@ -154,9 +154,6 @@ def test_to_valid_nvrtc_gpu_arch_cc(): assert to_valid_nvrtc_gpu_arch_cc("90b") == "90a" assert to_valid_nvrtc_gpu_arch_cc("91c") == "90a" assert to_valid_nvrtc_gpu_arch_cc("1234") == "52" - with pytest.raises(ValueError): - assert to_valid_nvrtc_gpu_arch_cc("") - assert to_valid_nvrtc_gpu_arch_cc("1") def test_prepare_kernel_string(): From bd0a40c720d5c70d84debd203e16fca1c0d124b0 Mon Sep 17 00:00:00 2001 From: Ben van Werkhoven Date: Mon, 22 Apr 2024 10:37:01 +0200 Subject: [PATCH 25/26] forgot to enclose in list --- kernel_tuner/observers/nvml.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel_tuner/observers/nvml.py b/kernel_tuner/observers/nvml.py index e90d8a72b..ed88acd3d 100644 --- a/kernel_tuner/observers/nvml.py +++ b/kernel_tuner/observers/nvml.py @@ -172,7 +172,7 @@ def set_clocks(self, mem_clock, gr_clock): if self.nvidia_smi: args = ["sudo", self.nvidia_smi, "-i", str(self.id)] command_set_clocks = f"--applications-clocks={str(mem_clock)},{str(gr_clock)}" - subprocess.run(args + command_set_clocks, check=True) + subprocess.run(args + [command_set_clocks], check=True) # Store the fact that we have modified the clocks self.modified_clocks = True From dfd3da98bb5743f09a4fe635875d804464f48670 Mon Sep 17 00:00:00 2001 From: Ben van Werkhoven Date: Mon, 22 Apr 2024 10:56:32 +0200 Subject: [PATCH 26/26] simplified and formatted with black --- kernel_tuner/observers/register.py | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/kernel_tuner/observers/register.py b/kernel_tuner/observers/register.py index 92f22ffd8..ca0c0084d 100644 --- a/kernel_tuner/observers/register.py +++ b/kernel_tuner/observers/register.py @@ -1,16 +1,14 @@ from kernel_tuner.observers.observer import BenchmarkObserver + class RegisterObserver(BenchmarkObserver): """Observer for counting the number of registers.""" - def __init__(self) -> None: - super().__init__() - def get_results(self): try: registers_per_thread = self.dev.num_regs except AttributeError: - raise NotImplementedError(f"Backend '{type(self.dev).__name__}' does not support count of registers per thread") - return { - "num_regs": registers_per_thread - } \ No newline at end of file + raise NotImplementedError( + f"Backend '{type(self.dev).__name__}' does not support count of registers per thread" + ) + return {"num_regs": registers_per_thread}