diff --git a/CONTRIBUTING.rst b/CONTRIBUTING.rst index 65f7140ce..4047fde80 100644 --- a/CONTRIBUTING.rst +++ b/CONTRIBUTING.rst @@ -85,7 +85,7 @@ Steps without :bash:`sudo` access (e.g. on a cluster): - /path/to/directory * [Optional] both Mamba and Miniconda can be automatically activated via :bash:`~/.bashrc`. Do not forget to add these (usually provided at the end of the installation). * Exit the shell and re-enter to make sure Conda is available. :bash:`cd` to the kernel tuner directory. - * [Optional] if you have limited user folder space, the Pip cache can be pointed elsewhere with the environment variable :bash:`PIP_CACHE_DIR`. The cache location can be checked with :bash:`pip cache dir`. + * [Optional] if you have limited user folder space, the Pip cache can be pointed elsewhere with the environment variable :bash:`PIP_CACHE_DIR`. The cache location can be checked with :bash:`pip cache dir`. On Linux, to point the entire :bash:`~/.cache` default elsewhere, use the :bash:`XDG_CACHE_HOME` environment variable. * [Optional] update Conda if available before continuing: :bash:`conda update -n base -c conda-forge conda`. #. Setup a virtual environment: :bash:`conda create --name kerneltuner python=3.11` (or whatever Python version and environment name you prefer). #. Activate the virtual environment: :bash:`conda activate kerneltuner`. diff --git a/kernel_tuner/backends/cupy.py b/kernel_tuner/backends/cupy.py index a1e13ff03..914f211a7 100644 --- a/kernel_tuner/backends/cupy.py +++ b/kernel_tuner/backends/cupy.py @@ -1,5 +1,6 @@ """This module contains all Cupy specific kernel_tuner functions.""" from __future__ import print_function +from warnings import warn import numpy as np @@ -124,6 +125,7 @@ def compile(self, kernel_instance): compiler_options = self.compiler_options if not any(["-std=" in opt for opt in self.compiler_options]): compiler_options = ["--std=c++11"] + self.compiler_options + # CuPy already sets the --gpu-architecture by itself, as per https://github.com/cupy/cupy/blob/main/cupy/cuda/compiler.py#L145 options = tuple(compiler_options) @@ -132,6 +134,7 @@ def compile(self, kernel_instance): ) self.func = self.current_module.get_function(kernel_name) + self.num_regs = self.func.num_regs return self.func def start_event(self): @@ -197,6 +200,8 @@ def run_kernel(self, func, gpu_args, threads, grid, stream=None): of the grid :type grid: tuple(int, int) """ + if stream is None: + stream = self.stream func(grid, threads, gpu_args, stream=stream, shared_mem=self.smem_size) def memset(self, allocation, value, size): diff --git a/kernel_tuner/backends/nvcuda.py b/kernel_tuner/backends/nvcuda.py index c6fb73d5e..15259cb23 100644 --- a/kernel_tuner/backends/nvcuda.py +++ b/kernel_tuner/backends/nvcuda.py @@ -1,9 +1,11 @@ """This module contains all NVIDIA cuda-python specific kernel_tuner functions.""" +from warnings import warn + import numpy as np from kernel_tuner.backends.backend import GPUBackend from kernel_tuner.observers.nvcuda import CudaRuntimeObserver -from kernel_tuner.util import SkippableFailure, cuda_error_check +from kernel_tuner.util import SkippableFailure, cuda_error_check, to_valid_nvrtc_gpu_arch_cc # embedded in try block to be able to generate documentation # and run tests without cuda-python installed @@ -161,12 +163,12 @@ def compile(self, kernel_instance): compiler_options.append(b"--std=c++11") if not any(["--std=" in opt for opt in self.compiler_options]): self.compiler_options.append("--std=c++11") - if not any([b"--gpu-architecture=" in opt for opt in compiler_options]): + if not any([b"--gpu-architecture=" in opt or b"-arch" in opt for opt in compiler_options]): compiler_options.append( - f"--gpu-architecture=compute_{self.cc}".encode("UTF-8") + f"--gpu-architecture=compute_{to_valid_nvrtc_gpu_arch_cc(self.cc)}".encode("UTF-8") ) - if not any(["--gpu-architecture=" in opt for opt in self.compiler_options]): - self.compiler_options.append(f"--gpu-architecture=compute_{self.cc}") + if not any(["--gpu-architecture=" in opt or "-arch" in opt for opt in self.compiler_options]): + self.compiler_options.append(f"--gpu-architecture=compute_{to_valid_nvrtc_gpu_arch_cc(self.cc)}") err, program = nvrtc.nvrtcCreateProgram( str.encode(kernel_string), b"CUDAProgram", 0, [], [] @@ -192,6 +194,11 @@ def compile(self, kernel_instance): ) cuda_error_check(err) + # get the number of registers per thread used in this kernel + num_regs = cuda.cuFuncGetAttribute(cuda.CUfunction_attribute.CU_FUNC_ATTRIBUTE_NUM_REGS, self.func) + assert num_regs[0] == 0, f"Retrieving number of registers per thread unsuccesful: code {num_regs[0]}" + self.num_regs = num_regs[1] + except RuntimeError as re: _, n = nvrtc.nvrtcGetProgramLogSize(program) log = b" " * n @@ -273,6 +280,8 @@ def run_kernel(self, func, gpu_args, threads, grid, stream=None): of the grid :type grid: tuple(int, int) """ + if stream is None: + stream = self.stream arg_types = list() for arg in gpu_args: if isinstance(arg, cuda.CUdeviceptr): diff --git a/kernel_tuner/backends/pycuda.py b/kernel_tuner/backends/pycuda.py index 3c168f824..7fddc9393 100644 --- a/kernel_tuner/backends/pycuda.py +++ b/kernel_tuner/backends/pycuda.py @@ -218,6 +218,8 @@ def compile(self, kernel_instance): ) self.func = self.current_module.get_function(kernel_name) + if not isinstance(self.func, str): + self.num_regs = self.func.num_regs return self.func except drv.CompileError as e: if "uses too much shared data" in e.stderr: diff --git a/kernel_tuner/core.py b/kernel_tuner/core.py index 174cd3af5..76bed9497 100644 --- a/kernel_tuner/core.py +++ b/kernel_tuner/core.py @@ -341,7 +341,7 @@ def __init__( print("Using: " + self.dev.name) def benchmark_default(self, func, gpu_args, threads, grid, result): - """Benchmark one kernel execution at a time""" + """Benchmark one kernel execution at a time.""" observers = [ obs for obs in self.dev.observers if not isinstance(obs, ContinuousObserver) ] @@ -391,12 +391,8 @@ def benchmark_continuous(self, func, gpu_args, threads, grid, result, duration): for obs in self.continuous_observers: result.update(obs.get_results()) - def benchmark(self, func, gpu_args, instance, verbose, objective): - """benchmark the kernel instance""" - logging.debug("benchmark " + instance.name) - logging.debug("thread block dimensions x,y,z=%d,%d,%d", *instance.threads) - logging.debug("grid dimensions x,y,z=%d,%d,%d", *instance.grid) - + def set_nvml_parameters(self, instance): + """Set the NVML parameters. Avoids setting time leaking into benchmark time.""" if self.use_nvml: if "nvml_pwr_limit" in instance.params: new_limit = int( @@ -409,6 +405,15 @@ def benchmark(self, func, gpu_args, instance, verbose, objective): if "nvml_mem_clock" in instance.params: self.nvml.mem_clock = instance.params["nvml_mem_clock"] + def benchmark(self, func, gpu_args, instance, verbose, objective, skip_nvml_setting=False): + """Benchmark the kernel instance.""" + logging.debug("benchmark " + instance.name) + logging.debug("thread block dimensions x,y,z=%d,%d,%d", *instance.threads) + logging.debug("grid dimensions x,y,z=%d,%d,%d", *instance.grid) + + if self.use_nvml and not skip_nvml_setting: + self.set_nvml_parameters(instance) + # Call the observers to register the configuration to be benchmarked for obs in self.dev.observers: obs.register_configuration(instance.params) @@ -577,9 +582,12 @@ def compile_and_benchmark(self, kernel_source, gpu_args, params, kernel_options, # benchmark if func: + # setting the NVML parameters here avoids this time from leaking into the benchmark time, ends up in framework time instead + if self.use_nvml: + self.set_nvml_parameters(instance) start_benchmark = time.perf_counter() result.update( - self.benchmark(func, gpu_args, instance, verbose, to.objective) + self.benchmark(func, gpu_args, instance, verbose, to.objective, skip_nvml_setting=False) ) last_benchmark_time = 1000 * (time.perf_counter() - start_benchmark) diff --git a/kernel_tuner/observers/nvml.py b/kernel_tuner/observers/nvml.py index 18945352a..0fd812a34 100644 --- a/kernel_tuner/observers/nvml.py +++ b/kernel_tuner/observers/nvml.py @@ -15,9 +15,7 @@ class nvml: """Class that gathers the NVML functionality for one device.""" - def __init__( - self, device_id=0, nvidia_smi_fallback="nvidia-smi", use_locked_clocks=False - ): + def __init__(self, device_id=0, nvidia_smi_fallback="nvidia-smi", use_locked_clocks=False): """Create object to control device using NVML.""" pynvml.nvmlInit() self.dev = pynvml.nvmlDeviceGetHandleByIndex(device_id) @@ -26,9 +24,7 @@ def __init__( try: self.pwr_limit_default = pynvml.nvmlDeviceGetPowerManagementLimit(self.dev) - self.pwr_constraints = pynvml.nvmlDeviceGetPowerManagementLimitConstraints( - self.dev - ) + self.pwr_constraints = pynvml.nvmlDeviceGetPowerManagementLimitConstraints(self.dev) except pynvml.NVMLError_NotSupported: self.pwr_limit_default = None # inverted range to make all range checks fail @@ -52,9 +48,7 @@ def __init__( self.gr_clock_default = pynvml.nvmlDeviceGetDefaultApplicationsClock( self.dev, pynvml.NVML_CLOCK_GRAPHICS ) - self.mem_clock_default = pynvml.nvmlDeviceGetDefaultApplicationsClock( - self.dev, pynvml.NVML_CLOCK_MEM - ) + self.mem_clock_default = pynvml.nvmlDeviceGetDefaultApplicationsClock(self.dev, pynvml.NVML_CLOCK_MEM) except pynvml.NVMLError_NotSupported: self.gr_clock_default = None self.sm_clock_default = None @@ -67,9 +61,7 @@ def __init__( # gather the supported gr clocks for each supported mem clock into a dict self.supported_gr_clocks = {} for mem_clock in self.supported_mem_clocks: - supported_gr_clocks = pynvml.nvmlDeviceGetSupportedGraphicsClocks( - self.dev, mem_clock - ) + supported_gr_clocks = pynvml.nvmlDeviceGetSupportedGraphicsClocks(self.dev, mem_clock) self.supported_gr_clocks[mem_clock] = supported_gr_clocks # test whether locked gr clocks and mem clocks are supported @@ -132,9 +124,9 @@ def persistence_mode(self): @persistence_mode.setter def persistence_mode(self, new_mode): if new_mode not in [0, 1]: - raise ValueError( - "Illegal value for persistence mode, should be either 0 or 1" - ) + raise ValueError("Illegal value for persistence mode, should be either 0 or 1") + if self.persistence_mode == new_mode: + return try: pynvml.nvmlDeviceSetPersistenceMode(self.dev, new_mode) self._persistence_mode = pynvml.nvmlDeviceGetPersistenceMode(self.dev) @@ -153,50 +145,37 @@ def set_clocks(self, mem_clock, gr_clock): if mem_clock not in self.supported_mem_clocks: raise ValueError("Illegal value for memory clock") if gr_clock not in self.supported_gr_clocks[mem_clock]: - raise ValueError(f"Graphics clock incompatible with memory clock ({mem_clock}), compatible graphics clocks: {self.supported_gr_clocks[mem_clock]}") - self.modified_clocks = True + raise ValueError( + f"Graphics clock incompatible with memory clock ({mem_clock}), compatible graphics clocks: {self.supported_gr_clocks[mem_clock]}" + ) + + # Check whether persistence mode is set. Without persistence mode, setting the clocks is not meaningful + # I deliberately removed the try..except clause here, if setting persistence mode fails, setting the clocks should fail + if self.persistence_mode != 1: + self.persistence_mode = 1 + if self.use_locked_clocks: - if self.persistence_mode != 1: - self.persistence_mode = 1 try: pynvml.nvmlDeviceSetGpuLockedClocks(self.dev, gr_clock, gr_clock) pynvml.nvmlDeviceSetMemoryLockedClocks(self.dev, mem_clock, mem_clock) except pynvml.NVMLError_NoPermission: if self.nvidia_smi: - args = [ - "sudo", - self.nvidia_smi, - "-i", - str(self.id), - "--lock-gpu-clocks=" + str(gr_clock) + "," + str(gr_clock), - ] - subprocess.run(args, check=True) - args = [ - "sudo", - self.nvidia_smi, - "-i", - str(self.id), - "--lock-memory-clocks=" + str(mem_clock) + "," + str(mem_clock), - ] - subprocess.run(args, check=True) + args = ["sudo", self.nvidia_smi, "-i", str(self.id)] + command_set_mem_clocks = f"--lock-memory-clocks={str(mem_clock)},{str(mem_clock)}" + command_set_gpu_clocks = f"--lock-gpu-clocks={str(gr_clock)},{str(gr_clock)}" + subprocess.run(args + [command_set_gpu_clocks], check=True) + subprocess.run(args + [command_set_mem_clocks], check=True) else: - try: - if self.persistence_mode != 0: - self.persistence_mode = 0 - except Exception: - pass try: pynvml.nvmlDeviceSetApplicationsClocks(self.dev, mem_clock, gr_clock) except pynvml.NVMLError_NoPermission: if self.nvidia_smi: - args = [ - "sudo", - self.nvidia_smi, - "-i", - str(self.id), - "--applications-clocks=" + str(mem_clock) + "," + str(gr_clock), - ] - subprocess.run(args, check=True) + args = ["sudo", self.nvidia_smi, "-i", str(self.id)] + command_set_clocks = f"--applications-clocks={str(mem_clock)},{str(gr_clock)}" + subprocess.run(args + [command_set_clocks], check=True) + + # Store the fact that we have modified the clocks + self.modified_clocks = True def reset_clocks(self): """Reset the clocks to the default clock if the device uses a non default clock.""" @@ -224,33 +203,22 @@ def reset_clocks(self): subprocess.run(args, check=True) elif self.gr_clock_default is not None: - gr_app_clock = pynvml.nvmlDeviceGetApplicationsClock( - self.dev, pynvml.NVML_CLOCK_GRAPHICS - ) - mem_app_clock = pynvml.nvmlDeviceGetApplicationsClock( - self.dev, pynvml.NVML_CLOCK_MEM - ) - if ( - gr_app_clock != self.gr_clock_default - or mem_app_clock != self.mem_clock_default - ): + gr_app_clock = pynvml.nvmlDeviceGetApplicationsClock(self.dev, pynvml.NVML_CLOCK_GRAPHICS) + mem_app_clock = pynvml.nvmlDeviceGetApplicationsClock(self.dev, pynvml.NVML_CLOCK_MEM) + if gr_app_clock != self.gr_clock_default or mem_app_clock != self.mem_clock_default: self.set_clocks(self.mem_clock_default, self.gr_clock_default) @property def gr_clock(self): """Control the graphics clock (may require permission), only values compatible with the memory clock can be set directly.""" - return pynvml.nvmlDeviceGetClockInfo(self.dev, pynvml.NVML_CLOCK_GRAPHICS) + if self.use_locked_clocks: + return pynvml.nvmlDeviceGetClockInfo(self.dev, pynvml.NVML_CLOCK_GRAPHICS) + else: + return pynvml.nvmlDeviceGetApplicationsClock(self.dev, pynvml.NVML_CLOCK_GRAPHICS) @gr_clock.setter def gr_clock(self, new_clock): - cur_clock = ( - pynvml.nvmlDeviceGetClockInfo(self.dev, pynvml.NVML_CLOCK_GRAPHICS) - if self.use_locked_clocks - else pynvml.nvmlDeviceGetApplicationsClock( - self.dev, pynvml.NVML_CLOCK_GRAPHICS - ) - ) - if new_clock != cur_clock: + if new_clock != self.gr_clock: self.set_clocks(self.mem_clock, new_clock) @property @@ -262,18 +230,11 @@ def mem_clock(self): mem_clock = pynvml.nvmlDeviceGetClockInfo(self.dev, pynvml.NVML_CLOCK_MEM) return min(self.supported_mem_clocks, key=lambda x: abs(x - mem_clock)) else: - return pynvml.nvmlDeviceGetApplicationsClock( - self.dev, pynvml.NVML_CLOCK_MEM - ) + return pynvml.nvmlDeviceGetApplicationsClock(self.dev, pynvml.NVML_CLOCK_MEM) @mem_clock.setter def mem_clock(self, new_clock): - cur_clock = ( - pynvml.nvmlDeviceGetClockInfo(self.dev, pynvml.NVML_CLOCK_MEM) - if self.use_locked_clocks - else pynvml.nvmlDeviceGetApplicationsClock(self.dev, pynvml.NVML_CLOCK_MEM) - ) - if new_clock != cur_clock: + if new_clock != self.mem_clock: self.set_clocks(new_clock, self.gr_clock) @property @@ -290,9 +251,7 @@ def auto_boost(self): def auto_boost(self, setting): # might need to use pynvml.NVML_FEATURE_DISABLED or pynvml.NVML_FEATURE_ENABLED instead of 0 or 1 if setting not in [0, 1]: - raise ValueError( - "Illegal value for auto boost enabled, should be either 0 or 1" - ) + raise ValueError("Illegal value for auto boost enabled, should be either 0 or 1") pynvml.nvmlDeviceSetAutoBoostedClocksEnabled(self.dev, setting) self._auto_boost = pynvml.nvmlDeviceGetAutoBoostedClocksEnabled(self.dev)[0] @@ -385,9 +344,7 @@ def __init__( if any([obs in self.needs_power for obs in observables]): self.measure_power = True power_observables = [obs for obs in observables if obs in self.needs_power] - self.continuous_observer = NVMLPowerObserver( - power_observables, self, self.nvml, continous_duration - ) + self.continuous_observer = NVMLPowerObserver(power_observables, self, self.nvml, continous_duration) # remove power observables self.observables = [obs for obs in observables if obs not in self.needs_power] @@ -402,11 +359,7 @@ def __init__( for obs in self.observables: self.results[obs + "s"] = [] - self.during_obs = [ - obs - for obs in observables - if obs in ["core_freq", "mem_freq", "temperature"] - ] + self.during_obs = [obs for obs in observables if obs in ["core_freq", "mem_freq", "temperature"]] self.iteration = {obs: [] for obs in self.during_obs} def before_start(self): @@ -428,15 +381,11 @@ def during(self): if "mem_freq" in self.observables: self.iteration["mem_freq"].append(self.nvml.mem_clock) if self.record_gr_voltage: - self.gr_voltage_readings.append( - [time.perf_counter() - self.t0, self.nvml.gr_voltage()] - ) + self.gr_voltage_readings.append([time.perf_counter() - self.t0, self.nvml.gr_voltage()]) def after_finish(self): if "temperature" in self.observables: - self.results["temperatures"].append( - np.average(self.iteration["temperature"]) - ) + self.results["temperatures"].append(np.average(self.iteration["temperature"])) if "core_freq" in self.observables: self.results["core_freqs"].append(np.average(self.iteration["core_freq"])) if "mem_freq" in self.observables: @@ -445,12 +394,8 @@ def after_finish(self): if "gr_voltage" in self.observables: execution_time = time.time() - self.t0 gr_voltage_readings = self.gr_voltage_readings - gr_voltage_readings = [ - [0.0, gr_voltage_readings[0][1]] - ] + gr_voltage_readings - gr_voltage_readings = gr_voltage_readings + [ - [execution_time, gr_voltage_readings[-1][1]] - ] + gr_voltage_readings = [[0.0, gr_voltage_readings[0][1]]] + gr_voltage_readings + gr_voltage_readings = gr_voltage_readings + [[execution_time, gr_voltage_readings[-1][1]]] # time in s, graphics voltage in millivolts self.results["gr_voltages"].append(np.average(gr_voltage_readings[:][:][1])) @@ -512,8 +457,7 @@ def during(self): timestamp = time.perf_counter() - self.t0 # only store the result if we get a new measurement from NVML if len(self.power_readings) == 0 or ( - self.power_readings[-1][1] != power_usage - or timestamp - self.power_readings[-1][0] > 0.01 + self.power_readings[-1][1] != power_usage or timestamp - self.power_readings[-1][0] > 0.01 ): self.power_readings.append([timestamp, power_usage]) @@ -560,9 +504,7 @@ def get_nvml_pwr_limits(device, n=None, quiet=False): n = int((power_limit_max - power_limit_min) / power_limit_round) + 1 # Rounded power limit values - power_limits = power_limit_round * np.round( - (np.linspace(power_limit_min, power_limit_max, n) / power_limit_round) - ) + power_limits = power_limit_round * np.round((np.linspace(power_limit_min, power_limit_max, n) / power_limit_round)) power_limits = sorted(list(set([int(power_limit) for power_limit in power_limits]))) tune_params["nvml_pwr_limit"] = power_limits diff --git a/kernel_tuner/observers/register.py b/kernel_tuner/observers/register.py new file mode 100644 index 000000000..ca0c0084d --- /dev/null +++ b/kernel_tuner/observers/register.py @@ -0,0 +1,14 @@ +from kernel_tuner.observers.observer import BenchmarkObserver + + +class RegisterObserver(BenchmarkObserver): + """Observer for counting the number of registers.""" + + def get_results(self): + try: + registers_per_thread = self.dev.num_regs + except AttributeError: + raise NotImplementedError( + f"Backend '{type(self.dev).__name__}' does not support count of registers per thread" + ) + return {"num_regs": registers_per_thread} diff --git a/kernel_tuner/runners/sequential.py b/kernel_tuner/runners/sequential.py index c493a0089..aeebd5116 100644 --- a/kernel_tuner/runners/sequential.py +++ b/kernel_tuner/runners/sequential.py @@ -100,7 +100,7 @@ def run(self, parameter_space, tuning_options): params = process_metrics(params, tuning_options.metrics) # get the framework time by estimating based on other times - total_time = 1000 * (perf_counter() - self.start_time) - warmup_time + total_time = 1000 * ((perf_counter() - self.start_time) - warmup_time) params['strategy_time'] = self.last_strategy_time params['framework_time'] = max(total_time - (params['compile_time'] + params['verification_time'] + params['benchmark_time'] + params['strategy_time']), 0) params['timestamp'] = str(datetime.now(timezone.utc)) diff --git a/kernel_tuner/util.py b/kernel_tuner/util.py index 6e9cdf5b0..64d5a618b 100644 --- a/kernel_tuner/util.py +++ b/kernel_tuner/util.py @@ -570,6 +570,13 @@ def get_total_timings(results, env, overhead_time): return env +NVRTC_VALID_CC = np.array(['50', '52', '53', '60', '61', '62', '70', '72', '75', '80', '87', '89', '90', '90a']) + +def to_valid_nvrtc_gpu_arch_cc(compute_capability: str) -> str: + """Returns a valid Compute Capability for NVRTC `--gpu-architecture=`, as per https://docs.nvidia.com/cuda/nvrtc/index.html#group__options.""" + return max(NVRTC_VALID_CC[NVRTC_VALID_CC<=compute_capability], default='52') + + def print_config(config, tuning_options, runner): """Print the configuration string with tunable parameters and benchmark results.""" print_config_output(tuning_options.tune_params, config, runner.quiet, tuning_options.metrics, runner.units) diff --git a/test/test_observers.py b/test/test_observers.py index d881fed74..c1cc460a9 100644 --- a/test/test_observers.py +++ b/test/test_observers.py @@ -1,11 +1,14 @@ - - import kernel_tuner from kernel_tuner.observers.nvml import NVMLObserver +from kernel_tuner.observers.register import RegisterObserver from kernel_tuner.observers.observer import BenchmarkObserver -from .context import skip_if_no_pycuda, skip_if_no_pynvml +from .context import skip_if_no_pycuda, skip_if_no_pynvml, skip_if_no_cupy, skip_if_no_cuda, skip_if_no_opencl, skip_if_no_pyhip from .test_runners import env # noqa: F401 +from .test_opencl_functions import env as env_opencl # noqa: F401 +from .test_hip_functions import env as env_hip # noqa: F401 + +from pytest import raises @skip_if_no_pycuda @@ -20,7 +23,6 @@ def test_nvml_observer(env): assert "temperature" in result[0] assert result[0]["temperature"] > 0 - @skip_if_no_pycuda def test_custom_observer(env): env[-1]["block_size_x"] = [128] @@ -34,3 +36,34 @@ def get_results(self): assert "name" in result[0] assert len(result[0]["name"]) > 0 +@skip_if_no_pycuda +def test_register_observer_pycuda(env): + result, _ = kernel_tuner.tune_kernel(*env, observers=[RegisterObserver()], lang='CUDA') + assert "num_regs" in result[0] + assert result[0]["num_regs"] > 0 + +@skip_if_no_cupy +def test_register_observer_cupy(env): + result, _ = kernel_tuner.tune_kernel(*env, observers=[RegisterObserver()], lang='CuPy') + assert "num_regs" in result[0] + assert result[0]["num_regs"] > 0 + +@skip_if_no_cuda +def test_register_observer_nvcuda(env): + result, _ = kernel_tuner.tune_kernel(*env, observers=[RegisterObserver()], lang='NVCUDA') + assert "num_regs" in result[0] + assert result[0]["num_regs"] > 0 + +@skip_if_no_opencl +def test_register_observer_opencl(env_opencl): + with raises(NotImplementedError) as err: + kernel_tuner.tune_kernel(*env_opencl, observers=[RegisterObserver()], lang='OpenCL') + assert err.errisinstance(NotImplementedError) + assert "OpenCL" in str(err.value) + +@skip_if_no_pyhip +def test_register_observer_hip(env_opencl): + with raises(NotImplementedError) as err: + kernel_tuner.tune_kernel(*env_opencl, observers=[RegisterObserver()], lang='HIP') + assert err.errisinstance(NotImplementedError) + assert "Hip" in str(err.value) diff --git a/test/test_pycuda_mocked.py b/test/test_pycuda_mocked.py index 21f352a3f..6bdfeef07 100644 --- a/test/test_pycuda_mocked.py +++ b/test/test_pycuda_mocked.py @@ -13,7 +13,7 @@ def setup_mock(drv): context = Mock() devprops = {'MAX_THREADS_PER_BLOCK': 1024, 'COMPUTE_CAPABILITY_MAJOR': 5, - 'COMPUTE_CAPABILITY_MINOR': 5} + 'COMPUTE_CAPABILITY_MINOR': 5,} context.return_value.get_device.return_value.get_attributes.return_value = devprops context.return_value.get_device.return_value.compute_capability.return_value = "55" drv.Device.return_value.retain_primary_context.return_value = context() diff --git a/test/test_util_functions.py b/test/test_util_functions.py index c66964b0e..f3431991b 100644 --- a/test/test_util_functions.py +++ b/test/test_util_functions.py @@ -146,6 +146,16 @@ def test_get_thread_block_dimensions(): assert threads[2] == 1 +def test_to_valid_nvrtc_gpu_arch_cc(): + assert to_valid_nvrtc_gpu_arch_cc("89") == "89" + assert to_valid_nvrtc_gpu_arch_cc("88") == "87" + assert to_valid_nvrtc_gpu_arch_cc("86") == "80" + assert to_valid_nvrtc_gpu_arch_cc("40") == "52" + assert to_valid_nvrtc_gpu_arch_cc("90b") == "90a" + assert to_valid_nvrtc_gpu_arch_cc("91c") == "90a" + assert to_valid_nvrtc_gpu_arch_cc("1234") == "52" + + def test_prepare_kernel_string(): kernel = "this is a weird kernel" grid = (3, 7)