From 4daae027b78cd2bbf2a0c53e116fd9c5e10dad90 Mon Sep 17 00:00:00 2001 From: stijn Date: Wed, 8 Mar 2023 13:34:48 +0100 Subject: [PATCH 01/24] Add `TunablePrecision` object and example on how to use it --- examples/cuda/accuracy.py | 46 +++++++++++++++++++++ kernel_tuner/accuracy.py | 86 +++++++++++++++++++++++++++++++++++++++ kernel_tuner/core.py | 55 +++++++++++++++++++++---- 3 files changed, 180 insertions(+), 7 deletions(-) create mode 100644 examples/cuda/accuracy.py create mode 100644 kernel_tuner/accuracy.py diff --git a/examples/cuda/accuracy.py b/examples/cuda/accuracy.py new file mode 100644 index 000000000..59655ecad --- /dev/null +++ b/examples/cuda/accuracy.py @@ -0,0 +1,46 @@ +#!/usr/bin/env python + +import numpy +from pprint import pprint +from kernel_tuner import tune_kernel +from kernel_tuner.accuracy import TunablePrecision + +def tune(): + kernel_string = """ + #include + using half = __half; + + __global__ void vector_add(int n, float_type* left, float_type* right, float_type* output) { + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < n) { + output[i] = left[i] + right[i]; + } + } + """ + + size = 10000000 + + n = numpy.int32(size) + a = numpy.random.randn(size).astype(numpy.float64) + b = numpy.random.randn(size).astype(numpy.float64) + c = numpy.zeros_like(b) + + args = [ + n, + TunablePrecision("float_type", a), + TunablePrecision("float_type", b), + TunablePrecision("float_type", c), + ] + + tune_params = dict() + tune_params["block_size_x"] = [128+64*i for i in range(15)] + tune_params["float_type"] = ["float", "double", "half"] + + results, env = tune_kernel("vector_add", kernel_string, size, args, tune_params) + + pprint(results) + + +if __name__ == "__main__": + tune() diff --git a/kernel_tuner/accuracy.py b/kernel_tuner/accuracy.py new file mode 100644 index 000000000..4fae643fc --- /dev/null +++ b/kernel_tuner/accuracy.py @@ -0,0 +1,86 @@ +from collections import UserDict +from typing import Dict +import numpy as np + + +class Tunable(UserDict): + def __init__(self, param_key: str, arrays: Dict): + """Create a new ``Tunable``. + + ``Tunable`` can be used one of the input arguments when tuning kernels. It can contain + several arrays and the array that will be used during benchmark of one kernel configuration + can be selected based on a tunable parameter. + + Example + ------- + For example, it is possible to define a tunable parameter called ``matrix_layout`` and then + tunable for Fortran-order or C-order memory layout by passing the following object as a + kernel argument: + + ``` + Tunable("matrix_layout", dict("c"=matrix, "f"=matrix.transpose())) + ``` + + :param param_key: The tunable parameter used to select the array for benchmarking. + :param arrays: A dictionary that maps the parameter value to arrays. + """ + if isinstance(arrays, (tuple, list)): + arrays = dict(enumerate(arrays)) + + super().__init__(arrays) + self.param_key = param_key + + def select_for_configuration(self, params): + if callable(self.param_key): + key = self.param_key(params) + elif self.param_key in params: + key = params[self.param_key] + else: + key = eval(self.param_key, params, params) + + if key not in self: + list = ", ".join(map(str, self.keys())) + raise KeyError(f"'{key}' is not a valid parameter value, should be one of: {list}") + + return self[key] + + def __call__(self, params): + return self.select_for_configuration(params) + + +class TunablePrecision(Tunable): + def __init__(self, param_key: str, array: np.ndarray, dtypes: Dict[str, np.dtype] = None): + """ + Create a new ``TunablePrecision``. + + ``TunablePrecision`` can be used one of the input arguments when tuning kernels. It + contains the same array data, but stored using different levels of precision. This can + be used to tune the optimal precision for a kernel argument. + + :param param_key: The tunable parameter used to select the precision for benchmarking. + :param array: The input array. Will be converted to the given precision levels. + :param dtypes: Dictionary that maps names to numpy data types. + """ + # If no dtypes are given, generate a default list + if not dtypes: + dtypes = dict( + half=np.half, + float=np.single, + double=np.double) + + # Try to get bfloat16 from tensorflow if available. + try: + import tensorflow + dtypes["bfloat16"] = tensorflow.bfloat16.as_numpy_dtype + except ImportError: + pass # Ignore error if tensorflow is not available + + # If dtype is a list, convert it to a dictionary + if isinstance(dtypes, (list, tuple)): + dtypes = dict((name, np.dtype(name)) for name in dtypes) + + arrays = dict() + for precision, dtype in dtypes.items(): + arrays[precision] = np.array(array).astype(dtype) + + super().__init__(param_key, arrays) diff --git a/kernel_tuner/core.py b/kernel_tuner/core.py index 1eeeb0d8d..7194456ba 100644 --- a/kernel_tuner/core.py +++ b/kernel_tuner/core.py @@ -6,6 +6,8 @@ import re import numpy as np +from kernel_tuner.accuracy import Tunable + try: import cupy as cp except ImportError: @@ -239,7 +241,7 @@ def __init__(self, kernel_source, device=0, platform=0, quiet=False, compiler=No elif lang.upper() == "OPENCL": dev = OpenCLFunctions(device, platform, compiler_options=compiler_options, iterations=iterations, observers=observers) elif lang.upper() in ["C", "FORTRAN"]: - dev = CFunctions(compiler=compiler, compiler_options=compiler_options, iterations=iterations) + dev = CFunctions(compiler=compiler, compiler_options=compiler_options, iterations=iterations, observers=observers) elif lang.upper() == "HIP": dev = HipFunctions(device, compiler_options=compiler_options, iterations=iterations, observers=observers) else: @@ -409,24 +411,40 @@ def check_kernel_output(self, func, gpu_args, instance, answer, atol, verify, ve if not correct: raise RuntimeError("Kernel result verification failed for: " + util.get_config_string(instance.params)) - def compile_and_benchmark(self, kernel_source, gpu_args, params, kernel_options, to): - """ Compile and benchmark a kernel instance based on kernel strings and parameters """ - instance_string = util.get_instance_string(params) + def preprocess_gpu_arguments(self, old_arguments, params): + """ Get a flat list of arguments based on the configuration given by `params` """ + new_arguments = [] + + for argument in old_arguments: + if isinstance(argument, Tunable): + new_arguments.append(argument.select_for_configuration(params)) + else: + new_arguments.append(argument) + + return new_arguments + def compile_and_benchmark(self, kernel_source, gpu_args, params, kernel_options, to): # reset previous timers last_compilation_time = None last_verification_time = None last_benchmark_time = None - logging.debug('compile_and_benchmark ' + instance_string) - verbose = to.verbose result = {} + # Compile and benchmark a kernel instance based on kernel strings and parameters + instance_string = util.get_instance_string(params) + + logging.debug('compile_and_benchmark ' + instance_string) + instance = self.create_kernel_instance(kernel_source, kernel_options, params, verbose) if isinstance(instance, util.ErrorConfig): result[to.objective] = util.InvalidConfig() else: + + # Preprocess the argument list. This is required to deal with `MixedPrecisionArray`s + gpu_args = self.preprocess_gpu_arguments(gpu_args, params) + try: # compile the kernel start_compilation = time.perf_counter() @@ -554,7 +572,30 @@ def memcpy_dtoh(self, dest, src): def ready_argument_list(self, arguments): """ready argument list to be passed to the kernel, allocates gpu mem if necessary""" - return self.dev.ready_argument_list(arguments) + flat_args = [] + + # Flatten all arguments into a single list. Required to deal with `MixedPrecisionArray`s + for argument in arguments: + if isinstance(argument, Tunable): + flat_args.extend(argument.values()) + else: + flat_args.append(argument) + + flag_gpu_args = iter(self.dev.ready_argument_list(flat_args)) + + # Unflatten the arguments back into arrays. + gpu_args = [] + for argument in arguments: + if isinstance(argument, Tunable): + arrays = dict() + for key in argument: + arrays[key] = next(flag_gpu_args) + + gpu_args.append(Tunable(argument.param_key, arrays)) + else: + gpu_args.append(next(flag_gpu_args)) + + return gpu_args def run_kernel(self, func, gpu_args, instance): """ Run a compiled kernel instance on a device """ From ee93e2a09af8a9d5002fd8d0a6aa7e6ac39dbc34 Mon Sep 17 00:00:00 2001 From: stijn Date: Wed, 15 Mar 2023 15:13:19 +0100 Subject: [PATCH 02/24] Add initial support for `AccuracyObserver` --- examples/cuda/accuracy.py | 32 +++++++++++++++++-- kernel_tuner/accuracy.py | 30 ++++++++++++++++-- kernel_tuner/core.py | 65 ++++++++++++++++++++++++++++----------- kernel_tuner/util.py | 7 +++++ 4 files changed, 111 insertions(+), 23 deletions(-) diff --git a/examples/cuda/accuracy.py b/examples/cuda/accuracy.py index 59655ecad..87485493f 100644 --- a/examples/cuda/accuracy.py +++ b/examples/cuda/accuracy.py @@ -4,13 +4,27 @@ from pprint import pprint from kernel_tuner import tune_kernel from kernel_tuner.accuracy import TunablePrecision +from kernel_tuner.observers import AccuracyObserver + + +class MyObserver(AccuracyObserver): + def __init__(self): + self.error = None + + def process_kernel_output(self, answer, outputs): + self.error = numpy.average((answer[-1] - outputs[-1].astype(numpy.float64))**2) + + def get_results(self): + return dict(error=self.error) + def tune(): kernel_string = """ #include using half = __half; - __global__ void vector_add(int n, float_type* left, float_type* right, float_type* output) { + template + __global__ void vector_add(int n, const T* left, const T* right, T* output) { int i = blockDim.x * blockIdx.x + threadIdx.x; if (i < n) { @@ -19,7 +33,7 @@ def tune(): } """ - size = 10000000 + size = 100000000 n = numpy.int32(size) a = numpy.random.randn(size).astype(numpy.float64) @@ -33,11 +47,23 @@ def tune(): TunablePrecision("float_type", c), ] + answer = [None, None, None, a + b] + tune_params = dict() tune_params["block_size_x"] = [128+64*i for i in range(15)] tune_params["float_type"] = ["float", "double", "half"] - results, env = tune_kernel("vector_add", kernel_string, size, args, tune_params) + observers = [MyObserver()] + + results, env = tune_kernel( + "vector_add", + kernel_string, + size, + args, + tune_params, + answer=answer, + observers=observers, + lang="cupy") pprint(results) diff --git a/kernel_tuner/accuracy.py b/kernel_tuner/accuracy.py index 4fae643fc..711902689 100644 --- a/kernel_tuner/accuracy.py +++ b/kernel_tuner/accuracy.py @@ -2,6 +2,8 @@ from typing import Dict import numpy as np +from kernel_tuner.observers import AccuracyObserver + class Tunable(UserDict): def __init__(self, param_key: str, arrays: Dict): @@ -70,8 +72,9 @@ def __init__(self, param_key: str, array: np.ndarray, dtypes: Dict[str, np.dtype # Try to get bfloat16 from tensorflow if available. try: - import tensorflow - dtypes["bfloat16"] = tensorflow.bfloat16.as_numpy_dtype + #import tensorflow + #dtypes["bfloat16"] = tensorflow.bfloat16.as_numpy_dtype + pass except ImportError: pass # Ignore error if tensorflow is not available @@ -84,3 +87,26 @@ def __init__(self, param_key: str, array: np.ndarray, dtypes: Dict[str, np.dtype arrays[precision] = np.array(array).astype(dtype) super().__init__(param_key, arrays) + + +class CalculateErrorObserver(AccuracyObserver): + def __init__(self, metric=None, key="error"): + # The default metric is the mean squared error + if metric is None: + metric = lambda a, b: np.average(np.square(a - b)) + + self.key = key + self.function = metric + self.result = None + + def process_kernel_output(self, answers, outputs): + errors = [] + + for answer, output in zip(answers, outputs): + if answer is not None: + errors.append(self.metric(answer, output)) + + self.result = max(errors) + + def get_results(self): + return dict([(self.key, self.result)]) diff --git a/kernel_tuner/core.py b/kernel_tuner/core.py index 7194456ba..128d88eb7 100644 --- a/kernel_tuner/core.py +++ b/kernel_tuner/core.py @@ -14,7 +14,7 @@ cp = np from kernel_tuner.observers.nvml import NVMLObserver -from kernel_tuner.observers.observer import ContinuousObserver +from kernel_tuner.observers.observer import ContinuousObserver, AccuracyObserver from kernel_tuner.backends.cupy import CupyFunctions from kernel_tuner.backends.pycuda import PyCudaFunctions from kernel_tuner.backends.nvcuda import CudaFunctions @@ -250,6 +250,7 @@ def __init__(self, kernel_source, device=0, platform=0, quiet=False, compiler=No #look for NVMLObserver in observers, if present, enable special tunable parameters through nvml self.use_nvml = False self.continuous_observers = [] + self.accuracy_observers = [] if observers: for obs in observers: if isinstance(obs, NVMLObserver): @@ -257,6 +258,9 @@ def __init__(self, kernel_source, device=0, platform=0, quiet=False, compiler=No self.use_nvml = True if hasattr(obs, "continuous_observer"): self.continuous_observers.append(obs.continuous_observer) + if isinstance(obs, AccuracyObserver): + self.accuracy_observers.append(obs) + self.iterations = iterations @@ -337,6 +341,10 @@ def benchmark(self, func, gpu_args, instance, verbose, objective): if "nvml_mem_clock" in instance.params: self.nvml.mem_clock = instance.params["nvml_mem_clock"] + # Call the observers to register the configuration to be benchmarked + for obs in self.dev.observers: + obs.register_configuration(instance.params) + result = {} try: self.benchmark_default(func, gpu_args, instance.threads, instance.grid, result) @@ -372,13 +380,18 @@ def check_kernel_output(self, func, gpu_args, instance, answer, atol, verify, ve logging.debug('check_kernel_output') #if not using custom verify function, check if the length is the same - if not verify and len(instance.arguments) != len(answer): - raise TypeError("The length of argument list and provided results do not match.") + if answer: + if len(instance.arguments) != len(answer): + raise TypeError("The length of argument list and provided results do not match.") + + should_sync = [answer[i] is not None for i, arg in enumerate(instance.arguments)] + else: + should_sync = [isinstance(arg, (np.ndarray, cp.ndarray, torch.Tensor)) for arg in instance.arguments] #re-copy original contents of output arguments to GPU memory, to overwrite any changes #by earlier kernel runs for i, arg in enumerate(instance.arguments): - if (verify or answer[i] is not None) and isinstance(arg, (np.ndarray, cp.ndarray, torch.Tensor)): + if should_sync[i]: self.dev.memcpy_htod(gpu_args[i], arg) #run the kernel @@ -389,24 +402,38 @@ def check_kernel_output(self, func, gpu_args, instance, answer, atol, verify, ve #retrieve gpu results to host memory result_host = [] for i, arg in enumerate(instance.arguments): - if (verify or answer[i] is not None) and isinstance(arg, (np.ndarray, cp.ndarray)): - result_host.append(np.zeros_like(arg)) - self.dev.memcpy_dtoh(result_host[-1], gpu_args[i]) - elif isinstance(arg, torch.Tensor) and isinstance(answer[i], torch.Tensor): - if not answer[i].is_cuda: - #if the answer is on the host, copy gpu output to host as well - result_host.append(torch.zeros_like(answer[i])) - self.dev.memcpy_dtoh(result_host[-1], gpu_args[i].tensor) + if should_sync[i]: + if isinstance(arg, (np.ndarray, cp.ndarray)): + result_host.append(np.zeros_like(arg)) + self.dev.memcpy_dtoh(result_host[-1], gpu_args[i]) + elif isinstance(arg, torch.Tensor) and isinstance(answer[i], torch.Tensor): + if not answer[i].is_cuda: + #if the answer is on the host, copy gpu output to host as well + result_host.append(torch.zeros_like(answer[i])) + self.dev.memcpy_dtoh(result_host[-1], gpu_args[i].tensor) + else: + result_host.append(gpu_args[i].tensor) else: - result_host.append(gpu_args[i].tensor) + # We should sync this argument, but we do not know how to transfer this type of argument + # What do we do? Should we throw an error? + result_host.append(None) else: result_host.append(None) - #if the user has specified a custom verify function, then call it, else use default based on numpy allclose + # Call the accuracy observers + for obs in self.accuracy_observers: + obs.process_kernel_output(answer, result_host) + + # There are three scenarios: + # - if there is a custom verify function, call that. + # - otherwise, if there are no accuracy observer, call the default verify function + # - otherwise, the answer is correct (we assume the accuracy observers verified the output) if verify: correct = verify(answer, result_host, atol=atol) - else: + elif not self.accuracy_observers: correct = _default_verify_function(instance, answer, result_host, atol, verbose) + else: + correct = True if not correct: raise RuntimeError("Kernel result verification failed for: " + util.get_config_string(instance.params)) @@ -444,7 +471,6 @@ def compile_and_benchmark(self, kernel_source, gpu_args, params, kernel_options, # Preprocess the argument list. This is required to deal with `MixedPrecisionArray`s gpu_args = self.preprocess_gpu_arguments(gpu_args, params) - try: # compile the kernel start_compilation = time.perf_counter() @@ -466,7 +492,7 @@ def compile_and_benchmark(self, kernel_source, gpu_args, params, kernel_options, last_compilation_time = 1000 * (time.perf_counter() - start_compilation) # test kernel for correctness - if func and (to.answer or to.verify): + if func and (to.answer or to.verify or self.accuracy_observers): start_verification = time.perf_counter() self.check_kernel_output(func, gpu_args, instance, to.answer, to.atol, to.verify, verbose) last_verification_time = 1000 * (time.perf_counter() - start_verification) @@ -559,8 +585,11 @@ def create_kernel_instance(self, kernel_source, kernel_options, params, verbose) if kernel_source.lang in ["CUDA", "NVCUDA"] and "<" in name and ">" in name: kernel_string, name = wrap_templated_kernel(kernel_string, name) + # Preprocess GPU arguments. Require for handling `Tunable` arguments + arguments = self.preprocess_gpu_arguments(kernel_options.arguments, params) + #collect everything we know about this instance and return it - return KernelInstance(name, kernel_source, kernel_string, temp_files, threads, grid, params, kernel_options.arguments) + return KernelInstance(name, kernel_source, kernel_string, temp_files, threads, grid, params, arguments) def get_environment(self): """Return dictionary with information about the environment""" diff --git a/kernel_tuner/util.py b/kernel_tuner/util.py index 0e3dee4d6..cadc8ee14 100644 --- a/kernel_tuner/util.py +++ b/kernel_tuner/util.py @@ -14,6 +14,9 @@ import numpy as np from constraint import Constraint, AllDifferentConstraint, AllEqualConstraint, MaxSumConstraint, ExactSumConstraint, MinSumConstraint, InSetConstraint, NotInSetConstraint, SomeInSetConstraint, SomeNotInSetConstraint, FunctionConstraint + +from kernel_tuner.accuracy import Tunable + try: import cupy as cp except ImportError: @@ -124,6 +127,10 @@ def check_argument_list(kernel_name, kernel_string, args): for (i, arg) in enumerate(args): kernel_argument = arguments[i] + # Fix to deal with tunable arguments + if isinstance(arg, Tunable): + continue + if not isinstance(arg, (np.ndarray, np.generic, cp.ndarray, torch.Tensor)): raise TypeError("Argument at position " + str(i) + " of type: " + str(type(arg)) + " should be of type np.ndarray or numpy scalar") From 2b99fc7abe3dcaa85085d1139dd9c8cd9951dab6 Mon Sep 17 00:00:00 2001 From: stijn Date: Wed, 29 Mar 2023 13:38:53 +0200 Subject: [PATCH 03/24] Extend documentation in `accuracy.py` --- kernel_tuner/accuracy.py | 87 ++++++++++++++++++++++++++-------------- 1 file changed, 57 insertions(+), 30 deletions(-) diff --git a/kernel_tuner/accuracy.py b/kernel_tuner/accuracy.py index 711902689..e78797809 100644 --- a/kernel_tuner/accuracy.py +++ b/kernel_tuner/accuracy.py @@ -7,23 +7,24 @@ class Tunable(UserDict): def __init__(self, param_key: str, arrays: Dict): - """Create a new ``Tunable``. - - ``Tunable`` can be used one of the input arguments when tuning kernels. It can contain - several arrays and the array that will be used during benchmark of one kernel configuration - can be selected based on a tunable parameter. + """The ``Tunable`` object is used as an input argument when tuning + kernels. It is a container that holds several arrays internally and + selects one array during benchmarking based on a tunable parameter. Example ------- - For example, it is possible to define a tunable parameter called ``matrix_layout`` and then - tunable for Fortran-order or C-order memory layout by passing the following object as a - kernel argument: + Consider this example:: + + arg = Tunable("matrix_layout", dict("c"=matrix, "f"=matrix.transpose())) - ``` - Tunable("matrix_layout", dict("c"=matrix, "f"=matrix.transpose())) - ``` + In this example, we create a Tunable object that selects either matrix + or matrix.transpose() for benchmarking, depending on the value of the + tunable parameter "matrix_layout". The arrays argument is a dictionary + that maps the tunable parameter values "c" and "f" to the arrays matrix + and matrix.transpose(), respectively. During benchmarking, the Tunable + object selects the appropriate array based on the value of "matrix_layout". - :param param_key: The tunable parameter used to select the array for benchmarking. + :param param_key: : The tunable parameter used to select the array for benchmarking. :param arrays: A dictionary that maps the parameter value to arrays. """ if isinstance(arrays, (tuple, list)): @@ -42,7 +43,9 @@ def select_for_configuration(self, params): if key not in self: list = ", ".join(map(str, self.keys())) - raise KeyError(f"'{key}' is not a valid parameter value, should be one of: {list}") + raise KeyError( + f"'{key}' is not a valid parameter value, should be one of: {list}" + ) return self[key] @@ -51,29 +54,39 @@ def __call__(self, params): class TunablePrecision(Tunable): - def __init__(self, param_key: str, array: np.ndarray, dtypes: Dict[str, np.dtype] = None): - """ - Create a new ``TunablePrecision``. + def __init__( + self, param_key: str, array: np.ndarray, dtypes: Dict[str, np.dtype] = None + ): + """The ``Tunable`` object is used as an input argument when tuning + kernels. It is a container that internally holds several arrays + containing the same data, but stored in using different levels of + precision. During benchamrking, one array is selected based on a + tunable parameter ``param_key``. + + Example + ------- + Consider this example:: - ``TunablePrecision`` can be used one of the input arguments when tuning kernels. It - contains the same array data, but stored using different levels of precision. This can - be used to tune the optimal precision for a kernel argument. + arg = TunablePrecision("matrix_type", matrix) - :param param_key: The tunable parameter used to select the precision for benchmarking. - :param array: The input array. Will be converted to the given precision levels. - :param dtypes: Dictionary that maps names to numpy data types. + This creates a ``TunablePrecision`` argument that selects the required + floating-point precision for ``matrix`` based on the tunable parameter + ``"matrix_type"``. + + :param param_key: The tunable parameter used to select the level of precision. + :param array: The input array. It will automatically be converted to + all data types given by ``dtypes``. + :param dtypes: Dictionary that maps names to numpy data types. The default + types are ``double``, ``float``, and ``half``. """ # If no dtypes are given, generate a default list if not dtypes: - dtypes = dict( - half=np.half, - float=np.single, - double=np.double) + dtypes = dict(half=np.half, float=np.single, double=np.double) # Try to get bfloat16 from tensorflow if available. try: - #import tensorflow - #dtypes["bfloat16"] = tensorflow.bfloat16.as_numpy_dtype + # import tensorflow + # dtypes["bfloat16"] = tensorflow.bfloat16.as_numpy_dtype pass except ImportError: pass # Ignore error if tensorflow is not available @@ -89,14 +102,28 @@ def __init__(self, param_key: str, array: np.ndarray, dtypes: Dict[str, np.dtype super().__init__(param_key, arrays) -class CalculateErrorObserver(AccuracyObserver): +class ErrorMetricObserver(AccuracyObserver): + """An ``AccuracyObserver`` that measure the error of the outputs produced + by a kernel by comparing it against reference outputs. + + By default, it uses the mean-squared error (MSE) and appends this to + the results with a metric called ``error``. + """ + def __init__(self, metric=None, key="error"): + """Create a new ``AccuracyObserver``. + + :param metric: The error metric. Should be function that accepts two numpy + arrays as arguments (the reference output and the kernel output) + :param key: The name of this metric in the results. + """ + # The default metric is the mean squared error if metric is None: metric = lambda a, b: np.average(np.square(a - b)) self.key = key - self.function = metric + self.metric = metric self.result = None def process_kernel_output(self, answers, outputs): From 777a2ff8fcca732685382f9f10c2a60f8e026c03 Mon Sep 17 00:00:00 2001 From: stijn Date: Wed, 5 Apr 2023 15:57:57 +0200 Subject: [PATCH 04/24] Use `bfloat16` from `bfloat16` package instead of tensorflow --- kernel_tuner/accuracy.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/kernel_tuner/accuracy.py b/kernel_tuner/accuracy.py index e78797809..a0a04a123 100644 --- a/kernel_tuner/accuracy.py +++ b/kernel_tuner/accuracy.py @@ -83,10 +83,10 @@ def __init__( if not dtypes: dtypes = dict(half=np.half, float=np.single, double=np.double) - # Try to get bfloat16 from tensorflow if available. + # Try to get bfloat16 if available. try: - # import tensorflow - # dtypes["bfloat16"] = tensorflow.bfloat16.as_numpy_dtype + from bfloat16 import bfloat16 + dtypes["bfloat16"] = bfloat16 pass except ImportError: pass # Ignore error if tensorflow is not available From 95c2d99a1ed70f42ee2719149c7ff8c4ee6849fc Mon Sep 17 00:00:00 2001 From: stijn Date: Wed, 19 Apr 2023 10:44:33 +0200 Subject: [PATCH 05/24] Make `SequentialRunner` always compute metrics for every configuration --- kernel_tuner/runners/sequential.py | 15 ++++++++------- kernel_tuner/util.py | 7 +++---- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/kernel_tuner/runners/sequential.py b/kernel_tuner/runners/sequential.py index 99f2ac972..5c9d438f3 100644 --- a/kernel_tuner/runners/sequential.py +++ b/kernel_tuner/runners/sequential.py @@ -92,17 +92,14 @@ def run(self, parameter_space, tuning_options): warmup_time = 1e3 * (perf_counter() - warmup_time) result = self.dev.compile_and_benchmark(self.kernel_source, self.gpu_args, params, self.kernel_options, tuning_options) - params.update(result) - # only compute metrics on configs that have not errored - if tuning_options.objective in result and isinstance(result[tuning_options.objective], ErrorConfig): + if isinstance(result.get(tuning_options.objective), ErrorConfig): logging.debug('kernel configuration was skipped silently due to compile or runtime failure') - elif tuning_options.metrics: - params = process_metrics(params, tuning_options.metrics) - # print configuration to the console - print_config_output(tuning_options.tune_params, params, self.quiet, tuning_options.metrics, self.units) + # only compute metrics on configs that have not errored + if not isinstance(params.get(tuning_options.objective), ErrorConfig): + params = process_metrics(params, tuning_options.metrics) # get the framework time by estimating based on other times total_time = 1000 * (perf_counter() - self.start_time) - warmup_time @@ -112,6 +109,10 @@ def run(self, parameter_space, tuning_options): self.start_time = perf_counter() if result: + # print configuration to the console + print_config_output(tuning_options.tune_params, params, self.quiet, tuning_options.metrics, self.units) + + # add configuration to cache store_cache(x_int, params, tuning_options) # all visited configurations are added to results to provide a trace for optimization strategies diff --git a/kernel_tuner/util.py b/kernel_tuner/util.py index cadc8ee14..fbf949ffd 100644 --- a/kernel_tuner/util.py +++ b/kernel_tuner/util.py @@ -518,10 +518,9 @@ def process_metrics(params, metrics): value = v(params) else: raise ValueError("metric dicts values should be strings or callable") - if not k in params: - params[k] = value - else: - raise ValueError("metric dicts keys should not already exist in params") + + # We overwrite any existing values for the given key + params[k] = value return params From 16d17645d859a6db1d9a40e39fdec9d3777f95ec Mon Sep 17 00:00:00 2001 From: stijn Date: Wed, 19 Apr 2023 11:09:04 +0200 Subject: [PATCH 06/24] Fix test for `process_metrics` to allow overwriting existing results --- kernel_tuner/runners/sequential.py | 2 +- test/test_util_functions.py | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/kernel_tuner/runners/sequential.py b/kernel_tuner/runners/sequential.py index 5c9d438f3..3ddcc827a 100644 --- a/kernel_tuner/runners/sequential.py +++ b/kernel_tuner/runners/sequential.py @@ -98,7 +98,7 @@ def run(self, parameter_space, tuning_options): logging.debug('kernel configuration was skipped silently due to compile or runtime failure') # only compute metrics on configs that have not errored - if not isinstance(params.get(tuning_options.objective), ErrorConfig): + if tuning_options.metrics and not isinstance(params.get(tuning_options.objective), ErrorConfig): params = process_metrics(params, tuning_options.metrics) # get the framework time by estimating based on other times diff --git a/test/test_util_functions.py b/test/test_util_functions.py index b3896a1ad..378bca229 100644 --- a/test/test_util_functions.py +++ b/test/test_util_functions.py @@ -667,15 +667,15 @@ def test_process_metrics(): with pytest.raises(ValueError): params = process_metrics(params, {}) - # test ValueError is raised when b already exists in params + # test if a metric overrides any existing metrics params = { "x": 15, "b": 12 } metrics = OrderedDict() metrics["b"] = "x" - with pytest.raises(ValueError): - params = process_metrics(params, metrics) + params = process_metrics(params, metrics) + assert params["b"] == 15 def test_parse_restrictions(): From 74f87e0b89649ad7328985c198a0008c61bb0348 Mon Sep 17 00:00:00 2001 From: stijn Date: Wed, 19 Apr 2023 11:14:28 +0200 Subject: [PATCH 07/24] Re-add `Observer.register_configuration` that was lost due in merge with master --- kernel_tuner/observers/observer.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/kernel_tuner/observers/observer.py b/kernel_tuner/observers/observer.py index c3203afb8..203805dd9 100644 --- a/kernel_tuner/observers/observer.py +++ b/kernel_tuner/observers/observer.py @@ -8,6 +8,11 @@ def register_device(self, dev): """Sets self.dev, for inspection by the observer at various points during benchmarking""" self.dev = dev + def register_configuration(self, params): + """Called once before benchmarking of a single kernel configuration. The `params` argument is a `dict` + that stores the configuration parameters.""" + pass + def before_start(self): """before start is called every iteration before the kernel starts""" pass From ee351be2ec15d05e777f0878c4e1c8c47b060d59 Mon Sep 17 00:00:00 2001 From: stijn Date: Tue, 9 May 2023 12:02:34 +0200 Subject: [PATCH 08/24] Support scalars in `TunablePrecision` --- kernel_tuner/accuracy.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/kernel_tuner/accuracy.py b/kernel_tuner/accuracy.py index a0a04a123..f3ba9075e 100644 --- a/kernel_tuner/accuracy.py +++ b/kernel_tuner/accuracy.py @@ -97,7 +97,13 @@ def __init__( arrays = dict() for precision, dtype in dtypes.items(): - arrays[precision] = np.array(array).astype(dtype) + # We convert the array into a `np.ndarray` by using `np.array`. + # However, if the value is a numpy scalar, then we do not want to + # convert it into an array but instead keep the original value + if not np.isinstance(array, np.generic) + array = np.array(array) + + arrays[precision] = array.astype(dtype) super().__init__(param_key, arrays) From 0789e01e58f6d3c81142529d8f7615c5074ff9ef Mon Sep 17 00:00:00 2001 From: stijn Date: Tue, 9 May 2023 12:03:13 +0200 Subject: [PATCH 09/24] Rename `flag_gpu_args` to `flat_gpu_args` in `core.py` --- kernel_tuner/accuracy.py | 2 +- kernel_tuner/core.py | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/kernel_tuner/accuracy.py b/kernel_tuner/accuracy.py index f3ba9075e..764ba72f4 100644 --- a/kernel_tuner/accuracy.py +++ b/kernel_tuner/accuracy.py @@ -100,7 +100,7 @@ def __init__( # We convert the array into a `np.ndarray` by using `np.array`. # However, if the value is a numpy scalar, then we do not want to # convert it into an array but instead keep the original value - if not np.isinstance(array, np.generic) + if not isinstance(array, np.generic): array = np.array(array) arrays[precision] = array.astype(dtype) diff --git a/kernel_tuner/core.py b/kernel_tuner/core.py index 128d88eb7..7149cb578 100644 --- a/kernel_tuner/core.py +++ b/kernel_tuner/core.py @@ -610,7 +610,7 @@ def ready_argument_list(self, arguments): else: flat_args.append(argument) - flag_gpu_args = iter(self.dev.ready_argument_list(flat_args)) + flat_gpu_args = iter(self.dev.ready_argument_list(flat_args)) # Unflatten the arguments back into arrays. gpu_args = [] @@ -618,11 +618,11 @@ def ready_argument_list(self, arguments): if isinstance(argument, Tunable): arrays = dict() for key in argument: - arrays[key] = next(flag_gpu_args) + arrays[key] = next(flat_gpu_args) gpu_args.append(Tunable(argument.param_key, arrays)) else: - gpu_args.append(next(flag_gpu_args)) + gpu_args.append(next(flat_gpu_args)) return gpu_args From 00d65221009bf8c0d20daa2e7749c856ebbebdc1 Mon Sep 17 00:00:00 2001 From: stijn Date: Tue, 13 Jun 2023 12:44:05 +0200 Subject: [PATCH 10/24] Add support for observers in C backend --- kernel_tuner/backends/c.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/kernel_tuner/backends/c.py b/kernel_tuner/backends/c.py index 486181cc3..f47f228fa 100644 --- a/kernel_tuner/backends/c.py +++ b/kernel_tuner/backends/c.py @@ -43,12 +43,15 @@ class CFunctions(CompilerBackend): """Class that groups the code for running and compiling C functions""" - def __init__(self, iterations=7, compiler_options=None, compiler=None): + def __init__(self, iterations=7, compiler_options=None, compiler=None, observers=None): """instantiate CFunctions object used for interacting with C code :param iterations: Number of iterations used while benchmarking a kernel, 7 by default. :type iterations: int """ + self.observers = observers or [] + self.observers.append(CRuntimeObserver(self)) + self.iterations = iterations self.max_threads = 1024 self.compiler_options = compiler_options @@ -56,7 +59,6 @@ def __init__(self, iterations=7, compiler_options=None, compiler=None): self.compiler = compiler or "g++" self.lib = None self.using_openmp = False - self.observers = [CRuntimeObserver(self)] self.last_result = None try: From 24e14ed1afcd2c1c90e09b1181f606b5246ec3f4 Mon Sep 17 00:00:00 2001 From: stijn Date: Tue, 27 Jun 2023 12:03:53 +0200 Subject: [PATCH 11/24] Allow `run_kernel` to deal with `Tunable` arguments --- kernel_tuner/interface.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/kernel_tuner/interface.py b/kernel_tuner/interface.py index 15fe04ec1..b72753377 100644 --- a/kernel_tuner/interface.py +++ b/kernel_tuner/interface.py @@ -746,7 +746,7 @@ def run_kernel( if log: logging.basicConfig(filename=kernel_name + datetime.now().strftime("%Y%m%d-%H:%M:%S") + ".log", level=log) - kernelsource = core.KernelSource(kernel_name, kernel_source, lang) + kernelsource = core.KernelSource(kernel_name, kernel_source, lang, defines) _check_user_input(kernel_name, kernelsource, arguments, block_size_names) @@ -758,6 +758,9 @@ def run_kernel( # detect language and create the right device function interface dev = core.DeviceInterface(kernelsource, iterations=1, **device_options) + # Preprocess GPU arguments. Require for handling `Tunable` arguments + arguments = dev.preprocess_gpu_arguments(arguments, params) + # move data to the GPU gpu_args = dev.ready_argument_list(arguments) From cd996b92cad321a6245972176374ac10e2ad429b Mon Sep 17 00:00:00 2001 From: stijn Date: Mon, 3 Jul 2023 12:40:11 +0200 Subject: [PATCH 12/24] Support more names for floating-point types in TunablePrecision --- kernel_tuner/accuracy.py | 25 ++++++++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) diff --git a/kernel_tuner/accuracy.py b/kernel_tuner/accuracy.py index 764ba72f4..567142d44 100644 --- a/kernel_tuner/accuracy.py +++ b/kernel_tuner/accuracy.py @@ -53,6 +53,28 @@ def __call__(self, params): return self.select_for_configuration(params) +def _to_float_dtype(x): + """Convert a string to a numpy data type (``dtype``). This function recognizes + common names (such as ``f16`` or ``kfloat``), and uses ``np.dtype(x)`` as a + fallback. + """ + if isinstance(x, str): + x = x.lower() + + if x in ("bfloat16", "bf16", "kbfloat16", "__nv_bfloat16"): + from bfloat16 import bfloat16 + + return bfloat16 + if x in ("half", "f16", "float16", "__half", "khalf", 16): + return np.half + if x in ("float", "single", "f32", "float32", "kfloat", 32): + return np.float32 + if x in ("double", "f64", "float64", "kdouble", 64): + return np.float64 + + return np.dtype(x) + + class TunablePrecision(Tunable): def __init__( self, param_key: str, array: np.ndarray, dtypes: Dict[str, np.dtype] = None @@ -86,6 +108,7 @@ def __init__( # Try to get bfloat16 if available. try: from bfloat16 import bfloat16 + dtypes["bfloat16"] = bfloat16 pass except ImportError: @@ -93,7 +116,7 @@ def __init__( # If dtype is a list, convert it to a dictionary if isinstance(dtypes, (list, tuple)): - dtypes = dict((name, np.dtype(name)) for name in dtypes) + dtypes = dict((name, _to_float_dtype(name)) for name in dtypes) arrays = dict() for precision, dtype in dtypes.items(): From c798e0fb0c996f274e488cf496013940bd3ae5ae Mon Sep 17 00:00:00 2001 From: stijn Date: Mon, 3 Jul 2023 12:40:43 +0200 Subject: [PATCH 13/24] Support several well-known error metrics in ErrorObserver --- kernel_tuner/accuracy.py | 52 +++++++++++++++++++++++++++++++++++++++- 1 file changed, 51 insertions(+), 1 deletion(-) diff --git a/kernel_tuner/accuracy.py b/kernel_tuner/accuracy.py index 567142d44..85447d9de 100644 --- a/kernel_tuner/accuracy.py +++ b/kernel_tuner/accuracy.py @@ -131,7 +131,54 @@ def __init__( super().__init__(param_key, arrays) -class ErrorMetricObserver(AccuracyObserver): +class AccuracyObserver(BenchmarkObserver): + """Observer that can verify or measure the accuracy of the output produced by a kernel.""" + + @abstractmethod + def process_kernel_output(self, answer, output): + """method will be called once before benchmarking of a single kernel configuration. The arguments + provided are the `answer` as passed `tune_kernel` and the `output` produced by the kernel + """ + pass + + +def error_metric_from_name(key): + """Find the error metric function for the given name. + + Returns an function that takes two parameters (the real values and the + estimated values) as numpy array and returns the error between the two + according to the given error metric. + + Valid values for the ``key`` are: + + * MSE (mean square error) + * RSME (Root mean square error) + * MAE (mean absolute error) + * MRE (mean relative error) + * MALE (mean absolute log error) + * RMSLE (root mean square log error) + """ + key = key.lower().strip().replace("_", " ") + + if key in ("mse", "smd", "mean square error"): + return lambda a, b: np.average(np.square(a - b)) + elif key in ("rmse", "rsmd", "root mean square error"): + return lambda a, b: np.sqrt(np.average(np.square(a - b))) + elif key in ("nrmse", "nrmsd"): + return lambda a, b: np.sqrt(np.average(np.square(a - b))) / np.average(a) + elif key in ("mae", "absolute error", "absolute", "mean absolute error", "abs"): + return lambda a, b: np.average(np.abs(a - b)) + elif key in ("mre", "relative error", "relative", "mean relative error", "rel"): + return lambda a, b: np.average(np.abs(a - b) / np.abs(a) - 1) + elif key in ("male", "mean absolute log error"): + return lambda a, b: np.average(np.abs(np.log(a) - np.log(b))) + elif key in ("rmsle", "root mean square log error"): + return lambda a, b: np.sqrt(np.average(np.square(np.log(a) - np.log(b)))) + else: + raise ValuError(f"invalid error metric provided: {key}") + + +class ErrorObserver(AccuracyObserver): """An ``AccuracyObserver`` that measure the error of the outputs produced by a kernel by comparing it against reference outputs. @@ -151,6 +198,9 @@ def __init__(self, metric=None, key="error"): if metric is None: metric = lambda a, b: np.average(np.square(a - b)) + if isinstance(metric, str): + metric = error_metric_from_name(metric) + self.key = key self.metric = metric self.result = None From 4bffd6e5f3a4ae257e1590ec04e8f1e7bf50eb20 Mon Sep 17 00:00:00 2001 From: stijn Date: Tue, 4 Jul 2023 14:08:38 +0200 Subject: [PATCH 14/24] Fix division by zero in ErrorObserver when ground-truth contains zeros --- kernel_tuner/accuracy.py | 45 +++++++++++++++++++++++++++------------- 1 file changed, 31 insertions(+), 14 deletions(-) diff --git a/kernel_tuner/accuracy.py b/kernel_tuner/accuracy.py index 85447d9de..7c4ea8c7d 100644 --- a/kernel_tuner/accuracy.py +++ b/kernel_tuner/accuracy.py @@ -145,8 +145,8 @@ def process_kernel_output(self, answer, output): def error_metric_from_name(key): """Find the error metric function for the given name. - Returns an function that takes two parameters (the real values and the - estimated values) as numpy array and returns the error between the two + Returns an function that takes two parameters (the ground-truth and the + estimated values) as numpy arrays and returns the error between the two according to the given error metric. Valid values for the ``key`` are: @@ -157,32 +157,50 @@ def error_metric_from_name(key): * MRE (mean relative error) * MALE (mean absolute log error) * RMSLE (root mean square log error) + * max (maximum absolute error) + * max_rel (maximum relative error) """ + + # Small value to prevent division by zero in relative metrics + EPS = np.finfo(np.float64).eps + + # lowercase the metric name key = key.lower().strip().replace("_", " ") if key in ("mse", "smd", "mean square error"): - return lambda a, b: np.average(np.square(a - b)) + metric = lambda a, b: np.average(np.square(a - b)) elif key in ("rmse", "rsmd", "root mean square error"): - return lambda a, b: np.sqrt(np.average(np.square(a - b))) + metric = lambda a, b: np.sqrt(np.average(np.square(a - b))) elif key in ("nrmse", "nrmsd"): - return lambda a, b: np.sqrt(np.average(np.square(a - b))) / np.average(a) + metric = lambda a, b: np.sqrt(np.average(np.square(a - b))) / np.average(a) elif key in ("mae", "absolute error", "absolute", "mean absolute error", "abs"): - return lambda a, b: np.average(np.abs(a - b)) + metric = lambda a, b: np.average(np.abs(a - b)) elif key in ("mre", "relative error", "relative", "mean relative error", "rel"): - return lambda a, b: np.average(np.abs(a - b) / np.abs(a) - 1) + metric = lambda a, b: np.average(np.abs(a - b) / np.maximum(np.abs(a), EPS)) elif key in ("male", "mean absolute log error"): - return lambda a, b: np.average(np.abs(np.log(a) - np.log(b))) + metric = lambda a, b: np.average(np.abs(np.log(a + EPS) - np.log(b + EPS))) elif key in ("rmsle", "root mean square log error"): - return lambda a, b: np.sqrt(np.average(np.square(np.log(a) - np.log(b)))) + metric = lambda a, b: np.sqrt( + np.average(np.square(np.log(a + EPS) - np.log(b + EPS))) + ) + elif key in ("max", "max_abs", "maximum", "maximum absolute"): + metric = lambda a, b: np.amax(np.abs(a - b)) + elif key in ("max_rel", "maximum relative"): + metric = lambda a, b: np.amax(np.abs(a - b) / np.maximum(np.abs(a), EPS)) else: - raise ValuError(f"invalid error metric provided: {key}") + raise ValueError(f"invalid error metric provided: {key}") + + # cast both arguments to f64 before passing them to the metric + return lambda a, b, metric=metric: metric( + a.astype(np.float64, copy=False), b.astype(np.float64, copy=False) + ) class ErrorObserver(AccuracyObserver): """An ``AccuracyObserver`` that measure the error of the outputs produced by a kernel by comparing it against reference outputs. - By default, it uses the mean-squared error (MSE) and appends this to + By default, it uses the root mean-squared error (RMSE) and appends this to the results with a metric called ``error``. """ @@ -196,9 +214,8 @@ def __init__(self, metric=None, key="error"): # The default metric is the mean squared error if metric is None: - metric = lambda a, b: np.average(np.square(a - b)) - - if isinstance(metric, str): + metric = error_metric_from_name("rmse") + elif isinstance(metric, str): metric = error_metric_from_name(metric) self.key = key From d67eefdda7bf46f384cea8ef29371c155eda33ac Mon Sep 17 00:00:00 2001 From: stijn Date: Tue, 4 Jul 2023 14:27:20 +0200 Subject: [PATCH 15/24] Add RMSRE error metric --- kernel_tuner/accuracy.py | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/kernel_tuner/accuracy.py b/kernel_tuner/accuracy.py index 7c4ea8c7d..382fc4afc 100644 --- a/kernel_tuner/accuracy.py +++ b/kernel_tuner/accuracy.py @@ -153,23 +153,24 @@ def error_metric_from_name(key): * MSE (mean square error) * RSME (Root mean square error) + * NRMSE (normalized root mean square error) + * RMSRE (root mean square relative error) + * RMSLE (root mean square log error) * MAE (mean absolute error) * MRE (mean relative error) * MALE (mean absolute log error) - * RMSLE (root mean square log error) * max (maximum absolute error) - * max_rel (maximum relative error) """ # Small value to prevent division by zero in relative metrics EPS = np.finfo(np.float64).eps # lowercase the metric name - key = key.lower().strip().replace("_", " ") + key = key.lower().replace("_", " ").strip() if key in ("mse", "smd", "mean square error"): metric = lambda a, b: np.average(np.square(a - b)) - elif key in ("rmse", "rsmd", "root mean square error"): + elif key in ("rmse", "rmsd", "root mean square error"): metric = lambda a, b: np.sqrt(np.average(np.square(a - b))) elif key in ("nrmse", "nrmsd"): metric = lambda a, b: np.sqrt(np.average(np.square(a - b))) / np.average(a) @@ -177,15 +178,19 @@ def error_metric_from_name(key): metric = lambda a, b: np.average(np.abs(a - b)) elif key in ("mre", "relative error", "relative", "mean relative error", "rel"): metric = lambda a, b: np.average(np.abs(a - b) / np.maximum(np.abs(a), EPS)) + elif key in ("rmsre", "root mean square relative error"): + metric = lambda a, b: np.sqrt( + np.average(np.square(a - b) / np.maximum(np.square(a), EPS**2)) + ) elif key in ("male", "mean absolute log error"): metric = lambda a, b: np.average(np.abs(np.log(a + EPS) - np.log(b + EPS))) elif key in ("rmsle", "root mean square log error"): metric = lambda a, b: np.sqrt( np.average(np.square(np.log(a + EPS) - np.log(b + EPS))) ) - elif key in ("max", "max_abs", "maximum", "maximum absolute"): + elif key in ("max", "max abs", "maximum", "maximum absolute"): metric = lambda a, b: np.amax(np.abs(a - b)) - elif key in ("max_rel", "maximum relative"): + elif key in ("max rel", "maximum relative"): metric = lambda a, b: np.amax(np.abs(a - b) / np.maximum(np.abs(a), EPS)) else: raise ValueError(f"invalid error metric provided: {key}") From 8412f7809a4e8032dbf0b61d09a4656c7f1f827a Mon Sep 17 00:00:00 2001 From: stijn Date: Tue, 8 Aug 2023 16:01:42 +0200 Subject: [PATCH 16/24] Fix crash when object is `ErrorConfig` --- kernel_tuner/runners/sequential.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel_tuner/runners/sequential.py b/kernel_tuner/runners/sequential.py index 3ddcc827a..2b0362930 100644 --- a/kernel_tuner/runners/sequential.py +++ b/kernel_tuner/runners/sequential.py @@ -94,7 +94,7 @@ def run(self, parameter_space, tuning_options): result = self.dev.compile_and_benchmark(self.kernel_source, self.gpu_args, params, self.kernel_options, tuning_options) params.update(result) - if isinstance(result.get(tuning_options.objective), ErrorConfig): + if tuning_options.objective in result and isinstance(result[tuning_options.objective], ErrorConfig): logging.debug('kernel configuration was skipped silently due to compile or runtime failure') # only compute metrics on configs that have not errored From 6bb6156b5020346921428c93b3fd41f79cf65231 Mon Sep 17 00:00:00 2001 From: stijn Date: Mon, 2 Oct 2023 10:02:15 +0200 Subject: [PATCH 17/24] Add support for custom tolerances for relative error metrics --- kernel_tuner/accuracy.py | 44 ++++++++++++++++++++++++---------------- 1 file changed, 26 insertions(+), 18 deletions(-) diff --git a/kernel_tuner/accuracy.py b/kernel_tuner/accuracy.py index 382fc4afc..823f7093d 100644 --- a/kernel_tuner/accuracy.py +++ b/kernel_tuner/accuracy.py @@ -142,7 +142,7 @@ def process_kernel_output(self, answer, output): pass -def error_metric_from_name(key): +def error_metric_from_name(key, EPS=1e-8): """Find the error metric function for the given name. Returns an function that takes two parameters (the ground-truth and the @@ -160,10 +160,10 @@ def error_metric_from_name(key): * MRE (mean relative error) * MALE (mean absolute log error) * max (maximum absolute error) - """ - # Small value to prevent division by zero in relative metrics - EPS = np.finfo(np.float64).eps + The value of `EPS` is used for relative errors to prevent division by zero. + `` + """ # lowercase the metric name key = key.lower().replace("_", " ").strip() @@ -173,14 +173,16 @@ def error_metric_from_name(key): elif key in ("rmse", "rmsd", "root mean square error"): metric = lambda a, b: np.sqrt(np.average(np.square(a - b))) elif key in ("nrmse", "nrmsd"): - metric = lambda a, b: np.sqrt(np.average(np.square(a - b))) / np.average(a) + metric = lambda a, b: np.sqrt( + np.average(np.square(a - b)) / np.average(np.square(a)) + ) elif key in ("mae", "absolute error", "absolute", "mean absolute error", "abs"): metric = lambda a, b: np.average(np.abs(a - b)) elif key in ("mre", "relative error", "relative", "mean relative error", "rel"): metric = lambda a, b: np.average(np.abs(a - b) / np.maximum(np.abs(a), EPS)) elif key in ("rmsre", "root mean square relative error"): metric = lambda a, b: np.sqrt( - np.average(np.square(a - b) / np.maximum(np.square(a), EPS**2)) + np.average(np.square(a - b) / np.square(np.maximum(a, EPS))) ) elif key in ("male", "mean absolute log error"): metric = lambda a, b: np.average(np.abs(np.log(a + EPS) - np.log(b + EPS))) @@ -202,26 +204,32 @@ def error_metric_from_name(key): class ErrorObserver(AccuracyObserver): - """An ``AccuracyObserver`` that measure the error of the outputs produced - by a kernel by comparing it against reference outputs. + """``ErrorObserver`` measures the error of the output produced by a kernel + by comparing it against a reference output. - By default, it uses the root mean-squared error (RMSE) and appends this to - the results with a metric called ``error``. + By default, it uses the root mean-squared error (RMSE) and uses the + parameter key ``error``. """ - def __init__(self, metric=None, key="error"): + def __init__(self, metric=None, key="error", *, atol=1e-8): """Create a new ``AccuracyObserver``. - :param metric: The error metric. Should be function that accepts two numpy - arrays as arguments (the reference output and the kernel output) + :param metric: The error metric. This should be a string that is + accepted by ``error_metric_from_name`` such as ``"absolute error"`` + or ``"relative error"``. Alternatively, it can be + function that accepts two numpy arrays as arguments + (the reference output and the kernel output) :param key: The name of this metric in the results. + :param atol: The absolute tolerance used in relative metrics to prevent + division by zero. """ - # The default metric is the mean squared error - if metric is None: - metric = error_metric_from_name("rmse") - elif isinstance(metric, str): - metric = error_metric_from_name(metric) + if not metric: + metric = "rmse" + + # If it is a string, convert it to a function + if isinstance(metric, str): + metric = error_metric_from_name(metric, atol) self.key = key self.metric = metric From 5070c97404cebb1dbbeb14e1b0cd7c3f1b02ac9f Mon Sep 17 00:00:00 2001 From: stijn Date: Mon, 2 Oct 2023 11:44:04 +0200 Subject: [PATCH 18/24] Rename `AccuracyObserver` to `OutputObserver` and `ErrorObserver` to `AccuracyObserver` --- kernel_tuner/accuracy.py | 121 +++++++++++++++++++++++++-------------- kernel_tuner/core.py | 21 +++---- 2 files changed, 88 insertions(+), 54 deletions(-) diff --git a/kernel_tuner/accuracy.py b/kernel_tuner/accuracy.py index 823f7093d..9e5cfc5f2 100644 --- a/kernel_tuner/accuracy.py +++ b/kernel_tuner/accuracy.py @@ -1,15 +1,16 @@ from collections import UserDict from typing import Dict import numpy as np +import logging from kernel_tuner.observers import AccuracyObserver class Tunable(UserDict): def __init__(self, param_key: str, arrays: Dict): - """The ``Tunable`` object is used as an input argument when tuning + """The ``Tunable`` object can be used as an input argument when tuning kernels. It is a container that holds several arrays internally and - selects one array during benchmarking based on a tunable parameter. + selects one array during benchmarking based on the value of a tunable parameter. Example ------- @@ -19,13 +20,14 @@ def __init__(self, param_key: str, arrays: Dict): In this example, we create a Tunable object that selects either matrix or matrix.transpose() for benchmarking, depending on the value of the - tunable parameter "matrix_layout". The arrays argument is a dictionary - that maps the tunable parameter values "c" and "f" to the arrays matrix - and matrix.transpose(), respectively. During benchmarking, the Tunable - object selects the appropriate array based on the value of "matrix_layout". + tunable parameter "matrix_layout". The first argument is the name of the tunable + paramater. The second argument is a dictionary that maps the tunable parameter + values "c" and "f" to the arrays ``matrix`` and ``matrix.transpose()``, respectively. + During benchmarking, the Tunable object selects the array passed to the kernel based + on the value of "matrix_layout". :param param_key: : The tunable parameter used to select the array for benchmarking. - :param arrays: A dictionary that maps the parameter value to arrays. + :param arrays: A dictionary that maps the value of that tunable parameter to options. """ if isinstance(arrays, (tuple, list)): arrays = dict(enumerate(arrays)) @@ -35,25 +37,25 @@ def __init__(self, param_key: str, arrays: Dict): def select_for_configuration(self, params): if callable(self.param_key): - key = self.param_key(params) + option = self.param_key(params) elif self.param_key in params: - key = params[self.param_key] + option = params[self.param_key] else: - key = eval(self.param_key, params, params) + option = eval(self.param_key, params, params) - if key not in self: - list = ", ".join(map(str, self.keys())) + if option not in self.data: + list = ", ".join(map(str, self.data.keys())) raise KeyError( - f"'{key}' is not a valid parameter value, should be one of: {list}" + f"'{option}' is not a valid parameter value, should be one of: {list}" ) - return self[key] + return self.data[option] def __call__(self, params): return self.select_for_configuration(params) -def _to_float_dtype(x): +def _to_float_dtype(x: str) -> np.dtype: """Convert a string to a numpy data type (``dtype``). This function recognizes common names (such as ``f16`` or ``kfloat``), and uses ``np.dtype(x)`` as a fallback. @@ -79,11 +81,11 @@ class TunablePrecision(Tunable): def __init__( self, param_key: str, array: np.ndarray, dtypes: Dict[str, np.dtype] = None ): - """The ``Tunable`` object is used as an input argument when tuning + """The ``Tunable`` object can be used as an input argument when tuning kernels. It is a container that internally holds several arrays containing the same data, but stored in using different levels of - precision. During benchamrking, one array is selected based on a - tunable parameter ``param_key``. + precision. During benchamrking, one array is selected based on the value + of the tunable parameter called ``param_key``. Example ------- @@ -110,9 +112,16 @@ def __init__( from bfloat16 import bfloat16 dtypes["bfloat16"] = bfloat16 - pass except ImportError: - pass # Ignore error if tensorflow is not available + try: + from tensorflow import bfloat16 + + dtypes["bfloat16"] = bfloat16.as_numpy_dtype + except ImportError: + logging.warning( + "could not find `bfloat16` data type for numpy, " + + "please install either the package `bfloat16` or `tensorflow`" + ) # If dtype is a list, convert it to a dictionary if isinstance(dtypes, (list, tuple)): @@ -131,8 +140,8 @@ def __init__( super().__init__(param_key, arrays) -class AccuracyObserver(BenchmarkObserver): - """Observer that can verify or measure the accuracy of the output produced by a kernel.""" +class OutputObserver(BenchmarkObserver): + """Observer that can verify or measure something about the output produced by a kernel.""" @abstractmethod def process_kernel_output(self, answer, output): @@ -168,43 +177,67 @@ def error_metric_from_name(key, EPS=1e-8): # lowercase the metric name key = key.lower().replace("_", " ").strip() - if key in ("mse", "smd", "mean square error"): - metric = lambda a, b: np.average(np.square(a - b)) - elif key in ("rmse", "rmsd", "root mean square error"): - metric = lambda a, b: np.sqrt(np.average(np.square(a - b))) - elif key in ("nrmse", "nrmsd"): - metric = lambda a, b: np.sqrt( - np.average(np.square(a - b)) / np.average(np.square(a)) - ) + if key in ("mse", "mean square error"): + + def metric(a, b): + return np.average(np.square(a - b)) + + elif key in ("rmse", "root mean square error"): + + def metric(a, b): + return np.sqrt(np.average(np.square(a - b))) + + elif key in ("nrmse", "normalized root mean square error"): + + def metric(a, b): + return np.sqrt(np.average(np.square(a - b)) / np.average(np.square(a))) + elif key in ("mae", "absolute error", "absolute", "mean absolute error", "abs"): - metric = lambda a, b: np.average(np.abs(a - b)) + + def metric(a, b): + return np.average(np.abs(a - b)) + elif key in ("mre", "relative error", "relative", "mean relative error", "rel"): - metric = lambda a, b: np.average(np.abs(a - b) / np.maximum(np.abs(a), EPS)) + + def metric(a, b): + return np.average(np.abs(a - b) / np.maximum(np.abs(a), EPS)) + elif key in ("rmsre", "root mean square relative error"): - metric = lambda a, b: np.sqrt( - np.average(np.square(a - b) / np.square(np.maximum(a, EPS))) - ) + + def metric(a, b): + return np.sqrt(np.average(np.square(a - b) / np.square(np.maximum(a, EPS)))) + elif key in ("male", "mean absolute log error"): - metric = lambda a, b: np.average(np.abs(np.log(a + EPS) - np.log(b + EPS))) + + def metric(a, b): + return np.average(np.abs(np.log(a + EPS) - np.log(b + EPS))) + elif key in ("rmsle", "root mean square log error"): - metric = lambda a, b: np.sqrt( - np.average(np.square(np.log(a + EPS) - np.log(b + EPS))) - ) + + def metric(a, b): + return np.sqrt(np.average(np.square(np.log(a + EPS) - np.log(b + EPS)))) + elif key in ("max", "max abs", "maximum", "maximum absolute"): - metric = lambda a, b: np.amax(np.abs(a - b)) + + def metric(a, b): + return np.amax(np.abs(a - b)) + elif key in ("max rel", "maximum relative"): - metric = lambda a, b: np.amax(np.abs(a - b) / np.maximum(np.abs(a), EPS)) + + def metric(a, b): + return np.amax(np.abs(a - b) / np.maximum(np.abs(a), EPS)) + else: raise ValueError(f"invalid error metric provided: {key}") # cast both arguments to f64 before passing them to the metric - return lambda a, b, metric=metric: metric( + return lambda a, b: metric( a.astype(np.float64, copy=False), b.astype(np.float64, copy=False) ) -class ErrorObserver(AccuracyObserver): - """``ErrorObserver`` measures the error of the output produced by a kernel +class AccuracyObserver(OutputObserver): + """``AccuracyObserver`` measures the error of the output produced by a kernel by comparing it against a reference output. By default, it uses the root mean-squared error (RMSE) and uses the diff --git a/kernel_tuner/core.py b/kernel_tuner/core.py index 7149cb578..41e19f741 100644 --- a/kernel_tuner/core.py +++ b/kernel_tuner/core.py @@ -13,8 +13,9 @@ except ImportError: cp = np +from kernel_tuner.accuracy import Tunable from kernel_tuner.observers.nvml import NVMLObserver -from kernel_tuner.observers.observer import ContinuousObserver, AccuracyObserver +from kernel_tuner.observers.observer import ContinuousObserver, OutputObserver from kernel_tuner.backends.cupy import CupyFunctions from kernel_tuner.backends.pycuda import PyCudaFunctions from kernel_tuner.backends.nvcuda import CudaFunctions @@ -250,7 +251,7 @@ def __init__(self, kernel_source, device=0, platform=0, quiet=False, compiler=No #look for NVMLObserver in observers, if present, enable special tunable parameters through nvml self.use_nvml = False self.continuous_observers = [] - self.accuracy_observers = [] + self.output_observers = [] if observers: for obs in observers: if isinstance(obs, NVMLObserver): @@ -258,8 +259,8 @@ def __init__(self, kernel_source, device=0, platform=0, quiet=False, compiler=No self.use_nvml = True if hasattr(obs, "continuous_observer"): self.continuous_observers.append(obs.continuous_observer) - if isinstance(obs, AccuracyObserver): - self.accuracy_observers.append(obs) + if isinstance(obs, OutputObserver): + self.output_observers.append(obs) self.iterations = iterations @@ -420,17 +421,17 @@ def check_kernel_output(self, func, gpu_args, instance, answer, atol, verify, ve else: result_host.append(None) - # Call the accuracy observers - for obs in self.accuracy_observers: + # Call the output observers + for obs in self.output_observers: obs.process_kernel_output(answer, result_host) # There are three scenarios: # - if there is a custom verify function, call that. - # - otherwise, if there are no accuracy observer, call the default verify function + # - otherwise, if there are no output observers, call the default verify function # - otherwise, the answer is correct (we assume the accuracy observers verified the output) if verify: correct = verify(answer, result_host, atol=atol) - elif not self.accuracy_observers: + elif not self.output_observers: correct = _default_verify_function(instance, answer, result_host, atol, verbose) else: correct = True @@ -468,9 +469,9 @@ def compile_and_benchmark(self, kernel_source, gpu_args, params, kernel_options, if isinstance(instance, util.ErrorConfig): result[to.objective] = util.InvalidConfig() else: - # Preprocess the argument list. This is required to deal with `MixedPrecisionArray`s gpu_args = self.preprocess_gpu_arguments(gpu_args, params) + try: # compile the kernel start_compilation = time.perf_counter() @@ -492,7 +493,7 @@ def compile_and_benchmark(self, kernel_source, gpu_args, params, kernel_options, last_compilation_time = 1000 * (time.perf_counter() - start_compilation) # test kernel for correctness - if func and (to.answer or to.verify or self.accuracy_observers): + if func and (to.answer or to.verify or self.output_observers): start_verification = time.perf_counter() self.check_kernel_output(func, gpu_args, instance, to.answer, to.atol, to.verify, verbose) last_verification_time = 1000 * (time.perf_counter() - start_verification) From ff1034688355be6601d76ef8847032e849460c99 Mon Sep 17 00:00:00 2001 From: stijn Date: Mon, 2 Oct 2023 11:17:40 +0200 Subject: [PATCH 19/24] Make `error_metric_from_name` more resilient in handling user provided metric name --- kernel_tuner/accuracy.py | 68 +++++++++++++++++++++++++++------------- 1 file changed, 47 insertions(+), 21 deletions(-) diff --git a/kernel_tuner/accuracy.py b/kernel_tuner/accuracy.py index 9e5cfc5f2..65667a591 100644 --- a/kernel_tuner/accuracy.py +++ b/kernel_tuner/accuracy.py @@ -2,6 +2,7 @@ from typing import Dict import numpy as np import logging +import re from kernel_tuner.observers import AccuracyObserver @@ -151,7 +152,7 @@ def process_kernel_output(self, answer, output): pass -def error_metric_from_name(key, EPS=1e-8): +def error_metric_from_name(user_key, EPS=1e-8): """Find the error metric function for the given name. Returns an function that takes two parameters (the ground-truth and the @@ -169,66 +170,90 @@ def error_metric_from_name(key, EPS=1e-8): * MRE (mean relative error) * MALE (mean absolute log error) * max (maximum absolute error) + * max rel (maximum relative error) The value of `EPS` is used for relative errors to prevent division by zero. `` """ - # lowercase the metric name - key = key.lower().replace("_", " ").strip() - - if key in ("mse", "mean square error"): + # Prepocess the provided name: + # - convert to lowercase + # - remove the word "error" + # - remove underscores and dashes + # - strip whitespaces + # - replace common abreviations + key = user_key.lower() + key = re.sub(r"\berror\b", " ", key) + key = re.sub(r"[\s_-]+", " ", key) + key = key.strip() + + replacements = { + "average": "mean", + "avg": "mean", + "square": "squared", + "sq": "squared", + "max": "maximum", + "rel": "relative", + "abs": "absolute", + "log": "logarithmic", + } + + for pattern, replacement in replacements.items(): + key = re.sub(rf"\b{pattern}\b", replacement, key) + + # Select the right metric + if key in ("mse", "mean squared"): def metric(a, b): return np.average(np.square(a - b)) - elif key in ("rmse", "root mean square error"): + elif key in ("rmse", "root mean squared"): def metric(a, b): return np.sqrt(np.average(np.square(a - b))) - elif key in ("nrmse", "normalized root mean square error"): + elif key in ("nrmse", "normalized root mean squared"): def metric(a, b): return np.sqrt(np.average(np.square(a - b)) / np.average(np.square(a))) - elif key in ("mae", "absolute error", "absolute", "mean absolute error", "abs"): + elif key in ("mae", "absolute", "mean absolute"): def metric(a, b): return np.average(np.abs(a - b)) - elif key in ("mre", "relative error", "relative", "mean relative error", "rel"): + elif key in ("mre", "relative", "mean relative"): def metric(a, b): return np.average(np.abs(a - b) / np.maximum(np.abs(a), EPS)) - elif key in ("rmsre", "root mean square relative error"): + elif key in ("rmsre", "root mean squared relative"): def metric(a, b): return np.sqrt(np.average(np.square(a - b) / np.square(np.maximum(a, EPS)))) - elif key in ("male", "mean absolute log error"): + elif key in ("male", "mean absolute logarithmic"): def metric(a, b): return np.average(np.abs(np.log(a + EPS) - np.log(b + EPS))) - elif key in ("rmsle", "root mean square log error"): + elif key in ("rmsle", "root mean squared logarithmic"): def metric(a, b): return np.sqrt(np.average(np.square(np.log(a + EPS) - np.log(b + EPS)))) - elif key in ("max", "max abs", "maximum", "maximum absolute"): + elif key in ("maximum absolute", "maximum"): def metric(a, b): return np.amax(np.abs(a - b)) - elif key in ("max rel", "maximum relative"): + elif key in ("maximum relative",): def metric(a, b): return np.amax(np.abs(a - b) / np.maximum(np.abs(a), EPS)) else: - raise ValueError(f"invalid error metric provided: {key}") + raise ValueError(f"invalid error metric provided: {user_key}") # cast both arguments to f64 before passing them to the metric return lambda a, b: metric( @@ -237,11 +262,11 @@ def metric(a, b): class AccuracyObserver(OutputObserver): - """``AccuracyObserver`` measures the error of the output produced by a kernel - by comparing it against a reference output. + """``AccuracyObserver`` measures the error on the output produced by a kernel + by comparing the output against a reference output. By default, it uses the root mean-squared error (RMSE) and uses the - parameter key ``error``. + metric name ``"error"``. """ def __init__(self, metric=None, key="error", *, atol=1e-8): @@ -249,14 +274,15 @@ def __init__(self, metric=None, key="error", *, atol=1e-8): :param metric: The error metric. This should be a string that is accepted by ``error_metric_from_name`` such as ``"absolute error"`` - or ``"relative error"``. Alternatively, it can be + or ``"relative error"``. Alternatively, it can be a function that accepts two numpy arrays as arguments (the reference output and the kernel output) :param key: The name of this metric in the results. - :param atol: The absolute tolerance used in relative metrics to prevent - division by zero. + :param atol: The tolerance used in relative metrics to prevent + division by zero. It is ignored by absolute error metrics. """ + # Default metric is RMSE if not metric: metric = "rmse" From afc8c7f0c9decc7b3b54f57916e7d40fe08644c6 Mon Sep 17 00:00:00 2001 From: stijn Date: Mon, 2 Oct 2023 11:21:43 +0200 Subject: [PATCH 20/24] Move `OutputObserver` from `accuracy` to `observers` --- kernel_tuner/accuracy.py | 13 +------------ kernel_tuner/observers/__init__.py | 2 +- kernel_tuner/observers/observer.py | 15 ++++++++++++++- 3 files changed, 16 insertions(+), 14 deletions(-) diff --git a/kernel_tuner/accuracy.py b/kernel_tuner/accuracy.py index 65667a591..34598276f 100644 --- a/kernel_tuner/accuracy.py +++ b/kernel_tuner/accuracy.py @@ -4,7 +4,7 @@ import logging import re -from kernel_tuner.observers import AccuracyObserver +from .observers import OutputObserver class Tunable(UserDict): @@ -141,17 +141,6 @@ def __init__( super().__init__(param_key, arrays) -class OutputObserver(BenchmarkObserver): - """Observer that can verify or measure something about the output produced by a kernel.""" - - @abstractmethod - def process_kernel_output(self, answer, output): - """method will be called once before benchmarking of a single kernel configuration. The arguments - provided are the `answer` as passed `tune_kernel` and the `output` produced by the kernel - """ - pass - - def error_metric_from_name(user_key, EPS=1e-8): """Find the error metric function for the given name. diff --git a/kernel_tuner/observers/__init__.py b/kernel_tuner/observers/__init__.py index 199dd7e15..ad27791d5 100644 --- a/kernel_tuner/observers/__init__.py +++ b/kernel_tuner/observers/__init__.py @@ -1 +1 @@ -from .observer import BenchmarkObserver, IterationObserver, ContinuousObserver +from .observer import BenchmarkObserver, IterationObserver, ContinuousObserver, OutputObserver diff --git a/kernel_tuner/observers/observer.py b/kernel_tuner/observers/observer.py index 203805dd9..056a64a6a 100644 --- a/kernel_tuner/observers/observer.py +++ b/kernel_tuner/observers/observer.py @@ -12,7 +12,7 @@ def register_configuration(self, params): """Called once before benchmarking of a single kernel configuration. The `params` argument is a `dict` that stores the configuration parameters.""" pass - + def before_start(self): """before start is called every iteration before the kernel starts""" pass @@ -45,3 +45,16 @@ class IterationObserver(BenchmarkObserver): class ContinuousObserver(BenchmarkObserver): pass + + +class OutputObserver(BenchmarkObserver): + """Observer that can verify or measure something about the output produced by a kernel.""" + + @abstractmethod + def process_kernel_output(self, answer, output): + """method will be called once before benchmarking of a single kernel configuration. The arguments + provided are the `answer` as passed `tune_kernel` and the `output` produced by the kernel + """ + pass + + From 754fc45f95991f32d7c916da3a8936849645db37 Mon Sep 17 00:00:00 2001 From: stijn Date: Mon, 2 Oct 2023 11:35:52 +0200 Subject: [PATCH 21/24] Rename method of `OutputObserver` from `process_kernel_output` to `process_output` --- kernel_tuner/accuracy.py | 4 ++-- kernel_tuner/core.py | 3 +-- kernel_tuner/observers/observer.py | 2 +- kernel_tuner/observers/powersensor.py | 2 +- kernel_tuner/runners/sequential.py | 1 + 5 files changed, 6 insertions(+), 6 deletions(-) diff --git a/kernel_tuner/accuracy.py b/kernel_tuner/accuracy.py index 34598276f..1a24fef13 100644 --- a/kernel_tuner/accuracy.py +++ b/kernel_tuner/accuracy.py @@ -4,7 +4,7 @@ import logging import re -from .observers import OutputObserver +from kernel_tuner.observers import OutputObserver class Tunable(UserDict): @@ -283,7 +283,7 @@ def __init__(self, metric=None, key="error", *, atol=1e-8): self.metric = metric self.result = None - def process_kernel_output(self, answers, outputs): + def process_output(self, answers, outputs): errors = [] for answer, output in zip(answers, outputs): diff --git a/kernel_tuner/core.py b/kernel_tuner/core.py index 41e19f741..1a7be0562 100644 --- a/kernel_tuner/core.py +++ b/kernel_tuner/core.py @@ -6,7 +6,6 @@ import re import numpy as np -from kernel_tuner.accuracy import Tunable try: import cupy as cp @@ -423,7 +422,7 @@ def check_kernel_output(self, func, gpu_args, instance, answer, atol, verify, ve # Call the output observers for obs in self.output_observers: - obs.process_kernel_output(answer, result_host) + obs.process_output(answer, result_host) # There are three scenarios: # - if there is a custom verify function, call that. diff --git a/kernel_tuner/observers/observer.py b/kernel_tuner/observers/observer.py index 056a64a6a..493de94f8 100644 --- a/kernel_tuner/observers/observer.py +++ b/kernel_tuner/observers/observer.py @@ -51,7 +51,7 @@ class OutputObserver(BenchmarkObserver): """Observer that can verify or measure something about the output produced by a kernel.""" @abstractmethod - def process_kernel_output(self, answer, output): + def process_output(self, answer, output): """method will be called once before benchmarking of a single kernel configuration. The arguments provided are the `answer` as passed `tune_kernel` and the `output` produced by the kernel """ diff --git a/kernel_tuner/observers/powersensor.py b/kernel_tuner/observers/powersensor.py index 33ff979e2..6d07e8977 100644 --- a/kernel_tuner/observers/powersensor.py +++ b/kernel_tuner/observers/powersensor.py @@ -52,7 +52,7 @@ def after_finish(self): self.results["ps_energy"].append(ps_measured_e) if "ps_power" in self.observables: ps_measured_t = ( - end_state.time_at_read - self.begin_state.time_at_read + end_state.time_at_read - self.begin_state.time_at_read ) # seconds self.results["ps_power"].append(ps_measured_e / ps_measured_t) # Watt diff --git a/kernel_tuner/runners/sequential.py b/kernel_tuner/runners/sequential.py index 2b0362930..352a8321e 100644 --- a/kernel_tuner/runners/sequential.py +++ b/kernel_tuner/runners/sequential.py @@ -92,6 +92,7 @@ def run(self, parameter_space, tuning_options): warmup_time = 1e3 * (perf_counter() - warmup_time) result = self.dev.compile_and_benchmark(self.kernel_source, self.gpu_args, params, self.kernel_options, tuning_options) + params.update(result) if tuning_options.objective in result and isinstance(result[tuning_options.objective], ErrorConfig): From 91ed8926e4da8f71d9b57c4b8cca9b8aa6a2055f Mon Sep 17 00:00:00 2001 From: stijn Date: Mon, 2 Oct 2023 13:05:44 +0200 Subject: [PATCH 22/24] Add tests for `kernel_tuner.accuracy` --- kernel_tuner/accuracy.py | 46 ++++++++++++++--------- kernel_tuner/core.py | 31 +++++++-------- test/test_accuracy.py | 81 ++++++++++++++++++++++++++++++++++++++++ test/test_core.py | 18 +++++++++ 4 files changed, 144 insertions(+), 32 deletions(-) create mode 100644 test/test_accuracy.py diff --git a/kernel_tuner/accuracy.py b/kernel_tuner/accuracy.py index 1a24fef13..491541909 100644 --- a/kernel_tuner/accuracy.py +++ b/kernel_tuner/accuracy.py @@ -56,6 +56,28 @@ def __call__(self, params): return self.select_for_configuration(params) +def _find_bfloat16_if_available(): + # Try to get bfloat16 if available. + try: + from bfloat16 import bfloat16 + return bfloat16 + except ImportError: + pass + + try: + from tensorflow import bfloat16 + return bfloat16.as_numpy_dtype + except ImportError: + pass + + logging.warning( + "could not find `bfloat16` data type for numpy, " + + "please install either the package `bfloat16` or `tensorflow`" + ) + + return None + + def _to_float_dtype(x: str) -> np.dtype: """Convert a string to a numpy data type (``dtype``). This function recognizes common names (such as ``f16`` or ``kfloat``), and uses ``np.dtype(x)`` as a @@ -65,9 +87,10 @@ def _to_float_dtype(x: str) -> np.dtype: x = x.lower() if x in ("bfloat16", "bf16", "kbfloat16", "__nv_bfloat16"): - from bfloat16 import bfloat16 + result = _find_bfloat16_if_available() + if result is not None: + return result - return bfloat16 if x in ("half", "f16", "float16", "__half", "khalf", 16): return np.half if x in ("float", "single", "f32", "float32", "kfloat", 32): @@ -108,21 +131,10 @@ def __init__( if not dtypes: dtypes = dict(half=np.half, float=np.single, double=np.double) - # Try to get bfloat16 if available. - try: - from bfloat16 import bfloat16 - + bfloat16 = _find_bfloat16_if_available() + if bfloat16 is not None: dtypes["bfloat16"] = bfloat16 - except ImportError: - try: - from tensorflow import bfloat16 - dtypes["bfloat16"] = bfloat16.as_numpy_dtype - except ImportError: - logging.warning( - "could not find `bfloat16` data type for numpy, " - + "please install either the package `bfloat16` or `tensorflow`" - ) # If dtype is a list, convert it to a dictionary if isinstance(dtypes, (list, tuple)): @@ -224,12 +236,12 @@ def metric(a, b): elif key in ("male", "mean absolute logarithmic"): def metric(a, b): - return np.average(np.abs(np.log(a + EPS) - np.log(b + EPS))) + return np.average(np.abs(np.log10(a + EPS) - np.log10(b + EPS))) elif key in ("rmsle", "root mean squared logarithmic"): def metric(a, b): - return np.sqrt(np.average(np.square(np.log(a + EPS) - np.log(b + EPS)))) + return np.sqrt(np.average(np.square(np.log10(a + EPS) - np.log10(b + EPS)))) elif key in ("maximum absolute", "maximum"): diff --git a/kernel_tuner/core.py b/kernel_tuner/core.py index 1a7be0562..973c0ee06 100644 --- a/kernel_tuner/core.py +++ b/kernel_tuner/core.py @@ -438,18 +438,6 @@ def check_kernel_output(self, func, gpu_args, instance, answer, atol, verify, ve if not correct: raise RuntimeError("Kernel result verification failed for: " + util.get_config_string(instance.params)) - def preprocess_gpu_arguments(self, old_arguments, params): - """ Get a flat list of arguments based on the configuration given by `params` """ - new_arguments = [] - - for argument in old_arguments: - if isinstance(argument, Tunable): - new_arguments.append(argument.select_for_configuration(params)) - else: - new_arguments.append(argument) - - return new_arguments - def compile_and_benchmark(self, kernel_source, gpu_args, params, kernel_options, to): # reset previous timers last_compilation_time = None @@ -469,7 +457,7 @@ def compile_and_benchmark(self, kernel_source, gpu_args, params, kernel_options, result[to.objective] = util.InvalidConfig() else: # Preprocess the argument list. This is required to deal with `MixedPrecisionArray`s - gpu_args = self.preprocess_gpu_arguments(gpu_args, params) + gpu_args = _preprocess_gpu_arguments(gpu_args, params) try: # compile the kernel @@ -586,7 +574,7 @@ def create_kernel_instance(self, kernel_source, kernel_options, params, verbose) kernel_string, name = wrap_templated_kernel(kernel_string, name) # Preprocess GPU arguments. Require for handling `Tunable` arguments - arguments = self.preprocess_gpu_arguments(kernel_options.arguments, params) + arguments = _preprocess_gpu_arguments(kernel_options.arguments, params) #collect everything we know about this instance and return it return KernelInstance(name, kernel_source, kernel_string, temp_files, threads, grid, params, arguments) @@ -603,7 +591,7 @@ def ready_argument_list(self, arguments): """ready argument list to be passed to the kernel, allocates gpu mem if necessary""" flat_args = [] - # Flatten all arguments into a single list. Required to deal with `MixedPrecisionArray`s + # Flatten all arguments into a single list. Required to deal with `Tunable`s for argument in arguments: if isinstance(argument, Tunable): flat_args.extend(argument.values()) @@ -644,6 +632,19 @@ def run_kernel(self, func, gpu_args, instance): return True +def _preprocess_gpu_arguments(old_arguments, params): + """ Get a flat list of arguments based on the configuration given by `params` """ + new_arguments = [] + + for argument in old_arguments: + if isinstance(argument, Tunable): + new_arguments.append(argument.select_for_configuration(params)) + else: + new_arguments.append(argument) + + return new_arguments + + def _default_verify_function(instance, answer, result_host, atol, verbose): """default verify function based on np.allclose""" diff --git a/test/test_accuracy.py b/test/test_accuracy.py new file mode 100644 index 000000000..8406d68aa --- /dev/null +++ b/test/test_accuracy.py @@ -0,0 +1,81 @@ +import kernel_tuner + +import numpy as np +import pytest + + +def test_tunable(): + from kernel_tuner.accuracy import Tunable + + # Test with string as key + x = Tunable("foo", dict(a=1, b=2)) + assert x(dict(foo="a")) == 1 + assert x(dict(foo="b")) == 2 + + with pytest.raises(KeyError): + assert x(dict(foo="c")) == 3 + + # Test with lambda as key + x = Tunable(lambda p: p["foo"] + p["bar"], dict(ab=1, bc=2)) + assert x(dict(foo="a", bar="b")) == 1 + assert x(dict(foo="b", bar="c")) == 2 + + with pytest.raises(KeyError): + assert x(dict(foo="c", bar="d")) == 3 + + +def test_to_float_dtype(): + from kernel_tuner.accuracy import _to_float_dtype + + ## Unfortunately, numpy does not offer bfloat16 + # assert _to_float_dtype("bfloat16") == np.bfloat16 + + assert _to_float_dtype("half") == np.float16 + assert _to_float_dtype("f16") == np.float16 + assert _to_float_dtype("float16") == np.float16 + + assert _to_float_dtype("float") == np.float32 + assert _to_float_dtype("f32") == np.float32 + assert _to_float_dtype("float32") == np.float32 + + assert _to_float_dtype("double") == np.float64 + assert _to_float_dtype("f64") == np.float64 + assert _to_float_dtype("float64") == np.float64 + + +def test_tunable_precision(): + from kernel_tuner.accuracy import TunablePrecision + + inputs = np.array([1, 2, 3], dtype=np.float64) + x = TunablePrecision( + "foo", inputs, dict(float16=np.half, float32=np.float32, float64=np.double) + ) + + assert np.all(x(dict(foo="float16")) == inputs) + assert x(dict(foo="float16")).dtype == np.half + + assert np.all(x(dict(foo="float32")) == inputs) + assert x(dict(foo="float32")).dtype == np.float32 + + assert np.all(x(dict(foo="float64")) == inputs) + assert x(dict(foo="float64")).dtype == np.double + + +def test_error_metric_from_name(): + from kernel_tuner.accuracy import error_metric_from_name + from math import sqrt + + eps = 0.1 + a = np.array([0, 1, 2, 3]) + b = np.array([1, 1, 2, 5]) + + assert error_metric_from_name("mse")(a, b) == pytest.approx(1.25) + assert error_metric_from_name("rmse")(a, b) == pytest.approx(sqrt(1.25)) + assert error_metric_from_name("nrmse")(a, b) == pytest.approx(sqrt(1.25 / 3.5)) + assert error_metric_from_name("mae")(a, b) == pytest.approx(0.75) + assert error_metric_from_name("mre", eps)(a, b) == pytest.approx(2.666666666666666) + assert error_metric_from_name("rmsre", eps)(a, b) == pytest.approx(5.011098792790969) + assert error_metric_from_name("male", eps)(a, b) == pytest.approx(0.3144002918554722) + assert error_metric_from_name("rmsle", eps)(a, b) == pytest.approx(1.224514683450185) + assert error_metric_from_name("maximum abs")(a, b) == pytest.approx(2) + assert error_metric_from_name("maximum rel", eps)(a, b) == pytest.approx(10) diff --git a/test/test_core.py b/test/test_core.py index a2c8b61ed..a8624470e 100644 --- a/test/test_core.py +++ b/test/test_core.py @@ -178,6 +178,24 @@ def test_default_verify_function_scalar(): assert core._default_verify_function(instance, answer, result_host, 0.1, False) +def test_preprocess_gpu_arguments(): + from kernel_tuner.accuracy import Tunable + + arguments = [ + Tunable("foo", dict(a=1, b=2)), + Tunable(lambda p: p["foo"] + p["bar"], dict(ax=3, bx=4)), + ] + + params = dict(foo="a", bar="x") + + expected = [ + 1, + 3, + ] + + assert core._preprocess_gpu_arguments(arguments, params) == expected + + def test_split_argument_list(): test_string = "T *c, const T *__restrict__ a, T\n *\n b\n , int n" ans1, ans2 = core.split_argument_list([s.strip() for s in test_string.split(',')]) From 24346cd41a855d24dba9a47365379e67cbc1b492 Mon Sep 17 00:00:00 2001 From: stijn Date: Mon, 2 Oct 2023 13:23:10 +0200 Subject: [PATCH 23/24] Update accuracy example to use currect API --- examples/cuda/accuracy.py | 24 ++++++++---------------- 1 file changed, 8 insertions(+), 16 deletions(-) diff --git a/examples/cuda/accuracy.py b/examples/cuda/accuracy.py index 87485493f..bf00aa72b 100644 --- a/examples/cuda/accuracy.py +++ b/examples/cuda/accuracy.py @@ -3,19 +3,7 @@ import numpy from pprint import pprint from kernel_tuner import tune_kernel -from kernel_tuner.accuracy import TunablePrecision -from kernel_tuner.observers import AccuracyObserver - - -class MyObserver(AccuracyObserver): - def __init__(self): - self.error = None - - def process_kernel_output(self, answer, outputs): - self.error = numpy.average((answer[-1] - outputs[-1].astype(numpy.float64))**2) - - def get_results(self): - return dict(error=self.error) +from kernel_tuner.accuracy import TunablePrecision, AccuracyObserver def tune(): @@ -50,10 +38,13 @@ def tune(): answer = [None, None, None, a + b] tune_params = dict() - tune_params["block_size_x"] = [128+64*i for i in range(15)] + tune_params["block_size_x"] = [32, 64, 128, 256, 512, 1024] tune_params["float_type"] = ["float", "double", "half"] - observers = [MyObserver()] + observers = [ + AccuracyObserver("RMSE", "error_rmse"), + AccuracyObserver("MRE", "error_relative"), + ] results, env = tune_kernel( "vector_add", @@ -63,7 +54,8 @@ def tune(): tune_params, answer=answer, observers=observers, - lang="cupy") + lang="CUDA", + ) pprint(results) From 647c04bb16fe6d3c3ee0ac445f84cc5ea7d7c97f Mon Sep 17 00:00:00 2001 From: stijn Date: Mon, 2 Oct 2023 13:42:06 +0200 Subject: [PATCH 24/24] Fix failing test for `error_metric_from_name` --- test/test_accuracy.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/test_accuracy.py b/test/test_accuracy.py index 8406d68aa..1e5070637 100644 --- a/test/test_accuracy.py +++ b/test/test_accuracy.py @@ -76,6 +76,6 @@ def test_error_metric_from_name(): assert error_metric_from_name("mre", eps)(a, b) == pytest.approx(2.666666666666666) assert error_metric_from_name("rmsre", eps)(a, b) == pytest.approx(5.011098792790969) assert error_metric_from_name("male", eps)(a, b) == pytest.approx(0.3144002918554722) - assert error_metric_from_name("rmsle", eps)(a, b) == pytest.approx(1.224514683450185) + assert error_metric_from_name("rmsle", eps)(a, b) == pytest.approx(0.5317999700319226) assert error_metric_from_name("maximum abs")(a, b) == pytest.approx(2) assert error_metric_from_name("maximum rel", eps)(a, b) == pytest.approx(10)