From 4daae027b78cd2bbf2a0c53e116fd9c5e10dad90 Mon Sep 17 00:00:00 2001
From: stijn <s.heldens@esciencecenter.nl>
Date: Wed, 8 Mar 2023 13:34:48 +0100
Subject: [PATCH 01/24] Add `TunablePrecision` object and example on how to use
 it

---
 examples/cuda/accuracy.py | 46 +++++++++++++++++++++
 kernel_tuner/accuracy.py  | 86 +++++++++++++++++++++++++++++++++++++++
 kernel_tuner/core.py      | 55 +++++++++++++++++++++----
 3 files changed, 180 insertions(+), 7 deletions(-)
 create mode 100644 examples/cuda/accuracy.py
 create mode 100644 kernel_tuner/accuracy.py

diff --git a/examples/cuda/accuracy.py b/examples/cuda/accuracy.py
new file mode 100644
index 000000000..59655ecad
--- /dev/null
+++ b/examples/cuda/accuracy.py
@@ -0,0 +1,46 @@
+#!/usr/bin/env python
+
+import numpy
+from pprint import pprint
+from kernel_tuner import tune_kernel
+from kernel_tuner.accuracy import TunablePrecision
+
+def tune():
+    kernel_string = """
+    #include <cuda_fp16.h>
+    using half = __half;
+
+    __global__ void vector_add(int n, float_type* left, float_type* right, float_type* output) {
+        int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+        if (i < n) {
+            output[i] = left[i] + right[i];
+        }
+    }
+    """
+
+    size = 10000000
+
+    n = numpy.int32(size)
+    a = numpy.random.randn(size).astype(numpy.float64)
+    b = numpy.random.randn(size).astype(numpy.float64)
+    c = numpy.zeros_like(b)
+
+    args = [
+        n,
+        TunablePrecision("float_type", a),
+        TunablePrecision("float_type", b),
+        TunablePrecision("float_type", c),
+    ]
+
+    tune_params = dict()
+    tune_params["block_size_x"] = [128+64*i for i in range(15)]
+    tune_params["float_type"] = ["float", "double", "half"]
+
+    results, env = tune_kernel("vector_add", kernel_string, size, args, tune_params)
+
+    pprint(results)
+
+
+if __name__ == "__main__":
+    tune()
diff --git a/kernel_tuner/accuracy.py b/kernel_tuner/accuracy.py
new file mode 100644
index 000000000..4fae643fc
--- /dev/null
+++ b/kernel_tuner/accuracy.py
@@ -0,0 +1,86 @@
+from collections import UserDict
+from typing import Dict
+import numpy as np
+
+
+class Tunable(UserDict):
+    def __init__(self, param_key: str, arrays: Dict):
+        """Create a new ``Tunable``.
+
+        ``Tunable`` can be used one of the input arguments when tuning kernels. It can contain
+        several arrays and the array that will be used during benchmark of one kernel configuration
+        can be selected based on a tunable parameter.
+
+        Example
+        -------
+        For example, it is possible to define a tunable parameter called ``matrix_layout`` and then
+        tunable for Fortran-order or C-order memory layout by passing the following object as a
+        kernel argument:
+
+        ```
+        Tunable("matrix_layout", dict("c"=matrix, "f"=matrix.transpose()))
+        ```
+
+        :param param_key: The tunable parameter used to select the array for benchmarking.
+        :param arrays: A dictionary that maps the parameter value to arrays.
+        """
+        if isinstance(arrays, (tuple, list)):
+            arrays = dict(enumerate(arrays))
+
+        super().__init__(arrays)
+        self.param_key = param_key
+
+    def select_for_configuration(self, params):
+        if callable(self.param_key):
+            key = self.param_key(params)
+        elif self.param_key in params:
+            key = params[self.param_key]
+        else:
+            key = eval(self.param_key, params, params)
+
+        if key not in self:
+            list = ", ".join(map(str, self.keys()))
+            raise KeyError(f"'{key}' is not a valid parameter value, should be one of: {list}")
+
+        return self[key]
+
+    def __call__(self, params):
+        return self.select_for_configuration(params)
+
+
+class TunablePrecision(Tunable):
+    def __init__(self, param_key: str, array: np.ndarray, dtypes: Dict[str, np.dtype] = None):
+        """
+        Create a new ``TunablePrecision``.
+
+        ``TunablePrecision`` can be used one of the input arguments when tuning kernels. It
+        contains the same array data, but stored using different levels of precision. This can
+        be used to tune the optimal precision for a kernel argument.
+
+        :param param_key: The tunable parameter used to select the precision for benchmarking.
+        :param array: The input array. Will be converted to the given precision levels.
+        :param dtypes: Dictionary that maps names to numpy data types.
+        """
+        # If no dtypes are given, generate a default list
+        if not dtypes:
+            dtypes = dict(
+                half=np.half,
+                float=np.single,
+                double=np.double)
+
+            # Try to get bfloat16 from tensorflow if available.
+            try:
+                import tensorflow
+                dtypes["bfloat16"] = tensorflow.bfloat16.as_numpy_dtype
+            except ImportError:
+                pass  # Ignore error if tensorflow is not available
+
+        # If dtype is a list, convert it to a dictionary
+        if isinstance(dtypes, (list, tuple)):
+            dtypes = dict((name, np.dtype(name)) for name in dtypes)
+
+        arrays = dict()
+        for precision, dtype in dtypes.items():
+            arrays[precision] = np.array(array).astype(dtype)
+
+        super().__init__(param_key, arrays)
diff --git a/kernel_tuner/core.py b/kernel_tuner/core.py
index 1eeeb0d8d..7194456ba 100644
--- a/kernel_tuner/core.py
+++ b/kernel_tuner/core.py
@@ -6,6 +6,8 @@
 import re
 import numpy as np
 
+from kernel_tuner.accuracy import Tunable
+
 try:
     import cupy as cp
 except ImportError:
@@ -239,7 +241,7 @@ def __init__(self, kernel_source, device=0, platform=0, quiet=False, compiler=No
         elif lang.upper() == "OPENCL":
             dev = OpenCLFunctions(device, platform, compiler_options=compiler_options, iterations=iterations, observers=observers)
         elif lang.upper() in ["C", "FORTRAN"]:
-            dev = CFunctions(compiler=compiler, compiler_options=compiler_options, iterations=iterations)
+            dev = CFunctions(compiler=compiler, compiler_options=compiler_options, iterations=iterations, observers=observers)
         elif lang.upper() == "HIP":
             dev = HipFunctions(device, compiler_options=compiler_options, iterations=iterations, observers=observers)
         else:
@@ -409,24 +411,40 @@ def check_kernel_output(self, func, gpu_args, instance, answer, atol, verify, ve
         if not correct:
             raise RuntimeError("Kernel result verification failed for: " + util.get_config_string(instance.params))
 
-    def compile_and_benchmark(self, kernel_source, gpu_args, params, kernel_options, to):
-        """ Compile and benchmark a kernel instance based on kernel strings and parameters """
-        instance_string = util.get_instance_string(params)
+    def preprocess_gpu_arguments(self, old_arguments, params):
+        """ Get a flat list of arguments based on the configuration given by `params` """
+        new_arguments = []
+
+        for argument in old_arguments:
+            if isinstance(argument, Tunable):
+                new_arguments.append(argument.select_for_configuration(params))
+            else:
+                new_arguments.append(argument)
+
+        return new_arguments
 
+    def compile_and_benchmark(self, kernel_source, gpu_args, params, kernel_options, to):
         # reset previous timers
         last_compilation_time = None
         last_verification_time = None
         last_benchmark_time = None
 
-        logging.debug('compile_and_benchmark ' + instance_string)
-
         verbose = to.verbose
         result = {}
 
+        # Compile and benchmark a kernel instance based on kernel strings and parameters
+        instance_string = util.get_instance_string(params)
+
+        logging.debug('compile_and_benchmark ' + instance_string)
+
         instance = self.create_kernel_instance(kernel_source, kernel_options, params, verbose)
         if isinstance(instance, util.ErrorConfig):
             result[to.objective] = util.InvalidConfig()
         else:
+
+            # Preprocess the argument list. This is required to deal with `MixedPrecisionArray`s
+            gpu_args = self.preprocess_gpu_arguments(gpu_args, params)
+
             try:
                 # compile the kernel
                 start_compilation = time.perf_counter()
@@ -554,7 +572,30 @@ def memcpy_dtoh(self, dest, src):
 
     def ready_argument_list(self, arguments):
         """ready argument list to be passed to the kernel, allocates gpu mem if necessary"""
-        return self.dev.ready_argument_list(arguments)
+        flat_args = []
+
+        # Flatten all arguments into a single list. Required to deal with `MixedPrecisionArray`s
+        for argument in arguments:
+            if isinstance(argument, Tunable):
+                flat_args.extend(argument.values())
+            else:
+                flat_args.append(argument)
+
+        flag_gpu_args = iter(self.dev.ready_argument_list(flat_args))
+
+        # Unflatten the arguments back into arrays.
+        gpu_args = []
+        for argument in arguments:
+            if isinstance(argument, Tunable):
+                arrays = dict()
+                for key in argument:
+                    arrays[key] = next(flag_gpu_args)
+
+                gpu_args.append(Tunable(argument.param_key, arrays))
+            else:
+                gpu_args.append(next(flag_gpu_args))
+
+        return gpu_args
 
     def run_kernel(self, func, gpu_args, instance):
         """ Run a compiled kernel instance on a device """

From ee93e2a09af8a9d5002fd8d0a6aa7e6ac39dbc34 Mon Sep 17 00:00:00 2001
From: stijn <s.heldens@esciencecenter.nl>
Date: Wed, 15 Mar 2023 15:13:19 +0100
Subject: [PATCH 02/24] Add initial support for `AccuracyObserver`

---
 examples/cuda/accuracy.py | 32 +++++++++++++++++--
 kernel_tuner/accuracy.py  | 30 ++++++++++++++++--
 kernel_tuner/core.py      | 65 ++++++++++++++++++++++++++++-----------
 kernel_tuner/util.py      |  7 +++++
 4 files changed, 111 insertions(+), 23 deletions(-)

diff --git a/examples/cuda/accuracy.py b/examples/cuda/accuracy.py
index 59655ecad..87485493f 100644
--- a/examples/cuda/accuracy.py
+++ b/examples/cuda/accuracy.py
@@ -4,13 +4,27 @@
 from pprint import pprint
 from kernel_tuner import tune_kernel
 from kernel_tuner.accuracy import TunablePrecision
+from kernel_tuner.observers import AccuracyObserver
+
+
+class MyObserver(AccuracyObserver):
+    def __init__(self):
+        self.error = None
+
+    def process_kernel_output(self, answer, outputs):
+        self.error = numpy.average((answer[-1] - outputs[-1].astype(numpy.float64))**2)
+
+    def get_results(self):
+        return dict(error=self.error)
+
 
 def tune():
     kernel_string = """
     #include <cuda_fp16.h>
     using half = __half;
 
-    __global__ void vector_add(int n, float_type* left, float_type* right, float_type* output) {
+    template <typename T>
+    __global__ void vector_add(int n, const T* left, const T* right, T* output) {
         int i = blockDim.x * blockIdx.x + threadIdx.x;
 
         if (i < n) {
@@ -19,7 +33,7 @@ def tune():
     }
     """
 
-    size = 10000000
+    size = 100000000
 
     n = numpy.int32(size)
     a = numpy.random.randn(size).astype(numpy.float64)
@@ -33,11 +47,23 @@ def tune():
         TunablePrecision("float_type", c),
     ]
 
+    answer = [None, None, None, a + b]
+
     tune_params = dict()
     tune_params["block_size_x"] = [128+64*i for i in range(15)]
     tune_params["float_type"] = ["float", "double", "half"]
 
-    results, env = tune_kernel("vector_add", kernel_string, size, args, tune_params)
+    observers = [MyObserver()]
+
+    results, env = tune_kernel(
+        "vector_add<float_type>",
+        kernel_string,
+        size,
+        args,
+        tune_params,
+        answer=answer,
+        observers=observers,
+        lang="cupy")
 
     pprint(results)
 
diff --git a/kernel_tuner/accuracy.py b/kernel_tuner/accuracy.py
index 4fae643fc..711902689 100644
--- a/kernel_tuner/accuracy.py
+++ b/kernel_tuner/accuracy.py
@@ -2,6 +2,8 @@
 from typing import Dict
 import numpy as np
 
+from kernel_tuner.observers import AccuracyObserver
+
 
 class Tunable(UserDict):
     def __init__(self, param_key: str, arrays: Dict):
@@ -70,8 +72,9 @@ def __init__(self, param_key: str, array: np.ndarray, dtypes: Dict[str, np.dtype
 
             # Try to get bfloat16 from tensorflow if available.
             try:
-                import tensorflow
-                dtypes["bfloat16"] = tensorflow.bfloat16.as_numpy_dtype
+                #import tensorflow
+                #dtypes["bfloat16"] = tensorflow.bfloat16.as_numpy_dtype
+                pass
             except ImportError:
                 pass  # Ignore error if tensorflow is not available
 
@@ -84,3 +87,26 @@ def __init__(self, param_key: str, array: np.ndarray, dtypes: Dict[str, np.dtype
             arrays[precision] = np.array(array).astype(dtype)
 
         super().__init__(param_key, arrays)
+
+
+class CalculateErrorObserver(AccuracyObserver):
+    def __init__(self, metric=None, key="error"):
+        # The default metric is the mean squared error
+        if metric is None:
+            metric = lambda a, b: np.average(np.square(a - b))
+
+        self.key = key
+        self.function = metric
+        self.result = None
+
+    def process_kernel_output(self, answers, outputs):
+        errors = []
+
+        for answer, output in zip(answers, outputs):
+            if answer is not None:
+                errors.append(self.metric(answer, output))
+
+        self.result = max(errors)
+
+    def get_results(self):
+        return dict([(self.key, self.result)])
diff --git a/kernel_tuner/core.py b/kernel_tuner/core.py
index 7194456ba..128d88eb7 100644
--- a/kernel_tuner/core.py
+++ b/kernel_tuner/core.py
@@ -14,7 +14,7 @@
     cp = np
 
 from kernel_tuner.observers.nvml import NVMLObserver
-from kernel_tuner.observers.observer import ContinuousObserver
+from kernel_tuner.observers.observer import ContinuousObserver, AccuracyObserver
 from kernel_tuner.backends.cupy import CupyFunctions
 from kernel_tuner.backends.pycuda import PyCudaFunctions
 from kernel_tuner.backends.nvcuda import CudaFunctions
@@ -250,6 +250,7 @@ def __init__(self, kernel_source, device=0, platform=0, quiet=False, compiler=No
         #look for NVMLObserver in observers, if present, enable special tunable parameters through nvml
         self.use_nvml = False
         self.continuous_observers = []
+        self.accuracy_observers = []
         if observers:
             for obs in observers:
                 if isinstance(obs, NVMLObserver):
@@ -257,6 +258,9 @@ def __init__(self, kernel_source, device=0, platform=0, quiet=False, compiler=No
                     self.use_nvml = True
                 if hasattr(obs, "continuous_observer"):
                     self.continuous_observers.append(obs.continuous_observer)
+                if isinstance(obs, AccuracyObserver):
+                    self.accuracy_observers.append(obs)
+
 
         self.iterations = iterations
 
@@ -337,6 +341,10 @@ def benchmark(self, func, gpu_args, instance, verbose, objective):
             if "nvml_mem_clock" in instance.params:
                 self.nvml.mem_clock = instance.params["nvml_mem_clock"]
 
+        # Call the observers to register the configuration to be benchmarked
+        for obs in self.dev.observers:
+            obs.register_configuration(instance.params)
+
         result = {}
         try:
             self.benchmark_default(func, gpu_args, instance.threads, instance.grid, result)
@@ -372,13 +380,18 @@ def check_kernel_output(self, func, gpu_args, instance, answer, atol, verify, ve
         logging.debug('check_kernel_output')
 
         #if not using custom verify function, check if the length is the same
-        if not verify and len(instance.arguments) != len(answer):
-            raise TypeError("The length of argument list and provided results do not match.")
+        if answer:
+            if len(instance.arguments) != len(answer):
+                raise TypeError("The length of argument list and provided results do not match.")
+
+            should_sync = [answer[i] is not None for i, arg in enumerate(instance.arguments)]
+        else:
+            should_sync = [isinstance(arg, (np.ndarray, cp.ndarray, torch.Tensor)) for arg in instance.arguments]
 
         #re-copy original contents of output arguments to GPU memory, to overwrite any changes
         #by earlier kernel runs
         for i, arg in enumerate(instance.arguments):
-            if (verify or answer[i] is not None) and isinstance(arg, (np.ndarray, cp.ndarray, torch.Tensor)):
+            if should_sync[i]:
                 self.dev.memcpy_htod(gpu_args[i], arg)
 
         #run the kernel
@@ -389,24 +402,38 @@ def check_kernel_output(self, func, gpu_args, instance, answer, atol, verify, ve
         #retrieve gpu results to host memory
         result_host = []
         for i, arg in enumerate(instance.arguments):
-            if (verify or answer[i] is not None) and isinstance(arg, (np.ndarray, cp.ndarray)):
-                result_host.append(np.zeros_like(arg))
-                self.dev.memcpy_dtoh(result_host[-1], gpu_args[i])
-            elif isinstance(arg, torch.Tensor) and isinstance(answer[i], torch.Tensor):
-                if not answer[i].is_cuda:
-                    #if the answer is on the host, copy gpu output to host as well
-                    result_host.append(torch.zeros_like(answer[i]))
-                    self.dev.memcpy_dtoh(result_host[-1], gpu_args[i].tensor)
+            if should_sync[i]:
+                if isinstance(arg, (np.ndarray, cp.ndarray)):
+                    result_host.append(np.zeros_like(arg))
+                    self.dev.memcpy_dtoh(result_host[-1], gpu_args[i])
+                elif isinstance(arg, torch.Tensor) and isinstance(answer[i], torch.Tensor):
+                    if not answer[i].is_cuda:
+                        #if the answer is on the host, copy gpu output to host as well
+                        result_host.append(torch.zeros_like(answer[i]))
+                        self.dev.memcpy_dtoh(result_host[-1], gpu_args[i].tensor)
+                    else:
+                        result_host.append(gpu_args[i].tensor)
                 else:
-                    result_host.append(gpu_args[i].tensor)
+                    # We should sync this argument, but we do not know how to transfer this type of argument
+                    # What do we do? Should we throw an error?
+                    result_host.append(None)
             else:
                 result_host.append(None)
 
-        #if the user has specified a custom verify function, then call it, else use default based on numpy allclose
+        # Call the accuracy observers
+        for obs in self.accuracy_observers:
+            obs.process_kernel_output(answer, result_host)
+
+        # There are three scenarios:
+        # - if there is a custom verify function, call that.
+        # - otherwise, if there are no accuracy observer, call the default verify function
+        # - otherwise, the answer is correct (we assume the accuracy observers verified the output)
         if verify:
             correct = verify(answer, result_host, atol=atol)
-        else:
+        elif not self.accuracy_observers:
             correct = _default_verify_function(instance, answer, result_host, atol, verbose)
+        else:
+            correct = True
 
         if not correct:
             raise RuntimeError("Kernel result verification failed for: " + util.get_config_string(instance.params))
@@ -444,7 +471,6 @@ def compile_and_benchmark(self, kernel_source, gpu_args, params, kernel_options,
 
             # Preprocess the argument list. This is required to deal with `MixedPrecisionArray`s
             gpu_args = self.preprocess_gpu_arguments(gpu_args, params)
-
             try:
                 # compile the kernel
                 start_compilation = time.perf_counter()
@@ -466,7 +492,7 @@ def compile_and_benchmark(self, kernel_source, gpu_args, params, kernel_options,
                 last_compilation_time = 1000 * (time.perf_counter() - start_compilation)
 
                 # test kernel for correctness
-                if func and (to.answer or to.verify):
+                if func and (to.answer or to.verify or self.accuracy_observers):
                     start_verification = time.perf_counter()
                     self.check_kernel_output(func, gpu_args, instance, to.answer, to.atol, to.verify, verbose)
                     last_verification_time = 1000 * (time.perf_counter() - start_verification)
@@ -559,8 +585,11 @@ def create_kernel_instance(self, kernel_source, kernel_options, params, verbose)
         if kernel_source.lang in ["CUDA", "NVCUDA"] and "<" in name and ">" in name:
             kernel_string, name = wrap_templated_kernel(kernel_string, name)
 
+        # Preprocess GPU arguments. Require for handling `Tunable` arguments
+        arguments = self.preprocess_gpu_arguments(kernel_options.arguments, params)
+
         #collect everything we know about this instance and return it
-        return KernelInstance(name, kernel_source, kernel_string, temp_files, threads, grid, params, kernel_options.arguments)
+        return KernelInstance(name, kernel_source, kernel_string, temp_files, threads, grid, params, arguments)
 
     def get_environment(self):
         """Return dictionary with information about the environment"""
diff --git a/kernel_tuner/util.py b/kernel_tuner/util.py
index 0e3dee4d6..cadc8ee14 100644
--- a/kernel_tuner/util.py
+++ b/kernel_tuner/util.py
@@ -14,6 +14,9 @@
 
 import numpy as np
 from constraint import Constraint, AllDifferentConstraint, AllEqualConstraint, MaxSumConstraint, ExactSumConstraint, MinSumConstraint, InSetConstraint, NotInSetConstraint, SomeInSetConstraint, SomeNotInSetConstraint, FunctionConstraint
+
+from kernel_tuner.accuracy import Tunable
+
 try:
     import cupy as cp
 except ImportError:
@@ -124,6 +127,10 @@ def check_argument_list(kernel_name, kernel_string, args):
         for (i, arg) in enumerate(args):
             kernel_argument = arguments[i]
 
+            # Fix to deal with tunable arguments
+            if isinstance(arg, Tunable):
+                continue
+
             if not isinstance(arg, (np.ndarray, np.generic, cp.ndarray, torch.Tensor)):
                 raise TypeError("Argument at position " + str(i) + " of type: " + str(type(arg)) + " should be of type np.ndarray or numpy scalar")
 

From 2b99fc7abe3dcaa85085d1139dd9c8cd9951dab6 Mon Sep 17 00:00:00 2001
From: stijn <s.heldens@esciencecenter.nl>
Date: Wed, 29 Mar 2023 13:38:53 +0200
Subject: [PATCH 03/24] Extend documentation in `accuracy.py`

---
 kernel_tuner/accuracy.py | 87 ++++++++++++++++++++++++++--------------
 1 file changed, 57 insertions(+), 30 deletions(-)

diff --git a/kernel_tuner/accuracy.py b/kernel_tuner/accuracy.py
index 711902689..e78797809 100644
--- a/kernel_tuner/accuracy.py
+++ b/kernel_tuner/accuracy.py
@@ -7,23 +7,24 @@
 
 class Tunable(UserDict):
     def __init__(self, param_key: str, arrays: Dict):
-        """Create a new ``Tunable``.
-
-        ``Tunable`` can be used one of the input arguments when tuning kernels. It can contain
-        several arrays and the array that will be used during benchmark of one kernel configuration
-        can be selected based on a tunable parameter.
+        """The ``Tunable`` object is used as an input argument when tuning
+        kernels. It is a container that holds several arrays internally and
+        selects one array during benchmarking based on a tunable parameter.
 
         Example
         -------
-        For example, it is possible to define a tunable parameter called ``matrix_layout`` and then
-        tunable for Fortran-order or C-order memory layout by passing the following object as a
-        kernel argument:
+        Consider this example::
+
+            arg = Tunable("matrix_layout", dict("c"=matrix, "f"=matrix.transpose()))
 
-        ```
-        Tunable("matrix_layout", dict("c"=matrix, "f"=matrix.transpose()))
-        ```
+        In this example, we create a Tunable object that selects either matrix
+        or matrix.transpose() for benchmarking, depending on the value of the
+        tunable parameter "matrix_layout". The arrays argument is a dictionary
+        that maps the tunable parameter values "c" and "f" to the arrays matrix
+        and matrix.transpose(), respectively. During benchmarking, the Tunable
+        object selects the appropriate array based on the value of "matrix_layout".
 
-        :param param_key: The tunable parameter used to select the array for benchmarking.
+        :param param_key: : The tunable parameter used to select the array for benchmarking.
         :param arrays: A dictionary that maps the parameter value to arrays.
         """
         if isinstance(arrays, (tuple, list)):
@@ -42,7 +43,9 @@ def select_for_configuration(self, params):
 
         if key not in self:
             list = ", ".join(map(str, self.keys()))
-            raise KeyError(f"'{key}' is not a valid parameter value, should be one of: {list}")
+            raise KeyError(
+                f"'{key}' is not a valid parameter value, should be one of: {list}"
+            )
 
         return self[key]
 
@@ -51,29 +54,39 @@ def __call__(self, params):
 
 
 class TunablePrecision(Tunable):
-    def __init__(self, param_key: str, array: np.ndarray, dtypes: Dict[str, np.dtype] = None):
-        """
-        Create a new ``TunablePrecision``.
+    def __init__(
+        self, param_key: str, array: np.ndarray, dtypes: Dict[str, np.dtype] = None
+    ):
+        """The ``Tunable`` object is used as an input argument when tuning
+        kernels. It is a container that internally holds several arrays
+        containing the same data, but stored in using different levels of
+        precision. During benchamrking, one array is selected based on a
+        tunable parameter ``param_key``.
+
+        Example
+        -------
+        Consider this example::
 
-        ``TunablePrecision`` can be used one of the input arguments when tuning kernels. It
-        contains the same array data, but stored using different levels of precision. This can
-        be used to tune the optimal precision for a kernel argument.
+            arg = TunablePrecision("matrix_type", matrix)
 
-        :param param_key: The tunable parameter used to select the precision for benchmarking.
-        :param array: The input array. Will be converted to the given precision levels.
-        :param dtypes: Dictionary that maps names to numpy data types.
+        This creates a ``TunablePrecision`` argument that selects the required
+        floating-point precision for ``matrix`` based on the tunable parameter
+        ``"matrix_type"``.
+
+        :param param_key: The tunable parameter used to select the level of precision.
+        :param array: The input array. It will automatically be converted to
+                      all data types given by ``dtypes``.
+        :param dtypes: Dictionary that maps names to numpy data types. The default
+                       types are ``double``, ``float``, and ``half``.
         """
         # If no dtypes are given, generate a default list
         if not dtypes:
-            dtypes = dict(
-                half=np.half,
-                float=np.single,
-                double=np.double)
+            dtypes = dict(half=np.half, float=np.single, double=np.double)
 
             # Try to get bfloat16 from tensorflow if available.
             try:
-                #import tensorflow
-                #dtypes["bfloat16"] = tensorflow.bfloat16.as_numpy_dtype
+                # import tensorflow
+                # dtypes["bfloat16"] = tensorflow.bfloat16.as_numpy_dtype
                 pass
             except ImportError:
                 pass  # Ignore error if tensorflow is not available
@@ -89,14 +102,28 @@ def __init__(self, param_key: str, array: np.ndarray, dtypes: Dict[str, np.dtype
         super().__init__(param_key, arrays)
 
 
-class CalculateErrorObserver(AccuracyObserver):
+class ErrorMetricObserver(AccuracyObserver):
+    """An ``AccuracyObserver`` that measure the error of the outputs produced
+    by a kernel by comparing it against reference outputs.
+
+    By default, it uses the mean-squared error (MSE) and appends this to
+    the results with a metric called ``error``.
+    """
+
     def __init__(self, metric=None, key="error"):
+        """Create a new ``AccuracyObserver``.
+
+        :param metric: The error metric. Should be function that accepts two numpy
+                       arrays as arguments (the reference output and the kernel output)
+        :param key: The name of this metric in the results.
+        """
+
         # The default metric is the mean squared error
         if metric is None:
             metric = lambda a, b: np.average(np.square(a - b))
 
         self.key = key
-        self.function = metric
+        self.metric = metric
         self.result = None
 
     def process_kernel_output(self, answers, outputs):

From 777a2ff8fcca732685382f9f10c2a60f8e026c03 Mon Sep 17 00:00:00 2001
From: stijn <s.heldens@esciencecenter.nl>
Date: Wed, 5 Apr 2023 15:57:57 +0200
Subject: [PATCH 04/24] Use `bfloat16` from `bfloat16` package instead of
 tensorflow

---
 kernel_tuner/accuracy.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/kernel_tuner/accuracy.py b/kernel_tuner/accuracy.py
index e78797809..a0a04a123 100644
--- a/kernel_tuner/accuracy.py
+++ b/kernel_tuner/accuracy.py
@@ -83,10 +83,10 @@ def __init__(
         if not dtypes:
             dtypes = dict(half=np.half, float=np.single, double=np.double)
 
-            # Try to get bfloat16 from tensorflow if available.
+            # Try to get bfloat16 if available.
             try:
-                # import tensorflow
-                # dtypes["bfloat16"] = tensorflow.bfloat16.as_numpy_dtype
+                from bfloat16 import bfloat16
+                dtypes["bfloat16"] = bfloat16
                 pass
             except ImportError:
                 pass  # Ignore error if tensorflow is not available

From 95c2d99a1ed70f42ee2719149c7ff8c4ee6849fc Mon Sep 17 00:00:00 2001
From: stijn <s.heldens@esciencecenter.nl>
Date: Wed, 19 Apr 2023 10:44:33 +0200
Subject: [PATCH 05/24] Make `SequentialRunner` always compute metrics for
 every configuration

---
 kernel_tuner/runners/sequential.py | 15 ++++++++-------
 kernel_tuner/util.py               |  7 +++----
 2 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/kernel_tuner/runners/sequential.py b/kernel_tuner/runners/sequential.py
index 99f2ac972..5c9d438f3 100644
--- a/kernel_tuner/runners/sequential.py
+++ b/kernel_tuner/runners/sequential.py
@@ -92,17 +92,14 @@ def run(self, parameter_space, tuning_options):
                     warmup_time = 1e3 * (perf_counter() - warmup_time)
 
                 result = self.dev.compile_and_benchmark(self.kernel_source, self.gpu_args, params, self.kernel_options, tuning_options)
-
                 params.update(result)
 
-                # only compute metrics on configs that have not errored
-                if tuning_options.objective in result and isinstance(result[tuning_options.objective], ErrorConfig):
+                if isinstance(result.get(tuning_options.objective), ErrorConfig):
                     logging.debug('kernel configuration was skipped silently due to compile or runtime failure')
-                elif tuning_options.metrics:
-                    params = process_metrics(params, tuning_options.metrics)
 
-                # print configuration to the console
-                print_config_output(tuning_options.tune_params, params, self.quiet, tuning_options.metrics, self.units)
+            # only compute metrics on configs that have not errored
+            if not isinstance(params.get(tuning_options.objective), ErrorConfig):
+                params = process_metrics(params, tuning_options.metrics)
 
             # get the framework time by estimating based on other times
             total_time = 1000 * (perf_counter() - self.start_time) - warmup_time
@@ -112,6 +109,10 @@ def run(self, parameter_space, tuning_options):
             self.start_time = perf_counter()
 
             if result:
+                # print configuration to the console
+                print_config_output(tuning_options.tune_params, params, self.quiet, tuning_options.metrics, self.units)
+
+                # add configuration to cache
                 store_cache(x_int, params, tuning_options)
 
             # all visited configurations are added to results to provide a trace for optimization strategies
diff --git a/kernel_tuner/util.py b/kernel_tuner/util.py
index cadc8ee14..fbf949ffd 100644
--- a/kernel_tuner/util.py
+++ b/kernel_tuner/util.py
@@ -518,10 +518,9 @@ def process_metrics(params, metrics):
             value = v(params)
         else:
             raise ValueError("metric dicts values should be strings or callable")
-        if not k in params:
-            params[k] = value
-        else:
-            raise ValueError("metric dicts keys should not already exist in params")
+
+        # We overwrite any existing values for the given key
+        params[k] = value
     return params
 
 

From 16d17645d859a6db1d9a40e39fdec9d3777f95ec Mon Sep 17 00:00:00 2001
From: stijn <s.heldens@esciencecenter.nl>
Date: Wed, 19 Apr 2023 11:09:04 +0200
Subject: [PATCH 06/24] Fix test for `process_metrics` to allow overwriting
 existing results

---
 kernel_tuner/runners/sequential.py | 2 +-
 test/test_util_functions.py        | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/kernel_tuner/runners/sequential.py b/kernel_tuner/runners/sequential.py
index 5c9d438f3..3ddcc827a 100644
--- a/kernel_tuner/runners/sequential.py
+++ b/kernel_tuner/runners/sequential.py
@@ -98,7 +98,7 @@ def run(self, parameter_space, tuning_options):
                     logging.debug('kernel configuration was skipped silently due to compile or runtime failure')
 
             # only compute metrics on configs that have not errored
-            if not isinstance(params.get(tuning_options.objective), ErrorConfig):
+            if tuning_options.metrics and not isinstance(params.get(tuning_options.objective), ErrorConfig):
                 params = process_metrics(params, tuning_options.metrics)
 
             # get the framework time by estimating based on other times
diff --git a/test/test_util_functions.py b/test/test_util_functions.py
index b3896a1ad..378bca229 100644
--- a/test/test_util_functions.py
+++ b/test/test_util_functions.py
@@ -667,15 +667,15 @@ def test_process_metrics():
     with pytest.raises(ValueError):
         params = process_metrics(params, {})
 
-    # test ValueError is raised when b already exists in params
+    # test if a metric overrides any existing metrics
     params = {
         "x": 15,
         "b": 12
     }
     metrics = OrderedDict()
     metrics["b"] = "x"
-    with pytest.raises(ValueError):
-        params = process_metrics(params, metrics)
+    params = process_metrics(params, metrics)
+    assert params["b"] == 15
 
 
 def test_parse_restrictions():

From 74f87e0b89649ad7328985c198a0008c61bb0348 Mon Sep 17 00:00:00 2001
From: stijn <s.heldens@esciencecenter.nl>
Date: Wed, 19 Apr 2023 11:14:28 +0200
Subject: [PATCH 07/24] Re-add `Observer.register_configuration` that was lost
 due in merge with master

---
 kernel_tuner/observers/observer.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/kernel_tuner/observers/observer.py b/kernel_tuner/observers/observer.py
index c3203afb8..203805dd9 100644
--- a/kernel_tuner/observers/observer.py
+++ b/kernel_tuner/observers/observer.py
@@ -8,6 +8,11 @@ def register_device(self, dev):
         """Sets self.dev, for inspection by the observer at various points during benchmarking"""
         self.dev = dev
 
+    def register_configuration(self, params):
+        """Called once before benchmarking of a single kernel configuration. The `params` argument is a `dict`
+        that stores the configuration parameters."""
+        pass
+    
     def before_start(self):
         """before start is called every iteration before the kernel starts"""
         pass

From ee351be2ec15d05e777f0878c4e1c8c47b060d59 Mon Sep 17 00:00:00 2001
From: stijn <s.heldens@esciencecenter.nl>
Date: Tue, 9 May 2023 12:02:34 +0200
Subject: [PATCH 08/24] Support scalars in `TunablePrecision`

---
 kernel_tuner/accuracy.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/kernel_tuner/accuracy.py b/kernel_tuner/accuracy.py
index a0a04a123..f3ba9075e 100644
--- a/kernel_tuner/accuracy.py
+++ b/kernel_tuner/accuracy.py
@@ -97,7 +97,13 @@ def __init__(
 
         arrays = dict()
         for precision, dtype in dtypes.items():
-            arrays[precision] = np.array(array).astype(dtype)
+            # We convert the array into a `np.ndarray` by using `np.array`.
+            # However, if the value is a numpy scalar, then we do not want to
+            # convert it into an array but instead keep the original value
+            if not np.isinstance(array, np.generic)
+                array = np.array(array)
+
+            arrays[precision] = array.astype(dtype)
 
         super().__init__(param_key, arrays)
 

From 0789e01e58f6d3c81142529d8f7615c5074ff9ef Mon Sep 17 00:00:00 2001
From: stijn <s.heldens@esciencecenter.nl>
Date: Tue, 9 May 2023 12:03:13 +0200
Subject: [PATCH 09/24] Rename `flag_gpu_args` to `flat_gpu_args` in `core.py`

---
 kernel_tuner/accuracy.py | 2 +-
 kernel_tuner/core.py     | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/kernel_tuner/accuracy.py b/kernel_tuner/accuracy.py
index f3ba9075e..764ba72f4 100644
--- a/kernel_tuner/accuracy.py
+++ b/kernel_tuner/accuracy.py
@@ -100,7 +100,7 @@ def __init__(
             # We convert the array into a `np.ndarray` by using `np.array`.
             # However, if the value is a numpy scalar, then we do not want to
             # convert it into an array but instead keep the original value
-            if not np.isinstance(array, np.generic)
+            if not isinstance(array, np.generic):
                 array = np.array(array)
 
             arrays[precision] = array.astype(dtype)
diff --git a/kernel_tuner/core.py b/kernel_tuner/core.py
index 128d88eb7..7149cb578 100644
--- a/kernel_tuner/core.py
+++ b/kernel_tuner/core.py
@@ -610,7 +610,7 @@ def ready_argument_list(self, arguments):
             else:
                 flat_args.append(argument)
 
-        flag_gpu_args = iter(self.dev.ready_argument_list(flat_args))
+        flat_gpu_args = iter(self.dev.ready_argument_list(flat_args))
 
         # Unflatten the arguments back into arrays.
         gpu_args = []
@@ -618,11 +618,11 @@ def ready_argument_list(self, arguments):
             if isinstance(argument, Tunable):
                 arrays = dict()
                 for key in argument:
-                    arrays[key] = next(flag_gpu_args)
+                    arrays[key] = next(flat_gpu_args)
 
                 gpu_args.append(Tunable(argument.param_key, arrays))
             else:
-                gpu_args.append(next(flag_gpu_args))
+                gpu_args.append(next(flat_gpu_args))
 
         return gpu_args
 

From 00d65221009bf8c0d20daa2e7749c856ebbebdc1 Mon Sep 17 00:00:00 2001
From: stijn <s.heldens@esciencecenter.nl>
Date: Tue, 13 Jun 2023 12:44:05 +0200
Subject: [PATCH 10/24] Add support for observers in C backend

---
 kernel_tuner/backends/c.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/kernel_tuner/backends/c.py b/kernel_tuner/backends/c.py
index 486181cc3..f47f228fa 100644
--- a/kernel_tuner/backends/c.py
+++ b/kernel_tuner/backends/c.py
@@ -43,12 +43,15 @@
 class CFunctions(CompilerBackend):
     """Class that groups the code for running and compiling C functions"""
 
-    def __init__(self, iterations=7, compiler_options=None, compiler=None):
+    def __init__(self, iterations=7, compiler_options=None, compiler=None, observers=None):
         """instantiate CFunctions object used for interacting with C code
 
         :param iterations: Number of iterations used while benchmarking a kernel, 7 by default.
         :type iterations: int
         """
+        self.observers = observers or []
+        self.observers.append(CRuntimeObserver(self))
+
         self.iterations = iterations
         self.max_threads = 1024
         self.compiler_options = compiler_options
@@ -56,7 +59,6 @@ def __init__(self, iterations=7, compiler_options=None, compiler=None):
         self.compiler = compiler or "g++"
         self.lib = None
         self.using_openmp = False
-        self.observers = [CRuntimeObserver(self)]
         self.last_result = None
 
         try:

From 24e14ed1afcd2c1c90e09b1181f606b5246ec3f4 Mon Sep 17 00:00:00 2001
From: stijn <s.heldens@esciencecenter.nl>
Date: Tue, 27 Jun 2023 12:03:53 +0200
Subject: [PATCH 11/24] Allow `run_kernel` to deal with `Tunable` arguments

---
 kernel_tuner/interface.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/kernel_tuner/interface.py b/kernel_tuner/interface.py
index 15fe04ec1..b72753377 100644
--- a/kernel_tuner/interface.py
+++ b/kernel_tuner/interface.py
@@ -746,7 +746,7 @@ def run_kernel(
     if log:
         logging.basicConfig(filename=kernel_name + datetime.now().strftime("%Y%m%d-%H:%M:%S") + ".log", level=log)
 
-    kernelsource = core.KernelSource(kernel_name, kernel_source, lang)
+    kernelsource = core.KernelSource(kernel_name, kernel_source, lang, defines)
 
     _check_user_input(kernel_name, kernelsource, arguments, block_size_names)
 
@@ -758,6 +758,9 @@ def run_kernel(
     # detect language and create the right device function interface
     dev = core.DeviceInterface(kernelsource, iterations=1, **device_options)
 
+    # Preprocess GPU arguments. Require for handling `Tunable` arguments
+    arguments = dev.preprocess_gpu_arguments(arguments, params)
+
     # move data to the GPU
     gpu_args = dev.ready_argument_list(arguments)
 

From cd996b92cad321a6245972176374ac10e2ad429b Mon Sep 17 00:00:00 2001
From: stijn <s.heldens@esciencecenter.nl>
Date: Mon, 3 Jul 2023 12:40:11 +0200
Subject: [PATCH 12/24] Support more names for floating-point types in
 TunablePrecision

---
 kernel_tuner/accuracy.py | 25 ++++++++++++++++++++++++-
 1 file changed, 24 insertions(+), 1 deletion(-)

diff --git a/kernel_tuner/accuracy.py b/kernel_tuner/accuracy.py
index 764ba72f4..567142d44 100644
--- a/kernel_tuner/accuracy.py
+++ b/kernel_tuner/accuracy.py
@@ -53,6 +53,28 @@ def __call__(self, params):
         return self.select_for_configuration(params)
 
 
+def _to_float_dtype(x):
+    """Convert a string to a numpy data type (``dtype``). This function recognizes
+    common names (such as ``f16`` or ``kfloat``), and uses ``np.dtype(x)`` as a
+    fallback.
+    """
+    if isinstance(x, str):
+        x = x.lower()
+
+    if x in ("bfloat16", "bf16", "kbfloat16", "__nv_bfloat16"):
+        from bfloat16 import bfloat16
+
+        return bfloat16
+    if x in ("half", "f16", "float16", "__half", "khalf", 16):
+        return np.half
+    if x in ("float", "single", "f32", "float32", "kfloat", 32):
+        return np.float32
+    if x in ("double", "f64", "float64", "kdouble", 64):
+        return np.float64
+
+    return np.dtype(x)
+
+
 class TunablePrecision(Tunable):
     def __init__(
         self, param_key: str, array: np.ndarray, dtypes: Dict[str, np.dtype] = None
@@ -86,6 +108,7 @@ def __init__(
             # Try to get bfloat16 if available.
             try:
                 from bfloat16 import bfloat16
+
                 dtypes["bfloat16"] = bfloat16
                 pass
             except ImportError:
@@ -93,7 +116,7 @@ def __init__(
 
         # If dtype is a list, convert it to a dictionary
         if isinstance(dtypes, (list, tuple)):
-            dtypes = dict((name, np.dtype(name)) for name in dtypes)
+            dtypes = dict((name, _to_float_dtype(name)) for name in dtypes)
 
         arrays = dict()
         for precision, dtype in dtypes.items():

From c798e0fb0c996f274e488cf496013940bd3ae5ae Mon Sep 17 00:00:00 2001
From: stijn <s.heldens@esciencecenter.nl>
Date: Mon, 3 Jul 2023 12:40:43 +0200
Subject: [PATCH 13/24] Support several well-known error metrics in
 ErrorObserver

---
 kernel_tuner/accuracy.py | 52 +++++++++++++++++++++++++++++++++++++++-
 1 file changed, 51 insertions(+), 1 deletion(-)

diff --git a/kernel_tuner/accuracy.py b/kernel_tuner/accuracy.py
index 567142d44..85447d9de 100644
--- a/kernel_tuner/accuracy.py
+++ b/kernel_tuner/accuracy.py
@@ -131,7 +131,54 @@ def __init__(
         super().__init__(param_key, arrays)
 
 
-class ErrorMetricObserver(AccuracyObserver):
+class AccuracyObserver(BenchmarkObserver):
+    """Observer that can verify or measure the accuracy of the output produced by a kernel."""
+
+    @abstractmethod
+    def process_kernel_output(self, answer, output):
+        """method will be called once before benchmarking of a single kernel configuration. The arguments
+        provided are the `answer` as passed `tune_kernel` and the `output` produced by the kernel
+        """
+        pass
+
+
+def error_metric_from_name(key):
+    """Find the error metric function for the given name.
+
+    Returns an function that takes two parameters (the real values and the
+    estimated values) as numpy array and returns the error between the two
+    according to the given error metric.
+
+    Valid values for the ``key`` are:
+
+    * MSE (mean square error)
+    * RSME (Root mean square error)
+    * MAE (mean absolute error)
+    * MRE (mean relative error)
+    * MALE (mean absolute log error)
+    * RMSLE (root mean square log error)
+    """
+    key = key.lower().strip().replace("_", " ")
+
+    if key in ("mse", "smd", "mean square error"):
+        return lambda a, b: np.average(np.square(a - b))
+    elif key in ("rmse", "rsmd", "root mean square error"):
+        return lambda a, b: np.sqrt(np.average(np.square(a - b)))
+    elif key in ("nrmse", "nrmsd"):
+        return lambda a, b: np.sqrt(np.average(np.square(a - b))) / np.average(a)
+    elif key in ("mae", "absolute error", "absolute", "mean absolute error", "abs"):
+        return lambda a, b: np.average(np.abs(a - b))
+    elif key in ("mre", "relative error", "relative", "mean relative error", "rel"):
+        return lambda a, b: np.average(np.abs(a - b) / np.abs(a) - 1)
+    elif key in ("male", "mean absolute log error"):
+        return lambda a, b: np.average(np.abs(np.log(a) - np.log(b)))
+    elif key in ("rmsle", "root mean square log error"):
+        return lambda a, b: np.sqrt(np.average(np.square(np.log(a) - np.log(b))))
+    else:
+        raise ValuError(f"invalid error metric provided: {key}")
+
+
+class ErrorObserver(AccuracyObserver):
     """An ``AccuracyObserver`` that measure the error of the outputs produced
     by a kernel by comparing it against reference outputs.
 
@@ -151,6 +198,9 @@ def __init__(self, metric=None, key="error"):
         if metric is None:
             metric = lambda a, b: np.average(np.square(a - b))
 
+        if isinstance(metric, str):
+            metric = error_metric_from_name(metric)
+
         self.key = key
         self.metric = metric
         self.result = None

From 4bffd6e5f3a4ae257e1590ec04e8f1e7bf50eb20 Mon Sep 17 00:00:00 2001
From: stijn <s.heldens@esciencecenter.nl>
Date: Tue, 4 Jul 2023 14:08:38 +0200
Subject: [PATCH 14/24] Fix division by zero in ErrorObserver when ground-truth
 contains zeros

---
 kernel_tuner/accuracy.py | 45 +++++++++++++++++++++++++++-------------
 1 file changed, 31 insertions(+), 14 deletions(-)

diff --git a/kernel_tuner/accuracy.py b/kernel_tuner/accuracy.py
index 85447d9de..7c4ea8c7d 100644
--- a/kernel_tuner/accuracy.py
+++ b/kernel_tuner/accuracy.py
@@ -145,8 +145,8 @@ def process_kernel_output(self, answer, output):
 def error_metric_from_name(key):
     """Find the error metric function for the given name.
 
-    Returns an function that takes two parameters (the real values and the
-    estimated values) as numpy array and returns the error between the two
+    Returns an function that takes two parameters (the ground-truth and the
+    estimated values) as numpy arrays and returns the error between the two
     according to the given error metric.
 
     Valid values for the ``key`` are:
@@ -157,32 +157,50 @@ def error_metric_from_name(key):
     * MRE (mean relative error)
     * MALE (mean absolute log error)
     * RMSLE (root mean square log error)
+    * max (maximum absolute error)
+    * max_rel (maximum relative error)
     """
+
+    # Small value to prevent division by zero in relative metrics
+    EPS = np.finfo(np.float64).eps
+
+    # lowercase the metric name
     key = key.lower().strip().replace("_", " ")
 
     if key in ("mse", "smd", "mean square error"):
-        return lambda a, b: np.average(np.square(a - b))
+        metric = lambda a, b: np.average(np.square(a - b))
     elif key in ("rmse", "rsmd", "root mean square error"):
-        return lambda a, b: np.sqrt(np.average(np.square(a - b)))
+        metric = lambda a, b: np.sqrt(np.average(np.square(a - b)))
     elif key in ("nrmse", "nrmsd"):
-        return lambda a, b: np.sqrt(np.average(np.square(a - b))) / np.average(a)
+        metric = lambda a, b: np.sqrt(np.average(np.square(a - b))) / np.average(a)
     elif key in ("mae", "absolute error", "absolute", "mean absolute error", "abs"):
-        return lambda a, b: np.average(np.abs(a - b))
+        metric = lambda a, b: np.average(np.abs(a - b))
     elif key in ("mre", "relative error", "relative", "mean relative error", "rel"):
-        return lambda a, b: np.average(np.abs(a - b) / np.abs(a) - 1)
+        metric = lambda a, b: np.average(np.abs(a - b) / np.maximum(np.abs(a), EPS))
     elif key in ("male", "mean absolute log error"):
-        return lambda a, b: np.average(np.abs(np.log(a) - np.log(b)))
+        metric = lambda a, b: np.average(np.abs(np.log(a + EPS) - np.log(b + EPS)))
     elif key in ("rmsle", "root mean square log error"):
-        return lambda a, b: np.sqrt(np.average(np.square(np.log(a) - np.log(b))))
+        metric = lambda a, b: np.sqrt(
+            np.average(np.square(np.log(a + EPS) - np.log(b + EPS)))
+        )
+    elif key in ("max", "max_abs", "maximum", "maximum absolute"):
+        metric = lambda a, b: np.amax(np.abs(a - b))
+    elif key in ("max_rel", "maximum relative"):
+        metric = lambda a, b: np.amax(np.abs(a - b) / np.maximum(np.abs(a), EPS))
     else:
-        raise ValuError(f"invalid error metric provided: {key}")
+        raise ValueError(f"invalid error metric provided: {key}")
+
+    # cast both arguments to f64 before passing them to the metric
+    return lambda a, b, metric=metric: metric(
+        a.astype(np.float64, copy=False), b.astype(np.float64, copy=False)
+    )
 
 
 class ErrorObserver(AccuracyObserver):
     """An ``AccuracyObserver`` that measure the error of the outputs produced
     by a kernel by comparing it against reference outputs.
 
-    By default, it uses the mean-squared error (MSE) and appends this to
+    By default, it uses the root mean-squared error (RMSE) and appends this to
     the results with a metric called ``error``.
     """
 
@@ -196,9 +214,8 @@ def __init__(self, metric=None, key="error"):
 
         # The default metric is the mean squared error
         if metric is None:
-            metric = lambda a, b: np.average(np.square(a - b))
-
-        if isinstance(metric, str):
+            metric = error_metric_from_name("rmse")
+        elif isinstance(metric, str):
             metric = error_metric_from_name(metric)
 
         self.key = key

From d67eefdda7bf46f384cea8ef29371c155eda33ac Mon Sep 17 00:00:00 2001
From: stijn <s.heldens@esciencecenter.nl>
Date: Tue, 4 Jul 2023 14:27:20 +0200
Subject: [PATCH 15/24] Add RMSRE error metric

---
 kernel_tuner/accuracy.py | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

diff --git a/kernel_tuner/accuracy.py b/kernel_tuner/accuracy.py
index 7c4ea8c7d..382fc4afc 100644
--- a/kernel_tuner/accuracy.py
+++ b/kernel_tuner/accuracy.py
@@ -153,23 +153,24 @@ def error_metric_from_name(key):
 
     * MSE (mean square error)
     * RSME (Root mean square error)
+    * NRMSE (normalized root mean square error)
+    * RMSRE (root mean square relative error)
+    * RMSLE (root mean square log error)
     * MAE (mean absolute error)
     * MRE (mean relative error)
     * MALE (mean absolute log error)
-    * RMSLE (root mean square log error)
     * max (maximum absolute error)
-    * max_rel (maximum relative error)
     """
 
     # Small value to prevent division by zero in relative metrics
     EPS = np.finfo(np.float64).eps
 
     # lowercase the metric name
-    key = key.lower().strip().replace("_", " ")
+    key = key.lower().replace("_", " ").strip()
 
     if key in ("mse", "smd", "mean square error"):
         metric = lambda a, b: np.average(np.square(a - b))
-    elif key in ("rmse", "rsmd", "root mean square error"):
+    elif key in ("rmse", "rmsd", "root mean square error"):
         metric = lambda a, b: np.sqrt(np.average(np.square(a - b)))
     elif key in ("nrmse", "nrmsd"):
         metric = lambda a, b: np.sqrt(np.average(np.square(a - b))) / np.average(a)
@@ -177,15 +178,19 @@ def error_metric_from_name(key):
         metric = lambda a, b: np.average(np.abs(a - b))
     elif key in ("mre", "relative error", "relative", "mean relative error", "rel"):
         metric = lambda a, b: np.average(np.abs(a - b) / np.maximum(np.abs(a), EPS))
+    elif key in ("rmsre", "root mean square relative error"):
+        metric = lambda a, b: np.sqrt(
+            np.average(np.square(a - b) / np.maximum(np.square(a), EPS**2))
+        )
     elif key in ("male", "mean absolute log error"):
         metric = lambda a, b: np.average(np.abs(np.log(a + EPS) - np.log(b + EPS)))
     elif key in ("rmsle", "root mean square log error"):
         metric = lambda a, b: np.sqrt(
             np.average(np.square(np.log(a + EPS) - np.log(b + EPS)))
         )
-    elif key in ("max", "max_abs", "maximum", "maximum absolute"):
+    elif key in ("max", "max abs", "maximum", "maximum absolute"):
         metric = lambda a, b: np.amax(np.abs(a - b))
-    elif key in ("max_rel", "maximum relative"):
+    elif key in ("max rel", "maximum relative"):
         metric = lambda a, b: np.amax(np.abs(a - b) / np.maximum(np.abs(a), EPS))
     else:
         raise ValueError(f"invalid error metric provided: {key}")

From 8412f7809a4e8032dbf0b61d09a4656c7f1f827a Mon Sep 17 00:00:00 2001
From: stijn <s.heldens@esciencecenter.nl>
Date: Tue, 8 Aug 2023 16:01:42 +0200
Subject: [PATCH 16/24] Fix crash when object is `ErrorConfig`

---
 kernel_tuner/runners/sequential.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel_tuner/runners/sequential.py b/kernel_tuner/runners/sequential.py
index 3ddcc827a..2b0362930 100644
--- a/kernel_tuner/runners/sequential.py
+++ b/kernel_tuner/runners/sequential.py
@@ -94,7 +94,7 @@ def run(self, parameter_space, tuning_options):
                 result = self.dev.compile_and_benchmark(self.kernel_source, self.gpu_args, params, self.kernel_options, tuning_options)
                 params.update(result)
 
-                if isinstance(result.get(tuning_options.objective), ErrorConfig):
+                if tuning_options.objective in result and isinstance(result[tuning_options.objective], ErrorConfig):
                     logging.debug('kernel configuration was skipped silently due to compile or runtime failure')
 
             # only compute metrics on configs that have not errored

From 6bb6156b5020346921428c93b3fd41f79cf65231 Mon Sep 17 00:00:00 2001
From: stijn <s.heldens@esciencecenter.nl>
Date: Mon, 2 Oct 2023 10:02:15 +0200
Subject: [PATCH 17/24] Add support for custom tolerances for relative error
 metrics

---
 kernel_tuner/accuracy.py | 44 ++++++++++++++++++++++++----------------
 1 file changed, 26 insertions(+), 18 deletions(-)

diff --git a/kernel_tuner/accuracy.py b/kernel_tuner/accuracy.py
index 382fc4afc..823f7093d 100644
--- a/kernel_tuner/accuracy.py
+++ b/kernel_tuner/accuracy.py
@@ -142,7 +142,7 @@ def process_kernel_output(self, answer, output):
         pass
 
 
-def error_metric_from_name(key):
+def error_metric_from_name(key, EPS=1e-8):
     """Find the error metric function for the given name.
 
     Returns an function that takes two parameters (the ground-truth and the
@@ -160,10 +160,10 @@ def error_metric_from_name(key):
     * MRE (mean relative error)
     * MALE (mean absolute log error)
     * max (maximum absolute error)
-    """
 
-    # Small value to prevent division by zero in relative metrics
-    EPS = np.finfo(np.float64).eps
+    The value of `EPS` is used for relative errors to prevent division by zero.
+    ``
+    """
 
     # lowercase the metric name
     key = key.lower().replace("_", " ").strip()
@@ -173,14 +173,16 @@ def error_metric_from_name(key):
     elif key in ("rmse", "rmsd", "root mean square error"):
         metric = lambda a, b: np.sqrt(np.average(np.square(a - b)))
     elif key in ("nrmse", "nrmsd"):
-        metric = lambda a, b: np.sqrt(np.average(np.square(a - b))) / np.average(a)
+        metric = lambda a, b: np.sqrt(
+            np.average(np.square(a - b)) / np.average(np.square(a))
+        )
     elif key in ("mae", "absolute error", "absolute", "mean absolute error", "abs"):
         metric = lambda a, b: np.average(np.abs(a - b))
     elif key in ("mre", "relative error", "relative", "mean relative error", "rel"):
         metric = lambda a, b: np.average(np.abs(a - b) / np.maximum(np.abs(a), EPS))
     elif key in ("rmsre", "root mean square relative error"):
         metric = lambda a, b: np.sqrt(
-            np.average(np.square(a - b) / np.maximum(np.square(a), EPS**2))
+            np.average(np.square(a - b) / np.square(np.maximum(a, EPS)))
         )
     elif key in ("male", "mean absolute log error"):
         metric = lambda a, b: np.average(np.abs(np.log(a + EPS) - np.log(b + EPS)))
@@ -202,26 +204,32 @@ def error_metric_from_name(key):
 
 
 class ErrorObserver(AccuracyObserver):
-    """An ``AccuracyObserver`` that measure the error of the outputs produced
-    by a kernel by comparing it against reference outputs.
+    """``ErrorObserver`` measures the error of the output produced by a kernel
+    by comparing it against a reference output.
 
-    By default, it uses the root mean-squared error (RMSE) and appends this to
-    the results with a metric called ``error``.
+    By default, it uses the root mean-squared error (RMSE) and uses the
+    parameter key ``error``.
     """
 
-    def __init__(self, metric=None, key="error"):
+    def __init__(self, metric=None, key="error", *, atol=1e-8):
         """Create a new ``AccuracyObserver``.
 
-        :param metric: The error metric. Should be function that accepts two numpy
-                       arrays as arguments (the reference output and the kernel output)
+        :param metric: The error metric. This should be a string that is
+                       accepted by ``error_metric_from_name`` such as ``"absolute error"``
+                       or ``"relative error"``. Alternatively, it can be
+                       function that accepts two numpy arrays as arguments
+                       (the reference output and the kernel output)
         :param key: The name of this metric in the results.
+        :param atol: The absolute tolerance used in relative metrics to prevent
+                     division by zero.
         """
 
-        # The default metric is the mean squared error
-        if metric is None:
-            metric = error_metric_from_name("rmse")
-        elif isinstance(metric, str):
-            metric = error_metric_from_name(metric)
+        if not metric:
+            metric = "rmse"
+
+        # If it is a string, convert it to a function
+        if isinstance(metric, str):
+            metric = error_metric_from_name(metric, atol)
 
         self.key = key
         self.metric = metric

From 5070c97404cebb1dbbeb14e1b0cd7c3f1b02ac9f Mon Sep 17 00:00:00 2001
From: stijn <s.heldens@esciencecenter.nl>
Date: Mon, 2 Oct 2023 11:44:04 +0200
Subject: [PATCH 18/24] Rename `AccuracyObserver` to `OutputObserver` and
 `ErrorObserver` to `AccuracyObserver`

---
 kernel_tuner/accuracy.py | 121 +++++++++++++++++++++++++--------------
 kernel_tuner/core.py     |  21 +++----
 2 files changed, 88 insertions(+), 54 deletions(-)

diff --git a/kernel_tuner/accuracy.py b/kernel_tuner/accuracy.py
index 823f7093d..9e5cfc5f2 100644
--- a/kernel_tuner/accuracy.py
+++ b/kernel_tuner/accuracy.py
@@ -1,15 +1,16 @@
 from collections import UserDict
 from typing import Dict
 import numpy as np
+import logging
 
 from kernel_tuner.observers import AccuracyObserver
 
 
 class Tunable(UserDict):
     def __init__(self, param_key: str, arrays: Dict):
-        """The ``Tunable`` object is used as an input argument when tuning
+        """The ``Tunable`` object can be used as an input argument when tuning
         kernels. It is a container that holds several arrays internally and
-        selects one array during benchmarking based on a tunable parameter.
+        selects one array during benchmarking based on the value of a tunable parameter.
 
         Example
         -------
@@ -19,13 +20,14 @@ def __init__(self, param_key: str, arrays: Dict):
 
         In this example, we create a Tunable object that selects either matrix
         or matrix.transpose() for benchmarking, depending on the value of the
-        tunable parameter "matrix_layout". The arrays argument is a dictionary
-        that maps the tunable parameter values "c" and "f" to the arrays matrix
-        and matrix.transpose(), respectively. During benchmarking, the Tunable
-        object selects the appropriate array based on the value of "matrix_layout".
+        tunable parameter "matrix_layout". The first argument is the name of the tunable
+        paramater. The second argument is a dictionary that maps the tunable parameter
+        values "c" and "f" to the arrays ``matrix`` and ``matrix.transpose()``, respectively.
+        During benchmarking, the Tunable object selects the array passed to the kernel based
+        on the value of "matrix_layout".
 
         :param param_key: : The tunable parameter used to select the array for benchmarking.
-        :param arrays: A dictionary that maps the parameter value to arrays.
+        :param arrays: A dictionary that maps the value of that tunable parameter to options.
         """
         if isinstance(arrays, (tuple, list)):
             arrays = dict(enumerate(arrays))
@@ -35,25 +37,25 @@ def __init__(self, param_key: str, arrays: Dict):
 
     def select_for_configuration(self, params):
         if callable(self.param_key):
-            key = self.param_key(params)
+            option = self.param_key(params)
         elif self.param_key in params:
-            key = params[self.param_key]
+            option = params[self.param_key]
         else:
-            key = eval(self.param_key, params, params)
+            option = eval(self.param_key, params, params)
 
-        if key not in self:
-            list = ", ".join(map(str, self.keys()))
+        if option not in self.data:
+            list = ", ".join(map(str, self.data.keys()))
             raise KeyError(
-                f"'{key}' is not a valid parameter value, should be one of: {list}"
+                f"'{option}' is not a valid parameter value, should be one of: {list}"
             )
 
-        return self[key]
+        return self.data[option]
 
     def __call__(self, params):
         return self.select_for_configuration(params)
 
 
-def _to_float_dtype(x):
+def _to_float_dtype(x: str) -> np.dtype:
     """Convert a string to a numpy data type (``dtype``). This function recognizes
     common names (such as ``f16`` or ``kfloat``), and uses ``np.dtype(x)`` as a
     fallback.
@@ -79,11 +81,11 @@ class TunablePrecision(Tunable):
     def __init__(
         self, param_key: str, array: np.ndarray, dtypes: Dict[str, np.dtype] = None
     ):
-        """The ``Tunable`` object is used as an input argument when tuning
+        """The ``Tunable`` object can be used as an input argument when tuning
         kernels. It is a container that internally holds several arrays
         containing the same data, but stored in using different levels of
-        precision. During benchamrking, one array is selected based on a
-        tunable parameter ``param_key``.
+        precision. During benchamrking, one array is selected based on the value
+        of the tunable parameter called ``param_key``.
 
         Example
         -------
@@ -110,9 +112,16 @@ def __init__(
                 from bfloat16 import bfloat16
 
                 dtypes["bfloat16"] = bfloat16
-                pass
             except ImportError:
-                pass  # Ignore error if tensorflow is not available
+                try:
+                    from tensorflow import bfloat16
+
+                    dtypes["bfloat16"] = bfloat16.as_numpy_dtype
+                except ImportError:
+                    logging.warning(
+                        "could not find `bfloat16` data type for numpy, "
+                        + "please install either the package `bfloat16` or `tensorflow`"
+                    )
 
         # If dtype is a list, convert it to a dictionary
         if isinstance(dtypes, (list, tuple)):
@@ -131,8 +140,8 @@ def __init__(
         super().__init__(param_key, arrays)
 
 
-class AccuracyObserver(BenchmarkObserver):
-    """Observer that can verify or measure the accuracy of the output produced by a kernel."""
+class OutputObserver(BenchmarkObserver):
+    """Observer that can verify or measure something about the output produced by a kernel."""
 
     @abstractmethod
     def process_kernel_output(self, answer, output):
@@ -168,43 +177,67 @@ def error_metric_from_name(key, EPS=1e-8):
     # lowercase the metric name
     key = key.lower().replace("_", " ").strip()
 
-    if key in ("mse", "smd", "mean square error"):
-        metric = lambda a, b: np.average(np.square(a - b))
-    elif key in ("rmse", "rmsd", "root mean square error"):
-        metric = lambda a, b: np.sqrt(np.average(np.square(a - b)))
-    elif key in ("nrmse", "nrmsd"):
-        metric = lambda a, b: np.sqrt(
-            np.average(np.square(a - b)) / np.average(np.square(a))
-        )
+    if key in ("mse", "mean square error"):
+
+        def metric(a, b):
+            return np.average(np.square(a - b))
+
+    elif key in ("rmse", "root mean square error"):
+
+        def metric(a, b):
+            return np.sqrt(np.average(np.square(a - b)))
+
+    elif key in ("nrmse", "normalized root mean square error"):
+
+        def metric(a, b):
+            return np.sqrt(np.average(np.square(a - b)) / np.average(np.square(a)))
+
     elif key in ("mae", "absolute error", "absolute", "mean absolute error", "abs"):
-        metric = lambda a, b: np.average(np.abs(a - b))
+
+        def metric(a, b):
+            return np.average(np.abs(a - b))
+
     elif key in ("mre", "relative error", "relative", "mean relative error", "rel"):
-        metric = lambda a, b: np.average(np.abs(a - b) / np.maximum(np.abs(a), EPS))
+
+        def metric(a, b):
+            return np.average(np.abs(a - b) / np.maximum(np.abs(a), EPS))
+
     elif key in ("rmsre", "root mean square relative error"):
-        metric = lambda a, b: np.sqrt(
-            np.average(np.square(a - b) / np.square(np.maximum(a, EPS)))
-        )
+
+        def metric(a, b):
+            return np.sqrt(np.average(np.square(a - b) / np.square(np.maximum(a, EPS))))
+
     elif key in ("male", "mean absolute log error"):
-        metric = lambda a, b: np.average(np.abs(np.log(a + EPS) - np.log(b + EPS)))
+
+        def metric(a, b):
+            return np.average(np.abs(np.log(a + EPS) - np.log(b + EPS)))
+
     elif key in ("rmsle", "root mean square log error"):
-        metric = lambda a, b: np.sqrt(
-            np.average(np.square(np.log(a + EPS) - np.log(b + EPS)))
-        )
+
+        def metric(a, b):
+            return np.sqrt(np.average(np.square(np.log(a + EPS) - np.log(b + EPS))))
+
     elif key in ("max", "max abs", "maximum", "maximum absolute"):
-        metric = lambda a, b: np.amax(np.abs(a - b))
+
+        def metric(a, b):
+            return np.amax(np.abs(a - b))
+
     elif key in ("max rel", "maximum relative"):
-        metric = lambda a, b: np.amax(np.abs(a - b) / np.maximum(np.abs(a), EPS))
+
+        def metric(a, b):
+            return np.amax(np.abs(a - b) / np.maximum(np.abs(a), EPS))
+
     else:
         raise ValueError(f"invalid error metric provided: {key}")
 
     # cast both arguments to f64 before passing them to the metric
-    return lambda a, b, metric=metric: metric(
+    return lambda a, b: metric(
         a.astype(np.float64, copy=False), b.astype(np.float64, copy=False)
     )
 
 
-class ErrorObserver(AccuracyObserver):
-    """``ErrorObserver`` measures the error of the output produced by a kernel
+class AccuracyObserver(OutputObserver):
+    """``AccuracyObserver`` measures the error of the output produced by a kernel
     by comparing it against a reference output.
 
     By default, it uses the root mean-squared error (RMSE) and uses the
diff --git a/kernel_tuner/core.py b/kernel_tuner/core.py
index 7149cb578..41e19f741 100644
--- a/kernel_tuner/core.py
+++ b/kernel_tuner/core.py
@@ -13,8 +13,9 @@
 except ImportError:
     cp = np
 
+from kernel_tuner.accuracy import Tunable
 from kernel_tuner.observers.nvml import NVMLObserver
-from kernel_tuner.observers.observer import ContinuousObserver, AccuracyObserver
+from kernel_tuner.observers.observer import ContinuousObserver, OutputObserver
 from kernel_tuner.backends.cupy import CupyFunctions
 from kernel_tuner.backends.pycuda import PyCudaFunctions
 from kernel_tuner.backends.nvcuda import CudaFunctions
@@ -250,7 +251,7 @@ def __init__(self, kernel_source, device=0, platform=0, quiet=False, compiler=No
         #look for NVMLObserver in observers, if present, enable special tunable parameters through nvml
         self.use_nvml = False
         self.continuous_observers = []
-        self.accuracy_observers = []
+        self.output_observers = []
         if observers:
             for obs in observers:
                 if isinstance(obs, NVMLObserver):
@@ -258,8 +259,8 @@ def __init__(self, kernel_source, device=0, platform=0, quiet=False, compiler=No
                     self.use_nvml = True
                 if hasattr(obs, "continuous_observer"):
                     self.continuous_observers.append(obs.continuous_observer)
-                if isinstance(obs, AccuracyObserver):
-                    self.accuracy_observers.append(obs)
+                if isinstance(obs, OutputObserver):
+                    self.output_observers.append(obs)
 
 
         self.iterations = iterations
@@ -420,17 +421,17 @@ def check_kernel_output(self, func, gpu_args, instance, answer, atol, verify, ve
             else:
                 result_host.append(None)
 
-        # Call the accuracy observers
-        for obs in self.accuracy_observers:
+        # Call the output observers
+        for obs in self.output_observers:
             obs.process_kernel_output(answer, result_host)
 
         # There are three scenarios:
         # - if there is a custom verify function, call that.
-        # - otherwise, if there are no accuracy observer, call the default verify function
+        # - otherwise, if there are no output observers, call the default verify function
         # - otherwise, the answer is correct (we assume the accuracy observers verified the output)
         if verify:
             correct = verify(answer, result_host, atol=atol)
-        elif not self.accuracy_observers:
+        elif not self.output_observers:
             correct = _default_verify_function(instance, answer, result_host, atol, verbose)
         else:
             correct = True
@@ -468,9 +469,9 @@ def compile_and_benchmark(self, kernel_source, gpu_args, params, kernel_options,
         if isinstance(instance, util.ErrorConfig):
             result[to.objective] = util.InvalidConfig()
         else:
-
             # Preprocess the argument list. This is required to deal with `MixedPrecisionArray`s
             gpu_args = self.preprocess_gpu_arguments(gpu_args, params)
+
             try:
                 # compile the kernel
                 start_compilation = time.perf_counter()
@@ -492,7 +493,7 @@ def compile_and_benchmark(self, kernel_source, gpu_args, params, kernel_options,
                 last_compilation_time = 1000 * (time.perf_counter() - start_compilation)
 
                 # test kernel for correctness
-                if func and (to.answer or to.verify or self.accuracy_observers):
+                if func and (to.answer or to.verify or self.output_observers):
                     start_verification = time.perf_counter()
                     self.check_kernel_output(func, gpu_args, instance, to.answer, to.atol, to.verify, verbose)
                     last_verification_time = 1000 * (time.perf_counter() - start_verification)

From ff1034688355be6601d76ef8847032e849460c99 Mon Sep 17 00:00:00 2001
From: stijn <s.heldens@esciencecenter.nl>
Date: Mon, 2 Oct 2023 11:17:40 +0200
Subject: [PATCH 19/24] Make `error_metric_from_name` more resilient in
 handling user provided metric name

---
 kernel_tuner/accuracy.py | 68 +++++++++++++++++++++++++++-------------
 1 file changed, 47 insertions(+), 21 deletions(-)

diff --git a/kernel_tuner/accuracy.py b/kernel_tuner/accuracy.py
index 9e5cfc5f2..65667a591 100644
--- a/kernel_tuner/accuracy.py
+++ b/kernel_tuner/accuracy.py
@@ -2,6 +2,7 @@
 from typing import Dict
 import numpy as np
 import logging
+import re
 
 from kernel_tuner.observers import AccuracyObserver
 
@@ -151,7 +152,7 @@ def process_kernel_output(self, answer, output):
         pass
 
 
-def error_metric_from_name(key, EPS=1e-8):
+def error_metric_from_name(user_key, EPS=1e-8):
     """Find the error metric function for the given name.
 
     Returns an function that takes two parameters (the ground-truth and the
@@ -169,66 +170,90 @@ def error_metric_from_name(key, EPS=1e-8):
     * MRE (mean relative error)
     * MALE (mean absolute log error)
     * max (maximum absolute error)
+    * max rel (maximum relative error)
 
     The value of `EPS` is used for relative errors to prevent division by zero.
     ``
     """
 
-    # lowercase the metric name
-    key = key.lower().replace("_", " ").strip()
-
-    if key in ("mse", "mean square error"):
+    # Prepocess the provided name:
+    # - convert to lowercase
+    # - remove the word "error"
+    # - remove underscores and dashes
+    # - strip whitespaces
+    # - replace common abreviations
+    key = user_key.lower()
+    key = re.sub(r"\berror\b", " ", key)
+    key = re.sub(r"[\s_-]+", " ", key)
+    key = key.strip()
+
+    replacements = {
+        "average": "mean",
+        "avg": "mean",
+        "square": "squared",
+        "sq": "squared",
+        "max": "maximum",
+        "rel": "relative",
+        "abs": "absolute",
+        "log": "logarithmic",
+    }
+
+    for pattern, replacement in replacements.items():
+        key = re.sub(rf"\b{pattern}\b", replacement, key)
+
+    # Select the right metric
+    if key in ("mse", "mean squared"):
 
         def metric(a, b):
             return np.average(np.square(a - b))
 
-    elif key in ("rmse", "root mean square error"):
+    elif key in ("rmse", "root mean squared"):
 
         def metric(a, b):
             return np.sqrt(np.average(np.square(a - b)))
 
-    elif key in ("nrmse", "normalized root mean square error"):
+    elif key in ("nrmse", "normalized root mean squared"):
 
         def metric(a, b):
             return np.sqrt(np.average(np.square(a - b)) / np.average(np.square(a)))
 
-    elif key in ("mae", "absolute error", "absolute", "mean absolute error", "abs"):
+    elif key in ("mae", "absolute", "mean absolute"):
 
         def metric(a, b):
             return np.average(np.abs(a - b))
 
-    elif key in ("mre", "relative error", "relative", "mean relative error", "rel"):
+    elif key in ("mre", "relative", "mean relative"):
 
         def metric(a, b):
             return np.average(np.abs(a - b) / np.maximum(np.abs(a), EPS))
 
-    elif key in ("rmsre", "root mean square relative error"):
+    elif key in ("rmsre", "root mean squared relative"):
 
         def metric(a, b):
             return np.sqrt(np.average(np.square(a - b) / np.square(np.maximum(a, EPS))))
 
-    elif key in ("male", "mean absolute log error"):
+    elif key in ("male", "mean absolute logarithmic"):
 
         def metric(a, b):
             return np.average(np.abs(np.log(a + EPS) - np.log(b + EPS)))
 
-    elif key in ("rmsle", "root mean square log error"):
+    elif key in ("rmsle", "root mean squared logarithmic"):
 
         def metric(a, b):
             return np.sqrt(np.average(np.square(np.log(a + EPS) - np.log(b + EPS))))
 
-    elif key in ("max", "max abs", "maximum", "maximum absolute"):
+    elif key in ("maximum absolute", "maximum"):
 
         def metric(a, b):
             return np.amax(np.abs(a - b))
 
-    elif key in ("max rel", "maximum relative"):
+    elif key in ("maximum relative",):
 
         def metric(a, b):
             return np.amax(np.abs(a - b) / np.maximum(np.abs(a), EPS))
 
     else:
-        raise ValueError(f"invalid error metric provided: {key}")
+        raise ValueError(f"invalid error metric provided: {user_key}")
 
     # cast both arguments to f64 before passing them to the metric
     return lambda a, b: metric(
@@ -237,11 +262,11 @@ def metric(a, b):
 
 
 class AccuracyObserver(OutputObserver):
-    """``AccuracyObserver`` measures the error of the output produced by a kernel
-    by comparing it against a reference output.
+    """``AccuracyObserver`` measures the error on the output produced by a kernel
+    by comparing the output against a reference output.
 
     By default, it uses the root mean-squared error (RMSE) and uses the
-    parameter key ``error``.
+    metric name ``"error"``.
     """
 
     def __init__(self, metric=None, key="error", *, atol=1e-8):
@@ -249,14 +274,15 @@ def __init__(self, metric=None, key="error", *, atol=1e-8):
 
         :param metric: The error metric. This should be a string that is
                        accepted by ``error_metric_from_name`` such as ``"absolute error"``
-                       or ``"relative error"``. Alternatively, it can be
+                       or ``"relative error"``. Alternatively, it can be a
                        function that accepts two numpy arrays as arguments
                        (the reference output and the kernel output)
         :param key: The name of this metric in the results.
-        :param atol: The absolute tolerance used in relative metrics to prevent
-                     division by zero.
+        :param atol: The tolerance used in relative metrics to prevent
+                     division by zero. It is ignored by absolute error metrics.
         """
 
+        # Default metric is RMSE
         if not metric:
             metric = "rmse"
 

From afc8c7f0c9decc7b3b54f57916e7d40fe08644c6 Mon Sep 17 00:00:00 2001
From: stijn <s.heldens@esciencecenter.nl>
Date: Mon, 2 Oct 2023 11:21:43 +0200
Subject: [PATCH 20/24] Move `OutputObserver` from `accuracy` to `observers`

---
 kernel_tuner/accuracy.py           | 13 +------------
 kernel_tuner/observers/__init__.py |  2 +-
 kernel_tuner/observers/observer.py | 15 ++++++++++++++-
 3 files changed, 16 insertions(+), 14 deletions(-)

diff --git a/kernel_tuner/accuracy.py b/kernel_tuner/accuracy.py
index 65667a591..34598276f 100644
--- a/kernel_tuner/accuracy.py
+++ b/kernel_tuner/accuracy.py
@@ -4,7 +4,7 @@
 import logging
 import re
 
-from kernel_tuner.observers import AccuracyObserver
+from .observers import OutputObserver
 
 
 class Tunable(UserDict):
@@ -141,17 +141,6 @@ def __init__(
         super().__init__(param_key, arrays)
 
 
-class OutputObserver(BenchmarkObserver):
-    """Observer that can verify or measure something about the output produced by a kernel."""
-
-    @abstractmethod
-    def process_kernel_output(self, answer, output):
-        """method will be called once before benchmarking of a single kernel configuration. The arguments
-        provided are the `answer` as passed `tune_kernel` and the `output` produced by the kernel
-        """
-        pass
-
-
 def error_metric_from_name(user_key, EPS=1e-8):
     """Find the error metric function for the given name.
 
diff --git a/kernel_tuner/observers/__init__.py b/kernel_tuner/observers/__init__.py
index 199dd7e15..ad27791d5 100644
--- a/kernel_tuner/observers/__init__.py
+++ b/kernel_tuner/observers/__init__.py
@@ -1 +1 @@
-from .observer import BenchmarkObserver, IterationObserver, ContinuousObserver
+from .observer import BenchmarkObserver, IterationObserver, ContinuousObserver, OutputObserver
diff --git a/kernel_tuner/observers/observer.py b/kernel_tuner/observers/observer.py
index 203805dd9..056a64a6a 100644
--- a/kernel_tuner/observers/observer.py
+++ b/kernel_tuner/observers/observer.py
@@ -12,7 +12,7 @@ def register_configuration(self, params):
         """Called once before benchmarking of a single kernel configuration. The `params` argument is a `dict`
         that stores the configuration parameters."""
         pass
-    
+
     def before_start(self):
         """before start is called every iteration before the kernel starts"""
         pass
@@ -45,3 +45,16 @@ class IterationObserver(BenchmarkObserver):
 
 class ContinuousObserver(BenchmarkObserver):
     pass
+
+
+class OutputObserver(BenchmarkObserver):
+    """Observer that can verify or measure something about the output produced by a kernel."""
+
+    @abstractmethod
+    def process_kernel_output(self, answer, output):
+        """method will be called once before benchmarking of a single kernel configuration. The arguments
+        provided are the `answer` as passed `tune_kernel` and the `output` produced by the kernel
+        """
+        pass
+
+

From 754fc45f95991f32d7c916da3a8936849645db37 Mon Sep 17 00:00:00 2001
From: stijn <s.heldens@esciencecenter.nl>
Date: Mon, 2 Oct 2023 11:35:52 +0200
Subject: [PATCH 21/24] Rename method of `OutputObserver` from
 `process_kernel_output` to `process_output`

---
 kernel_tuner/accuracy.py              | 4 ++--
 kernel_tuner/core.py                  | 3 +--
 kernel_tuner/observers/observer.py    | 2 +-
 kernel_tuner/observers/powersensor.py | 2 +-
 kernel_tuner/runners/sequential.py    | 1 +
 5 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/kernel_tuner/accuracy.py b/kernel_tuner/accuracy.py
index 34598276f..1a24fef13 100644
--- a/kernel_tuner/accuracy.py
+++ b/kernel_tuner/accuracy.py
@@ -4,7 +4,7 @@
 import logging
 import re
 
-from .observers import OutputObserver
+from kernel_tuner.observers import OutputObserver
 
 
 class Tunable(UserDict):
@@ -283,7 +283,7 @@ def __init__(self, metric=None, key="error", *, atol=1e-8):
         self.metric = metric
         self.result = None
 
-    def process_kernel_output(self, answers, outputs):
+    def process_output(self, answers, outputs):
         errors = []
 
         for answer, output in zip(answers, outputs):
diff --git a/kernel_tuner/core.py b/kernel_tuner/core.py
index 41e19f741..1a7be0562 100644
--- a/kernel_tuner/core.py
+++ b/kernel_tuner/core.py
@@ -6,7 +6,6 @@
 import re
 import numpy as np
 
-from kernel_tuner.accuracy import Tunable
 
 try:
     import cupy as cp
@@ -423,7 +422,7 @@ def check_kernel_output(self, func, gpu_args, instance, answer, atol, verify, ve
 
         # Call the output observers
         for obs in self.output_observers:
-            obs.process_kernel_output(answer, result_host)
+            obs.process_output(answer, result_host)
 
         # There are three scenarios:
         # - if there is a custom verify function, call that.
diff --git a/kernel_tuner/observers/observer.py b/kernel_tuner/observers/observer.py
index 056a64a6a..493de94f8 100644
--- a/kernel_tuner/observers/observer.py
+++ b/kernel_tuner/observers/observer.py
@@ -51,7 +51,7 @@ class OutputObserver(BenchmarkObserver):
     """Observer that can verify or measure something about the output produced by a kernel."""
 
     @abstractmethod
-    def process_kernel_output(self, answer, output):
+    def process_output(self, answer, output):
         """method will be called once before benchmarking of a single kernel configuration. The arguments
         provided are the `answer` as passed `tune_kernel` and the `output` produced by the kernel
         """
diff --git a/kernel_tuner/observers/powersensor.py b/kernel_tuner/observers/powersensor.py
index 33ff979e2..6d07e8977 100644
--- a/kernel_tuner/observers/powersensor.py
+++ b/kernel_tuner/observers/powersensor.py
@@ -52,7 +52,7 @@ def after_finish(self):
             self.results["ps_energy"].append(ps_measured_e)
         if "ps_power" in self.observables:
             ps_measured_t = (
-                end_state.time_at_read - self.begin_state.time_at_read
+                    end_state.time_at_read - self.begin_state.time_at_read
             )  # seconds
             self.results["ps_power"].append(ps_measured_e / ps_measured_t)  # Watt
 
diff --git a/kernel_tuner/runners/sequential.py b/kernel_tuner/runners/sequential.py
index 2b0362930..352a8321e 100644
--- a/kernel_tuner/runners/sequential.py
+++ b/kernel_tuner/runners/sequential.py
@@ -92,6 +92,7 @@ def run(self, parameter_space, tuning_options):
                     warmup_time = 1e3 * (perf_counter() - warmup_time)
 
                 result = self.dev.compile_and_benchmark(self.kernel_source, self.gpu_args, params, self.kernel_options, tuning_options)
+
                 params.update(result)
 
                 if tuning_options.objective in result and isinstance(result[tuning_options.objective], ErrorConfig):

From 91ed8926e4da8f71d9b57c4b8cca9b8aa6a2055f Mon Sep 17 00:00:00 2001
From: stijn <s.heldens@esciencecenter.nl>
Date: Mon, 2 Oct 2023 13:05:44 +0200
Subject: [PATCH 22/24] Add tests for `kernel_tuner.accuracy`

---
 kernel_tuner/accuracy.py | 46 ++++++++++++++---------
 kernel_tuner/core.py     | 31 +++++++--------
 test/test_accuracy.py    | 81 ++++++++++++++++++++++++++++++++++++++++
 test/test_core.py        | 18 +++++++++
 4 files changed, 144 insertions(+), 32 deletions(-)
 create mode 100644 test/test_accuracy.py

diff --git a/kernel_tuner/accuracy.py b/kernel_tuner/accuracy.py
index 1a24fef13..491541909 100644
--- a/kernel_tuner/accuracy.py
+++ b/kernel_tuner/accuracy.py
@@ -56,6 +56,28 @@ def __call__(self, params):
         return self.select_for_configuration(params)
 
 
+def _find_bfloat16_if_available():
+    # Try to get bfloat16 if available.
+    try:
+        from bfloat16 import bfloat16
+        return bfloat16
+    except ImportError:
+        pass
+
+    try:
+        from tensorflow import bfloat16
+        return bfloat16.as_numpy_dtype
+    except ImportError:
+        pass
+
+    logging.warning(
+        "could not find `bfloat16` data type for numpy, "
+        + "please install either the package `bfloat16` or `tensorflow`"
+    )
+
+    return None
+
+
 def _to_float_dtype(x: str) -> np.dtype:
     """Convert a string to a numpy data type (``dtype``). This function recognizes
     common names (such as ``f16`` or ``kfloat``), and uses ``np.dtype(x)`` as a
@@ -65,9 +87,10 @@ def _to_float_dtype(x: str) -> np.dtype:
         x = x.lower()
 
     if x in ("bfloat16", "bf16", "kbfloat16", "__nv_bfloat16"):
-        from bfloat16 import bfloat16
+        result = _find_bfloat16_if_available()
+        if result is not None:
+            return result
 
-        return bfloat16
     if x in ("half", "f16", "float16", "__half", "khalf", 16):
         return np.half
     if x in ("float", "single", "f32", "float32", "kfloat", 32):
@@ -108,21 +131,10 @@ def __init__(
         if not dtypes:
             dtypes = dict(half=np.half, float=np.single, double=np.double)
 
-            # Try to get bfloat16 if available.
-            try:
-                from bfloat16 import bfloat16
-
+            bfloat16 = _find_bfloat16_if_available()
+            if bfloat16 is not None:
                 dtypes["bfloat16"] = bfloat16
-            except ImportError:
-                try:
-                    from tensorflow import bfloat16
 
-                    dtypes["bfloat16"] = bfloat16.as_numpy_dtype
-                except ImportError:
-                    logging.warning(
-                        "could not find `bfloat16` data type for numpy, "
-                        + "please install either the package `bfloat16` or `tensorflow`"
-                    )
 
         # If dtype is a list, convert it to a dictionary
         if isinstance(dtypes, (list, tuple)):
@@ -224,12 +236,12 @@ def metric(a, b):
     elif key in ("male", "mean absolute logarithmic"):
 
         def metric(a, b):
-            return np.average(np.abs(np.log(a + EPS) - np.log(b + EPS)))
+            return np.average(np.abs(np.log10(a + EPS) - np.log10(b + EPS)))
 
     elif key in ("rmsle", "root mean squared logarithmic"):
 
         def metric(a, b):
-            return np.sqrt(np.average(np.square(np.log(a + EPS) - np.log(b + EPS))))
+            return np.sqrt(np.average(np.square(np.log10(a + EPS) - np.log10(b + EPS))))
 
     elif key in ("maximum absolute", "maximum"):
 
diff --git a/kernel_tuner/core.py b/kernel_tuner/core.py
index 1a7be0562..973c0ee06 100644
--- a/kernel_tuner/core.py
+++ b/kernel_tuner/core.py
@@ -438,18 +438,6 @@ def check_kernel_output(self, func, gpu_args, instance, answer, atol, verify, ve
         if not correct:
             raise RuntimeError("Kernel result verification failed for: " + util.get_config_string(instance.params))
 
-    def preprocess_gpu_arguments(self, old_arguments, params):
-        """ Get a flat list of arguments based on the configuration given by `params` """
-        new_arguments = []
-
-        for argument in old_arguments:
-            if isinstance(argument, Tunable):
-                new_arguments.append(argument.select_for_configuration(params))
-            else:
-                new_arguments.append(argument)
-
-        return new_arguments
-
     def compile_and_benchmark(self, kernel_source, gpu_args, params, kernel_options, to):
         # reset previous timers
         last_compilation_time = None
@@ -469,7 +457,7 @@ def compile_and_benchmark(self, kernel_source, gpu_args, params, kernel_options,
             result[to.objective] = util.InvalidConfig()
         else:
             # Preprocess the argument list. This is required to deal with `MixedPrecisionArray`s
-            gpu_args = self.preprocess_gpu_arguments(gpu_args, params)
+            gpu_args = _preprocess_gpu_arguments(gpu_args, params)
 
             try:
                 # compile the kernel
@@ -586,7 +574,7 @@ def create_kernel_instance(self, kernel_source, kernel_options, params, verbose)
             kernel_string, name = wrap_templated_kernel(kernel_string, name)
 
         # Preprocess GPU arguments. Require for handling `Tunable` arguments
-        arguments = self.preprocess_gpu_arguments(kernel_options.arguments, params)
+        arguments = _preprocess_gpu_arguments(kernel_options.arguments, params)
 
         #collect everything we know about this instance and return it
         return KernelInstance(name, kernel_source, kernel_string, temp_files, threads, grid, params, arguments)
@@ -603,7 +591,7 @@ def ready_argument_list(self, arguments):
         """ready argument list to be passed to the kernel, allocates gpu mem if necessary"""
         flat_args = []
 
-        # Flatten all arguments into a single list. Required to deal with `MixedPrecisionArray`s
+        # Flatten all arguments into a single list. Required to deal with `Tunable`s
         for argument in arguments:
             if isinstance(argument, Tunable):
                 flat_args.extend(argument.values())
@@ -644,6 +632,19 @@ def run_kernel(self, func, gpu_args, instance):
         return True
 
 
+def _preprocess_gpu_arguments(old_arguments, params):
+    """ Get a flat list of arguments based on the configuration given by `params` """
+    new_arguments = []
+
+    for argument in old_arguments:
+        if isinstance(argument, Tunable):
+            new_arguments.append(argument.select_for_configuration(params))
+        else:
+            new_arguments.append(argument)
+
+    return new_arguments
+
+
 def _default_verify_function(instance, answer, result_host, atol, verbose):
     """default verify function based on np.allclose"""
 
diff --git a/test/test_accuracy.py b/test/test_accuracy.py
new file mode 100644
index 000000000..8406d68aa
--- /dev/null
+++ b/test/test_accuracy.py
@@ -0,0 +1,81 @@
+import kernel_tuner
+
+import numpy as np
+import pytest
+
+
+def test_tunable():
+    from kernel_tuner.accuracy import Tunable
+
+    # Test with string as key
+    x = Tunable("foo", dict(a=1, b=2))
+    assert x(dict(foo="a")) == 1
+    assert x(dict(foo="b")) == 2
+
+    with pytest.raises(KeyError):
+        assert x(dict(foo="c")) == 3
+
+    # Test with lambda as key
+    x = Tunable(lambda p: p["foo"] + p["bar"], dict(ab=1, bc=2))
+    assert x(dict(foo="a", bar="b")) == 1
+    assert x(dict(foo="b", bar="c")) == 2
+
+    with pytest.raises(KeyError):
+        assert x(dict(foo="c", bar="d")) == 3
+
+
+def test_to_float_dtype():
+    from kernel_tuner.accuracy import _to_float_dtype
+
+    ## Unfortunately, numpy does not offer bfloat16
+    # assert _to_float_dtype("bfloat16") == np.bfloat16
+
+    assert _to_float_dtype("half") == np.float16
+    assert _to_float_dtype("f16") == np.float16
+    assert _to_float_dtype("float16") == np.float16
+
+    assert _to_float_dtype("float") == np.float32
+    assert _to_float_dtype("f32") == np.float32
+    assert _to_float_dtype("float32") == np.float32
+
+    assert _to_float_dtype("double") == np.float64
+    assert _to_float_dtype("f64") == np.float64
+    assert _to_float_dtype("float64") == np.float64
+
+
+def test_tunable_precision():
+    from kernel_tuner.accuracy import TunablePrecision
+
+    inputs = np.array([1, 2, 3], dtype=np.float64)
+    x = TunablePrecision(
+        "foo", inputs, dict(float16=np.half, float32=np.float32, float64=np.double)
+    )
+
+    assert np.all(x(dict(foo="float16")) == inputs)
+    assert x(dict(foo="float16")).dtype == np.half
+
+    assert np.all(x(dict(foo="float32")) == inputs)
+    assert x(dict(foo="float32")).dtype == np.float32
+
+    assert np.all(x(dict(foo="float64")) == inputs)
+    assert x(dict(foo="float64")).dtype == np.double
+
+
+def test_error_metric_from_name():
+    from kernel_tuner.accuracy import error_metric_from_name
+    from math import sqrt
+
+    eps = 0.1
+    a = np.array([0, 1, 2, 3])
+    b = np.array([1, 1, 2, 5])
+
+    assert error_metric_from_name("mse")(a, b) == pytest.approx(1.25)
+    assert error_metric_from_name("rmse")(a, b) == pytest.approx(sqrt(1.25))
+    assert error_metric_from_name("nrmse")(a, b) == pytest.approx(sqrt(1.25 / 3.5))
+    assert error_metric_from_name("mae")(a, b) == pytest.approx(0.75)
+    assert error_metric_from_name("mre", eps)(a, b) == pytest.approx(2.666666666666666)
+    assert error_metric_from_name("rmsre", eps)(a, b) == pytest.approx(5.011098792790969)
+    assert error_metric_from_name("male", eps)(a, b) == pytest.approx(0.3144002918554722)
+    assert error_metric_from_name("rmsle", eps)(a, b) == pytest.approx(1.224514683450185)
+    assert error_metric_from_name("maximum abs")(a, b) == pytest.approx(2)
+    assert error_metric_from_name("maximum rel", eps)(a, b) == pytest.approx(10)
diff --git a/test/test_core.py b/test/test_core.py
index a2c8b61ed..a8624470e 100644
--- a/test/test_core.py
+++ b/test/test_core.py
@@ -178,6 +178,24 @@ def test_default_verify_function_scalar():
     assert core._default_verify_function(instance, answer, result_host, 0.1, False)
 
 
+def test_preprocess_gpu_arguments():
+    from kernel_tuner.accuracy import Tunable
+
+    arguments = [
+        Tunable("foo", dict(a=1, b=2)),
+        Tunable(lambda p: p["foo"] + p["bar"], dict(ax=3, bx=4)),
+    ]
+
+    params = dict(foo="a", bar="x")
+
+    expected = [
+        1,
+        3,
+    ]
+
+    assert core._preprocess_gpu_arguments(arguments, params) == expected
+
+
 def test_split_argument_list():
     test_string = "T *c, const T *__restrict__ a, T\n *\n b\n , int n"
     ans1, ans2 = core.split_argument_list([s.strip() for s in test_string.split(',')])

From 24346cd41a855d24dba9a47365379e67cbc1b492 Mon Sep 17 00:00:00 2001
From: stijn <s.heldens@esciencecenter.nl>
Date: Mon, 2 Oct 2023 13:23:10 +0200
Subject: [PATCH 23/24] Update accuracy example to use currect API

---
 examples/cuda/accuracy.py | 24 ++++++++----------------
 1 file changed, 8 insertions(+), 16 deletions(-)

diff --git a/examples/cuda/accuracy.py b/examples/cuda/accuracy.py
index 87485493f..bf00aa72b 100644
--- a/examples/cuda/accuracy.py
+++ b/examples/cuda/accuracy.py
@@ -3,19 +3,7 @@
 import numpy
 from pprint import pprint
 from kernel_tuner import tune_kernel
-from kernel_tuner.accuracy import TunablePrecision
-from kernel_tuner.observers import AccuracyObserver
-
-
-class MyObserver(AccuracyObserver):
-    def __init__(self):
-        self.error = None
-
-    def process_kernel_output(self, answer, outputs):
-        self.error = numpy.average((answer[-1] - outputs[-1].astype(numpy.float64))**2)
-
-    def get_results(self):
-        return dict(error=self.error)
+from kernel_tuner.accuracy import TunablePrecision, AccuracyObserver
 
 
 def tune():
@@ -50,10 +38,13 @@ def tune():
     answer = [None, None, None, a + b]
 
     tune_params = dict()
-    tune_params["block_size_x"] = [128+64*i for i in range(15)]
+    tune_params["block_size_x"] = [32, 64, 128, 256, 512, 1024]
     tune_params["float_type"] = ["float", "double", "half"]
 
-    observers = [MyObserver()]
+    observers = [
+        AccuracyObserver("RMSE", "error_rmse"),
+        AccuracyObserver("MRE", "error_relative"),
+    ]
 
     results, env = tune_kernel(
         "vector_add<float_type>",
@@ -63,7 +54,8 @@ def tune():
         tune_params,
         answer=answer,
         observers=observers,
-        lang="cupy")
+        lang="CUDA",
+    )
 
     pprint(results)
 

From 647c04bb16fe6d3c3ee0ac445f84cc5ea7d7c97f Mon Sep 17 00:00:00 2001
From: stijn <s.heldens@esciencecenter.nl>
Date: Mon, 2 Oct 2023 13:42:06 +0200
Subject: [PATCH 24/24] Fix failing test for `error_metric_from_name`

---
 test/test_accuracy.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/test_accuracy.py b/test/test_accuracy.py
index 8406d68aa..1e5070637 100644
--- a/test/test_accuracy.py
+++ b/test/test_accuracy.py
@@ -76,6 +76,6 @@ def test_error_metric_from_name():
     assert error_metric_from_name("mre", eps)(a, b) == pytest.approx(2.666666666666666)
     assert error_metric_from_name("rmsre", eps)(a, b) == pytest.approx(5.011098792790969)
     assert error_metric_from_name("male", eps)(a, b) == pytest.approx(0.3144002918554722)
-    assert error_metric_from_name("rmsle", eps)(a, b) == pytest.approx(1.224514683450185)
+    assert error_metric_from_name("rmsle", eps)(a, b) == pytest.approx(0.5317999700319226)
     assert error_metric_from_name("maximum abs")(a, b) == pytest.approx(2)
     assert error_metric_from_name("maximum rel", eps)(a, b) == pytest.approx(10)