Merge pull request #248 from KernelTuner/directives

Improved OpenACC support
KernelTuner · May 23, 2024 · 9e8a59a · 9e8a59a
2 parents 902340d + 88bde19
commit 9e8a59a
Show file tree

Hide file tree

Showing 9 changed files with 798 additions and 327 deletions.
diff --git a/examples/README.rst b/examples/README.rst
@@ -18,7 +18,7 @@ Below we list the example applications and the features they illustrate.
 
 Vector Add
 ----------
-[`CUDA <https://github.com/kerneltuner/kernel_tuner/blob/master/examples/cuda/vector_add.py>`__] [`CUDA-C++ <https://github.com/kerneltuner/kernel_tuner/blob/master/examples/cuda-c++/vector_add.py>`__] [`OpenCL <https://github.com/kerneltuner/kernel_tuner/blob/master/examples/opencl/vector_add.py>`__] [`C <https://github.com/kerneltuner/kernel_tuner/blob/master/examples/c/vector_add.py>`__] [`Fortran <https://github.com/kerneltuner/kernel_tuner/blob/master/examples/fortran/vector_add.py>`__]
+[`CUDA <https://github.com/kerneltuner/kernel_tuner/blob/master/examples/cuda/vector_add.py>`__] [`CUDA-C++ <https://github.com/kerneltuner/kernel_tuner/blob/master/examples/cuda-c++/vector_add.py>`__] [`OpenCL <https://github.com/kerneltuner/kernel_tuner/blob/master/examples/opencl/vector_add.py>`__] [`C <https://github.com/kerneltuner/kernel_tuner/blob/master/examples/c/vector_add.py>`__] [`Fortran <https://github.com/kerneltuner/kernel_tuner/blob/master/examples/fortran/vector_add.py>`__] [`OpenACC-C++ <https://github.com/kerneltuner/kernel_tuner/blob/master/examples/directives/vector_add_c_openacc.py>`__] [`OpenACC-Fortran <https://github.com/kerneltuner/kernel_tuner/blob/master/examples/directives/vector_add_fortran_openacc.py>`__]
  - use Kernel Tuner to tune a simple kernel
 
 Stencil

diff --git a/examples/c/vector_add_openacc.py → examples/directives/vector_add_c_openacc.py b/examples/c/vector_add_openacc.py → examples/directives/vector_add_c_openacc.py
@@ -3,19 +3,21 @@
 
 from kernel_tuner import tune_kernel
 from kernel_tuner.utils.directives import (
+    Code,
+    OpenACC,
+    Cxx,
     extract_directive_signature,
     extract_directive_code,
     extract_preprocessor,
     generate_directive_function,
     extract_directive_data,
     allocate_signature_memory,
 )
-from collections import OrderedDict
 
 code = """
 #include <stdlib.h>
 
-#define VECTOR_SIZE 65536
+#define VECTOR_SIZE 1000000
 
 int main(void) {
 	int size = VECTOR_SIZE;
@@ -24,7 +26,7 @@
 	float * c = (float *) malloc(VECTOR_SIZE * sizeof(float));
 
 	#pragma tuner start vector_add a(float*:VECTOR_SIZE) b(float*:VECTOR_SIZE) c(float*:VECTOR_SIZE) size(int:VECTOR_SIZE)
-	#pragma acc parallel num_gangs(ngangs) vector_length(nthreads)
+	#pragma acc parallel vector_length(nthreads)
 	#pragma acc loop
 	for ( int i = 0; i < size; i++ ) {
 		c[i] = a[i] + b[i];
@@ -37,21 +39,23 @@
 }
 """
 
-# Extract tunable directive and generate kernel_string
+# Extract tunable directive
+app = Code(OpenACC(), Cxx())
 preprocessor = extract_preprocessor(code)
-signature = extract_directive_signature(code)
-body = extract_directive_code(code)
-kernel_string = generate_directive_function(
-    preprocessor, signature["vector_add"], body["vector_add"]
-)
-
+signature = extract_directive_signature(code, app)
+body = extract_directive_code(code, app)
 # Allocate memory on the host
-data = extract_directive_data(code)
+data = extract_directive_data(code, app)
 args = allocate_signature_memory(data["vector_add"], preprocessor)
+# Generate kernel string
+kernel_string = generate_directive_function(
+    preprocessor, signature["vector_add"], body["vector_add"], app, data=data["vector_add"]
+)
 
-tune_params = OrderedDict()
-tune_params["ngangs"] = [2**i for i in range(0, 15)]
-tune_params["nthreads"] = [2**i for i in range(0, 11)]
+tune_params = dict()
+tune_params["nthreads"] = [32 * i for i in range(1, 33)]
+metrics = dict()
+metrics["GB/s"] = lambda x: ((2 * 4 * len(args[0])) + (4 * len(args[0]))) / (x["time"] / 10**3) / 10**9
 
 answer = [None, None, args[0] + args[1], None]
 
@@ -61,6 +65,7 @@
     0,
     args,
     tune_params,
+    metrics=metrics,
     answer=answer,
     compiler_options=["-fast", "-acc=gpu"],
     compiler="nvc++",

diff --git a/examples/fortran/vector_add_openacc.py → .../directives/vector_add_fortran_openacc.py b/examples/fortran/vector_add_openacc.py → .../directives/vector_add_fortran_openacc.py
@@ -3,26 +3,28 @@
 
 from kernel_tuner import tune_kernel
 from kernel_tuner.utils.directives import (
+    Code,
+    OpenACC,
+    Fortran,
     extract_directive_signature,
     extract_directive_code,
     extract_preprocessor,
     generate_directive_function,
     extract_directive_data,
     allocate_signature_memory,
 )
-from collections import OrderedDict
 
 code = """
-#define VECTOR_SIZE 65536
+#define VECTOR_SIZE 1000000
 
 subroutine vector_add(A, B, C, n)
     use iso_c_binding
     real (c_float), intent(out), dimension(VECTOR_SIZE) :: C
     real (c_float), intent(in), dimension(VECTOR_SIZE) :: A, B
     integer (c_int), intent(in) :: n
 
-    !$tuner start vector_add A(float*:VECTOR_SIZE) B(float*:VECTOR_SIZE) C(float*:VECTOR_SIZE) n(int:VECTOR_SIZE)
-    !$acc parallel loop num_gangs(ngangs) vector_length(vlength)
+    !$tuner start vector_add A(float*:VECTOR_SIZE) B(float*:VECTOR_SIZE) C(float*:VECTOR_SIZE) n(int:VECTOR_SIZE) i(int:VECTOR_SIZE)
+    !$acc parallel loop vector_length(nthreads)
     do i = 1, n
       C(i) = A(i) + B(i)
     end do
@@ -32,30 +34,33 @@
 end subroutine vector_add
 """
 
-# Extract tunable directive and generate kernel_string
+# Extract tunable directive
+app = Code(OpenACC(), Fortran())
 preprocessor = extract_preprocessor(code)
-signature = extract_directive_signature(code)
-body = extract_directive_code(code)
-kernel_string = generate_directive_function(
-    preprocessor, signature["vector_add"], body["vector_add"]
-)
-
+signature = extract_directive_signature(code, app)
+body = extract_directive_code(code, app)
 # Allocate memory on the host
-data = extract_directive_data(code)
+data = extract_directive_data(code, app)
 args = allocate_signature_memory(data["vector_add"], preprocessor)
+# Generate kernel string
+kernel_string = generate_directive_function(
+    preprocessor, signature["vector_add"], body["vector_add"], app, data=data["vector_add"]
+)
 
-tune_params = OrderedDict()
-tune_params["ngangs"] = [2**i for i in range(0, 15)]
-tune_params["vlength"] = [2**i for i in range(0, 11)]
+tune_params = dict()
+tune_params["nthreads"] = [32 * i for i in range(1, 33)]
+metrics = dict()
+metrics["GB/s"] = lambda x: ((2 * 4 * len(args[0])) + (4 * len(args[0]))) / (x["time"] / 10**3) / 10**9
 
-answer = [None, None, args[0] + args[1], None]
+answer = [None, None, args[0] + args[1], None, None]
 
 tune_kernel(
     "vector_add",
     kernel_string,
     0,
     args,
     tune_params,
+    metrics=metrics,
     answer=answer,
     compiler_options=["-fast", "-acc=gpu"],
     compiler="nvfortran",

diff --git a/kernel_tuner/backends/compiler.py b/kernel_tuner/backends/compiler.py
@@ -146,9 +146,7 @@ def ready_argument_list(self, arguments):
 
         for i, arg in enumerate(arguments):
             if not (isinstance(arg, (np.ndarray, np.number)) or is_cupy_array(arg)):
-                raise TypeError(
-                    f"Argument is not numpy or cupy ndarray or numpy scalar but a {type(arg)}"
-                )
+                raise TypeError(f"Argument is not numpy or cupy ndarray or numpy scalar but a {type(arg)}")
             dtype_str = str(arg.dtype)
             if isinstance(arg, np.ndarray):
                 if dtype_str in dtype_map.keys():
@@ -210,11 +208,7 @@ def compile(self, kernel_instance):
 
         # detect whether to use nvcc as default instead of g++, may overrule an explicitly passed g++
         if (
-            (
-                (suffix == ".cu")
-                or ("#include <cuda" in kernel_string)
-                or ("cudaMemcpy" in kernel_string)
-            )
+            ((suffix == ".cu") or ("#include <cuda" in kernel_string) or ("cudaMemcpy" in kernel_string))
             and self.compiler == "g++"
             and self.nvcc_available
         ):
@@ -271,11 +265,7 @@ def compile(self, kernel_instance):
             if platform.system() == "Darwin":
                 lib_extension = ".dylib"
 
-            subprocess.check_call(
-                [self.compiler, "-c", source_file]
-                + compiler_options
-                + ["-o", filename + ".o"]
-            )
+            subprocess.check_call([self.compiler, "-c", source_file] + compiler_options + ["-o", filename + ".o"])
             subprocess.check_call(
                 [self.compiler, filename + ".o"]
                 + compiler_options
@@ -319,7 +309,7 @@ def synchronize(self):
         C backend does not support asynchronous launches"""
         pass
 
-    def run_kernel(self, func, c_args, threads, grid):
+    def run_kernel(self, func, c_args, threads, grid, stream=None):
         """runs the kernel once, returns whatever the kernel returns
 
         :param func: A C function compiled for this specific configuration
@@ -331,11 +321,15 @@ def run_kernel(self, func, c_args, threads, grid):
         :type c_args: list(Argument)
 
         :param threads: Ignored, but left as argument for now to have the same
-            interface as CudaFunctions and OpenCLFunctions.
+            interface as Backend.
         :type threads: any
 
         :param grid: Ignored, but left as argument for now to have the same
-            interface as CudaFunctions and OpenCLFunctions.
+            interface as Backend.
+        :type grid: any
+
+        :param stream: Ignored, but left as argument for now to have the same
+            interface as Backend.
         :type grid: any
 
         :returns: A robust average of values returned by the C function.