Skip to content

Commit

Permalink
Merge pull request #248 from KernelTuner/directives
Browse files Browse the repository at this point in the history
Improved OpenACC support
  • Loading branch information
benvanwerkhoven authored May 23, 2024
2 parents 902340d + 88bde19 commit 9e8a59a
Show file tree
Hide file tree
Showing 9 changed files with 798 additions and 327 deletions.
2 changes: 1 addition & 1 deletion examples/README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ Below we list the example applications and the features they illustrate.

Vector Add
----------
[`CUDA <https://github.com/kerneltuner/kernel_tuner/blob/master/examples/cuda/vector_add.py>`__] [`CUDA-C++ <https://github.com/kerneltuner/kernel_tuner/blob/master/examples/cuda-c++/vector_add.py>`__] [`OpenCL <https://github.com/kerneltuner/kernel_tuner/blob/master/examples/opencl/vector_add.py>`__] [`C <https://github.com/kerneltuner/kernel_tuner/blob/master/examples/c/vector_add.py>`__] [`Fortran <https://github.com/kerneltuner/kernel_tuner/blob/master/examples/fortran/vector_add.py>`__]
[`CUDA <https://github.com/kerneltuner/kernel_tuner/blob/master/examples/cuda/vector_add.py>`__] [`CUDA-C++ <https://github.com/kerneltuner/kernel_tuner/blob/master/examples/cuda-c++/vector_add.py>`__] [`OpenCL <https://github.com/kerneltuner/kernel_tuner/blob/master/examples/opencl/vector_add.py>`__] [`C <https://github.com/kerneltuner/kernel_tuner/blob/master/examples/c/vector_add.py>`__] [`Fortran <https://github.com/kerneltuner/kernel_tuner/blob/master/examples/fortran/vector_add.py>`__] [`OpenACC-C++ <https://github.com/kerneltuner/kernel_tuner/blob/master/examples/directives/vector_add_c_openacc.py>`__] [`OpenACC-Fortran <https://github.com/kerneltuner/kernel_tuner/blob/master/examples/directives/vector_add_fortran_openacc.py>`__]
- use Kernel Tuner to tune a simple kernel

Stencil
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,19 +3,21 @@

from kernel_tuner import tune_kernel
from kernel_tuner.utils.directives import (
Code,
OpenACC,
Cxx,
extract_directive_signature,
extract_directive_code,
extract_preprocessor,
generate_directive_function,
extract_directive_data,
allocate_signature_memory,
)
from collections import OrderedDict

code = """
#include <stdlib.h>
#define VECTOR_SIZE 65536
#define VECTOR_SIZE 1000000
int main(void) {
int size = VECTOR_SIZE;
Expand All @@ -24,7 +26,7 @@
float * c = (float *) malloc(VECTOR_SIZE * sizeof(float));
#pragma tuner start vector_add a(float*:VECTOR_SIZE) b(float*:VECTOR_SIZE) c(float*:VECTOR_SIZE) size(int:VECTOR_SIZE)
#pragma acc parallel num_gangs(ngangs) vector_length(nthreads)
#pragma acc parallel vector_length(nthreads)
#pragma acc loop
for ( int i = 0; i < size; i++ ) {
c[i] = a[i] + b[i];
Expand All @@ -37,21 +39,23 @@
}
"""

# Extract tunable directive and generate kernel_string
# Extract tunable directive
app = Code(OpenACC(), Cxx())
preprocessor = extract_preprocessor(code)
signature = extract_directive_signature(code)
body = extract_directive_code(code)
kernel_string = generate_directive_function(
preprocessor, signature["vector_add"], body["vector_add"]
)

signature = extract_directive_signature(code, app)
body = extract_directive_code(code, app)
# Allocate memory on the host
data = extract_directive_data(code)
data = extract_directive_data(code, app)
args = allocate_signature_memory(data["vector_add"], preprocessor)
# Generate kernel string
kernel_string = generate_directive_function(
preprocessor, signature["vector_add"], body["vector_add"], app, data=data["vector_add"]
)

tune_params = OrderedDict()
tune_params["ngangs"] = [2**i for i in range(0, 15)]
tune_params["nthreads"] = [2**i for i in range(0, 11)]
tune_params = dict()
tune_params["nthreads"] = [32 * i for i in range(1, 33)]
metrics = dict()
metrics["GB/s"] = lambda x: ((2 * 4 * len(args[0])) + (4 * len(args[0]))) / (x["time"] / 10**3) / 10**9

answer = [None, None, args[0] + args[1], None]

Expand All @@ -61,6 +65,7 @@
0,
args,
tune_params,
metrics=metrics,
answer=answer,
compiler_options=["-fast", "-acc=gpu"],
compiler="nvc++",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,26 +3,28 @@

from kernel_tuner import tune_kernel
from kernel_tuner.utils.directives import (
Code,
OpenACC,
Fortran,
extract_directive_signature,
extract_directive_code,
extract_preprocessor,
generate_directive_function,
extract_directive_data,
allocate_signature_memory,
)
from collections import OrderedDict

code = """
#define VECTOR_SIZE 65536
#define VECTOR_SIZE 1000000
subroutine vector_add(A, B, C, n)
use iso_c_binding
real (c_float), intent(out), dimension(VECTOR_SIZE) :: C
real (c_float), intent(in), dimension(VECTOR_SIZE) :: A, B
integer (c_int), intent(in) :: n
!$tuner start vector_add A(float*:VECTOR_SIZE) B(float*:VECTOR_SIZE) C(float*:VECTOR_SIZE) n(int:VECTOR_SIZE)
!$acc parallel loop num_gangs(ngangs) vector_length(vlength)
!$tuner start vector_add A(float*:VECTOR_SIZE) B(float*:VECTOR_SIZE) C(float*:VECTOR_SIZE) n(int:VECTOR_SIZE) i(int:VECTOR_SIZE)
!$acc parallel loop vector_length(nthreads)
do i = 1, n
C(i) = A(i) + B(i)
end do
Expand All @@ -32,30 +34,33 @@
end subroutine vector_add
"""

# Extract tunable directive and generate kernel_string
# Extract tunable directive
app = Code(OpenACC(), Fortran())
preprocessor = extract_preprocessor(code)
signature = extract_directive_signature(code)
body = extract_directive_code(code)
kernel_string = generate_directive_function(
preprocessor, signature["vector_add"], body["vector_add"]
)

signature = extract_directive_signature(code, app)
body = extract_directive_code(code, app)
# Allocate memory on the host
data = extract_directive_data(code)
data = extract_directive_data(code, app)
args = allocate_signature_memory(data["vector_add"], preprocessor)
# Generate kernel string
kernel_string = generate_directive_function(
preprocessor, signature["vector_add"], body["vector_add"], app, data=data["vector_add"]
)

tune_params = OrderedDict()
tune_params["ngangs"] = [2**i for i in range(0, 15)]
tune_params["vlength"] = [2**i for i in range(0, 11)]
tune_params = dict()
tune_params["nthreads"] = [32 * i for i in range(1, 33)]
metrics = dict()
metrics["GB/s"] = lambda x: ((2 * 4 * len(args[0])) + (4 * len(args[0]))) / (x["time"] / 10**3) / 10**9

answer = [None, None, args[0] + args[1], None]
answer = [None, None, args[0] + args[1], None, None]

tune_kernel(
"vector_add",
kernel_string,
0,
args,
tune_params,
metrics=metrics,
answer=answer,
compiler_options=["-fast", "-acc=gpu"],
compiler="nvfortran",
Expand Down
26 changes: 10 additions & 16 deletions kernel_tuner/backends/compiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -146,9 +146,7 @@ def ready_argument_list(self, arguments):

for i, arg in enumerate(arguments):
if not (isinstance(arg, (np.ndarray, np.number)) or is_cupy_array(arg)):
raise TypeError(
f"Argument is not numpy or cupy ndarray or numpy scalar but a {type(arg)}"
)
raise TypeError(f"Argument is not numpy or cupy ndarray or numpy scalar but a {type(arg)}")
dtype_str = str(arg.dtype)
if isinstance(arg, np.ndarray):
if dtype_str in dtype_map.keys():
Expand Down Expand Up @@ -210,11 +208,7 @@ def compile(self, kernel_instance):

# detect whether to use nvcc as default instead of g++, may overrule an explicitly passed g++
if (
(
(suffix == ".cu")
or ("#include <cuda" in kernel_string)
or ("cudaMemcpy" in kernel_string)
)
((suffix == ".cu") or ("#include <cuda" in kernel_string) or ("cudaMemcpy" in kernel_string))
and self.compiler == "g++"
and self.nvcc_available
):
Expand Down Expand Up @@ -271,11 +265,7 @@ def compile(self, kernel_instance):
if platform.system() == "Darwin":
lib_extension = ".dylib"

subprocess.check_call(
[self.compiler, "-c", source_file]
+ compiler_options
+ ["-o", filename + ".o"]
)
subprocess.check_call([self.compiler, "-c", source_file] + compiler_options + ["-o", filename + ".o"])
subprocess.check_call(
[self.compiler, filename + ".o"]
+ compiler_options
Expand Down Expand Up @@ -319,7 +309,7 @@ def synchronize(self):
C backend does not support asynchronous launches"""
pass

def run_kernel(self, func, c_args, threads, grid):
def run_kernel(self, func, c_args, threads, grid, stream=None):
"""runs the kernel once, returns whatever the kernel returns
:param func: A C function compiled for this specific configuration
Expand All @@ -331,11 +321,15 @@ def run_kernel(self, func, c_args, threads, grid):
:type c_args: list(Argument)
:param threads: Ignored, but left as argument for now to have the same
interface as CudaFunctions and OpenCLFunctions.
interface as Backend.
:type threads: any
:param grid: Ignored, but left as argument for now to have the same
interface as CudaFunctions and OpenCLFunctions.
interface as Backend.
:type grid: any
:param stream: Ignored, but left as argument for now to have the same
interface as Backend.
:type grid: any
:returns: A robust average of values returned by the C function.
Expand Down
Loading

0 comments on commit 9e8a59a

Please sign in to comment.