Skip to content

Commit

Permalink
Merge pull request #269 from KernelTuner/directives
Browse files Browse the repository at this point in the history
Directives: summer refactoring
  • Loading branch information
isazi authored Aug 20, 2024
2 parents 6ad115c + a950997 commit f307f50
Show file tree
Hide file tree
Showing 3 changed files with 132 additions and 35 deletions.
57 changes: 57 additions & 0 deletions examples/directives/matrix_multiply_c_openacc.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
#!/usr/bin/env python
"""This is an example tuning a naive matrix multiplication using the simplified directives interface"""

from kernel_tuner import tune_kernel
from kernel_tuner.utils.directives import (
Code,
OpenACC,
Cxx,
process_directives
)

N = 4096

code = """
#define N 4096
void matrix_multiply(float *A, float *B, float *C) {
#pragma tuner start mm A(float*:NN) B(float*:NN) C(float*:NN)
float temp_sum = 0.0f;
#pragma acc parallel vector_length(nthreads)
#pragma acc loop gang collapse(2)
for ( int i = 0; i < N; i++) {
for ( int j = 0; j < N; j++ ) {
temp_sum = 0.0f;
#pragma acc loop vector reduction(+:temp_sum)
for ( int k = 0; k < N; k++ ) {
temp_sum += A[(i * N) + k] * B[(k * N) + j];
}
C[(i * N) + j] = temp_sum;
}
}
#pragma tuner stop
}
"""

# Extract tunable directive
app = Code(OpenACC(), Cxx())
dims = {"NN": N**2}
kernel_string, kernel_args = process_directives(app, code, user_dimensions=dims)

tune_params = dict()
tune_params["nthreads"] = [32 * i for i in range(1, 33)]
metrics = dict()
metrics["time_s"] = lambda x: x["time"] / 10**3
metrics["GB/s"] = lambda x: ((N**3 * 2 * 4) + (N**2 * 4)) / x["time_s"] / 10**9
metrics["GFLOP/s"] = lambda x: (N**3 * 3) / x["time_s"] / 10**9

tune_kernel(
"mm",
kernel_string["mm"],
0,
kernel_args["mm"],
tune_params,
metrics=metrics,
compiler_options=["-fast", "-acc=gpu"],
compiler="nvc++",
)
104 changes: 72 additions & 32 deletions kernel_tuner/utils/directives.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,37 @@
from typing import Any
from typing import Any, Tuple
from abc import ABC, abstractmethod
import numpy as np

# Function templates
cpp_template: str = """
<!?PREPROCESSOR?!>
<!?USER_DEFINES?!>
#include <chrono>
extern "C" <!?SIGNATURE?!> {
<!?INITIALIZATION?!>
<!?BODY?!>
<!?DEINITIALIZATION?!>
}
"""

f90_template: str = """
<!?PREPROCESSOR?!>
<!?USER_DEFINES?!>
module kt
use iso_c_binding
contains
<!?SIGNATURE?!>
<!?INITIALIZATION?!>
<!?BODY?!>
<!?DEINITIALIZATION?!>
end function <!?NAME?!>
end module kt
"""


class Directive(ABC):
"""Base class for all directives"""
Expand Down Expand Up @@ -339,7 +369,7 @@ def wrap_timing_fortran(code: str) -> str:

def end_timing_cxx(code: str) -> str:
"""In C++ we need to return the measured time"""
return code + "\nreturn elapsed_time.count();\n"
return "\n".join([code, "return elapsed_time.count();\n"])


def wrap_data(code: str, langs: Code, data: dict, preprocessor: list = None, user_dimensions: dict = None) -> str:
Expand All @@ -355,7 +385,7 @@ def wrap_data(code: str, langs: Code, data: dict, preprocessor: list = None, use
elif is_openacc(langs.directive) and is_fortran(langs.language):
intro += create_data_directive_openacc_fortran(name, size)
outro += exit_data_directive_openacc_fortran(name, size)
return intro + code + outro
return "\n".join([intro, code, outro])


def extract_directive_code(code: str, langs: Code, kernel_name: str = None) -> dict:
Expand Down Expand Up @@ -529,42 +559,34 @@ def generate_directive_function(
) -> str:
"""Generate tunable function for one directive"""

code = "\n".join(preprocessor) + "\n"
if user_dimensions is not None:
# add user dimensions to preprocessor
for key, value in user_dimensions.items():
code += f"#define {key} {value}\n"
if is_cxx(langs.language) and "#include <chrono>" not in preprocessor:
code += "\n#include <chrono>\n"
if is_cxx(langs.language):
code += 'extern "C" ' + signature + "{\n"
elif is_fortran(langs.language):
code += "\nmodule kt\nuse iso_c_binding\ncontains\n"
code += "\n" + signature
if len(initialization) > 1:
code += initialization + "\n"
if data is not None:
body = add_present_openacc(body, langs, data, preprocessor, user_dimensions)
if is_cxx(langs.language):
code = cpp_template
body = start_timing_cxx(body)
if data is not None:
code += wrap_data(body + "\n", langs, data, preprocessor, user_dimensions)
else:
code += body
code = end_timing_cxx(code)
if len(deinitialization) > 1:
code += deinitialization + "\n"
code += "\n}"
body = wrap_data(body + "\n", langs, data, preprocessor, user_dimensions)
body = end_timing_cxx(body)
elif is_fortran(langs.language):
code = f90_template
body = wrap_timing(body, langs.language)
if data is not None:
code += wrap_data(body + "\n", langs, data, preprocessor, user_dimensions)
else:
code += body + "\n"
if len(deinitialization) > 1:
code += deinitialization + "\n"
body = wrap_data(body + "\n", langs, data, preprocessor, user_dimensions)
name = signature.split(" ")[1].split("(")[0]
code += f"\nend function {name}\nend module kt\n"
code = code.replace("<!?NAME?!>", name)
code = code.replace("<!?PREPROCESSOR?!>", "\n".join(preprocessor))
# if present, add user specific dimensions as defines
if user_dimensions is not None:
user_defines = ""
for key, value in user_dimensions.items():
user_defines += f"#define {key} {value}\n"
code = code.replace("<!?USER_DEFINES?!>", user_defines)
else:
code = code.replace("<!?USER_DEFINES?!>", "")
code = code.replace("<!?SIGNATURE?!>", signature)
code = code.replace("<!?INITIALIZATION?!>", initialization)
code = code.replace("<!?DEINITIALIZATION?!>", deinitialization)
if data is not None:
body = add_present_openacc(body, langs, data, preprocessor, user_dimensions)
code = code.replace("<!?BODY?!>", body)

return code

Expand Down Expand Up @@ -662,3 +684,21 @@ def add_present_openacc_fortran(name: str, size: ArraySize) -> str:
else:
md_size = fortran_md_size(size)
return f" present({name}({','.join(md_size)})) "


def process_directives(langs: Code, source: str, user_dimensions: dict = None) -> Tuple[dict, dict]:
"""Helper functions to process all the directives in the code and create tunable functions"""
kernel_strings = dict()
kernel_args = dict()
preprocessor = extract_preprocessor(source)
signatures = extract_directive_signature(source, langs)
bodies = extract_directive_code(source, langs)
data = extract_directive_data(source, langs)
init = extract_initialization_code(source, langs)
deinit = extract_deinitialization_code(source, langs)
for kernel in signatures.keys():
kernel_strings[kernel] = generate_directive_function(
preprocessor, signatures[kernel], bodies[kernel], langs, data[kernel], init, deinit, user_dimensions
)
kernel_args[kernel] = allocate_signature_memory(data[kernel], preprocessor, user_dimensions)
return (kernel_strings, kernel_args)
6 changes: 3 additions & 3 deletions test/utils/test_directives.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,13 +105,13 @@ def test_wrap_data():
code_f90 = "! this is a comment\n"
data = {"array": ["int*", "size"]}
preprocessor = ["#define size 42"]
expected_cxx = "#pragma acc enter data create(array[:42])\n#pragma acc update device(array[:42])\n// this is a comment\n#pragma acc exit data copyout(array[:42])\n"
expected_cxx = "#pragma acc enter data create(array[:42])\n#pragma acc update device(array[:42])\n\n// this is a comment\n\n#pragma acc exit data copyout(array[:42])\n"
assert wrap_data(code_cxx, acc_cxx, data, preprocessor, None) == expected_cxx
expected_f90 = "!$acc enter data create(array(:42))\n!$acc update device(array(:42))\n! this is a comment\n!$acc exit data copyout(array(:42))\n"
expected_f90 = "!$acc enter data create(array(:42))\n!$acc update device(array(:42))\n\n! this is a comment\n\n!$acc exit data copyout(array(:42))\n"
assert wrap_data(code_f90, acc_f90, data, preprocessor, None) == expected_f90
data = {"matrix": ["float*", "rows,cols"]}
preprocessor = ["#define rows 42", "#define cols 84"]
expected_f90 = "!$acc enter data create(matrix(:42,:84))\n!$acc update device(matrix(:42,:84))\n! this is a comment\n!$acc exit data copyout(matrix(:42,:84))\n"
expected_f90 = "!$acc enter data create(matrix(:42,:84))\n!$acc update device(matrix(:42,:84))\n\n! this is a comment\n\n!$acc exit data copyout(matrix(:42,:84))\n"
assert wrap_data(code_f90, acc_f90, data, preprocessor, None) == expected_f90
dimensions = {"rows": 42, "cols": 84}
assert wrap_data(code_f90, acc_f90, data, user_dimensions=dimensions) == expected_f90
Expand Down

0 comments on commit f307f50

Please sign in to comment.