diff --git a/examples/README.rst b/examples/README.rst index 7fd4832cf..e2749882d 100644 --- a/examples/README.rst +++ b/examples/README.rst @@ -18,7 +18,7 @@ Below we list the example applications and the features they illustrate. Vector Add ---------- -[`CUDA `__] [`CUDA-C++ `__] [`OpenCL `__] [`C `__] [`Fortran `__] +[`CUDA `__] [`CUDA-C++ `__] [`OpenCL `__] [`C `__] [`Fortran `__] [`OpenACC-C++ `__] [`OpenACC-Fortran `__] - use Kernel Tuner to tune a simple kernel Stencil diff --git a/examples/c/vector_add_openacc.py b/examples/directives/vector_add_c_openacc.py similarity index 67% rename from examples/c/vector_add_openacc.py rename to examples/directives/vector_add_c_openacc.py index 8894de1e2..c062f2b1e 100644 --- a/examples/c/vector_add_openacc.py +++ b/examples/directives/vector_add_c_openacc.py @@ -3,6 +3,9 @@ from kernel_tuner import tune_kernel from kernel_tuner.utils.directives import ( + Code, + OpenACC, + Cxx, extract_directive_signature, extract_directive_code, extract_preprocessor, @@ -10,12 +13,11 @@ extract_directive_data, allocate_signature_memory, ) -from collections import OrderedDict code = """ #include -#define VECTOR_SIZE 65536 +#define VECTOR_SIZE 1000000 int main(void) { int size = VECTOR_SIZE; @@ -24,7 +26,7 @@ float * c = (float *) malloc(VECTOR_SIZE * sizeof(float)); #pragma tuner start vector_add a(float*:VECTOR_SIZE) b(float*:VECTOR_SIZE) c(float*:VECTOR_SIZE) size(int:VECTOR_SIZE) - #pragma acc parallel num_gangs(ngangs) vector_length(nthreads) + #pragma acc parallel vector_length(nthreads) #pragma acc loop for ( int i = 0; i < size; i++ ) { c[i] = a[i] + b[i]; @@ -37,21 +39,23 @@ } """ -# Extract tunable directive and generate kernel_string +# Extract tunable directive +app = Code(OpenACC(), Cxx()) preprocessor = extract_preprocessor(code) -signature = extract_directive_signature(code) -body = extract_directive_code(code) -kernel_string = generate_directive_function( - preprocessor, signature["vector_add"], body["vector_add"] -) - +signature = extract_directive_signature(code, app) +body = extract_directive_code(code, app) # Allocate memory on the host -data = extract_directive_data(code) +data = extract_directive_data(code, app) args = allocate_signature_memory(data["vector_add"], preprocessor) +# Generate kernel string +kernel_string = generate_directive_function( + preprocessor, signature["vector_add"], body["vector_add"], app, data=data["vector_add"] +) -tune_params = OrderedDict() -tune_params["ngangs"] = [2**i for i in range(0, 15)] -tune_params["nthreads"] = [2**i for i in range(0, 11)] +tune_params = dict() +tune_params["nthreads"] = [32 * i for i in range(1, 33)] +metrics = dict() +metrics["GB/s"] = lambda x: ((2 * 4 * len(args[0])) + (4 * len(args[0]))) / (x["time"] / 10**3) / 10**9 answer = [None, None, args[0] + args[1], None] @@ -61,6 +65,7 @@ 0, args, tune_params, + metrics=metrics, answer=answer, compiler_options=["-fast", "-acc=gpu"], compiler="nvc++", diff --git a/examples/fortran/vector_add_openacc.py b/examples/directives/vector_add_fortran_openacc.py similarity index 60% rename from examples/fortran/vector_add_openacc.py rename to examples/directives/vector_add_fortran_openacc.py index 60058e9eb..29e94646a 100644 --- a/examples/fortran/vector_add_openacc.py +++ b/examples/directives/vector_add_fortran_openacc.py @@ -3,6 +3,9 @@ from kernel_tuner import tune_kernel from kernel_tuner.utils.directives import ( + Code, + OpenACC, + Fortran, extract_directive_signature, extract_directive_code, extract_preprocessor, @@ -10,10 +13,9 @@ extract_directive_data, allocate_signature_memory, ) -from collections import OrderedDict code = """ -#define VECTOR_SIZE 65536 +#define VECTOR_SIZE 1000000 subroutine vector_add(A, B, C, n) use iso_c_binding @@ -21,8 +23,8 @@ real (c_float), intent(in), dimension(VECTOR_SIZE) :: A, B integer (c_int), intent(in) :: n - !$tuner start vector_add A(float*:VECTOR_SIZE) B(float*:VECTOR_SIZE) C(float*:VECTOR_SIZE) n(int:VECTOR_SIZE) - !$acc parallel loop num_gangs(ngangs) vector_length(vlength) + !$tuner start vector_add A(float*:VECTOR_SIZE) B(float*:VECTOR_SIZE) C(float*:VECTOR_SIZE) n(int:VECTOR_SIZE) i(int:VECTOR_SIZE) + !$acc parallel loop vector_length(nthreads) do i = 1, n C(i) = A(i) + B(i) end do @@ -32,23 +34,25 @@ end subroutine vector_add """ -# Extract tunable directive and generate kernel_string +# Extract tunable directive +app = Code(OpenACC(), Fortran()) preprocessor = extract_preprocessor(code) -signature = extract_directive_signature(code) -body = extract_directive_code(code) -kernel_string = generate_directive_function( - preprocessor, signature["vector_add"], body["vector_add"] -) - +signature = extract_directive_signature(code, app) +body = extract_directive_code(code, app) # Allocate memory on the host -data = extract_directive_data(code) +data = extract_directive_data(code, app) args = allocate_signature_memory(data["vector_add"], preprocessor) +# Generate kernel string +kernel_string = generate_directive_function( + preprocessor, signature["vector_add"], body["vector_add"], app, data=data["vector_add"] +) -tune_params = OrderedDict() -tune_params["ngangs"] = [2**i for i in range(0, 15)] -tune_params["vlength"] = [2**i for i in range(0, 11)] +tune_params = dict() +tune_params["nthreads"] = [32 * i for i in range(1, 33)] +metrics = dict() +metrics["GB/s"] = lambda x: ((2 * 4 * len(args[0])) + (4 * len(args[0]))) / (x["time"] / 10**3) / 10**9 -answer = [None, None, args[0] + args[1], None] +answer = [None, None, args[0] + args[1], None, None] tune_kernel( "vector_add", @@ -56,6 +60,7 @@ 0, args, tune_params, + metrics=metrics, answer=answer, compiler_options=["-fast", "-acc=gpu"], compiler="nvfortran", diff --git a/kernel_tuner/backends/compiler.py b/kernel_tuner/backends/compiler.py index 2cccae523..b5724a1a0 100644 --- a/kernel_tuner/backends/compiler.py +++ b/kernel_tuner/backends/compiler.py @@ -146,9 +146,7 @@ def ready_argument_list(self, arguments): for i, arg in enumerate(arguments): if not (isinstance(arg, (np.ndarray, np.number)) or is_cupy_array(arg)): - raise TypeError( - f"Argument is not numpy or cupy ndarray or numpy scalar but a {type(arg)}" - ) + raise TypeError(f"Argument is not numpy or cupy ndarray or numpy scalar but a {type(arg)}") dtype_str = str(arg.dtype) if isinstance(arg, np.ndarray): if dtype_str in dtype_map.keys(): @@ -210,11 +208,7 @@ def compile(self, kernel_instance): # detect whether to use nvcc as default instead of g++, may overrule an explicitly passed g++ if ( - ( - (suffix == ".cu") - or ("#include str: + pass + + +class Language(ABC): + """Base class for all languages""" + + @abstractmethod + def get(self) -> str: + pass + + +class OpenACC(Directive): + """Class to represent OpenACC""" + + def get(self) -> str: + return "openacc" + + +class Cxx(Language): + """Class to represent C++ code""" + + def get(self) -> str: + return "cxx" + + +class Fortran(Language): + """Class to represent Fortran code""" + + def get(self) -> str: + return "fortran" + + +class Code(object): + """Class to represent the directive and host code of the application""" + + def __init__(self, directive: Directive, lang: Language): + self.directive = directive + self.language = lang + + +def is_openacc(directive: Directive) -> bool: + """Check if a directive is OpenACC""" + return isinstance(directive, OpenACC) + + +def is_cxx(lang: Language) -> bool: + """Check if language is C++""" + return isinstance(lang, Cxx) + + +def is_fortran(lang: Language) -> bool: + """Check if language is Fortran""" + return isinstance(lang, Fortran) + + +def line_contains_openacc_directive(line: str, lang: Language) -> bool: + """Check if line contains an OpenACC directive or not""" + if is_cxx(lang): + return line_contains_openacc_directive_cxx(line) + elif is_fortran(lang): + return line_contains_openacc_directive_fortran(line) + return False + + +def line_contains_openacc_directive_cxx(line: str) -> bool: + """Check if a line of code contains a C++ OpenACC directive or not""" + return line_contains(line, "#pragma acc") + + +def line_contains_openacc_directive_fortran(line: str) -> bool: + """Check if a line of code contains a Fortran OpenACC directive or not""" + return line_contains(line, "!$acc") + + +def line_contains_openacc_parallel_directive(line: str, lang: Language) -> bool: + """Check if line contains an OpenACC parallel directive or not""" + if is_cxx(lang): + return line_contains_openacc_parallel_directive_cxx(line) + elif is_fortran(lang): + return line_contains_openacc_parallel_directive_fortran(line) + return False + + +def line_contains_openacc_parallel_directive_cxx(line: str) -> bool: + """Check if a line of code contains a C++ OpenACC parallel directive or not""" + return line_contains(line, "#pragma acc parallel") + + +def line_contains_openacc_parallel_directive_fortran(line: str) -> bool: + """Check if a line of code contains a Fortran OpenACC parallel directive or not""" + return line_contains(line, "!$acc parallel") + + +def line_contains(line: str, target: str) -> bool: + """Generic helper to check if a line contains the target""" + return target in line + + +def openacc_directive_contains_clause(line: str, clauses: list) -> bool: + """Check if an OpenACC directive contains one clause from a list""" + for clause in clauses: + if clause in line: + return True + return False + + +def openacc_directive_contains_data_clause(line: str) -> bool: + """Check if an OpenACC directive contains one data clause""" + data_clauses = ["copy", "copyin", "copyout", "create", "no_create", "present", "device_ptr", "attach"] + return openacc_directive_contains_clause(line, data_clauses) + + +def create_data_directive_openacc(name: str, size: int, lang: Language) -> str: + """Create a data directive for a given language""" + if is_cxx(lang): + return create_data_directive_openacc_cxx(name, size) + elif is_fortran(lang): + return create_data_directive_openacc_fortran(name, size) + return "" + + +def create_data_directive_openacc_cxx(name: str, size: int) -> str: + """Create C++ OpenACC code to allocate and copy data""" + return f"#pragma acc enter data create({name}[:{size}])\n#pragma acc update device({name}[:{size}])\n" + + +def create_data_directive_openacc_fortran(name: str, size: int) -> str: + """Create Fortran OpenACC code to allocate and copy data""" + return f"!$acc enter data create({name}(:{size}))\n!$acc update device({name}(:{size}))\n" + + +def exit_data_directive_openacc(name: str, size: int, lang: Language) -> str: + """Create code to copy data back for a given language""" + if is_cxx(lang): + return exit_data_directive_openacc_cxx(name, size) + elif is_fortran(lang): + return exit_data_directive_openacc_fortran(name, size) + return "" + + +def exit_data_directive_openacc_cxx(name: str, size: int) -> str: + """Create C++ OpenACC code to copy back data""" + return f"#pragma acc exit data copyout({name}[:{size}])\n" + + +def exit_data_directive_openacc_fortran(name: str, size: int) -> str: + """Create Fortran OpenACC code to copy back data""" + return f"!$acc exit data copyout({name}(:{size}))\n" + + def correct_kernel(kernel_name: str, line: str) -> bool: - return f" {kernel_name} " in line or ( - kernel_name in line and len(line.partition(kernel_name)[2]) == 0 - ) + """Checks if the line contains the correct kernel name""" + return f" {kernel_name} " in line or (kernel_name in line and len(line.partition(kernel_name)[2]) == 0) -def cpp_or_f90(code: str) -> tuple: - return "#pragma acc" in code, "!$acc" in code +def find_size_in_preprocessor(dimension: str, preprocessor: list) -> int: + """Find the dimension of a directive defined value in the preprocessor""" + ret_size = None + for line in preprocessor: + if f"#define {dimension}" in line: + try: + ret_size = int(line.split(" ")[2]) + break + except ValueError: + continue + return ret_size -def extract_code(start: str, stop: str, code: str, kernel_name: str = None) -> dict: +def extract_code(start: str, stop: str, code: str, langs: Code, kernel_name: str = None) -> dict: """Extract an arbitrary section of code""" found_section = False sections = dict() tmp_string = list() name = "" init_found = 0 - cpp, f90 = cpp_or_f90(code) for line in code.replace("\\\n", "").split("\n"): if found_section: @@ -34,9 +198,9 @@ def extract_code(start: str, stop: str, code: str, kernel_name: str = None) -> d if kernel_name is None or correct_kernel(kernel_name, line): found_section = True try: - if cpp: + if is_cxx(langs.language): name = line.strip().split(" ")[3] - elif f90: + elif is_fortran(langs.language): name = line.strip().split(" ")[2] except IndexError: name = f"init_{init_found}" @@ -45,45 +209,147 @@ def extract_code(start: str, stop: str, code: str, kernel_name: str = None) -> d return sections -def extract_directive_code(code: str, kernel_name: str = None) -> dict: - """Extract explicitly marked directive sections from code""" - cpp, f90 = cpp_or_f90(code) +def parse_size(size: Any, preprocessor: list = None, dimensions: dict = None) -> int: + """Converts an arbitrary object into an integer representing memory size""" + ret_size = None + if type(size) is not int: + try: + # Try to convert the size to an integer + ret_size = int(size) + except ValueError: + # If size cannot be natively converted to an int, we try to derive it from the preprocessor + if preprocessor is not None: + if "," in size: + ret_size = 1 + for dimension in size.split(","): + ret_size *= find_size_in_preprocessor(dimension, preprocessor) + else: + ret_size = find_size_in_preprocessor(size, preprocessor) + # If size cannot be natively converted, nor retrieved from the preprocessor, we check user provided values + if dimensions is not None: + if size in dimensions.keys(): + try: + ret_size = int(dimensions[size]) + except ValueError: + # User error, no mitigation + return ret_size + elif "," in size: + ret_size = 1 + for dimension in size.split(","): + try: + ret_size *= int(dimensions[dimension]) + except ValueError: + # User error, no mitigation + return None + else: + # size is already an int. no need for conversion + ret_size = size + + return ret_size + + +def wrap_timing(code: str, lang: Language) -> str: + """Helper to wrap timing code around the provided code""" + if is_cxx(lang): + return end_timing_cxx(start_timing_cxx(code)) + elif is_fortran(lang): + return wrap_timing_fortran(code) + return "" + + +def start_timing_cxx(code: str) -> str: + """Wrap C++ timing code around the provided code""" + + start = "auto kt_timing_start = std::chrono::steady_clock::now();" + end = "auto kt_timing_end = std::chrono::steady_clock::now();" + timing = "std::chrono::duration elapsed_time = kt_timing_end - kt_timing_start;" - if cpp: + return "\n".join([start, code, end, timing]) + + +def wrap_timing_fortran(code: str) -> str: + """Wrap Fortran timing code around the provided code""" + + start = "call system_clock(kt_timing_start, kt_rate)" + end = "call system_clock(kt_timing_end)" + timing = "timing = (real(kt_timing_end - kt_timing_start) / real(kt_rate)) * 1e3" + + return "\n".join([start, code, end, timing]) + + +def end_timing_cxx(code: str) -> str: + """In C++ we need to return the measured time""" + return code + "\nreturn elapsed_time.count();\n" + + +def wrap_data(code: str, langs: Code, data: dict, preprocessor: list = None, user_dimensions: dict = None) -> str: + """Insert data directives before and after the timed code""" + intro = str() + outro = str() + for name in data.keys(): + if "*" in data[name][0]: + size = parse_size(data[name][1], preprocessor=preprocessor, dimensions=user_dimensions) + if is_openacc(langs.directive) and is_cxx(langs.language): + intro += create_data_directive_openacc_cxx(name, size) + outro += exit_data_directive_openacc_cxx(name, size) + elif is_openacc(langs.directive) and is_fortran(langs.language): + intro += create_data_directive_openacc_fortran(name, size) + outro += exit_data_directive_openacc_fortran(name, size) + return intro + code + outro + + +def extract_directive_code(code: str, langs: Code, kernel_name: str = None) -> dict: + """Extract explicitly marked directive sections from code""" + if is_cxx(langs.language): start_string = "#pragma tuner start" end_string = "#pragma tuner stop" - elif f90: + elif is_fortran(langs.language): start_string = "!$tuner start" end_string = "!$tuner stop" - return extract_code(start_string, end_string, code, kernel_name) + return extract_code(start_string, end_string, code, langs, kernel_name) -def extract_initialization_code(code: str) -> str: +def extract_initialization_code(code: str, langs: Code) -> str: """Extract the initialization section from code""" - cpp, f90 = cpp_or_f90(code) - - if cpp: + if is_cxx(langs.language): start_string = "#pragma tuner initialize" end_string = "#pragma tuner stop" - elif f90: + elif is_fortran(langs.language): start_string = "!$tuner initialize" end_string = "!$tuner stop" - init_code = extract_code(start_string, end_string, code) + init_code = extract_code(start_string, end_string, code, langs) if len(init_code) >= 1: - return "\n".join(init_code.values()) + return "\n".join(init_code.values()) + "\n" else: return "" -def extract_directive_signature(code: str, kernel_name: str = None) -> dict: +def format_argument_fortran(p_type: str, p_size: int, p_name: str) -> str: + """Format the argument for Fortran code""" + argument = "" + if "float*" in p_type: + argument = f"real (c_float), dimension({p_size}) :: {p_name}" + elif "double*" in p_type: + argument = f"real (c_double), dimension({p_size}) :: {p_name}" + elif "int*" in p_type: + argument = f"integer (c_int), dimension({p_size}) :: {p_name}" + elif "float" in p_type: + argument = f"real (c_float), value :: {p_name}" + elif "double" in p_type: + argument = f"real (c_double), value :: {p_name}" + elif "int" in p_type: + argument = f"integer (c_int), value :: {p_name}" + return argument + + +def extract_directive_signature(code: str, langs: Code, kernel_name: str = None) -> dict: """Extract the user defined signature for directive sections""" - cpp, f90 = cpp_or_f90(code) - if cpp: + if is_cxx(langs.language): start_string = "#pragma tuner start" - elif f90: + elif is_fortran(langs.language): start_string = "!$tuner start" signatures = dict() @@ -91,10 +357,10 @@ def extract_directive_signature(code: str, kernel_name: str = None) -> dict: if start_string in line: if kernel_name is None or correct_kernel(kernel_name, line): tmp_string = line.strip().split(" ") - if cpp: + if is_cxx(langs.language): name = tmp_string[3] tmp_string = tmp_string[4:] - elif f90: + elif is_fortran(langs.language): name = tmp_string[2] tmp_string = tmp_string[3:] params = list() @@ -107,16 +373,16 @@ def extract_directive_signature(code: str, kernel_name: str = None) -> dict: p_type = p_type.split(":")[0] if "*" in p_type: p_type = p_type.replace("*", " * restrict") - if cpp: + if is_cxx(langs.language): params.append(f"{p_type} {p_name}") - elif f90: + elif is_fortran(langs.language): params.append(p_name) - if cpp: + if is_cxx(langs.language): signatures[name] = f"float {name}({', '.join(params)})" - elif f90: + elif is_fortran(langs.language): signatures[ name - ] = f"function {name}({', '.join(params)}) result(timing)\nuse iso_c_binding\n" + ] = f"function {name}({', '.join(params)}) result(timing)\nuse iso_c_binding\nimplicit none\n" params = list() for param in tmp_string: if len(param) == 0: @@ -126,46 +392,31 @@ def extract_directive_signature(code: str, kernel_name: str = None) -> dict: p_type = param[1:-1] p_size = p_type.split(":")[1] p_type = p_type.split(":")[0] - if "float*" in p_type: - params.append( - f"real (c_float), dimension({p_size}) :: {p_name}" - ) - elif "double*" in p_type: - params.append( - f"real (c_double), dimension({p_size}) :: {p_name}" - ) - elif "int*" in p_type: - params.append( - f"integer (c_int), dimension({p_size}) :: {p_name}" - ) - elif "float" in p_type: - params.append(f"real (c_float), value :: {p_name}") - elif "double" in p_type: - params.append(f"real (c_double), value :: {p_name}") - elif "int" in p_type: - params.append(f"integer (c_int), value :: {p_name}") + params.append(format_argument_fortran(p_type, p_size, p_name)) signatures[name] += "\n".join(params) + "\n" + signatures[ + name + ] += "integer(c_int):: kt_timing_start\nreal(c_float):: kt_rate\ninteger(c_int):: kt_timing_end\nreal(c_float):: timing\n" return signatures -def extract_directive_data(code: str, kernel_name: str = None) -> dict: +def extract_directive_data(code: str, langs: Code, kernel_name: str = None) -> dict: """Extract the data used in the directive section""" - cpp, f90 = cpp_or_f90(code) - if cpp: + if is_cxx(langs.language): start_string = "#pragma tuner start" - elif f90: + elif is_fortran(langs.language): start_string = "!$tuner start" data = dict() for line in code.replace("\\\n", "").split("\n"): if start_string in line: if kernel_name is None or correct_kernel(kernel_name, line): - if cpp: + if is_cxx(langs.language): name = line.strip().split(" ")[3] tmp_string = line.strip().split(" ")[4:] - elif f90: + elif is_fortran(langs.language): name = line.strip().split(" ")[2] tmp_string = line.strip().split(" ")[3:] data[name] = dict() @@ -196,86 +447,139 @@ def extract_preprocessor(code: str) -> list: return preprocessor -def wrap_timing(code: str) -> str: - """Wrap timing code around the provided code""" - cpp, f90 = cpp_or_f90(code) - - if cpp: - start = "auto kt_timing_start = std::chrono::steady_clock::now();" - end = "auto kt_timing_end = std::chrono::steady_clock::now();" - timing = "std::chrono::duration elapsed_time = kt_timing_end - kt_timing_start;" - ret = "return elapsed_time.count();" - elif f90: - start = "integer (c_int) :: kt_timing_start\nreal (c_float) :: kt_rate\ninteger (c_int) :: kt_timing_end\nreal (c_float) :: timing\ncall system_clock(kt_timing_start, kt_rate)" - end = "call system_clock(kt_timing_end)" - timing = "timing = (real(kt_timing_end - kt_timing_start) / real(kt_rate)) * 1e3" - ret = "" - - return "\n".join([start, code, end, timing, ret]) - - def generate_directive_function( - preprocessor: str, signature: str, body: str, initialization: str = "" + preprocessor: list, + signature: str, + body: str, + langs: Code, + data: dict = None, + initialization: str = "", + user_dimensions: dict = None, ) -> str: """Generate tunable function for one directive""" - cpp, f90 = cpp_or_f90(body) - code = "\n".join(preprocessor) - if cpp and "#include " not in preprocessor: + code = "\n".join(preprocessor) + "\n" + if user_dimensions is not None: + # add user dimensions to preprocessor + for key, value in user_dimensions.items(): + code += f"#define {key} {value}\n" + if is_cxx(langs.language) and "#include " not in preprocessor: code += "\n#include \n" - if cpp: + if is_cxx(langs.language): code += 'extern "C" ' + signature + "{\n" - elif f90: + elif is_fortran(langs.language): code += "\nmodule kt\nuse iso_c_binding\ncontains\n" code += "\n" + signature if len(initialization) > 1: code += initialization + "\n" - code += wrap_timing(body) - if cpp: + if data is not None: + body = add_present_openacc(body, langs, data, preprocessor, user_dimensions) + if is_cxx(langs.language): + body = start_timing_cxx(body) + if data is not None: + code += wrap_data(body + "\n", langs, data, preprocessor, user_dimensions) + else: + code += body + code = end_timing_cxx(code) code += "\n}" - elif f90: + elif is_fortran(langs.language): + body = wrap_timing(body, langs.language) + if data is not None: + code += wrap_data(body + "\n", langs, data, preprocessor, user_dimensions) + else: + code += body name = signature.split(" ")[1].split("(")[0] code += f"\nend function {name}\nend module kt\n" return code -def allocate_signature_memory(data: dict, preprocessor: list = None) -> list: +def allocate_array(p_type: str, size: int) -> np.ndarray: + """Allocate a Numpy array""" + max_int = 1024 + array = None + if p_type == "float*": + array = np.random.rand(size).astype(np.float32) + elif p_type == "double*": + array = np.random.rand(size).astype(np.float64) + elif p_type == "int*": + array = np.random.randint(max_int, size=size) + else: + # The parameter is an array of user defined types + array = np.random.rand(size).astype(np.byte) + return array + + +def allocate_scalar(p_type: str, size: int) -> np.number: + """Allocate a Numpy scalar""" + scalar = None + if p_type == "float": + scalar = np.float32(size) + elif p_type == "double": + scalar = np.float64(size) + elif p_type == "int": + scalar = np.int32(size) + else: + # The parameter is some user defined type + scalar = np.byte(size) + return scalar + + +def allocate_signature_memory(data: dict, preprocessor: list = None, user_dimensions: dict = None) -> list: """Allocates the data needed by a kernel and returns the arguments array""" args = [] - max_int = 1024 for parameter in data.keys(): p_type = data[parameter][0] - size = data[parameter][1] - if type(size) is not int: - try: - # Try to convert the size to an integer - size = int(size) - except ValueError: - # If size cannot be natively converted to string, we try to derive it from the preprocessor - for line in preprocessor: - if f"#define {size}" in line: - try: - size = int(line.split(" ")[2]) - break - except ValueError: - continue + size = parse_size(data[parameter][1], preprocessor, user_dimensions) if "*" in p_type: - # The parameter is an array - if p_type == "float*": - args.append(np.random.rand(size).astype(np.float32)) - elif p_type == "double*": - args.append(np.random.rand(size).astype(np.float64)) - elif p_type == "int*": - args.append(np.random.randint(max_int, size=size)) + args.append(allocate_array(p_type, size)) else: - # The parameter is a scalar - if p_type == "float": - args.append(np.float32(size)) - elif p_type == "double": - args.append(np.float64(size)) - elif p_type == "int": - args.append(np.int32(size)) + args.append(allocate_scalar(p_type, size)) return args + + +def add_new_line(line: str) -> str: + """Adds the new line character to the end of the line if not present""" + if line.rfind("\n") != len(line) - 1: + return line + "\n" + return line + + +def add_present_openacc( + code: str, langs: Code, data: dict, preprocessor: list = None, user_dimensions: dict = None +) -> str: + """Add the present clause to OpenACC directive""" + new_body = "" + for line in code.replace("\\\n", "").split("\n"): + if not line_contains_openacc_parallel_directive(line, langs.language): + new_body += line + else: + # The line contains an OpenACC directive + if openacc_directive_contains_data_clause(line): + # The OpenACC directive manages memory, do not interfere + return code + else: + new_line = line.replace("\n", "") + present_clause = "" + for name in data.keys(): + if "*" in data[name][0]: + size = parse_size(data[name][1], preprocessor=preprocessor, dimensions=user_dimensions) + if is_cxx(langs.language): + present_clause += add_present_openacc_cxx(name, size) + elif is_fortran(langs.language): + present_clause += add_present_openacc_fortran(name, size) + new_body += new_line + present_clause.rstrip() + "\n" + new_body = add_new_line(new_body) + return new_body + + +def add_present_openacc_cxx(name: str, size: int) -> str: + """Create present clause for C++ OpenACC directive""" + return f" present({name}[:{size}]) " + + +def add_present_openacc_fortran(name: str, size: int) -> str: + """Create present clause for Fortran OpenACC directive""" + return f" present({name}(:{size})) " diff --git a/test/test_compiler_functions.py b/test/test_compiler_functions.py index 5060e2203..0c9d7f86a 100644 --- a/test/test_compiler_functions.py +++ b/test/test_compiler_functions.py @@ -354,7 +354,7 @@ def test_complies_fortran_function_no_module(): cfunc = CompilerFunctions(compiler="gfortran") func = cfunc.compile(kernel_instance) - result = cfunc.run_kernel(func, [], (), ()) + result = cfunc.run_kernel(func, [], (), (), None) assert np.isclose(result, 42.0) @@ -386,7 +386,7 @@ def test_complies_fortran_function_with_module(): cfunc = CompilerFunctions(compiler="gfortran") func = cfunc.compile(kernel_instance) - result = cfunc.run_kernel(func, [], (), ()) + result = cfunc.run_kernel(func, [], (), (), None) assert np.isclose(result, 42.0) diff --git a/test/test_utils_directives.py b/test/test_utils_directives.py deleted file mode 100644 index df8f36e14..000000000 --- a/test/test_utils_directives.py +++ /dev/null @@ -1,164 +0,0 @@ -from kernel_tuner.utils.directives import * - - -def test_extract_directive_code(): - code = """ - #include - - #define VECTOR_SIZE 65536 - - int main(void) { - int size = VECTOR_SIZE; - __restrict float * a = (float *) malloc(VECTOR_SIZE * sizeof(float)); - __restrict float * b = (float *) malloc(VECTOR_SIZE * sizeof(float)); - __restrict float * c = (float *) malloc(VECTOR_SIZE * sizeof(float)); - - #pragma tuner start initialize - #pragma acc parallel - #pragma acc loop - for ( int i = 0; i < size; i++ ) { - a[i] = i; - b[i] = i + 1; - } - #pragma tuner stop - - #pragma tuner start vector_add - #pragma acc parallel - #pragma acc loop - for ( int i = 0; i < size; i++ ) { - c[i] = a[i] + b[i]; - } - #pragma tuner stop - - free(a); - free(b); - free(c); - } - """ - expected_one = """ #pragma acc parallel - #pragma acc loop - for ( int i = 0; i < size; i++ ) { - a[i] = i; - b[i] = i + 1; - }""" - expected_two = """ #pragma acc parallel - #pragma acc loop - for ( int i = 0; i < size; i++ ) { - c[i] = a[i] + b[i]; - }""" - returns = extract_directive_code(code) - assert len(returns) == 2 - assert expected_one in returns["initialize"] - assert expected_two in returns["vector_add"] - assert expected_one not in returns["vector_add"] - returns = extract_directive_code(code, "vector") - assert len(returns) == 0 - - code = """ - !$tuner start vector_add - !$acc parallel loop num_gangs(ngangs) vector_length(vlength) - do i = 1, N - C(i) = A(i) + B(i) - end do - !$acc end parallel loop - !$tuner stop - """ - expected = """ !$acc parallel loop num_gangs(ngangs) vector_length(vlength) - do i = 1, N - C(i) = A(i) + B(i) - end do - !$acc end parallel loop""" - returns = extract_directive_code(code, "vector_add") - assert len(returns) == 1 - assert expected in returns["vector_add"] - - -def test_extract_preprocessor(): - code = """ - #include - - #define VECTOR_SIZE 65536 - - int main(void) { - int size = VECTOR_SIZE; - __restrict float * a = (float *) malloc(VECTOR_SIZE * sizeof(float)); - __restrict float * b = (float *) malloc(VECTOR_SIZE * sizeof(float)); - __restrict float * c = (float *) malloc(VECTOR_SIZE * sizeof(float)); - - #pragma tuner start - #pragma acc parallel - #pragma acc loop - for ( int i = 0; i < size; i++ ) { - a[i] = i; - b[i] = i + 1; - } - #pragma tuner stop - - #pragma tuner start - #pragma acc parallel - #pragma acc loop - for ( int i = 0; i < size; i++ ) { - c[i] = a[i] + b[i]; - } - #pragma tuner stop - - free(a); - free(b); - free(c); - } - """ - expected = [" #include ", " #define VECTOR_SIZE 65536"] - results = extract_preprocessor(code) - assert len(results) == 2 - for item in expected: - assert item in results - - -def test_wrap_timing(): - code = "#pragma acc\nfor ( int i = 0; i < size; i++ ) {\nc[i] = a[i] + b[i];\n}" - wrapped = wrap_timing(code) - assert ( - wrapped - == "auto kt_timing_start = std::chrono::steady_clock::now();\n#pragma acc\nfor ( int i = 0; i < size; i++ ) {\nc[i] = a[i] + b[i];\n}\nauto kt_timing_end = std::chrono::steady_clock::now();\nstd::chrono::duration elapsed_time = kt_timing_end - kt_timing_start;\nreturn elapsed_time.count();" - ) - - -def test_extract_directive_signature(): - code = "#pragma tuner start vector_add a(float*:VECTOR_SIZE) b(float*:VECTOR_SIZE) c(float*:VECTOR_SIZE) size(int:VECTOR_SIZE) \n#pragma acc" - signatures = extract_directive_signature(code) - assert len(signatures) == 1 - assert ( - "float vector_add(float * restrict a, float * restrict b, float * restrict c, int size)" - in signatures["vector_add"] - ) - signatures = extract_directive_signature(code, "vector_add") - assert len(signatures) == 1 - assert ( - "float vector_add(float * restrict a, float * restrict b, float * restrict c, int size)" - in signatures["vector_add"] - ) - signatures = extract_directive_signature(code, "vector_add_ext") - assert len(signatures) == 0 - code = "!$tuner start vector_add A(float*:VECTOR_SIZE) B(float*:VECTOR_SIZE) C(float*:VECTOR_SIZE) n(int:VECTOR_SIZE)\n!$acc" - signatures = extract_directive_signature(code) - assert len(signatures) == 1 - assert "function vector_add(A, B, C, n)" in signatures["vector_add"] - - -def test_extract_directive_data(): - code = "#pragma tuner start vector_add a(float*:VECTOR_SIZE) b(float*:VECTOR_SIZE) c(float*:VECTOR_SIZE) size(int:VECTOR_SIZE)\n#pragma acc" - data = extract_directive_data(code) - assert len(data) == 1 - assert len(data["vector_add"]) == 4 - assert "float*" in data["vector_add"]["b"] - assert "int" not in data["vector_add"]["c"] - assert "VECTOR_SIZE" in data["vector_add"]["size"] - data = extract_directive_data(code, "vector_add_double") - assert len(data) == 0 - code = "!$tuner start vector_add A(float*:VECTOR_SIZE) B(float*:VECTOR_SIZE) C(float*:VECTOR_SIZE) n(int:VECTOR_SIZE)\n!$acc" - data = extract_directive_data(code) - assert len(data) == 1 - assert len(data["vector_add"]) == 4 - assert "float*" in data["vector_add"]["B"] - assert "int" not in data["vector_add"]["C"] - assert "VECTOR_SIZE" in data["vector_add"]["n"] diff --git a/test/utils/__init__.py b/test/utils/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/test/utils/test_directives.py b/test/utils/test_directives.py new file mode 100644 index 000000000..7b98a8a27 --- /dev/null +++ b/test/utils/test_directives.py @@ -0,0 +1,327 @@ +from pytest import raises + +from kernel_tuner.utils.directives import * + + +def test_is_openacc(): + assert is_openacc(OpenACC()) + assert not is_openacc(None) + + +def test_is_cxx(): + assert is_cxx(Cxx()) + assert not is_cxx(Fortran()) + assert not is_cxx(None) + + +def test_is_fortran(): + assert is_fortran(Fortran()) + assert not is_fortran(Cxx()) + assert not is_fortran(None) + + +def test_line_contains_openacc_directive(): + cxx_code = "int main(void) {\n#pragma acc parallel}" + f90_code = "!$acc parallel" + assert line_contains_openacc_directive(cxx_code, Cxx()) + assert not line_contains_openacc_directive(f90_code, Cxx()) + assert line_contains_openacc_directive(f90_code, Fortran()) + assert not line_contains_openacc_directive(cxx_code, Fortran()) + assert not line_contains_openacc_directive(cxx_code, None) + + +def test_line_contains_openacc_parallel_directive(): + assert line_contains_openacc_parallel_directive("#pragma acc parallel wait", Cxx()) + assert line_contains_openacc_parallel_directive("!$acc parallel", Fortran()) + assert not line_contains_openacc_parallel_directive("#pragma acc loop", Cxx()) + assert not line_contains_openacc_parallel_directive("!$acc loop", Fortran()) + assert not line_contains_openacc_parallel_directive("!$acc parallel", None) + + +def test_openacc_directive_contains_data_clause(): + assert openacc_directive_contains_data_clause("#pragma acc parallel present(A[:1089])") + assert not openacc_directive_contains_data_clause("#pragma acc parallel for") + + +def test_create_data_directive(): + assert ( + create_data_directive_openacc("array", 1024, Cxx()) + == "#pragma acc enter data create(array[:1024])\n#pragma acc update device(array[:1024])\n" + ) + assert ( + create_data_directive_openacc("matrix", 35, Fortran()) + == "!$acc enter data create(matrix(:35))\n!$acc update device(matrix(:35))\n" + ) + assert create_data_directive_openacc("array", 1024, None) == "" + + +def test_exit_data_directive(): + assert exit_data_directive_openacc("array", 1024, Cxx()) == "#pragma acc exit data copyout(array[:1024])\n" + assert exit_data_directive_openacc("matrix", 35, Fortran()) == "!$acc exit data copyout(matrix(:35))\n" + assert exit_data_directive_openacc("matrix", 1024, None) == "" + + +def test_correct_kernel(): + assert correct_kernel("vector_add", "tuner start vector_add") + assert correct_kernel("vector_add", "tuner start vector_add a(float:size)") + assert not correct_kernel("vector_add", "tuner start gemm") + assert not correct_kernel("vector_add", "tuner start gemm a(float:size) b(float:size)") + + +def test_parse_size(): + assert parse_size(128) == 128 + assert parse_size("16") == 16 + assert parse_size("test") is None + assert parse_size("n", ["#define n 1024\n"]) == 1024 + assert parse_size("n,m", ["#define n 16\n", "#define m 32\n"]) == 512 + assert parse_size("n", ["#define size 512\n"], {"n": 32}) == 32 + assert parse_size("m", ["#define size 512\n"], {"n": 32}) is None + assert parse_size("rows,cols", dimensions={"rows": 16, "cols": 8}) == 128 + + +def test_wrap_timing(): + code = "#pragma acc\nfor ( int i = 0; i < size; i++ ) {\nc[i] = a[i] + b[i];\n}" + wrapped = wrap_timing(code, Cxx()) + assert ( + wrapped + == "auto kt_timing_start = std::chrono::steady_clock::now();\n#pragma acc\nfor ( int i = 0; i < size; i++ ) {\nc[i] = a[i] + b[i];\n}\nauto kt_timing_end = std::chrono::steady_clock::now();\nstd::chrono::duration elapsed_time = kt_timing_end - kt_timing_start;\nreturn elapsed_time.count();\n" + ) + + +def test_wrap_data(): + acc_cxx = Code(OpenACC(), Cxx()) + acc_f90 = Code(OpenACC(), Fortran()) + code_cxx = "// this is a comment\n" + code_f90 = "! this is a comment\n" + data = {"array": ["int*", "size"]} + preprocessor = ["#define size 42"] + expected_cxx = "#pragma acc enter data create(array[:42])\n#pragma acc update device(array[:42])\n// this is a comment\n#pragma acc exit data copyout(array[:42])\n" + assert wrap_data(code_cxx, acc_cxx, data, preprocessor, None) == expected_cxx + expected_f90 = "!$acc enter data create(array(:42))\n!$acc update device(array(:42))\n! this is a comment\n!$acc exit data copyout(array(:42))\n" + assert wrap_data(code_f90, acc_f90, data, preprocessor, None) == expected_f90 + + +def test_extract_directive_code(): + code = """ + #include + + #define VECTOR_SIZE 65536 + + int main(void) { + int size = VECTOR_SIZE; + __restrict float * a = (float *) malloc(VECTOR_SIZE * sizeof(float)); + __restrict float * b = (float *) malloc(VECTOR_SIZE * sizeof(float)); + __restrict float * c = (float *) malloc(VECTOR_SIZE * sizeof(float)); + + #pragma tuner start initialize + #pragma acc parallel + #pragma acc loop + for ( int i = 0; i < size; i++ ) { + a[i] = i; + b[i] = i + 1; + } + #pragma tuner stop + + #pragma tuner start vector_add + #pragma acc parallel + #pragma acc loop + for ( int i = 0; i < size; i++ ) { + c[i] = a[i] + b[i]; + } + #pragma tuner stop + + free(a); + free(b); + free(c); + } + """ + expected_one = """ #pragma acc parallel + #pragma acc loop + for ( int i = 0; i < size; i++ ) { + a[i] = i; + b[i] = i + 1; + }""" + expected_two = """ #pragma acc parallel + #pragma acc loop + for ( int i = 0; i < size; i++ ) { + c[i] = a[i] + b[i]; + }""" + acc_cxx = Code(OpenACC(), Cxx()) + returns = extract_directive_code(code, acc_cxx) + assert len(returns) == 2 + assert expected_one in returns["initialize"] + assert expected_two in returns["vector_add"] + assert expected_one not in returns["vector_add"] + returns = extract_directive_code(code, acc_cxx, "vector") + assert len(returns) == 0 + + code = """ + !$tuner start vector_add + !$acc parallel loop num_gangs(ngangs) vector_length(vlength) + do i = 1, N + C(i) = A(i) + B(i) + end do + !$acc end parallel loop + !$tuner stop + """ + expected = """ !$acc parallel loop num_gangs(ngangs) vector_length(vlength) + do i = 1, N + C(i) = A(i) + B(i) + end do + !$acc end parallel loop""" + returns = extract_directive_code(code, Code(OpenACC(), Fortran()), "vector_add") + assert len(returns) == 1 + assert expected in returns["vector_add"] + + +def test_extract_preprocessor(): + code = """ + #include + + #define VECTOR_SIZE 65536 + + int main(void) { + int size = VECTOR_SIZE; + __restrict float * a = (float *) malloc(VECTOR_SIZE * sizeof(float)); + __restrict float * b = (float *) malloc(VECTOR_SIZE * sizeof(float)); + __restrict float * c = (float *) malloc(VECTOR_SIZE * sizeof(float)); + + #pragma tuner start + #pragma acc parallel + #pragma acc loop + for ( int i = 0; i < size; i++ ) { + a[i] = i; + b[i] = i + 1; + } + #pragma tuner stop + + #pragma tuner start + #pragma acc parallel + #pragma acc loop + for ( int i = 0; i < size; i++ ) { + c[i] = a[i] + b[i]; + } + #pragma tuner stop + + free(a); + free(b); + free(c); + } + """ + expected = [" #include ", " #define VECTOR_SIZE 65536"] + results = extract_preprocessor(code) + assert len(results) == 2 + for item in expected: + assert item in results + + +def test_extract_directive_signature(): + acc_cxx = Code(OpenACC(), Cxx()) + code = "#pragma tuner start vector_add a(float*:VECTOR_SIZE) b(float*:VECTOR_SIZE) c(float*:VECTOR_SIZE) size(int:VECTOR_SIZE) \n#pragma acc" + signatures = extract_directive_signature(code, acc_cxx) + assert len(signatures) == 1 + assert ( + "float vector_add(float * restrict a, float * restrict b, float * restrict c, int size)" + in signatures["vector_add"] + ) + signatures = extract_directive_signature(code, acc_cxx, "vector_add") + assert len(signatures) == 1 + assert ( + "float vector_add(float * restrict a, float * restrict b, float * restrict c, int size)" + in signatures["vector_add"] + ) + signatures = extract_directive_signature(code, acc_cxx, "vector_add_ext") + assert len(signatures) == 0 + code = "!$tuner start vector_add A(float*:VECTOR_SIZE) B(float*:VECTOR_SIZE) C(float*:VECTOR_SIZE) n(int:VECTOR_SIZE)\n!$acc" + signatures = extract_directive_signature(code, Code(OpenACC(), Fortran())) + assert len(signatures) == 1 + assert "function vector_add(A, B, C, n)" in signatures["vector_add"] + + +def test_extract_directive_data(): + acc_cxx = Code(OpenACC(), Cxx()) + code = "#pragma tuner start vector_add a(float*:VECTOR_SIZE) b(float*:VECTOR_SIZE) c(float*:VECTOR_SIZE) size(int:VECTOR_SIZE)\n#pragma acc" + data = extract_directive_data(code, acc_cxx) + assert len(data) == 1 + assert len(data["vector_add"]) == 4 + assert "float*" in data["vector_add"]["b"] + assert "int" not in data["vector_add"]["c"] + assert "VECTOR_SIZE" in data["vector_add"]["size"] + data = extract_directive_data(code, acc_cxx, "vector_add_double") + assert len(data) == 0 + acc_f90 = Code(OpenACC(), Fortran()) + code = "!$tuner start vector_add A(float*:VECTOR_SIZE) B(float*:VECTOR_SIZE) C(float*:VECTOR_SIZE) n(int:VECTOR_SIZE)\n!$acc" + data = extract_directive_data(code, acc_f90) + assert len(data) == 1 + assert len(data["vector_add"]) == 4 + assert "float*" in data["vector_add"]["B"] + assert "int" not in data["vector_add"]["C"] + assert "VECTOR_SIZE" in data["vector_add"]["n"] + code = ( + "!$tuner start matrix_add A(float*:N_ROWS,N_COLS) B(float*:N_ROWS,N_COLS) nr(int:N_ROWS) nc(int:N_COLS)\n!$acc" + ) + data = extract_directive_data(code, acc_f90) + assert len(data) == 1 + assert len(data["matrix_add"]) == 4 + assert "float*" in data["matrix_add"]["A"] + assert "N_ROWS,N_COLS" in data["matrix_add"]["B"] + + +def test_allocate_signature_memory(): + code = "#pragma tuner start vector_add a(float*:VECTOR_SIZE) b(float*:VECTOR_SIZE) c(float*:VECTOR_SIZE) size(int:VECTOR_SIZE)\n#pragma acc" + data = extract_directive_data(code, Code(OpenACC(), Cxx())) + with raises(TypeError): + _ = allocate_signature_memory(data["vector_add"]) + preprocessor = ["#define VECTOR_SIZE 1024\n"] + args = allocate_signature_memory(data["vector_add"], preprocessor) + assert type(args[0]) is np.ndarray + assert type(args[1]) is not np.float64 + assert args[2].dtype == "float32" + assert type(args[3]) is np.int32 + assert args[3] == 1024 + user_values = dict() + user_values["VECTOR_SIZE"] = 1024 + args = allocate_signature_memory(data["vector_add"], user_dimensions=user_values) + assert type(args[0]) is np.ndarray + assert type(args[1]) is not np.float64 + assert args[2].dtype == "float32" + assert type(args[3]) is np.int32 + code = ( + "!$tuner start matrix_add A(float*:N_ROWS,N_COLS) B(float*:N_ROWS,N_COLS) nr(int:N_ROWS) nc(int:N_COLS)\n!$acc" + ) + data = extract_directive_data(code, Code(OpenACC(), Fortran())) + preprocessor = ["#define N_ROWS 128\n", "#define N_COLS 512\n"] + args = allocate_signature_memory(data["matrix_add"], preprocessor) + assert args[2] == 128 + assert len(args[0]) == (128 * 512) + user_values = dict() + user_values["N_ROWS"] = 32 + user_values["N_COLS"] = 16 + args = allocate_signature_memory(data["matrix_add"], user_dimensions=user_values) + assert args[3] == 16 + assert len(args[1]) == 512 + + +def test_extract_initialization_code(): + code_cpp = "#pragma tuner initialize\nconst int value = 42;\n#pragma tuner stop\n" + code_f90 = "!$tuner initialize\ninteger :: value\n!$tuner stop\n" + assert extract_initialization_code(code_cpp, Code(OpenACC(), Cxx())) == "const int value = 42;\n" + assert extract_initialization_code(code_f90, Code(OpenACC(), Fortran())) == "integer :: value\n" + + +def test_add_present_openacc(): + acc_cxx = Code(OpenACC(), Cxx()) + acc_f90 = Code(OpenACC(), Fortran()) + code_cxx = "#pragma acc parallel num_gangs(32)\n#pragma acc\n" + code_f90 = "!$acc parallel async num_workers(16)\n" + data = {"array": ["int*", "size"]} + preprocessor = ["#define size 42"] + expected_cxx = "#pragma acc parallel num_gangs(32) present(array[:42])\n#pragma acc\n" + assert add_present_openacc(code_cxx, acc_cxx, data, preprocessor, None) == expected_cxx + expected_f90 = "!$acc parallel async num_workers(16) present(array(:42))\n" + assert add_present_openacc(code_f90, acc_f90, data, preprocessor, None) == expected_f90 + code_f90 = "!$acc parallel async num_workers(16) copy(array(:42))\n" + assert add_present_openacc(code_f90, acc_f90, data, preprocessor, None) == code_f90 + code_cxx = "#pragma acc parallel num_gangs(32)\n\t#pragma acc loop\n\t//for loop\n" + expected_cxx = "#pragma acc parallel num_gangs(32) present(array[:42])\n\t#pragma acc loop\n\t//for loop\n" + assert add_present_openacc(code_cxx, acc_cxx, data, preprocessor, None) == expected_cxx