diff --git a/docs/developer_guides/caching.rst b/docs/developer_guides/caching.rst new file mode 100644 index 0000000000..49186ca472 --- /dev/null +++ b/docs/developer_guides/caching.rst @@ -0,0 +1,24 @@ +.. _caching: + +Caching Mechanism in Numba-dpex +================================ + +Caching is done by saving the compiled kernel code, the ELF object of the executable code. By using the kernel code, cached kernels have minimal overhead because no compilation is needed. + +Unlike Numba, we do not perform file-based caching, instead we use an Least Recently Used (LRU) caching mechanism. However when a kernel needs to be evicted, we utilize numba's file-based caching mechanism described `here `_. + +Algorithm +========== + +The caching mechanism for Numba-dpex works as follows: The cache is an LRU cache backed by an ordered dictionary mapped onto a doubly linked list. The tail of the list contains the most recently used (MRU) kernel and the head of the list contains the least recently used (LRU) kernel. The list has a fixed size. If a new kernel arrives to be cached and if the size is already on the maximum limit, the algorithm evicts the LRU kernel to make room for the MRU kernel. The evicted item will be serialized and pickled into a file using Numba's caching mechanism. + +Everytime whenever a kernel needs to be retrieved from the cache, the mechanism will look for the kernel in the cache and will be loaded if it's already present. However, if the program is seeking for a kernel that has been evicted, the algorithm will load it from the file and enqueue in the cache. + +Settings +======== + +Therefore, we employ similar environment variables as used in Numba, i.e. ``NUMBA_CACHE_DIR`` etc. However we add three more environment variables to control the caching mechanism. + +- In order to specify cache capacity, one can use ``NUMBA_DPEX_CACHE_SIZE``. By default, it's set to 10. +- ``NUMBA_DPEX_ENABLE_CACHE`` can be used to enable/disable the caching mechanism. By default it's enabled, i.e. set to 1. +- In order to enable the debugging messages related to caching, one can set ``NUMBA_DPEX_DEBUG_CACHE`` to 1. All environment variables are defined in :file:`numba_dpex/config.py`. diff --git a/numba_dpex/codegen.py b/numba_dpex/codegen.py index b54d46741c..9f45b444d5 100644 --- a/numba_dpex/codegen.py +++ b/numba_dpex/codegen.py @@ -66,6 +66,9 @@ def _init(self, llvm_module): assert list(llvm_module.global_variables) == [], "Module isn't empty" self._data_layout = SPIR_DATA_LAYOUT[utils.MACHINE_BITS] self._target_data = ll.create_target_data(self._data_layout) + self._tm_features = ( + "" # We need this for chaching, not sure about this value for now + ) def _create_empty_module(self, name): ir_module = lc.Module(name) diff --git a/numba_dpex/compiler.py b/numba_dpex/compiler.py deleted file mode 100644 index 09872b9870..0000000000 --- a/numba_dpex/compiler.py +++ /dev/null @@ -1,833 +0,0 @@ -# SPDX-FileCopyrightText: 2020 - 2022 Intel Corporation -# -# SPDX-License-Identifier: Apache-2.0 - -import copy -import ctypes -import warnings -from inspect import signature -from types import FunctionType - -import dpctl -import dpctl.program as dpctl_prog -import dpctl.utils -import numpy as np -from numba.core import compiler, ir, types -from numba.core.compiler import CompilerBase, DefaultPassBuilder -from numba.core.compiler_lock import global_compiler_lock -from numba.core.typing.templates import AbstractTemplate, ConcreteTemplate - -from numba_dpex import config -from numba_dpex.core.exceptions import KernelHasReturnValueError -from numba_dpex.core.types import Array, USMNdArray -from numba_dpex.dpctl_support import dpctl_version -from numba_dpex.parfor_diagnostics import ExtendedParforDiagnostics -from numba_dpex.utils import ( - IndeterminateExecutionQueueError, - as_usm_obj, - cfd_ctx_mgr_wrng_msg, - copy_from_numpy_to_usm_obj, - copy_to_numpy_from_usm_obj, - get_info_from_suai, - has_usm_memory, - mix_datatype_err_msg, -) - -from . import spirv_generator -from .passbuilder import PassBuilder - -_RO_KERNEL_ARG = "read_only" -_WO_KERNEL_ARG = "write_only" -_RW_KERNEL_ARG = "read_write" - - -def _raise_datatype_mixed_error(argtypes): - error_message = mix_datatype_err_msg + ("%s" % str(argtypes)) - raise TypeError(error_message) - - -def _raise_no_device_found_error(): - error_message = ( - "No SYCL device specified. " - "Usage : jit_fn[device, globalsize, localsize](...)" - ) - raise ValueError(error_message) - - -def _raise_invalid_kernel_enqueue_args(): - error_message = ( - "Incorrect number of arguments for enqueuing numba_dpex.kernel. " - "Usage: device_env, global size, local size. " - "The local size argument is optional." - ) - raise ValueError(error_message) - - -def get_ordered_arg_access_types(pyfunc, access_types): - # Construct a list of access type of each arg according to their position - ordered_arg_access_types = [] - sig = signature(pyfunc, follow_wrapped=False) - for idx, arg_name in enumerate(sig.parameters): - if access_types: - for key in access_types: - if arg_name in access_types[key]: - ordered_arg_access_types.append(key) - if len(ordered_arg_access_types) <= idx: - ordered_arg_access_types.append(None) - - return ordered_arg_access_types - - -class Compiler(CompilerBase): - """The DPEX compiler pipeline.""" - - def define_pipelines(self): - # this maintains the objmode fallback behaviour - pms = [] - self.state.parfor_diagnostics = ExtendedParforDiagnostics() - self.state.metadata[ - "parfor_diagnostics" - ] = self.state.parfor_diagnostics - if not self.state.flags.force_pyobject: - pms.append(PassBuilder.define_nopython_pipeline(self.state)) - if self.state.status.can_fallback or self.state.flags.force_pyobject: - pms.append( - DefaultPassBuilder.define_objectmode_pipeline(self.state) - ) - return pms - - -@global_compiler_lock -def compile_with_depx(pyfunc, return_type, args, is_kernel, debug=None): - """ - Compiles the function using the dpex compiler pipeline and returns the - compiled result. - - Args: - pyfunc: The Python function to be compiled. - return_type: The Numba type of the return value. - args: The list of arguments sent to the Python function. - is_kernel (bool): Indicates whether the function is decorated - with @numba_depx.kernel or not. - debug (bool): Flag to turn debug mode ON/OFF. - - Returns: - cres: Compiled result. - - Raises: - TypeError: @numba_depx.kernel does not allow users to return any - value. TypeError is raised when users do. - - """ - # First compilation will trigger the initialization of the backend. - from .core.descriptor import dpex_target - - typingctx = dpex_target.typing_context - targetctx = dpex_target.target_context - - flags = compiler.Flags() - # Do not compile (generate native code), just lower (to LLVM) - flags.debuginfo = config.DEBUGINFO_DEFAULT - flags.no_compile = True - flags.no_cpython_wrapper = True - flags.nrt = False - - if debug is not None: - flags.debuginfo = debug - - # Run compilation pipeline - if isinstance(pyfunc, FunctionType): - cres = compiler.compile_extra( - typingctx=typingctx, - targetctx=targetctx, - func=pyfunc, - args=args, - return_type=return_type, - flags=flags, - locals={}, - pipeline_class=Compiler, - ) - elif isinstance(pyfunc, ir.FunctionIR): - cres = compiler.compile_ir( - typingctx=typingctx, - targetctx=targetctx, - func_ir=pyfunc, - args=args, - return_type=return_type, - flags=flags, - locals={}, - pipeline_class=Compiler, - ) - else: - assert 0 - - if ( - is_kernel - and cres.signature.return_type is not None - and cres.signature.return_type != types.void - ): - raise KernelHasReturnValueError( - kernel_name=pyfunc.__name__, return_type=cres.signature.return_type - ) - - # Linking depending libraries - library = cres.library - library.finalize() - - return cres - - -def compile_kernel(sycl_queue, pyfunc, args, access_types, debug=None): - # For any array we only accept numba_dpex.types.Array - for arg in args: - if isinstance(arg, types.npytypes.Array) and not ( - isinstance(arg, Array) or isinstance(arg, USMNdArray) - ): - raise TypeError( - "Only numba_dpex.core.types.USMNdArray " - + "objects are supported as kernel arguments. " - + "Received %s" % (type(arg)) - ) - - if config.DEBUG: - print("compile_kernel", args) - debug = True - if not sycl_queue: - # We expect the sycl_queue to be provided when this function is called - raise ValueError("SYCL queue is required for compiling a kernel") - - cres = compile_with_depx( - pyfunc=pyfunc, return_type=None, args=args, is_kernel=True, debug=debug - ) - func = cres.library.get_function(cres.fndesc.llvm_func_name) - kernel = cres.target_context.prepare_ocl_kernel(func, cres.signature.args) - - # A reference to the target context is stored in the Kernel to - # reference the context later in code generation. For example, we link - # the kernel object with a spir_func defining atomic operations only - # when atomic operations are used in the kernel. - oclkern = Kernel( - context=cres.target_context, - sycl_queue=sycl_queue, - llvm_module=kernel.module, - name=kernel.name, - argtypes=cres.signature.args, - ordered_arg_access_types=access_types, - ) - return oclkern - - -def compile_kernel_parfor( - sycl_queue, func_ir, args, args_with_addrspaces, debug=None -): - # We only accept numba_dpex.core.types.Array type - for arg in args_with_addrspaces: - if isinstance(arg, types.npytypes.Array) and not isinstance(arg, Array): - raise TypeError( - "Only numba_dpex.core.types.Array objects are supported as " - + "kernel arguments. Received %s" % (type(arg)) - ) - if config.DEBUG: - print("compile_kernel_parfor", args) - for a in args_with_addrspaces: - print(a, type(a)) - if isinstance(a, types.npytypes.Array): - print("addrspace:", a.addrspace) - - cres = compile_with_depx( - pyfunc=func_ir, - return_type=None, - args=args_with_addrspaces, - is_kernel=True, - debug=debug, - ) - func = cres.library.get_function(cres.fndesc.llvm_func_name) - - if config.DEBUG: - print("compile_kernel_parfor signature", cres.signature.args) - for a in cres.signature.args: - print(a, type(a)) - - kernel = cres.target_context.prepare_ocl_kernel(func, cres.signature.args) - oclkern = Kernel( - context=cres.target_context, - sycl_queue=sycl_queue, - llvm_module=kernel.module, - name=kernel.name, - argtypes=args_with_addrspaces, - ) - - return oclkern - - -def compile_func(pyfunc, return_type, args, debug=None): - cres = compile_with_depx( - pyfunc=pyfunc, - return_type=return_type, - args=args, - is_kernel=False, - debug=debug, - ) - func = cres.library.get_function(cres.fndesc.llvm_func_name) - cres.target_context.mark_ocl_device(func) - devfn = DpexFunction(cres) - - class _function_template(ConcreteTemplate): - key = devfn - cases = [cres.signature] - - cres.typing_context.insert_user_function(devfn, _function_template) - libs = [cres.library] - cres.target_context.insert_user_function(devfn, cres.fndesc, libs) - return devfn - - -def compile_func_template(pyfunc, debug=None): - """Compile a DpexFunctionTemplate""" - from .core.descriptor import dpex_target - - dft = DpexFunctionTemplate(pyfunc, debug=debug) - - class _function_template(AbstractTemplate): - key = dft - - def generic(self, args, kws): - assert not kws - return dft.compile(args) - - typingctx = dpex_target.typing_context - typingctx.insert_user_function(dft, _function_template) - return dft - - -class DpexFunctionTemplate(object): - """Unmaterialized dpex function""" - - def __init__(self, pyfunc, debug=None): - self.py_func = pyfunc - self.debug = debug - # self.inline = inline - self._compileinfos = {} - - def compile(self, args): - """Compile the function for the given argument types. - - Each signature is compiled once by caching the compiled function inside - this object. - """ - if args not in self._compileinfos: - cres = compile_with_depx( - pyfunc=self.py_func, - return_type=None, - args=args, - is_kernel=False, - debug=self.debug, - ) - func = cres.library.get_function(cres.fndesc.llvm_func_name) - cres.target_context.mark_ocl_device(func) - first_definition = not self._compileinfos - self._compileinfos[args] = cres - libs = [cres.library] - - if first_definition: - # First definition - cres.target_context.insert_user_function( - self, cres.fndesc, libs - ) - else: - cres.target_context.add_user_function(self, cres.fndesc, libs) - - else: - cres = self._compileinfos[args] - - return cres.signature - - -class DpexFunction(object): - def __init__(self, cres): - self.cres = cres - - -def _ensure_valid_work_item_grid(val, sycl_queue): - - if not isinstance(val, (tuple, list, int)): - error_message = ( - "Cannot create work item dimension from provided argument" - ) - raise ValueError(error_message) - - if isinstance(val, int): - val = [val] - - # TODO: we need some way to check the max dimensions - """ - if len(val) > device_env.get_max_work_item_dims(): - error_message = ("Unsupported number of work item dimensions ") - raise ValueError(error_message) - """ - - return list( - val[::-1] - ) # reversing due to sycl and opencl interop kernel range mismatch semantic - - -def _ensure_valid_work_group_size(val, work_item_grid): - - if not isinstance(val, (tuple, list, int)): - error_message = ( - "Cannot create work item dimension from provided argument" - ) - raise ValueError(error_message) - - if isinstance(val, int): - val = [val] - - if len(val) != len(work_item_grid): - error_message = ( - "Unsupported number of work item dimensions, " - + "dimensions of global and local work items has to be the same " - ) - raise ValueError(error_message) - - return list( - val[::-1] - ) # reversing due to sycl and opencl interop kernel range mismatch semantic - - -class KernelBase(object): - """Define interface for configurable kernels""" - - def __init__(self): - self.global_size = [] - self.local_size = [] - self.sycl_queue = None - - # list of supported access types, stored in dict for fast lookup - self.valid_access_types = { - _RO_KERNEL_ARG: _RO_KERNEL_ARG, - _WO_KERNEL_ARG: _WO_KERNEL_ARG, - _RW_KERNEL_ARG: _RW_KERNEL_ARG, - } - - def copy(self): - return copy.copy(self) - - def configure(self, sycl_queue, global_size, local_size=None): - """Configure the OpenCL kernel. The local_size can be None""" - clone = self.copy() - clone.global_size = global_size - clone.local_size = local_size - clone.sycl_queue = sycl_queue - - return clone - - def __getitem__(self, args): - """Mimick CUDA python's square-bracket notation for configuration. - This assumes the argument to be: - `global size, local size` - """ - ls = None - nargs = len(args) - # Check if the kernel enquing arguments are sane - if nargs < 1 or nargs > 2: - _raise_invalid_kernel_enqueue_args - - sycl_queue = dpctl.get_current_queue() - - gs = _ensure_valid_work_item_grid(args[0], sycl_queue) - # If the optional local size argument is provided - if nargs == 2 and args[1] != []: - ls = _ensure_valid_work_group_size(args[1], gs) - - return self.configure(sycl_queue, gs, ls) - - -class Kernel(KernelBase): - """ - A OCL kernel object - """ - - def __init__( - self, - context, - sycl_queue, - llvm_module, - name, - argtypes, - ordered_arg_access_types=None, - ): - super(Kernel, self).__init__() - self._llvm_module = llvm_module - self.assembly = self.binary = llvm_module.__str__() - self.entry_name = name - self.argument_types = tuple(argtypes) - self.ordered_arg_access_types = ordered_arg_access_types - self._argloc = [] - self.sycl_queue = sycl_queue - self.context = context - - dpctl_create_program_from_spirv_flags = [] - # First-time compilation using SPIRV-Tools - if config.DEBUG: - with open("llvm_kernel.ll", "w") as f: - f.write(self.binary) - - if config.DEBUG or config.OPT == 0: - # if debug is ON we need to pass additional - # flags to igc. - dpctl_create_program_from_spirv_flags = ["-g", "-cl-opt-disable"] - - self.spirv_bc = spirv_generator.llvm_to_spirv( - self.context, self.assembly, self._llvm_module.as_bitcode() - ) - - # create a program - self.program = dpctl_prog.create_program_from_spirv( - self.sycl_queue, - self.spirv_bc, - " ".join(dpctl_create_program_from_spirv_flags), - ) - # create a kernel - self.kernel = self.program.get_sycl_kernel(self.entry_name) - - def __call__(self, *args): - """ - Create a list of the kernel arguments by unpacking pyobject values - into ctypes values. - """ - - kernelargs = [] - internal_device_arrs = [] - for ty, val, access_type in zip( - self.argument_types, args, self.ordered_arg_access_types - ): - self._unpack_argument( - ty, - val, - self.sycl_queue, - kernelargs, - internal_device_arrs, - access_type, - ) - - self.sycl_queue.submit( - self.kernel, kernelargs, self.global_size, self.local_size - ) - self.sycl_queue.wait() - - for ty, val, i_dev_arr, access_type in zip( - self.argument_types, - args, - internal_device_arrs, - self.ordered_arg_access_types, - ): - self._pack_argument( - ty, val, self.sycl_queue, i_dev_arr, access_type - ) - - def _pack_argument(self, ty, val, sycl_queue, device_arr, access_type): - """ - Copy device data back to host - """ - if device_arr and ( - access_type not in self.valid_access_types - or access_type in self.valid_access_types - and self.valid_access_types[access_type] != _RO_KERNEL_ARG - ): - # We copy the data back from usm allocated data - # container to original data container. - usm_mem, orig_ndarr, packed_ndarr, packed = device_arr - copy_to_numpy_from_usm_obj(usm_mem, packed_ndarr) - if packed: - np.copyto(orig_ndarr, packed_ndarr) - - def _unpack_device_array_argument( - self, size, itemsize, buf, shape, strides, ndim, kernelargs - ): - """ - Implements the unpacking logic for array arguments. - - Args: - size: Total number of elements in the array. - itemsize: Size in bytes of each element in the array. - buf: The pointer to the memory. - shape: The shape of the array. - ndim: Number of dimension. - kernelargs: Array where the arguments of the kernel is stored. - """ - # meminfo - kernelargs.append(ctypes.c_size_t(0)) - # parent - kernelargs.append(ctypes.c_size_t(0)) - kernelargs.append(ctypes.c_longlong(size)) - kernelargs.append(ctypes.c_longlong(itemsize)) - kernelargs.append(buf) - for ax in range(ndim): - kernelargs.append(ctypes.c_longlong(shape[ax])) - for ax in range(ndim): - kernelargs.append(ctypes.c_longlong(strides[ax])) - - def _unpack_USMNdArray(self, val, kernelargs): - ( - usm_mem, - total_size, - shape, - ndim, - itemsize, - strides, - dtype, - ) = get_info_from_suai(val) - - self._unpack_device_array_argument( - total_size, - itemsize, - usm_mem, - shape, - strides, - ndim, - kernelargs, - ) - - def _unpack_Array( - self, val, sycl_queue, kernelargs, device_arrs, access_type - ): - packed_val = val - usm_mem = has_usm_memory(val) - if usm_mem is None: - default_behavior = self.check_for_invalid_access_type(access_type) - usm_mem = as_usm_obj(val, queue=sycl_queue, copy=False) - - orig_val = val - packed = False - if not val.flags.c_contiguous: - # If the numpy.ndarray is not C-contiguous - # we pack the strided array into a packed array. - # This allows us to treat the data from here on as C-contiguous. - # While packing we treat the data as C-contiguous. - # We store the reference of both (strided and packed) - # array and during unpacking we use numpy.copyto() to copy - # the data back from the packed temporary array to the - # original strided array. - packed_val = val.flatten(order="C") - packed = True - - if ( - default_behavior - or self.valid_access_types[access_type] == _RO_KERNEL_ARG - or self.valid_access_types[access_type] == _RW_KERNEL_ARG - ): - copy_from_numpy_to_usm_obj(usm_mem, packed_val) - - device_arrs[-1] = (usm_mem, orig_val, packed_val, packed) - - self._unpack_device_array_argument( - packed_val.size, - packed_val.dtype.itemsize, - usm_mem, - packed_val.shape, - packed_val.strides, - packed_val.ndim, - kernelargs, - ) - - def _unpack_argument( - self, ty, val, sycl_queue, kernelargs, device_arrs, access_type - ): - """ - Unpacks the arguments that are to be passed to the SYCL kernel from - Numba types to Ctypes. - - Args: - ty: The data types of the kernel argument defined as in instance of - numba.types. - val: The value of the kernel argument. - sycl_queue (dpctl.SyclQueue): A ``dpctl.SyclQueue`` object. The - queue object will be used whenever USM memory allocation is - needed during unpacking of an numpy.ndarray argument. - kernelargs (list): The list of kernel arguments into which the - current kernel argument will be appended. - device_arrs (list): A list of tuples that is used to store the - triples corresponding to the USM memorry allocated for an - ``numpy.ndarray`` argument, a wrapper ``ndarray`` created from - the USM memory, and the original ``ndarray`` argument. - access_type : The type of access for an array argument. - - Raises: - NotImplementedError: If the type of argument is not yet supported, - then a ``NotImplementedError`` is raised. - - """ - - device_arrs.append(None) - - if isinstance(ty, USMNdArray): - self._unpack_USMNdArray(val, kernelargs) - elif isinstance(ty, types.Array): - self._unpack_Array( - val, sycl_queue, kernelargs, device_arrs, access_type - ) - elif ty == types.int64: - cval = ctypes.c_longlong(val) - kernelargs.append(cval) - elif ty == types.uint64: - cval = ctypes.c_ulonglong(val) - kernelargs.append(cval) - elif ty == types.int32: - cval = ctypes.c_int(val) - kernelargs.append(cval) - elif ty == types.uint32: - cval = ctypes.c_uint(val) - kernelargs.append(cval) - elif ty == types.float64: - cval = ctypes.c_double(val) - kernelargs.append(cval) - elif ty == types.float32: - cval = ctypes.c_float(val) - kernelargs.append(cval) - elif ty == types.boolean: - cval = ctypes.c_uint8(int(val)) - kernelargs.append(cval) - elif ty == types.complex64: - raise NotImplementedError(ty, val) - elif ty == types.complex128: - raise NotImplementedError(ty, val) - else: - raise NotImplementedError(ty, val) - - def check_for_invalid_access_type(self, access_type): - if access_type not in self.valid_access_types: - msg = ( - "[!] %s is not a valid access type. " - "Supported access types are [" % (access_type) - ) - for key in self.valid_access_types: - msg += " %s |" % (key) - - msg = msg[:-1] + "]" - if access_type is not None: - print(msg) - return True - else: - return False - - -class JitKernel(KernelBase): - def __init__(self, func, debug, access_types): - - super(JitKernel, self).__init__() - - self.py_func = func - self.definitions = {} - self.debug = debug - self.access_types = access_types - - from .core.descriptor import dpex_target - - self.typingctx = dpex_target.typing_context - - def _get_argtypes(self, *args): - """ - Convenience function to get the type of each argument. - """ - return tuple([self.typingctx.resolve_argument_type(a) for a in args]) - - def _datatype_is_same(self, argtypes): - """ - This function will determine if there is any argument of type array and - in case there are multiple array types if they are all of the same type. - - Args: - argtypes: Numba type for each argument passed to a JitKernel. - - Returns: - array_type: None if there are no argument of type array, or the - Numba type in case there is array type argument. - bool: True if no array type arguments or if all array type arguments - are of same Numba type, False otherwise. - - """ - array_type = None - for i, argtype in enumerate(argtypes): - arg_is_array_type = isinstance(argtype, USMNdArray) or isinstance( - argtype, types.Array - ) - if array_type is None and arg_is_array_type: - array_type = argtype - elif ( - array_type is not None - and arg_is_array_type - and type(argtype) is not type(array_type) - ): - return None, False - return array_type, True - - def __call__(self, *args, **kwargs): - assert not kwargs, "Keyword Arguments are not supported" - - argtypes = self._get_argtypes(*args) - compute_queue = None - - # Get the array type and whether all array are of same type or not - array_type, uniform = self._datatype_is_same(argtypes) - if not uniform: - _raise_datatype_mixed_error(argtypes) - - if type(array_type) == USMNdArray: - if dpctl.is_in_device_context(): - warnings.warn(cfd_ctx_mgr_wrng_msg) - - queues = [] - for i, argtype in enumerate(argtypes): - if type(argtype) == USMNdArray: - memory = dpctl.memory.as_usm_memory(args[i]) - if dpctl_version < (0, 12): - queue = memory._queue - else: - queue = memory.sycl_queue - queues.append(queue) - - # dpctl.utils.get_exeuction_queue() checks if the queues passed are - # equivalent and returns a SYCL queue if they are equivalent and - # None if they are not. - compute_queue = dpctl.utils.get_execution_queue(queues) - if compute_queue is None: - raise IndeterminateExecutionQueueError( - "Data passed as argument are not equivalent. Please " - "create dpctl.tensor.usm_ndarray with equivalent SYCL queue." - ) - - if compute_queue is None: - try: - compute_queue = dpctl.get_current_queue() - except: - _raise_no_device_found_error() - - kernel = self.specialize(argtypes, compute_queue) - cfg = kernel.configure( - kernel.sycl_queue, self.global_size, self.local_size - ) - cfg(*args) - - def specialize(self, argtypes, queue): - # We specialize for argtypes and queue. These two are used as key for - # caching as well. - assert queue is not None - - sycl_ctx = None - kernel = None - # we were previously using the _env_ptr of the device_env, the sycl_queue - # should be sufficient to cache the compiled kernel for now, but we should - # use the device type to cache such kernels. - key_definitions = argtypes - result = self.definitions.get(key_definitions) - if result: - sycl_ctx, kernel = result - - if sycl_ctx and sycl_ctx == queue.sycl_context: - return kernel - else: - kernel = compile_kernel( - queue, self.py_func, argtypes, self.access_types, self.debug - ) - self.definitions[key_definitions] = (queue.sycl_context, kernel) - return kernel diff --git a/numba_dpex/config.py b/numba_dpex/config.py index 3cbbca2ba1..7fc121d059 100644 --- a/numba_dpex/config.py +++ b/numba_dpex/config.py @@ -86,6 +86,21 @@ def __getattr__(name): "NUMBA_DPEX_DEBUGINFO", int, config.DEBUGINFO_DEFAULT ) +# configs for caching +# To see the debug messages for the caching. +# Execute like: +# NUMBA_DPEX_DEBUG_CACHE=1 python +DEBUG_CACHE = _readenv("NUMBA_DPEX_DEBUG_CACHE", int, 0) +# This is a global flag to turn the caching on/off, +# regardless of whatever has been specified in Dispatcher. +# Useful for debugging. Execute like: +# NUMBA_DPEX_ENABLE_CACHE=0 python +# to turn off the caching globally. +ENABLE_CACHE = _readenv("NUMBA_DPEX_ENABLE_CACHE", int, 1) +# Capacity of the cache, execute it like: +# NUMBA_DPEX_CACHE_SIZE=20 python +CACHE_SIZE = _readenv("NUMBA_DPEX_CACHE_SIZE", int, 10) + TESTING_SKIP_NO_DPNP = _readenv("NUMBA_DPEX_TESTING_SKIP_NO_DPNP", int, 0) TESTING_SKIP_NO_DEBUGGING = _readenv( "NUMBA_DPEX_TESTING_SKIP_NO_DEBUGGING", int, 1 diff --git a/numba_dpex/core/caching.py b/numba_dpex/core/caching.py new file mode 100644 index 0000000000..1c46485c6c --- /dev/null +++ b/numba_dpex/core/caching.py @@ -0,0 +1,515 @@ +# SPDX-FileCopyrightText: 2020 - 2022 Intel Corporation +# +# SPDX-License-Identifier: Apache-2.0 + +import hashlib +import sys +from abc import ABCMeta, abstractmethod + +from numba.core.caching import CacheImpl, IndexDataCacheFile +from numba.core.serialize import dumps + +from numba_dpex import config +from numba_dpex.core.types import USMNdArray + + +def build_key(argtypes, pyfunc, codegen, backend=None, device_type=None): + """Constructs a key from python function, context, backend and the device + type. + + Compute index key for the given argument types and codegen. It includes a + description of the OS, target architecture and hashes of the bytecode for + the function and, if the function has a __closure__, a hash of the + cell_contents.type + + Args: + argtypes : A tuple of numba types corresponding to the arguments to the + compiled function. + pyfunc : The Python function that is to be compiled and cached. + codegen (numba.core.codegen.Codegen): + The codegen object found from the target context. + backend (enum, optional): A 'backend_type' enum. + Defaults to None. + device_type (enum, optional): A 'device_type' enum. + Defaults to None. + + Returns: + tuple: A tuple of return type, argtpes, magic_tuple of codegen + and another tuple of hashcodes from bytecode and cell_contents. + """ + + codebytes = pyfunc.__code__.co_code + if pyfunc.__closure__ is not None: + try: + cvars = tuple([x.cell_contents for x in pyfunc.__closure__]) + # Note: cloudpickle serializes a function differently depending + # on how the process is launched; e.g. multiprocessing.Process + cvarbytes = dumps(cvars) + except: + cvarbytes = b"" # a temporary solution for function template + else: + cvarbytes = b"" + + argtylist = list(argtypes) + for i, argty in enumerate(argtylist): + if isinstance(argty, USMNdArray): + # Convert the USMNdArray to an abridged type that disregards the + # usm_type, device, queue, address space attributes. + argtylist[i] = (argty.ndim, argty.dtype, argty.layout) + + argtypes = tuple(argtylist) + + return ( + argtypes, + codegen.magic_tuple(), + backend, + device_type, + ( + hashlib.sha256(codebytes).hexdigest(), + hashlib.sha256(cvarbytes).hexdigest(), + ), + ) + + +class _CacheImpl(CacheImpl): + """Implementation of `CacheImpl` to be used by subclasses of `_Cache`. + + This class is an implementation of `CacheImpl` to be used by subclasses + of `_Cache`. To be assigned in `_impl_class`. Implements the more common + and core mechanism for the caching. + + """ + + def reduce(self, data): + """Serialize an object before caching. + Args: + data (object): The object to be serialized before pickling. + """ + # TODO: Implement, but looks like we might not need it at all. + # Look at numba.core.caching for how to implement. + pass + + def rebuild(self, target_context, reduced_data): + """Deserialize after unpickling from the cache. + Args: + target_context (numba_dpex.core.target.DpexTargetContext): + The target context for the kernel. + reduced_data (object): The data to be deserialzed after unpickling. + """ + # TODO: Implement, but looks like we might not need it at all. + # Look at numba.core.caching for how to implement. + pass + + def check_cachable(self, cres): + """Check if a certain object is cacheable. + + Args: + cres (object): The object to be cached. For example, if the object + is `CompileResult`, then you might want to follow the similar + checks as has been done in + `numba.core.caching.CompileResultCacheImpl`. + + Returns: + bool: Return `True` if cacheable, otherwise `False`. + """ + # TODO: Although, for the time being, assuming all Kernels in numba_dpex + # are always cachable. However, we might need to add some bells and + # whistles in the future. Look at numba.core.caching for how to + # implement. + return True + + +class AbstractCache(metaclass=ABCMeta): + """Abstract cache class to specify basic caching operations. + + This class will be used to create an non-functional dummy cache + (i.e. NullCache) and other functional cache. The dummy cache + will be used as a placeholder when caching is disabled. + + Args: + metaclass (type, optional): Metaclass for the abstract class. + Defaults to ABCMeta. + """ + + @abstractmethod + def get(self): + """An abstract method to retrieve item from the cache.""" + + @abstractmethod + def put(self, key, value): + """An abstract method to save item into the cache. + + Args: + key (object): The key for the data + (i.e. compiled kernel/function etc.). + value (object): The data (i.e. compiled kernel/function) + to be saved. + """ + + +class NullCache(AbstractCache): + """A dummy cache used if user decides to disable caching. + + If the caching is disabled this class will be used to + perform all caching operations, all of which will be basically + NOP. This idea is copied from numba. + + Args: + AbstractCache (class): The abstract cache from which all + other caching classes will be derived. + """ + + def get(self, key): + """Function to get an item (i.e. compiled kernel/function) + from the cache + + Args: + key (object): The key to retrieve the + data (i.e. compiled kernel/function) + + Returns: + None: Returns None. + """ + return None + + def put(self, key, value): + """Function to save a compiled kernel/function + into the cache. + + Args: + key (object): The key to the data (i.e. compiled kernel/function). + value (object): The data to be cached (i.e. + compiled kernel/function). + """ + pass + + +class Node: + """A 'Node' class for LRUCache.""" + + def __init__(self, key, value): + """Constructor for the Node. + + Args: + key (object): The key to the value. + value (object): The data to be saved. + """ + self.key = key + self.value = value + self.next = None + self.previous = None + + def __str__(self): + """__str__ for Node. + + Returns: + str: A human readable representation of a Node. + """ + return "(" + str(self.key) + ": " + str(self.value) + ")" + + def __repr__(self): + """__repr__ for Node + + Returns: + str: A human readable representation of a Node. + """ + return self.__str__() + + +class LRUCache(AbstractCache): + """LRUCache implementation for caching kernels, + functions and modules. + + The cache is basically a doubly-linked-list backed + with a dictionary as a lookup table. + """ + + def __init__(self, capacity=10, pyfunc=None): + """Constructor for LRUCache. + + Args: + capacity (int, optional): The max capacity of the cache. + Defaults to 10. + pyfunc (NoneType, optional): A python function to be cached. + Defaults to None. + """ + self._capacity = capacity + self._lookup = {} + self._evicted = {} + self._dummy = Node(0, 0) + self._head = self._dummy.next + self._tail = self._dummy.next + self._pyfunc = pyfunc + self._cache_file = None + # if pyfunc is specified, we will use files for evicted items + if self._pyfunc is not None: + # _CacheImpl object to be used + self._impl_class = _CacheImpl + self._impl = self._impl_class(self._pyfunc) + self._cache_path = self._impl.locator.get_cache_path() + # This may be a bit strict but avoids us maintaining a magic number + source_stamp = self._impl.locator.get_source_stamp() + filename_base = self._impl.filename_base + self._cache_file = IndexDataCacheFile( + cache_path=self._cache_path, + filename_base=filename_base, + source_stamp=source_stamp, + ) + + @property + def head(self): + """Get the head of the cache. + + This is used for testing/debugging purposes. + + Returns: + Node: The head of the cache. + """ + return self._head + + @property + def tail(self): + """Get the tail of the cache. + + This is used for testing/debugging purposes. + + Returns: + Node: The tail of the cache. + """ + return self._tail + + @property + def evicted(self): + """Get the list of evicted items from the cache. + + This is used for testing/debugging purposes. + + Returns: + dict: A table of evicted items from the cache. + """ + return self._evicted + + def _get_memsize(self, obj, seen=None): + """Recursively finds size of *almost any* object. + + This function might be useful in the future when + size based (not count based) cache limit will be + implemented. + + Args: + obj (object): Any object. + seen (set, optional): Set of seen object id(). + Defaults to None. + + Returns: + int: Size of the object in bytes. + """ + size = sys.getsizeof(obj) + if seen is None: + seen = set() + obj_id = id(obj) + if obj_id in seen: + return 0 + # Important mark as seen *before* entering recursion to gracefully + # handle self-referential objects + seen.add(obj_id) + if isinstance(obj, dict): + size += sum([self._get_memsize(v, seen) for v in obj.values()]) + size += sum([self._get_memsize(k, seen) for k in obj.keys()]) + elif hasattr(obj, "__dict__"): + size += self._get_memsize(obj.__dict__, seen) + elif hasattr(obj, "__iter__") and not isinstance( + obj, (str, bytes, bytearray) + ): + size += sum([self._get_memsize(i, seen) for i in obj]) + return size + + def size(self): + """Get the current size of the cache. + + Returns: + int: The current number of items in the cache. + """ + return len(self._lookup) + + def memsize(self): + """Get the total memory size of the cache. + + This function might be useful in the future when + size based (not count based) cache limit will be + implemented. + + Returns: + int: Get the total memory size of the cache in bytes. + """ + size = 0 + current = self._head + while current: + size = size + self._get_memsize(current.value) + current = current.next + return size + + def __str__(self): + """__str__ function for the cache + + Returns: + str: A human readable representation of the cache. + """ + items = [] + current = self._head + while current: + items.append(str(current)) + current = current.next + return "{" + ", ".join(items) + "}" + + def __repr__(self): + """__repr__ function for the cache + + Returns: + str: A human readable representation of the cache. + """ + return self.__str__() + + def clean(self): + """Clean the cache""" + self._lookup = {} + self._evicted = {} + self._dummy = Node(0, 0) + self._head = self._dummy.next + self._tail = self._dummy.next + + def _remove_head(self): + """Remove the head of the cache""" + if not self._head: + return + prev = self._head + self._head = self._head.next + if self._head: + self._head.previous = None + del prev + + def _append_tail(self, new_node): + """Add the new node to the tail end""" + if not self._tail: + self._head = self._tail = new_node + else: + self._tail.next = new_node + new_node.previous = self._tail + self._tail = self._tail.next + + def _unlink_node(self, node): + """Unlink current linked node""" + if node is None: + return + + if self._head is node: + self._head = node.next + if node.next: + node.next.previous = None + node.previous, node.next = None, None + return + + # removing the node from somewhere in the middle; update pointers + prev, nex = node.previous, node.next + prev.next = nex + nex.previous = prev + node.previous, node.next = None, None + + def get(self, key): + """Get the value associated with the key. + + Args: + key (object): A key for the lookup table. + + Returns: + object: The value associated with the key. + """ + + if key not in self._lookup: + if key not in self._evicted: + return None + elif self._cache_file: + value = self._cache_file.load(key) + if config.DEBUG_CACHE: + print( + "[cache]: unpickled an evicted artifact, " + "key: {0:s}.".format(str(key)) + ) + else: + value = self._evicted[key] + self.put(key, value) + return value + else: + if config.DEBUG_CACHE: + print( + "[cache] size: {0:d}, loading artifact, key: {1:s}".format( + len(self._lookup), str(key) + ) + ) + node = self._lookup[key] + + if node is not self._tail: + self._unlink_node(node) + self._append_tail(node) + + return node.value + + def put(self, key, value): + """Store the key-value pair into the cache. + + Args: + key (object): The key for the data. + value (object): The data to be saved. + """ + if key in self._lookup: + if config.DEBUG_CACHE: + print( + "[cache] size: {0:d}, storing artifact, key: {1:s}".format( + len(self._lookup), str(key) + ) + ) + self._lookup[key].value = value + self.get(key) + return + + if key in self._evicted: + self._evicted.pop(key) + + if len(self._lookup) >= self._capacity: + # remove head node and correspond key + if self._cache_file: + if config.DEBUG_CACHE: + print( + "[cache] size: {0:d}, pickling the LRU item, " + "key: {1:s}, indexed at {2:s}.".format( + len(self._lookup), + str(self._head.key), + self._cache_file._index_path, + ) + ) + self._cache_file.save(self._head.key, self._head.value) + self._evicted[ + self._head.key + ] = None # as we are using cache files, we save memory + else: + self._evicted[self._head.key] = self._head.value + self._lookup.pop(self._head.key) + if config.DEBUG_CACHE: + print( + "[cache] size: {0:d}, capacity exceeded, evicted".format( + len(self._lookup) + ), + self._head.key, + ) + self._remove_head() + + # add new node and hash key + new_node = Node(key, value) + self._lookup[key] = new_node + self._append_tail(new_node) + if config.DEBUG_CACHE: + print( + "[cache] size: {0:d}, saved artifact, key: {1:s}".format( + len(self._lookup), str(key) + ) + ) diff --git a/numba_dpex/core/compiler.py b/numba_dpex/core/compiler.py new file mode 100644 index 0000000000..5dd0445ee2 --- /dev/null +++ b/numba_dpex/core/compiler.py @@ -0,0 +1,301 @@ +# SPDX-FileCopyrightText: 2022 Intel Corporation +# +# SPDX-License-Identifier: Apache-2.0 + +from types import FunctionType + +from numba.core import compiler, ir +from numba.core import types as numba_types +from numba.core.compiler import CompilerBase +from numba.core.compiler_lock import global_compiler_lock +from numba.core.compiler_machinery import PassManager +from numba.core.typed_passes import ( + AnnotateTypes, + InlineOverloads, + IRLegalization, + NopythonRewrites, + NoPythonSupportedFeatureValidation, + NopythonTypeInference, + PreLowerStripPhis, +) +from numba.core.untyped_passes import ( + DeadBranchPrune, + FindLiterallyCalls, + FixupArgs, + GenericRewrites, + InlineClosureLikes, + InlineInlinables, + IRProcessing, + LiteralPropagationSubPipelinePass, + LiteralUnroll, + MakeFunctionToJitFunction, + ReconstructSSA, + RewriteSemanticConstants, + TranslateByteCode, + WithLifting, +) + +from numba_dpex import config +from numba_dpex.core.exceptions import ( + KernelHasReturnValueError, + UnreachableError, + UnsupportedCompilationModeError, +) +from numba_dpex.core.passes.passes import ( + ConstantSizeStaticLocalMemoryPass, + DpexLowering, + DumpParforDiagnostics, + NoPythonBackend, + ParforPass, + PreParforPass, +) +from numba_dpex.core.passes.rename_numpy_functions_pass import ( + RewriteNdarrayFunctionsPass, + RewriteOverloadedNumPyFunctionsPass, +) +from numba_dpex.parfor_diagnostics import ExtendedParforDiagnostics + + +class PassBuilder(object): + """ + A pass builder to run dpex's code-generation and optimization passes. + + Unlike Numba, dpex's pass builder does not offer objectmode and + interpreted passes. + """ + + @staticmethod + def define_untyped_pipeline(state, name="dpex_untyped"): + """Returns an untyped part of the nopython pipeline + + The pipeline of untyped passes is duplicated from Numba's compiler. We + are adding couple of passes to the pipeline to change specific numpy + overloads. + """ + pm = PassManager(name) + if state.func_ir is None: + pm.add_pass(TranslateByteCode, "analyzing bytecode") + pm.add_pass(FixupArgs, "fix up args") + pm.add_pass(IRProcessing, "processing IR") + pm.add_pass(WithLifting, "Handle with contexts") + + # --- Begin dpex passes added to the untyped pipeline --# + + # The RewriteOverloadedNumPyFunctionsPass rewrites the module namespace + # of specific NumPy functions to dpnp, as we overload these functions + # differently. + pm.add_pass( + RewriteOverloadedNumPyFunctionsPass, + "Rewrite name of Numpy functions to overload already overloaded " + + "function", + ) + # Add pass to ensure when users allocate static constant memory the + # size of the allocation is a constant and not specified by a closure + # variable. + pm.add_pass( + ConstantSizeStaticLocalMemoryPass, + "dpex constant size for static local memory", + ) + + # --- End of dpex passes added to the untyped pipeline --# + + # inline closures early in case they are using nonlocal's + # see issue #6585. + pm.add_pass( + InlineClosureLikes, "inline calls to locally defined closures" + ) + + # pre typing + if not state.flags.no_rewrites: + pm.add_pass(RewriteSemanticConstants, "rewrite semantic constants") + pm.add_pass(DeadBranchPrune, "dead branch pruning") + pm.add_pass(GenericRewrites, "nopython rewrites") + + # convert any remaining closures into functions + pm.add_pass( + MakeFunctionToJitFunction, + "convert make_function into JIT functions", + ) + # inline functions that have been determined as inlinable and rerun + # branch pruning, this needs to be run after closures are inlined as + # the IR repr of a closure masks call sites if an inlinable is called + # inside a closure + pm.add_pass(InlineInlinables, "inline inlinable functions") + if not state.flags.no_rewrites: + pm.add_pass(DeadBranchPrune, "dead branch pruning") + + pm.add_pass(FindLiterallyCalls, "find literally calls") + pm.add_pass(LiteralUnroll, "handles literal_unroll") + + if state.flags.enable_ssa: + pm.add_pass(ReconstructSSA, "ssa") + + pm.add_pass(LiteralPropagationSubPipelinePass, "Literal propagation") + + pm.finalize() + return pm + + @staticmethod + def define_typed_pipeline(state, name="dpex_typed"): + """Returns the typed part of the nopython pipeline""" + pm = PassManager(name) + # typing + pm.add_pass(NopythonTypeInference, "nopython frontend") + # Annotate only once legalized + pm.add_pass(AnnotateTypes, "annotate types") + pm.add_pass( + RewriteNdarrayFunctionsPass, + "Rewrite numpy.ndarray functions to dpnp.ndarray functions", + ) + + # strip phis + pm.add_pass(PreLowerStripPhis, "remove phis nodes") + + # optimization + pm.add_pass(InlineOverloads, "inline overloaded functions") + pm.add_pass(PreParforPass, "Preprocessing for parfors") + if not state.flags.no_rewrites: + pm.add_pass(NopythonRewrites, "nopython rewrites") + pm.add_pass(ParforPass, "convert to parfors") + + pm.finalize() + return pm + + @staticmethod + def define_nopython_lowering_pipeline(state, name="dpex_nopython_lowering"): + """Returns an nopython mode pipeline based PassManager""" + pm = PassManager(name) + + # legalize + pm.add_pass( + NoPythonSupportedFeatureValidation, + "ensure features that are in use are in a valid form", + ) + pm.add_pass(IRLegalization, "ensure IR is legal prior to lowering") + + # lower + pm.add_pass(DpexLowering, "Custom Lowerer with auto-offload support") + pm.add_pass(NoPythonBackend, "nopython mode backend") + pm.add_pass(DumpParforDiagnostics, "dump parfor diagnostics") + + pm.finalize() + return pm + + @staticmethod + def define_nopython_pipeline(state, name="dpex_nopython"): + """Returns an nopython mode pipeline based PassManager""" + # compose pipeline from untyped, typed and lowering parts + dpb = PassBuilder + pm = PassManager(name) + untyped_passes = dpb.define_untyped_pipeline(state) + pm.passes.extend(untyped_passes.passes) + + typed_passes = dpb.define_typed_pipeline(state) + pm.passes.extend(typed_passes.passes) + + lowering_passes = dpb.define_nopython_lowering_pipeline(state) + pm.passes.extend(lowering_passes.passes) + + pm.finalize() + return pm + + +class Compiler(CompilerBase): + """Dpex's compiler pipeline.""" + + def define_pipelines(self): + # this maintains the objmode fallback behaviour + pms = [] + self.state.parfor_diagnostics = ExtendedParforDiagnostics() + self.state.metadata[ + "parfor_diagnostics" + ] = self.state.parfor_diagnostics + if not self.state.flags.force_pyobject: + pms.append(PassBuilder.define_nopython_pipeline(self.state)) + if self.state.status.can_fallback or self.state.flags.force_pyobject: + raise UnsupportedCompilationModeError() + return pms + + +@global_compiler_lock +def compile_with_dpex( + pyfunc, + pyfunc_name, + args, + return_type, + target_context, + typing_context, + debug=None, + is_kernel=True, + extra_compile_flags=None, +): + """ + Compiles a function using the dpex compiler pipeline and returns the + compiled result. + + Args: + args: The list of arguments passed to the kernel. + debug (bool): Optional flag to turn on debug mode compilation. + extra_compile_flags: Extra flags passed to the compiler. + + Returns: + cres: Compiled result. + + Raises: + KernelHasReturnValueError: If the compiled function returns a + non-void value. + """ + # First compilation will trigger the initialization of the backend. + typingctx = typing_context + targetctx = target_context + + flags = compiler.Flags() + # Do not compile the function to a binary, just lower to LLVM + flags.debuginfo = config.DEBUGINFO_DEFAULT + flags.no_compile = True + flags.no_cpython_wrapper = True + flags.nrt = False + + if debug is not None: + flags.debuginfo = debug + + # Run compilation pipeline + if isinstance(pyfunc, FunctionType): + cres = compiler.compile_extra( + typingctx=typingctx, + targetctx=targetctx, + func=pyfunc, + args=args, + return_type=return_type, + flags=flags, + locals={}, + pipeline_class=Compiler, + ) + elif isinstance(pyfunc, ir.FunctionIR): + cres = compiler.compile_ir( + typingctx=typingctx, + targetctx=targetctx, + func_ir=pyfunc, + args=args, + return_type=return_type, + flags=flags, + locals={}, + pipeline_class=Compiler, + ) + else: + raise UnreachableError() + + if ( + is_kernel + and cres.signature.return_type is not None + and cres.signature.return_type != numba_types.void + ): + raise KernelHasReturnValueError( + kernel_name=pyfunc_name, + return_type=cres.signature.return_type, + ) + # Linking depending libraries + library = cres.library + library.finalize() + + return cres diff --git a/numba_dpex/core/exceptions.py b/numba_dpex/core/exceptions.py index 35273fe981..3aefd4f489 100644 --- a/numba_dpex/core/exceptions.py +++ b/numba_dpex/core/exceptions.py @@ -2,7 +2,7 @@ # # SPDX-License-Identifier: Apache-2.0 -"""The module defines the custom exception classes used in numba_dpex. +"""The module defines the custom error classes used in numba_dpex. """ from warnings import warn @@ -23,14 +23,20 @@ class KernelHasReturnValueError(Exception): the kernel function. """ - def __init__(self, kernel_name, return_type) -> None: + def __init__(self, kernel_name, return_type, sig=None) -> None: self.return_type = return_type - self.kernel_name = kernel_name - self.message = ( - f'Kernel "{self.kernel_name}" has a return value ' - f'of type "{self.return_type}". ' - "A numba-dpex kernel must have a void return type." - ) + if sig: + self.message = ( + f'Specialized kernel signature "{sig}" has a return value ' + f'of type "{return_type}". ' + "A numba-dpex kernel must have a void return type." + ) + else: + self.message = ( + f'Kernel "{kernel_name}" has a return value ' + f'of type "{return_type}". ' + "A numba-dpex kernel must have a void return type." + ) super().__init__(self.message) @@ -82,6 +88,72 @@ def __init__(self, kernel_name) -> None: super().__init__(self.message) +class IllegalRangeValueError(Exception): + def __init__(self, kernel_name) -> None: + self.message = ( + f"Kernel {kernel_name} cannot be dispatched with the " + "specified range. The range should be specified as a list, tuple, " + "or an int." + ) + super().__init__(self.message) + + +class UnsupportedNumberOfRangeDimsError(Exception): + def __init__(self, kernel_name, ndims, max_work_item_dims) -> None: + self.message = ( + f"Specified range for kernel {kernel_name} has {ndims} dimensions, " + f"the device supports only {max_work_item_dims} dimensional " + "ranges." + ) + super().__init__(self.message) + + +class UnmatchedNumberOfRangeDimsError(Exception): + def __init__(self, kernel_name, global_ndims, local_ndims) -> None: + self.message = ( + f"Specified global_range for kernel {kernel_name} has {global_ndims} dimensions, " + f"while specified local_range with dimenstions of {local_ndims} doesn't match " + "with global_range." + ) + super().__init__(self.message) + + +class UnsupportedWorkItemSizeError(Exception): + """ + + Args: + Exception (_type_): _description_ + """ + + def __init__( + self, kernel_name, dim, requested_work_items, supported_work_items + ) -> None: + self.message = ( + f"Attempting to launch kernel {kernel_name} with " + f"{requested_work_items} work items in dimension {dim} is not " + f"supported. The device supports only {supported_work_items} " + f"work items for dimension {dim}." + ) + super().__init__(self.message) + + +class UnsupportedGroupWorkItemSizeError(Exception): + """ + + Args: + Exception (_type_): _description_ + """ + + def __init__(self, kernel_name, dim, work_groups, work_items) -> None: + self.message = ( + f"Attempting to launch kernel {kernel_name} with " + f"{work_groups} global work groups and {work_items} local work items " + f"in dimension {dim} is not supported. The global work groups must be " + f"able to divide local work items evenly." + ) + super().__init__(self.message) + + class ComputeFollowsDataInferenceError(Exception): """Exception raised when an execution queue for a given array expression or a kernel function could not be deduced using the compute-follows-data @@ -112,8 +184,8 @@ def __init__( self, kernel_name, ndarray_argnum_list=None, *, usmarray_argnum_list ) -> None: if ndarray_argnum_list and usmarray_argnum_list: - ndarray_args = ",".join(ndarray_argnum_list) - usmarray_args = ",".join(usmarray_argnum_list) + ndarray_args = ",".join([str(i) for i in ndarray_argnum_list]) + usmarray_args = ",".join([str(i) for i in usmarray_argnum_list]) self.message = ( f'Kernel "{kernel_name}" has arguments of both usm_ndarray and ' "non-usm_ndarray types. Mixing of arguments of different " @@ -122,7 +194,7 @@ def __init__( f"and arguments {usmarray_args} are usm arrays." ) elif usmarray_argnum_list: - usmarray_args = ",".join(usmarray_argnum_list) + usmarray_args = ",".join([str(i) for i in usmarray_argnum_list]) self.message = ( f'Execution queue for kernel "{kernel_name}" could ' "be deduced using compute follows data programming model. The " @@ -218,3 +290,118 @@ def __init__(self) -> None: else: self.message = "Unreachable code executed." super().__init__(self.message) + + +class UnsupportedKernelArgumentError(Exception): + """Exception raised when the type of a kernel argument is not supported by + the compiler. + + Args: + type (str): The type of the unsupported argument. + value (object): The Python object passed as a kernel argument. + kernel_name (str): Name of kernel where the error was raised. + """ + + def __init__(self, type, value, kernel_name="") -> None: + self.message = ( + f"Argument {value} passed to kernel {kernel_name} is of an " + f"unsupported type ({type})." + ) + super().__init__(self.message) + + +class SUAIProtocolError(Exception): + """Exception raised when an array-like object passed to a kernel is + neither a NumPy array nor does it implement the __sycl_usm_array_interface__ + attribute. + + Args: + kernel_name (str): Name of kernel where the error was raised. + arg: Array-like object + """ + + def __init__(self, kernel_name, arg) -> None: + self.message = ( + f'Array-like argument {arg} passed to kernel "{kernel_name}" ' + "is neither a NumPy array nor implement the " + "__sycl_usm_array_interface__." + ) + super().__init__(self.message) + + +class UnsupportedAccessQualifierError(Exception): + """Exception raised when an illegal access specifier value is specified for + a NumPy array argument passed to a kernel. + + Args: + kernel_name (str): Name of kernel where the error was raised. + array_val: name of the array argument with the illegal access specifier. + illegal_access_type (str): The illegal access specifier string. + legal_access_list (str): Joined string for the legal access specifiers. + """ + + def __init__( + self, kernel_name, array_val, illegal_access_type, legal_access_list + ) -> None: + self.message = ( + f"Invalid access type {illegal_access_type} applied to " + f'array {array_val} argument passed to kernel "{kernel_name}". ' + f"Legal access specifiers are {legal_access_list}." + ) + + super().__init__(self.message) + + +class UnsupportedCompilationModeError(Exception): + def __init__(self) -> None: + self.message = ( + 'The dpex compiler does not support the "force_pyobject" setting.' + ) + super().__init__(self.message) + + +class InvalidKernelSpecializationError(Exception): + """Exception raised when a the specialization argument types are not + supported by the dpex kernel decorator. + + The exception is raised whenever an unsupported kernel argument is + provided in the specialization signature passed to a dpex kernel decorator + instance. For example, dpex kernels require arrays to be of USMNdArray type + and no other Array type, such as NumPy ndarray, are supported. If the + signature has an non USMNdArray Array type the exception is raised. + + Args: + kernel_name (str): Name of kernel where the error was raised. + invalid_sig: Unsupported signature. + unsupported_argnum_list : The list of argument numbers that are + unsupported. + """ + + def __init__( + self, kernel_name, invalid_sig, unsupported_argnum_list + ) -> None: + unsupported = ",".join([str(i) for i in unsupported_argnum_list]) + self.message = ( + f"Kernel {kernel_name} cannot be specialized for " + f'"{invalid_sig}". Arguments {unsupported} are not supported.' + ) + + super().__init__(self.message) + + +class MissingSpecializationError(Exception): + """Exception raised when a specialized JitKernel was called with arguments + that do not match any of the specialized versions of the JitKernel. + + Args: + kernel_name (str): Name of kernel where the error was raised. + sig: Unsupported argument types used to call a specialized JitKernel. + """ + + def __init__(self, kernel_name, argtypes) -> None: + self.message = ( + f"No specialized version of the kernel {kernel_name} " + f"exists for argument types: {argtypes}." + ) + + super().__init__(self.message) diff --git a/numba_dpex/core/kernel_interface/__init__.py b/numba_dpex/core/kernel_interface/__init__.py new file mode 100644 index 0000000000..21f8040397 --- /dev/null +++ b/numba_dpex/core/kernel_interface/__init__.py @@ -0,0 +1,6 @@ +# SPDX-FileCopyrightText: 2020 - 2022 Intel Corporation +# +# SPDX-License-Identifier: Apache-2.0 + +"""Defines the interface for kernel compilation using numba-dpex. +""" diff --git a/numba_dpex/core/kernel_interface/arg_pack_unpacker.py b/numba_dpex/core/kernel_interface/arg_pack_unpacker.py new file mode 100644 index 0000000000..e7e2e447d3 --- /dev/null +++ b/numba_dpex/core/kernel_interface/arg_pack_unpacker.py @@ -0,0 +1,232 @@ +# SPDX-FileCopyrightText: 2022 Intel Corporation +# +# SPDX-License-Identifier: Apache-2.0 + +import ctypes +import logging + +import dpctl.memory as dpctl_mem +import numpy as np +from numba.core import types + +import numba_dpex.utils as utils +from numba_dpex.core.exceptions import ( + UnsupportedAccessQualifierError, + UnsupportedKernelArgumentError, +) +from numba_dpex.core.types import USMNdArray +from numba_dpex.core.utils import get_info_from_suai + + +class _NumPyArrayPackerPayload: + def __init__(self, usm_mem, orig_val, packed_val, packed) -> None: + self._usm_mem = usm_mem + self._orig_val = orig_val + self._packed_val = packed_val + self._packed = packed + + +class Packer: + """Implements the functionality to unpack a Python object passed as an + argument to a numba_dpex kernel fucntion into corresponding ctype object. + """ + + # TODO: Remove after NumPy support is removed + _access_types = ("read_only", "write_only", "read_write") + + def _check_for_invalid_access_type(self, array_val, access_type): + if access_type and access_type not in Packer._access_types: + raise UnsupportedAccessQualifierError( + self._pyfunc_name, + array_val, + access_type, + ",".join(Packer._access_types), + ) + + def _unpack_array_helper(self, size, itemsize, buf, shape, strides, ndim): + """ + Implements the unpacking logic for array arguments. + + Args: + size: Total number of elements in the array. + itemsize: Size in bytes of each element in the array. + buf: The pointer to the memory. + shape: The shape of the array. + ndim: Number of dimension. + + Returns: + A list a ctype value for each array attribute argument + """ + unpacked_array_attrs = [] + + # meminfo (FIXME: should be removed and the USMNdArray type modified + # once NumPy support is removed) + unpacked_array_attrs.append(ctypes.c_size_t(0)) + # parent (FIXME: Evaluate if the attribute should be removed and the + # USMNdArray type modified once NumPy support is removed) + unpacked_array_attrs.append(ctypes.c_size_t(0)) + unpacked_array_attrs.append(ctypes.c_longlong(size)) + unpacked_array_attrs.append(ctypes.c_longlong(itemsize)) + unpacked_array_attrs.append(buf) + for ax in range(ndim): + unpacked_array_attrs.append(ctypes.c_longlong(shape[ax])) + for ax in range(ndim): + unpacked_array_attrs.append(ctypes.c_longlong(strides[ax])) + + return unpacked_array_attrs + + def _unpack_usm_array(self, val): + """Flattens an object of USMNdArray type into ctypes objects to be + passed as kernel arguments. + + Args: + val : An object of dpctl.types.UsmNdArray type. + + Returns: + list: A list of ctype objects representing the flattened usm_ndarray + """ + suai_attrs = get_info_from_suai(val) + + return self._unpack_array_helper( + size=suai_attrs.size, + itemsize=suai_attrs.itemsize, + buf=suai_attrs.data, + shape=suai_attrs.shape, + strides=suai_attrs.strides, + ndim=suai_attrs.dimensions, + ) + + def _unpack_array(self, val, access_type): + """Deprecated to be removed once NumPy array support in kernels is + removed. + """ + packed_val = val + # Check if the NumPy array is backed by USM memory + usm_mem = utils.has_usm_memory(val) + + # If the NumPy array is not USM backed, then copy to a USM memory + # object. Add an entry to the repack_map so that on exit from kernel + # the data from the USM object can be copied back into the NumPy array. + if usm_mem is None: + self._check_for_invalid_access_type(val, access_type) + usm_mem = utils.as_usm_obj(val, queue=self._queue, copy=False) + + orig_val = val + packed = False + if not val.flags.c_contiguous: + # If the numpy.ndarray is not C-contiguous + # we pack the strided array into a packed array. + # This allows us to treat the data from here on as C-contiguous. + # While packing we treat the data as C-contiguous. + # We store the reference of both (strided and packed) + # array and during unpacking we use numpy.copyto() to copy + # the data back from the packed temporary array to the + # original strided array. + packed_val = val.flatten(order="C") + packed = True + + if access_type == "read_only": + utils.copy_from_numpy_to_usm_obj(usm_mem, packed_val) + elif access_type == "read_write": + utils.copy_from_numpy_to_usm_obj(usm_mem, packed_val) + # Store to the repack map + self._repack_list.append( + _NumPyArrayPackerPayload( + usm_mem, orig_val, packed_val, packed + ) + ) + elif access_type == "write_only": + self._repack_list.append( + _NumPyArrayPackerPayload( + usm_mem, orig_val, packed_val, packed + ) + ) + else: + utils.copy_from_numpy_to_usm_obj(usm_mem, packed_val) + self._repack_list.append( + _NumPyArrayPackerPayload( + usm_mem, orig_val, packed_val, packed + ) + ) + + return self._unpack_array_helper( + packed_val.size, + packed_val.dtype.itemsize, + usm_mem, + packed_val.shape, + packed_val.strides, + packed_val.ndim, + ) + + def _unpack_argument(self, ty, val): + """ + Unpack a Python object into one or more ctype values using Numba's + type-inference machinery. + + Args: + ty: The data types of the kernel argument defined as in instance of + numba.types. + val: The value of the kernel argument. + + Raises: + UnsupportedKernelArgumentError: When the argument is of an + unsupported type. + + """ + + if isinstance(ty, USMNdArray): + return self._unpack_usm_array(val) + elif ty == types.int64: + return ctypes.c_longlong(val) + elif ty == types.uint64: + return ctypes.c_ulonglong(val) + elif ty == types.int32: + return ctypes.c_int(val) + elif ty == types.uint32: + return ctypes.c_uint(val) + elif ty == types.float64: + return ctypes.c_double(val) + elif ty == types.float32: + return ctypes.c_float(val) + elif ty == types.boolean: + return ctypes.c_uint8(int(val)) + elif ty == types.complex64: + raise UnsupportedKernelArgumentError(ty, val, self._pyfunc_name) + elif ty == types.complex128: + raise UnsupportedKernelArgumentError(ty, val, self._pyfunc_name) + else: + raise UnsupportedKernelArgumentError(ty, val, self._pyfunc_name) + + def __init__( + self, + kernel_name, + arg_list, + argty_list, + ) -> None: + """Initializes new Packer object and unpacks the input argument list. + + Args: + arg_list (list): A list of arguments to be unpacked + argty_list (list): A list of Numba inferred types for each argument. + + """ + self._pyfunc_name = kernel_name + self._arg_list = arg_list + self._argty_list = argty_list + + # loop over the arg_list and generate the kernelargs list + self._unpacked_args = [] + for i, val in enumerate(arg_list): + arg = self._unpack_argument( + ty=argty_list[i], + val=val, + ) + if type(arg) == list: + self._unpacked_args.extend(arg) + else: + self._unpacked_args.append(arg) + + @property + def unpacked_args(self): + """Returns the list of unpacked arguments created by a Packer object.""" + return self._unpacked_args diff --git a/numba_dpex/core/kernel_interface/dispatcher.py b/numba_dpex/core/kernel_interface/dispatcher.py new file mode 100644 index 0000000000..04a9a70228 --- /dev/null +++ b/numba_dpex/core/kernel_interface/dispatcher.py @@ -0,0 +1,618 @@ +# SPDX-FileCopyrightText: 2022 Intel Corporation +# +# SPDX-License-Identifier: Apache-2.0 + + +import copy +from inspect import signature +from warnings import warn + +import dpctl +import dpctl.program as dpctl_prog +from numba.core import sigutils +from numba.core.types import Array as NpArrayType +from numba.core.types import void + +from numba_dpex import config +from numba_dpex.core.caching import LRUCache, NullCache, build_key +from numba_dpex.core.descriptor import dpex_target +from numba_dpex.core.exceptions import ( + ComputeFollowsDataInferenceError, + ExecutionQueueInferenceError, + IllegalRangeValueError, + InvalidKernelLaunchArgsError, + InvalidKernelSpecializationError, + KernelHasReturnValueError, + MissingSpecializationError, + UnknownGlobalRangeError, + UnmatchedNumberOfRangeDimsError, + UnsupportedBackendError, + UnsupportedGroupWorkItemSizeError, + UnsupportedNumberOfRangeDimsError, + UnsupportedWorkItemSizeError, +) +from numba_dpex.core.kernel_interface.arg_pack_unpacker import Packer +from numba_dpex.core.kernel_interface.spirv_kernel import SpirvKernel +from numba_dpex.core.types import USMNdArray + + +def get_ordered_arg_access_types(pyfunc, access_types): + """Deprecated and to be removed in next release.""" + # Construct a list of access type of each arg according to their position + ordered_arg_access_types = [] + sig = signature(pyfunc, follow_wrapped=False) + for idx, arg_name in enumerate(sig.parameters): + if access_types: + for key in access_types: + if arg_name in access_types[key]: + ordered_arg_access_types.append(key) + if len(ordered_arg_access_types) <= idx: + ordered_arg_access_types.append(None) + + return ordered_arg_access_types + + +class JitKernel: + """Functor to wrap a kernel function and JIT compile and dispatch it to a + specified SYCL queue. + + A JitKernel is returned by the kernel decorator and wraps an instance of a + device kernel function. A device kernel function is specialized for a + backend may represent a binary object in a lower-level IR. Currently, only + SPIR-V binary format device functions for level-zero and opencl backends + are supported. + + """ + + # The list of SYCL backends supported by the Dispatcher + _supported_backends = ["opencl", "level_zero"] + + def __init__( + self, + pyfunc, + debug_flags=None, + compile_flags=None, + specialization_sigs=None, + enable_cache=True, + ): + self.typingctx = dpex_target.typing_context + self.pyfunc = pyfunc + self.debug_flags = debug_flags + self.compile_flags = compile_flags + self.kernel_name = pyfunc.__name__ + + # TODO: To be removed once the__getitem__ is removed + self._global_range = None + self._local_range = None + + # caching related attributes + if not config.ENABLE_CACHE: + self._cache = NullCache() + elif enable_cache: + self._cache = LRUCache( + capacity=config.CACHE_SIZE, pyfunc=self.pyfunc + ) + else: + self._cache = NullCache() + self._cache_hits = 0 + + if debug_flags or config.OPT == 0: + # if debug is ON we need to pass additional + # flags to igc. + self._create_sycl_kernel_bundle_flags = ["-g", "-cl-opt-disable"] + else: + self._create_sycl_kernel_bundle_flags = [] + + # Specialization of kernel based on signatures. If specialization + # signatures are found, they are compiled ahead of time and cached. + if specialization_sigs: + self._has_specializations = True + self._specialization_cache = LRUCache( + capacity=config.CACHE_SIZE, pyfunc=self.pyfunc + ) + for sig in specialization_sigs: + self._specialize(sig) + if self._specialization_cache.size() == 0: + raise AssertionError( + "JitKernel could not be specialized for signatures: " + + specialization_sigs + ) + else: + self._has_specializations = False + self._specialization_cache = NullCache() + + @property + def cache(self): + return self._cache + + @property + def cache_hits(self): + return self._cache_hits + + def _compile_and_cache(self, argtypes, cache): + """Helper function to compile the Python function or Numba FunctionIR + object passed to a JitKernel and store it in an internal cache. + """ + # We always compile the kernel using the dpex_target. + typingctx = dpex_target.typing_context + targetctx = dpex_target.target_context + + kernel = SpirvKernel(self.pyfunc, self.kernel_name) + kernel.compile( + args=argtypes, + typing_ctx=typingctx, + target_ctx=targetctx, + debug=self.debug_flags, + compile_flags=self.compile_flags, + ) + + device_driver_ir_module = kernel.device_driver_ir_module + kernel_module_name = kernel.module_name + + key = build_key( + tuple(argtypes), + self.pyfunc, + kernel.target_context.codegen(), + ) + cache.put(key, (device_driver_ir_module, kernel_module_name)) + + return device_driver_ir_module, kernel_module_name + + def _specialize(self, sig): + """Compiles a device kernel ahead of time based on provided signature. + + Args: + sig: The signature on which the kernel is to be specialized. + """ + + argtypes, return_type = sigutils.normalize_signature(sig) + + # Check if signature has a non-void return type + if return_type and return_type != void: + raise KernelHasReturnValueError( + kernel_name=None, return_type=return_type, sig=sig + ) + + # USMNdarray check + usmarray_argnums = [] + usmndarray_argtypes = [] + unsupported_argnum_list = [] + + for i, argtype in enumerate(argtypes): + # FIXME: Add checks for other types of unsupported kernel args, e.g. + # complex. + + # Check if a non-USMNdArray Array type is passed to the kernel + if isinstance(argtype, NpArrayType) and not isinstance( + argtype, USMNdArray + ): + unsupported_argnum_list.append(i) + elif isinstance(argtype, USMNdArray): + usmarray_argnums.append(i) + usmndarray_argtypes.append(argtype) + + if unsupported_argnum_list: + raise InvalidKernelSpecializationError( + kernel_name=self.kernel_name, + invalid_sig=sig, + unsupported_argnum_list=unsupported_argnum_list, + ) + + self._compile_and_cache( + argtypes=argtypes, + cache=self._specialization_cache, + ) + + def _check_size(self, dim, size, size_limit): + """Checks if the range value is sane based on the number of work items + supported by the device. + """ + + if size > size_limit: + raise UnsupportedWorkItemSizeError( + kernel_name=self.kernel_name, + dim=dim, + requested_work_items=size, + supported_work_items=size_limit, + ) + + def _check_range(self, range, device): + """Checks if the requested range to launch the kernel is valid. + + Range is checked against the number of dimensions and if the range + argument is specified as a valid list of tuple. + """ + + if not isinstance(range, (tuple, list)): + raise IllegalRangeValueError(self.kernel_name) + + max_work_item_dims = device.max_work_item_dims + + if len(range) > max_work_item_dims: + raise UnsupportedNumberOfRangeDimsError( + kernel_name=self.kernel_name, + ndims=len(range), + max_work_item_dims=max_work_item_dims, + ) + + def _check_ndrange(self, global_range, local_range, device): + """Checks if the specified nd_range (global_range, local_range) is + legal for a device on which the kernel will be launched. + """ + self._check_range(local_range, device) + + self._check_range(global_range, device) + if len(local_range) != len(global_range): + raise UnmatchedNumberOfRangeDimsError( + kernel_name=self.kernel_name, + global_ndims=len(global_range), + local_ndims=len(local_range), + ) + + for i in range(len(global_range)): + self._check_size(i, local_range[i], device.max_work_item_sizes[i]) + if global_range[i] % local_range[i] != 0: + raise UnsupportedGroupWorkItemSizeError( + kernel_name=self.kernel_name, + dim=i, + work_groups=global_range[i], + work_items=local_range[i], + ) + + def _chk_compute_follows_data_compliance(self, usm_array_arglist): + """Check if all the usm ndarray's have the same device. + + Extracts the device filter string from the Numba inferred USMNdArray + type. Check if the devices corresponding to the filter string are + equivalent and return a ``dpctl.SyclDevice`` object corresponding to the + common filter string. + + If an exception occurred in creating a ``dpctl.SyclDevice``, or the + devices are not equivalent then returns None. + + Args: + usm_array_arglist : A list of usm_ndarray types specified as + arguments to the kernel. + + Returns: + A ``dpctl.SyclDevice`` object if all USMNdArray have same device, or + else None is returned. + """ + + queue = None + + for usm_array in usm_array_arglist: + _queue = usm_array.queue + if not queue: + queue = _queue + else: + if _queue != queue: + return None + + return queue + + def _determine_kernel_launch_queue(self, args, argtypes): + """Determines the queue where the kernel is to be launched. + + The execution queue is derived using the following algorithm. In future, + support for ``numpy.ndarray`` and ``dpctl.device_context`` is to be + removed and queue derivation will follows Python Array API's + "compute follows data" logic. + + Check if there are array arguments. + True: + Check if all array arguments are of type numpy.ndarray + (numba.types.Array) + True: + Check if the kernel was invoked from within a + dpctl.device_context. + True: + Provide a deprecation warning for device_context use and + point to using dpctl.tensor.usm_ndarray or dpnp.ndarray + + return dpctl.get_current_queue + False: + Raise ExecutionQueueInferenceError + False: + Check if all of the arrays are USMNdarray + True: + Check if execution queue could be inferred using + compute follows data rules + True: + return the compute follows data inferred queue + False: + Raise ComputeFollowsDataInferenceError + False: + Raise ComputeFollowsDataInferenceError + False: + Check if the kernel was invoked from within a dpctl.device_context. + True: + Provide a deprecation warning for device_context use and + point to using dpctl.tensor.usm_ndarray of dpnp.ndarray + + return dpctl.get_current_queue + False: + Raise ExecutionQueueInferenceError + + Args: + args : A list of arguments passed to the kernel stored in the + launcher. + argtypes : The Numba inferred type for each argument. + + Returns: + A queue the common queue used to allocate the arrays. If no such + queue exists, then raises an Exception. + + Raises: + ComputeFollowsDataInferenceError: If the queue could not be inferred + using compute follows data rules. + ExecutionQueueInferenceError: If the queue could not be inferred + using the dpctl queue manager. + """ + + # FIXME: The args parameter is not needed once numpy support is removed + + # Needed as USMNdArray derives from Array + array_argnums = [ + i + for i, _ in enumerate(args) + if isinstance(argtypes[i], NpArrayType) + and not isinstance(argtypes[i], USMNdArray) + ] + usmarray_argnums = [ + i for i, _ in enumerate(args) if isinstance(argtypes[i], USMNdArray) + ] + + # if usm and non-usm array arguments are getting mixed, then the + # execution queue cannot be inferred using compute follows data rules. + if array_argnums and usmarray_argnums: + raise ComputeFollowsDataInferenceError( + array_argnums, usmarray_argnum_list=usmarray_argnums + ) + elif array_argnums and not usmarray_argnums: + raise ExecutionQueueInferenceError(self.kernel_name) + elif usmarray_argnums and not array_argnums: + usm_array_args = [ + argtype + for i, argtype in enumerate(argtypes) + if i in usmarray_argnums + ] + + queue = self._chk_compute_follows_data_compliance(usm_array_args) + + if not queue: + raise ComputeFollowsDataInferenceError( + self.kernel_name, usmarray_argnum_list=usmarray_argnums + ) + + return queue + else: + raise ExecutionQueueInferenceError(self.kernel_name) + + def __getitem__(self, args): + """Mimic's ``numba.cuda`` square-bracket notation for configuring the + global_range and local_range settings when launching a kernel on a + SYCL queue. + + When a Python function decorated with the @kernel decorator, + is invoked it creates a KernelLauncher object. Calling the + KernelLauncher objects ``__getitem__`` function inturn clones the object + and sets the ``global_range`` and optionally the ``local_range`` + attributes with the arguments passed to ``__getitem__``. + + Args: + args (tuple): A tuple of tuples that specify the global and + optionally the local range for the kernel execution. If the + argument is a two-tuple of tuple, then it is assumed that both + global and local range options are specified. The first entry is + considered to be the global range and the second the local range. + + If only a single tuple value is provided, then the kernel is + launched with only a global range and the local range configuration + is decided by the SYCL runtime. + + Returns: + KernelLauncher: A clone of the KernelLauncher object, but with the + global_range and local_range attributes initialized. + + .. deprecated:: 0.19 + """ + + # if not (isinstance(args, Range) or isinstance(args, NdRange)): + # warn( + # "Setting ranges directly in the [] (__getitem__) method " + # + "is deprecated. The index space for the kernel should be " + # + "specified via a Range or an NdRange instance.", + # DeprecationWarning, + # stacklevel=2, + # ) + + # Check if an int is passed, if so the only global range from int + + # If it is a tuple and the individual elements of the tuple are not + # tuples, then treat it as a multi-dimensional global range + + # If it is a tuple and each element is a tuple then length of the tuple + # needs to be two. First element of the tuple is the global range + # the next is the local range. + + nargs = len(args) + # Check if the kernel launch arguments are sane. + if nargs < 1: + raise UnknownGlobalRangeError(kernel_name=self.kernel_name) + elif nargs > 2: + raise InvalidKernelLaunchArgsError( + kernel_name=self.kernel_name, args=args + ) + self._global_range = args[0] + if nargs == 2 and args[1] != []: + self._local_range = args[1] + else: + self._local_range = None + + return copy.copy(self) + + def _get_ranges(self, global_range, local_range, device): + """Helper to get the global and local range values needed to launch a + kernel. + + The global and local range arguments can either be provided using the + __getitem__ method or as keyword arguments to the __call__ method. + The function verifies that the range values are specified using at least + one of the method. + + Args: + global_range (list or tuple): The global range to be used for kernel + launch. + local_range (list or tuple): The local range to be used for kernel + launch. + device (dpctl.SyclDevice): The device on which to launch the kernel. + + Raises: + UnknownGlobalRangeError: When no global range was specified for + kernel launch. + """ + if global_range: + if self._global_range: + warn( + "Ignoring the previously set value of global_range and " + + "using the value specified at the kernel call site." + ) + else: + if self._global_range: + warn( + "Use of __getitem__ to set the global_range attribute is " + + 'deprecated. Use the keyword argument "global_range" ' + + "when calling the kernel to specify the global range." + ) + global_range = self._global_range + else: + raise UnknownGlobalRangeError(self.kernel_name) + + if local_range: + if self._local_range: + warn( + "Ignoring the previously set value of local_range and " + + "using the value specified at the kernel call site.." + ) + else: + if self._local_range: + warn( + "Use of __getitem__ to set the local_range attribute is " + + 'deprecated. Use the keyword argument "local_range" ' + + "when calling the kernel to specify the local range." + ) + local_range = self._local_range + else: + local_range = None + warn( + "Kernel to be submitted without a local range letting " + + "the SYCL runtime select a local range. The behavior " + + "can lead to suboptimal performance in certain cases. " + + "Consider setting the local range value for the kernel " + + "execution.\n" + + "The local_range keyword may be made a required argument " + + "in the future when calling a kernel." + ) + + if isinstance(global_range, int): + global_range = [global_range] + + # If only global range value is provided, then the kernel is invoked + # over an N-dimensional index space defined by a SYCL range, where + # N is one, two or three. + # If both local and global range values are specified the kernel is + # invoked using a SYCL nd_range + + if global_range and not local_range: + self._check_range(global_range, device) + # FIXME:[::-1] is done as OpenCL and SYCl have different orders when + # it comes to specifying dimensions. + global_range = list(global_range)[::-1] + else: + if isinstance(local_range, int): + local_range = [local_range] + self._check_ndrange( + global_range=global_range, + local_range=local_range, + device=device, + ) + global_range = list(global_range)[::-1] + local_range = list(local_range)[::-1] + + return (global_range, local_range) + + def __call__(self, *args): + """Functor to launch a kernel. + + Args: + args: List of arguments passed to the kernel + """ + argtypes = [self.typingctx.resolve_argument_type(arg) for arg in args] + # FIXME: For specialized and ahead of time compiled and cached kernels, + # the CFD check was already done statically. The run-time check is + # redundant. We should avoid these checks for the specialized case. + exec_queue = self._determine_kernel_launch_queue(args, argtypes) + backend = exec_queue.backend + + if exec_queue.backend not in [ + dpctl.backend_type.opencl, + dpctl.backend_type.level_zero, + ]: + raise UnsupportedBackendError( + self.kernel_name, backend, JitKernel._supported_backends + ) + + # load the kernel from cache + key = build_key( + tuple(argtypes), + self.pyfunc, + dpex_target.target_context.codegen(), + ) + + # If the JitKernel was specialized then raise exception if argtypes + # do not match one of the specialized versions. + if self._has_specializations: + artifact = self._specialization_cache.get(key) + if artifact is not None: + device_driver_ir_module, kernel_module_name = artifact + else: + raise MissingSpecializationError(self.kernel_name, argtypes) + else: + artifact = self._cache.get(key) + # if the kernel was not previously cached, compile it. + if artifact is not None: + device_driver_ir_module, kernel_module_name = artifact + self._cache_hits += 1 + else: + ( + device_driver_ir_module, + kernel_module_name, + ) = self._compile_and_cache( + argtypes=argtypes, + cache=self._cache, + ) + + # create a sycl::KernelBundle + kernel_bundle = dpctl_prog.create_program_from_spirv( + exec_queue, + device_driver_ir_module, + " ".join(self._create_sycl_kernel_bundle_flags), + ) + # get the sycl::kernel + sycl_kernel = kernel_bundle.get_sycl_kernel(kernel_module_name) + + packer = Packer( + kernel_name=self.kernel_name, + arg_list=args, + argty_list=argtypes, + ) + + exec_queue.submit( + sycl_kernel, + packer.unpacked_args, + self._global_range, + self._local_range, + ) + + exec_queue.wait() diff --git a/numba_dpex/core/kernel_interface/func.py b/numba_dpex/core/kernel_interface/func.py new file mode 100644 index 0000000000..42fe2e19c1 --- /dev/null +++ b/numba_dpex/core/kernel_interface/func.py @@ -0,0 +1,105 @@ +# SPDX-FileCopyrightText: 2022 Intel Corporation +# +# SPDX-License-Identifier: Apache-2.0 + +"""_summary_ +""" + + +from numba.core.typing.templates import AbstractTemplate, ConcreteTemplate + +from numba_dpex.core.compiler import compile_with_dpex +from numba_dpex.core.descriptor import dpex_target + + +def compile_func(pyfunc, return_type, args, debug=None): + cres = compile_with_dpex( + pyfunc=pyfunc, + pyfunc_name=pyfunc.__name__, + return_type=return_type, + target_context=dpex_target.target_context, + typing_context=dpex_target.typing_context, + args=args, + is_kernel=False, + debug=debug, + ) + func = cres.library.get_function(cres.fndesc.llvm_func_name) + cres.target_context.mark_ocl_device(func) + devfn = DpexFunction(cres) + + class _function_template(ConcreteTemplate): + key = devfn + cases = [cres.signature] + + cres.typing_context.insert_user_function(devfn, _function_template) + libs = [cres.library] + cres.target_context.insert_user_function(devfn, cres.fndesc, libs) + return devfn + + +def compile_func_template(pyfunc, debug=None): + """Compile a DpexFunctionTemplate""" + + dft = DpexFunctionTemplate(pyfunc, debug=debug) + + class _function_template(AbstractTemplate): + key = dft + + def generic(self, args, kws): + if kws: + raise AssertionError("No keyword arguments allowed.") + return dft.compile(args) + + dpex_target.typing_context.insert_user_function(dft, _function_template) + return dft + + +class DpexFunctionTemplate(object): + """Unmaterialized dpex function""" + + def __init__(self, pyfunc, debug=None): + self.py_func = pyfunc + self.debug = debug + self._compileinfos = {} + + def compile(self, args): + """Compile a dpex.func decorated Python function with the given + argument types. + + Each signature is compiled once by caching the compiled function inside + this object. + """ + if args not in self._compileinfos: + cres = compile_with_dpex( + pyfunc=self.py_func, + pyfunc_name=self.py_func.__name__, + return_type=None, + target_context=dpex_target.target_context, + typing_context=dpex_target.typing_context, + args=args, + is_kernel=False, + debug=self.debug, + ) + func = cres.library.get_function(cres.fndesc.llvm_func_name) + cres.target_context.mark_ocl_device(func) + first_definition = not self._compileinfos + self._compileinfos[args] = cres + libs = [cres.library] + + if first_definition: + # First definition + cres.target_context.insert_user_function( + self, cres.fndesc, libs + ) + else: + cres.target_context.add_user_function(self, cres.fndesc, libs) + + else: + cres = self._compileinfos[args] + + return cres.signature + + +class DpexFunction(object): + def __init__(self, cres): + self.cres = cres diff --git a/numba_dpex/core/kernel_interface/indexers.py b/numba_dpex/core/kernel_interface/indexers.py new file mode 100644 index 0000000000..098702460b --- /dev/null +++ b/numba_dpex/core/kernel_interface/indexers.py @@ -0,0 +1,65 @@ +# SPDX-FileCopyrightText: 2022 Intel Corporation +# +# SPDX-License-Identifier: Apache-2.0 + + +class Range: + """Defines an 1,2,or 3 dimensional index space over which a kernel is + executed. + + The Range class is analogous to SYCL's ``sycl::range`` class. + """ + + def __init__(self, dim0, dim1=None, dim2=None): + self._dim0 = dim0 + self._dim1 = dim1 + self._dim2 = dim2 + + if not self._dim0: + raise ValueError + + if self._dim2 and not self._dim1: + raise ValueError + + if not isinstance(self._dim0, int): + raise ValueError + + if self._dim1 and not isinstance(self._dim1, int): + raise ValueError + + if self._dim2 and not isinstance(self._dim2, int): + raise ValueError + + def get(self, dim): + if not isinstance(dim, int): + raise ValueError + + if dim == 0: + return self._dim0 + elif dim == 1: + return self._dim1 + elif dim == 2: + return self._dim3 + else: + raise ValueError + + def size(self): + size = self._dim0 + if self._dim1: + size *= self._dim1 + if self._dim2: + size *= self._dim2 + + return size + + def rank(self): + rank = 1 + + # We already checked in init that if dim2 is set that dim1 has + # to be set as well + if self._dim1: + rank += 1 + elif self._dim2: + rank += 1 + + return rank diff --git a/numba_dpex/core/kernel_interface/kernel_base.py b/numba_dpex/core/kernel_interface/kernel_base.py new file mode 100644 index 0000000000..b0dd31f5b3 --- /dev/null +++ b/numba_dpex/core/kernel_interface/kernel_base.py @@ -0,0 +1,60 @@ +# SPDX-FileCopyrightText: 2022 Intel Corporation +# +# SPDX-License-Identifier: Apache-2.0 + +import abc + + +class KernelInterface(metaclass=abc.ABCMeta): + """An interface for compute kernel that was generated either from a + Python function object or as a Numba IR FunctionType object. + + Args: + metaclass (optional): The interface is derived from abc.ABCMeta. + + Raises: + NotImplementedError: The interface does not implement any of the + methods and subclasses are required to implement them. + """ + + @classmethod + def __subclasshook__(cls, subclass): + return hasattr( + (subclass, "llvm_module") + and hasattr(subclass, "device_driver_ir_module") + and hasattr(subclass, "pyfunc_name") + and hasattr(subclass, "module_name") + and hasattr(subclass, "compile") + and callable(subclass.compile) + ) + + # TODO Add a property for argtypes + + @property + @abc.abstractmethod + def llvm_module(self): + """The LLVM IR Module corresponding to the Kernel instance.""" + raise NotImplementedError + + @property + @abc.abstractmethod + def device_driver_ir_module(self): + """The module in a device IR (such as SPIR-V or PTX) format.""" + raise NotImplementedError + + @property + @abc.abstractmethod + def pyfunc_name(self): + """The Python function name corresponding to the Kernel instance.""" + raise NotImplementedError + + @property + @abc.abstractmethod + def module_name(self): + """The LLVM module name for the compiled kernel.""" + raise NotImplementedError + + @abc.abstractmethod + def compile(self, target_ctx, typing_ctx, args, debug, compile_flags): + """Abstract method to compile a Kernel instance.""" + raise NotImplementedError diff --git a/numba_dpex/core/kernel_interface/spirv_kernel.py b/numba_dpex/core/kernel_interface/spirv_kernel.py new file mode 100644 index 0000000000..932f4edd9a --- /dev/null +++ b/numba_dpex/core/kernel_interface/spirv_kernel.py @@ -0,0 +1,149 @@ +# SPDX-FileCopyrightText: 2022 Intel Corporation +# +# SPDX-License-Identifier: Apache-2.0 + +import logging +from types import FunctionType + +from numba.core import ir + +from numba_dpex import spirv_generator +from numba_dpex.core.compiler import compile_with_dpex +from numba_dpex.core.exceptions import UncompiledKernelError, UnreachableError + +from .kernel_base import KernelInterface + + +class SpirvKernel(KernelInterface): + def __init__(self, func, func_name) -> None: + """Represents a SPIR-V module compiled for a Python function. + + Args: + func: The function to be compiled. Can be a Python function or a + Numba IR object representing a function. + func_name (str): Name of the function being compiled + + Raises: + UnreachableError: An internal error indicating an unexpected code + path was executed. + """ + self._llvm_module = None + self._device_driver_ir_module = None + self._module_name = None + self._pyfunc_name = func_name + self._func = func + if isinstance(func, FunctionType): + self._func_ty = FunctionType + elif isinstance(func, ir.FunctionIR): + self._func_ty = ir.FunctionIR + else: + raise UnreachableError() + self._target_context = None + + @property + def llvm_module(self): + """The LLVM IR Module corresponding to the Kernel instance.""" + if self._llvm_module: + return self._llvm_module + else: + raise UncompiledKernelError(self._pyfunc_name) + + @property + def device_driver_ir_module(self): + """The module in a device IR (such as SPIR-V or PTX) format.""" + if self._device_driver_ir_module: + return self._device_driver_ir_module + else: + raise UncompiledKernelError(self._pyfunc_name) + + @property + def pyfunc_name(self): + """The Python function name corresponding to the kernel.""" + return self._pyfunc_name + + @property + def module_name(self): + """The name of the compiled LLVM module for the kernel.""" + if self._module_name: + return self._module_name + else: + raise UncompiledKernelError(self._pyfunc_name) + + @property + def target_context(self): + """Returns the target context that was used to compile the kernel. + + Raises: + UncompiledKernelError: If the kernel was not yet compiled. + + Returns: + target context used to compile the kernel + """ + if self._target_context: + return self._target_context + else: + raise UncompiledKernelError(self._pyfunc_name) + + @property + def typing_context(self): + """Returns the typing context that was used to compile the kernel. + + Raises: + UncompiledKernelError: If the kernel was not yet compiled. + + Returns: + typing context used to compile the kernel + """ + if self._typing_context: + return self._typing_context + else: + raise UncompiledKernelError(self._pyfunc_name) + + def compile( + self, + target_ctx, + typing_ctx, + args, + debug, + compile_flags, + ): + """Compiles a kernel using numba_dpex.core.compiler.Compiler. + + Args: + args (_type_): _description_ + debug (_type_): _description_ + compile_flags (_type_): _description_ + """ + + logging.debug("compiling SpirvKernel with arg types", args) + + self._target_context = target_ctx + self._typing_context = typing_ctx + + cres = compile_with_dpex( + self._func, + self._pyfunc_name, + args=args, + return_type=None, + debug=debug, + is_kernel=True, + typing_context=typing_ctx, + target_context=target_ctx, + extra_compile_flags=compile_flags, + ) + + func = cres.library.get_function(cres.fndesc.llvm_func_name) + kernel = cres.target_context.prepare_ocl_kernel( + func, cres.signature.args + ) + self._llvm_module = kernel.module.__str__() + self._module_name = kernel.name + + # FIXME: There is no need to serialize the bitcode. It can be passed to + # llvm-spirv directly via stdin. + + # FIXME: There is no need for spirv-dis. We cause use --to-text + # (or --spirv-text) to convert SPIRV to text + self._device_driver_ir_module = spirv_generator.llvm_to_spirv( + self._target_context, self._llvm_module, kernel.module.as_bitcode() + ) diff --git a/numba_dpex/core/passes/lowerer.py b/numba_dpex/core/passes/lowerer.py index d48926d265..b12cf31f5b 100644 --- a/numba_dpex/core/passes/lowerer.py +++ b/numba_dpex/core/passes/lowerer.py @@ -11,6 +11,7 @@ from collections import OrderedDict import dpctl +import dpctl.program as dpctl_prog import numba import numpy as np from numba.core import compiler, funcdesc, ir, lowering, sigutils, types @@ -44,6 +45,7 @@ import numba_dpex as dpex from numba_dpex import config +from numba_dpex.core.descriptor import dpex_target from numba_dpex.core.target import DpexTargetContext from numba_dpex.core.types import Array from numba_dpex.dpctl_iface import KernelLaunchOps @@ -52,6 +54,57 @@ from .dufunc_inliner import dufunc_inliner +def _compile_kernel_parfor( + sycl_queue, kernel_name, func_ir, args, args_with_addrspaces, debug=None +): + # We only accept numba_dpex.core.types.Array type + for arg in args_with_addrspaces: + if isinstance(arg, types.npytypes.Array) and not isinstance(arg, Array): + raise TypeError( + "Only numba_dpex.core.types.Array objects are supported as " + + "kernel arguments. Received %s" % (type(arg)) + ) + if config.DEBUG: + print("compile_kernel_parfor", args) + for a in args_with_addrspaces: + print(a, type(a)) + if isinstance(a, types.npytypes.Array): + print("addrspace:", a.addrspace) + + # Create a SPIRVKernel object + kernel = dpex.core.kernel_interface.spirv_kernel.SpirvKernel( + func_ir, kernel_name + ) + + # compile the kernel + kernel.compile( + args=args_with_addrspaces, + typing_ctx=dpex_target.typing_context, + target_ctx=dpex_target.target_context, + debug=debug, + compile_flags=None, + ) + + # Compile a SYCL Kernel object rom the SPIRVKernel + + dpctl_create_program_from_spirv_flags = [] + + if debug or config.OPT == 0: + # if debug is ON we need to pass additional flags to igc. + dpctl_create_program_from_spirv_flags = ["-g", "-cl-opt-disable"] + + # create a program + kernel_bundle = dpctl_prog.create_program_from_spirv( + sycl_queue, + kernel.device_driver_ir_module, + " ".join(dpctl_create_program_from_spirv_flags), + ) + # create a kernel + sycl_kernel = kernel_bundle.get_sycl_kernel(kernel.module_name) + + return sycl_kernel + + def _print_block(block): for i, inst in enumerate(block.body): print(" ", i, inst) @@ -268,13 +321,9 @@ def _create_gufunc_for_parfor_body( lowerer, parfor, typemap, - typingctx, - targetctx, flags, loop_ranges, - locals, has_aliases, - index_var_typ, races, ): """ @@ -656,8 +705,9 @@ def print_arg_with_addrspaces(args): print("after DUFunc inline".center(80, "-")) gufunc_ir.dump() - kernel_func = dpex.compiler.compile_kernel_parfor( + sycl_kernel = _compile_kernel_parfor( dpctl.get_current_queue(), + gufunc_name, gufunc_ir, gufunc_param_types, param_types_addrspaces, @@ -669,7 +719,7 @@ def print_arg_with_addrspaces(args): if config.DEBUG_ARRAY_OPT: print("kernel_sig = ", kernel_sig) - return kernel_func, parfor_args, kernel_sig, func_arg_types, setitems + return sycl_kernel, parfor_args, kernel_sig, func_arg_types, setitems def _lower_parfor_gufunc(lowerer, parfor): @@ -762,13 +812,9 @@ def _lower_parfor_gufunc(lowerer, parfor): lowerer, parfor, typemap, - typingctx, - targetctx, flags, loop_ranges, - {}, bool(alias_map), - index_var_typ, parfor.races, ) finally: @@ -893,7 +939,7 @@ def bump_alpha(c, class_map): def generate_kernel_launch_ops( lowerer, - cres, + kernel, gu_signature, outer_sig, expr_args, @@ -930,7 +976,7 @@ def generate_kernel_launch_ops( print("modified_arrays", modified_arrays) # get dpex_cpu_portion_lowerer object - kernel_launcher = KernelLaunchOps(lowerer, cres.kernel, num_inputs) + kernel_launcher = KernelLaunchOps(lowerer, kernel, num_inputs) # Get a pointer to the current queue curr_queue = kernel_launcher.get_current_queue() @@ -977,7 +1023,7 @@ def val_type_or_none(context, lowerer, x): ] all_args = [loadvar_or_none(lowerer, x) for x in expr_args[:ninouts]] - keep_alive_kernels.append(cres) + keep_alive_kernels.append(kernel) # Call clSetKernelArg for each arg and create arg array for # the enqueue function. Put each part of each argument into @@ -1065,7 +1111,7 @@ def relatively_deep_copy(obj, memo): from numba.core.typing.templates import Signature from numba.np.ufunc.dufunc import DUFunc - from numba_dpex.compiler import DpexFunctionTemplate + from numba_dpex.core.kernel_interface.func import DpexFunctionTemplate # objects which shouldn't or can't be copied and it's ok not to copy it. if isinstance( @@ -1198,14 +1244,15 @@ def relatively_deep_copy(obj, memo): memo[obj_id] = cpy return cpy - # some python objects are not copyable. In such case exception would be raised - # it is just a convinient point to find such objects + # some python objects are not copyable. In such case exception would be + # raised it is just a convinient point to find such objects try: cpy = copy.copy(obj) except Exception as e: raise e - # __slots__ for subclass specify only members declared in subclass. So to get all members we need to go through + # __slots__ for subclass specify only members declared in subclass. So to + # get all members we need to go through # all supeclasses def get_slots_members(obj): keys = [] diff --git a/numba_dpex/core/target.py b/numba_dpex/core/target.py index 7afb88efc6..f594fc4f64 100644 --- a/numba_dpex/core/target.py +++ b/numba_dpex/core/target.py @@ -9,20 +9,23 @@ from llvmlite import ir as llvmir from llvmlite.llvmpy import core as lc from numba import typeof -from numba.core import cgutils, datamodel, types, typing, utils +from numba.core import cgutils, types, typing, utils from numba.core.base import BaseContext from numba.core.callconv import MinimalCallConv from numba.core.registry import cpu_target from numba.core.target_extension import GPU, target_registry +from numba.core.types import Array as NpArrayType from numba.core.utils import cached_property from numba_dpex.core.datamodel.models import _init_data_model_manager +from numba_dpex.core.exceptions import UnsupportedKernelArgumentError +from numba_dpex.core.typeconv import to_usm_ndarray +from numba_dpex.core.types import USMNdArray +from numba_dpex.core.utils import get_info_from_suai from numba_dpex.utils import ( address_space, calling_conv, - has_usm_memory, npytypes_array_to_dpex_array, - suai_to_dpex_array, ) from .. import codegen @@ -67,21 +70,26 @@ def resolve_argument_type(self, val): """ try: - _type = type(typeof(val)) + type(typeof(val)) except ValueError: - # For arbitrary array that is not recognized by Numba, - # we will end up in this path. We check if the array - # has __sycl_usm_array_interface__ attribute. If yes, - # we create the necessary Numba type to represent it - # and send it back. - if has_usm_memory(val) is not None: - return suai_to_dpex_array(val) - - if _type is types.npytypes.Array: - # Convert npytypes.Array to numba_dpex.core.types.Array - return npytypes_array_to_dpex_array(typeof(val)) - else: - return super().resolve_argument_type(val) + # When an array-like kernel argument is not recognized by + # numba-dpex, this additional check sees if the array-like object + # implements the __sycl_usm_array_interface__ protocol. For such + # cases, we treat the object as an UsmNdArray type. + try: + suai_attrs = get_info_from_suai(val) + return to_usm_ndarray(suai_attrs) + except Exception: + raise UnsupportedKernelArgumentError( + type=str(type(val)), value=val + ) + + if isinstance(typeof(val), NpArrayType) and not isinstance( + typeof(val), USMNdArray + ): + raise UnsupportedKernelArgumentError(type=str(type(val)), value=val) + + return super().resolve_argument_type(val) def load_additional_registries(self): """Register the OpenCL API and math and other functions.""" diff --git a/numba_dpex/core/typeconv/__init__.py b/numba_dpex/core/typeconv/__init__.py new file mode 100644 index 0000000000..61bb97a839 --- /dev/null +++ b/numba_dpex/core/typeconv/__init__.py @@ -0,0 +1,7 @@ +# SPDX-FileCopyrightText: 2020 - 2023 Intel Corporation +# +# SPDX-License-Identifier: Apache-2.0 + +from .array_conversion import to_usm_ndarray + +__all__ = ["to_usm_ndarray"] diff --git a/numba_dpex/core/typeconv/array_conversion.py b/numba_dpex/core/typeconv/array_conversion.py new file mode 100644 index 0000000000..5096045a90 --- /dev/null +++ b/numba_dpex/core/typeconv/array_conversion.py @@ -0,0 +1,46 @@ +# SPDX-FileCopyrightText: 2020 - 2023 Intel Corporation +# +# SPDX-License-Identifier: Apache-2.0 + +from numba.np import numpy_support + +from numba_dpex.core.types import USMNdArray +from numba_dpex.core.utils import get_info_from_suai +from numba_dpex.utils.constants import address_space + + +def to_usm_ndarray(suai_attrs, addrspace=address_space.GLOBAL): + """Converts an array-like object that has the _sycl_usm_array_interface__ + attribute to numba_dpex.types.UsmNdArray. + + Args: + suai_attrs: The extracted SUAI information for an array-like object. + addrspace: Address space this array is allocated in. + + Returns: The Numba type for SUAI array. + + Raises: + NotImplementedError: If the dtype of the passed array is not supported. + """ + try: + dtype = numpy_support.from_dtype(suai_attrs.dtype) + except NotImplementedError: + raise ValueError("Unsupported array dtype: %s" % (dtype,)) + + # If converting from an unknown array-like object that implements + # __sycl_usm_array_interface__, the layout is always hard-coded to + # C-contiguous. + layout = "C" + + return USMNdArray( + dtype=dtype, + ndim=suai_attrs.dimensions, + layout=layout, + usm_type=suai_attrs.usm_type, + device=suai_attrs.device, + queue=suai_attrs.queue, + readonly=not suai_attrs.is_writable, + name=None, + aligned=True, + addrspace=addrspace, + ) diff --git a/numba_dpex/core/types/usm_ndarray_type.py b/numba_dpex/core/types/usm_ndarray_type.py index e156d7a5c7..b2b8cdd24b 100644 --- a/numba_dpex/core/types/usm_ndarray_type.py +++ b/numba_dpex/core/types/usm_ndarray_type.py @@ -2,12 +2,15 @@ # # SPDX-License-Identifier: Apache-2.0 +"""A type class to represent dpctl.tensor.usm_ndarray type in Numba +""" + +import dpctl import dpctl.tensor from numba.core.typeconv import Conversion from numba.core.types.npytypes import Array -"""A type class to represent dpctl.tensor.usm_ndarray type in Numba -""" +from numba_dpex.utils import address_space class USMNdArray(Array): @@ -18,25 +21,46 @@ def __init__( dtype, ndim, layout, - usm_type, - device, + usm_type="unknown", + device="unknown", + queue=None, readonly=False, name=None, aligned=True, - addrspace=None, + addrspace=address_space.GLOBAL, ): self.usm_type = usm_type - self.device = device self.addrspace = addrspace + # Normalize the device filter string and get the fully qualified three + # tuple (backend:device_type:device_num) filter string from dpctl. + if device != "unknown": + _d = dpctl.SyclDevice(device) + self.device = _d.filter_string + else: + self.device = "unknown" + + self.queue = queue + if name is None: type_name = "usm_ndarray" if readonly: type_name = "readonly " + type_name if not aligned: type_name = "unaligned " + type_name - name_parts = (type_name, dtype, ndim, layout, usm_type, device) - name = "%s(%s, %sd, %s, %s, %s)" % name_parts + name_parts = ( + type_name, + dtype, + ndim, + layout, + self.addrspace, + usm_type, + self.device, + ) + name = ( + "%s(dtype=%s, ndim=%s, layout=%s, address_space=%s, " + "usm_type=%s, sycl_device=%s)" % name_parts + ) super().__init__( dtype, @@ -86,8 +110,16 @@ def unify(self, typingctx, other): """ Unify this with the *other* USMNdArray. """ - # If other is array and the ndim matches - if isinstance(other, USMNdArray) and other.ndim == self.ndim: + # If other is array and the ndim, usm_type, address_space, and device + # attributes match + + if ( + isinstance(other, USMNdArray) + and other.ndim == self.ndim + and self.device == other.device + and self.addrspace == other.addrspace + and self.usm_type == other.usm_type + ): # If dtype matches or other.dtype is undefined (inferred) if other.dtype == self.dtype or not other.dtype.is_precise(): if self.layout == other.layout: @@ -102,6 +134,9 @@ def unify(self, typingctx, other): layout=layout, readonly=readonly, aligned=aligned, + usm_type=self.usm_type, + device=self.device, + addrspace=self.addrspace, ) def can_convert_to(self, typingctx, other): diff --git a/numba_dpex/core/typing/typeof.py b/numba_dpex/core/typing/typeof.py index 144acfa5de..3293db3553 100644 --- a/numba_dpex/core/typing/typeof.py +++ b/numba_dpex/core/typing/typeof.py @@ -62,5 +62,6 @@ def typeof_usm_ndarray(val, c): readonly=readonly, usm_type=usm_type, device=device, + queue=val.sycl_queue, addrspace=address_space.GLOBAL, ) diff --git a/numba_dpex/core/utils/__init__.py b/numba_dpex/core/utils/__init__.py new file mode 100644 index 0000000000..78bf969d57 --- /dev/null +++ b/numba_dpex/core/utils/__init__.py @@ -0,0 +1,10 @@ +# SPDX-FileCopyrightText: 2020 - 2022 Intel Corporation +# +# SPDX-License-Identifier: Apache-2.0 + +from .suai_helper import SyclUSMArrayInterface, get_info_from_suai + +__all__ = [ + "get_info_from_suai", + "SyclUSMArrayInterface", +] diff --git a/numba_dpex/core/utils/suai_helper.py b/numba_dpex/core/utils/suai_helper.py new file mode 100644 index 0000000000..4dd40880d9 --- /dev/null +++ b/numba_dpex/core/utils/suai_helper.py @@ -0,0 +1,155 @@ +# SPDX-FileCopyrightText: 2020 - 2022 Intel Corporation +# +# SPDX-License-Identifier: Apache-2.0 + +import logging + +import dpctl +import dpctl.memory as dpctl_mem +import numpy as np + + +class SyclUSMArrayInterface: + """Stores as attributes the information extracted from a + __sycl_usm_array_interface__ dictionary as defined by dpctl.memory.Memory* + classes. + """ + + def __init__( + self, + data, + writable, + size, + shape, + dimensions, + itemsize, + strides, + dtype, + usm_type, + device, + queue, + ): + self._data = data + self._data_writeable = writable + self._size = size + self._shape = shape + self._dimensions = dimensions + self._itemsize = itemsize + self._strides = strides + self._dtype = dtype + self._usm_type = usm_type + self._device = device + self._queue = queue + + @property + def data(self): + return self._data + + @property + def is_writable(self): + return self._data_writeable + + @property + def size(self): + return self._size + + @property + def shape(self): + return self._shape + + @property + def dimensions(self): + return self._dimensions + + @property + def itemsize(self): + return self._itemsize + + @property + def strides(self): + return self._strides + + @property + def dtype(self): + return self._dtype + + @property + def usm_type(self): + return self._usm_type + + @property + def device(self): + return self._device + + @property + def queue(self): + return self._queue + + +def get_info_from_suai(obj): + """ + Extracts the metadata of an object of type UsmNdArray using the objects + __sycl_usm_array_interface__ (SUAI) attribute. + + The ``dpctl.memory.as_usm_memory`` function converts the array-like + object into a dpctl.memory.USMMemory object. Using the ``as_usm_memory`` + is an implicit way to verify if the array-like object is a legal + SYCL USM memory back Python object that can be passed to a dpex kernel. + + Args: + obj: array-like object with a SUAI attribute. + + Returns: + A SyclUSMArrayInterface object + + """ + + # dpctl.as_usm_memory validated if an array-like object, obj, has a well + # defined __sycl_usm_array_interface__ dictionary and converts it into a + # dpctl.memory.Memory* object. + try: + usm_mem = dpctl_mem.as_usm_memory(obj) + except Exception as e: + logging.exception( + "Array like object with __sycl_usm_array_interface__ could not be " + "converted to a dpctl.memory.Memory* object." + ) + raise e + + # The data attribute of __sycl_usm_array_interface__ is a 2-tuple. + # The first element is the data pointer and the second a boolean + # value indicating if the data is writable. + is_writable = usm_mem.__sycl_usm_array_interface__["data"][1] + + shape = obj.__sycl_usm_array_interface__["shape"] + total_size = np.prod(shape) + ndim = len(shape) + dtype = np.dtype(obj.__sycl_usm_array_interface__["typestr"]) + itemsize = dtype.itemsize + + strides = obj.__sycl_usm_array_interface__["strides"] + if strides is None: + strides = [1] * ndim + for i in reversed(range(1, ndim)): + strides[i - 1] = strides[i] * shape[i] + strides = tuple(strides) + + syclobj = usm_mem.sycl_queue + device = syclobj.sycl_device.filter_string + usm_type = usm_mem.get_usm_type() + + suai_info = SyclUSMArrayInterface( + data=usm_mem, + writable=is_writable, + size=total_size, + usm_type=usm_type, + device=device, + queue=syclobj, + shape=shape, + dimensions=ndim, + itemsize=itemsize, + strides=strides, + dtype=dtype, + ) + + return suai_info diff --git a/numba_dpex/decorators.py b/numba_dpex/decorators.py index fdfe224df5..0dce245976 100644 --- a/numba_dpex/decorators.py +++ b/numba_dpex/decorators.py @@ -2,74 +2,87 @@ # # SPDX-License-Identifier: Apache-2.0 -import dpctl -from numba.core import sigutils, types +import inspect -from numba_dpex.core.exceptions import KernelHasReturnValueError -from numba_dpex.utils import npytypes_array_to_dpex_array +from numba.core import sigutils, types -from .compiler import ( - JitKernel, +from numba_dpex.core.kernel_interface.dispatcher import JitKernel +from numba_dpex.core.kernel_interface.func import ( compile_func, compile_func_template, - get_ordered_arg_access_types, ) +from numba_dpex.utils import npytypes_array_to_dpex_array -def kernel(signature=None, access_types=None, debug=None): - """The decorator to write a numba_dpex kernel function. +def kernel( + func_or_sig=None, + access_types=None, + debug=None, + enable_cache=True, +): + """A decorator to define a kernel function. A kernel function is conceptually equivalent to a SYCL kernel function, and gets compiled into either an OpenCL or a LevelZero SPIR-V binary kernel. - A dpex kernel imposes the following restrictions: + A kernel decorated Python function has the following restrictions: - * A numba_dpex.kernel function can not return any value. - * All array arguments passed to a kernel should be of the same type - and have the same dtype. + * The function can not return any value. + * All array arguments passed to a kernel should adhere to compute + follows data programming model. """ - if signature is None: - return autojit(debug=debug, access_types=access_types) - elif not sigutils.is_signature(signature): - func = signature - return autojit(debug=debug, access_types=access_types)(func) - else: - return _kernel_jit(signature, debug, access_types) + def _kernel_dispatcher(pyfunc, sigs=None): -def autojit(debug=None, access_types=None): - def _kernel_autojit(pyfunc): - ordered_arg_access_types = get_ordered_arg_access_types( - pyfunc, access_types + return JitKernel( + pyfunc=pyfunc, + debug_flags=debug, + enable_cache=enable_cache, + specialization_sigs=sigs, ) - return JitKernel(pyfunc, debug, ordered_arg_access_types) - return _kernel_autojit - - -def _kernel_jit(signature, debug, access_types): - argtypes, rettype = sigutils.normalize_signature(signature) - argtypes = tuple( - [ - npytypes_array_to_dpex_array(ty) - if isinstance(ty, types.npytypes.Array) - else ty - for ty in argtypes - ] - ) - - def _wrapped(pyfunc): - current_queue = dpctl.get_current_queue() - ordered_arg_access_types = get_ordered_arg_access_types( - pyfunc, access_types + if func_or_sig is None: + return _kernel_dispatcher + elif isinstance(func_or_sig, str): + raise NotImplementedError( + "Specifying signatures as string is not yet supported by numba-dpex" ) - # We create an instance of JitKernel to make sure at call time - # we are going through the caching mechanism. - kernel = JitKernel(pyfunc, debug, ordered_arg_access_types) - # This will make sure we are compiling eagerly. - kernel.specialize(argtypes, current_queue) - return kernel - - return _wrapped + elif isinstance(func_or_sig, list) or sigutils.is_signature(func_or_sig): + # String signatures are not supported as passing usm_ndarray type as + # a string is not possible. Numba's sigutils relies on the type being + # available in Numba's types.__dpct__ and dpex types are not registered + # there yet. + if isinstance(func_or_sig, list): + for sig in func_or_sig: + if isinstance(sig, str): + raise NotImplementedError( + "Specifying signatures as string is not yet supported " + "by numba-dpex" + ) + # Specialized signatures can either be a single signature or a list. + # In case only one signature is provided convert it to a list + if not isinstance(func_or_sig, list): + func_or_sig = [func_or_sig] + + def _specialized_kernel_dispatcher(pyfunc): + + return JitKernel( + pyfunc=pyfunc, + debug_flags=debug, + enable_cache=enable_cache, + specialization_sigs=func_or_sig, + ) + + return _specialized_kernel_dispatcher + else: + func = func_or_sig + if not inspect.isfunction(func): + raise ValueError( + "Argument passed to the kernel decorator is neither a " + "function object, nor a signature. If you are trying to " + "specialize the kernel that takes a single argument, specify " + "the return type as void explicitly." + ) + return _kernel_dispatcher(func) def func(signature=None, debug=None): @@ -84,14 +97,6 @@ def func(signature=None, debug=None): def _func_jit(signature, debug=None): argtypes, restype = sigutils.normalize_signature(signature) - argtypes = tuple( - [ - npytypes_array_to_dpex_array(ty) - if isinstance(ty, types.npytypes.Array) - else ty - for ty in argtypes - ] - ) def _wrapped(pyfunc): return compile_func(pyfunc, restype, argtypes, debug=debug) diff --git a/numba_dpex/device_init.py b/numba_dpex/device_init.py index 3452ffdcb9..ad1ed3457a 100644 --- a/numba_dpex/device_init.py +++ b/numba_dpex/device_init.py @@ -33,6 +33,6 @@ from . import initialize from .core import target -from .decorators import autojit, func, kernel +from .decorators import func, kernel initialize.load_dpctl_sycl_interface() diff --git a/numba_dpex/examples/kernel/device_func.py b/numba_dpex/examples/kernel/device_func.py index 2ebc57537a..507ca0377f 100644 --- a/numba_dpex/examples/kernel/device_func.py +++ b/numba_dpex/examples/kernel/device_func.py @@ -10,7 +10,8 @@ N = 10 -# A device callable function that can be invoked from ``kernel`` and other device functions +# A device callable function that can be invoked from ``kernel`` and other +# device functions @ndpex.func def a_device_function(a): return a + 1 diff --git a/numba_dpex/examples/kernel/kernel_specialization.py b/numba_dpex/examples/kernel/kernel_specialization.py new file mode 100644 index 0000000000..5798b80488 --- /dev/null +++ b/numba_dpex/examples/kernel/kernel_specialization.py @@ -0,0 +1,135 @@ +# SPDX-FileCopyrightText: 2020 - 2022 Intel Corporation +# +# SPDX-License-Identifier: Apache-2.0 + +import logging + +import dpctl.tensor as dpt +import numpy as np + +import numba_dpex as dpex +from numba_dpex import float32, int64, usm_ndarray +from numba_dpex.core.exceptions import ( + InvalidKernelSpecializationError, + MissingSpecializationError, +) + +# Similar to Numba, numba-dpex supports eager compilation of functions. The +# following examples demonstrate the feature for numba_dpex.kernel and presents +# usage scenarios and current limitations. + +# ------------ Example 1. ------------ # + +# Define type specializations using the numba_dpex usm_ndarray data type. +i64arrty = usm_ndarray(int64, 1, "C", usm_type="device", device="0") +f32arrty = usm_ndarray(float32, 1, "C", usm_type="device", device="0") + + +# specialize a kernel for the i64arrty +@dpex.kernel((i64arrty, i64arrty, i64arrty)) +def data_parallel_sum(a, b, c): + """ + Vector addition using the ``kernel`` decorator. + """ + i = dpex.get_global_id(0) + c[i] = a[i] + b[i] + + +# run the specialized kernel +a = dpt.ones(1024, dtype=dpt.int64, device="0") +b = dpt.ones(1024, dtype=dpt.int64, device="0") +c = dpt.zeros(1024, dtype=dpt.int64, device="0") + +data_parallel_sum[ + 1024, +](a, b, c) + +npc = dpt.asnumpy(c) +npc_expected = np.full(1024, 2, dtype=np.int64) +assert np.array_equal(npc, npc_expected) + + +# ------------ Example 2. ------------ # + +# Multiple signatures can be specified as a list to eager compile multiple +# versions of the kernel. + +# specialize a kernel for the i64arrty +@dpex.kernel([(i64arrty, i64arrty, i64arrty), (f32arrty, f32arrty, f32arrty)]) +def data_parallel_sum2(a, b, c): + """ + Vector addition using the ``kernel`` decorator. + """ + i = dpex.get_global_id(0) + c[i] = a[i] + b[i] + + +# run the i64 specialized kernel +a = dpt.ones(1024, dtype=dpt.int64, device="0") +b = dpt.ones(1024, dtype=dpt.int64, device="0") +c = dpt.zeros(1024, dtype=dpt.int64, device="0") + +data_parallel_sum2[ + 1024, +](a, b, c) + +npc = dpt.asnumpy(c) +npc_expected = np.full(1024, 2, dtype=np.int64) +assert np.array_equal(npc, npc_expected) + +# run the f32 specialized kernel +a = dpt.ones(1024, dtype=dpt.float32, device="0") +b = dpt.ones(1024, dtype=dpt.float32, device="0") +c = dpt.zeros(1024, dtype=dpt.float32, device="0") + +data_parallel_sum2[ + 1024, +](a, b, c) + +npc = dpt.asnumpy(c) +npc_expected = np.full(1024, 2, dtype=np.float32) +assert np.array_equal(npc, npc_expected) + + +# ------------ Example 3. ------------ # + +# A specialized kernel cannot be jit compiled. Calling a specialized kernel +# with arguments having type different from the specialization will result in +# an MissingSpecializationError. + +a = dpt.ones(1024, dtype=dpt.int32) +b = dpt.ones(1024, dtype=dpt.int32) +c = dpt.zeros(1024, dtype=dpt.int32) + +try: + data_parallel_sum[ + 1024, + ](a, b, c) +except MissingSpecializationError as mse: + print(mse) + + +# ------------ Example 4. ------------ # + +# Numba_dpex does not support NumPy arrays as kernel arguments and all +# array arguments should be inferable as a numba_dpex.types.usm_ndarray. Trying +# to eager compile with a NumPy array-based signature will lead to an +# InvalidKernelSpecializationError + +try: + dpex.kernel((int64[::1], int64[::1], int64[::1])) +except InvalidKernelSpecializationError: + logging.exception() + + +# ------------ Limitations ------------ # + + +# Specifying signatures using strings is not yet supported. The limitation is +# due to numba_dpex relying on Numba's sigutils module to parse signatures. +# Sigutils only recognizes Numba types specified as strings. + +try: + dpex.kernel("(i64arrty, i64arrty, i64arrty)") +except NotImplementedError: + logging.exception() diff --git a/numba_dpex/offload_dispatcher.py b/numba_dpex/offload_dispatcher.py index 172607f813..8efb760adc 100644 --- a/numba_dpex/offload_dispatcher.py +++ b/numba_dpex/offload_dispatcher.py @@ -22,7 +22,7 @@ def __init__( pipeline_class=compiler.Compiler, ): if dpex_config.HAS_NON_HOST_DEVICE: - from numba_dpex.compiler import Compiler + from numba_dpex.core.compiler import Compiler targetoptions["parallel"] = True dispatcher.Dispatcher.__init__( diff --git a/numba_dpex/passbuilder.py b/numba_dpex/passbuilder.py deleted file mode 100644 index 999b356b11..0000000000 --- a/numba_dpex/passbuilder.py +++ /dev/null @@ -1,145 +0,0 @@ -# SPDX-FileCopyrightText: 2020 - 2022 Intel Corporation -# -# SPDX-License-Identifier: Apache-2.0 - -from numba.core.compiler_machinery import PassManager -from numba.core.typed_passes import ( - AnnotateTypes, - InlineOverloads, - IRLegalization, - NopythonRewrites, - NoPythonSupportedFeatureValidation, - NopythonTypeInference, - PreLowerStripPhis, -) -from numba.core.untyped_passes import ( - DeadBranchPrune, - FindLiterallyCalls, - FixupArgs, - GenericRewrites, - InlineClosureLikes, - InlineInlinables, - IRProcessing, - LiteralUnroll, - MakeFunctionToJitFunction, - ReconstructSSA, - RewriteSemanticConstants, - TranslateByteCode, - WithLifting, -) - -from numba_dpex.core.passes.passes import ( - ConstantSizeStaticLocalMemoryPass, - DpexLowering, - DumpParforDiagnostics, - NoPythonBackend, - ParforPass, - PreParforPass, -) -from numba_dpex.core.passes.rename_numpy_functions_pass import ( - RewriteNdarrayFunctionsPass, - RewriteOverloadedNumPyFunctionsPass, -) - - -class PassBuilder(object): - """ - This is a pass builder to run Intel GPU/CPU specific - code-generation and optimization passes. This pass builder does - not offer objectmode and interpreted passes. - """ - - @staticmethod - def default_numba_nopython_pipeline(state, pm): - """Adds the default set of NUMBA passes to the pass manager""" - if state.func_ir is None: - pm.add_pass(TranslateByteCode, "analyzing bytecode") - pm.add_pass(FixupArgs, "fix up args") - pm.add_pass(IRProcessing, "processing IR") - pm.add_pass(WithLifting, "Handle with contexts") - - # this pass rewrites name of NumPy functions we intend to overload - pm.add_pass( - RewriteOverloadedNumPyFunctionsPass, - "Rewrite name of Numpy functions to overload already overloaded function", - ) - - # Add pass to ensure when users are allocating static - # constant memory the size is a constant and can not - # come from a closure variable - pm.add_pass( - ConstantSizeStaticLocalMemoryPass, - "dpex constant size for static local memory", - ) - - # inline closures early in case they are using nonlocal's - # see issue #6585. - pm.add_pass( - InlineClosureLikes, "inline calls to locally defined closures" - ) - - # pre typing - if not state.flags.no_rewrites: - pm.add_pass(RewriteSemanticConstants, "rewrite semantic constants") - pm.add_pass(DeadBranchPrune, "dead branch pruning") - pm.add_pass(GenericRewrites, "nopython rewrites") - - # convert any remaining closures into functions - pm.add_pass( - MakeFunctionToJitFunction, - "convert make_function into JIT functions", - ) - # inline functions that have been determined as inlinable and rerun - # branch pruning, this needs to be run after closures are inlined as - # the IR repr of a closure masks call sites if an inlinable is called - # inside a closure - pm.add_pass(InlineInlinables, "inline inlinable functions") - if not state.flags.no_rewrites: - pm.add_pass(DeadBranchPrune, "dead branch pruning") - - pm.add_pass(FindLiterallyCalls, "find literally calls") - pm.add_pass(LiteralUnroll, "handles literal_unroll") - - if state.flags.enable_ssa: - pm.add_pass(ReconstructSSA, "ssa") - - # typing - pm.add_pass(NopythonTypeInference, "nopython frontend") - pm.add_pass(AnnotateTypes, "annotate types") - - pm.add_pass( - RewriteNdarrayFunctionsPass, - "Rewrite numpy.ndarray functions to dpnp.ndarray functions", - ) - - # strip phis - pm.add_pass(PreLowerStripPhis, "remove phis nodes") - - # optimisation - pm.add_pass(InlineOverloads, "inline overloaded functions") - - @staticmethod - def define_nopython_pipeline(state, name="dpex_nopython"): - """Returns an nopython mode pipeline based PassManager""" - pm = PassManager(name) - PassBuilder.default_numba_nopython_pipeline(state, pm) - - # Intel GPU/CPU specific optimizations - pm.add_pass(PreParforPass, "Preprocessing for parfors") - if not state.flags.no_rewrites: - pm.add_pass(NopythonRewrites, "nopython rewrites") - pm.add_pass(ParforPass, "convert to parfors") - - # legalise - pm.add_pass( - NoPythonSupportedFeatureValidation, - "ensure features that are in use are in a valid form", - ) - pm.add_pass(IRLegalization, "ensure IR is legal prior to lowering") - - # lower - pm.add_pass(DpexLowering, "Custom Lowerer with auto-offload support") - pm.add_pass(NoPythonBackend, "nopython mode backend") - pm.add_pass(DumpParforDiagnostics, "dump parfor diagnostics") - pm.finalize() - return pm diff --git a/numba_dpex/tests/integration/test_sycl_usm_array_iface_interop.py b/numba_dpex/tests/integration/test_sycl_usm_array_iface_interop.py index 143b992059..0c83486dfe 100644 --- a/numba_dpex/tests/integration/test_sycl_usm_array_iface_interop.py +++ b/numba_dpex/tests/integration/test_sycl_usm_array_iface_interop.py @@ -33,7 +33,7 @@ def test_kernel_valid_usm_obj(dtype): """Test if a ``numba_dpex.kernel`` function accepts a DuckUSMArray argument. The ``DuckUSMArray`` uses ``dpctl.memory`` to allocate a Python object that - defines a __sycl_usm_array__interface__ attribute. We test if + defines a ``__sycl_usm_array_interface__`` attribute. We test if ``numba_dpex`` recognizes the ``DuckUSMArray`` as a valid USM-backed Python object and accepts it as a kernel argument. diff --git a/numba_dpex/tests/kernel_tests/test_atomic_op.py b/numba_dpex/tests/kernel_tests/test_atomic_op.py index 1d568027d5..95e25d08f9 100644 --- a/numba_dpex/tests/kernel_tests/test_atomic_op.py +++ b/numba_dpex/tests/kernel_tests/test_atomic_op.py @@ -2,14 +2,13 @@ # # SPDX-License-Identifier: Apache-2.0 -import os - import dpctl import numpy as np import pytest import numba_dpex as dpex from numba_dpex import config +from numba_dpex.core.descriptor import dpex_target from numba_dpex.tests._helper import filter_strings, override_config global_size = 100 @@ -176,7 +175,6 @@ def skip_if_disabled(*args): return pytest.param(*args, marks=skip_NATIVE_FP_ATOMICS_0) -@pytest.mark.parametrize("filter_str", filter_strings) @skip_no_atomic_support @pytest.mark.parametrize( "NATIVE_FP_ATOMICS, expected_native_atomic_for_device", @@ -197,7 +195,6 @@ def skip_if_disabled(*args): ) @pytest.mark.parametrize("dtype", list_of_f_dtypes) def test_atomic_fp_native( - filter_str, NATIVE_FP_ATOMICS, expected_native_atomic_for_device, function_generator, @@ -206,20 +203,27 @@ def test_atomic_fp_native( dtype, ): function = function_generator(operator_name, dtype) - kernel = dpex.kernel(function) - argtypes = kernel._get_argtypes(np.array([0], dtype)) + kernel = dpex.core.kernel_interface.spirv_kernel.SpirvKernel( + function, function.__name__ + ) + args = [np.array([0], dtype)] + argtypes = [ + dpex.core.descriptor.dpex_target.typing_context.resolve_argument_type( + arg + ) + for arg in args + ] with override_config("NATIVE_FP_ATOMICS", NATIVE_FP_ATOMICS): - - with dpctl.device_context(filter_str) as sycl_queue: - - specialized_kernel = kernel[ - global_size, dpex.DEFAULT_LOCAL_SIZE - ].specialize(argtypes, sycl_queue) - - is_native_atomic = ( - expected_spirv_function in specialized_kernel.assembly - ) - assert is_native_atomic == expected_native_atomic_for_device( - filter_str - ) + kernel.compile( + args=argtypes, + debug=None, + compile_flags=None, + target_ctx=dpex_target.target_context, + typing_ctx=dpex_target.typing_context, + ) + + is_native_atomic = expected_spirv_function in kernel._llvm_module + assert is_native_atomic == expected_native_atomic_for_device( + dpctl.select_default_device().filter_string + ) diff --git a/numba_dpex/tests/kernel_tests/test_barrier.py b/numba_dpex/tests/kernel_tests/test_barrier.py index 8bc8ef5299..3c65093273 100644 --- a/numba_dpex/tests/kernel_tests/test_barrier.py +++ b/numba_dpex/tests/kernel_tests/test_barrier.py @@ -2,20 +2,22 @@ # # SPDX-License-Identifier: Apache-2.0 -import platform - import dpctl +import dpctl.tensor as dpt import numpy as np import pytest import numba_dpex as dpex +from numba_dpex import float32, usm_ndarray, void from numba_dpex.tests._helper import filter_strings +f32arrty = usm_ndarray(float32, 1, "C") + @pytest.mark.parametrize("filter_str", filter_strings) def test_proper_lowering(filter_str): # This will trigger eager compilation - @dpex.kernel("void(float32[::1])") + @dpex.kernel(void(f32arrty)) def twice(A): i = dpex.get_global_id(0) d = A[i] @@ -23,19 +25,17 @@ def twice(A): A[i] = d * 2 N = 256 - arr = np.random.random(N).astype(np.float32) - orig = arr.copy() - - with dpctl.device_context(filter_str): - twice[N, N // 2](arr) - + arr = dpt.arange(N, dtype=dpt.float32) + orig = dpt.asnumpy(arr) + twice[N, N // 2](arr) + after = dpt.asnumpy(arr) # The computation is correct? - np.testing.assert_allclose(orig * 2, arr) + np.testing.assert_allclose(orig * 2, after) @pytest.mark.parametrize("filter_str", filter_strings) def test_no_arg_barrier_support(filter_str): - @dpex.kernel("void(float32[::1])") + @dpex.kernel(void(f32arrty)) def twice(A): i = dpex.get_global_id(0) d = A[i] @@ -44,21 +44,19 @@ def twice(A): A[i] = d * 2 N = 256 - arr = np.random.random(N).astype(np.float32) - orig = arr.copy() - - with dpctl.device_context(filter_str): - twice[N, dpex.DEFAULT_LOCAL_SIZE](arr) - + arr = dpt.arange(N, dtype=dpt.float32) + orig = dpt.asnumpy(arr) + twice[N, dpex.DEFAULT_LOCAL_SIZE](arr) + after = dpt.asnumpy(arr) # The computation is correct? - np.testing.assert_allclose(orig * 2, arr) + np.testing.assert_allclose(orig * 2, after) @pytest.mark.parametrize("filter_str", filter_strings) def test_local_memory(filter_str): blocksize = 10 - @dpex.kernel("void(float32[::1])") + @dpex.kernel(void(f32arrty)) def reverse_array(A): lm = dpex.local.array(shape=10, dtype=np.float32) i = dpex.get_global_id(0) @@ -70,11 +68,9 @@ def reverse_array(A): # write A[i] += lm[blocksize - 1 - i] - arr = np.arange(blocksize).astype(np.float32) - orig = arr.copy() - - with dpctl.device_context(filter_str): - reverse_array[blocksize, blocksize](arr) - + arr = dpt.arange(blocksize, dtype=dpt.float32) + orig = dpt.asnumpy(arr) + reverse_array[blocksize, blocksize](arr) + after = dpt.asnumpy(arr) expected = orig[::-1] + orig - np.testing.assert_allclose(expected, arr) + np.testing.assert_allclose(expected, after) diff --git a/numba_dpex/tests/kernel_tests/test_caching.py b/numba_dpex/tests/kernel_tests/test_caching.py index 09ceb27d57..801cd4193c 100644 --- a/numba_dpex/tests/kernel_tests/test_caching.py +++ b/numba_dpex/tests/kernel_tests/test_caching.py @@ -3,76 +3,48 @@ # SPDX-License-Identifier: Apache-2.0 import dpctl +import dpctl.tensor as dpt import numpy as np import pytest import numba_dpex as dpex +from numba_dpex.core.kernel_interface.dispatcher import JitKernel from numba_dpex.tests._helper import filter_strings @pytest.mark.parametrize("filter_str", filter_strings) -def test_caching_kernel_using_same_queue(filter_str): - """Test kernel caching when the same queue is used to submit a kernel - multiple times. +def test_caching_hit_counts(filter_str): + """Tests the correct number of cache hits. + If a Dispatcher is invoked 10 times and if the caching is enabled, + then the total number of cache hits will be 9. Given the fact that + the first time the kernel will be compiled and it will be loaded + off the cache for the next time on. Args: - filter_str: SYCL filter selector string + filter_str (str): The device name coming from filter_strings in + ._helper.py """ - global_size = 10 - N = global_size - def data_parallel_sum(a, b, c): + def data_parallel_sum(x, y, z): + """ + Vector addition using the ``kernel`` decorator. + """ i = dpex.get_global_id(0) - c[i] = a[i] + b[i] + z[i] = x[i] + y[i] - a = np.array(np.random.random(N), dtype=np.float32) - b = np.array(np.random.random(N), dtype=np.float32) - c = np.ones_like(a) + a = dpt.arange(0, 100, device=filter_str) + b = dpt.arange(0, 100, device=filter_str) + c = dpt.zeros_like(a, device=filter_str) - with dpctl.device_context(filter_str) as gpu_queue: - func = dpex.kernel(data_parallel_sum) - cached_kernel = func[global_size, dpex.DEFAULT_LOCAL_SIZE].specialize( - func._get_argtypes(a, b, c), gpu_queue - ) + expected = dpt.asnumpy(a) + dpt.asnumpy(b) - for i in range(10): - _kernel = func[global_size, dpex.DEFAULT_LOCAL_SIZE].specialize( - func._get_argtypes(a, b, c), gpu_queue - ) - assert _kernel == cached_kernel - - -@pytest.mark.parametrize("filter_str", filter_strings) -def test_caching_kernel_using_same_context(filter_str): - """Test kernel caching for the scenario where different SYCL queues that - share a SYCL context are used to submit a kernel. - - Args: - filter_str: SYCL filter selector string - """ - global_size = 10 - N = global_size - - def data_parallel_sum(a, b, c): - i = dpex.get_global_id(0) - c[i] = a[i] + b[i] + d = JitKernel( + data_parallel_sum, + ) - a = np.array(np.random.random(N), dtype=np.float32) - b = np.array(np.random.random(N), dtype=np.float32) - c = np.ones_like(a) + N = 10 + for i in range(N): + d(a, b, c, global_range=[100]) + actual = dpt.asnumpy(c) - # Set the global queue to the default device so that the cached_kernel gets - # created for that device - dpctl.set_global_queue(filter_str) - func = dpex.kernel(data_parallel_sum) - default_queue = dpctl.get_current_queue() - cached_kernel = func[global_size, dpex.DEFAULT_LOCAL_SIZE].specialize( - func._get_argtypes(a, b, c), default_queue - ) - for i in range(0, 10): - # Each iteration create a fresh queue that will share the same context - with dpctl.device_context(filter_str) as gpu_queue: - _kernel = func[global_size, dpex.DEFAULT_LOCAL_SIZE].specialize( - func._get_argtypes(a, b, c), gpu_queue - ) - assert _kernel == cached_kernel + assert np.array_equal(expected, actual) and (d.cache_hits == N - 1) diff --git a/numba_dpex/tests/kernel_tests/test_compute_follows_data.py b/numba_dpex/tests/kernel_tests/test_compute_follows_data.py index ad16d9c921..91f233a3ff 100644 --- a/numba_dpex/tests/kernel_tests/test_compute_follows_data.py +++ b/numba_dpex/tests/kernel_tests/test_compute_follows_data.py @@ -10,17 +10,12 @@ import pytest import numba_dpex +from numba_dpex.core.exceptions import ComputeFollowsDataInferenceError from numba_dpex.tests._helper import ( filter_strings, skip_no_level_zero_gpu, skip_no_opencl_gpu, ) -from numba_dpex.utils import ( - IndeterminateExecutionQueueError, - IndeterminateExecutionQueueError_msg, - cfd_ctx_mgr_wrng_msg, - mix_datatype_err_msg, -) global_size = 10 local_size = 1 @@ -115,7 +110,7 @@ def test_ndarray_argtype(offload_device, input_arrays): def test_mix_argtype(offload_device, input_arrays): usm_type = "device" - a, b, expected = input_arrays + a, b, _ = input_arrays got = np.ones_like(a) device = dpctl.SyclDevice(offload_device) @@ -136,11 +131,9 @@ def test_mix_argtype(offload_device, input_arrays): buffer_ctor_kwargs={"queue": queue}, ) - with pytest.raises(TypeError) as error_msg: + with pytest.raises(ComputeFollowsDataInferenceError): sum_kernel[global_size, local_size](da, b, dc) - assert mix_datatype_err_msg in error_msg - @pytest.mark.parametrize("offload_device", filter_strings) def test_context_manager_with_usm_ndarray(offload_device, input_arrays): @@ -235,9 +228,8 @@ def test_equivalent_usm_ndarray(input_arrays): buffer_ctor_kwargs={"queue": queue1}, ) - with pytest.raises(IndeterminateExecutionQueueError) as error_msg: + with pytest.raises(ComputeFollowsDataInferenceError): sum_kernel[global_size, local_size](da, not_equivalent_db, dc) - assert IndeterminateExecutionQueueError_msg in str(error_msg.value) sum_kernel[global_size, local_size](da, equivalent_db, dc) dc.usm_data.copy_to_host(got.reshape((-1)).view("|u1")) diff --git a/numba_dpex/tests/kernel_tests/test_kernel_has_return_value_error.py b/numba_dpex/tests/kernel_tests/test_kernel_has_return_value_error.py index 3140cffd0d..d417924513 100644 --- a/numba_dpex/tests/kernel_tests/test_kernel_has_return_value_error.py +++ b/numba_dpex/tests/kernel_tests/test_kernel_has_return_value_error.py @@ -2,12 +2,14 @@ # # SPDX-License-Identifier: Apache-2.0 -import dpctl +import dpctl.tensor as dpt import numpy as np import pytest import numba_dpex as dpex -from numba_dpex.tests._helper import filter_strings +from numba_dpex import int32, usm_ndarray + +i32arrty = usm_ndarray(int32, 1, "C") def f(a): @@ -16,7 +18,7 @@ def f(a): list_of_sig = [ None, - ("int32[::1](int32[::1])"), + (i32arrty(i32arrty)), ] @@ -25,13 +27,9 @@ def sig(request): return request.param -@pytest.mark.parametrize("filter_str", filter_strings) -def test_return(filter_str, sig): - a = np.array(np.random.random(122), np.int32) +def test_return(sig): + a = dpt.arange(1024, dtype=dpt.int32, device="0") with pytest.raises(dpex.core.exceptions.KernelHasReturnValueError): kernel = dpex.kernel(sig)(f) - - device = dpctl.SyclDevice(filter_str) - with dpctl.device_context(device): - kernel[a.size, dpex.DEFAULT_LOCAL_SIZE](a) + kernel[a.size, dpex.DEFAULT_LOCAL_SIZE](a) diff --git a/numba_dpex/tests/kernel_tests/test_kernel_specialization.py b/numba_dpex/tests/kernel_tests/test_kernel_specialization.py new file mode 100644 index 0000000000..e99d0eeabb --- /dev/null +++ b/numba_dpex/tests/kernel_tests/test_kernel_specialization.py @@ -0,0 +1,96 @@ +# SPDX-FileCopyrightText: 2022 - 2023 Intel Corporation +# +# SPDX-License-Identifier: Apache-2.0 + +import dpctl.tensor as dpt +import pytest + +import numba_dpex as dpex +from numba_dpex import float32, int64, usm_ndarray +from numba_dpex.core.exceptions import ( + InvalidKernelSpecializationError, + MissingSpecializationError, +) + +i64arrty = usm_ndarray(int64, 1, "C") +f32arrty = usm_ndarray(float32, 1, "C") + +specialized_kernel1 = dpex.kernel((i64arrty, i64arrty, i64arrty)) +specialized_kernel2 = dpex.kernel( + [(i64arrty, i64arrty, i64arrty), (f32arrty, f32arrty, f32arrty)] +) + + +def data_parallel_sum(a, b, c): + """ + Vector addition using the ``kernel`` decorator. + """ + i = dpex.get_global_id(0) + c[i] = a[i] + b[i] + + +def test_single_specialization(): + """Test if a kernel can be specialized with a single signature.""" + jitkernel = specialized_kernel1(data_parallel_sum) + assert jitkernel._specialization_cache.size() == 1 + + +def test_multiple_specialization(): + """Test if a kernel can be specialized with multiple signatures.""" + jitkernel = specialized_kernel2(data_parallel_sum) + assert jitkernel._specialization_cache.size() == 2 + + +def test_invalid_specialization_error(): + """Test if an InvalidKernelSpecializationError is raised when attempting to + specialize with NumPy arrays. + """ + specialized_kernel3 = dpex.kernel((int64[::1], int64[::1], int64[::1])) + with pytest.raises(InvalidKernelSpecializationError): + specialized_kernel3(data_parallel_sum) + + +def test_missing_specialization_error(): + """Test if a MissingSpecializationError is raised when calling a + specialized kernel with unsupported arguments. + """ + a = dpt.ones(1024, dtype=dpt.int32) + b = dpt.ones(1024, dtype=dpt.int32) + c = dpt.zeros(1024, dtype=dpt.int32) + + with pytest.raises(MissingSpecializationError): + specialized_kernel1(data_parallel_sum)[ + 1024, + ](a, b, c) + + +def test_execution_of_specialized_kernel(): + """Test if the specialized kernel is correctly executed.""" + a = dpt.ones(1024, dtype=dpt.int64) + b = dpt.ones(1024, dtype=dpt.int64) + c = dpt.zeros(1024, dtype=dpt.int64) + + specialized_kernel1(data_parallel_sum)[ + 1024, + ](a, b, c) + + npc = dpt.asnumpy(c) + import numpy as np + + npc_expected = np.full(1024, 2, dtype=np.int64) + assert np.array_equal(npc, npc_expected) + + +def test_string_specialization(): + """Test if NotImplementedError is raised when signature is a string""" + + with pytest.raises(NotImplementedError): + dpex.kernel("(i64arrty, i64arrty, i64arrty)") + + with pytest.raises(NotImplementedError): + dpex.kernel( + ["(i64arrty, i64arrty, i64arrty)", "(f32arrty, f32arrty, f32arrty)"] + ) + + with pytest.raises(ValueError): + dpex.kernel((i64arrty)) diff --git a/numba_dpex/tests/kernel_tests/test_ndrange_exceptions.py b/numba_dpex/tests/kernel_tests/test_ndrange_exceptions.py new file mode 100644 index 0000000000..e92fdbea28 --- /dev/null +++ b/numba_dpex/tests/kernel_tests/test_ndrange_exceptions.py @@ -0,0 +1,38 @@ +# SPDX-FileCopyrightText: 2020 - 2022 Intel Corporation +# +# SPDX-License-Identifier: Apache-2.0 +import dpctl.tensor as dpt +import pytest + +import numba_dpex as ndpx +from numba_dpex.core.exceptions import ( + UnmatchedNumberOfRangeDimsError, + UnsupportedGroupWorkItemSizeError, +) + + +# Data parallel kernel implementing vector sum +@ndpx.kernel +def kernel_vector_sum(a, b, c): + i = ndpx.get_global_id(0) + c[i] = a[i] + b[i] + + +@pytest.mark.parametrize( + "error, ndrange", + [ + (UnmatchedNumberOfRangeDimsError, ((2, 2), (1, 1, 1))), + (UnsupportedGroupWorkItemSizeError, ((3, 3, 3), (2, 2, 2))), + ], +) +def test_ndrange_config_error(error, ndrange): + """Test if a exception is raised when calling a + ndrange kernel with unspported arguments. + """ + + a = dpt.ones(1024, dtype=dpt.int32, device="0") + b = dpt.ones(1024, dtype=dpt.int32, device="0") + c = dpt.zeros(1024, dtype=dpt.int64, device="0") + + with pytest.raises(error): + kernel_vector_sum[ndrange](a, b, c) diff --git a/numba_dpex/tests/test_black_scholes.py b/numba_dpex/tests/test_black_scholes.py deleted file mode 100644 index 57d928a39c..0000000000 --- a/numba_dpex/tests/test_black_scholes.py +++ /dev/null @@ -1,151 +0,0 @@ -# Copyright 2020 - 2022 Intel Corporation -# -# SPDX-License-Identifier: Apache-2.0 - -import math -import time - -import dpctl -import numpy as np - -import numba_dpex as dpex -from numba_dpex.tests._helper import skip_no_opencl_gpu - -RISKFREE = 0.02 -VOLATILITY = 0.30 - -A1 = 0.31938153 -A2 = -0.356563782 -A3 = 1.781477937 -A4 = -1.821255978 -A5 = 1.330274429 -RSQRT2PI = 0.39894228040143267793994605993438 - - -def cnd(d): - K = 1.0 / (1.0 + 0.2316419 * np.abs(d)) - ret_val = ( - RSQRT2PI - * np.exp(-0.5 * d * d) - * (K * (A1 + K * (A2 + K * (A3 + K * (A4 + K * A5))))) - ) - return np.where(d > 0, 1.0 - ret_val, ret_val) - - -def black_scholes( - callResult, - putResult, - stockPrice, - optionStrike, - optionYears, - Riskfree, - Volatility, -): - S = stockPrice - X = optionStrike - T = optionYears - R = Riskfree - V = Volatility - sqrtT = np.sqrt(T) - d1 = (np.log(S / X) + (R + 0.5 * V * V) * T) / (V * sqrtT) - d2 = d1 - V * sqrtT - cndd1 = cnd(d1) - cndd2 = cnd(d2) - - expRT = np.exp(-R * T) - callResult[:] = S * cndd1 - X * expRT * cndd2 - putResult[:] = X * expRT * (1.0 - cndd2) - S * (1.0 - cndd1) - - -def randfloat(rand_var, low, high): - return (1.0 - rand_var) * low + rand_var * high - - -@skip_no_opencl_gpu -class TestBlackScholesKernel: - def test_black_scholes(self): - OPT_N = 400 - iterations = 2 - - stockPrice = randfloat(np.random.random(OPT_N), 5.0, 30.0) - optionStrike = randfloat(np.random.random(OPT_N), 1.0, 100.0) - optionYears = randfloat(np.random.random(OPT_N), 0.25, 10.0) - - callResultNumpy = np.zeros(OPT_N) - putResultNumpy = -np.ones(OPT_N) - - callResultNumbapro = np.zeros(OPT_N) - putResultNumbapro = -np.ones(OPT_N) - - # numpy - for i in range(iterations): - black_scholes( - callResultNumpy, - putResultNumpy, - stockPrice, - optionStrike, - optionYears, - RISKFREE, - VOLATILITY, - ) - - # numba_dpex - @dpex.kernel - def black_scholes_dpex(callResult, putResult, S, X, T, R, V): - i = dpex.get_global_id(0) - if i >= S.shape[0]: - return - sqrtT = math.sqrt(T[i]) - d1 = (math.log(S[i] / X[i]) + (R + 0.5 * V * V) * T[i]) / ( - V * sqrtT - ) - d2 = d1 - V * sqrtT - - K = 1.0 / (1.0 + 0.2316419 * math.fabs(d1)) - cndd1 = ( - RSQRT2PI - * math.exp(-0.5 * d1 * d1) - * (K * (A1 + K * (A2 + K * (A3 + K * (A4 + K * A5))))) - ) - if d1 > 0: - cndd1 = 1.0 - cndd1 - - K = 1.0 / (1.0 + 0.2316419 * math.fabs(d2)) - cndd2 = ( - RSQRT2PI - * math.exp(-0.5 * d2 * d2) - * (K * (A1 + K * (A2 + K * (A3 + K * (A4 + K * A5))))) - ) - if d2 > 0: - cndd2 = 1.0 - cndd2 - - expRT = math.exp((-1.0 * R) * T[i]) - callResult[i] = S[i] * cndd1 - X[i] * expRT * cndd2 - putResult[i] = X[i] * expRT * (1.0 - cndd2) - S[i] * (1.0 - cndd1) - - # numba - time0 = time.time() - blockdim = 512, 1 - griddim = int(math.ceil(float(OPT_N) / blockdim[0])), 1 - - with dpctl.device_context("opencl:gpu"): - time1 = time.time() - for i in range(iterations): - black_scholes_dpex[blockdim, griddim]( - callResultNumbapro, - putResultNumbapro, - stockPrice, - optionStrike, - optionYears, - RISKFREE, - VOLATILITY, - ) - - dt = time1 - time0 # noqa - - delta = np.abs(callResultNumpy - callResultNumbapro) - L1norm = delta.sum() / np.abs(callResultNumpy).sum() - - max_abs_err = delta.max() - assert L1norm < 1e-13 - assert max_abs_err < 1e-13 diff --git a/numba_dpex/tests/test_debuginfo.py b/numba_dpex/tests/test_debuginfo.py index 3c9a92f655..4b2eadf15e 100644 --- a/numba_dpex/tests/test_debuginfo.py +++ b/numba_dpex/tests/test_debuginfo.py @@ -6,12 +6,11 @@ import re -import dpctl import pytest from numba.core import types import numba_dpex as dpex -from numba_dpex import compiler +from numba_dpex.core.descriptor import dpex_target from numba_dpex.tests._helper import override_config from numba_dpex.utils import npytypes_array_to_dpex_array @@ -23,11 +22,18 @@ def debug_option(request): return request.param -def get_kernel_ir(sycl_queue, fn, sig, debug=None): - kernel = compiler.compile_kernel( - sycl_queue, fn.py_func, sig, None, debug=debug +def get_kernel_ir(fn, sig, debug=None): + kernel = dpex.core.kernel_interface.spirv_kernel.SpirvKernel( + fn, fn.__name__ ) - return kernel.assembly + kernel.compile( + args=sig, + target_ctx=dpex_target.target_context, + typing_ctx=dpex_target.typing_context, + debug=debug, + compile_flags=None, + ) + return kernel.llvm_module def make_check(ir, val_to_search): @@ -45,15 +51,11 @@ def test_debug_flag_generates_ir_with_debuginfo(debug_option): Check debug info is emitting to IR if debug parameter is set to True """ - @dpex.kernel def foo(x): x = 1 # noqa - sycl_queue = dpctl.get_current_queue() sig = (types.int32,) - - kernel_ir = get_kernel_ir(sycl_queue, foo, sig, debug=debug_option) - + kernel_ir = get_kernel_ir(foo, sig, debug=debug_option) tag = "!dbg" if debug_option: @@ -68,7 +70,6 @@ def test_debug_info_locals_vars_on_no_opt(): if debug parameter is set to True and optimization is O0 """ - @dpex.kernel def foo(var_a, var_b, var_c): i = dpex.get_global_id(0) var_c[i] = var_a[i] + var_b[i] @@ -79,8 +80,6 @@ def foo(var_a, var_b, var_c): '!DILocalVariable(name: "var_c"', '!DILocalVariable(name: "i"', ] - - sycl_queue = dpctl.get_current_queue() sig = ( npytypes_array_to_dpex_array(types.float32[:]), npytypes_array_to_dpex_array(types.float32[:]), @@ -88,7 +87,7 @@ def foo(var_a, var_b, var_c): ) with override_config("OPT", 0): - kernel_ir = get_kernel_ir(sycl_queue, foo, sig, debug=True) + kernel_ir = get_kernel_ir(foo, sig, debug=True) for tag in ir_tags: assert tag in kernel_ir @@ -100,7 +99,6 @@ def test_debug_kernel_local_vars_in_ir(): created in kernel """ - @dpex.kernel def foo(arr): index = dpex.get_global_id(0) local_d = 9 * 99 + 5 @@ -110,11 +108,8 @@ def foo(arr): '!DILocalVariable(name: "index"', '!DILocalVariable(name: "local_d"', ] - - sycl_queue = dpctl.get_current_queue() sig = (npytypes_array_to_dpex_array(types.float32[:]),) - - kernel_ir = get_kernel_ir(sycl_queue, foo, sig, debug=True) + kernel_ir = get_kernel_ir(foo, sig, debug=True) for tag in ir_tags: assert tag in kernel_ir @@ -130,7 +125,6 @@ def func_sum(a, b): result = a + b return result - @dpex.kernel(debug=debug_option) def data_parallel_sum(a, b, c): i = dpex.get_global_id(0) c[i] = func_sum(a[i], b[i]) @@ -140,16 +134,13 @@ def data_parallel_sum(a, b, c): r'\!DISubprogram\(name: ".*data_parallel_sum"', ] - sycl_queue = dpctl.get_current_queue() sig = ( npytypes_array_to_dpex_array(types.float32[:]), npytypes_array_to_dpex_array(types.float32[:]), npytypes_array_to_dpex_array(types.float32[:]), ) - kernel_ir = get_kernel_ir( - sycl_queue, data_parallel_sum, sig, debug=debug_option - ) + kernel_ir = get_kernel_ir(data_parallel_sum, sig, debug=debug_option) for tag in ir_tags: assert debug_option == make_check(kernel_ir, tag) @@ -165,7 +156,6 @@ def func_sum(a, b): result = a + b return result - @dpex.kernel def data_parallel_sum(a, b, c): i = dpex.get_global_id(0) c[i] = func_sum(a[i], b[i]) @@ -175,7 +165,6 @@ def data_parallel_sum(a, b, c): r'\!DISubprogram\(name: ".*data_parallel_sum"', ] - sycl_queue = dpctl.get_current_queue() sig = ( npytypes_array_to_dpex_array(types.float32[:]), npytypes_array_to_dpex_array(types.float32[:]), @@ -183,14 +172,13 @@ def data_parallel_sum(a, b, c): ) with override_config("DEBUGINFO_DEFAULT", int(debug_option)): - kernel_ir = get_kernel_ir(sycl_queue, data_parallel_sum, sig) + kernel_ir = get_kernel_ir(data_parallel_sum, sig) for tag in ir_tags: assert debug_option == make_check(kernel_ir, tag) def test_debuginfo_DISubprogram_linkageName(): - @dpex.kernel def func(a, b): i = dpex.get_global_id(0) b[i] = a[i] @@ -199,20 +187,18 @@ def func(a, b): r'\!DISubprogram\(.*linkageName: ".*e4func.*"', ] - sycl_queue = dpctl.get_current_queue() sig = ( npytypes_array_to_dpex_array(types.float32[:]), npytypes_array_to_dpex_array(types.float32[:]), ) - kernel_ir = get_kernel_ir(sycl_queue, func, sig, debug=True) + kernel_ir = get_kernel_ir(func, sig, debug=True) for tag in ir_tags: assert make_check(kernel_ir, tag) def test_debuginfo_DICompileUnit_language_and_producer(): - @dpex.kernel def func(a, b): i = dpex.get_global_id(0) b[i] = a[i] @@ -222,13 +208,12 @@ def func(a, b): r'\!DICompileUnit\(.*producer: "numba-dpex"', ] - sycl_queue = dpctl.get_current_queue() sig = ( npytypes_array_to_dpex_array(types.float32[:]), npytypes_array_to_dpex_array(types.float32[:]), ) - kernel_ir = get_kernel_ir(sycl_queue, func, sig, debug=True) + kernel_ir = get_kernel_ir(func, sig, debug=True) for tag in ir_tags: assert make_check(kernel_ir, tag) diff --git a/numba_dpex/tests/test_device_array_args.py b/numba_dpex/tests/test_device_array_args.py index cc50c48854..4ddd848983 100644 --- a/numba_dpex/tests/test_device_array_args.py +++ b/numba_dpex/tests/test_device_array_args.py @@ -5,7 +5,7 @@ # SPDX-License-Identifier: Apache-2.0 import dpctl -import numpy as np +import dpctl.tensor as dpt import numba_dpex as dpex from numba_dpex.tests._helper import skip_no_opencl_cpu, skip_no_opencl_gpu @@ -20,28 +20,35 @@ def data_parallel_sum(a, b, c): global_size = 64 N = global_size -a = np.array(np.random.random(N), dtype=np.float32) -b = np.array(np.random.random(N), dtype=np.float32) -d = a + b +a = dpt.ones(N, dtype=dpt.float32) +b = dpt.ones(N, dtype=dpt.float32) @skip_no_opencl_cpu class TestArrayArgsGPU: def test_device_array_args_cpu(self): - c = np.ones_like(a) + c = dpt.ones_like(a) with dpctl.device_context("opencl:cpu"): data_parallel_sum[global_size, dpex.DEFAULT_LOCAL_SIZE](a, b, c) - assert np.all(c == d) + npc = dpt.asnumpy(c) + import numpy as np + + npc_expected = np.full(N, 2.0, dtype=np.float32) + assert np.all(npc == npc_expected) @skip_no_opencl_gpu class TestArrayArgsCPU: def test_device_array_args_gpu(self): - c = np.ones_like(a) + c = dpt.ones_like(a) with dpctl.device_context("opencl:gpu"): data_parallel_sum[global_size, dpex.DEFAULT_LOCAL_SIZE](a, b, c) - assert np.all(c == d) + npc = dpt.asnumpy(c) + import numpy as np + + npc_expected = np.full(N, 2.0, dtype=np.float32) + assert np.all(npc == npc_expected) diff --git a/numba_dpex/tests/test_dppy_fallback.py b/numba_dpex/tests/test_dpex_fallback.py similarity index 100% rename from numba_dpex/tests/test_dppy_fallback.py rename to numba_dpex/tests/test_dpex_fallback.py diff --git a/numba_dpex/tests/test_dppy_func.py b/numba_dpex/tests/test_dpex_func.py similarity index 70% rename from numba_dpex/tests/test_dppy_func.py rename to numba_dpex/tests/test_dpex_func.py index d469063243..3d0ad75c47 100644 --- a/numba_dpex/tests/test_dppy_func.py +++ b/numba_dpex/tests/test_dpex_func.py @@ -3,7 +3,7 @@ # SPDX-License-Identifier: Apache-2.0 import dpctl -import numpy as np +import dpctl.tensor as dpt import numba_dpex as dpex from numba_dpex.tests._helper import skip_no_opencl_gpu @@ -23,14 +23,17 @@ def f(a, b): i = dpex.get_global_id(0) b[i] = g(a[i]) - a = np.ones(self.N) - b = np.ones(self.N) + a = dpt.ones(self.N, dtype=dpt.int32) + b = dpt.ones(self.N, dtype=dpt.int32) device = dpctl.SyclDevice("opencl:gpu") with dpctl.device_context(device): f[self.N, dpex.DEFAULT_LOCAL_SIZE](a, b) - assert np.all(b == 2) + npb = dpt.asnumpy(b) + import numpy as np + + assert np.all(npb == 2) def test_func_ndarray(self): @dpex.func @@ -47,15 +50,17 @@ def h(a, b): i = dpex.get_global_id(0) b[i] = g(a[i]) + 1 - a = np.ones(self.N) - b = np.ones(self.N) + a = dpt.ones(self.N, dtype=dpt.int32) + b = dpt.ones(self.N, dtype=dpt.int32) device = dpctl.SyclDevice("opencl:gpu") with dpctl.device_context(device): f[self.N, dpex.DEFAULT_LOCAL_SIZE](a, b) + npb = dpt.asnumpy(b) + import numpy as np - assert np.all(b == 2) + assert np.all(npb == 2) h[self.N, dpex.DEFAULT_LOCAL_SIZE](a, b) - - assert np.all(b == 3) + npb = dpt.asnumpy(b) + assert np.all(npb == 3) diff --git a/numba_dpex/tests/test_no_copy_usm_shared.py b/numba_dpex/tests/test_no_copy_usm_shared.py index e4ced748c8..3d0ab97539 100644 --- a/numba_dpex/tests/test_no_copy_usm_shared.py +++ b/numba_dpex/tests/test_no_copy_usm_shared.py @@ -10,7 +10,7 @@ from numba.core import compiler, cpu from numba.core.registry import cpu_target -from numba_dpex.compiler import Compiler +from numba_dpex.core.compiler import Compiler from numba_dpex.tests._helper import skip_no_opencl_gpu diff --git a/numba_dpex/tests/test_vectorize.py b/numba_dpex/tests/test_vectorize.py index af00468cf7..dfd9a692d7 100644 --- a/numba_dpex/tests/test_vectorize.py +++ b/numba_dpex/tests/test_vectorize.py @@ -23,6 +23,7 @@ def shape(request): return request.param +@pytest.mark.xfail @pytest.mark.parametrize("filter_str", filter_strings) def test_njit(filter_str): @vectorize(nopython=True) @@ -66,6 +67,7 @@ def input_type(request): return request.param +@pytest.mark.xfail @pytest.mark.parametrize("filter_str", filter_strings) def test_vectorize(filter_str, shape, dtypes, input_type): def vector_add(a, b): diff --git a/numba_dpex/utils/__init__.py b/numba_dpex/utils/__init__.py index 345bf9f208..cf8068e8c0 100644 --- a/numba_dpex/utils/__init__.py +++ b/numba_dpex/utils/__init__.py @@ -10,7 +10,6 @@ as_usm_obj, copy_from_numpy_to_usm_obj, copy_to_numpy_from_usm_obj, - get_info_from_suai, has_usm_memory, ) from numba_dpex.utils.constants import address_space, calling_conv @@ -28,10 +27,7 @@ mix_datatype_err_msg, ) from numba_dpex.utils.misc import IndeterminateExecutionQueueError -from numba_dpex.utils.type_conversion_fns import ( - npytypes_array_to_dpex_array, - suai_to_dpex_array, -) +from numba_dpex.utils.type_conversion_fns import npytypes_array_to_dpex_array __all__ = [ "LLVMTypes", @@ -42,7 +38,6 @@ "get_one", "npytypes_array_to_dpex_array", "npytypes_array_to_dpex_array", - "suai_to_dpex_array", "address_space", "calling_conv", "has_usm_memory", @@ -53,5 +48,4 @@ "cfd_ctx_mgr_wrng_msg", "IndeterminateExecutionQueueError_msg", "mix_datatype_err_msg", - "get_info_from_suai", ] diff --git a/numba_dpex/utils/array_utils.py b/numba_dpex/utils/array_utils.py index 054910c633..a6e1ce6bf3 100644 --- a/numba_dpex/utils/array_utils.py +++ b/numba_dpex/utils/array_utils.py @@ -20,40 +20,6 @@ ] -def get_info_from_suai(obj): - """ - Convenience function to gather information from __sycl_usm_array_interface__. - - Args: - obj: Array with SUAI attribute. - - Returns: - usm_mem: USM memory object. - total_size: Total number of items in the array. - shape: Shape of the array. - ndim: Total number of dimensions. - itemsize: Size of each item. - strides: Stride of the array. - dtype: Dtype of the array. - """ - usm_mem = dpctl_mem.as_usm_memory(obj) - - assert usm_mem is not None - - shape = obj.__sycl_usm_array_interface__["shape"] - total_size = np.prod(obj.__sycl_usm_array_interface__["shape"]) - ndim = len(obj.__sycl_usm_array_interface__["shape"]) - itemsize = np.dtype(obj.__sycl_usm_array_interface__["typestr"]).itemsize - dtype = np.dtype(obj.__sycl_usm_array_interface__["typestr"]) - strides = obj.__sycl_usm_array_interface__["strides"] - if strides is None: - strides = [1] * ndim - for i in reversed(range(1, ndim)): - strides[i - 1] = strides[i] * shape[i] - strides = tuple(strides) - return usm_mem, total_size, shape, ndim, itemsize, strides, dtype - - def has_usm_memory(obj): """ Determine and return a SYCL device accessible object. diff --git a/numba_dpex/utils/type_conversion_fns.py b/numba_dpex/utils/type_conversion_fns.py index 5cfb76aad8..c9f02bcbed 100644 --- a/numba_dpex/utils/type_conversion_fns.py +++ b/numba_dpex/utils/type_conversion_fns.py @@ -9,14 +9,12 @@ """ from numba.core import types -from numba.np import numpy_support from numba_dpex.core.types import Array -from .array_utils import get_info_from_suai from .constants import address_space -__all__ = ["npytypes_array_to_dpex_array", "suai_to_dpex_array"] +__all__ = ["npytypes_array_to_dpex_array"] def npytypes_array_to_dpex_array(arrtype, addrspace=address_space.GLOBAL): @@ -63,47 +61,3 @@ def npytypes_array_to_dpex_array(arrtype, addrspace=address_space.GLOBAL): ) else: raise NotImplementedError - - -def suai_to_dpex_array(arr, addrspace=address_space.GLOBAL): - """Create type for Array with __sycl_usm_array_interface__ (SUAI) attribute. - - This function creates a Numba type for arrays with SUAI attribute. - - Args: - arr: Array with SUAI attribute. - addrspace: Address space this array is allocated in. - - Returns: The Numba type for SUAI array. - - Raises: - NotImplementedError: If the dtype of the passed array is not supported. - """ - from numba_dpex.core.types import USMNdArray - - ( - usm_mem, - total_size, - shape, - ndim, - itemsize, - strides, - dtype, - ) = get_info_from_suai(arr) - - try: - dtype = numpy_support.from_dtype(dtype) - except NotImplementedError: - raise ValueError("Unsupported array dtype: %s" % (dtype,)) - - layout = "C" - readonly = False - - return USMNdArray( - dtype, - ndim, - layout, - None, - readonly, - addrspace=addrspace, - )