From 9f8d1ac6a6516c7c4d54ac3ef2b8ac931191e46d Mon Sep 17 00:00:00 2001 From: Yevhenii Havrylko Date: Tue, 2 Apr 2024 17:22:14 -0400 Subject: [PATCH 1/3] Add nd_item to reduction template --- .../parfors/kernel_templates/reduction_template.py | 13 ++++++++----- numba_dpex/core/parfors/reduction_kernel_builder.py | 11 ++++++++++- 2 files changed, 18 insertions(+), 6 deletions(-) diff --git a/numba_dpex/core/parfors/kernel_templates/reduction_template.py b/numba_dpex/core/parfors/kernel_templates/reduction_template.py index 8752adf2f5..f039f1e6a3 100644 --- a/numba_dpex/core/parfors/kernel_templates/reduction_template.py +++ b/numba_dpex/core/parfors/kernel_templates/reduction_template.py @@ -55,7 +55,7 @@ def _generate_kernel_stub_as_string(self): gufunc_txt = "" gufunc_txt += "def " + self._kernel_name - gufunc_txt += "(" + (", ".join(self._kernel_params)) + "):\n" + gufunc_txt += "(nd_item, " + (", ".join(self._kernel_params)) + "):\n" global_id_dim = 0 for_loop_dim = self._parfor_dim @@ -64,14 +64,17 @@ def _generate_kernel_stub_as_string(self): else: global_id_dim = self._parfor_dim + gufunc_txt += " group = nd_item.get_group()\n" for dim in range(global_id_dim): dstr = str(dim) gufunc_txt += ( - f" {self._ivar_names[dim]} = dpex.get_global_id({dstr})\n" + f" {self._ivar_names[dim]} = nd_item.get_global_id({dstr})\n" ) - gufunc_txt += f" local_id{dim} = dpex.get_local_id({dstr})\n" - gufunc_txt += f" local_size{dim} = dpex.get_local_size({dstr})\n" - gufunc_txt += f" group_id{dim} = dpex.get_group_id({dstr})\n" + gufunc_txt += f" local_id{dim} = nd_item.get_local_id({dstr})\n" + gufunc_txt += ( + f" local_size{dim} = group.get_local_range({dstr})\n" + ) + gufunc_txt += f" group_id{dim} = group.get_group_id({dstr})\n" # Allocate local_sums arrays for each reduction variable. for redvar in self._redvars: diff --git a/numba_dpex/core/parfors/reduction_kernel_builder.py b/numba_dpex/core/parfors/reduction_kernel_builder.py index 24649f106b..4c2741ba77 100644 --- a/numba_dpex/core/parfors/reduction_kernel_builder.py +++ b/numba_dpex/core/parfors/reduction_kernel_builder.py @@ -20,6 +20,7 @@ from numba.core.typing import signature from numba_dpex.core.types import DpctlSyclQueue +from numba_dpex.core.types.kernel_api.index_space_ids import NdItemType from .kernel_builder import _print_body # saved for debug from .kernel_builder import ( @@ -48,6 +49,7 @@ def create_reduction_main_kernel_for_parfor( """ loc = parfor_node.init_block.loc + parfor_dim = len(parfor_node.loop_nests) for race in parfor_node.races: msg = ( @@ -84,7 +86,7 @@ def create_reduction_main_kernel_for_parfor( sentinel_name=sentinel_name, loop_ranges=loop_ranges, param_dict=reductionKernelVar.param_dict, - parfor_dim=len(parfor_node.loop_nests), + parfor_dim=parfor_dim, redvars=reductionKernelVar.parfor_redvars, parfor_args=reductionKernelVar.parfor_params, parfor_reddict=parfor_reddict, @@ -136,6 +138,13 @@ def create_reduction_main_kernel_for_parfor( if not has_aliases: flags.noalias = True + # The first argument to a range kernel is a kernel_api.NdItem object. The + # ``NdItem`` object is used by the kernel_api.spirv backend to generate the + # correct SPIR-V indexing instructions. Since, the argument is not something + # available originally in the kernel_param_types, we add it at this point to + # make sure the kernel signature matches the actual generated code. + ty_item = NdItemType(parfor_dim) + kernel_param_types = (ty_item, *kernel_param_types) kernel_sig = signature(types.none, *kernel_param_types) # FIXME: A better design is required so that we do not have to create a From d5d48c07dc00c4f9d9aa5c92d6418f9fc0b82e2d Mon Sep 17 00:00:00 2001 From: Yevhenii Havrylko Date: Tue, 2 Apr 2024 19:08:05 -0400 Subject: [PATCH 2/3] Migrate parfor local accessor to new api --- numba_dpex/core/parfors/kernel_builder.py | 4 ++ .../kernel_templates/reduction_template.py | 11 +----- numba_dpex/core/parfors/parfor_lowerer.py | 20 +++++++++- .../core/parfors/reduction_kernel_builder.py | 38 +++++++++++++++---- 4 files changed, 55 insertions(+), 18 deletions(-) diff --git a/numba_dpex/core/parfors/kernel_builder.py b/numba_dpex/core/parfors/kernel_builder.py index a6dbcb5d66..1eda1664ae 100644 --- a/numba_dpex/core/parfors/kernel_builder.py +++ b/numba_dpex/core/parfors/kernel_builder.py @@ -44,6 +44,8 @@ def __init__( kernel_args, kernel_arg_types, queue: dpctl.SyclQueue, + local_accessors=None, + work_group_size=None, ): self.name = name self.kernel = kernel @@ -51,6 +53,8 @@ def __init__( self.kernel_args = kernel_args self.kernel_arg_types = kernel_arg_types self.queue = queue + self.local_accessors = local_accessors + self.work_group_size = work_group_size def _print_block(block): diff --git a/numba_dpex/core/parfors/kernel_templates/reduction_template.py b/numba_dpex/core/parfors/kernel_templates/reduction_template.py index f039f1e6a3..96b8913106 100644 --- a/numba_dpex/core/parfors/kernel_templates/reduction_template.py +++ b/numba_dpex/core/parfors/kernel_templates/reduction_template.py @@ -30,8 +30,8 @@ def __init__( parfor_args, parfor_reddict, redvars_dict, + local_accessors_dict, typemap, - work_group_size, ) -> None: self._kernel_name = kernel_name self._kernel_params = kernel_params @@ -44,8 +44,8 @@ def __init__( self._parfor_args = parfor_args self._parfor_reddict = parfor_reddict self._redvars_dict = redvars_dict + self._local_accessors_dict = local_accessors_dict self._typemap = typemap - self._work_group_size = work_group_size self._kernel_txt = self._generate_kernel_stub_as_string() self._kernel_ir = self._generate_kernel_ir() @@ -76,13 +76,6 @@ def _generate_kernel_stub_as_string(self): ) gufunc_txt += f" group_id{dim} = group.get_group_id({dstr})\n" - # Allocate local_sums arrays for each reduction variable. - for redvar in self._redvars: - rtyp = str(self._typemap[redvar]) - redvar = self._redvars_dict[redvar] - gufunc_txt += f" local_sums_{redvar} = \ - dpex.local.array({self._work_group_size}, dpnp.{rtyp})\n" - for dim in range(global_id_dim, for_loop_dim): for indent in range(1 + (dim - global_id_dim)): gufunc_txt += " " diff --git a/numba_dpex/core/parfors/parfor_lowerer.py b/numba_dpex/core/parfors/parfor_lowerer.py index e54d54171b..41bf86868f 100644 --- a/numba_dpex/core/parfors/parfor_lowerer.py +++ b/numba_dpex/core/parfors/parfor_lowerer.py @@ -169,8 +169,24 @@ def _submit_parfor_kernel( queue_ref = kl_builder.get_queue(exec_queue=kernel_fn.queue) kernel_args = [] - for arg in kernel_fn.kernel_args: - kernel_args.append(_getvar(lowerer, arg)) + for i, arg in enumerate(kernel_fn.kernel_args): + if ( + kernel_fn.local_accessors is not None + and arg in kernel_fn.local_accessors + ): + wg_size = lowerer.context.get_constant( + types.intp, kernel_fn.work_group_size + ) + la_shape = cgutils.pack_array(lowerer.builder, [wg_size]) + arg_ty = kernel_fn.kernel_arg_types[i] + la = cgutils.create_struct_proxy(arg_ty)( + lowerer.context, + lowerer.builder, + ) + la.shape = la_shape + kernel_args.append(la._getvalue()) + else: + kernel_args.append(_getvar(lowerer, arg)) kernel_ref_addr = kernel_fn.kernel.addressof_ref() kernel_ref = lowerer.builder.inttoptr( diff --git a/numba_dpex/core/parfors/reduction_kernel_builder.py b/numba_dpex/core/parfors/reduction_kernel_builder.py index 4c2741ba77..6180ff5ef1 100644 --- a/numba_dpex/core/parfors/reduction_kernel_builder.py +++ b/numba_dpex/core/parfors/reduction_kernel_builder.py @@ -19,8 +19,10 @@ ) from numba.core.typing import signature +from numba_dpex.core.parfors.reduction_helper import ReductionKernelVariables from numba_dpex.core.types import DpctlSyclQueue from numba_dpex.core.types.kernel_api.index_space_ids import NdItemType +from numba_dpex.core.types.kernel_api.local_accessor import LocalAccessorType from .kernel_builder import _print_body # saved for debug from .kernel_builder import ( @@ -41,7 +43,7 @@ def create_reduction_main_kernel_for_parfor( typemap, flags, has_aliases, - reductionKernelVar, + reductionKernelVar: ReductionKernelVariables, parfor_reddict=None, ): """ @@ -79,20 +81,35 @@ def create_reduction_main_kernel_for_parfor( except KeyError: pass + parfor_params = reductionKernelVar.parfor_params.copy() + parfor_legalized_params = reductionKernelVar.parfor_legalized_params.copy() + parfor_param_types = reductionKernelVar.param_types.copy() + local_accessors_dict = {} + for k, v in reductionKernelVar.redvars_legal_dict.items(): + la_var = "local_sums_" + v + local_accessors_dict[k] = la_var + idx = reductionKernelVar.parfor_params.index(k) + arr_ty = reductionKernelVar.param_types[idx] + la_ty = LocalAccessorType(parfor_dim, arr_ty.dtype) + + parfor_params.append(la_var) + parfor_legalized_params.append(la_var) + parfor_param_types.append(la_ty) + kernel_template = TreeReduceIntermediateKernelTemplate( kernel_name=kernel_name, - kernel_params=reductionKernelVar.parfor_legalized_params, + kernel_params=parfor_legalized_params, ivar_names=reductionKernelVar.legal_loop_indices, sentinel_name=sentinel_name, loop_ranges=loop_ranges, param_dict=reductionKernelVar.param_dict, parfor_dim=parfor_dim, redvars=reductionKernelVar.parfor_redvars, - parfor_args=reductionKernelVar.parfor_params, + parfor_args=parfor_params, parfor_reddict=parfor_reddict, redvars_dict=reductionKernelVar.redvars_legal_dict, + local_accessors_dict=local_accessors_dict, typemap=typemap, - work_group_size=reductionKernelVar.work_group_size, ) kernel_ir = kernel_template.kernel_ir @@ -116,7 +133,7 @@ def create_reduction_main_kernel_for_parfor( new_var_dict[name] = mk_unique_var(name) replace_var_names(kernel_ir.blocks, new_var_dict) - kernel_param_types = reductionKernelVar.param_types + kernel_param_types = parfor_param_types kernel_stub_last_label = max(kernel_ir.blocks.keys()) + 1 # Add kernel stub last label to each parfor.loop_body label to prevent # label conflicts. @@ -164,13 +181,20 @@ def create_reduction_main_kernel_for_parfor( flags.noalias = old_alias + parfor_params = ( + reductionKernelVar.parfor_params.copy() + + parfor_params[len(reductionKernelVar.parfor_params) :] # noqa: $203 + ) + return ParforKernel( name=kernel_name, kernel=sycl_kernel, signature=kernel_sig, - kernel_args=reductionKernelVar.parfor_params, - kernel_arg_types=reductionKernelVar.func_arg_types, + kernel_args=parfor_params, + kernel_arg_types=parfor_param_types, queue=exec_queue, + local_accessors=set(local_accessors_dict.values()), + work_group_size=reductionKernelVar.work_group_size, ) From 40c465b7f416fe03b2b40001adb65992a4b6f69b Mon Sep 17 00:00:00 2001 From: Yevhenii Havrylko Date: Wed, 3 Apr 2024 14:50:43 -0400 Subject: [PATCH 3/3] Remove local arrays in reduction remainder kernel --- .../kernel_templates/reduction_template.py | 24 +++++-------------- 1 file changed, 6 insertions(+), 18 deletions(-) diff --git a/numba_dpex/core/parfors/kernel_templates/reduction_template.py b/numba_dpex/core/parfors/kernel_templates/reduction_template.py index 96b8913106..34b637412f 100644 --- a/numba_dpex/core/parfors/kernel_templates/reduction_template.py +++ b/numba_dpex/core/parfors/kernel_templates/reduction_template.py @@ -278,10 +278,13 @@ def _generate_kernel_stub_as_string(self): ) for redvar in self._redvars: + rtyp = str(self._typemap[redvar]) legal_redvar = self._redvars_dict[redvar] gufunc_txt += " " gufunc_txt += legal_redvar + " = " - gufunc_txt += f"{self._parfor_reddict[redvar].init_val}\n" + gufunc_txt += ( + f"dpnp.{rtyp}({self._parfor_reddict[redvar].init_val})\n" + ) gufunc_txt += ( " " @@ -290,32 +293,17 @@ def _generate_kernel_stub_as_string(self): + f"{self._global_size_var_name[0]} + j\n" ) - for redvar in self._redvars: - rtyp = str(self._typemap[redvar]) - redvar = self._redvars_dict[redvar] - gufunc_txt += ( - " " - + f"local_sums_{redvar} = " - + f"dpex.local.array(1, dpnp.{rtyp})\n" - ) - gufunc_txt += " " + self._sentinel_name + " = 0\n" - for i, redvar in enumerate(self._redvars): - legal_redvar = self._redvars_dict[redvar] - gufunc_txt += ( - " " + f"local_sums_{legal_redvar}[0] = {legal_redvar}\n" - ) - for i, redvar in enumerate(self._redvars): legal_redvar = self._redvars_dict[redvar] redop = self._parfor_reddict[redvar].redop if redop == operator.iadd: gufunc_txt += f" {self._final_sum_var_name[i]}[0] += \ - local_sums_{legal_redvar}[0]\n" + {legal_redvar}\n" elif redop == operator.imul: gufunc_txt += f" {self._final_sum_var_name[i]}[0] *= \ - local_sums_{legal_redvar}[0]\n" + {legal_redvar}\n" else: raise NotImplementedError