diff --git a/.github/workflows/conda-package.yml b/.github/workflows/conda-package.yml
index 3cea2d65e5..1c6c6ab96c 100644
--- a/.github/workflows/conda-package.yml
+++ b/.github/workflows/conda-package.yml
@@ -91,7 +91,7 @@ jobs:
     strategy:
       matrix:
         python: ["3.8", "3.9", "3.10"]
-        numba: ["0.56"]
+        numba: ["0.57"]
         dpnp: ["0.11"]
 
     steps:
diff --git a/conda-recipe/meta.yaml b/conda-recipe/meta.yaml
index 431b1739d7..8ac7f1b530 100644
--- a/conda-recipe/meta.yaml
+++ b/conda-recipe/meta.yaml
@@ -18,14 +18,14 @@ requirements:
         - python
         - setuptools >=63.*
         - cython
-        - numba 0.56*
+        - numba 0.57*
         - dpctl >=0.14*
         - dpnp >=0.11*
         - dpcpp-llvm-spirv
         - wheel
     run:
         - python
-        - numba >=0.56*
+        - numba >=0.57*
         - dpctl >=0.14*
         - spirv-tools
         - dpcpp-llvm-spirv
diff --git a/environment/coverage.yml b/environment/coverage.yml
index 9404befc45..6516932f4d 100644
--- a/environment/coverage.yml
+++ b/environment/coverage.yml
@@ -12,7 +12,7 @@ dependencies:
   - gxx_linux-64
   - dpcpp_linux-64
   - cython
-  - numba 0.56*
+  - numba 0.57*
   - dppy/label/dev:dpctl
   - dppy/label/dev:dpnp
   - spirv-tools
diff --git a/environment/docs.yml b/environment/docs.yml
index 6a1489d337..62a2df0b0c 100644
--- a/environment/docs.yml
+++ b/environment/docs.yml
@@ -11,7 +11,7 @@ dependencies:
   - gxx_linux-64
   - dpcpp_linux-64
   - cython
-  - numba 0.56*
+  - numba 0.57*
   - dpctl 0.14*
   - dpnp >=0.10.2
   - spirv-tools
diff --git a/numba_dpex/__init__.py b/numba_dpex/__init__.py
index 9ee692931d..b224744cb9 100644
--- a/numba_dpex/__init__.py
+++ b/numba_dpex/__init__.py
@@ -14,18 +14,23 @@
 
 import dpctl
 import llvmlite.binding as ll
-import numba
-from numba.core import ir_utils
+from numba import __version__ as numba_version
 from numba.np import arrayobj
-from numba.np.ufunc import array_exprs
 from numba.np.ufunc.decorators import Vectorize
 
-from numba_dpex._patches import _empty_nd_impl, _is_ufunc, _mk_alloc
+from numba_dpex._patches import _empty_nd_impl
 from numba_dpex.vectorizers import Vectorize as DpexVectorize
 
+from .numba_patches import (
+    patch_arrayexpr_tree_to_ir,
+    patch_is_ufunc,
+    patch_mk_alloc,
+)
+
 # Monkey patches
-array_exprs._is_ufunc = _is_ufunc
-ir_utils.mk_alloc = _mk_alloc
+patch_is_ufunc.patch()
+patch_mk_alloc.patch()
+patch_arrayexpr_tree_to_ir.patch()
 arrayobj._empty_nd_impl = _empty_nd_impl
 
 
@@ -58,8 +63,6 @@ def load_dpctl_sycl_interface():
     else:
         raise ImportError
 
-    Vectorize.target_registry.ondemand["dpex"] = lambda: DpexVectorize
-
 
 def parse_sem_version(version_string: str) -> Tuple[int, int, int]:
     """Parse sem version into tuple of three integers. If there is a suffix like
@@ -76,19 +79,19 @@ def parse_sem_version(version_string: str) -> Tuple[int, int, int]:
     )
 
 
-numba_version = parse_sem_version(numba.__version__)
-if numba_version < (0, 56, 4):
+numba_sem_version = parse_sem_version(numba_version)
+if numba_sem_version < (0, 57, 0):
     logging.warning(
-        "numba_dpex needs numba 0.56.4, using "
+        "numba_dpex needs numba 0.57.0, using "
         f"numba={numba_version} may cause unexpected behavior"
     )
 
 
-dpctl_version = tuple(map(int, dpctl.__version__.split(".")[:2]))
-if dpctl_version < (0, 14):
+dpctl_sem_version = parse_sem_version(dpctl.__version__)
+if dpctl_sem_version < (0, 14):
     logging.warning(
         "numba_dpex needs dpctl 0.14 or greater, using "
-        f"dpctl={dpctl_version} may cause unexpected behavior"
+        f"dpctl={dpctl_sem_version} may cause unexpected behavior"
     )
 
 from numba import prange  # noqa E402
@@ -139,6 +142,8 @@ def parse_sem_version(version_string: str) -> Tuple[int, int, int]:
 else:
     raise ImportError("No non-host SYCL device found to execute kernels.")
 
+Vectorize.target_registry.ondemand["dpex"] = lambda: DpexVectorize
+
 from numba_dpex._version import get_versions  # noqa E402
 
 __version__ = get_versions()["version"]
diff --git a/numba_dpex/_patches.py b/numba_dpex/_patches.py
index cb996a1a9f..cbf464af03 100644
--- a/numba_dpex/_patches.py
+++ b/numba_dpex/_patches.py
@@ -2,17 +2,11 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
-import numpy
 from llvmlite import ir as llvmir
 from llvmlite.ir import Constant
 from numba.core import cgutils
 from numba.core import config as numba_config
-from numba.core import ir, types
-from numba.core.ir_utils import (
-    convert_size_to_var,
-    get_np_ufunc_typ,
-    mk_unique_var,
-)
+from numba.core import types
 from numba.core.typing import signature
 from numba.extending import intrinsic, overload_classmethod
 from numba.np.arrayobj import (
@@ -21,156 +15,10 @@
     make_array,
     populate_array,
 )
-from numba.np.ufunc.dufunc import DUFunc
 
 from numba_dpex.core.runtime import context as dpexrt
 from numba_dpex.core.types import DpnpNdArray
 
-# Numpy array constructors
-
-
-def _is_ufunc(func):
-    return isinstance(func, (numpy.ufunc, DUFunc)) or hasattr(
-        func, "is_dpnp_ufunc"
-    )
-
-
-def _mk_alloc(
-    typingctx, typemap, calltypes, lhs, size_var, dtype, scope, loc, lhs_typ
-):
-    """generate an array allocation with np.empty() and return list of nodes.
-    size_var can be an int variable or tuple of int variables.
-    lhs_typ is the type of the array being allocated.
-    """
-    out = []
-    ndims = 1
-    size_typ = types.intp
-    if isinstance(size_var, tuple):
-        if len(size_var) == 1:
-            size_var = size_var[0]
-            size_var = convert_size_to_var(size_var, typemap, scope, loc, out)
-        else:
-            # tuple_var = build_tuple([size_var...])
-            ndims = len(size_var)
-            tuple_var = ir.Var(scope, mk_unique_var("$tuple_var"), loc)
-            if typemap:
-                typemap[tuple_var.name] = types.containers.UniTuple(
-                    types.intp, ndims
-                )
-            # constant sizes need to be assigned to vars
-            new_sizes = [
-                convert_size_to_var(s, typemap, scope, loc, out)
-                for s in size_var
-            ]
-            tuple_call = ir.Expr.build_tuple(new_sizes, loc)
-            tuple_assign = ir.Assign(tuple_call, tuple_var, loc)
-            out.append(tuple_assign)
-            size_var = tuple_var
-            size_typ = types.containers.UniTuple(types.intp, ndims)
-
-    if hasattr(lhs_typ, "__allocate__"):
-        return lhs_typ.__allocate__(
-            typingctx,
-            typemap,
-            calltypes,
-            lhs,
-            size_var,
-            dtype,
-            scope,
-            loc,
-            lhs_typ,
-            size_typ,
-            out,
-        )
-
-    # g_np_var = Global(numpy)
-    g_np_var = ir.Var(scope, mk_unique_var("$np_g_var"), loc)
-    if typemap:
-        typemap[g_np_var.name] = types.misc.Module(numpy)
-    g_np = ir.Global("np", numpy, loc)
-    g_np_assign = ir.Assign(g_np, g_np_var, loc)
-    # attr call: empty_attr = getattr(g_np_var, empty)
-    empty_attr_call = ir.Expr.getattr(g_np_var, "empty", loc)
-    attr_var = ir.Var(scope, mk_unique_var("$empty_attr_attr"), loc)
-    if typemap:
-        typemap[attr_var.name] = get_np_ufunc_typ(numpy.empty)
-    attr_assign = ir.Assign(empty_attr_call, attr_var, loc)
-    # Assume str(dtype) returns a valid type
-    dtype_str = str(dtype)
-    # alloc call: lhs = empty_attr(size_var, typ_var)
-    typ_var = ir.Var(scope, mk_unique_var("$np_typ_var"), loc)
-    if typemap:
-        typemap[typ_var.name] = types.functions.NumberClass(dtype)
-    # If dtype is a datetime/timedelta with a unit,
-    # then it won't return a valid type and instead can be created
-    # with a string. i.e. "datetime64[ns]")
-    if (
-        isinstance(dtype, (types.NPDatetime, types.NPTimedelta))
-        and dtype.unit != ""
-    ):
-        typename_const = ir.Const(dtype_str, loc)
-        typ_var_assign = ir.Assign(typename_const, typ_var, loc)
-    else:
-        if dtype_str == "bool":
-            # empty doesn't like 'bool' sometimes (e.g. kmeans example)
-            dtype_str = "bool_"
-        np_typ_getattr = ir.Expr.getattr(g_np_var, dtype_str, loc)
-        typ_var_assign = ir.Assign(np_typ_getattr, typ_var, loc)
-    alloc_call = ir.Expr.call(attr_var, [size_var, typ_var], (), loc)
-
-    if calltypes:
-        cac = typemap[attr_var.name].get_call_type(
-            typingctx, [size_typ, types.functions.NumberClass(dtype)], {}
-        )
-        # By default, all calls to "empty" are typed as returning a standard
-        # NumPy ndarray.  If we are allocating a ndarray subclass here then
-        # just change the return type to be that of the subclass.
-        cac._return_type = (
-            lhs_typ.copy(layout="C") if lhs_typ.layout == "F" else lhs_typ
-        )
-        calltypes[alloc_call] = cac
-    if lhs_typ.layout == "F":
-        empty_c_typ = lhs_typ.copy(layout="C")
-        empty_c_var = ir.Var(scope, mk_unique_var("$empty_c_var"), loc)
-        if typemap:
-            typemap[empty_c_var.name] = lhs_typ.copy(layout="C")
-        empty_c_assign = ir.Assign(alloc_call, empty_c_var, loc)
-
-        # attr call: asfortranarray = getattr(g_np_var, asfortranarray)
-        asfortranarray_attr_call = ir.Expr.getattr(
-            g_np_var, "asfortranarray", loc
-        )
-        afa_attr_var = ir.Var(
-            scope, mk_unique_var("$asfortran_array_attr"), loc
-        )
-        if typemap:
-            typemap[afa_attr_var.name] = get_np_ufunc_typ(numpy.asfortranarray)
-        afa_attr_assign = ir.Assign(asfortranarray_attr_call, afa_attr_var, loc)
-        # call asfortranarray
-        asfortranarray_call = ir.Expr.call(afa_attr_var, [empty_c_var], (), loc)
-        if calltypes:
-            calltypes[asfortranarray_call] = typemap[
-                afa_attr_var.name
-            ].get_call_type(typingctx, [empty_c_typ], {})
-
-        asfortranarray_assign = ir.Assign(asfortranarray_call, lhs, loc)
-
-        out.extend(
-            [
-                g_np_assign,
-                attr_assign,
-                typ_var_assign,
-                empty_c_assign,
-                afa_attr_assign,
-                asfortranarray_assign,
-            ]
-        )
-    else:
-        alloc_assign = ir.Assign(alloc_call, lhs, loc)
-        out.extend([g_np_assign, attr_assign, typ_var_assign, alloc_assign])
-
-    return out
-
 
 def _empty_nd_impl(context, builder, arrtype, shapes):
     """Utility function used for allocating a new array during LLVM code
diff --git a/numba_dpex/config.py b/numba_dpex/config.py
index b6bb16f8c3..609fbc2cc7 100644
--- a/numba_dpex/config.py
+++ b/numba_dpex/config.py
@@ -53,8 +53,6 @@ def __getattr__(name):
 # Dump offload diagnostics
 OFFLOAD_DIAGNOSTICS = _readenv("NUMBA_DPEX_OFFLOAD_DIAGNOSTICS", int, 0)
 
-FALLBACK_ON_CPU = _readenv("NUMBA_DPEX_FALLBACK_ON_CPU", int, 1)
-
 # Activate Native floating point atomcis support for supported devices.
 # Requires llvm-spirv supporting the FP atomics extension
 NATIVE_FP_ATOMICS = _readenv("NUMBA_DPEX_ACTIVATE_ATOMICS_FP_NATIVE", int, 0)
diff --git a/numba_dpex/core/parfors/__init__.py b/numba_dpex/core/parfors/__init__.py
new file mode 100644
index 0000000000..00fec5f515
--- /dev/null
+++ b/numba_dpex/core/parfors/__init__.py
@@ -0,0 +1,3 @@
+# SPDX-FileCopyrightText: 2020 - 2023 Intel Corporation
+#
+# SPDX-License-Identifier: Apache-2.0
diff --git a/numba_dpex/core/utils/kernel_builder.py b/numba_dpex/core/parfors/kernel_builder.py
similarity index 98%
rename from numba_dpex/core/utils/kernel_builder.py
rename to numba_dpex/core/parfors/kernel_builder.py
index 20c9cdeaba..01c2bd4c0a 100644
--- a/numba_dpex/core/utils/kernel_builder.py
+++ b/numba_dpex/core/parfors/kernel_builder.py
@@ -21,14 +21,14 @@
     replace_var_names,
 )
 from numba.core.typing import signature
+from numba.parfors import parfor
 
 import numba_dpex as dpex
 from numba_dpex import config
 
 from ..descriptor import dpex_kernel_target
-from ..passes import parfor
 from ..types.dpnp_ndarray_type import DpnpNdArray
-from .kernel_templates import RangeKernelTemplate
+from ..utils.kernel_templates import RangeKernelTemplate
 
 
 class ParforKernel:
@@ -338,7 +338,7 @@ def create_kernel_for_parfor(
         print("legal parfor_params = ", parfor_params, type(parfor_params))
 
     # Determine the unique names of the kernel functions.
-    kernel_name = "__numba_parfor_kernel_%s" % (parfor_node.id)
+    kernel_name = "__dpex_parfor_kernel_%s" % (parfor_node.id)
 
     kernel_template = RangeKernelTemplate(
         kernel_name=kernel_name,
@@ -493,8 +493,7 @@ def update_sentinel(kernel_ir, sentinel_name, kernel_body, new_label):
                 # parfor body.
                 prev_block.append(ir.Jump(body_first_label, loc))
 
-                # Add all the parfor loop body blocks to the gufunc function's
-                # IR.
+                # Add all the parfor loop body blocks to the kernel IR
                 for loop, b in kernel_body.items():
                     kernel_ir.blocks[loop] = copy.copy(b)
                     kernel_ir.blocks[loop].body = copy.copy(
diff --git a/numba_dpex/core/passes/parfor_lowering_pass.py b/numba_dpex/core/parfors/parfor_lowerer.py
similarity index 72%
rename from numba_dpex/core/passes/parfor_lowering_pass.py
rename to numba_dpex/core/parfors/parfor_lowerer.py
index 8f64a0d05d..7b0c053e98 100644
--- a/numba_dpex/core/passes/parfor_lowering_pass.py
+++ b/numba_dpex/core/parfors/parfor_lowerer.py
@@ -4,38 +4,71 @@
 
 import copy
 
-from numba.core import funcdesc, ir, types
-from numba.core.compiler_machinery import LoweringPass, register_pass
-from numba.core.lowering import Lower
-from numba.parfors.parfor_lowering import (
-    _lower_parfor_parallel as _lower_parfor_parallel_std,
+from llvmlite import ir as llvmir
+from numba.core import cgutils, ir, types
+from numba.parfors.parfor import (
+    find_potential_aliases_parfor,
+    get_parfor_outputs,
 )
 
 from numba_dpex import config
 from numba_dpex.core.utils.kernel_launcher import KernelLaunchIRBuilder
-from numba_dpex.core.utils.reduction_helper import (
+from numba_dpex.core.parfors.reduction_helper import (
     ReductionHelper,
     ReductionKernelVariables,
 )
 
 from ..exceptions import UnsupportedParforError
 from ..types.dpnp_ndarray_type import DpnpNdArray
-from ..utils.kernel_builder import create_kernel_for_parfor
-from ..utils.reduction_kernel_builder import (
+from .kernel_builder import create_kernel_for_parfor
+from .reduction_kernel_builder import (
     create_reduction_main_kernel_for_parfor,
     create_reduction_remainder_kernel_for_parfor,
 )
-from .parfor import Parfor, find_potential_aliases_parfor, get_parfor_outputs
 
 # A global list of kernels to keep the objects alive indefinitely.
 keep_alive_kernels = []
 
 
-def _getvar_or_none(lowerer, x):
-    try:
-        return lowerer.getvar(x)
-    except:
-        return None
+def _getvar(lowerer, x):
+    """Returns the LLVM Value corresponding to a Numba IR variable.
+
+    Depending on if Numba's sroa-like optimization is enabled or not, the
+    LLVM value for an Numba IR variable is found in either the ``varmap``
+    or the ``blk_local_varmap`` of the ``lowerer``. If the LLVM Value is not a
+    pointer, e.g., in case of function args with sroa optimization enabled, then
+    creates an alloca and stores the Value into the new alloca Value and returns
+    it. The extra alloca is needed as all inputs to a kernel function need to
+    be passed by reference and not value.
+
+    Args:
+        lowerer: The Numba Lower instance used to lower the function.
+        x: Numba IR variable name used to lookup the corresponding
+           LLVM Value.
+
+    Raises:
+        AssertionError: If the LLVM Value for ``x`` does not exist in either
+            the ``varmap`` or the ``blk_local_varmap``.
+
+    Returns: An LLVM Value object
+
+    """
+    var_val = None
+    if x in lowerer._blk_local_varmap:
+        var_val = lowerer._blk_local_varmap[x]
+    elif x in lowerer.varmap:
+        var_val = lowerer.varmap[x]
+
+    if var_val:
+        if not isinstance(var_val.type, llvmir.PointerType):
+            with lowerer.builder.goto_entry_block():
+                var_val_ptr = lowerer.builder.alloca(var_val.type)
+            lowerer.builder.store(var_val, var_val_ptr)
+            return var_val_ptr
+        else:
+            return var_val
+    else:
+        raise AssertionError("No llvm Value found for kernel arg")
 
 
 def _load_range(lowerer, value):
@@ -95,14 +128,11 @@ def _build_kernel_arglist(self, kernel_fn, lowerer):
         self.args_ty_list = self.kernel_builder.allocate_kernel_arg_ty_array(
             num_flattened_args
         )
-
         # Populate the args_list and the args_ty_list LLVM arrays
         self.kernel_arg_num = 0
         for arg_num, arg in enumerate(kernel_fn.kernel_args):
             argtype = kernel_fn.kernel_arg_types[arg_num]
-            llvm_val = _getvar_or_none(lowerer, arg)
-            if not llvm_val:
-                raise AssertionError
+            llvm_val = _getvar(lowerer, arg)
             if isinstance(argtype, DpnpNdArray):
                 self.kernel_builder.build_array_arg(
                     array_val=llvm_val,
@@ -476,14 +506,18 @@ class ParforLowerFactory:
     """A pseudo-factory class that maps a device filter string to a lowering
     function.
 
-    Each parfor can have a "lowerer" attribute that determines how the parfor
-    node is to be lowered to LLVM IR. The factory class maintains a static map
-    that for every device type (filter string) encountered so far to a lowerer
-    function for that device type. At this point numba-dpex does not generate
-    device-specific code and there lowerer is always same for all devices.
-    By generating different instances we make sure prfors that will execute on
-    distinct devices as determined by compute-follows-data programming model are
-    never fused together.
+    Each Parfor instruction can have an optional "lowerer" attribute. The
+    lowerer attribute determines how the parfor instruction should be lowered
+    to LLVM IR. In addition, the lower attribute decides which parfor
+    instructions can be fused together.
+
+    The factory class maintains a dictionary mapping every device
+    type (filter string) encountered so far to a lowerer function for that
+    device type. At this point numba-dpex does not generate device-specific code
+    and the lowerer used is same for all device types. However, as a different
+    ParforLowerImpl instance is returned for every parfor instruction that has
+    a distinct compute-follows-data inferred device it prevents illegal
+    parfor fusion.
     """
 
     device_to_lowerer_map = {}
@@ -497,143 +531,3 @@ def get_lowerer(cls, device):
             ParforLowerFactory.device_to_lowerer_map[device] = lowerer
 
         return lowerer
-
-
-class WrapperDefaultLower(Lower):
-    @property
-    def _disable_sroa_like_opt(self):
-        """We always return True."""
-        return True
-
-
-def lower_parfor_dpex(lowerer, parfor):
-    parfor.lowerer = ParforLowerImpl()._lower_parfor_as_kernel
-    if parfor.lowerer is None:
-        _lower_parfor_parallel_std(lowerer, parfor)
-    else:
-        parfor.lowerer(lowerer, parfor)
-
-
-class _ParforLower(Lower):
-    """Extends standard lowering to accommodate parfor.Parfor nodes that may
-    have the ``lowerer`` attribute set.
-    """
-
-    def __init__(self, context, library, fndesc, func_ir, metadata=None):
-        Lower.__init__(self, context, library, fndesc, func_ir, metadata)
-        self.dpex_lower = self._lower(
-            context, library, fndesc, func_ir, metadata
-        )
-
-    def _lower(self, context, library, fndesc, func_ir, metadata):
-        """Create Lower with changed linkageName in debug info"""
-        lower = WrapperDefaultLower(context, library, fndesc, func_ir, metadata)
-
-        # Debuginfo
-        if context.enable_debuginfo:
-            from numba.core.funcdesc import default_mangler, qualifying_prefix
-
-            from numba_dpex.debuginfo import DpexDIBuilder
-
-            qualprefix = qualifying_prefix(fndesc.modname, fndesc.qualname)
-            mangled_qualname = default_mangler(qualprefix, fndesc.argtypes)
-
-            lower.debuginfo = DpexDIBuilder(
-                module=lower.module,
-                filepath=func_ir.loc.filename,
-                linkage_name=mangled_qualname,
-                cgctx=context,
-            )
-
-        return lower
-
-    def lower(self):
-        context = self.dpex_lower.context
-
-        # Only Numba's CPUContext has the `lower_extension` attribute
-        context.lower_extensions[Parfor] = lower_parfor_dpex
-        self.dpex_lower.lower()
-        self.base_lower = self.dpex_lower
-
-        self.env = self.base_lower.env
-        self.call_helper = self.base_lower.call_helper
-
-    def create_cpython_wrapper(self, release_gil=False):
-        return self.base_lower.create_cpython_wrapper(release_gil)
-
-
-@register_pass(mutates_CFG=True, analysis_only=False)
-class ParforLoweringPass(LoweringPass):
-    """A custom lowering pass that does dpex-specific lowering of parfor
-    nodes.
-
-    FIXME: Redesign once numba-dpex supports Numba 0.57
-    """
-
-    _name = "dpjit_lowering"
-
-    def __init__(self):
-        LoweringPass.__init__(self)
-
-    def run_pass(self, state):
-        if state.library is None:
-            codegen = state.targetctx.codegen()
-            state.library = codegen.create_library(state.func_id.func_qualname)
-            # Enable object caching upfront, so that the library can
-            # be later serialized.
-            state.library.enable_object_caching()
-
-        targetctx = state.targetctx
-
-        library = state.library
-        interp = state.func_ir
-        typemap = state.typemap
-        restype = state.return_type
-        calltypes = state.calltypes
-        flags = state.flags
-        metadata = state.metadata
-
-        kwargs = {}
-
-        # for support numba 0.54 and <=0.55.0dev0=*_469
-        if hasattr(flags, "get_mangle_string"):
-            kwargs["abi_tags"] = flags.get_mangle_string()
-        # Lowering
-        fndesc = funcdesc.PythonFunctionDescriptor.from_specialized_function(
-            interp,
-            typemap,
-            restype,
-            calltypes,
-            mangler=targetctx.mangler,
-            inline=flags.forceinline,
-            noalias=flags.noalias,
-            **kwargs,
-        )
-
-        with targetctx.push_code_library(library):
-            lower = _ParforLower(
-                targetctx, library, fndesc, interp, metadata=metadata
-            )
-            lower.lower()
-            if not flags.no_cpython_wrapper:
-                lower.create_cpython_wrapper(flags.release_gil)
-
-            env = lower.env
-            call_helper = lower.call_helper
-            del lower
-
-        from numba.core.compiler import _LowerResult  # TODO: move this
-
-        if flags.no_compile:
-            state["cr"] = _LowerResult(fndesc, call_helper, cfunc=None, env=env)
-        else:
-            # Prepare for execution
-            cfunc = targetctx.get_executable(library, fndesc, env)
-            # Insert native function for use by other jitted-functions.
-            # We also register its library to allow for inlining.
-            targetctx.insert_user_function(cfunc, fndesc, [library])
-            state["cr"] = _LowerResult(
-                fndesc, call_helper, cfunc=cfunc, env=env
-            )
-
-        return True
diff --git a/numba_dpex/core/utils/reduction_helper.py b/numba_dpex/core/parfors/reduction_helper.py
similarity index 99%
rename from numba_dpex/core/utils/reduction_helper.py
rename to numba_dpex/core/parfors/reduction_helper.py
index dfa038c71c..cb8471ee19 100644
--- a/numba_dpex/core/utils/reduction_helper.py
+++ b/numba_dpex/core/parfors/reduction_helper.py
@@ -13,13 +13,13 @@
     remove_dels,
     replace_var_names,
 )
+from numba.parfors import parfor
 from numba.parfors.parfor_lowering_utils import ParforLoweringBuilder
 
 from numba_dpex import utils
 from numba_dpex.core.utils.kernel_launcher import KernelLaunchIRBuilder
 from numba_dpex.dpctl_iface import DpctlCAPIFnBuilder
 
-from ..passes import parfor
 from ..types.dpnp_ndarray_type import DpnpNdArray
 
 
@@ -296,7 +296,7 @@ def __init__(
 
         param_types = [_to_scalar_from_0d(typemap[v]) for v in parfor_params]
 
-        # Calculate types of args passed to gufunc.
+        # Calculate types of args passed to the kernel function.
         func_arg_types = [typemap[v] for v in (parfor_inputs + parfor_outputs)]
 
         # Replace illegal parameter names in the loop body with legal ones.
diff --git a/numba_dpex/core/utils/reduction_kernel_builder.py b/numba_dpex/core/parfors/reduction_kernel_builder.py
similarity index 94%
rename from numba_dpex/core/utils/reduction_kernel_builder.py
rename to numba_dpex/core/parfors/reduction_kernel_builder.py
index 0571aa0ede..3e2d7d0f51 100644
--- a/numba_dpex/core/utils/reduction_kernel_builder.py
+++ b/numba_dpex/core/parfors/reduction_kernel_builder.py
@@ -18,6 +18,10 @@
 )
 from numba.core.typing import signature
 
+from ..utils.kernel_templates.reduction_template import (
+    RemainderReduceIntermediateKernelTemplate,
+    TreeReduceIntermediateKernelTemplate,
+)
 from .kernel_builder import _print_body  # saved for debug
 from .kernel_builder import (
     ParforKernel,
@@ -25,10 +29,6 @@
     _to_scalar_from_0d,
     update_sentinel,
 )
-from .kernel_templates.reduction_template import (
-    RemainderReduceIntermediateKernelTemplate,
-    TreeReduceIntermediateKernelTemplate,
-)
 
 
 def create_reduction_main_kernel_for_parfor(
@@ -57,8 +57,8 @@ def create_reduction_main_kernel_for_parfor(
     loop_body_var_table = get_name_var_table(reductionKernelVar.loop_body)
     sentinel_name = get_unused_var_name("__sentinel__", loop_body_var_table)
 
-    # Determine the unique names of the scheduling and gufunc functions.
-    kernel_name = "__numba_parfor_gufunc_%s" % (parfor_node.id)
+    # Determine the unique names of the scheduling and kernel functions.
+    kernel_name = "__dpex_reduction_parfor_%s" % (parfor_node.id)
 
     # swap s.2 (redvar) with partial_sum
     for i, name in enumerate(reductionKernelVar.parfor_params):
@@ -112,11 +112,11 @@ def create_reduction_main_kernel_for_parfor(
 
     replace_var_names(kernel_ir.blocks, new_var_dict)
     kernel_param_types = reductionKernelVar.param_types
-    gufunc_stub_last_label = max(kernel_ir.blocks.keys()) + 1
-    # Add gufunc stub last label to each parfor.loop_body label to prevent
+    kernel_stub_last_label = max(kernel_ir.blocks.keys()) + 1
+    # Add kernel stub last label to each parfor.loop_body label to prevent
     # label conflicts.
     loop_body = add_offset_to_labels(
-        reductionKernelVar.loop_body, gufunc_stub_last_label
+        reductionKernelVar.loop_body, kernel_stub_last_label
     )
     # new label for splitting sentinel block
     new_label = max(loop_body.keys()) + 1
@@ -205,7 +205,7 @@ def create_reduction_remainder_kernel_for_parfor(
         name = reductionHelper.final_sum_var.name
         final_sum_var_name.append(name)
 
-    kernel_name = "__numba_parfor_gufunc_%s_sum2" % (parfor_node.id)
+    kernel_name = "__dpex_redection_parfor_%s_remainder" % (parfor_node.id)
 
     partial_sum_var_dict = legalize_names(partial_sum_var_name)
     global_size_var_dict = legalize_names(global_size_var_name)
@@ -313,12 +313,12 @@ def create_reduction_remainder_kernel_for_parfor(
 
     kernel_param_types = reductionKernelVar.param_types
 
-    gufunc_stub_last_label = max(kernel_ir.blocks.keys()) + 1
+    kernel_stub_last_label = max(kernel_ir.blocks.keys()) + 1
 
-    # Add gufunc stub last label to each parfor.loop_body label to prevent
+    # Add kernel stub last label to each parfor.loop_body label to prevent
     # label conflicts.
     loop_body = add_offset_to_labels(
-        reductionKernelVar.loop_body, gufunc_stub_last_label
+        reductionKernelVar.loop_body, kernel_stub_last_label
     )
     # new label for splitting sentinel block
     new_label = max(loop_body.keys()) + 1
diff --git a/numba_dpex/core/passes/__init__.py b/numba_dpex/core/passes/__init__.py
index 5d50252a3b..d090fb56f5 100644
--- a/numba_dpex/core/passes/__init__.py
+++ b/numba_dpex/core/passes/__init__.py
@@ -3,25 +3,10 @@
 # SPDX-License-Identifier: Apache-2.0
 
 from .parfor_legalize_cfd_pass import ParforLegalizeCFDPass
-from .parfor_lowering_pass import ParforLoweringPass
-from .passes import (
-    DumpParforDiagnostics,
-    NoPythonBackend,
-    ParforFusionPass,
-    ParforPass,
-    ParforPreLoweringPass,
-    PreParforPass,
-    SplitParforPass,
-)
+from .passes import DumpParforDiagnostics, NoPythonBackend
 
 __all__ = [
     "DumpParforDiagnostics",
-    "ParforLoweringPass",
     "ParforLegalizeCFDPass",
-    "ParforFusionPass",
-    "ParforPreLoweringPass",
-    "ParforPass",
-    "PreParforPass",
-    "SplitParforPass",
     "NoPythonBackend",
 ]
diff --git a/numba_dpex/core/passes/lowerer.py b/numba_dpex/core/passes/lowerer.py
deleted file mode 100644
index f476065f40..0000000000
--- a/numba_dpex/core/passes/lowerer.py
+++ /dev/null
@@ -1,1470 +0,0 @@
-# SPDX-FileCopyrightText: 2020 - 2023 Intel Corporation
-#
-# SPDX-License-Identifier: Apache-2.0
-
-import ast
-import copy
-import linecache
-import os
-import sys
-import warnings
-from collections import OrderedDict
-
-import dpctl
-import dpctl.program as dpctl_prog
-import numba
-import numpy as np
-from numba.core import compiler, funcdesc, ir, lowering, sigutils, types
-from numba.core.errors import (
-    NumbaParallelSafetyWarning,
-    NumbaPerformanceWarning,
-)
-from numba.core.ir_utils import (
-    add_offset_to_labels,
-    build_definitions,
-    find_callname,
-    find_potential_aliases,
-    get_call_table,
-    get_name_var_table,
-    get_np_ufunc_typ,
-    get_unused_var_name,
-    guard,
-    is_const_call,
-    is_pure,
-    legalize_names,
-    mk_unique_var,
-    remove_dead,
-    remove_dels,
-    rename_labels,
-    replace_var_names,
-    visit_vars_inner,
-)
-from numba.core.typing import signature
-from numba.parfors import parfor
-from numba.parfors.parfor_lowering import _lower_parfor_parallel
-
-import numba_dpex as dpex
-from numba_dpex import config
-from numba_dpex.core.descriptor import dpex_kernel_target
-from numba_dpex.core.targets.kernel_target import DpexKernelTargetContext
-from numba_dpex.core.types import Array
-from numba_dpex.dpctl_iface import KernelLaunchOps
-from numba_dpex.utils import address_space, npytypes_array_to_dpex_array
-
-from .dufunc_inliner import dufunc_inliner
-
-
-def _compile_kernel_parfor(
-    sycl_queue, kernel_name, func_ir, args, args_with_addrspaces, debug=False
-):
-    # We only accept numba_dpex.core.types.Array type
-    for arg in args_with_addrspaces:
-        if isinstance(arg, types.npytypes.Array) and not isinstance(arg, Array):
-            raise TypeError(
-                "Only numba_dpex.core.types.Array objects are supported as "
-                + "kernel arguments. Received %s" % (type(arg))
-            )
-    if config.DEBUG:
-        print("compile_kernel_parfor", args)
-        for a in args_with_addrspaces:
-            print(a, type(a))
-            if isinstance(a, types.npytypes.Array):
-                print("addrspace:", a.addrspace)
-
-    # Create a SPIRVKernel object
-    kernel = dpex.core.kernel_interface.spirv_kernel.SpirvKernel(
-        func_ir, kernel_name
-    )
-
-    # compile the kernel
-    kernel.compile(
-        args=args_with_addrspaces,
-        typing_ctx=dpex_kernel_target.typing_context,
-        target_ctx=dpex_kernel_target.target_context,
-        debug=debug,
-        compile_flags=None,
-    )
-
-    # Compile a SYCL Kernel object rom the SPIRVKernel
-
-    dpctl_create_program_from_spirv_flags = []
-
-    if debug or config.OPT == 0:
-        # if debug is ON we need to pass additional flags to igc.
-        dpctl_create_program_from_spirv_flags = ["-g", "-cl-opt-disable"]
-
-    # create a program
-    kernel_bundle = dpctl_prog.create_program_from_spirv(
-        sycl_queue,
-        kernel.device_driver_ir_module,
-        " ".join(dpctl_create_program_from_spirv_flags),
-    )
-    #  create a kernel
-    sycl_kernel = kernel_bundle.get_sycl_kernel(kernel.module_name)
-
-    return sycl_kernel
-
-
-def _print_block(block):
-    for i, inst in enumerate(block.body):
-        print("    ", i, inst)
-
-
-def _print_body(body_dict):
-    """Pretty-print a set of IR blocks."""
-    for label, block in body_dict.items():
-        print("label: ", label)
-        _print_block(block)
-
-
-# This loop scheduler is pretty basic, there is only
-# 3 dimension allowed in OpenCL, so to make the backend
-# functional we will schedule the first 3 dimensions
-# through OpenCL and generate for loops for the remaining
-# dimensions
-def _schedule_loop(parfor_dim, legal_loop_indices, loop_ranges, param_dict):
-    gufunc_txt = ""
-    global_id_dim = 0
-    for_loop_dim = parfor_dim
-
-    if parfor_dim > 3:
-        global_id_dim = 3
-    else:
-        global_id_dim = parfor_dim
-
-    for eachdim in range(global_id_dim):
-        gufunc_txt += (
-            "    "
-            + legal_loop_indices[eachdim]
-            + " = "
-            + "dpex.get_global_id("
-            + str(eachdim)
-            + ")\n"
-        )
-
-    for eachdim in range(global_id_dim, for_loop_dim):
-        for indent in range(1 + (eachdim - global_id_dim)):
-            gufunc_txt += "    "
-
-        start, stop, step = loop_ranges[eachdim]
-        start = param_dict.get(str(start), start)
-        stop = param_dict.get(str(stop), stop)
-        gufunc_txt += (
-            "for "
-            + legal_loop_indices[eachdim]
-            + " in range("
-            + str(start)
-            + ", "
-            + str(stop)
-            + " + 1):\n"
-        )
-
-    for eachdim in range(global_id_dim, for_loop_dim):
-        for indent in range(1 + (eachdim - global_id_dim)):
-            gufunc_txt += "    "
-
-    return gufunc_txt
-
-
-def _dbgprint_after_each_array_assignments(lowerer, loop_body, typemap):
-    for label, block in loop_body.items():
-        new_block = block.copy()
-        new_block.clear()
-        loc = block.loc
-        scope = block.scope
-        for inst in block.body:
-            new_block.append(inst)
-            # Append print after assignment
-            if isinstance(inst, ir.Assign):
-                # Only apply to numbers
-                if typemap[inst.target.name] not in types.number_domain:
-                    continue
-
-                # Make constant string
-                strval = "{} =".format(inst.target.name)
-                strconsttyp = types.StringLiteral(strval)
-
-                lhs = ir.Var(scope, mk_unique_var("str_const"), loc)
-                assign_lhs = ir.Assign(
-                    value=ir.Const(value=strval, loc=loc), target=lhs, loc=loc
-                )
-                typemap[lhs.name] = strconsttyp
-                new_block.append(assign_lhs)
-
-                # Make print node
-                print_node = ir.Print(
-                    args=[lhs, inst.target], vararg=None, loc=loc
-                )
-                new_block.append(print_node)
-                sig = numba.typing.signature(
-                    types.none, typemap[lhs.name], typemap[inst.target.name]
-                )
-                lowerer.fndesc.calltypes[print_node] = sig
-        loop_body[label] = new_block
-
-
-def replace_var_with_array_in_block(vars, block, typemap, calltypes):
-    new_block = []
-    for inst in block.body:
-        if isinstance(inst, ir.Assign) and inst.target.name in vars:
-            const_node = ir.Const(0, inst.loc)
-            const_var = ir.Var(
-                inst.target.scope, mk_unique_var("$const_ind_0"), inst.loc
-            )
-            typemap[const_var.name] = types.uintp
-            const_assign = ir.Assign(const_node, const_var, inst.loc)
-            new_block.append(const_assign)
-
-            setitem_node = ir.SetItem(
-                inst.target, const_var, inst.value, inst.loc
-            )
-            calltypes[setitem_node] = signature(
-                types.none,
-                types.npytypes.Array(typemap[inst.target.name], 1, "C"),
-                types.intp,
-                typemap[inst.target.name],
-            )
-            new_block.append(setitem_node)
-            continue
-        elif isinstance(inst, parfor.Parfor):
-            replace_var_with_array_internal(
-                vars, {0: inst.init_block}, typemap, calltypes
-            )
-            replace_var_with_array_internal(
-                vars, inst.loop_body, typemap, calltypes
-            )
-
-        new_block.append(inst)
-    return new_block
-
-
-def replace_var_with_array_internal(vars, loop_body, typemap, calltypes):
-    for label, block in loop_body.items():
-        block.body = replace_var_with_array_in_block(
-            vars, block, typemap, calltypes
-        )
-
-
-def replace_var_with_array(vars, loop_body, typemap, calltypes):
-    replace_var_with_array_internal(vars, loop_body, typemap, calltypes)
-    for v in vars:
-        el_typ = typemap[v]
-        typemap.pop(v, None)
-        typemap[v] = types.npytypes.Array(el_typ, 1, "C")
-
-
-def wrap_loop_body(loop_body):
-    blocks = loop_body.copy()  # shallow copy is enough
-    first_label = min(blocks.keys())
-    last_label = max(blocks.keys())
-    loc = blocks[last_label].loc
-    blocks[last_label].body.append(ir.Jump(first_label, loc))
-    return blocks
-
-
-def unwrap_loop_body(loop_body):
-    last_label = max(loop_body.keys())
-    loop_body[last_label].body = loop_body[last_label].body[:-1]
-
-
-def legalize_names_with_typemap(names, typemap):
-    """We use ir_utils.legalize_names to replace internal IR variable names
-    containing illegal characters (e.g. period) with a legal character
-    (underscore) so as to create legal variable names.
-    The original variable names are in the typemap so we also
-    need to add the legalized name to the typemap as well.
-    """
-    outdict = legalize_names(names)
-    # For each pair in the dict of legalized names...
-    for x, y in outdict.items():
-        # If the name had some legalization change to it...
-        if x != y:
-            # Set the type of the new name the same as the type of the old name.
-            typemap[y] = typemap[x]
-    return outdict
-
-
-def to_scalar_from_0d(x):
-    if isinstance(x, types.ArrayCompatible):
-        if x.ndim == 0:
-            return x.dtype
-    return x
-
-
-def find_setitems_block(setitems, block, typemap):
-    for inst in block.body:
-        if isinstance(inst, ir.StaticSetItem) or isinstance(inst, ir.SetItem):
-            setitems.add(inst.target.name)
-        elif isinstance(inst, parfor.Parfor):
-            find_setitems_block(setitems, inst.init_block, typemap)
-            find_setitems_body(setitems, inst.loop_body, typemap)
-
-
-def find_setitems_body(setitems, loop_body, typemap):
-    """
-    Find the arrays that are written into (goes into setitems)
-    """
-    for label, block in loop_body.items():
-        find_setitems_block(setitems, block, typemap)
-
-
-def _create_gufunc_for_regular_parfor():
-    # TODO
-    pass
-
-
-def _create_gufunc_for_reduction_parfor():
-    raise ValueError("Reductions are not yet supported via parfor")
-
-
-def _create_gufunc_for_parfor_body(
-    lowerer,
-    parfor,
-    typemap,
-    flags,
-    loop_ranges,
-    has_aliases,
-    races,
-):
-    """
-    Takes a parfor and creates a gufunc function for its body. There
-    are two parts to this function:
-
-        1) Code to iterate across the iteration space as defined by
-           the schedule.
-        2) The parfor body that does the work for a single point in
-           the iteration space.
-
-    Part 1 is created as Python text for simplicity with a sentinel
-    assignment to mark the point in the IR where the parfor body
-    should be added. This Python text is 'exec'ed into existence and its
-    IR retrieved with run_frontend. The IR is scanned for the sentinel
-    assignment where that basic block is split and the IR for the parfor
-    body inserted.
-    """
-
-    loc = parfor.init_block.loc
-
-    # The parfor body and the main function body share ir.Var nodes.
-    # We have to do some replacements of Var names in the parfor body
-    # to make them legal parameter names. If we don't copy then the
-    # Vars in the main function also would incorrectly change their name.
-
-    loop_body = copy.copy(parfor.loop_body)
-    remove_dels(loop_body)
-
-    parfor_dim = len(parfor.loop_nests)
-    loop_indices = [l.index_variable.name for l in parfor.loop_nests]
-
-    # Get all the parfor params.
-    parfor_params = parfor.params
-
-    for start, stop, step in loop_ranges:
-        if isinstance(start, ir.Var):
-            parfor_params.add(start.name)
-        if isinstance(stop, ir.Var):
-            parfor_params.add(stop.name)
-
-    # Get just the outputs of the parfor.
-    parfor_outputs = numba.parfors.parfor.get_parfor_outputs(
-        parfor, parfor_params
-    )
-
-    # Get all parfor reduction vars, and operators.
-    typemap = lowerer.fndesc.typemap
-
-    parfor_redvars, parfor_reddict = numba.parfors.parfor.get_parfor_reductions(
-        lowerer.func_ir, parfor, parfor_params, lowerer.fndesc.calltypes
-    )
-    has_reduction = False if len(parfor_redvars) == 0 else True
-
-    if has_reduction:
-        _create_gufunc_for_reduction_parfor()
-
-    # Compute just the parfor inputs as a set difference.
-    parfor_inputs = sorted(list(set(parfor_params) - set(parfor_outputs)))
-
-    for race in races:
-        msg = (
-            "Variable %s used in parallel loop may be written "
-            "to simultaneously by multiple workers and may result "
-            "in non-deterministic or unintended results." % race
-        )
-        warnings.warn(NumbaParallelSafetyWarning(msg, loc))
-    replace_var_with_array(races, loop_body, typemap, lowerer.fndesc.calltypes)
-
-    if config.DEBUG_ARRAY_OPT >= 1:
-        print("parfor_params = ", parfor_params, type(parfor_params))
-        print("parfor_outputs = ", parfor_outputs, type(parfor_outputs))
-        print("parfor_inputs = ", parfor_inputs, type(parfor_inputs))
-
-    # Reorder all the params so that inputs go first then outputs.
-    parfor_params = parfor_inputs + parfor_outputs
-
-    def addrspace_from(params, def_addr):
-        addrspaces = []
-        for p in params:
-            if isinstance(to_scalar_from_0d(typemap[p]), types.npytypes.Array):
-                addrspaces.append(def_addr)
-            else:
-                addrspaces.append(None)
-        return addrspaces
-
-    addrspaces = addrspace_from(parfor_params, address_space.GLOBAL)
-
-    if config.DEBUG_ARRAY_OPT >= 1:
-        print("parfor_params = ", parfor_params, type(parfor_params))
-        print("loop_indices = ", loop_indices, type(loop_indices))
-        print("loop_body = ", loop_body, type(loop_body))
-        _print_body(loop_body)
-
-    # Some Var are not legal parameter names so create a dict of
-    # potentially illegal param name to guaranteed legal name.
-    param_dict = legalize_names_with_typemap(parfor_params, typemap)
-    if config.DEBUG_ARRAY_OPT >= 1:
-        print("param_dict = ", sorted(param_dict.items()), type(param_dict))
-
-    # Some loop_indices are not legal parameter names so create a dict
-    # of potentially illegal loop index to guaranteed legal name.
-    ind_dict = legalize_names_with_typemap(loop_indices, typemap)
-    # Compute a new list of legal loop index names.
-    legal_loop_indices = [ind_dict[v] for v in loop_indices]
-
-    if config.DEBUG_ARRAY_OPT >= 1:
-        print("ind_dict = ", sorted(ind_dict.items()), type(ind_dict))
-        print(
-            "legal_loop_indices = ",
-            legal_loop_indices,
-            type(legal_loop_indices),
-        )
-
-        for pd in parfor_params:
-            print("pd = ", pd)
-            print("pd type = ", typemap[pd], type(typemap[pd]))
-
-    # Get the types of each parameter.
-    param_types = [to_scalar_from_0d(typemap[v]) for v in parfor_params]
-
-    param_types_addrspaces = copy.copy(param_types)
-
-    # Calculate types of args passed to gufunc.
-    func_arg_types = [typemap[v] for v in (parfor_inputs + parfor_outputs)]
-    assert len(param_types_addrspaces) == len(addrspaces)
-    for i in range(len(param_types_addrspaces)):
-        if addrspaces[i] is not None:
-            # Convert numba.types.Array to numba_dpex.core.types.Array data
-            # type. Our Array type allows us to specify an address space for the
-            # data and other pointer arguments for the array.
-            param_types_addrspaces[i] = npytypes_array_to_dpex_array(
-                param_types_addrspaces[i], addrspaces[i]
-            )
-
-    def print_arg_with_addrspaces(args):
-        for a in args:
-            print(a, type(a))
-            if isinstance(a, types.npytypes.Array):
-                print("addrspace:", a.addrspace)
-
-    if config.DEBUG_ARRAY_OPT >= 1:
-        print_arg_with_addrspaces(param_types)
-        print("func_arg_types = ", func_arg_types, type(func_arg_types))
-
-    # Replace illegal parameter names in the loop body with legal ones.
-    replace_var_names(loop_body, param_dict)
-    # remember the name before legalizing as the actual arguments
-    parfor_args = parfor_params
-    # Change parfor_params to be legal names.
-    parfor_params = [param_dict[v] for v in parfor_params]
-    parfor_params_orig = parfor_params
-
-    parfor_params = []
-    ascontig = False
-    for pindex in range(len(parfor_params_orig)):
-        if (
-            ascontig
-            and pindex < len(parfor_inputs)
-            and isinstance(param_types[pindex], types.npytypes.Array)
-        ):
-            parfor_params.append(parfor_params_orig[pindex] + "param")
-        else:
-            parfor_params.append(parfor_params_orig[pindex])
-
-    # Change parfor body to replace illegal loop index vars with legal ones.
-    replace_var_names(loop_body, ind_dict)
-    loop_body_var_table = get_name_var_table(loop_body)
-    sentinel_name = get_unused_var_name("__sentinel__", loop_body_var_table)
-
-    if config.DEBUG_ARRAY_OPT >= 1:
-        print("legal parfor_params = ", parfor_params, type(parfor_params))
-
-    # Determine the unique names of the scheduling and gufunc functions.
-    gufunc_name = "__numba_parfor_gufunc_%s" % (parfor.id)
-
-    if config.DEBUG_ARRAY_OPT:
-        # print("sched_func_name ", type(sched_func_name), sched_func_name)
-        print("gufunc_name ", type(gufunc_name), gufunc_name)
-
-    gufunc_txt = ""
-
-    # Create the gufunc function.
-    gufunc_txt += "def " + gufunc_name
-    gufunc_txt += "(" + (", ".join(parfor_params)) + "):\n"
-
-    gufunc_txt += _schedule_loop(
-        parfor_dim, legal_loop_indices, loop_ranges, param_dict
-    )
-
-    # Add the sentinel assignment so that we can find the loop body position
-    # in the IR.
-    gufunc_txt += "    "
-    gufunc_txt += sentinel_name + " = 0\n"
-
-    # gufunc returns nothing
-    gufunc_txt += "    return None\n"
-
-    if config.DEBUG_ARRAY_OPT:
-        print("gufunc_txt = ", type(gufunc_txt), "\n", gufunc_txt)
-        sys.stdout.flush()
-    # Force gufunc outline into existence.
-    globls = {"np": np, "numba": numba, "dpex": dpex}
-    locls = {}
-    exec(gufunc_txt, globls, locls)
-    gufunc_func = locls[gufunc_name]
-
-    if config.DEBUG_ARRAY_OPT:
-        print("gufunc_func = ", type(gufunc_func), "\n", gufunc_func)
-    # Get the IR for the gufunc outline.
-    gufunc_ir = compiler.run_frontend(gufunc_func)
-
-    if config.DEBUG_ARRAY_OPT:
-        print("gufunc_ir dump ", type(gufunc_ir))
-        gufunc_ir.dump()
-        print("loop_body dump ", type(loop_body))
-        _print_body(loop_body)
-
-    # rename all variables in gufunc_ir afresh
-    var_table = get_name_var_table(gufunc_ir.blocks)
-    new_var_dict = {}
-    reserved_names = (
-        [sentinel_name] + list(param_dict.values()) + legal_loop_indices
-    )
-    for name, var in var_table.items():
-        if not (name in reserved_names):
-            new_var_dict[name] = mk_unique_var(name)
-    replace_var_names(gufunc_ir.blocks, new_var_dict)
-    if config.DEBUG_ARRAY_OPT:
-        print("gufunc_ir dump after renaming ")
-        gufunc_ir.dump()
-
-    prs_dict = {}
-    pss_dict = {}
-    pspmd_dict = {}
-
-    gufunc_param_types = param_types
-
-    if config.DEBUG_ARRAY_OPT:
-        print(
-            "gufunc_param_types = ",
-            type(gufunc_param_types),
-            "\n",
-            gufunc_param_types,
-        )
-
-    gufunc_stub_last_label = max(gufunc_ir.blocks.keys()) + 1
-
-    # Add gufunc stub last label to each parfor.loop_body label to prevent
-    # label conflicts.
-    loop_body = add_offset_to_labels(loop_body, gufunc_stub_last_label)
-    # new label for splitting sentinel block
-    new_label = max(loop_body.keys()) + 1
-
-    # If enabled, add a print statement after every assignment.
-    if config.DEBUG_ARRAY_OPT_RUNTIME:
-        _dbgprint_after_each_array_assignments(lowerer, loop_body, typemap)
-
-    if config.DEBUG_ARRAY_OPT:
-        print("parfor loop body")
-        _print_body(loop_body)
-
-    wrapped_blocks = wrap_loop_body(loop_body)
-    # hoisted, not_hoisted = hoist(parfor_params, loop_body,
-    #                             typemap, wrapped_blocks)
-    setitems = set()
-    find_setitems_body(setitems, loop_body, typemap)
-
-    hoisted = []
-    not_hoisted = []
-
-    start_block = gufunc_ir.blocks[min(gufunc_ir.blocks.keys())]
-    start_block.body = start_block.body[:-1] + hoisted + [start_block.body[-1]]
-    unwrap_loop_body(loop_body)
-
-    # store hoisted into diagnostics
-    diagnostics = lowerer.metadata["parfor_diagnostics"]
-    diagnostics.hoist_info[parfor.id] = {
-        "hoisted": hoisted,
-        "not_hoisted": not_hoisted,
-    }
-
-    lowerer.metadata["parfor_diagnostics"].extra_info[str(parfor.id)] = str(
-        dpctl.get_current_queue().get_sycl_device().name
-    )
-
-    if config.DEBUG_ARRAY_OPT:
-        print("After hoisting")
-        _print_body(loop_body)
-
-    # Search all the block in the gufunc outline for the sentinel assignment.
-    for label, block in gufunc_ir.blocks.items():
-        for i, inst in enumerate(block.body):
-            if (
-                isinstance(inst, ir.Assign)
-                and inst.target.name == sentinel_name
-            ):
-                # We found the sentinel assignment.
-                loc = inst.loc
-                scope = block.scope
-                # split block across __sentinel__
-                # A new block is allocated for the statements prior to the
-                # sentinel but the new block maintains the current block label.
-                prev_block = ir.Block(scope, loc)
-                prev_block.body = block.body[:i]
-
-                # The current block is used for statements after the sentinel.
-                block.body = block.body[i + 1 :]
-                # But the current block gets a new label.
-                body_first_label = min(loop_body.keys())
-
-                # The previous block jumps to the minimum labelled block of the
-                # parfor body.
-                prev_block.append(ir.Jump(body_first_label, loc))
-                # Add all the parfor loop body blocks to the gufunc function's
-                # IR.
-                for l, b in loop_body.items():
-                    gufunc_ir.blocks[l] = b
-                body_last_label = max(loop_body.keys())
-                gufunc_ir.blocks[new_label] = block
-                gufunc_ir.blocks[label] = prev_block
-                # Add a jump from the last parfor body block to the block
-                # containing statements after the sentinel.
-                gufunc_ir.blocks[body_last_label].append(
-                    ir.Jump(new_label, loc)
-                )
-                break
-        else:
-            continue
-        break
-
-    if config.DEBUG_ARRAY_OPT:
-        print("gufunc_ir last dump before renaming")
-        gufunc_ir.dump()
-
-    gufunc_ir.blocks = rename_labels(gufunc_ir.blocks)
-    remove_dels(gufunc_ir.blocks)
-
-    if config.DEBUG_ARRAY_OPT:
-        sys.stdout.flush()
-
-    if config.DEBUG_ARRAY_OPT:
-        print("gufunc_ir last dump")
-        gufunc_ir.dump()
-        print("flags", flags)
-        print("typemap", typemap)
-
-    old_alias = flags.noalias
-    if not has_aliases:
-        if config.DEBUG_ARRAY_OPT:
-            print("No aliases found so adding noalias flag.")
-        flags.noalias = True
-
-    remove_dead(gufunc_ir.blocks, gufunc_ir.arg_names, gufunc_ir, typemap)
-
-    if config.DEBUG_ARRAY_OPT:
-        print("gufunc_ir after remove dead")
-        gufunc_ir.dump()
-
-    kernel_sig = signature(types.none, *gufunc_param_types)
-
-    if config.DEBUG_ARRAY_OPT:
-        sys.stdout.flush()
-
-    if config.DEBUG_ARRAY_OPT:
-        print("before DUFunc inlining".center(80, "-"))
-        gufunc_ir.dump()
-
-    # Inlining all DUFuncs
-    dufunc_inliner(
-        gufunc_ir,
-        lowerer.fndesc.calltypes,
-        typemap,
-        lowerer.context.typing_context,
-        lowerer.context,
-    )
-
-    if config.DEBUG_ARRAY_OPT:
-        print("after DUFunc inline".center(80, "-"))
-        gufunc_ir.dump()
-
-    sycl_kernel = _compile_kernel_parfor(
-        dpctl.get_current_queue(),
-        gufunc_name,
-        gufunc_ir,
-        gufunc_param_types,
-        param_types_addrspaces,
-        debug=flags.debuginfo,
-    )
-
-    flags.noalias = old_alias
-
-    if config.DEBUG_ARRAY_OPT:
-        print("kernel_sig = ", kernel_sig)
-
-    return sycl_kernel, parfor_args, kernel_sig, func_arg_types, setitems
-
-
-def _lower_parfor_gufunc(lowerer, parfor):
-    """Lowerer that handles LLVM code generation for parfor.
-    This function lowers a parfor IR node to LLVM.
-    The general approach is as follows:
-    1) The code from the parfor's init block is lowered normally
-       in the context of the current function.
-    2) The body of the parfor is transformed into a gufunc function.
-    3) Code is inserted into the main function that calls do_scheduling
-       to divide the iteration space for each thread, allocates
-       reduction arrays, calls the gufunc function, and then invokes
-       the reduction function across the reduction arrays to produce
-       the final reduction values.
-    """
-
-    typingctx = lowerer.context.typing_context
-    targetctx = lowerer.context
-    # We copy the typemap here because for race condition variable we'll
-    # update their type to array so they can be updated by the gufunc.
-    orig_typemap = lowerer.fndesc.typemap
-    # replace original typemap with copy and restore the original at the end.
-    lowerer.fndesc.typemap = copy.copy(orig_typemap)
-    if config.DEBUG_ARRAY_OPT:
-        print("lowerer.fndesc", lowerer.fndesc, type(lowerer.fndesc))
-    typemap = lowerer.fndesc.typemap
-    varmap = lowerer.varmap
-
-    if config.DEBUG_ARRAY_OPT:
-        print("_lower_parfor_parallel")
-        parfor.dump()
-
-    loc = parfor.init_block.loc
-    scope = parfor.init_block.scope
-
-    # produce instructions for init_block
-    if config.DEBUG_ARRAY_OPT:
-        print("init_block = ", parfor.init_block, type(parfor.init_block))
-    for instr in parfor.init_block.body:
-        if config.DEBUG_ARRAY_OPT:
-            print("lower init_block instr = ", instr)
-        lowerer.lower_inst(instr)
-
-    for racevar in parfor.races:
-        if racevar not in varmap:
-            rvtyp = typemap[racevar]
-            rv = ir.Var(scope, racevar, loc)
-            lowerer._alloca_var(rv.name, rvtyp)
-
-    alias_map = {}
-    arg_aliases = {}
-    numba.parfors.parfor.find_potential_aliases_parfor(
-        parfor, parfor.params, typemap, lowerer.func_ir, alias_map, arg_aliases
-    )
-    if config.DEBUG_ARRAY_OPT:
-        print("alias_map", alias_map)
-        print("arg_aliases", arg_aliases)
-
-    # run get_parfor_outputs() and get_parfor_reductions() before
-    # gufunc creation since Jumps are modified so CFG of loop_body
-    # dict will become invalid
-    assert parfor.params is not None
-
-    parfor_output_arrays = numba.parfors.parfor.get_parfor_outputs(
-        parfor, parfor.params
-    )
-
-    # compile parfor body as a separate function to be used with GUFuncWrapper
-    flags = copy.copy(parfor.flags)
-    flags.error_model = "numpy"
-
-    # Can't get here unless flags.set('auto_parallel', ParallelOptions(True))
-    index_var_typ = typemap[parfor.loop_nests[0].index_variable.name]
-
-    # index variables should have the same type, check rest of indices
-    for l in parfor.loop_nests[1:]:
-        assert typemap[l.index_variable.name] == index_var_typ
-
-    numba.parfors.parfor.sequential_parfor_lowering = True
-    loop_ranges = [(l.start, l.stop, l.step) for l in parfor.loop_nests]
-
-    try:
-        (
-            func,
-            func_args,
-            func_sig,
-            func_arg_types,
-            modified_arrays,
-        ) = _create_gufunc_for_parfor_body(
-            lowerer,
-            parfor,
-            typemap,
-            flags,
-            loop_ranges,
-            bool(alias_map),
-            parfor.races,
-        )
-    finally:
-        numba.parfors.parfor.sequential_parfor_lowering = False
-
-    # get the shape signature
-    get_shape_classes = parfor.get_shape_classes
-
-    num_inputs = len(func_args) - len(parfor_output_arrays)
-    if config.DEBUG_ARRAY_OPT:
-        print("func", func, type(func))
-        print("func_args", func_args, type(func_args))
-        print("func_sig", func_sig, type(func_sig))
-        print("num_inputs = ", num_inputs)
-        print("parfor_outputs = ", parfor_output_arrays)
-
-    # call the func in parallel by wrapping it with ParallelGUFuncBuilder
-    if config.DEBUG_ARRAY_OPT:
-        print("loop_nests = ", parfor.loop_nests)
-        print("loop_ranges = ", loop_ranges)
-
-    gu_signature = _create_shape_signature(
-        parfor.get_shape_classes,
-        num_inputs,
-        func_args,
-        func_sig,
-        parfor.races,
-        typemap,
-    )
-
-    generate_kernel_launch_ops(
-        lowerer,
-        func,
-        gu_signature,
-        func_sig,
-        func_args,
-        num_inputs,
-        func_arg_types,
-        loop_ranges,
-        modified_arrays,
-    )
-
-    if config.DEBUG_ARRAY_OPT:
-        sys.stdout.flush()
-
-    # Restore the original typemap of the function that was replaced
-    # temporarily at the beginning of this function.
-    lowerer.fndesc.typemap = orig_typemap
-
-
-def _create_shape_signature(
-    get_shape_classes,
-    num_inputs,
-    # num_reductions,
-    args,
-    func_sig,
-    races,
-    typemap,
-):
-    """Create shape signature for GUFunc"""
-    if config.DEBUG_ARRAY_OPT:
-        print("_create_shape_signature", num_inputs, args)
-        arg_start_print = 0
-        for i in args[arg_start_print:]:
-            print("argument", i, type(i), get_shape_classes(i, typemap=typemap))
-
-    # num_inouts = len(args) - num_reductions
-    num_inouts = len(args)
-    # maximum class number for array shapes
-    classes = [
-        get_shape_classes(var, typemap=typemap) if var not in races else (-1,)
-        for var in args[1:]
-    ]
-    class_set = set()
-    for _class in classes:
-        if _class:
-            for i in _class:
-                class_set.add(i)
-    max_class = max(class_set) + 1 if class_set else 0
-    classes.insert(0, (max_class,))  # force set the class of 'sched' argument
-    class_set.add(max_class)
-    class_map = {}
-    # TODO: use prefix + class number instead of single char
-    alphabet = ord("a")
-    for n in class_set:
-        if n >= 0:
-            class_map[n] = chr(alphabet)
-            alphabet += 1
-
-    alpha_dict = {"latest_alpha": alphabet}
-
-    def bump_alpha(c, class_map):
-        if c >= 0:
-            return class_map[c]
-        else:
-            alpha_dict["latest_alpha"] += 1
-            return chr(alpha_dict["latest_alpha"])
-
-    gu_sin = []
-    gu_sout = []
-    count = 0
-    syms_sin = ()
-
-    if config.DEBUG_ARRAY_OPT:
-        print("args", args)
-        print("classes", classes)
-
-    for cls, arg in zip(classes, args):
-        count = count + 1
-        if cls:
-            dim_syms = tuple(bump_alpha(c, class_map) for c in cls)
-        else:
-            dim_syms = ()
-        gu_sin.append(dim_syms)
-        syms_sin += dim_syms
-    return (gu_sin, gu_sout)
-
-
-# Keep all the kernels and programs created alive indefinitely.
-keep_alive_kernels = []
-
-
-def generate_kernel_launch_ops(
-    lowerer,
-    kernel,
-    gu_signature,
-    outer_sig,
-    expr_args,
-    num_inputs,
-    expr_arg_types,
-    loop_ranges,
-    modified_arrays,
-):
-    """
-    Adds the call to the gufunc function from the main function.
-    """
-    context = lowerer.context
-    builder = lowerer.builder
-    sin, sout = gu_signature
-    num_dim = len(loop_ranges)
-
-    if config.DEBUG_ARRAY_OPT:
-        print("generate_kernel_launch_ops")
-        print("args = ", expr_args)
-        print(
-            "outer_sig = ",
-            outer_sig.args,
-            outer_sig.return_type,
-            outer_sig.recvr,
-            outer_sig.pysig,
-        )
-        print("loop_ranges = ", loop_ranges)
-        print("expr_args", expr_args)
-        print("expr_arg_types", expr_arg_types)
-        print("gu_signature", gu_signature)
-        print("sin", sin)
-        print("sout", sout)
-        print("cres", cres, type(cres))
-        print("modified_arrays", modified_arrays)
-
-    # get dpex_cpu_portion_lowerer object
-    kernel_launcher = KernelLaunchOps(lowerer, kernel, num_inputs)
-
-    # Get a pointer to the current queue
-    curr_queue = kernel_launcher.get_current_queue()
-
-    # Compute number of args
-    num_expanded_args = 0
-
-    for arg_type in expr_arg_types:
-        if isinstance(arg_type, types.npytypes.Array):
-            num_expanded_args += 5 + (2 * arg_type.ndim)
-        else:
-            num_expanded_args += 1
-
-    if config.DEBUG_ARRAY_OPT:
-        print("num_expanded_args = ", num_expanded_args)
-
-    # now that we know the total number of kernel args, lets allocate
-    # a kernel_arg array
-    kernel_launcher.allocate_kernel_arg_array(num_expanded_args)
-
-    ninouts = len(expr_args)
-
-    def getvar_or_none(lowerer, x):
-        try:
-            return lowerer.getvar(x)
-        except:
-            return None
-
-    def loadvar_or_none(lowerer, x):
-        try:
-            return lowerer.loadvar(x)
-        except:
-            return None
-
-    def val_type_or_none(context, lowerer, x):
-        try:
-            return context.get_value_type(lowerer.fndesc.typemap[x])
-        except:
-            return None
-
-    all_llvm_args = [getvar_or_none(lowerer, x) for x in expr_args[:ninouts]]
-    all_val_types = [
-        val_type_or_none(context, lowerer, x) for x in expr_args[:ninouts]
-    ]
-    all_args = [loadvar_or_none(lowerer, x) for x in expr_args[:ninouts]]
-
-    keep_alive_kernels.append(kernel)
-
-    # Call clSetKernelArg for each arg and create arg array for
-    # the enqueue function. Put each part of each argument into
-    # kernel_arg_array.
-    for var, llvm_arg, arg_type, gu_sig, val_type, index in zip(
-        expr_args,
-        all_llvm_args,
-        expr_arg_types,
-        sin + sout,
-        all_val_types,
-        range(len(expr_args)),
-    ):
-        if config.DEBUG_ARRAY_OPT:
-            print(
-                "var:",
-                var,
-                type(var),
-                "\n\tllvm_arg:",
-                llvm_arg,
-                type(llvm_arg),
-                "\n\targ_type:",
-                arg_type,
-                type(arg_type),
-                "\n\tgu_sig:",
-                gu_sig,
-                "\n\tval_type:",
-                val_type,
-                type(val_type),
-                "\n\tindex:",
-                index,
-            )
-        kernel_launcher.process_kernel_arg(
-            var, llvm_arg, arg_type, index, modified_arrays, curr_queue
-        )
-
-    # loadvars for loop_ranges
-    def load_range(v):
-        if isinstance(v, ir.Var):
-            return lowerer.loadvar(v.name)
-        else:
-            return context.get_constant(types.uintp, v)
-
-    num_dim = len(loop_ranges)
-    for i in range(num_dim):
-        start, stop, step = loop_ranges[i]
-        start = load_range(start)
-        stop = load_range(stop)
-        assert step == 1  # We do not support loop steps other than 1
-        step = load_range(step)
-        loop_ranges[i] = (start, stop, step)
-
-    kernel_launcher.enqueue_kernel_and_copy_back(loop_ranges, curr_queue)
-
-    # At this point we can free the DPCTLSyclQueueRef (curr_queue)
-    kernel_launcher.free_queue(sycl_queue_val=curr_queue)
-
-
-from numba.core.lowering import Lower
-
-
-class CopyIRException(RuntimeError):
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-
-
-def relatively_deep_copy(obj, memo):
-    # WARNING: there are some issues with genarators which were not investigated
-    # and root cause is not found. Though copied IR seems to work fine there are
-    # some extra references kept on generator objects which may result in a
-    # memory leak.
-
-    obj_id = id(obj)
-    if obj_id in memo:
-        return memo[obj_id]
-
-    from ctypes import _CFuncPtr
-    from types import ModuleType
-
-    from numba.core.bytecode import FunctionIdentity
-    from numba.core.compiler import CompileResult
-    from numba.core.dispatcher import _DispatcherBase
-    from numba.core.types.abstract import Type
-    from numba.core.types.functions import Dispatcher, Function
-    from numba.core.typing.templates import Signature
-    from numba.np.ufunc.dufunc import DUFunc
-
-    from numba_dpex.core.kernel_interface.func import DpexFunctionTemplate
-
-    # objects which shouldn't or can't be copied and it's ok not to copy it.
-    if isinstance(
-        obj,
-        (
-            FunctionIdentity,
-            _DispatcherBase,
-            Function,
-            Type,
-            Dispatcher,
-            ModuleType,
-            Signature,
-            DpexFunctionTemplate,
-            CompileResult,
-            DUFunc,
-            _CFuncPtr,
-            type,
-            str,
-            bool,
-            type(None),
-        ),
-    ):
-        return obj
-
-    from numba.core.funcdesc import FunctionDescriptor
-    from numba.core.ir import FreeVar, FunctionIR, Global
-    from numba.core.postproc import PostProcessor
-
-    if isinstance(obj, FunctionDescriptor):
-        cpy = FunctionDescriptor(
-            native=obj.native,
-            modname=obj.modname,
-            qualname=obj.qualname,
-            unique_name=obj.unique_name,
-            doc=obj.doc,
-            typemap=relatively_deep_copy(obj.typemap, memo),
-            restype=obj.restype,
-            calltypes=relatively_deep_copy(obj.calltypes, memo),
-            args=obj.args,
-            kws=obj.kws,
-            mangler=None,
-            argtypes=relatively_deep_copy(obj.argtypes, memo),
-            inline=obj.inline,
-            noalias=obj.noalias,
-            env_name=obj.env_name,
-            global_dict=obj.global_dict,
-        )
-        # mangler parameter is not saved in FunctionDescriptor, but used to generated name.
-        # So pass None as mangler parameter and then copy mangled_name by hands
-        cpy.mangled_name = obj.mangled_name
-
-        memo[obj_id] = cpy
-
-        return cpy
-
-    if isinstance(obj, FunctionIR):
-        # PostProcessor do the following:
-        # 1. canonicolize cfg, modifying IR
-        # 2. fills internal generators status
-        # 3. creates and fills VariableLifetime object
-        # We can't copy this objects. So in order to have copy of it we need run PostProcessor on copied IR.
-        # This means, that in case PostProcess wasn't run for original object copied object would defer.
-        # In order to avoid this we are running PostProcess on original object firstly.
-        # This means that copy of IR actually has a side effect on it.
-        pp = PostProcessor(obj)
-        pp.run()
-        cpy = FunctionIR(
-            blocks=relatively_deep_copy(obj.blocks, memo),
-            is_generator=relatively_deep_copy(obj.is_generator, memo),
-            func_id=relatively_deep_copy(obj.func_id, memo),
-            loc=obj.loc,
-            definitions=relatively_deep_copy(obj._definitions, memo),
-            arg_count=obj.arg_count,
-            arg_names=relatively_deep_copy(obj.arg_names, memo),
-        )
-        pp = PostProcessor(cpy)
-        pp.run()
-
-        memo[obj_id] = cpy
-
-        return cpy
-
-    if isinstance(obj, Global):
-        cpy = Global(name=obj.name, value=obj.value, loc=obj.loc)
-        memo[obj_id] = cpy
-
-        return cpy
-
-    if isinstance(obj, FreeVar):
-        cpy = FreeVar(
-            index=obj.index, name=obj.name, value=obj.value, loc=obj.loc
-        )
-        memo[obj_id] = cpy
-
-        return cpy
-
-    # for containers we need to copy container itself first. And then fill it with copied items.
-    if isinstance(obj, list):
-        cpy = copy.copy(obj)
-        cpy.clear()
-        for item in obj:
-            cpy.append(relatively_deep_copy(item, memo))
-        memo[obj_id] = cpy
-        return cpy
-    elif isinstance(obj, dict):
-        cpy = copy.copy(obj)
-        cpy.clear()
-        for key, item in obj.items():
-            cpy[relatively_deep_copy(key, memo)] = relatively_deep_copy(
-                item, memo
-            )
-        memo[obj_id] = cpy
-        return cpy
-    elif isinstance(obj, tuple):
-        # subclass constructors could have different parameters than superclass.
-        # e.g. tuple and namedtuple constructors accepts quite different parameters.
-        # it is better to have separate section for namedtuple
-        tpl = tuple([relatively_deep_copy(item, memo) for item in obj])
-        if type(obj) == tuple:
-            cpy = tpl
-        else:
-            cpy = type(obj)(*tpl)
-        memo[obj_id] = cpy
-        return cpy
-    elif isinstance(obj, set):
-        cpy = copy.copy(obj)
-        cpy.clear()
-        for item in obj:
-            cpy.add(relatively_deep_copy(item, memo))
-        memo[obj_id] = cpy
-        return cpy
-
-    # some python objects are not copyable. In such case exception would be
-    # raised it is just a convinient point to find such objects
-    try:
-        cpy = copy.copy(obj)
-    except Exception as e:
-        raise e
-
-    # __slots__ for subclass specify only members declared in subclass. So to
-    # get all members we need to go through
-    # all supeclasses
-    def get_slots_members(obj):
-        keys = []
-        typ = obj
-        if not isinstance(typ, type):
-            typ = type(obj)
-
-        try:
-            if len(typ.__slots__):
-                keys.extend(typ.__slots__)
-            if len(typ.__bases__):
-                for base in typ.__bases__:
-                    keys.extend(get_slots_members(base))
-        except:
-            pass
-
-        return keys
-
-    memo[obj_id] = cpy
-    keys = []
-
-    # Objects have either __dict__ or __slots__ or neither.
-    # If object has none of it and it is copyable we already made a copy, just return it
-    # If object is not copyable we shouldn't reach this point.
-    try:
-        keys = obj.__dict__.keys()
-    except:
-        try:
-            obj.__slots__
-            keys = get_slots_members(obj)
-        except:
-            return cpy
-
-    for key in keys:
-        attr = getattr(obj, key)
-        attr_cpy = relatively_deep_copy(attr, memo)
-        setattr(cpy, key, attr_cpy)
-
-    return cpy
-
-
-class WrapperDefaultLower(Lower):
-    @property
-    def _disable_sroa_like_opt(self):
-        """We always return True."""
-        return True
-
-
-class DPEXLowerer(Lower):
-    def __init__(self, context, library, fndesc, func_ir, metadata=None):
-        Lower.__init__(self, context, library, fndesc, func_ir, metadata)
-        memo = {}
-
-        fndesc_cpu = relatively_deep_copy(fndesc, memo)
-        func_ir_cpu = relatively_deep_copy(func_ir, memo)
-
-        cpu_context = (
-            context.cpu_context
-            if isinstance(context, DpexKernelTargetContext)
-            else context
-        )
-        self.gpu_lower = self._lower(
-            context, library, fndesc, func_ir, metadata
-        )
-        self.cpu_lower = self._lower(
-            cpu_context, library, fndesc_cpu, func_ir_cpu, metadata
-        )
-
-    def _lower(self, context, library, fndesc, func_ir, metadata):
-        """Create Lower with changed linkageName in debug info"""
-        lower = WrapperDefaultLower(context, library, fndesc, func_ir, metadata)
-
-        # Debuginfo
-        if context.enable_debuginfo:
-            from numba.core.funcdesc import default_mangler, qualifying_prefix
-
-            from numba_dpex.debuginfo import DpexDIBuilder
-
-            qualprefix = qualifying_prefix(fndesc.modname, fndesc.qualname)
-            mangled_qualname = default_mangler(qualprefix, fndesc.argtypes)
-
-            lower.debuginfo = DpexDIBuilder(
-                module=lower.module,
-                filepath=func_ir.loc.filename,
-                linkage_name=mangled_qualname,
-                cgctx=context,
-            )
-
-        return lower
-
-    def lower(self):
-        """Custom lowering function to support offloading of kernels.
-
-        The lowerer has a builtin fallback mechanism for parfor functions. We
-        first try to lower a parfor onto a SYCL device, if the lowering fails
-        then we fallback to the default Numba lowering to CPU. The lowering
-        follow the following steps:
-
-        1. Start lowering of parent function
-        2. Try to lower parfor onto the specified SYCL device
-            2.a. The ``lower_parfor_rollback`` function prepares function to
-                lower onto to the specified SYCL device and inserts the
-                ``get_global_id`` intrinsic function.
-
-                2.a.a. Start lowering the parfor body and execute
-                ``DpexLowerer.lower()`` again.
-                2.a.b. If the lowering fails, throw an exception.
-            2.b. The ``lower_parfor_rollback`` catches the exception and
-                 restores the parfor body to its initial state.
-            2.c. Then throw an exception inside ``lower_parfor_rollback``
-                 that will be caught inside ``DpexLowerer.lower()``.
-        3. Catch exception and start parfor lowering with the default Numba CPU
-           context.
-
-        TODO/FIXME The rollback approach only works in case no device specific
-        modifications were added to function containing the parfor node. If the
-        function has any device specific modifications, a different solution
-        should be used.
-
-        Raises:
-            Exception: If a parfor node could not be lowered to a SYCL device.
-
-        """
-        try:
-            context = self.gpu_lower.context
-            try:
-                # Only Numba's CPUContext has the `lower_extension` attribute
-                lower_extension_parfor = context.lower_extensions[parfor.Parfor]
-                context.lower_extensions[parfor.Parfor] = lower_parfor_rollback
-            except Exception as e:
-                if config.DEBUG:
-                    print(e)
-                pass
-
-            self.gpu_lower.lower()
-            # if lower does not crash, and parfor_diagnostics is empty then it
-            # is a kernel function.
-            if not self.gpu_lower.metadata["parfor_diagnostics"].extra_info:
-                str_name = (
-                    dpctl.get_current_queue().get_sycl_device().filter_string
-                )
-                self.gpu_lower.metadata["parfor_diagnostics"].extra_info[
-                    "kernel"
-                ] = str_name
-            self.base_lower = self.gpu_lower
-
-            try:
-                context.lower_extensions[parfor.Parfor] = lower_extension_parfor
-            except Exception as e:
-                if config.DEBUG:
-                    print(e)
-                pass
-        except Exception as e:
-            if config.DEBUG:
-                import traceback
-
-                device_filter_str = (
-                    dpctl.get_current_queue().get_sycl_device().filter_string
-                )
-                print(
-                    "Failed to offload parfor to "
-                    + device_filter_str
-                    + ". Due to:\n",
-                    e,
-                )
-                print(traceback.format_exc())
-
-            if config.FALLBACK_ON_CPU == 1:
-                self.cpu_lower.context.lower_extensions[
-                    parfor.Parfor
-                ] = _lower_parfor_parallel
-                self.cpu_lower.lower()
-                self.base_lower = self.cpu_lower
-            else:
-                raise e
-
-        self.env = self.base_lower.env
-        self.call_helper = self.base_lower.call_helper
-
-    def create_cpython_wrapper(self, release_gil=False):
-        return self.base_lower.create_cpython_wrapper(release_gil)
-
-
-def copy_block(block):
-    memo = {}
-    new_block = ir.Block(block.scope, block.loc)
-    new_block.body = [relatively_deep_copy(stmt, memo) for stmt in block.body]
-    return new_block
-
-
-def lower_parfor_rollback(lowerer, parfor):
-    try:
-        _lower_parfor_gufunc(lowerer, parfor)
-        if config.DEBUG:
-            device_filter_str = (
-                dpctl.get_current_queue().get_sycl_device().filter_string
-            )
-
-            msg = "Parfor offloaded to " + device_filter_str
-            print(msg, parfor.loc)
-    except Exception as e:
-        device_filter_str = (
-            dpctl.get_current_queue().get_sycl_device().filter_string
-        )
-        msg = (
-            "Failed to offload parfor to " + device_filter_str + ". Falling "
-            "back to default CPU parallelization. Please file a bug report "
-            "at https://github.com/IntelPython/numba-dpex. To help us debug "
-            "the issue, please add the traceback to the bug report."
-        )
-        if not config.DEBUG:
-            msg += " Set the environment variable NUMBA_DPEX_DEBUG to 1 to "
-            msg += "generate a traceback."
-
-        warnings.warn(NumbaPerformanceWarning(msg, parfor.loc))
-        raise e
diff --git a/numba_dpex/core/passes/parfor.py b/numba_dpex/core/passes/parfor.py
deleted file mode 100644
index 454d63d350..0000000000
--- a/numba_dpex/core/passes/parfor.py
+++ /dev/null
@@ -1,6007 +0,0 @@
-#
-# Copyright (c) 2017 Intel Corporation
-# SPDX-License-Identifier: BSD-2-Clause
-#
-
-"""
-This module transforms data-parallel operations such as Numpy calls into
-'Parfor' nodes, which are nested loops that can be parallelized.
-It also implements optimizations such as loop fusion, and extends the rest of
-compiler analysis and optimizations to support Parfors.
-This is similar to ParallelAccelerator package in Julia:
-https://github.com/IntelLabs/ParallelAccelerator.jl
-'Parallelizing Julia with a Non-invasive DSL', T. Anderson et al., ECOOP'17.
-"""
-import copy
-import inspect
-import linecache
-import math
-import operator
-import os
-import sys
-import textwrap
-import types as pytypes  # avoid confusion with numba.types
-from collections import OrderedDict, defaultdict, namedtuple
-from contextlib import contextmanager
-from dataclasses import make_dataclass
-from functools import reduce
-
-import numba.core.ir
-import numba.cpython.builtins
-import numpy
-import numpy as np
-from llvmlite import ir as lir
-from numba import pndindex, prange
-from numba.core import (
-    analysis,
-    config,
-    errors,
-    ir,
-    ir_utils,
-    postproc,
-    rewrites,
-    typeinfer,
-    types,
-    typing,
-    utils,
-)
-from numba.core.analysis import (
-    compute_cfg_from_blocks,
-    compute_dead_maps,
-    compute_live_map,
-    compute_use_defs,
-)
-from numba.core.controlflow import CFGraph
-from numba.core.extending import lower_builtin, overload, register_jitable
-from numba.core.imputils import impl_ret_untracked
-from numba.core.ir_utils import (
-    GuardException,
-    add_offset_to_labels,
-    apply_copy_propagate,
-    build_definitions,
-    canonicalize_array_math,
-    compile_to_numba_ir,
-    copy_propagate,
-    dprint_func_ir,
-    find_build_sequence,
-    find_callname,
-    find_potential_aliases,
-    find_topo_order,
-    get_block_copies,
-    get_call_table,
-    get_definition,
-    get_name_var_table,
-    get_np_ufunc_typ,
-    get_stmt_writes,
-    guard,
-    has_no_side_effect,
-    index_var_of_get_setitem,
-    is_get_setitem,
-    is_getitem,
-    is_setitem,
-    mk_alloc,
-    mk_loop_header,
-    mk_range_block,
-    mk_unique_var,
-    next_label,
-    remove_dead,
-    rename_labels,
-    replace_arg_nodes,
-    replace_returns,
-    replace_var_names,
-    replace_vars,
-    replace_vars_inner,
-    require,
-    set_index_var_of_get_setitem,
-    simplify,
-    simplify_CFG,
-    transfer_scope,
-    visit_vars,
-    visit_vars_inner,
-)
-from numba.core.types.functions import Function
-from numba.core.typing import npydecl, signature
-from numba.core.typing.templates import AbstractTemplate, infer_global
-from numba.np.npdatetime_helpers import datetime_maximum, datetime_minimum
-from numba.np.numpy_support import as_dtype, numpy_version
-from numba.parfors import array_analysis
-from numba.parfors.array_analysis import (
-    assert_equiv,
-    random_1arg_size,
-    random_2arg_sizelast,
-    random_3arg_sizelast,
-    random_calls,
-    random_int_args,
-)
-from numba.stencils import stencilparfor
-from numba.stencils.stencilparfor import StencilPass
-
-# circular dependency: import numba.npyufunc.dufunc.DUFunc
-
-# wrapped pretty print
-_termwidth = 80
-_txtwrapper = textwrap.TextWrapper(width=_termwidth, drop_whitespace=False)
-
-
-def print_wrapped(x):
-    for l in x.splitlines():
-        [print(y) for y in _txtwrapper.wrap(l)]
-
-
-sequential_parfor_lowering = False
-
-
-# init_prange is a sentinel call that specifies the start of the initialization
-# code for the computation in the upcoming prange call
-# This lets the prange pass to put the code in the generated parfor's init_block
-def init_prange():
-    return
-
-
-@overload(init_prange)
-def init_prange_overload():
-    def no_op():
-        return
-
-    return no_op
-
-
-class internal_prange(object):
-    def __new__(cls, *args):
-        return range(*args)
-
-
-def min_parallel_impl(return_type, arg):
-    # XXX: use prange for 1D arrays since pndindex returns a 1-tuple instead of
-    # integer. This causes type and fusion issues.
-    if arg.ndim == 0:
-
-        def min_1(in_arr):
-            return in_arr[()]
-
-    elif arg.ndim == 1:
-        if isinstance(arg.dtype, (types.NPDatetime, types.NPTimedelta)):
-            # NaT is always returned if it is in the array
-            def min_1(in_arr):
-                numba.parfors.parfor.init_prange()
-                min_checker(len(in_arr))
-                val = numba.cpython.builtins.get_type_max_value(in_arr.dtype)
-                for i in numba.parfors.parfor.internal_prange(len(in_arr)):
-                    val = datetime_minimum(val, in_arr[i])
-                return val
-
-        else:
-
-            def min_1(in_arr):
-                numba.parfors.parfor.init_prange()
-                min_checker(len(in_arr))
-                val = numba.cpython.builtins.get_type_max_value(in_arr.dtype)
-                for i in numba.parfors.parfor.internal_prange(len(in_arr)):
-                    val = min(val, in_arr[i])
-                return val
-
-    else:
-
-        def min_1(in_arr):
-            numba.parfors.parfor.init_prange()
-            min_checker(len(in_arr))
-            val = numba.cpython.builtins.get_type_max_value(in_arr.dtype)
-            for i in numba.pndindex(in_arr.shape):
-                val = min(val, in_arr[i])
-            return val
-
-    return min_1
-
-
-def max_parallel_impl(return_type, arg):
-    if arg.ndim == 0:
-
-        def max_1(in_arr):
-            return in_arr[()]
-
-    elif arg.ndim == 1:
-        if isinstance(arg.dtype, (types.NPDatetime, types.NPTimedelta)):
-            # NaT is always returned if it is in the array
-            def max_1(in_arr):
-                numba.parfors.parfor.init_prange()
-                max_checker(len(in_arr))
-                val = numba.cpython.builtins.get_type_min_value(in_arr.dtype)
-                for i in numba.parfors.parfor.internal_prange(len(in_arr)):
-                    val = datetime_maximum(val, in_arr[i])
-                return val
-
-        else:
-
-            def max_1(in_arr):
-                numba.parfors.parfor.init_prange()
-                max_checker(len(in_arr))
-                val = numba.cpython.builtins.get_type_min_value(in_arr.dtype)
-                for i in numba.parfors.parfor.internal_prange(len(in_arr)):
-                    val = max(val, in_arr[i])
-                return val
-
-    else:
-
-        def max_1(in_arr):
-            numba.parfors.parfor.init_prange()
-            max_checker(len(in_arr))
-            val = numba.cpython.builtins.get_type_min_value(in_arr.dtype)
-            for i in numba.pndindex(in_arr.shape):
-                val = max(val, in_arr[i])
-            return val
-
-    return max_1
-
-
-def argmin_parallel_impl(in_arr):
-    numba.parfors.parfor.init_prange()
-    argmin_checker(len(in_arr))
-    A = in_arr.ravel()
-    init_val = numba.cpython.builtins.get_type_max_value(A.dtype)
-    ival = typing.builtins.IndexValue(0, init_val)
-    for i in numba.parfors.parfor.internal_prange(len(A)):
-        curr_ival = typing.builtins.IndexValue(i, A[i])
-        ival = min(ival, curr_ival)
-    return ival.index
-
-
-def argmax_parallel_impl(in_arr):
-    numba.parfors.parfor.init_prange()
-    argmax_checker(len(in_arr))
-    A = in_arr.ravel()
-    init_val = numba.cpython.builtins.get_type_min_value(A.dtype)
-    ival = typing.builtins.IndexValue(0, init_val)
-    for i in numba.parfors.parfor.internal_prange(len(A)):
-        curr_ival = typing.builtins.IndexValue(i, A[i])
-        ival = max(ival, curr_ival)
-    return ival.index
-
-
-def dotvv_parallel_impl(a, b):
-    numba.parfors.parfor.init_prange()
-    l = a.shape[0]
-    m = b.shape[0]
-    # TODO: investigate assert_equiv
-    # assert_equiv("sizes of l, m do not match", l, m)
-    s = 0
-    for i in numba.parfors.parfor.internal_prange(l):
-        s += a[i] * b[i]
-    return s
-
-
-def dotvm_parallel_impl(a, b):
-    numba.parfors.parfor.init_prange()
-    l = a.shape
-    m, n = b.shape
-    # TODO: investigate assert_equiv
-    # assert_equiv("Sizes of l, m do not match", l, m)
-    c = np.zeros(n, a.dtype)
-    # TODO: evaluate dotvm implementation options
-    # for i in prange(n):
-    #    s = 0
-    #    for j in range(m):
-    #        s += a[j] * b[j, i]
-    #    c[i] = s
-    for i in numba.parfors.parfor.internal_prange(m):
-        c += a[i] * b[i, :]
-    return c
-
-
-def dotmv_parallel_impl(a, b):
-    numba.parfors.parfor.init_prange()
-    m, n = a.shape
-    l = b.shape
-    # TODO: investigate assert_equiv
-    # assert_equiv("sizes of n, l do not match", n, l)
-    c = np.empty(m, a.dtype)
-    for i in numba.parfors.parfor.internal_prange(m):
-        s = 0
-        for j in range(n):
-            s += a[i, j] * b[j]
-        c[i] = s
-    return c
-
-
-def dot_parallel_impl(return_type, atyp, btyp):
-    # Note that matrix matrix multiply is not translated.
-    if isinstance(atyp, types.npytypes.Array) and isinstance(
-        btyp, types.npytypes.Array
-    ):
-        if atyp.ndim == btyp.ndim == 1:
-            return dotvv_parallel_impl
-        # TODO: evaluate support for dotvm and enable
-        # elif atyp.ndim == 1 and btyp.ndim == 2:
-        #    return dotvm_parallel_impl
-        elif atyp.ndim == 2 and btyp.ndim == 1:
-            return dotmv_parallel_impl
-
-
-def sum_parallel_impl(return_type, arg):
-    zero = return_type(0)
-
-    if arg.ndim == 0:
-
-        def sum_1(in_arr):
-            return in_arr[()]
-
-    elif arg.ndim == 1:
-
-        def sum_1(in_arr):
-            numba.parfors.parfor.init_prange()
-            val = zero
-            for i in numba.parfors.parfor.internal_prange(len(in_arr)):
-                val += in_arr[i]
-            return val
-
-    else:
-
-        def sum_1(in_arr):
-            numba.parfors.parfor.init_prange()
-            val = zero
-            for i in numba.pndindex(in_arr.shape):
-                val += in_arr[i]
-            return val
-
-    return sum_1
-
-
-def prod_parallel_impl(return_type, arg):
-    one = return_type(1)
-
-    if arg.ndim == 0:
-
-        def prod_1(in_arr):
-            return in_arr[()]
-
-    elif arg.ndim == 1:
-
-        def prod_1(in_arr):
-            numba.parfors.parfor.init_prange()
-            val = one
-            for i in numba.parfors.parfor.internal_prange(len(in_arr)):
-                val *= in_arr[i]
-            return val
-
-    else:
-
-        def prod_1(in_arr):
-            numba.parfors.parfor.init_prange()
-            val = one
-            for i in numba.pndindex(in_arr.shape):
-                val *= in_arr[i]
-            return val
-
-    return prod_1
-
-
-def mean_parallel_impl(return_type, arg):
-    # can't reuse sum since output type is different
-    zero = return_type(0)
-
-    if arg.ndim == 0:
-
-        def mean_1(in_arr):
-            return in_arr[()]
-
-    elif arg.ndim == 1:
-
-        def mean_1(in_arr):
-            numba.parfors.parfor.init_prange()
-            val = zero
-            for i in numba.parfors.parfor.internal_prange(len(in_arr)):
-                val += in_arr[i]
-            return val / len(in_arr)
-
-    else:
-
-        def mean_1(in_arr):
-            numba.parfors.parfor.init_prange()
-            val = zero
-            for i in numba.pndindex(in_arr.shape):
-                val += in_arr[i]
-            return val / in_arr.size
-
-    return mean_1
-
-
-def var_parallel_impl(return_type, arg):
-    if arg.ndim == 0:
-
-        def var_1(in_arr):
-            return 0
-
-    elif arg.ndim == 1:
-
-        def var_1(in_arr):
-            # Compute the mean
-            m = in_arr.mean()
-            # Compute the sum of square diffs
-            numba.parfors.parfor.init_prange()
-            ssd = 0
-            for i in numba.parfors.parfor.internal_prange(len(in_arr)):
-                val = in_arr[i] - m
-                ssd += np.real(val * np.conj(val))
-            return ssd / len(in_arr)
-
-    else:
-
-        def var_1(in_arr):
-            # Compute the mean
-            m = in_arr.mean()
-            # Compute the sum of square diffs
-            numba.parfors.parfor.init_prange()
-            ssd = 0
-            for i in numba.pndindex(in_arr.shape):
-                val = in_arr[i] - m
-                ssd += np.real(val * np.conj(val))
-            return ssd / in_arr.size
-
-    return var_1
-
-
-def std_parallel_impl(return_type, arg):
-    def std_1(in_arr):
-        return in_arr.var() ** 0.5
-
-    return std_1
-
-
-def arange_parallel_impl(return_type, *args):
-    dtype = as_dtype(return_type.dtype)
-
-    def arange_1(stop):
-        return np.arange(0, stop, 1, dtype)
-
-    def arange_2(start, stop):
-        return np.arange(start, stop, 1, dtype)
-
-    def arange_3(start, stop, step):
-        return np.arange(start, stop, step, dtype)
-
-    if any(isinstance(a, types.Complex) for a in args):
-
-        def arange_4(start, stop, step, dtype):
-            numba.parfors.parfor.init_prange()
-            nitems_c = (stop - start) / step
-            nitems_r = math.ceil(nitems_c.real)
-            nitems_i = math.ceil(nitems_c.imag)
-            nitems = int(max(min(nitems_i, nitems_r), 0))
-            arr = np.empty(nitems, dtype)
-            for i in numba.parfors.parfor.internal_prange(nitems):
-                arr[i] = start + i * step
-            return arr
-
-    else:
-
-        def arange_4(start, stop, step, dtype):
-            numba.parfors.parfor.init_prange()
-            nitems_r = math.ceil((stop - start) / step)
-            nitems = int(max(nitems_r, 0))
-            arr = np.empty(nitems, dtype)
-            val = start
-            for i in numba.parfors.parfor.internal_prange(nitems):
-                arr[i] = start + i * step
-            return arr
-
-    if len(args) == 1:
-        return arange_1
-    elif len(args) == 2:
-        return arange_2
-    elif len(args) == 3:
-        return arange_3
-    elif len(args) == 4:
-        return arange_4
-    else:
-        raise ValueError("parallel arange with types {}".format(args))
-
-
-def linspace_parallel_impl(return_type, *args):
-    dtype = as_dtype(return_type.dtype)
-
-    def linspace_2(start, stop):
-        return np.linspace(start, stop, 50)
-
-    def linspace_3(start, stop, num):
-        numba.parfors.parfor.init_prange()
-        arr = np.empty(num, dtype)
-        div = num - 1
-        delta = stop - start
-        arr[0] = start
-        for i in numba.parfors.parfor.internal_prange(num):
-            arr[i] = start + delta * (i / div)
-        return arr
-
-    if len(args) == 2:
-        return linspace_2
-    elif len(args) == 3:
-        return linspace_3
-    else:
-        raise ValueError("parallel linspace with types {}".format(args))
-
-
-swap_functions_map = {
-    ("argmin", "numpy"): lambda r, a: argmin_parallel_impl,
-    ("argmax", "numpy"): lambda r, a: argmax_parallel_impl,
-    ("min", "numpy"): min_parallel_impl,
-    ("max", "numpy"): max_parallel_impl,
-    ("amin", "numpy"): min_parallel_impl,
-    ("amax", "numpy"): max_parallel_impl,
-    ("sum", "numpy"): sum_parallel_impl,
-    ("prod", "numpy"): prod_parallel_impl,
-    ("mean", "numpy"): mean_parallel_impl,
-    ("var", "numpy"): var_parallel_impl,
-    ("std", "numpy"): std_parallel_impl,
-    ("dot", "numpy"): dot_parallel_impl,
-    ("arange", "numpy"): arange_parallel_impl,
-    ("linspace", "numpy"): linspace_parallel_impl,
-}
-
-
-def fill_parallel_impl(return_type, arr, val):
-    """Parallel implementation of ndarray.fill.  The array on
-    which to operate is retrieved from get_call_name and
-    is passed along with the value to fill.
-    """
-    if arr.ndim == 1:
-
-        def fill_1(in_arr, val):
-            numba.parfors.parfor.init_prange()
-            for i in numba.parfors.parfor.internal_prange(len(in_arr)):
-                in_arr[i] = val
-            return None
-
-    else:
-
-        def fill_1(in_arr, val):
-            numba.parfors.parfor.init_prange()
-            for i in numba.pndindex(in_arr.shape):
-                in_arr[i] = val
-            return None
-
-    return fill_1
-
-
-replace_functions_ndarray = {
-    "fill": fill_parallel_impl,
-}
-
-
-@register_jitable
-def max_checker(arr_size):
-    if arr_size == 0:
-        raise ValueError(
-            (
-                "zero-size array to reduction operation "
-                "maximum which has no identity"
-            )
-        )
-
-
-@register_jitable
-def min_checker(arr_size):
-    if arr_size == 0:
-        raise ValueError(
-            (
-                "zero-size array to reduction operation "
-                "minimum which has no identity"
-            )
-        )
-
-
-@register_jitable
-def argmin_checker(arr_size):
-    if arr_size == 0:
-        raise ValueError("attempt to get argmin of an empty sequence")
-
-
-@register_jitable
-def argmax_checker(arr_size):
-    if arr_size == 0:
-        raise ValueError("attempt to get argmax of an empty sequence")
-
-
-checker_impl = namedtuple("checker_impl", ["name", "func"])
-
-replace_functions_checkers_map = {
-    ("argmin", "numpy"): checker_impl("argmin_checker", argmin_checker),
-    ("argmax", "numpy"): checker_impl("argmax_checker", argmax_checker),
-    ("min", "numpy"): checker_impl("min_checker", min_checker),
-    ("max", "numpy"): checker_impl("max_checker", max_checker),
-    ("amin", "numpy"): checker_impl("min_checker", min_checker),
-    ("amax", "numpy"): checker_impl("max_checker", max_checker),
-}
-
-
-class LoopNest(object):
-
-    """The LoopNest class holds information of a single loop including
-    the index variable (of a non-negative integer value), and the
-    range variable, e.g. range(r) is 0 to r-1 with step size 1.
-    """
-
-    def __init__(self, index_variable, start, stop, step):
-        self.index_variable = index_variable
-        self.start = start
-        self.stop = stop
-        self.step = step
-
-    def __repr__(self):
-        return "LoopNest(index_variable = {}, range = ({}, {}, {}))".format(
-            self.index_variable, self.start, self.stop, self.step
-        )
-
-    def list_vars(self):
-        all_uses = []
-        all_uses.append(self.index_variable)
-        if isinstance(self.start, ir.Var):
-            all_uses.append(self.start)
-        if isinstance(self.stop, ir.Var):
-            all_uses.append(self.stop)
-        if isinstance(self.step, ir.Var):
-            all_uses.append(self.step)
-        return all_uses
-
-
-class Parfor(ir.Expr, ir.Stmt):
-    id_counter = 0
-
-    def __init__(
-        self,
-        loop_nests,
-        init_block,
-        loop_body,
-        loc,
-        index_var,
-        equiv_set,
-        pattern,
-        flags,
-        *,  # only specify the options below by keyword
-        no_sequential_lowering=False,
-        races=set(),
-    ):
-        super(Parfor, self).__init__(op="parfor", loc=loc)
-
-        self.id = type(self).id_counter
-        type(self).id_counter += 1
-        # self.input_info  = input_info
-        # self.output_info = output_info
-        self.loop_nests = loop_nests
-        self.init_block = init_block
-        self.loop_body = loop_body
-        self.index_var = index_var
-        self.params = None  # filled right before parallel lowering
-        self.equiv_set = equiv_set
-        # The parallel patterns this parfor was generated from and their options
-        # for example, a parfor could be from the stencil pattern with
-        # the neighborhood option
-        assert len(pattern) > 1
-        self.patterns = [pattern]
-        self.flags = flags
-        # if True, this parfor shouldn't be lowered sequentially even with the
-        # sequential lowering option
-        self.no_sequential_lowering = no_sequential_lowering
-        self.races = races
-        self.redvars = []
-        self.reddict = {}
-        # If the lowerer is None then the standard lowerer will be used.
-        # This can be set to a function to have that function act as the lowerer
-        # for this parfor.  This lowerer field will also prevent parfors from
-        # being fused unless they have use the same lowerer.
-        self.lowerer = None
-        if config.DEBUG_ARRAY_OPT_STATS:
-            fmt = "Parallel for-loop #{} is produced from pattern '{}' at {}"
-            print(fmt.format(self.id, pattern, loc))
-
-    def __repr__(self):
-        return (
-            "id="
-            + str(self.id)
-            + repr(self.loop_nests)
-            + repr(self.loop_body)
-            + repr(self.index_var)
-        )
-
-    def list_vars(self):
-        """list variables used (read/written) in this parfor by
-        traversing the body and combining block uses.
-        """
-        all_uses = []
-        for l, b in self.loop_body.items():
-            for stmt in b.body:
-                all_uses += stmt.list_vars()
-
-        for loop in self.loop_nests:
-            all_uses += loop.list_vars()
-
-        for stmt in self.init_block.body:
-            all_uses += stmt.list_vars()
-
-        return all_uses
-
-    def get_shape_classes(self, var, typemap=None):
-        """get the shape classes for a given variable.
-        If a typemap is specified then use it for type resolution
-        """
-        # We get shape classes from the equivalence set but that
-        # keeps its own typemap at a time prior to lowering.  So
-        # if something is added during lowering then we can pass
-        # in a type map to use.  We temporarily replace the
-        # equivalence set typemap, do the work and then restore
-        # the original on the way out.
-        if typemap is not None:
-            save_typemap = self.equiv_set.typemap
-            self.equiv_set.typemap = typemap
-        res = self.equiv_set.get_shape_classes(var)
-        if typemap is not None:
-            self.equiv_set.typemap = save_typemap
-        return res
-
-    def dump(self, file=None):
-        file = file or sys.stdout
-        print(("begin parfor {}".format(self.id)).center(20, "-"), file=file)
-        print("index_var = ", self.index_var, file=file)
-        print("params = ", self.params, file=file)
-        print("races = ", self.races, file=file)
-        for loopnest in self.loop_nests:
-            print(loopnest, file=file)
-        print("init block:", file=file)
-        self.init_block.dump(file)
-        for offset, block in sorted(self.loop_body.items()):
-            print("label %s:" % (offset,), file=file)
-            block.dump(file)
-        print(("end parfor {}".format(self.id)).center(20, "-"), file=file)
-
-    def validate_params(self, typemap):
-        """
-        Check that Parfors params are of valid types.
-        """
-        if self.params is None:
-            msg = (
-                "Cannot run parameter validation on a Parfor with params "
-                "not set"
-            )
-            raise ValueError(msg)
-        for p in self.params:
-            ty = typemap.get(p)
-            if ty is None:
-                msg = (
-                    "Cannot validate parameter %s, there is no type "
-                    "information available"
-                )
-                raise ValueError(msg)
-            if isinstance(ty, types.BaseTuple):
-                if ty.count > config.PARFOR_MAX_TUPLE_SIZE:
-                    msg = (
-                        "Use of a tuple (%s) of length %d in a parallel region "
-                        "exceeds the maximum supported tuple size.  Since "
-                        "Generalized Universal Functions back parallel regions "
-                        "and those do not support tuples, tuples passed to "
-                        "parallel regions are unpacked if their size is below "
-                        "a certain threshold, currently configured to be %d. "
-                        "This threshold can be modified using the Numba "
-                        "environment variable NUMBA_PARFOR_MAX_TUPLE_SIZE."
-                    )
-                    raise errors.UnsupportedParforsError(
-                        msg % (p, ty.count, config.PARFOR_MAX_TUPLE_SIZE),
-                        self.loc,
-                    )
-
-
-def _analyze_parfor(parfor, equiv_set, typemap, array_analysis):
-    """Recursive array analysis for parfor nodes."""
-    func_ir = array_analysis.func_ir
-    parfor_blocks = wrap_parfor_blocks(parfor)
-    # Since init_block get label 0 after wrap, we need to save
-    # the equivset for the real block label 0.
-    backup_equivset = array_analysis.equiv_sets.get(0, None)
-    array_analysis.run(parfor_blocks, equiv_set)
-    unwrap_parfor_blocks(parfor, parfor_blocks)
-    parfor.equiv_set = array_analysis.equiv_sets[0]
-    # Restore equivset for block 0 after parfor is unwrapped
-    if backup_equivset:
-        array_analysis.equiv_sets[0] = backup_equivset
-    return [], []
-
-
-array_analysis.array_analysis_extensions[Parfor] = _analyze_parfor
-
-
-class ParforDiagnostics(object):
-    """Holds parfor diagnostic info, this is accumulated throughout the
-    PreParforPass and ParforPass, also in the closure inlining!
-    """
-
-    def __init__(self):
-        # holds ref to the function for which this is providing diagnostics
-        self.func = None
-        # holds a map of the replaced functions
-        self.replaced_fns = dict()
-        # used to identify "internal" parfor functions
-        self.internal_name = "__numba_parfor_gufunc"
-        self.fusion_info = defaultdict(list)
-        self.nested_fusion_info = defaultdict(list)
-        self.fusion_reports = []
-        self.hoist_info = {}
-        self.has_setup = False
-
-    def setup(self, func_ir, fusion_enabled):
-        self.func_ir = func_ir
-        self.name = self.func_ir.func_id.func_qualname
-        self.line = self.func_ir.loc
-        self.fusion_enabled = fusion_enabled
-        if self.internal_name in self.name:
-            self.purpose = "Internal parallel function"
-        else:
-            self.purpose = "Function %s, %s" % (self.name, self.line)
-        # we store a reference to the parfors prior to fusion etc, the parfors
-        # do get mangled in the fusion process but in a predetermined manner
-        # and by holding a reference here the "before" state can be printed
-        self.initial_parfors = self.get_parfors()
-        self.has_setup = True
-
-    @property
-    def has_setup(self):
-        return self._has_setup
-
-    @has_setup.setter
-    def has_setup(self, state):
-        self._has_setup = state
-
-    def count_parfors(self, blocks=None):
-        return len(self.get_parfors())
-
-    def _get_nested_parfors(self, parfor, parfors_list):
-        blocks = wrap_parfor_blocks(parfor)
-        self._get_parfors(blocks, parfors_list)
-        unwrap_parfor_blocks(parfor)
-
-    def _get_parfors(self, blocks, parfors_list):
-        for label, blk in blocks.items():
-            for stmt in blk.body:
-                if isinstance(stmt, Parfor):
-                    parfors_list.append(stmt)
-                    self._get_nested_parfors(stmt, parfors_list)
-
-    def get_parfors(self):
-        parfors_list = []
-        self._get_parfors(self.func_ir.blocks, parfors_list)
-        return parfors_list
-
-    def hoisted_allocations(self):
-        allocs = []
-        for pf_id, data in self.hoist_info.items():
-            stmt = data.get("hoisted", [])
-            for inst in stmt:
-                if isinstance(inst.value, ir.Expr):
-                    if inst.value.op == "call":
-                        call = guard(find_callname, self.func_ir, inst.value)
-                        if call is not None and call == ("empty", "numpy"):
-                            allocs.append(inst)
-        return allocs
-
-    def compute_graph_info(self, _a):
-        """
-        compute adjacency list of the fused loops
-        and find the roots in of the lists
-        """
-        a = copy.deepcopy(_a)
-        if a == {}:
-            return [], set()
-
-        vtx = set()
-        for v in a.values():
-            for x in v:
-                vtx.add(x)
-
-        # find roots
-        potential_roots = set(a.keys())
-        roots = potential_roots - vtx
-        if roots is None:
-            roots = set()
-
-        # populate rest of adjacency list
-        not_roots = set()
-        for x in range(max(set(a.keys()).union(vtx)) + 1):
-            val = a.get(x)
-            if val is not None:
-                a[x] = val
-            elif val is []:
-                not_roots.add(x)  # debug only
-            else:
-                a[x] = []
-
-        # fold adjacency list into an actual list ordered
-        # by vtx
-        l = []
-        for x in sorted(a.keys()):
-            l.append(a[x])
-
-        return l, roots  # , not_roots
-
-    def get_stats(self, fadj, nadj, root):
-        """
-        Computes the number of fused and serialized loops
-        based on a fusion adjacency list `fadj` and a nested
-        parfors adjacency list `nadj` for the root, `root`
-        """
-
-        def count_root(fadj, nadj, root, nfused, nserial):
-            for k in nadj[root]:
-                nserial += 1
-                if nadj[k] == []:
-                    nfused += len(fadj[k])
-                else:
-                    nf, ns = count_root(fadj, nadj, k, nfused, nserial)
-                    nfused += nf
-                    nserial = ns
-            return nfused, nserial
-
-        nfused, nserial = count_root(fadj, nadj, root, 0, 0)
-        return nfused, nserial
-
-    def reachable_nodes(self, adj, root):
-        """
-        returns a list of nodes reachable in an adjacency list from a
-        specified root
-        """
-        fusers = []
-        fusers.extend(adj[root])
-        for k in adj[root]:
-            if adj[k] != []:
-                fusers.extend(self.reachable_nodes(adj, k))
-        return fusers
-
-    def sort_pf_by_line(self, pf_id, parfors_simple):
-        """
-        pd_id - the parfors id
-        parfors_simple - the simple parfors map
-        """
-        # this sorts parfors by source line number
-        pf = parfors_simple[pf_id][0]
-        pattern = pf.patterns[0]
-        line = max(0, pf.loc.line - 1)  # why are these out by 1 ?!
-        filename = self.func_ir.loc.filename
-        nadj, nroots = self.compute_graph_info(self.nested_fusion_info)
-        fadj, froots = self.compute_graph_info(self.fusion_info)
-        graphs = [nadj, fadj]
-
-        # If the parfor is internal, like internal prange, then the
-        # default line number is from its location in the numba source
-        # To get a more accurate line number, this first checks the
-        # adjacency graph for fused parfors that might not be internal
-        # and uses the minimum line number from there. If that fails
-        # (case where there's just a single internal parfor) the IR
-        # is walked backwards from the parfor location and the first non
-        # parfor statement line number is used.
-        if isinstance(pattern, tuple):
-            if pattern[1] == "internal":
-                reported_loc = pattern[2][1]
-                if reported_loc.filename == filename:
-                    return max(0, reported_loc.line - 1)
-                else:
-                    # first recurse and check the adjacency list for
-                    # something that is not an in internal parfor
-                    tmp = []
-                    for adj in graphs:
-                        if adj:  # graph may be empty, e.g. no nesting
-                            for k in adj[pf_id]:
-                                tmp.append(
-                                    self.sort_pf_by_line(k, parfors_simple)
-                                )
-                            if tmp:
-                                return max(0, min(tmp) - 1)
-                    # second run through the parfor block to see if there's
-                    # and reference to a line number in the user source
-                    for blk in pf.loop_body.values():
-                        for stmt in blk.body:
-                            if stmt.loc.filename == filename:
-                                return max(0, stmt.loc.line - 1)
-                    # finally run through the func_ir and look for the
-                    # first non-parfor statement prior to this one and
-                    # grab the line from that
-                    for blk in self.func_ir.blocks.values():
-                        try:
-                            idx = blk.body.index(pf)
-                            for i in range(idx - 1, 0, -1):
-                                stmt = blk.body[i]
-                                if not isinstance(stmt, Parfor):
-                                    line = max(0, stmt.loc.line - 1)
-                                    break
-                        except ValueError:
-                            pass
-        return line
-
-    def get_parfors_simple(self, print_loop_search):
-        parfors_simple = dict()
-
-        # print in line order, parfors loop id is based on discovery order
-        for pf in sorted(self.initial_parfors, key=lambda x: x.loc.line):
-            # use 0 here, the parfors are mutated by the time this routine
-            # is called, however, fusion appends the patterns so we can just
-            # pull in the first as a "before fusion" emulation
-            r_pattern = pf.patterns[0]
-            pattern = pf.patterns[0]
-            loc = pf.loc
-            if isinstance(pattern, tuple):
-                if pattern[0] == "prange":
-                    if pattern[1] == "internal":
-                        replfn = ".".join(reversed(list(pattern[2][0])))
-                        loc = pattern[2][1]
-                        r_pattern = "%s %s" % (
-                            replfn,
-                            "(internal parallel version)",
-                        )
-                    elif pattern[1] == "user":
-                        r_pattern = "user defined prange"
-                    elif pattern[1] == "pndindex":
-                        r_pattern = "internal pndindex"  # FIXME: trace this!
-                    else:
-                        assert 0
-            fmt = "Parallel for-loop #%s: is produced from %s:\n    %s\n \n"
-            if print_loop_search:
-                print_wrapped(fmt % (pf.id, loc, r_pattern))
-            parfors_simple[pf.id] = (pf, loc, r_pattern)
-        return parfors_simple
-
-    def get_all_lines(self, parfors_simple):
-        # ensure adjacency lists are the same size for both sets of info
-        # (nests and fusion may not traverse the same space, for
-        # convenience [] is used as a condition to halt recursion)
-        fadj, froots = self.compute_graph_info(self.fusion_info)
-        nadj, _nroots = self.compute_graph_info(self.nested_fusion_info)
-
-        if len(fadj) > len(nadj):
-            lim = len(fadj)
-            tmp = nadj
-        else:
-            lim = len(nadj)
-            tmp = fadj
-        for x in range(len(tmp), lim):
-            tmp.append([])
-
-        # This computes the roots of true loop nests (i.e. loops containing
-        # loops opposed to just a loop that's a root).
-        nroots = set()
-        if _nroots:
-            for r in _nroots:
-                if nadj[r] != []:
-                    nroots.add(r)
-        all_roots = froots ^ nroots
-
-        # This computes all the parfors at the top level that are either:
-        # - roots of loop fusion
-        # - roots of true loop nests
-        # it then combines these based on source line number for ease of
-        # producing output ordered in a manner similar to the code structure
-        froots_lines = {}
-        for x in froots:
-            line = self.sort_pf_by_line(x, parfors_simple)
-            froots_lines[line] = "fuse", x, fadj
-
-        nroots_lines = {}
-        for x in nroots:
-            line = self.sort_pf_by_line(x, parfors_simple)
-            nroots_lines[line] = "nest", x, nadj
-
-        all_lines = froots_lines.copy()
-        all_lines.update(nroots_lines)
-        return all_lines
-
-    def source_listing(self, parfors_simple, purpose_str):
-        filename = self.func_ir.loc.filename
-        count = self.count_parfors()
-        func_name = self.func_ir.func_id.func
-        try:
-            lines = inspect.getsource(func_name).splitlines()
-        except OSError:  # generated function
-            lines = None
-        if lines and parfors_simple:
-            src_width = max([len(x) for x in lines])
-            map_line_to_pf = defaultdict(list)  # parfors can alias lines
-            for k, v in parfors_simple.items():
-                # TODO: do a better job of tracking parfors that are not in
-                # this file but are referred to, e.g. np.arange()
-                if parfors_simple[k][1].filename == filename:
-                    match_line = self.sort_pf_by_line(k, parfors_simple)
-                    map_line_to_pf[match_line].append(str(k))
-
-            max_pf_per_line = max(
-                [1] + [len(x) for x in map_line_to_pf.values()]
-            )
-            width = src_width + (1 + max_pf_per_line * (len(str(count)) + 2))
-            newlines = []
-            newlines.append("\n")
-            newlines.append("Parallel loop listing for %s" % purpose_str)
-            newlines.append(width * "-" + "|loop #ID")
-            fmt = "{0:{1}}| {2}"
-            # why are these off by 1?
-            lstart = max(0, self.func_ir.loc.line - 1)
-            for no, line in enumerate(lines, lstart):
-                pf_ids = map_line_to_pf.get(no, None)
-                if pf_ids is not None:
-                    pfstr = "#" + ", ".join(pf_ids)
-                else:
-                    pfstr = ""
-                stripped = line.strip("\n")
-                srclen = len(stripped)
-                if pf_ids:
-                    l = fmt.format(width * "-", width, pfstr)
-                else:
-                    l = fmt.format(width * " ", width, pfstr)
-                newlines.append(stripped + l[srclen:])
-            print("\n".join(newlines))
-        else:
-            print("No source available")
-
-    def print_unoptimised(self, lines):
-        # This prints the unoptimised parfors state
-        sword = "+--"
-        fac = len(sword)
-        fadj, froots = self.compute_graph_info(self.fusion_info)
-        nadj, _nroots = self.compute_graph_info(self.nested_fusion_info)
-
-        if len(fadj) > len(nadj):
-            lim = len(fadj)
-            tmp = nadj
-        else:
-            lim = len(nadj)
-            tmp = fadj
-        for x in range(len(tmp), lim):
-            tmp.append([])
-
-        def print_nest(fadj_, nadj_, theroot, reported, region_id):
-            def print_g(fadj_, nadj_, nroot, depth):
-                print_wrapped(
-                    fac * depth * " " + "%s%s %s" % (sword, nroot, "(parallel)")
-                )
-                for k in nadj_[nroot]:
-                    if nadj_[k] == []:
-                        msg = []
-                        msg.append(
-                            fac * (depth + 1) * " "
-                            + "%s%s %s" % (sword, k, "(parallel)")
-                        )
-                        if fadj_[k] != [] and k not in reported:
-                            fused = self.reachable_nodes(fadj_, k)
-                            for i in fused:
-                                msg.append(
-                                    fac * (depth + 1) * " "
-                                    + "%s%s %s" % (sword, i, "(parallel)")
-                                )
-                        reported.append(k)
-                        print_wrapped("\n".join(msg))
-                    else:
-                        print_g(fadj_, nadj_, k, depth + 1)
-
-            if nadj_[theroot] != []:
-                print_wrapped("Parallel region %s:" % region_id)
-                print_g(fadj_, nadj_, theroot, 0)
-                print("\n")
-                region_id = region_id + 1
-            return region_id
-
-        def print_fuse(ty, pf_id, adj, depth, region_id):
-            msg = []
-            print_wrapped("Parallel region %s:" % region_id)
-            msg.append(
-                fac * depth * " " + "%s%s %s" % (sword, pf_id, "(parallel)")
-            )
-            if adj[pf_id] != []:
-                fused = sorted(self.reachable_nodes(adj, pf_id))
-                for k in fused:
-                    msg.append(
-                        fac * depth * " " + "%s%s %s" % (sword, k, "(parallel)")
-                    )
-            region_id = region_id + 1
-            print_wrapped("\n".join(msg))
-            print("\n")
-            return region_id
-
-        # Walk the parfors by src line and print optimised structure
-        region_id = 0
-        reported = []
-        for line, info in sorted(lines.items()):
-            opt_ty, pf_id, adj = info
-            if opt_ty == "fuse":
-                if pf_id not in reported:
-                    region_id = print_fuse("f", pf_id, adj, 0, region_id)
-            elif opt_ty == "nest":
-                region_id = print_nest(fadj, nadj, pf_id, reported, region_id)
-            else:
-                assert 0
-
-    def print_optimised(self, lines):
-        # This prints the optimised output based on the transforms that
-        # occurred during loop fusion and rewriting of loop nests
-        sword = "+--"
-        fac = len(sword)
-        fadj, froots = self.compute_graph_info(self.fusion_info)
-        nadj, _nroots = self.compute_graph_info(self.nested_fusion_info)
-
-        if len(fadj) > len(nadj):
-            lim = len(fadj)
-            tmp = nadj
-        else:
-            lim = len(nadj)
-            tmp = fadj
-        for x in range(len(tmp), lim):
-            tmp.append([])
-
-        summary = dict()
-        # region : {fused, serialized}
-
-        def print_nest(fadj_, nadj_, theroot, reported, region_id):
-            def print_g(fadj_, nadj_, nroot, depth):
-                for k in nadj_[nroot]:
-                    msg = fac * depth * " " + "%s%s %s" % (sword, k, "(serial")
-                    if nadj_[k] == []:
-                        fused = []
-                        if fadj_[k] != [] and k not in reported:
-                            fused = sorted(self.reachable_nodes(fadj_, k))
-                            msg += ", fused with loop(s): "
-                            msg += ", ".join([str(x) for x in fused])
-                        msg += ")"
-                        reported.append(k)
-                        print_wrapped(msg)
-                        summary[region_id]["fused"] += len(fused)
-                    else:
-                        print_wrapped(msg + ")")
-                        print_g(fadj_, nadj_, k, depth + 1)
-                    summary[region_id]["serialized"] += 1
-
-            if nadj_[theroot] != []:
-                print_wrapped("Parallel region %s:" % region_id)
-                print_wrapped("%s%s %s" % (sword, theroot, "(parallel)"))
-                summary[region_id] = {
-                    "root": theroot,
-                    "fused": 0,
-                    "serialized": 0,
-                }
-                print_g(fadj_, nadj_, theroot, 1)
-                print("\n")
-                region_id = region_id + 1
-            return region_id
-
-        def print_fuse(ty, pf_id, adj, depth, region_id):
-            print_wrapped("Parallel region %s:" % region_id)
-            msg = fac * depth * " " + "%s%s %s" % (sword, pf_id, "(parallel")
-            fused = []
-            if adj[pf_id] != []:
-                fused = sorted(self.reachable_nodes(adj, pf_id))
-                msg += ", fused with loop(s): "
-                msg += ", ".join([str(x) for x in fused])
-
-            summary[region_id] = {
-                "root": pf_id,
-                "fused": len(fused),
-                "serialized": 0,
-            }
-            msg += ")"
-            print_wrapped(msg)
-            print("\n")
-            region_id = region_id + 1
-            return region_id
-
-        # Walk the parfors by src line and print optimised structure
-        region_id = 0
-        reported = []
-        for line, info in sorted(lines.items()):
-            opt_ty, pf_id, adj = info
-            if opt_ty == "fuse":
-                if pf_id not in reported:
-                    region_id = print_fuse("f", pf_id, adj, 0, region_id)
-            elif opt_ty == "nest":
-                region_id = print_nest(fadj, nadj, pf_id, reported, region_id)
-            else:
-                assert 0
-
-        # print the summary of the fuse/serialize rewrite
-        if summary:
-            for k, v in sorted(summary.items()):
-                msg = (
-                    "\n \nParallel region %s (loop #%s) had %s " "loop(s) fused"
-                )
-                root = v["root"]
-                fused = v["fused"]
-                serialized = v["serialized"]
-                if serialized != 0:
-                    msg += (
-                        " and %s loop(s) "
-                        "serialized as part of the larger "
-                        "parallel loop (#%s)."
-                    )
-                    print_wrapped(msg % (k, root, fused, serialized, root))
-                else:
-                    msg += "."
-                    print_wrapped(msg % (k, root, fused))
-        else:
-            print_wrapped("Parallel structure is already optimal.")
-
-    def allocation_hoist(self):
-        found = False
-        print("Allocation hoisting:")
-        for pf_id, data in self.hoist_info.items():
-            stmt = data.get("hoisted", [])
-            for inst in stmt:
-                if isinstance(inst.value, ir.Expr):
-                    try:
-                        attr = inst.value.attr
-                        if attr == "empty":
-                            msg = (
-                                "The memory allocation derived from the "
-                                "instruction at %s is hoisted out of the "
-                                "parallel loop labelled #%s (it will be "
-                                "performed before the loop is executed and "
-                                "reused inside the loop):"
-                            )
-                            loc = inst.loc
-                            print_wrapped(msg % (loc, pf_id))
-                            try:
-                                path = os.path.relpath(loc.filename)
-                            except ValueError:
-                                path = os.path.abspath(loc.filename)
-                            lines = linecache.getlines(path)
-                            if lines and loc.line:
-                                print_wrapped(
-                                    "   Allocation:: "
-                                    + lines[
-                                        0 if loc.line < 2 else loc.line - 1
-                                    ].strip()
-                                )
-                            print_wrapped(
-                                "    - numpy.empty() is used for the allocation.\n"
-                            )
-                            found = True
-                    except (KeyError, AttributeError):
-                        pass
-        if not found:
-            print_wrapped("No allocation hoisting found")
-
-    def instruction_hoist(self):
-        print("")
-        print("Instruction hoisting:")
-        hoist_info_printed = False
-        if self.hoist_info:
-            for pf_id, data in self.hoist_info.items():
-                hoisted = data.get("hoisted", None)
-                not_hoisted = data.get("not_hoisted", None)
-                if not hoisted and not not_hoisted:
-                    print("loop #%s has nothing to hoist." % pf_id)
-                    continue
-
-                print("loop #%s:" % pf_id)
-                if hoisted:
-                    print("  Has the following hoisted:")
-                    [print("    %s" % y) for y in hoisted]
-                    hoist_info_printed = True
-                if not_hoisted:
-                    print("  Failed to hoist the following:")
-                    [print("    %s: %s" % (y, x)) for x, y in not_hoisted]
-                    hoist_info_printed = True
-        if not hoist_info_printed:
-            print_wrapped("No instruction hoisting found")
-        print_wrapped(80 * "-")
-
-    def dump(self, level=1):
-        if not self.has_setup:
-            raise RuntimeError("self.setup has not been called")
-        name = self.func_ir.func_id.func_qualname
-        line = self.func_ir.loc
-        if self.internal_name in name:
-            purpose_str = "Internal parallel functions "
-            purpose = "internal"
-        else:
-            purpose_str = " Function %s, %s " % (name, line)
-            purpose = "user"
-
-        print_loop_search = False
-        print_source_listing = False
-        print_fusion_search = False
-        print_fusion_summary = False
-        print_loopnest_rewrite = False
-        print_pre_optimised = False
-        print_post_optimised = False
-        print_allocation_hoist = False
-        print_instruction_hoist = False
-        print_internal = False
-
-        # each level switches on progressively more output
-        if level in (1, 2, 3, 4):
-            print_source_listing = True
-            print_post_optimised = True
-        else:
-            raise ValueError(
-                "Report level unknown, should be one of 1, 2, 3, 4"
-            )
-
-        if level in (2, 3, 4):
-            print_pre_optimised = True
-
-        if level in (3, 4):
-            print_allocation_hoist = True
-
-        if level == 3:
-            print_fusion_summary = True
-            print_loopnest_rewrite = True
-
-        if level == 4:
-            print_fusion_search = True
-            print_instruction_hoist = True
-            print_internal = True
-
-        if purpose == "internal" and not print_internal:
-            return
-
-        print_wrapped("\n ")
-        print_wrapped(_termwidth * "=")
-        print_wrapped(
-            (" Parallel Accelerator Optimizing: %s " % purpose_str).center(
-                _termwidth, "="
-            )
-        )
-        print_wrapped(_termwidth * "=")
-        print_wrapped("")
-
-        # ----------- search section
-        if print_loop_search:
-            print_wrapped("Looking for parallel loops".center(_termwidth, "-"))
-        parfors_simple = self.get_parfors_simple(print_loop_search)
-
-        count = self.count_parfors()
-        if print_loop_search:
-            print_wrapped("\nFound %s parallel loops." % count)
-            print_wrapped("-" * _termwidth)
-
-        # ----------- augmented source section
-        filename = self.func_ir.loc.filename
-        try:
-            # Try to get a relative path
-            # ipython/jupyter input just returns as filename
-            path = os.path.relpath(filename)
-        except ValueError:
-            # Fallback to absolute path if error occurred in getting the
-            # relative path.
-            # This may happen on windows if the drive is different
-            path = os.path.abspath(filename)
-
-        if print_source_listing:
-            self.source_listing(parfors_simple, purpose_str)
-
-        # ---------- these are used a lot here on in
-        sword = "+--"
-        parfors = self.get_parfors()  # this is the mutated parfors
-        parfor_ids = [x.id for x in parfors]
-        n_parfors = len(parfor_ids)
-
-        # ----------- loop fusion section
-        if print_fusion_search or print_fusion_summary:
-            if not sequential_parfor_lowering:
-                print_wrapped(" Fusing loops ".center(_termwidth, "-"))
-                msg = (
-                    "Attempting fusion of parallel loops (combines loops "
-                    "with similar properties)...\n"
-                )
-                print_wrapped(msg)
-            else:
-                msg = "Performing sequential lowering of loops...\n"
-                print_wrapped(msg)
-                print_wrapped(_termwidth * "-")
-        # if there are some parfors, print information about them!
-        if n_parfors > -1:
-
-            def dump_graph_indented(a, root_msg, node_msg):
-                fac = len(sword)
-
-                def print_graph(adj, roots):
-                    def print_g(adj, root, depth):
-                        for k in adj[root]:
-                            print_wrapped(
-                                fac * depth * " "
-                                + "%s%s %s" % (sword, k, node_msg)
-                            )
-                            if adj[k] != []:
-                                print_g(adj, k, depth + 1)
-
-                    for r in roots:
-                        print_wrapped("%s%s %s" % (sword, r, root_msg))
-                        print_g(l, r, 1)
-                        print_wrapped("")
-
-                l, roots = self.compute_graph_info(a)
-                print_graph(l, roots)
-
-            if print_fusion_search:
-                for report in self.fusion_reports:
-                    l1, l2, msg = report
-                    print_wrapped(
-                        "  Trying to fuse loops #%s and #%s:" % (l1, l2)
-                    )
-                    print_wrapped("    %s" % msg)
-
-            if self.fusion_info != {}:
-                if print_fusion_summary:
-                    print_wrapped("\n \nFused loop summary:\n")
-
-                    dump_graph_indented(
-                        self.fusion_info,
-                        "has the following loops fused into it:",
-                        "(fused)",
-                    )
-
-            if print_fusion_summary:
-                if self.fusion_enabled:
-                    after_fusion = (
-                        "Following the attempted fusion of parallel for-loops"
-                    )
-                else:
-                    after_fusion = "With fusion disabled"
-
-                print_wrapped(
-                    (
-                        "\n{} there are {} parallel for-loop(s) (originating from loops labelled: {})."
-                    ).format(
-                        after_fusion,
-                        n_parfors,
-                        ", ".join(["#%s" % x for x in parfor_ids]),
-                    )
-                )
-                print_wrapped(_termwidth * "-")
-                print_wrapped("")
-
-            # ----------- loop nest section
-            if print_loopnest_rewrite:
-                if self.nested_fusion_info != {}:
-                    print_wrapped(
-                        (" Optimising loop nests ").center(_termwidth, "-")
-                    )
-                    print_wrapped(
-                        "Attempting loop nest rewrites (optimising for the largest parallel loops)...\n "
-                    )
-                    root_msg = "is a parallel loop"
-                    node_msg = "--> rewritten as a serial loop"
-                    dump_graph_indented(
-                        self.nested_fusion_info, root_msg, node_msg
-                    )
-                    print_wrapped(_termwidth * "-")
-                    print_wrapped("")
-
-            # ---------- compute various properties and orderings in the data for subsequent use
-            all_lines = self.get_all_lines(parfors_simple)
-
-            if print_pre_optimised:
-                print(" Before Optimisation ".center(_termwidth, "-"))
-                self.print_unoptimised(all_lines)
-                print(_termwidth * "-")
-
-            if print_post_optimised:
-                print(" After Optimisation ".center(_termwidth, "-"))
-                self.print_optimised(all_lines)
-                print(_termwidth * "-")
-            print_wrapped("")
-            print_wrapped(_termwidth * "-")
-            print_wrapped("\n ")
-
-            # ----------- LICM section
-            if print_allocation_hoist or print_instruction_hoist:
-                print_wrapped("Loop invariant code motion".center(80, "-"))
-
-            if print_allocation_hoist:
-                self.allocation_hoist()
-
-            if print_instruction_hoist:
-                self.instruction_hoist()
-
-        else:  # there are no parfors
-            print_wrapped(
-                "Function %s, %s, has no parallel for-loops.".format(name, line)
-            )
-
-    def __str__(self):
-        r = "ParforDiagnostics:\n"
-        r += repr(self.replaced_fns)
-        return r
-
-    def __repr__(self):
-        r = "ParforDiagnostics"
-        return r
-
-
-class PreParforPass(object):
-    """Preprocessing for the Parfor pass. It mostly inlines parallel
-    implementations of numpy functions if available.
-    """
-
-    def __init__(
-        self,
-        func_ir,
-        typemap,
-        calltypes,
-        typingctx,
-        targetctx,
-        options,
-        swapped={},
-        replace_functions_map=None,
-    ):
-        self.func_ir = func_ir
-        self.typemap = typemap
-        self.calltypes = calltypes
-        self.typingctx = typingctx
-        self.targetctx = targetctx
-        self.options = options
-        # diagnostics
-        self.swapped = swapped
-        if replace_functions_map is None:
-            replace_functions_map = swap_functions_map
-        self.replace_functions_map = replace_functions_map
-        self.stats = {
-            "replaced_func": 0,
-            "replaced_dtype": 0,
-        }
-
-    def run(self):
-        """Run pre-parfor processing pass."""
-        # e.g. convert A.sum() to np.sum(A) for easier match and optimization
-        canonicalize_array_math(
-            self.func_ir, self.typemap, self.calltypes, self.typingctx
-        )
-        if self.options.numpy:
-            self._replace_parallel_functions(self.func_ir.blocks)
-        self.func_ir.blocks = simplify_CFG(self.func_ir.blocks)
-
-    def _replace_parallel_functions(self, blocks):
-        """
-        Replace functions with their parallel implementation in
-        replace_functions_map if available.
-        The implementation code is inlined to enable more optimization.
-        """
-        swapped = self.swapped
-        from numba.core.inline_closurecall import inline_closure_call
-
-        work_list = list(blocks.items())
-        while work_list:
-            label, block = work_list.pop()
-            for i, instr in enumerate(block.body):
-                if isinstance(instr, ir.Assign):
-                    lhs = instr.target
-                    lhs_typ = self.typemap[lhs.name]
-                    expr = instr.value
-                    if isinstance(expr, ir.Expr) and expr.op == "call":
-                        # Try and inline known calls with their parallel implementations
-                        def replace_func():
-                            func_def = get_definition(self.func_ir, expr.func)
-                            callname = find_callname(self.func_ir, expr)
-                            repl_func = self.replace_functions_map.get(
-                                callname, None
-                            )
-                            # Handle method on array type
-                            if (
-                                repl_func is None
-                                and len(callname) == 2
-                                and isinstance(callname[1], ir.Var)
-                                and isinstance(
-                                    self.typemap[callname[1].name],
-                                    types.npytypes.Array,
-                                )
-                            ):
-                                repl_func = replace_functions_ndarray.get(
-                                    callname[0], None
-                                )
-                                if repl_func is not None:
-                                    # Add the array that the method is on to the arg list.
-                                    expr.args.insert(0, callname[1])
-
-                            require(repl_func is not None)
-                            typs = tuple(
-                                self.typemap[x.name] for x in expr.args
-                            )
-                            try:
-                                new_func = repl_func(lhs_typ, *typs)
-                            except:
-                                new_func = None
-                            require(new_func is not None)
-                            g = copy.copy(self.func_ir.func_id.func.__globals__)
-                            g["numba"] = numba
-                            g["np"] = numpy
-                            g["math"] = math
-                            # if the function being inlined has a function
-                            # checking the inputs, find it and add it to globals
-                            check = replace_functions_checkers_map.get(
-                                callname, None
-                            )
-                            if check is not None:
-                                g[check.name] = check.func
-                            # inline the parallel implementation
-                            new_blocks, _ = inline_closure_call(
-                                self.func_ir,
-                                g,
-                                block,
-                                i,
-                                new_func,
-                                self.typingctx,
-                                self.targetctx,
-                                typs,
-                                self.typemap,
-                                self.calltypes,
-                                work_list,
-                            )
-                            call_table = get_call_table(
-                                new_blocks, topological_ordering=False
-                            )
-
-                            # find the prange in the new blocks and record it for use in diagnostics
-                            for call in call_table:
-                                for k, v in call.items():
-                                    if v[0] == "internal_prange":
-                                        swapped[k] = [
-                                            callname,
-                                            repl_func.__name__,
-                                            func_def,
-                                            block.body[i].loc,
-                                        ]
-                                        break
-                            return True
-
-                        if guard(replace_func):
-                            self.stats["replaced_func"] += 1
-                            break
-                    elif (
-                        isinstance(expr, ir.Expr)
-                        and expr.op == "getattr"
-                        and expr.attr == "dtype"
-                    ):
-                        # Replace getattr call "A.dtype" with numpy.dtype(<actual type>).
-                        # This helps remove superfluous dependencies from parfor.
-                        typ = self.typemap[expr.value.name]
-                        if isinstance(typ, types.npytypes.Array):
-                            # Convert A.dtype to four statements.
-                            # 1) Get numpy global.
-                            # 2) Create var for known type of array as string
-                            #    constant. e.g. 'float64'
-                            # 3) Get dtype function from numpy module.
-                            # 4) Create var for numpy.dtype(var from #2).
-
-                            # Create var for numpy module.
-                            dtype = typ.dtype
-                            scope = block.scope
-                            loc = instr.loc
-                            g_np_var = ir.Var(
-                                scope, mk_unique_var("$np_g_var"), loc
-                            )
-                            self.typemap[g_np_var.name] = types.misc.Module(
-                                numpy
-                            )
-                            g_np = ir.Global("np", numpy, loc)
-                            g_np_assign = ir.Assign(g_np, g_np_var, loc)
-
-                            # Create var for the inferred type of the array
-                            # e.g., 'float64'
-                            dtype_str = str(dtype)
-                            if dtype_str == "bool":
-                                dtype_str = "bool_"
-                            typ_var = ir.Var(
-                                scope, mk_unique_var("$np_typ_var"), loc
-                            )
-                            self.typemap[typ_var.name] = types.StringLiteral(
-                                dtype_str
-                            )
-                            typ_var_assign = ir.Assign(
-                                ir.Const(dtype_str, loc), typ_var, loc
-                            )
-
-                            # Get the dtype function from the numpy module.
-                            dtype_attr_var = ir.Var(
-                                scope, mk_unique_var("$dtype_attr_var"), loc
-                            )
-                            temp = find_template(numpy.dtype)
-                            tfunc = numba.core.types.Function(temp)
-                            tfunc.get_call_type(
-                                self.typingctx,
-                                (self.typemap[typ_var.name],),
-                                {},
-                            )
-                            self.typemap[
-                                dtype_attr_var.name
-                            ] = types.functions.Function(temp)
-                            dtype_attr_getattr = ir.Expr.getattr(
-                                g_np_var, "dtype", loc
-                            )
-                            dtype_attr_assign = ir.Assign(
-                                dtype_attr_getattr, dtype_attr_var, loc
-                            )
-
-                            # Call numpy.dtype on the statically coded type two steps above.
-                            dtype_var = ir.Var(
-                                scope, mk_unique_var("$dtype_var"), loc
-                            )
-                            self.typemap[dtype_var.name] = types.npytypes.DType(
-                                dtype
-                            )
-                            dtype_getattr = ir.Expr.call(
-                                dtype_attr_var, [typ_var], (), loc
-                            )
-                            dtype_assign = ir.Assign(
-                                dtype_getattr, dtype_var, loc
-                            )
-                            self.calltypes[dtype_getattr] = signature(
-                                self.typemap[dtype_var.name],
-                                self.typemap[typ_var.name],
-                            )
-
-                            # The original A.dtype rhs is replaced with result of this call.
-                            instr.value = dtype_var
-                            # Add statements to body of the code.
-                            block.body.insert(0, dtype_assign)
-                            block.body.insert(0, dtype_attr_assign)
-                            block.body.insert(0, typ_var_assign)
-                            block.body.insert(0, g_np_assign)
-                            self.stats["replaced_dtype"] += 1
-                            break
-
-
-def find_template(op):
-    for ft in numba.core.typing.templates.builtin_registry.functions:
-        if ft.key == op:
-            return ft
-
-
-class ParforPassStates:
-    """This class encapsulates all internal states of the ParforPass."""
-
-    def __init__(
-        self,
-        func_ir,
-        typemap,
-        calltypes,
-        return_type,
-        typingctx,
-        targetctx,
-        options,
-        flags,
-        metadata,
-        diagnostics=ParforDiagnostics(),
-    ):
-        self.func_ir = func_ir
-        self.typemap = typemap
-        self.calltypes = calltypes
-        self.typingctx = typingctx
-        self.targetctx = targetctx
-        self.return_type = return_type
-        self.options = options
-        self.diagnostics = diagnostics
-        self.swapped_fns = diagnostics.replaced_fns
-        self.fusion_info = diagnostics.fusion_info
-        self.nested_fusion_info = diagnostics.nested_fusion_info
-
-        self.array_analysis = array_analysis.ArrayAnalysis(
-            self.typingctx,
-            self.func_ir,
-            self.typemap,
-            self.calltypes,
-        )
-
-        ir_utils._the_max_label.update(max(func_ir.blocks.keys()))
-        self.flags = flags
-        self.metadata = metadata
-        if "parfors" not in metadata:
-            metadata["parfors"] = {}
-
-
-class ConvertInplaceBinop:
-    """Parfor subpass to convert setitem on Arrays"""
-
-    def __init__(self, pass_states):
-        """
-        Parameters
-        ----------
-        pass_states : ParforPassStates
-        """
-        self.pass_states = pass_states
-        self.rewritten = []
-
-    def run(self, blocks):
-        pass_states = self.pass_states
-        # convert expressions like A += ... where A is an array.
-        topo_order = find_topo_order(blocks)
-        # variables available in the program so far (used for finding map
-        # functions in array_expr lowering)
-        for label in topo_order:
-            block = blocks[label]
-            new_body = []
-            equiv_set = pass_states.array_analysis.get_equiv_set(label)
-            for instr in block.body:
-                if isinstance(instr, ir.Assign):
-                    lhs = instr.target
-                    expr = instr.value
-                    if isinstance(expr, ir.Expr) and expr.op == "inplace_binop":
-                        loc = expr.loc
-                        target = expr.lhs
-                        value = expr.rhs
-                        target_typ = pass_states.typemap[target.name]
-                        value_typ = pass_states.typemap[value.name]
-                        # Handle A op= ...
-                        if isinstance(target_typ, types.npytypes.Array):
-                            # RHS is an array
-                            if isinstance(value_typ, types.npytypes.Array):
-                                new_instr = self._inplace_binop_to_parfor(
-                                    equiv_set,
-                                    loc,
-                                    expr.immutable_fn,
-                                    target,
-                                    value,
-                                )
-                                self.rewritten.append(
-                                    dict(
-                                        old=instr,
-                                        new=new_instr,
-                                        reason="inplace_binop",
-                                    ),
-                                )
-                                instr = [new_instr, ir.Assign(target, lhs, loc)]
-                if isinstance(instr, list):
-                    new_body.extend(instr)
-                else:
-                    new_body.append(instr)
-            block.body = new_body
-
-    def _inplace_binop_to_parfor(self, equiv_set, loc, op, target, value):
-        """generate parfor from setitem node with a boolean or slice array indices.
-        The value can be either a scalar or an array variable, and if a boolean index
-        is used for the latter case, the same index must be used for the value too.
-        """
-        pass_states = self.pass_states
-        scope = target.scope
-        arr_typ = pass_states.typemap[target.name]
-        el_typ = arr_typ.dtype
-        init_block = ir.Block(scope, loc)
-        value_typ = pass_states.typemap[value.name]
-
-        size_vars = equiv_set.get_shape(target)
-
-        # generate loopnests and size variables from target correlations
-        index_vars, loopnests = _mk_parfor_loops(
-            pass_states.typemap, size_vars, scope, loc
-        )
-
-        # generate body
-        body_label = next_label()
-        body_block = ir.Block(scope, loc)
-        index_var, index_var_typ = _make_index_var(
-            pass_states.typemap, scope, index_vars, body_block
-        )
-
-        # Read value.
-        value_var = ir.Var(scope, mk_unique_var("$value_var"), loc)
-        pass_states.typemap[value_var.name] = value_typ.dtype
-        getitem_call = ir.Expr.getitem(value, index_var, loc)
-        pass_states.calltypes[getitem_call] = signature(
-            value_typ.dtype, value_typ, index_var_typ
-        )
-        body_block.body.append(ir.Assign(getitem_call, value_var, loc))
-
-        # Read target
-        target_var = ir.Var(scope, mk_unique_var("$target_var"), loc)
-        pass_states.typemap[target_var.name] = el_typ
-        getitem_call = ir.Expr.getitem(target, index_var, loc)
-        pass_states.calltypes[getitem_call] = signature(
-            el_typ, arr_typ, index_var_typ
-        )
-        body_block.body.append(ir.Assign(getitem_call, target_var, loc))
-
-        # Create temp to hold result.
-        expr_out_var = ir.Var(scope, mk_unique_var("$expr_out_var"), loc)
-        pass_states.typemap[expr_out_var.name] = el_typ
-
-        # Create binop and assign result to temporary.
-        binop_expr = ir.Expr.binop(op, target_var, value_var, loc)
-        body_block.body.append(ir.Assign(binop_expr, expr_out_var, loc))
-        unified_type = self.pass_states.typingctx.unify_pairs(
-            el_typ, value_typ.dtype
-        )
-        pass_states.calltypes[binop_expr] = signature(
-            unified_type, unified_type, unified_type
-        )
-
-        # Write to target
-        setitem_node = ir.SetItem(target, index_var, expr_out_var, loc)
-        pass_states.calltypes[setitem_node] = signature(
-            types.none, arr_typ, index_var_typ, el_typ
-        )
-        body_block.body.append(setitem_node)
-
-        parfor = Parfor(
-            loopnests,
-            init_block,
-            {},
-            loc,
-            index_var,
-            equiv_set,
-            ("inplace_binop", ""),
-            pass_states.flags,
-        )
-        parfor.loop_body = {body_label: body_block}
-        if config.DEBUG_ARRAY_OPT >= 1:
-            print("parfor from inplace_binop")
-            parfor.dump()
-        return parfor
-
-    def _type_getitem(self, args):
-        fnty = operator.getitem
-        return self.pass_states.typingctx.resolve_function_type(
-            fnty, tuple(args), {}
-        )
-
-
-def get_index_var(x):
-    return x.index if isinstance(x, ir.SetItem) else x.index_var
-
-
-class ConvertSetItemPass:
-    """Parfor subpass to convert setitem on Arrays"""
-
-    def __init__(self, pass_states):
-        """
-        Parameters
-        ----------
-        pass_states : ParforPassStates
-        """
-        self.pass_states = pass_states
-        self.rewritten = []
-
-    def run(self, blocks):
-        pass_states = self.pass_states
-        # convert setitem expressions like A[C] = c or A[C] = B[C] to parfor,
-        # where C is a boolean array.
-        topo_order = find_topo_order(blocks)
-        # variables available in the program so far (used for finding map
-        # functions in array_expr lowering)
-        for label in topo_order:
-            block = blocks[label]
-            new_body = []
-            equiv_set = pass_states.array_analysis.get_equiv_set(label)
-            for instr in block.body:
-                if isinstance(instr, (ir.StaticSetItem, ir.SetItem)):
-                    loc = instr.loc
-                    target = instr.target
-                    index = get_index_var(instr)
-                    value = instr.value
-                    target_typ = pass_states.typemap[target.name]
-                    index_typ = pass_states.typemap[index.name]
-                    value_typ = pass_states.typemap[value.name]
-                    # Handle A[boolean_array] = <scalar or array>
-                    if isinstance(target_typ, types.npytypes.Array):
-                        if (
-                            isinstance(index_typ, types.npytypes.Array)
-                            and isinstance(index_typ.dtype, types.Boolean)
-                            and target_typ.ndim == index_typ.ndim
-                        ):
-                            # RHS is a scalar number
-                            if isinstance(value_typ, types.Number):
-                                new_instr = self._setitem_to_parfor(
-                                    equiv_set, loc, target, index, value
-                                )
-                                self.rewritten.append(
-                                    dict(
-                                        old=instr,
-                                        new=new_instr,
-                                        reason="masked_assign_broadcast_scalar",
-                                    ),
-                                )
-                                instr = new_instr
-                            # RHS is an array
-                            elif isinstance(value_typ, types.npytypes.Array):
-                                val_def = guard(
-                                    get_definition,
-                                    pass_states.func_ir,
-                                    value.name,
-                                )
-                                if (
-                                    isinstance(val_def, ir.Expr)
-                                    and val_def.op == "getitem"
-                                    and val_def.index.name == index.name
-                                ):
-                                    new_instr = self._setitem_to_parfor(
-                                        equiv_set,
-                                        loc,
-                                        target,
-                                        index,
-                                        val_def.value,
-                                    )
-                                    self.rewritten.append(
-                                        dict(
-                                            old=instr,
-                                            new=new_instr,
-                                            reason="masked_assign_array",
-                                        ),
-                                    )
-                                    instr = new_instr
-                        else:
-                            # Handle A[:] = x
-                            shape = equiv_set.get_shape(instr)
-                            # Don't converted broadcasted setitems into parfors.
-                            if isinstance(index_typ, types.BaseTuple):
-                                # The sliced dims are those in the index that
-                                # are made of slices.  Count the numbers of slices
-                                # in the index tuple.
-                                sliced_dims = len(
-                                    list(
-                                        filter(
-                                            lambda x: isinstance(
-                                                x, types.misc.SliceType
-                                            ),
-                                            index_typ.types,
-                                        )
-                                    )
-                                )
-                            elif isinstance(index_typ, types.misc.SliceType):
-                                # For singular indices there can be a bare slice
-                                # and if so there is one dimension being set.
-                                sliced_dims = 1
-                            else:
-                                sliced_dims = 0
-
-                            # Only create a parfor for this setitem if we know the
-                            # shape of the output and number of dimensions set is
-                            # equal to the number of dimensions on the right side.
-                            if shape is not None and (
-                                not isinstance(value_typ, types.npytypes.Array)
-                                or sliced_dims == value_typ.ndim
-                            ):
-                                new_instr = self._setitem_to_parfor(
-                                    equiv_set,
-                                    loc,
-                                    target,
-                                    index,
-                                    value,
-                                    shape=shape,
-                                )
-                                self.rewritten.append(
-                                    dict(
-                                        old=instr, new=new_instr, reason="slice"
-                                    ),
-                                )
-                                instr = new_instr
-                new_body.append(instr)
-            block.body = new_body
-
-    def _setitem_to_parfor(
-        self, equiv_set, loc, target, index, value, shape=None
-    ):
-        """generate parfor from setitem node with a boolean or slice array indices.
-        The value can be either a scalar or an array variable, and if a boolean index
-        is used for the latter case, the same index must be used for the value too.
-        """
-        pass_states = self.pass_states
-        scope = target.scope
-        arr_typ = pass_states.typemap[target.name]
-        el_typ = arr_typ.dtype
-        index_typ = pass_states.typemap[index.name]
-        init_block = ir.Block(scope, loc)
-
-        if shape:
-            # Slice index is being used on the target array, we'll have to create
-            # a sub-array so that the target dimension matches the given shape.
-            assert isinstance(index_typ, types.BaseTuple) or isinstance(
-                index_typ, types.SliceType
-            )
-            # setitem has a custom target shape
-            size_vars = shape
-            # create a new target array via getitem
-            subarr_var = ir.Var(scope, mk_unique_var("$subarr"), loc)
-            getitem_call = ir.Expr.getitem(target, index, loc)
-            subarr_typ = typing.arraydecl.get_array_index_type(
-                arr_typ, index_typ
-            ).result
-            pass_states.typemap[subarr_var.name] = subarr_typ
-            pass_states.calltypes[getitem_call] = self._type_getitem(
-                (arr_typ, index_typ)
-            )
-            init_block.append(ir.Assign(getitem_call, subarr_var, loc))
-            target = subarr_var
-        else:
-            # Otherwise it is a boolean array that is used as index.
-            assert isinstance(index_typ, types.ArrayCompatible)
-            size_vars = equiv_set.get_shape(target)
-            bool_typ = index_typ.dtype
-
-        # generate loopnests and size variables from lhs correlations
-        loopnests = []
-        index_vars = []
-        for size_var in size_vars:
-            index_var = ir.Var(scope, mk_unique_var("parfor_index"), loc)
-            index_vars.append(index_var)
-            pass_states.typemap[index_var.name] = types.uintp
-            loopnests.append(LoopNest(index_var, 0, size_var, 1))
-
-        # generate body
-        body_label = next_label()
-        body_block = ir.Block(scope, loc)
-        index_var, index_var_typ = _make_index_var(
-            pass_states.typemap, scope, index_vars, body_block
-        )
-        parfor = Parfor(
-            loopnests,
-            init_block,
-            {},
-            loc,
-            index_var,
-            equiv_set,
-            ("setitem", ""),
-            pass_states.flags,
-        )
-        if shape:
-            # slice subarray
-            parfor.loop_body = {body_label: body_block}
-            true_block = body_block
-            end_label = None
-        else:
-            # boolean mask
-            true_label = next_label()
-            true_block = ir.Block(scope, loc)
-            end_label = next_label()
-            end_block = ir.Block(scope, loc)
-            parfor.loop_body = {
-                body_label: body_block,
-                true_label: true_block,
-                end_label: end_block,
-            }
-            mask_var = ir.Var(scope, mk_unique_var("$mask_var"), loc)
-            pass_states.typemap[mask_var.name] = bool_typ
-            mask_val = ir.Expr.getitem(index, index_var, loc)
-            body_block.body.extend(
-                [
-                    ir.Assign(mask_val, mask_var, loc),
-                    ir.Branch(mask_var, true_label, end_label, loc),
-                ]
-            )
-
-        value_typ = pass_states.typemap[value.name]
-        if isinstance(value_typ, types.npytypes.Array):
-            value_var = ir.Var(scope, mk_unique_var("$value_var"), loc)
-            pass_states.typemap[value_var.name] = value_typ.dtype
-            getitem_call = ir.Expr.getitem(value, index_var, loc)
-            pass_states.calltypes[getitem_call] = signature(
-                value_typ.dtype, value_typ, index_var_typ
-            )
-            true_block.body.append(ir.Assign(getitem_call, value_var, loc))
-        else:
-            value_var = value
-        setitem_node = ir.SetItem(target, index_var, value_var, loc)
-        pass_states.calltypes[setitem_node] = signature(
-            types.none, pass_states.typemap[target.name], index_var_typ, el_typ
-        )
-        true_block.body.append(setitem_node)
-        if end_label:
-            true_block.body.append(ir.Jump(end_label, loc))
-
-        if config.DEBUG_ARRAY_OPT >= 1:
-            print("parfor from setitem")
-            parfor.dump()
-        return parfor
-
-    def _type_getitem(self, args):
-        fnty = operator.getitem
-        return self.pass_states.typingctx.resolve_function_type(
-            fnty, tuple(args), {}
-        )
-
-
-def _make_index_var(typemap, scope, index_vars, body_block, force_tuple=False):
-    """When generating a SetItem call to an array in a parfor, the general
-    strategy is to generate a tuple if the array is more than 1 dimension.
-    If it is 1 dimensional then you can use a simple variable.  This routine
-    is also used when converting pndindex to parfor but pndindex requires a
-    tuple even if the iteration space is 1 dimensional.  The pndindex use of
-    this function will use force_tuple to make the output index a tuple even
-    if it is one dimensional.
-    """
-    ndims = len(index_vars)
-    loc = body_block.loc
-    if ndims > 1 or force_tuple:
-        tuple_var = ir.Var(scope, mk_unique_var("$parfor_index_tuple_var"), loc)
-        typemap[tuple_var.name] = types.containers.UniTuple(types.uintp, ndims)
-        tuple_call = ir.Expr.build_tuple(list(index_vars), loc)
-        tuple_assign = ir.Assign(tuple_call, tuple_var, loc)
-        body_block.body.append(tuple_assign)
-        return tuple_var, types.containers.UniTuple(types.uintp, ndims)
-    elif ndims == 1:
-        return index_vars[0], types.uintp
-    else:
-        raise errors.UnsupportedRewriteError(
-            "Parfor does not handle arrays of dimension 0",
-            loc=loc,
-        )
-
-
-def _mk_parfor_loops(typemap, size_vars, scope, loc):
-    """
-    Create loop index variables and build LoopNest objects for a parfor.
-    """
-    loopnests = []
-    index_vars = []
-    for size_var in size_vars:
-        index_var = ir.Var(scope, mk_unique_var("parfor_index"), loc)
-        index_vars.append(index_var)
-        typemap[index_var.name] = types.uintp
-        loopnests.append(LoopNest(index_var, 0, size_var, 1))
-    return index_vars, loopnests
-
-
-class ConvertNumpyPass:
-    """
-    Convert supported Numpy functions, as well as arrayexpr nodes, to
-    parfor nodes.
-    """
-
-    def __init__(self, pass_states):
-        self.pass_states = pass_states
-        self.rewritten = []
-
-    def run(self, blocks):
-        pass_states = self.pass_states
-        topo_order = find_topo_order(blocks)
-        # variables available in the program so far (used for finding map
-        # functions in array_expr lowering)
-        avail_vars = []
-        for label in topo_order:
-            block = blocks[label]
-            new_body = []
-            equiv_set = pass_states.array_analysis.get_equiv_set(label)
-            for instr in block.body:
-                if isinstance(instr, ir.Assign):
-                    expr = instr.value
-                    lhs = instr.target
-                    lhs_typ = self.pass_states.typemap[lhs.name]
-                    if self._is_C_or_F_order(lhs_typ):
-                        if guard(self._is_supported_npycall, expr):
-                            new_instr = self._numpy_to_parfor(
-                                equiv_set, lhs, expr
-                            )
-                            if new_instr is not None:
-                                self.rewritten.append(
-                                    dict(
-                                        old=instr,
-                                        new=new_instr,
-                                        reason="numpy_allocator",
-                                    )
-                                )
-                                instr = new_instr
-                        elif (
-                            isinstance(expr, ir.Expr) and expr.op == "arrayexpr"
-                        ):
-                            new_instr = self._arrayexpr_to_parfor(
-                                equiv_set, lhs, expr, avail_vars
-                            )
-                            self.rewritten.append(
-                                dict(
-                                    old=instr,
-                                    new=new_instr,
-                                    reason="arrayexpr",
-                                )
-                            )
-                            instr = new_instr
-                    avail_vars.append(lhs.name)
-                new_body.append(instr)
-            block.body = new_body
-
-    def _is_C_order(self, arr_name):
-        if isinstance(arr_name, types.npytypes.Array):
-            return arr_name.layout == "C" and arr_name.ndim > 0
-        elif arr_name is str:
-            typ = self.pass_states.typemap[arr_name]
-            return (
-                isinstance(typ, types.npytypes.Array)
-                and typ.layout == "C"
-                and typ.ndim > 0
-            )
-        else:
-            return False
-
-    def _is_C_or_F_order(self, arr_name):
-        if isinstance(arr_name, types.npytypes.Array):
-            return (
-                arr_name.layout == "C" or arr_name.layout == "F"
-            ) and arr_name.ndim > 0
-        elif arr_name is str:
-            typ = self.pass_states.typemap[arr_name]
-            return (
-                isinstance(typ, types.npytypes.Array)
-                and (typ.layout == "C" or typ.layout == "F")
-                and typ.ndim > 0
-            )
-        else:
-            return False
-
-    def _arrayexpr_to_parfor(self, equiv_set, lhs, arrayexpr, avail_vars):
-        """generate parfor from arrayexpr node, which is essentially a
-        map with recursive tree.
-        """
-        pass_states = self.pass_states
-        scope = lhs.scope
-        loc = lhs.loc
-        expr = arrayexpr.expr
-        arr_typ = pass_states.typemap[lhs.name]
-        el_typ = arr_typ.dtype
-        # generate loopnests and size variables from lhs correlations
-        size_vars = equiv_set.get_shape(lhs)
-        index_vars, loopnests = _mk_parfor_loops(
-            pass_states.typemap, size_vars, scope, loc
-        )
-
-        # generate init block and body
-        init_block = ir.Block(scope, loc)
-        init_block.body = mk_alloc(
-            pass_states.typingctx,
-            pass_states.typemap,
-            pass_states.calltypes,
-            lhs,
-            tuple(size_vars),
-            el_typ,
-            scope,
-            loc,
-            pass_states.typemap[lhs.name],
-        )
-        body_label = next_label()
-        body_block = ir.Block(scope, loc)
-        expr_out_var = ir.Var(scope, mk_unique_var("$expr_out_var"), loc)
-        pass_states.typemap[expr_out_var.name] = el_typ
-
-        index_var, index_var_typ = _make_index_var(
-            pass_states.typemap, scope, index_vars, body_block
-        )
-
-        body_block.body.extend(
-            _arrayexpr_tree_to_ir(
-                pass_states.func_ir,
-                pass_states.typingctx,
-                pass_states.typemap,
-                pass_states.calltypes,
-                equiv_set,
-                init_block,
-                expr_out_var,
-                expr,
-                index_var,
-                index_vars,
-                avail_vars,
-            )
-        )
-
-        pat = ("array expression {}".format(repr_arrayexpr(arrayexpr.expr)),)
-
-        parfor = Parfor(
-            loopnests,
-            init_block,
-            {},
-            loc,
-            index_var,
-            equiv_set,
-            pat[0],
-            pass_states.flags,
-        )
-
-        setitem_node = ir.SetItem(lhs, index_var, expr_out_var, loc)
-        pass_states.calltypes[setitem_node] = signature(
-            types.none, pass_states.typemap[lhs.name], index_var_typ, el_typ
-        )
-        body_block.body.append(setitem_node)
-        parfor.loop_body = {body_label: body_block}
-        if config.DEBUG_ARRAY_OPT >= 1:
-            print("parfor from arrayexpr")
-            parfor.dump()
-        return parfor
-
-    def _is_supported_npycall(self, expr):
-        """check if we support parfor translation for
-        this Numpy call.
-        """
-        call_name, mod_name = find_callname(self.pass_states.func_ir, expr)
-        if not (isinstance(mod_name, str) and mod_name.startswith("numpy")):
-            return False
-        if call_name in ["zeros", "ones"]:
-            return True
-        if mod_name == "numpy.random" and call_name in random_calls:
-            return True
-        # TODO: add more calls
-        return False
-
-    def _numpy_to_parfor(self, equiv_set, lhs, expr):
-        call_name, mod_name = find_callname(self.pass_states.func_ir, expr)
-        args = expr.args
-        kws = dict(expr.kws)
-        if call_name in ["zeros", "ones"] or mod_name == "numpy.random":
-            return self._numpy_map_to_parfor(
-                equiv_set, call_name, lhs, args, kws, expr
-            )
-        # return error if we couldn't handle it (avoid rewrite infinite loop)
-        raise errors.UnsupportedRewriteError(
-            f"parfor translation failed for {expr}",
-            loc=expr.loc,
-        )
-
-    def _numpy_map_to_parfor(self, equiv_set, call_name, lhs, args, kws, expr):
-        """generate parfor from Numpy calls that are maps."""
-        pass_states = self.pass_states
-        scope = lhs.scope
-        loc = lhs.loc
-        arr_typ = pass_states.typemap[lhs.name]
-        el_typ = arr_typ.dtype
-
-        # generate loopnests and size variables from lhs correlations
-        size_vars = equiv_set.get_shape(lhs)
-        if size_vars is None:
-            if config.DEBUG_ARRAY_OPT >= 1:
-                print("Could not convert numpy map to parfor, unknown size")
-            return None
-
-        index_vars, loopnests = _mk_parfor_loops(
-            pass_states.typemap, size_vars, scope, loc
-        )
-
-        # generate init block and body
-        init_block = ir.Block(scope, loc)
-        init_block.body = mk_alloc(
-            pass_states.typingctx,
-            pass_states.typemap,
-            pass_states.calltypes,
-            lhs,
-            tuple(size_vars),
-            el_typ,
-            scope,
-            loc,
-            pass_states.typemap[lhs.name],
-        )
-        body_label = next_label()
-        body_block = ir.Block(scope, loc)
-        expr_out_var = ir.Var(scope, mk_unique_var("$expr_out_var"), loc)
-        pass_states.typemap[expr_out_var.name] = el_typ
-
-        index_var, index_var_typ = _make_index_var(
-            pass_states.typemap, scope, index_vars, body_block
-        )
-
-        if call_name == "zeros":
-            value = ir.Const(el_typ(0), loc)
-        elif call_name == "ones":
-            value = ir.Const(el_typ(1), loc)
-        elif call_name in random_calls:
-            # remove size arg to reuse the call expr for single value
-            _remove_size_arg(call_name, expr)
-            # update expr type
-            new_arg_typs, new_kw_types = _get_call_arg_types(
-                expr, pass_states.typemap
-            )
-            pass_states.calltypes.pop(expr)
-            pass_states.calltypes[expr] = pass_states.typemap[
-                expr.func.name
-            ].get_call_type(typing.Context(), new_arg_typs, new_kw_types)
-            value = expr
-        else:
-            raise NotImplementedError(
-                "Map of numpy.{} to parfor is not implemented".format(call_name)
-            )
-
-        value_assign = ir.Assign(value, expr_out_var, loc)
-        body_block.body.append(value_assign)
-
-        setitem_node = ir.SetItem(lhs, index_var, expr_out_var, loc)
-        pass_states.calltypes[setitem_node] = signature(
-            types.none, pass_states.typemap[lhs.name], index_var_typ, el_typ
-        )
-        body_block.body.append(setitem_node)
-
-        parfor = Parfor(
-            loopnests,
-            init_block,
-            {},
-            loc,
-            index_var,
-            equiv_set,
-            (
-                "{} function".format(
-                    call_name,
-                ),
-                "NumPy mapping",
-            ),
-            pass_states.flags,
-        )
-        parfor.loop_body = {body_label: body_block}
-        if config.DEBUG_ARRAY_OPT >= 1:
-            print("generated parfor for numpy map:")
-            parfor.dump()
-        return parfor
-
-
-class ConvertReducePass:
-    """
-    Find reduce() calls and convert them to parfors.
-    """
-
-    def __init__(self, pass_states):
-        self.pass_states = pass_states
-        self.rewritten = []
-
-    def run(self, blocks):
-        pass_states = self.pass_states
-
-        topo_order = find_topo_order(blocks)
-        for label in topo_order:
-            block = blocks[label]
-            new_body = []
-            equiv_set = pass_states.array_analysis.get_equiv_set(label)
-            for instr in block.body:
-                parfor = None
-                if isinstance(instr, ir.Assign):
-                    loc = instr.loc
-                    lhs = instr.target
-                    expr = instr.value
-                    callname = guard(find_callname, pass_states.func_ir, expr)
-                    if callname == ("reduce", "builtins") or callname == (
-                        "reduce",
-                        "_functools",
-                    ):
-                        # reduce function with generic function
-                        parfor = guard(
-                            self._reduce_to_parfor,
-                            equiv_set,
-                            lhs,
-                            expr.args,
-                            loc,
-                        )
-                    if parfor:
-                        self.rewritten.append(
-                            dict(
-                                new=parfor,
-                                old=instr,
-                                reason="reduce",
-                            )
-                        )
-                        instr = parfor
-                new_body.append(instr)
-            block.body = new_body
-        return
-
-    def _reduce_to_parfor(self, equiv_set, lhs, args, loc):
-        """
-        Convert a reduce call to a parfor.
-        The call arguments should be (call_name, array, init_value).
-        """
-        pass_states = self.pass_states
-
-        scope = lhs.scope
-        call_name = args[0]
-        in_arr = args[1]
-        arr_def = get_definition(pass_states.func_ir, in_arr.name)
-
-        mask_var = None
-        mask_indices = None
-
-        # Search for array[boolean_mask]
-        mask_query_result = guard(
-            _find_mask, pass_states.typemap, pass_states.func_ir, arr_def
-        )
-        if mask_query_result:
-            in_arr, mask_var, mask_typ, mask_indices = mask_query_result
-
-        init_val = args[2]
-        size_vars = equiv_set.get_shape(
-            in_arr if mask_indices is None else mask_var
-        )
-        if size_vars is None:
-            return None
-
-        index_vars, loopnests = _mk_parfor_loops(
-            pass_states.typemap, size_vars, scope, loc
-        )
-        mask_index = index_vars
-        if mask_indices:
-            # the following is never tested
-            raise AssertionError("unreachable")
-            index_vars = tuple(x if x else index_vars[0] for x in mask_indices)
-        acc_var = lhs
-
-        # init block has to init the reduction variable
-        init_block = ir.Block(scope, loc)
-        init_block.body.append(ir.Assign(init_val, acc_var, loc))
-
-        # produce loop body
-        body_label = next_label()
-        index_var, loop_body = self._mk_reduction_body(
-            call_name, scope, loc, index_vars, in_arr, acc_var
-        )
-        if mask_indices:
-            # the following is never tested
-            raise AssertionError("unreachable")
-            index_var = mask_index[0]
-
-        if mask_var is not None:
-            true_label = min(loop_body.keys())
-            false_label = max(loop_body.keys())
-            body_block = ir.Block(scope, loc)
-            loop_body[body_label] = body_block
-            mask = ir.Var(scope, mk_unique_var("$mask_val"), loc)
-            pass_states.typemap[mask.name] = mask_typ
-            mask_val = ir.Expr.getitem(mask_var, index_var, loc)
-            body_block.body.extend(
-                [
-                    ir.Assign(mask_val, mask, loc),
-                    ir.Branch(mask, true_label, false_label, loc),
-                ]
-            )
-
-        parfor = Parfor(
-            loopnests,
-            init_block,
-            loop_body,
-            loc,
-            index_var,
-            equiv_set,
-            ("{} function".format(call_name), "reduction"),
-            pass_states.flags,
-        )
-        if config.DEBUG_ARRAY_OPT >= 1:
-            print("parfor from reduction")
-            parfor.dump()
-        return parfor
-
-    def _mk_reduction_body(
-        self, call_name, scope, loc, index_vars, in_arr, acc_var
-    ):
-        """
-        Produce the body blocks for a reduction function indicated by call_name.
-        """
-        from numba.core.inline_closurecall import check_reduce_func
-
-        pass_states = self.pass_states
-        reduce_func = get_definition(pass_states.func_ir, call_name)
-        fcode = check_reduce_func(pass_states.func_ir, reduce_func)
-
-        arr_typ = pass_states.typemap[in_arr.name]
-        in_typ = arr_typ.dtype
-        body_block = ir.Block(scope, loc)
-        index_var, index_var_type = _make_index_var(
-            pass_states.typemap, scope, index_vars, body_block
-        )
-
-        tmp_var = ir.Var(scope, mk_unique_var("$val"), loc)
-        pass_states.typemap[tmp_var.name] = in_typ
-        getitem_call = ir.Expr.getitem(in_arr, index_var, loc)
-        pass_states.calltypes[getitem_call] = signature(
-            in_typ, arr_typ, index_var_type
-        )
-        body_block.append(ir.Assign(getitem_call, tmp_var, loc))
-
-        reduce_f_ir = compile_to_numba_ir(
-            fcode,
-            pass_states.func_ir.func_id.func.__globals__,
-            pass_states.typingctx,
-            pass_states.targetctx,
-            (in_typ, in_typ),
-            pass_states.typemap,
-            pass_states.calltypes,
-        )
-        loop_body = reduce_f_ir.blocks
-        end_label = next_label()
-        end_block = ir.Block(scope, loc)
-        loop_body[end_label] = end_block
-        first_reduce_label = min(reduce_f_ir.blocks.keys())
-        first_reduce_block = reduce_f_ir.blocks[first_reduce_label]
-        body_block.body.extend(first_reduce_block.body)
-        first_reduce_block.body = body_block.body
-        replace_arg_nodes(first_reduce_block, [acc_var, tmp_var])
-        replace_returns(loop_body, acc_var, end_label)
-        return index_var, loop_body
-
-
-class ConvertLoopPass:
-    """Build Parfor nodes from prange loops."""
-
-    def __init__(self, pass_states):
-        self.pass_states = pass_states
-        self.rewritten = []
-
-    def run(self, blocks):
-        pass_states = self.pass_states
-
-        call_table, _ = get_call_table(blocks)
-        cfg = compute_cfg_from_blocks(blocks)
-        usedefs = compute_use_defs(blocks)
-        live_map = compute_live_map(cfg, blocks, usedefs.usemap, usedefs.defmap)
-        loops = cfg.loops()
-        sized_loops = [(loops[k], len(loops[k].body)) for k in loops.keys()]
-        moved_blocks = []
-        # We go over all loops, smaller loops first (inner first)
-        for loop, s in sorted(sized_loops, key=lambda tup: tup[1]):
-            if len(loop.entries) != 1 or len(loop.exits) != 1:
-                continue
-            entry = list(loop.entries)[0]
-            for inst in blocks[entry].body:
-                # if prange or pndindex call
-                if (
-                    isinstance(inst, ir.Assign)
-                    and isinstance(inst.value, ir.Expr)
-                    and inst.value.op == "call"
-                    and self._is_parallel_loop(inst.value.func.name, call_table)
-                ):
-                    # Here we've found a parallel loop, either prange or pndindex.
-                    # We create a parfor from this loop and then overwrite the contents
-                    # of the original loop header block to contain this parfor and then
-                    # a jump to the original loop exit block.  Other blocks in the
-                    # original loop are discarded.
-                    body_labels = [
-                        l for l in loop.body if l in blocks and l != loop.header
-                    ]
-                    args = inst.value.args
-                    loop_kind, loop_replacing = self._get_loop_kind(
-                        inst.value.func.name, call_table
-                    )
-                    # Get the body of the header of the loops minus the branch terminator
-                    # The general approach is to prepend the header block to the first
-                    # body block and then let dead code removal handle removing unneeded
-                    # statements.  Not all statements in the header block are unnecessary.
-                    header_body = blocks[loop.header].body[:-1]
-                    # find loop index variable (pair_first in header block)
-                    loop_index = None
-                    for hbi, stmt in enumerate(header_body):
-                        if (
-                            isinstance(stmt, ir.Assign)
-                            and isinstance(stmt.value, ir.Expr)
-                            and stmt.value.op == "pair_first"
-                        ):
-                            loop_index = stmt.target.name
-                            li_index = hbi
-                            break
-                    assert loop_index is not None
-                    # Remove pair_first from header.
-                    # We have to remove the pair_first by hand since it causes problems
-                    # for some code below if we don't.
-                    header_body = (
-                        header_body[:li_index] + header_body[li_index + 1 :]
-                    )
-
-                    # loop_index may be assigned to other vars
-                    # get header copies to find all of them
-                    cps, _ = get_block_copies(
-                        {0: blocks[loop.header]}, pass_states.typemap
-                    )
-                    cps = cps[0]
-                    loop_index_vars = set(t for t, v in cps if v == loop_index)
-                    loop_index_vars.add(loop_index)
-
-                    scope = blocks[entry].scope
-                    loc = inst.loc
-                    equiv_set = pass_states.array_analysis.get_equiv_set(
-                        loop.header
-                    )
-                    init_block = ir.Block(scope, loc)
-                    init_block.body = self._get_prange_init_block(
-                        blocks[entry], call_table, args
-                    )
-                    loop_body = {l: blocks[l] for l in body_labels}
-                    # Add an empty block to the end of loop body
-                    end_label = next_label()
-                    loop_body[end_label] = ir.Block(scope, loc)
-
-                    # Detect races in the prange.
-                    # Races are defs in the parfor body that are live at the exit block.
-                    bodydefs = set()
-                    for bl in body_labels:
-                        bodydefs = bodydefs.union(usedefs.defmap[bl])
-                    exit_lives = set()
-                    for bl in loop.exits:
-                        exit_lives = exit_lives.union(live_map[bl])
-                    races = bodydefs.intersection(exit_lives)
-                    # It is possible for the result of an ir.Global to be flagged
-                    # as a race if it is defined in this Parfor and then used in
-                    # a subsequent Parfor.  push_call_vars() in the Parfor pass
-                    # copies such ir.Global nodes into the Parfors in which they
-                    # are used so no need to treat things of type Module as a race.
-                    races = races.intersection(
-                        {
-                            x
-                            for x in races
-                            if not isinstance(
-                                pass_states.typemap[x], types.misc.Module
-                            )
-                        }
-                    )
-
-                    # replace jumps to header block with the end block
-                    for l in body_labels:
-                        last_inst = loop_body[l].body[-1]
-                        if (
-                            isinstance(last_inst, ir.Jump)
-                            and last_inst.target == loop.header
-                        ):
-                            last_inst.target = end_label
-
-                    def find_indexed_arrays():
-                        """find expressions that involve getitem using the
-                        index variable. Return both the arrays and expressions.
-                        """
-                        indices = copy.copy(loop_index_vars)
-                        for block in loop_body.values():
-                            for inst in block.find_insts(ir.Assign):
-                                if (
-                                    isinstance(inst.value, ir.Var)
-                                    and inst.value.name in indices
-                                ):
-                                    indices.add(inst.target.name)
-                        arrs = []
-                        exprs = []
-                        for block in loop_body.values():
-                            for inst in block.body:
-                                lv = set(x.name for x in inst.list_vars())
-                                if lv & indices:
-                                    if lv.issubset(indices):
-                                        continue
-                                    require(isinstance(inst, ir.Assign))
-                                    expr = inst.value
-                                    require(
-                                        isinstance(expr, ir.Expr)
-                                        and expr.op
-                                        in ["getitem", "static_getitem"]
-                                    )
-                                    arrs.append(expr.value.name)
-                                    exprs.append(expr)
-                        return arrs, exprs
-
-                    mask_var = None
-                    mask_indices = None
-
-                    def find_mask_from_size(size_var):
-                        """Find the case where size_var is defined by A[M].shape,
-                        where M is a boolean array.
-                        """
-                        size_def = get_definition(pass_states.func_ir, size_var)
-                        require(
-                            size_def
-                            and isinstance(size_def, ir.Expr)
-                            and size_def.op == "getattr"
-                            and size_def.attr == "shape"
-                        )
-                        arr_var = size_def.value
-                        live_vars = set.union(
-                            *[live_map[l] for l in loop.exits]
-                        )
-                        index_arrs, index_exprs = find_indexed_arrays()
-                        require([arr_var.name] == list(index_arrs))
-                        # input array has to be dead after loop
-                        require(arr_var.name not in live_vars)
-                        # loop for arr's definition, where size = arr.shape
-                        arr_def = get_definition(
-                            pass_states.func_ir, size_def.value
-                        )
-                        result = _find_mask(
-                            pass_states.typemap, pass_states.func_ir, arr_def
-                        )
-
-                        # The following is never tested.
-                        raise AssertionError("unreachable")
-                        # Found the mask.
-                        # Replace B[i] with A[i], where B = A[M]
-                        for expr in index_exprs:
-                            expr.value = result[0]
-                        return result
-
-                    # pndindex and prange are provably positive except when
-                    # user provides negative start to prange()
-                    unsigned_index = True
-                    # TODO: support array mask optimization for prange
-                    # TODO: refactor and simplify array mask optimization
-                    if loop_kind == "pndindex":
-                        assert equiv_set.has_shape(args[0])
-                        # see if input array to pndindex is output of array
-                        # mask like B = A[M]
-                        result = guard(find_mask_from_size, args[0])
-                        if result:
-                            in_arr, mask_var, mask_typ, mask_indices = result
-                        else:
-                            in_arr = args[0]
-                        assert isinstance(in_arr, ir.Var)
-                        in_arr_typ = pass_states.typemap[in_arr.name]
-                        if isinstance(in_arr_typ, types.Integer):
-                            index_var = ir.Var(
-                                scope, mk_unique_var("parfor_index"), loc
-                            )
-                            pass_states.typemap[index_var.name] = types.uintp
-                            loops = [LoopNest(index_var, 0, in_arr, 1)]
-                            index_vars = [index_var]
-                        else:
-                            size_vars = equiv_set.get_shape(
-                                in_arr if mask_indices is None else mask_var
-                            )
-                            index_vars, loops = _mk_parfor_loops(
-                                pass_states.typemap,
-                                size_vars,
-                                scope,
-                                loc,
-                            )
-                        assert len(loops) > 0
-                        orig_index = index_vars
-                        if mask_indices:
-                            # replace mask indices if required;
-                            # integer indices of original array should be used
-                            # instead of parfor indices
-                            index_vars = tuple(
-                                x if x else index_vars[0] for x in mask_indices
-                            )
-                        first_body_block = loop_body[min(loop_body.keys())]
-                        body_block = ir.Block(scope, loc)
-                        index_var, index_var_typ = _make_index_var(
-                            pass_states.typemap,
-                            scope,
-                            index_vars,
-                            body_block,
-                            force_tuple=True,
-                        )
-                        body = body_block.body + first_body_block.body
-                        first_body_block.body = body
-                        if mask_indices:
-                            orig_index_var = orig_index[0]
-                        else:
-                            orig_index_var = index_var
-
-                        # if masked array optimization is being applied, create
-                        # the branch for array selection
-                        if mask_var is not None:
-                            # The following code are not tested
-                            raise AssertionError("unreachable")
-                            body_label = next_label()
-                            # loop_body needs new labels greater than body_label
-                            loop_body = add_offset_to_labels(
-                                loop_body,
-                                body_label - min(loop_body.keys()) + 1,
-                            )
-                            labels = loop_body.keys()
-                            true_label = min(labels)
-                            false_label = max(labels)
-                            body_block = ir.Block(scope, loc)
-                            loop_body[body_label] = body_block
-                            mask = ir.Var(
-                                scope, mk_unique_var("$mask_val"), loc
-                            )
-                            pass_states.typemap[mask.name] = mask_typ
-                            mask_val = ir.Expr.getitem(
-                                mask_var, orig_index_var, loc
-                            )
-                            body_block.body.extend(
-                                [
-                                    ir.Assign(mask_val, mask, loc),
-                                    ir.Branch(
-                                        mask, true_label, false_label, loc
-                                    ),
-                                ]
-                            )
-                    else:  # prange
-                        start = 0
-                        step = 1
-                        size_var = args[0]
-                        if len(args) == 2:
-                            start = args[0]
-                            size_var = args[1]
-                        if len(args) == 3:
-                            start = args[0]
-                            size_var = args[1]
-                            try:
-                                step = pass_states.func_ir.get_definition(
-                                    args[2]
-                                )
-                            except KeyError:
-                                raise errors.UnsupportedRewriteError(
-                                    "Only known step size is supported for prange",
-                                    loc=inst.loc,
-                                )
-                            if not isinstance(step, ir.Const):
-                                raise errors.UnsupportedRewriteError(
-                                    "Only constant step size is supported for prange",
-                                    loc=inst.loc,
-                                )
-                            step = step.value
-                            if step != 1:
-                                raise errors.UnsupportedRewriteError(
-                                    "Only constant step size of 1 is supported for prange",
-                                    loc=inst.loc,
-                                )
-                        index_var = ir.Var(
-                            scope, mk_unique_var("parfor_index"), loc
-                        )
-                        # assume user-provided start to prange can be negative
-                        # this is the only case parfor can have negative index
-                        if isinstance(start, int) and start >= 0:
-                            index_var_typ = types.uintp
-                        else:
-                            index_var_typ = types.intp
-                            unsigned_index = False
-                        loops = [LoopNest(index_var, start, size_var, step)]
-                        pass_states.typemap[index_var.name] = index_var_typ
-
-                        # We can't just drop the header block since there can be things
-                        # in there other than the prange looping infrastructure.
-                        # So we just add the header to the first loop body block (minus the
-                        # branch) and let dead code elimination remove the unnecessary parts.
-                        first_body_label = min(loop_body.keys())
-                        loop_body[first_body_label].body = (
-                            header_body + loop_body[first_body_label].body
-                        )
-
-                    index_var_map = {v: index_var for v in loop_index_vars}
-                    replace_vars(loop_body, index_var_map)
-                    if unsigned_index:
-                        # need to replace signed array access indices to enable
-                        # optimizations (see #2846)
-                        self._replace_loop_access_indices(
-                            loop_body, loop_index_vars, index_var
-                        )
-                    parfor = Parfor(
-                        loops,
-                        init_block,
-                        loop_body,
-                        loc,
-                        orig_index_var if mask_indices else index_var,
-                        equiv_set,
-                        ("prange", loop_kind, loop_replacing),
-                        pass_states.flags,
-                        races=races,
-                    )
-
-                    blocks[loop.header].body = [parfor]
-                    # We have to insert the header_body after the parfor because in
-                    # a Numba loop this will be executed one more times before the
-                    # branch and may contain instructions such as variable renamings
-                    # that are relied upon later.
-                    blocks[loop.header].body.extend(header_body)
-                    blocks[loop.header].body.append(
-                        ir.Jump(list(loop.exits)[0], loc)
-                    )
-                    self.rewritten.append(
-                        dict(
-                            old_loop=loop,
-                            new=parfor,
-                            reason="loop",
-                        )
-                    )
-                    # remove loop blocks from top level dict
-                    for l in body_labels:
-                        if l != loop.header:
-                            blocks.pop(l)
-                    if config.DEBUG_ARRAY_OPT >= 1:
-                        print("parfor from loop")
-                        parfor.dump()
-
-    def _is_parallel_loop(self, func_var, call_table):
-        # prange can be either getattr (numba.prange) or global (prange)
-        if func_var not in call_table:
-            return False
-        call = call_table[func_var]
-        return len(call) > 0 and (
-            call[0] == "prange"
-            or call[0] == prange
-            or call[0] == "internal_prange"
-            or call[0] == internal_prange
-            or call[0] == "pndindex"
-            or call[0] == pndindex
-        )
-
-    def _get_loop_kind(self, func_var, call_table):
-        """see if prange is user prange or internal"""
-        pass_states = self.pass_states
-        # prange can be either getattr (numba.prange) or global (prange)
-        assert func_var in call_table
-        call = call_table[func_var]
-        assert len(call) > 0
-        kind = "user", ""
-        if call[0] == "internal_prange" or call[0] == internal_prange:
-            try:
-                kind = "internal", (
-                    pass_states.swapped_fns[func_var][0],
-                    pass_states.swapped_fns[func_var][-1],
-                )
-            except KeyError:
-                # FIXME: Fix this issue... the code didn't manage to trace the
-                # swapout for func_var so set the kind as internal so that the
-                # transform can occur, it's just not tracked
-                kind = "internal", ("", "")
-        elif call[0] == "pndindex" or call[0] == pndindex:
-            kind = "pndindex", ""
-        return kind
-
-    def _get_prange_init_block(self, entry_block, call_table, prange_args):
-        """
-        If there is init_prange, find the code between init_prange and prange
-        calls. Remove the code from entry_block and return it.
-        """
-        init_call_ind = -1
-        prange_call_ind = -1
-        init_body = []
-        for i, inst in enumerate(entry_block.body):
-            # if init_prange call
-            if (
-                isinstance(inst, ir.Assign)
-                and isinstance(inst.value, ir.Expr)
-                and inst.value.op == "call"
-                and self._is_prange_init(inst.value.func.name, call_table)
-            ):
-                init_call_ind = i
-            if (
-                isinstance(inst, ir.Assign)
-                and isinstance(inst.value, ir.Expr)
-                and inst.value.op == "call"
-                and self._is_parallel_loop(inst.value.func.name, call_table)
-            ):
-                prange_call_ind = i
-        if init_call_ind != -1 and prange_call_ind != -1:
-            # we save instructions that are used to calculate prange call args
-            # in the entry block. The rest go to parfor init_block
-            arg_related_vars = {v.name for v in prange_args}
-            saved_nodes = []
-            for i in reversed(range(init_call_ind + 1, prange_call_ind)):
-                inst = entry_block.body[i]
-                inst_vars = {v.name for v in inst.list_vars()}
-                if arg_related_vars & inst_vars:
-                    arg_related_vars |= inst_vars
-                    saved_nodes.append(inst)
-                else:
-                    init_body.append(inst)
-
-            init_body.reverse()
-            saved_nodes.reverse()
-            entry_block.body = (
-                entry_block.body[:init_call_ind]
-                + saved_nodes
-                + entry_block.body[prange_call_ind + 1 :]
-            )
-
-        return init_body
-
-    def _is_prange_init(self, func_var, call_table):
-        if func_var not in call_table:
-            return False
-        call = call_table[func_var]
-        return len(call) > 0 and (
-            call[0] == "init_prange" or call[0] == init_prange
-        )
-
-    def _replace_loop_access_indices(self, loop_body, index_set, new_index):
-        """
-        Replace array access indices in a loop body with a new index.
-        index_set has all the variables that are equivalent to loop index.
-        """
-        # treat new index like others since replacing it with itself is ok
-        index_set.add(new_index.name)
-
-        with dummy_return_in_loop_body(loop_body):
-            labels = find_topo_order(loop_body)
-
-        first_label = labels[0]
-        added_indices = set()
-
-        # traverse loop body and replace indices in getitem/setitem with
-        # new_index if possible.
-        # also, find equivalent indices defined in first block.
-        for l in labels:
-            block = loop_body[l]
-            for stmt in block.body:
-                if isinstance(stmt, ir.Assign) and isinstance(
-                    stmt.value, ir.Var
-                ):
-                    # the first block dominates others so we can use copies
-                    # of indices safely
-                    if (
-                        l == first_label
-                        and stmt.value.name in index_set
-                        and stmt.target.name not in index_set
-                    ):
-                        index_set.add(stmt.target.name)
-                        added_indices.add(stmt.target.name)
-                    # make sure parallel index is not overwritten
-                    else:
-                        scope = block.scope
-
-                        def unver(name):
-                            from numba.core import errors
-
-                            try:
-                                return scope.get_exact(name).unversioned_name
-                            except errors.NotDefinedError:
-                                return name
-
-                        if unver(stmt.target.name) in map(
-                            unver, index_set
-                        ) and unver(stmt.target.name) != unver(stmt.value.name):
-                            raise errors.UnsupportedRewriteError(
-                                "Overwrite of parallel loop index",
-                                loc=stmt.target.loc,
-                            )
-
-                if is_get_setitem(stmt):
-                    index = index_var_of_get_setitem(stmt)
-                    # statics can have none indices
-                    if index is None:
-                        continue
-                    ind_def = guard(
-                        get_definition,
-                        self.pass_states.func_ir,
-                        index,
-                        lhs_only=True,
-                    )
-                    if index.name in index_set or (
-                        ind_def is not None and ind_def.name in index_set
-                    ):
-                        set_index_var_of_get_setitem(stmt, new_index)
-                    # corner case where one dimension of a multi-dim access
-                    # should be replaced
-                    guard(
-                        self._replace_multi_dim_ind,
-                        ind_def,
-                        index_set,
-                        new_index,
-                    )
-
-                if isinstance(stmt, Parfor):
-                    self._replace_loop_access_indices(
-                        stmt.loop_body, index_set, new_index
-                    )
-
-        # remove added indices for correct recursive parfor handling
-        index_set -= added_indices
-        return
-
-    def _replace_multi_dim_ind(self, ind_var, index_set, new_index):
-        """
-        replace individual indices in multi-dimensional access variable, which
-        is a build_tuple
-        """
-        pass_states = self.pass_states
-        require(ind_var is not None)
-        # check for Tuple instead of UniTuple since some dims could be slices
-        require(
-            isinstance(
-                pass_states.typemap[ind_var.name], (types.Tuple, types.UniTuple)
-            )
-        )
-        ind_def_node = get_definition(pass_states.func_ir, ind_var)
-        require(
-            isinstance(ind_def_node, ir.Expr)
-            and ind_def_node.op == "build_tuple"
-        )
-        ind_def_node.items = [
-            new_index if v.name in index_set else v for v in ind_def_node.items
-        ]
-
-
-def _find_mask(typemap, func_ir, arr_def):
-    """check if an array is of B[...M...], where M is a
-    boolean array, and other indices (if available) are ints.
-    If found, return B, M, M's type, and a tuple representing mask indices.
-    Otherwise, raise GuardException.
-    """
-    require(isinstance(arr_def, ir.Expr) and arr_def.op == "getitem")
-    value = arr_def.value
-    index = arr_def.index
-    value_typ = typemap[value.name]
-    index_typ = typemap[index.name]
-    ndim = value_typ.ndim
-    require(isinstance(value_typ, types.npytypes.Array))
-    if (
-        isinstance(index_typ, types.npytypes.Array)
-        and isinstance(index_typ.dtype, types.Boolean)
-        and ndim == index_typ.ndim
-    ):
-        return value, index, index_typ.dtype, None
-    elif isinstance(index_typ, types.BaseTuple):
-        # Handle multi-dimension differently by requiring
-        # all indices to be constant except the one for mask.
-        seq, op = find_build_sequence(func_ir, index)
-        require(op == "build_tuple" and len(seq) == ndim)
-        count_consts = 0
-        mask_indices = []
-        mask_var = None
-        for ind in seq:
-            index_typ = typemap[ind.name]
-            # Handle boolean mask
-            if isinstance(index_typ, types.npytypes.Array) and isinstance(
-                index_typ.dtype, types.Boolean
-            ):
-                mask_var = ind
-                mask_typ = index_typ.dtype
-                mask_indices.append(None)
-            # Handle integer array selector
-            elif isinstance(index_typ, types.npytypes.Array) and isinstance(
-                index_typ.dtype, types.Integer
-            ):
-                mask_var = ind
-                mask_typ = index_typ.dtype
-                mask_indices.append(None)
-            # Handle integer index
-            elif isinstance(index_typ, types.Integer):
-                count_consts += 1
-                mask_indices.append(ind)
-
-        require(mask_var and count_consts == ndim - 1)
-        return value, mask_var, mask_typ, mask_indices
-    raise GuardException
-
-
-class ParforPass(ParforPassStates):
-
-    """ParforPass class is responsible for converting NumPy
-    calls in Numba intermediate representation to Parfors, which
-    will lower into either sequential or parallel loops during lowering
-    stage.
-    """
-
-    def _pre_run(self):
-        # run array analysis, a pre-requisite for parfor translation
-        self.array_analysis.run(self.func_ir.blocks)
-        # NOTE: Prepare _the_max_label. See #6102
-        ir_utils._the_max_label.update(
-            ir_utils.find_max_label(self.func_ir.blocks)
-        )
-
-    def run(self):
-        """run parfor conversion pass: replace Numpy calls
-        with Parfors when possible and optimize the IR."""
-        self._pre_run()
-        # run stencil translation to parfor
-        if self.options.stencil:
-            stencil_pass = StencilPass(
-                self.func_ir,
-                self.typemap,
-                self.calltypes,
-                self.array_analysis,
-                self.typingctx,
-                self.targetctx,
-                self.flags,
-            )
-            stencil_pass.run()
-        if self.options.setitem:
-            ConvertSetItemPass(self).run(self.func_ir.blocks)
-        if self.options.numpy:
-            ConvertNumpyPass(self).run(self.func_ir.blocks)
-        if self.options.reduction:
-            ConvertReducePass(self).run(self.func_ir.blocks)
-        if self.options.prange:
-            ConvertLoopPass(self).run(self.func_ir.blocks)
-        if self.options.inplace_binop:
-            ConvertInplaceBinop(self).run(self.func_ir.blocks)
-
-        # setup diagnostics now parfors are found
-        self.diagnostics.setup(self.func_ir, self.options.fusion)
-
-        dprint_func_ir(self.func_ir, "after parfor pass")
-
-    def _find_mask(self, arr_def):
-        """check if an array is of B[...M...], where M is a
-        boolean array, and other indices (if available) are ints.
-        If found, return B, M, M's type, and a tuple representing mask indices.
-        Otherwise, raise GuardException.
-        """
-        return _find_mask(self.typemap, self.func_ir, arr_def)
-
-    def _mk_parfor_loops(self, size_vars, scope, loc):
-        """
-        Create loop index variables and build LoopNest objects for a parfor.
-        """
-        return _mk_parfor_loops(self.typemap, size_vars, scope, loc)
-
-
-class ParforFusionPass(ParforPassStates):
-
-    """ParforFusionPass class is responsible for fusing parfors"""
-
-    def run(self):
-        """run parfor fusion pass"""
-
-        # simplify CFG of parfor body loops since nested parfors with extra
-        # jumps can be created with prange conversion
-        n_parfors = simplify_parfor_body_CFG(self.func_ir.blocks)
-        # simplify before fusion
-        simplify(
-            self.func_ir, self.typemap, self.calltypes, self.metadata["parfors"]
-        )
-        # need two rounds of copy propagation to enable fusion of long sequences
-        # of parfors like test_fuse_argmin (some PYTHONHASHSEED values since
-        # apply_copies_parfor depends on set order for creating dummy assigns)
-        simplify(
-            self.func_ir, self.typemap, self.calltypes, self.metadata["parfors"]
-        )
-
-        if self.options.fusion and n_parfors >= 2:
-            self.func_ir._definitions = build_definitions(self.func_ir.blocks)
-            self.array_analysis.equiv_sets = dict()
-            self.array_analysis.run(self.func_ir.blocks)
-
-            # Get parfor params to calculate reductions below.
-            _, parfors = get_parfor_params(
-                self.func_ir.blocks,
-                self.options.fusion,
-                self.nested_fusion_info,
-            )
-
-            # Find reductions so that fusion can be disallowed if a
-            # subsequent parfor read a reduction variable.
-            for p in parfors:
-                p.redvars, p.reddict = get_parfor_reductions(
-                    self.func_ir, p, p.params, self.calltypes
-                )
-
-            # reorder statements to maximize fusion
-            # push non-parfors down
-            maximize_fusion(
-                self.func_ir,
-                self.func_ir.blocks,
-                self.typemap,
-                up_direction=False,
-            )
-            dprint_func_ir(self.func_ir, "after maximize fusion down")
-            self.fuse_parfors(
-                self.array_analysis,
-                self.func_ir.blocks,
-                self.func_ir,
-                self.typemap,
-            )
-            dprint_func_ir(self.func_ir, "after first fuse")
-            # push non-parfors up
-            maximize_fusion(self.func_ir, self.func_ir.blocks, self.typemap)
-            dprint_func_ir(self.func_ir, "after maximize fusion up")
-            # try fuse again after maximize
-            self.fuse_parfors(
-                self.array_analysis,
-                self.func_ir.blocks,
-                self.func_ir,
-                self.typemap,
-            )
-            dprint_func_ir(self.func_ir, "after fusion")
-            # remove dead code after fusion to remove extra arrays and variables
-            simplify(
-                self.func_ir,
-                self.typemap,
-                self.calltypes,
-                self.metadata["parfors"],
-            )
-
-    def fuse_parfors(self, array_analysis, blocks, func_ir, typemap):
-        for label, block in blocks.items():
-            equiv_set = array_analysis.get_equiv_set(label)
-            fusion_happened = True
-            while fusion_happened:
-                fusion_happened = False
-                new_body = []
-                i = 0
-                while i < len(block.body) - 1:
-                    stmt = block.body[i]
-                    next_stmt = block.body[i + 1]
-                    if isinstance(stmt, Parfor) and isinstance(
-                        next_stmt, Parfor
-                    ):
-                        # we have to update equiv_set since they have changed due to
-                        # variables being renamed before fusion.
-                        equiv_set = array_analysis.get_equiv_set(label)
-                        stmt.equiv_set = equiv_set
-                        next_stmt.equiv_set = equiv_set
-                        fused_node, fuse_report = try_fuse(
-                            equiv_set,
-                            stmt,
-                            next_stmt,
-                            self.metadata["parfors"],
-                            func_ir,
-                            typemap,
-                        )
-                        # accumulate fusion reports
-                        self.diagnostics.fusion_reports.append(fuse_report)
-                        if fused_node is not None:
-                            fusion_happened = True
-                            self.diagnostics.fusion_info[stmt.id].extend(
-                                [next_stmt.id]
-                            )
-                            new_body.append(fused_node)
-                            self.fuse_recursive_parfor(
-                                fused_node, equiv_set, func_ir, typemap
-                            )
-                            i += 2
-                            continue
-                    new_body.append(stmt)
-                    if isinstance(stmt, Parfor):
-                        self.fuse_recursive_parfor(
-                            stmt, equiv_set, func_ir, typemap
-                        )
-                    i += 1
-                new_body.append(block.body[-1])
-                block.body = new_body
-        return
-
-    def fuse_recursive_parfor(self, parfor, equiv_set, func_ir, typemap):
-        blocks = wrap_parfor_blocks(parfor)
-        maximize_fusion(self.func_ir, blocks, self.typemap)
-        dprint_func_ir(
-            self.func_ir, "after recursive maximize fusion down", blocks
-        )
-        arr_analysis = array_analysis.ArrayAnalysis(
-            self.typingctx, self.func_ir, self.typemap, self.calltypes
-        )
-        arr_analysis.run(blocks, equiv_set)
-        self.fuse_parfors(arr_analysis, blocks, func_ir, typemap)
-        unwrap_parfor_blocks(parfor)
-
-
-class ParforPreLoweringPass(ParforPassStates):
-
-    """ParforPreLoweringPass class is responsible for preparing parfors for lowering."""
-
-    def run(self):
-        """run parfor prelowering pass"""
-
-        # push function call variables inside parfors so gufunc function
-        # wouldn't need function variables as argument
-        push_call_vars(self.func_ir.blocks, {}, {}, self.typemap)
-        dprint_func_ir(self.func_ir, "after push call vars")
-        # simplify again
-        simplify(
-            self.func_ir, self.typemap, self.calltypes, self.metadata["parfors"]
-        )
-        dprint_func_ir(self.func_ir, "after optimization")
-        if config.DEBUG_ARRAY_OPT >= 1:
-            print("variable types: ", sorted(self.typemap.items()))
-            print("call types: ", self.calltypes)
-
-        if config.DEBUG_ARRAY_OPT >= 3:
-            for block_label, block in self.func_ir.blocks.items():
-                new_block = []
-                scope = block.scope
-                for stmt in block.body:
-                    new_block.append(stmt)
-                    if isinstance(stmt, ir.Assign):
-                        loc = stmt.loc
-                        lhs = stmt.target
-                        rhs = stmt.value
-                        lhs_typ = self.typemap[lhs.name]
-                        print(
-                            "Adding print for assignment to ",
-                            lhs.name,
-                            lhs_typ,
-                            type(lhs_typ),
-                        )
-                        if lhs_typ in types.number_domain or isinstance(
-                            lhs_typ, types.Literal
-                        ):
-                            str_var = ir.Var(
-                                scope, mk_unique_var("str_var"), loc
-                            )
-                            self.typemap[str_var.name] = types.StringLiteral(
-                                lhs.name
-                            )
-                            lhs_const = ir.Const(lhs.name, loc)
-                            str_assign = ir.Assign(lhs_const, str_var, loc)
-                            new_block.append(str_assign)
-                            str_print = ir.Print([str_var], None, loc)
-                            self.calltypes[str_print] = signature(
-                                types.none, self.typemap[str_var.name]
-                            )
-                            new_block.append(str_print)
-                            ir_print = ir.Print([lhs], None, loc)
-                            self.calltypes[ir_print] = signature(
-                                types.none, lhs_typ
-                            )
-                            new_block.append(ir_print)
-                block.body = new_block
-
-        if self.func_ir.is_generator:
-            fix_generator_types(
-                self.func_ir.generator_info, self.return_type, self.typemap
-            )
-        if sequential_parfor_lowering:
-            lower_parfor_sequential(
-                self.typingctx,
-                self.func_ir,
-                self.typemap,
-                self.calltypes,
-                self.metadata,
-            )
-        else:
-            # prepare for parallel lowering
-            # add parfor params to parfors here since lowering is destructive
-            # changing the IR after this is not allowed
-            parfor_ids, parfors = get_parfor_params(
-                self.func_ir.blocks,
-                self.options.fusion,
-                self.nested_fusion_info,
-            )
-
-            # Validate reduction in parfors.
-            for p in parfors:
-                p.redvars, p.reddict = get_parfor_reductions(
-                    self.func_ir, p, p.params, self.calltypes
-                )
-
-            # Validate parameters:
-            for p in parfors:
-                p.validate_params(self.typemap)
-
-            if config.DEBUG_ARRAY_OPT_STATS:
-                name = self.func_ir.func_id.func_qualname
-                n_parfors = len(parfor_ids)
-                if n_parfors > 0:
-                    after_fusion = (
-                        "After fusion"
-                        if self.options.fusion
-                        else "With fusion disabled"
-                    )
-                    print(
-                        (
-                            "{}, function {} has "
-                            "{} parallel for-loop(s) #{}."
-                        ).format(after_fusion, name, n_parfors, parfor_ids)
-                    )
-                else:
-                    print("Function {} has no Parfor.".format(name))
-
-
-def _remove_size_arg(call_name, expr):
-    "remove size argument from args or kws"
-    # remove size kwarg
-    kws = dict(expr.kws)
-    kws.pop("size", "")
-    expr.kws = tuple(kws.items())
-
-    # remove size arg if available
-    if call_name in random_1arg_size + random_int_args:
-        # these calls have only a "size" argument or list of ints
-        # so remove all args
-        expr.args = []
-
-    if call_name in random_3arg_sizelast:
-        # normal, uniform, ... have 3 args, last one is size
-        if len(expr.args) == 3:
-            expr.args.pop()
-
-    if call_name in random_2arg_sizelast:
-        # have 2 args, last one is size
-        if len(expr.args) == 2:
-            expr.args.pop()
-
-    if call_name == "randint":
-        # has 4 args, 3rd one is size
-        if len(expr.args) == 3:
-            expr.args.pop()
-        if len(expr.args) == 4:
-            dt_arg = expr.args.pop()
-            expr.args.pop()  # remove size
-            expr.args.append(dt_arg)
-
-    if call_name == "triangular":
-        # has 4 args, last one is size
-        if len(expr.args) == 4:
-            expr.args.pop()
-
-    return
-
-
-def _get_call_arg_types(expr, typemap):
-    new_arg_typs = []
-    for arg in expr.args:
-        new_arg_typs.append(typemap[arg.name])
-
-    new_kw_types = {}
-    for name, arg in expr.kws:
-        new_kw_types[name] = typemap[arg.name]
-
-    return tuple(new_arg_typs), new_kw_types
-
-
-def _ufunc_to_parfor_instr(
-    typemap,
-    op,
-    avail_vars,
-    loc,
-    scope,
-    func_ir,
-    out_ir,
-    arg_vars,
-    typingctx,
-    calltypes,
-    expr_out_var,
-):
-    func_var_name = _find_func_var(typemap, op, avail_vars, loc=loc)
-    func_var = ir.Var(scope, mk_unique_var(func_var_name), loc)
-    typemap[func_var.name] = typemap[func_var_name]
-    func_var_def = copy.deepcopy(func_ir.get_definition(func_var_name))
-    if (
-        isinstance(func_var_def, ir.Expr)
-        and func_var_def.op == "getattr"
-        and func_var_def.attr == "sqrt"
-    ):
-        g_math_var = ir.Var(scope, mk_unique_var("$math_g_var"), loc)
-        typemap[g_math_var.name] = types.misc.Module(math)
-        g_math = ir.Global("math", math, loc)
-        g_math_assign = ir.Assign(g_math, g_math_var, loc)
-        func_var_def = ir.Expr.getattr(g_math_var, "sqrt", loc)
-        out_ir.append(g_math_assign)
-    #                     out_ir.append(func_var_def)
-    ir_expr = ir.Expr.call(func_var, arg_vars, (), loc)
-    call_typ = typemap[func_var.name].get_call_type(
-        typingctx, tuple(typemap[a.name] for a in arg_vars), {}
-    )
-    calltypes[ir_expr] = call_typ
-    el_typ = call_typ.return_type
-    # signature(el_typ, el_typ)
-    out_ir.append(ir.Assign(func_var_def, func_var, loc))
-    out_ir.append(ir.Assign(ir_expr, expr_out_var, loc))
-
-
-def _arrayexpr_tree_to_ir(
-    func_ir,
-    typingctx,
-    typemap,
-    calltypes,
-    equiv_set,
-    init_block,
-    expr_out_var,
-    expr,
-    parfor_index_tuple_var,
-    all_parfor_indices,
-    avail_vars,
-):
-    """generate IR from array_expr's expr tree recursively. Assign output to
-    expr_out_var and returns the whole IR as a list of Assign nodes.
-    """
-    el_typ = typemap[expr_out_var.name]
-    scope = expr_out_var.scope
-    loc = expr_out_var.loc
-    out_ir = []
-
-    if isinstance(expr, tuple):
-        op, arr_expr_args = expr
-        arg_vars = []
-        for arg in arr_expr_args:
-            arg_out_var = ir.Var(scope, mk_unique_var("$arg_out_var"), loc)
-            typemap[arg_out_var.name] = el_typ
-            out_ir += _arrayexpr_tree_to_ir(
-                func_ir,
-                typingctx,
-                typemap,
-                calltypes,
-                equiv_set,
-                init_block,
-                arg_out_var,
-                arg,
-                parfor_index_tuple_var,
-                all_parfor_indices,
-                avail_vars,
-            )
-            arg_vars.append(arg_out_var)
-        if op in npydecl.supported_array_operators:
-            el_typ1 = typemap[arg_vars[0].name]
-            if len(arg_vars) == 2:
-                el_typ2 = typemap[arg_vars[1].name]
-                func_typ = typingctx.resolve_function_type(
-                    op, (el_typ1, el_typ2), {}
-                )
-                ir_expr = ir.Expr.binop(op, arg_vars[0], arg_vars[1], loc)
-                if op == operator.truediv:
-                    func_typ, ir_expr = _gen_np_divide(
-                        arg_vars[0], arg_vars[1], out_ir, typemap
-                    )
-            else:
-                func_typ = typingctx.resolve_function_type(op, (el_typ1,), {})
-                ir_expr = ir.Expr.unary(op, arg_vars[0], loc)
-            calltypes[ir_expr] = func_typ
-            el_typ = func_typ.return_type
-            out_ir.append(ir.Assign(ir_expr, expr_out_var, loc))
-        for T in array_analysis.MAP_TYPES:
-            if isinstance(op, T):
-                # elif isinstance(op, (np.ufunc, DUFunc)):
-                # function calls are stored in variables which are not removed
-                # op is typing_key to the variables type
-                _ufunc_to_parfor_instr(
-                    typemap,
-                    op,
-                    avail_vars,
-                    loc,
-                    scope,
-                    func_ir,
-                    out_ir,
-                    arg_vars,
-                    typingctx,
-                    calltypes,
-                    expr_out_var,
-                )
-        if hasattr(op, "is_dpnp_ufunc"):
-            _ufunc_to_parfor_instr(
-                typemap,
-                op,
-                avail_vars,
-                loc,
-                scope,
-                func_ir,
-                out_ir,
-                arg_vars,
-                typingctx,
-                calltypes,
-                expr_out_var,
-            )
-    elif isinstance(expr, ir.Var):
-        var_typ = typemap[expr.name]
-        if isinstance(var_typ, types.Array):
-            el_typ = var_typ.dtype
-            ir_expr = _gen_arrayexpr_getitem(
-                equiv_set,
-                expr,
-                parfor_index_tuple_var,
-                all_parfor_indices,
-                el_typ,
-                calltypes,
-                typingctx,
-                typemap,
-                init_block,
-                out_ir,
-            )
-        else:
-            # assert typemap[expr.name]==el_typ
-            el_typ = var_typ
-            ir_expr = expr
-        out_ir.append(ir.Assign(ir_expr, expr_out_var, loc))
-    elif isinstance(expr, ir.Const):
-        el_typ = typing.Context().resolve_value_type(expr.value)
-        out_ir.append(ir.Assign(expr, expr_out_var, loc))
-
-    if len(out_ir) == 0:
-        raise errors.UnsupportedRewriteError(
-            f"Don't know how to translate array expression '{expr:r}'",
-            loc=expr.loc,
-        )
-    typemap.pop(expr_out_var.name, None)
-    typemap[expr_out_var.name] = el_typ
-    return out_ir
-
-
-def _gen_np_divide(arg1, arg2, out_ir, typemap):
-    """generate np.divide() instead of / for array_expr to get numpy error model
-    like inf for division by zero (test_division_by_zero).
-    """
-    scope = arg1.scope
-    loc = arg1.loc
-    # g_np_var = Global(numpy)
-    g_np_var = ir.Var(scope, mk_unique_var("$np_g_var"), loc)
-    typemap[g_np_var.name] = types.misc.Module(numpy)
-    g_np = ir.Global("np", numpy, loc)
-    g_np_assign = ir.Assign(g_np, g_np_var, loc)
-    # attr call: div_attr = getattr(g_np_var, divide)
-    div_attr_call = ir.Expr.getattr(g_np_var, "divide", loc)
-    attr_var = ir.Var(scope, mk_unique_var("$div_attr"), loc)
-    func_var_typ = get_np_ufunc_typ(numpy.divide)
-    typemap[attr_var.name] = func_var_typ
-    attr_assign = ir.Assign(div_attr_call, attr_var, loc)
-    # divide call:  div_attr(arg1, arg2)
-    div_call = ir.Expr.call(attr_var, [arg1, arg2], (), loc)
-    func_typ = func_var_typ.get_call_type(
-        typing.Context(), [typemap[arg1.name], typemap[arg2.name]], {}
-    )
-    out_ir.extend([g_np_assign, attr_assign])
-    return func_typ, div_call
-
-
-def _gen_arrayexpr_getitem(
-    equiv_set,
-    var,
-    parfor_index_tuple_var,
-    all_parfor_indices,
-    el_typ,
-    calltypes,
-    typingctx,
-    typemap,
-    init_block,
-    out_ir,
-):
-    """if there is implicit dimension broadcast, generate proper access variable
-    for getitem. For example, if indices are (i1,i2,i3) but shape is (c1,0,c3),
-    generate a tuple with (i1,0,i3) for access.  Another example: for (i1,i2,i3)
-    and (c1,c2) generate (i2,i3).
-    """
-    loc = var.loc
-    index_var = parfor_index_tuple_var
-    var_typ = typemap[var.name]
-    ndims = typemap[var.name].ndim
-    num_indices = len(all_parfor_indices)
-    size_vars = equiv_set.get_shape(var) or []
-    size_consts = [equiv_set.get_equiv_const(x) for x in size_vars]
-    # Handle array-scalar
-    if ndims == 0:
-        # call np.ravel
-        ravel_var = ir.Var(var.scope, mk_unique_var("$ravel"), loc)
-        ravel_typ = types.npytypes.Array(
-            dtype=var_typ.dtype, ndim=1, layout="C"
-        )
-        typemap[ravel_var.name] = ravel_typ
-        stmts = ir_utils.gen_np_call(
-            "ravel",
-            numpy.ravel,
-            ravel_var,
-            [var],
-            typingctx,
-            typemap,
-            calltypes,
-        )
-        init_block.body.extend(stmts)
-        var = ravel_var
-        # Const(0)
-        const_node = ir.Const(0, var.loc)
-        const_var = ir.Var(var.scope, mk_unique_var("$const_ind_0"), loc)
-        typemap[const_var.name] = types.uintp
-        const_assign = ir.Assign(const_node, const_var, loc)
-        out_ir.append(const_assign)
-        index_var = const_var
-    # Handle 1d array
-    elif ndims == 1:
-        # Use last index for 1D arrays
-        index_var = all_parfor_indices[-1]
-    # Handle known constant size
-    elif any([x is not None for x in size_consts]):
-        # Need a tuple as index
-        ind_offset = num_indices - ndims
-        tuple_var = ir.Var(
-            var.scope, mk_unique_var("$parfor_index_tuple_var_bcast"), loc
-        )
-        typemap[tuple_var.name] = types.containers.UniTuple(types.uintp, ndims)
-        # Just in case, const var for size 1 dim access index: $const0 =
-        # Const(0)
-        const_node = ir.Const(0, var.loc)
-        const_var = ir.Var(var.scope, mk_unique_var("$const_ind_0"), loc)
-        typemap[const_var.name] = types.uintp
-        const_assign = ir.Assign(const_node, const_var, loc)
-        out_ir.append(const_assign)
-        index_vars = []
-        for i in reversed(range(ndims)):
-            size_var = size_vars[i]
-            size_const = size_consts[i]
-            if size_const == 1:
-                index_vars.append(const_var)
-            else:
-                index_vars.append(all_parfor_indices[ind_offset + i])
-        index_vars = list(reversed(index_vars))
-        tuple_call = ir.Expr.build_tuple(index_vars, loc)
-        tuple_assign = ir.Assign(tuple_call, tuple_var, loc)
-        out_ir.append(tuple_assign)
-        index_var = tuple_var
-
-    ir_expr = ir.Expr.getitem(var, index_var, loc)
-    calltypes[ir_expr] = signature(
-        el_typ, typemap[var.name], typemap[index_var.name]
-    )
-    return ir_expr
-
-
-def _find_func_var(typemap, func, avail_vars, loc):
-    """find variable in typemap which represents the function func."""
-    for v in avail_vars:
-        t = typemap[v]
-        # Function types store actual functions in typing_key.
-        if isinstance(t, Function) and t.typing_key == func:
-            return v
-    raise errors.UnsupportedRewriteError(
-        "ufunc call variable not found", loc=loc
-    )
-
-
-def lower_parfor_sequential(typingctx, func_ir, typemap, calltypes, metadata):
-    ir_utils._the_max_label.update(ir_utils.find_max_label(func_ir.blocks))
-    parfor_found = False
-    new_blocks = {}
-    scope = next(iter(func_ir.blocks.values())).scope
-    for block_label, block in func_ir.blocks.items():
-        block_label, parfor_found = _lower_parfor_sequential_block(
-            block_label,
-            block,
-            new_blocks,
-            typemap,
-            calltypes,
-            parfor_found,
-            scope=scope,
-        )
-        # old block stays either way
-        new_blocks[block_label] = block
-    func_ir.blocks = new_blocks
-    # rename only if parfor found and replaced (avoid test_flow_control error)
-    if parfor_found:
-        func_ir.blocks = rename_labels(func_ir.blocks)
-    dprint_func_ir(func_ir, "after parfor sequential lowering")
-    simplify(func_ir, typemap, calltypes, metadata["parfors"])
-    dprint_func_ir(func_ir, "after parfor sequential simplify")
-
-
-def _lower_parfor_sequential_block(
-    block_label, block, new_blocks, typemap, calltypes, parfor_found, scope
-):
-    i = _find_first_parfor(block.body)
-    while i != -1:
-        parfor_found = True
-        inst = block.body[i]
-        loc = inst.init_block.loc
-        # split block across parfor
-        prev_block = ir.Block(scope, loc)
-        prev_block.body = block.body[:i]
-        block.body = block.body[i + 1 :]
-        # previous block jump to parfor init block
-        init_label = next_label()
-        prev_block.body.append(ir.Jump(init_label, loc))
-        new_blocks[init_label] = transfer_scope(inst.init_block, scope)
-        new_blocks[block_label] = prev_block
-        block_label = next_label()
-
-        ndims = len(inst.loop_nests)
-        for i in range(ndims):
-            loopnest = inst.loop_nests[i]
-            # create range block for loop
-            range_label = next_label()
-            header_label = next_label()
-            range_block = mk_range_block(
-                typemap,
-                loopnest.start,
-                loopnest.stop,
-                loopnest.step,
-                calltypes,
-                scope,
-                loc,
-            )
-            range_block.body[-1].target = header_label  # fix jump target
-            phi_var = range_block.body[-2].target
-            new_blocks[range_label] = range_block
-            header_block = mk_loop_header(
-                typemap, phi_var, calltypes, scope, loc
-            )
-            header_block.body[-2].target = loopnest.index_variable
-            new_blocks[header_label] = header_block
-            # jump to this new inner loop
-            if i == 0:
-                inst.init_block.body.append(ir.Jump(range_label, loc))
-                header_block.body[-1].falsebr = block_label
-            else:
-                new_blocks[prev_header_label].body[-1].truebr = range_label
-                header_block.body[-1].falsebr = prev_header_label
-            prev_header_label = header_label  # to set truebr next loop
-
-        # last body block jump to inner most header
-        body_last_label = max(inst.loop_body.keys())
-        inst.loop_body[body_last_label].body.append(ir.Jump(header_label, loc))
-        # inner most header jumps to first body block
-        body_first_label = min(inst.loop_body.keys())
-        header_block.body[-1].truebr = body_first_label
-        # add parfor body to blocks
-        for l, b in inst.loop_body.items():
-            l, parfor_found = _lower_parfor_sequential_block(
-                l, b, new_blocks, typemap, calltypes, parfor_found, scope=scope
-            )
-            new_blocks[l] = transfer_scope(b, scope)
-        i = _find_first_parfor(block.body)
-    return block_label, parfor_found
-
-
-def _find_first_parfor(body):
-    for i, inst in enumerate(body):
-        if isinstance(inst, Parfor) and not inst.no_sequential_lowering:
-            return i
-    return -1
-
-
-def get_parfor_params(blocks, options_fusion, fusion_info):
-    """find variables used in body of parfors from outside and save them.
-    computed as live variables at entry of first block.
-    """
-
-    # since parfor wrap creates a back-edge to first non-init basic block,
-    # live_map[first_non_init_block] contains variables defined in parfor body
-    # that could be undefined before. So we only consider variables that are
-    # actually defined before the parfor body in the program.
-    parfor_ids = set()
-    parfors = []
-    pre_defs = set()
-    _, all_defs = compute_use_defs(blocks)
-    topo_order = find_topo_order(blocks)
-    for label in topo_order:
-        block = blocks[label]
-        for i, parfor in _find_parfors(block.body):
-            # find variable defs before the parfor in the same block
-            dummy_block = ir.Block(block.scope, block.loc)
-            dummy_block.body = block.body[:i]
-            before_defs = compute_use_defs({0: dummy_block}).defmap[0]
-            pre_defs |= before_defs
-            params = get_parfor_params_inner(
-                parfor,
-                pre_defs,
-                options_fusion,
-                fusion_info,
-            )
-            parfor.params, parfor.races = _combine_params_races_for_ssa_names(
-                block.scope,
-                params,
-                parfor.races,
-            )
-            parfor_ids.add(parfor.id)
-            parfors.append(parfor)
-
-        pre_defs |= all_defs[label]
-    return parfor_ids, parfors
-
-
-def _combine_params_races_for_ssa_names(scope, params, races):
-    """Returns `(params|races1, races1)`, where `races1` contains all variables
-    in `races` are NOT referring to the same unversioned (SSA) variables in
-    `params`.
-    """
-
-    def unversion(k):
-        try:
-            return scope.get_exact(k).unversioned_name
-        except ir.NotDefinedError:
-            # XXX: it's a bug that something references an undefined name
-            return k
-
-    races1 = set(races)
-    unver_params = list(map(unversion, params))
-
-    for rv in races:
-        if any(unversion(rv) == pv for pv in unver_params):
-            races1.discard(rv)
-        else:
-            break
-
-    return params | races1, races1
-
-
-def get_parfor_params_inner(parfor, pre_defs, options_fusion, fusion_info):
-    blocks = wrap_parfor_blocks(parfor)
-    cfg = compute_cfg_from_blocks(blocks)
-    usedefs = compute_use_defs(blocks)
-    live_map = compute_live_map(cfg, blocks, usedefs.usemap, usedefs.defmap)
-    parfor_ids, _ = get_parfor_params(blocks, options_fusion, fusion_info)
-    n_parfors = len(parfor_ids)
-    if n_parfors > 0:
-        if config.DEBUG_ARRAY_OPT_STATS:
-            after_fusion = (
-                "After fusion" if options_fusion else "With fusion disabled"
-            )
-            print(
-                (
-                    "{}, parallel for-loop {} has " "nested Parfor(s) #{}."
-                ).format(after_fusion, parfor.id, n_parfors, parfor_ids)
-            )
-        fusion_info[parfor.id] = list(parfor_ids)
-
-    unwrap_parfor_blocks(parfor)
-    keylist = sorted(live_map.keys())
-    init_block = keylist[0]
-    first_non_init_block = keylist[1]
-
-    before_defs = usedefs.defmap[init_block] | pre_defs
-    params = live_map[first_non_init_block] & before_defs
-    return params
-
-
-def _find_parfors(body):
-    for i, inst in enumerate(body):
-        if isinstance(inst, Parfor):
-            yield i, inst
-
-
-def get_parfor_outputs(parfor, parfor_params):
-    """get arrays that are written to inside the parfor and need to be passed
-    as parameters to gufunc.
-    """
-    # FIXME: The following assumes the target of all SetItem are outputs,
-    # which is wrong!
-    last_label = max(parfor.loop_body.keys())
-    outputs = []
-    for blk in parfor.loop_body.values():
-        for stmt in blk.body:
-            if (
-                isinstance(stmt, (ir.StaticSetItem, ir.SetItem))
-                and get_index_var(stmt).name == parfor.index_var.name
-            ):
-                outputs.append(stmt.target.name)
-    # make sure these written arrays are in parfor parameters (live coming in)
-    outputs = list(set(outputs) & set(parfor_params))
-    return sorted(outputs)
-
-
-_RedVarInfo = make_dataclass(
-    "_RedVarInfo",
-    ["init_val", "reduce_nodes", "redop"],
-    frozen=True,
-)
-
-
-def get_parfor_reductions(
-    func_ir,
-    parfor,
-    parfor_params,
-    calltypes,
-    reductions=None,
-    reduce_varnames=None,
-    param_uses=None,
-    param_nodes=None,
-    var_to_param=None,
-):
-    """find variables that are updated using their previous values and an array
-    item accessed with parfor index, e.g. s = s+A[i]
-    """
-    if reductions is None:
-        reductions = {}
-    if reduce_varnames is None:
-        reduce_varnames = []
-
-    # for each param variable, find what other variables are used to update it
-    # also, keep the related nodes
-    if param_uses is None:
-        param_uses = defaultdict(list)
-    if param_nodes is None:
-        param_nodes = defaultdict(list)
-    if var_to_param is None:
-        var_to_param = {}
-
-    blocks = wrap_parfor_blocks(parfor)
-    topo_order = find_topo_order(blocks)
-    topo_order = topo_order[1:]  # ignore init block
-    unwrap_parfor_blocks(parfor)
-
-    for label in reversed(topo_order):
-        for stmt in reversed(parfor.loop_body[label].body):
-            if isinstance(stmt, ir.Assign) and (
-                stmt.target.name in parfor_params
-                or stmt.target.name in var_to_param
-            ):
-                lhs = stmt.target
-                rhs = stmt.value
-                cur_param = (
-                    lhs if lhs.name in parfor_params else var_to_param[lhs.name]
-                )
-                used_vars = []
-                if isinstance(rhs, ir.Var):
-                    used_vars = [rhs.name]
-                elif isinstance(rhs, ir.Expr):
-                    used_vars = [v.name for v in stmt.value.list_vars()]
-                param_uses[cur_param].extend(used_vars)
-                for v in used_vars:
-                    var_to_param[v] = cur_param
-                # save copy of dependent stmt
-                stmt_cp = copy.deepcopy(stmt)
-                if stmt.value in calltypes:
-                    calltypes[stmt_cp.value] = calltypes[stmt.value]
-                param_nodes[cur_param].append(stmt_cp)
-            if isinstance(stmt, Parfor):
-                # recursive parfors can have reductions like test_prange8
-                get_parfor_reductions(
-                    func_ir,
-                    stmt,
-                    parfor_params,
-                    calltypes,
-                    reductions,
-                    reduce_varnames,
-                    None,
-                    param_nodes,
-                    var_to_param,
-                )
-
-    for param, used_vars in param_uses.items():
-        # a parameter is a reduction variable if its value is used to update it
-        # check reduce_varnames since recursive parfors might have processed
-        # param already
-        param_name = param.name
-        if param_name in used_vars and param_name not in reduce_varnames:
-            param_nodes[param].reverse()
-            reduce_nodes = get_reduce_nodes(param, param_nodes[param], func_ir)
-            # Certain kinds of ill-formed Python (like potentially undefined
-            # variables) in combination with SSA can make things look like
-            # reductions except that they don't have reduction operators.
-            # If we get to this point but don't find a reduction operator
-            # then assume it is this situation and just don't treat this
-            # variable as a reduction.
-            if reduce_nodes is not None:
-                reduce_varnames.append(param_name)
-                check_conflicting_reduction_operators(param, reduce_nodes)
-                gri_out = guard(get_reduction_init, reduce_nodes)
-                if gri_out is not None:
-                    init_val, redop = gri_out
-                else:
-                    init_val = None
-                    redop = None
-                reductions[param_name] = _RedVarInfo(
-                    init_val=init_val,
-                    reduce_nodes=reduce_nodes,
-                    redop=redop,
-                )
-
-    return reduce_varnames, reductions
-
-
-def check_conflicting_reduction_operators(param, nodes):
-    """In prange, a user could theoretically specify conflicting
-    reduction operators.  For example, in one spot it is += and
-    another spot *=.  Here, we raise an exception if multiple
-    different reduction operators are used in one prange.
-    """
-    first_red_func = None
-    for node in nodes:
-        if (
-            isinstance(node, ir.Assign)
-            and isinstance(node.value, ir.Expr)
-            and node.value.op == "inplace_binop"
-        ):
-            if first_red_func is None:
-                first_red_func = node.value.fn
-            else:
-                if first_red_func != node.value.fn:
-                    msg = (
-                        "Reduction variable %s has multiple conflicting "
-                        "reduction operators." % param.unversioned_name
-                    )
-                    raise errors.UnsupportedRewriteError(msg, node.loc)
-
-
-def get_reduction_init(nodes):
-    """
-    Get initial value for known reductions.
-    Currently, only += and *= are supported.
-    """
-    require(len(nodes) >= 1)
-    # there could be multiple extra assignments after the reduce node
-    # See: test_reduction_var_reuse
-    acc_expr = list(filter(lambda x: isinstance(x.value, ir.Expr), nodes))[
-        -1
-    ].value
-    require(isinstance(acc_expr, ir.Expr) and acc_expr.op == "inplace_binop")
-    if acc_expr.fn == operator.iadd or acc_expr.fn == operator.isub:
-        return 0, acc_expr.fn
-    if acc_expr.fn == operator.imul or acc_expr.fn == operator.itruediv:
-        return 1, acc_expr.fn
-    return None, None
-
-
-def supported_reduction(x, func_ir):
-    if x.op == "inplace_binop" or x.op == "binop":
-        if x.fn == operator.ifloordiv or x.fn == operator.floordiv:
-            raise errors.NumbaValueError(
-                (
-                    "Parallel floordiv reductions are not supported. "
-                    "If all divisors are integers then a floordiv "
-                    "reduction can in some cases be parallelized as "
-                    "a multiply reduction followed by a floordiv of "
-                    "the resulting product."
-                ),
-                x.loc,
-            )
-        supps = [
-            operator.iadd,
-            operator.isub,
-            operator.imul,
-            operator.itruediv,
-            operator.add,
-            operator.sub,
-            operator.mul,
-            operator.truediv,
-        ]
-        return x.fn in supps
-    if x.op == "call":
-        callname = guard(find_callname, func_ir, x)
-        if callname in [
-            ("max", "builtins"),
-            ("min", "builtins"),
-            ("datetime_minimum", "numba.np.npdatetime_helpers"),
-            ("datetime_maximum", "numba.np.npdatetime_helpers"),
-        ]:
-            return True
-    return False
-
-
-def get_reduce_nodes(reduction_node, nodes, func_ir):
-    """
-    Get nodes that combine the reduction variable with a sentinel variable.
-    Recognizes the first node that combines the reduction variable with another
-    variable.
-    """
-    reduce_nodes = None
-    defs = {}
-
-    def lookup(var, varonly=True):
-        val = defs.get(var.name, None)
-        if isinstance(val, ir.Var):
-            return lookup(val)
-        else:
-            return var if (varonly or val is None) else val
-
-    name = reduction_node.name
-    unversioned_name = reduction_node.unversioned_name
-    for i, stmt in enumerate(nodes):
-        lhs = stmt.target
-        rhs = stmt.value
-        defs[lhs.name] = rhs
-        if isinstance(rhs, ir.Var) and rhs.name in defs:
-            rhs = lookup(rhs)
-        if isinstance(rhs, ir.Expr):
-            in_vars = set(lookup(v, True).name for v in rhs.list_vars())
-            if name in in_vars:
-                # reductions like sum have an assignment afterwards
-                # e.g. $2 = a + $1; a = $2
-                # reductions that are functions calls like max() don't have an
-                # extra assignment afterwards
-                if (
-                    not (
-                        i + 1 < len(nodes)
-                        and isinstance(nodes[i + 1], ir.Assign)
-                        and nodes[i + 1].target.unversioned_name
-                        == unversioned_name
-                    )
-                    and lhs.unversioned_name != unversioned_name
-                ):
-                    raise ValueError(
-                        f"Use of reduction variable {unversioned_name!r} other "
-                        "than in a supported reduction function is not "
-                        "permitted."
-                    )
-
-                if not supported_reduction(rhs, func_ir):
-                    raise ValueError(
-                        (
-                            "Use of reduction variable "
-                            + unversioned_name
-                            + " in an unsupported reduction function."
-                        )
-                    )
-                args = [(x.name, lookup(x, True)) for x in get_expr_args(rhs)]
-                non_red_args = [x for (x, y) in args if y.name != name]
-                assert len(non_red_args) == 1
-                args = [(x, y) for (x, y) in args if x != y.name]
-                replace_dict = dict(args)
-                replace_dict[non_red_args[0]] = ir.Var(
-                    lhs.scope, name + "#init", lhs.loc
-                )
-                replace_vars_inner(rhs, replace_dict)
-                reduce_nodes = nodes[i:]
-                break
-    return reduce_nodes
-
-
-def get_expr_args(expr):
-    """
-    Get arguments of an expression node
-    """
-    if expr.op in ["binop", "inplace_binop"]:
-        return [expr.lhs, expr.rhs]
-    if expr.op == "call":
-        return [v for v in expr.args]
-    raise NotImplementedError("get arguments for expression {}".format(expr))
-
-
-def visit_parfor_pattern_vars(parfor, callback, cbdata):
-    # currently, only stencil pattern has variables
-    for pattern in parfor.patterns:
-        if pattern[0] == "stencil":
-            left_lengths = pattern[1][0]
-            for i in range(len(left_lengths)):
-                if isinstance(left_lengths[i], ir.Var):
-                    left_lengths[i] = visit_vars_inner(
-                        left_lengths[i], callback, cbdata
-                    )
-            right_lengths = pattern[1][1]
-            for i in range(len(right_lengths)):
-                if isinstance(right_lengths[i], ir.Var):
-                    right_lengths[i] = visit_vars_inner(
-                        right_lengths[i], callback, cbdata
-                    )
-
-
-def visit_vars_parfor(parfor, callback, cbdata):
-    if config.DEBUG_ARRAY_OPT >= 1:
-        print("visiting parfor vars for:", parfor)
-        print("cbdata: ", sorted(cbdata.items()))
-    for l in parfor.loop_nests:
-        l.index_variable = visit_vars_inner(l.index_variable, callback, cbdata)
-        if isinstance(l.start, ir.Var):
-            l.start = visit_vars_inner(l.start, callback, cbdata)
-        if isinstance(l.stop, ir.Var):
-            l.stop = visit_vars_inner(l.stop, callback, cbdata)
-        if isinstance(l.step, ir.Var):
-            l.step = visit_vars_inner(l.step, callback, cbdata)
-    visit_vars({-1: parfor.init_block}, callback, cbdata)
-    visit_parfor_pattern_vars(parfor, callback, cbdata)
-    visit_vars(parfor.loop_body, callback, cbdata)
-    return
-
-
-# add call to visit parfor variable
-ir_utils.visit_vars_extensions[Parfor] = visit_vars_parfor
-
-
-def parfor_defs(parfor, use_set=None, def_set=None):
-    """list variables written in this parfor by recursively
-    calling compute_use_defs() on body and combining block defs.
-    """
-    if use_set is None:
-        use_set = set()
-    if def_set is None:
-        def_set = set()
-    blocks = wrap_parfor_blocks(parfor)
-    uses, defs = compute_use_defs(blocks)
-    cfg = compute_cfg_from_blocks(blocks)
-    last_label = max(blocks.keys())
-    unwrap_parfor_blocks(parfor)
-
-    # Conservatively, only add defs for blocks that are definitely executed
-    # Go through blocks in order, as if they are statements of the block that
-    # includes the parfor, and update uses/defs.
-
-    # no need for topo order of ir_utils
-    topo_order = cfg.topo_order()
-    # blocks that dominate last block are definitely executed
-    definitely_executed = cfg.dominators()[last_label]
-    # except loop bodies that might not execute
-    for loop in cfg.loops().values():
-        definitely_executed -= loop.body
-    for label in topo_order:
-        if label in definitely_executed:
-            # see compute_use_defs() in analysis.py
-            # variables defined in the block that includes the parfor are not
-            # uses of that block (are not potentially live in the beginning of
-            # the block)
-            use_set.update(uses[label] - def_set)
-            def_set.update(defs[label])
-        else:
-            use_set.update(uses[label] - def_set)
-
-    # treat loop variables and size variables as use
-    loop_vars = {
-        l.start.name for l in parfor.loop_nests if isinstance(l.start, ir.Var)
-    }
-    loop_vars |= {
-        l.stop.name for l in parfor.loop_nests if isinstance(l.stop, ir.Var)
-    }
-    loop_vars |= {
-        l.step.name for l in parfor.loop_nests if isinstance(l.step, ir.Var)
-    }
-    use_set.update(loop_vars - def_set)
-    use_set |= get_parfor_pattern_vars(parfor)
-
-    return analysis._use_defs_result(usemap=use_set, defmap=def_set)
-
-
-analysis.ir_extension_usedefs[Parfor] = parfor_defs
-
-
-def _parfor_use_alloca(parfor, alloca_set):
-    """
-    Reduction variables for parfors and the reduction variables within
-    nested parfors must be stack allocated.
-    """
-    alloca_set |= set(parfor.redvars)
-
-    blocks = wrap_parfor_blocks(parfor)
-    alloca_set |= analysis.must_use_alloca(blocks)
-
-    unwrap_parfor_blocks(parfor)
-
-
-analysis.ir_extension_use_alloca[Parfor] = _parfor_use_alloca
-
-
-def parfor_insert_dels(parfor, curr_dead_set):
-    """insert dels in parfor. input: dead variable set right after parfor.
-    returns the variables for which del was inserted.
-    """
-    blocks = wrap_parfor_blocks(parfor)
-    cfg = compute_cfg_from_blocks(blocks)
-    usedefs = compute_use_defs(blocks)
-    live_map = compute_live_map(cfg, blocks, usedefs.usemap, usedefs.defmap)
-    dead_map = compute_dead_maps(cfg, blocks, live_map, usedefs.defmap)
-
-    # treat loop variables and size variables as live
-    loop_vars = {
-        l.start.name for l in parfor.loop_nests if isinstance(l.start, ir.Var)
-    }
-    loop_vars |= {
-        l.stop.name for l in parfor.loop_nests if isinstance(l.stop, ir.Var)
-    }
-    loop_vars |= {
-        l.step.name for l in parfor.loop_nests if isinstance(l.step, ir.Var)
-    }
-    loop_vars |= {l.index_variable.name for l in parfor.loop_nests}
-    # for var_list in parfor.array_analysis.array_size_vars.values():
-    #    loop_vars |= {v.name for v in var_list if isinstance(v, ir.Var)}
-
-    dead_set = set()
-    for label in blocks.keys():
-        # only kill vars that are actually dead at the parfor's block
-        dead_map.internal[label] &= curr_dead_set
-        dead_map.internal[label] -= loop_vars
-        dead_set |= dead_map.internal[label]
-        dead_map.escaping[label] &= curr_dead_set
-        dead_map.escaping[label] -= loop_vars
-        dead_set |= dead_map.escaping[label]
-
-    # dummy class to replace func_ir. _patch_var_dels only accesses blocks
-    class DummyFuncIR(object):
-        def __init__(self, blocks):
-            self.blocks = blocks
-
-    post_proc = postproc.PostProcessor(DummyFuncIR(blocks))
-    post_proc._patch_var_dels(dead_map.internal, dead_map.escaping)
-    unwrap_parfor_blocks(parfor)
-
-    return dead_set | loop_vars
-
-
-postproc.ir_extension_insert_dels[Parfor] = parfor_insert_dels
-
-
-def maximize_fusion(func_ir, blocks, typemap, up_direction=True):
-    """
-    Reorder statements to maximize parfor fusion. Push all parfors up or down
-    so they are adjacent.
-    """
-    call_table, _ = get_call_table(blocks)
-    alias_map, arg_aliases = find_potential_aliases(
-        blocks, func_ir.arg_names, typemap, func_ir
-    )
-    for block in blocks.values():
-        order_changed = True
-        while order_changed:
-            order_changed = maximize_fusion_inner(
-                func_ir, block, call_table, alias_map, arg_aliases, up_direction
-            )
-
-
-def maximize_fusion_inner(
-    func_ir, block, call_table, alias_map, arg_aliases, up_direction=True
-):
-    order_changed = False
-    i = 0
-    # i goes to body[-3] (i+1 to body[-2]) since body[-1] is terminator and
-    # shouldn't be reordered
-    while i < len(block.body) - 2:
-        stmt = block.body[i]
-        next_stmt = block.body[i + 1]
-        can_reorder = (
-            _can_reorder_stmts(
-                stmt, next_stmt, func_ir, call_table, alias_map, arg_aliases
-            )
-            if up_direction
-            else _can_reorder_stmts(
-                next_stmt, stmt, func_ir, call_table, alias_map, arg_aliases
-            )
-        )
-        if can_reorder:
-            block.body[i] = next_stmt
-            block.body[i + 1] = stmt
-            order_changed = True
-        i += 1
-    return order_changed
-
-
-def expand_aliases(the_set, alias_map, arg_aliases):
-    ret = set()
-    for i in the_set:
-        if i in alias_map:
-            ret = ret.union(alias_map[i])
-        if i in arg_aliases:
-            ret = ret.union(arg_aliases)
-        ret.add(i)
-    return ret
-
-
-def _can_reorder_stmts(
-    stmt, next_stmt, func_ir, call_table, alias_map, arg_aliases
-):
-    """
-    Check dependencies to determine if a parfor can be reordered in the IR block
-    with a non-parfor statement.
-    """
-    # swap only parfors with non-parfors
-    # don't reorder calls with side effects (e.g. file close)
-    # only read-read dependencies are OK
-    # make sure there is no write-write, write-read dependencies
-    if (
-        isinstance(stmt, Parfor)
-        and not isinstance(next_stmt, Parfor)
-        and not isinstance(next_stmt, ir.Print)
-        and (
-            not isinstance(next_stmt, ir.Assign)
-            or has_no_side_effect(next_stmt.value, set(), call_table)
-            or guard(is_assert_equiv, func_ir, next_stmt.value)
-        )
-    ):
-        stmt_accesses = expand_aliases(
-            {v.name for v in stmt.list_vars()}, alias_map, arg_aliases
-        )
-        stmt_writes = expand_aliases(
-            get_parfor_writes(stmt), alias_map, arg_aliases
-        )
-        next_accesses = expand_aliases(
-            {v.name for v in next_stmt.list_vars()}, alias_map, arg_aliases
-        )
-        next_writes = expand_aliases(
-            get_stmt_writes(next_stmt), alias_map, arg_aliases
-        )
-        if (
-            len((stmt_writes & next_accesses) | (next_writes & stmt_accesses))
-            == 0
-        ):
-            return True
-    return False
-
-
-def is_assert_equiv(func_ir, expr):
-    func_name, mod_name = find_callname(func_ir, expr)
-    return func_name == "assert_equiv"
-
-
-def get_parfor_writes(parfor):
-    assert isinstance(parfor, Parfor)
-    writes = set()
-    blocks = parfor.loop_body.copy()
-    blocks[-1] = parfor.init_block
-    for block in blocks.values():
-        for stmt in block.body:
-            writes.update(get_stmt_writes(stmt))
-            if isinstance(stmt, Parfor):
-                writes.update(get_parfor_writes(stmt))
-    return writes
-
-
-FusionReport = namedtuple("FusionReport", ["first", "second", "message"])
-
-
-def try_fuse(equiv_set, parfor1, parfor2, metadata, func_ir, typemap):
-    """try to fuse parfors and return a fused parfor, otherwise return None"""
-    dprint("try_fuse: trying to fuse \n", parfor1, "\n", parfor2)
-
-    # default report is None
-    report = None
-
-    # fusion of parfors with different lowerers is not possible
-    if parfor1.lowerer != parfor2.lowerer:
-        dprint("try_fuse: parfors different lowerers")
-        msg = "- fusion failed: lowerer mismatch"
-        report = FusionReport(parfor1.id, parfor2.id, msg)
-        return None, report
-
-    # fusion of parfors with different dimensions not supported yet
-    if len(parfor1.loop_nests) != len(parfor2.loop_nests):
-        dprint("try_fuse: parfors number of dimensions mismatch")
-        msg = "- fusion failed: number of loops mismatched, %s, %s."
-        fmt = "parallel loop #%s has a nest of %s loops"
-        l1 = fmt % (parfor1.id, len(parfor1.loop_nests))
-        l2 = fmt % (parfor2.id, len(parfor2.loop_nests))
-        report = FusionReport(parfor1.id, parfor2.id, msg % (l1, l2))
-        return None, report
-
-    ndims = len(parfor1.loop_nests)
-    # all loops should be equal length
-
-    def is_equiv(x, y):
-        return x == y or equiv_set.is_equiv(x, y)
-
-    def get_user_varname(v):
-        """get original variable name by user if possible"""
-        if not isinstance(v, ir.Var):
-            return v
-        v = v.name
-        if "var_rename_map" in metadata and v in metadata["var_rename_map"]:
-            user_varname = metadata["var_rename_map"][v]
-            return user_varname
-        return v
-
-    for i in range(ndims):
-        nest1 = parfor1.loop_nests[i]
-        nest2 = parfor2.loop_nests[i]
-        if not (
-            is_equiv(nest1.start, nest2.start)
-            and is_equiv(nest1.stop, nest2.stop)
-            and is_equiv(nest1.step, nest2.step)
-        ):
-            dprint("try_fuse: parfor dimension correlation mismatch", i)
-            msg = "- fusion failed: loop dimension mismatched in axis %s. "
-            msg += "slice(%s, %s, %s) != " % (
-                get_user_varname(nest1.start),
-                get_user_varname(nest1.stop),
-                get_user_varname(nest1.step),
-            )
-            msg += "slice(%s, %s, %s)" % (
-                get_user_varname(nest2.start),
-                get_user_varname(nest2.stop),
-                get_user_varname(nest2.step),
-            )
-            report = FusionReport(parfor1.id, parfor2.id, msg % i)
-            return None, report
-
-    func_ir._definitions = build_definitions(func_ir.blocks)
-    # TODO: make sure parfor1's reduction output is not used in parfor2
-    # only data parallel loops
-    p1_cross_dep, p1_ip, p1_ia, p1_non_ia = has_cross_iter_dep(
-        parfor1, func_ir, typemap
-    )
-    if not p1_cross_dep:
-        p2_cross_dep = has_cross_iter_dep(
-            parfor2, func_ir, typemap, p1_ip, p1_ia, p1_non_ia
-        )[0]
-    else:
-        p2_cross_dep = True
-
-    if p1_cross_dep or p2_cross_dep:
-        dprint("try_fuse: parfor cross iteration dependency found")
-        msg = (
-            "- fusion failed: cross iteration dependency found "
-            "between loops #%s and #%s"
-        )
-        report = FusionReport(
-            parfor1.id, parfor2.id, msg % (parfor1.id, parfor2.id)
-        )
-        return None, report
-
-    # find parfor1's defs, only body is considered since init_block will run
-    # first after fusion as well
-    p1_body_usedefs = compute_use_defs(parfor1.loop_body)
-    p1_body_defs = set()
-    for defs in p1_body_usedefs.defmap.values():
-        p1_body_defs |= defs
-    # Add reduction variables from parfor1 to the set of body defs
-    # so that if parfor2 reads the reduction variable it won't fuse.
-    p1_body_defs |= set(parfor1.redvars)
-
-    p2_usedefs = compute_use_defs(parfor2.loop_body)
-    p2_uses = compute_use_defs({0: parfor2.init_block}).usemap[0]
-    for uses in p2_usedefs.usemap.values():
-        p2_uses |= uses
-
-    if not p1_body_defs.isdisjoint(p2_uses):
-        dprint("try_fuse: parfor2 depends on parfor1 body")
-        msg = (
-            "- fusion failed: parallel loop %s has a dependency on the "
-            "body of parallel loop %s. "
-        )
-        report = FusionReport(
-            parfor1.id, parfor2.id, msg % (parfor1.id, parfor2.id)
-        )
-        return None, report
-
-    return fuse_parfors_inner(parfor1, parfor2)
-
-
-def fuse_parfors_inner(parfor1, parfor2):
-    # fuse parfor2 into parfor1
-    # append parfor2's init block on parfor1's
-    parfor1.init_block.body.extend(parfor2.init_block.body)
-
-    # append parfor2's first block to parfor1's last block
-    parfor2_first_label = min(parfor2.loop_body.keys())
-    parfor2_first_block = parfor2.loop_body[parfor2_first_label].body
-    parfor1_first_label = min(parfor1.loop_body.keys())
-    parfor1_last_label = max(parfor1.loop_body.keys())
-    parfor1.loop_body[parfor1_last_label].body.extend(parfor2_first_block)
-
-    # add parfor2 body blocks to parfor1's except first
-    parfor1.loop_body.update(parfor2.loop_body)
-    parfor1.loop_body.pop(parfor2_first_label)
-
-    # replace parfor2 indices with parfor1's
-    ndims = len(parfor1.loop_nests)
-    index_dict = {parfor2.index_var.name: parfor1.index_var}
-    for i in range(ndims):
-        index_dict[
-            parfor2.loop_nests[i].index_variable.name
-        ] = parfor1.loop_nests[i].index_variable
-    replace_vars(parfor1.loop_body, index_dict)
-
-    # re-order labels from min to max
-    blocks = wrap_parfor_blocks(parfor1, entry_label=parfor1_first_label)
-    blocks = rename_labels(blocks)
-    unwrap_parfor_blocks(parfor1, blocks)
-
-    nameset = set(x.name for x in index_dict.values())
-    remove_duplicate_definitions(parfor1.loop_body, nameset)
-    parfor1.patterns.extend(parfor2.patterns)
-    if config.DEBUG_ARRAY_OPT_STATS:
-        print(
-            "Parallel for-loop #{} is fused into for-loop #{}.".format(
-                parfor2.id, parfor1.id
-            )
-        )
-
-    msg = (
-        "- fusion succeeded: parallel for-loop #{} is fused into for-loop #{}."
-    )
-    msg = msg.format(parfor2.id, parfor1.id)
-    report = FusionReport(parfor1.id, parfor2.id, msg)
-
-    return parfor1, report
-
-
-def remove_duplicate_definitions(blocks, nameset):
-    """Remove duplicated definition for variables in the given nameset, which
-    is often a result of parfor fusion.
-    """
-    for label, block in blocks.items():
-        body = block.body
-        new_body = []
-        defined = set()
-        for inst in body:
-            if isinstance(inst, ir.Assign):
-                name = inst.target.name
-                if name in nameset:
-                    if name in defined:
-                        continue
-                    defined.add(name)
-            new_body.append(inst)
-        block.body = new_body
-    return
-
-
-def has_cross_iter_dep(
-    parfor,
-    func_ir,
-    typemap,
-    index_positions=None,
-    indexed_arrays=None,
-    non_indexed_arrays=None,
-):
-    # We should assume there is cross iteration dependency unless we can
-    # prove otherwise.  Return True if there is a cross-iter dependency
-    # that should prevent fusion, False if fusion is okay.
-    # Also returns index_positions, indexed_arrays, and non_indexed_arrays
-    # who purpose is described below so that subsequent additional
-    # has_cross_iter_dep calls for other parfors can build on the same
-    # data structures to make sure that the array accesses generate no
-    # cross-iter dependencies both within a parfor but also across parfors.
-
-    # TODO: make it more accurate using ud-chains
-
-    # Get the index variable used by this parfor.
-    # This will hold other variables with equivalent value, e.g., a = index_var
-    indices = {l.index_variable.name for l in parfor.loop_nests}
-    # This set will store variables that are (potentially recursively)
-    # defined in relation to an index variable, e.g., a = index_var + 1.
-    # A getitem that uses an index variable from this set will be considered
-    # as potentially having a cross-iter dependency and so won't fuse.
-    derived_from_indices = set()
-    # For the first parfor considered for fusion, the latter 3 args will be None
-    # and initialized to empty.  For the second parfor, the structures from the
-    # previous parfor are passed in so that cross-parfor violations of the
-    # below comments can prevent fusion.
-    #
-    # index_positions keeps track of which index positions have had an index
-    # variable used for them and which ones haven't for each possible array
-    # dimensionality.  After the first array access is seen, if subsequent
-    # ones use a parfor index for a different dimension then we conservatively
-    # say that we can't fuse.  For example, if a 2D array is accessed with
-    # a[parfor_index, 0] then index_positions[2] will be (True, False) and
-    # if a[0, parfor_index] happens later which is (False, True) then this
-    # conflicts with the previous value and will prevent fusion.
-    #
-    # indexed_arrays records arrays that are accessed with at least one
-    # parfor index.  If such an array is later accessed with indices that
-    # don't include a parfor index then conservatively assume we can't fuse.
-    #
-    # non_indexed_arrays holds arrays that are indexed without a parfor index.
-    # If an array first accessed without a parfor index is later indexed
-    # with one then conservatively assume we can't fuse.
-    if index_positions is None:
-        index_positions = {}
-    if indexed_arrays is None:
-        indexed_arrays = set()
-    if non_indexed_arrays is None:
-        non_indexed_arrays = set()
-
-    def add_check_position(
-        new_position,
-        array_accessed,
-        index_positions,
-        indexed_arrays,
-        non_indexed_arrays,
-    ):
-        """Returns True if there is a reason to prevent fusion based
-        on the rules described above.
-        new_position will be a list or tuple of booleans that
-        says whether the index in that spot is a parfor index
-        or not.  array_accessed is the array on which the access
-        is occurring."""
-
-        # Convert list indices to tuple for generality.
-        if isinstance(new_position, list):
-            new_position = tuple(new_position)
-
-        # If none of the indices are based on a parfor index.
-        if True not in new_position:
-            # See if this array has been accessed before with a
-            # a parfor index and if so say that we can't fuse.
-            if array_accessed in indexed_arrays:
-                return True
-            else:
-                # Either array is already in non_indexed arrays or we
-                # will add it.  Either way, this index usage can fuse.
-                non_indexed_arrays.add(array_accessed)
-                return False
-
-        # Fallthrough for cases where one of the indices is a parfor index.
-        # If this array was previously accessed without a parfor index then
-        # conservatively say we can't fuse.
-        if array_accessed in non_indexed_arrays:
-            return True
-
-        indexed_arrays.add(array_accessed)
-
-        npsize = len(new_position)
-        # See if we have not seen a npsize dimensioned array accessed before.
-        if npsize not in index_positions:
-            # If not then add current set of parfor/non-parfor indices and
-            # indicate it is safe as it is the first usage.
-            index_positions[npsize] = new_position
-            return False
-
-        # Here we have a subsequent access to a npsize-dimensioned array.
-        # Make sure we see the same combination of parfor/non-parfor index
-        # indices that we've seen before.  If not then return True saying
-        # that we can't fuse.
-        return index_positions[npsize] != new_position
-
-    def check_index(
-        stmt_index,
-        array_accessed,
-        index_positions,
-        indexed_arrays,
-        non_indexed_arrays,
-        derived_from_indices,
-    ):
-        """Looks at the indices of a getitem or setitem to see if there
-        is a reason that they would prevent fusion.
-        Returns True if fusion should be prohibited, False otherwise.
-        """
-        if isinstance(stmt_index, ir.Var):
-            # If the array is 2+ dimensions then the index should be a tuple.
-            if isinstance(typemap[stmt_index.name], types.BaseTuple):
-                # Get how the index tuple is constructed.
-                fbs_res = guard(find_build_sequence, func_ir, stmt_index)
-                if fbs_res is not None:
-                    ind_seq, _ = fbs_res
-                    # If any indices are derived from an index is used then
-                    # return True to say we can't fuse.
-                    if all(
-                        [
-                            x.name in indices
-                            or x.name not in derived_from_indices
-                            for x in ind_seq
-                        ]
-                    ):
-                        # Get position in index tuple where parfor indices used.
-                        new_index_positions = [
-                            x.name in indices for x in ind_seq
-                        ]
-                        # Make sure that we aren't accessing a given array with
-                        # different indices in a different order.
-                        return add_check_position(
-                            new_index_positions,
-                            array_accessed,
-                            index_positions,
-                            indexed_arrays,
-                            non_indexed_arrays,
-                        )
-                    else:
-                        # index derived from a parfor index used so no fusion
-                        return True
-                else:
-                    # Don't know how the index tuple is built so
-                    # have to assume fusion can't happen.
-                    return True
-            else:
-                # Should be for 1D arrays.
-                if stmt_index.name in indices:
-                    # Array indexed by a parfor index variable.
-                    # Make sure this 1D access is consistent with prior ones.
-                    return add_check_position(
-                        (True,),
-                        array_accessed,
-                        index_positions,
-                        indexed_arrays,
-                        non_indexed_arrays,
-                    )
-                elif stmt_index.name in derived_from_indices:
-                    # If we ever index an array with something calculated
-                    # from an index then no fusion.
-                    return True
-                else:
-                    # Some kind of index that isn't a parfor index or
-                    # one derived from one, e.g., a constant.  Make sure
-                    # this is consistent with prior accessed of this array.
-                    return add_check_position(
-                        (False,),
-                        array_accessed,
-                        index_positions,
-                        indexed_arrays,
-                        non_indexed_arrays,
-                    )
-        else:
-            # We don't know how to handle non-Var indices so no fusion.
-            return True
-
-        # All branches above should cover all the cases and each should
-        # return so we should never get here.
-        raise errors.InternalError(
-            "Some code path in the parfor fusion "
-            "cross-iteration dependency checker "
-            "check_index didn't return a result."
-        )
-
-    # Iterate through all the statements in the parfor.
-    for b in parfor.loop_body.values():
-        for stmt in b.body:
-            # Make sure SetItem accesses are fusion safe.
-            if isinstance(stmt, (ir.SetItem, ir.StaticSetItem)):
-                if isinstance(typemap[stmt.target.name], types.npytypes.Array):
-                    # Check index safety with prior array accesses.
-                    if check_index(
-                        stmt.index,
-                        stmt.target.name,
-                        index_positions,
-                        indexed_arrays,
-                        non_indexed_arrays,
-                        derived_from_indices,
-                    ):
-                        return (
-                            True,
-                            index_positions,
-                            indexed_arrays,
-                            non_indexed_arrays,
-                        )
-                # Fusion safe so go to next statement.
-                continue
-            elif isinstance(stmt, ir.Assign):
-                # If stmt of form a = parfor_index then add "a" to set of
-                # parfor indices.
-                if isinstance(stmt.value, ir.Var):
-                    if stmt.value.name in indices:
-                        indices.add(stmt.target.name)
-                        continue
-                elif isinstance(stmt.value, ir.Expr):
-                    op = stmt.value.op
-                    # Make sure getitem accesses are fusion safe.
-                    if op in ["getitem", "static_getitem"]:
-                        if isinstance(
-                            typemap[stmt.value.value.name], types.npytypes.Array
-                        ):
-                            # Check index safety with prior array accesses.
-                            if check_index(
-                                stmt.value.index,
-                                stmt.value.value.name,
-                                index_positions,
-                                indexed_arrays,
-                                non_indexed_arrays,
-                                derived_from_indices,
-                            ):
-                                return (
-                                    True,
-                                    index_positions,
-                                    indexed_arrays,
-                                    non_indexed_arrays,
-                                )
-                        # Fusion safe so go to next statement.
-                        continue
-                    elif op == "call":
-                        # If there is a call in the parfor body that takes some
-                        # array parameter then we have no way to analyze what
-                        # that call is doing so presume it is unsafe for fusion.
-                        if any(
-                            [
-                                isinstance(
-                                    typemap[x.name], types.npytypes.Array
-                                )
-                                for x in stmt.value.list_vars()
-                            ]
-                        ):
-                            return (
-                                True,
-                                index_positions,
-                                indexed_arrays,
-                                non_indexed_arrays,
-                            )
-
-                    # Get the vars used by this non-setitem/getitem statement.
-                    rhs_vars = [x.name for x in stmt.value.list_vars()]
-                    # If a parfor index is used as part of this statement or
-                    # something previous determined to be derived from a parfor
-                    # index then add the target variable to the set of
-                    # variables that are derived from parfors and so should
-                    # prevent fusion if used as an index.
-                    if not indices.isdisjoint(
-                        rhs_vars
-                    ) or not derived_from_indices.isdisjoint(rhs_vars):
-                        derived_from_indices.add(stmt.target.name)
-
-    return False, index_positions, indexed_arrays, non_indexed_arrays
-
-
-def dprint(*s):
-    if config.DEBUG_ARRAY_OPT >= 1:
-        print(*s)
-
-
-def get_parfor_pattern_vars(parfor):
-    """get the variables used in parfor pattern information"""
-    out = set()
-    # currently, only stencil pattern has variables
-    for pattern in parfor.patterns:
-        if pattern[0] == "stencil":
-            left_lengths = pattern[1][0]
-            right_lengths = pattern[1][1]
-            for v in left_lengths + right_lengths:
-                if isinstance(v, ir.Var):
-                    out.add(v.name)
-    return out
-
-
-def remove_dead_parfor(
-    parfor, lives, lives_n_aliases, arg_aliases, alias_map, func_ir, typemap
-):
-    """remove dead code inside parfor including get/sets"""
-
-    with dummy_return_in_loop_body(parfor.loop_body):
-        labels = find_topo_order(parfor.loop_body)
-
-    # get/setitem replacement should ideally use dataflow to propagate setitem
-    # saved values, but for simplicity we handle the common case of propagating
-    # setitems in the first block (which is dominant) if the array is not
-    # potentially changed in any way
-    first_label = labels[0]
-    first_block_saved_values = {}
-    _update_parfor_get_setitems(
-        parfor.loop_body[first_label].body,
-        parfor.index_var,
-        alias_map,
-        first_block_saved_values,
-        lives_n_aliases,
-    )
-
-    # remove saved first block setitems if array potentially changed later
-    saved_arrs = set(first_block_saved_values.keys())
-    for l in labels:
-        if l == first_label:
-            continue
-        for stmt in parfor.loop_body[l].body:
-            if (
-                isinstance(stmt, ir.Assign)
-                and isinstance(stmt.value, ir.Expr)
-                and stmt.value.op == "getitem"
-                and stmt.value.index.name == parfor.index_var.name
-            ):
-                continue
-            varnames = set(v.name for v in stmt.list_vars())
-            rm_arrs = varnames & saved_arrs
-            for a in rm_arrs:
-                first_block_saved_values.pop(a, None)
-
-    # replace getitems with available value
-    # e.g. A[i] = v; ... s = A[i]  ->  s = v
-    for l in labels:
-        if l == first_label:
-            continue
-        block = parfor.loop_body[l]
-        saved_values = first_block_saved_values.copy()
-        _update_parfor_get_setitems(
-            block.body,
-            parfor.index_var,
-            alias_map,
-            saved_values,
-            lives_n_aliases,
-        )
-
-    # after getitem replacement, remove extra setitems
-    blocks = parfor.loop_body.copy()  # shallow copy is enough
-    last_label = max(blocks.keys())
-    return_label, tuple_var = _add_liveness_return_block(
-        blocks, lives_n_aliases, typemap
-    )
-    # jump to return label
-    jump = ir.Jump(return_label, ir.Loc("parfors_dummy", -1))
-    blocks[last_label].body.append(jump)
-    cfg = compute_cfg_from_blocks(blocks)
-    usedefs = compute_use_defs(blocks)
-    live_map = compute_live_map(cfg, blocks, usedefs.usemap, usedefs.defmap)
-    alias_set = set(alias_map.keys())
-
-    for label, block in blocks.items():
-        new_body = []
-        in_lives = {v.name for v in block.terminator.list_vars()}
-        # find live variables at the end of block
-        for out_blk, _data in cfg.successors(label):
-            in_lives |= live_map[out_blk]
-        for stmt in reversed(block.body):
-            # aliases of lives are also live for setitems
-            alias_lives = in_lives & alias_set
-            for v in alias_lives:
-                in_lives |= alias_map[v]
-            if (
-                isinstance(stmt, (ir.StaticSetItem, ir.SetItem))
-                and get_index_var(stmt).name == parfor.index_var.name
-                and stmt.target.name not in in_lives
-                and stmt.target.name not in arg_aliases
-            ):
-                continue
-            in_lives |= {v.name for v in stmt.list_vars()}
-            new_body.append(stmt)
-        new_body.reverse()
-        block.body = new_body
-
-    typemap.pop(tuple_var.name)  # remove dummy tuple type
-    blocks[last_label].body.pop()  # remove jump
-
-    """
-      Process parfor body recursively.
-      Note that this is the only place in this function that uses the
-      argument lives instead of lives_n_aliases.  The former does not
-      include the aliases of live variables but only the live variable
-      names themselves.  See a comment in this function for how that
-      is used.
-    """
-    remove_dead_parfor_recursive(
-        parfor, lives, arg_aliases, alias_map, func_ir, typemap
-    )
-
-    # remove parfor if empty
-    is_empty = len(parfor.init_block.body) == 0
-    for block in parfor.loop_body.values():
-        is_empty &= len(block.body) == 0
-    if is_empty:
-        return None
-    return parfor
-
-
-def _update_parfor_get_setitems(
-    block_body, index_var, alias_map, saved_values, lives
-):
-    """
-    replace getitems of a previously set array in a block of parfor loop body
-    """
-    for stmt in block_body:
-        if (
-            isinstance(stmt, (ir.StaticSetItem, ir.SetItem))
-            and get_index_var(stmt).name == index_var.name
-            and stmt.target.name not in lives
-        ):
-            # saved values of aliases of SetItem target array are invalid
-            for w in alias_map.get(stmt.target.name, []):
-                saved_values.pop(w, None)
-            # set saved value after invalidation since alias_map may
-            # contain the array itself (e.g. pi example)
-            saved_values[stmt.target.name] = stmt.value
-            continue
-        if isinstance(stmt, ir.Assign) and isinstance(stmt.value, ir.Expr):
-            rhs = stmt.value
-            if rhs.op == "getitem" and isinstance(rhs.index, ir.Var):
-                if rhs.index.name == index_var.name:
-                    # replace getitem if value saved
-                    stmt.value = saved_values.get(rhs.value.name, rhs)
-                    continue
-        # conservative assumption: array is modified if referenced
-        # remove all referenced arrays
-        for v in stmt.list_vars():
-            saved_values.pop(v.name, None)
-            # aliases are potentially modified as well
-            for w in alias_map.get(v.name, []):
-                saved_values.pop(w, None)
-
-    return
-
-
-ir_utils.remove_dead_extensions[Parfor] = remove_dead_parfor
-
-
-def remove_dead_parfor_recursive(
-    parfor, lives, arg_aliases, alias_map, func_ir, typemap
-):
-    """create a dummy function from parfor and call remove dead recursively"""
-    blocks = parfor.loop_body.copy()  # shallow copy is enough
-    first_body_block = min(blocks.keys())
-    assert first_body_block > 0  # we are using 0 for init block here
-    last_label = max(blocks.keys())
-
-    """
-      Previously, this statement used lives_n_aliases.  That had the effect of
-      keeping variables in the init_block alive if they aliased an array that
-      was later written to.  By using just lives to indicate which variables
-      names are live at exit of the parfor but then using alias_map for the
-      actual recursive dead code removal, we keep any writes to aliased arrays
-      alive but also allow aliasing assignments (i.e., a = b) to be eliminated
-      so long as 'b' is not written to through the variable 'a' later on.
-      This makes assignment handling of remove_dead_block work properly since
-      it allows distinguishing between live variables and their aliases.
-    """
-    return_label, tuple_var = _add_liveness_return_block(blocks, lives, typemap)
-
-    # branch back to first body label to simulate loop
-    scope = blocks[last_label].scope
-
-    branchcond = ir.Var(
-        scope, mk_unique_var("$branchcond"), ir.Loc("parfors_dummy", -1)
-    )
-    typemap[branchcond.name] = types.boolean
-
-    branch = ir.Branch(
-        branchcond, first_body_block, return_label, ir.Loc("parfors_dummy", -1)
-    )
-    blocks[last_label].body.append(branch)
-
-    # add dummy jump in init_block for CFG to work
-    blocks[0] = parfor.init_block
-    blocks[0].body.append(
-        ir.Jump(first_body_block, ir.Loc("parfors_dummy", -1))
-    )
-
-    # args var including aliases is ok
-    remove_dead(blocks, arg_aliases, func_ir, typemap, alias_map, arg_aliases)
-    typemap.pop(tuple_var.name)  # remove dummy tuple type
-    blocks[0].body.pop()  # remove dummy jump
-    blocks[last_label].body.pop()  # remove branch
-    return
-
-
-def _add_liveness_return_block(blocks, lives, typemap):
-    last_label = max(blocks.keys())
-    return_label = last_label + 1
-
-    loc = blocks[last_label].loc
-    scope = blocks[last_label].scope
-    blocks[return_label] = ir.Block(scope, loc)
-
-    # add lives in a dummpy return to last block to avoid their removal
-    tuple_var = ir.Var(scope, mk_unique_var("$tuple_var"), loc)
-    # dummy type for tuple_var
-    typemap[tuple_var.name] = types.containers.UniTuple(types.uintp, 2)
-    live_vars = [ir.Var(scope, v, loc) for v in lives]
-    tuple_call = ir.Expr.build_tuple(live_vars, loc)
-    blocks[return_label].body.append(ir.Assign(tuple_call, tuple_var, loc))
-    blocks[return_label].body.append(ir.Return(tuple_var, loc))
-    return return_label, tuple_var
-
-
-def find_potential_aliases_parfor(
-    parfor, args, typemap, func_ir, alias_map, arg_aliases
-):
-    blocks = wrap_parfor_blocks(parfor)
-    ir_utils.find_potential_aliases(
-        blocks, args, typemap, func_ir, alias_map, arg_aliases
-    )
-    unwrap_parfor_blocks(parfor)
-    return
-
-
-ir_utils.alias_analysis_extensions[Parfor] = find_potential_aliases_parfor
-
-
-def simplify_parfor_body_CFG(blocks):
-    """simplify CFG of body loops in parfors"""
-    n_parfors = 0
-    for block in blocks.values():
-        for stmt in block.body:
-            if isinstance(stmt, Parfor):
-                n_parfors += 1
-                parfor = stmt
-                # add dummy return to enable CFG creation
-                # can't use dummy_return_in_loop_body since body changes
-                last_block = parfor.loop_body[max(parfor.loop_body.keys())]
-                scope = last_block.scope
-                loc = ir.Loc("parfors_dummy", -1)
-                const = ir.Var(scope, mk_unique_var("$const"), loc)
-                last_block.body.append(ir.Assign(ir.Const(0, loc), const, loc))
-                last_block.body.append(ir.Return(const, loc))
-                parfor.loop_body = simplify_CFG(parfor.loop_body)
-                last_block = parfor.loop_body[max(parfor.loop_body.keys())]
-                last_block.body.pop()
-                # call on body recursively
-                simplify_parfor_body_CFG(parfor.loop_body)
-    return n_parfors
-
-
-def wrap_parfor_blocks(parfor, entry_label=None):
-    """wrap parfor blocks for analysis/optimization like CFG"""
-    blocks = parfor.loop_body.copy()  # shallow copy is enough
-    if entry_label is None:
-        entry_label = min(blocks.keys())
-    assert entry_label > 0  # we are using 0 for init block here
-
-    # add dummy jump in init_block for CFG to work
-    blocks[0] = parfor.init_block
-    blocks[0].body.append(ir.Jump(entry_label, blocks[0].loc))
-    for block in blocks.values():
-        if len(block.body) == 0 or (not block.body[-1].is_terminator):
-            block.body.append(ir.Jump(entry_label, block.loc))
-    return blocks
-
-
-def unwrap_parfor_blocks(parfor, blocks=None):
-    """
-    unwrap parfor blocks after analysis/optimization.
-    Allows changes to the parfor loop.
-    """
-    if blocks is not None:
-        # make sure init block isn't removed
-        init_block_label = min(blocks.keys())
-        # update loop body blocks
-        blocks.pop(init_block_label)
-        parfor.loop_body = blocks
-
-    # make sure dummy jump to loop body isn't altered
-    first_body_label = min(parfor.loop_body.keys())
-    assert isinstance(parfor.init_block.body[-1], ir.Jump)
-
-    # remove dummy jump to loop body
-    parfor.init_block.body.pop()
-
-    # make sure dummy jump back to loop body isn't altered
-    for block in parfor.loop_body.values():
-        if (
-            isinstance(block.body[-1], ir.Jump)
-            and block.body[-1].target == first_body_label
-        ):
-            # remove dummy jump back to loop
-            block.body.pop()
-    return
-
-
-def get_copies_parfor(parfor, typemap):
-    """find copies generated/killed by parfor"""
-    blocks = wrap_parfor_blocks(parfor)
-    in_copies_parfor, out_copies_parfor = copy_propagate(blocks, typemap)
-    in_gen_copies, in_extra_kill = get_block_copies(blocks, typemap)
-    unwrap_parfor_blocks(parfor)
-
-    # parfor's extra kill is kills of its init block,
-    # and all possible gens and kills of it's body loop.
-    # body doesn't gen and only kills since it may or may not run
-    # TODO: save copies that are repeated in parfor
-    kill_set = in_extra_kill[0]
-    for label in parfor.loop_body.keys():
-        kill_set |= {l for l, r in in_gen_copies[label]}
-        kill_set |= in_extra_kill[label]
-
-    # gen copies is copies generated by init that are not killed by body
-    last_label = max(parfor.loop_body.keys())
-    gens = out_copies_parfor[last_label] & in_gen_copies[0]
-
-    if config.DEBUG_ARRAY_OPT >= 1:
-        print("copy propagate parfor gens:", gens, "kill_set", kill_set)
-    return gens, kill_set
-
-
-ir_utils.copy_propagate_extensions[Parfor] = get_copies_parfor
-
-
-def apply_copies_parfor(
-    parfor, var_dict, name_var_table, typemap, calltypes, save_copies
-):
-    """apply copy propagate recursively in parfor"""
-    # replace variables in pattern metadata like stencil neighborhood
-    for i, pattern in enumerate(parfor.patterns):
-        if pattern[0] == "stencil":
-            parfor.patterns[i] = (
-                "stencil",
-                replace_vars_inner(pattern[1], var_dict),
-            )
-
-    # replace loop boundary variables
-    for l in parfor.loop_nests:
-        l.start = replace_vars_inner(l.start, var_dict)
-        l.stop = replace_vars_inner(l.stop, var_dict)
-        l.step = replace_vars_inner(l.step, var_dict)
-
-    blocks = wrap_parfor_blocks(parfor)
-    # add dummy assigns for each copy
-    assign_list = []
-    for lhs_name, rhs in var_dict.items():
-        assign_list.append(
-            ir.Assign(rhs, name_var_table[lhs_name], ir.Loc("dummy", -1))
-        )
-    blocks[0].body = assign_list + blocks[0].body
-    in_copies_parfor, out_copies_parfor = copy_propagate(blocks, typemap)
-    apply_copy_propagate(
-        blocks,
-        in_copies_parfor,
-        name_var_table,
-        typemap,
-        calltypes,
-        save_copies,
-    )
-    unwrap_parfor_blocks(parfor)
-    # remove dummy assignments
-    blocks[0].body = blocks[0].body[len(assign_list) :]
-    return
-
-
-ir_utils.apply_copy_propagate_extensions[Parfor] = apply_copies_parfor
-
-
-def push_call_vars(
-    blocks, saved_globals, saved_getattrs, typemap, nested=False
-):
-    """push call variables to right before their call site.
-    assuming one global/getattr is created for each call site and control flow
-    doesn't change it.
-    """
-    for block in blocks.values():
-        new_body = []
-        # global/attr variables that are defined in this block already,
-        #   no need to reassign them
-        block_defs = set()
-        # Some definitions are copied right before the call but then we
-        # need to rename that symbol in that block so that typing won't
-        # generate an error trying to lock the save var twice.
-        # In rename_dict, we collect the symbols that must be renamed in
-        # this block.  We collect them then apply the renaming at the end.
-        rename_dict = {}
-        for stmt in block.body:
-
-            def process_assign(stmt):
-                if isinstance(stmt, ir.Assign):
-                    rhs = stmt.value
-                    lhs = stmt.target
-                    if isinstance(rhs, ir.Global):
-                        saved_globals[lhs.name] = stmt
-                        block_defs.add(lhs.name)
-                    elif isinstance(rhs, ir.Expr) and rhs.op == "getattr":
-                        if (
-                            rhs.value.name in saved_globals
-                            or rhs.value.name in saved_getattrs
-                        ):
-                            saved_getattrs[lhs.name] = stmt
-                            block_defs.add(lhs.name)
-
-            if not nested and isinstance(stmt, Parfor):
-                for s in stmt.init_block.body:
-                    process_assign(s)
-                pblocks = stmt.loop_body.copy()
-                push_call_vars(
-                    pblocks, saved_globals, saved_getattrs, typemap, nested=True
-                )
-                new_body.append(stmt)
-                continue
-            else:
-                process_assign(stmt)
-            for v in stmt.list_vars():
-                new_body += _get_saved_call_nodes(
-                    v.name,
-                    saved_globals,
-                    saved_getattrs,
-                    block_defs,
-                    rename_dict,
-                )
-            new_body.append(stmt)
-        block.body = new_body
-        # If there is anything to rename then apply the renaming here.
-        if len(rename_dict) > 0:
-            # Fix-up the typing for the renamed vars.
-            for k, v in rename_dict.items():
-                typemap[v] = typemap[k]
-            # This is only to call replace_var_names which takes a dict.
-            temp_blocks = {0: block}
-            replace_var_names(temp_blocks, rename_dict)
-
-    return
-
-
-def _get_saved_call_nodes(
-    fname, saved_globals, saved_getattrs, block_defs, rename_dict
-):
-    """Implement the copying of globals or getattrs for the purposes noted in
-    push_call_vars.  We make a new var and assign to it a copy of the
-    global or getattr.  We remember this new assignment node and add an
-    entry in the renaming dictionary so that for this block the original
-    var name is replaced by the new var name we created.
-    """
-    nodes = []
-    while fname not in block_defs and (
-        fname in saved_globals or fname in saved_getattrs
-    ):
-
-        def rename_global_or_getattr(
-            obj, var_base, nodes, block_defs, rename_dict
-        ):
-            assert isinstance(obj, ir.Assign)
-            renamed_var = ir.Var(
-                obj.target.scope, mk_unique_var(var_base), obj.target.loc
-            )
-            renamed_assign = ir.Assign(
-                copy.deepcopy(obj.value), renamed_var, obj.loc
-            )
-            nodes.append(renamed_assign)
-            block_defs.add(obj.target.name)
-            rename_dict[obj.target.name] = renamed_assign.target.name
-
-        if fname in saved_globals:
-            rename_global_or_getattr(
-                saved_globals[fname],
-                "$push_global_to_block",
-                nodes,
-                block_defs,
-                rename_dict,
-            )
-            fname = "_PA_DONE"
-        elif fname in saved_getattrs:
-            rename_global_or_getattr(
-                saved_getattrs[fname],
-                "$push_getattr_to_block",
-                nodes,
-                block_defs,
-                rename_dict,
-            )
-            fname = saved_getattrs[fname].value.value.name
-    nodes.reverse()
-    return nodes
-
-
-def repr_arrayexpr(arrayexpr):
-    """Extract operators from arrayexpr to represent it abstractly as a string."""
-    if isinstance(arrayexpr, tuple):
-        opr = arrayexpr[0]
-        # sometimes opr is not string like '+', but is a ufunc object
-        if not isinstance(opr, str):
-            if hasattr(opr, "__name__"):
-                opr = opr.__name__
-            else:
-                opr = "_"  # can return dummy since repr is not critical
-        args = arrayexpr[1]
-        if len(args) == 1:
-            return "({}({}))".format(opr, repr_arrayexpr(args[0]))
-        else:
-            opr = " " + opr + " "
-            return "({})".format(opr.join([repr_arrayexpr(x) for x in args]))
-    elif isinstance(arrayexpr, numba.core.ir.Var):
-        name = arrayexpr.name
-        if name.startswith("$"):
-            return "'%s' (temporary variable)" % name
-        else:
-            return name
-    elif isinstance(arrayexpr, numba.core.ir.Const):
-        return repr(arrayexpr.value)
-    else:
-        return "_"
-
-
-def fix_generator_types(generator_info, return_type, typemap):
-    """postproc updates generator_info with live variables after transformations
-    but generator variables have types in return_type that are updated here.
-    """
-    new_state_types = []
-    for v in generator_info.state_vars:
-        new_state_types.append(typemap[v])
-    return_type.state_types = tuple(new_state_types)
-    return
-
-
-def get_parfor_call_table(parfor, call_table=None, reverse_call_table=None):
-    if call_table is None:
-        call_table = {}
-    if reverse_call_table is None:
-        reverse_call_table = {}
-    blocks = wrap_parfor_blocks(parfor)
-    call_table, reverse_call_table = get_call_table(
-        blocks, call_table, reverse_call_table
-    )
-    unwrap_parfor_blocks(parfor)
-    return call_table, reverse_call_table
-
-
-ir_utils.call_table_extensions[Parfor] = get_parfor_call_table
-
-
-def get_parfor_tuple_table(parfor, tuple_table=None):
-    if tuple_table is None:
-        tuple_table = {}
-    blocks = wrap_parfor_blocks(parfor)
-    tuple_table = ir_utils.get_tuple_table(blocks, tuple_table)
-    unwrap_parfor_blocks(parfor)
-    return tuple_table
-
-
-ir_utils.tuple_table_extensions[Parfor] = get_parfor_tuple_table
-
-
-def get_parfor_array_accesses(parfor, accesses=None):
-    if accesses is None:
-        accesses = set()
-    blocks = wrap_parfor_blocks(parfor)
-    accesses = ir_utils.get_array_accesses(blocks, accesses)
-    unwrap_parfor_blocks(parfor)
-    return accesses
-
-
-# parfor handler is same as
-ir_utils.array_accesses_extensions[Parfor] = get_parfor_array_accesses
-
-
-def parfor_add_offset_to_labels(parfor, offset):
-    blocks = wrap_parfor_blocks(parfor)
-    blocks = add_offset_to_labels(blocks, offset)
-    blocks[0] = blocks[offset]
-    blocks.pop(offset)
-    unwrap_parfor_blocks(parfor, blocks)
-    return
-
-
-ir_utils.add_offset_to_labels_extensions[Parfor] = parfor_add_offset_to_labels
-
-
-def parfor_find_max_label(parfor):
-    blocks = wrap_parfor_blocks(parfor)
-    max_label = ir_utils.find_max_label(blocks)
-    unwrap_parfor_blocks(parfor)
-    return max_label
-
-
-ir_utils.find_max_label_extensions[Parfor] = parfor_find_max_label
-
-
-def parfor_typeinfer(parfor, typeinferer):
-    save_blocks = typeinferer.blocks
-    blocks = wrap_parfor_blocks(parfor)
-    index_vars = [l.index_variable for l in parfor.loop_nests]
-    # no need to handle parfor.index_var (tuple of variables), since it will be
-    # assigned to a tuple from individual indices
-    first_block = min(blocks.keys())
-    loc = blocks[first_block].loc
-    # XXX
-    index_assigns = [
-        ir.Assign(ir.Const(1, loc=loc, use_literal_type=False), v, loc)
-        for v in index_vars
-    ]
-    save_first_block_body = blocks[first_block].body
-    blocks[first_block].body = index_assigns + blocks[first_block].body
-    typeinferer.blocks = blocks
-    typeinferer.build_constraint()
-    typeinferer.blocks = save_blocks
-    blocks[first_block].body = save_first_block_body
-    unwrap_parfor_blocks(parfor)
-
-
-typeinfer.typeinfer_extensions[Parfor] = parfor_typeinfer
-
-
-def build_parfor_definitions(parfor, definitions=None):
-    """get variable definition table for parfors"""
-    if definitions is None:
-        definitions = defaultdict(list)
-
-    # avoid wrap_parfor_blocks() since build_definitions is called inside
-    # find_potential_aliases_parfor where the parfor is already wrapped
-    build_definitions(parfor.loop_body, definitions)
-    build_definitions({0: parfor.init_block}, definitions)
-    return definitions
-
-
-ir_utils.build_defs_extensions[Parfor] = build_parfor_definitions
-
-
-@contextmanager
-def dummy_return_in_loop_body(loop_body):
-    """adds dummy return to last block of parfor loop body for CFG computation"""
-    # max is last block since we add it manually for prange
-    last_label = max(loop_body.keys())
-    scope = loop_body[last_label].scope
-    const = ir.Var(scope, mk_unique_var("$const"), ir.Loc("parfors_dummy", -1))
-    loop_body[last_label].body.append(
-        ir.Return(const, ir.Loc("parfors_dummy", -1))
-    )
-    yield
-    # remove dummy return
-    loop_body[last_label].body.pop()
-
-
-@infer_global(reduce)
-class ReduceInfer(AbstractTemplate):
-    def generic(self, args, kws):
-        assert not kws
-        if len(args) != 3:
-            raise errors.NumbaAssertionError("len(args) != 3")
-        assert isinstance(args[1], types.Array)
-        return signature(args[1].dtype, *args)
-
-
-def ensure_parallel_support():
-    """Check if the platform supports parallel=True and raise if it does not."""
-    if config.IS_32BITS:
-        msg = (
-            "The 'parallel' target is not currently supported on 32 bit "
-            "hardware."
-        )
-        raise errors.UnsupportedParforsError(msg)
diff --git a/numba_dpex/core/passes/parfor_legalize_cfd_pass.py b/numba_dpex/core/passes/parfor_legalize_cfd_pass.py
index 264e62f181..fbdaef4737 100644
--- a/numba_dpex/core/passes/parfor_legalize_cfd_pass.py
+++ b/numba_dpex/core/passes/parfor_legalize_cfd_pass.py
@@ -5,18 +5,17 @@
 from numba.core import ir, types
 from numba.core.compiler_machinery import FunctionPass, register_pass
 from numba.core.ir_utils import find_topo_order
-
-from numba_dpex.core.exceptions import ComputeFollowsDataInferenceError
-from numba_dpex.core.passes.parfor_lowering_pass import ParforLowerFactory
-from numba_dpex.core.types.dpnp_ndarray_type import DpnpNdArray
-
-from .parfor import (
+from numba.parfors.parfor import (
     Parfor,
     ParforDiagnostics,
     get_parfor_outputs,
     get_parfor_params,
 )
 
+from numba_dpex.core.exceptions import ComputeFollowsDataInferenceError
+from numba_dpex.core.parfors.parfor_lowerer import ParforLowerFactory
+from numba_dpex.core.types.dpnp_ndarray_type import DpnpNdArray
+
 
 class ParforLegalizeCFDPassImpl:
 
diff --git a/numba_dpex/core/passes/passes.py b/numba_dpex/core/passes/passes.py
index c5ab12271d..19aa5b67a5 100644
--- a/numba_dpex/core/passes/passes.py
+++ b/numba_dpex/core/passes/passes.py
@@ -3,30 +3,18 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import warnings
-from contextlib import contextmanager
 
-import numpy as np
-from numba.core import errors, funcdesc, ir, typing, utils
+from numba.core import errors, ir, typing
 from numba.core.compiler_machinery import (
     AnalysisPass,
     FunctionPass,
-    LoweringPass,
     register_pass,
 )
 from numba.core.ir_utils import remove_dels
 from numba.core.typed_passes import NativeLowering
-from numba.parfors.parfor import ParforPass as _numba_parfor_ParforPass
-from numba.parfors.parfor import PreParforPass as _numba_parfor_PreParforPass
-from numba.parfors.parfor import swap_functions_map
 
 from numba_dpex import config
 
-from .lowerer import DPEXLowerer
-from .parfor import Parfor
-from .parfor import ParforFusionPass as _parfor_ParforFusionPass
-from .parfor import ParforPass as _parfor_ParforPass
-from .parfor import ParforPreLoweringPass as _parfor_ParforPreLoweringPass
-
 
 @register_pass(mutates_CFG=True, analysis_only=False)
 class ConstantSizeStaticLocalMemoryPass(FunctionPass):
@@ -137,329 +125,6 @@ def run_pass(self, state):
         return True
 
 
-@register_pass(mutates_CFG=True, analysis_only=False)
-class PreParforPass(FunctionPass):
-    _name = "dpex_pre_parfor_pass"
-
-    def __init__(self):
-        FunctionPass.__init__(self)
-
-    def run_pass(self, state):
-        """
-        Preprocessing for data-parallel computations.
-        """
-
-        # Ensure we have an IR and type information.
-        assert state.func_ir
-        functions_map = swap_functions_map.copy()
-        # FIXME: remove once reduction is implemented
-        functions_map.pop(("dot", "numpy"), None)
-        functions_map.pop(("sum", "numpy"), None)
-        functions_map.pop(("prod", "numpy"), None)
-        functions_map.pop(("argmax", "numpy"), None)
-        functions_map.pop(("max", "numpy"), None)
-        functions_map.pop(("argmin", "numpy"), None)
-        functions_map.pop(("min", "numpy"), None)
-        functions_map.pop(("mean", "numpy"), None)
-
-        preparfor_pass = _numba_parfor_PreParforPass(
-            state.func_ir,
-            state.type_annotation.typemap,
-            state.type_annotation.calltypes,
-            state.typingctx,
-            state.targetctx,
-            state.flags.auto_parallel,
-            state.parfor_diagnostics.replaced_fns,
-            replace_functions_map=functions_map,
-        )
-
-        preparfor_pass.run()
-
-        if config.DEBUG or config.DUMP_IR:
-            name = state.func_ir.func_id.func_qualname
-            print(("IR DUMP: %s" % name).center(80, "-"))
-            state.func_ir.dump()
-
-        return True
-
-
-@register_pass(mutates_CFG=True, analysis_only=False)
-class ParforPass(FunctionPass):
-    _name = "dpex_parfor_pass"
-
-    def __init__(self):
-        FunctionPass.__init__(self)
-
-    def run_pass(self, state):
-        """
-        Convert data-parallel computations into Parfor nodes
-        """
-        # Ensure we have an IR and type information.
-        assert state.func_ir
-        parfor_pass = _numba_parfor_ParforPass(
-            state.func_ir,
-            state.type_annotation.typemap,
-            state.type_annotation.calltypes,
-            state.return_type,
-            state.typingctx,
-            state.targetctx,
-            state.flags.auto_parallel,
-            state.flags,
-            state.metadata,
-            state.parfor_diagnostics,
-        )
-
-        parfor_pass.run()
-
-        if config.DEBUG or config.DUMP_IR:
-            name = state.func_ir.func_id.func_qualname
-            print(("IR DUMP: %s" % name).center(80, "-"))
-            state.func_ir.dump()
-
-        return True
-
-
-# this is here so it pickles and for no other reason
-def _reload_parfors():
-    """Reloader for cached parfors"""
-    # Re-initialize the parallel backend when load from cache.
-    from numba.np.ufunc.parallel import _launch_threads
-
-    _launch_threads()
-
-
-@register_pass(mutates_CFG=True, analysis_only=False)
-class SplitParforPass(FunctionPass):
-    _name = "dpex_parfor_pass"
-
-    def __init__(self):
-        FunctionPass.__init__(self)
-
-    def run_pass(self, state):
-        """
-        Convert data-parallel computations into Parfor nodes
-        """
-        # Ensure we have an IR and type information.
-        assert state.func_ir
-        parfor_pass = _parfor_ParforPass(
-            state.func_ir,
-            state.typemap,
-            state.calltypes,
-            state.return_type,
-            state.typingctx,
-            state.targetctx,
-            state.flags.auto_parallel,
-            state.flags,
-            state.metadata,
-            state.parfor_diagnostics,
-        )
-        parfor_pass.run()
-
-        # check the parfor pass worked and warn if it didn't
-        has_parfor = False
-        for blk in state.func_ir.blocks.values():
-            for stmnt in blk.body:
-                if isinstance(stmnt, Parfor):
-                    has_parfor = True
-                    break
-            else:
-                continue
-            break
-
-        if not has_parfor:
-            # parfor calls the compiler chain again with a string
-            if not (
-                config.DISABLE_PERFORMANCE_WARNINGS
-                or state.func_ir.loc.filename == "<string>"
-            ):
-                url = (
-                    "https://numba.readthedocs.io/en/stable/user/"
-                    "parallel.html#diagnostics"
-                )
-                msg = (
-                    "\nThe keyword argument 'parallel=True' was specified "
-                    "but no transformation for parallel execution was "
-                    "possible.\n\nTo find out why, try turning on parallel "
-                    "diagnostics, see %s for help." % url
-                )
-                warnings.warn(
-                    errors.NumbaPerformanceWarning(msg, state.func_ir.loc)
-                )
-
-        # Add reload function to initialize the parallel backend.
-        state.reload_init.append(_reload_parfors)
-        return True
-
-
-@register_pass(mutates_CFG=True, analysis_only=False)
-class ParforFusionPass(FunctionPass):
-    _name = "parfor_fusion_pass"
-
-    def __init__(self):
-        FunctionPass.__init__(self)
-
-    def run_pass(self, state):
-        """
-        Do fusion of parfor nodes.
-        """
-        # Ensure we have an IR and type information.
-        assert state.func_ir
-        parfor_pass = _parfor_ParforFusionPass(
-            state.func_ir,
-            state.typemap,
-            state.calltypes,
-            state.return_type,
-            state.typingctx,
-            state.targetctx,
-            state.flags.auto_parallel,
-            state.flags,
-            state.metadata,
-            state.parfor_diagnostics,
-        )
-        parfor_pass.run()
-
-        return True
-
-
-@register_pass(mutates_CFG=True, analysis_only=False)
-class ParforPreLoweringPass(FunctionPass):
-    _name = "parfor_prelowering_pass"
-
-    def __init__(self):
-        FunctionPass.__init__(self)
-
-    def run_pass(self, state):
-        """
-        Prepare parfors for lowering.
-        """
-        # Ensure we have an IR and type information.
-        assert state.func_ir
-        parfor_pass = _parfor_ParforPreLoweringPass(
-            state.func_ir,
-            state.typemap,
-            state.calltypes,
-            state.return_type,
-            state.typingctx,
-            state.targetctx,
-            state.flags.auto_parallel,
-            state.flags,
-            state.metadata,
-            state.parfor_diagnostics,
-        )
-        parfor_pass.run()
-
-        return True
-
-
-@contextmanager
-def fallback_context(state, msg):
-    """
-    Wraps code that would signal a fallback to object mode
-    """
-    try:
-        yield
-    except Exception as e:
-        if not state.status.can_fallback:
-            raise
-        else:
-            if utils.PYVERSION >= (3,):
-                # Clear all references attached to the traceback
-                e = e.with_traceback(None)
-            # this emits a warning containing the error message body in the
-            # case of fallback from npm to objmode
-            loop_lift = "" if state.flags.enable_looplift else "OUT"
-            msg_rewrite = (
-                "\nCompilation is falling back to object mode "
-                "WITH%s looplifting enabled because %s" % (loop_lift, msg)
-            )
-            warnings.warn_explicit(
-                "%s due to: %s" % (msg_rewrite, e),
-                errors.NumbaWarning,
-                state.func_id.filename,
-                state.func_id.firstlineno,
-            )
-            raise
-
-
-@register_pass(mutates_CFG=True, analysis_only=False)
-class DpexLowering(LoweringPass):
-    _name = "dpex_lowering"
-
-    def __init__(self):
-        LoweringPass.__init__(self)
-
-    def run_pass(self, state):
-        if state.library is None:
-            codegen = state.targetctx.codegen()
-            state.library = codegen.create_library(state.func_id.func_qualname)
-            # Enable object caching upfront, so that the library can
-            # be later serialized.
-            state.library.enable_object_caching()
-
-        targetctx = state.targetctx
-
-        library = state.library
-        interp = state.func_ir
-        typemap = state.typemap
-        restype = state.return_type
-        calltypes = state.calltypes
-        flags = state.flags
-        metadata = state.metadata
-
-        msg = "Function %s failed at nopython " "mode lowering" % (
-            state.func_id.func_name,
-        )
-        with fallback_context(state, msg):
-            kwargs = {}
-
-            # for support numba 0.54 and <=0.55.0dev0=*_469
-            if hasattr(flags, "get_mangle_string"):
-                kwargs["abi_tags"] = flags.get_mangle_string()
-            # Lowering
-            fndesc = (
-                funcdesc.PythonFunctionDescriptor.from_specialized_function(
-                    interp,
-                    typemap,
-                    restype,
-                    calltypes,
-                    mangler=targetctx.mangler,
-                    inline=flags.forceinline,
-                    noalias=flags.noalias,
-                    **kwargs,
-                )
-            )
-
-            with targetctx.push_code_library(library):
-                lower = DPEXLowerer(
-                    targetctx, library, fndesc, interp, metadata=metadata
-                )
-                lower.lower()
-                if not flags.no_cpython_wrapper:
-                    lower.create_cpython_wrapper(flags.release_gil)
-
-                env = lower.env
-                call_helper = lower.call_helper
-                del lower
-
-            from numba.core.compiler import _LowerResult  # TODO: move this
-
-            if flags.no_compile:
-                state["cr"] = _LowerResult(
-                    fndesc, call_helper, cfunc=None, env=env
-                )
-            else:
-                # Prepare for execution
-                cfunc = targetctx.get_executable(library, fndesc, env)
-                # Insert native function for use by other jitted-functions.
-                # We also register its library to allow for inlining.
-                targetctx.insert_user_function(cfunc, fndesc, [library])
-                state["cr"] = _LowerResult(
-                    fndesc, call_helper, cfunc=cfunc, env=env
-                )
-
-        return True
-
-
 @register_pass(mutates_CFG=True, analysis_only=False)
 class NoPythonBackend(FunctionPass):
     _name = "dpex_nopython_backend"
diff --git a/numba_dpex/core/pipelines/dpjit_compiler.py b/numba_dpex/core/pipelines/dpjit_compiler.py
index 433dfd9a27..f84469d528 100644
--- a/numba_dpex/core/pipelines/dpjit_compiler.py
+++ b/numba_dpex/core/pipelines/dpjit_compiler.py
@@ -8,9 +8,13 @@
     AnnotateTypes,
     InlineOverloads,
     IRLegalization,
+    NativeParforLowering,
     NopythonRewrites,
     NoPythonSupportedFeatureValidation,
     NopythonTypeInference,
+    ParforFusionPass,
+    ParforPass,
+    ParforPreLoweringPass,
     PreLowerStripPhis,
     PreParforPass,
 )
@@ -19,11 +23,7 @@
 from numba_dpex.core.passes import (
     DumpParforDiagnostics,
     NoPythonBackend,
-    ParforFusionPass,
     ParforLegalizeCFDPass,
-    ParforLoweringPass,
-    ParforPreLoweringPass,
-    SplitParforPass,
 )
 from numba_dpex.parfor_diagnostics import ExtendedParforDiagnostics
 
@@ -53,7 +53,7 @@ def define_typed_pipeline(state, name="dpex_dpjit_typed"):
         pm.add_pass(PreParforPass, "Preprocessing for parfors")
         if not state.flags.no_rewrites:
             pm.add_pass(NopythonRewrites, "nopython rewrites")
-        pm.add_pass(SplitParforPass, "convert to parfors")
+        pm.add_pass(ParforPass, "convert to parfors")
         pm.add_pass(
             ParforLegalizeCFDPass, "Legalize parfors for compute follows data"
         )
@@ -76,7 +76,9 @@ def define_nopython_lowering_pipeline(state, name="dpex_dpjit_lowering"):
         pm.add_pass(IRLegalization, "ensure IR is legal prior to lowering")
 
         # lower
-        pm.add_pass(ParforLoweringPass, "Custom lowerer for dpex parfor nodes")
+        pm.add_pass(
+            NativeParforLowering, "lowerer with support for parfor nodes"
+        )
         pm.add_pass(NoPythonBackend, "nopython mode backend")
         pm.add_pass(DumpParforDiagnostics, "dump parfor diagnostics")
 
diff --git a/numba_dpex/core/targets/kernel_target.py b/numba_dpex/core/targets/kernel_target.py
index c0ee52e916..9085fd9979 100644
--- a/numba_dpex/core/targets/kernel_target.py
+++ b/numba_dpex/core/targets/kernel_target.py
@@ -5,7 +5,7 @@
 import re
 from functools import cached_property
 
-import numpy as np
+import dpnp
 from llvmlite import binding as ll
 from llvmlite import ir as llvmir
 from numba import typeof
@@ -298,37 +298,36 @@ def init(self):
     def create_module(self, name):
         return self._internal_codegen._create_empty_module(name)
 
-    def replace_numpy_ufunc_with_opencl_supported_functions(self):
+    def replace_dpnp_ufunc_with_ocl_intrinsics(self):
         from numba_dpex.ocl.mathimpl import lower_ocl_impl, sig_mapper
 
         ufuncs = [
-            ("fabs", np.fabs),
-            ("exp", np.exp),
-            ("log", np.log),
-            ("log10", np.log10),
-            ("expm1", np.expm1),
-            ("log1p", np.log1p),
-            ("sqrt", np.sqrt),
-            ("sin", np.sin),
-            ("cos", np.cos),
-            ("tan", np.tan),
-            ("asin", np.arcsin),
-            ("acos", np.arccos),
-            ("atan", np.arctan),
-            ("atan2", np.arctan2),
-            ("sinh", np.sinh),
-            ("cosh", np.cosh),
-            ("tanh", np.tanh),
-            ("asinh", np.arcsinh),
-            ("acosh", np.arccosh),
-            ("atanh", np.arctanh),
-            ("ldexp", np.ldexp),
-            ("floor", np.floor),
-            ("ceil", np.ceil),
-            ("trunc", np.trunc),
-            ("hypot", np.hypot),
-            ("exp2", np.exp2),
-            ("log2", np.log2),
+            ("fabs", dpnp.fabs),
+            ("exp", dpnp.exp),
+            ("log", dpnp.log),
+            ("log10", dpnp.log10),
+            ("expm1", dpnp.expm1),
+            ("log1p", dpnp.log1p),
+            ("sqrt", dpnp.sqrt),
+            ("sin", dpnp.sin),
+            ("cos", dpnp.cos),
+            ("tan", dpnp.tan),
+            ("asin", dpnp.arcsin),
+            ("acos", dpnp.arccos),
+            ("atan", dpnp.arctan),
+            ("atan2", dpnp.arctan2),
+            ("sinh", dpnp.sinh),
+            ("cosh", dpnp.cosh),
+            ("tanh", dpnp.tanh),
+            ("asinh", dpnp.arcsinh),
+            ("acosh", dpnp.arccosh),
+            ("atanh", dpnp.arctanh),
+            ("floor", dpnp.floor),
+            ("ceil", dpnp.ceil),
+            ("trunc", dpnp.trunc),
+            ("hypot", dpnp.hypot),
+            ("exp2", dpnp.exp2),
+            ("log2", dpnp.log2),
         ]
 
         for name, ufunc in ufuncs:
@@ -344,23 +343,24 @@ def replace_numpy_ufunc_with_opencl_supported_functions(self):
     def load_additional_registries(self):
         """Register OpenCL functions into numba_depx's target context.
 
-        To make sure we are calling supported OpenCL math functions, we
-        replace some of NUMBA's NumPy ufunc with OpenCL versions of those
-        functions. The replacement is done after the OpenCL functions have
-        been registered into the target context.
+        To make sure we are calling supported OpenCL math functions, we replace
+        the dpnp functions that default to NUMBA's NumPy ufunc with OpenCL
+        intrinsics that are equivalent to those functions. The replacement is
+        done after the OpenCL functions have been registered into the
+        target context.
 
         """
-        from numba.np import npyimpl
+        from numba_dpex.dpnp_iface import dpnpimpl
 
         from ... import printimpl
         from ...ocl import mathimpl, oclimpl
 
         self.insert_func_defn(oclimpl.registry.functions)
         self.insert_func_defn(mathimpl.registry.functions)
-        self.insert_func_defn(npyimpl.registry.functions)
+        self.insert_func_defn(dpnpimpl.registry.functions)
         self.install_registry(printimpl.registry)
-        # Replace NumPy functions with their OpenCL versions.
-        self.replace_numpy_ufunc_with_opencl_supported_functions()
+        # Replace dpnp math functions with their OpenCL versions.
+        self.replace_dpnp_ufunc_with_ocl_intrinsics()
 
     @cached_property
     def call_conv(self):
diff --git a/numba_dpex/dpnp_iface/dpnpimpl.py b/numba_dpex/dpnp_iface/dpnpimpl.py
index 7873bbc5b5..da2a2b6cb8 100644
--- a/numba_dpex/dpnp_iface/dpnpimpl.py
+++ b/numba_dpex/dpnp_iface/dpnpimpl.py
@@ -2,19 +2,37 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
+import copy
+
 import dpnp
+from numba.core.imputils import Registry
 from numba.np import npyimpl
 
 from numba_dpex.core.typing.dpnpdecl import _unsupported
 from numba_dpex.dpnp_iface import dpnp_ufunc_db
 
+registry = Registry("dpnpimpl")
+
 
 def _register_dpnp_ufuncs():
+    """Adds dpnp ufuncs to the dpnpimpl.registry.
+
+    The npyimpl.registry is searched for all registered ufuncs and we copy the
+    implementations and register them in a dpnp-specific registry defined in the
+    current module. The numpy ufuncs are deep copied so as to not mutate the
+    original functions by changes we introduce in the DpexKernelTarget.
+
+    Raises:
+        RuntimeError: If the signature of the ufunc takes more than two input
+        args.
+    """
     kernels = {}
-    # NOTE: Assuming ufunc implementation for the CPUContext.
+
     for ufunc in dpnp_ufunc_db.get_ufuncs():
         kernels[ufunc] = npyimpl.register_ufunc_kernel(
-            ufunc, npyimpl._ufunc_db_function(ufunc)
+            ufunc,
+            copy.copy(npyimpl._ufunc_db_function(ufunc)),
+            registry.lower,
         )
 
     for _op_map in (
@@ -27,9 +45,13 @@ def _register_dpnp_ufuncs():
             ufunc = getattr(dpnp, ufunc_name)
             kernel = kernels[ufunc]
             if ufunc.nin == 1:
-                npyimpl.register_unary_operator_kernel(operator, ufunc, kernel)
+                npyimpl.register_unary_operator_kernel(
+                    operator, ufunc, kernel, registry.lower
+                )
             elif ufunc.nin == 2:
-                npyimpl.register_binary_operator_kernel(operator, ufunc, kernel)
+                npyimpl.register_binary_operator_kernel(
+                    operator, ufunc, kernel, registry.lower
+                )
             else:
                 raise RuntimeError(
                     "There shouldn't be any non-unary or binary operators"
@@ -43,11 +65,11 @@ def _register_dpnp_ufuncs():
             kernel = kernels[ufunc]
             if ufunc.nin == 1:
                 npyimpl.register_unary_operator_kernel(
-                    operator, ufunc, kernel, inplace=True
+                    operator, ufunc, kernel, registry.lower, inplace=True
                 )
             elif ufunc.nin == 2:
                 npyimpl.register_binary_operator_kernel(
-                    operator, ufunc, kernel, inplace=True
+                    operator, ufunc, kernel, registry.lower, inplace=True
                 )
             else:
                 raise RuntimeError(
@@ -55,4 +77,5 @@ def _register_dpnp_ufuncs():
                 )
 
 
+# Initialize the registry that stores the dpnp ufuncs
 _register_dpnp_ufuncs()
diff --git a/numba_dpex/numba_patches/__init__.py b/numba_dpex/numba_patches/__init__.py
new file mode 100644
index 0000000000..3a217e6325
--- /dev/null
+++ b/numba_dpex/numba_patches/__init__.py
@@ -0,0 +1,3 @@
+# SPDX-FileCopyrightText: 2023 Intel Corporation
+#
+# SPDX-License-Identifier: Apache-2.0
diff --git a/numba_dpex/numba_patches/patch_arrayexpr_tree_to_ir.py b/numba_dpex/numba_patches/patch_arrayexpr_tree_to_ir.py
new file mode 100644
index 0000000000..e0d42d71f8
--- /dev/null
+++ b/numba_dpex/numba_patches/patch_arrayexpr_tree_to_ir.py
@@ -0,0 +1,189 @@
+# SPDX-FileCopyrightText: 2023 Intel Corporation
+#
+# SPDX-License-Identifier: Apache-2.0
+
+
+def patch():
+    """
+    Patches the _arrayexpr_tree_to_ir function in numba.parfor.parfor.py to
+    support array expression nodes that were generated from dpnp expressions.
+    """
+
+    import copy
+    import math
+    import operator
+
+    from numba.core import errors, ir, types, typing
+    from numba.core.ir_utils import mk_unique_var
+    from numba.core.typing import npydecl
+    from numba.parfors import array_analysis, parfor
+
+    def _ufunc_to_parfor_instr(
+        typemap,
+        op,
+        avail_vars,
+        loc,
+        scope,
+        func_ir,
+        out_ir,
+        arg_vars,
+        typingctx,
+        calltypes,
+        expr_out_var,
+    ):
+        func_var_name = parfor._find_func_var(typemap, op, avail_vars, loc=loc)
+        func_var = ir.Var(scope, mk_unique_var(func_var_name), loc)
+        typemap[func_var.name] = typemap[func_var_name]
+        func_var_def = copy.deepcopy(func_ir.get_definition(func_var_name))
+        if (
+            isinstance(func_var_def, ir.Expr)
+            and func_var_def.op == "getattr"
+            and func_var_def.attr == "sqrt"
+        ):
+            g_math_var = ir.Var(scope, mk_unique_var("$math_g_var"), loc)
+            typemap[g_math_var.name] = types.misc.Module(math)
+            g_math = ir.Global("math", math, loc)
+            g_math_assign = ir.Assign(g_math, g_math_var, loc)
+            func_var_def = ir.Expr.getattr(g_math_var, "sqrt", loc)
+            out_ir.append(g_math_assign)
+        #                     out_ir.append(func_var_def)
+        ir_expr = ir.Expr.call(func_var, arg_vars, (), loc)
+        call_typ = typemap[func_var.name].get_call_type(
+            typingctx, tuple(typemap[a.name] for a in arg_vars), {}
+        )
+        calltypes[ir_expr] = call_typ
+        el_typ = call_typ.return_type
+        # signature(el_typ, el_typ)
+        out_ir.append(ir.Assign(func_var_def, func_var, loc))
+        out_ir.append(ir.Assign(ir_expr, expr_out_var, loc))
+
+        return el_typ
+
+    def _arrayexpr_tree_to_ir(
+        func_ir,
+        typingctx,
+        typemap,
+        calltypes,
+        equiv_set,
+        init_block,
+        expr_out_var,
+        expr,
+        parfor_index_tuple_var,
+        all_parfor_indices,
+        avail_vars,
+    ):
+        """generate IR from array_expr's expr tree recursively. Assign output to
+        expr_out_var and returns the whole IR as a list of Assign nodes.
+        """
+        el_typ = typemap[expr_out_var.name]
+        scope = expr_out_var.scope
+        loc = expr_out_var.loc
+        out_ir = []
+
+        if isinstance(expr, tuple):
+            op, arr_expr_args = expr
+            arg_vars = []
+            for arg in arr_expr_args:
+                arg_out_var = ir.Var(scope, mk_unique_var("$arg_out_var"), loc)
+                typemap[arg_out_var.name] = el_typ
+                out_ir += _arrayexpr_tree_to_ir(
+                    func_ir,
+                    typingctx,
+                    typemap,
+                    calltypes,
+                    equiv_set,
+                    init_block,
+                    arg_out_var,
+                    arg,
+                    parfor_index_tuple_var,
+                    all_parfor_indices,
+                    avail_vars,
+                )
+                arg_vars.append(arg_out_var)
+            if op in npydecl.supported_array_operators:
+                el_typ1 = typemap[arg_vars[0].name]
+                if len(arg_vars) == 2:
+                    el_typ2 = typemap[arg_vars[1].name]
+                    func_typ = typingctx.resolve_function_type(
+                        op, (el_typ1, el_typ2), {}
+                    )
+                    ir_expr = ir.Expr.binop(op, arg_vars[0], arg_vars[1], loc)
+                    if op == operator.truediv:
+                        func_typ, ir_expr = parfor._gen_np_divide(
+                            arg_vars[0], arg_vars[1], out_ir, typemap
+                        )
+                else:
+                    func_typ = typingctx.resolve_function_type(
+                        op, (el_typ1,), {}
+                    )
+                    ir_expr = ir.Expr.unary(op, arg_vars[0], loc)
+                calltypes[ir_expr] = func_typ
+                el_typ = func_typ.return_type
+                out_ir.append(ir.Assign(ir_expr, expr_out_var, loc))
+            for T in array_analysis.MAP_TYPES:
+                if isinstance(op, T):
+                    # elif isinstance(op, (np.ufunc, DUFunc)):
+                    # function calls are stored in variables which are not removed
+                    # op is typing_key to the variables type
+                    el_typ = _ufunc_to_parfor_instr(
+                        typemap,
+                        op,
+                        avail_vars,
+                        loc,
+                        scope,
+                        func_ir,
+                        out_ir,
+                        arg_vars,
+                        typingctx,
+                        calltypes,
+                        expr_out_var,
+                    )
+            if hasattr(op, "is_dpnp_ufunc"):
+                el_typ = _ufunc_to_parfor_instr(
+                    typemap,
+                    op,
+                    avail_vars,
+                    loc,
+                    scope,
+                    func_ir,
+                    out_ir,
+                    arg_vars,
+                    typingctx,
+                    calltypes,
+                    expr_out_var,
+                )
+        elif isinstance(expr, ir.Var):
+            var_typ = typemap[expr.name]
+            if isinstance(var_typ, types.Array):
+                el_typ = var_typ.dtype
+                ir_expr = parfor._gen_arrayexpr_getitem(
+                    equiv_set,
+                    expr,
+                    parfor_index_tuple_var,
+                    all_parfor_indices,
+                    el_typ,
+                    calltypes,
+                    typingctx,
+                    typemap,
+                    init_block,
+                    out_ir,
+                )
+            else:
+                # assert typemap[expr.name]==el_typ
+                el_typ = var_typ
+                ir_expr = expr
+            out_ir.append(ir.Assign(ir_expr, expr_out_var, loc))
+        elif isinstance(expr, ir.Const):
+            el_typ = typing.Context().resolve_value_type(expr.value)
+            out_ir.append(ir.Assign(expr, expr_out_var, loc))
+
+        if len(out_ir) == 0:
+            raise errors.UnsupportedRewriteError(
+                f"Don't know how to translate array expression '{expr:r}'",
+                loc=expr.loc,
+            )
+        typemap.pop(expr_out_var.name, None)
+        typemap[expr_out_var.name] = el_typ
+        return out_ir
+
+    parfor._arrayexpr_tree_to_ir = _arrayexpr_tree_to_ir
diff --git a/numba_dpex/numba_patches/patch_is_ufunc.py b/numba_dpex/numba_patches/patch_is_ufunc.py
new file mode 100644
index 0000000000..ee8b394b5a
--- /dev/null
+++ b/numba_dpex/numba_patches/patch_is_ufunc.py
@@ -0,0 +1,23 @@
+# SPDX-FileCopyrightText: 2020 - 2022 Intel Corporation
+#
+# SPDX-License-Identifier: Apache-2.0
+
+
+def patch():
+    """Patches the numba.np.ufunc.array_exprs._is_ufunc function to make it
+    possible to support dpnp universal functions (ufuncs).
+
+    The extra condition is the check for the "is_dpnp_ufunc" attribute to
+    identify a non-NumPy ufunc.
+    """
+    import numpy
+    from numba.np.ufunc.dufunc import DUFunc
+
+    def _is_ufunc(func):
+        return isinstance(func, (numpy.ufunc, DUFunc)) or hasattr(
+            func, "is_dpnp_ufunc"
+        )
+
+    from numba.np.ufunc import array_exprs
+
+    array_exprs._is_ufunc = _is_ufunc
diff --git a/numba_dpex/numba_patches/patch_mk_alloc.py b/numba_dpex/numba_patches/patch_mk_alloc.py
new file mode 100644
index 0000000000..792ff7f686
--- /dev/null
+++ b/numba_dpex/numba_patches/patch_mk_alloc.py
@@ -0,0 +1,175 @@
+# SPDX-FileCopyrightText: 2023 Intel Corporation
+#
+# SPDX-License-Identifier: Apache-2.0
+
+
+def patch():
+    """
+    Patches the numba.core.ir_utils.mk_alloc function to support non-NumPy array
+    types.
+
+    The patch extends the ir_utils.mk_alloc function that is used by
+    numba.parfors.parfor to allocate an empty temp array inside a parfor
+    section. Without the patch Numba always allocates the temp using
+    numpy.empty, with the patch the function checks if the array type to be
+    allocated has a __allocate__ method, and if so uses that method to allocate
+    the temporary array.
+
+    TODO: Already merged into Numba main and should be removed when numba-dpex
+    is ported to Numba 0.58
+    """
+    import numpy
+    from numba.core import ir, ir_utils, types
+    from numba.core.ir_utils import (
+        convert_size_to_var,
+        get_np_ufunc_typ,
+        mk_unique_var,
+    )
+    from numba.parfors import parfor
+
+    def _mk_alloc(
+        typingctx, typemap, calltypes, lhs, size_var, dtype, scope, loc, lhs_typ
+    ):
+        """generate an array allocation with np.empty() and return list of
+        nodes. size_var can be an int variable or tuple of int variables.
+        lhs_typ is the type of the array being allocated.
+        """
+        out = []
+        ndims = 1
+        size_typ = types.intp
+        if isinstance(size_var, tuple):
+            if len(size_var) == 1:
+                size_var = size_var[0]
+                size_var = convert_size_to_var(
+                    size_var, typemap, scope, loc, out
+                )
+            else:
+                # tuple_var = build_tuple([size_var...])
+                ndims = len(size_var)
+                tuple_var = ir.Var(scope, mk_unique_var("$tuple_var"), loc)
+                if typemap:
+                    typemap[tuple_var.name] = types.containers.UniTuple(
+                        types.intp, ndims
+                    )
+                # constant sizes need to be assigned to vars
+                new_sizes = [
+                    convert_size_to_var(s, typemap, scope, loc, out)
+                    for s in size_var
+                ]
+                tuple_call = ir.Expr.build_tuple(new_sizes, loc)
+                tuple_assign = ir.Assign(tuple_call, tuple_var, loc)
+                out.append(tuple_assign)
+                size_var = tuple_var
+                size_typ = types.containers.UniTuple(types.intp, ndims)
+
+        if hasattr(lhs_typ, "__allocate__"):
+            return lhs_typ.__allocate__(
+                typingctx,
+                typemap,
+                calltypes,
+                lhs,
+                size_var,
+                dtype,
+                scope,
+                loc,
+                lhs_typ,
+                size_typ,
+                out,
+            )
+
+        # g_np_var = Global(numpy)
+        g_np_var = ir.Var(scope, mk_unique_var("$np_g_var"), loc)
+        if typemap:
+            typemap[g_np_var.name] = types.misc.Module(numpy)
+        g_np = ir.Global("np", numpy, loc)
+        g_np_assign = ir.Assign(g_np, g_np_var, loc)
+        # attr call: empty_attr = getattr(g_np_var, empty)
+        empty_attr_call = ir.Expr.getattr(g_np_var, "empty", loc)
+        attr_var = ir.Var(scope, mk_unique_var("$empty_attr_attr"), loc)
+        if typemap:
+            typemap[attr_var.name] = get_np_ufunc_typ(numpy.empty)
+        attr_assign = ir.Assign(empty_attr_call, attr_var, loc)
+        # Assume str(dtype) returns a valid type
+        dtype_str = str(dtype)
+        # alloc call: lhs = empty_attr(size_var, typ_var)
+        typ_var = ir.Var(scope, mk_unique_var("$np_typ_var"), loc)
+        if typemap:
+            typemap[typ_var.name] = types.functions.NumberClass(dtype)
+        # If dtype is a datetime/timedelta with a unit,
+        # then it won't return a valid type and instead can be created
+        # with a string. i.e. "datetime64[ns]")
+        if (
+            isinstance(dtype, (types.NPDatetime, types.NPTimedelta))
+            and dtype.unit != ""
+        ):
+            typename_const = ir.Const(dtype_str, loc)
+            typ_var_assign = ir.Assign(typename_const, typ_var, loc)
+        else:
+            if dtype_str == "bool":
+                # empty doesn't like 'bool' sometimes (e.g. kmeans example)
+                dtype_str = "bool_"
+            np_typ_getattr = ir.Expr.getattr(g_np_var, dtype_str, loc)
+            typ_var_assign = ir.Assign(np_typ_getattr, typ_var, loc)
+        alloc_call = ir.Expr.call(attr_var, [size_var, typ_var], (), loc)
+
+        if calltypes:
+            cac = typemap[attr_var.name].get_call_type(
+                typingctx, [size_typ, types.functions.NumberClass(dtype)], {}
+            )
+            # By default, all calls to "empty" are typed as returning a standard
+            # NumPy ndarray.  If we are allocating a ndarray subclass here then
+            # just change the return type to be that of the subclass.
+            cac._return_type = (
+                lhs_typ.copy(layout="C") if lhs_typ.layout == "F" else lhs_typ
+            )
+            calltypes[alloc_call] = cac
+        if lhs_typ.layout == "F":
+            empty_c_typ = lhs_typ.copy(layout="C")
+            empty_c_var = ir.Var(scope, mk_unique_var("$empty_c_var"), loc)
+            if typemap:
+                typemap[empty_c_var.name] = lhs_typ.copy(layout="C")
+            empty_c_assign = ir.Assign(alloc_call, empty_c_var, loc)
+
+            # attr call: asfortranarray = getattr(g_np_var, asfortranarray)
+            asfortranarray_attr_call = ir.Expr.getattr(
+                g_np_var, "asfortranarray", loc
+            )
+            afa_attr_var = ir.Var(
+                scope, mk_unique_var("$asfortran_array_attr"), loc
+            )
+            if typemap:
+                typemap[afa_attr_var.name] = get_np_ufunc_typ(
+                    numpy.asfortranarray
+                )
+            afa_attr_assign = ir.Assign(
+                asfortranarray_attr_call, afa_attr_var, loc
+            )
+            # call asfortranarray
+            asfortranarray_call = ir.Expr.call(
+                afa_attr_var, [empty_c_var], (), loc
+            )
+            if calltypes:
+                calltypes[asfortranarray_call] = typemap[
+                    afa_attr_var.name
+                ].get_call_type(typingctx, [empty_c_typ], {})
+
+            asfortranarray_assign = ir.Assign(asfortranarray_call, lhs, loc)
+
+            out.extend(
+                [
+                    g_np_assign,
+                    attr_assign,
+                    typ_var_assign,
+                    empty_c_assign,
+                    afa_attr_assign,
+                    asfortranarray_assign,
+                ]
+            )
+        else:
+            alloc_assign = ir.Assign(alloc_call, lhs, loc)
+            out.extend([g_np_assign, attr_assign, typ_var_assign, alloc_assign])
+
+        return out
+
+    ir_utils.mk_alloc = _mk_alloc
+    parfor.mk_alloc = _mk_alloc
diff --git a/numba_dpex/tests/_helper.py b/numba_dpex/tests/_helper.py
index c3c6b32ee7..b87b562ee3 100644
--- a/numba_dpex/tests/_helper.py
+++ b/numba_dpex/tests/_helper.py
@@ -10,7 +10,7 @@
 import dpctl
 import pytest
 
-from numba_dpex import config, numba_version
+from numba_dpex import config, numba_sem_version
 
 
 def has_opencl_gpu():
@@ -113,7 +113,7 @@ def is_windows():
 ]
 
 skip_no_numba056 = pytest.mark.skipif(
-    numba_version < (0, 56), reason="Need Numba 0.56 or higher"
+    numba_sem_version < (0, 56), reason="Need Numba 0.56 or higher"
 )
 
 skip_no_gdb = pytest.mark.skipif(
diff --git a/numba_dpex/tests/core/types/USMNdAArray/test_usm_ndarray_creation.py b/numba_dpex/tests/core/types/USMNdArray/test_usm_ndarray_creation.py
similarity index 100%
rename from numba_dpex/tests/core/types/USMNdAArray/test_usm_ndarray_creation.py
rename to numba_dpex/tests/core/types/USMNdArray/test_usm_ndarray_creation.py
diff --git a/numba_dpex/tests/core/types/USMNdAArray/test_usm_ndarray_type.py b/numba_dpex/tests/core/types/USMNdArray/test_usm_ndarray_type.py
similarity index 100%
rename from numba_dpex/tests/core/types/USMNdAArray/test_usm_ndarray_type.py
rename to numba_dpex/tests/core/types/USMNdArray/test_usm_ndarray_type.py
diff --git a/numba_dpex/tests/misc/test_dpctl_version.py b/numba_dpex/tests/misc/test_dpctl_version.py
index d723264de4..b055817c87 100644
--- a/numba_dpex/tests/misc/test_dpctl_version.py
+++ b/numba_dpex/tests/misc/test_dpctl_version.py
@@ -4,12 +4,12 @@
 
 import dpctl
 
-from numba_dpex import dpctl_version
+from numba_dpex import dpctl_sem_version
 
 
 def test_dpctl_version():
     dpctl_v = dpctl.__version__
-    computed_v = ".".join(str(n) for n in dpctl_version)
+    computed_v = ".".join(str(n) for n in dpctl_sem_version)
     n = len(computed_v)
     assert n <= len(dpctl_v)
     assert computed_v == dpctl_v[:n]
diff --git a/setup.py b/setup.py
index f1eebd7309..92e69fc482 100644
--- a/setup.py
+++ b/setup.py
@@ -152,7 +152,7 @@ def _llvm_spirv():
 )
 build_requires = ["cython"]
 install_requires = [
-    "numba >={}".format("0.56"),
+    "numba >={}".format("0.57"),
     "dpctl",
     "packaging",
 ]