diff --git a/.github/workflows/conda-package.yml b/.github/workflows/conda-package.yml
index 7df9afcfe7..2c8d8a2ed1 100644
--- a/.github/workflows/conda-package.yml
+++ b/.github/workflows/conda-package.yml
@@ -152,7 +152,7 @@ jobs:
           conda activate numba_dpex_env
           # echo "libintelocl.so" | tee /etc/OpenCL/vendors/intel-cpu.icd
           export OCL_ICD_FILENAMES=libintelocl.so
-          for script in $(find . \( -not -name "_*" -not -name "vectorize.py" -and -name "*.py" \))
+          for script in $(find . \( -not -name "_*" -not -name "vectorize.py" -not -name "scan.py" -and -name "*.py" \))
           do
             echo "Executing ${script}"
             python ${script} || exit 1
diff --git a/numba_dpex/examples/atomic_op.py b/numba_dpex/examples/atomic_op.py
deleted file mode 100644
index 1155fa38ac..0000000000
--- a/numba_dpex/examples/atomic_op.py
+++ /dev/null
@@ -1,52 +0,0 @@
-# SPDX-FileCopyrightText: 2020 - 2022 Intel Corporation
-#
-# SPDX-License-Identifier: Apache-2.0
-
-import dpctl
-import numpy as np
-
-import numba_dpex as dpex
-
-
-def main():
-    """
-    The example demonstrates the use of numba_dpex's ``atomic_add`` intrinsic
-    function on a SYCL GPU device. The ``dpctl.select_gpu_device`` is
-    equivalent to ``sycl::gpu_selector`` and returns a sycl::device of type GPU.
-
-    If we want to generate native floating point atomics for supported
-    SYCL devices we need to set two environment variables:
-    NUMBA_DPEX_ACTIVATE_ATOMICS_FP_NATIVE=1
-    NUMBA_DPEX_LLVM_SPIRV_ROOT=/path/to/dpcpp/provided/llvm_spirv
-
-    To run this example:
-    NUMBA_DPEX_ACTIVATE_ATOMICS_FP_NATIVE=1 NUMBA_DPEX_LLVM_SPIRV_ROOT=/path/to/dpcpp/provided/llvm_spirv python atomic_op.py
-
-    Without these two environment variables numba_dpex will use other
-    implementation for floating point atomics.
-    """
-
-    @dpex.kernel
-    def atomic_add(a):
-        dpex.atomic.add(a, 0, 1)
-
-    global_size = 100
-    a = np.array([0], dtype=np.float32)
-
-    # Use the environment variable SYCL_DEVICE_FILTER to change the default device.
-    # See https://github.com/intel/llvm/blob/sycl/sycl/doc/EnvironmentVariables.md#sycl_device_filter.
-    device = dpctl.select_default_device()
-    print("Using device ...")
-    device.print_device_info()
-
-    with dpctl.device_context(device):
-        atomic_add[global_size, dpex.DEFAULT_LOCAL_SIZE](a)
-
-    # Expected 100, because global_size = 100
-    print(a)
-
-    print("Done...")
-
-
-if __name__ == "__main__":
-    main()
diff --git a/numba_dpex/examples/barrier.py b/numba_dpex/examples/barrier.py
deleted file mode 100644
index 8092115663..0000000000
--- a/numba_dpex/examples/barrier.py
+++ /dev/null
@@ -1,88 +0,0 @@
-# SPDX-FileCopyrightText: 2020 - 2022 Intel Corporation
-#
-# SPDX-License-Identifier: Apache-2.0
-
-import dpctl
-import numpy as np
-from numba import float32
-
-import numba_dpex as dpex
-
-
-def no_arg_barrier_support():
-    """
-    This example demonstrates the usage of numba_dpex's ``barrier``
-    intrinsic function. The ``barrier`` function is usable only inside
-    a ``kernel`` and is equivalent to OpenCL's ``barrier`` function.
-    """
-
-    @dpex.kernel
-    def twice(A):
-        i = dpex.get_global_id(0)
-        d = A[i]
-        # no argument defaults to global mem fence
-        dpex.barrier()
-        A[i] = d * 2
-
-    N = 10
-    arr = np.arange(N).astype(np.float32)
-    print(arr)
-
-    # Use the environment variable SYCL_DEVICE_FILTER to change the default device.
-    # See https://github.com/intel/llvm/blob/sycl/sycl/doc/EnvironmentVariables.md#sycl_device_filter.
-    device = dpctl.select_default_device()
-    print("Using device ...")
-    device.print_device_info()
-
-    with dpctl.device_context(device):
-        twice[N, dpex.DEFAULT_LOCAL_SIZE](arr)
-
-    # the output should be `arr * 2, i.e. [0, 2, 4, 6, ...]`
-    print(arr)
-
-
-def local_memory():
-    """
-    This example demonstrates the usage of numba-dpex's `local.array`
-    intrinsic function. The function is used to create a static array
-    allocated on the devices local address space.
-    """
-    blocksize = 10
-
-    @dpex.kernel
-    def reverse_array(A):
-        lm = dpex.local.array(shape=10, dtype=float32)
-        i = dpex.get_global_id(0)
-
-        # preload
-        lm[i] = A[i]
-        # barrier local or global will both work as we only have one work group
-        dpex.barrier(dpex.CLK_LOCAL_MEM_FENCE)  # local mem fence
-        # write
-        A[i] += lm[blocksize - 1 - i]
-
-    arr = np.arange(blocksize).astype(np.float32)
-    print(arr)
-
-    # Use the environment variable SYCL_DEVICE_FILTER to change the default device.
-    # See https://github.com/intel/llvm/blob/sycl/sycl/doc/EnvironmentVariables.md#sycl_device_filter.
-    device = dpctl.select_default_device()
-    print("Using device ...")
-    device.print_device_info()
-
-    with dpctl.device_context(device):
-        reverse_array[blocksize, dpex.DEFAULT_LOCAL_SIZE](arr)
-
-    # the output should be `orig[::-1] + orig, i.e. [9, 9, 9, ...]``
-    print(arr)
-
-
-def main():
-    no_arg_barrier_support()
-    local_memory()
-
-    print("Done...")
-
-
-if __name__ == "__main__":
-    main()
diff --git a/numba_dpex/examples/blacksholes_kernel.py b/numba_dpex/examples/blacksholes_kernel.py
deleted file mode 100644
index d2eb80d3b8..0000000000
--- a/numba_dpex/examples/blacksholes_kernel.py
+++ /dev/null
@@ -1,102 +0,0 @@
-# SPDX-FileCopyrightText: 2020 - 2022 Intel Corporation
-#
-# SPDX-License-Identifier: Apache-2.0
-
-import math
-
-import dpctl
-import numpy as np
-
-import numba_dpex as dpex
-
-RISKFREE = 0.02
-VOLATILITY = 0.30
-
-A1 = 0.31938153
-A2 = -0.356563782
-A3 = 1.781477937
-A4 = -1.821255978
-A5 = 1.330274429
-RSQRT2PI = 0.39894228040143267793994605993438
-
-
-def randfloat(rand_var, low, high):
-    return (1.0 - rand_var) * low + rand_var * high
-
-
-OPT_N = 400
-iterations = 2
-
-stockPrice = randfloat(np.random.random(OPT_N), 5.0, 30.0)
-optionStrike = randfloat(np.random.random(OPT_N), 1.0, 100.0)
-optionYears = randfloat(np.random.random(OPT_N), 0.25, 10.0)
-callResult = np.zeros(OPT_N)
-putResult = -np.ones(OPT_N)
-
-
-@dpex.kernel
-def blackscholes_kernel(callResult, putResult, S, X, T, R, V):
-    """
-    A simple implementation of the Black-Scholes formula using explicit
-    OpenCL-syle kernel programming model.
-    """
-    i = dpex.get_global_id(0)
-    if i >= S.shape[0]:
-        return
-    sqrtT = math.sqrt(T[i])
-    d1 = (math.log(S[i] / X[i]) + (R + 0.5 * V * V) * T[i]) / (V * sqrtT)
-    d2 = d1 - V * sqrtT
-
-    K = 1.0 / (1.0 + 0.2316419 * math.fabs(d1))
-    cndd1 = (
-        RSQRT2PI
-        * math.exp(-0.5 * d1 * d1)
-        * (K * (A1 + K * (A2 + K * (A3 + K * (A4 + K * A5)))))
-    )
-    if d1 > 0:
-        cndd1 = 1.0 - cndd1
-
-    K = 1.0 / (1.0 + 0.2316419 * math.fabs(d2))
-    cndd2 = (
-        RSQRT2PI
-        * math.exp(-0.5 * d2 * d2)
-        * (K * (A1 + K * (A2 + K * (A3 + K * (A4 + K * A5)))))
-    )
-    if d2 > 0:
-        cndd2 = 1.0 - cndd2
-
-    expRT = math.exp((-1.0 * R) * T[i])
-    callResult[i] = S[i] * cndd1 - X[i] * expRT * cndd2
-    putResult[i] = X[i] * expRT * (1.0 - cndd2) - S[i] * (1.0 - cndd1)
-
-
-def main():
-    blockdim = 512, 1
-    griddim = int(math.ceil(float(OPT_N) / blockdim[0])), 1
-
-    # Use the environment variable SYCL_DEVICE_FILTER to change the default device.
-    # See https://github.com/intel/llvm/blob/sycl/sycl/doc/EnvironmentVariables.md#sycl_device_filter.
-    device = dpctl.select_default_device()
-    print("Using device ...")
-    device.print_device_info()
-
-    with dpctl.device_context(device):
-        for i in range(iterations):
-            blackscholes_kernel[blockdim, griddim](
-                callResult,
-                putResult,
-                stockPrice,
-                optionStrike,
-                optionYears,
-                RISKFREE,
-                VOLATILITY,
-            )
-
-    print("callResult : \n", callResult)
-    print("putResult : \n", putResult)
-
-    print("Done...")
-
-
-if __name__ == "__main__":
-    main()
diff --git a/numba_dpex/examples/dpex_func.py b/numba_dpex/examples/dpex_func.py
deleted file mode 100644
index 7d39ff1d10..0000000000
--- a/numba_dpex/examples/dpex_func.py
+++ /dev/null
@@ -1,56 +0,0 @@
-# SPDX-FileCopyrightText: 2020 - 2022 Intel Corporation
-#
-# SPDX-License-Identifier: Apache-2.0
-
-import dpctl
-import numpy as np
-
-import numba_dpex as dpex
-
-
-@dpex.func
-def a_device_function(a):
-    """
-    A ``func`` is a device callable function that can be invoked from
-    ``kernel`` and other ``func`` functions.
-    """
-    return a + 1
-
-
-@dpex.func
-def another_device_function(a):
-    return a_device_function(a)
-
-
-@dpex.kernel
-def a_kernel_function(a, b):
-    i = dpex.get_global_id(0)
-    b[i] = another_device_function(a[i])
-
-
-def driver(a, b, N):
-    print(b)
-    print("--------")
-    a_kernel_function[N, dpex.DEFAULT_LOCAL_SIZE](a, b)
-    print(b)
-
-
-def main():
-    N = 10
-    a = np.ones(N)
-    b = np.ones(N)
-
-    # Use the environment variable SYCL_DEVICE_FILTER to change the default device.
-    # See https://github.com/intel/llvm/blob/sycl/sycl/doc/EnvironmentVariables.md#sycl_device_filter.
-    device = dpctl.select_default_device()
-    print("Using device ...")
-    device.print_device_info()
-
-    with dpctl.device_context(device):
-        driver(a, b, N)
-
-    print("Done...")
-
-
-if __name__ == "__main__":
-    main()
diff --git a/numba_dpex/examples/dpex_with_context.py b/numba_dpex/examples/dpex_with_context.py
deleted file mode 100644
index 8a58c6c144..0000000000
--- a/numba_dpex/examples/dpex_with_context.py
+++ /dev/null
@@ -1,51 +0,0 @@
-# SPDX-FileCopyrightText: 2020 - 2022 Intel Corporation
-#
-# SPDX-License-Identifier: Apache-2.0
-
-"""
-The numba_dpex extension adds an automatic offload optimizer to
-numba. The optimizer automatically detects data-parallel code
-regions in a numba.jit function and then offloads the data-parallel
-regions to a SYCL device. The optimizer is triggered when a numba.jit
-function is invoked inside ``dpctl.device_context`` scope.
-
-This example demonstrates the usage of numba_dpex's automatic offload
-functionality. Note that numba_dpex should be installed in your
-environment for the example to work.
-"""
-
-import dpctl
-import numpy as np
-from numba import njit, prange
-
-
-@njit
-def add_two_arrays(b, c):
-    a = np.empty_like(b)
-    for i in prange(len(b)):
-        a[i] = b[i] + c[i]
-
-    return a
-
-
-def main():
-    N = 10
-    b = np.ones(N)
-    c = np.ones(N)
-
-    # Use the environment variable SYCL_DEVICE_FILTER to change the default device.
-    # See https://github.com/intel/llvm/blob/sycl/sycl/doc/EnvironmentVariables.md#sycl_device_filter.
-    device = dpctl.select_default_device()
-    print("Using device ...")
-    device.print_device_info()
-
-    with dpctl.device_context(device):
-        result = add_two_arrays(b, c)
-
-    print("Result :", result)
-
-    print("Done...")
-
-
-if __name__ == "__main__":
-    main()
diff --git a/numba_dpex/examples/kernel/atomic_op.py b/numba_dpex/examples/kernel/atomic_op.py
new file mode 100644
index 0000000000..4e11546958
--- /dev/null
+++ b/numba_dpex/examples/kernel/atomic_op.py
@@ -0,0 +1,30 @@
+# SPDX-FileCopyrightText: 2020 - 2022 Intel Corporation
+#
+# SPDX-License-Identifier: Apache-2.0
+
+import dpnp as np
+
+import numba_dpex as ndpex
+
+
+@ndpex.kernel
+def atomic_reduction(a):
+    idx = ndpex.get_global_id(0)
+    ndpex.atomic.add(a, 0, a[idx])
+
+
+def main():
+    N = 10
+    a = np.arange(N)
+
+    print("Using device ...")
+    print(a.device)
+
+    atomic_reduction[N, ndpex.DEFAULT_LOCAL_SIZE](a)
+    print("Reduction sum =", a[0])
+
+    print("Done...")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/numba_dpex/examples/kernel/black_scholes.py b/numba_dpex/examples/kernel/black_scholes.py
new file mode 100644
index 0000000000..fa61cab3cd
--- /dev/null
+++ b/numba_dpex/examples/kernel/black_scholes.py
@@ -0,0 +1,107 @@
+# Copyright 2022 Intel Corporation
+#
+# SPDX-License-Identifier: Apache 2.0
+
+
+from math import erf, exp, log, sqrt
+
+import dpnp as np
+
+import numba_dpex as ndpx
+
+# Stock price range
+S0L = 10.0
+S0H = 50.0
+
+# Strike range
+XL = 10.0
+XH = 50.0
+
+# Maturity range
+TL = 1.0
+TH = 2.0
+
+# Risk-free rate assumed constant
+RISK_FREE = 0.1
+
+# Volatility assumed constants
+VOLATILITY = 0.2
+
+# Number of call-put options
+NOPT = 1024 * 1024
+
+# Random seed
+SEED = 777
+
+
+def initialize():
+    np.random.seed(SEED)
+    price = np.random.uniform(S0L, S0H, NOPT)
+    strike = np.random.uniform(XL, XH, NOPT)
+    t = np.random.uniform(TL, TH, NOPT)
+    rate = RISK_FREE
+    volatility = VOLATILITY
+    call = np.empty(NOPT)
+    put = np.empty(NOPT)
+
+    return price, strike, t, rate, volatility, call, put
+
+
+@ndpx.kernel(
+    access_types={
+        "read_only": ["price", "strike", "t"],
+        "write_only": ["call", "put"],
+    }
+)
+def kernel_black_scholes(price, strike, t, rate, volatility, call, put):
+    # Scalars
+    mr = -rate
+    sig_sig_two = volatility * volatility * 2.0
+
+    # Current index
+    i = ndpx.get_global_id(0)
+
+    # Get inputs into private memory
+    p = price[i]
+    s = strike[i]
+    tt = t[i]
+
+    a = log(p / s)
+    b = tt * mr
+
+    z = tt * sig_sig_two
+    c = 0.25 * z
+    y = 1.0 / sqrt(z)
+
+    w1 = (a - b + c) * y
+    w2 = (a - b - c) * y
+
+    d1 = 0.5 + 0.5 * erf(w1)
+    d2 = 0.5 + 0.5 * erf(w2)
+
+    se = exp(b) * s
+
+    r = p * d1 - se * d2
+
+    # Write back results
+    call[i] = r
+    put[i] = r - p + se
+
+
+def main():
+    price, strike, t, rate, volatility, call, put = initialize()
+
+    print("Using device ...")
+    print(price.device)
+
+    kernel_black_scholes[NOPT, ndpx.DEFAULT_LOCAL_SIZE](
+        price, strike, t, rate, volatility, call, put
+    )
+
+    print("Call:", call)
+    print("Put:", put)
+    print("Done...")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/numba_dpex/examples/kernel/device_func.py b/numba_dpex/examples/kernel/device_func.py
new file mode 100644
index 0000000000..2ebc57537a
--- /dev/null
+++ b/numba_dpex/examples/kernel/device_func.py
@@ -0,0 +1,52 @@
+# SPDX-FileCopyrightText: 2020 - 2022 Intel Corporation
+#
+# SPDX-License-Identifier: Apache-2.0
+
+import dpnp as np
+
+import numba_dpex as ndpex
+
+# Array size
+N = 10
+
+
+# A device callable function that can be invoked from ``kernel`` and other device functions
+@ndpex.func
+def a_device_function(a):
+    return a + 1
+
+
+# A device callable function can call another device function
+@ndpex.func
+def another_device_function(a):
+    return a_device_function(a * 2)
+
+
+# A kernel function that calls the device function
+@ndpex.kernel
+def a_kernel_function(a, b):
+    i = ndpex.get_global_id(0)
+    b[i] = another_device_function(a[i])
+
+
+# Utility function for printing
+def driver(a, b, N):
+    print("A=", a)
+    a_kernel_function[N, ndpex.DEFAULT_LOCAL_SIZE](a, b)
+    print("B=", b)
+
+
+# Main function
+def main():
+    a = np.ones(N)
+    b = np.ones(N)
+
+    print("Using device ...")
+    print(a.device)
+    driver(a, b, N)
+
+    print("Done...")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/numba_dpex/examples/kernel/interpolation.py b/numba_dpex/examples/kernel/interpolation.py
new file mode 100644
index 0000000000..06632d98c0
--- /dev/null
+++ b/numba_dpex/examples/kernel/interpolation.py
@@ -0,0 +1,131 @@
+# SPDX-FileCopyrightText: 2020 - 2022 Intel Corporation
+#
+# SPDX-License-Identifier: Apache-2.0
+
+import dpnp as np
+from numba import float32
+from numpy.testing import assert_almost_equal
+
+import numba_dpex as ndpex
+
+# Interpolation domain
+XLO = 10.0
+XHI = 90.0
+
+# Number of cubic polynomial segments
+N_SEGMENTS = 8
+
+LOCAL_SIZE = 128  # Number of batches
+N_POINTS_PER_WORK_ITEM = (
+    4  # Number of points in the batch. Each work item processes one batch
+)
+N_POINTS_PER_WORK_GROUP = (
+    N_POINTS_PER_WORK_ITEM * LOCAL_SIZE
+)  # Number of points processed by a work group
+N_POINTS = N_POINTS_PER_WORK_GROUP * N_SEGMENTS  # Total number of points
+
+# Natural cubic spline coefficients in interval [10, 90] with uniform grid
+# Each work group processes its own segment
+COEFFICIENTS = np.asarray(
+    [
+        [
+            -0.008086340206185568,
+            0.242590206185567,
+            -0.6172680412371134,
+            0,
+        ],  # [10, 20]
+        [
+            0.015431701030927836,
+            -1.168492268041237,
+            27.60438144329897,
+            -188.1443298969072,
+        ],  # [20, 30]
+        [
+            -0.0036404639175257733,
+            0.5480025773195877,
+            -23.890463917525775,
+            326.8041237113402,
+        ],  # [30, 40]
+        [
+            -0.010869845360824743,
+            1.4155283505154639,
+            -58.59149484536083,
+            789.4845360824743,
+        ],  # [40, 50]
+        [
+            -0.0028801546391752576,
+            0.21707474226804124,
+            1.3311855670103092,
+            -209.22680412371133,
+        ],  # [50, 60]
+        [
+            0.042390463917525774,
+            -7.9316365979381445,
+            490.25386597938143,
+            -9987.680412371134,
+        ],  # [60, 70]
+        [
+            -0.061681701030927835,
+            13.923518041237113,
+            -1039.6069587628865,
+            25709.072164948455,
+        ],  # [70, 80]
+        [
+            0.029336340206185568,
+            -7.920811855670103,
+            707.9394329896908,
+            -20892.164948453606,
+        ],
+    ],  # [80, 90]
+    dtype=np.float32,  # We use single precision for interpolation
+)
+
+
+@ndpex.kernel()
+def kernel_polynomial(x, y, coefficients):
+    c = ndpex.private.array(
+        4, dtype=float32
+    )  # Coefficients of a polynomial of a given segment
+    z = ndpex.private.array(1, dtype=float32)  # Keep x[i] in private memory
+
+    gid = ndpex.get_global_id(0)
+    gr_id = ndpex.get_group_id(0)
+
+    # Polynomial coefficients are fixed within a workgroup
+    c[0] = coefficients[gr_id][0]
+    c[1] = coefficients[gr_id][1]
+    c[2] = coefficients[gr_id][2]
+    c[3] = coefficients[gr_id][3]
+
+    # Each work item processes N_POINTS_PER_WORK_ITEM points
+    for i in range(
+        gid * N_POINTS_PER_WORK_ITEM, (gid + 1) * N_POINTS_PER_WORK_ITEM, 1
+    ):
+        z[0] = x[i]  # Copy current point into the private memory
+        y[i] = ((c[0] * z[0] + c[1]) * z[0] + c[2]) * z[0] + c[
+            3
+        ]  # Coefficients are in the private memory too
+
+
+def main():
+    # Create arrays on the default device
+    xp = np.arange(XLO, XHI, (XHI - XLO) / N_POINTS)
+    yp = np.empty(xp.shape)
+
+    print("Using device ...")
+    print(xp.device)
+
+    kernel_polynomial[N_POINTS // N_POINTS_PER_WORK_ITEM, LOCAL_SIZE](
+        xp, yp, COEFFICIENTS
+    )
+
+    # Copy results back to the host
+    nyp = np.asnumpy(yp)
+    # Basic check for correctness
+    assert_almost_equal(nyp[2047], 39.97161865234375)
+
+    print("Done...")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/numba_dpex/examples/kernel/scan.py b/numba_dpex/examples/kernel/scan.py
new file mode 100644
index 0000000000..85ecdf86a5
--- /dev/null
+++ b/numba_dpex/examples/kernel/scan.py
@@ -0,0 +1,69 @@
+# SPDX-FileCopyrightText: 2020 - 2022 Intel Corporation
+#
+# SPDX-License-Identifier: Apache-2.0
+
+# scan.py is not working due to issue: https://github.com/IntelPython/numba-dpex/issues/829
+
+import dpnp as np
+
+import numba_dpex as ndpx
+
+# 1D array size
+N = 64
+
+
+# Implements Hillis-Steele prefix sum algorithm
+@ndpx.kernel
+def kernel_hillis_steele_scan(a):
+    # Get local and global id and workgroup size
+    gid = ndpx.get_global_id(0)
+    lid = ndpx.get_local_id(0)
+    ls = ndpx.get_local_size(0)
+
+    # Create temporals in local memory
+    b = ndpx.local.array(ls, dtype=a.dtype)
+    c = ndpx.local.array(ls, dtype=a.dtype)
+
+    # Initialize locals
+    c[lid] = b[lid] = a[gid]
+    ndpx.barrier(ndpx.LOCAL_MEM_FENCE)
+
+    # Calculate prefix sum
+    d = 1
+    while d < ls:
+        if lid > d:
+            c[lid] = b[lid] + b[lid - d]
+        else:
+            c[lid] = b[lid]
+
+        ndpx.barrier(ndpx.LOCAL_MEM_FENCE)
+
+        # Swap c and b
+        e = c[lid]
+        c[lid] = b[lid]
+        b[lid] = e
+
+        # Double the stride
+        d *= 2
+
+    ndpx.barrier()  # The same as ndpx.barrier(ndpx.GLOBAL_MEM_FENCE)
+    a[gid] = b[lid]
+
+
+def main():
+    arr = np.arange(N)
+    print("Original array:", arr)
+
+    print("Using device ...")
+    print(arr.device)
+    kernel_hillis_steele_scan[N, ndpx.DEFAULT_LOCAL_SIZE](arr)
+
+    # the output should be [0, 1, 3, 6, ...]
+    arr_np = np.asnumpy(arr)
+    print(arr_np)
+
+    print("Done...")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/numba_dpex/examples/kernel/vector_sum.py b/numba_dpex/examples/kernel/vector_sum.py
new file mode 100644
index 0000000000..4ebda8f38a
--- /dev/null
+++ b/numba_dpex/examples/kernel/vector_sum.py
@@ -0,0 +1,47 @@
+# SPDX-FileCopyrightText: 2020 - 2022 Intel Corporation
+#
+# SPDX-License-Identifier: Apache-2.0
+
+import dpnp
+import numpy.testing as testing
+
+import numba_dpex as ndpx
+
+
+# Data parallel kernel implementing vector sum
+@ndpx.kernel
+def kernel_vector_sum(a, b, c):
+    i = ndpx.get_global_id(0)
+    c[i] = a[i] + b[i]
+
+
+# Utility function for printing and testing
+def driver(a, b, c, global_size):
+
+    kernel_vector_sum[global_size, ndpx.DEFAULT_LOCAL_SIZE](a, b, c)
+
+    a_np = dpnp.asnumpy(a)  # Copy dpnp array a to NumPy array a_np
+    b_np = dpnp.asnumpy(b)  # Copy dpnp array b to NumPy array b_np
+    c_np = dpnp.asnumpy(c)  # Copy dpnp array c to NumPy array c_np
+    testing.assert_equal(c_np, a_np + b_np)
+
+
+# Main function
+def main():
+    N = 10
+    global_size = N
+    print("Vector size N", N)
+
+    # Create random vectors on the default device
+    a = dpnp.random.random(N)
+    b = dpnp.random.random(N)
+    c = dpnp.ones_like(a)
+
+    print("Using device ...")
+    print(a.device)
+    driver(a, b, c, global_size)
+    print("Done...")
+
+
+if __name__ == "__main__":
+    main()