diff --git a/docs/_templates/autoapi/index.rst b/docs/_templates/autoapi/index.rst
index 1af3e006b2..6f37ea6912 100644
--- a/docs/_templates/autoapi/index.rst
+++ b/docs/_templates/autoapi/index.rst
@@ -4,9 +4,11 @@ API Reference
 This page contains auto-generated API reference documentation [#f1]_.
 
 .. toctree::
-   :maxdepth: 2
+   :maxdepth: 1
 
    numba_dpex/kernel_api/index
+   numba_dpex/experimental/decorators/index
+   numba_dpex/experimental/launcher/index
 
    {% for page in pages %}
    {% if page.top_level_object and page.display %}
diff --git a/docs/source/conf.py b/docs/source/conf.py
index 72a445165c..6b053075c3 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -27,7 +27,6 @@
     "sphinx.ext.extlinks",
     "sphinx.ext.githubpages",
     "sphinx.ext.napoleon",
-    "sphinx.ext.autosectionlabel",
     "sphinxcontrib.programoutput",
     "sphinxcontrib.googleanalytics",
     "myst_parser",
@@ -114,19 +113,10 @@
 
 # -- Auto API configurations ---------------------------------------------------
 
-
-# def skip_util_classes(app, what, name, obj, skip, options):
-#     if what == "module" and "experimental" in name:
-#         if what == "module" and "decorators" not in name:
-#             skip = True
-#     return skip
-
-
-# def setup(sphinx):
-#     sphinx.connect("autoapi-skip-member", skip_util_classes)
-
-
-autoapi_dirs = ["../../numba_dpex/kernel_api"]
+autoapi_dirs = [
+    "../../numba_dpex/kernel_api",
+    "../../numba_dpex/experimental",
+]
 autoapi_type = "python"
 
 autoapi_template_dir = "_templates/autoapi"
@@ -160,3 +150,14 @@ def prepare_jinja_env(jinja_env) -> None:
 
 
 autoapi_prepare_jinja_env = prepare_jinja_env
+
+
+def skip_member(app, what, name, obj, skip, options):
+    # skip submodules
+    if what == "module":
+        skip = True
+    return skip
+
+
+def setup(sphinx):
+    sphinx.connect("autoapi-skip-member", skip_member)
diff --git a/docs/source/experimental/index.rst b/docs/source/experimental/index.rst
new file mode 100644
index 0000000000..0737f2d4d2
--- /dev/null
+++ b/docs/source/experimental/index.rst
@@ -0,0 +1,126 @@
+.. _index:
+.. include:: ./../ext_links.txt
+
+Experimental Features
+=====================
+
+Numba-dpex includes various experimental features that are not yet suitable for
+everyday production usage, but are included as an engineering preview.
+The most prominent experimental features currently included in numba-dpex are
+listed in this section.
+
+
+Compiling and Offloading ``dpnp`` statements
+--------------------------------------------
+
+Data Parallel Extension for NumPy* (`dpnp`_) is a drop-in NumPy* replacement
+library built using the oneAPI software stack including `oneMKL`_, `oneDPL`_ and
+`SYCL*`_. numba-dpex has experimental support for compiling a subset of dpnp
+functions. The feature is enabled by the :py:func:`numba_dpex.dpjit` decorator.
+
+An example of a supported usage of dpnp in numba-dpex is provided in the
+following code snippet:
+
+.. code-block:: python
+
+    import dpnp
+    from numba_dpex import dpjit
+
+
+    @dpjit
+    def foo():
+        a = dpnp.ones(1024, device="gpu")
+        return dpnp.sqrt(a)
+
+
+    a = foo()
+    print(a)
+    print(type(a))
+
+
+Offloading ``prange`` loops
+---------------------------
+
+numba-dpex supports using the ``numba.prange`` statements with
+``dpnp.ndarray`` objects. All such ``prange`` loops are offloaded as kernels and
+executed on a device inferred using the compute follows data programming model.
+The next examples shows using a ``prange`` loop.
+
+
+.. code-block:: python
+
+    import dpnp
+    from numba_dpex import dpjit, prange
+
+
+    @dpjit
+    def foo():
+        x = dpnp.ones(1024, device="gpu")
+        o = dpnp.empty_like(a)
+        for i in prange(x.shape[0]):
+            o[i] = x[i] * x[i]
+        return o
+
+
+    c = foo()
+    print(c)
+    print(type(c))
+
+
+``prange`` loop statements can also be used to write reduction loops as
+demonstrated by the following naive pairwise distance computation.
+
+.. code-block:: python
+
+  from numba_dpex import dpjit, prange
+  import dpnp
+  import dpctl
+
+
+  @dpjit
+  def pairwise_distance(X1, X2, D):
+      """Naïve pairwise distance impl - take an array representing M points in N
+      dimensions, and return the M x M matrix of Euclidean distances
+
+      Args:
+          X1 : Set of points
+          X2 : Set of points
+          D  : Outputted distance matrix
+      """
+      # Size of inputs
+      X1_rows = X1.shape[0]
+      X2_rows = X2.shape[0]
+      X1_cols = X1.shape[1]
+
+      float0 = X1.dtype.type(0.0)
+
+      # Outermost parallel loop over the matrix X1
+      for i in prange(X1_rows):
+          # Loop over the matrix X2
+          for j in range(X2_rows):
+              d = float0
+              # Compute exclidean distance
+              for k in range(X1_cols):
+                  tmp = X1[i, k] - X2[j, k]
+                  d += tmp * tmp
+              # Write computed distance to distance matrix
+              D[i, j] = dpnp.sqrt(d)
+
+
+  q = dpctl.SyclQueue()
+  X1 = dpnp.ones((10, 2), sycl_queue=q)
+  X2 = dpnp.zeros((10, 2), sycl_queue=q)
+  D = dpnp.empty((10, 2), sycl_queue=q)
+
+  pairwise_distance(X1, X2, D)
+  print(D)
+
+
+Kernel fusion
+-------------
+
+.. ``numba-dpex`` can identify each NumPy* (or ``dpnp``) array expression as a
+.. data-parallel kernel and fuse them together to generate a single SYCL kernel.
+.. The kernel is automatically offloaded to the specified device where the fusion
+.. operation is invoked. Here is a simple example of a Black-Scholes formula
+.. computation where kernel fusion occurs at different ``dpnp`` math functions:
diff --git a/docs/source/ext_links.txt b/docs/source/ext_links.txt
index 13a02c5f0d..ee01b1a83c 100644
--- a/docs/source/ext_links.txt
+++ b/docs/source/ext_links.txt
@@ -25,3 +25,6 @@
 .. _Intel VTune Profiler: https://www.intel.com/content/www/us/en/developer/tools/oneapi/vtune-profiler.html
 .. _Intel Advisor: https://www.intel.com/content/www/us/en/developer/tools/oneapi/advisor.html
 .. _oneMKL: https://www.intel.com/content/www/us/en/docs/oneapi/programming-guide/2023-2/intel-oneapi-math-kernel-library-onemkl.html
+.. _oneDPL: https://www.intel.com/content/www/us/en/developer/tools/oneapi/dpc-library.html#gs.5izf63
+.. _UXL: https://uxlfoundation.org/
+.. _oneAPI GPU optimization guide: https://www.intel.com/content/www/us/en/docs/oneapi/optimization-guide-gpu/2024-0/general-purpose-computing-on-gpu.html
diff --git a/docs/source/glossary.rst b/docs/source/glossary.rst
deleted file mode 100644
index 5c3414c7f8..0000000000
--- a/docs/source/glossary.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-.. _glossary:
-.. include:: ./ext_links.txt
-
-Glossary
-========
diff --git a/docs/source/index.rst b/docs/source/index.rst
index 1bf6f0ab62..8a540cac8f 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -41,26 +41,26 @@ Data Parallel Extension for Numba*
 .. module:: numba_dpex
 
 .. toctree::
-   :maxdepth: 2
+   :maxdepth: 1
 
    overview
    getting_started
    programming_model
    user_guide/index
    autoapi/index
+   experimental/index
    useful_links
 
 .. toctree::
-   :maxdepth: 2
+   :maxdepth: 1
    :caption: Development
 
    contribution_guide
 
 .. toctree::
-   :maxdepth: 2
+   :maxdepth: 1
    :caption: Misc Notes
 
    examples
-   glossary
    license
    release-notes
diff --git a/docs/source/overview.rst b/docs/source/overview.rst
index 25ad9c139b..6db2ceabaa 100644
--- a/docs/source/overview.rst
+++ b/docs/source/overview.rst
@@ -4,147 +4,86 @@
 Overview
 ========
 
-Data Parallel Extension for Numba* (`numba-dpex`_) is an extension to
-the `Numba*`_ Python JIT compiler adding an architecture-agnostic kernel
-programming API, and a new front-end to compile the Data Parallel Extension
-for Numpy* (`dpnp`_) library. The ``dpnp`` Python library is a data-parallel
-implementation of `NumPy*`_'s API using the `SYCL*`_ language.
-
-.. ``numba-dpex``'s support for ``dpnp`` compilation is a new way for Numba* users
-.. to write code in a NumPy-like API that is already supported by Numba*, while at
-.. the same time automatically running such code parallelly on various types of
-.. architecture.
-
-``numba-dpex`` is an open-source project and can be installed as part of `Intel
-AI Analytics Toolkit`_ or the `Intel Distribution for Python*`_. The package is
-also available on Anaconda cloud and as a Docker image on GitHub. Please refer
-the :doc:`getting_started` page to learn more.
-
-Main Features
--------------
-
-Portable Kernel Programming
-~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-The ``numba-dpex`` kernel programming API has a design similar to Numba's
-``cuda.jit`` sub-module. The API is modeled after the `SYCL*`_ language and uses
-the `DPC++`_ SYCL runtime. Currently, compilation of kernels is supported for
-SPIR-V-based OpenCL and `oneAPI Level Zero`_ devices CPU and GPU devices. In the
-future, compilation support for other types of hardware that are supported by
-DPC++ will be added.
-
-The following example illustrates a vector addition kernel written with
-``numba-dpex`` kernel API.
+Data Parallel Extension for Numba* (`numba-dpex`_) is a free and open-source
+LLVM-based code generator for portable accelerator programming in Python. The
+code generator implements a new pseudo-kernel programming domain-specific
+language (DSL) called `KAPI` that is modeled after the C++ DSL `SYCL*`_. The
+SYCL language is an open standard developed under the Unified Acceleration
+Foundation (`UXL`_) as a vendor-agnostic way of programming different types of
+data-parallel hardware such as multi-core CPUs, GPUs, and FPGAs. Numba-dpex and
+KAPI aim to bring the same vendor-agnostic and standard-compliant programming
+model to Python.
+
+Numba-dpex is built on top of the open-source `Numba*`_ JIT compiler that
+implements a CPython bytecode parser and code generator to lower the bytecode to
+LLVM IR. The Numba* compiler is able to compile a large sub-set of Python and
+most of the NumPy library. Numba-dpex uses Numba*'s tooling to implement the
+parsing and typing support for the data types and functions defined in the KAPI
+DSL. A custom code generator is then used to lower KAPI to a form of LLVM IR
+that includes special LLVM instructions that define a low-level data-parallel
+kernel API. Thus, a function defined in KAPI is compiled to a data-parallel
+kernel that can run on different types of hardware. Currently, compilation of
+KAPI is possible for x86 CPU devices, Intel Gen9 integrated GPUs, Intel UHD
+integrated GPUs, and Intel discrete GPUs.
+
+
+The following example shows a pairwise distance matrix computation in KAPI.
 
 .. code-block:: python
 
-    import dpnp
-    import numba_dpex as dpex
-
-
-    @dpex.kernel
-    def vecadd_kernel(a, b, c):
-        i = dpex.get_global_id(0)
-        c[i] = a[i] + b[i]
-
-
-    a = dpnp.ones(1024, device="gpu")
-    b = dpnp.ones(1024, device="gpu")
-    c = dpnp.empty_like(a)
-
-    vecadd_kernel[dpex.Range(1024)](a, b, c)
-    print(c)
-
-In the above example, three arrays are allocated on a default ``gpu`` device
-using the ``dpnp`` library. The arrays are then passed as input arguments to the
-kernel function. The compilation target and the subsequent execution of the
-kernel is determined by the input arguments and follow the
-"compute-follows-data" programming model as specified in the `Python* Array API
-Standard`_. To change the execution target to a CPU, the device keyword needs to
-be changed to ``cpu`` when allocating the ``dpnp`` arrays. It is also possible
-to leave the ``device`` keyword undefined and let the ``dpnp`` library select a
-default device based on environment flag settings. Refer the
-:doc:`user_guide/kernel_programming/index` for further details.
-
-``dpjit`` decorator
-~~~~~~~~~~~~~~~~~~~
-
-The ``numba-dpex`` package provides a new decorator ``dpjit`` that extends
-Numba's ``njit`` decorator. The new decorator is equivalent to
-``numba.njit(parallel=True)``, but additionally supports compiling ``dpnp``
-functions, ``prange`` loops, and array expressions that use ``dpnp.ndarray``
-objects.
-
-Unlike Numba's NumPy parallelization that only supports CPUs, ``dpnp``
-expressions are first converted to data-parallel kernels and can then be
-`offloaded` to different types of devices. As ``dpnp`` implements the same API
-as NumPy*, an existing ``numba.njit`` decorated function that uses
-``numpy.ndarray`` may be refactored to use ``dpnp.ndarray`` and decorated with
-``dpjit``. Such a refactoring can allow the parallel regions to be offloaded
-to a supported GPU device, providing users an additional option to execute their
-code parallelly.
-
-The vector addition example depicted using the kernel API can also be
-expressed in several different ways using ``dpjit``.
-
-.. code-block:: python
-
-    import dpnp
-    import numba_dpex as dpex
-
-
-    @dpex.dpjit
-    def vecadd_v1(a, b):
-        return a + b
-
+    from numba_dpex import kernel_api as kapi
+    import math
 
-    @dpex.dpjit
-    def vecadd_v2(a, b):
-        return dpnp.add(a, b)
 
+    def pairwise_distance_kernel(item: kapi.Item, data, distance):
+        i = item.get_id(0)
+        j = item.get_id(1)
 
-    @dpex.dpjit
-    def vecadd_v3(a, b):
-        c = dpnp.empty_like(a)
-        for i in prange(a.shape[0]):
-            c[i] = a[i] + b[i]
-        return c
+        data_dims = data.shape[1]
 
-As with the kernel API example, a ``dpjit`` function if invoked with ``dpnp``
-input arguments follows the compute-follows-data programming model. Refer
-:doc:`user_manual/dpnp_offload/index` for further details.
+        d = data.dtype.type(0.0)
+        for k in range(data_dims):
+            tmp = data[i, k] - data[j, k]
+            d += tmp * tmp
 
+        distance[j, i] = math.sqrt(d)
 
-.. Project Goal
-.. ------------
 
-.. If C++ is not your language, you can skip writing data-parallel kernels in SYCL
-.. and directly write them in Python.
+Skipping over much of the language details, at a high-level the
+``pairwise_distance_kernel`` can be viewed as a data-parallel function that gets
+executed individually by a set of "work items". That is, each work item runs the
+same function for a subset of the elements of the input ``data`` and
+``distance`` arrays. For programmers familiar with the CUDA or OpenCL languages,
+it is the same programming model that is referred to as Single Program Multiple
+Data (SPMD). As Python has no concept of a work item the KAPI function itself is
+sequential and needs to be compiled to convert it into a parallel version. The
+next example shows the changes to the original script to compile and run the
+``pairwise_distance_kernel`` in parallel.
 
-.. Our package ``numba-dpex`` extends the Numba compiler to allow kernel creation
-.. directly in Python via a custom compute API
-
-
-.. Contributing
-.. ------------
-
-.. Refer the `contributing guide
-.. <https://github.com/IntelPython/numba-dpex/blob/main/CONTRIBUTING>`_ for
-.. information on coding style and standards used in ``numba-dpex``.
-
-.. License
-.. -------
-
-.. ``numba-dpex`` is Licensed under Apache License 2.0 that can be found in `LICENSE
-.. <https://github.com/IntelPython/numba-dpex/blob/main/LICENSE>`_. All usage and
-.. contributions to the project are subject to the terms and conditions of this
-.. license.
+.. code-block:: python
 
+    from numba_dpex import kernel, call_kernel
+    import dpnp
 
-.. Along with the kernel programming API an auto-offload feature is also provided.
-.. The feature enables automatic generation of kernels from data-parallel NumPy
-.. library calls and array expressions, Numba ``prange`` loops, and `other
-.. "data-parallel by construction" expressions
-.. <https://numba.pydata.org/numba-doc/latest/user/parallel.html>`_ that Numba is
-.. able to parallelize. Following two examples demonstrate the two ways in which
-.. kernels may be written using numba-dpex.
+    data = dpnp.random.ranf((10000, 3), device="gpu")
+    distance = dpnp.empty(shape=(data.shape[0], data.shape[0]), device="gpu")
+    exec_range = kapi.Range(data.shape[0], data.shape[0])
+    call_kernel(kernel(pairwise_distance_kernel), exec_range, data, distance)
+
+To compile a KAPI function into a data-parallel kernel and run it on a device,
+three things need to be done: allocate the arguments to the function on the
+device where the function is to execute, compile the function by applying a
+numba-dpex decorator, and `launch` or execute the compiled kernel on the device.
+
+Allocating arrays or scalars to be passed to a compiled KAPI function is not
+done directly in numba-dpex. Instead, numba-dpex supports passing in
+tensors/ndarrays created using either the `dpnp`_ NumPy drop-in replacement
+library or the `dpctl`_ SYCl-based Python Array API library. To trigger
+compilation, the ``numba_dpex.kernel`` decorator has to be used, and finally to
+launch a compiled kernel the ``numba_dpex.call_kernel`` function should be
+invoked.
+
+For a more detailed description about programming with numba-dpex, refer
+the :doc:`programming_model`, :doc:`user_guide/index` and the
+:doc:`autoapi/index` sections of the documentation. To setup numba-dpex and try
+it out refer the :doc:`getting_started` section.
diff --git a/docs/source/programming_model.rst b/docs/source/programming_model.rst
index 168120fc9d..42e972c6b0 100644
--- a/docs/source/programming_model.rst
+++ b/docs/source/programming_model.rst
@@ -227,10 +227,8 @@ three keyword arguments present in all `array creation functions
 For example, consider
 
 .. code:: python
-   # Use usm_type = 'device' to get USM-device allocation (default),
-   #     usm_type = 'shared' to get USM-shared allocation,
-   #     usm_type = 'host'   to get USM-host allocation
-   # def dpt.empty(..., device=None, usm_type=None, sycl_queue=None) -> dpctl.tensor.usm_ndarray: ...
+
+    # TODO
 
 The keyword ``device`` is `mandated by the array API
 <https://data-apis.org/array-api/latest/design_topics/device_support.html#syntax-for-device-assignment>`__.
diff --git a/docs/source/user_guide/config.rst b/docs/source/user_guide/config.rst
index a9c188d660..e98b85521c 100644
--- a/docs/source/user_guide/config.rst
+++ b/docs/source/user_guide/config.rst
@@ -3,7 +3,10 @@
 Configuration Options for ``numba-dpex``
 ========================================
 
-``numba-dpex`` provides a set of environment variables and flags for configuring different aspects of the compilation, debugging and execution of programs. The configuration flags of ``numba-dpex`` are mostly inherited from those of Numba*. They are defined in :file:`numba_dpex/core/config.py`.
+``numba-dpex`` provides a set of environment variables and flags for configuring
+different aspects of the compilation, debugging and execution of programs. The
+configuration flags of ``numba-dpex`` are mostly inherited from those of Numba*.
+They are defined in :file:`numba_dpex/core/config.py`.
 
 .. note::
     In order to enable/disable each of the configuration flags, a ``NUMBA_DPEX``
diff --git a/docs/source/user_guide/dpnp_offload.rst b/docs/source/user_guide/dpnp_offload.rst
deleted file mode 100644
index d04183df20..0000000000
--- a/docs/source/user_guide/dpnp_offload.rst
+++ /dev/null
@@ -1,219 +0,0 @@
-.. include:: ./../ext_links.txt
-
-Compiling and Offloading ``dpnp`` statements
-============================================
-
-Data Parallel Extension for NumPy* (``dpnp``) is a drop-in ``NumPy*``
-replacement library built on top of oneMKL and SYCL. ``numba-dpex`` allows
-various ``dpnp`` library function calls to be JIT-compiled using the
-``numba_dpex.dpjit`` decorator. Presently, ``numba-dpex`` can compile several
-``dpnp`` array constructors (``ones``, ``zeros``, ``full``, ``empty``), most
-universal functions, ``prange`` loops, and vector expressions using
-``dpnp.ndarray`` objects.
-
-An example of a supported usage of ``dpnp`` statements in ``numba-dpex`` is
-provided in the following code snippet:
-
-
-.. ``numba-dpex`` implements its own runtime library to support offloading ``dpnp``
-.. library functions to SYCL devices. For each ``dpnp`` function signature to be
-.. offloaded, ``numba-dpex`` implements the corresponding direct SYCL function call
-.. in the runtime and the function call is inlined in the generated LLVM IR.
-
-.. code-block:: python
-
-    import dpnp
-    from numba_dpex import dpjit
-
-
-    @dpjit
-    def foo():
-        a = dpnp.ones(1024, device="gpu")
-        return dpnp.sqrt(a)
-
-
-    a = foo()
-    print(a)
-    print(type(a))
-
-.. :samp:`dpnp.ones(10)` will be called through |ol_dpnp_ones(...)|_.
-
-
-.. Design
-.. -------
-
-.. ``numba_dpex`` uses the |numba.extending.overload| decorator to create a Numba*
-.. implementation of a function that can be used in `nopython mode`_ functions.
-.. This is done through translation of ``dpnp`` function signature so that they can
-.. be called in ``numba_dpex.dpjit`` decorated code.
-
-.. The specific SYCL operation for a certain ``dpnp`` function is performed by the
-.. runtime interface. During compiling a function decorated with the ``@dpjit``
-.. decorator, ``numba-dpex`` generates the corresponding SYCL function call through
-.. its runtime library and injects it into the LLVM IR through
-.. |numba.extending.intrinsic|_. The ``@intrinsic`` decorator is used for marking a
-.. ``dpnp`` function as typing and implementing the function in nopython mode using
-.. the `llvmlite IRBuilder API`_. This is an escape hatch to build custom LLVM IR
-.. that will be inlined into the caller.
-
-.. The code injection logic to enable ``dpnp`` functions calls in the Numba IR is
-.. implemented by :mod:`numba_dpex.core.dpnp_iface.arrayobj` module which replaces
-.. Numba*'s :mod:`numba.np.arrayobj`. Each ``dpnp`` function signature is provided
-.. with a concrete implementation to generates the actual code using Numba's
-.. ``overload`` function API. e.g.:
-
-.. .. code-block:: python
-
-..     @overload(dpnp.ones, prefer_literal=True)
-..     def ol_dpnp_ones(
-..         shape, dtype=None, order="C", device=None, usm_type="device", sycl_queue=None
-..     ):
-..         ...
-
-.. The corresponding intrinsic implementation is in :file:`numba_dpex/dpnp_iface/_intrinsic.py`.
-
-.. .. code-block:: python
-
-..    @intrinsic
-..    def impl_dpnp_ones(
-..        ty_context,
-..        ty_shape,
-..        ty_dtype,
-..        ty_order,
-..        ty_device,
-..        ty_usm_type,
-..        ty_sycl_queue,
-..        ty_retty_ref,
-..    ):
-..        ...
-
-Parallel Range
----------------
-
-``numba-dpex`` supports using the ``numba.prange`` statements with
-``dpnp.ndarray`` objects. All such ``prange`` loops are offloaded as kernels and
-executed on a device inferred using the compute follows data programming model.
-The next examples shows using a ``prange`` loop.
-
-.. implements the ability to run loops in parallel, the language
-.. construct is adapted from Numba*'s ``prange`` concept that was initially
-.. designed to run OpenMP parallel for loops. In Numba*, the loop-body is scheduled
-.. in seperate threads, and they execute in a ``nopython`` Numba* context.
-.. ``prange`` automatically takes care of data privatization. ``numba-dpex``
-.. employs the ``prange`` compilation mechanism to offload parallel loop like
-.. programming constructs onto SYCL enabled devices.
-
-.. The ``prange`` compilation pass is delegated through Numba's
-.. :file:`numba/parfor/parfor_lowering.py` module where ``numba-dpex`` provides
-.. :file:`numba_dpex/core/parfors/parfor_lowerer.py` module to be used as the
-.. *lowering* mechanism through
-.. :py:class:`numba_dpex.core.parfors.parfor_lowerer.ParforLowerImpl` class. This
-.. provides a custom lowerer for ``prange`` nodes that generates a SYCL kernel for
-.. a ``prange`` node and submits it to a queue. Here is an example of a ``prange``
-.. use case in ``@dpjit`` context:
-
-.. code-block:: python
-
-    import dpnp
-    from numba_dpex import dpjit, prange
-
-
-    @dpjit
-    def foo():
-        x = dpnp.ones(1024, device="gpu")
-        o = dpnp.empty_like(a)
-        for i in prange(x.shape[0]):
-            o[i] = x[i] * x[i]
-        return o
-
-
-    c = foo()
-    print(c)
-    print(type(c))
-
-.. Each ``prange`` instruction in Numba* has an optional *lowerer* attribute. The
-.. lowerer attribute determines how the parfor instruction should be lowered to
-.. LLVM IR. In addition, the lower attribute decides which ``prange`` instructions
-.. can be fused together. At this point ``numba-dpex`` does not generate
-.. device-specific code and the lowerer used is same for all device types. However,
-.. a different :py:class:`numba_dpex.core.parfors.parfor_lowerer.ParforLowerImpl`
-.. instance is returned for every ``prange`` instruction for each corresponding CFD
-.. (Compute Follows Data) inferred device to prevent illegal ``prange`` fusion.
-
-``prange`` loop statements can also be used to write reduction loops as
-demonstrated by the following naive pairwise distance computation.
-
-.. code-block:: python
-
-  from numba_dpex import dpjit, prange
-  import dpnp
-  import dpctl
-
-
-  @dpjit
-  def pairwise_distance(X1, X2, D):
-      """Naïve pairwise distance impl - take an array representing M points in N
-      dimensions, and return the M x M matrix of Euclidean distances
-
-      Args:
-          X1 : Set of points
-          X2 : Set of points
-          D  : Outputted distance matrix
-      """
-      # Size of inputs
-      X1_rows = X1.shape[0]
-      X2_rows = X2.shape[0]
-      X1_cols = X1.shape[1]
-
-      float0 = X1.dtype.type(0.0)
-
-      # Outermost parallel loop over the matrix X1
-      for i in prange(X1_rows):
-          # Loop over the matrix X2
-          for j in range(X2_rows):
-              d = float0
-              # Compute exclidean distance
-              for k in range(X1_cols):
-                  tmp = X1[i, k] - X2[j, k]
-                  d += tmp * tmp
-              # Write computed distance to distance matrix
-              D[i, j] = dpnp.sqrt(d)
-
-
-  q = dpctl.SyclQueue()
-  X1 = dpnp.ones((10, 2), sycl_queue=q)
-  X2 = dpnp.zeros((10, 2), sycl_queue=q)
-  D = dpnp.empty((10, 2), sycl_queue=q)
-
-  pairwise_distance(X1, X2, D)
-  print(D)
-
-
-.. Fusion of Kernels
-.. ------------------
-
-.. ``numba-dpex`` can identify each NumPy* (or ``dpnp``) array expression as a
-.. data-parallel kernel and fuse them together to generate a single SYCL kernel.
-.. The kernel is automatically offloaded to the specified device where the fusion
-.. operation is invoked. Here is a simple example of a Black-Scholes formula
-.. computation where kernel fusion occurs at different ``dpnp`` math functions:
-
-.. .. literalinclude:: ./../../../numba_dpex/examples/blacksholes_njit.py
-..    :language: python
-..    :pyobject: blackscholes
-..    :caption: **EXAMPLE:** Data parallel kernel implementing the vector sum a+b
-..    :name: blackscholes_dpjit
-
-
-.. .. |numba.extending.overload| replace:: ``numba.extending.overload``
-.. .. |numba.extending.intrinsic| replace:: ``numba.extending.intrinsic``
-.. .. |ol_dpnp_ones(...)| replace:: ``ol_dpnp_ones(...)``
-.. .. |numba.np.arrayobj| replace:: ``numba.np.arrayobj``
-
-.. .. _low-level API: https://github.com/IntelPython/dpnp/tree/master/dpnp/backend
-.. .. _`ol_dpnp_ones(...)`: https://github.com/IntelPython/numba-dpex/blob/main/numba_dpex/dpnp_iface/arrayobj.py#L358
-.. .. _`numba.extending.overload`: https://numba.pydata.org/numba-doc/latest/extending/high-level.html#implementing-functions
-.. .. _`numba.extending.intrinsic`: https://numba.pydata.org/numba-doc/latest/extending/high-level.html#implementing-intrinsics
-.. .. _nopython mode: https://numba.pydata.org/numba-doc/latest/glossary.html#term-nopython-mode
-.. .. _`numba.np.arrayobj`: https://github.com/numba/numba/blob/main/numba/np/arrayobj.py
-.. .. _`llvmlite IRBuilder API`: http://llvmlite.pydata.org/en/latest/user-guide/ir/ir-builder.html
diff --git a/docs/source/user_guide/index.rst b/docs/source/user_guide/index.rst
index 1cf7381bac..46efb768de 100644
--- a/docs/source/user_guide/index.rst
+++ b/docs/source/user_guide/index.rst
@@ -8,6 +8,5 @@ Tutorials
     :maxdepth: 2
 
     kernel_programming/index
-    dpnp_offload
     debugging/index
     config
diff --git a/docs/source/user_guide/kernel_programming/atomic-operations.rst b/docs/source/user_guide/kernel_programming/atomic-operations.rst
deleted file mode 100644
index 07c2be4795..0000000000
--- a/docs/source/user_guide/kernel_programming/atomic-operations.rst
+++ /dev/null
@@ -1,27 +0,0 @@
-Supported Atomic Operations
-===========================
-
-Numba-dpex supports some of the atomic operations supported in DPC++.
-Those that are presently implemented are as follows:
-
-.. automodule:: numba_dpex.ocl.stubs
-   :members: atomic
-   :noindex:
-
-Example
--------
-
-Example usage of atomic operations
-
-.. literalinclude:: ./../../../../numba_dpex/examples/kernel/atomic_op.py
-   :pyobject: main
-
-.. note::
-
-    The ``numba_dpex.atomic.add`` function is analogous to The
-    ``numba.cuda.atomic.add`` provided by the ``numba.cuda`` backend.
-
-Full examples
--------------
-
-- :file:`numba_dpex/examples/atomic_op.py`
diff --git a/docs/source/user_guide/kernel_programming/index.rst b/docs/source/user_guide/kernel_programming/index.rst
index 0008eccc6c..b40d5845fe 100644
--- a/docs/source/user_guide/kernel_programming/index.rst
+++ b/docs/source/user_guide/kernel_programming/index.rst
@@ -1,53 +1,106 @@
 .. _index:
 .. include:: ./../../ext_links.txt
 
-Kernel Programming Basics
-=========================
-
-`Data Parallel Extensions for Python*`_ introduce a concept of an *offload kernel*, defined as
-a part of a Python program being submitted for execution to the device queue.
-
-.. image:: ./../../../asset/images/kernel-queue-device.png
-    :scale: 50%
-    :align: center
-    :alt: Offload Kernel
-
-There are multiple ways how to write offload kernels. CUDA*, OpenCl*, and SYCL* offer similar programming model
-known as the *data parallel kernel programming*. In this model you express the work in terms of *work items*.
-You split data into small pieces, and each piece will be a unit of work, or a *work item*. The total number of
-work items is called *global size*. You can also group work items into bigger chunks called *work groups*.
-The number of work items in the work group is called the *local size*.
-
-.. image:: ./../../../asset/images/kernel_prog_model.png
-    :scale: 50%
-    :align: center
-    :alt: Offload Kernel
-
-In this example there are 48 *work items* (8 in dimension 0, and 6 in dimension 1), that is the *global size* is 48.
-Work items are grouped in *work groups* with the *local size* 8 (4 in dimension 0, and 2 in dimension 1). There are
-total 48/8 = 6 work groups.
-
-In the *data parallel kernel programming* model you write a function that processes a given work item.
-Such a function is called the *data parallel kernel*.
-
-**Data Parallel Extension for Numba** offers a way to write data parallel kernels directly using Python using
-``numba_dpex.kernel``. It bears similarities with ``numba.cuda`` and ``numba.roc``, but unlike these proprietary
-programming models ``numba_dpex`` is built on top of `SYCL*`_ , which is hardware agnostic, meaning
-that with ``numba_dpex.kernel`` programming model you will be able to write a portable code targeting different
-hardware vendors.
-
-.. note::
-   The current version of ``numba-dpex`` supports Intel SYCL devices only
-
-.. toctree::
-   :caption: This document will cover the following chapters:
-   :maxdepth: 2
-
-   writing_kernels
-   synchronization
-   device-functions
-   atomic-operations
-   memory_allocation_address_space
-   reduction
-   ufunc
-   supported-python-features
+Kernel Programming
+==================
+
+The tutorial covers the most important features of the KAPI kernel programming
+API and introduces the concepts needed to express data-parallel kernels in
+numba-dpex.
+
+
+Preliminary concepts
+--------------------
+
+Data parallelism
+++++++++++++++++
+
+Single Program Multiple Data
+++++++++++++++++++++++++++++
+
+Range v/s Nd-Range Kernels
+++++++++++++++++++++++++++
+
+Work items and Work groups
+++++++++++++++++++++++++++
+
+Basic concepts
+--------------
+
+
+Writing a *range* kernel
+++++++++++++++++++++++++
+
+A *range* kernel represents the simplest form of parallelism that can be
+expressed in KAPI. A range kernel represents a data-parallel execution of the
+same function by a set of work items. In KAPI, an instance of the
+:py:class:`numba_dpex.kernel_api.Range` class represents the set of work items
+and each work item in the ``Range`` is represented by an instance of the
+:py:class:`numba_dpex.kernel_api.Item` class. As such these two classes are
+essential to writing a range kernel in KAPI.
+
+.. literalinclude:: ./../../../../numba_dpex/examples/kernel/vector_sum.py
+   :language: python
+   :lines: 8-9, 11-15
+   :caption: **EXAMPLE:** A KAPI range kernel
+   :name: ex_kernel_declaration_vector_sum
+
+:ref:`ex_kernel_declaration_vector_sum` shows an example of a range kernel.
+Every range kernel requires its first argument to be an ``Item`` and
+needs to be launched via :py:func:`numba_dpex.experimental.launcher.call_kernel`
+by passing an instance a ``Range`` object.
+
+Do note that a ``Range`` object only controls the creation of work items, the
+distribution of work and data over a ``Range`` still needs to be defined by the
+user-written function. In the example, each work item access a single element of
+each of the three array and performs a single addition operation. It is possible
+to write the kernel differently so that each work item accesses multiple data
+elements or conditionally performs different amount of work. The data access
+patterns in a work item can have performance implications and programmers should
+refer a more topical material such as the `oneAPI GPU optimization guide`_ to
+learn more.
+
+A range kernel is meant to express a basic `parallel-for` calculation that is
+ideally suited for embarrassingly parallel kernels such as elementwise
+computations over ndarrays. The API for expressing a range kernel does not
+allow advanced features such as synchronization of work items and fine-grained
+control over memory allocation on a device.
+
+Writing an *nd-range* kernel
+++++++++++++++++++++++++++++
+
+The ``device_func`` decorator
++++++++++++++++++++++++++++++
+
+Supported mathematical operations
++++++++++++++++++++++++++++++++++
+
+Supported Python operators
+++++++++++++++++++++++++++
+
+Supported kernel arguments
+++++++++++++++++++++++++++
+
+Launching a kernel
+++++++++++++++++++
+
+Advanced topics
+---------------
+
+Local memory allocation
++++++++++++++++++++++++
+
+Private memory allocation
++++++++++++++++++++++++++
+
+Group barrier synchronization
++++++++++++++++++++++++++++++
+
+Atomic operations
++++++++++++++++++
+
+Async kernel execution
+++++++++++++++++++++++
+
+Specializing a kernel or a device_func
+++++++++++++++++++++++++++++++++++++++
diff --git a/docs/source/user_guide/kernel_programming/memory_allocation_address_space.rst b/docs/source/user_guide/kernel_programming/memory_allocation_address_space.rst
deleted file mode 100644
index 7fc02e6a58..0000000000
--- a/docs/source/user_guide/kernel_programming/memory_allocation_address_space.rst
+++ /dev/null
@@ -1,36 +0,0 @@
-Supported Address Space Qualifiers
-==================================
-
-The address space qualifier may be used to specify the region of memory that is
-used to allocate the object.
-
-Numba-dpex supports three disjoint named address spaces:
-
-1. Global Address Space
-    Global Address Space refers to memory objects allocated from the global
-    memory pool and will be shared among all work-items. Arguments passed to any
-    kernel are allocated in the global address space. In the below example,
-    arguments `a`, `b` and `c` will be allocated in the global address space:
-
-    .. literalinclude:: ./../../../../numba_dpex/examples/kernel/vector_sum.py
-
-
-2. Local Address Space
-    Local Address Space refers to memory objects that need to be allocated in
-    local memory pool and are shared by all work-items of a work-group.
-    Numba-dpex does not support passing arguments that are allocated in the
-    local address space to `@numba_dpex.kernel`. Users are allowed to allocate
-    static arrays in the local address space inside the `@numba_dpex.kernel`. In
-    the example below `numba_dpex.local.array(shape, dtype)` is the API used to
-    allocate a static array in the local address space:
-
-    .. literalinclude:: ./../../../../numba_dpex/examples/barrier.py
-      :lines: 54-87
-
-3. Private Address Space
-    Private Address Space refers to memory objects that are local to each
-    work-item and is not shared with any other work-item. In the example below
-    `numba_dpex.private.array(shape, dtype)` is the API used to allocate a
-    static array in the private address space:
-
-    .. literalinclude:: ./../../../../numba_dpex/examples/kernel/kernel_private_memory.py
diff --git a/docs/source/user_guide/kernel_programming/random.rst b/docs/source/user_guide/kernel_programming/random.rst
deleted file mode 100644
index e2aef1d14b..0000000000
--- a/docs/source/user_guide/kernel_programming/random.rst
+++ /dev/null
@@ -1,58 +0,0 @@
-Random Number Generation
-========================
-
-Numba-dpex does not provide a random number generation algorithm that can be
-executed on the SYCL-supported Device.
-
-Numba-dpex provides access to NumPy random algorithms that can be executed on the
-SYCL-supported device via integration with `dpnp Random`_.
-
-.. _`dpnp Random`: https://intelpython.github.io/dpnp/reference/comparison.html#random-sampling
-
-
-Supported functions
--------------------
-
-Simple random
-`````````````
-
-- `random <https://numpy.org/doc/1.16/reference/generated/numpy.random.random.html#numpy.random.random>`_
-- `sample <https://numpy.org/doc/1.16/reference/generated/numpy.random.sample.html#numpy.random.sample>`_
-- `ranf <https://numpy.org/doc/1.16/reference/generated/numpy.random.ranf.html#numpy.random.ranf>`_
-- `random_sample <https://numpy.org/doc/1.16/reference/generated/numpy.random.random_sample.html#numpy.random.random_sample>`_
-- `rand <https://numpy.org/doc/1.16/reference/generated/numpy.random.rand.html#numpy.random.rand>`_
-- `randint <https://numpy.org/doc/1.16/reference/generated/numpy.random.randint.html#numpy.random.randint>`_
-- `random_integers <https://numpy.org/doc/1.16/reference/generated/numpy.random.random_integers.html#numpy.random.random_integers>`_
-
-Distribution
-````````````
-
-- `beta <https://numpy.org/doc/1.16/reference/generated/numpy.random.beta.html#numpy.random.beta>`_
-- `binomial <https://numpy.org/doc/1.16/reference/generated/numpy.random.binomial.html#numpy.random.binomial>`_
-- `chisquare <https://numpy.org/doc/1.16/reference/generated/numpy.random.chisquare.html#numpy.random.chisquare>`_
-- `exponential <https://numpy.org/doc/1.16/reference/generated/numpy.random.exponential.html#numpy.random.exponential>`_
-- `gamma <https://numpy.org/doc/1.16/reference/generated/numpy.random.gamma.html#numpy.random.gamma>`_
-- `geometric <https://numpy.org/doc/1.16/reference/generated/numpy.random.geometric.html#numpy.random.geometric>`_
-- `gumbel <https://numpy.org/doc/1.16/reference/generated/numpy.random.gumbel.html#numpy.random.gumbel>`_
-- `hypergeometric <https://numpy.org/doc/1.16/reference/generated/numpy.random.hypergeometric.html#numpy.random.hypergeometric>`_
-- `laplace <https://numpy.org/doc/1.16/reference/generated/numpy.random.laplace.html#numpy.random.laplace>`_
-- `lognormal <https://numpy.org/doc/1.16/reference/generated/numpy.random.lognormal.html#numpy.random.lognormal>`_
-- `multinomial <https://numpy.org/doc/1.16/reference/generated/numpy.random.multinomial.html#numpy.random.multinomial>`_
-- `multivariate_normal <https://numpy.org/doc/1.16/reference/generated/numpy.random.multivariate_normal.html#numpy.random.multivariate_normal>`_
-- `negative_binomial <https://numpy.org/doc/1.16/reference/generated/numpy.random.negative_binomial.html#numpy.random.negative_binomial>`_
-- `normal <https://numpy.org/doc/1.16/reference/generated/numpy.random.normal.html#numpy.random.normal>`_
-- `poisson <https://numpy.org/doc/1.16/reference/generated/numpy.random.poisson.html#numpy.random.poisson>`_
-- `rayleigh <https://numpy.org/doc/1.16/reference/generated/numpy.random.rayleigh.html#numpy.random.rayleigh>`_
-- `standard_cauchy <https://numpy.org/doc/1.16/reference/generated/numpy.random.standard_cauchy.html#numpy.random.standard_cauchy>`_
-- `standard_exponential <https://numpy.org/doc/1.16/reference/generated/numpy.random.standard_exponential.html#numpy.random.standard_exponential>`_
-- `standard_gamma <https://numpy.org/doc/1.16/reference/generated/numpy.random.standard_gamma.html#numpy.random.standard_gamma>`_
-- `standard_normal <https://numpy.org/doc/1.16/reference/generated/numpy.random.standard_normal.html#numpy.random.standard_normal>`_
-- `uniform <https://numpy.org/doc/1.16/reference/generated/numpy.random.uniform.html#numpy.random.uniform>`_
-- `weibull <https://numpy.org/doc/1.16/reference/generated/numpy.random.weibull.html#numpy.random.weibull>`_
-
-Example:
-
-.. note::
-    To ensure the code is executed on GPU set ``DEBUG=1`` (or ``NUMBA_DPEX_DEBUG=1``) and look to stdout
-
-.. literalinclude:: ./../../../../numba_dpex/examples/rand.py
diff --git a/docs/source/user_guide/kernel_programming/ufunc.rst b/docs/source/user_guide/kernel_programming/ufunc.rst
deleted file mode 100644
index eb1b47d99b..0000000000
--- a/docs/source/user_guide/kernel_programming/ufunc.rst
+++ /dev/null
@@ -1,52 +0,0 @@
-Universal Functions
-===================
-
-Numba provides a `set of decorators
-<https://numba.pydata.org/numba-doc/latest/user/vectorize.html>`_ to create
-`NumPy universal functions
-<http://docs.scipy.org/doc/numpy/reference/ufuncs.html>`_-like routines that are
-JIT compiled. Although, a close analog to NumPy universal functions Numba's
-``@vectorize`` are not fully compatible with a regular NumPy ufunc.. Refer
-`Creating NumPy universal functions`_ for details.
-
-
-Numba-dpex only supports ``numba.vectorize`` decorator and not yet the
-``numba.guvectorize`` decorator. Another present limitation is that
-numba-dpex ufunc kernels cannot invoke a ``numba_dpex.kernel`` function.
-.. Ongoing work is in progress to address these limitations.
-
-Example 1: Basic Usage
-----------------------
-
-Full example can be found at :file:`numba_dpex/examples/vectorize.py`.
-
-.. literalinclude:: ./../../../../numba_dpex/examples/vectorize.py
-   :pyobject: ufunc_kernel
-
-.. literalinclude:: ./../../../../numba_dpex/examples/vectorize.py
-   :pyobject: test_njit
-
-Example 2: Calling ``numba.vectorize`` inside a ``numba_dpex.kernel``
----------------------------------------------------------------------
-
-Full example can be found at :file:`numba_dpex/examples/blacksholes_njit.py`.
-
-.. literalinclude:: ./../../../../numba_dpex/examples/blacksholes_njit.py
-   :pyobject: cndf2
-
-.. note::
-
-    ``numba.cuda`` requires ``target='cuda'`` parameter for ``numba.vectorize``
-    and ``numba.guvectorize`` functions. Numba-dpex eschews the ``target``
-    parameter for ``@vectorize`` and infers the target from the
-    ``dpctl.device_context`` in which the ``numba.vectorize`` function is
-    called.
-
-Full Examples
--------------
-
-- :file:`numba_dpex/examples/vectorize.py`
-- :file:`numba_dpex/examples/blacksholes_njit.py`
-
-.. _`Universal functions (ufunc)`: http://docs.scipy.org/doc/numpy/reference/ufuncs.html
-.. _`Creating NumPy universal functions`: https://numba.pydata.org/numba-doc/latest/user/vectorize.html
diff --git a/docs/source/user_guide/kernel_programming/writing_kernels.rst b/docs/source/user_guide/kernel_programming/writing_kernels.rst
deleted file mode 100644
index 5989314aeb..0000000000
--- a/docs/source/user_guide/kernel_programming/writing_kernels.rst
+++ /dev/null
@@ -1,97 +0,0 @@
-.. include:: ./../../ext_links.txt
-
-Writing Data Parallel Kernels
-=============================
-
-Kernel Declaration
-------------------
-A kernel function is a device function that is meant to be called from host
-code, where a device can be any SYCL supported device such as a GPU, CPU, or an
-FPGA. The main characteristics of a kernel function are:
-
-- **Scalars must be passed as an array**. Kernels operate with ``dpnp`` array
-  arguments only. If your want a scalar argument, then represent it as
-  0-dimensional ``dpnp`` array.
-
-.. note::
-   Please refer to `Data Parallel Extension for Numpy*`_ to learn more about ``dpnp``.
-
-- **Kernels cannot explicitly return a value**. All result data must be written
-  to ``dpnp`` array passed as a function's argument.
-
-Here is an example of a kernel that computes sum of two vectors ``a`` and ``b``.
-Arguments are two input vectors ``a`` and ``b`` and one output vector ``c`` for
-storing the result of vector summation:
-
-.. literalinclude:: ./../../../../numba_dpex/examples/kernel/vector_sum.py
-   :language: python
-   :lines: 8-9, 11-15
-   :caption: **EXAMPLE:** Data parallel kernel implementing the vector sum a+b
-   :name: ex_kernel_declaration_vector_sum
-
-
-Kernel Invocation
-------------------
-
-The kernel launch parameter syntax for specifying global and local sizes are
-similar to ``SYCL``'s ``range`` and ``ndrange`` classes. The global and local
-sizes need to be specified with ``numba_dpex``'s ``Range`` and ``NdRange``
-classes.
-
-For example, below is a following kernel that computes a sum of two vectors:
-
-.. literalinclude:: ./../../../../numba_dpex/examples/kernel/vector_sum.py
-   :language: python
-   :lines: 8-9, 11-15
-   :caption: **EXAMPLE:** A vector sum kernel
-   :name: vector_sum_kernel
-
-If the ``global size`` parameter is needed to run, it could be like this (where
-``global_size`` is an ``int``):
-
-.. literalinclude:: ./../../../../numba_dpex/examples/kernel/vector_sum.py
-   :language: python
-   :lines: 8-9, 18-24
-   :emphasize-lines: 5
-   :caption: **EXAMPLE:** A vector sum kernel with a global size/range
-   :name: vector_sum_kernel_with_launch_param
-
-If both local and global ranges are needed, they can be specified using two
-instances of ``Range`` inside an ``NdRange`` object. For example, below is a
-kernel to compute pair-wise Euclidean distances of n-dimensional data points:
-
-.. literalinclude:: ./../../../../numba_dpex/examples/kernel/pairwise_distance.py
-   :language: python
-   :lines: 14-15, 36-51
-   :caption: **EXAMPLE:** A kernel to compute pair-wise Euclidean distances
-   :name: pairwise_distance_kernel
-
-Now the local and global sizes can be specified as follows (here both ``args.n``
-and ``args.l`` are ``int``):
-
-.. literalinclude:: ./../../../../numba_dpex/examples/kernel/pairwise_distance.py
-   :language: python
-   :lines: 14-15, 27-31, 54-67
-   :emphasize-lines: 4,6,13
-   :caption: **EXAMPLE:** A kernel to compute pair-wise Euclidean distances with
-               a global and a local size/range
-   :name: pairwise_distance_kernel_with_launch_param
-
-
-Kernel Indexing Functions
--------------------------
-
-In *data parallel kernel programming* all work items are enumerated and accessed
-by their index. You will use ``numba_dpex.get_global_id()`` function to get the
-index of a current work item from the kernel. The total number of work items can
-be determined by calling ``numba_dpex.get_global_size()`` function.
-
-The work group size can be determined by calling ``numba_dpex.get_local_size()``
-function. Work items in the current work group are accessed by calling
-``numba_dpex.get_local_id()``.
-
-The total number of work groups are determined by calling
-``numba_dpex.get_num_groups()`` function. The current work group index is
-obtained by calling ``numba_dpex.get_group_id()`` function.
-
-.. _Black Scholes: https://en.wikipedia.org/wiki/Black%E2%80%93Scholes_model
diff --git a/numba_dpex/core/kernel_interface/ranges_overloads.py b/numba_dpex/core/kernel_interface/ranges_overloads.py
index f0ed884c8b..696b438d4a 100644
--- a/numba_dpex/core/kernel_interface/ranges_overloads.py
+++ b/numba_dpex/core/kernel_interface/ranges_overloads.py
@@ -33,14 +33,14 @@ def codegen(context, builder, sig, args):
             range_struct.dim1 = dim1
         else:
             range_struct.dim1 = llvmir.Constant(
-                llvmir.types.IntType(64), Range.UNDEFINED_DIMENSION
+                llvmir.types.IntType(64), Range._UNDEFINED_DIMENSION
             )
 
         if not isinstance(sig.args[2], types.NoneType):
             range_struct.dim2 = dim2
         else:
             range_struct.dim2 = llvmir.Constant(
-                llvmir.types.IntType(64), Range.UNDEFINED_DIMENSION
+                llvmir.types.IntType(64), Range._UNDEFINED_DIMENSION
             )
 
         range_struct.ndim = llvmir.Constant(llvmir.types.IntType(64), typ.ndim)
diff --git a/numba_dpex/experimental/launcher.py b/numba_dpex/experimental/launcher.py
index b9c017d395..82809a4c9e 100644
--- a/numba_dpex/experimental/launcher.py
+++ b/numba_dpex/experimental/launcher.py
@@ -220,18 +220,19 @@ def codegen(
 
 @dpjit
 def call_kernel(kernel_fn, index_space, *kernel_args) -> None:
-    """Calls a numba_dpex.kernel decorated function from CPython or from another
-    dpjit function. Kernel execution happens in synchronous way, so the thread
-    will be blocked till the kernel done execution.
+    """Compiles and synchronously executes a kernel function.
+
+    Kernel execution happens in synchronous way, so the main thread will be
+    blocked till the kernel done execution.
 
     Args:
         kernel_fn (numba_dpex.experimental.KernelDispatcher): A
-        numba_dpex.kernel decorated function that is compiled to a
-        KernelDispatcher by numba_dpex.
-        index_space (Range | NdRange): A numba_dpex.Range or numba_dpex.NdRange
-        type object that specifies the index space for the kernel.
+            :func:`numba_dpex.experimental.kernel` decorated function that is
+            compiled to a ``KernelDispatcher``.
+        index_space (Range | NdRange): A Range or NdRange type object that
+            specifies the index space for the kernel.
         kernel_args : List of objects that are passed to the numba_dpex.kernel
-        decorated function.
+            decorated function.
     """
     _submit_kernel_sync(  # pylint: disable=E1120
         kernel_fn,
@@ -247,25 +248,32 @@ def call_kernel_async(
     dependent_events: list[dpctl.SyclEvent],
     *kernel_args
 ) -> tuple[dpctl.SyclEvent, dpctl.SyclEvent]:
-    """Calls a numba_dpex.kernel decorated function from CPython or from another
-    dpjit function. Kernel execution happens in asynchronous way, so the thread
-    will not be blocked till the kernel done execution. That means that it is
-    user responsibility to properly use any memory used by kernel until the
-    kernel execution is completed.
+    """Compiles and asynchronously executes a kernel function.
+
+    Calls a :func:`numba_dpex.experimental.kernel` decorated function
+    asynchronously from CPython or from a :func:`numba_dpex.dpjit` function. As
+    the kernel execution happens asynchronously, so the main thread will not be
+    blocked till the kernel done execution. Instead the function returns back to
+    caller a handle for an *event* to track kernel execution. It is a user's
+    responsibility to properly track kernel execution completion and not use any
+    data that may still be used by the kernel prior to the kernel's completion.
 
     Args:
-        kernel_fn (numba_dpex.experimental.KernelDispatcher): A
-        numba_dpex.kernel decorated function that is compiled to a
-        KernelDispatcher by numba_dpex.
-        index_space (Range | NdRange): A numba_dpex.Range or numba_dpex.NdRange
-        type object that specifies the index space for the kernel.
+        kernel_fn (KernelDispatcher): A
+            :func:`numba_dpex.experimental.kernel` decorated function that is
+            compiled to a ``KernelDispatcher``.
+        index_space (Range | NdRange): A Range or NdRange type object that
+            specifies the index space for the kernel.
         kernel_args : List of objects that are passed to the numba_dpex.kernel
-        decorated function.
+            decorated function.
 
     Returns:
-        pair of host event and device event. Host event represent host task
-        that releases use of any kernel argument so it can be deallocated.
-        This task may be executed only after device task is done.
+        A pair of ``dpctl.SyclEvent`` objects. The pair of events constitute of
+        a host task and an event associated with the kernel execution. The event
+        associated with the kernel execution indicates the execution status of
+        the submitted kernel function. The host task manages the lifetime of any
+        PyObject passed in as a kernel argument and automatically decrements the
+        reference count of the object on kernel execution completion.
     """
     return _submit_kernel_async(  # pylint: disable=E1120
         kernel_fn,
diff --git a/numba_dpex/kernel_api/atomic_ref.py b/numba_dpex/kernel_api/atomic_ref.py
index 9759cecf95..c8ab205890 100644
--- a/numba_dpex/kernel_api/atomic_ref.py
+++ b/numba_dpex/kernel_api/atomic_ref.py
@@ -10,9 +10,10 @@
 
 
 class AtomicRef:
-    """Analogue to the ``sycl::atomic_ref`` type. An atomic reference is a
-    view into a data container that can be then updated atomically using any of
-    the ``fetch_*`` member functions of the class.
+    """Analogue to the ``sycl::atomic_ref`` class.
+
+    An atomic reference is a view into a data container that can be then updated
+    atomically using any of the ``fetch_*`` member functions of the class.
     """
 
     def __init__(  # pylint: disable=too-many-arguments
@@ -66,7 +67,6 @@ def fetch_add(self, val):
             val : Value to be added to the object referenced by the AtomicRef.
 
         Returns: The original value of the object referenced by the AtomicRef.
-
         """
         old = self._ref[self._index].copy()
         self._ref[self._index] += val
@@ -79,10 +79,9 @@ def fetch_sub(self, val):
 
         Args:
             val : Value to be subtracted from the object referenced by the
-            AtomicRef.
+                AtomicRef.
 
         Returns: The original value of the object referenced by the AtomicRef.
-
         """
         old = self._ref[self._index].copy()
         self._ref[self._index] -= val
@@ -95,10 +94,9 @@ def fetch_min(self, val):
 
         Args:
             val : Value to be compared against the object referenced by the
-            AtomicRef.
+                AtomicRef.
 
         Returns: The original value of the object referenced by the AtomicRef.
-
         """
         old = self._ref[self._index].copy()
         self._ref[self._index] = min(old, val)
@@ -111,10 +109,9 @@ def fetch_max(self, val):
 
         Args:
             val : Value to be compared against the object referenced by the
-            AtomicRef.
+                AtomicRef.
 
         Returns: The original value of the object referenced by the AtomicRef.
-
         """
         old = self._ref[self._index].copy()
         self._ref[self._index] = max(old, val)
@@ -127,10 +124,9 @@ def fetch_and(self, val):
 
         Args:
             val : Value to be bitwise ANDed against the object referenced by
-            the AtomicRef.
+                the AtomicRef.
 
         Returns: The original value of the object referenced by the AtomicRef.
-
         """
         old = self._ref[self._index].copy()
         self._ref[self._index] &= val
@@ -143,10 +139,9 @@ def fetch_or(self, val):
 
         Args:
             val : Value to be bitwise ORed against the object referenced by
-            the AtomicRef.
+                the AtomicRef.
 
         Returns: The original value of the object referenced by the AtomicRef.
-
         """
         old = self._ref[self._index].copy()
         self._ref[self._index] |= val
@@ -159,7 +154,7 @@ def fetch_xor(self, val):
 
         Args:
             val : Value to be bitwise XORed against the object referenced by
-            the AtomicRef.
+                the AtomicRef.
 
         Returns: The original value of the object referenced by the AtomicRef.
 
@@ -172,7 +167,6 @@ def load(self):
         """Loads the value of the object referenced by the AtomicRef.
 
         Returns: The value of the object referenced by the AtomicRef.
-
         """
         return self._ref[self._index]
 
@@ -180,22 +174,20 @@ def store(self, val):
         """Stores operand ``val`` to the object referenced by the AtomicRef.
 
         Args:
-            val : Value to be stored in the object referenced by
-            the AtomicRef.
-
+            val : Value to be stored in the object referenced by the AtomicRef.
         """
         self._ref[self._index] = val
 
     def exchange(self, val):
         """Replaces the value of the object referenced by the AtomicRef
-        with value of ``val``. Returns the original value of the referenced object.
+        with value of ``val``. Returns the original value of the referenced
+        object.
 
         Args:
             val : Value to be exchanged against the object referenced by
-            the AtomicRef.
+                the AtomicRef.
 
         Returns: The original value of the object referenced by the AtomicRef.
-
         """
         old = self._ref[self._index].copy()
         self._ref[self._index] = val
@@ -203,23 +195,21 @@ def exchange(self, val):
 
     def compare_exchange(self, expected, desired, expected_idx=0):
         """Compares the value of the object referenced by the AtomicRef
-        against the value of ``expected[expected_idx]``.
-        If the values are equal, replaces the value of the
-        referenced object with the value of ``desired``.
-        Otherwise assigns the original value of the
-        referenced object to ``expected[expected_idx]``.
+        against the value of ``expected[expected_idx]``. If the values are
+        equal, replaces the value of the referenced object with the value of
+        ``desired``. Otherwise assigns the original value of the referenced
+        object to ``expected[expected_idx]``.
 
         Args:
-            expected : Array containing the expected value of the
-            object referenced by the AtomicRef.
-            desired : Value that replaces the value of the object
-            referenced by the AtomicRef.
+            expected : Array containing the expected value of the object
+                referenced by the AtomicRef.
+            desired : Value that replaces the value of the object referenced by
+                the AtomicRef.
             expected_idx: Offset in `expected` array where the expected
             value of the object referenced by the AtomicRef is present.
 
-        Returns: Returns ``True`` if the comparison operation and
-        replacement operation were successful.
-
+        Returns: ``True`` if the comparison operation and replacement operation
+            were successful.
         """
         if self._ref[self._index] == expected[expected_idx]:
             self._ref[self._index] = desired
diff --git a/numba_dpex/kernel_api/barrier.py b/numba_dpex/kernel_api/barrier.py
index 744262403d..3fd78fe80e 100644
--- a/numba_dpex/kernel_api/barrier.py
+++ b/numba_dpex/kernel_api/barrier.py
@@ -2,7 +2,7 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
-"""Python functions that simulate SYCL's barrier primitives.
+"""Python functions that simulate SYCL's group_barrier function.
 """
 
 from .index_space_ids import Group
@@ -14,13 +14,13 @@ def group_barrier(group: Group, fence_scope=MemoryScope.WORK_GROUP):
 
     The function is modeled after the ``sycl::group_barrier`` function. It
     synchronizes work within a group of work items. All the work-items
-    of the group must execute the barrier construct before any work-item
+    of the group must execute the barrier call before any work-item
     continues execution beyond the barrier.
 
     The ``group_barrier`` performs mem-fence operations ensuring that memory
     accesses issued before the barrier are not re-ordered with those issued
-    after the barrier: all work-items in group g execute a release fence prior
-    to synchronizing at the barrier, all work-items in group g execute an
+    after the barrier: all work-items in group G execute a release fence prior
+    to synchronizing at the barrier, all work-items in group G execute an
     acquire fence afterwards, and there is an implicit synchronization of these
     fences as if provided by an explicit atomic operation on an atomic object.
 
diff --git a/numba_dpex/kernel_api/index_space_ids.py b/numba_dpex/kernel_api/index_space_ids.py
index fb2018dd20..e53b81f58f 100644
--- a/numba_dpex/kernel_api/index_space_ids.py
+++ b/numba_dpex/kernel_api/index_space_ids.py
@@ -119,8 +119,10 @@ def leader(self, work_item_id):
 
 
 class Item:
-    """Analogue to the ``sycl::item`` type. Identifies an instance of the
-    function object executing at each point in an Range.
+    """Analogue to the ``sycl::item`` class.
+
+    Identifies an instance of the function object executing at each point in an
+    :class:`.Range`.
     """
 
     def __init__(self, extent: Range, index: list):
@@ -170,7 +172,7 @@ def get_range(self, idx):
 
     @property
     def dimensions(self) -> int:
-        """Returns the rank of a Item object.
+        """Returns the number of dimensions of a Item object.
 
         Returns:
             int: Number of dimensions in the Item object
@@ -179,8 +181,10 @@ def dimensions(self) -> int:
 
 
 class NdItem:
-    """Analogue to the ``sycl::nd_item`` type. Identifies an instance of the
-    function object executing at each point in an NdRange.
+    """Analogue to the ``sycl::nd_item`` class.
+
+    Identifies an instance of the function object executing at each point in an
+    :class:`.NdRange`.
     """
 
     # TODO: define group type
@@ -201,8 +205,7 @@ def get_global_id(self, idx):
         return self._global_item.get_id(idx)
 
     def get_global_linear_id(self):
-        """Get the global linear id associated with this item for all
-        dimensions.
+        """Get the linearized global id for the item for all dimensions.
 
         Returns:
             int: The global linear id.
diff --git a/numba_dpex/kernel_api/memory_enums.py b/numba_dpex/kernel_api/memory_enums.py
index ef738daf2b..5631e88191 100644
--- a/numba_dpex/kernel_api/memory_enums.py
+++ b/numba_dpex/kernel_api/memory_enums.py
@@ -11,8 +11,9 @@
 
 class MemoryOrder(FlagEnum):
     """
-    An enumeration of the supported ``sycl::memory_order`` values in dpcpp. The
-    integer values of the enums is kept consistent with the corresponding
+    An enumeration of the supported ``sycl::memory_order`` values.
+
+    The integer values of the enums is kept consistent with the corresponding
     implementation in dpcpp.
 
     =====================   ============
@@ -37,8 +38,9 @@ class MemoryOrder(FlagEnum):
 
 class MemoryScope(FlagEnum):
     """
-    An enumeration of SYCL memory scope. For more details please refer to
-    SYCL 2020 specification, section 3.8.3.2
+    An enumeration of the supported ``sycl::memory_scope`` values.
+
+    For more details please refer to SYCL 2020 specification, section 3.8.3.2
 
     ===============  ============
     Memory Scope     Enum value
@@ -59,7 +61,7 @@ class MemoryScope(FlagEnum):
 
 
 class AddressSpace(FlagEnum):
-    """The address space values supported by numba_dpex.
+    """An enumeration of the supported address space values.
 
     ==================   ============
     Address space        Value
diff --git a/numba_dpex/kernel_api/ranges.py b/numba_dpex/kernel_api/ranges.py
index 85cea812af..e4a131544a 100644
--- a/numba_dpex/kernel_api/ranges.py
+++ b/numba_dpex/kernel_api/ranges.py
@@ -27,7 +27,7 @@ class Range(tuple):
     the behavior of `sycl::range`.
     """
 
-    UNDEFINED_DIMENSION = -1
+    _UNDEFINED_DIMENSION = -1
 
     def __new__(cls, dim0, dim1=None, dim2=None):
         """Constructs a 1, 2, or 3 dimensional range.
@@ -114,7 +114,7 @@ def dim1(self) -> int:
         try:
             return self[1]
         except IndexError:
-            return Range.UNDEFINED_DIMENSION
+            return Range._UNDEFINED_DIMENSION
 
     @property
     def dim2(self) -> int:
@@ -127,7 +127,7 @@ def dim2(self) -> int:
         try:
             return self[2]
         except IndexError:
-            return Range.UNDEFINED_DIMENSION
+            return Range._UNDEFINED_DIMENSION
 
 
 class NdRange: