diff --git a/docs/_templates/autoapi/index.rst b/docs/_templates/autoapi/index.rst index 1af3e006b2..6f37ea6912 100644 --- a/docs/_templates/autoapi/index.rst +++ b/docs/_templates/autoapi/index.rst @@ -4,9 +4,11 @@ API Reference This page contains auto-generated API reference documentation [#f1]_. .. toctree:: - :maxdepth: 2 + :maxdepth: 1 numba_dpex/kernel_api/index + numba_dpex/experimental/decorators/index + numba_dpex/experimental/launcher/index {% for page in pages %} {% if page.top_level_object and page.display %} diff --git a/docs/source/conf.py b/docs/source/conf.py index 72a445165c..6b053075c3 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -27,7 +27,6 @@ "sphinx.ext.extlinks", "sphinx.ext.githubpages", "sphinx.ext.napoleon", - "sphinx.ext.autosectionlabel", "sphinxcontrib.programoutput", "sphinxcontrib.googleanalytics", "myst_parser", @@ -114,19 +113,10 @@ # -- Auto API configurations --------------------------------------------------- - -# def skip_util_classes(app, what, name, obj, skip, options): -# if what == "module" and "experimental" in name: -# if what == "module" and "decorators" not in name: -# skip = True -# return skip - - -# def setup(sphinx): -# sphinx.connect("autoapi-skip-member", skip_util_classes) - - -autoapi_dirs = ["../../numba_dpex/kernel_api"] +autoapi_dirs = [ + "../../numba_dpex/kernel_api", + "../../numba_dpex/experimental", +] autoapi_type = "python" autoapi_template_dir = "_templates/autoapi" @@ -160,3 +150,14 @@ def prepare_jinja_env(jinja_env) -> None: autoapi_prepare_jinja_env = prepare_jinja_env + + +def skip_member(app, what, name, obj, skip, options): + # skip submodules + if what == "module": + skip = True + return skip + + +def setup(sphinx): + sphinx.connect("autoapi-skip-member", skip_member) diff --git a/docs/source/experimental/index.rst b/docs/source/experimental/index.rst new file mode 100644 index 0000000000..0737f2d4d2 --- /dev/null +++ b/docs/source/experimental/index.rst @@ -0,0 +1,126 @@ +.. _index: +.. include:: ./../ext_links.txt + +Experimental Features +===================== + +Numba-dpex includes various experimental features that are not yet suitable for +everyday production usage, but are included as an engineering preview. +The most prominent experimental features currently included in numba-dpex are +listed in this section. + + +Compiling and Offloading ``dpnp`` statements +-------------------------------------------- + +Data Parallel Extension for NumPy* (`dpnp`_) is a drop-in NumPy* replacement +library built using the oneAPI software stack including `oneMKL`_, `oneDPL`_ and +`SYCL*`_. numba-dpex has experimental support for compiling a subset of dpnp +functions. The feature is enabled by the :py:func:`numba_dpex.dpjit` decorator. + +An example of a supported usage of dpnp in numba-dpex is provided in the +following code snippet: + +.. code-block:: python + + import dpnp + from numba_dpex import dpjit + + + @dpjit + def foo(): + a = dpnp.ones(1024, device="gpu") + return dpnp.sqrt(a) + + + a = foo() + print(a) + print(type(a)) + + +Offloading ``prange`` loops +--------------------------- + +numba-dpex supports using the ``numba.prange`` statements with +``dpnp.ndarray`` objects. All such ``prange`` loops are offloaded as kernels and +executed on a device inferred using the compute follows data programming model. +The next examples shows using a ``prange`` loop. + + +.. code-block:: python + + import dpnp + from numba_dpex import dpjit, prange + + + @dpjit + def foo(): + x = dpnp.ones(1024, device="gpu") + o = dpnp.empty_like(a) + for i in prange(x.shape[0]): + o[i] = x[i] * x[i] + return o + + + c = foo() + print(c) + print(type(c)) + + +``prange`` loop statements can also be used to write reduction loops as +demonstrated by the following naive pairwise distance computation. + +.. code-block:: python + + from numba_dpex import dpjit, prange + import dpnp + import dpctl + + + @dpjit + def pairwise_distance(X1, X2, D): + """Naïve pairwise distance impl - take an array representing M points in N + dimensions, and return the M x M matrix of Euclidean distances + + Args: + X1 : Set of points + X2 : Set of points + D : Outputted distance matrix + """ + # Size of inputs + X1_rows = X1.shape[0] + X2_rows = X2.shape[0] + X1_cols = X1.shape[1] + + float0 = X1.dtype.type(0.0) + + # Outermost parallel loop over the matrix X1 + for i in prange(X1_rows): + # Loop over the matrix X2 + for j in range(X2_rows): + d = float0 + # Compute exclidean distance + for k in range(X1_cols): + tmp = X1[i, k] - X2[j, k] + d += tmp * tmp + # Write computed distance to distance matrix + D[i, j] = dpnp.sqrt(d) + + + q = dpctl.SyclQueue() + X1 = dpnp.ones((10, 2), sycl_queue=q) + X2 = dpnp.zeros((10, 2), sycl_queue=q) + D = dpnp.empty((10, 2), sycl_queue=q) + + pairwise_distance(X1, X2, D) + print(D) + + +Kernel fusion +------------- + +.. ``numba-dpex`` can identify each NumPy* (or ``dpnp``) array expression as a +.. data-parallel kernel and fuse them together to generate a single SYCL kernel. +.. The kernel is automatically offloaded to the specified device where the fusion +.. operation is invoked. Here is a simple example of a Black-Scholes formula +.. computation where kernel fusion occurs at different ``dpnp`` math functions: diff --git a/docs/source/ext_links.txt b/docs/source/ext_links.txt index 13a02c5f0d..ee01b1a83c 100644 --- a/docs/source/ext_links.txt +++ b/docs/source/ext_links.txt @@ -25,3 +25,6 @@ .. _Intel VTune Profiler: https://www.intel.com/content/www/us/en/developer/tools/oneapi/vtune-profiler.html .. _Intel Advisor: https://www.intel.com/content/www/us/en/developer/tools/oneapi/advisor.html .. _oneMKL: https://www.intel.com/content/www/us/en/docs/oneapi/programming-guide/2023-2/intel-oneapi-math-kernel-library-onemkl.html +.. _oneDPL: https://www.intel.com/content/www/us/en/developer/tools/oneapi/dpc-library.html#gs.5izf63 +.. _UXL: https://uxlfoundation.org/ +.. _oneAPI GPU optimization guide: https://www.intel.com/content/www/us/en/docs/oneapi/optimization-guide-gpu/2024-0/general-purpose-computing-on-gpu.html diff --git a/docs/source/glossary.rst b/docs/source/glossary.rst deleted file mode 100644 index 5c3414c7f8..0000000000 --- a/docs/source/glossary.rst +++ /dev/null @@ -1,5 +0,0 @@ -.. _glossary: -.. include:: ./ext_links.txt - -Glossary -======== diff --git a/docs/source/index.rst b/docs/source/index.rst index 1bf6f0ab62..8a540cac8f 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -41,26 +41,26 @@ Data Parallel Extension for Numba* .. module:: numba_dpex .. toctree:: - :maxdepth: 2 + :maxdepth: 1 overview getting_started programming_model user_guide/index autoapi/index + experimental/index useful_links .. toctree:: - :maxdepth: 2 + :maxdepth: 1 :caption: Development contribution_guide .. toctree:: - :maxdepth: 2 + :maxdepth: 1 :caption: Misc Notes examples - glossary license release-notes diff --git a/docs/source/overview.rst b/docs/source/overview.rst index 25ad9c139b..6db2ceabaa 100644 --- a/docs/source/overview.rst +++ b/docs/source/overview.rst @@ -4,147 +4,86 @@ Overview ======== -Data Parallel Extension for Numba* (`numba-dpex`_) is an extension to -the `Numba*`_ Python JIT compiler adding an architecture-agnostic kernel -programming API, and a new front-end to compile the Data Parallel Extension -for Numpy* (`dpnp`_) library. The ``dpnp`` Python library is a data-parallel -implementation of `NumPy*`_'s API using the `SYCL*`_ language. - -.. ``numba-dpex``'s support for ``dpnp`` compilation is a new way for Numba* users -.. to write code in a NumPy-like API that is already supported by Numba*, while at -.. the same time automatically running such code parallelly on various types of -.. architecture. - -``numba-dpex`` is an open-source project and can be installed as part of `Intel -AI Analytics Toolkit`_ or the `Intel Distribution for Python*`_. The package is -also available on Anaconda cloud and as a Docker image on GitHub. Please refer -the :doc:`getting_started` page to learn more. - -Main Features -------------- - -Portable Kernel Programming -~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -The ``numba-dpex`` kernel programming API has a design similar to Numba's -``cuda.jit`` sub-module. The API is modeled after the `SYCL*`_ language and uses -the `DPC++`_ SYCL runtime. Currently, compilation of kernels is supported for -SPIR-V-based OpenCL and `oneAPI Level Zero`_ devices CPU and GPU devices. In the -future, compilation support for other types of hardware that are supported by -DPC++ will be added. - -The following example illustrates a vector addition kernel written with -``numba-dpex`` kernel API. +Data Parallel Extension for Numba* (`numba-dpex`_) is a free and open-source +LLVM-based code generator for portable accelerator programming in Python. The +code generator implements a new pseudo-kernel programming domain-specific +language (DSL) called `KAPI` that is modeled after the C++ DSL `SYCL*`_. The +SYCL language is an open standard developed under the Unified Acceleration +Foundation (`UXL`_) as a vendor-agnostic way of programming different types of +data-parallel hardware such as multi-core CPUs, GPUs, and FPGAs. Numba-dpex and +KAPI aim to bring the same vendor-agnostic and standard-compliant programming +model to Python. + +Numba-dpex is built on top of the open-source `Numba*`_ JIT compiler that +implements a CPython bytecode parser and code generator to lower the bytecode to +LLVM IR. The Numba* compiler is able to compile a large sub-set of Python and +most of the NumPy library. Numba-dpex uses Numba*'s tooling to implement the +parsing and typing support for the data types and functions defined in the KAPI +DSL. A custom code generator is then used to lower KAPI to a form of LLVM IR +that includes special LLVM instructions that define a low-level data-parallel +kernel API. Thus, a function defined in KAPI is compiled to a data-parallel +kernel that can run on different types of hardware. Currently, compilation of +KAPI is possible for x86 CPU devices, Intel Gen9 integrated GPUs, Intel UHD +integrated GPUs, and Intel discrete GPUs. + + +The following example shows a pairwise distance matrix computation in KAPI. .. code-block:: python - import dpnp - import numba_dpex as dpex - - - @dpex.kernel - def vecadd_kernel(a, b, c): - i = dpex.get_global_id(0) - c[i] = a[i] + b[i] - - - a = dpnp.ones(1024, device="gpu") - b = dpnp.ones(1024, device="gpu") - c = dpnp.empty_like(a) - - vecadd_kernel[dpex.Range(1024)](a, b, c) - print(c) - -In the above example, three arrays are allocated on a default ``gpu`` device -using the ``dpnp`` library. The arrays are then passed as input arguments to the -kernel function. The compilation target and the subsequent execution of the -kernel is determined by the input arguments and follow the -"compute-follows-data" programming model as specified in the `Python* Array API -Standard`_. To change the execution target to a CPU, the device keyword needs to -be changed to ``cpu`` when allocating the ``dpnp`` arrays. It is also possible -to leave the ``device`` keyword undefined and let the ``dpnp`` library select a -default device based on environment flag settings. Refer the -:doc:`user_guide/kernel_programming/index` for further details. - -``dpjit`` decorator -~~~~~~~~~~~~~~~~~~~ - -The ``numba-dpex`` package provides a new decorator ``dpjit`` that extends -Numba's ``njit`` decorator. The new decorator is equivalent to -``numba.njit(parallel=True)``, but additionally supports compiling ``dpnp`` -functions, ``prange`` loops, and array expressions that use ``dpnp.ndarray`` -objects. - -Unlike Numba's NumPy parallelization that only supports CPUs, ``dpnp`` -expressions are first converted to data-parallel kernels and can then be -`offloaded` to different types of devices. As ``dpnp`` implements the same API -as NumPy*, an existing ``numba.njit`` decorated function that uses -``numpy.ndarray`` may be refactored to use ``dpnp.ndarray`` and decorated with -``dpjit``. Such a refactoring can allow the parallel regions to be offloaded -to a supported GPU device, providing users an additional option to execute their -code parallelly. - -The vector addition example depicted using the kernel API can also be -expressed in several different ways using ``dpjit``. - -.. code-block:: python - - import dpnp - import numba_dpex as dpex - - - @dpex.dpjit - def vecadd_v1(a, b): - return a + b - + from numba_dpex import kernel_api as kapi + import math - @dpex.dpjit - def vecadd_v2(a, b): - return dpnp.add(a, b) + def pairwise_distance_kernel(item: kapi.Item, data, distance): + i = item.get_id(0) + j = item.get_id(1) - @dpex.dpjit - def vecadd_v3(a, b): - c = dpnp.empty_like(a) - for i in prange(a.shape[0]): - c[i] = a[i] + b[i] - return c + data_dims = data.shape[1] -As with the kernel API example, a ``dpjit`` function if invoked with ``dpnp`` -input arguments follows the compute-follows-data programming model. Refer -:doc:`user_manual/dpnp_offload/index` for further details. + d = data.dtype.type(0.0) + for k in range(data_dims): + tmp = data[i, k] - data[j, k] + d += tmp * tmp + distance[j, i] = math.sqrt(d) -.. Project Goal -.. ------------ -.. If C++ is not your language, you can skip writing data-parallel kernels in SYCL -.. and directly write them in Python. +Skipping over much of the language details, at a high-level the +``pairwise_distance_kernel`` can be viewed as a data-parallel function that gets +executed individually by a set of "work items". That is, each work item runs the +same function for a subset of the elements of the input ``data`` and +``distance`` arrays. For programmers familiar with the CUDA or OpenCL languages, +it is the same programming model that is referred to as Single Program Multiple +Data (SPMD). As Python has no concept of a work item the KAPI function itself is +sequential and needs to be compiled to convert it into a parallel version. The +next example shows the changes to the original script to compile and run the +``pairwise_distance_kernel`` in parallel. -.. Our package ``numba-dpex`` extends the Numba compiler to allow kernel creation -.. directly in Python via a custom compute API - - -.. Contributing -.. ------------ - -.. Refer the `contributing guide -.. `_ for -.. information on coding style and standards used in ``numba-dpex``. - -.. License -.. ------- - -.. ``numba-dpex`` is Licensed under Apache License 2.0 that can be found in `LICENSE -.. `_. All usage and -.. contributions to the project are subject to the terms and conditions of this -.. license. +.. code-block:: python + from numba_dpex import kernel, call_kernel + import dpnp -.. Along with the kernel programming API an auto-offload feature is also provided. -.. The feature enables automatic generation of kernels from data-parallel NumPy -.. library calls and array expressions, Numba ``prange`` loops, and `other -.. "data-parallel by construction" expressions -.. `_ that Numba is -.. able to parallelize. Following two examples demonstrate the two ways in which -.. kernels may be written using numba-dpex. + data = dpnp.random.ranf((10000, 3), device="gpu") + distance = dpnp.empty(shape=(data.shape[0], data.shape[0]), device="gpu") + exec_range = kapi.Range(data.shape[0], data.shape[0]) + call_kernel(kernel(pairwise_distance_kernel), exec_range, data, distance) + +To compile a KAPI function into a data-parallel kernel and run it on a device, +three things need to be done: allocate the arguments to the function on the +device where the function is to execute, compile the function by applying a +numba-dpex decorator, and `launch` or execute the compiled kernel on the device. + +Allocating arrays or scalars to be passed to a compiled KAPI function is not +done directly in numba-dpex. Instead, numba-dpex supports passing in +tensors/ndarrays created using either the `dpnp`_ NumPy drop-in replacement +library or the `dpctl`_ SYCl-based Python Array API library. To trigger +compilation, the ``numba_dpex.kernel`` decorator has to be used, and finally to +launch a compiled kernel the ``numba_dpex.call_kernel`` function should be +invoked. + +For a more detailed description about programming with numba-dpex, refer +the :doc:`programming_model`, :doc:`user_guide/index` and the +:doc:`autoapi/index` sections of the documentation. To setup numba-dpex and try +it out refer the :doc:`getting_started` section. diff --git a/docs/source/programming_model.rst b/docs/source/programming_model.rst index 168120fc9d..42e972c6b0 100644 --- a/docs/source/programming_model.rst +++ b/docs/source/programming_model.rst @@ -227,10 +227,8 @@ three keyword arguments present in all `array creation functions For example, consider .. code:: python - # Use usm_type = 'device' to get USM-device allocation (default), - # usm_type = 'shared' to get USM-shared allocation, - # usm_type = 'host' to get USM-host allocation - # def dpt.empty(..., device=None, usm_type=None, sycl_queue=None) -> dpctl.tensor.usm_ndarray: ... + + # TODO The keyword ``device`` is `mandated by the array API `__. diff --git a/docs/source/user_guide/config.rst b/docs/source/user_guide/config.rst index a9c188d660..e98b85521c 100644 --- a/docs/source/user_guide/config.rst +++ b/docs/source/user_guide/config.rst @@ -3,7 +3,10 @@ Configuration Options for ``numba-dpex`` ======================================== -``numba-dpex`` provides a set of environment variables and flags for configuring different aspects of the compilation, debugging and execution of programs. The configuration flags of ``numba-dpex`` are mostly inherited from those of Numba*. They are defined in :file:`numba_dpex/core/config.py`. +``numba-dpex`` provides a set of environment variables and flags for configuring +different aspects of the compilation, debugging and execution of programs. The +configuration flags of ``numba-dpex`` are mostly inherited from those of Numba*. +They are defined in :file:`numba_dpex/core/config.py`. .. note:: In order to enable/disable each of the configuration flags, a ``NUMBA_DPEX`` diff --git a/docs/source/user_guide/dpnp_offload.rst b/docs/source/user_guide/dpnp_offload.rst deleted file mode 100644 index d04183df20..0000000000 --- a/docs/source/user_guide/dpnp_offload.rst +++ /dev/null @@ -1,219 +0,0 @@ -.. include:: ./../ext_links.txt - -Compiling and Offloading ``dpnp`` statements -============================================ - -Data Parallel Extension for NumPy* (``dpnp``) is a drop-in ``NumPy*`` -replacement library built on top of oneMKL and SYCL. ``numba-dpex`` allows -various ``dpnp`` library function calls to be JIT-compiled using the -``numba_dpex.dpjit`` decorator. Presently, ``numba-dpex`` can compile several -``dpnp`` array constructors (``ones``, ``zeros``, ``full``, ``empty``), most -universal functions, ``prange`` loops, and vector expressions using -``dpnp.ndarray`` objects. - -An example of a supported usage of ``dpnp`` statements in ``numba-dpex`` is -provided in the following code snippet: - - -.. ``numba-dpex`` implements its own runtime library to support offloading ``dpnp`` -.. library functions to SYCL devices. For each ``dpnp`` function signature to be -.. offloaded, ``numba-dpex`` implements the corresponding direct SYCL function call -.. in the runtime and the function call is inlined in the generated LLVM IR. - -.. code-block:: python - - import dpnp - from numba_dpex import dpjit - - - @dpjit - def foo(): - a = dpnp.ones(1024, device="gpu") - return dpnp.sqrt(a) - - - a = foo() - print(a) - print(type(a)) - -.. :samp:`dpnp.ones(10)` will be called through |ol_dpnp_ones(...)|_. - - -.. Design -.. ------- - -.. ``numba_dpex`` uses the |numba.extending.overload| decorator to create a Numba* -.. implementation of a function that can be used in `nopython mode`_ functions. -.. This is done through translation of ``dpnp`` function signature so that they can -.. be called in ``numba_dpex.dpjit`` decorated code. - -.. The specific SYCL operation for a certain ``dpnp`` function is performed by the -.. runtime interface. During compiling a function decorated with the ``@dpjit`` -.. decorator, ``numba-dpex`` generates the corresponding SYCL function call through -.. its runtime library and injects it into the LLVM IR through -.. |numba.extending.intrinsic|_. The ``@intrinsic`` decorator is used for marking a -.. ``dpnp`` function as typing and implementing the function in nopython mode using -.. the `llvmlite IRBuilder API`_. This is an escape hatch to build custom LLVM IR -.. that will be inlined into the caller. - -.. The code injection logic to enable ``dpnp`` functions calls in the Numba IR is -.. implemented by :mod:`numba_dpex.core.dpnp_iface.arrayobj` module which replaces -.. Numba*'s :mod:`numba.np.arrayobj`. Each ``dpnp`` function signature is provided -.. with a concrete implementation to generates the actual code using Numba's -.. ``overload`` function API. e.g.: - -.. .. code-block:: python - -.. @overload(dpnp.ones, prefer_literal=True) -.. def ol_dpnp_ones( -.. shape, dtype=None, order="C", device=None, usm_type="device", sycl_queue=None -.. ): -.. ... - -.. The corresponding intrinsic implementation is in :file:`numba_dpex/dpnp_iface/_intrinsic.py`. - -.. .. code-block:: python - -.. @intrinsic -.. def impl_dpnp_ones( -.. ty_context, -.. ty_shape, -.. ty_dtype, -.. ty_order, -.. ty_device, -.. ty_usm_type, -.. ty_sycl_queue, -.. ty_retty_ref, -.. ): -.. ... - -Parallel Range ---------------- - -``numba-dpex`` supports using the ``numba.prange`` statements with -``dpnp.ndarray`` objects. All such ``prange`` loops are offloaded as kernels and -executed on a device inferred using the compute follows data programming model. -The next examples shows using a ``prange`` loop. - -.. implements the ability to run loops in parallel, the language -.. construct is adapted from Numba*'s ``prange`` concept that was initially -.. designed to run OpenMP parallel for loops. In Numba*, the loop-body is scheduled -.. in seperate threads, and they execute in a ``nopython`` Numba* context. -.. ``prange`` automatically takes care of data privatization. ``numba-dpex`` -.. employs the ``prange`` compilation mechanism to offload parallel loop like -.. programming constructs onto SYCL enabled devices. - -.. The ``prange`` compilation pass is delegated through Numba's -.. :file:`numba/parfor/parfor_lowering.py` module where ``numba-dpex`` provides -.. :file:`numba_dpex/core/parfors/parfor_lowerer.py` module to be used as the -.. *lowering* mechanism through -.. :py:class:`numba_dpex.core.parfors.parfor_lowerer.ParforLowerImpl` class. This -.. provides a custom lowerer for ``prange`` nodes that generates a SYCL kernel for -.. a ``prange`` node and submits it to a queue. Here is an example of a ``prange`` -.. use case in ``@dpjit`` context: - -.. code-block:: python - - import dpnp - from numba_dpex import dpjit, prange - - - @dpjit - def foo(): - x = dpnp.ones(1024, device="gpu") - o = dpnp.empty_like(a) - for i in prange(x.shape[0]): - o[i] = x[i] * x[i] - return o - - - c = foo() - print(c) - print(type(c)) - -.. Each ``prange`` instruction in Numba* has an optional *lowerer* attribute. The -.. lowerer attribute determines how the parfor instruction should be lowered to -.. LLVM IR. In addition, the lower attribute decides which ``prange`` instructions -.. can be fused together. At this point ``numba-dpex`` does not generate -.. device-specific code and the lowerer used is same for all device types. However, -.. a different :py:class:`numba_dpex.core.parfors.parfor_lowerer.ParforLowerImpl` -.. instance is returned for every ``prange`` instruction for each corresponding CFD -.. (Compute Follows Data) inferred device to prevent illegal ``prange`` fusion. - -``prange`` loop statements can also be used to write reduction loops as -demonstrated by the following naive pairwise distance computation. - -.. code-block:: python - - from numba_dpex import dpjit, prange - import dpnp - import dpctl - - - @dpjit - def pairwise_distance(X1, X2, D): - """Naïve pairwise distance impl - take an array representing M points in N - dimensions, and return the M x M matrix of Euclidean distances - - Args: - X1 : Set of points - X2 : Set of points - D : Outputted distance matrix - """ - # Size of inputs - X1_rows = X1.shape[0] - X2_rows = X2.shape[0] - X1_cols = X1.shape[1] - - float0 = X1.dtype.type(0.0) - - # Outermost parallel loop over the matrix X1 - for i in prange(X1_rows): - # Loop over the matrix X2 - for j in range(X2_rows): - d = float0 - # Compute exclidean distance - for k in range(X1_cols): - tmp = X1[i, k] - X2[j, k] - d += tmp * tmp - # Write computed distance to distance matrix - D[i, j] = dpnp.sqrt(d) - - - q = dpctl.SyclQueue() - X1 = dpnp.ones((10, 2), sycl_queue=q) - X2 = dpnp.zeros((10, 2), sycl_queue=q) - D = dpnp.empty((10, 2), sycl_queue=q) - - pairwise_distance(X1, X2, D) - print(D) - - -.. Fusion of Kernels -.. ------------------ - -.. ``numba-dpex`` can identify each NumPy* (or ``dpnp``) array expression as a -.. data-parallel kernel and fuse them together to generate a single SYCL kernel. -.. The kernel is automatically offloaded to the specified device where the fusion -.. operation is invoked. Here is a simple example of a Black-Scholes formula -.. computation where kernel fusion occurs at different ``dpnp`` math functions: - -.. .. literalinclude:: ./../../../numba_dpex/examples/blacksholes_njit.py -.. :language: python -.. :pyobject: blackscholes -.. :caption: **EXAMPLE:** Data parallel kernel implementing the vector sum a+b -.. :name: blackscholes_dpjit - - -.. .. |numba.extending.overload| replace:: ``numba.extending.overload`` -.. .. |numba.extending.intrinsic| replace:: ``numba.extending.intrinsic`` -.. .. |ol_dpnp_ones(...)| replace:: ``ol_dpnp_ones(...)`` -.. .. |numba.np.arrayobj| replace:: ``numba.np.arrayobj`` - -.. .. _low-level API: https://github.com/IntelPython/dpnp/tree/master/dpnp/backend -.. .. _`ol_dpnp_ones(...)`: https://github.com/IntelPython/numba-dpex/blob/main/numba_dpex/dpnp_iface/arrayobj.py#L358 -.. .. _`numba.extending.overload`: https://numba.pydata.org/numba-doc/latest/extending/high-level.html#implementing-functions -.. .. _`numba.extending.intrinsic`: https://numba.pydata.org/numba-doc/latest/extending/high-level.html#implementing-intrinsics -.. .. _nopython mode: https://numba.pydata.org/numba-doc/latest/glossary.html#term-nopython-mode -.. .. _`numba.np.arrayobj`: https://github.com/numba/numba/blob/main/numba/np/arrayobj.py -.. .. _`llvmlite IRBuilder API`: http://llvmlite.pydata.org/en/latest/user-guide/ir/ir-builder.html diff --git a/docs/source/user_guide/index.rst b/docs/source/user_guide/index.rst index 1cf7381bac..46efb768de 100644 --- a/docs/source/user_guide/index.rst +++ b/docs/source/user_guide/index.rst @@ -8,6 +8,5 @@ Tutorials :maxdepth: 2 kernel_programming/index - dpnp_offload debugging/index config diff --git a/docs/source/user_guide/kernel_programming/atomic-operations.rst b/docs/source/user_guide/kernel_programming/atomic-operations.rst deleted file mode 100644 index 07c2be4795..0000000000 --- a/docs/source/user_guide/kernel_programming/atomic-operations.rst +++ /dev/null @@ -1,27 +0,0 @@ -Supported Atomic Operations -=========================== - -Numba-dpex supports some of the atomic operations supported in DPC++. -Those that are presently implemented are as follows: - -.. automodule:: numba_dpex.ocl.stubs - :members: atomic - :noindex: - -Example -------- - -Example usage of atomic operations - -.. literalinclude:: ./../../../../numba_dpex/examples/kernel/atomic_op.py - :pyobject: main - -.. note:: - - The ``numba_dpex.atomic.add`` function is analogous to The - ``numba.cuda.atomic.add`` provided by the ``numba.cuda`` backend. - -Full examples -------------- - -- :file:`numba_dpex/examples/atomic_op.py` diff --git a/docs/source/user_guide/kernel_programming/index.rst b/docs/source/user_guide/kernel_programming/index.rst index 0008eccc6c..b40d5845fe 100644 --- a/docs/source/user_guide/kernel_programming/index.rst +++ b/docs/source/user_guide/kernel_programming/index.rst @@ -1,53 +1,106 @@ .. _index: .. include:: ./../../ext_links.txt -Kernel Programming Basics -========================= - -`Data Parallel Extensions for Python*`_ introduce a concept of an *offload kernel*, defined as -a part of a Python program being submitted for execution to the device queue. - -.. image:: ./../../../asset/images/kernel-queue-device.png - :scale: 50% - :align: center - :alt: Offload Kernel - -There are multiple ways how to write offload kernels. CUDA*, OpenCl*, and SYCL* offer similar programming model -known as the *data parallel kernel programming*. In this model you express the work in terms of *work items*. -You split data into small pieces, and each piece will be a unit of work, or a *work item*. The total number of -work items is called *global size*. You can also group work items into bigger chunks called *work groups*. -The number of work items in the work group is called the *local size*. - -.. image:: ./../../../asset/images/kernel_prog_model.png - :scale: 50% - :align: center - :alt: Offload Kernel - -In this example there are 48 *work items* (8 in dimension 0, and 6 in dimension 1), that is the *global size* is 48. -Work items are grouped in *work groups* with the *local size* 8 (4 in dimension 0, and 2 in dimension 1). There are -total 48/8 = 6 work groups. - -In the *data parallel kernel programming* model you write a function that processes a given work item. -Such a function is called the *data parallel kernel*. - -**Data Parallel Extension for Numba** offers a way to write data parallel kernels directly using Python using -``numba_dpex.kernel``. It bears similarities with ``numba.cuda`` and ``numba.roc``, but unlike these proprietary -programming models ``numba_dpex`` is built on top of `SYCL*`_ , which is hardware agnostic, meaning -that with ``numba_dpex.kernel`` programming model you will be able to write a portable code targeting different -hardware vendors. - -.. note:: - The current version of ``numba-dpex`` supports Intel SYCL devices only - -.. toctree:: - :caption: This document will cover the following chapters: - :maxdepth: 2 - - writing_kernels - synchronization - device-functions - atomic-operations - memory_allocation_address_space - reduction - ufunc - supported-python-features +Kernel Programming +================== + +The tutorial covers the most important features of the KAPI kernel programming +API and introduces the concepts needed to express data-parallel kernels in +numba-dpex. + + +Preliminary concepts +-------------------- + +Data parallelism +++++++++++++++++ + +Single Program Multiple Data +++++++++++++++++++++++++++++ + +Range v/s Nd-Range Kernels +++++++++++++++++++++++++++ + +Work items and Work groups +++++++++++++++++++++++++++ + +Basic concepts +-------------- + + +Writing a *range* kernel +++++++++++++++++++++++++ + +A *range* kernel represents the simplest form of parallelism that can be +expressed in KAPI. A range kernel represents a data-parallel execution of the +same function by a set of work items. In KAPI, an instance of the +:py:class:`numba_dpex.kernel_api.Range` class represents the set of work items +and each work item in the ``Range`` is represented by an instance of the +:py:class:`numba_dpex.kernel_api.Item` class. As such these two classes are +essential to writing a range kernel in KAPI. + +.. literalinclude:: ./../../../../numba_dpex/examples/kernel/vector_sum.py + :language: python + :lines: 8-9, 11-15 + :caption: **EXAMPLE:** A KAPI range kernel + :name: ex_kernel_declaration_vector_sum + +:ref:`ex_kernel_declaration_vector_sum` shows an example of a range kernel. +Every range kernel requires its first argument to be an ``Item`` and +needs to be launched via :py:func:`numba_dpex.experimental.launcher.call_kernel` +by passing an instance a ``Range`` object. + +Do note that a ``Range`` object only controls the creation of work items, the +distribution of work and data over a ``Range`` still needs to be defined by the +user-written function. In the example, each work item access a single element of +each of the three array and performs a single addition operation. It is possible +to write the kernel differently so that each work item accesses multiple data +elements or conditionally performs different amount of work. The data access +patterns in a work item can have performance implications and programmers should +refer a more topical material such as the `oneAPI GPU optimization guide`_ to +learn more. + +A range kernel is meant to express a basic `parallel-for` calculation that is +ideally suited for embarrassingly parallel kernels such as elementwise +computations over ndarrays. The API for expressing a range kernel does not +allow advanced features such as synchronization of work items and fine-grained +control over memory allocation on a device. + +Writing an *nd-range* kernel +++++++++++++++++++++++++++++ + +The ``device_func`` decorator ++++++++++++++++++++++++++++++ + +Supported mathematical operations ++++++++++++++++++++++++++++++++++ + +Supported Python operators +++++++++++++++++++++++++++ + +Supported kernel arguments +++++++++++++++++++++++++++ + +Launching a kernel +++++++++++++++++++ + +Advanced topics +--------------- + +Local memory allocation ++++++++++++++++++++++++ + +Private memory allocation ++++++++++++++++++++++++++ + +Group barrier synchronization ++++++++++++++++++++++++++++++ + +Atomic operations ++++++++++++++++++ + +Async kernel execution +++++++++++++++++++++++ + +Specializing a kernel or a device_func +++++++++++++++++++++++++++++++++++++++ diff --git a/docs/source/user_guide/kernel_programming/memory_allocation_address_space.rst b/docs/source/user_guide/kernel_programming/memory_allocation_address_space.rst deleted file mode 100644 index 7fc02e6a58..0000000000 --- a/docs/source/user_guide/kernel_programming/memory_allocation_address_space.rst +++ /dev/null @@ -1,36 +0,0 @@ -Supported Address Space Qualifiers -================================== - -The address space qualifier may be used to specify the region of memory that is -used to allocate the object. - -Numba-dpex supports three disjoint named address spaces: - -1. Global Address Space - Global Address Space refers to memory objects allocated from the global - memory pool and will be shared among all work-items. Arguments passed to any - kernel are allocated in the global address space. In the below example, - arguments `a`, `b` and `c` will be allocated in the global address space: - - .. literalinclude:: ./../../../../numba_dpex/examples/kernel/vector_sum.py - - -2. Local Address Space - Local Address Space refers to memory objects that need to be allocated in - local memory pool and are shared by all work-items of a work-group. - Numba-dpex does not support passing arguments that are allocated in the - local address space to `@numba_dpex.kernel`. Users are allowed to allocate - static arrays in the local address space inside the `@numba_dpex.kernel`. In - the example below `numba_dpex.local.array(shape, dtype)` is the API used to - allocate a static array in the local address space: - - .. literalinclude:: ./../../../../numba_dpex/examples/barrier.py - :lines: 54-87 - -3. Private Address Space - Private Address Space refers to memory objects that are local to each - work-item and is not shared with any other work-item. In the example below - `numba_dpex.private.array(shape, dtype)` is the API used to allocate a - static array in the private address space: - - .. literalinclude:: ./../../../../numba_dpex/examples/kernel/kernel_private_memory.py diff --git a/docs/source/user_guide/kernel_programming/random.rst b/docs/source/user_guide/kernel_programming/random.rst deleted file mode 100644 index e2aef1d14b..0000000000 --- a/docs/source/user_guide/kernel_programming/random.rst +++ /dev/null @@ -1,58 +0,0 @@ -Random Number Generation -======================== - -Numba-dpex does not provide a random number generation algorithm that can be -executed on the SYCL-supported Device. - -Numba-dpex provides access to NumPy random algorithms that can be executed on the -SYCL-supported device via integration with `dpnp Random`_. - -.. _`dpnp Random`: https://intelpython.github.io/dpnp/reference/comparison.html#random-sampling - - -Supported functions -------------------- - -Simple random -````````````` - -- `random `_ -- `sample `_ -- `ranf `_ -- `random_sample `_ -- `rand `_ -- `randint `_ -- `random_integers `_ - -Distribution -```````````` - -- `beta `_ -- `binomial `_ -- `chisquare `_ -- `exponential `_ -- `gamma `_ -- `geometric `_ -- `gumbel `_ -- `hypergeometric `_ -- `laplace `_ -- `lognormal `_ -- `multinomial `_ -- `multivariate_normal `_ -- `negative_binomial `_ -- `normal `_ -- `poisson `_ -- `rayleigh `_ -- `standard_cauchy `_ -- `standard_exponential `_ -- `standard_gamma `_ -- `standard_normal `_ -- `uniform `_ -- `weibull `_ - -Example: - -.. note:: - To ensure the code is executed on GPU set ``DEBUG=1`` (or ``NUMBA_DPEX_DEBUG=1``) and look to stdout - -.. literalinclude:: ./../../../../numba_dpex/examples/rand.py diff --git a/docs/source/user_guide/kernel_programming/ufunc.rst b/docs/source/user_guide/kernel_programming/ufunc.rst deleted file mode 100644 index eb1b47d99b..0000000000 --- a/docs/source/user_guide/kernel_programming/ufunc.rst +++ /dev/null @@ -1,52 +0,0 @@ -Universal Functions -=================== - -Numba provides a `set of decorators -`_ to create -`NumPy universal functions -`_-like routines that are -JIT compiled. Although, a close analog to NumPy universal functions Numba's -``@vectorize`` are not fully compatible with a regular NumPy ufunc.. Refer -`Creating NumPy universal functions`_ for details. - - -Numba-dpex only supports ``numba.vectorize`` decorator and not yet the -``numba.guvectorize`` decorator. Another present limitation is that -numba-dpex ufunc kernels cannot invoke a ``numba_dpex.kernel`` function. -.. Ongoing work is in progress to address these limitations. - -Example 1: Basic Usage ----------------------- - -Full example can be found at :file:`numba_dpex/examples/vectorize.py`. - -.. literalinclude:: ./../../../../numba_dpex/examples/vectorize.py - :pyobject: ufunc_kernel - -.. literalinclude:: ./../../../../numba_dpex/examples/vectorize.py - :pyobject: test_njit - -Example 2: Calling ``numba.vectorize`` inside a ``numba_dpex.kernel`` ---------------------------------------------------------------------- - -Full example can be found at :file:`numba_dpex/examples/blacksholes_njit.py`. - -.. literalinclude:: ./../../../../numba_dpex/examples/blacksholes_njit.py - :pyobject: cndf2 - -.. note:: - - ``numba.cuda`` requires ``target='cuda'`` parameter for ``numba.vectorize`` - and ``numba.guvectorize`` functions. Numba-dpex eschews the ``target`` - parameter for ``@vectorize`` and infers the target from the - ``dpctl.device_context`` in which the ``numba.vectorize`` function is - called. - -Full Examples -------------- - -- :file:`numba_dpex/examples/vectorize.py` -- :file:`numba_dpex/examples/blacksholes_njit.py` - -.. _`Universal functions (ufunc)`: http://docs.scipy.org/doc/numpy/reference/ufuncs.html -.. _`Creating NumPy universal functions`: https://numba.pydata.org/numba-doc/latest/user/vectorize.html diff --git a/docs/source/user_guide/kernel_programming/writing_kernels.rst b/docs/source/user_guide/kernel_programming/writing_kernels.rst deleted file mode 100644 index 5989314aeb..0000000000 --- a/docs/source/user_guide/kernel_programming/writing_kernels.rst +++ /dev/null @@ -1,97 +0,0 @@ -.. include:: ./../../ext_links.txt - -Writing Data Parallel Kernels -============================= - -Kernel Declaration ------------------- -A kernel function is a device function that is meant to be called from host -code, where a device can be any SYCL supported device such as a GPU, CPU, or an -FPGA. The main characteristics of a kernel function are: - -- **Scalars must be passed as an array**. Kernels operate with ``dpnp`` array - arguments only. If your want a scalar argument, then represent it as - 0-dimensional ``dpnp`` array. - -.. note:: - Please refer to `Data Parallel Extension for Numpy*`_ to learn more about ``dpnp``. - -- **Kernels cannot explicitly return a value**. All result data must be written - to ``dpnp`` array passed as a function's argument. - -Here is an example of a kernel that computes sum of two vectors ``a`` and ``b``. -Arguments are two input vectors ``a`` and ``b`` and one output vector ``c`` for -storing the result of vector summation: - -.. literalinclude:: ./../../../../numba_dpex/examples/kernel/vector_sum.py - :language: python - :lines: 8-9, 11-15 - :caption: **EXAMPLE:** Data parallel kernel implementing the vector sum a+b - :name: ex_kernel_declaration_vector_sum - - -Kernel Invocation ------------------- - -The kernel launch parameter syntax for specifying global and local sizes are -similar to ``SYCL``'s ``range`` and ``ndrange`` classes. The global and local -sizes need to be specified with ``numba_dpex``'s ``Range`` and ``NdRange`` -classes. - -For example, below is a following kernel that computes a sum of two vectors: - -.. literalinclude:: ./../../../../numba_dpex/examples/kernel/vector_sum.py - :language: python - :lines: 8-9, 11-15 - :caption: **EXAMPLE:** A vector sum kernel - :name: vector_sum_kernel - -If the ``global size`` parameter is needed to run, it could be like this (where -``global_size`` is an ``int``): - -.. literalinclude:: ./../../../../numba_dpex/examples/kernel/vector_sum.py - :language: python - :lines: 8-9, 18-24 - :emphasize-lines: 5 - :caption: **EXAMPLE:** A vector sum kernel with a global size/range - :name: vector_sum_kernel_with_launch_param - -If both local and global ranges are needed, they can be specified using two -instances of ``Range`` inside an ``NdRange`` object. For example, below is a -kernel to compute pair-wise Euclidean distances of n-dimensional data points: - -.. literalinclude:: ./../../../../numba_dpex/examples/kernel/pairwise_distance.py - :language: python - :lines: 14-15, 36-51 - :caption: **EXAMPLE:** A kernel to compute pair-wise Euclidean distances - :name: pairwise_distance_kernel - -Now the local and global sizes can be specified as follows (here both ``args.n`` -and ``args.l`` are ``int``): - -.. literalinclude:: ./../../../../numba_dpex/examples/kernel/pairwise_distance.py - :language: python - :lines: 14-15, 27-31, 54-67 - :emphasize-lines: 4,6,13 - :caption: **EXAMPLE:** A kernel to compute pair-wise Euclidean distances with - a global and a local size/range - :name: pairwise_distance_kernel_with_launch_param - - -Kernel Indexing Functions -------------------------- - -In *data parallel kernel programming* all work items are enumerated and accessed -by their index. You will use ``numba_dpex.get_global_id()`` function to get the -index of a current work item from the kernel. The total number of work items can -be determined by calling ``numba_dpex.get_global_size()`` function. - -The work group size can be determined by calling ``numba_dpex.get_local_size()`` -function. Work items in the current work group are accessed by calling -``numba_dpex.get_local_id()``. - -The total number of work groups are determined by calling -``numba_dpex.get_num_groups()`` function. The current work group index is -obtained by calling ``numba_dpex.get_group_id()`` function. - -.. _Black Scholes: https://en.wikipedia.org/wiki/Black%E2%80%93Scholes_model diff --git a/numba_dpex/core/kernel_interface/ranges_overloads.py b/numba_dpex/core/kernel_interface/ranges_overloads.py index f0ed884c8b..696b438d4a 100644 --- a/numba_dpex/core/kernel_interface/ranges_overloads.py +++ b/numba_dpex/core/kernel_interface/ranges_overloads.py @@ -33,14 +33,14 @@ def codegen(context, builder, sig, args): range_struct.dim1 = dim1 else: range_struct.dim1 = llvmir.Constant( - llvmir.types.IntType(64), Range.UNDEFINED_DIMENSION + llvmir.types.IntType(64), Range._UNDEFINED_DIMENSION ) if not isinstance(sig.args[2], types.NoneType): range_struct.dim2 = dim2 else: range_struct.dim2 = llvmir.Constant( - llvmir.types.IntType(64), Range.UNDEFINED_DIMENSION + llvmir.types.IntType(64), Range._UNDEFINED_DIMENSION ) range_struct.ndim = llvmir.Constant(llvmir.types.IntType(64), typ.ndim) diff --git a/numba_dpex/experimental/launcher.py b/numba_dpex/experimental/launcher.py index b9c017d395..82809a4c9e 100644 --- a/numba_dpex/experimental/launcher.py +++ b/numba_dpex/experimental/launcher.py @@ -220,18 +220,19 @@ def codegen( @dpjit def call_kernel(kernel_fn, index_space, *kernel_args) -> None: - """Calls a numba_dpex.kernel decorated function from CPython or from another - dpjit function. Kernel execution happens in synchronous way, so the thread - will be blocked till the kernel done execution. + """Compiles and synchronously executes a kernel function. + + Kernel execution happens in synchronous way, so the main thread will be + blocked till the kernel done execution. Args: kernel_fn (numba_dpex.experimental.KernelDispatcher): A - numba_dpex.kernel decorated function that is compiled to a - KernelDispatcher by numba_dpex. - index_space (Range | NdRange): A numba_dpex.Range or numba_dpex.NdRange - type object that specifies the index space for the kernel. + :func:`numba_dpex.experimental.kernel` decorated function that is + compiled to a ``KernelDispatcher``. + index_space (Range | NdRange): A Range or NdRange type object that + specifies the index space for the kernel. kernel_args : List of objects that are passed to the numba_dpex.kernel - decorated function. + decorated function. """ _submit_kernel_sync( # pylint: disable=E1120 kernel_fn, @@ -247,25 +248,32 @@ def call_kernel_async( dependent_events: list[dpctl.SyclEvent], *kernel_args ) -> tuple[dpctl.SyclEvent, dpctl.SyclEvent]: - """Calls a numba_dpex.kernel decorated function from CPython or from another - dpjit function. Kernel execution happens in asynchronous way, so the thread - will not be blocked till the kernel done execution. That means that it is - user responsibility to properly use any memory used by kernel until the - kernel execution is completed. + """Compiles and asynchronously executes a kernel function. + + Calls a :func:`numba_dpex.experimental.kernel` decorated function + asynchronously from CPython or from a :func:`numba_dpex.dpjit` function. As + the kernel execution happens asynchronously, so the main thread will not be + blocked till the kernel done execution. Instead the function returns back to + caller a handle for an *event* to track kernel execution. It is a user's + responsibility to properly track kernel execution completion and not use any + data that may still be used by the kernel prior to the kernel's completion. Args: - kernel_fn (numba_dpex.experimental.KernelDispatcher): A - numba_dpex.kernel decorated function that is compiled to a - KernelDispatcher by numba_dpex. - index_space (Range | NdRange): A numba_dpex.Range or numba_dpex.NdRange - type object that specifies the index space for the kernel. + kernel_fn (KernelDispatcher): A + :func:`numba_dpex.experimental.kernel` decorated function that is + compiled to a ``KernelDispatcher``. + index_space (Range | NdRange): A Range or NdRange type object that + specifies the index space for the kernel. kernel_args : List of objects that are passed to the numba_dpex.kernel - decorated function. + decorated function. Returns: - pair of host event and device event. Host event represent host task - that releases use of any kernel argument so it can be deallocated. - This task may be executed only after device task is done. + A pair of ``dpctl.SyclEvent`` objects. The pair of events constitute of + a host task and an event associated with the kernel execution. The event + associated with the kernel execution indicates the execution status of + the submitted kernel function. The host task manages the lifetime of any + PyObject passed in as a kernel argument and automatically decrements the + reference count of the object on kernel execution completion. """ return _submit_kernel_async( # pylint: disable=E1120 kernel_fn, diff --git a/numba_dpex/kernel_api/atomic_ref.py b/numba_dpex/kernel_api/atomic_ref.py index 9759cecf95..c8ab205890 100644 --- a/numba_dpex/kernel_api/atomic_ref.py +++ b/numba_dpex/kernel_api/atomic_ref.py @@ -10,9 +10,10 @@ class AtomicRef: - """Analogue to the ``sycl::atomic_ref`` type. An atomic reference is a - view into a data container that can be then updated atomically using any of - the ``fetch_*`` member functions of the class. + """Analogue to the ``sycl::atomic_ref`` class. + + An atomic reference is a view into a data container that can be then updated + atomically using any of the ``fetch_*`` member functions of the class. """ def __init__( # pylint: disable=too-many-arguments @@ -66,7 +67,6 @@ def fetch_add(self, val): val : Value to be added to the object referenced by the AtomicRef. Returns: The original value of the object referenced by the AtomicRef. - """ old = self._ref[self._index].copy() self._ref[self._index] += val @@ -79,10 +79,9 @@ def fetch_sub(self, val): Args: val : Value to be subtracted from the object referenced by the - AtomicRef. + AtomicRef. Returns: The original value of the object referenced by the AtomicRef. - """ old = self._ref[self._index].copy() self._ref[self._index] -= val @@ -95,10 +94,9 @@ def fetch_min(self, val): Args: val : Value to be compared against the object referenced by the - AtomicRef. + AtomicRef. Returns: The original value of the object referenced by the AtomicRef. - """ old = self._ref[self._index].copy() self._ref[self._index] = min(old, val) @@ -111,10 +109,9 @@ def fetch_max(self, val): Args: val : Value to be compared against the object referenced by the - AtomicRef. + AtomicRef. Returns: The original value of the object referenced by the AtomicRef. - """ old = self._ref[self._index].copy() self._ref[self._index] = max(old, val) @@ -127,10 +124,9 @@ def fetch_and(self, val): Args: val : Value to be bitwise ANDed against the object referenced by - the AtomicRef. + the AtomicRef. Returns: The original value of the object referenced by the AtomicRef. - """ old = self._ref[self._index].copy() self._ref[self._index] &= val @@ -143,10 +139,9 @@ def fetch_or(self, val): Args: val : Value to be bitwise ORed against the object referenced by - the AtomicRef. + the AtomicRef. Returns: The original value of the object referenced by the AtomicRef. - """ old = self._ref[self._index].copy() self._ref[self._index] |= val @@ -159,7 +154,7 @@ def fetch_xor(self, val): Args: val : Value to be bitwise XORed against the object referenced by - the AtomicRef. + the AtomicRef. Returns: The original value of the object referenced by the AtomicRef. @@ -172,7 +167,6 @@ def load(self): """Loads the value of the object referenced by the AtomicRef. Returns: The value of the object referenced by the AtomicRef. - """ return self._ref[self._index] @@ -180,22 +174,20 @@ def store(self, val): """Stores operand ``val`` to the object referenced by the AtomicRef. Args: - val : Value to be stored in the object referenced by - the AtomicRef. - + val : Value to be stored in the object referenced by the AtomicRef. """ self._ref[self._index] = val def exchange(self, val): """Replaces the value of the object referenced by the AtomicRef - with value of ``val``. Returns the original value of the referenced object. + with value of ``val``. Returns the original value of the referenced + object. Args: val : Value to be exchanged against the object referenced by - the AtomicRef. + the AtomicRef. Returns: The original value of the object referenced by the AtomicRef. - """ old = self._ref[self._index].copy() self._ref[self._index] = val @@ -203,23 +195,21 @@ def exchange(self, val): def compare_exchange(self, expected, desired, expected_idx=0): """Compares the value of the object referenced by the AtomicRef - against the value of ``expected[expected_idx]``. - If the values are equal, replaces the value of the - referenced object with the value of ``desired``. - Otherwise assigns the original value of the - referenced object to ``expected[expected_idx]``. + against the value of ``expected[expected_idx]``. If the values are + equal, replaces the value of the referenced object with the value of + ``desired``. Otherwise assigns the original value of the referenced + object to ``expected[expected_idx]``. Args: - expected : Array containing the expected value of the - object referenced by the AtomicRef. - desired : Value that replaces the value of the object - referenced by the AtomicRef. + expected : Array containing the expected value of the object + referenced by the AtomicRef. + desired : Value that replaces the value of the object referenced by + the AtomicRef. expected_idx: Offset in `expected` array where the expected value of the object referenced by the AtomicRef is present. - Returns: Returns ``True`` if the comparison operation and - replacement operation were successful. - + Returns: ``True`` if the comparison operation and replacement operation + were successful. """ if self._ref[self._index] == expected[expected_idx]: self._ref[self._index] = desired diff --git a/numba_dpex/kernel_api/barrier.py b/numba_dpex/kernel_api/barrier.py index 744262403d..3fd78fe80e 100644 --- a/numba_dpex/kernel_api/barrier.py +++ b/numba_dpex/kernel_api/barrier.py @@ -2,7 +2,7 @@ # # SPDX-License-Identifier: Apache-2.0 -"""Python functions that simulate SYCL's barrier primitives. +"""Python functions that simulate SYCL's group_barrier function. """ from .index_space_ids import Group @@ -14,13 +14,13 @@ def group_barrier(group: Group, fence_scope=MemoryScope.WORK_GROUP): The function is modeled after the ``sycl::group_barrier`` function. It synchronizes work within a group of work items. All the work-items - of the group must execute the barrier construct before any work-item + of the group must execute the barrier call before any work-item continues execution beyond the barrier. The ``group_barrier`` performs mem-fence operations ensuring that memory accesses issued before the barrier are not re-ordered with those issued - after the barrier: all work-items in group g execute a release fence prior - to synchronizing at the barrier, all work-items in group g execute an + after the barrier: all work-items in group G execute a release fence prior + to synchronizing at the barrier, all work-items in group G execute an acquire fence afterwards, and there is an implicit synchronization of these fences as if provided by an explicit atomic operation on an atomic object. diff --git a/numba_dpex/kernel_api/index_space_ids.py b/numba_dpex/kernel_api/index_space_ids.py index fb2018dd20..e53b81f58f 100644 --- a/numba_dpex/kernel_api/index_space_ids.py +++ b/numba_dpex/kernel_api/index_space_ids.py @@ -119,8 +119,10 @@ def leader(self, work_item_id): class Item: - """Analogue to the ``sycl::item`` type. Identifies an instance of the - function object executing at each point in an Range. + """Analogue to the ``sycl::item`` class. + + Identifies an instance of the function object executing at each point in an + :class:`.Range`. """ def __init__(self, extent: Range, index: list): @@ -170,7 +172,7 @@ def get_range(self, idx): @property def dimensions(self) -> int: - """Returns the rank of a Item object. + """Returns the number of dimensions of a Item object. Returns: int: Number of dimensions in the Item object @@ -179,8 +181,10 @@ def dimensions(self) -> int: class NdItem: - """Analogue to the ``sycl::nd_item`` type. Identifies an instance of the - function object executing at each point in an NdRange. + """Analogue to the ``sycl::nd_item`` class. + + Identifies an instance of the function object executing at each point in an + :class:`.NdRange`. """ # TODO: define group type @@ -201,8 +205,7 @@ def get_global_id(self, idx): return self._global_item.get_id(idx) def get_global_linear_id(self): - """Get the global linear id associated with this item for all - dimensions. + """Get the linearized global id for the item for all dimensions. Returns: int: The global linear id. diff --git a/numba_dpex/kernel_api/memory_enums.py b/numba_dpex/kernel_api/memory_enums.py index ef738daf2b..5631e88191 100644 --- a/numba_dpex/kernel_api/memory_enums.py +++ b/numba_dpex/kernel_api/memory_enums.py @@ -11,8 +11,9 @@ class MemoryOrder(FlagEnum): """ - An enumeration of the supported ``sycl::memory_order`` values in dpcpp. The - integer values of the enums is kept consistent with the corresponding + An enumeration of the supported ``sycl::memory_order`` values. + + The integer values of the enums is kept consistent with the corresponding implementation in dpcpp. ===================== ============ @@ -37,8 +38,9 @@ class MemoryOrder(FlagEnum): class MemoryScope(FlagEnum): """ - An enumeration of SYCL memory scope. For more details please refer to - SYCL 2020 specification, section 3.8.3.2 + An enumeration of the supported ``sycl::memory_scope`` values. + + For more details please refer to SYCL 2020 specification, section 3.8.3.2 =============== ============ Memory Scope Enum value @@ -59,7 +61,7 @@ class MemoryScope(FlagEnum): class AddressSpace(FlagEnum): - """The address space values supported by numba_dpex. + """An enumeration of the supported address space values. ================== ============ Address space Value diff --git a/numba_dpex/kernel_api/ranges.py b/numba_dpex/kernel_api/ranges.py index 85cea812af..e4a131544a 100644 --- a/numba_dpex/kernel_api/ranges.py +++ b/numba_dpex/kernel_api/ranges.py @@ -27,7 +27,7 @@ class Range(tuple): the behavior of `sycl::range`. """ - UNDEFINED_DIMENSION = -1 + _UNDEFINED_DIMENSION = -1 def __new__(cls, dim0, dim1=None, dim2=None): """Constructs a 1, 2, or 3 dimensional range. @@ -114,7 +114,7 @@ def dim1(self) -> int: try: return self[1] except IndexError: - return Range.UNDEFINED_DIMENSION + return Range._UNDEFINED_DIMENSION @property def dim2(self) -> int: @@ -127,7 +127,7 @@ def dim2(self) -> int: try: return self[2] except IndexError: - return Range.UNDEFINED_DIMENSION + return Range._UNDEFINED_DIMENSION class NdRange: