diff --git a/.github/workflows/conda-package.yml b/.github/workflows/conda-package.yml index 7df9afcfe7..2c8d8a2ed1 100644 --- a/.github/workflows/conda-package.yml +++ b/.github/workflows/conda-package.yml @@ -152,7 +152,7 @@ jobs: conda activate numba_dpex_env # echo "libintelocl.so" | tee /etc/OpenCL/vendors/intel-cpu.icd export OCL_ICD_FILENAMES=libintelocl.so - for script in $(find . \( -not -name "_*" -not -name "vectorize.py" -and -name "*.py" \)) + for script in $(find . \( -not -name "_*" -not -name "vectorize.py" -not -name "scan.py" -and -name "*.py" \)) do echo "Executing ${script}" python ${script} || exit 1 diff --git a/numba_dpex/examples/atomic_op.py b/numba_dpex/examples/atomic_op.py deleted file mode 100644 index 1155fa38ac..0000000000 --- a/numba_dpex/examples/atomic_op.py +++ /dev/null @@ -1,52 +0,0 @@ -# SPDX-FileCopyrightText: 2020 - 2022 Intel Corporation -# -# SPDX-License-Identifier: Apache-2.0 - -import dpctl -import numpy as np - -import numba_dpex as dpex - - -def main(): - """ - The example demonstrates the use of numba_dpex's ``atomic_add`` intrinsic - function on a SYCL GPU device. The ``dpctl.select_gpu_device`` is - equivalent to ``sycl::gpu_selector`` and returns a sycl::device of type GPU. - - If we want to generate native floating point atomics for supported - SYCL devices we need to set two environment variables: - NUMBA_DPEX_ACTIVATE_ATOMICS_FP_NATIVE=1 - NUMBA_DPEX_LLVM_SPIRV_ROOT=/path/to/dpcpp/provided/llvm_spirv - - To run this example: - NUMBA_DPEX_ACTIVATE_ATOMICS_FP_NATIVE=1 NUMBA_DPEX_LLVM_SPIRV_ROOT=/path/to/dpcpp/provided/llvm_spirv python atomic_op.py - - Without these two environment variables numba_dpex will use other - implementation for floating point atomics. - """ - - @dpex.kernel - def atomic_add(a): - dpex.atomic.add(a, 0, 1) - - global_size = 100 - a = np.array([0], dtype=np.float32) - - # Use the environment variable SYCL_DEVICE_FILTER to change the default device. - # See https://github.com/intel/llvm/blob/sycl/sycl/doc/EnvironmentVariables.md#sycl_device_filter. - device = dpctl.select_default_device() - print("Using device ...") - device.print_device_info() - - with dpctl.device_context(device): - atomic_add[global_size, dpex.DEFAULT_LOCAL_SIZE](a) - - # Expected 100, because global_size = 100 - print(a) - - print("Done...") - - -if __name__ == "__main__": - main() diff --git a/numba_dpex/examples/barrier.py b/numba_dpex/examples/barrier.py deleted file mode 100644 index 8092115663..0000000000 --- a/numba_dpex/examples/barrier.py +++ /dev/null @@ -1,88 +0,0 @@ -# SPDX-FileCopyrightText: 2020 - 2022 Intel Corporation -# -# SPDX-License-Identifier: Apache-2.0 - -import dpctl -import numpy as np -from numba import float32 - -import numba_dpex as dpex - - -def no_arg_barrier_support(): - """ - This example demonstrates the usage of numba_dpex's ``barrier`` - intrinsic function. The ``barrier`` function is usable only inside - a ``kernel`` and is equivalent to OpenCL's ``barrier`` function. - """ - - @dpex.kernel - def twice(A): - i = dpex.get_global_id(0) - d = A[i] - # no argument defaults to global mem fence - dpex.barrier() - A[i] = d * 2 - - N = 10 - arr = np.arange(N).astype(np.float32) - print(arr) - - # Use the environment variable SYCL_DEVICE_FILTER to change the default device. - # See https://github.com/intel/llvm/blob/sycl/sycl/doc/EnvironmentVariables.md#sycl_device_filter. - device = dpctl.select_default_device() - print("Using device ...") - device.print_device_info() - - with dpctl.device_context(device): - twice[N, dpex.DEFAULT_LOCAL_SIZE](arr) - - # the output should be `arr * 2, i.e. [0, 2, 4, 6, ...]` - print(arr) - - -def local_memory(): - """ - This example demonstrates the usage of numba-dpex's `local.array` - intrinsic function. The function is used to create a static array - allocated on the devices local address space. - """ - blocksize = 10 - - @dpex.kernel - def reverse_array(A): - lm = dpex.local.array(shape=10, dtype=float32) - i = dpex.get_global_id(0) - - # preload - lm[i] = A[i] - # barrier local or global will both work as we only have one work group - dpex.barrier(dpex.CLK_LOCAL_MEM_FENCE) # local mem fence - # write - A[i] += lm[blocksize - 1 - i] - - arr = np.arange(blocksize).astype(np.float32) - print(arr) - - # Use the environment variable SYCL_DEVICE_FILTER to change the default device. - # See https://github.com/intel/llvm/blob/sycl/sycl/doc/EnvironmentVariables.md#sycl_device_filter. - device = dpctl.select_default_device() - print("Using device ...") - device.print_device_info() - - with dpctl.device_context(device): - reverse_array[blocksize, dpex.DEFAULT_LOCAL_SIZE](arr) - - # the output should be `orig[::-1] + orig, i.e. [9, 9, 9, ...]`` - print(arr) - - -def main(): - no_arg_barrier_support() - local_memory() - - print("Done...") - - -if __name__ == "__main__": - main() diff --git a/numba_dpex/examples/blacksholes_kernel.py b/numba_dpex/examples/blacksholes_kernel.py deleted file mode 100644 index d2eb80d3b8..0000000000 --- a/numba_dpex/examples/blacksholes_kernel.py +++ /dev/null @@ -1,102 +0,0 @@ -# SPDX-FileCopyrightText: 2020 - 2022 Intel Corporation -# -# SPDX-License-Identifier: Apache-2.0 - -import math - -import dpctl -import numpy as np - -import numba_dpex as dpex - -RISKFREE = 0.02 -VOLATILITY = 0.30 - -A1 = 0.31938153 -A2 = -0.356563782 -A3 = 1.781477937 -A4 = -1.821255978 -A5 = 1.330274429 -RSQRT2PI = 0.39894228040143267793994605993438 - - -def randfloat(rand_var, low, high): - return (1.0 - rand_var) * low + rand_var * high - - -OPT_N = 400 -iterations = 2 - -stockPrice = randfloat(np.random.random(OPT_N), 5.0, 30.0) -optionStrike = randfloat(np.random.random(OPT_N), 1.0, 100.0) -optionYears = randfloat(np.random.random(OPT_N), 0.25, 10.0) -callResult = np.zeros(OPT_N) -putResult = -np.ones(OPT_N) - - -@dpex.kernel -def blackscholes_kernel(callResult, putResult, S, X, T, R, V): - """ - A simple implementation of the Black-Scholes formula using explicit - OpenCL-syle kernel programming model. - """ - i = dpex.get_global_id(0) - if i >= S.shape[0]: - return - sqrtT = math.sqrt(T[i]) - d1 = (math.log(S[i] / X[i]) + (R + 0.5 * V * V) * T[i]) / (V * sqrtT) - d2 = d1 - V * sqrtT - - K = 1.0 / (1.0 + 0.2316419 * math.fabs(d1)) - cndd1 = ( - RSQRT2PI - * math.exp(-0.5 * d1 * d1) - * (K * (A1 + K * (A2 + K * (A3 + K * (A4 + K * A5))))) - ) - if d1 > 0: - cndd1 = 1.0 - cndd1 - - K = 1.0 / (1.0 + 0.2316419 * math.fabs(d2)) - cndd2 = ( - RSQRT2PI - * math.exp(-0.5 * d2 * d2) - * (K * (A1 + K * (A2 + K * (A3 + K * (A4 + K * A5))))) - ) - if d2 > 0: - cndd2 = 1.0 - cndd2 - - expRT = math.exp((-1.0 * R) * T[i]) - callResult[i] = S[i] * cndd1 - X[i] * expRT * cndd2 - putResult[i] = X[i] * expRT * (1.0 - cndd2) - S[i] * (1.0 - cndd1) - - -def main(): - blockdim = 512, 1 - griddim = int(math.ceil(float(OPT_N) / blockdim[0])), 1 - - # Use the environment variable SYCL_DEVICE_FILTER to change the default device. - # See https://github.com/intel/llvm/blob/sycl/sycl/doc/EnvironmentVariables.md#sycl_device_filter. - device = dpctl.select_default_device() - print("Using device ...") - device.print_device_info() - - with dpctl.device_context(device): - for i in range(iterations): - blackscholes_kernel[blockdim, griddim]( - callResult, - putResult, - stockPrice, - optionStrike, - optionYears, - RISKFREE, - VOLATILITY, - ) - - print("callResult : \n", callResult) - print("putResult : \n", putResult) - - print("Done...") - - -if __name__ == "__main__": - main() diff --git a/numba_dpex/examples/dpex_func.py b/numba_dpex/examples/dpex_func.py deleted file mode 100644 index 7d39ff1d10..0000000000 --- a/numba_dpex/examples/dpex_func.py +++ /dev/null @@ -1,56 +0,0 @@ -# SPDX-FileCopyrightText: 2020 - 2022 Intel Corporation -# -# SPDX-License-Identifier: Apache-2.0 - -import dpctl -import numpy as np - -import numba_dpex as dpex - - -@dpex.func -def a_device_function(a): - """ - A ``func`` is a device callable function that can be invoked from - ``kernel`` and other ``func`` functions. - """ - return a + 1 - - -@dpex.func -def another_device_function(a): - return a_device_function(a) - - -@dpex.kernel -def a_kernel_function(a, b): - i = dpex.get_global_id(0) - b[i] = another_device_function(a[i]) - - -def driver(a, b, N): - print(b) - print("--------") - a_kernel_function[N, dpex.DEFAULT_LOCAL_SIZE](a, b) - print(b) - - -def main(): - N = 10 - a = np.ones(N) - b = np.ones(N) - - # Use the environment variable SYCL_DEVICE_FILTER to change the default device. - # See https://github.com/intel/llvm/blob/sycl/sycl/doc/EnvironmentVariables.md#sycl_device_filter. - device = dpctl.select_default_device() - print("Using device ...") - device.print_device_info() - - with dpctl.device_context(device): - driver(a, b, N) - - print("Done...") - - -if __name__ == "__main__": - main() diff --git a/numba_dpex/examples/dpex_with_context.py b/numba_dpex/examples/dpex_with_context.py deleted file mode 100644 index 8a58c6c144..0000000000 --- a/numba_dpex/examples/dpex_with_context.py +++ /dev/null @@ -1,51 +0,0 @@ -# SPDX-FileCopyrightText: 2020 - 2022 Intel Corporation -# -# SPDX-License-Identifier: Apache-2.0 - -""" -The numba_dpex extension adds an automatic offload optimizer to -numba. The optimizer automatically detects data-parallel code -regions in a numba.jit function and then offloads the data-parallel -regions to a SYCL device. The optimizer is triggered when a numba.jit -function is invoked inside ``dpctl.device_context`` scope. - -This example demonstrates the usage of numba_dpex's automatic offload -functionality. Note that numba_dpex should be installed in your -environment for the example to work. -""" - -import dpctl -import numpy as np -from numba import njit, prange - - -@njit -def add_two_arrays(b, c): - a = np.empty_like(b) - for i in prange(len(b)): - a[i] = b[i] + c[i] - - return a - - -def main(): - N = 10 - b = np.ones(N) - c = np.ones(N) - - # Use the environment variable SYCL_DEVICE_FILTER to change the default device. - # See https://github.com/intel/llvm/blob/sycl/sycl/doc/EnvironmentVariables.md#sycl_device_filter. - device = dpctl.select_default_device() - print("Using device ...") - device.print_device_info() - - with dpctl.device_context(device): - result = add_two_arrays(b, c) - - print("Result :", result) - - print("Done...") - - -if __name__ == "__main__": - main() diff --git a/numba_dpex/examples/kernel/atomic_op.py b/numba_dpex/examples/kernel/atomic_op.py new file mode 100644 index 0000000000..4e11546958 --- /dev/null +++ b/numba_dpex/examples/kernel/atomic_op.py @@ -0,0 +1,30 @@ +# SPDX-FileCopyrightText: 2020 - 2022 Intel Corporation +# +# SPDX-License-Identifier: Apache-2.0 + +import dpnp as np + +import numba_dpex as ndpex + + +@ndpex.kernel +def atomic_reduction(a): + idx = ndpex.get_global_id(0) + ndpex.atomic.add(a, 0, a[idx]) + + +def main(): + N = 10 + a = np.arange(N) + + print("Using device ...") + print(a.device) + + atomic_reduction[N, ndpex.DEFAULT_LOCAL_SIZE](a) + print("Reduction sum =", a[0]) + + print("Done...") + + +if __name__ == "__main__": + main() diff --git a/numba_dpex/examples/kernel/black_scholes.py b/numba_dpex/examples/kernel/black_scholes.py new file mode 100644 index 0000000000..fa61cab3cd --- /dev/null +++ b/numba_dpex/examples/kernel/black_scholes.py @@ -0,0 +1,107 @@ +# Copyright 2022 Intel Corporation +# +# SPDX-License-Identifier: Apache 2.0 + + +from math import erf, exp, log, sqrt + +import dpnp as np + +import numba_dpex as ndpx + +# Stock price range +S0L = 10.0 +S0H = 50.0 + +# Strike range +XL = 10.0 +XH = 50.0 + +# Maturity range +TL = 1.0 +TH = 2.0 + +# Risk-free rate assumed constant +RISK_FREE = 0.1 + +# Volatility assumed constants +VOLATILITY = 0.2 + +# Number of call-put options +NOPT = 1024 * 1024 + +# Random seed +SEED = 777 + + +def initialize(): + np.random.seed(SEED) + price = np.random.uniform(S0L, S0H, NOPT) + strike = np.random.uniform(XL, XH, NOPT) + t = np.random.uniform(TL, TH, NOPT) + rate = RISK_FREE + volatility = VOLATILITY + call = np.empty(NOPT) + put = np.empty(NOPT) + + return price, strike, t, rate, volatility, call, put + + +@ndpx.kernel( + access_types={ + "read_only": ["price", "strike", "t"], + "write_only": ["call", "put"], + } +) +def kernel_black_scholes(price, strike, t, rate, volatility, call, put): + # Scalars + mr = -rate + sig_sig_two = volatility * volatility * 2.0 + + # Current index + i = ndpx.get_global_id(0) + + # Get inputs into private memory + p = price[i] + s = strike[i] + tt = t[i] + + a = log(p / s) + b = tt * mr + + z = tt * sig_sig_two + c = 0.25 * z + y = 1.0 / sqrt(z) + + w1 = (a - b + c) * y + w2 = (a - b - c) * y + + d1 = 0.5 + 0.5 * erf(w1) + d2 = 0.5 + 0.5 * erf(w2) + + se = exp(b) * s + + r = p * d1 - se * d2 + + # Write back results + call[i] = r + put[i] = r - p + se + + +def main(): + price, strike, t, rate, volatility, call, put = initialize() + + print("Using device ...") + print(price.device) + + kernel_black_scholes[NOPT, ndpx.DEFAULT_LOCAL_SIZE]( + price, strike, t, rate, volatility, call, put + ) + + print("Call:", call) + print("Put:", put) + print("Done...") + + +if __name__ == "__main__": + main() diff --git a/numba_dpex/examples/kernel/device_func.py b/numba_dpex/examples/kernel/device_func.py new file mode 100644 index 0000000000..2ebc57537a --- /dev/null +++ b/numba_dpex/examples/kernel/device_func.py @@ -0,0 +1,52 @@ +# SPDX-FileCopyrightText: 2020 - 2022 Intel Corporation +# +# SPDX-License-Identifier: Apache-2.0 + +import dpnp as np + +import numba_dpex as ndpex + +# Array size +N = 10 + + +# A device callable function that can be invoked from ``kernel`` and other device functions +@ndpex.func +def a_device_function(a): + return a + 1 + + +# A device callable function can call another device function +@ndpex.func +def another_device_function(a): + return a_device_function(a * 2) + + +# A kernel function that calls the device function +@ndpex.kernel +def a_kernel_function(a, b): + i = ndpex.get_global_id(0) + b[i] = another_device_function(a[i]) + + +# Utility function for printing +def driver(a, b, N): + print("A=", a) + a_kernel_function[N, ndpex.DEFAULT_LOCAL_SIZE](a, b) + print("B=", b) + + +# Main function +def main(): + a = np.ones(N) + b = np.ones(N) + + print("Using device ...") + print(a.device) + driver(a, b, N) + + print("Done...") + + +if __name__ == "__main__": + main() diff --git a/numba_dpex/examples/kernel/interpolation.py b/numba_dpex/examples/kernel/interpolation.py new file mode 100644 index 0000000000..06632d98c0 --- /dev/null +++ b/numba_dpex/examples/kernel/interpolation.py @@ -0,0 +1,131 @@ +# SPDX-FileCopyrightText: 2020 - 2022 Intel Corporation +# +# SPDX-License-Identifier: Apache-2.0 + +import dpnp as np +from numba import float32 +from numpy.testing import assert_almost_equal + +import numba_dpex as ndpex + +# Interpolation domain +XLO = 10.0 +XHI = 90.0 + +# Number of cubic polynomial segments +N_SEGMENTS = 8 + +LOCAL_SIZE = 128 # Number of batches +N_POINTS_PER_WORK_ITEM = ( + 4 # Number of points in the batch. Each work item processes one batch +) +N_POINTS_PER_WORK_GROUP = ( + N_POINTS_PER_WORK_ITEM * LOCAL_SIZE +) # Number of points processed by a work group +N_POINTS = N_POINTS_PER_WORK_GROUP * N_SEGMENTS # Total number of points + +# Natural cubic spline coefficients in interval [10, 90] with uniform grid +# Each work group processes its own segment +COEFFICIENTS = np.asarray( + [ + [ + -0.008086340206185568, + 0.242590206185567, + -0.6172680412371134, + 0, + ], # [10, 20] + [ + 0.015431701030927836, + -1.168492268041237, + 27.60438144329897, + -188.1443298969072, + ], # [20, 30] + [ + -0.0036404639175257733, + 0.5480025773195877, + -23.890463917525775, + 326.8041237113402, + ], # [30, 40] + [ + -0.010869845360824743, + 1.4155283505154639, + -58.59149484536083, + 789.4845360824743, + ], # [40, 50] + [ + -0.0028801546391752576, + 0.21707474226804124, + 1.3311855670103092, + -209.22680412371133, + ], # [50, 60] + [ + 0.042390463917525774, + -7.9316365979381445, + 490.25386597938143, + -9987.680412371134, + ], # [60, 70] + [ + -0.061681701030927835, + 13.923518041237113, + -1039.6069587628865, + 25709.072164948455, + ], # [70, 80] + [ + 0.029336340206185568, + -7.920811855670103, + 707.9394329896908, + -20892.164948453606, + ], + ], # [80, 90] + dtype=np.float32, # We use single precision for interpolation +) + + +@ndpex.kernel() +def kernel_polynomial(x, y, coefficients): + c = ndpex.private.array( + 4, dtype=float32 + ) # Coefficients of a polynomial of a given segment + z = ndpex.private.array(1, dtype=float32) # Keep x[i] in private memory + + gid = ndpex.get_global_id(0) + gr_id = ndpex.get_group_id(0) + + # Polynomial coefficients are fixed within a workgroup + c[0] = coefficients[gr_id][0] + c[1] = coefficients[gr_id][1] + c[2] = coefficients[gr_id][2] + c[3] = coefficients[gr_id][3] + + # Each work item processes N_POINTS_PER_WORK_ITEM points + for i in range( + gid * N_POINTS_PER_WORK_ITEM, (gid + 1) * N_POINTS_PER_WORK_ITEM, 1 + ): + z[0] = x[i] # Copy current point into the private memory + y[i] = ((c[0] * z[0] + c[1]) * z[0] + c[2]) * z[0] + c[ + 3 + ] # Coefficients are in the private memory too + + +def main(): + # Create arrays on the default device + xp = np.arange(XLO, XHI, (XHI - XLO) / N_POINTS) + yp = np.empty(xp.shape) + + print("Using device ...") + print(xp.device) + + kernel_polynomial[N_POINTS // N_POINTS_PER_WORK_ITEM, LOCAL_SIZE]( + xp, yp, COEFFICIENTS + ) + + # Copy results back to the host + nyp = np.asnumpy(yp) + # Basic check for correctness + assert_almost_equal(nyp[2047], 39.97161865234375) + + print("Done...") + + +if __name__ == "__main__": + main() diff --git a/numba_dpex/examples/kernel/scan.py b/numba_dpex/examples/kernel/scan.py new file mode 100644 index 0000000000..85ecdf86a5 --- /dev/null +++ b/numba_dpex/examples/kernel/scan.py @@ -0,0 +1,69 @@ +# SPDX-FileCopyrightText: 2020 - 2022 Intel Corporation +# +# SPDX-License-Identifier: Apache-2.0 + +# scan.py is not working due to issue: https://github.com/IntelPython/numba-dpex/issues/829 + +import dpnp as np + +import numba_dpex as ndpx + +# 1D array size +N = 64 + + +# Implements Hillis-Steele prefix sum algorithm +@ndpx.kernel +def kernel_hillis_steele_scan(a): + # Get local and global id and workgroup size + gid = ndpx.get_global_id(0) + lid = ndpx.get_local_id(0) + ls = ndpx.get_local_size(0) + + # Create temporals in local memory + b = ndpx.local.array(ls, dtype=a.dtype) + c = ndpx.local.array(ls, dtype=a.dtype) + + # Initialize locals + c[lid] = b[lid] = a[gid] + ndpx.barrier(ndpx.LOCAL_MEM_FENCE) + + # Calculate prefix sum + d = 1 + while d < ls: + if lid > d: + c[lid] = b[lid] + b[lid - d] + else: + c[lid] = b[lid] + + ndpx.barrier(ndpx.LOCAL_MEM_FENCE) + + # Swap c and b + e = c[lid] + c[lid] = b[lid] + b[lid] = e + + # Double the stride + d *= 2 + + ndpx.barrier() # The same as ndpx.barrier(ndpx.GLOBAL_MEM_FENCE) + a[gid] = b[lid] + + +def main(): + arr = np.arange(N) + print("Original array:", arr) + + print("Using device ...") + print(arr.device) + kernel_hillis_steele_scan[N, ndpx.DEFAULT_LOCAL_SIZE](arr) + + # the output should be [0, 1, 3, 6, ...] + arr_np = np.asnumpy(arr) + print(arr_np) + + print("Done...") + + +if __name__ == "__main__": + main() diff --git a/numba_dpex/examples/kernel/vector_sum.py b/numba_dpex/examples/kernel/vector_sum.py new file mode 100644 index 0000000000..4ebda8f38a --- /dev/null +++ b/numba_dpex/examples/kernel/vector_sum.py @@ -0,0 +1,47 @@ +# SPDX-FileCopyrightText: 2020 - 2022 Intel Corporation +# +# SPDX-License-Identifier: Apache-2.0 + +import dpnp +import numpy.testing as testing + +import numba_dpex as ndpx + + +# Data parallel kernel implementing vector sum +@ndpx.kernel +def kernel_vector_sum(a, b, c): + i = ndpx.get_global_id(0) + c[i] = a[i] + b[i] + + +# Utility function for printing and testing +def driver(a, b, c, global_size): + + kernel_vector_sum[global_size, ndpx.DEFAULT_LOCAL_SIZE](a, b, c) + + a_np = dpnp.asnumpy(a) # Copy dpnp array a to NumPy array a_np + b_np = dpnp.asnumpy(b) # Copy dpnp array b to NumPy array b_np + c_np = dpnp.asnumpy(c) # Copy dpnp array c to NumPy array c_np + testing.assert_equal(c_np, a_np + b_np) + + +# Main function +def main(): + N = 10 + global_size = N + print("Vector size N", N) + + # Create random vectors on the default device + a = dpnp.random.random(N) + b = dpnp.random.random(N) + c = dpnp.ones_like(a) + + print("Using device ...") + print(a.device) + driver(a, b, c, global_size) + print("Done...") + + +if __name__ == "__main__": + main()