From 2fcaa4c05eabd8005a2ebd41a4f939a28d3f595f Mon Sep 17 00:00:00 2001 From: Vincent Favre-Nicolin Date: Sat, 22 Oct 2022 20:46:30 +0200 Subject: [PATCH 01/10] During accuracy tests, make sure the array after an R2C transform still has the fastest axis along the last dimension (X) - since this is not the case after a 2D numpy rfftn transform. Fixes #19 --- pyvkfft/accuracy.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pyvkfft/accuracy.py b/pyvkfft/accuracy.py index 636c9bd..e51f6dd 100644 --- a/pyvkfft/accuracy.py +++ b/pyvkfft/accuracy.py @@ -387,7 +387,9 @@ def test_accuracy(backend, shape, ndim, axes, dtype, inplace, norm, use_lut, r2c d0n = d0.astype(np.clongdouble) else: d0n = d0 - + if np.argmin(d0.strides) != d0.ndim - 1: + # np.fft.rfftn can change the fast axis + d0 = d0.copy(order='C') if 'opencl' in backend: d_gpu = cla.to_device(queue, d0) else: From 4728252223a19725599731fcc528da1902dfd952 Mon Sep 17 00:00:00 2001 From: Vincent Favre-Nicolin Date: Sat, 22 Oct 2022 20:52:26 +0200 Subject: [PATCH 02/10] More elegant re-ordering array during accuracy test, when necessary --- pyvkfft/accuracy.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyvkfft/accuracy.py b/pyvkfft/accuracy.py index e51f6dd..192cb22 100644 --- a/pyvkfft/accuracy.py +++ b/pyvkfft/accuracy.py @@ -389,7 +389,7 @@ def test_accuracy(backend, shape, ndim, axes, dtype, inplace, norm, use_lut, r2c d0n = d0 if np.argmin(d0.strides) != d0.ndim - 1: # np.fft.rfftn can change the fast axis - d0 = d0.copy(order='C') + d0 = np.asarray(d0, order='C') if 'opencl' in backend: d_gpu = cla.to_device(queue, d0) else: From d75c620d9d1fdd91d06454439a7d731f7166de70 Mon Sep 17 00:00:00 2001 From: Vincent Favre-Nicolin Date: Sun, 23 Oct 2022 14:13:46 +0200 Subject: [PATCH 03/10] Update changelog --- CHANGELOG.rst | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 0ceff86..9e96b51 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -1,3 +1,7 @@ +Version 2022.2 (2022-XX-XX) +----------------------------- +* Fix R2C tests when using numpy (scipy unavailable) [#19] + Version 2022.1.1 (2022-02-14) ----------------------------- * Correct the dtype of the returned array for fft.rfftn() and fft.irfftn() From 59f9ebd9a453b8a205d6404372f187bdcd874270 Mon Sep 17 00:00:00 2001 From: Vincent Favre-Nicolin Date: Sun, 23 Oct 2022 14:41:45 +0200 Subject: [PATCH 04/10] Add support for non C-ordered arrays (F-order or different types of strides). Supports both C2C and R2C (as long as the fast axis is transformed). Adresses #20 --- pyvkfft/accuracy.py | 95 ++++++++++++++++++++++++---------- pyvkfft/base.py | 28 ++++++++-- pyvkfft/cuda.py | 9 ++-- pyvkfft/opencl.py | 6 ++- pyvkfft/test/test_fft.py | 107 +++++++++++++++++++++++---------------- 5 files changed, 164 insertions(+), 81 deletions(-) diff --git a/pyvkfft/accuracy.py b/pyvkfft/accuracy.py index 192cb22..b7bd2e1 100644 --- a/pyvkfft/accuracy.py +++ b/pyvkfft/accuracy.py @@ -154,12 +154,12 @@ def li(a, b): def test_accuracy(backend, shape, ndim, axes, dtype, inplace, norm, use_lut, r2c=False, dct=False, gpu_name=None, stream=None, queue=None, return_array=False, init_array=None, verbose=False, - colour_output=False, ref_long_double=True): + colour_output=False, ref_long_double=True, order='C'): """ Measure the :param backend: either 'pyopencl', 'pycuda' or 'cupy' :param shape: the shape of the array to test. If this is an inplace r2c, the - x-axis length must be even, and two extra values will be appended along x, + fast-axis length must be even, and two extra values will be appended along x, so the actual transform shape is the one supplied :param ndim: the number of FFT dimensions. Can be None if axes is given :param axes: the transform axes. Supersedes ndim @@ -187,6 +187,10 @@ def test_accuracy(backend, shape, ndim, axes, dtype, inplace, norm, use_lut, r2c :param colour_output: if True, use some colour to tag the quality of the accuracy :param ref_long_double: if True and scipy is available, long double precision will be used for the reference transform. Otherwise, this is ignored. + :param order: either 'C' (default C-contiguous) or 'F' to test a different + stride. Note that for the latter, a 3D transform on a 4D array will not + be supported as the last transform axis would be on the 4th dimension + (once ordered by stride). :return: a dictionary with (l2_fft, li_fft, l2_ifft, li_ifft, tol, dt_array, dt_app, dt_fft, dt_ifft, src_unchanged_fft, src_unchanged_ifft, tol_test, str), with the L2 and Linf normalised norms comparing pyvkfft's result with either @@ -204,6 +208,7 @@ def test_accuracy(backend, shape, ndim, axes, dtype, inplace, norm, use_lut, r2c All input parameters are also returned as key/values, except stream, queue, return_array, ini_array and verbose. """ + ndims = len(shape) if backend == "cupy" and has_cupy: mempool = cp.get_default_memory_pool() if mempool is not None: # Is that test necessary ? @@ -231,10 +236,16 @@ def test_accuracy(backend, shape, ndim, axes, dtype, inplace, norm, use_lut, r2c # Add two extra columns in the source array # so the transform has the desired shape shape = list(shape) - shape[-1] += 2 + if order == 'C': + shape[-1] += 2 + else: + shape[0] += 2 else: shapec = list(shape) - shapec[-1] = shapec[-1] // 2 + 1 + if order == 'C': + shapec[-1] = shapec[-1] // 2 + 1 + else: + shapec[0] = shapec[-1] // 2 + 1 shapec = tuple(shapec) else: shapec = tuple(shape) @@ -244,7 +255,10 @@ def test_accuracy(backend, shape, ndim, axes, dtype, inplace, norm, use_lut, r2c if r2c: if inplace: d0 = np.empty(shape, dtype=dtypef) - d0[..., :-2] = init_array + if order == 'C': + d0[..., :-2] = init_array + else: + d0[:-2] = init_array else: d0 = init_array.astype(dtypef) elif dct: @@ -257,11 +271,14 @@ def test_accuracy(backend, shape, ndim, axes, dtype, inplace, norm, use_lut, r2c else: d0 = (np.random.uniform(-0.5, 0.5, shape) + 1j * np.random.uniform(-0.5, 0.5, shape)).astype(dtype) + if order != 'C': + d0 = np.asarray(d0, order=order) + t1 = timeit.default_timer() if 'opencl' in backend: app = clVkFFTApp(d0.shape, d0.dtype, queue, ndim=ndim, norm=norm, - axes=axes, useLUT=use_lut, inplace=inplace, r2c=r2c, dct=dct) + axes=axes, useLUT=use_lut, inplace=inplace, r2c=r2c, dct=dct, strides=d0.strides) t2 = timeit.default_timer() d_gpu = cla.to_device(queue, d0) else: @@ -276,13 +293,30 @@ def test_accuracy(backend, shape, ndim, axes, dtype, inplace, norm, use_lut, r2c d_gpu = to_gpu(d0) if axes is None: - axes_numpy = list(range(len(shape)))[-ndim:] + axes_numpy = list(range(ndims))[-ndim:] else: - axes_numpy = axes + # Make sure axes indices are >0 + axes_numpy = [ax if ax >= 0 else ndims + ax for ax in axes] + + # Need the fast axis for R2C (last for 'C' order, first for 'F') + fast_axis = np.argmin(d0.strides) + + if r2c: + if fast_axis not in axes_numpy: + raise RuntimeError("The fast axis must be transformed for R2C") + + # For R2C, we need the same fast axis as on the GPU, or the + # half-hermitian result won't look the same + if fast_axis != axes_numpy[-1]: + axes_numpy.remove(fast_axis) + axes_numpy.append(fast_axis) + # if order != 'C': + # print("R2C", ndims, ndim, shape, axes, axes_numpy, fast_axis, inplace) + # base FFT scale for numpy (not used for DCT) s = np.sqrt(np.prod([d0.shape[i] for i in axes_numpy])) if r2c and inplace: - s = np.sqrt(s ** 2 / d0.shape[-1] * (d0.shape[-1] - 2)) + s = np.sqrt(s ** 2 / d0.shape[fast_axis] * (d0.shape[fast_axis] - 2)) # Tolerance estimated from accuracy notebook if dtype in (np.complex64, np.float32): @@ -301,11 +335,11 @@ def test_accuracy(backend, shape, ndim, axes, dtype, inplace, norm, use_lut, r2c else: if r2c: if backend == "pyopencl": - d1_gpu = cla.empty(queue, shapec, dtype=dtype) + d1_gpu = cla.empty(queue, shapec, dtype=dtype, order=order) elif backend == "pycuda": - d1_gpu = cua.empty(shapec, dtype=dtype) + d1_gpu = cua.empty(shapec, dtype=dtype, order=order) elif backend == "cupy": - d1_gpu = cp.empty(shapec, dtype=dtype) + d1_gpu = cp.empty(shapec, dtype=dtype, order=order) else: d1_gpu = d_gpu.copy() @@ -324,7 +358,8 @@ def test_accuracy(backend, shape, ndim, axes, dtype, inplace, norm, use_lut, r2c if r2c: if inplace: - d = rfftn(d0n[..., :-2], axes=axes_numpy) / s + # Need to cut the fastest axis by 2 + d = rfftn(np.take(d0n, range(d0n.shape[fast_axis] - 2), axis=fast_axis), axes=axes_numpy) / s else: d = rfftn(d0n, axes=axes_numpy) / s elif dct: @@ -348,12 +383,17 @@ def test_accuracy(backend, shape, ndim, axes, dtype, inplace, norm, use_lut, r2c t = "C2C" if r2c and inplace: tmp = list(d0.shape) - tmp[-1] -= 2 - shstr = str(tuple(tmp)).replace(" ", "") - if ",)" in shstr: - shstr = shstr.replace(",)", "+2)") + if order == 'C': + tmp[-1] -= 2 + shstr = str(tuple(tmp)).replace(" ", "") + if ",)" in shstr: + shstr = shstr.replace(",)", "+2)") + else: + shstr = shstr.replace(")", "+2)") else: - shstr = shstr.replace(")", "+2)") + tmp[0] -= 2 + shstr = "(%d+2," % tmp[0] + shstr += str(tuple(tmp)).replace(" ", "")[1:] else: shstr = str(d0.shape).replace(" ", "") shax = str(axes).replace(" ", "") @@ -364,9 +404,9 @@ def test_accuracy(backend, shape, ndim, axes, dtype, inplace, norm, use_lut, r2c stol = "%6.2e < %6.2e (%5.3f)" % (ni, tol, ni / tol) verb_out = "%8s %4s %14s axes=%10s ndim=%4s %10s lut=%4s inplace=%d " \ - " norm=%4s %5s: n2=%6.2e ninf=%s %d" % \ + " norm=%4s %s %5s: n2=%6.2e ninf=%s %d" % \ (backend, t, shstr, shax, str(ndim), str(d0.dtype), - str(use_lut), int(inplace), str(norm), "FFT", n2, stol, src_unchanged_fft) + str(use_lut), int(inplace), str(norm), order, "FFT", n2, stol, src_unchanged_fft) t3 = timeit.default_timer() @@ -387,9 +427,11 @@ def test_accuracy(backend, shape, ndim, axes, dtype, inplace, norm, use_lut, r2c d0n = d0.astype(np.clongdouble) else: d0n = d0 - if np.argmin(d0.strides) != d0.ndim - 1: + if (np.argmin(d0.strides) != d0.ndim - 1) and order == 'C': # np.fft.rfftn can change the fast axis d0 = np.asarray(d0, order='C') + if order != 'C': + d0 = np.asarray(d0, order=order) if 'opencl' in backend: d_gpu = cla.to_device(queue, d0) else: @@ -400,11 +442,11 @@ def test_accuracy(backend, shape, ndim, axes, dtype, inplace, norm, use_lut, r2c else: if r2c: if backend == "pyopencl": - d1_gpu = cla.empty(queue, shape, dtype=dtypef) + d1_gpu = cla.empty(queue, shape, dtype=dtypef, order=order) elif backend == "pycuda": - d1_gpu = cua.empty(shape, dtype=dtypef) + d1_gpu = cua.empty(shape, dtype=dtypef, order=order) elif backend == "cupy": - d1_gpu = cp.empty(shape, dtype=dtypef) + d1_gpu = cp.empty(shape, dtype=dtypef, order=order) else: d1_gpu = d_gpu.copy() @@ -426,7 +468,8 @@ def test_accuracy(backend, shape, ndim, axes, dtype, inplace, norm, use_lut, r2c assert d1_gpu.dtype == dtype, "The array type is incorrect after an inplace iFFT" if r2c and inplace: - n2i, nii = l2(d, d1_gpu.get()[..., :-2]), li(d, d1_gpu.get()[..., :-2]) + tmp = np.take(d1_gpu.get(), range(d1_gpu.shape[fast_axis] - 2), axis=fast_axis) + n2i, nii = l2(d, tmp), li(d, tmp) else: n2i, nii = l2(d, d1_gpu.get()), li(d, d1_gpu.get()) @@ -463,7 +506,7 @@ def test_accuracy(backend, shape, ndim, axes, dtype, inplace, norm, use_lut, r2c "dt_fft": t3 - t2, "dt_ifft": t4 - t3, "src_unchanged_fft": src_unchanged_fft, "src_unchanged_ifft": src_unchanged_ifft, "tol_test": max(ni, nii) < tol, "str": verb_out, "backend": backend, "shape": shape0, "ndim": ndim, "axes": axes, "dtype": dtype0, "inplace": inplace, - "norm": norm, "use_lut": use_lut, "r2c": r2c, "dct": dct, "gpu_name": gpu_name} + "norm": norm, "use_lut": use_lut, "r2c": r2c, "dct": dct, "gpu_name": gpu_name, "order": order} if return_array: res["d0"] = d0 diff --git a/pyvkfft/base.py b/pyvkfft/base.py index dfefabc..c066e94 100644 --- a/pyvkfft/base.py +++ b/pyvkfft/base.py @@ -276,7 +276,7 @@ def radix_gen_n(nmax, max_size, radix, ndim=None, even=False, exclude_one=True, return v -def calc_transform_axes(shape, axes=None, ndim=None): +def calc_transform_axes(shape, axes=None, ndim=None, strides=None): """ Compute the final shape of the array to be passed to VkFFT, and the axes for which the transform should be skipped. @@ -290,7 +290,8 @@ def calc_transform_axes(shape, axes=None, ndim=None): - shape=(4,5,6,7) and axes=(2,3): the first two axes will be collapsed to a (20,6,7) axis - shape=(4,5,6,7,8,9) and axes=(2,3): the first two axes will be collapsed - to a (20,6,7) axis and a batch size of 8*9=81 will be used + to a (20,6,7) axis (the first axis is skipped) and a batch size + of 8*9=81 will be used Examples of impossible transforms: - shape=(4,5,6,7) with ndim=4: only 3D transforms are allowed - shape=(4,5,6,7) with axes=(1,2,3): the index of the last transformed @@ -303,6 +304,8 @@ def calc_transform_axes(shape, axes=None, ndim=None): are transformed, or up to ndim. :param ndim: the number of dimensions for the transform. If None, the number of axes is used + :param strides: the array strides. If None, a C-order is assumed + with the fastest axes along the last dimensions (numpy default) :return: (shape, skip_axis, ndim) with the 4D shape after collapsing consecutive non-transformed axes (padded with ones if necessary, with the order adequate for VkFFT (nx, ny, nz, n_batch), @@ -324,6 +327,16 @@ def calc_transform_axes(shape, axes=None, ndim=None): else: if ndim1 != len(axes): raise RuntimeError("The number of transform axes does not match ndim:", axes, ndim) + + if strides is not None and len(shape) > 1: + idx = np.argsort(strides)[::-1] + if not np.alltrue((idx[1:] - idx[:-1]) > 0): + # Array is not C-ordered, need to move axes + idxend = -len(idx) + idx + # print("Re-ordering:", shape1, np.take(shape1, idx), "axes:", axes, [idxend[i] for i in axes]) + shape1 = np.take(shape1, idx).tolist() + axes = [idxend[i] for i in axes] + # Collapse non-transform axes when possible skip_axis = [True for i in range(len(shape1))] for i in axes: @@ -421,7 +434,7 @@ class VkFFTApp: """ def __init__(self, shape, dtype: type, ndim=None, inplace=True, norm=1, - r2c=False, dct=False, axes=None, **kwargs): + r2c=False, dct=False, axes=None, strides=None, **kwargs): """ Init function for the VkFFT application. @@ -460,6 +473,7 @@ def __init__(self, shape, dtype: type, ndim=None, inplace=True, norm=1, :param axes: a list or tuple of axes along which the transform should be made. if None, the transform is done along the ndim fastest axes, or all axes if ndim is None. Not allowed for R2C transforms + :param strides: the array strides - needed if not C-ordered. :raises RuntimeError: if the transform dimensions are not allowed by VkFFT. """ self.app = None @@ -470,9 +484,11 @@ def __init__(self, shape, dtype: type, ndim=None, inplace=True, norm=1, raise RuntimeError("R2C or DCT selected but input type is not real !") if r2c and axes is not None: raise RuntimeError("axes=... is not allowed for R2C transforms") + if strides is not None and dct: + raise RuntimeError("strides=... is not allowed for DCT transforms (needs a C-ordered array)") # Get the final shape passed to VkFFT, collapsing non-transform axes - # as necessary. The calculated shape has 4 dimensions (nx, ny, nz, n_batch) - self.shape, self.skip_axis, self.ndim = calc_transform_axes(shape, axes, ndim) + # as necessary. The calculated shape has 4 dimensions (nx, ny, nz, n_batch). + self.shape, self.skip_axis, self.ndim = calc_transform_axes(shape, axes, ndim, strides) self.inplace = inplace self.r2c = r2c if dct is False: @@ -546,6 +562,8 @@ def _get_fft_scale(self, norm): ndim_real += 1 s = np.sqrt(s) if self.r2c and self.inplace: + # Note: this is still correct for non-C-ordered arrays since + # self.shape has been re-ordered s *= np.sqrt((self.shape[0] - 2) / self.shape[0]) if self.dct: s *= 2 ** (0.5 * ndim_real) diff --git a/pyvkfft/cuda.py b/pyvkfft/cuda.py index e799eec..60f1c42 100644 --- a/pyvkfft/cuda.py +++ b/pyvkfft/cuda.py @@ -64,7 +64,7 @@ class VkFFTApp(VkFFTAppBase): """ def __init__(self, shape, dtype: type, ndim=None, inplace=True, stream=None, norm=1, - r2c=False, dct=False, axes=None, **kwargs): + r2c=False, dct=False, axes=None, strides=None, **kwargs): """ :param shape: the shape of the array to be transformed. The number @@ -106,10 +106,13 @@ def __init__(self, shape, dtype: type, ndim=None, inplace=True, stream=None, nor :param axes: a list or tuple of axes along which the transform should be made. if None, the transform is done along the ndim fastest axes, or all axes if ndim is None. Not allowed for R2C transforms + :param strides: the array strides - needed if not C-ordered. :raises RuntimeError: if the initialisation fails, e.g. if the CUDA - driver has not been properly initialised. + driver has not been properly initialised, or if the transform dimensions + are not allowed by VkFFT. """ - super().__init__(shape, dtype, ndim=ndim, inplace=inplace, norm=norm, r2c=r2c, dct=dct, axes=axes, **kwargs) + super().__init__(shape, dtype, ndim=ndim, inplace=inplace, norm=norm, r2c=r2c, + dct=dct, axes=axes, strides=strides, **kwargs) self.stream = stream diff --git a/pyvkfft/opencl.py b/pyvkfft/opencl.py index daaedab..fc1193e 100644 --- a/pyvkfft/opencl.py +++ b/pyvkfft/opencl.py @@ -53,7 +53,7 @@ class VkFFTApp(VkFFTAppBase): """ def __init__(self, shape, dtype: type, queue: cl.CommandQueue, ndim=None, inplace=True, norm=1, - r2c=False, dct=False, axes=None, **kwargs): + r2c=False, dct=False, axes=None, strides=None, **kwargs): """ Init function for the VkFFT application. @@ -93,11 +93,13 @@ def __init__(self, shape, dtype: type, queue: cl.CommandQueue, ndim=None, inplac :param axes: a list or tuple of axes along which the transform should be made. if None, the transform is done along the ndim fastest axes, or all axes if ndim is None. Not allowed for R2C transforms + :param strides: the array strides - needed if not C-ordered. :raises RuntimeError: if the initialisation fails, e.g. if the GPU driver has not been properly initialised, or if the transform dimensions are not allowed by VkFFT. """ - super().__init__(shape, dtype, ndim=ndim, inplace=inplace, norm=norm, r2c=r2c, dct=dct, axes=axes, **kwargs) + super().__init__(shape, dtype, ndim=ndim, inplace=inplace, norm=norm, r2c=r2c, + dct=dct, axes=axes, strides=strides, **kwargs) self.queue = queue diff --git a/pyvkfft/test/test_fft.py b/pyvkfft/test/test_fft.py index 1e43177..86d61e4 100644 --- a/pyvkfft/test/test_fft.py +++ b/pyvkfft/test/test_fft.py @@ -200,7 +200,7 @@ def run_fft(self, vbackend, vn, dims_max=4, ndim_max=3, shuffle_axes=True, + 1j * np.random.uniform(-0.5, 0.5, sh)).astype(dtype) if vlut == "auto": if dtype in (np.float64, np.complex128): - # By default LUT is enabled for complex128, no need to test twice + # By default, LUT is enabled for complex128, no need to test twice tmp = [None] else: tmp = [None, True] @@ -209,50 +209,67 @@ def run_fft(self, vbackend, vn, dims_max=4, ndim_max=3, shuffle_axes=True, for use_lut in tmp: for inplace in vinplace: for norm in vnorm: - with self.subTest(backend=backend, n=n, dims=dims, ndim=ndim, - axes=axes, dtype=np.dtype(dtype), norm=norm, - use_lut=use_lut, inplace=inplace, - r2c=r2c, dct=dct): - ct += 1 - if not dry_run: - res = test_accuracy(backend, sh, ndim, axes, dtype, inplace, - norm, use_lut, r2c=r2c, dct=dct, - gpu_name=self.gpu, - stream=None, queue=cq, - return_array=False, init_array=d0, - verbose=verbose) - npr = primes(n) - ni, n2 = res["ni"], res["n2"] - nii, n2i = res["nii"], res["n2i"] - tol = res["tol"] - src1 = res["src_unchanged_fft"] - src2 = res["src_unchanged_ifft"] - self.assertTrue(ni < tol, "Accuracy mismatch after FFT, " - "n2=%8e ni=%8e>%8e" % - (n2, ni, tol)) - self.assertTrue(nii < tol, "Accuracy mismatch after iFFT, " - "n2=%8e ni=%8e>%8e" % - (n2, nii, tol)) - if not inplace: - self.assertTrue(src1, "The source array was modified " - "during the FFT") - nmaxr2c1d = 3072 * (1 + int( - dtype in (np.float32, np.complex64))) - if not r2c or (ndim == 1 and max(npr) <= 13) \ - and n < nmaxr2c1d: - self.assertTrue(src2, + vorder = ['C', 'F'] + if dims == 1 or dims > 3 or dct: + vorder = ['C'] + if r2c: + if ndim is not None: + if dims != ndim: + vorder = ['C'] + if axes is not None: + # TODO : also test F-order when ndim%8e" % + (n2, ni, tol)) + self.assertTrue(nii < tol, + "Accuracy mismatch after iFFT, " + "n2=%8e ni=%8e>%8e" % + (n2, nii, tol)) + if not inplace: + self.assertTrue(src1, "The source array was modified " - "during the iFFT") - else: - kwargs = {"backend": backend, "shape": sh, - "ndim": ndim, "axes": axes, - "dtype": dtype, "inplace": inplace, - "norm": norm, "use_lut": use_lut, - "r2c": r2c, "dct": dct, - "gpu_name": self.gpu, "stream": None, - "verbose": False, - "colour_output": self.colour} - vkwargs.append(kwargs) + "during the FFT") + nmaxr2c1d = 3072 * (1 + int( + dtype in (np.float32, np.complex64))) + if not r2c or (ndim == 1 and max(npr) <= 13) \ + and n < nmaxr2c1d: + self.assertTrue(src2, + "The source array was modified " + "during the iFFT") + else: + kwargs = {"backend": backend, "shape": sh, + "ndim": ndim, "axes": axes, + "dtype": dtype, "inplace": inplace, + "norm": norm, "use_lut": use_lut, + "r2c": r2c, "dct": dct, + "gpu_name": self.gpu, "stream": None, + "verbose": False, "order": order, + "colour_output": self.colour} + vkwargs.append(kwargs) return ct, vkwargs @@ -262,7 +279,7 @@ def run_fft_parallel(self, vkwargs): for res in pool.imap(test_accuracy_kwargs, vkwargs): with self.subTest(backend=res['backend'], n=max(res['shape']), ndim=res['ndim'], dtype=np.dtype(res['dtype']), norm=res['norm'], use_lut=res['use_lut'], - inplace=res['inplace'], r2c=res['r2c'], dct=res['dct']): + inplace=res['inplace'], r2c=res['r2c'], dct=res['dct'], order=res['order']): n = max(res['shape']) npr = primes(n) ni, n2 = res["ni"], res["n2"] From 04028426eb4370632cfa8e7bef4b7350f0759736 Mon Sep 17 00:00:00 2001 From: Vincent Favre-Nicolin Date: Sun, 23 Oct 2022 16:56:23 +0200 Subject: [PATCH 05/10] Non-C-contiguous arrays: add systematic test, fix DCT check, upodate some doc and outputs --- pyvkfft/accuracy.py | 9 ++++++--- pyvkfft/base.py | 13 ++++++++++--- pyvkfft/scripts/pyvkfft_test.py | 3 +++ pyvkfft/test/test_fft.py | 14 ++++++++------ 4 files changed, 27 insertions(+), 12 deletions(-) diff --git a/pyvkfft/accuracy.py b/pyvkfft/accuracy.py index b7bd2e1..b487f00 100644 --- a/pyvkfft/accuracy.py +++ b/pyvkfft/accuracy.py @@ -152,7 +152,7 @@ def li(a, b): return abs(a - b).max() / abs(a).max() -def test_accuracy(backend, shape, ndim, axes, dtype, inplace, norm, use_lut, r2c=False, dct=False, +def test_accuracy(backend, shape, ndim, axes, dtype, inplace, norm, use_lut, r2c=False, dct=False, fstride=False, gpu_name=None, stream=None, queue=None, return_array=False, init_array=None, verbose=False, colour_output=False, ref_long_double=True, order='C'): """ @@ -392,8 +392,11 @@ def test_accuracy(backend, shape, ndim, axes, dtype, inplace, norm, use_lut, r2c shstr = shstr.replace(")", "+2)") else: tmp[0] -= 2 - shstr = "(%d+2," % tmp[0] - shstr += str(tuple(tmp)).replace(" ", "")[1:] + if len(tmp) == 1: + shstr = "(%d+2)," % tmp[0] + else: + shstr = "(%d+2," % tmp[0] + shstr += str(tuple(tmp[1:])).replace(" ", "").replace(",)", ")")[1:] else: shstr = str(d0.shape).replace(" ", "") shax = str(axes).replace(" ", "") diff --git a/pyvkfft/base.py b/pyvkfft/base.py index c066e94..74ba9fb 100644 --- a/pyvkfft/base.py +++ b/pyvkfft/base.py @@ -465,7 +465,11 @@ def __init__(self, shape, dtype: type, ndim=None, inplace=True, norm=1, reinterpret the type) will have a shape (..., nx//2 + 1). For an out-of-place transform, if the input (real) shape is (..., nx), the output (complex) shape should be (..., nx//2+1). - Note that for C2R transforms with ndim>=2, the source (complex) array + Note 1: the above shape changes are true for C-contiguous arrays; + generally the axis which is halved by the R2C transform always is + the fast axis -with a stride of 1 element. For F-contiguous arrays + this will be the first dimension instead of the last. + Note 2:for C2R transforms with ndim>=2, the source (complex) array is modified. :param dct: used to perform a Direct Cosine Transform (DCT) aka a R2R transform. An integer can be given to specify the type of DCT (1, 2, 3 or 4). @@ -484,8 +488,11 @@ def __init__(self, shape, dtype: type, ndim=None, inplace=True, norm=1, raise RuntimeError("R2C or DCT selected but input type is not real !") if r2c and axes is not None: raise RuntimeError("axes=... is not allowed for R2C transforms") - if strides is not None and dct: - raise RuntimeError("strides=... is not allowed for DCT transforms (needs a C-ordered array)") + if strides is not None and dct and len(shape) > 1: + # TODO: update support status for non-C-contiguous DCT transforms + s = np.array(strides) + if not np.alltrue((s[:-1] - s[1:]) > 0): + raise RuntimeError("A C-contiguous array is required for DCT transforms") # Get the final shape passed to VkFFT, collapsing non-transform axes # as necessary. The calculated shape has 4 dimensions (nx, ny, nz, n_batch). self.shape, self.skip_axis, self.ndim = calc_transform_axes(shape, axes, ndim, strides) diff --git a/pyvkfft/scripts/pyvkfft_test.py b/pyvkfft/scripts/pyvkfft_test.py index 4724af4..514202d 100755 --- a/pyvkfft/scripts/pyvkfft_test.py +++ b/pyvkfft/scripts/pyvkfft_test.py @@ -301,6 +301,8 @@ def main(): "can be slower (or much slower on some architectures).") sysgrp.add_argument('--r2c', action='store_true', help="Test real-to-complex transform " "(default is c2c)") + sysgrp.add_argument('--fstride', action='store_true', + help="Test F-ordered arrays (default is C-ordered). Not supported for DCT") sysgrp.add_argument('--radix', action='store', nargs='*', type=int, help="Perform only radix transforms. If no value is given, all available " "radix transforms are allowed. Alternatively a list can be given: " @@ -372,6 +374,7 @@ def main(): t.norm = args.norm[0] t.nproc = args.nproc[0] t.r2c = args.r2c + t.fstride = args.fstride t.radix = args.radix t.ref_long_double = args.ref_long_double t.max_pow = None if args.radix_max_pow is None else args.radix_max_pow[0] diff --git a/pyvkfft/test/test_fft.py b/pyvkfft/test/test_fft.py index 86d61e4..ac097ad 100644 --- a/pyvkfft/test/test_fft.py +++ b/pyvkfft/test/test_fft.py @@ -443,6 +443,7 @@ class TestFFTSystematic(unittest.TestCase): gpu = None inplace = False lut = False + fstride = False max_pow = None max_nb_tests = 1000 nb_test = 0 # Number of tests actually run @@ -515,7 +516,8 @@ def test_systematic(self): kwargs = {"backend": backend, "shape": s, "ndim": len(s), "axes": self.axes, "dtype": self.dtype, "inplace": self.inplace, "norm": self.norm, "use_lut": self.lut, "r2c": self.r2c, "dct": self.dct, "gpu_name": self.gpu, "stream": None, "verbose": False, - "colour_output": self.colour, "ref_long_double": self.ref_long_double} + "colour_output": self.colour, "ref_long_double": self.ref_long_double, + "order": 'F' if self.fstride else 'C'} vkwargs.append(kwargs) if self.db is not None: # TODO secure the db with a context 'with' @@ -523,7 +525,7 @@ def test_systematic(self): dbc = db.cursor() dbc.execute('CREATE TABLE IF NOT EXISTS pyvkfft_test (epoch int, hostname int,' 'backend text, language text, transform text, axes text, array_shape text,' - 'ndims int, ndim int, precision int, inplace int, norm int, lut int,' + 'ndims int, ndim int, precision int, inplace int, norm int, lut int, fstride int,' 'n int, n2_fft float, n2_ifft float, ni_fft float, ni_ifft float, tolerance float,' 'dt_app float, dt_fft float, dt_ifft float, src_unchanged_fft int, src_unchanged_ifft int,' 'gpu_name text, success int, error int, vkfft_error_code int)') @@ -563,7 +565,7 @@ def test_systematic(self): # as e.g. "float32" instead of "" with self.subTest(backend=backend, shape=sh, ndim=ndim, dtype=np.dtype(self.dtype), norm=self.norm, use_lut=self.lut, - inplace=self.inplace, r2c=self.r2c, dct=self.dct): + inplace=self.inplace, r2c=self.r2c, dct=self.dct, fstride=self.fstride): if self.serial: res = test_accuracy_kwargs(v) else: @@ -600,12 +602,12 @@ def test_systematic(self): succ = False if self.db is not None: dbc.execute('INSERT INTO pyvkfft_test VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,' - '?,?,?,?,?,?,?,?,?,?,?,?,?)', + '?,?,?,?,?,?,?,?,?,?,?,?,?,?)', (time.time(), hostname, backend, lang, transform, str(res['axes']).encode('ascii'), str(res['shape']).encode('ascii'), len(res['shape']), ndim, np.dtype(self.dtype).itemsize, - self.inplace, self.norm, self.lut, int(max(res['shape'])), float(n2), - float(n2i), + self.inplace, self.norm, self.lut, self.fstride, + int(max(res['shape'])), float(n2), float(n2i), float(ni), float(nii), float(tol), res["dt_app"], res["dt_fft"], res["dt_ifft"], int(src1), int(src2), res["gpu_name"].encode('ascii'), int(succ), 0, 0)) From fe7bafad7ffe5438f3a7a2c001acc049f350e1a9 Mon Sep 17 00:00:00 2001 From: Vincent Favre-Nicolin Date: Sat, 26 Nov 2022 15:10:02 +0100 Subject: [PATCH 06/10] Also display the platforma and device name along vkfft error check --- pyvkfft/opencl.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pyvkfft/opencl.py b/pyvkfft/opencl.py index fc1193e..1734d73 100644 --- a/pyvkfft/opencl.py +++ b/pyvkfft/opencl.py @@ -114,7 +114,8 @@ def __init__(self, shape, dtype: type, queue: cl.CommandQueue, ndim=None, inplac raise RuntimeError("Error creating VkFFTConfiguration. Was the OpenCL context properly initialised ?") res = ctypes.c_int(0) self.app = _vkfft_opencl.init_app(self.config, queue.int_ptr, ctypes.byref(res)) - check_vkfft_result(res, shape, dtype, ndim, inplace, norm, r2c, dct, axes, "opencl") + check_vkfft_result(res, shape, dtype, ndim, inplace, norm, r2c, dct, axes, "opencl:%s:%s" % + (queue.device.platform.name, queue.device.name)) if self.app is None: raise RuntimeError("Error creating VkFFTApplication. Was the OpenCL context properly initialised ?") From 66fe9443fa101e6073a98c9bb0b2097ee86ceb1f Mon Sep 17 00:00:00 2001 From: Vincent Favre-Nicolin Date: Sat, 26 Nov 2022 15:11:06 +0100 Subject: [PATCH 07/10] Check also for OSError in fftp.py. Should fix #21 --- pyvkfft/fft.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pyvkfft/fft.py b/pyvkfft/fft.py index 3358dbd..b882766 100644 --- a/pyvkfft/fft.py +++ b/pyvkfft/fft.py @@ -21,14 +21,14 @@ import pycuda.gpuarray as cua if has_cupy: import cupy as cp -except ImportError: +except (ImportError, OSError): has_cupy, has_pycuda = False, False try: from .opencl import VkFFTApp as VkFFTApp_cl, cla, vkfft_version has_opencl = True -except ImportError: +except (ImportError, OSError): has_opencl = False From 3efa26ab67b35fbfe4a8ab95aa3497c8080b7848 Mon Sep 17 00:00:00 2001 From: Vincent Favre-Nicolin Date: Thu, 5 Jan 2023 14:44:29 +0100 Subject: [PATCH 08/10] Fix tests using F-strided arrays. Allow to select the backend also for the basic test, and add the option to select the opencl platform. --- pyvkfft/accuracy.py | 31 +++++++----- pyvkfft/scripts/pyvkfft_test.py | 13 +++++ pyvkfft/test/test_fft.py | 84 ++++++++++++++------------------- 3 files changed, 68 insertions(+), 60 deletions(-) diff --git a/pyvkfft/accuracy.py b/pyvkfft/accuracy.py index b487f00..483b702 100644 --- a/pyvkfft/accuracy.py +++ b/pyvkfft/accuracy.py @@ -66,7 +66,7 @@ gpu_ctx_dic = {} -def init_ctx(backend, gpu_name=None, verbose=False): +def init_ctx(backend, gpu_name=None, opencl_platform=None, verbose=False): if backend in gpu_ctx_dic: return if backend == "pycuda": @@ -96,6 +96,9 @@ def init_ctx(backend, gpu_name=None, verbose=False): for p in cl.get_platforms(): if d is not None: break + if opencl_platform is not None: + if opencl_platform.lower() not in p.name.lower(): + continue for d0 in p.get_devices(): if d0.type & cl.device_type.GPU: if gpu_name is not None: @@ -106,10 +109,8 @@ def init_ctx(backend, gpu_name=None, verbose=False): if d is not None: break if d is None: - if gpu_name is not None: - raise RuntimeError("Selected backend is pyopencl, but no device found (name=%s)" % gpu_name) - else: - raise RuntimeError("Selected backend is pyopencl, but no device found") + raise RuntimeError("Selected backend is pyopencl, but no device found (name=%s, platform=%s)" % + (gpu_name, opencl_platform)) cl_ctx = cl.Context([d]) cq = cl.CommandQueue(cl_ctx) gpu_ctx_dic["pyopencl"] = d, cl_ctx, cq, 'cl_khr_fp64' in cq.device.extensions @@ -152,9 +153,9 @@ def li(a, b): return abs(a - b).max() / abs(a).max() -def test_accuracy(backend, shape, ndim, axes, dtype, inplace, norm, use_lut, r2c=False, dct=False, fstride=False, - gpu_name=None, stream=None, queue=None, return_array=False, init_array=None, verbose=False, - colour_output=False, ref_long_double=True, order='C'): +def test_accuracy(backend, shape, ndim, axes, dtype, inplace, norm, use_lut, r2c=False, dct=False, + gpu_name=None, opencl_platform=None, stream=None, queue=None, return_array=False, + init_array=None, verbose=False, colour_output=False, ref_long_double=True, order='C'): """ Measure the :param backend: either 'pyopencl', 'pycuda' or 'cupy' @@ -175,6 +176,8 @@ def test_accuracy(backend, shape, ndim, axes, dtype, inplace, norm, use_lut, r2c tested (native scipy normalisation). :param gpu_name: the name of the gpu to use. If None, the first available for the backend will be used. + :param opencl_platform: the name of the OpenCL platform to use. If None, the first available + will be used. :param stream: the cuda stream to use, or None :param queue: the opencl queue to use (mandatory for the 'pyopencl' backend) :param return_array: if True, will return the generated random array so it can be @@ -216,7 +219,7 @@ def test_accuracy(backend, shape, ndim, axes, dtype, inplace, norm, use_lut, r2c # N parallel process so memory management must be done manually mempool.free_all_blocks() t0 = timeit.default_timer() - init_ctx(backend, gpu_name=gpu_name, verbose=False) + init_ctx(backend, gpu_name=gpu_name, opencl_platform=opencl_platform, verbose=False) if backend == "pyopencl" and queue is None: queue = gpu_ctx_dic["pyopencl"][2] shape0 = shape @@ -281,14 +284,17 @@ def test_accuracy(backend, shape, ndim, axes, dtype, inplace, norm, use_lut, r2c axes=axes, useLUT=use_lut, inplace=inplace, r2c=r2c, dct=dct, strides=d0.strides) t2 = timeit.default_timer() d_gpu = cla.to_device(queue, d0) + empty_like = cla.empty_like else: if backend == "pycuda": to_gpu = cua.to_gpu + empty_like = cua.empty_like else: to_gpu = cp.array + empty_like = cp.empty_like app = cuVkFFTApp(d0.shape, d0.dtype, ndim=ndim, norm=norm, axes=axes, - useLUT=use_lut, inplace=inplace, r2c=r2c, dct=dct, stream=stream) + useLUT=use_lut, inplace=inplace, r2c=r2c, dct=dct, strides=d0.strides, stream=stream) t2 = timeit.default_timer() d_gpu = to_gpu(d0) @@ -341,7 +347,8 @@ def test_accuracy(backend, shape, ndim, axes, dtype, inplace, norm, use_lut, r2c elif backend == "cupy": d1_gpu = cp.empty(shapec, dtype=dtype, order=order) else: - d1_gpu = d_gpu.copy() + # Note that pycuda's gpuarray.copy (as of 2022.2.2) does not copy strides + d1_gpu = empty_like(d_gpu) if has_scipy and ref_long_double: # Use long double precision @@ -451,7 +458,7 @@ def test_accuracy(backend, shape, ndim, axes, dtype, inplace, norm, use_lut, r2c elif backend == "cupy": d1_gpu = cp.empty(shape, dtype=dtypef, order=order) else: - d1_gpu = d_gpu.copy() + d1_gpu = empty_like(d_gpu) d1_gpu = app.ifft(d_gpu, d1_gpu) if not dct: diff --git a/pyvkfft/scripts/pyvkfft_test.py b/pyvkfft/scripts/pyvkfft_test.py index 514202d..3e6d8bf 100755 --- a/pyvkfft/scripts/pyvkfft_test.py +++ b/pyvkfft/scripts/pyvkfft_test.py @@ -216,6 +216,8 @@ def main(): "which can be displayed by clicking on the type of transform.") parser.add_argument('--gpu', action='store', help="Name (or sub-string) of the GPU to use") + parser.add_argument('--opencl_platform', action='store', + help="Name (or sub-string) of the opencl platform to use (case-insensitive)") parser.add_argument('--mailto', action='store', help="Email address the results will be sent to") parser.add_argument('--mailto_fail', action='store', @@ -365,6 +367,7 @@ def main(): t.dry_run = args.dry_run t.dtype = np.float64 if args.double else np.float32 t.gpu = args.gpu + t.opencl_platform = args.opencl_platform t.graph = args.graph t.inplace = args.inplace t.lut = args.lut @@ -373,6 +376,7 @@ def main(): # t.ndims = args.ndims t.norm = args.norm[0] t.nproc = args.nproc[0] + t.opencl_platform = args.opencl_platform t.r2c = args.r2c t.fstride = args.fstride t.radix = args.radix @@ -407,6 +411,15 @@ def main(): t.colour = args.colour t.gpu = args.gpu t.nproc = args.nproc[0] + t.opencl_platform = args.opencl_platform + t.vbackend = args.backend + # KLUDGE: remove cuda stream test if neither cupy or pycuda are used + test_cuda_stream = False + for b in args.backend: + if 'pycuda' in b.lower(): + test_cuda_stream = True + if not test_cuda_stream: + del t.test_pycuda_streams suite = unittest.defaultTestLoader.loadTestsFromTestCase(t) if t.verbose: res = unittest.TextTestRunner(verbosity=2).run(suite) diff --git a/pyvkfft/test/test_fft.py b/pyvkfft/test/test_fft.py index ac097ad..e353704 100644 --- a/pyvkfft/test/test_fft.py +++ b/pyvkfft/test/test_fft.py @@ -68,6 +68,18 @@ class TestFFT(unittest.TestCase): nproc = 1 verbose = False colour = False + opencl_platform = None + vbackend = None + + def setUp(self) -> None: + if self.vbackend is None: + self.vbackend = [] + if has_pycuda: + self.vbackend.append("pycuda") + if has_cupy: + self.vbackend.append("cupy") + if has_pyopencl: + self.vbackend.append("pyopencl") def test_backend(self): self.assertTrue(has_pycuda or has_pyopencl or has_cupy, @@ -76,17 +88,9 @@ def test_backend(self): @unittest.skipIf(not (has_pycuda or has_cupy or has_pyopencl), "No OpenCL/CUDA backend is available") def test_simple_fft(self): """Test the simple fft API""" - vbackend = [] - if has_pycuda: - vbackend.append("pycuda") - if has_cupy: - vbackend.append("cupy") - if has_pyopencl: - vbackend.append("pyopencl") - - for backend in vbackend: + for backend in self.vbackend: with self.subTest(backend=backend): - init_ctx(backend, gpu_name=self.gpu, verbose=False) + init_ctx(backend, gpu_name=self.gpu, opencl_platform=self.opencl_platform, verbose=False) if backend == "pycuda": dc = cua.to_gpu(ascent().astype(np.complex64)) dr = cua.to_gpu(ascent().astype(np.float32)) @@ -162,7 +166,7 @@ def run_fft(self, vbackend, vn, dims_max=4, ndim_max=3, shuffle_axes=True, ct = 0 vkwargs = [] for backend in vbackend: - init_ctx(backend, gpu_name=self.gpu, verbose=False) + init_ctx(backend, gpu_name=self.gpu, opencl_platform=self.opencl_platform, verbose=False) cq = gpu_ctx_dic["pyopencl"][2] if backend == "pyopencl" else None for n in vn: for dims in range(1, dims_max + 1): @@ -228,13 +232,15 @@ def run_fft(self, vbackend, vn, dims_max=4, ndim_max=3, shuffle_axes=True, r2c=r2c, dct=dct, order=order): ct += 1 if not dry_run: - res = test_accuracy(backend, sh, ndim, axes, dtype, - inplace, - norm, use_lut, r2c=r2c, dct=dct, - gpu_name=self.gpu, - stream=None, queue=cq, - return_array=False, init_array=d0, - verbose=verbose, order=order) + res = \ + test_accuracy(backend, sh, ndim, axes, dtype, + inplace, + norm, use_lut, r2c=r2c, dct=dct, + gpu_name=self.gpu, + opencl_platform=self.opencl_platform, + stream=None, queue=cq, + return_array=False, init_array=d0, + verbose=verbose, order=order) npr = primes(n) ni, n2 = res["ni"], res["n2"] nii, n2i = res["nii"], res["n2i"] @@ -266,7 +272,9 @@ def run_fft(self, vbackend, vn, dims_max=4, ndim_max=3, shuffle_axes=True, "dtype": dtype, "inplace": inplace, "norm": norm, "use_lut": use_lut, "r2c": r2c, "dct": dct, - "gpu_name": self.gpu, "stream": None, + "gpu_name": self.gpu, + "opencl_platform": self.opencl_platform, + "stream": None, "verbose": False, "order": order, "colour_output": self.colour} vkwargs.append(kwargs) @@ -302,15 +310,8 @@ def run_fft_parallel(self, vkwargs): @unittest.skipIf(not (has_pycuda or has_cupy or has_pyopencl), "No OpenCL/CUDA backend is available") def test_c2c(self): """Run C2C tests""" - vbackend = [] - if has_pycuda: - vbackend.append("pycuda") - if has_cupy: - vbackend.append("cupy") - if has_pyopencl: - vbackend.append("pyopencl") - for backend in vbackend: - init_ctx(backend, gpu_name=self.gpu, verbose=False) + for backend in self.vbackend: + init_ctx(backend, gpu_name=self.gpu, opencl_platform=self.opencl_platform, verbose=False) has_cl_fp64 = gpu_ctx_dic["pyopencl"][3] if backend == "pyopencl" else True ct = 0 vkwargs = [] @@ -335,15 +336,8 @@ def test_c2c(self): @unittest.skipIf(not (has_pycuda or has_cupy or has_pyopencl), "No OpenCL/CUDA backend is available") def test_r2c(self): """Run R2C tests""" - vbackend = [] - if has_pycuda: - vbackend.append("pycuda") - if has_cupy: - vbackend.append("cupy") - if has_pyopencl: - vbackend.append("pyopencl") - for backend in vbackend: - init_ctx(backend, gpu_name=self.gpu, verbose=False) + for backend in self.vbackend: + init_ctx(backend, gpu_name=self.gpu, opencl_platform=self.opencl_platform, verbose=False) has_cl_fp64 = gpu_ctx_dic["pyopencl"][3] if backend == "pyopencl" else True ct = 0 vkwargs = [] @@ -369,15 +363,8 @@ def test_r2c(self): @unittest.skipIf(not has_dct_ref, "scipy and pyfftw are not available - cannot test DCT") def test_dct(self): """Run DCT tests""" - vbackend = [] - if has_pycuda: - vbackend.append("pycuda") - if has_cupy: - vbackend.append("cupy") - if has_pyopencl: - vbackend.append("pyopencl") - for backend in vbackend: - init_ctx(backend, gpu_name=self.gpu, verbose=False) + for backend in self.vbackend: + init_ctx(backend, gpu_name=self.gpu, opencl_platform=self.opencl_platform, verbose=False) has_cl_fp64 = gpu_ctx_dic["pyopencl"][3] if backend == "pyopencl" else True ct = 0 vkwargs = [] @@ -403,7 +390,7 @@ def test_pycuda_streams(self): """ for dtype in (np.complex64, np.complex128): with self.subTest(dtype=np.dtype(dtype)): - init_ctx("pycuda", gpu_name=self.gpu, verbose=False) + init_ctx("pycuda", gpu_name=self.gpu, opencl_platform=self.opencl_platform, verbose=False) if dtype == np.complex64: rtol = 1e-6 else: @@ -452,6 +439,7 @@ class TestFFTSystematic(unittest.TestCase): # t.ndims = args.ndims norm = 1 nproc = 1 + opencl_platform = None r2c = False radix = None range = 2, 128 @@ -473,7 +461,7 @@ def setUp(self) -> None: self.vbackend.append("cupy") if has_pyopencl: self.vbackend.append("pyopencl") - init_ctx("pyopencl", gpu_name=self.gpu, verbose=False) + init_ctx("pyopencl", gpu_name=self.gpu, opencl_platform=self.opencl_platform, verbose=False) self.cq, self.has_cl_fp64 = gpu_ctx_dic["pyopencl"][2:] self.assertTrue(not self.bluestein or self.radix is None, "Cannot select both Bluestein and radix") if not self.bluestein and self.radix is None: From b4b195d2c51363ad486bdbb5acf507e3e586157f Mon Sep 17 00:00:00 2001 From: Vincent Favre-Nicolin Date: Thu, 5 Jan 2023 21:05:18 +0100 Subject: [PATCH 09/10] Add support for F-ordered array in the simple fft interface (C2C and R2C). Fox use of opencl_platform in tests --- CHANGELOG.rst | 6 +++- README.rst | 4 ++- pyvkfft/fft.py | 41 ++++++++++++++++----------- pyvkfft/scripts/pyvkfft_test_suite.py | 3 ++ pyvkfft/test/test_fft.py | 3 +- 5 files changed, 38 insertions(+), 19 deletions(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 9e96b51..6dbfad9 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -1,6 +1,10 @@ -Version 2022.2 (2022-XX-XX) +Version 2023.1 (2021-XX-XX) ----------------------------- * Fix R2C tests when using numpy (scipy unavailable) [#19] +* Add support for F-ordered arrays (C2C and R2C) +* Allow selection of backend for non-systemetic pvkfft-test +* Add parameter for the OpenCL platform in pyvkfft-test +* Fix simple fft interface import when only pycuda is used Version 2022.1.1 (2022-02-14) ----------------------------- diff --git a/README.rst b/README.rst index 5a20cd0..fda6eea 100644 --- a/README.rst +++ b/README.rst @@ -79,7 +79,8 @@ Features - array can be have more dimensions than the FFT (batch transforms). - arbitrary array size, using Bluestein algorithm for prime numbers>13 (note that in this case the performance can be significantly lower, up to ~4x, depending on the transform size, - see example performance plot below) + see example performance plot below). Now also uses Rader's FFT algorithm for primes from + 17 up to max shared memory length (~10000, see VkFFT's doc for details) - transform along a given list of axes - this requires that after collapsing non-transformed axes, the last transformed axis is at most along the 3rd dimension, e.g. the following axes are allowed: (-2,-3), (-1,-3), (-1,-4), (-4,-5),... @@ -87,6 +88,7 @@ Features This is not allowed for R2C transforms. - normalisation=0 (array L2 norm * array size on each transform) and 1 (the backward transform divides the L2 norm by the array size, so FFT*iFFT restores the original array) +- Support for C (default) and F-ordered arrays, for C2C and R2C transforms - unit tests for all transforms: see test sub-directory. Note that these take a **long** time to finish due to the exhaustive number of sub-tests. - Note that out-of-place C2R transform currently destroys the complex array for FFT dimensions >=2 diff --git a/pyvkfft/fft.py b/pyvkfft/fft.py index b882766..8f193d0 100644 --- a/pyvkfft/fft.py +++ b/pyvkfft/fft.py @@ -51,13 +51,18 @@ def _prepare_transform(src, dest, cl_queue, r2c=False): :param r2c: if True, this is for an R2C transform, so adapt the destination array accordingly. :return: a tuple (backend, inplace, dest, cl_queue), also appending the - destination dtype for an r2c transform. + destination dtype for a r2c transform. """ backend = Backend.UNKNOWN + fastidx = np.argmin(src.strides) # fast axis is the last only for C-ordered arrays + if fastidx == src.ndim - 1: + order = 'C' + else: + order = 'F' if r2c: if src.dtype in [np.float16, np.float32, np.float64]: sh = list(src.shape) - sh[-1] = sh[-1] // 2 + 1 + sh[fastidx] = sh[fastidx] // 2 + 1 dtype = np.complex64 if src.dtype == np.float16: dtype = complex32 @@ -65,7 +70,7 @@ def _prepare_transform(src, dest, cl_queue, r2c=False): dtype = np.complex128 else: sh = list(src.shape) - sh[-1] = (sh[-1] - 1) * 2 + sh[fastidx] = (sh[fastidx] - 1) * 2 dtype = np.float32 if src.dtype == complex32: dtype = np.float16 @@ -82,7 +87,7 @@ def _prepare_transform(src, dest, cl_queue, r2c=False): src_ptr = int(src.gpudata) if dest is None: if r2c: - dest = cua.empty(tuple(sh), dtype=dtype, allocator=src.allocator) + dest = cua.empty(tuple(sh), dtype=dtype, allocator=src.allocator, order=order) else: dest = cua.empty_like(src) dest_ptr = int(dest.gpudata) @@ -93,7 +98,7 @@ def _prepare_transform(src, dest, cl_queue, r2c=False): src_ptr = src.data.int_ptr if dest is None: if r2c: - dest = cla.empty(src.queue, tuple(sh), dtype=dtype, allocator=src.allocator) + dest = cla.empty(src.queue, tuple(sh), dtype=dtype, allocator=src.allocator, order=order) else: dest = cla.empty_like(src) dest_ptr = dest.data.int_ptr @@ -106,7 +111,7 @@ def _prepare_transform(src, dest, cl_queue, r2c=False): src_ptr = src.__cuda_array_interface__['data'][0] if dest is None: if r2c: - dest = cp.empty(tuple(sh), dtype=dtype) + dest = cp.empty(tuple(sh), dtype=dtype, order=order) else: dest = cp.empty_like(src) dest_ptr = dest.__cuda_array_interface__['data'][0] @@ -127,23 +132,23 @@ def _prepare_transform(src, dest, cl_queue, r2c=False): @lru_cache(maxsize=FFT_CACHE_NB) -def _get_fft_app(backend, shape, dtype, inplace, ndim, axes, norm, cuda_stream, cl_queue): +def _get_fft_app(backend, shape, dtype, inplace, ndim, axes, norm, cuda_stream, cl_queue, strides=None): if backend in [Backend.PYCUDA, Backend.CUPY]: return VkFFTApp_cuda(shape, dtype, ndim=ndim, inplace=inplace, - stream=cuda_stream, norm=norm, axes=axes) + stream=cuda_stream, norm=norm, axes=axes, strides=strides) elif backend == Backend.PYOPENCL: return VkFFTApp_cl(shape, dtype, cl_queue, ndim=ndim, inplace=inplace, - norm=norm, axes=axes) + norm=norm, axes=axes, strides=strides) @lru_cache(maxsize=FFT_CACHE_NB) -def _get_rfft_app(backend, shape, dtype, inplace, ndim, norm, cuda_stream, cl_queue): +def _get_rfft_app(backend, shape, dtype, inplace, ndim, norm, cuda_stream, cl_queue, strides=None): if backend in [Backend.PYCUDA, Backend.CUPY]: return VkFFTApp_cuda(shape, dtype, ndim=ndim, inplace=inplace, - stream=cuda_stream, norm=norm, r2c=True) + stream=cuda_stream, norm=norm, r2c=True, strides=strides) elif backend == Backend.PYOPENCL: return VkFFTApp_cl(shape, dtype, cl_queue, ndim=ndim, inplace=inplace, - norm=norm, r2c=True) + norm=norm, r2c=True, strides=strides) @lru_cache(maxsize=FFT_CACHE_NB) @@ -192,7 +197,8 @@ def fftn(src, dest=None, ndim=None, norm=1, axes=None, cuda_stream=None, cl_queu :return: the destination array if return_scale is False, or (dest, scale) """ backend, inplace, dest, cl_queue = _prepare_transform(src, dest, cl_queue, False) - app = _get_fft_app(backend, src.shape, src.dtype, inplace, ndim, axes, norm, cuda_stream, cl_queue) + app = _get_fft_app(backend, src.shape, src.dtype, inplace, ndim, axes, norm, cuda_stream, cl_queue, + strides=src.strides) app.fft(src, dest) if return_scale: s = app.get_fft_scale() @@ -235,7 +241,8 @@ def ifftn(src, dest=None, ndim=None, norm=1, axes=None, cuda_stream=None, cl_que :return: the destination array if return_scale is False, or (dest, scale) """ backend, inplace, dest, cl_queue = _prepare_transform(src, dest, cl_queue, False) - app = _get_fft_app(backend, src.shape, src.dtype, inplace, ndim, axes, norm, cuda_stream, cl_queue) + app = _get_fft_app(backend, src.shape, src.dtype, inplace, ndim, axes, norm, cuda_stream, cl_queue, + strides=src.strides) app.ifft(src, dest) if return_scale: s = app.get_fft_scale() @@ -282,7 +289,8 @@ def rfftn(src, dest=None, ndim=None, norm=1, cuda_stream=None, cl_queue=None, with the appropriate type. """ backend, inplace, dest, cl_queue, dtype = _prepare_transform(src, dest, cl_queue, True) - app = _get_rfft_app(backend, src.shape, src.dtype, inplace, ndim, norm, cuda_stream, cl_queue) + app = _get_rfft_app(backend, src.shape, src.dtype, inplace, ndim, norm, cuda_stream, cl_queue, + strides=src.strides) app.fft(src, dest) if return_scale: s = app.get_fft_scale() @@ -329,7 +337,8 @@ def irfftn(src, dest=None, ndim=None, norm=1, cuda_stream=None, cl_queue=None, with the appropriate type. """ backend, inplace, dest, cl_queue, dtype = _prepare_transform(src, dest, cl_queue, True) - app = _get_rfft_app(backend, dest.shape, dest.dtype, inplace, ndim, norm, cuda_stream, cl_queue) + app = _get_rfft_app(backend, dest.shape, dest.dtype, inplace, ndim, norm, cuda_stream, cl_queue, + strides=dest.strides) app.ifft(src, dest) if return_scale: s = app.get_fft_scale() diff --git a/pyvkfft/scripts/pyvkfft_test_suite.py b/pyvkfft/scripts/pyvkfft_test_suite.py index 71908ee..ac836ff 100644 --- a/pyvkfft/scripts/pyvkfft_test_suite.py +++ b/pyvkfft/scripts/pyvkfft_test_suite.py @@ -17,9 +17,12 @@ def main(): gpu_gb = 32 dry_run = False backend = 'cupy' + opencl_platform = None # Basic test com = "pyvkfft-test --nproc %d --html --range-mb 0 4100" % nproc0 + if opencl_platform is not None: + com += ' --opencl_platform ' + opencl_platform if dry_run: print(com) else: diff --git a/pyvkfft/test/test_fft.py b/pyvkfft/test/test_fft.py index e353704..e41675d 100644 --- a/pyvkfft/test/test_fft.py +++ b/pyvkfft/test/test_fft.py @@ -503,7 +503,8 @@ def test_systematic(self): for s in self.vshape: kwargs = {"backend": backend, "shape": s, "ndim": len(s), "axes": self.axes, "dtype": self.dtype, "inplace": self.inplace, "norm": self.norm, "use_lut": self.lut, - "r2c": self.r2c, "dct": self.dct, "gpu_name": self.gpu, "stream": None, "verbose": False, + "r2c": self.r2c, "dct": self.dct, "gpu_name": self.gpu, + "opencl_platform": self.opencl_platform, "stream": None, "verbose": False, "colour_output": self.colour, "ref_long_double": self.ref_long_double, "order": 'F' if self.fstride else 'C'} vkwargs.append(kwargs) From c0d333b261d4e74d81fc0efacd4616d478e6b1a0 Mon Sep 17 00:00:00 2001 From: Vincent Favre-Nicolin Date: Thu, 5 Jan 2023 21:24:15 +0100 Subject: [PATCH 10/10] Update notebooks --- examples/pyvkfft-tests-CUDA.ipynb | 318 +++++++++++++++++++++++++++- examples/pyvkfft-tests-OpenCL.ipynb | 208 +++++++++++++++++- 2 files changed, 517 insertions(+), 9 deletions(-) diff --git a/examples/pyvkfft-tests-CUDA.ipynb b/examples/pyvkfft-tests-CUDA.ipynb index e290be7..5a34f77 100644 --- a/examples/pyvkfft-tests-CUDA.ipynb +++ b/examples/pyvkfft-tests-CUDA.ipynb @@ -68,9 +68,14 @@ "\n", "import pyvkfft.cuda\n", "from pyvkfft.cuda import VkFFTApp\n", + "from pyvkfft.fft import fftn as vkfftn, ifftn as vkifftn, rfftn as vkrfftn, irfftn as vkirfftn\n", + "from pyvkfft.accuracy import l2, li\n", "import matplotlib.pyplot as plt\n", "from matplotlib.colors import LogNorm\n", - "from scipy.misc import ascent\n", + "try:\n", + " from scipy.datasets import ascent\n", + "except:\n", + " from scipy.misc import ascent\n", "import numpy as np\n", "from scipy.fft import fftn, ifftn, fftshift, rfftn, irfftn, dctn, idctn\n", "import timeit\n" @@ -489,6 +494,311 @@ " print()" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Complex<->Complex, inplace, F-ordering" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "d0 = np.asarray(ascent().astype(np.complex64), order='F')\n", + "for axes in [None, (-1,), (-2,)]:\n", + " d = to_gpu(d0)\n", + " app = VkFFTApp(d.shape, d.dtype, strides=d.strides, axes=axes)\n", + "\n", + " plt.figure(figsize=(15, 5))\n", + " plt.subplot(121)\n", + " plt.imshow(np.fft.fftshift(abs(np.fft.fftn(d0, axes=axes))), norm=LogNorm())\n", + " plt.colorbar()\n", + "\n", + " d = app.fft(d)\n", + "\n", + " plt.subplot(122)\n", + " plt.imshow(np.fft.fftshift(abs(d.get())), norm=LogNorm())\n", + " plt.colorbar()\n", + "\n", + " d = app.ifft(d)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Complex<->Complex, out-of-place, F-ordering" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "d0 = np.asarray(ascent().astype(np.complex64), order='F')\n", + "axes = (-1,)\n", + "d1 = to_gpu(d0)\n", + "d2 = gpu_empty_like(d1)\n", + "app = VkFFTApp(d1.shape, d1.dtype, strides=d1.strides, axes=axes, inplace=False)\n", + "\n", + "plt.figure(figsize=(15, 5))\n", + "plt.subplot(121)\n", + "plt.imshow(np.fft.fftshift(abs(np.fft.fftn(d0, axes=axes))), norm=LogNorm())\n", + "plt.colorbar()\n", + "\n", + "print((abs(d.get()) ** 2).sum())\n", + "d2 = app.fft(d1, d2)\n", + "\n", + "plt.subplot(122)\n", + "plt.imshow(np.fft.fftshift(abs(d2.get())), norm=LogNorm())\n", + "plt.colorbar()\n", + "\n", + "d1 = app.ifft(d2, d1)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Complex<->Complex, inplace, F-ordering, simple FFT interface" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "order = 'F'\n", + "for axes in [None, (-1,), (-2,)]:\n", + " d0 = np.asarray(ascent().astype(np.complex64), order=order)\n", + " d = to_gpu(d0)\n", + "\n", + " plt.figure(figsize=(15, 5))\n", + " plt.subplot(121)\n", + " dn = np.fft.fftshift(abs(np.fft.fftn(d0, axes=axes)))\n", + " plt.imshow(dn, norm=LogNorm())\n", + " plt.colorbar()\n", + "\n", + " d = vkfftn(d, d, axes=axes)\n", + "\n", + " plt.subplot(122)\n", + " plt.imshow(np.fft.fftshift(abs(d.get())), norm=LogNorm())\n", + " plt.colorbar()\n", + "\n", + " d = vkifftn(d, d, axes=axes)\n", + " \n", + " # Test accuracy on random array\n", + " shape, dtype = d0.shape, d0.dtype\n", + " d0 = (np.random.uniform(-0.5, 0.5, shape) + 1j * np.random.uniform(-0.5, 0.5, shape)).astype(dtype)\n", + " d0 = np.asarray(d0, order=order)\n", + " d = to_gpu(d0)\n", + " d = vkfftn(d, d, axes=axes)\n", + " dn = np.fft.fftn(d0, axes=axes)\n", + " print(l2(dn, d.get()), li(dn, d.get()))\n", + " d = to_gpu(d0)\n", + " d = vkifftn(d, d, axes=axes)\n", + " dn = np.fft.ifftn(d0, axes=axes)\n", + " print(l2(dn, d.get()), li(dn, d.get()))\n", + " print()\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Complex<->Complex, out-of-place, F-ordering, simple FFT interface\n", + "The destination array is automatically allocated" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "order = 'F'\n", + "for axes in [None, (-1,), (-2,)]:\n", + " d0 = np.asarray(ascent().astype(np.complex64), order=order)\n", + " d = to_gpu(d0)\n", + "\n", + " plt.figure(figsize=(15, 5))\n", + " plt.subplot(121)\n", + " dn = np.fft.fftshift(abs(np.fft.fftn(d0, axes=axes)))\n", + " plt.imshow(dn, norm=LogNorm())\n", + " plt.colorbar()\n", + "\n", + " d = vkfftn(d, axes=axes)\n", + "\n", + " plt.subplot(122)\n", + " plt.imshow(np.fft.fftshift(abs(d.get())), norm=LogNorm())\n", + " plt.colorbar()\n", + "\n", + " d = vkifftn(d, axes=axes)\n", + " \n", + " # Test accuracy on random array\n", + " shape, dtype = d0.shape, d0.dtype\n", + " d0 = (np.random.uniform(-0.5, 0.5, shape) + 1j * np.random.uniform(-0.5, 0.5, shape)).astype(dtype)\n", + " d0 = np.asarray(d0, order=order)\n", + " d = to_gpu(d0)\n", + " d = vkfftn(d, axes=axes)\n", + " dn = np.fft.fftn(d0, axes=axes)\n", + " print(l2(dn, d.get()), li(dn, d.get()))\n", + " d = to_gpu(d0)\n", + " d = vkifftn(d, axes=axes)\n", + " dn = np.fft.ifftn(d0, axes=axes)\n", + " print(l2(dn, d.get()), li(dn, d.get()))\n", + " print()\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Complex<->Complex, out-of-place pre-allocated, F-ordering, simple FFT interface\n", + "The destination array is pre-allocated here" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "order = 'F'\n", + "for axes in [None, (-1,), (-2,)]:\n", + " d0 = np.asarray(ascent().astype(np.complex64), order=order)\n", + " d = to_gpu(d0)\n", + " dest = gpu_empty_like(d)\n", + "\n", + " plt.figure(figsize=(15, 5))\n", + " plt.subplot(121)\n", + " dn = np.fft.fftshift(abs(np.fft.fftn(d0, axes=axes)))\n", + " plt.imshow(dn, norm=LogNorm())\n", + " plt.colorbar()\n", + "\n", + " dest = vkfftn(d, dest, axes=axes)\n", + "\n", + " plt.subplot(122)\n", + " plt.imshow(np.fft.fftshift(abs(dest.get())), norm=LogNorm())\n", + " plt.colorbar()\n", + "\n", + " dest = vkifftn(d, axes=axes)\n", + " \n", + " # Test accuracy on random array\n", + " shape, dtype = d0.shape, d0.dtype\n", + " d0 = (np.random.uniform(-0.5, 0.5, shape) + 1j * np.random.uniform(-0.5, 0.5, shape)).astype(dtype)\n", + " d0 = np.asarray(d0, order=order)\n", + " \n", + " d = to_gpu(d0)\n", + " dest = vkfftn(d, dest, axes=axes)\n", + " dn = np.fft.fftn(d0, axes=axes)\n", + " print(l2(dn, dest.get()), li(dn, dest.get()))\n", + " \n", + " d = to_gpu(d0)\n", + " dest = vkifftn(d, dest, axes=axes)\n", + " dn = np.fft.ifftn(d0, axes=axes)\n", + " print(l2(dn, dest.get()), li(dn, dest.get()))\n", + " print()\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Real<->Complex (half-Hermitian) inplace transform, F-ordered, simple fft interface" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# single precision, R2C inplace, # F order\n", + "order = 'F'\n", + "d0 = np.zeros((514, 512), dtype=np.float32)\n", + "d0[:512] = ascent()\n", + "d0 = np.asarray(d0, order=order)\n", + "\n", + "d = to_gpu(d0)\n", + "\n", + "d = vkrfftn(d, d)\n", + "\n", + "dn = rfftn(d0[:-2], axes=(-1,-2)) # Need to force the Hermitian axis as the fast one\n", + "print(dn.shape)\n", + "print(np.allclose(d.get(), dn, rtol=1e-6, atol=dn.max() * 1e-6))\n", + "\n", + "plt.figure(figsize=(15,5), dpi=100)\n", + "plt.subplot(121)\n", + "plt.imshow(fftshift(abs(d.get()), axes=0), norm=LogNorm(vmin=100))\n", + "plt.colorbar()\n", + "plt.title(\"R->C\")\n", + "plt.subplot(122)\n", + "plt.imshow(fftshift(abs(dn), axes=0), norm=LogNorm(vmin=100))\n", + "plt.colorbar()\n", + "plt.title(\"R->C (numpy)\")\n", + "\n", + "d = vkirfftn(d, d)\n", + "print(np.allclose(d.get()[:-2], d0[:-2], rtol=1e-6, atol = 255e-6))\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "d.strides , dn.strides" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Real<->Complex (half-Hermitian) out-of-place, F-ordered, simple fft interface" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# single precision, R2C inplace, # F order\n", + "order = 'F'\n", + "d0 = ascent().astype(np.float32)\n", + "d0 = np.asarray(d0, order=order)\n", + "\n", + "d = to_gpu(d0)\n", + "\n", + "d = vkrfftn(d)\n", + "\n", + "dn = rfftn(d0, axes=(-1,-2)) # Need to force the Hermitian axis as the fast one\n", + "print(dn.shape, d.shape)\n", + "print(np.allclose(d.get(), dn, rtol=1e-6, atol=dn.max() * 1e-6))\n", + "\n", + "plt.figure(figsize=(15,5), dpi=100)\n", + "plt.subplot(121)\n", + "plt.imshow(fftshift(abs(d.get()), axes=0), norm=LogNorm(vmin=100))\n", + "plt.colorbar()\n", + "plt.title(\"R->C\")\n", + "plt.subplot(122)\n", + "plt.imshow(fftshift(abs(dn), axes=0), norm=LogNorm(vmin=100))\n", + "plt.colorbar()\n", + "plt.title(\"R->C (numpy)\")\n", + "\n", + "d = vkirfftn(d)\n", + "print(np.allclose(d.get(), d0, rtol=1e-6, atol = 255e-6))\n" + ] + }, { "cell_type": "code", "execution_count": null, @@ -499,9 +809,9 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "py39-pyvkfft-test-conda", "language": "python", - "name": "python3" + "name": "py39-pyvkfft-test-conda" }, "language_info": { "codemirror_mode": { @@ -513,7 +823,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.6" + "version": "3.9.15" } }, "nbformat": 4, diff --git a/examples/pyvkfft-tests-OpenCL.ipynb b/examples/pyvkfft-tests-OpenCL.ipynb index 7f2c46a..719b4ed 100644 --- a/examples/pyvkfft-tests-OpenCL.ipynb +++ b/examples/pyvkfft-tests-OpenCL.ipynb @@ -36,10 +36,15 @@ "import pyopencl as cl\n", "import pyopencl.array as cla\n", "import pyvkfft.opencl\n", + "from pyvkfft.fft import fftn as vkfftn, ifftn as vkifftn, rfftn as vkrfftn, irfftn as vkirfftn\n", "from pyvkfft.opencl import VkFFTApp\n", + "from pyvkfft.accuracy import l2, li\n", "import matplotlib.pyplot as plt\n", "from matplotlib.colors import LogNorm\n", - "from scipy.misc import ascent\n", + "try:\n", + " from scipy.datasets import ascent\n", + "except:\n", + " from scipy.misc import ascent\n", "import numpy as np\n", "from scipy.fft import fftn, ifftn, fftshift, rfftn, irfftn, dctn, idctn\n", "import timeit\n", @@ -480,19 +485,212 @@ " print()" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Complex<->Complex, inplace, F-ordering" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "d0 = np.asarray(ascent().astype(np.complex64), order='F')\n", + "for axes in [None, (-1,), (-2,)]:\n", + " d = cla.to_device(cq, d0)\n", + " app = VkFFTApp(d.shape, d.dtype, cq, strides=d.strides, axes=axes)\n", + "\n", + " plt.figure(figsize=(15, 5))\n", + " plt.subplot(121)\n", + " plt.imshow(np.fft.fftshift(abs(np.fft.fftn(d0, axes=axes))), norm=LogNorm())\n", + " plt.colorbar()\n", + "\n", + " d = app.fft(d)\n", + "\n", + " plt.subplot(122)\n", + " plt.imshow(np.fft.fftshift(abs(d.get())), norm=LogNorm())\n", + " plt.colorbar()\n", + "\n", + " d = app.ifft(d)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Complex<->Complex, out-of-place, F-ordering" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "d0 = np.asarray(ascent().astype(np.complex64), order='F')\n", + "axes = (-1,)\n", + "d1 = cla.to_device(cq, d0)\n", + "d2 = cla.empty_like(d1)\n", + "app = VkFFTApp(d1.shape, d1.dtype, cq, strides=d1.strides, axes=axes, inplace=False)\n", + "\n", + "plt.figure(figsize=(15, 5))\n", + "plt.subplot(121)\n", + "plt.imshow(np.fft.fftshift(abs(np.fft.fftn(d0, axes=axes))), norm=LogNorm())\n", + "plt.colorbar()\n", + "\n", + "print((abs(d.get()) ** 2).sum())\n", + "d2 = app.fft(d1, d2)\n", + "\n", + "plt.subplot(122)\n", + "plt.imshow(np.fft.fftshift(abs(d2.get())), norm=LogNorm())\n", + "plt.colorbar()\n", + "\n", + "d1 = app.ifft(d2, d1)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Complex<->Complex, inplace, F-ordering, simple FFT interface" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "order = 'F'\n", + "for axes in [None, (-1,), (-2,)]:\n", + " d0 = np.asarray(ascent().astype(np.complex64), order=order)\n", + " d = cla.to_device(cq, d0)\n", + "\n", + " plt.figure(figsize=(15, 5))\n", + " plt.subplot(121)\n", + " dn = np.fft.fftshift(abs(np.fft.fftn(d0, axes=axes)))\n", + " plt.imshow(dn, norm=LogNorm())\n", + " plt.colorbar()\n", + "\n", + " d = vkfftn(d, d, axes=axes)\n", + "\n", + " plt.subplot(122)\n", + " plt.imshow(np.fft.fftshift(abs(d.get())), norm=LogNorm())\n", + " plt.colorbar()\n", + "\n", + " d = vkifftn(d, d, axes=axes)\n", + " \n", + " # Test accuracy on random array\n", + " shape, dtype = d0.shape, d0.dtype\n", + " d0 = (np.random.uniform(-0.5, 0.5, shape) + 1j * np.random.uniform(-0.5, 0.5, shape)).astype(dtype)\n", + " d0 = np.asarray(d0, order=order)\n", + " \n", + " d = cla.to_device(cq, d0)\n", + " d = vkfftn(d, d, axes=axes)\n", + " dn = np.fft.fftn(d0, axes=axes)\n", + " print(l2(dn, d.get()), li(dn, d.get()))\n", + " \n", + " d = cla.to_device(cq, d0)\n", + " d = vkifftn(d, d, axes=axes)\n", + " dn = np.fft.ifftn(d0, axes=axes)\n", + " print(l2(dn, d.get()), li(dn, d.get()))\n", + " print()\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Real<->Complex (half-Hermitian) inplace transform, F-ordered, simple fft interface" + ] + }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], - "source": [] + "source": [ + "# single precision, R2C inplace, # F order\n", + "order = 'F'\n", + "d0 = np.zeros((514, 512), dtype=np.float32)\n", + "d0[:512] = ascent()\n", + "d0 = np.asarray(d0, order=order)\n", + "\n", + "d = cla.to_device(cq, d0)\n", + "\n", + "d = vkrfftn(d, d)\n", + "\n", + "dn = rfftn(d0[:-2], axes=(-1,-2)) # Need to force the Hermitian axis as the fast one\n", + "print(dn.shape)\n", + "print(np.allclose(d.get(), dn, rtol=1e-6, atol=dn.max() * 1e-6))\n", + "\n", + "plt.figure(figsize=(15,5), dpi=100)\n", + "plt.subplot(121)\n", + "plt.imshow(fftshift(abs(d.get()), axes=0), norm=LogNorm(vmin=100))\n", + "plt.colorbar()\n", + "plt.title(\"R->C\")\n", + "plt.subplot(122)\n", + "plt.imshow(fftshift(abs(dn), axes=0), norm=LogNorm(vmin=100))\n", + "plt.colorbar()\n", + "plt.title(\"R->C (numpy)\")\n", + "\n", + "d = vkirfftn(d, d)\n", + "print(np.allclose(d.get()[:-2], d0[:-2], rtol=1e-6, atol = 255e-6))\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Real<->Complex (half-Hermitian) out-of-place, F-ordered, simple fft interface" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": false + }, + "outputs": [], + "source": [ + "# single precision, R2C inplace, # F order\n", + "order = 'F'\n", + "d0 = ascent().astype(np.float32)\n", + "d0 = np.asarray(d0, order=order)\n", + "\n", + "d = cla.to_device(cq, d0)\n", + "\n", + "d = vkrfftn(d)\n", + "\n", + "dn = rfftn(d0, axes=(-1,-2)) # Need to force the Hermitian axis as the fast one\n", + "print(dn.shape, d.shape)\n", + "print(np.allclose(d.get(), dn, rtol=1e-6, atol=dn.max() * 1e-6))\n", + "\n", + "plt.figure(figsize=(15,5), dpi=100)\n", + "plt.subplot(121)\n", + "plt.imshow(fftshift(abs(d.get()), axes=0), norm=LogNorm(vmin=100))\n", + "plt.colorbar()\n", + "plt.title(\"R->C\")\n", + "plt.subplot(122)\n", + "plt.imshow(fftshift(abs(dn), axes=0), norm=LogNorm(vmin=100))\n", + "plt.colorbar()\n", + "plt.title(\"R->C (numpy)\")\n", + "\n", + "d = vkirfftn(d)\n", + "print(np.allclose(d.get(), d0, rtol=1e-6, atol = 255e-6))\n" + ] } ], "metadata": { "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "py39-pyvkfft-test-conda", "language": "python", - "name": "python3" + "name": "py39-pyvkfft-test-conda" }, "language_info": { "codemirror_mode": { @@ -504,7 +702,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.6" + "version": "3.9.15" } }, "nbformat": 4,