From 2fcaa4c05eabd8005a2ebd41a4f939a28d3f595f Mon Sep 17 00:00:00 2001
From: Vincent Favre-Nicolin <favre@esrf.fr>
Date: Sat, 22 Oct 2022 20:46:30 +0200
Subject: [PATCH 01/10] During accuracy tests, make sure the array after an R2C
 transform still has the fastest axis along the last dimension (X) - since
 this is not the case after a 2D numpy rfftn transform. Fixes #19

---
 pyvkfft/accuracy.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/pyvkfft/accuracy.py b/pyvkfft/accuracy.py
index 636c9bd..e51f6dd 100644
--- a/pyvkfft/accuracy.py
+++ b/pyvkfft/accuracy.py
@@ -387,7 +387,9 @@ def test_accuracy(backend, shape, ndim, axes, dtype, inplace, norm, use_lut, r2c
             d0n = d0.astype(np.clongdouble)
         else:
             d0n = d0
-
+        if np.argmin(d0.strides) != d0.ndim - 1:
+            # np.fft.rfftn can change the fast axis
+            d0 = d0.copy(order='C')
     if 'opencl' in backend:
         d_gpu = cla.to_device(queue, d0)
     else:

From 4728252223a19725599731fcc528da1902dfd952 Mon Sep 17 00:00:00 2001
From: Vincent Favre-Nicolin <favre@esrf.fr>
Date: Sat, 22 Oct 2022 20:52:26 +0200
Subject: [PATCH 02/10] More elegant re-ordering array during accuracy test,
 when necessary

---
 pyvkfft/accuracy.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyvkfft/accuracy.py b/pyvkfft/accuracy.py
index e51f6dd..192cb22 100644
--- a/pyvkfft/accuracy.py
+++ b/pyvkfft/accuracy.py
@@ -389,7 +389,7 @@ def test_accuracy(backend, shape, ndim, axes, dtype, inplace, norm, use_lut, r2c
             d0n = d0
         if np.argmin(d0.strides) != d0.ndim - 1:
             # np.fft.rfftn can change the fast axis
-            d0 = d0.copy(order='C')
+            d0 = np.asarray(d0, order='C')
     if 'opencl' in backend:
         d_gpu = cla.to_device(queue, d0)
     else:

From d75c620d9d1fdd91d06454439a7d731f7166de70 Mon Sep 17 00:00:00 2001
From: Vincent Favre-Nicolin <favre@esrf.fr>
Date: Sun, 23 Oct 2022 14:13:46 +0200
Subject: [PATCH 03/10] Update changelog

---
 CHANGELOG.rst | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/CHANGELOG.rst b/CHANGELOG.rst
index 0ceff86..9e96b51 100644
--- a/CHANGELOG.rst
+++ b/CHANGELOG.rst
@@ -1,3 +1,7 @@
+Version 2022.2 (2022-XX-XX)
+-----------------------------
+* Fix R2C tests when using numpy (scipy unavailable) [#19]
+
 Version 2022.1.1 (2022-02-14)
 -----------------------------
 * Correct the dtype of the returned array for fft.rfftn() and fft.irfftn()

From 59f9ebd9a453b8a205d6404372f187bdcd874270 Mon Sep 17 00:00:00 2001
From: Vincent Favre-Nicolin <favre@esrf.fr>
Date: Sun, 23 Oct 2022 14:41:45 +0200
Subject: [PATCH 04/10] Add support for non C-ordered arrays (F-order or
 different types of strides). Supports both C2C and R2C (as long as the fast
 axis is transformed). Adresses #20

---
 pyvkfft/accuracy.py      |  95 ++++++++++++++++++++++++----------
 pyvkfft/base.py          |  28 ++++++++--
 pyvkfft/cuda.py          |   9 ++--
 pyvkfft/opencl.py        |   6 ++-
 pyvkfft/test/test_fft.py | 107 +++++++++++++++++++++++----------------
 5 files changed, 164 insertions(+), 81 deletions(-)

diff --git a/pyvkfft/accuracy.py b/pyvkfft/accuracy.py
index 192cb22..b7bd2e1 100644
--- a/pyvkfft/accuracy.py
+++ b/pyvkfft/accuracy.py
@@ -154,12 +154,12 @@ def li(a, b):
 
 def test_accuracy(backend, shape, ndim, axes, dtype, inplace, norm, use_lut, r2c=False, dct=False,
                   gpu_name=None, stream=None, queue=None, return_array=False, init_array=None, verbose=False,
-                  colour_output=False, ref_long_double=True):
+                  colour_output=False, ref_long_double=True, order='C'):
     """
     Measure the
     :param backend: either 'pyopencl', 'pycuda' or 'cupy'
     :param shape: the shape of the array to test. If this is an inplace r2c, the
-        x-axis length must be even, and two extra values will be appended along x,
+        fast-axis length must be even, and two extra values will be appended along x,
         so the actual transform shape is the one supplied
     :param ndim: the number of FFT dimensions. Can be None if axes is given
     :param axes: the transform axes. Supersedes ndim
@@ -187,6 +187,10 @@ def test_accuracy(backend, shape, ndim, axes, dtype, inplace, norm, use_lut, r2c
     :param colour_output: if True, use some colour to tag the quality of the accuracy
     :param ref_long_double: if True and scipy is available, long double precision
         will be used for the reference transform. Otherwise, this is ignored.
+    :param order: either 'C' (default C-contiguous) or 'F' to test a different
+        stride. Note that for the latter, a 3D transform on a 4D array will not
+        be supported as the last transform axis would be on the 4th dimension
+        (once ordered by stride).
     :return: a dictionary with (l2_fft, li_fft, l2_ifft, li_ifft, tol, dt_array,
         dt_app, dt_fft, dt_ifft, src_unchanged_fft, src_unchanged_ifft, tol_test, str),
         with the L2 and Linf normalised norms comparing pyvkfft's result with either
@@ -204,6 +208,7 @@ def test_accuracy(backend, shape, ndim, axes, dtype, inplace, norm, use_lut, r2c
         All input parameters are also returned as key/values, except stream, queue,
         return_array, ini_array and verbose.
     """
+    ndims = len(shape)
     if backend == "cupy" and has_cupy:
         mempool = cp.get_default_memory_pool()
         if mempool is not None:  # Is that test necessary ?
@@ -231,10 +236,16 @@ def test_accuracy(backend, shape, ndim, axes, dtype, inplace, norm, use_lut, r2c
             # Add two extra columns in the source array
             # so the transform has the desired shape
             shape = list(shape)
-            shape[-1] += 2
+            if order == 'C':
+                shape[-1] += 2
+            else:
+                shape[0] += 2
         else:
             shapec = list(shape)
-            shapec[-1] = shapec[-1] // 2 + 1
+            if order == 'C':
+                shapec[-1] = shapec[-1] // 2 + 1
+            else:
+                shapec[0] = shapec[-1] // 2 + 1
             shapec = tuple(shapec)
     else:
         shapec = tuple(shape)
@@ -244,7 +255,10 @@ def test_accuracy(backend, shape, ndim, axes, dtype, inplace, norm, use_lut, r2c
         if r2c:
             if inplace:
                 d0 = np.empty(shape, dtype=dtypef)
-                d0[..., :-2] = init_array
+                if order == 'C':
+                    d0[..., :-2] = init_array
+                else:
+                    d0[:-2] = init_array
             else:
                 d0 = init_array.astype(dtypef)
         elif dct:
@@ -257,11 +271,14 @@ def test_accuracy(backend, shape, ndim, axes, dtype, inplace, norm, use_lut, r2c
         else:
             d0 = (np.random.uniform(-0.5, 0.5, shape) + 1j * np.random.uniform(-0.5, 0.5, shape)).astype(dtype)
 
+    if order != 'C':
+        d0 = np.asarray(d0, order=order)
+
     t1 = timeit.default_timer()
 
     if 'opencl' in backend:
         app = clVkFFTApp(d0.shape, d0.dtype, queue, ndim=ndim, norm=norm,
-                         axes=axes, useLUT=use_lut, inplace=inplace, r2c=r2c, dct=dct)
+                         axes=axes, useLUT=use_lut, inplace=inplace, r2c=r2c, dct=dct, strides=d0.strides)
         t2 = timeit.default_timer()
         d_gpu = cla.to_device(queue, d0)
     else:
@@ -276,13 +293,30 @@ def test_accuracy(backend, shape, ndim, axes, dtype, inplace, norm, use_lut, r2c
         d_gpu = to_gpu(d0)
 
     if axes is None:
-        axes_numpy = list(range(len(shape)))[-ndim:]
+        axes_numpy = list(range(ndims))[-ndim:]
     else:
-        axes_numpy = axes
+        # Make sure axes indices are >0
+        axes_numpy = [ax if ax >= 0 else ndims + ax for ax in axes]
+
+    # Need the fast axis for R2C (last for 'C' order, first for 'F')
+    fast_axis = np.argmin(d0.strides)
+
+    if r2c:
+        if fast_axis not in axes_numpy:
+            raise RuntimeError("The fast axis must be transformed for R2C")
+
+        # For R2C, we need the same fast axis as on the GPU, or the
+        # half-hermitian result won't look the same
+        if fast_axis != axes_numpy[-1]:
+            axes_numpy.remove(fast_axis)
+            axes_numpy.append(fast_axis)
+        # if order != 'C':
+        #     print("R2C", ndims, ndim, shape, axes, axes_numpy, fast_axis, inplace)
+
     # base FFT scale for numpy (not used for DCT)
     s = np.sqrt(np.prod([d0.shape[i] for i in axes_numpy]))
     if r2c and inplace:
-        s = np.sqrt(s ** 2 / d0.shape[-1] * (d0.shape[-1] - 2))
+        s = np.sqrt(s ** 2 / d0.shape[fast_axis] * (d0.shape[fast_axis] - 2))
 
     # Tolerance estimated from accuracy notebook
     if dtype in (np.complex64, np.float32):
@@ -301,11 +335,11 @@ def test_accuracy(backend, shape, ndim, axes, dtype, inplace, norm, use_lut, r2c
     else:
         if r2c:
             if backend == "pyopencl":
-                d1_gpu = cla.empty(queue, shapec, dtype=dtype)
+                d1_gpu = cla.empty(queue, shapec, dtype=dtype, order=order)
             elif backend == "pycuda":
-                d1_gpu = cua.empty(shapec, dtype=dtype)
+                d1_gpu = cua.empty(shapec, dtype=dtype, order=order)
             elif backend == "cupy":
-                d1_gpu = cp.empty(shapec, dtype=dtype)
+                d1_gpu = cp.empty(shapec, dtype=dtype, order=order)
         else:
             d1_gpu = d_gpu.copy()
 
@@ -324,7 +358,8 @@ def test_accuracy(backend, shape, ndim, axes, dtype, inplace, norm, use_lut, r2c
 
     if r2c:
         if inplace:
-            d = rfftn(d0n[..., :-2], axes=axes_numpy) / s
+            # Need to cut the fastest axis by 2
+            d = rfftn(np.take(d0n, range(d0n.shape[fast_axis] - 2), axis=fast_axis), axes=axes_numpy) / s
         else:
             d = rfftn(d0n, axes=axes_numpy) / s
     elif dct:
@@ -348,12 +383,17 @@ def test_accuracy(backend, shape, ndim, axes, dtype, inplace, norm, use_lut, r2c
         t = "C2C"
     if r2c and inplace:
         tmp = list(d0.shape)
-        tmp[-1] -= 2
-        shstr = str(tuple(tmp)).replace(" ", "")
-        if ",)" in shstr:
-            shstr = shstr.replace(",)", "+2)")
+        if order == 'C':
+            tmp[-1] -= 2
+            shstr = str(tuple(tmp)).replace(" ", "")
+            if ",)" in shstr:
+                shstr = shstr.replace(",)", "+2)")
+            else:
+                shstr = shstr.replace(")", "+2)")
         else:
-            shstr = shstr.replace(")", "+2)")
+            tmp[0] -= 2
+            shstr = "(%d+2," % tmp[0]
+            shstr += str(tuple(tmp)).replace(" ", "")[1:]
     else:
         shstr = str(d0.shape).replace(" ", "")
     shax = str(axes).replace(" ", "")
@@ -364,9 +404,9 @@ def test_accuracy(backend, shape, ndim, axes, dtype, inplace, norm, use_lut, r2c
         stol = "%6.2e < %6.2e (%5.3f)" % (ni, tol, ni / tol)
 
     verb_out = "%8s %4s %14s axes=%10s ndim=%4s %10s lut=%4s inplace=%d " \
-               " norm=%4s %5s: n2=%6.2e ninf=%s %d" % \
+               " norm=%4s %s %5s: n2=%6.2e ninf=%s %d" % \
                (backend, t, shstr, shax, str(ndim), str(d0.dtype),
-                str(use_lut), int(inplace), str(norm), "FFT", n2, stol, src_unchanged_fft)
+                str(use_lut), int(inplace), str(norm), order, "FFT", n2, stol, src_unchanged_fft)
 
     t3 = timeit.default_timer()
 
@@ -387,9 +427,11 @@ def test_accuracy(backend, shape, ndim, axes, dtype, inplace, norm, use_lut, r2c
             d0n = d0.astype(np.clongdouble)
         else:
             d0n = d0
-        if np.argmin(d0.strides) != d0.ndim - 1:
+        if (np.argmin(d0.strides) != d0.ndim - 1) and order == 'C':
             # np.fft.rfftn can change the fast axis
             d0 = np.asarray(d0, order='C')
+    if order != 'C':
+        d0 = np.asarray(d0, order=order)
     if 'opencl' in backend:
         d_gpu = cla.to_device(queue, d0)
     else:
@@ -400,11 +442,11 @@ def test_accuracy(backend, shape, ndim, axes, dtype, inplace, norm, use_lut, r2c
     else:
         if r2c:
             if backend == "pyopencl":
-                d1_gpu = cla.empty(queue, shape, dtype=dtypef)
+                d1_gpu = cla.empty(queue, shape, dtype=dtypef, order=order)
             elif backend == "pycuda":
-                d1_gpu = cua.empty(shape, dtype=dtypef)
+                d1_gpu = cua.empty(shape, dtype=dtypef, order=order)
             elif backend == "cupy":
-                d1_gpu = cp.empty(shape, dtype=dtypef)
+                d1_gpu = cp.empty(shape, dtype=dtypef, order=order)
         else:
             d1_gpu = d_gpu.copy()
 
@@ -426,7 +468,8 @@ def test_accuracy(backend, shape, ndim, axes, dtype, inplace, norm, use_lut, r2c
             assert d1_gpu.dtype == dtype, "The array type is incorrect after an inplace iFFT"
 
     if r2c and inplace:
-        n2i, nii = l2(d, d1_gpu.get()[..., :-2]), li(d, d1_gpu.get()[..., :-2])
+        tmp = np.take(d1_gpu.get(), range(d1_gpu.shape[fast_axis] - 2), axis=fast_axis)
+        n2i, nii = l2(d, tmp), li(d, tmp)
     else:
         n2i, nii = l2(d, d1_gpu.get()), li(d, d1_gpu.get())
 
@@ -463,7 +506,7 @@ def test_accuracy(backend, shape, ndim, axes, dtype, inplace, norm, use_lut, r2c
            "dt_fft": t3 - t2, "dt_ifft": t4 - t3, "src_unchanged_fft": src_unchanged_fft,
            "src_unchanged_ifft": src_unchanged_ifft, "tol_test": max(ni, nii) < tol, "str": verb_out,
            "backend": backend, "shape": shape0, "ndim": ndim, "axes": axes, "dtype": dtype0, "inplace": inplace,
-           "norm": norm, "use_lut": use_lut, "r2c": r2c, "dct": dct, "gpu_name": gpu_name}
+           "norm": norm, "use_lut": use_lut, "r2c": r2c, "dct": dct, "gpu_name": gpu_name, "order": order}
 
     if return_array:
         res["d0"] = d0
diff --git a/pyvkfft/base.py b/pyvkfft/base.py
index dfefabc..c066e94 100644
--- a/pyvkfft/base.py
+++ b/pyvkfft/base.py
@@ -276,7 +276,7 @@ def radix_gen_n(nmax, max_size, radix, ndim=None, even=False, exclude_one=True,
     return v
 
 
-def calc_transform_axes(shape, axes=None, ndim=None):
+def calc_transform_axes(shape, axes=None, ndim=None, strides=None):
     """ Compute the final shape of the array to be passed
     to VkFFT, and the axes for which the transform should
     be skipped.
@@ -290,7 +290,8 @@ def calc_transform_axes(shape, axes=None, ndim=None):
     - shape=(4,5,6,7) and axes=(2,3): the first two axes will be collapsed
       to a (20,6,7) axis
     - shape=(4,5,6,7,8,9) and axes=(2,3): the first two axes will be collapsed
-      to a (20,6,7) axis and a batch size of 8*9=81 will be used
+      to a (20,6,7) axis (the first axis is skipped) and a batch size
+      of 8*9=81 will be used
     Examples of impossible transforms:
     - shape=(4,5,6,7) with ndim=4: only 3D transforms are allowed
     - shape=(4,5,6,7) with axes=(1,2,3): the index of the last transformed
@@ -303,6 +304,8 @@ def calc_transform_axes(shape, axes=None, ndim=None):
         are transformed, or up to ndim.
     :param ndim: the number of dimensions for the transform. If None,
         the number of axes is used
+    :param strides: the array strides. If None, a C-order is assumed
+        with the fastest axes along the last dimensions (numpy default)
     :return: (shape, skip_axis, ndim) with the 4D shape after collapsing
         consecutive non-transformed axes (padded with ones if necessary,
         with the order adequate for VkFFT (nx, ny, nz, n_batch),
@@ -324,6 +327,16 @@ def calc_transform_axes(shape, axes=None, ndim=None):
         else:
             if ndim1 != len(axes):
                 raise RuntimeError("The number of transform axes does not match ndim:", axes, ndim)
+
+    if strides is not None and len(shape) > 1:
+        idx = np.argsort(strides)[::-1]
+        if not np.alltrue((idx[1:] - idx[:-1]) > 0):
+            # Array is not C-ordered, need to move axes
+            idxend = -len(idx) + idx
+            # print("Re-ordering:", shape1, np.take(shape1, idx), "axes:", axes, [idxend[i] for i in axes])
+            shape1 = np.take(shape1, idx).tolist()
+            axes = [idxend[i] for i in axes]
+
     # Collapse non-transform axes when possible
     skip_axis = [True for i in range(len(shape1))]
     for i in axes:
@@ -421,7 +434,7 @@ class VkFFTApp:
     """
 
     def __init__(self, shape, dtype: type, ndim=None, inplace=True, norm=1,
-                 r2c=False, dct=False, axes=None, **kwargs):
+                 r2c=False, dct=False, axes=None, strides=None, **kwargs):
         """
         Init function for the VkFFT application.
 
@@ -460,6 +473,7 @@ def __init__(self, shape, dtype: type, ndim=None, inplace=True, norm=1,
         :param axes: a list or tuple of axes along which the transform should be made.
             if None, the transform is done along the ndim fastest axes, or all
             axes if ndim is None. Not allowed for R2C transforms
+        :param strides: the array strides - needed if not C-ordered.
         :raises RuntimeError:  if the transform dimensions are not allowed by VkFFT.
         """
         self.app = None
@@ -470,9 +484,11 @@ def __init__(self, shape, dtype: type, ndim=None, inplace=True, norm=1,
             raise RuntimeError("R2C or DCT selected but input type is not real !")
         if r2c and axes is not None:
             raise RuntimeError("axes=... is not allowed for R2C transforms")
+        if strides is not None and dct:
+            raise RuntimeError("strides=... is not allowed for DCT transforms (needs a C-ordered array)")
         # Get the final shape passed to VkFFT, collapsing non-transform axes
-        # as necessary. The calculated shape has 4 dimensions (nx, ny, nz, n_batch)
-        self.shape, self.skip_axis, self.ndim = calc_transform_axes(shape, axes, ndim)
+        # as necessary. The calculated shape has 4 dimensions (nx, ny, nz, n_batch).
+        self.shape, self.skip_axis, self.ndim = calc_transform_axes(shape, axes, ndim, strides)
         self.inplace = inplace
         self.r2c = r2c
         if dct is False:
@@ -546,6 +562,8 @@ def _get_fft_scale(self, norm):
                 ndim_real += 1
         s = np.sqrt(s)
         if self.r2c and self.inplace:
+            # Note: this is still correct for non-C-ordered arrays since
+            # self.shape has been re-ordered
             s *= np.sqrt((self.shape[0] - 2) / self.shape[0])
         if self.dct:
             s *= 2 ** (0.5 * ndim_real)
diff --git a/pyvkfft/cuda.py b/pyvkfft/cuda.py
index e799eec..60f1c42 100644
--- a/pyvkfft/cuda.py
+++ b/pyvkfft/cuda.py
@@ -64,7 +64,7 @@ class VkFFTApp(VkFFTAppBase):
     """
 
     def __init__(self, shape, dtype: type, ndim=None, inplace=True, stream=None, norm=1,
-                 r2c=False, dct=False, axes=None, **kwargs):
+                 r2c=False, dct=False, axes=None, strides=None, **kwargs):
         """
 
         :param shape: the shape of the array to be transformed. The number
@@ -106,10 +106,13 @@ def __init__(self, shape, dtype: type, ndim=None, inplace=True, stream=None, nor
         :param axes: a list or tuple of axes along which the transform should be made.
             if None, the transform is done along the ndim fastest axes, or all
             axes if ndim is None. Not allowed for R2C transforms
+        :param strides: the array strides - needed if not C-ordered.
         :raises RuntimeError: if the initialisation fails, e.g. if the CUDA
-            driver has not been properly initialised.
+            driver has not been properly initialised, or if the transform dimensions
+            are not allowed by VkFFT.
         """
-        super().__init__(shape, dtype, ndim=ndim, inplace=inplace, norm=norm, r2c=r2c, dct=dct, axes=axes, **kwargs)
+        super().__init__(shape, dtype, ndim=ndim, inplace=inplace, norm=norm, r2c=r2c,
+                         dct=dct, axes=axes, strides=strides, **kwargs)
 
         self.stream = stream
 
diff --git a/pyvkfft/opencl.py b/pyvkfft/opencl.py
index daaedab..fc1193e 100644
--- a/pyvkfft/opencl.py
+++ b/pyvkfft/opencl.py
@@ -53,7 +53,7 @@ class VkFFTApp(VkFFTAppBase):
     """
 
     def __init__(self, shape, dtype: type, queue: cl.CommandQueue, ndim=None, inplace=True, norm=1,
-                 r2c=False, dct=False, axes=None, **kwargs):
+                 r2c=False, dct=False, axes=None, strides=None, **kwargs):
         """
         Init function for the VkFFT application.
 
@@ -93,11 +93,13 @@ def __init__(self, shape, dtype: type, queue: cl.CommandQueue, ndim=None, inplac
         :param axes: a list or tuple of axes along which the transform should be made.
             if None, the transform is done along the ndim fastest axes, or all
             axes if ndim is None. Not allowed for R2C transforms
+        :param strides: the array strides - needed if not C-ordered.
         :raises RuntimeError: if the initialisation fails, e.g. if the GPU
             driver has not been properly initialised, or if the transform dimensions
             are not allowed by VkFFT.
         """
-        super().__init__(shape, dtype, ndim=ndim, inplace=inplace, norm=norm, r2c=r2c, dct=dct, axes=axes, **kwargs)
+        super().__init__(shape, dtype, ndim=ndim, inplace=inplace, norm=norm, r2c=r2c,
+                         dct=dct, axes=axes, strides=strides, **kwargs)
 
         self.queue = queue
 
diff --git a/pyvkfft/test/test_fft.py b/pyvkfft/test/test_fft.py
index 1e43177..86d61e4 100644
--- a/pyvkfft/test/test_fft.py
+++ b/pyvkfft/test/test_fft.py
@@ -200,7 +200,7 @@ def run_fft(self, vbackend, vn, dims_max=4, ndim_max=3, shuffle_axes=True,
                                                       + 1j * np.random.uniform(-0.5, 0.5, sh)).astype(dtype)
                                         if vlut == "auto":
                                             if dtype in (np.float64, np.complex128):
-                                                # By default LUT is enabled for complex128, no need to test twice
+                                                # By default, LUT is enabled for complex128, no need to test twice
                                                 tmp = [None]
                                             else:
                                                 tmp = [None, True]
@@ -209,50 +209,67 @@ def run_fft(self, vbackend, vn, dims_max=4, ndim_max=3, shuffle_axes=True,
                                         for use_lut in tmp:
                                             for inplace in vinplace:
                                                 for norm in vnorm:
-                                                    with self.subTest(backend=backend, n=n, dims=dims, ndim=ndim,
-                                                                      axes=axes, dtype=np.dtype(dtype), norm=norm,
-                                                                      use_lut=use_lut, inplace=inplace,
-                                                                      r2c=r2c, dct=dct):
-                                                        ct += 1
-                                                        if not dry_run:
-                                                            res = test_accuracy(backend, sh, ndim, axes, dtype, inplace,
-                                                                                norm, use_lut, r2c=r2c, dct=dct,
-                                                                                gpu_name=self.gpu,
-                                                                                stream=None, queue=cq,
-                                                                                return_array=False, init_array=d0,
-                                                                                verbose=verbose)
-                                                            npr = primes(n)
-                                                            ni, n2 = res["ni"], res["n2"]
-                                                            nii, n2i = res["nii"], res["n2i"]
-                                                            tol = res["tol"]
-                                                            src1 = res["src_unchanged_fft"]
-                                                            src2 = res["src_unchanged_ifft"]
-                                                            self.assertTrue(ni < tol, "Accuracy mismatch after FFT, "
-                                                                                      "n2=%8e ni=%8e>%8e" %
-                                                                            (n2, ni, tol))
-                                                            self.assertTrue(nii < tol, "Accuracy mismatch after iFFT, "
-                                                                                       "n2=%8e ni=%8e>%8e" %
-                                                                            (n2, nii, tol))
-                                                            if not inplace:
-                                                                self.assertTrue(src1, "The source array was modified "
-                                                                                      "during the FFT")
-                                                                nmaxr2c1d = 3072 * (1 + int(
-                                                                    dtype in (np.float32, np.complex64)))
-                                                                if not r2c or (ndim == 1 and max(npr) <= 13) \
-                                                                        and n < nmaxr2c1d:
-                                                                    self.assertTrue(src2,
+                                                    vorder = ['C', 'F']
+                                                    if dims == 1 or dims > 3 or dct:
+                                                        vorder = ['C']
+                                                    if r2c:
+                                                        if ndim is not None:
+                                                            if dims != ndim:
+                                                                vorder = ['C']
+                                                        if axes is not None:
+                                                            # TODO : also test F-order when ndim<dims but fast axis
+                                                            #  is transformed
+                                                            if len(axes) != dims:
+                                                                vorder = ['C']
+                                                    for order in vorder:
+                                                        with self.subTest(backend=backend, n=n, dims=dims, ndim=ndim,
+                                                                          axes=axes, dtype=np.dtype(dtype), norm=norm,
+                                                                          use_lut=use_lut, inplace=inplace,
+                                                                          r2c=r2c, dct=dct, order=order):
+                                                            ct += 1
+                                                            if not dry_run:
+                                                                res = test_accuracy(backend, sh, ndim, axes, dtype,
+                                                                                    inplace,
+                                                                                    norm, use_lut, r2c=r2c, dct=dct,
+                                                                                    gpu_name=self.gpu,
+                                                                                    stream=None, queue=cq,
+                                                                                    return_array=False, init_array=d0,
+                                                                                    verbose=verbose, order=order)
+                                                                npr = primes(n)
+                                                                ni, n2 = res["ni"], res["n2"]
+                                                                nii, n2i = res["nii"], res["n2i"]
+                                                                tol = res["tol"]
+                                                                src1 = res["src_unchanged_fft"]
+                                                                src2 = res["src_unchanged_ifft"]
+                                                                self.assertTrue(ni < tol,
+                                                                                "Accuracy mismatch after FFT, "
+                                                                                "n2=%8e ni=%8e>%8e" %
+                                                                                (n2, ni, tol))
+                                                                self.assertTrue(nii < tol,
+                                                                                "Accuracy mismatch after iFFT, "
+                                                                                "n2=%8e ni=%8e>%8e" %
+                                                                                (n2, nii, tol))
+                                                                if not inplace:
+                                                                    self.assertTrue(src1,
                                                                                     "The source array was modified "
-                                                                                    "during the iFFT")
-                                                        else:
-                                                            kwargs = {"backend": backend, "shape": sh,
-                                                                      "ndim": ndim, "axes": axes,
-                                                                      "dtype": dtype, "inplace": inplace,
-                                                                      "norm": norm, "use_lut": use_lut,
-                                                                      "r2c": r2c, "dct": dct,
-                                                                      "gpu_name": self.gpu, "stream": None,
-                                                                      "verbose": False,
-                                                                      "colour_output": self.colour}
-                                                            vkwargs.append(kwargs)
+                                                                                    "during the FFT")
+                                                                    nmaxr2c1d = 3072 * (1 + int(
+                                                                        dtype in (np.float32, np.complex64)))
+                                                                    if not r2c or (ndim == 1 and max(npr) <= 13) \
+                                                                            and n < nmaxr2c1d:
+                                                                        self.assertTrue(src2,
+                                                                                        "The source array was modified "
+                                                                                        "during the iFFT")
+                                                            else:
+                                                                kwargs = {"backend": backend, "shape": sh,
+                                                                          "ndim": ndim, "axes": axes,
+                                                                          "dtype": dtype, "inplace": inplace,
+                                                                          "norm": norm, "use_lut": use_lut,
+                                                                          "r2c": r2c, "dct": dct,
+                                                                          "gpu_name": self.gpu, "stream": None,
+                                                                          "verbose": False, "order": order,
+                                                                          "colour_output": self.colour}
+                                                                vkwargs.append(kwargs)
 
         return ct, vkwargs
 
@@ -262,7 +279,7 @@ def run_fft_parallel(self, vkwargs):
             for res in pool.imap(test_accuracy_kwargs, vkwargs):
                 with self.subTest(backend=res['backend'], n=max(res['shape']), ndim=res['ndim'],
                                   dtype=np.dtype(res['dtype']), norm=res['norm'], use_lut=res['use_lut'],
-                                  inplace=res['inplace'], r2c=res['r2c'], dct=res['dct']):
+                                  inplace=res['inplace'], r2c=res['r2c'], dct=res['dct'], order=res['order']):
                     n = max(res['shape'])
                     npr = primes(n)
                     ni, n2 = res["ni"], res["n2"]

From 04028426eb4370632cfa8e7bef4b7350f0759736 Mon Sep 17 00:00:00 2001
From: Vincent Favre-Nicolin <favre@esrf.fr>
Date: Sun, 23 Oct 2022 16:56:23 +0200
Subject: [PATCH 05/10] Non-C-contiguous arrays: add systematic test, fix DCT
 check, upodate some doc and outputs

---
 pyvkfft/accuracy.py             |  9 ++++++---
 pyvkfft/base.py                 | 13 ++++++++++---
 pyvkfft/scripts/pyvkfft_test.py |  3 +++
 pyvkfft/test/test_fft.py        | 14 ++++++++------
 4 files changed, 27 insertions(+), 12 deletions(-)

diff --git a/pyvkfft/accuracy.py b/pyvkfft/accuracy.py
index b7bd2e1..b487f00 100644
--- a/pyvkfft/accuracy.py
+++ b/pyvkfft/accuracy.py
@@ -152,7 +152,7 @@ def li(a, b):
     return abs(a - b).max() / abs(a).max()
 
 
-def test_accuracy(backend, shape, ndim, axes, dtype, inplace, norm, use_lut, r2c=False, dct=False,
+def test_accuracy(backend, shape, ndim, axes, dtype, inplace, norm, use_lut, r2c=False, dct=False, fstride=False,
                   gpu_name=None, stream=None, queue=None, return_array=False, init_array=None, verbose=False,
                   colour_output=False, ref_long_double=True, order='C'):
     """
@@ -392,8 +392,11 @@ def test_accuracy(backend, shape, ndim, axes, dtype, inplace, norm, use_lut, r2c
                 shstr = shstr.replace(")", "+2)")
         else:
             tmp[0] -= 2
-            shstr = "(%d+2," % tmp[0]
-            shstr += str(tuple(tmp)).replace(" ", "")[1:]
+            if len(tmp) == 1:
+                shstr = "(%d+2)," % tmp[0]
+            else:
+                shstr = "(%d+2," % tmp[0]
+                shstr += str(tuple(tmp[1:])).replace(" ", "").replace(",)", ")")[1:]
     else:
         shstr = str(d0.shape).replace(" ", "")
     shax = str(axes).replace(" ", "")
diff --git a/pyvkfft/base.py b/pyvkfft/base.py
index c066e94..74ba9fb 100644
--- a/pyvkfft/base.py
+++ b/pyvkfft/base.py
@@ -465,7 +465,11 @@ def __init__(self, shape, dtype: type, ndim=None, inplace=True, norm=1,
             reinterpret the type) will have a shape (..., nx//2 + 1).
             For an out-of-place transform, if the input (real) shape is (..., nx),
             the output (complex) shape should be (..., nx//2+1).
-            Note that for C2R transforms with ndim>=2, the source (complex) array
+            Note 1: the above shape changes are true for C-contiguous arrays;
+            generally the axis which is halved by the R2C transform always is
+            the fast axis -with a stride of 1 element. For F-contiguous arrays
+            this will be the first dimension instead of the last.
+            Note 2:for C2R transforms with ndim>=2, the source (complex) array
             is modified.
         :param dct: used to perform a Direct Cosine Transform (DCT) aka a R2R transform.
             An integer can be given to specify the type of DCT (1, 2, 3 or 4).
@@ -484,8 +488,11 @@ def __init__(self, shape, dtype: type, ndim=None, inplace=True, norm=1,
             raise RuntimeError("R2C or DCT selected but input type is not real !")
         if r2c and axes is not None:
             raise RuntimeError("axes=... is not allowed for R2C transforms")
-        if strides is not None and dct:
-            raise RuntimeError("strides=... is not allowed for DCT transforms (needs a C-ordered array)")
+        if strides is not None and dct and len(shape) > 1:
+            # TODO: update support status for non-C-contiguous DCT transforms
+            s = np.array(strides)
+            if not np.alltrue((s[:-1] - s[1:]) > 0):
+                raise RuntimeError("A C-contiguous array is required for DCT transforms")
         # Get the final shape passed to VkFFT, collapsing non-transform axes
         # as necessary. The calculated shape has 4 dimensions (nx, ny, nz, n_batch).
         self.shape, self.skip_axis, self.ndim = calc_transform_axes(shape, axes, ndim, strides)
diff --git a/pyvkfft/scripts/pyvkfft_test.py b/pyvkfft/scripts/pyvkfft_test.py
index 4724af4..514202d 100755
--- a/pyvkfft/scripts/pyvkfft_test.py
+++ b/pyvkfft/scripts/pyvkfft_test.py
@@ -301,6 +301,8 @@ def main():
                              "can be slower (or much slower on some architectures).")
     sysgrp.add_argument('--r2c', action='store_true', help="Test real-to-complex transform "
                                                            "(default is c2c)")
+    sysgrp.add_argument('--fstride', action='store_true',
+                        help="Test F-ordered arrays (default is C-ordered). Not supported for DCT")
     sysgrp.add_argument('--radix', action='store', nargs='*', type=int,
                         help="Perform only radix transforms. If no value is given, all available "
                              "radix transforms are allowed. Alternatively a list can be given: "
@@ -372,6 +374,7 @@ def main():
         t.norm = args.norm[0]
         t.nproc = args.nproc[0]
         t.r2c = args.r2c
+        t.fstride = args.fstride
         t.radix = args.radix
         t.ref_long_double = args.ref_long_double
         t.max_pow = None if args.radix_max_pow is None else args.radix_max_pow[0]
diff --git a/pyvkfft/test/test_fft.py b/pyvkfft/test/test_fft.py
index 86d61e4..ac097ad 100644
--- a/pyvkfft/test/test_fft.py
+++ b/pyvkfft/test/test_fft.py
@@ -443,6 +443,7 @@ class TestFFTSystematic(unittest.TestCase):
     gpu = None
     inplace = False
     lut = False
+    fstride = False
     max_pow = None
     max_nb_tests = 1000
     nb_test = 0  # Number of tests actually run
@@ -515,7 +516,8 @@ def test_systematic(self):
                 kwargs = {"backend": backend, "shape": s, "ndim": len(s), "axes": self.axes,
                           "dtype": self.dtype, "inplace": self.inplace, "norm": self.norm, "use_lut": self.lut,
                           "r2c": self.r2c, "dct": self.dct, "gpu_name": self.gpu, "stream": None, "verbose": False,
-                          "colour_output": self.colour, "ref_long_double": self.ref_long_double}
+                          "colour_output": self.colour, "ref_long_double": self.ref_long_double,
+                          "order": 'F' if self.fstride else 'C'}
                 vkwargs.append(kwargs)
         if self.db is not None:
             # TODO secure the db with a context 'with'
@@ -523,7 +525,7 @@ def test_systematic(self):
             dbc = db.cursor()
             dbc.execute('CREATE TABLE IF NOT EXISTS pyvkfft_test (epoch int, hostname int,'
                         'backend text, language text, transform text, axes text, array_shape text,'
-                        'ndims int, ndim int, precision int, inplace int, norm int, lut int,'
+                        'ndims int, ndim int, precision int, inplace int, norm int, lut int, fstride int,'
                         'n int, n2_fft float, n2_ifft float, ni_fft float, ni_ifft float, tolerance float,'
                         'dt_app float, dt_fft float, dt_ifft float, src_unchanged_fft int, src_unchanged_ifft int,'
                         'gpu_name text, success int, error int, vkfft_error_code int)')
@@ -563,7 +565,7 @@ def test_systematic(self):
                     # as e.g. "float32" instead of "<class 'numpy.float32'>"
                     with self.subTest(backend=backend, shape=sh, ndim=ndim,
                                       dtype=np.dtype(self.dtype), norm=self.norm, use_lut=self.lut,
-                                      inplace=self.inplace, r2c=self.r2c, dct=self.dct):
+                                      inplace=self.inplace, r2c=self.r2c, dct=self.dct, fstride=self.fstride):
                         if self.serial:
                             res = test_accuracy_kwargs(v)
                         else:
@@ -600,12 +602,12 @@ def test_systematic(self):
                                 succ = False
                         if self.db is not None:
                             dbc.execute('INSERT INTO pyvkfft_test VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,'
-                                        '?,?,?,?,?,?,?,?,?,?,?,?,?)',
+                                        '?,?,?,?,?,?,?,?,?,?,?,?,?,?)',
                                         (time.time(), hostname, backend, lang, transform,
                                          str(res['axes']).encode('ascii'), str(res['shape']).encode('ascii'),
                                          len(res['shape']), ndim, np.dtype(self.dtype).itemsize,
-                                         self.inplace, self.norm, self.lut, int(max(res['shape'])), float(n2),
-                                         float(n2i),
+                                         self.inplace, self.norm, self.lut, self.fstride,
+                                         int(max(res['shape'])), float(n2), float(n2i),
                                          float(ni), float(nii), float(tol), res["dt_app"], res["dt_fft"],
                                          res["dt_ifft"],
                                          int(src1), int(src2), res["gpu_name"].encode('ascii'), int(succ), 0, 0))

From fe7bafad7ffe5438f3a7a2c001acc049f350e1a9 Mon Sep 17 00:00:00 2001
From: Vincent Favre-Nicolin <favre@esrf.fr>
Date: Sat, 26 Nov 2022 15:10:02 +0100
Subject: [PATCH 06/10] Also display the platforma and device name along vkfft
 error check

---
 pyvkfft/opencl.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/pyvkfft/opencl.py b/pyvkfft/opencl.py
index fc1193e..1734d73 100644
--- a/pyvkfft/opencl.py
+++ b/pyvkfft/opencl.py
@@ -114,7 +114,8 @@ def __init__(self, shape, dtype: type, queue: cl.CommandQueue, ndim=None, inplac
             raise RuntimeError("Error creating VkFFTConfiguration. Was the OpenCL context properly initialised ?")
         res = ctypes.c_int(0)
         self.app = _vkfft_opencl.init_app(self.config, queue.int_ptr, ctypes.byref(res))
-        check_vkfft_result(res, shape, dtype, ndim, inplace, norm, r2c, dct, axes, "opencl")
+        check_vkfft_result(res, shape, dtype, ndim, inplace, norm, r2c, dct, axes, "opencl:%s:%s" %
+                           (queue.device.platform.name, queue.device.name))
         if self.app is None:
             raise RuntimeError("Error creating VkFFTApplication. Was the OpenCL context properly initialised ?")
 

From 66fe9443fa101e6073a98c9bb0b2097ee86ceb1f Mon Sep 17 00:00:00 2001
From: Vincent Favre-Nicolin <favre@esrf.fr>
Date: Sat, 26 Nov 2022 15:11:06 +0100
Subject: [PATCH 07/10] Check also for OSError in fftp.py. Should fix #21

---
 pyvkfft/fft.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pyvkfft/fft.py b/pyvkfft/fft.py
index 3358dbd..b882766 100644
--- a/pyvkfft/fft.py
+++ b/pyvkfft/fft.py
@@ -21,14 +21,14 @@
         import pycuda.gpuarray as cua
     if has_cupy:
         import cupy as cp
-except ImportError:
+except (ImportError, OSError):
     has_cupy, has_pycuda = False, False
 
 try:
     from .opencl import VkFFTApp as VkFFTApp_cl, cla, vkfft_version
 
     has_opencl = True
-except ImportError:
+except (ImportError, OSError):
     has_opencl = False
 
 

From 3efa26ab67b35fbfe4a8ab95aa3497c8080b7848 Mon Sep 17 00:00:00 2001
From: Vincent Favre-Nicolin <favre@esrf.fr>
Date: Thu, 5 Jan 2023 14:44:29 +0100
Subject: [PATCH 08/10] Fix tests using F-strided arrays. Allow to select the
 backend also for the basic test, and add the option to select the opencl
 platform.

---
 pyvkfft/accuracy.py             | 31 +++++++-----
 pyvkfft/scripts/pyvkfft_test.py | 13 +++++
 pyvkfft/test/test_fft.py        | 84 ++++++++++++++-------------------
 3 files changed, 68 insertions(+), 60 deletions(-)

diff --git a/pyvkfft/accuracy.py b/pyvkfft/accuracy.py
index b487f00..483b702 100644
--- a/pyvkfft/accuracy.py
+++ b/pyvkfft/accuracy.py
@@ -66,7 +66,7 @@
 gpu_ctx_dic = {}
 
 
-def init_ctx(backend, gpu_name=None, verbose=False):
+def init_ctx(backend, gpu_name=None, opencl_platform=None, verbose=False):
     if backend in gpu_ctx_dic:
         return
     if backend == "pycuda":
@@ -96,6 +96,9 @@ def init_ctx(backend, gpu_name=None, verbose=False):
         for p in cl.get_platforms():
             if d is not None:
                 break
+            if opencl_platform is not None:
+                if opencl_platform.lower() not in p.name.lower():
+                    continue
             for d0 in p.get_devices():
                 if d0.type & cl.device_type.GPU:
                     if gpu_name is not None:
@@ -106,10 +109,8 @@ def init_ctx(backend, gpu_name=None, verbose=False):
                 if d is not None:
                     break
         if d is None:
-            if gpu_name is not None:
-                raise RuntimeError("Selected backend is pyopencl, but no device found (name=%s)" % gpu_name)
-            else:
-                raise RuntimeError("Selected backend is pyopencl, but no device found")
+            raise RuntimeError("Selected backend is pyopencl, but no device found (name=%s, platform=%s)" %
+                               (gpu_name, opencl_platform))
         cl_ctx = cl.Context([d])
         cq = cl.CommandQueue(cl_ctx)
         gpu_ctx_dic["pyopencl"] = d, cl_ctx, cq, 'cl_khr_fp64' in cq.device.extensions
@@ -152,9 +153,9 @@ def li(a, b):
     return abs(a - b).max() / abs(a).max()
 
 
-def test_accuracy(backend, shape, ndim, axes, dtype, inplace, norm, use_lut, r2c=False, dct=False, fstride=False,
-                  gpu_name=None, stream=None, queue=None, return_array=False, init_array=None, verbose=False,
-                  colour_output=False, ref_long_double=True, order='C'):
+def test_accuracy(backend, shape, ndim, axes, dtype, inplace, norm, use_lut, r2c=False, dct=False,
+                  gpu_name=None, opencl_platform=None, stream=None, queue=None, return_array=False,
+                  init_array=None, verbose=False, colour_output=False, ref_long_double=True, order='C'):
     """
     Measure the
     :param backend: either 'pyopencl', 'pycuda' or 'cupy'
@@ -175,6 +176,8 @@ def test_accuracy(backend, shape, ndim, axes, dtype, inplace, norm, use_lut, r2c
         tested (native scipy normalisation).
     :param gpu_name: the name of the gpu to use. If None, the first available
         for the backend will be used.
+    :param opencl_platform: the name of the OpenCL platform to use. If None, the first available
+        will be used.
     :param stream: the cuda stream to use, or None
     :param queue: the opencl queue to use (mandatory for the 'pyopencl' backend)
     :param return_array: if True, will return the generated random array so it can be
@@ -216,7 +219,7 @@ def test_accuracy(backend, shape, ndim, axes, dtype, inplace, norm, use_lut, r2c
             # N parallel process so memory management  must be done manually
             mempool.free_all_blocks()
     t0 = timeit.default_timer()
-    init_ctx(backend, gpu_name=gpu_name, verbose=False)
+    init_ctx(backend, gpu_name=gpu_name, opencl_platform=opencl_platform, verbose=False)
     if backend == "pyopencl" and queue is None:
         queue = gpu_ctx_dic["pyopencl"][2]
     shape0 = shape
@@ -281,14 +284,17 @@ def test_accuracy(backend, shape, ndim, axes, dtype, inplace, norm, use_lut, r2c
                          axes=axes, useLUT=use_lut, inplace=inplace, r2c=r2c, dct=dct, strides=d0.strides)
         t2 = timeit.default_timer()
         d_gpu = cla.to_device(queue, d0)
+        empty_like = cla.empty_like
     else:
         if backend == "pycuda":
             to_gpu = cua.to_gpu
+            empty_like = cua.empty_like
         else:
             to_gpu = cp.array
+            empty_like = cp.empty_like
 
         app = cuVkFFTApp(d0.shape, d0.dtype, ndim=ndim, norm=norm, axes=axes,
-                         useLUT=use_lut, inplace=inplace, r2c=r2c, dct=dct, stream=stream)
+                         useLUT=use_lut, inplace=inplace, r2c=r2c, dct=dct, strides=d0.strides, stream=stream)
         t2 = timeit.default_timer()
         d_gpu = to_gpu(d0)
 
@@ -341,7 +347,8 @@ def test_accuracy(backend, shape, ndim, axes, dtype, inplace, norm, use_lut, r2c
             elif backend == "cupy":
                 d1_gpu = cp.empty(shapec, dtype=dtype, order=order)
         else:
-            d1_gpu = d_gpu.copy()
+            # Note that pycuda's gpuarray.copy (as of 2022.2.2) does not copy strides
+            d1_gpu = empty_like(d_gpu)
 
     if has_scipy and ref_long_double:
         # Use long double precision
@@ -451,7 +458,7 @@ def test_accuracy(backend, shape, ndim, axes, dtype, inplace, norm, use_lut, r2c
             elif backend == "cupy":
                 d1_gpu = cp.empty(shape, dtype=dtypef, order=order)
         else:
-            d1_gpu = d_gpu.copy()
+            d1_gpu = empty_like(d_gpu)
 
     d1_gpu = app.ifft(d_gpu, d1_gpu)
     if not dct:
diff --git a/pyvkfft/scripts/pyvkfft_test.py b/pyvkfft/scripts/pyvkfft_test.py
index 514202d..3e6d8bf 100755
--- a/pyvkfft/scripts/pyvkfft_test.py
+++ b/pyvkfft/scripts/pyvkfft_test.py
@@ -216,6 +216,8 @@ def main():
                              "which can be displayed by clicking on the type of transform.")
     parser.add_argument('--gpu', action='store',
                         help="Name (or sub-string) of the GPU to use")
+    parser.add_argument('--opencl_platform', action='store',
+                        help="Name (or sub-string) of the opencl platform to use (case-insensitive)")
     parser.add_argument('--mailto', action='store',
                         help="Email address the results will be sent to")
     parser.add_argument('--mailto_fail', action='store',
@@ -365,6 +367,7 @@ def main():
         t.dry_run = args.dry_run
         t.dtype = np.float64 if args.double else np.float32
         t.gpu = args.gpu
+        t.opencl_platform = args.opencl_platform
         t.graph = args.graph
         t.inplace = args.inplace
         t.lut = args.lut
@@ -373,6 +376,7 @@ def main():
         # t.ndims = args.ndims
         t.norm = args.norm[0]
         t.nproc = args.nproc[0]
+        t.opencl_platform = args.opencl_platform
         t.r2c = args.r2c
         t.fstride = args.fstride
         t.radix = args.radix
@@ -407,6 +411,15 @@ def main():
         t.colour = args.colour
         t.gpu = args.gpu
         t.nproc = args.nproc[0]
+        t.opencl_platform = args.opencl_platform
+        t.vbackend = args.backend
+        # KLUDGE: remove cuda stream test if neither cupy or pycuda are used
+        test_cuda_stream = False
+        for b in args.backend:
+            if 'pycuda' in b.lower():
+                test_cuda_stream = True
+        if not test_cuda_stream:
+            del t.test_pycuda_streams
         suite = unittest.defaultTestLoader.loadTestsFromTestCase(t)
         if t.verbose:
             res = unittest.TextTestRunner(verbosity=2).run(suite)
diff --git a/pyvkfft/test/test_fft.py b/pyvkfft/test/test_fft.py
index ac097ad..e353704 100644
--- a/pyvkfft/test/test_fft.py
+++ b/pyvkfft/test/test_fft.py
@@ -68,6 +68,18 @@ class TestFFT(unittest.TestCase):
     nproc = 1
     verbose = False
     colour = False
+    opencl_platform = None
+    vbackend = None
+
+    def setUp(self) -> None:
+        if self.vbackend is None:
+            self.vbackend = []
+            if has_pycuda:
+                self.vbackend.append("pycuda")
+            if has_cupy:
+                self.vbackend.append("cupy")
+            if has_pyopencl:
+                self.vbackend.append("pyopencl")
 
     def test_backend(self):
         self.assertTrue(has_pycuda or has_pyopencl or has_cupy,
@@ -76,17 +88,9 @@ def test_backend(self):
     @unittest.skipIf(not (has_pycuda or has_cupy or has_pyopencl), "No OpenCL/CUDA backend is available")
     def test_simple_fft(self):
         """Test the simple fft API"""
-        vbackend = []
-        if has_pycuda:
-            vbackend.append("pycuda")
-        if has_cupy:
-            vbackend.append("cupy")
-        if has_pyopencl:
-            vbackend.append("pyopencl")
-
-        for backend in vbackend:
+        for backend in self.vbackend:
             with self.subTest(backend=backend):
-                init_ctx(backend, gpu_name=self.gpu, verbose=False)
+                init_ctx(backend, gpu_name=self.gpu, opencl_platform=self.opencl_platform, verbose=False)
                 if backend == "pycuda":
                     dc = cua.to_gpu(ascent().astype(np.complex64))
                     dr = cua.to_gpu(ascent().astype(np.float32))
@@ -162,7 +166,7 @@ def run_fft(self, vbackend, vn, dims_max=4, ndim_max=3, shuffle_axes=True,
         ct = 0
         vkwargs = []
         for backend in vbackend:
-            init_ctx(backend, gpu_name=self.gpu, verbose=False)
+            init_ctx(backend, gpu_name=self.gpu, opencl_platform=self.opencl_platform, verbose=False)
             cq = gpu_ctx_dic["pyopencl"][2] if backend == "pyopencl" else None
             for n in vn:
                 for dims in range(1, dims_max + 1):
@@ -228,13 +232,15 @@ def run_fft(self, vbackend, vn, dims_max=4, ndim_max=3, shuffle_axes=True,
                                                                           r2c=r2c, dct=dct, order=order):
                                                             ct += 1
                                                             if not dry_run:
-                                                                res = test_accuracy(backend, sh, ndim, axes, dtype,
-                                                                                    inplace,
-                                                                                    norm, use_lut, r2c=r2c, dct=dct,
-                                                                                    gpu_name=self.gpu,
-                                                                                    stream=None, queue=cq,
-                                                                                    return_array=False, init_array=d0,
-                                                                                    verbose=verbose, order=order)
+                                                                res = \
+                                                                    test_accuracy(backend, sh, ndim, axes, dtype,
+                                                                                  inplace,
+                                                                                  norm, use_lut, r2c=r2c, dct=dct,
+                                                                                  gpu_name=self.gpu,
+                                                                                  opencl_platform=self.opencl_platform,
+                                                                                  stream=None, queue=cq,
+                                                                                  return_array=False, init_array=d0,
+                                                                                  verbose=verbose, order=order)
                                                                 npr = primes(n)
                                                                 ni, n2 = res["ni"], res["n2"]
                                                                 nii, n2i = res["nii"], res["n2i"]
@@ -266,7 +272,9 @@ def run_fft(self, vbackend, vn, dims_max=4, ndim_max=3, shuffle_axes=True,
                                                                           "dtype": dtype, "inplace": inplace,
                                                                           "norm": norm, "use_lut": use_lut,
                                                                           "r2c": r2c, "dct": dct,
-                                                                          "gpu_name": self.gpu, "stream": None,
+                                                                          "gpu_name": self.gpu,
+                                                                          "opencl_platform": self.opencl_platform,
+                                                                          "stream": None,
                                                                           "verbose": False, "order": order,
                                                                           "colour_output": self.colour}
                                                                 vkwargs.append(kwargs)
@@ -302,15 +310,8 @@ def run_fft_parallel(self, vkwargs):
     @unittest.skipIf(not (has_pycuda or has_cupy or has_pyopencl), "No OpenCL/CUDA backend is available")
     def test_c2c(self):
         """Run C2C tests"""
-        vbackend = []
-        if has_pycuda:
-            vbackend.append("pycuda")
-        if has_cupy:
-            vbackend.append("cupy")
-        if has_pyopencl:
-            vbackend.append("pyopencl")
-        for backend in vbackend:
-            init_ctx(backend, gpu_name=self.gpu, verbose=False)
+        for backend in self.vbackend:
+            init_ctx(backend, gpu_name=self.gpu, opencl_platform=self.opencl_platform, verbose=False)
             has_cl_fp64 = gpu_ctx_dic["pyopencl"][3] if backend == "pyopencl" else True
             ct = 0
             vkwargs = []
@@ -335,15 +336,8 @@ def test_c2c(self):
     @unittest.skipIf(not (has_pycuda or has_cupy or has_pyopencl), "No OpenCL/CUDA backend is available")
     def test_r2c(self):
         """Run R2C tests"""
-        vbackend = []
-        if has_pycuda:
-            vbackend.append("pycuda")
-        if has_cupy:
-            vbackend.append("cupy")
-        if has_pyopencl:
-            vbackend.append("pyopencl")
-        for backend in vbackend:
-            init_ctx(backend, gpu_name=self.gpu, verbose=False)
+        for backend in self.vbackend:
+            init_ctx(backend, gpu_name=self.gpu, opencl_platform=self.opencl_platform, verbose=False)
             has_cl_fp64 = gpu_ctx_dic["pyopencl"][3] if backend == "pyopencl" else True
             ct = 0
             vkwargs = []
@@ -369,15 +363,8 @@ def test_r2c(self):
     @unittest.skipIf(not has_dct_ref, "scipy and pyfftw are not available - cannot test DCT")
     def test_dct(self):
         """Run DCT tests"""
-        vbackend = []
-        if has_pycuda:
-            vbackend.append("pycuda")
-        if has_cupy:
-            vbackend.append("cupy")
-        if has_pyopencl:
-            vbackend.append("pyopencl")
-        for backend in vbackend:
-            init_ctx(backend, gpu_name=self.gpu, verbose=False)
+        for backend in self.vbackend:
+            init_ctx(backend, gpu_name=self.gpu, opencl_platform=self.opencl_platform, verbose=False)
             has_cl_fp64 = gpu_ctx_dic["pyopencl"][3] if backend == "pyopencl" else True
             ct = 0
             vkwargs = []
@@ -403,7 +390,7 @@ def test_pycuda_streams(self):
         """
         for dtype in (np.complex64, np.complex128):
             with self.subTest(dtype=np.dtype(dtype)):
-                init_ctx("pycuda", gpu_name=self.gpu, verbose=False)
+                init_ctx("pycuda", gpu_name=self.gpu, opencl_platform=self.opencl_platform, verbose=False)
                 if dtype == np.complex64:
                     rtol = 1e-6
                 else:
@@ -452,6 +439,7 @@ class TestFFTSystematic(unittest.TestCase):
     # t.ndims = args.ndims
     norm = 1
     nproc = 1
+    opencl_platform = None
     r2c = False
     radix = None
     range = 2, 128
@@ -473,7 +461,7 @@ def setUp(self) -> None:
                 self.vbackend.append("cupy")
             if has_pyopencl:
                 self.vbackend.append("pyopencl")
-                init_ctx("pyopencl", gpu_name=self.gpu, verbose=False)
+                init_ctx("pyopencl", gpu_name=self.gpu, opencl_platform=self.opencl_platform, verbose=False)
                 self.cq, self.has_cl_fp64 = gpu_ctx_dic["pyopencl"][2:]
         self.assertTrue(not self.bluestein or self.radix is None, "Cannot select both Bluestein and radix")
         if not self.bluestein and self.radix is None:

From b4b195d2c51363ad486bdbb5acf507e3e586157f Mon Sep 17 00:00:00 2001
From: Vincent Favre-Nicolin <favre@esrf.fr>
Date: Thu, 5 Jan 2023 21:05:18 +0100
Subject: [PATCH 09/10] Add support for F-ordered array in the simple fft
 interface (C2C and R2C). Fox use of opencl_platform in tests

---
 CHANGELOG.rst                         |  6 +++-
 README.rst                            |  4 ++-
 pyvkfft/fft.py                        | 41 ++++++++++++++++-----------
 pyvkfft/scripts/pyvkfft_test_suite.py |  3 ++
 pyvkfft/test/test_fft.py              |  3 +-
 5 files changed, 38 insertions(+), 19 deletions(-)

diff --git a/CHANGELOG.rst b/CHANGELOG.rst
index 9e96b51..6dbfad9 100644
--- a/CHANGELOG.rst
+++ b/CHANGELOG.rst
@@ -1,6 +1,10 @@
-Version 2022.2 (2022-XX-XX)
+Version 2023.1 (2021-XX-XX)
 -----------------------------
 * Fix R2C tests when using numpy (scipy unavailable) [#19]
+* Add support for F-ordered arrays (C2C and R2C)
+* Allow selection of backend for non-systemetic pvkfft-test
+* Add parameter for the OpenCL platform in pyvkfft-test
+* Fix simple fft interface import when only pycuda is used
 
 Version 2022.1.1 (2022-02-14)
 -----------------------------
diff --git a/README.rst b/README.rst
index 5a20cd0..fda6eea 100644
--- a/README.rst
+++ b/README.rst
@@ -79,7 +79,8 @@ Features
 - array can be have more dimensions than the FFT (batch transforms).
 - arbitrary array size, using Bluestein algorithm for prime numbers>13 (note that in this case
   the performance can be significantly lower, up to ~4x, depending on the transform size,
-  see example performance plot below)
+  see example performance plot below). Now also uses Rader's FFT algorithm for primes from
+  17 up to max shared memory length (~10000, see VkFFT's doc for details)
 - transform along a given list of axes - this requires that after collapsing
   non-transformed axes, the last transformed axis is at most along the 3rd dimension,
   e.g. the following axes are allowed: (-2,-3), (-1,-3), (-1,-4), (-4,-5),...
@@ -87,6 +88,7 @@ Features
   This is not allowed for R2C transforms.
 - normalisation=0 (array L2 norm * array size on each transform) and 1 (the backward
   transform divides the L2 norm by the array size, so FFT*iFFT restores the original array)
+- Support for C (default) and F-ordered arrays, for C2C and R2C transforms
 - unit tests for all transforms: see test sub-directory. Note that these take a **long**
   time to finish due to the exhaustive number of sub-tests.
 - Note that out-of-place C2R transform currently destroys the complex array for FFT dimensions >=2
diff --git a/pyvkfft/fft.py b/pyvkfft/fft.py
index b882766..8f193d0 100644
--- a/pyvkfft/fft.py
+++ b/pyvkfft/fft.py
@@ -51,13 +51,18 @@ def _prepare_transform(src, dest, cl_queue, r2c=False):
     :param r2c: if True, this is for an R2C transform, so adapt the destination
         array accordingly.
     :return: a tuple (backend, inplace, dest, cl_queue), also appending the
-    destination dtype for an r2c transform.
+    destination dtype for a r2c transform.
     """
     backend = Backend.UNKNOWN
+    fastidx = np.argmin(src.strides)  # fast axis is the last only for C-ordered arrays
+    if fastidx == src.ndim - 1:
+        order = 'C'
+    else:
+        order = 'F'
     if r2c:
         if src.dtype in [np.float16, np.float32, np.float64]:
             sh = list(src.shape)
-            sh[-1] = sh[-1] // 2 + 1
+            sh[fastidx] = sh[fastidx] // 2 + 1
             dtype = np.complex64
             if src.dtype == np.float16:
                 dtype = complex32
@@ -65,7 +70,7 @@ def _prepare_transform(src, dest, cl_queue, r2c=False):
                 dtype = np.complex128
         else:
             sh = list(src.shape)
-            sh[-1] = (sh[-1] - 1) * 2
+            sh[fastidx] = (sh[fastidx] - 1) * 2
             dtype = np.float32
             if src.dtype == complex32:
                 dtype = np.float16
@@ -82,7 +87,7 @@ def _prepare_transform(src, dest, cl_queue, r2c=False):
             src_ptr = int(src.gpudata)
             if dest is None:
                 if r2c:
-                    dest = cua.empty(tuple(sh), dtype=dtype, allocator=src.allocator)
+                    dest = cua.empty(tuple(sh), dtype=dtype, allocator=src.allocator, order=order)
                 else:
                     dest = cua.empty_like(src)
             dest_ptr = int(dest.gpudata)
@@ -93,7 +98,7 @@ def _prepare_transform(src, dest, cl_queue, r2c=False):
             src_ptr = src.data.int_ptr
             if dest is None:
                 if r2c:
-                    dest = cla.empty(src.queue, tuple(sh), dtype=dtype, allocator=src.allocator)
+                    dest = cla.empty(src.queue, tuple(sh), dtype=dtype, allocator=src.allocator, order=order)
                 else:
                     dest = cla.empty_like(src)
             dest_ptr = dest.data.int_ptr
@@ -106,7 +111,7 @@ def _prepare_transform(src, dest, cl_queue, r2c=False):
             src_ptr = src.__cuda_array_interface__['data'][0]
             if dest is None:
                 if r2c:
-                    dest = cp.empty(tuple(sh), dtype=dtype)
+                    dest = cp.empty(tuple(sh), dtype=dtype, order=order)
                 else:
                     dest = cp.empty_like(src)
             dest_ptr = dest.__cuda_array_interface__['data'][0]
@@ -127,23 +132,23 @@ def _prepare_transform(src, dest, cl_queue, r2c=False):
 
 
 @lru_cache(maxsize=FFT_CACHE_NB)
-def _get_fft_app(backend, shape, dtype, inplace, ndim, axes, norm, cuda_stream, cl_queue):
+def _get_fft_app(backend, shape, dtype, inplace, ndim, axes, norm, cuda_stream, cl_queue, strides=None):
     if backend in [Backend.PYCUDA, Backend.CUPY]:
         return VkFFTApp_cuda(shape, dtype, ndim=ndim, inplace=inplace,
-                             stream=cuda_stream, norm=norm, axes=axes)
+                             stream=cuda_stream, norm=norm, axes=axes, strides=strides)
     elif backend == Backend.PYOPENCL:
         return VkFFTApp_cl(shape, dtype, cl_queue, ndim=ndim, inplace=inplace,
-                           norm=norm, axes=axes)
+                           norm=norm, axes=axes, strides=strides)
 
 
 @lru_cache(maxsize=FFT_CACHE_NB)
-def _get_rfft_app(backend, shape, dtype, inplace, ndim, norm, cuda_stream, cl_queue):
+def _get_rfft_app(backend, shape, dtype, inplace, ndim, norm, cuda_stream, cl_queue, strides=None):
     if backend in [Backend.PYCUDA, Backend.CUPY]:
         return VkFFTApp_cuda(shape, dtype, ndim=ndim, inplace=inplace,
-                             stream=cuda_stream, norm=norm, r2c=True)
+                             stream=cuda_stream, norm=norm, r2c=True, strides=strides)
     elif backend == Backend.PYOPENCL:
         return VkFFTApp_cl(shape, dtype, cl_queue, ndim=ndim, inplace=inplace,
-                           norm=norm, r2c=True)
+                           norm=norm, r2c=True, strides=strides)
 
 
 @lru_cache(maxsize=FFT_CACHE_NB)
@@ -192,7 +197,8 @@ def fftn(src, dest=None, ndim=None, norm=1, axes=None, cuda_stream=None, cl_queu
     :return: the destination array if return_scale is False, or (dest, scale)
     """
     backend, inplace, dest, cl_queue = _prepare_transform(src, dest, cl_queue, False)
-    app = _get_fft_app(backend, src.shape, src.dtype, inplace, ndim, axes, norm, cuda_stream, cl_queue)
+    app = _get_fft_app(backend, src.shape, src.dtype, inplace, ndim, axes, norm, cuda_stream, cl_queue,
+                       strides=src.strides)
     app.fft(src, dest)
     if return_scale:
         s = app.get_fft_scale()
@@ -235,7 +241,8 @@ def ifftn(src, dest=None, ndim=None, norm=1, axes=None, cuda_stream=None, cl_que
     :return: the destination array if return_scale is False, or (dest, scale)
     """
     backend, inplace, dest, cl_queue = _prepare_transform(src, dest, cl_queue, False)
-    app = _get_fft_app(backend, src.shape, src.dtype, inplace, ndim, axes, norm, cuda_stream, cl_queue)
+    app = _get_fft_app(backend, src.shape, src.dtype, inplace, ndim, axes, norm, cuda_stream, cl_queue,
+                       strides=src.strides)
     app.ifft(src, dest)
     if return_scale:
         s = app.get_fft_scale()
@@ -282,7 +289,8 @@ def rfftn(src, dest=None, ndim=None, norm=1, cuda_stream=None, cl_queue=None,
         with the appropriate type.
     """
     backend, inplace, dest, cl_queue, dtype = _prepare_transform(src, dest, cl_queue, True)
-    app = _get_rfft_app(backend, src.shape, src.dtype, inplace, ndim, norm, cuda_stream, cl_queue)
+    app = _get_rfft_app(backend, src.shape, src.dtype, inplace, ndim, norm, cuda_stream, cl_queue,
+                        strides=src.strides)
     app.fft(src, dest)
     if return_scale:
         s = app.get_fft_scale()
@@ -329,7 +337,8 @@ def irfftn(src, dest=None, ndim=None, norm=1, cuda_stream=None, cl_queue=None,
         with the appropriate type.
     """
     backend, inplace, dest, cl_queue, dtype = _prepare_transform(src, dest, cl_queue, True)
-    app = _get_rfft_app(backend, dest.shape, dest.dtype, inplace, ndim, norm, cuda_stream, cl_queue)
+    app = _get_rfft_app(backend, dest.shape, dest.dtype, inplace, ndim, norm, cuda_stream, cl_queue,
+                        strides=dest.strides)
     app.ifft(src, dest)
     if return_scale:
         s = app.get_fft_scale()
diff --git a/pyvkfft/scripts/pyvkfft_test_suite.py b/pyvkfft/scripts/pyvkfft_test_suite.py
index 71908ee..ac836ff 100644
--- a/pyvkfft/scripts/pyvkfft_test_suite.py
+++ b/pyvkfft/scripts/pyvkfft_test_suite.py
@@ -17,9 +17,12 @@ def main():
     gpu_gb = 32
     dry_run = False
     backend = 'cupy'
+    opencl_platform = None
 
     # Basic test
     com = "pyvkfft-test --nproc %d --html --range-mb 0 4100" % nproc0
+    if opencl_platform is not None:
+        com += ' --opencl_platform ' + opencl_platform
     if dry_run:
         print(com)
     else:
diff --git a/pyvkfft/test/test_fft.py b/pyvkfft/test/test_fft.py
index e353704..e41675d 100644
--- a/pyvkfft/test/test_fft.py
+++ b/pyvkfft/test/test_fft.py
@@ -503,7 +503,8 @@ def test_systematic(self):
             for s in self.vshape:
                 kwargs = {"backend": backend, "shape": s, "ndim": len(s), "axes": self.axes,
                           "dtype": self.dtype, "inplace": self.inplace, "norm": self.norm, "use_lut": self.lut,
-                          "r2c": self.r2c, "dct": self.dct, "gpu_name": self.gpu, "stream": None, "verbose": False,
+                          "r2c": self.r2c, "dct": self.dct, "gpu_name": self.gpu,
+                          "opencl_platform": self.opencl_platform, "stream": None, "verbose": False,
                           "colour_output": self.colour, "ref_long_double": self.ref_long_double,
                           "order": 'F' if self.fstride else 'C'}
                 vkwargs.append(kwargs)

From c0d333b261d4e74d81fc0efacd4616d478e6b1a0 Mon Sep 17 00:00:00 2001
From: Vincent Favre-Nicolin <favre@esrf.fr>
Date: Thu, 5 Jan 2023 21:24:15 +0100
Subject: [PATCH 10/10] Update notebooks

---
 examples/pyvkfft-tests-CUDA.ipynb   | 318 +++++++++++++++++++++++++++-
 examples/pyvkfft-tests-OpenCL.ipynb | 208 +++++++++++++++++-
 2 files changed, 517 insertions(+), 9 deletions(-)

diff --git a/examples/pyvkfft-tests-CUDA.ipynb b/examples/pyvkfft-tests-CUDA.ipynb
index e290be7..5a34f77 100644
--- a/examples/pyvkfft-tests-CUDA.ipynb
+++ b/examples/pyvkfft-tests-CUDA.ipynb
@@ -68,9 +68,14 @@
     "\n",
     "import pyvkfft.cuda\n",
     "from pyvkfft.cuda import VkFFTApp\n",
+    "from pyvkfft.fft import fftn as vkfftn, ifftn as vkifftn, rfftn as vkrfftn, irfftn as vkirfftn\n",
+    "from pyvkfft.accuracy import l2, li\n",
     "import matplotlib.pyplot as plt\n",
     "from matplotlib.colors import LogNorm\n",
-    "from scipy.misc import ascent\n",
+    "try:\n",
+    "    from scipy.datasets import ascent\n",
+    "except:\n",
+    "    from scipy.misc import ascent\n",
     "import numpy as np\n",
     "from scipy.fft import fftn, ifftn, fftshift, rfftn, irfftn, dctn, idctn\n",
     "import timeit\n"
@@ -489,6 +494,311 @@
     "    print()"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Complex<->Complex, inplace, F-ordering"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "d0 = np.asarray(ascent().astype(np.complex64), order='F')\n",
+    "for axes in [None, (-1,), (-2,)]:\n",
+    "    d = to_gpu(d0)\n",
+    "    app = VkFFTApp(d.shape, d.dtype, strides=d.strides, axes=axes)\n",
+    "\n",
+    "    plt.figure(figsize=(15, 5))\n",
+    "    plt.subplot(121)\n",
+    "    plt.imshow(np.fft.fftshift(abs(np.fft.fftn(d0, axes=axes))), norm=LogNorm())\n",
+    "    plt.colorbar()\n",
+    "\n",
+    "    d = app.fft(d)\n",
+    "\n",
+    "    plt.subplot(122)\n",
+    "    plt.imshow(np.fft.fftshift(abs(d.get())), norm=LogNorm())\n",
+    "    plt.colorbar()\n",
+    "\n",
+    "    d = app.ifft(d)\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Complex<->Complex, out-of-place, F-ordering"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "d0 = np.asarray(ascent().astype(np.complex64), order='F')\n",
+    "axes = (-1,)\n",
+    "d1 = to_gpu(d0)\n",
+    "d2 = gpu_empty_like(d1)\n",
+    "app = VkFFTApp(d1.shape, d1.dtype, strides=d1.strides, axes=axes, inplace=False)\n",
+    "\n",
+    "plt.figure(figsize=(15, 5))\n",
+    "plt.subplot(121)\n",
+    "plt.imshow(np.fft.fftshift(abs(np.fft.fftn(d0, axes=axes))), norm=LogNorm())\n",
+    "plt.colorbar()\n",
+    "\n",
+    "print((abs(d.get()) ** 2).sum())\n",
+    "d2 = app.fft(d1, d2)\n",
+    "\n",
+    "plt.subplot(122)\n",
+    "plt.imshow(np.fft.fftshift(abs(d2.get())), norm=LogNorm())\n",
+    "plt.colorbar()\n",
+    "\n",
+    "d1 = app.ifft(d2, d1)\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Complex<->Complex, inplace, F-ordering, simple FFT interface"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "order = 'F'\n",
+    "for axes in [None, (-1,), (-2,)]:\n",
+    "    d0 = np.asarray(ascent().astype(np.complex64), order=order)\n",
+    "    d = to_gpu(d0)\n",
+    "\n",
+    "    plt.figure(figsize=(15, 5))\n",
+    "    plt.subplot(121)\n",
+    "    dn = np.fft.fftshift(abs(np.fft.fftn(d0, axes=axes)))\n",
+    "    plt.imshow(dn, norm=LogNorm())\n",
+    "    plt.colorbar()\n",
+    "\n",
+    "    d = vkfftn(d, d, axes=axes)\n",
+    "\n",
+    "    plt.subplot(122)\n",
+    "    plt.imshow(np.fft.fftshift(abs(d.get())), norm=LogNorm())\n",
+    "    plt.colorbar()\n",
+    "\n",
+    "    d = vkifftn(d, d, axes=axes)\n",
+    "    \n",
+    "    # Test accuracy on random array\n",
+    "    shape, dtype = d0.shape, d0.dtype\n",
+    "    d0 = (np.random.uniform(-0.5, 0.5, shape) + 1j * np.random.uniform(-0.5, 0.5, shape)).astype(dtype)\n",
+    "    d0 = np.asarray(d0, order=order)\n",
+    "    d = to_gpu(d0)\n",
+    "    d = vkfftn(d, d, axes=axes)\n",
+    "    dn = np.fft.fftn(d0, axes=axes)\n",
+    "    print(l2(dn, d.get()), li(dn, d.get()))\n",
+    "    d = to_gpu(d0)\n",
+    "    d = vkifftn(d, d, axes=axes)\n",
+    "    dn = np.fft.ifftn(d0, axes=axes)\n",
+    "    print(l2(dn, d.get()), li(dn, d.get()))\n",
+    "    print()\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Complex<->Complex, out-of-place, F-ordering, simple FFT interface\n",
+    "The destination array is automatically allocated"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "order = 'F'\n",
+    "for axes in [None, (-1,), (-2,)]:\n",
+    "    d0 = np.asarray(ascent().astype(np.complex64), order=order)\n",
+    "    d = to_gpu(d0)\n",
+    "\n",
+    "    plt.figure(figsize=(15, 5))\n",
+    "    plt.subplot(121)\n",
+    "    dn = np.fft.fftshift(abs(np.fft.fftn(d0, axes=axes)))\n",
+    "    plt.imshow(dn, norm=LogNorm())\n",
+    "    plt.colorbar()\n",
+    "\n",
+    "    d = vkfftn(d, axes=axes)\n",
+    "\n",
+    "    plt.subplot(122)\n",
+    "    plt.imshow(np.fft.fftshift(abs(d.get())), norm=LogNorm())\n",
+    "    plt.colorbar()\n",
+    "\n",
+    "    d = vkifftn(d, axes=axes)\n",
+    "    \n",
+    "    # Test accuracy on random array\n",
+    "    shape, dtype = d0.shape, d0.dtype\n",
+    "    d0 = (np.random.uniform(-0.5, 0.5, shape) + 1j * np.random.uniform(-0.5, 0.5, shape)).astype(dtype)\n",
+    "    d0 = np.asarray(d0, order=order)\n",
+    "    d = to_gpu(d0)\n",
+    "    d = vkfftn(d, axes=axes)\n",
+    "    dn = np.fft.fftn(d0, axes=axes)\n",
+    "    print(l2(dn, d.get()), li(dn, d.get()))\n",
+    "    d = to_gpu(d0)\n",
+    "    d = vkifftn(d, axes=axes)\n",
+    "    dn = np.fft.ifftn(d0, axes=axes)\n",
+    "    print(l2(dn, d.get()), li(dn, d.get()))\n",
+    "    print()\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Complex<->Complex, out-of-place pre-allocated, F-ordering, simple FFT interface\n",
+    "The destination array is pre-allocated here"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "order = 'F'\n",
+    "for axes in [None, (-1,), (-2,)]:\n",
+    "    d0 = np.asarray(ascent().astype(np.complex64), order=order)\n",
+    "    d = to_gpu(d0)\n",
+    "    dest = gpu_empty_like(d)\n",
+    "\n",
+    "    plt.figure(figsize=(15, 5))\n",
+    "    plt.subplot(121)\n",
+    "    dn = np.fft.fftshift(abs(np.fft.fftn(d0, axes=axes)))\n",
+    "    plt.imshow(dn, norm=LogNorm())\n",
+    "    plt.colorbar()\n",
+    "\n",
+    "    dest = vkfftn(d, dest, axes=axes)\n",
+    "\n",
+    "    plt.subplot(122)\n",
+    "    plt.imshow(np.fft.fftshift(abs(dest.get())), norm=LogNorm())\n",
+    "    plt.colorbar()\n",
+    "\n",
+    "    dest = vkifftn(d, axes=axes)\n",
+    "    \n",
+    "    # Test accuracy on random array\n",
+    "    shape, dtype = d0.shape, d0.dtype\n",
+    "    d0 = (np.random.uniform(-0.5, 0.5, shape) + 1j * np.random.uniform(-0.5, 0.5, shape)).astype(dtype)\n",
+    "    d0 = np.asarray(d0, order=order)\n",
+    "    \n",
+    "    d = to_gpu(d0)\n",
+    "    dest = vkfftn(d, dest, axes=axes)\n",
+    "    dn = np.fft.fftn(d0, axes=axes)\n",
+    "    print(l2(dn, dest.get()), li(dn, dest.get()))\n",
+    "    \n",
+    "    d = to_gpu(d0)\n",
+    "    dest = vkifftn(d, dest, axes=axes)\n",
+    "    dn = np.fft.ifftn(d0, axes=axes)\n",
+    "    print(l2(dn, dest.get()), li(dn, dest.get()))\n",
+    "    print()\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Real<->Complex (half-Hermitian) inplace transform, F-ordered, simple fft interface"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# single precision, R2C inplace, # F order\n",
+    "order = 'F'\n",
+    "d0 = np.zeros((514, 512), dtype=np.float32)\n",
+    "d0[:512] = ascent()\n",
+    "d0 = np.asarray(d0, order=order)\n",
+    "\n",
+    "d = to_gpu(d0)\n",
+    "\n",
+    "d = vkrfftn(d, d)\n",
+    "\n",
+    "dn = rfftn(d0[:-2], axes=(-1,-2))  # Need to force the Hermitian axis as the fast one\n",
+    "print(dn.shape)\n",
+    "print(np.allclose(d.get(), dn, rtol=1e-6, atol=dn.max() * 1e-6))\n",
+    "\n",
+    "plt.figure(figsize=(15,5), dpi=100)\n",
+    "plt.subplot(121)\n",
+    "plt.imshow(fftshift(abs(d.get()), axes=0), norm=LogNorm(vmin=100))\n",
+    "plt.colorbar()\n",
+    "plt.title(\"R->C\")\n",
+    "plt.subplot(122)\n",
+    "plt.imshow(fftshift(abs(dn), axes=0), norm=LogNorm(vmin=100))\n",
+    "plt.colorbar()\n",
+    "plt.title(\"R->C (numpy)\")\n",
+    "\n",
+    "d = vkirfftn(d, d)\n",
+    "print(np.allclose(d.get()[:-2], d0[:-2], rtol=1e-6, atol = 255e-6))\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "d.strides , dn.strides"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Real<->Complex (half-Hermitian) out-of-place, F-ordered, simple fft interface"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# single precision, R2C inplace, # F order\n",
+    "order = 'F'\n",
+    "d0 = ascent().astype(np.float32)\n",
+    "d0 = np.asarray(d0, order=order)\n",
+    "\n",
+    "d = to_gpu(d0)\n",
+    "\n",
+    "d = vkrfftn(d)\n",
+    "\n",
+    "dn = rfftn(d0, axes=(-1,-2))  # Need to force the Hermitian axis as the fast one\n",
+    "print(dn.shape, d.shape)\n",
+    "print(np.allclose(d.get(), dn, rtol=1e-6, atol=dn.max() * 1e-6))\n",
+    "\n",
+    "plt.figure(figsize=(15,5), dpi=100)\n",
+    "plt.subplot(121)\n",
+    "plt.imshow(fftshift(abs(d.get()), axes=0), norm=LogNorm(vmin=100))\n",
+    "plt.colorbar()\n",
+    "plt.title(\"R->C\")\n",
+    "plt.subplot(122)\n",
+    "plt.imshow(fftshift(abs(dn), axes=0), norm=LogNorm(vmin=100))\n",
+    "plt.colorbar()\n",
+    "plt.title(\"R->C (numpy)\")\n",
+    "\n",
+    "d = vkirfftn(d)\n",
+    "print(np.allclose(d.get(), d0, rtol=1e-6, atol = 255e-6))\n"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -499,9 +809,9 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
+   "display_name": "py39-pyvkfft-test-conda",
    "language": "python",
-   "name": "python3"
+   "name": "py39-pyvkfft-test-conda"
   },
   "language_info": {
    "codemirror_mode": {
@@ -513,7 +823,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.8.6"
+   "version": "3.9.15"
   }
  },
  "nbformat": 4,
diff --git a/examples/pyvkfft-tests-OpenCL.ipynb b/examples/pyvkfft-tests-OpenCL.ipynb
index 7f2c46a..719b4ed 100644
--- a/examples/pyvkfft-tests-OpenCL.ipynb
+++ b/examples/pyvkfft-tests-OpenCL.ipynb
@@ -36,10 +36,15 @@
     "import pyopencl as cl\n",
     "import pyopencl.array as cla\n",
     "import pyvkfft.opencl\n",
+    "from pyvkfft.fft import fftn as vkfftn, ifftn as vkifftn, rfftn as vkrfftn, irfftn as vkirfftn\n",
     "from pyvkfft.opencl import VkFFTApp\n",
+    "from pyvkfft.accuracy import l2, li\n",
     "import matplotlib.pyplot as plt\n",
     "from matplotlib.colors import LogNorm\n",
-    "from scipy.misc import ascent\n",
+    "try:\n",
+    "    from scipy.datasets import ascent\n",
+    "except:\n",
+    "    from scipy.misc import ascent\n",
     "import numpy as np\n",
     "from scipy.fft import fftn, ifftn, fftshift, rfftn, irfftn, dctn, idctn\n",
     "import timeit\n",
@@ -480,19 +485,212 @@
     "    print()"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Complex<->Complex, inplace, F-ordering"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "d0 = np.asarray(ascent().astype(np.complex64), order='F')\n",
+    "for axes in [None, (-1,), (-2,)]:\n",
+    "    d = cla.to_device(cq, d0)\n",
+    "    app = VkFFTApp(d.shape, d.dtype, cq, strides=d.strides, axes=axes)\n",
+    "\n",
+    "    plt.figure(figsize=(15, 5))\n",
+    "    plt.subplot(121)\n",
+    "    plt.imshow(np.fft.fftshift(abs(np.fft.fftn(d0, axes=axes))), norm=LogNorm())\n",
+    "    plt.colorbar()\n",
+    "\n",
+    "    d = app.fft(d)\n",
+    "\n",
+    "    plt.subplot(122)\n",
+    "    plt.imshow(np.fft.fftshift(abs(d.get())), norm=LogNorm())\n",
+    "    plt.colorbar()\n",
+    "\n",
+    "    d = app.ifft(d)\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Complex<->Complex, out-of-place, F-ordering"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "d0 = np.asarray(ascent().astype(np.complex64), order='F')\n",
+    "axes = (-1,)\n",
+    "d1 = cla.to_device(cq, d0)\n",
+    "d2 = cla.empty_like(d1)\n",
+    "app = VkFFTApp(d1.shape, d1.dtype, cq, strides=d1.strides, axes=axes, inplace=False)\n",
+    "\n",
+    "plt.figure(figsize=(15, 5))\n",
+    "plt.subplot(121)\n",
+    "plt.imshow(np.fft.fftshift(abs(np.fft.fftn(d0, axes=axes))), norm=LogNorm())\n",
+    "plt.colorbar()\n",
+    "\n",
+    "print((abs(d.get()) ** 2).sum())\n",
+    "d2 = app.fft(d1, d2)\n",
+    "\n",
+    "plt.subplot(122)\n",
+    "plt.imshow(np.fft.fftshift(abs(d2.get())), norm=LogNorm())\n",
+    "plt.colorbar()\n",
+    "\n",
+    "d1 = app.ifft(d2, d1)\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Complex<->Complex, inplace, F-ordering, simple FFT interface"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "order = 'F'\n",
+    "for axes in [None, (-1,), (-2,)]:\n",
+    "    d0 = np.asarray(ascent().astype(np.complex64), order=order)\n",
+    "    d = cla.to_device(cq, d0)\n",
+    "\n",
+    "    plt.figure(figsize=(15, 5))\n",
+    "    plt.subplot(121)\n",
+    "    dn = np.fft.fftshift(abs(np.fft.fftn(d0, axes=axes)))\n",
+    "    plt.imshow(dn, norm=LogNorm())\n",
+    "    plt.colorbar()\n",
+    "\n",
+    "    d = vkfftn(d, d, axes=axes)\n",
+    "\n",
+    "    plt.subplot(122)\n",
+    "    plt.imshow(np.fft.fftshift(abs(d.get())), norm=LogNorm())\n",
+    "    plt.colorbar()\n",
+    "\n",
+    "    d = vkifftn(d, d, axes=axes)\n",
+    "    \n",
+    "    # Test accuracy on random array\n",
+    "    shape, dtype = d0.shape, d0.dtype\n",
+    "    d0 = (np.random.uniform(-0.5, 0.5, shape) + 1j * np.random.uniform(-0.5, 0.5, shape)).astype(dtype)\n",
+    "    d0 = np.asarray(d0, order=order)\n",
+    "    \n",
+    "    d = cla.to_device(cq, d0)\n",
+    "    d = vkfftn(d, d, axes=axes)\n",
+    "    dn = np.fft.fftn(d0, axes=axes)\n",
+    "    print(l2(dn, d.get()), li(dn, d.get()))\n",
+    "    \n",
+    "    d = cla.to_device(cq, d0)\n",
+    "    d = vkifftn(d, d, axes=axes)\n",
+    "    dn = np.fft.ifftn(d0, axes=axes)\n",
+    "    print(l2(dn, d.get()), li(dn, d.get()))\n",
+    "    print()\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Real<->Complex (half-Hermitian) inplace transform, F-ordered, simple fft interface"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
    "metadata": {},
    "outputs": [],
-   "source": []
+   "source": [
+    "# single precision, R2C inplace, # F order\n",
+    "order = 'F'\n",
+    "d0 = np.zeros((514, 512), dtype=np.float32)\n",
+    "d0[:512] = ascent()\n",
+    "d0 = np.asarray(d0, order=order)\n",
+    "\n",
+    "d = cla.to_device(cq, d0)\n",
+    "\n",
+    "d = vkrfftn(d, d)\n",
+    "\n",
+    "dn = rfftn(d0[:-2], axes=(-1,-2))  # Need to force the Hermitian axis as the fast one\n",
+    "print(dn.shape)\n",
+    "print(np.allclose(d.get(), dn, rtol=1e-6, atol=dn.max() * 1e-6))\n",
+    "\n",
+    "plt.figure(figsize=(15,5), dpi=100)\n",
+    "plt.subplot(121)\n",
+    "plt.imshow(fftshift(abs(d.get()), axes=0), norm=LogNorm(vmin=100))\n",
+    "plt.colorbar()\n",
+    "plt.title(\"R->C\")\n",
+    "plt.subplot(122)\n",
+    "plt.imshow(fftshift(abs(dn), axes=0), norm=LogNorm(vmin=100))\n",
+    "plt.colorbar()\n",
+    "plt.title(\"R->C (numpy)\")\n",
+    "\n",
+    "d = vkirfftn(d, d)\n",
+    "print(np.allclose(d.get()[:-2], d0[:-2], rtol=1e-6, atol = 255e-6))\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Real<->Complex (half-Hermitian) out-of-place, F-ordered, simple fft interface"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "scrolled": false
+   },
+   "outputs": [],
+   "source": [
+    "# single precision, R2C inplace, # F order\n",
+    "order = 'F'\n",
+    "d0 = ascent().astype(np.float32)\n",
+    "d0 = np.asarray(d0, order=order)\n",
+    "\n",
+    "d = cla.to_device(cq, d0)\n",
+    "\n",
+    "d = vkrfftn(d)\n",
+    "\n",
+    "dn = rfftn(d0, axes=(-1,-2))  # Need to force the Hermitian axis as the fast one\n",
+    "print(dn.shape, d.shape)\n",
+    "print(np.allclose(d.get(), dn, rtol=1e-6, atol=dn.max() * 1e-6))\n",
+    "\n",
+    "plt.figure(figsize=(15,5), dpi=100)\n",
+    "plt.subplot(121)\n",
+    "plt.imshow(fftshift(abs(d.get()), axes=0), norm=LogNorm(vmin=100))\n",
+    "plt.colorbar()\n",
+    "plt.title(\"R->C\")\n",
+    "plt.subplot(122)\n",
+    "plt.imshow(fftshift(abs(dn), axes=0), norm=LogNorm(vmin=100))\n",
+    "plt.colorbar()\n",
+    "plt.title(\"R->C (numpy)\")\n",
+    "\n",
+    "d = vkirfftn(d)\n",
+    "print(np.allclose(d.get(), d0, rtol=1e-6, atol = 255e-6))\n"
+   ]
   }
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
+   "display_name": "py39-pyvkfft-test-conda",
    "language": "python",
-   "name": "python3"
+   "name": "py39-pyvkfft-test-conda"
   },
   "language_info": {
    "codemirror_mode": {
@@ -504,7 +702,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.8.6"
+   "version": "3.9.15"
   }
  },
  "nbformat": 4,