diff --git a/.github/workflows/apps.yml b/.github/workflows/apps.yml
index 2ff10f92462..ca333285df8 100644
--- a/.github/workflows/apps.yml
+++ b/.github/workflows/apps.yml
@@ -95,6 +95,7 @@ jobs:
             -DWarpX_OPENPMD=OFF                   \
             -DCMAKE_VERBOSE_MAKEFILE=ON           \
             -DCMAKE_CXX_COMPILER_LAUNCHER=ccache  \
+            -DAMReX_FFT=ON                        \
             -DAMReX_LINEAR_SOLVER_INCFLO=OFF
         cmake --build WarpX/build -j 4
 
diff --git a/.github/workflows/clang.yml b/.github/workflows/clang.yml
index c84d19850a7..1ed5240164f 100644
--- a/.github/workflows/clang.yml
+++ b/.github/workflows/clang.yml
@@ -44,6 +44,7 @@ jobs:
             -DCMAKE_BUILD_TYPE=Debug    \
             -DCMAKE_VERBOSE_MAKEFILE=ON \
             -DCMAKE_INSTALL_PREFIX=/tmp/my-amrex      \
+            -DAMReX_FFT=ON                            \
             -DAMReX_EB=ON                             \
             -DAMReX_FORTRAN=ON                        \
             -DAMReX_MPI=OFF                           \
@@ -104,6 +105,7 @@ jobs:
         cmake ..                                      \
             -DCMAKE_BUILD_TYPE=Debug                  \
             -DCMAKE_VERBOSE_MAKEFILE=ON               \
+            -DAMReX_FFT=ON                            \
             -DAMReX_EB=ON                             \
             -DAMReX_ENABLE_TESTS=ON                   \
             -DAMReX_FORTRAN=ON                        \
@@ -158,6 +160,7 @@ jobs:
         cmake ..                                      \
             -DCMAKE_BUILD_TYPE=RelWithDebInfo         \
             -DCMAKE_VERBOSE_MAKEFILE=ON               \
+            -DAMReX_FFT=ON                            \
             -DAMReX_EB=ON                             \
             -DAMReX_ENABLE_TESTS=ON                   \
             -DAMReX_FORTRAN=OFF                       \
@@ -200,7 +203,7 @@ jobs:
         export CCACHE_LOGFILE=${{ github.workspace }}/ccache.log.txt
         ccache -z
 
-        ./configure --dim 2 --with-fortran no --comp llvm --with-mpi no
+        ./configure --dim 2 --with-fortran no --comp llvm --with-mpi no --enable-fft yes
         make -j4 WARN_ALL=TRUE WARN_ERROR=TRUE XTRA_CXXFLAGS="-fno-operator-names" \
             CCACHE=ccache
         make install
diff --git a/.github/workflows/cuda.yml b/.github/workflows/cuda.yml
index 9e96aefac5e..927e99ded40 100644
--- a/.github/workflows/cuda.yml
+++ b/.github/workflows/cuda.yml
@@ -38,6 +38,7 @@ jobs:
 
         cmake -S . -B build                              \
             -DCMAKE_VERBOSE_MAKEFILE=ON                  \
+            -DAMReX_FFT=ON                               \
             -DAMReX_EB=ON                                \
             -DAMReX_ENABLE_TESTS=ON                      \
             -DAMReX_FORTRAN=OFF                          \
@@ -97,6 +98,7 @@ jobs:
         cmake -S . -B build                              \
             -DCMAKE_VERBOSE_MAKEFILE=ON                  \
             -DAMReX_MPI=OFF                              \
+            -DAMReX_FFT=ON                               \
             -DAMReX_EB=ON                                \
             -DAMReX_ENABLE_TESTS=ON                      \
             -DAMReX_FORTRAN=OFF                          \
@@ -153,6 +155,7 @@ jobs:
             -DCMAKE_VERBOSE_MAKEFILE=ON                  \
             -DAMReX_ENABLE_TESTS=ON                      \
             -DAMReX_TEST_TYPE=Small                      \
+            -DAMReX_FFT=ON                               \
             -DAMReX_FORTRAN=ON                           \
             -DAMReX_FORTRAN_INTERFACES=ON                \
             -DAMReX_GPU_BACKEND=CUDA                     \
@@ -196,7 +199,7 @@ jobs:
         ccache -z
 
         export PATH=/usr/local/nvidia/bin:/usr/local/cuda/bin:${PATH}
-        ./configure --dim 3 --with-cuda yes --enable-eb yes --enable-xsdk-defaults yes --with-fortran no
+        ./configure --dim 3 --with-cuda yes --enable-eb yes --enable-xsdk-defaults yes --with-fortran no --enable-fft yes
         #
         # /home/runner/work/amrex/amrex/Src/Base/AMReX_GpuLaunchGlobal.H:16:41: error: unused parameter ‘f0’ [-Werror=unused-parameter]
         #    16 |     AMREX_GPU_GLOBAL void launch_global (L f0) { f0(); }
diff --git a/.github/workflows/dependencies/dependencies.sh b/.github/workflows/dependencies/dependencies.sh
index 07e461f577f..c7cde496519 100755
--- a/.github/workflows/dependencies/dependencies.sh
+++ b/.github/workflows/dependencies/dependencies.sh
@@ -16,6 +16,7 @@ sudo apt-get update
 
 sudo apt-get install -y --no-install-recommends\
     build-essential \
+    libfftw3-dev    \
     g++ gfortran    \
     libopenmpi-dev  \
     openmpi-bin
diff --git a/.github/workflows/dependencies/dependencies_clang.sh b/.github/workflows/dependencies/dependencies_clang.sh
index 2e96b5196d8..4c329321b68 100755
--- a/.github/workflows/dependencies/dependencies_clang.sh
+++ b/.github/workflows/dependencies/dependencies_clang.sh
@@ -16,5 +16,6 @@ sudo apt-get update
 
 sudo apt-get install -y --no-install-recommends \
     build-essential      \
+    libfftw3-dev         \
     gfortran             \
     clang-$1
diff --git a/.github/workflows/dependencies/dependencies_gcc.sh b/.github/workflows/dependencies/dependencies_gcc.sh
index 2a576c0b52d..93d9aa27ec4 100755
--- a/.github/workflows/dependencies/dependencies_gcc.sh
+++ b/.github/workflows/dependencies/dependencies_gcc.sh
@@ -17,6 +17,7 @@ sudo apt-get update
 
 sudo apt-get install -y --no-install-recommends \
     build-essential    \
+    libfftw3-dev       \
     g++-$1 gfortran-$1 \
     libopenmpi-dev     \
     openmpi-bin
diff --git a/.github/workflows/dependencies/dependencies_hip.sh b/.github/workflows/dependencies/dependencies_hip.sh
index ab5185ce0ae..df4f274ef36 100755
--- a/.github/workflows/dependencies/dependencies_hip.sh
+++ b/.github/workflows/dependencies/dependencies_hip.sh
@@ -56,6 +56,7 @@ sudo apt-get install -y --no-install-recommends \
     roctracer-dev   \
     rocprofiler-dev \
     rocrand-dev     \
+    rocfft-dev      \
     rocprim-dev
 
 # hiprand-dev is a new package that does not exist in old versions
diff --git a/.github/workflows/dependencies/dependencies_nvcc.sh b/.github/workflows/dependencies/dependencies_nvcc.sh
index abf95048013..2578bd33fe7 100755
--- a/.github/workflows/dependencies/dependencies_nvcc.sh
+++ b/.github/workflows/dependencies/dependencies_nvcc.sh
@@ -35,5 +35,6 @@ sudo apt-get install -y \
     cuda-minimal-build-$VERSION_DASHED      \
     cuda-nvml-dev-$VERSION_DASHED           \
     cuda-nvtx-$VERSION_DASHED               \
+    libcufft-dev-$VERSION_DASHED            \
     libcurand-dev-$VERSION_DASHED
 sudo ln -s cuda-$VERSION_DOTTED /usr/local/cuda
diff --git a/.github/workflows/gcc.yml b/.github/workflows/gcc.yml
index b64dbd3b5f0..88fe47c988c 100644
--- a/.github/workflows/gcc.yml
+++ b/.github/workflows/gcc.yml
@@ -42,6 +42,7 @@ jobs:
         mkdir build
         cd build
         cmake ..                                  \
+            -DAMReX_FFT=ON                        \
             -DAMReX_FORTRAN=ON                    \
             -DAMReX_PLOTFILE_TOOLS=ON             \
             -DCMAKE_VERBOSE_MAKEFILE=ON           \
@@ -99,6 +100,7 @@ jobs:
         cmake -S . -B build             \
             -DCMAKE_BUILD_TYPE=Debug    \
             -DCMAKE_VERBOSE_MAKEFILE=ON \
+            -DAMReX_FFT=ON              \
             -DAMReX_EB=ON               \
             -DAMReX_ENABLE_TESTS=ON     \
             -DAMReX_FORTRAN=ON          \
@@ -147,6 +149,7 @@ jobs:
         cmake -S . -B build             \
             -DCMAKE_BUILD_TYPE=Debug    \
             -DCMAKE_VERBOSE_MAKEFILE=ON \
+            -DAMReX_FFT=ON              \
             -DAMReX_EB=ON               \
             -DAMReX_ENABLE_TESTS=ON     \
             -DAMReX_FORTRAN=ON          \
@@ -196,6 +199,7 @@ jobs:
         cmake -S . -B build             \
             -DCMAKE_BUILD_TYPE=Debug    \
             -DCMAKE_VERBOSE_MAKEFILE=ON \
+            -DAMReX_FFT=ON              \
             -DAMReX_EB=OFF              \
             -DAMReX_ENABLE_TESTS=ON     \
             -DAMReX_FORTRAN=ON          \
@@ -248,6 +252,7 @@ jobs:
             -DCMAKE_VERBOSE_MAKEFILE=ON \
             -DAMReX_ASSERTIONS=ON       \
             -DAMReX_TESTING=ON          \
+            -DAMReX_FFT=ON              \
             -DAMReX_EB=OFF              \
             -DAMReX_ENABLE_TESTS=ON     \
             -DAMReX_BOUND_CHECK=ON      \
@@ -310,6 +315,7 @@ jobs:
             -DAMReX_TESTING=ON          \
             -DAMReX_BOUND_CHECK=ON      \
             -DAMReX_FPE=ON              \
+            -DAMReX_FFT=ON              \
             -DAMReX_EB=ON               \
             -DAMReX_ENABLE_TESTS=ON     \
             -DAMReX_FORTRAN=ON          \
@@ -371,6 +377,7 @@ jobs:
             -DAMReX_TESTING=ON          \
             -DAMReX_BOUND_CHECK=ON      \
             -DAMReX_FPE=ON              \
+            -DAMReX_FFT=ON              \
             -DAMReX_EB=ON               \
             -DAMReX_ENABLE_TESTS=ON     \
             -DAMReX_FORTRAN=OFF         \
@@ -457,7 +464,7 @@ jobs:
         export CCACHE_LOGFILE=${{ github.workspace }}/ccache.log.txt
         ccache -z
 
-        ./configure --dim 3 --enable-eb yes --enable-xsdk-defaults yes
+        ./configure --dim 3 --enable-eb yes --enable-xsdk-defaults yes --enable-fft yes
         make -j4 WARN_ALL=TRUE WARN_ERROR=TRUE XTRA_CXXFLAGS=-fno-operator-names \
             CCACHE=ccache
         make install
@@ -497,7 +504,8 @@ jobs:
         export CCACHE_LOGFILE=${{ github.workspace }}/ccache.log.txt
         ccache -z
 
-        ./configure --dim 3 --enable-eb no --enable-xsdk-defaults no --single-precision yes --single-precision-particles yes --enable-tiny-profile yes
+        ./configure --dim 3 --enable-eb no --enable-xsdk-defaults no --single-precision yes \
+                    --single-precision-particles yes --enable-tiny-profile yes --enable-fft yes
         make -j4 WARN_ALL=TRUE WARN_ERROR=TRUE XTRA_CXXFLAGS=-fno-operator-names \
             CCACHE=ccache
         make install
@@ -623,6 +631,7 @@ jobs:
             -DAMReX_OMP=ON              \
             -DCMAKE_VERBOSE_MAKEFILE=ON \
             -DAMReX_ENABLE_TESTS=ON     \
+            -DAMReX_FFT=ON              \
             -DCMAKE_CXX_COMPILER_LAUNCHER=ccache
         make -j 4
 
diff --git a/.github/workflows/hip.yml b/.github/workflows/hip.yml
index 345d7c468b5..22154d6b012 100644
--- a/.github/workflows/hip.yml
+++ b/.github/workflows/hip.yml
@@ -48,6 +48,7 @@ jobs:
 
         cmake -S . -B build                               \
             -DCMAKE_VERBOSE_MAKEFILE=ON                   \
+            -DAMReX_FFT=ON                                \
             -DAMReX_EB=ON                                 \
             -DAMReX_ENABLE_TESTS=ON                       \
             -DAMReX_FORTRAN=ON                            \
@@ -103,6 +104,7 @@ jobs:
 
         cmake -S . -B build_full_legacywrapper            \
             -DCMAKE_VERBOSE_MAKEFILE=ON                   \
+            -DAMReX_FFT=ON                                \
             -DAMReX_EB=OFF                                \
             -DAMReX_ENABLE_TESTS=ON                       \
             -DAMReX_FORTRAN=ON                            \
@@ -145,7 +147,9 @@ jobs:
         export CCACHE_MAXSIZE=100M
         ccache -z
 
-        ./configure --dim 2 --with-hip yes --enable-eb yes --enable-xsdk-defaults yes --with-mpi no --with-omp no --single-precision yes --single-precision-particles yes
+        ./configure --dim 2 --with-hip yes --enable-eb yes --enable-xsdk-defaults yes \
+                    --with-mpi no --with-omp no --single-precision yes \
+                    --single-precision-particles yes --enable-fft yes
         make -j4 WARN_ALL=TRUE AMD_ARCH=gfx90a CCACHE=ccache
         make install
 
diff --git a/.github/workflows/intel.yml b/.github/workflows/intel.yml
index 227b0f97380..15c7bbda587 100644
--- a/.github/workflows/intel.yml
+++ b/.github/workflows/intel.yml
@@ -41,6 +41,7 @@ jobs:
         set -e
         cmake -S . -B build                                \
             -DCMAKE_VERBOSE_MAKEFILE=ON                    \
+            -DAMReX_FFT=ON                                 \
             -DAMReX_EB=OFF                                 \
             -DAMReX_ENABLE_TESTS=ON                        \
             -DAMReX_FORTRAN=ON                             \
@@ -89,6 +90,7 @@ jobs:
         set -e
         cmake -S . -B build                                \
             -DCMAKE_VERBOSE_MAKEFILE=ON                    \
+            -DAMReX_FFT=ON                                 \
             -DAMReX_EB=ON                                  \
             -DAMReX_ENABLE_TESTS=ON                        \
             -DAMReX_FORTRAN=OFF                            \
diff --git a/Docs/sphinx_documentation/source/BuildingAMReX.rst b/Docs/sphinx_documentation/source/BuildingAMReX.rst
index 064d11e740a..831346765b9 100644
--- a/Docs/sphinx_documentation/source/BuildingAMReX.rst
+++ b/Docs/sphinx_documentation/source/BuildingAMReX.rst
@@ -475,6 +475,8 @@ The list of available options is reported in the :ref:`table <tab:cmakevar>` bel
    +------------------------------+-------------------------------------------------+-------------------------+-----------------------+
    | AMReX_EB                     |  Build Embedded Boundary support                | NO                      | YES, NO               |
    +------------------------------+-------------------------------------------------+-------------------------+-----------------------+
+   | AMReX_FFT                    |  Build FFT support                              | NO                      | YES, NO               |
+   +------------------------------+-------------------------------------------------+-------------------------+-----------------------+
    | AMReX_PARTICLES              |  Build particle classes                         | YES                     | YES, NO               |
    +------------------------------+-------------------------------------------------+-------------------------+-----------------------+
    | AMReX_PARTICLES_PRECISION    |  Set reals precision in particle classes        | Same as AMReX_PRECISION | DOUBLE, SINGLE        |
@@ -697,6 +699,8 @@ A list of AMReX component names and related configure options are shown in the t
    +------------------------------+-----------------+
    | AMReX_EB                     | EB              |
    +------------------------------+-----------------+
+   | AMReX_FFT                    | FFT             |
+   +------------------------------+-----------------+
    | AMReX_PARTICLES              | PARTICLES       |
    +------------------------------+-----------------+
    | AMReX_PARTICLES_PRECISION    | PDOUBLE, PSINGLE|
diff --git a/Docs/sphinx_documentation/source/FFT.rst b/Docs/sphinx_documentation/source/FFT.rst
new file mode 100644
index 00000000000..3fc24fcab86
--- /dev/null
+++ b/Docs/sphinx_documentation/source/FFT.rst
@@ -0,0 +1,71 @@
+.. role:: cpp(code)
+   :language: c++
+
+.. _sec:FFT:r2c:
+
+FFT::R2C Class
+==============
+
+Class template `FFT::R2C` supports discrete Fourier transforms between real
+and complex data. The name R2C indicates that the forward transform converts
+real data to complex data, while the backward transform converts complex
+data to real data. It should be noted that both directions of transformation
+are supported, not just from real to complex.
+
+The implementation utilizes cuFFT, rocFFT, oneMKL and FFTW, for CUDA, HIP,
+SYCL and CPU builds, respectively. Because the parallel communication is
+handled by AMReX, it does not need the parallel version of
+FFTW. Furthermore, there is no constraint on the domain decomposition such
+as one Box per process. This class performs parallel FFT on AMReX's parallel
+data containers (e.g., :cpp:`MultiFab` and
+:cpp:`FabArray<BaseFab<ComplexData<Real>>>`. For local FFT, the users can
+use FFTW, cuFFT, rocFFT, or oneMKL directly.
+
+Other than using column-majored order, AMReX follows the convention of
+FFTW. Applying the forward transform followed by the backward transform
+scales the original data by the size of the input array. The layout of the
+complex data also follows the FFTW convention, where the complex Hermitian
+output array has `(nx/2+1,ny,nz)` elements. Here `nx`, `ny` and `nz` are the
+sizes of the real array and the division is rounded down.
+
+Below are examples of using :cpp:`FFT:R2C`.
+
+.. highlight:: c++
+
+::
+
+    Geometry geom(...);
+    MultiFab mfin(...);
+    MultiFab mfout(...);
+
+    auto scaling = 1. / geom.Domain().d_numPts();
+
+    FFT::R2C r2c(geom.Domain());
+    r2c.forwardThenBackward(mfin, mfout,
+        [=] AMREX_GPU_DEVICE (int, int, int, auto& sp)
+        {
+            sp *= scaling;
+        });
+
+    cMultiFab cmf(...);
+    FFT::R2C<Real,FFT::Direction::forward> r2c_forward(geom.Domain());
+    r2c_forward(mfin, cmf);
+
+    FFT::R2C<Real,FFT::Direction::backward> r2c_backward(geom.Domain());
+    r2c_backward(cmf, mfout);
+
+Note that using :cpp:`forwardThenBackward` is expected to be more efficient
+than separate calls to :cpp:`forward` and :cpp:`backward` because some
+parallel communication can be avoided. It should also be noted that a lot of
+preparation works are done in the construction of an :cpp:`FFT::R2C`
+object. Therefore, one should cache it for reuse if possible.
+
+
+Poisson Solver
+==============
+
+AMReX provides FFT based Poisson solvers. :cpp:`FFT::Poisson` supports all
+periodic boundaries using purely FFT. :cpp:`FFT::PoissonHybrid` is a 3D only
+solver that supports periodic boundaries in the first two dimensions and
+Neumann boundary in the last dimension. Similar to :cpp:`FFT::R2C`, the
+Poisson solvers should be cached for reuse.
diff --git a/Docs/sphinx_documentation/source/FFT_Chapter.rst b/Docs/sphinx_documentation/source/FFT_Chapter.rst
new file mode 100644
index 00000000000..9d6e9505d4f
--- /dev/null
+++ b/Docs/sphinx_documentation/source/FFT_Chapter.rst
@@ -0,0 +1,16 @@
+.. _Chap:FFT:
+
+.. _sec:FFT:FFTOverview:
+
+Discrete Fourier Transform
+==========================
+
+AMReX provides support for parallel discrete Fourier transform. The
+implementation utilizes cuFFT, rocFFT, oneMKL and FFTW, for CUDA, HIP, SYCL
+and CPU builds, respectively. It also provides FFT based Poisson
+solvers.
+
+.. toctree::
+   :maxdepth: 1
+
+   FFT
diff --git a/Docs/sphinx_documentation/source/index.rst b/Docs/sphinx_documentation/source/index.rst
index 203545cf40a..09ffbb5c0b4 100644
--- a/Docs/sphinx_documentation/source/index.rst
+++ b/Docs/sphinx_documentation/source/index.rst
@@ -52,6 +52,7 @@ Documentation on migration from BoxLib is available in the AMReX repository at D
    Fortran_Chapter
    Python_Chapter
    EB_Chapter
+   FFT_Chapter
    TimeIntegration_Chapter
    GPU_Chapter
    Visualization_Chapter
diff --git a/GNUmakefile.in b/GNUmakefile.in
index b85c2e0c35e..67c789d97ca 100644
--- a/GNUmakefile.in
+++ b/GNUmakefile.in
@@ -26,6 +26,9 @@ ifeq ($(USE_LINEAR_SOLVERS),TRUE)
      Pdirs += F_Interfaces/LinearSolvers
    endif
 endif
+ifeq ($(USE_FFT),TRUE)
+   Pdirs += FFT
+endif
 ifeq ($(USE_EB),TRUE)
    Pdirs += EB
 endif
diff --git a/Src/Base/AMReX_FabArray.H b/Src/Base/AMReX_FabArray.H
index 69970d6401f..a67a72f0a38 100644
--- a/Src/Base/AMReX_FabArray.H
+++ b/Src/Base/AMReX_FabArray.H
@@ -13,6 +13,7 @@
 #include <AMReX_FabFactory.H>
 #include <AMReX_DistributionMapping.H>
 #include <AMReX_Geometry.H>
+#include <AMReX_GpuComplex.H>
 #include <AMReX_ParallelDescriptor.H>
 #include <AMReX_Utility.H>
 #include <AMReX_ccse-mpi.H>
@@ -3679,6 +3680,8 @@ FabArray<FAB>::norminf (FabArray<IFAB> const& mask, int comp, int ncomp,
     return nm0;
 }
 
+using cMultiFab = FabArray<BaseFab<GpuComplex<Real> > >;
+
 }
 
 #endif /*BL_FABARRAY_H*/
diff --git a/Src/Base/AMReX_GpuComplex.H b/Src/Base/AMReX_GpuComplex.H
index 274da82604d..42dfc7626e9 100644
--- a/Src/Base/AMReX_GpuComplex.H
+++ b/Src/Base/AMReX_GpuComplex.H
@@ -41,16 +41,16 @@ struct alignas(2*sizeof(T)) GpuComplex
     /**
      * \brief Return the real part.
      */
-    AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+    [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
     constexpr T real () const noexcept { return m_real; }
 
     /**
      * \brief Return the imaginary part.
      */
-    AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+    [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
     constexpr T imag () const noexcept { return m_imag; }
 
-   /**
+    /**
      * \brief Add a real number to this complex number.
      */
     AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
diff --git a/Src/Base/AMReX_GpuError.H b/Src/Base/AMReX_GpuError.H
index ce3ac188a85..65457c8f4e7 100644
--- a/Src/Base/AMReX_GpuError.H
+++ b/Src/Base/AMReX_GpuError.H
@@ -84,6 +84,16 @@ namespace Gpu {
     std::string errStr(std::string("CURAND error in file ") + __FILE__  \
                        + " line " + std::to_string(__LINE__));          \
     amrex::Abort(errStr); }} while(0)
+
+#define AMREX_CUFFT_SAFE_CALL(call) { \
+    cufftResult_t amrex_i_err = call; \
+    if (CUFFT_SUCCESS != amrex_i_err) { \
+        std::string errStr(std::string("CUFFT error ")+std::to_string(amrex_i_err) \
+                           + std::string(" in file ") + __FILE__        \
+                           + " line " + std::to_string(__LINE__));      \
+        amrex::Abort(errStr); \
+    }}
+
 #endif
 
 #ifdef AMREX_USE_HIP
@@ -100,6 +110,16 @@ namespace Gpu {
     std::string errStr(std::string("HIPRAND error in file ") + __FILE__  \
                        + " line " + std::to_string(__LINE__));          \
     amrex::Abort(errStr); }} while(0)
+
+#define AMREX_ROCFFT_SAFE_CALL(call) { \
+    auto amrex_i_err = call; \
+    if (rocfft_status_success != amrex_i_err) { \
+        std::string errStr(std::string("rocFFT error ")+std::to_string(amrex_i_err) \
+                           + std::string(" in file ") + __FILE__        \
+                           + " line " + std::to_string(__LINE__));      \
+        amrex::Abort(errStr); \
+    }}
+
 #endif
 
 #define AMREX_GPU_ERROR_CHECK() amrex::Gpu::ErrorCheck(__FILE__, __LINE__)
diff --git a/Src/CMakeLists.txt b/Src/CMakeLists.txt
index 6e8af043e0d..25455d72636 100644
--- a/Src/CMakeLists.txt
+++ b/Src/CMakeLists.txt
@@ -136,6 +136,10 @@ if (AMReX_PARTICLES)
    add_subdirectory(Particle)
 endif ()
 
+if (AMReX_FFT)
+   add_subdirectory(FFT)
+endif ()
+
 #
 # Optional external components
 #
diff --git a/Src/FFT/AMReX_FFT.H b/Src/FFT/AMReX_FFT.H
new file mode 100644
index 00000000000..f8050fff933
--- /dev/null
+++ b/Src/FFT/AMReX_FFT.H
@@ -0,0 +1,969 @@
+#ifndef AMREX_FFT_H_
+#define AMREX_FFT_H_
+#include <AMReX_Config.H>
+
+#include <AMReX_MultiFab.H>
+#include <AMReX_FFT_Helper.H>
+#include <numeric>
+#include <tuple>
+#include <utility>
+
+#if defined(AMREX_USE_CUDA)
+#  include <cufft.h>
+#  include <cuComplex.h>
+#elif defined(AMREX_USE_HIP)
+#  if __has_include(<rocfft/rocfft.h>)  // ROCm 5.3+
+#    include <rocfft/rocfft.h>
+#  else
+#    include <rocfft.h>
+#  endif
+#  include <hip/hip_complex.h>
+#elif defined(AMREX_USE_SYCL)
+#  include <oneapi/mkl/dfti.hpp>
+#else
+#  include <fftw3.h>
+#endif
+
+namespace amrex::FFT
+{
+
+/**
+ * \brief Discrete Fourier Transform
+ *
+ * This class supports Fourier transforms between real and complex data. The
+ * name R2C indicates that the forward transform converts real data to
+ * complex data, while the backward transform converts complex data to real
+ * data. It should be noted that both directions of transformation are
+ * supported, not just from real to complex. The scaling follows the FFTW
+ * convention, where applying the forward transform followed by the backward
+ * transform scales the original data by the size of the input array.
+ *
+ * For more details, we refer the users to
+ * https://amrex-codes.github.io/amrex/docs_html/FFT_Chapter.html.
+ */
+template <typename T = Real, FFT::Direction D = FFT::Direction::both>
+class R2C
+{
+public:
+    using MF = std::conditional_t<std::is_same_v<T,Real>,
+                                  MultiFab, FabArray<BaseFab<T> > >;
+    using cMF = FabArray<BaseFab<GpuComplex<T> > >;
+
+    /**
+     * \brief Constructor
+     *
+     * \param domain the forward domain (i.e., the domain of the real data)
+     * \param info optional information
+     */
+    explicit R2C (Box const& domain, Info const& info = Info{});
+
+    ~R2C ();
+
+    R2C (R2C const&) = delete;
+    R2C (R2C &&) = delete;
+    R2C& operator= (R2C const&) = delete;
+    R2C& operator= (R2C &&) = delete;
+
+    /**
+     * \brief Forward and then backward transform
+     *
+     * This function is available only when this class template is
+     * instantiated for transforms in both directions. It's more efficient
+     * than calling the forward function that stores the spectral data in a
+     * caller provided container followed by the backward function, because
+     * this can avoid parallel communication between the internal data and
+     * the caller's data container.
+     *
+     * \param inmf         input data in MultiFab or FabArray<BaseFab<float>>
+     * \param outmf        output data in MultiFab or FabArray<BaseFab<float>>
+     * \param post_forward a callable object for processing the post-forward
+     *                     data before the backward transform. Its interface
+     *                     is `(int,int,int,GpuComplex<T>&)`, where the integers
+     *                     are indices in the spectral space, and the reference
+     *                     to the complex number allows for the modification of
+     *                     the spectral data at that location.
+     */
+    template <typename F, Direction DIR=D,
+              std::enable_if_t<DIR == Direction::both, int> = 0>
+    void forwardThenBackward (MF const& inmf, MF& outmf, F const& post_forward)
+    {
+        this->forward(inmf);
+        this->post_forward_doit(post_forward);
+        this->backward(outmf);
+    }
+
+    /**
+     * \brief Forward transform
+     *
+     * The output is stored in this object's internal data. This function is
+     * not available when this class template is instantiated for
+     * backward-only transform.
+     *
+     * \param inmf input data in MultiFab or FabArray<BaseFab<float>>
+     */
+    template <Direction DIR=D, std::enable_if_t<DIR == Direction::forward ||
+                                                DIR == Direction::both, int> = 0>
+    void forward (MF const& inmf);
+
+    /**
+     * \brief Forward transform
+     *
+     * This function is not available when this class template is
+     * instantiated for backward-only transform.
+     *
+     * \param inmf input data in MultiFab or FabArray<BaseFab<float>>
+     * \param outmf output data in FabArray<BaseFab<GpuComplex<T>>>
+     */
+    template <Direction DIR=D, std::enable_if_t<DIR == Direction::forward ||
+                                                DIR == Direction::both, int> = 0>
+    void forward (MF const& inmf, cMF& outmf);
+
+    /**
+     * \brief Backward transform
+     *
+     * This function is available only when this class template is
+     * instantiated for transforms in both directions.
+     *
+     * \param outmf output data in MultiFab or FabArray<BaseFab<float>>
+     */
+    template <Direction DIR=D, std::enable_if_t<DIR == Direction::both, int> = 0>
+    void backward (MF& outmf);
+
+    /**
+     * \brief Backward transform
+     *
+     * This function is not available when this class template is
+     * instantiated for forward-only transform.
+     *
+     * \param inmf input data in FabArray<BaseFab<GpuComplex<T>>>
+     * \param outmf output data in MultiFab or FabArray<BaseFab<float>>
+     */
+    template <Direction DIR=D, std::enable_if_t<DIR == Direction::backward ||
+                                                DIR == Direction::both, int> = 0>
+    void backward (cMF const& inmf, MF& outmf);
+
+    /**
+     * \brief Get the internal spectral data
+     *
+     * This function is not available when this class template is
+     * instantiated for backward-only transform. For performance reasons,
+     * the returned data array does not have the usual ordering of
+     * `(x,y,z)`. The order is specified in the second part of the return
+     * value.
+     */
+    template <Direction DIR=D, std::enable_if_t<DIR == Direction::forward ||
+                                                DIR == Direction::both, int> = 0>
+    std::pair<cMF*,IntVect> getSpectralData ();
+
+    struct Swap01
+    {
+        [[nodiscard]] AMREX_GPU_HOST_DEVICE Dim3 operator() (Dim3 i) const noexcept
+        {
+            return {i.y, i.x, i.z};
+        }
+
+        [[nodiscard]] AMREX_GPU_HOST_DEVICE Dim3 Inverse (Dim3 i) const noexcept
+        {
+            return {i.y, i.x, i.z};
+        }
+
+        [[nodiscard]] IndexType operator() (IndexType it) const noexcept
+        {
+            return it;
+        }
+
+        [[nodiscard]] IndexType Inverse (IndexType it) const noexcept
+        {
+            return it;
+        }
+    };
+
+    struct Swap02
+    {
+        [[nodiscard]] AMREX_GPU_HOST_DEVICE Dim3 operator() (Dim3 i) const noexcept
+        {
+            return {i.z, i.y, i.x};
+        }
+
+        [[nodiscard]] AMREX_GPU_HOST_DEVICE Dim3 Inverse (Dim3 i) const noexcept
+        {
+            return {i.z, i.y, i.x};
+        }
+
+        [[nodiscard]] IndexType operator() (IndexType it) const noexcept
+        {
+            return it;
+        }
+
+        [[nodiscard]] IndexType Inverse (IndexType it) const noexcept
+        {
+            return it;
+        }
+    };
+
+    struct RotateFwd
+    {
+        // dest -> src: (x,y,z) -> (y,z,x)
+        [[nodiscard]] AMREX_GPU_HOST_DEVICE Dim3 operator() (Dim3 i) const noexcept
+        {
+            return {i.y, i.z, i.x};
+        }
+
+        // src -> dest: (x,y,z) -> (z,x,y)
+        [[nodiscard]] AMREX_GPU_HOST_DEVICE Dim3 Inverse (Dim3 i) const noexcept
+        {
+            return {i.z, i.x, i.y};
+        }
+
+        [[nodiscard]] IndexType operator() (IndexType it) const noexcept
+        {
+            return it;
+        }
+
+        [[nodiscard]] IndexType Inverse (IndexType it) const noexcept
+        {
+            return it;
+        }
+    };
+
+    struct RotateBwd
+    {
+        // dest -> src: (x,y,z) -> (z,x,y)
+        [[nodiscard]] AMREX_GPU_HOST_DEVICE Dim3 operator() (Dim3 i) const noexcept
+        {
+            return {i.z, i.x, i.y};
+        }
+
+        // src -> dest: (x,y,z) -> (y,z,x)
+        [[nodiscard]] AMREX_GPU_HOST_DEVICE Dim3 Inverse (Dim3 i) const noexcept
+        {
+            return {i.y, i.z, i.x};
+        }
+
+        [[nodiscard]] IndexType operator() (IndexType it) const noexcept
+        {
+            return it;
+        }
+
+        [[nodiscard]] IndexType Inverse (IndexType it) const noexcept
+        {
+            return it;
+        }
+    };
+
+    // public for cuda
+    template <typename F>
+    void post_forward_doit (F const& post_forward);
+
+private:
+
+#if defined(AMREX_USE_CUDA)
+    using VendorPlan = cufftHandle;
+    using VendorPlan2 = VendorPlan;
+    using FFTComplex = std::conditional_t<std::is_same_v<float,T>,
+                                          cuComplex, cuDoubleComplex>;
+#elif defined(AMREX_USE_HIP)
+    using VendorPlan = rocfft_plan;
+    using VendorPlan2 = VendorPlan;
+    using FFTComplex = std::conditional_t<std::is_same_v<float,T>,
+                                          float2, double2>;
+#elif defined(AMREX_USE_SYCL)
+    using VendorPlan = oneapi::mkl::dft::descriptor<
+        std::is_same_v<float,T> ? oneapi::mkl::dft::precision::SINGLE
+                                : oneapi::mkl::dft::precision::DOUBLE,
+        oneapi::mkl::dft::domain::REAL> *;
+    using VendorPlan2 = oneapi::mkl::dft::descriptor<
+        std::is_same_v<float,T> ? oneapi::mkl::dft::precision::SINGLE
+                                : oneapi::mkl::dft::precision::DOUBLE,
+        oneapi::mkl::dft::domain::COMPLEX> *;
+    using FFTComplex = GpuComplex<T>;
+#else
+    using VendorPlan = std::conditional_t<std::is_same_v<float,T>,
+                                          fftwf_plan, fftw_plan>;
+    using VendorPlan2 = VendorPlan;
+    using FFTComplex = std::conditional_t<std::is_same_v<float,T>,
+                                          fftwf_complex, fftw_complex>;
+#endif
+
+    struct Plan {
+        bool defined = false;
+        VendorPlan plan = 0; // NOLINT
+    };
+
+    struct Plan2 {
+        bool defined = false;
+        VendorPlan2 plan = 0; // NOLINT
+    };
+
+    template <typename FA>
+    static typename FA::FABType::value_type *
+    get_fab (FA& fa) {
+        auto myproc = ParallelContext::MyProcSub();
+        if (myproc < fa.size()) {
+            return fa.fabPtr(myproc);
+        } else {
+            return nullptr;
+        }
+    }
+
+    static void exec_r2c (Plan plan, MF& in, cMF& out);
+    static void exec_c2r (Plan plan, cMF& in, MF& out);
+    template <Direction direction>
+    static void exec_c2c (Plan2 plan, cMF& inout);
+
+    template <typename P>
+    static void destroy_plan (P plan);
+    static std::pair<Plan2,Plan2> make_c2c_plans (cMF& inout);
+
+    void backward_doit (MF& outmf);
+
+    Plan m_fft_fwd_x{};
+    Plan m_fft_bwd_x{};
+    Plan2 m_fft_fwd_y{};
+    Plan2 m_fft_bwd_y{};
+    Plan2 m_fft_fwd_z{};
+    Plan2 m_fft_bwd_z{};
+
+    // Comm meta-data. In the forward phase, we start with (x,y,z),
+    // transpose to (y,x,z) and then (z,x,y). In the backward phase, we
+    // perform inverse transpose.
+    std::unique_ptr<MultiBlockCommMetaData> m_cmd_x2y; // (x,y,z) -> (y,x,z)
+    std::unique_ptr<MultiBlockCommMetaData> m_cmd_y2x; // (y,x,z) -> (x,y,z)
+    std::unique_ptr<MultiBlockCommMetaData> m_cmd_y2z; // (y,x,z) -> (z,x,y)
+    std::unique_ptr<MultiBlockCommMetaData> m_cmd_z2y; // (z,x,y) -> (y,x,z)
+    Swap01 m_dtos_x2y{};
+    Swap01 m_dtos_y2x{};
+    Swap02 m_dtos_y2z{};
+    Swap02 m_dtos_z2y{};
+
+    MF  m_rx;
+    cMF m_cx;
+    cMF m_cy;
+    cMF m_cz;
+
+    Box m_real_domain;
+    Box m_spectral_domain_x;
+    Box m_spectral_domain_y;
+    Box m_spectral_domain_z;
+
+    Info m_info;
+};
+
+template <typename T, Direction D>
+R2C<T,D>::R2C (Box const& domain, Info const& info)
+    : m_real_domain(domain),
+      m_spectral_domain_x(IntVect(0), IntVect(AMREX_D_DECL(domain.length(0)/2,
+                                                           domain.bigEnd(1),
+                                                           domain.bigEnd(2)))),
+#if (AMREX_SPACEDIM >= 2)
+      m_spectral_domain_y(IntVect(0), IntVect(AMREX_D_DECL(domain.bigEnd(1),
+                                                           domain.length(0)/2,
+                                                           domain.bigEnd(2)))),
+#if (AMREX_SPACEDIM == 3)
+      m_spectral_domain_z(IntVect(0), IntVect(AMREX_D_DECL(domain.bigEnd(2),
+                                                           domain.length(0)/2,
+                                                           domain.bigEnd(1)))),
+#endif
+#endif
+      m_info(info)
+{
+    static_assert(std::is_same_v<float,T> || std::is_same_v<double,T>);
+    AMREX_ALWAYS_ASSERT(m_real_domain.smallEnd() == 0 &&
+                        m_real_domain.length(0) > 1 &&
+                        m_real_domain.cellCentered());
+#if (AMREX_SPACEDIM == 3)
+    AMREX_ALWAYS_ASSERT(m_real_domain.length(2) > 1 || ! m_info.batch_mode);
+    AMREX_ALWAYS_ASSERT(m_real_domain.length(1) > 1 || m_real_domain.length(2) == 1);
+#else
+    AMREX_ALWAYS_ASSERT(! m_info.batch_mode);
+#endif
+
+    int myproc = ParallelContext::MyProcSub();
+    int nprocs = ParallelContext::NProcsSub();
+
+    auto bax = amrex::decompose(m_real_domain, nprocs, {AMREX_D_DECL(false,true,true)});
+    DistributionMapping dmx = detail::make_iota_distromap(bax.size());
+    m_rx.define(bax, dmx, 1, 0);
+
+    {
+        BoxList bl = bax.boxList();
+        for (auto & b : bl) {
+            b.setBig(0, m_spectral_domain_x.bigEnd(0));
+        }
+        BoxArray cbax(std::move(bl));
+        m_cx.define(cbax, dmx, 1, 0);
+    }
+
+    // plans for x-direction
+    if (myproc < m_rx.size())
+    {
+        Box const local_box = m_rx.boxArray()[myproc];
+        int n = local_box.length(0);
+        int howmany = AMREX_D_TERM(1, *local_box.length(1), *local_box.length(2));
+
+#if defined(AMREX_USE_CUDA)
+        if constexpr (D == Direction::both || D == Direction::forward) {
+            cufftType fwd_type = std::is_same_v<float,T> ? CUFFT_R2C : CUFFT_D2Z;
+            AMREX_CUFFT_SAFE_CALL
+                (cufftPlanMany(&m_fft_fwd_x.plan, 1, &n,
+                               nullptr, 1, m_real_domain.length(0),
+                               nullptr, 1, m_spectral_domain_x.length(0),
+                               fwd_type, howmany));
+            AMREX_CUFFT_SAFE_CALL(cufftSetStream(m_fft_fwd_x.plan, Gpu::gpuStream()));
+        }
+        if constexpr (D == Direction::both || D == Direction::backward) {
+            cufftType bwd_type = std::is_same_v<float,T> ? CUFFT_C2R : CUFFT_Z2D;
+            AMREX_CUFFT_SAFE_CALL
+                (cufftPlanMany(&m_fft_bwd_x.plan, 1, &n,
+                               nullptr, 1, m_spectral_domain_x.length(0),
+                               nullptr, 1, m_real_domain.length(0),
+                               bwd_type, howmany));
+            AMREX_CUFFT_SAFE_CALL(cufftSetStream(m_fft_bwd_x.plan, Gpu::gpuStream()));
+        }
+#elif defined(AMREX_USE_HIP)
+
+        auto prec = std::is_same_v<float,T> ? rocfft_precision_single : rocfft_precision_double;
+        const std::size_t length = n;
+        if constexpr (D == Direction::both || D == Direction::forward) {
+            AMREX_ROCFFT_SAFE_CALL
+                (rocfft_plan_create(&m_fft_fwd_x.plan, rocfft_placement_notinplace,
+                                    rocfft_transform_type_real_forward, prec, 1,
+                                    &length, howmany, nullptr));
+        }
+        if constexpr (D == Direction::both || D == Direction::backward) {
+            AMREX_ROCFFT_SAFE_CALL
+                (rocfft_plan_create(&m_fft_bwd_x.plan, rocfft_placement_notinplace,
+                                    rocfft_transform_type_real_inverse, prec, 1,
+                                    &length, howmany, nullptr));
+        }
+
+#elif defined(AMREX_USE_SYCL)
+
+        m_fft_fwd_x.plan = new std::remove_pointer_t<VendorPlan>(n);
+        m_fft_fwd_x.plan->set_value(oneapi::mkl::dft::config_param::PLACEMENT,
+                                    DFTI_NOT_INPLACE);
+        m_fft_fwd_x.plan->set_value(oneapi::mkl::dft::config_param::NUMBER_OF_TRANSFORMS,
+                                    howmany);
+        m_fft_fwd_x.plan->set_value(oneapi::mkl::dft::config_param::FWD_DISTANCE,
+                                    m_real_domain.length(0));
+        m_fft_fwd_x.plan->set_value(oneapi::mkl::dft::config_param::BWD_DISTANCE,
+                                    m_spectral_domain_x.length(0));
+        std::array<std::int64_t,2> strides{0,1};
+        m_fft_fwd_x.plan->set_value(oneapi::mkl::dft::config_param::FWD_STRIDES,
+                                    strides.data());
+        m_fft_fwd_x.plan->set_value(oneapi::mkl::dft::config_param::BWD_STRIDES,
+                                    strides.data());
+        m_fft_fwd_x.plan->set_value(oneapi::mkl::dft::config_param::WORKSPACE,
+                                    oneapi::mkl::dft::config_value::WORKSPACE_EXTERNAL);
+        m_fft_fwd_x.plan->commit(amrex::Gpu::Device::streamQueue());
+
+        m_fft_bwd_x.plan = m_fft_fwd_x.plan;
+
+#else /* FFTW */
+
+        auto* in = m_rx[myproc].dataPtr();
+        auto* out = (FFTComplex*)(m_cx[myproc].dataPtr());
+
+        if constexpr (std::is_same_v<float,T>) {
+            if constexpr (D == Direction::both || D == Direction::forward) {
+                m_fft_fwd_x.plan = fftwf_plan_many_dft_r2c
+                    (1, &n, howmany, in, nullptr, 1, m_real_domain.length(0),
+                     out, nullptr, 1, m_spectral_domain_x.length(0),
+                     FFTW_ESTIMATE | FFTW_DESTROY_INPUT);
+            }
+            if constexpr (D == Direction::both || D == Direction::backward) {
+                m_fft_bwd_x.plan = fftwf_plan_many_dft_c2r
+                    (1, &n, howmany, out, nullptr, 1, m_spectral_domain_x.length(0),
+                     in, nullptr, 1, m_real_domain.length(0),
+                     FFTW_ESTIMATE | FFTW_DESTROY_INPUT);
+            }
+        } else {
+            if constexpr (D == Direction::both || D == Direction::forward) {
+                m_fft_fwd_x.plan = fftw_plan_many_dft_r2c
+                    (1, &n, howmany, in, nullptr, 1, m_real_domain.length(0),
+                     out, nullptr, 1, m_spectral_domain_x.length(0),
+                     FFTW_ESTIMATE | FFTW_DESTROY_INPUT);
+            }
+            if constexpr (D == Direction::both || D == Direction::backward) {
+                m_fft_bwd_x.plan = fftw_plan_many_dft_c2r
+                    (1, &n, howmany, out, nullptr, 1, m_spectral_domain_x.length(0),
+                     in, nullptr, 1, m_real_domain.length(0),
+                     FFTW_ESTIMATE | FFTW_DESTROY_INPUT);
+            }
+        }
+#endif
+        if constexpr (D == Direction::both || D == Direction::forward) {
+            m_fft_fwd_x.defined = true;
+        }
+        if constexpr (D == Direction::both || D == Direction::backward) {
+            m_fft_bwd_x.defined = true;
+        }
+    }
+
+#if (AMREX_SPACEDIM >= 2)
+    DistributionMapping cdmy;
+    if (m_real_domain.length(1) > 1) {
+        auto cbay = amrex::decompose(m_spectral_domain_y, nprocs, {AMREX_D_DECL(false,true,true)});
+        if (cbay.size() == dmx.size()) {
+            cdmy = dmx;
+        } else {
+            cdmy = detail::make_iota_distromap(cbay.size());
+        }
+        m_cy.define(cbay, cdmy, 1, 0);
+
+        std::tie(m_fft_fwd_y, m_fft_bwd_y) = make_c2c_plans(m_cy);
+
+        // comm meta-data between x and y phases
+        m_cmd_x2y = std::make_unique<MultiBlockCommMetaData>
+            (m_cy, m_spectral_domain_y, m_cx, IntVect(0), m_dtos_x2y);
+        m_cmd_y2x = std::make_unique<MultiBlockCommMetaData>
+            (m_cx, m_spectral_domain_x, m_cy, IntVect(0), m_dtos_y2x);
+    }
+
+#if (AMREX_SPACEDIM == 3)
+    if (m_real_domain.length(1) > 1 &&
+        (! m_info.batch_mode && m_real_domain.length(2) > 1))
+    {
+        auto cbaz = amrex::decompose(m_spectral_domain_z, nprocs, {false,true,true});
+        DistributionMapping cdmz;
+        if (cbaz.size() == dmx.size()) {
+            cdmz = dmx;
+        } else if (cbaz.size() == cdmy.size()) {
+            cdmz = cdmy;
+        } else {
+            cdmz = detail::make_iota_distromap(cbaz.size());
+        }
+         m_cz.define(cbaz, cdmz, 1, 0);
+
+        std::tie(m_fft_fwd_z, m_fft_bwd_z) = make_c2c_plans(m_cz);
+
+        // comm meta-data between y and z phases
+        m_cmd_y2z = std::make_unique<MultiBlockCommMetaData>
+            (m_cz, m_spectral_domain_z, m_cy, IntVect(0), m_dtos_y2z);
+        m_cmd_z2y = std::make_unique<MultiBlockCommMetaData>
+            (m_cy, m_spectral_domain_y, m_cz, IntVect(0), m_dtos_z2y);
+    }
+#endif
+#endif
+}
+
+template <typename T, Direction D>
+template <typename P>
+void R2C<T,D>::destroy_plan (P plan)
+{
+    if (! plan.defined) { return; }
+
+#if defined(AMREX_USE_CUDA)
+    AMREX_CUFFT_SAFE_CALL(cufftDestroy(plan.plan));
+#elif defined(AMREX_USE_HIP)
+    AMREX_ROCFFT_SAFE_CALL(rocfft_plan_destroy(plan.plan));
+#elif defined(AMREX_USE_SYCL)
+    delete plan.plan;
+#else
+    if constexpr (std::is_same_v<float,T>) {
+        fftwf_destroy_plan(plan.plan);
+    } else {
+        fftw_destroy_plan(plan.plan);
+    }
+#endif
+
+    plan.defined = false;
+}
+
+template <typename T, Direction D>
+R2C<T,D>::~R2C<T,D> ()
+{
+#if defined(AMREX_USE_SYCL)
+    if constexpr (D == Direction::both || D == Direction::forward) {
+        destroy_plan(m_fft_fwd_x);
+        destroy_plan(m_fft_fwd_y);
+        destroy_plan(m_fft_fwd_z);
+    } else {
+        destroy_plan(m_fft_bwd_x);
+        destroy_plan(m_fft_bwd_y);
+        destroy_plan(m_fft_bwd_z);
+    }
+#else
+    destroy_plan(m_fft_fwd_x);
+    destroy_plan(m_fft_fwd_y);
+    destroy_plan(m_fft_fwd_z);
+    destroy_plan(m_fft_bwd_x);
+    destroy_plan(m_fft_bwd_y);
+    destroy_plan(m_fft_bwd_z);
+#endif
+}
+
+#ifdef AMREX_USE_HIP
+namespace detail { void hip_execute (rocfft_plan plan, void **in, void **out); }
+#endif
+
+#ifdef AMREX_USE_SYCL
+namespace detail
+{
+template <typename T, Direction direction, typename P, typename TI, typename TO>
+void sycl_execute (P plan, TI* in, TO* out)
+{
+    std::size_t workspaceSize = 0;
+    plan->get_value(oneapi::mkl::dft::config_param::WORKSPACE_BYTES,
+                    &workspaceSize);
+    auto* buffer = (T*)amrex::The_Arena()->alloc(workspaceSize);
+    plan->set_workspace(buffer);
+    sycl::event r;
+    if (std::is_same_v<TI,TO>) {
+        amrex::ignore_unused(in);
+        if constexpr (direction == Direction::forward) {
+            r = oneapi::mkl::dft::compute_forward(*plan, out);
+        } else {
+            r = oneapi::mkl::dft::compute_backward(*plan, out);
+        }
+    } else {
+        if constexpr (direction == Direction::forward) {
+            r = oneapi::mkl::dft::compute_forward(*plan, in, out);
+        } else {
+            r = oneapi::mkl::dft::compute_backward(*plan, in, out);
+        }
+    }
+    r.wait();
+    amrex::The_Arena()->free(buffer);
+}
+}
+#endif
+
+template <typename T, Direction D>
+void R2C<T,D>::exec_r2c (Plan plan, MF& in, cMF& out)
+{
+    if (! plan.defined) { return; }
+
+#if defined(AMREX_USE_GPU)
+    auto* pin = in[ParallelContext::MyProcSub()].dataPtr();
+    auto* pout = out[ParallelContext::MyProcSub()].dataPtr();
+#else
+    amrex::ignore_unused(in,out);
+#endif
+
+#if defined(AMREX_USE_CUDA)
+    if constexpr (std::is_same_v<float,T>) {
+        AMREX_CUFFT_SAFE_CALL(cufftExecR2C(plan.plan, pin, (FFTComplex*)pout));
+    } else {
+        AMREX_CUFFT_SAFE_CALL(cufftExecD2Z(plan.plan, pin, (FFTComplex*)pout));
+    }
+#elif defined(AMREX_USE_HIP)
+    detail::hip_execute(plan.plan, (void**)&pin, (void**)&pout);
+#elif defined(AMREX_USE_SYCL)
+    detail::sycl_execute<T,Direction::forward>(plan.plan, pin, (std::complex<T>*)pout);
+#else
+    if constexpr (std::is_same_v<float,T>) {
+        fftwf_execute(plan.plan);
+    } else {
+        fftw_execute(plan.plan);
+    }
+#endif
+}
+
+template <typename T, Direction D>
+void R2C<T,D>::exec_c2r (Plan plan, cMF& in, MF& out)
+{
+    if (! plan.defined) { return; }
+
+#if defined(AMREX_USE_GPU)
+    auto* pin = in[ParallelContext::MyProcSub()].dataPtr();
+    auto* pout = out[ParallelContext::MyProcSub()].dataPtr();
+#else
+    amrex::ignore_unused(in,out);
+#endif
+
+#if defined(AMREX_USE_CUDA)
+    if constexpr (std::is_same_v<float,T>) {
+        AMREX_CUFFT_SAFE_CALL(cufftExecC2R(plan.plan, (FFTComplex*)pin, pout));
+    } else {
+        AMREX_CUFFT_SAFE_CALL(cufftExecZ2D(plan.plan, (FFTComplex*)pin, pout));
+    }
+#elif defined(AMREX_USE_HIP)
+    detail::hip_execute(plan.plan, (void**)&pin, (void**)&pout);
+#elif defined(AMREX_USE_SYCL)
+    detail::sycl_execute<T,Direction::backward>(plan.plan, (std::complex<T>*)pin, pout);
+#else
+    if constexpr (std::is_same_v<float,T>) {
+        fftwf_execute(plan.plan);
+    } else {
+        fftw_execute(plan.plan);
+    }
+#endif
+}
+
+template <typename T, Direction D>
+template <Direction direction>
+void R2C<T,D>::exec_c2c (Plan2 plan, cMF& inout)
+{
+    if (! plan.defined) { return; }
+
+    amrex::ignore_unused(inout);
+#if defined(AMREX_USE_GPU)
+    auto* p = inout[ParallelContext::MyProcSub()].dataPtr();
+#endif
+
+#if defined(AMREX_USE_CUDA)
+    auto cufft_direction = (direction == Direction::forward) ? CUFFT_FORWARD : CUFFT_INVERSE;
+    if constexpr (std::is_same_v<float,T>) {
+        AMREX_CUFFT_SAFE_CALL(cufftExecC2C(plan.plan, (FFTComplex*)p, (FFTComplex*)p,
+                                           cufft_direction));
+    } else {
+        AMREX_CUFFT_SAFE_CALL(cufftExecZ2Z(plan.plan, (FFTComplex*)p, (FFTComplex*)p,
+                                           cufft_direction));
+    }
+#elif defined(AMREX_USE_HIP)
+    detail::hip_execute(plan.plan, (void**)&p, (void**)&p);
+#elif defined(AMREX_USE_SYCL)
+    detail::sycl_execute<T,direction>(plan.plan, (std::complex<T>*)p, (std::complex<T>*)p);
+#else
+    if constexpr (std::is_same_v<float,T>) {
+        fftwf_execute(plan.plan);
+    } else {
+        fftw_execute(plan.plan);
+    }
+#endif
+}
+
+template <typename T, Direction D>
+template <Direction DIR, std::enable_if_t<DIR == Direction::forward ||
+                                          DIR == Direction::both, int> >
+void R2C<T,D>::forward (MF const& inmf)
+{
+    m_rx.ParallelCopy(inmf, 0, 0, 1);
+    exec_r2c(m_fft_fwd_x, m_rx, m_cx);
+
+    if (                          m_cmd_x2y) {
+        ParallelCopy(m_cy, m_cx, *m_cmd_x2y, 0, 0, 1, m_dtos_x2y);
+    }
+    exec_c2c<Direction::forward>(m_fft_fwd_y, m_cy);
+
+    if (                          m_cmd_y2z) {
+        ParallelCopy(m_cz, m_cy, *m_cmd_y2z, 0, 0, 1, m_dtos_y2z);
+    }
+    exec_c2c<Direction::forward>(m_fft_fwd_z, m_cz);
+}
+
+template <typename T, Direction D>
+template <Direction DIR, std::enable_if_t<DIR == Direction::both, int> >
+void R2C<T,D>::backward (MF& outmf)
+{
+    backward_doit(outmf);
+}
+
+template <typename T, Direction D>
+void R2C<T,D>::backward_doit (MF& outmf)
+{
+    exec_c2c<Direction::backward>(m_fft_bwd_z, m_cz);
+    if (                          m_cmd_z2y) {
+        ParallelCopy(m_cy, m_cz, *m_cmd_z2y, 0, 0, 1, m_dtos_z2y);
+    }
+
+    exec_c2c<Direction::backward>(m_fft_bwd_y, m_cy);
+    if (                          m_cmd_y2x) {
+        ParallelCopy(m_cx, m_cy, *m_cmd_y2x, 0, 0, 1, m_dtos_y2x);
+    }
+
+    exec_c2r(m_fft_bwd_x, m_cx, m_rx);
+    outmf.ParallelCopy(m_rx, 0, 0, 1);
+}
+
+template <typename T, Direction D>
+std::pair<typename R2C<T,D>::Plan2, typename R2C<T,D>::Plan2>
+R2C<T,D>::make_c2c_plans (cMF& inout)
+{
+    Plan2 fwd;
+    Plan2 bwd;
+
+    auto* fab = get_fab(inout);
+    if (!fab) { return {fwd, bwd};}
+
+    Box const& local_box = fab->box();
+
+    int n = local_box.length(0);
+    int howmany = AMREX_D_TERM(1, *local_box.length(1), *local_box.length(2));
+
+#if defined(AMREX_USE_CUDA)
+
+    if constexpr (D == Direction::both || D == Direction::forward) {
+        cufftType fwd_type = std::is_same_v<float,T> ? CUFFT_C2C : CUFFT_Z2Z;
+        AMREX_CUFFT_SAFE_CALL
+            (cufftPlanMany(&fwd.plan, 1, &n, nullptr, 1, n, nullptr, 1, n,
+                           fwd_type, howmany));
+        AMREX_CUFFT_SAFE_CALL(cufftSetStream(fwd.plan, Gpu::gpuStream()));
+    }
+    if constexpr (D == Direction::both || D == Direction::backward) {
+        cufftType bwd_type = std::is_same_v<float,T> ? CUFFT_C2C : CUFFT_Z2Z;
+        AMREX_CUFFT_SAFE_CALL
+            (cufftPlanMany(&bwd.plan, 1, &n, nullptr, 1, n, nullptr, 1, n,
+                           bwd_type, howmany));
+        AMREX_CUFFT_SAFE_CALL(cufftSetStream(bwd.plan, Gpu::gpuStream()));
+    }
+
+#elif defined(AMREX_USE_HIP)
+
+    auto prec = std::is_same_v<float,T> ? rocfft_precision_single : rocfft_precision_double;
+    const std::size_t length = n;
+    if constexpr (D == Direction::both || D == Direction::forward) {
+        AMREX_ROCFFT_SAFE_CALL
+            (rocfft_plan_create(&fwd.plan, rocfft_placement_inplace,
+                                rocfft_transform_type_complex_forward, prec, 1,
+                                &length, howmany, nullptr));
+    }
+    if constexpr (D == Direction::both || D == Direction::backward) {
+        AMREX_ROCFFT_SAFE_CALL
+            (rocfft_plan_create(&bwd.plan, rocfft_placement_inplace,
+                                rocfft_transform_type_complex_inverse, prec, 1,
+                                &length, howmany, nullptr));
+    }
+
+#elif defined(AMREX_USE_SYCL)
+
+    fwd.plan = new std::remove_pointer_t<VendorPlan2>(n);
+    fwd.plan->set_value(oneapi::mkl::dft::config_param::PLACEMENT,
+                        DFTI_INPLACE);
+    fwd.plan->set_value(oneapi::mkl::dft::config_param::NUMBER_OF_TRANSFORMS,
+                        howmany);
+    fwd.plan->set_value(oneapi::mkl::dft::config_param::FWD_DISTANCE, n);
+    fwd.plan->set_value(oneapi::mkl::dft::config_param::BWD_DISTANCE, n);
+    std::array<std::int64_t,2> strides{0,1};
+    fwd.plan->set_value(oneapi::mkl::dft::config_param::FWD_STRIDES, strides.data());
+    fwd.plan->set_value(oneapi::mkl::dft::config_param::BWD_STRIDES, strides.data());
+    fwd.plan->set_value(oneapi::mkl::dft::config_param::WORKSPACE,
+                        oneapi::mkl::dft::config_value::WORKSPACE_EXTERNAL);
+    fwd.plan->commit(amrex::Gpu::Device::streamQueue());
+
+    bwd.plan = fwd.plan;
+
+#else
+    auto* pinout = (FFTComplex*)fab->dataPtr();
+
+    if constexpr (std::is_same_v<float,T>) {
+        if constexpr (D == Direction::both || D == Direction::forward) {
+            fwd.plan = fftwf_plan_many_dft(1, &n, howmany, pinout, nullptr, 1, n,
+                                           pinout, nullptr, 1, n, -1, FFTW_ESTIMATE);
+        }
+        if constexpr (D == Direction::both || D == Direction::backward) {
+            bwd.plan = fftwf_plan_many_dft(1, &n, howmany, pinout, nullptr, 1, n,
+                                           pinout, nullptr, 1, n, +1, FFTW_ESTIMATE);
+        }
+    } else {
+        if constexpr (D == Direction::both || D == Direction::forward) {
+            fwd.plan = fftw_plan_many_dft(1, &n, howmany, pinout, nullptr, 1, n,
+                                          pinout, nullptr, 1, n, -1, FFTW_ESTIMATE);
+        }
+        if constexpr (D == Direction::both || D == Direction::backward) {
+            bwd.plan = fftw_plan_many_dft(1, &n, howmany, pinout, nullptr, 1, n,
+                                          pinout, nullptr, 1, n, +1, FFTW_ESTIMATE);
+        }
+    }
+#endif
+
+    if constexpr (D == Direction::both || D == Direction::forward) {
+        fwd.defined = true;
+    }
+    if constexpr (D == Direction::both || D == Direction::backward) {
+        bwd.defined = true;
+    }
+
+    return {fwd,bwd};
+}
+
+template <typename T, Direction D>
+template <typename F>
+void R2C<T,D>::post_forward_doit (F const& post_forward)
+{
+    if (m_info.batch_mode) {
+        amrex::Abort("xxxxx todo: post_forward");
+    } else {
+        if (                           ! m_cz.empty()) {
+            auto* spectral_fab = get_fab(m_cz);
+            if (spectral_fab) {
+                auto const& a = spectral_fab->array(); // m_cz's ordering is z,x,y
+                ParallelFor(spectral_fab->box(),
+                [=] AMREX_GPU_DEVICE (int iz, int jx, int ky)
+                {
+                    post_forward(jx,ky,iz,a(iz,jx,ky));
+                });
+            }
+        } else if (                    ! m_cy.empty()) {
+            auto* spectral_fab = get_fab(m_cy);
+            if (spectral_fab) {
+                auto const& a = spectral_fab->array(); // m_cy's ordering is y,x,z
+                ParallelFor(spectral_fab->box(),
+                [=] AMREX_GPU_DEVICE (int iy, int jx, int k)
+                {
+                    post_forward(jx,iy,k,a(iy,jx,k));
+                });
+            }
+        } else {
+            auto* spectral_fab = get_fab(m_cx);
+            if (spectral_fab) {
+                auto const& a = spectral_fab->array();
+                ParallelFor(spectral_fab->box(),
+                [=] AMREX_GPU_DEVICE (int i, int j, int k)
+                {
+                    post_forward(i,j,k,a(i,j,k));
+                });
+            }
+        }
+    }
+}
+
+template <typename T, Direction D>
+template <Direction DIR, std::enable_if_t<DIR == Direction::forward ||
+                                          DIR == Direction::both, int> >
+std::pair<typename R2C<T,D>::cMF *, IntVect>
+R2C<T,D>::getSpectralData ()
+{
+    if (!m_cz.empty()) {
+        return std::make_pair(&m_cz, IntVect{AMREX_D_DECL(2,0,1)});
+    } else if (!m_cy.empty()) {
+        return std::make_pair(&m_cy, IntVect{AMREX_D_DECL(1,0,2)});
+    } else {
+        return std::make_pair(&m_cx, IntVect{AMREX_D_DECL(0,1,2)});
+    }
+}
+
+template <typename T, Direction D>
+template <Direction DIR, std::enable_if_t<DIR == Direction::forward ||
+                                          DIR == Direction::both, int> >
+void R2C<T,D>::forward (MF const& inmf, cMF& outmf)
+{
+    forward(inmf);
+    if (!m_cz.empty()) { // m_cz's order (z,x,y) -> (x,y,z)
+        RotateBwd dtos{};
+        MultiBlockCommMetaData cmd
+            (outmf, m_spectral_domain_x, m_cz, IntVect(0), dtos);
+        ParallelCopy(outmf, m_cz, cmd, 0, 0, 1, dtos);
+    } else if (!m_cy.empty()) { // m_cy's order (y,x,z) -> (x,y,z)
+        MultiBlockCommMetaData cmd
+            (outmf, m_spectral_domain_x, m_cy, IntVect(0), m_dtos_y2x);
+        ParallelCopy(outmf, m_cy, cmd, 0, 0, 1, m_dtos_y2x);
+    } else {
+        outmf.ParallelCopy(m_cx, 0, 0, 1);
+    }
+}
+
+template <typename T, Direction D>
+template <Direction DIR, std::enable_if_t<DIR == Direction::backward ||
+                                          DIR == Direction::both, int> >
+void R2C<T,D>::backward (cMF const& inmf, MF& outmf)
+{
+    if (!m_cz.empty()) { // (x,y,z) -> m_cz's order (z,x,y)
+        RotateFwd dtos{};
+        MultiBlockCommMetaData cmd
+            (m_cz, m_spectral_domain_z, inmf, IntVect(0), dtos);
+        ParallelCopy(m_cz, inmf, cmd, 0, 0, 1, dtos);
+    } else if (!m_cy.empty()) { // (x,y,z) -> m_cy's ordering (y,x,z)
+        MultiBlockCommMetaData cmd
+            (m_cy, m_spectral_domain_y, inmf, IntVect(0), m_dtos_x2y);
+        ParallelCopy(m_cy, inmf, cmd, 0, 0, 1, m_dtos_x2y);
+    } else {
+        m_cx.ParallelCopy(inmf, 0, 0, 1);
+    }
+    backward_doit(outmf);
+}
+
+}
+
+#endif
diff --git a/Src/FFT/AMReX_FFT.cpp b/Src/FFT/AMReX_FFT.cpp
new file mode 100644
index 00000000000..68984a8f24a
--- /dev/null
+++ b/Src/FFT/AMReX_FFT.cpp
@@ -0,0 +1,40 @@
+#include <AMReX_FFT.H>
+#include <algorithm>
+
+namespace amrex::FFT::detail
+{
+
+DistributionMapping make_iota_distromap (Long n)
+{
+    AMREX_ASSERT(n <= ParallelContext::NProcsSub());
+    Vector<int> pm(n);
+    for (int i = 0; i < n; ++i) {
+        pm[i] = ParallelContext::local_to_global_rank(i);
+    }
+    return DistributionMapping(std::move(pm));
+}
+
+#ifdef AMREX_USE_HIP
+void hip_execute (rocfft_plan plan, void **in, void **out)
+{
+    rocfft_execution_info execinfo = nullptr;
+    AMREX_ROCFFT_SAFE_CALL(rocfft_execution_info_create(&execinfo));
+
+    std::size_t buffersize = 0;
+    AMREX_ROCFFT_SAFE_CALL(rocfft_plan_get_work_buffer_size(plan, &buffersize));
+
+    auto* buffer = (void*)amrex::The_Arena()->alloc(buffersize);
+    AMREX_ROCFFT_SAFE_CALL(rocfft_execution_info_set_work_buffer(execinfo, buffer, buffersize));
+
+    AMREX_ROCFFT_SAFE_CALL(rocfft_execution_info_set_stream(execinfo, amrex::Gpu::gpuStream()));
+
+    AMREX_ROCFFT_SAFE_CALL(rocfft_execute(plan, in, out, execinfo));
+
+    amrex::Gpu::streamSynchronize();
+    amrex::The_Arena()->free(buffer);
+
+    AMREX_ROCFFT_SAFE_CALL(rocfft_execution_info_destroy(execinfo));
+}
+#endif
+
+}
diff --git a/Src/FFT/AMReX_FFT_Helper.H b/Src/FFT/AMReX_FFT_Helper.H
new file mode 100644
index 00000000000..c8ae2b74ea2
--- /dev/null
+++ b/Src/FFT/AMReX_FFT_Helper.H
@@ -0,0 +1,29 @@
+#ifndef AMREX_FFT_HELPER_H_
+#define AMREX_FFT_HELPER_H_
+#include <AMReX_Config.H>
+
+#include <AMReX_DistributionMapping.H>
+
+namespace amrex::FFT
+{
+
+enum struct Direction { forward, backward, both };
+
+struct Info
+{
+    //! Supported only in 3D. When batch_mode is true, FFT is performed on
+    //! the first two dimensions only and the third dimension size is the
+    //! batch size.
+    bool batch_mode = false;
+
+    Info& setBatchMode (bool x) { batch_mode = x; return *this; }
+};
+
+namespace detail
+{
+    DistributionMapping make_iota_distromap (Long n);
+}
+
+}
+
+#endif
diff --git a/Src/FFT/AMReX_FFT_Poisson.H b/Src/FFT/AMReX_FFT_Poisson.H
new file mode 100644
index 00000000000..62062062108
--- /dev/null
+++ b/Src/FFT/AMReX_FFT_Poisson.H
@@ -0,0 +1,259 @@
+#ifndef AMREX_FFT_POISSON_H_
+#define AMREX_FFT_POISSON_H_
+
+#include <AMReX_FFT.H>
+#include <AMReX_Geometry.H>
+
+namespace amrex::FFT
+{
+
+/**
+ * \brief Poisson solver for all periodic boundaries using FFT
+ */
+template <typename MF>
+class Poisson
+{
+public:
+
+    template <typename FA=MF, std::enable_if_t<IsFabArray_v<FA>,int> = 0>
+    explicit Poisson (Geometry const& geom)
+        : m_geom(geom), m_r2c(geom.Domain())
+    {
+        AMREX_ALWAYS_ASSERT(geom.isAllPeriodic());
+    }
+
+    void solve (MF& soln, MF const& rhs);
+
+private:
+    Geometry m_geom;
+    R2C<typename MF::value_type, Direction::both> m_r2c;
+};
+
+/**
+ * \brief 3D Poisson solver for periodic boundaries in the first two
+ * dimensions and Neumann in the last dimension.
+ */
+template <typename MF>
+class PoissonHybrid
+{
+public:
+
+    template <typename FA=MF, std::enable_if_t<IsFabArray_v<FA>,int> = 0>
+    explicit PoissonHybrid (Geometry const& geom)
+        : m_geom(geom), m_r2c(geom.Domain(), Info().setBatchMode(true))
+    {
+#if (AMREX_SPACEDIM == 3)
+        AMREX_ALWAYS_ASSERT(geom.isPeriodic(0) && geom.isPeriodic(1));
+#else
+        amrex::Abort("FFT::PoissonHybrid: 1D & 2D todo");
+#endif
+    }
+
+    void solve (MF& soln, MF const& rhs);
+
+private:
+    Geometry m_geom;
+    R2C<typename MF::value_type, Direction::both> m_r2c;
+};
+
+template <typename MF>
+void Poisson<MF>::solve (MF& soln, MF const& rhs)
+{
+    using T = typename MF::value_type;
+
+    GpuArray<T,AMREX_SPACEDIM> fac
+        {AMREX_D_DECL(T(2)*Math::pi<T>()/T(m_geom.ProbLength(0)),
+                      T(2)*Math::pi<T>()/T(m_geom.ProbLength(1)),
+                      T(2)*Math::pi<T>()/T(m_geom.ProbLength(2)))};
+    GpuArray<T,AMREX_SPACEDIM> dx
+        {AMREX_D_DECL(T(m_geom.CellSize(0)),
+                      T(m_geom.CellSize(1)),
+                      T(m_geom.CellSize(2)))};
+    auto scale = T(1.0/m_geom.Domain().d_numPts());
+#if (AMREX_SPACEDIM > 1)
+    auto const& len = m_geom.Domain().length();
+#endif
+
+    m_r2c.forwardThenBackward(rhs, soln,
+                              [=] AMREX_GPU_DEVICE (int i, int j, int k,
+                                                    GpuComplex<T>& spectral_data)
+    {
+        amrex::ignore_unused(i,j,k);
+        // the values in the upper-half of the spectral array in y and z
+        // are here interpreted as negative wavenumbers
+        AMREX_D_TERM(T a = fac[0]*i;,
+                     T b = (j < len[1]/2) ? fac[1]*j : fac[1]*(len[1]-j);,
+                     T c = (k < len[2]/2) ? fac[2]*k : fac[2]*(len[2]-k));
+        T k2 = AMREX_D_TERM(T(2)*(std::cos(a*dx[0])-T(1))/(dx[0]*dx[0]),
+                           +T(2)*(std::cos(b*dx[1])-T(1))/(dx[1]*dx[1]),
+                           +T(2)*(std::cos(c*dx[2])-T(1))/(dx[2]*dx[2]));
+        if (k2 != T(0)) {
+            spectral_data /= k2;
+        } else {
+            // interpretation here is that the average value of the
+            // solution is zero
+            spectral_data = 0;
+        }
+        spectral_data *= scale;
+    });
+}
+
+template <typename MF>
+void PoissonHybrid<MF>::solve (MF& soln, MF const& rhs)
+{
+#if (AMREX_SPACEDIM < 3)
+    amrex::ignore_unused(soln, rhs);
+#else
+    using T = typename MF::value_type;
+
+    auto facx = T(2)*Math::pi<T>()/T(m_geom.ProbLength(0));
+    auto facy = T(2)*Math::pi<T>()/T(m_geom.ProbLength(1));
+    auto dx = T(m_geom.CellSize(0));
+    auto dy = T(m_geom.CellSize(1));
+    auto scale = T(1.0)/(T(m_geom.Domain().length(0)) *
+                         T(m_geom.Domain().length(1)));
+    auto ny = m_geom.Domain().length(1);
+    auto nz = m_geom.Domain().length(2);
+
+    Gpu::DeviceVector<T> delzv(nz, T(m_geom.CellSize(2)));
+    auto const* delz = delzv.data();
+
+    Box cdomain = m_geom.Domain();
+    cdomain.setBig(0,cdomain.length(0)/2);
+    auto cba = amrex::decompose(cdomain, ParallelContext::NProcsSub(),
+                                {AMREX_D_DECL(true,true,false)});
+    DistributionMapping dm = detail::make_iota_distromap(cba.size());
+    FabArray<BaseFab<GpuComplex<T> > > spmf(cba, dm, 1, 0);
+
+    m_r2c.forward(rhs, spmf);
+
+    for (MFIter mfi(spmf); mfi.isValid(); ++mfi)
+    {
+        auto const& spectral = spmf.array(mfi);
+        auto const& box = mfi.validbox();
+        auto const& xybox = amrex::makeSlab(box, 2, 0);
+
+#ifdef AMREX_USE_GPU
+        // xxxxx TODO: We need to explore how to optimize this
+        // function. Maybe we can use cusparse. Maybe we should make
+        // z-direction to be the unit stride direction.
+
+        FArrayBox tridiag_workspace(box,4);
+        auto const& ald = tridiag_workspace.array(0);
+        auto const& bd = tridiag_workspace.array(1);
+        auto const& cud = tridiag_workspace.array(2);
+        auto const& scratch = tridiag_workspace.array(3);
+
+        amrex::ParallelFor(xybox, [=] AMREX_GPU_DEVICE (int i, int j, int)
+        {
+            T a = facx*i;
+            T b = (j < ny/2) ? facy*j : facy*(ny-j);
+
+            T k2 = T(2)*(std::cos(a*dx)-T(1))/(dx*dx)
+                +  T(2)*(std::cos(b*dy)-T(1))/(dy*dy);
+
+            // Tridiagonal solve with homogeneous Neumann
+            for(int k=0; k < nz; k++) {
+                if(k==0) {
+                    ald(i,j,k) = 0.;
+                    cud(i,j,k) = 2.0 /(delz[k]*(delz[k]+delz[k+1]));
+                    bd(i,j,k) = k2 -ald(i,j,k)-cud(i,j,k);
+                } else if (k == nz-1) {
+                    ald(i,j,k) = 2.0 /(delz[k]*(delz[k]+delz[k-1]));
+                    cud(i,j,k) = 0.;
+                    bd(i,j,k) = k2 -ald(i,j,k)-cud(i,j,k);
+                    if (i == 0 && j == 0) {
+                        bd(i,j,k) *= 2.0;
+                    }
+                } else {
+                    ald(i,j,k) = 2.0 /(delz[k]*(delz[k]+delz[k-1]));
+                    cud(i,j,k) = 2.0 /(delz[k]*(delz[k]+delz[k+1]));
+                    bd(i,j,k) = k2 -ald(i,j,k)-cud(i,j,k);
+                }
+            }
+
+            scratch(i,j,0) = cud(i,j,0)/bd(i,j,0);
+            spectral(i,j,0) = spectral(i,j,0)/bd(i,j,0);
+
+            for (int k = 1; k < nz; k++) {
+                if (k < nz-1) {
+                    scratch(i,j,k) = cud(i,j,k) / (bd(i,j,k) - ald(i,j,k) * scratch(i,j,k-1));
+                }
+                spectral(i,j,k) = (spectral(i,j,k) - ald(i,j,k) * spectral(i,j,k - 1))
+                    / (bd(i,j,k) - ald(i,j,k) * scratch(i,j,k-1));
+            }
+
+            for (int k = nz - 2; k >= 0; k--) {
+                spectral(i,j,k) -= scratch(i,j,k) * spectral(i,j,k + 1);
+            }
+
+            for (int k = 0; k < nz; ++k) {
+                spectral(i,j,k) *= scale;
+            }
+        });
+        Gpu::streamSynchronize();
+
+#else
+
+        Gpu::DeviceVector<GpuComplex<Real>> ald(nz);
+        Gpu::DeviceVector<GpuComplex<Real>> bd(nz);
+        Gpu::DeviceVector<GpuComplex<Real>> cud(nz);
+        Gpu::DeviceVector<GpuComplex<Real>> scratch(nz);
+
+        amrex::LoopOnCpu(xybox, [&] (int i, int j, int)
+        {
+            T a = facx*i;
+            T b = (j < ny/2) ? facy*j : facy*(ny-j);
+
+            T k2 = T(2)*(std::cos(a*dx)-T(1))/(dx*dx)
+                +  T(2)*(std::cos(b*dy)-T(1))/(dy*dy);
+
+            // Tridiagonal solve with homogeneous Neumann
+            for(int k=0; k < nz; k++) {
+                if(k==0) {
+                    ald[k] = 0.;
+                    cud[k] = 2.0 /(delz[k]*(delz[k]+delz[k+1]));
+                    bd[k] = k2 -ald[k]-cud[k];
+                } else if (k == nz-1) {
+                    ald[k] = 2.0 /(delz[k]*(delz[k]+delz[k-1]));
+                    cud[k] = 0.;
+                    bd[k] = k2 -ald[k]-cud[k];
+                    if (i == 0 && j == 0) {
+                        bd[k] *= 2.0;
+                    }
+                } else {
+                    ald[k] = 2.0 /(delz[k]*(delz[k]+delz[k-1]));
+                    cud[k] = 2.0 /(delz[k]*(delz[k]+delz[k+1]));
+                    bd[k] = k2 -ald[k]-cud[k];
+                }
+            }
+
+            scratch[0] = cud[0]/bd[0];
+            spectral(i,j,0) = spectral(i,j,0)/bd[0];
+
+            for (int k = 1; k < nz; k++) {
+                if (k < nz-1) {
+                    scratch[k] = cud[k] / (bd[k] - ald[k] * scratch[k-1]);
+                }
+                spectral(i,j,k) = (spectral(i,j,k) - ald[k] * spectral(i,j,k - 1))
+                    / (bd[k] - ald[k] * scratch[k-1]);
+            }
+
+            for (int k = nz - 2; k >= 0; k--) {
+                spectral(i,j,k) -= scratch[k] * spectral(i,j,k + 1);
+            }
+
+            for (int k = 0; k < nz; ++k) {
+                spectral(i,j,k) *= scale;
+            }
+        });
+#endif
+    }
+
+    m_r2c.backward(spmf, soln);
+#endif
+}
+
+}
+
+#endif
diff --git a/Src/FFT/CMakeLists.txt b/Src/FFT/CMakeLists.txt
new file mode 100644
index 00000000000..2c695a9aec3
--- /dev/null
+++ b/Src/FFT/CMakeLists.txt
@@ -0,0 +1,14 @@
+add_amrex_define(AMREX_USE_FFT NO_LEGACY)
+
+foreach(D IN LISTS AMReX_SPACEDIM)
+    target_include_directories(amrex_${D}d PUBLIC $<BUILD_INTERFACE:${CMAKE_CURRENT_LIST_DIR}>)
+
+    target_sources(amrex_${D}d
+       PRIVATE
+       AMReX_FFT.H
+       AMReX_FFT.cpp
+       AMReX_FFT_Helper.H
+       AMReX_FFT_Poisson.H
+       )
+
+endforeach()
diff --git a/Src/FFT/Make.package b/Src/FFT/Make.package
new file mode 100644
index 00000000000..1dcd714f644
--- /dev/null
+++ b/Src/FFT/Make.package
@@ -0,0 +1,10 @@
+ifndef AMREX_FFT_MAKE
+       AMREX_FFT_MAKE := 1
+
+CEXE_headers += AMReX_FFT.H AMReX_FFT_Helper.H AMReX_FFT_Poisson.H
+CEXE_sources += AMReX_FFT.cpp
+
+VPATH_LOCATIONS += $(AMREX_HOME)/Src/FFT
+INCLUDE_LOCATIONS += $(AMREX_HOME)/Src/FFT
+
+endif
diff --git a/Tests/CMakeLists.txt b/Tests/CMakeLists.txt
index 90730c24fb6..9d6afba857a 100644
--- a/Tests/CMakeLists.txt
+++ b/Tests/CMakeLists.txt
@@ -117,6 +117,10 @@ if (AMReX_TEST_TYPE STREQUAL "Small")
       add_subdirectory("LinearSolvers/ABecLaplacian_C")
    endif()
 
+   if (AMReX_FFT)
+      add_subdirectory("FFT/Poisson")
+   endif()
+
 else()
    #
    # List of subdirectories to search for CMakeLists.
@@ -137,6 +141,10 @@ else()
       list(APPEND AMREX_TESTS_SUBDIRS LinearSolvers)
    endif ()
 
+   if (AMReX_FFT)
+      list(APPEND AMREX_TESTS_SUBDIRS FFT)
+   endif ()
+
    if (AMReX_HDF5)
       list(APPEND AMREX_TESTS_SUBDIRS HDF5Benchmark)
    endif ()
diff --git a/Tests/FFT/Poisson/CMakeLists.txt b/Tests/FFT/Poisson/CMakeLists.txt
new file mode 100644
index 00000000000..21a9d3b2681
--- /dev/null
+++ b/Tests/FFT/Poisson/CMakeLists.txt
@@ -0,0 +1,10 @@
+foreach(D IN LISTS AMReX_SPACEDIM)
+    set(_sources  main.cpp)
+
+    set(_input_files)
+
+    setup_test(${D} _sources _input_files)
+
+    unset(_sources)
+    unset(_input_files)
+endforeach()
diff --git a/Tests/FFT/Poisson/GNUmakefile b/Tests/FFT/Poisson/GNUmakefile
new file mode 100644
index 00000000000..93376f44852
--- /dev/null
+++ b/Tests/FFT/Poisson/GNUmakefile
@@ -0,0 +1,26 @@
+AMREX_HOME := ../../..
+
+DEBUG	= FALSE
+
+DIM	= 3
+
+COMP    = gcc
+
+USE_MPI   = TRUE
+USE_OMP   = FALSE
+USE_CUDA  = FALSE
+USE_HIP   = FALSE
+USE_SYCL  = FALSE
+
+USE_FFT = TRUE
+
+BL_NO_FORT = TRUE
+
+TINY_PROFILE = FALSE
+
+include $(AMREX_HOME)/Tools/GNUMake/Make.defs
+
+include ./Make.package
+include $(AMREX_HOME)/Src/Base/Make.package
+
+include $(AMREX_HOME)/Tools/GNUMake/Make.rules
diff --git a/Tests/FFT/Poisson/Make.package b/Tests/FFT/Poisson/Make.package
new file mode 100644
index 00000000000..6b4b865e8fc
--- /dev/null
+++ b/Tests/FFT/Poisson/Make.package
@@ -0,0 +1 @@
+CEXE_sources += main.cpp
diff --git a/Tests/FFT/Poisson/main.cpp b/Tests/FFT/Poisson/main.cpp
new file mode 100644
index 00000000000..1286d80dad5
--- /dev/null
+++ b/Tests/FFT/Poisson/main.cpp
@@ -0,0 +1,148 @@
+#include <AMReX_FFT_Poisson.H> // Put this at the top for testing
+
+#include <AMReX.H>
+#include <AMReX_MultiFab.H>
+#include <AMReX_ParmParse.H>
+#include <AMReX_PlotFileUtil.H>
+
+using namespace amrex;
+
+int main (int argc, char* argv[])
+{
+    amrex::Initialize(argc, argv);
+    {
+        BL_PROFILE("main");
+
+        AMREX_D_TERM(int n_cell_x = 64;,
+                     int n_cell_y = 32;,
+                     int n_cell_z = 128);
+
+        AMREX_D_TERM(int max_grid_size_x = 32;,
+                     int max_grid_size_y = 32;,
+                     int max_grid_size_z = 32);
+
+        AMREX_D_TERM(Real prob_lo_x = 0.;,
+                     Real prob_lo_y = 0.;,
+                     Real prob_lo_z = 0.);
+        AMREX_D_TERM(Real prob_hi_x = 1.;,
+                     Real prob_hi_y = 1.;,
+                     Real prob_hi_z = 1.);
+
+        {
+            ParmParse pp;
+            AMREX_D_TERM(pp.query("n_cell_x", n_cell_x);,
+                         pp.query("n_cell_y", n_cell_y);,
+                         pp.query("n_cell_z", n_cell_z));
+            AMREX_D_TERM(pp.query("max_grid_size_x", max_grid_size_x);,
+                         pp.query("max_grid_size_y", max_grid_size_y);,
+                         pp.query("max_grid_size_z", max_grid_size_z));
+        }
+
+        Box domain(IntVect(0),IntVect(AMREX_D_DECL(n_cell_x-1,n_cell_y-1,n_cell_z-1)));
+        BoxArray ba(domain);
+        ba.maxSize(IntVect(AMREX_D_DECL(max_grid_size_x,
+                                        max_grid_size_y,
+                                        max_grid_size_z)));
+        DistributionMapping dm(ba);
+
+        Geometry geom;
+        {
+            geom.define(domain,
+                        RealBox(AMREX_D_DECL(prob_lo_x,prob_lo_y,prob_lo_z),
+                                AMREX_D_DECL(prob_hi_x,prob_hi_y,prob_hi_z)),
+                        CoordSys::cartesian, {AMREX_D_DECL(1,1,1)});
+        }
+        auto const& dx = geom.CellSizeArray();
+
+        MultiFab rhs(ba,dm,1,0);
+        MultiFab soln(ba,dm,1,0);
+        auto const& rhsma = rhs.arrays();
+        ParallelFor(rhs, [=] AMREX_GPU_DEVICE (int b, int i, int j, int k)
+        {
+            AMREX_D_TERM(Real x = (i+0.5_rt) * dx[0] - 0.5_rt;,
+                         Real y = (j+0.5_rt) * dx[1] - 0.5_rt;,
+                         Real z = (k+0.5_rt) * dx[2] - 0.5_rt);
+            rhsma[b](i,j,k) = std::exp(-10._rt*
+                (AMREX_D_TERM(x*x*1.05_rt, + y*y*0.90_rt, + z*z)));
+        });
+
+        // Shift rhs so that its sum is zero.
+        auto rhosum = rhs.sum(0);
+        rhs.plus(-rhosum/geom.Domain().d_numPts(), 0, 1);
+
+#if (AMREX_SPACEDIM == 3)
+        Array<int,2> solvers{0,1};
+#else
+        Array<int,2> solvers{0};
+#endif
+
+        for (int solver_type : solvers) {
+            double tsetup, tsolve;
+            if (solver_type == 0) {
+                auto t0 = amrex::second();
+                FFT::Poisson<MultiFab> fft_poisson(geom);
+                auto t1 = amrex::second();
+                tsetup = t1-t0;
+
+                for (int n = 0; n < 2; ++n) {
+                    auto ta = amrex::second();
+                    fft_poisson.solve(soln, rhs);
+                    auto tb = amrex::second();
+                    tsolve = tb-ta;
+                }
+            } else {
+                auto t0 = amrex::second();
+                FFT::PoissonHybrid<MultiFab> fft_poisson(geom);
+                auto t1 = amrex::second();
+                tsetup = t1-t0;
+
+                for (int n = 0; n < 2; ++n) {
+                    auto ta = amrex::second();
+                    fft_poisson.solve(soln, rhs);
+                    auto tb = amrex::second();
+                    tsolve = tb-ta;
+                }
+            }
+
+            amrex::Print() << "  AMReX FFT setup time: " << tsetup
+                           << ", solve time " << tsolve << "\n";
+
+            MultiFab phi(soln.boxArray(), soln.DistributionMap(), 1, 1);
+            MultiFab res(soln.boxArray(), soln.DistributionMap(), 1, 0);
+            MultiFab::Copy(phi, soln, 0, 0, 1, 0);
+            phi.FillBoundary(geom.periodicity());
+            auto const& res_ma = res.arrays();
+            auto const& phi_ma = phi.const_arrays();
+            auto const& rhs_ma = rhs.const_arrays();
+            ParallelFor(res, [=] AMREX_GPU_DEVICE (int b, int i, int j, int k)
+            {
+                auto const& phia = phi_ma[b];
+                auto lap = (phia(i-1,j,k)-2._rt*phia(i,j,k)+phia(i+1,j,k)) / (dx[0]*dx[0]);
+#if (AMREX_SPACEDIM >= 2)
+                lap += (phia(i,j-1,k)-2._rt*phia(i,j,k)+phia(i,j+1,k)) / (dx[1]*dx[1]);
+#endif
+#if (AMREX_SPACEDIM == 3)
+                if ((solver_type == 1) && (k == 0)) { // Neumann
+                    lap += (-phia(i,j,k)+phia(i,j,k+1)) / (dx[2]*dx[2]);
+                } else if ((solver_type == 1) && ((k+1) == n_cell_z)) { // Neumann
+                    lap += (phia(i,j,k-1)-phia(i,j,k)) / (dx[2]*dx[2]);
+                } else {
+                    lap += (phia(i,j,k-1)-2._rt*phia(i,j,k)+phia(i,j,k+1)) / (dx[2]*dx[2]);
+                }
+#endif
+                res_ma[b](i,j,k) = rhs_ma[b](i,j,k) - lap;
+            });
+            auto bnorm = rhs.norminf();
+            auto rnorm = res.norminf();
+            amrex::Print() << "  rhs inf norm " << bnorm << "\n"
+                           << "  res inf norm " << rnorm << "\n";
+#ifdef AMREX_USE_FLOAT
+            auto eps = 2.e-3f;
+#else
+            auto eps = 1.e-11;
+#endif
+            AMREX_ALWAYS_ASSERT(rnorm < eps*bnorm);
+        }
+    }
+    amrex::Finalize();
+}
diff --git a/Tests/FFT/R2C/CMakeLists.txt b/Tests/FFT/R2C/CMakeLists.txt
new file mode 100644
index 00000000000..21a9d3b2681
--- /dev/null
+++ b/Tests/FFT/R2C/CMakeLists.txt
@@ -0,0 +1,10 @@
+foreach(D IN LISTS AMReX_SPACEDIM)
+    set(_sources  main.cpp)
+
+    set(_input_files)
+
+    setup_test(${D} _sources _input_files)
+
+    unset(_sources)
+    unset(_input_files)
+endforeach()
diff --git a/Tests/FFT/R2C/GNUmakefile b/Tests/FFT/R2C/GNUmakefile
new file mode 100644
index 00000000000..93376f44852
--- /dev/null
+++ b/Tests/FFT/R2C/GNUmakefile
@@ -0,0 +1,26 @@
+AMREX_HOME := ../../..
+
+DEBUG	= FALSE
+
+DIM	= 3
+
+COMP    = gcc
+
+USE_MPI   = TRUE
+USE_OMP   = FALSE
+USE_CUDA  = FALSE
+USE_HIP   = FALSE
+USE_SYCL  = FALSE
+
+USE_FFT = TRUE
+
+BL_NO_FORT = TRUE
+
+TINY_PROFILE = FALSE
+
+include $(AMREX_HOME)/Tools/GNUMake/Make.defs
+
+include ./Make.package
+include $(AMREX_HOME)/Src/Base/Make.package
+
+include $(AMREX_HOME)/Tools/GNUMake/Make.rules
diff --git a/Tests/FFT/R2C/Make.package b/Tests/FFT/R2C/Make.package
new file mode 100644
index 00000000000..6b4b865e8fc
--- /dev/null
+++ b/Tests/FFT/R2C/Make.package
@@ -0,0 +1 @@
+CEXE_sources += main.cpp
diff --git a/Tests/FFT/R2C/main.cpp b/Tests/FFT/R2C/main.cpp
new file mode 100644
index 00000000000..71030385759
--- /dev/null
+++ b/Tests/FFT/R2C/main.cpp
@@ -0,0 +1,126 @@
+#include <AMReX_FFT.H> // Put this at the top for testing
+
+#include <AMReX.H>
+#include <AMReX_MultiFab.H>
+#include <AMReX_ParmParse.H>
+#include <AMReX_PlotFileUtil.H>
+
+using namespace amrex;
+
+int main (int argc, char* argv[])
+{
+    amrex::Initialize(argc, argv);
+    {
+        BL_PROFILE("main");
+
+        AMREX_D_TERM(int n_cell_x = 128;,
+                     int n_cell_y = 32;,
+                     int n_cell_z = 64);
+
+        AMREX_D_TERM(int max_grid_size_x = 32;,
+                     int max_grid_size_y = 32;,
+                     int max_grid_size_z = 32);
+
+        AMREX_D_TERM(Real prob_lo_x = 0.;,
+                     Real prob_lo_y = 0.;,
+                     Real prob_lo_z = 0.);
+        AMREX_D_TERM(Real prob_hi_x = 1.;,
+                     Real prob_hi_y = 1.;,
+                     Real prob_hi_z = 1.);
+
+        {
+            ParmParse pp;
+            AMREX_D_TERM(pp.query("n_cell_x", n_cell_x);,
+                         pp.query("n_cell_y", n_cell_y);,
+                         pp.query("n_cell_z", n_cell_z));
+            AMREX_D_TERM(pp.query("max_grid_size_x", max_grid_size_x);,
+                         pp.query("max_grid_size_y", max_grid_size_y);,
+                         pp.query("max_grid_size_z", max_grid_size_z));
+        }
+
+        Box domain(IntVect(0),IntVect(AMREX_D_DECL(n_cell_x-1,n_cell_y-1,n_cell_z-1)));
+        BoxArray ba(domain);
+        ba.maxSize(IntVect(AMREX_D_DECL(max_grid_size_x,
+                                        max_grid_size_y,
+                                        max_grid_size_z)));
+        DistributionMapping dm(ba);
+
+        Geometry geom;
+        {
+            geom.define(domain,
+                        RealBox(AMREX_D_DECL(prob_lo_x,prob_lo_y,prob_lo_z),
+                                AMREX_D_DECL(prob_hi_x,prob_hi_y,prob_hi_z)),
+                        CoordSys::cartesian, {AMREX_D_DECL(1,1,1)});
+        }
+        auto const& dx = geom.CellSizeArray();
+
+        MultiFab mf(ba,dm,1,0);
+        auto const& ma = mf.arrays();
+        ParallelFor(mf, [=] AMREX_GPU_DEVICE (int b, int i, int j, int k)
+        {
+            AMREX_D_TERM(Real x = (i+0.5_rt) * dx[0] - 0.5_rt;,
+                         Real y = (j+0.5_rt) * dx[1] - 0.5_rt;,
+                         Real z = (k+0.5_rt) * dx[2] - 0.5_rt);
+            ma[b](i,j,k) = std::exp(-10._rt*
+                (AMREX_D_TERM(x*x*1.05_rt, + y*y*0.90_rt, + z*z)));
+        });
+
+        MultiFab mf2(ba,dm,1,0);
+
+        auto scaling = Real(1) / Real(geom.Domain().d_numPts());
+
+        {
+            cMultiFab cmf(ba,dm,1,0);
+
+            // forward
+            {
+                FFT::R2C<Real,FFT::Direction::forward> r2c(geom.Domain());
+                r2c.forward(mf,cmf);
+            }
+
+            // backward
+            {
+                FFT::R2C<Real,FFT::Direction::backward> r2c(geom.Domain());
+                r2c.backward(cmf,mf2);
+            }
+
+            auto const& ma2 = mf2.arrays();
+            ParallelFor(mf2, [=] AMREX_GPU_DEVICE (int b, int i, int j, int k)
+            {
+                ma2[b](i,j,k) = ma[b](i,j,k) - ma2[b](i,j,k)*scaling;
+            });
+
+            auto error = mf2.norminf();
+            amrex::Print() << "  Expected to be close to zero: " << error << "\n";
+#ifdef AMREX_USE_FLOAT
+            auto eps = 1.e-6f;
+#else
+            auto eps = 1.e-13;
+#endif
+            AMREX_ALWAYS_ASSERT(error < eps);
+        }
+
+        mf2.setVal(std::numeric_limits<Real>::max());
+
+        { // forward and backward
+            FFT::R2C<Real,FFT::Direction::both> r2c(geom.Domain());
+            r2c.forwardThenBackward(mf, mf2,
+                                    [=] AMREX_GPU_DEVICE (int, int, int, auto& sp)
+            {
+                sp *= scaling;
+            });
+
+            MultiFab::Subtract(mf2, mf, 0, 0, 1, 0);
+
+            auto error = mf2.norminf();
+            amrex::Print() << "  Expected to be close to zero: " << error << "\n";
+#ifdef AMREX_USE_FLOAT
+            auto eps = 1.e-6f;
+#else
+            auto eps = 1.e-13;
+#endif
+            AMREX_ALWAYS_ASSERT(error < eps);
+        }
+    }
+    amrex::Finalize();
+}
diff --git a/Tools/CMake/AMReXConfig.cmake.in b/Tools/CMake/AMReXConfig.cmake.in
index 6ae09ba0a00..96fb12cbf77 100644
--- a/Tools/CMake/AMReXConfig.cmake.in
+++ b/Tools/CMake/AMReXConfig.cmake.in
@@ -76,6 +76,7 @@ set(AMReX_FINTERFACES_FOUND         @AMReX_FORTRAN_INTERFACES@)
 set(AMReX_LSOLVERS_FOUND            @AMReX_LINEAR_SOLVERS@)
 set(AMReX_LSOLVERS_INCFLO_FOUND     @AMReX_LINEAR_SOLVERS_INCFLO@)
 set(AMReX_LSOLVERS_EM_FOUND         @AMReX_LINEAR_SOLVERS_EM@)
+set(AMReX_FFT_FOUND                 @AMReX_FFT@)
 set(AMReX_AMRDATA_FOUND             @AMReX_AMRDATA@)
 set(AMReX_PARTICLES_FOUND           @AMReX_PARTICLES@)
 set(AMReX_P@AMReX_PARTICLES_PRECISION@_FOUND ON)
@@ -133,6 +134,7 @@ set(AMReX_FINTERFACES               @AMReX_FORTRAN_INTERFACES@)
 set(AMReX_LSOLVERS                  @AMReX_LINEAR_SOLVERS@)
 set(AMReX_LSOLVERS_INCFLO           @AMReX_LINEAR_SOLVERS_INCFLO@)
 set(AMReX_LSOLVERS_EM               @AMReX_LINEAR_SOLVERS_EM@)
+set(AMReX_FFT                       @AMReX_FFT@)
 set(AMReX_AMRDATA                   @AMReX_AMRDATA@)
 set(AMReX_PARTICLES                 @AMReX_PARTICLES@)
 set(AMReX_PARTICLES_PRECISION       @AMReX_PARTICLES_PRECISION@)
@@ -216,6 +218,12 @@ if (@AMReX_CONDUIT@)
    find_dependency(Conduit REQUIRED)
 endif ()
 
+if (@AMReX_FFT@)
+    if (@AMReX_GPU_BACKEND@ STREQUAL NONE)
+        find_dependency(AMReXFFTW REQUIRED)
+    endif()
+endif()
+
 if (@AMReX_HDF5@)
     find_dependency(HDF5 REQUIRED)
 endif ()
diff --git a/Tools/CMake/AMReXOptions.cmake b/Tools/CMake/AMReXOptions.cmake
index 382459e38f1..a7863f125e3 100644
--- a/Tools/CMake/AMReXOptions.cmake
+++ b/Tools/CMake/AMReXOptions.cmake
@@ -294,6 +294,9 @@ cmake_dependent_option( AMReX_LINEAR_SOLVERS_EM
     "AMReX_LINEAR_SOLVERS" OFF)
 print_option( AMReX_LINEAR_SOLVERS_EM )
 
+option( AMReX_FFT  "Build AMReX FFT" OFF )
+print_option( AMReX_FFT )
+
 option( AMReX_AMRDATA "Build data services" OFF )
 print_option( AMReX_AMRDATA )
 
diff --git a/Tools/CMake/AMReXThirdPartyLibraries.cmake b/Tools/CMake/AMReXThirdPartyLibraries.cmake
index abe62a2ebc9..b8ad503e83f 100644
--- a/Tools/CMake/AMReXThirdPartyLibraries.cmake
+++ b/Tools/CMake/AMReXThirdPartyLibraries.cmake
@@ -1,3 +1,27 @@
+#
+# FFT
+#
+if (AMReX_FFT)
+    if (AMReX_CUDA)
+        find_package(CUDAToolkit REQUIRED)
+        foreach(D IN LISTS AMReX_SPACEDIM)
+            target_link_libraries(amrex_${D}d PUBLIC CUDA::cufft)
+        endforeach()
+    elseif (AMReX_HIP)
+        find_package(rocfft REQUIRED)
+        foreach(D IN LISTS AMReX_SPACEDIM)
+            target_link_libraries(amrex_${D}d PUBLIC roc::rocfft)
+        endforeach()
+    elseif (AMReX_SYCL)
+        # nothing to do
+    else()
+        find_package(AMReXFFTW REQUIRED)
+        foreach(D IN LISTS AMReX_SPACEDIM)
+            target_link_libraries(amrex_${D}d PUBLIC AMReX::FFTW)
+        endforeach()
+    endif()
+endif()
+
 #
 # HDF5 -- here it would be best to create an imported target
 #
diff --git a/Tools/CMake/AMReX_Config_ND.H.in b/Tools/CMake/AMReX_Config_ND.H.in
index 3296a403ff0..07e3b7fd637 100644
--- a/Tools/CMake/AMReX_Config_ND.H.in
+++ b/Tools/CMake/AMReX_Config_ND.H.in
@@ -39,6 +39,7 @@
 #cmakedefine BL_FORT_USE_LOWERCASE
 #cmakedefine BL_FORT_USE_UPPERCASE
 #cmakedefine BL_NO_FORT
+#cmakedefine AMREX_USE_FFT
 #cmakedefine AMREX_USE_SENSEI_INSITU
 #cmakedefine AMREX_NO_SENSEI_AMR_INST
 #cmakedefine AMREX_USE_CONDUIT
diff --git a/Tools/CMake/FindAMReXFFTW.cmake b/Tools/CMake/FindAMReXFFTW.cmake
new file mode 100644
index 00000000000..678743a08bb
--- /dev/null
+++ b/Tools/CMake/FindAMReXFFTW.cmake
@@ -0,0 +1,51 @@
+#[=======================================================================[:
+FindAMReXFFTW
+-------
+
+Finds the FFTW library.
+
+Imported Targets
+^^^^^^^^^^^^^^^^
+
+This module provides the following imported target, if found:
+
+``FFTW``
+  The FFTW library
+
+Result Variables
+^^^^^^^^^^^^^^^^
+
+This will define the following variables:
+
+``AMReXFFTW_FOUND``
+  True if the hypre library has been found.
+``FFTW_INCLUDES``
+  Include directories needed to use FFTW.
+``FFTW_LIBRARIES``
+  Libraries needed to link to FFTW.
+
+This will also create an imported target, AMReX::FFTW.
+#]=======================================================================]
+
+if (NOT FFTW_INCLUDES)
+    find_path(FFTW_INCLUDES NAMES "fftw3.h" HINTS ${FFTW_ROOT}/include)
+endif()
+
+if (NOT FFTW_LIBRARIES)
+    find_library(FFTW_LIBRARY NAMES "fftw3" HINTS ${FFTW_ROOT}/lib)
+    find_library(FFTWF_LIBRARY NAMES "fftw3f" HINTS ${FFTW_ROOT}/lib)
+    set(FFTW_LIBRARIES ${FFTW_LIBRARY} ${FFTWF_LIBRARY})
+endif()
+
+include(FindPackageHandleStandardArgs)
+
+find_package_handle_standard_args(AMReXFFTW
+    REQUIRED_VARS FFTW_LIBRARIES FFTW_INCLUDES)
+
+mark_as_advanced(FFTW_LIBRARIES FFTW_INCLUDES)
+
+# Create imported target
+add_library(AMReX::FFTW INTERFACE IMPORTED GLOBAL)
+target_link_libraries(AMReX::FFTW INTERFACE ${FFTW_LIBRARIES})
+set_target_properties(AMReX::FFTW PROPERTIES
+	INTERFACE_INCLUDE_DIRECTORIES "${FFTW_INCLUDES}")
diff --git a/Tools/GNUMake/Make.defs b/Tools/GNUMake/Make.defs
index f33911ed3aa..5c678bdcc99 100644
--- a/Tools/GNUMake/Make.defs
+++ b/Tools/GNUMake/Make.defs
@@ -112,6 +112,12 @@ else
   DEBUG := FALSE
 endif
 
+ifdef USE_FFT
+  USE_FFT := $(strip $(USE_FFT))
+else
+  USE_FFT := FALSE
+endif
+
 ifdef PROFILE
   PROFILE := $(strip $(PROFILE))
 else
@@ -604,6 +610,28 @@ else
     DebugSuffix :=
 endif
 
+ifeq ($(USE_FFT),TRUE)
+  include $(AMREX_HOME)/Src/FFT/Make.package
+  ifeq ($(USE_CUDA),TRUE)
+    LIBRARIES += -lcufft
+  else ifeq ($(USE_HIP),TRUE)
+    # Use rocFFT.  ROC_PATH is defined in hip.mak
+    SYSTEM_INCLUDE_LOCATIONS += $(ROC_PATH)/rocfft/include
+    LIBRARY_LOCATIONS += $(ROC_PATH)/rocfft/lib
+    LIBRARIES += -Wl,--rpath=$(ROC_PATH)/rocfft/lib -lrocfft
+  else ifeq ($(USE_SYCL),TRUE)
+    # nothing
+  else
+    FFTW_HOME ?= NOT_SET
+    ifneq ($(FFTW_HOME),NOT_SET)
+      SYSTEM_INCLUDE_LOCATIONS += $(FFTW_HOME)/include
+      LIBRARY_LOCATIONS += $(FFTW_HOME)/lib
+      LIBRARIES += -Wl,--rpath=$(FFTW_HOME)/lib
+    endif
+    LIBRARIES += -lfftw3f -lfftw3
+  endif
+endif
+
 ifeq ($(USE_PROFPARSER),TRUE)
   PROFILE := TRUE
   TRACE_PROFILE := TRUE
@@ -918,6 +946,10 @@ ifeq ($(USE_PARTICLES),TRUE)
   DEFINES += -DAMREX_PARTICLES
 endif
 
+ifeq ($(USE_FFT),TRUE)
+    DEFINES += -DAMREX_USE_FFT
+endif
+
 ifeq ($(USE_EB),TRUE)
     DEFINES += -DAMREX_USE_EB
 endif
diff --git a/Tools/GNUMake/comps/hip.mak b/Tools/GNUMake/comps/hip.mak
index 87bb3e93f59..26dff7f94ff 100644
--- a/Tools/GNUMake/comps/hip.mak
+++ b/Tools/GNUMake/comps/hip.mak
@@ -119,8 +119,8 @@ ifeq ($(HIP_COMPILER),clang)
   endif
 
   # Generic HIP info
-  ROC_PATH=$(realpath $(dir $(HIP_PATH)))
-  SYSTEM_INCLUDE_LOCATIONS += $(ROC_PATH)/include $(HIP_PATH)/include
+  ROC_PATH=$(realpath $(HIP_PATH))
+  SYSTEM_INCLUDE_LOCATIONS += $(ROC_PATH)/include
 
   # rocRand
   SYSTEM_INCLUDE_LOCATIONS += $(ROC_PATH)/include/hiprand $(ROC_PATH)/include/rocrand
diff --git a/Tools/libamrex/configure.py b/Tools/libamrex/configure.py
index 8988c18b965..873b575fe4f 100755
--- a/Tools/libamrex/configure.py
+++ b/Tools/libamrex/configure.py
@@ -57,6 +57,10 @@ def configure(argv):
                         help="Enable AMReX Fortran API [default=yes]",
                         choices=["yes","no"],
                         default="yes")
+    parser.add_argument("--enable-fft",
+                        help="Enable AMReX FFT [default=no]",
+                        choices=["yes","no"],
+                        default="no")
     parser.add_argument("--enable-linear-solver",
                         help="Enable AMReX linear solvers [default=yes]",
                         choices=["yes","no"],
@@ -151,6 +155,7 @@ def configure(argv):
     f.write("DEBUG = {}\n".format("TRUE" if args.debug == "yes" else "FALSE"))
     f.write("USE_PARTICLES = {}\n".format("FALSE" if args.enable_particle == "no" else "TRUE"))
     f.write("USE_FORTRAN_INTERFACE = {}\n".format("FALSE" if args.enable_fortran_api == "no" else "TRUE"))
+    f.write("USE_FFT = {}\n".format("TRUE" if args.enable_fft == "yes" else "FALSE"))
     f.write("USE_LINEAR_SOLVERS = {}\n".format("FALSE" if args.enable_linear_solver == "no" else "TRUE"))
     f.write("USE_LINEAR_SOLVERS_INCFLO = {}\n".format("FALSE" if args.enable_linear_solver_incflo == "no" else "TRUE"))
     f.write("USE_LINEAR_SOLVERS_EM = {}\n".format("FALSE" if args.enable_linear_solver_em == "no" else "TRUE"))