diff --git a/.github/workflows/apps.yml b/.github/workflows/apps.yml index 2ff10f92462..ca333285df8 100644 --- a/.github/workflows/apps.yml +++ b/.github/workflows/apps.yml @@ -95,6 +95,7 @@ jobs: -DWarpX_OPENPMD=OFF \ -DCMAKE_VERBOSE_MAKEFILE=ON \ -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \ + -DAMReX_FFT=ON \ -DAMReX_LINEAR_SOLVER_INCFLO=OFF cmake --build WarpX/build -j 4 diff --git a/.github/workflows/clang.yml b/.github/workflows/clang.yml index c84d19850a7..1ed5240164f 100644 --- a/.github/workflows/clang.yml +++ b/.github/workflows/clang.yml @@ -44,6 +44,7 @@ jobs: -DCMAKE_BUILD_TYPE=Debug \ -DCMAKE_VERBOSE_MAKEFILE=ON \ -DCMAKE_INSTALL_PREFIX=/tmp/my-amrex \ + -DAMReX_FFT=ON \ -DAMReX_EB=ON \ -DAMReX_FORTRAN=ON \ -DAMReX_MPI=OFF \ @@ -104,6 +105,7 @@ jobs: cmake .. \ -DCMAKE_BUILD_TYPE=Debug \ -DCMAKE_VERBOSE_MAKEFILE=ON \ + -DAMReX_FFT=ON \ -DAMReX_EB=ON \ -DAMReX_ENABLE_TESTS=ON \ -DAMReX_FORTRAN=ON \ @@ -158,6 +160,7 @@ jobs: cmake .. \ -DCMAKE_BUILD_TYPE=RelWithDebInfo \ -DCMAKE_VERBOSE_MAKEFILE=ON \ + -DAMReX_FFT=ON \ -DAMReX_EB=ON \ -DAMReX_ENABLE_TESTS=ON \ -DAMReX_FORTRAN=OFF \ @@ -200,7 +203,7 @@ jobs: export CCACHE_LOGFILE=${{ github.workspace }}/ccache.log.txt ccache -z - ./configure --dim 2 --with-fortran no --comp llvm --with-mpi no + ./configure --dim 2 --with-fortran no --comp llvm --with-mpi no --enable-fft yes make -j4 WARN_ALL=TRUE WARN_ERROR=TRUE XTRA_CXXFLAGS="-fno-operator-names" \ CCACHE=ccache make install diff --git a/.github/workflows/cuda.yml b/.github/workflows/cuda.yml index 9e96aefac5e..927e99ded40 100644 --- a/.github/workflows/cuda.yml +++ b/.github/workflows/cuda.yml @@ -38,6 +38,7 @@ jobs: cmake -S . -B build \ -DCMAKE_VERBOSE_MAKEFILE=ON \ + -DAMReX_FFT=ON \ -DAMReX_EB=ON \ -DAMReX_ENABLE_TESTS=ON \ -DAMReX_FORTRAN=OFF \ @@ -97,6 +98,7 @@ jobs: cmake -S . -B build \ -DCMAKE_VERBOSE_MAKEFILE=ON \ -DAMReX_MPI=OFF \ + -DAMReX_FFT=ON \ -DAMReX_EB=ON \ -DAMReX_ENABLE_TESTS=ON \ -DAMReX_FORTRAN=OFF \ @@ -153,6 +155,7 @@ jobs: -DCMAKE_VERBOSE_MAKEFILE=ON \ -DAMReX_ENABLE_TESTS=ON \ -DAMReX_TEST_TYPE=Small \ + -DAMReX_FFT=ON \ -DAMReX_FORTRAN=ON \ -DAMReX_FORTRAN_INTERFACES=ON \ -DAMReX_GPU_BACKEND=CUDA \ @@ -196,7 +199,7 @@ jobs: ccache -z export PATH=/usr/local/nvidia/bin:/usr/local/cuda/bin:${PATH} - ./configure --dim 3 --with-cuda yes --enable-eb yes --enable-xsdk-defaults yes --with-fortran no + ./configure --dim 3 --with-cuda yes --enable-eb yes --enable-xsdk-defaults yes --with-fortran no --enable-fft yes # # /home/runner/work/amrex/amrex/Src/Base/AMReX_GpuLaunchGlobal.H:16:41: error: unused parameter ‘f0’ [-Werror=unused-parameter] # 16 | AMREX_GPU_GLOBAL void launch_global (L f0) { f0(); } diff --git a/.github/workflows/dependencies/dependencies.sh b/.github/workflows/dependencies/dependencies.sh index 07e461f577f..c7cde496519 100755 --- a/.github/workflows/dependencies/dependencies.sh +++ b/.github/workflows/dependencies/dependencies.sh @@ -16,6 +16,7 @@ sudo apt-get update sudo apt-get install -y --no-install-recommends\ build-essential \ + libfftw3-dev \ g++ gfortran \ libopenmpi-dev \ openmpi-bin diff --git a/.github/workflows/dependencies/dependencies_clang.sh b/.github/workflows/dependencies/dependencies_clang.sh index 2e96b5196d8..4c329321b68 100755 --- a/.github/workflows/dependencies/dependencies_clang.sh +++ b/.github/workflows/dependencies/dependencies_clang.sh @@ -16,5 +16,6 @@ sudo apt-get update sudo apt-get install -y --no-install-recommends \ build-essential \ + libfftw3-dev \ gfortran \ clang-$1 diff --git a/.github/workflows/dependencies/dependencies_gcc.sh b/.github/workflows/dependencies/dependencies_gcc.sh index 2a576c0b52d..93d9aa27ec4 100755 --- a/.github/workflows/dependencies/dependencies_gcc.sh +++ b/.github/workflows/dependencies/dependencies_gcc.sh @@ -17,6 +17,7 @@ sudo apt-get update sudo apt-get install -y --no-install-recommends \ build-essential \ + libfftw3-dev \ g++-$1 gfortran-$1 \ libopenmpi-dev \ openmpi-bin diff --git a/.github/workflows/dependencies/dependencies_hip.sh b/.github/workflows/dependencies/dependencies_hip.sh index ab5185ce0ae..df4f274ef36 100755 --- a/.github/workflows/dependencies/dependencies_hip.sh +++ b/.github/workflows/dependencies/dependencies_hip.sh @@ -56,6 +56,7 @@ sudo apt-get install -y --no-install-recommends \ roctracer-dev \ rocprofiler-dev \ rocrand-dev \ + rocfft-dev \ rocprim-dev # hiprand-dev is a new package that does not exist in old versions diff --git a/.github/workflows/dependencies/dependencies_nvcc.sh b/.github/workflows/dependencies/dependencies_nvcc.sh index abf95048013..2578bd33fe7 100755 --- a/.github/workflows/dependencies/dependencies_nvcc.sh +++ b/.github/workflows/dependencies/dependencies_nvcc.sh @@ -35,5 +35,6 @@ sudo apt-get install -y \ cuda-minimal-build-$VERSION_DASHED \ cuda-nvml-dev-$VERSION_DASHED \ cuda-nvtx-$VERSION_DASHED \ + libcufft-dev-$VERSION_DASHED \ libcurand-dev-$VERSION_DASHED sudo ln -s cuda-$VERSION_DOTTED /usr/local/cuda diff --git a/.github/workflows/gcc.yml b/.github/workflows/gcc.yml index b64dbd3b5f0..88fe47c988c 100644 --- a/.github/workflows/gcc.yml +++ b/.github/workflows/gcc.yml @@ -42,6 +42,7 @@ jobs: mkdir build cd build cmake .. \ + -DAMReX_FFT=ON \ -DAMReX_FORTRAN=ON \ -DAMReX_PLOTFILE_TOOLS=ON \ -DCMAKE_VERBOSE_MAKEFILE=ON \ @@ -99,6 +100,7 @@ jobs: cmake -S . -B build \ -DCMAKE_BUILD_TYPE=Debug \ -DCMAKE_VERBOSE_MAKEFILE=ON \ + -DAMReX_FFT=ON \ -DAMReX_EB=ON \ -DAMReX_ENABLE_TESTS=ON \ -DAMReX_FORTRAN=ON \ @@ -147,6 +149,7 @@ jobs: cmake -S . -B build \ -DCMAKE_BUILD_TYPE=Debug \ -DCMAKE_VERBOSE_MAKEFILE=ON \ + -DAMReX_FFT=ON \ -DAMReX_EB=ON \ -DAMReX_ENABLE_TESTS=ON \ -DAMReX_FORTRAN=ON \ @@ -196,6 +199,7 @@ jobs: cmake -S . -B build \ -DCMAKE_BUILD_TYPE=Debug \ -DCMAKE_VERBOSE_MAKEFILE=ON \ + -DAMReX_FFT=ON \ -DAMReX_EB=OFF \ -DAMReX_ENABLE_TESTS=ON \ -DAMReX_FORTRAN=ON \ @@ -248,6 +252,7 @@ jobs: -DCMAKE_VERBOSE_MAKEFILE=ON \ -DAMReX_ASSERTIONS=ON \ -DAMReX_TESTING=ON \ + -DAMReX_FFT=ON \ -DAMReX_EB=OFF \ -DAMReX_ENABLE_TESTS=ON \ -DAMReX_BOUND_CHECK=ON \ @@ -310,6 +315,7 @@ jobs: -DAMReX_TESTING=ON \ -DAMReX_BOUND_CHECK=ON \ -DAMReX_FPE=ON \ + -DAMReX_FFT=ON \ -DAMReX_EB=ON \ -DAMReX_ENABLE_TESTS=ON \ -DAMReX_FORTRAN=ON \ @@ -371,6 +377,7 @@ jobs: -DAMReX_TESTING=ON \ -DAMReX_BOUND_CHECK=ON \ -DAMReX_FPE=ON \ + -DAMReX_FFT=ON \ -DAMReX_EB=ON \ -DAMReX_ENABLE_TESTS=ON \ -DAMReX_FORTRAN=OFF \ @@ -457,7 +464,7 @@ jobs: export CCACHE_LOGFILE=${{ github.workspace }}/ccache.log.txt ccache -z - ./configure --dim 3 --enable-eb yes --enable-xsdk-defaults yes + ./configure --dim 3 --enable-eb yes --enable-xsdk-defaults yes --enable-fft yes make -j4 WARN_ALL=TRUE WARN_ERROR=TRUE XTRA_CXXFLAGS=-fno-operator-names \ CCACHE=ccache make install @@ -497,7 +504,8 @@ jobs: export CCACHE_LOGFILE=${{ github.workspace }}/ccache.log.txt ccache -z - ./configure --dim 3 --enable-eb no --enable-xsdk-defaults no --single-precision yes --single-precision-particles yes --enable-tiny-profile yes + ./configure --dim 3 --enable-eb no --enable-xsdk-defaults no --single-precision yes \ + --single-precision-particles yes --enable-tiny-profile yes --enable-fft yes make -j4 WARN_ALL=TRUE WARN_ERROR=TRUE XTRA_CXXFLAGS=-fno-operator-names \ CCACHE=ccache make install @@ -623,6 +631,7 @@ jobs: -DAMReX_OMP=ON \ -DCMAKE_VERBOSE_MAKEFILE=ON \ -DAMReX_ENABLE_TESTS=ON \ + -DAMReX_FFT=ON \ -DCMAKE_CXX_COMPILER_LAUNCHER=ccache make -j 4 diff --git a/.github/workflows/hip.yml b/.github/workflows/hip.yml index 345d7c468b5..22154d6b012 100644 --- a/.github/workflows/hip.yml +++ b/.github/workflows/hip.yml @@ -48,6 +48,7 @@ jobs: cmake -S . -B build \ -DCMAKE_VERBOSE_MAKEFILE=ON \ + -DAMReX_FFT=ON \ -DAMReX_EB=ON \ -DAMReX_ENABLE_TESTS=ON \ -DAMReX_FORTRAN=ON \ @@ -103,6 +104,7 @@ jobs: cmake -S . -B build_full_legacywrapper \ -DCMAKE_VERBOSE_MAKEFILE=ON \ + -DAMReX_FFT=ON \ -DAMReX_EB=OFF \ -DAMReX_ENABLE_TESTS=ON \ -DAMReX_FORTRAN=ON \ @@ -145,7 +147,9 @@ jobs: export CCACHE_MAXSIZE=100M ccache -z - ./configure --dim 2 --with-hip yes --enable-eb yes --enable-xsdk-defaults yes --with-mpi no --with-omp no --single-precision yes --single-precision-particles yes + ./configure --dim 2 --with-hip yes --enable-eb yes --enable-xsdk-defaults yes \ + --with-mpi no --with-omp no --single-precision yes \ + --single-precision-particles yes --enable-fft yes make -j4 WARN_ALL=TRUE AMD_ARCH=gfx90a CCACHE=ccache make install diff --git a/.github/workflows/intel.yml b/.github/workflows/intel.yml index 227b0f97380..15c7bbda587 100644 --- a/.github/workflows/intel.yml +++ b/.github/workflows/intel.yml @@ -41,6 +41,7 @@ jobs: set -e cmake -S . -B build \ -DCMAKE_VERBOSE_MAKEFILE=ON \ + -DAMReX_FFT=ON \ -DAMReX_EB=OFF \ -DAMReX_ENABLE_TESTS=ON \ -DAMReX_FORTRAN=ON \ @@ -89,6 +90,7 @@ jobs: set -e cmake -S . -B build \ -DCMAKE_VERBOSE_MAKEFILE=ON \ + -DAMReX_FFT=ON \ -DAMReX_EB=ON \ -DAMReX_ENABLE_TESTS=ON \ -DAMReX_FORTRAN=OFF \ diff --git a/Docs/sphinx_documentation/source/BuildingAMReX.rst b/Docs/sphinx_documentation/source/BuildingAMReX.rst index 064d11e740a..831346765b9 100644 --- a/Docs/sphinx_documentation/source/BuildingAMReX.rst +++ b/Docs/sphinx_documentation/source/BuildingAMReX.rst @@ -475,6 +475,8 @@ The list of available options is reported in the :ref:`table ` bel +------------------------------+-------------------------------------------------+-------------------------+-----------------------+ | AMReX_EB | Build Embedded Boundary support | NO | YES, NO | +------------------------------+-------------------------------------------------+-------------------------+-----------------------+ + | AMReX_FFT | Build FFT support | NO | YES, NO | + +------------------------------+-------------------------------------------------+-------------------------+-----------------------+ | AMReX_PARTICLES | Build particle classes | YES | YES, NO | +------------------------------+-------------------------------------------------+-------------------------+-----------------------+ | AMReX_PARTICLES_PRECISION | Set reals precision in particle classes | Same as AMReX_PRECISION | DOUBLE, SINGLE | @@ -697,6 +699,8 @@ A list of AMReX component names and related configure options are shown in the t +------------------------------+-----------------+ | AMReX_EB | EB | +------------------------------+-----------------+ + | AMReX_FFT | FFT | + +------------------------------+-----------------+ | AMReX_PARTICLES | PARTICLES | +------------------------------+-----------------+ | AMReX_PARTICLES_PRECISION | PDOUBLE, PSINGLE| diff --git a/Docs/sphinx_documentation/source/FFT.rst b/Docs/sphinx_documentation/source/FFT.rst new file mode 100644 index 00000000000..3fc24fcab86 --- /dev/null +++ b/Docs/sphinx_documentation/source/FFT.rst @@ -0,0 +1,71 @@ +.. role:: cpp(code) + :language: c++ + +.. _sec:FFT:r2c: + +FFT::R2C Class +============== + +Class template `FFT::R2C` supports discrete Fourier transforms between real +and complex data. The name R2C indicates that the forward transform converts +real data to complex data, while the backward transform converts complex +data to real data. It should be noted that both directions of transformation +are supported, not just from real to complex. + +The implementation utilizes cuFFT, rocFFT, oneMKL and FFTW, for CUDA, HIP, +SYCL and CPU builds, respectively. Because the parallel communication is +handled by AMReX, it does not need the parallel version of +FFTW. Furthermore, there is no constraint on the domain decomposition such +as one Box per process. This class performs parallel FFT on AMReX's parallel +data containers (e.g., :cpp:`MultiFab` and +:cpp:`FabArray>>`. For local FFT, the users can +use FFTW, cuFFT, rocFFT, or oneMKL directly. + +Other than using column-majored order, AMReX follows the convention of +FFTW. Applying the forward transform followed by the backward transform +scales the original data by the size of the input array. The layout of the +complex data also follows the FFTW convention, where the complex Hermitian +output array has `(nx/2+1,ny,nz)` elements. Here `nx`, `ny` and `nz` are the +sizes of the real array and the division is rounded down. + +Below are examples of using :cpp:`FFT:R2C`. + +.. highlight:: c++ + +:: + + Geometry geom(...); + MultiFab mfin(...); + MultiFab mfout(...); + + auto scaling = 1. / geom.Domain().d_numPts(); + + FFT::R2C r2c(geom.Domain()); + r2c.forwardThenBackward(mfin, mfout, + [=] AMREX_GPU_DEVICE (int, int, int, auto& sp) + { + sp *= scaling; + }); + + cMultiFab cmf(...); + FFT::R2C r2c_forward(geom.Domain()); + r2c_forward(mfin, cmf); + + FFT::R2C r2c_backward(geom.Domain()); + r2c_backward(cmf, mfout); + +Note that using :cpp:`forwardThenBackward` is expected to be more efficient +than separate calls to :cpp:`forward` and :cpp:`backward` because some +parallel communication can be avoided. It should also be noted that a lot of +preparation works are done in the construction of an :cpp:`FFT::R2C` +object. Therefore, one should cache it for reuse if possible. + + +Poisson Solver +============== + +AMReX provides FFT based Poisson solvers. :cpp:`FFT::Poisson` supports all +periodic boundaries using purely FFT. :cpp:`FFT::PoissonHybrid` is a 3D only +solver that supports periodic boundaries in the first two dimensions and +Neumann boundary in the last dimension. Similar to :cpp:`FFT::R2C`, the +Poisson solvers should be cached for reuse. diff --git a/Docs/sphinx_documentation/source/FFT_Chapter.rst b/Docs/sphinx_documentation/source/FFT_Chapter.rst new file mode 100644 index 00000000000..9d6e9505d4f --- /dev/null +++ b/Docs/sphinx_documentation/source/FFT_Chapter.rst @@ -0,0 +1,16 @@ +.. _Chap:FFT: + +.. _sec:FFT:FFTOverview: + +Discrete Fourier Transform +========================== + +AMReX provides support for parallel discrete Fourier transform. The +implementation utilizes cuFFT, rocFFT, oneMKL and FFTW, for CUDA, HIP, SYCL +and CPU builds, respectively. It also provides FFT based Poisson +solvers. + +.. toctree:: + :maxdepth: 1 + + FFT diff --git a/Docs/sphinx_documentation/source/index.rst b/Docs/sphinx_documentation/source/index.rst index 203545cf40a..09ffbb5c0b4 100644 --- a/Docs/sphinx_documentation/source/index.rst +++ b/Docs/sphinx_documentation/source/index.rst @@ -52,6 +52,7 @@ Documentation on migration from BoxLib is available in the AMReX repository at D Fortran_Chapter Python_Chapter EB_Chapter + FFT_Chapter TimeIntegration_Chapter GPU_Chapter Visualization_Chapter diff --git a/GNUmakefile.in b/GNUmakefile.in index b85c2e0c35e..67c789d97ca 100644 --- a/GNUmakefile.in +++ b/GNUmakefile.in @@ -26,6 +26,9 @@ ifeq ($(USE_LINEAR_SOLVERS),TRUE) Pdirs += F_Interfaces/LinearSolvers endif endif +ifeq ($(USE_FFT),TRUE) + Pdirs += FFT +endif ifeq ($(USE_EB),TRUE) Pdirs += EB endif diff --git a/Src/Base/AMReX_FabArray.H b/Src/Base/AMReX_FabArray.H index 69970d6401f..a67a72f0a38 100644 --- a/Src/Base/AMReX_FabArray.H +++ b/Src/Base/AMReX_FabArray.H @@ -13,6 +13,7 @@ #include #include #include +#include #include #include #include @@ -3679,6 +3680,8 @@ FabArray::norminf (FabArray const& mask, int comp, int ncomp, return nm0; } +using cMultiFab = FabArray > >; + } #endif /*BL_FABARRAY_H*/ diff --git a/Src/Base/AMReX_GpuComplex.H b/Src/Base/AMReX_GpuComplex.H index 274da82604d..42dfc7626e9 100644 --- a/Src/Base/AMReX_GpuComplex.H +++ b/Src/Base/AMReX_GpuComplex.H @@ -41,16 +41,16 @@ struct alignas(2*sizeof(T)) GpuComplex /** * \brief Return the real part. */ - AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE + [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE constexpr T real () const noexcept { return m_real; } /** * \brief Return the imaginary part. */ - AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE + [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE constexpr T imag () const noexcept { return m_imag; } - /** + /** * \brief Add a real number to this complex number. */ AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE diff --git a/Src/Base/AMReX_GpuError.H b/Src/Base/AMReX_GpuError.H index ce3ac188a85..65457c8f4e7 100644 --- a/Src/Base/AMReX_GpuError.H +++ b/Src/Base/AMReX_GpuError.H @@ -84,6 +84,16 @@ namespace Gpu { std::string errStr(std::string("CURAND error in file ") + __FILE__ \ + " line " + std::to_string(__LINE__)); \ amrex::Abort(errStr); }} while(0) + +#define AMREX_CUFFT_SAFE_CALL(call) { \ + cufftResult_t amrex_i_err = call; \ + if (CUFFT_SUCCESS != amrex_i_err) { \ + std::string errStr(std::string("CUFFT error ")+std::to_string(amrex_i_err) \ + + std::string(" in file ") + __FILE__ \ + + " line " + std::to_string(__LINE__)); \ + amrex::Abort(errStr); \ + }} + #endif #ifdef AMREX_USE_HIP @@ -100,6 +110,16 @@ namespace Gpu { std::string errStr(std::string("HIPRAND error in file ") + __FILE__ \ + " line " + std::to_string(__LINE__)); \ amrex::Abort(errStr); }} while(0) + +#define AMREX_ROCFFT_SAFE_CALL(call) { \ + auto amrex_i_err = call; \ + if (rocfft_status_success != amrex_i_err) { \ + std::string errStr(std::string("rocFFT error ")+std::to_string(amrex_i_err) \ + + std::string(" in file ") + __FILE__ \ + + " line " + std::to_string(__LINE__)); \ + amrex::Abort(errStr); \ + }} + #endif #define AMREX_GPU_ERROR_CHECK() amrex::Gpu::ErrorCheck(__FILE__, __LINE__) diff --git a/Src/CMakeLists.txt b/Src/CMakeLists.txt index 6e8af043e0d..25455d72636 100644 --- a/Src/CMakeLists.txt +++ b/Src/CMakeLists.txt @@ -136,6 +136,10 @@ if (AMReX_PARTICLES) add_subdirectory(Particle) endif () +if (AMReX_FFT) + add_subdirectory(FFT) +endif () + # # Optional external components # diff --git a/Src/FFT/AMReX_FFT.H b/Src/FFT/AMReX_FFT.H new file mode 100644 index 00000000000..f8050fff933 --- /dev/null +++ b/Src/FFT/AMReX_FFT.H @@ -0,0 +1,969 @@ +#ifndef AMREX_FFT_H_ +#define AMREX_FFT_H_ +#include + +#include +#include +#include +#include +#include + +#if defined(AMREX_USE_CUDA) +# include +# include +#elif defined(AMREX_USE_HIP) +# if __has_include() // ROCm 5.3+ +# include +# else +# include +# endif +# include +#elif defined(AMREX_USE_SYCL) +# include +#else +# include +#endif + +namespace amrex::FFT +{ + +/** + * \brief Discrete Fourier Transform + * + * This class supports Fourier transforms between real and complex data. The + * name R2C indicates that the forward transform converts real data to + * complex data, while the backward transform converts complex data to real + * data. It should be noted that both directions of transformation are + * supported, not just from real to complex. The scaling follows the FFTW + * convention, where applying the forward transform followed by the backward + * transform scales the original data by the size of the input array. + * + * For more details, we refer the users to + * https://amrex-codes.github.io/amrex/docs_html/FFT_Chapter.html. + */ +template +class R2C +{ +public: + using MF = std::conditional_t, + MultiFab, FabArray > >; + using cMF = FabArray > >; + + /** + * \brief Constructor + * + * \param domain the forward domain (i.e., the domain of the real data) + * \param info optional information + */ + explicit R2C (Box const& domain, Info const& info = Info{}); + + ~R2C (); + + R2C (R2C const&) = delete; + R2C (R2C &&) = delete; + R2C& operator= (R2C const&) = delete; + R2C& operator= (R2C &&) = delete; + + /** + * \brief Forward and then backward transform + * + * This function is available only when this class template is + * instantiated for transforms in both directions. It's more efficient + * than calling the forward function that stores the spectral data in a + * caller provided container followed by the backward function, because + * this can avoid parallel communication between the internal data and + * the caller's data container. + * + * \param inmf input data in MultiFab or FabArray> + * \param outmf output data in MultiFab or FabArray> + * \param post_forward a callable object for processing the post-forward + * data before the backward transform. Its interface + * is `(int,int,int,GpuComplex&)`, where the integers + * are indices in the spectral space, and the reference + * to the complex number allows for the modification of + * the spectral data at that location. + */ + template = 0> + void forwardThenBackward (MF const& inmf, MF& outmf, F const& post_forward) + { + this->forward(inmf); + this->post_forward_doit(post_forward); + this->backward(outmf); + } + + /** + * \brief Forward transform + * + * The output is stored in this object's internal data. This function is + * not available when this class template is instantiated for + * backward-only transform. + * + * \param inmf input data in MultiFab or FabArray> + */ + template = 0> + void forward (MF const& inmf); + + /** + * \brief Forward transform + * + * This function is not available when this class template is + * instantiated for backward-only transform. + * + * \param inmf input data in MultiFab or FabArray> + * \param outmf output data in FabArray>> + */ + template = 0> + void forward (MF const& inmf, cMF& outmf); + + /** + * \brief Backward transform + * + * This function is available only when this class template is + * instantiated for transforms in both directions. + * + * \param outmf output data in MultiFab or FabArray> + */ + template = 0> + void backward (MF& outmf); + + /** + * \brief Backward transform + * + * This function is not available when this class template is + * instantiated for forward-only transform. + * + * \param inmf input data in FabArray>> + * \param outmf output data in MultiFab or FabArray> + */ + template = 0> + void backward (cMF const& inmf, MF& outmf); + + /** + * \brief Get the internal spectral data + * + * This function is not available when this class template is + * instantiated for backward-only transform. For performance reasons, + * the returned data array does not have the usual ordering of + * `(x,y,z)`. The order is specified in the second part of the return + * value. + */ + template = 0> + std::pair getSpectralData (); + + struct Swap01 + { + [[nodiscard]] AMREX_GPU_HOST_DEVICE Dim3 operator() (Dim3 i) const noexcept + { + return {i.y, i.x, i.z}; + } + + [[nodiscard]] AMREX_GPU_HOST_DEVICE Dim3 Inverse (Dim3 i) const noexcept + { + return {i.y, i.x, i.z}; + } + + [[nodiscard]] IndexType operator() (IndexType it) const noexcept + { + return it; + } + + [[nodiscard]] IndexType Inverse (IndexType it) const noexcept + { + return it; + } + }; + + struct Swap02 + { + [[nodiscard]] AMREX_GPU_HOST_DEVICE Dim3 operator() (Dim3 i) const noexcept + { + return {i.z, i.y, i.x}; + } + + [[nodiscard]] AMREX_GPU_HOST_DEVICE Dim3 Inverse (Dim3 i) const noexcept + { + return {i.z, i.y, i.x}; + } + + [[nodiscard]] IndexType operator() (IndexType it) const noexcept + { + return it; + } + + [[nodiscard]] IndexType Inverse (IndexType it) const noexcept + { + return it; + } + }; + + struct RotateFwd + { + // dest -> src: (x,y,z) -> (y,z,x) + [[nodiscard]] AMREX_GPU_HOST_DEVICE Dim3 operator() (Dim3 i) const noexcept + { + return {i.y, i.z, i.x}; + } + + // src -> dest: (x,y,z) -> (z,x,y) + [[nodiscard]] AMREX_GPU_HOST_DEVICE Dim3 Inverse (Dim3 i) const noexcept + { + return {i.z, i.x, i.y}; + } + + [[nodiscard]] IndexType operator() (IndexType it) const noexcept + { + return it; + } + + [[nodiscard]] IndexType Inverse (IndexType it) const noexcept + { + return it; + } + }; + + struct RotateBwd + { + // dest -> src: (x,y,z) -> (z,x,y) + [[nodiscard]] AMREX_GPU_HOST_DEVICE Dim3 operator() (Dim3 i) const noexcept + { + return {i.z, i.x, i.y}; + } + + // src -> dest: (x,y,z) -> (y,z,x) + [[nodiscard]] AMREX_GPU_HOST_DEVICE Dim3 Inverse (Dim3 i) const noexcept + { + return {i.y, i.z, i.x}; + } + + [[nodiscard]] IndexType operator() (IndexType it) const noexcept + { + return it; + } + + [[nodiscard]] IndexType Inverse (IndexType it) const noexcept + { + return it; + } + }; + + // public for cuda + template + void post_forward_doit (F const& post_forward); + +private: + +#if defined(AMREX_USE_CUDA) + using VendorPlan = cufftHandle; + using VendorPlan2 = VendorPlan; + using FFTComplex = std::conditional_t, + cuComplex, cuDoubleComplex>; +#elif defined(AMREX_USE_HIP) + using VendorPlan = rocfft_plan; + using VendorPlan2 = VendorPlan; + using FFTComplex = std::conditional_t, + float2, double2>; +#elif defined(AMREX_USE_SYCL) + using VendorPlan = oneapi::mkl::dft::descriptor< + std::is_same_v ? oneapi::mkl::dft::precision::SINGLE + : oneapi::mkl::dft::precision::DOUBLE, + oneapi::mkl::dft::domain::REAL> *; + using VendorPlan2 = oneapi::mkl::dft::descriptor< + std::is_same_v ? oneapi::mkl::dft::precision::SINGLE + : oneapi::mkl::dft::precision::DOUBLE, + oneapi::mkl::dft::domain::COMPLEX> *; + using FFTComplex = GpuComplex; +#else + using VendorPlan = std::conditional_t, + fftwf_plan, fftw_plan>; + using VendorPlan2 = VendorPlan; + using FFTComplex = std::conditional_t, + fftwf_complex, fftw_complex>; +#endif + + struct Plan { + bool defined = false; + VendorPlan plan = 0; // NOLINT + }; + + struct Plan2 { + bool defined = false; + VendorPlan2 plan = 0; // NOLINT + }; + + template + static typename FA::FABType::value_type * + get_fab (FA& fa) { + auto myproc = ParallelContext::MyProcSub(); + if (myproc < fa.size()) { + return fa.fabPtr(myproc); + } else { + return nullptr; + } + } + + static void exec_r2c (Plan plan, MF& in, cMF& out); + static void exec_c2r (Plan plan, cMF& in, MF& out); + template + static void exec_c2c (Plan2 plan, cMF& inout); + + template + static void destroy_plan (P plan); + static std::pair make_c2c_plans (cMF& inout); + + void backward_doit (MF& outmf); + + Plan m_fft_fwd_x{}; + Plan m_fft_bwd_x{}; + Plan2 m_fft_fwd_y{}; + Plan2 m_fft_bwd_y{}; + Plan2 m_fft_fwd_z{}; + Plan2 m_fft_bwd_z{}; + + // Comm meta-data. In the forward phase, we start with (x,y,z), + // transpose to (y,x,z) and then (z,x,y). In the backward phase, we + // perform inverse transpose. + std::unique_ptr m_cmd_x2y; // (x,y,z) -> (y,x,z) + std::unique_ptr m_cmd_y2x; // (y,x,z) -> (x,y,z) + std::unique_ptr m_cmd_y2z; // (y,x,z) -> (z,x,y) + std::unique_ptr m_cmd_z2y; // (z,x,y) -> (y,x,z) + Swap01 m_dtos_x2y{}; + Swap01 m_dtos_y2x{}; + Swap02 m_dtos_y2z{}; + Swap02 m_dtos_z2y{}; + + MF m_rx; + cMF m_cx; + cMF m_cy; + cMF m_cz; + + Box m_real_domain; + Box m_spectral_domain_x; + Box m_spectral_domain_y; + Box m_spectral_domain_z; + + Info m_info; +}; + +template +R2C::R2C (Box const& domain, Info const& info) + : m_real_domain(domain), + m_spectral_domain_x(IntVect(0), IntVect(AMREX_D_DECL(domain.length(0)/2, + domain.bigEnd(1), + domain.bigEnd(2)))), +#if (AMREX_SPACEDIM >= 2) + m_spectral_domain_y(IntVect(0), IntVect(AMREX_D_DECL(domain.bigEnd(1), + domain.length(0)/2, + domain.bigEnd(2)))), +#if (AMREX_SPACEDIM == 3) + m_spectral_domain_z(IntVect(0), IntVect(AMREX_D_DECL(domain.bigEnd(2), + domain.length(0)/2, + domain.bigEnd(1)))), +#endif +#endif + m_info(info) +{ + static_assert(std::is_same_v || std::is_same_v); + AMREX_ALWAYS_ASSERT(m_real_domain.smallEnd() == 0 && + m_real_domain.length(0) > 1 && + m_real_domain.cellCentered()); +#if (AMREX_SPACEDIM == 3) + AMREX_ALWAYS_ASSERT(m_real_domain.length(2) > 1 || ! m_info.batch_mode); + AMREX_ALWAYS_ASSERT(m_real_domain.length(1) > 1 || m_real_domain.length(2) == 1); +#else + AMREX_ALWAYS_ASSERT(! m_info.batch_mode); +#endif + + int myproc = ParallelContext::MyProcSub(); + int nprocs = ParallelContext::NProcsSub(); + + auto bax = amrex::decompose(m_real_domain, nprocs, {AMREX_D_DECL(false,true,true)}); + DistributionMapping dmx = detail::make_iota_distromap(bax.size()); + m_rx.define(bax, dmx, 1, 0); + + { + BoxList bl = bax.boxList(); + for (auto & b : bl) { + b.setBig(0, m_spectral_domain_x.bigEnd(0)); + } + BoxArray cbax(std::move(bl)); + m_cx.define(cbax, dmx, 1, 0); + } + + // plans for x-direction + if (myproc < m_rx.size()) + { + Box const local_box = m_rx.boxArray()[myproc]; + int n = local_box.length(0); + int howmany = AMREX_D_TERM(1, *local_box.length(1), *local_box.length(2)); + +#if defined(AMREX_USE_CUDA) + if constexpr (D == Direction::both || D == Direction::forward) { + cufftType fwd_type = std::is_same_v ? CUFFT_R2C : CUFFT_D2Z; + AMREX_CUFFT_SAFE_CALL + (cufftPlanMany(&m_fft_fwd_x.plan, 1, &n, + nullptr, 1, m_real_domain.length(0), + nullptr, 1, m_spectral_domain_x.length(0), + fwd_type, howmany)); + AMREX_CUFFT_SAFE_CALL(cufftSetStream(m_fft_fwd_x.plan, Gpu::gpuStream())); + } + if constexpr (D == Direction::both || D == Direction::backward) { + cufftType bwd_type = std::is_same_v ? CUFFT_C2R : CUFFT_Z2D; + AMREX_CUFFT_SAFE_CALL + (cufftPlanMany(&m_fft_bwd_x.plan, 1, &n, + nullptr, 1, m_spectral_domain_x.length(0), + nullptr, 1, m_real_domain.length(0), + bwd_type, howmany)); + AMREX_CUFFT_SAFE_CALL(cufftSetStream(m_fft_bwd_x.plan, Gpu::gpuStream())); + } +#elif defined(AMREX_USE_HIP) + + auto prec = std::is_same_v ? rocfft_precision_single : rocfft_precision_double; + const std::size_t length = n; + if constexpr (D == Direction::both || D == Direction::forward) { + AMREX_ROCFFT_SAFE_CALL + (rocfft_plan_create(&m_fft_fwd_x.plan, rocfft_placement_notinplace, + rocfft_transform_type_real_forward, prec, 1, + &length, howmany, nullptr)); + } + if constexpr (D == Direction::both || D == Direction::backward) { + AMREX_ROCFFT_SAFE_CALL + (rocfft_plan_create(&m_fft_bwd_x.plan, rocfft_placement_notinplace, + rocfft_transform_type_real_inverse, prec, 1, + &length, howmany, nullptr)); + } + +#elif defined(AMREX_USE_SYCL) + + m_fft_fwd_x.plan = new std::remove_pointer_t(n); + m_fft_fwd_x.plan->set_value(oneapi::mkl::dft::config_param::PLACEMENT, + DFTI_NOT_INPLACE); + m_fft_fwd_x.plan->set_value(oneapi::mkl::dft::config_param::NUMBER_OF_TRANSFORMS, + howmany); + m_fft_fwd_x.plan->set_value(oneapi::mkl::dft::config_param::FWD_DISTANCE, + m_real_domain.length(0)); + m_fft_fwd_x.plan->set_value(oneapi::mkl::dft::config_param::BWD_DISTANCE, + m_spectral_domain_x.length(0)); + std::array strides{0,1}; + m_fft_fwd_x.plan->set_value(oneapi::mkl::dft::config_param::FWD_STRIDES, + strides.data()); + m_fft_fwd_x.plan->set_value(oneapi::mkl::dft::config_param::BWD_STRIDES, + strides.data()); + m_fft_fwd_x.plan->set_value(oneapi::mkl::dft::config_param::WORKSPACE, + oneapi::mkl::dft::config_value::WORKSPACE_EXTERNAL); + m_fft_fwd_x.plan->commit(amrex::Gpu::Device::streamQueue()); + + m_fft_bwd_x.plan = m_fft_fwd_x.plan; + +#else /* FFTW */ + + auto* in = m_rx[myproc].dataPtr(); + auto* out = (FFTComplex*)(m_cx[myproc].dataPtr()); + + if constexpr (std::is_same_v) { + if constexpr (D == Direction::both || D == Direction::forward) { + m_fft_fwd_x.plan = fftwf_plan_many_dft_r2c + (1, &n, howmany, in, nullptr, 1, m_real_domain.length(0), + out, nullptr, 1, m_spectral_domain_x.length(0), + FFTW_ESTIMATE | FFTW_DESTROY_INPUT); + } + if constexpr (D == Direction::both || D == Direction::backward) { + m_fft_bwd_x.plan = fftwf_plan_many_dft_c2r + (1, &n, howmany, out, nullptr, 1, m_spectral_domain_x.length(0), + in, nullptr, 1, m_real_domain.length(0), + FFTW_ESTIMATE | FFTW_DESTROY_INPUT); + } + } else { + if constexpr (D == Direction::both || D == Direction::forward) { + m_fft_fwd_x.plan = fftw_plan_many_dft_r2c + (1, &n, howmany, in, nullptr, 1, m_real_domain.length(0), + out, nullptr, 1, m_spectral_domain_x.length(0), + FFTW_ESTIMATE | FFTW_DESTROY_INPUT); + } + if constexpr (D == Direction::both || D == Direction::backward) { + m_fft_bwd_x.plan = fftw_plan_many_dft_c2r + (1, &n, howmany, out, nullptr, 1, m_spectral_domain_x.length(0), + in, nullptr, 1, m_real_domain.length(0), + FFTW_ESTIMATE | FFTW_DESTROY_INPUT); + } + } +#endif + if constexpr (D == Direction::both || D == Direction::forward) { + m_fft_fwd_x.defined = true; + } + if constexpr (D == Direction::both || D == Direction::backward) { + m_fft_bwd_x.defined = true; + } + } + +#if (AMREX_SPACEDIM >= 2) + DistributionMapping cdmy; + if (m_real_domain.length(1) > 1) { + auto cbay = amrex::decompose(m_spectral_domain_y, nprocs, {AMREX_D_DECL(false,true,true)}); + if (cbay.size() == dmx.size()) { + cdmy = dmx; + } else { + cdmy = detail::make_iota_distromap(cbay.size()); + } + m_cy.define(cbay, cdmy, 1, 0); + + std::tie(m_fft_fwd_y, m_fft_bwd_y) = make_c2c_plans(m_cy); + + // comm meta-data between x and y phases + m_cmd_x2y = std::make_unique + (m_cy, m_spectral_domain_y, m_cx, IntVect(0), m_dtos_x2y); + m_cmd_y2x = std::make_unique + (m_cx, m_spectral_domain_x, m_cy, IntVect(0), m_dtos_y2x); + } + +#if (AMREX_SPACEDIM == 3) + if (m_real_domain.length(1) > 1 && + (! m_info.batch_mode && m_real_domain.length(2) > 1)) + { + auto cbaz = amrex::decompose(m_spectral_domain_z, nprocs, {false,true,true}); + DistributionMapping cdmz; + if (cbaz.size() == dmx.size()) { + cdmz = dmx; + } else if (cbaz.size() == cdmy.size()) { + cdmz = cdmy; + } else { + cdmz = detail::make_iota_distromap(cbaz.size()); + } + m_cz.define(cbaz, cdmz, 1, 0); + + std::tie(m_fft_fwd_z, m_fft_bwd_z) = make_c2c_plans(m_cz); + + // comm meta-data between y and z phases + m_cmd_y2z = std::make_unique + (m_cz, m_spectral_domain_z, m_cy, IntVect(0), m_dtos_y2z); + m_cmd_z2y = std::make_unique + (m_cy, m_spectral_domain_y, m_cz, IntVect(0), m_dtos_z2y); + } +#endif +#endif +} + +template +template +void R2C::destroy_plan (P plan) +{ + if (! plan.defined) { return; } + +#if defined(AMREX_USE_CUDA) + AMREX_CUFFT_SAFE_CALL(cufftDestroy(plan.plan)); +#elif defined(AMREX_USE_HIP) + AMREX_ROCFFT_SAFE_CALL(rocfft_plan_destroy(plan.plan)); +#elif defined(AMREX_USE_SYCL) + delete plan.plan; +#else + if constexpr (std::is_same_v) { + fftwf_destroy_plan(plan.plan); + } else { + fftw_destroy_plan(plan.plan); + } +#endif + + plan.defined = false; +} + +template +R2C::~R2C () +{ +#if defined(AMREX_USE_SYCL) + if constexpr (D == Direction::both || D == Direction::forward) { + destroy_plan(m_fft_fwd_x); + destroy_plan(m_fft_fwd_y); + destroy_plan(m_fft_fwd_z); + } else { + destroy_plan(m_fft_bwd_x); + destroy_plan(m_fft_bwd_y); + destroy_plan(m_fft_bwd_z); + } +#else + destroy_plan(m_fft_fwd_x); + destroy_plan(m_fft_fwd_y); + destroy_plan(m_fft_fwd_z); + destroy_plan(m_fft_bwd_x); + destroy_plan(m_fft_bwd_y); + destroy_plan(m_fft_bwd_z); +#endif +} + +#ifdef AMREX_USE_HIP +namespace detail { void hip_execute (rocfft_plan plan, void **in, void **out); } +#endif + +#ifdef AMREX_USE_SYCL +namespace detail +{ +template +void sycl_execute (P plan, TI* in, TO* out) +{ + std::size_t workspaceSize = 0; + plan->get_value(oneapi::mkl::dft::config_param::WORKSPACE_BYTES, + &workspaceSize); + auto* buffer = (T*)amrex::The_Arena()->alloc(workspaceSize); + plan->set_workspace(buffer); + sycl::event r; + if (std::is_same_v) { + amrex::ignore_unused(in); + if constexpr (direction == Direction::forward) { + r = oneapi::mkl::dft::compute_forward(*plan, out); + } else { + r = oneapi::mkl::dft::compute_backward(*plan, out); + } + } else { + if constexpr (direction == Direction::forward) { + r = oneapi::mkl::dft::compute_forward(*plan, in, out); + } else { + r = oneapi::mkl::dft::compute_backward(*plan, in, out); + } + } + r.wait(); + amrex::The_Arena()->free(buffer); +} +} +#endif + +template +void R2C::exec_r2c (Plan plan, MF& in, cMF& out) +{ + if (! plan.defined) { return; } + +#if defined(AMREX_USE_GPU) + auto* pin = in[ParallelContext::MyProcSub()].dataPtr(); + auto* pout = out[ParallelContext::MyProcSub()].dataPtr(); +#else + amrex::ignore_unused(in,out); +#endif + +#if defined(AMREX_USE_CUDA) + if constexpr (std::is_same_v) { + AMREX_CUFFT_SAFE_CALL(cufftExecR2C(plan.plan, pin, (FFTComplex*)pout)); + } else { + AMREX_CUFFT_SAFE_CALL(cufftExecD2Z(plan.plan, pin, (FFTComplex*)pout)); + } +#elif defined(AMREX_USE_HIP) + detail::hip_execute(plan.plan, (void**)&pin, (void**)&pout); +#elif defined(AMREX_USE_SYCL) + detail::sycl_execute(plan.plan, pin, (std::complex*)pout); +#else + if constexpr (std::is_same_v) { + fftwf_execute(plan.plan); + } else { + fftw_execute(plan.plan); + } +#endif +} + +template +void R2C::exec_c2r (Plan plan, cMF& in, MF& out) +{ + if (! plan.defined) { return; } + +#if defined(AMREX_USE_GPU) + auto* pin = in[ParallelContext::MyProcSub()].dataPtr(); + auto* pout = out[ParallelContext::MyProcSub()].dataPtr(); +#else + amrex::ignore_unused(in,out); +#endif + +#if defined(AMREX_USE_CUDA) + if constexpr (std::is_same_v) { + AMREX_CUFFT_SAFE_CALL(cufftExecC2R(plan.plan, (FFTComplex*)pin, pout)); + } else { + AMREX_CUFFT_SAFE_CALL(cufftExecZ2D(plan.plan, (FFTComplex*)pin, pout)); + } +#elif defined(AMREX_USE_HIP) + detail::hip_execute(plan.plan, (void**)&pin, (void**)&pout); +#elif defined(AMREX_USE_SYCL) + detail::sycl_execute(plan.plan, (std::complex*)pin, pout); +#else + if constexpr (std::is_same_v) { + fftwf_execute(plan.plan); + } else { + fftw_execute(plan.plan); + } +#endif +} + +template +template +void R2C::exec_c2c (Plan2 plan, cMF& inout) +{ + if (! plan.defined) { return; } + + amrex::ignore_unused(inout); +#if defined(AMREX_USE_GPU) + auto* p = inout[ParallelContext::MyProcSub()].dataPtr(); +#endif + +#if defined(AMREX_USE_CUDA) + auto cufft_direction = (direction == Direction::forward) ? CUFFT_FORWARD : CUFFT_INVERSE; + if constexpr (std::is_same_v) { + AMREX_CUFFT_SAFE_CALL(cufftExecC2C(plan.plan, (FFTComplex*)p, (FFTComplex*)p, + cufft_direction)); + } else { + AMREX_CUFFT_SAFE_CALL(cufftExecZ2Z(plan.plan, (FFTComplex*)p, (FFTComplex*)p, + cufft_direction)); + } +#elif defined(AMREX_USE_HIP) + detail::hip_execute(plan.plan, (void**)&p, (void**)&p); +#elif defined(AMREX_USE_SYCL) + detail::sycl_execute(plan.plan, (std::complex*)p, (std::complex*)p); +#else + if constexpr (std::is_same_v) { + fftwf_execute(plan.plan); + } else { + fftw_execute(plan.plan); + } +#endif +} + +template +template > +void R2C::forward (MF const& inmf) +{ + m_rx.ParallelCopy(inmf, 0, 0, 1); + exec_r2c(m_fft_fwd_x, m_rx, m_cx); + + if ( m_cmd_x2y) { + ParallelCopy(m_cy, m_cx, *m_cmd_x2y, 0, 0, 1, m_dtos_x2y); + } + exec_c2c(m_fft_fwd_y, m_cy); + + if ( m_cmd_y2z) { + ParallelCopy(m_cz, m_cy, *m_cmd_y2z, 0, 0, 1, m_dtos_y2z); + } + exec_c2c(m_fft_fwd_z, m_cz); +} + +template +template > +void R2C::backward (MF& outmf) +{ + backward_doit(outmf); +} + +template +void R2C::backward_doit (MF& outmf) +{ + exec_c2c(m_fft_bwd_z, m_cz); + if ( m_cmd_z2y) { + ParallelCopy(m_cy, m_cz, *m_cmd_z2y, 0, 0, 1, m_dtos_z2y); + } + + exec_c2c(m_fft_bwd_y, m_cy); + if ( m_cmd_y2x) { + ParallelCopy(m_cx, m_cy, *m_cmd_y2x, 0, 0, 1, m_dtos_y2x); + } + + exec_c2r(m_fft_bwd_x, m_cx, m_rx); + outmf.ParallelCopy(m_rx, 0, 0, 1); +} + +template +std::pair::Plan2, typename R2C::Plan2> +R2C::make_c2c_plans (cMF& inout) +{ + Plan2 fwd; + Plan2 bwd; + + auto* fab = get_fab(inout); + if (!fab) { return {fwd, bwd};} + + Box const& local_box = fab->box(); + + int n = local_box.length(0); + int howmany = AMREX_D_TERM(1, *local_box.length(1), *local_box.length(2)); + +#if defined(AMREX_USE_CUDA) + + if constexpr (D == Direction::both || D == Direction::forward) { + cufftType fwd_type = std::is_same_v ? CUFFT_C2C : CUFFT_Z2Z; + AMREX_CUFFT_SAFE_CALL + (cufftPlanMany(&fwd.plan, 1, &n, nullptr, 1, n, nullptr, 1, n, + fwd_type, howmany)); + AMREX_CUFFT_SAFE_CALL(cufftSetStream(fwd.plan, Gpu::gpuStream())); + } + if constexpr (D == Direction::both || D == Direction::backward) { + cufftType bwd_type = std::is_same_v ? CUFFT_C2C : CUFFT_Z2Z; + AMREX_CUFFT_SAFE_CALL + (cufftPlanMany(&bwd.plan, 1, &n, nullptr, 1, n, nullptr, 1, n, + bwd_type, howmany)); + AMREX_CUFFT_SAFE_CALL(cufftSetStream(bwd.plan, Gpu::gpuStream())); + } + +#elif defined(AMREX_USE_HIP) + + auto prec = std::is_same_v ? rocfft_precision_single : rocfft_precision_double; + const std::size_t length = n; + if constexpr (D == Direction::both || D == Direction::forward) { + AMREX_ROCFFT_SAFE_CALL + (rocfft_plan_create(&fwd.plan, rocfft_placement_inplace, + rocfft_transform_type_complex_forward, prec, 1, + &length, howmany, nullptr)); + } + if constexpr (D == Direction::both || D == Direction::backward) { + AMREX_ROCFFT_SAFE_CALL + (rocfft_plan_create(&bwd.plan, rocfft_placement_inplace, + rocfft_transform_type_complex_inverse, prec, 1, + &length, howmany, nullptr)); + } + +#elif defined(AMREX_USE_SYCL) + + fwd.plan = new std::remove_pointer_t(n); + fwd.plan->set_value(oneapi::mkl::dft::config_param::PLACEMENT, + DFTI_INPLACE); + fwd.plan->set_value(oneapi::mkl::dft::config_param::NUMBER_OF_TRANSFORMS, + howmany); + fwd.plan->set_value(oneapi::mkl::dft::config_param::FWD_DISTANCE, n); + fwd.plan->set_value(oneapi::mkl::dft::config_param::BWD_DISTANCE, n); + std::array strides{0,1}; + fwd.plan->set_value(oneapi::mkl::dft::config_param::FWD_STRIDES, strides.data()); + fwd.plan->set_value(oneapi::mkl::dft::config_param::BWD_STRIDES, strides.data()); + fwd.plan->set_value(oneapi::mkl::dft::config_param::WORKSPACE, + oneapi::mkl::dft::config_value::WORKSPACE_EXTERNAL); + fwd.plan->commit(amrex::Gpu::Device::streamQueue()); + + bwd.plan = fwd.plan; + +#else + auto* pinout = (FFTComplex*)fab->dataPtr(); + + if constexpr (std::is_same_v) { + if constexpr (D == Direction::both || D == Direction::forward) { + fwd.plan = fftwf_plan_many_dft(1, &n, howmany, pinout, nullptr, 1, n, + pinout, nullptr, 1, n, -1, FFTW_ESTIMATE); + } + if constexpr (D == Direction::both || D == Direction::backward) { + bwd.plan = fftwf_plan_many_dft(1, &n, howmany, pinout, nullptr, 1, n, + pinout, nullptr, 1, n, +1, FFTW_ESTIMATE); + } + } else { + if constexpr (D == Direction::both || D == Direction::forward) { + fwd.plan = fftw_plan_many_dft(1, &n, howmany, pinout, nullptr, 1, n, + pinout, nullptr, 1, n, -1, FFTW_ESTIMATE); + } + if constexpr (D == Direction::both || D == Direction::backward) { + bwd.plan = fftw_plan_many_dft(1, &n, howmany, pinout, nullptr, 1, n, + pinout, nullptr, 1, n, +1, FFTW_ESTIMATE); + } + } +#endif + + if constexpr (D == Direction::both || D == Direction::forward) { + fwd.defined = true; + } + if constexpr (D == Direction::both || D == Direction::backward) { + bwd.defined = true; + } + + return {fwd,bwd}; +} + +template +template +void R2C::post_forward_doit (F const& post_forward) +{ + if (m_info.batch_mode) { + amrex::Abort("xxxxx todo: post_forward"); + } else { + if ( ! m_cz.empty()) { + auto* spectral_fab = get_fab(m_cz); + if (spectral_fab) { + auto const& a = spectral_fab->array(); // m_cz's ordering is z,x,y + ParallelFor(spectral_fab->box(), + [=] AMREX_GPU_DEVICE (int iz, int jx, int ky) + { + post_forward(jx,ky,iz,a(iz,jx,ky)); + }); + } + } else if ( ! m_cy.empty()) { + auto* spectral_fab = get_fab(m_cy); + if (spectral_fab) { + auto const& a = spectral_fab->array(); // m_cy's ordering is y,x,z + ParallelFor(spectral_fab->box(), + [=] AMREX_GPU_DEVICE (int iy, int jx, int k) + { + post_forward(jx,iy,k,a(iy,jx,k)); + }); + } + } else { + auto* spectral_fab = get_fab(m_cx); + if (spectral_fab) { + auto const& a = spectral_fab->array(); + ParallelFor(spectral_fab->box(), + [=] AMREX_GPU_DEVICE (int i, int j, int k) + { + post_forward(i,j,k,a(i,j,k)); + }); + } + } + } +} + +template +template > +std::pair::cMF *, IntVect> +R2C::getSpectralData () +{ + if (!m_cz.empty()) { + return std::make_pair(&m_cz, IntVect{AMREX_D_DECL(2,0,1)}); + } else if (!m_cy.empty()) { + return std::make_pair(&m_cy, IntVect{AMREX_D_DECL(1,0,2)}); + } else { + return std::make_pair(&m_cx, IntVect{AMREX_D_DECL(0,1,2)}); + } +} + +template +template > +void R2C::forward (MF const& inmf, cMF& outmf) +{ + forward(inmf); + if (!m_cz.empty()) { // m_cz's order (z,x,y) -> (x,y,z) + RotateBwd dtos{}; + MultiBlockCommMetaData cmd + (outmf, m_spectral_domain_x, m_cz, IntVect(0), dtos); + ParallelCopy(outmf, m_cz, cmd, 0, 0, 1, dtos); + } else if (!m_cy.empty()) { // m_cy's order (y,x,z) -> (x,y,z) + MultiBlockCommMetaData cmd + (outmf, m_spectral_domain_x, m_cy, IntVect(0), m_dtos_y2x); + ParallelCopy(outmf, m_cy, cmd, 0, 0, 1, m_dtos_y2x); + } else { + outmf.ParallelCopy(m_cx, 0, 0, 1); + } +} + +template +template > +void R2C::backward (cMF const& inmf, MF& outmf) +{ + if (!m_cz.empty()) { // (x,y,z) -> m_cz's order (z,x,y) + RotateFwd dtos{}; + MultiBlockCommMetaData cmd + (m_cz, m_spectral_domain_z, inmf, IntVect(0), dtos); + ParallelCopy(m_cz, inmf, cmd, 0, 0, 1, dtos); + } else if (!m_cy.empty()) { // (x,y,z) -> m_cy's ordering (y,x,z) + MultiBlockCommMetaData cmd + (m_cy, m_spectral_domain_y, inmf, IntVect(0), m_dtos_x2y); + ParallelCopy(m_cy, inmf, cmd, 0, 0, 1, m_dtos_x2y); + } else { + m_cx.ParallelCopy(inmf, 0, 0, 1); + } + backward_doit(outmf); +} + +} + +#endif diff --git a/Src/FFT/AMReX_FFT.cpp b/Src/FFT/AMReX_FFT.cpp new file mode 100644 index 00000000000..68984a8f24a --- /dev/null +++ b/Src/FFT/AMReX_FFT.cpp @@ -0,0 +1,40 @@ +#include +#include + +namespace amrex::FFT::detail +{ + +DistributionMapping make_iota_distromap (Long n) +{ + AMREX_ASSERT(n <= ParallelContext::NProcsSub()); + Vector pm(n); + for (int i = 0; i < n; ++i) { + pm[i] = ParallelContext::local_to_global_rank(i); + } + return DistributionMapping(std::move(pm)); +} + +#ifdef AMREX_USE_HIP +void hip_execute (rocfft_plan plan, void **in, void **out) +{ + rocfft_execution_info execinfo = nullptr; + AMREX_ROCFFT_SAFE_CALL(rocfft_execution_info_create(&execinfo)); + + std::size_t buffersize = 0; + AMREX_ROCFFT_SAFE_CALL(rocfft_plan_get_work_buffer_size(plan, &buffersize)); + + auto* buffer = (void*)amrex::The_Arena()->alloc(buffersize); + AMREX_ROCFFT_SAFE_CALL(rocfft_execution_info_set_work_buffer(execinfo, buffer, buffersize)); + + AMREX_ROCFFT_SAFE_CALL(rocfft_execution_info_set_stream(execinfo, amrex::Gpu::gpuStream())); + + AMREX_ROCFFT_SAFE_CALL(rocfft_execute(plan, in, out, execinfo)); + + amrex::Gpu::streamSynchronize(); + amrex::The_Arena()->free(buffer); + + AMREX_ROCFFT_SAFE_CALL(rocfft_execution_info_destroy(execinfo)); +} +#endif + +} diff --git a/Src/FFT/AMReX_FFT_Helper.H b/Src/FFT/AMReX_FFT_Helper.H new file mode 100644 index 00000000000..c8ae2b74ea2 --- /dev/null +++ b/Src/FFT/AMReX_FFT_Helper.H @@ -0,0 +1,29 @@ +#ifndef AMREX_FFT_HELPER_H_ +#define AMREX_FFT_HELPER_H_ +#include + +#include + +namespace amrex::FFT +{ + +enum struct Direction { forward, backward, both }; + +struct Info +{ + //! Supported only in 3D. When batch_mode is true, FFT is performed on + //! the first two dimensions only and the third dimension size is the + //! batch size. + bool batch_mode = false; + + Info& setBatchMode (bool x) { batch_mode = x; return *this; } +}; + +namespace detail +{ + DistributionMapping make_iota_distromap (Long n); +} + +} + +#endif diff --git a/Src/FFT/AMReX_FFT_Poisson.H b/Src/FFT/AMReX_FFT_Poisson.H new file mode 100644 index 00000000000..62062062108 --- /dev/null +++ b/Src/FFT/AMReX_FFT_Poisson.H @@ -0,0 +1,259 @@ +#ifndef AMREX_FFT_POISSON_H_ +#define AMREX_FFT_POISSON_H_ + +#include +#include + +namespace amrex::FFT +{ + +/** + * \brief Poisson solver for all periodic boundaries using FFT + */ +template +class Poisson +{ +public: + + template ,int> = 0> + explicit Poisson (Geometry const& geom) + : m_geom(geom), m_r2c(geom.Domain()) + { + AMREX_ALWAYS_ASSERT(geom.isAllPeriodic()); + } + + void solve (MF& soln, MF const& rhs); + +private: + Geometry m_geom; + R2C m_r2c; +}; + +/** + * \brief 3D Poisson solver for periodic boundaries in the first two + * dimensions and Neumann in the last dimension. + */ +template +class PoissonHybrid +{ +public: + + template ,int> = 0> + explicit PoissonHybrid (Geometry const& geom) + : m_geom(geom), m_r2c(geom.Domain(), Info().setBatchMode(true)) + { +#if (AMREX_SPACEDIM == 3) + AMREX_ALWAYS_ASSERT(geom.isPeriodic(0) && geom.isPeriodic(1)); +#else + amrex::Abort("FFT::PoissonHybrid: 1D & 2D todo"); +#endif + } + + void solve (MF& soln, MF const& rhs); + +private: + Geometry m_geom; + R2C m_r2c; +}; + +template +void Poisson::solve (MF& soln, MF const& rhs) +{ + using T = typename MF::value_type; + + GpuArray fac + {AMREX_D_DECL(T(2)*Math::pi()/T(m_geom.ProbLength(0)), + T(2)*Math::pi()/T(m_geom.ProbLength(1)), + T(2)*Math::pi()/T(m_geom.ProbLength(2)))}; + GpuArray dx + {AMREX_D_DECL(T(m_geom.CellSize(0)), + T(m_geom.CellSize(1)), + T(m_geom.CellSize(2)))}; + auto scale = T(1.0/m_geom.Domain().d_numPts()); +#if (AMREX_SPACEDIM > 1) + auto const& len = m_geom.Domain().length(); +#endif + + m_r2c.forwardThenBackward(rhs, soln, + [=] AMREX_GPU_DEVICE (int i, int j, int k, + GpuComplex& spectral_data) + { + amrex::ignore_unused(i,j,k); + // the values in the upper-half of the spectral array in y and z + // are here interpreted as negative wavenumbers + AMREX_D_TERM(T a = fac[0]*i;, + T b = (j < len[1]/2) ? fac[1]*j : fac[1]*(len[1]-j);, + T c = (k < len[2]/2) ? fac[2]*k : fac[2]*(len[2]-k)); + T k2 = AMREX_D_TERM(T(2)*(std::cos(a*dx[0])-T(1))/(dx[0]*dx[0]), + +T(2)*(std::cos(b*dx[1])-T(1))/(dx[1]*dx[1]), + +T(2)*(std::cos(c*dx[2])-T(1))/(dx[2]*dx[2])); + if (k2 != T(0)) { + spectral_data /= k2; + } else { + // interpretation here is that the average value of the + // solution is zero + spectral_data = 0; + } + spectral_data *= scale; + }); +} + +template +void PoissonHybrid::solve (MF& soln, MF const& rhs) +{ +#if (AMREX_SPACEDIM < 3) + amrex::ignore_unused(soln, rhs); +#else + using T = typename MF::value_type; + + auto facx = T(2)*Math::pi()/T(m_geom.ProbLength(0)); + auto facy = T(2)*Math::pi()/T(m_geom.ProbLength(1)); + auto dx = T(m_geom.CellSize(0)); + auto dy = T(m_geom.CellSize(1)); + auto scale = T(1.0)/(T(m_geom.Domain().length(0)) * + T(m_geom.Domain().length(1))); + auto ny = m_geom.Domain().length(1); + auto nz = m_geom.Domain().length(2); + + Gpu::DeviceVector delzv(nz, T(m_geom.CellSize(2))); + auto const* delz = delzv.data(); + + Box cdomain = m_geom.Domain(); + cdomain.setBig(0,cdomain.length(0)/2); + auto cba = amrex::decompose(cdomain, ParallelContext::NProcsSub(), + {AMREX_D_DECL(true,true,false)}); + DistributionMapping dm = detail::make_iota_distromap(cba.size()); + FabArray > > spmf(cba, dm, 1, 0); + + m_r2c.forward(rhs, spmf); + + for (MFIter mfi(spmf); mfi.isValid(); ++mfi) + { + auto const& spectral = spmf.array(mfi); + auto const& box = mfi.validbox(); + auto const& xybox = amrex::makeSlab(box, 2, 0); + +#ifdef AMREX_USE_GPU + // xxxxx TODO: We need to explore how to optimize this + // function. Maybe we can use cusparse. Maybe we should make + // z-direction to be the unit stride direction. + + FArrayBox tridiag_workspace(box,4); + auto const& ald = tridiag_workspace.array(0); + auto const& bd = tridiag_workspace.array(1); + auto const& cud = tridiag_workspace.array(2); + auto const& scratch = tridiag_workspace.array(3); + + amrex::ParallelFor(xybox, [=] AMREX_GPU_DEVICE (int i, int j, int) + { + T a = facx*i; + T b = (j < ny/2) ? facy*j : facy*(ny-j); + + T k2 = T(2)*(std::cos(a*dx)-T(1))/(dx*dx) + + T(2)*(std::cos(b*dy)-T(1))/(dy*dy); + + // Tridiagonal solve with homogeneous Neumann + for(int k=0; k < nz; k++) { + if(k==0) { + ald(i,j,k) = 0.; + cud(i,j,k) = 2.0 /(delz[k]*(delz[k]+delz[k+1])); + bd(i,j,k) = k2 -ald(i,j,k)-cud(i,j,k); + } else if (k == nz-1) { + ald(i,j,k) = 2.0 /(delz[k]*(delz[k]+delz[k-1])); + cud(i,j,k) = 0.; + bd(i,j,k) = k2 -ald(i,j,k)-cud(i,j,k); + if (i == 0 && j == 0) { + bd(i,j,k) *= 2.0; + } + } else { + ald(i,j,k) = 2.0 /(delz[k]*(delz[k]+delz[k-1])); + cud(i,j,k) = 2.0 /(delz[k]*(delz[k]+delz[k+1])); + bd(i,j,k) = k2 -ald(i,j,k)-cud(i,j,k); + } + } + + scratch(i,j,0) = cud(i,j,0)/bd(i,j,0); + spectral(i,j,0) = spectral(i,j,0)/bd(i,j,0); + + for (int k = 1; k < nz; k++) { + if (k < nz-1) { + scratch(i,j,k) = cud(i,j,k) / (bd(i,j,k) - ald(i,j,k) * scratch(i,j,k-1)); + } + spectral(i,j,k) = (spectral(i,j,k) - ald(i,j,k) * spectral(i,j,k - 1)) + / (bd(i,j,k) - ald(i,j,k) * scratch(i,j,k-1)); + } + + for (int k = nz - 2; k >= 0; k--) { + spectral(i,j,k) -= scratch(i,j,k) * spectral(i,j,k + 1); + } + + for (int k = 0; k < nz; ++k) { + spectral(i,j,k) *= scale; + } + }); + Gpu::streamSynchronize(); + +#else + + Gpu::DeviceVector> ald(nz); + Gpu::DeviceVector> bd(nz); + Gpu::DeviceVector> cud(nz); + Gpu::DeviceVector> scratch(nz); + + amrex::LoopOnCpu(xybox, [&] (int i, int j, int) + { + T a = facx*i; + T b = (j < ny/2) ? facy*j : facy*(ny-j); + + T k2 = T(2)*(std::cos(a*dx)-T(1))/(dx*dx) + + T(2)*(std::cos(b*dy)-T(1))/(dy*dy); + + // Tridiagonal solve with homogeneous Neumann + for(int k=0; k < nz; k++) { + if(k==0) { + ald[k] = 0.; + cud[k] = 2.0 /(delz[k]*(delz[k]+delz[k+1])); + bd[k] = k2 -ald[k]-cud[k]; + } else if (k == nz-1) { + ald[k] = 2.0 /(delz[k]*(delz[k]+delz[k-1])); + cud[k] = 0.; + bd[k] = k2 -ald[k]-cud[k]; + if (i == 0 && j == 0) { + bd[k] *= 2.0; + } + } else { + ald[k] = 2.0 /(delz[k]*(delz[k]+delz[k-1])); + cud[k] = 2.0 /(delz[k]*(delz[k]+delz[k+1])); + bd[k] = k2 -ald[k]-cud[k]; + } + } + + scratch[0] = cud[0]/bd[0]; + spectral(i,j,0) = spectral(i,j,0)/bd[0]; + + for (int k = 1; k < nz; k++) { + if (k < nz-1) { + scratch[k] = cud[k] / (bd[k] - ald[k] * scratch[k-1]); + } + spectral(i,j,k) = (spectral(i,j,k) - ald[k] * spectral(i,j,k - 1)) + / (bd[k] - ald[k] * scratch[k-1]); + } + + for (int k = nz - 2; k >= 0; k--) { + spectral(i,j,k) -= scratch[k] * spectral(i,j,k + 1); + } + + for (int k = 0; k < nz; ++k) { + spectral(i,j,k) *= scale; + } + }); +#endif + } + + m_r2c.backward(spmf, soln); +#endif +} + +} + +#endif diff --git a/Src/FFT/CMakeLists.txt b/Src/FFT/CMakeLists.txt new file mode 100644 index 00000000000..2c695a9aec3 --- /dev/null +++ b/Src/FFT/CMakeLists.txt @@ -0,0 +1,14 @@ +add_amrex_define(AMREX_USE_FFT NO_LEGACY) + +foreach(D IN LISTS AMReX_SPACEDIM) + target_include_directories(amrex_${D}d PUBLIC $) + + target_sources(amrex_${D}d + PRIVATE + AMReX_FFT.H + AMReX_FFT.cpp + AMReX_FFT_Helper.H + AMReX_FFT_Poisson.H + ) + +endforeach() diff --git a/Src/FFT/Make.package b/Src/FFT/Make.package new file mode 100644 index 00000000000..1dcd714f644 --- /dev/null +++ b/Src/FFT/Make.package @@ -0,0 +1,10 @@ +ifndef AMREX_FFT_MAKE + AMREX_FFT_MAKE := 1 + +CEXE_headers += AMReX_FFT.H AMReX_FFT_Helper.H AMReX_FFT_Poisson.H +CEXE_sources += AMReX_FFT.cpp + +VPATH_LOCATIONS += $(AMREX_HOME)/Src/FFT +INCLUDE_LOCATIONS += $(AMREX_HOME)/Src/FFT + +endif diff --git a/Tests/CMakeLists.txt b/Tests/CMakeLists.txt index 90730c24fb6..9d6afba857a 100644 --- a/Tests/CMakeLists.txt +++ b/Tests/CMakeLists.txt @@ -117,6 +117,10 @@ if (AMReX_TEST_TYPE STREQUAL "Small") add_subdirectory("LinearSolvers/ABecLaplacian_C") endif() + if (AMReX_FFT) + add_subdirectory("FFT/Poisson") + endif() + else() # # List of subdirectories to search for CMakeLists. @@ -137,6 +141,10 @@ else() list(APPEND AMREX_TESTS_SUBDIRS LinearSolvers) endif () + if (AMReX_FFT) + list(APPEND AMREX_TESTS_SUBDIRS FFT) + endif () + if (AMReX_HDF5) list(APPEND AMREX_TESTS_SUBDIRS HDF5Benchmark) endif () diff --git a/Tests/FFT/Poisson/CMakeLists.txt b/Tests/FFT/Poisson/CMakeLists.txt new file mode 100644 index 00000000000..21a9d3b2681 --- /dev/null +++ b/Tests/FFT/Poisson/CMakeLists.txt @@ -0,0 +1,10 @@ +foreach(D IN LISTS AMReX_SPACEDIM) + set(_sources main.cpp) + + set(_input_files) + + setup_test(${D} _sources _input_files) + + unset(_sources) + unset(_input_files) +endforeach() diff --git a/Tests/FFT/Poisson/GNUmakefile b/Tests/FFT/Poisson/GNUmakefile new file mode 100644 index 00000000000..93376f44852 --- /dev/null +++ b/Tests/FFT/Poisson/GNUmakefile @@ -0,0 +1,26 @@ +AMREX_HOME := ../../.. + +DEBUG = FALSE + +DIM = 3 + +COMP = gcc + +USE_MPI = TRUE +USE_OMP = FALSE +USE_CUDA = FALSE +USE_HIP = FALSE +USE_SYCL = FALSE + +USE_FFT = TRUE + +BL_NO_FORT = TRUE + +TINY_PROFILE = FALSE + +include $(AMREX_HOME)/Tools/GNUMake/Make.defs + +include ./Make.package +include $(AMREX_HOME)/Src/Base/Make.package + +include $(AMREX_HOME)/Tools/GNUMake/Make.rules diff --git a/Tests/FFT/Poisson/Make.package b/Tests/FFT/Poisson/Make.package new file mode 100644 index 00000000000..6b4b865e8fc --- /dev/null +++ b/Tests/FFT/Poisson/Make.package @@ -0,0 +1 @@ +CEXE_sources += main.cpp diff --git a/Tests/FFT/Poisson/main.cpp b/Tests/FFT/Poisson/main.cpp new file mode 100644 index 00000000000..1286d80dad5 --- /dev/null +++ b/Tests/FFT/Poisson/main.cpp @@ -0,0 +1,148 @@ +#include // Put this at the top for testing + +#include +#include +#include +#include + +using namespace amrex; + +int main (int argc, char* argv[]) +{ + amrex::Initialize(argc, argv); + { + BL_PROFILE("main"); + + AMREX_D_TERM(int n_cell_x = 64;, + int n_cell_y = 32;, + int n_cell_z = 128); + + AMREX_D_TERM(int max_grid_size_x = 32;, + int max_grid_size_y = 32;, + int max_grid_size_z = 32); + + AMREX_D_TERM(Real prob_lo_x = 0.;, + Real prob_lo_y = 0.;, + Real prob_lo_z = 0.); + AMREX_D_TERM(Real prob_hi_x = 1.;, + Real prob_hi_y = 1.;, + Real prob_hi_z = 1.); + + { + ParmParse pp; + AMREX_D_TERM(pp.query("n_cell_x", n_cell_x);, + pp.query("n_cell_y", n_cell_y);, + pp.query("n_cell_z", n_cell_z)); + AMREX_D_TERM(pp.query("max_grid_size_x", max_grid_size_x);, + pp.query("max_grid_size_y", max_grid_size_y);, + pp.query("max_grid_size_z", max_grid_size_z)); + } + + Box domain(IntVect(0),IntVect(AMREX_D_DECL(n_cell_x-1,n_cell_y-1,n_cell_z-1))); + BoxArray ba(domain); + ba.maxSize(IntVect(AMREX_D_DECL(max_grid_size_x, + max_grid_size_y, + max_grid_size_z))); + DistributionMapping dm(ba); + + Geometry geom; + { + geom.define(domain, + RealBox(AMREX_D_DECL(prob_lo_x,prob_lo_y,prob_lo_z), + AMREX_D_DECL(prob_hi_x,prob_hi_y,prob_hi_z)), + CoordSys::cartesian, {AMREX_D_DECL(1,1,1)}); + } + auto const& dx = geom.CellSizeArray(); + + MultiFab rhs(ba,dm,1,0); + MultiFab soln(ba,dm,1,0); + auto const& rhsma = rhs.arrays(); + ParallelFor(rhs, [=] AMREX_GPU_DEVICE (int b, int i, int j, int k) + { + AMREX_D_TERM(Real x = (i+0.5_rt) * dx[0] - 0.5_rt;, + Real y = (j+0.5_rt) * dx[1] - 0.5_rt;, + Real z = (k+0.5_rt) * dx[2] - 0.5_rt); + rhsma[b](i,j,k) = std::exp(-10._rt* + (AMREX_D_TERM(x*x*1.05_rt, + y*y*0.90_rt, + z*z))); + }); + + // Shift rhs so that its sum is zero. + auto rhosum = rhs.sum(0); + rhs.plus(-rhosum/geom.Domain().d_numPts(), 0, 1); + +#if (AMREX_SPACEDIM == 3) + Array solvers{0,1}; +#else + Array solvers{0}; +#endif + + for (int solver_type : solvers) { + double tsetup, tsolve; + if (solver_type == 0) { + auto t0 = amrex::second(); + FFT::Poisson fft_poisson(geom); + auto t1 = amrex::second(); + tsetup = t1-t0; + + for (int n = 0; n < 2; ++n) { + auto ta = amrex::second(); + fft_poisson.solve(soln, rhs); + auto tb = amrex::second(); + tsolve = tb-ta; + } + } else { + auto t0 = amrex::second(); + FFT::PoissonHybrid fft_poisson(geom); + auto t1 = amrex::second(); + tsetup = t1-t0; + + for (int n = 0; n < 2; ++n) { + auto ta = amrex::second(); + fft_poisson.solve(soln, rhs); + auto tb = amrex::second(); + tsolve = tb-ta; + } + } + + amrex::Print() << " AMReX FFT setup time: " << tsetup + << ", solve time " << tsolve << "\n"; + + MultiFab phi(soln.boxArray(), soln.DistributionMap(), 1, 1); + MultiFab res(soln.boxArray(), soln.DistributionMap(), 1, 0); + MultiFab::Copy(phi, soln, 0, 0, 1, 0); + phi.FillBoundary(geom.periodicity()); + auto const& res_ma = res.arrays(); + auto const& phi_ma = phi.const_arrays(); + auto const& rhs_ma = rhs.const_arrays(); + ParallelFor(res, [=] AMREX_GPU_DEVICE (int b, int i, int j, int k) + { + auto const& phia = phi_ma[b]; + auto lap = (phia(i-1,j,k)-2._rt*phia(i,j,k)+phia(i+1,j,k)) / (dx[0]*dx[0]); +#if (AMREX_SPACEDIM >= 2) + lap += (phia(i,j-1,k)-2._rt*phia(i,j,k)+phia(i,j+1,k)) / (dx[1]*dx[1]); +#endif +#if (AMREX_SPACEDIM == 3) + if ((solver_type == 1) && (k == 0)) { // Neumann + lap += (-phia(i,j,k)+phia(i,j,k+1)) / (dx[2]*dx[2]); + } else if ((solver_type == 1) && ((k+1) == n_cell_z)) { // Neumann + lap += (phia(i,j,k-1)-phia(i,j,k)) / (dx[2]*dx[2]); + } else { + lap += (phia(i,j,k-1)-2._rt*phia(i,j,k)+phia(i,j,k+1)) / (dx[2]*dx[2]); + } +#endif + res_ma[b](i,j,k) = rhs_ma[b](i,j,k) - lap; + }); + auto bnorm = rhs.norminf(); + auto rnorm = res.norminf(); + amrex::Print() << " rhs inf norm " << bnorm << "\n" + << " res inf norm " << rnorm << "\n"; +#ifdef AMREX_USE_FLOAT + auto eps = 2.e-3f; +#else + auto eps = 1.e-11; +#endif + AMREX_ALWAYS_ASSERT(rnorm < eps*bnorm); + } + } + amrex::Finalize(); +} diff --git a/Tests/FFT/R2C/CMakeLists.txt b/Tests/FFT/R2C/CMakeLists.txt new file mode 100644 index 00000000000..21a9d3b2681 --- /dev/null +++ b/Tests/FFT/R2C/CMakeLists.txt @@ -0,0 +1,10 @@ +foreach(D IN LISTS AMReX_SPACEDIM) + set(_sources main.cpp) + + set(_input_files) + + setup_test(${D} _sources _input_files) + + unset(_sources) + unset(_input_files) +endforeach() diff --git a/Tests/FFT/R2C/GNUmakefile b/Tests/FFT/R2C/GNUmakefile new file mode 100644 index 00000000000..93376f44852 --- /dev/null +++ b/Tests/FFT/R2C/GNUmakefile @@ -0,0 +1,26 @@ +AMREX_HOME := ../../.. + +DEBUG = FALSE + +DIM = 3 + +COMP = gcc + +USE_MPI = TRUE +USE_OMP = FALSE +USE_CUDA = FALSE +USE_HIP = FALSE +USE_SYCL = FALSE + +USE_FFT = TRUE + +BL_NO_FORT = TRUE + +TINY_PROFILE = FALSE + +include $(AMREX_HOME)/Tools/GNUMake/Make.defs + +include ./Make.package +include $(AMREX_HOME)/Src/Base/Make.package + +include $(AMREX_HOME)/Tools/GNUMake/Make.rules diff --git a/Tests/FFT/R2C/Make.package b/Tests/FFT/R2C/Make.package new file mode 100644 index 00000000000..6b4b865e8fc --- /dev/null +++ b/Tests/FFT/R2C/Make.package @@ -0,0 +1 @@ +CEXE_sources += main.cpp diff --git a/Tests/FFT/R2C/main.cpp b/Tests/FFT/R2C/main.cpp new file mode 100644 index 00000000000..71030385759 --- /dev/null +++ b/Tests/FFT/R2C/main.cpp @@ -0,0 +1,126 @@ +#include // Put this at the top for testing + +#include +#include +#include +#include + +using namespace amrex; + +int main (int argc, char* argv[]) +{ + amrex::Initialize(argc, argv); + { + BL_PROFILE("main"); + + AMREX_D_TERM(int n_cell_x = 128;, + int n_cell_y = 32;, + int n_cell_z = 64); + + AMREX_D_TERM(int max_grid_size_x = 32;, + int max_grid_size_y = 32;, + int max_grid_size_z = 32); + + AMREX_D_TERM(Real prob_lo_x = 0.;, + Real prob_lo_y = 0.;, + Real prob_lo_z = 0.); + AMREX_D_TERM(Real prob_hi_x = 1.;, + Real prob_hi_y = 1.;, + Real prob_hi_z = 1.); + + { + ParmParse pp; + AMREX_D_TERM(pp.query("n_cell_x", n_cell_x);, + pp.query("n_cell_y", n_cell_y);, + pp.query("n_cell_z", n_cell_z)); + AMREX_D_TERM(pp.query("max_grid_size_x", max_grid_size_x);, + pp.query("max_grid_size_y", max_grid_size_y);, + pp.query("max_grid_size_z", max_grid_size_z)); + } + + Box domain(IntVect(0),IntVect(AMREX_D_DECL(n_cell_x-1,n_cell_y-1,n_cell_z-1))); + BoxArray ba(domain); + ba.maxSize(IntVect(AMREX_D_DECL(max_grid_size_x, + max_grid_size_y, + max_grid_size_z))); + DistributionMapping dm(ba); + + Geometry geom; + { + geom.define(domain, + RealBox(AMREX_D_DECL(prob_lo_x,prob_lo_y,prob_lo_z), + AMREX_D_DECL(prob_hi_x,prob_hi_y,prob_hi_z)), + CoordSys::cartesian, {AMREX_D_DECL(1,1,1)}); + } + auto const& dx = geom.CellSizeArray(); + + MultiFab mf(ba,dm,1,0); + auto const& ma = mf.arrays(); + ParallelFor(mf, [=] AMREX_GPU_DEVICE (int b, int i, int j, int k) + { + AMREX_D_TERM(Real x = (i+0.5_rt) * dx[0] - 0.5_rt;, + Real y = (j+0.5_rt) * dx[1] - 0.5_rt;, + Real z = (k+0.5_rt) * dx[2] - 0.5_rt); + ma[b](i,j,k) = std::exp(-10._rt* + (AMREX_D_TERM(x*x*1.05_rt, + y*y*0.90_rt, + z*z))); + }); + + MultiFab mf2(ba,dm,1,0); + + auto scaling = Real(1) / Real(geom.Domain().d_numPts()); + + { + cMultiFab cmf(ba,dm,1,0); + + // forward + { + FFT::R2C r2c(geom.Domain()); + r2c.forward(mf,cmf); + } + + // backward + { + FFT::R2C r2c(geom.Domain()); + r2c.backward(cmf,mf2); + } + + auto const& ma2 = mf2.arrays(); + ParallelFor(mf2, [=] AMREX_GPU_DEVICE (int b, int i, int j, int k) + { + ma2[b](i,j,k) = ma[b](i,j,k) - ma2[b](i,j,k)*scaling; + }); + + auto error = mf2.norminf(); + amrex::Print() << " Expected to be close to zero: " << error << "\n"; +#ifdef AMREX_USE_FLOAT + auto eps = 1.e-6f; +#else + auto eps = 1.e-13; +#endif + AMREX_ALWAYS_ASSERT(error < eps); + } + + mf2.setVal(std::numeric_limits::max()); + + { // forward and backward + FFT::R2C r2c(geom.Domain()); + r2c.forwardThenBackward(mf, mf2, + [=] AMREX_GPU_DEVICE (int, int, int, auto& sp) + { + sp *= scaling; + }); + + MultiFab::Subtract(mf2, mf, 0, 0, 1, 0); + + auto error = mf2.norminf(); + amrex::Print() << " Expected to be close to zero: " << error << "\n"; +#ifdef AMREX_USE_FLOAT + auto eps = 1.e-6f; +#else + auto eps = 1.e-13; +#endif + AMREX_ALWAYS_ASSERT(error < eps); + } + } + amrex::Finalize(); +} diff --git a/Tools/CMake/AMReXConfig.cmake.in b/Tools/CMake/AMReXConfig.cmake.in index 6ae09ba0a00..96fb12cbf77 100644 --- a/Tools/CMake/AMReXConfig.cmake.in +++ b/Tools/CMake/AMReXConfig.cmake.in @@ -76,6 +76,7 @@ set(AMReX_FINTERFACES_FOUND @AMReX_FORTRAN_INTERFACES@) set(AMReX_LSOLVERS_FOUND @AMReX_LINEAR_SOLVERS@) set(AMReX_LSOLVERS_INCFLO_FOUND @AMReX_LINEAR_SOLVERS_INCFLO@) set(AMReX_LSOLVERS_EM_FOUND @AMReX_LINEAR_SOLVERS_EM@) +set(AMReX_FFT_FOUND @AMReX_FFT@) set(AMReX_AMRDATA_FOUND @AMReX_AMRDATA@) set(AMReX_PARTICLES_FOUND @AMReX_PARTICLES@) set(AMReX_P@AMReX_PARTICLES_PRECISION@_FOUND ON) @@ -133,6 +134,7 @@ set(AMReX_FINTERFACES @AMReX_FORTRAN_INTERFACES@) set(AMReX_LSOLVERS @AMReX_LINEAR_SOLVERS@) set(AMReX_LSOLVERS_INCFLO @AMReX_LINEAR_SOLVERS_INCFLO@) set(AMReX_LSOLVERS_EM @AMReX_LINEAR_SOLVERS_EM@) +set(AMReX_FFT @AMReX_FFT@) set(AMReX_AMRDATA @AMReX_AMRDATA@) set(AMReX_PARTICLES @AMReX_PARTICLES@) set(AMReX_PARTICLES_PRECISION @AMReX_PARTICLES_PRECISION@) @@ -216,6 +218,12 @@ if (@AMReX_CONDUIT@) find_dependency(Conduit REQUIRED) endif () +if (@AMReX_FFT@) + if (@AMReX_GPU_BACKEND@ STREQUAL NONE) + find_dependency(AMReXFFTW REQUIRED) + endif() +endif() + if (@AMReX_HDF5@) find_dependency(HDF5 REQUIRED) endif () diff --git a/Tools/CMake/AMReXOptions.cmake b/Tools/CMake/AMReXOptions.cmake index 382459e38f1..a7863f125e3 100644 --- a/Tools/CMake/AMReXOptions.cmake +++ b/Tools/CMake/AMReXOptions.cmake @@ -294,6 +294,9 @@ cmake_dependent_option( AMReX_LINEAR_SOLVERS_EM "AMReX_LINEAR_SOLVERS" OFF) print_option( AMReX_LINEAR_SOLVERS_EM ) +option( AMReX_FFT "Build AMReX FFT" OFF ) +print_option( AMReX_FFT ) + option( AMReX_AMRDATA "Build data services" OFF ) print_option( AMReX_AMRDATA ) diff --git a/Tools/CMake/AMReXThirdPartyLibraries.cmake b/Tools/CMake/AMReXThirdPartyLibraries.cmake index abe62a2ebc9..b8ad503e83f 100644 --- a/Tools/CMake/AMReXThirdPartyLibraries.cmake +++ b/Tools/CMake/AMReXThirdPartyLibraries.cmake @@ -1,3 +1,27 @@ +# +# FFT +# +if (AMReX_FFT) + if (AMReX_CUDA) + find_package(CUDAToolkit REQUIRED) + foreach(D IN LISTS AMReX_SPACEDIM) + target_link_libraries(amrex_${D}d PUBLIC CUDA::cufft) + endforeach() + elseif (AMReX_HIP) + find_package(rocfft REQUIRED) + foreach(D IN LISTS AMReX_SPACEDIM) + target_link_libraries(amrex_${D}d PUBLIC roc::rocfft) + endforeach() + elseif (AMReX_SYCL) + # nothing to do + else() + find_package(AMReXFFTW REQUIRED) + foreach(D IN LISTS AMReX_SPACEDIM) + target_link_libraries(amrex_${D}d PUBLIC AMReX::FFTW) + endforeach() + endif() +endif() + # # HDF5 -- here it would be best to create an imported target # diff --git a/Tools/CMake/AMReX_Config_ND.H.in b/Tools/CMake/AMReX_Config_ND.H.in index 3296a403ff0..07e3b7fd637 100644 --- a/Tools/CMake/AMReX_Config_ND.H.in +++ b/Tools/CMake/AMReX_Config_ND.H.in @@ -39,6 +39,7 @@ #cmakedefine BL_FORT_USE_LOWERCASE #cmakedefine BL_FORT_USE_UPPERCASE #cmakedefine BL_NO_FORT +#cmakedefine AMREX_USE_FFT #cmakedefine AMREX_USE_SENSEI_INSITU #cmakedefine AMREX_NO_SENSEI_AMR_INST #cmakedefine AMREX_USE_CONDUIT diff --git a/Tools/CMake/FindAMReXFFTW.cmake b/Tools/CMake/FindAMReXFFTW.cmake new file mode 100644 index 00000000000..678743a08bb --- /dev/null +++ b/Tools/CMake/FindAMReXFFTW.cmake @@ -0,0 +1,51 @@ +#[=======================================================================[: +FindAMReXFFTW +------- + +Finds the FFTW library. + +Imported Targets +^^^^^^^^^^^^^^^^ + +This module provides the following imported target, if found: + +``FFTW`` + The FFTW library + +Result Variables +^^^^^^^^^^^^^^^^ + +This will define the following variables: + +``AMReXFFTW_FOUND`` + True if the hypre library has been found. +``FFTW_INCLUDES`` + Include directories needed to use FFTW. +``FFTW_LIBRARIES`` + Libraries needed to link to FFTW. + +This will also create an imported target, AMReX::FFTW. +#]=======================================================================] + +if (NOT FFTW_INCLUDES) + find_path(FFTW_INCLUDES NAMES "fftw3.h" HINTS ${FFTW_ROOT}/include) +endif() + +if (NOT FFTW_LIBRARIES) + find_library(FFTW_LIBRARY NAMES "fftw3" HINTS ${FFTW_ROOT}/lib) + find_library(FFTWF_LIBRARY NAMES "fftw3f" HINTS ${FFTW_ROOT}/lib) + set(FFTW_LIBRARIES ${FFTW_LIBRARY} ${FFTWF_LIBRARY}) +endif() + +include(FindPackageHandleStandardArgs) + +find_package_handle_standard_args(AMReXFFTW + REQUIRED_VARS FFTW_LIBRARIES FFTW_INCLUDES) + +mark_as_advanced(FFTW_LIBRARIES FFTW_INCLUDES) + +# Create imported target +add_library(AMReX::FFTW INTERFACE IMPORTED GLOBAL) +target_link_libraries(AMReX::FFTW INTERFACE ${FFTW_LIBRARIES}) +set_target_properties(AMReX::FFTW PROPERTIES + INTERFACE_INCLUDE_DIRECTORIES "${FFTW_INCLUDES}") diff --git a/Tools/GNUMake/Make.defs b/Tools/GNUMake/Make.defs index f33911ed3aa..5c678bdcc99 100644 --- a/Tools/GNUMake/Make.defs +++ b/Tools/GNUMake/Make.defs @@ -112,6 +112,12 @@ else DEBUG := FALSE endif +ifdef USE_FFT + USE_FFT := $(strip $(USE_FFT)) +else + USE_FFT := FALSE +endif + ifdef PROFILE PROFILE := $(strip $(PROFILE)) else @@ -604,6 +610,28 @@ else DebugSuffix := endif +ifeq ($(USE_FFT),TRUE) + include $(AMREX_HOME)/Src/FFT/Make.package + ifeq ($(USE_CUDA),TRUE) + LIBRARIES += -lcufft + else ifeq ($(USE_HIP),TRUE) + # Use rocFFT. ROC_PATH is defined in hip.mak + SYSTEM_INCLUDE_LOCATIONS += $(ROC_PATH)/rocfft/include + LIBRARY_LOCATIONS += $(ROC_PATH)/rocfft/lib + LIBRARIES += -Wl,--rpath=$(ROC_PATH)/rocfft/lib -lrocfft + else ifeq ($(USE_SYCL),TRUE) + # nothing + else + FFTW_HOME ?= NOT_SET + ifneq ($(FFTW_HOME),NOT_SET) + SYSTEM_INCLUDE_LOCATIONS += $(FFTW_HOME)/include + LIBRARY_LOCATIONS += $(FFTW_HOME)/lib + LIBRARIES += -Wl,--rpath=$(FFTW_HOME)/lib + endif + LIBRARIES += -lfftw3f -lfftw3 + endif +endif + ifeq ($(USE_PROFPARSER),TRUE) PROFILE := TRUE TRACE_PROFILE := TRUE @@ -918,6 +946,10 @@ ifeq ($(USE_PARTICLES),TRUE) DEFINES += -DAMREX_PARTICLES endif +ifeq ($(USE_FFT),TRUE) + DEFINES += -DAMREX_USE_FFT +endif + ifeq ($(USE_EB),TRUE) DEFINES += -DAMREX_USE_EB endif diff --git a/Tools/GNUMake/comps/hip.mak b/Tools/GNUMake/comps/hip.mak index 87bb3e93f59..26dff7f94ff 100644 --- a/Tools/GNUMake/comps/hip.mak +++ b/Tools/GNUMake/comps/hip.mak @@ -119,8 +119,8 @@ ifeq ($(HIP_COMPILER),clang) endif # Generic HIP info - ROC_PATH=$(realpath $(dir $(HIP_PATH))) - SYSTEM_INCLUDE_LOCATIONS += $(ROC_PATH)/include $(HIP_PATH)/include + ROC_PATH=$(realpath $(HIP_PATH)) + SYSTEM_INCLUDE_LOCATIONS += $(ROC_PATH)/include # rocRand SYSTEM_INCLUDE_LOCATIONS += $(ROC_PATH)/include/hiprand $(ROC_PATH)/include/rocrand diff --git a/Tools/libamrex/configure.py b/Tools/libamrex/configure.py index 8988c18b965..873b575fe4f 100755 --- a/Tools/libamrex/configure.py +++ b/Tools/libamrex/configure.py @@ -57,6 +57,10 @@ def configure(argv): help="Enable AMReX Fortran API [default=yes]", choices=["yes","no"], default="yes") + parser.add_argument("--enable-fft", + help="Enable AMReX FFT [default=no]", + choices=["yes","no"], + default="no") parser.add_argument("--enable-linear-solver", help="Enable AMReX linear solvers [default=yes]", choices=["yes","no"], @@ -151,6 +155,7 @@ def configure(argv): f.write("DEBUG = {}\n".format("TRUE" if args.debug == "yes" else "FALSE")) f.write("USE_PARTICLES = {}\n".format("FALSE" if args.enable_particle == "no" else "TRUE")) f.write("USE_FORTRAN_INTERFACE = {}\n".format("FALSE" if args.enable_fortran_api == "no" else "TRUE")) + f.write("USE_FFT = {}\n".format("TRUE" if args.enable_fft == "yes" else "FALSE")) f.write("USE_LINEAR_SOLVERS = {}\n".format("FALSE" if args.enable_linear_solver == "no" else "TRUE")) f.write("USE_LINEAR_SOLVERS_INCFLO = {}\n".format("FALSE" if args.enable_linear_solver_incflo == "no" else "TRUE")) f.write("USE_LINEAR_SOLVERS_EM = {}\n".format("FALSE" if args.enable_linear_solver_em == "no" else "TRUE"))