diff --git a/.github/workflows/clang.yml b/.github/workflows/clang.yml
index bdd629ce11f..afd37544c12 100644
--- a/.github/workflows/clang.yml
+++ b/.github/workflows/clang.yml
@@ -12,13 +12,13 @@ jobs:
   # Build and install libamrex as AMReX CMake project
   # Note: this is an intentional "minimal" build that does not enable (many) options
   library_clang:
-    name: Clang@6.0 C++14 SP NOMPI Debug [lib]
-    runs-on: ubuntu-18.04
-    env: {CXXFLAGS: "-fno-operator-names -Werror -Wall -Wextra -Wpedantic -Wnull-dereference -Wfloat-conversion -Wshadow -Woverloaded-virtual -Wextra-semi -Wunreachable-code -Wno-c++17-extensions"}
+    name: Clang@7.0 C++17 SP NOMPI Debug [lib]
+    runs-on: ubuntu-20.04
+    env: {CXXFLAGS: "-fno-operator-names -Werror -Wall -Wextra -Wpedantic -Wnull-dereference -Wfloat-conversion -Wshadow -Woverloaded-virtual -Wextra-semi -Wunreachable-code -Wnon-virtual-dtor"}
     steps:
-    - uses: actions/checkout@v2
+    - uses: actions/checkout@v3
     - name: Dependencies
-      run: .github/workflows/dependencies/dependencies_clang6.sh
+      run: .github/workflows/dependencies/dependencies_clang7.sh
     - name: Build & Install
       run: |
         mkdir build
@@ -34,7 +34,6 @@ jobs:
             -DAMReX_PLOTFILE_TOOLS=ON                 \
             -DAMReX_PRECISION=SINGLE                  \
             -DAMReX_PARTICLES_PRECISION=SINGLE        \
-            -DCMAKE_CXX_STANDARD=14                   \
             -DCMAKE_C_COMPILER=$(which clang)         \
             -DCMAKE_CXX_COMPILER=$(which clang++)     \
             -DCMAKE_Fortran_COMPILER=$(which gfortran)
@@ -48,14 +47,14 @@ jobs:
         ctest --output-on-failure
 
   tests_clang:
-    name: Clang@6.0 C++14 SP Particles DP Mesh Debug [tests]
-    runs-on: ubuntu-18.04
-    env: {CXXFLAGS: "-fno-operator-names -Werror -Wall -Wextra -Wpedantic -Wnull-dereference -Wfloat-conversion -Wshadow -Woverloaded-virtual -Wextra-semi -Wunreachable-code -Wno-c++17-extensions -O1"}
+    name: Clang@7.0 C++17 SP Particles DP Mesh Debug [tests]
+    runs-on: ubuntu-20.04
+    env: {CXXFLAGS: "-fno-operator-names -Werror -Wall -Wextra -Wpedantic -Wnull-dereference -Wfloat-conversion -Wshadow -Woverloaded-virtual -Wextra-semi -Wunreachable-code -O1 -Wnon-virtual-dtor"}
       # It's too slow with -O0
     steps:
-    - uses: actions/checkout@v2
+    - uses: actions/checkout@v3
     - name: Dependencies
-      run: .github/workflows/dependencies/dependencies_clang6.sh
+      run: .github/workflows/dependencies/dependencies_clang7.sh
     - name: Build & Install
       run: |
         mkdir build
@@ -70,7 +69,6 @@ jobs:
             -DAMReX_PARTICLES=ON                      \
             -DAMReX_PRECISION=DOUBLE                  \
             -DAMReX_PARTICLES_PRECISION=SINGLE        \
-            -DCMAKE_CXX_STANDARD=14                   \
             -DCMAKE_C_COMPILER=$(which clang)         \
             -DCMAKE_CXX_COMPILER=$(which clang++)     \
             -DCMAKE_Fortran_COMPILER=$(which gfortran)
@@ -80,14 +78,14 @@ jobs:
 
   # Build 2D libamrex with configure
   configure-2d:
-    name: Clang@6.0 NOMPI Release [configure 2D]
-    runs-on: ubuntu-18.04
+    name: Clang@7.0 NOMPI Release [configure 2D]
+    runs-on: ubuntu-20.04
     steps:
-    - uses: actions/checkout@v2
+    - uses: actions/checkout@v3
     - name: Dependencies
-      run: .github/workflows/dependencies/dependencies_clang6.sh
+      run: .github/workflows/dependencies/dependencies_clang7.sh
     - name: Build & Install
       run: |
         ./configure --dim 2 --with-fortran no --comp llvm --with-mpi no
-        make -j2 WARN_ALL=TRUE WARN_ERROR=TRUE XTRA_CXXFLAGS="-fno-operator-names -Wno-c++17-extensions"
+        make -j2 WARN_ALL=TRUE WARN_ERROR=TRUE XTRA_CXXFLAGS="-fno-operator-names"
         make install
diff --git a/.github/workflows/cuda.yml b/.github/workflows/cuda.yml
index c5fbceb5d7e..98a2b001760 100644
--- a/.github/workflows/cuda.yml
+++ b/.github/workflows/cuda.yml
@@ -7,44 +7,13 @@ concurrency:
   cancel-in-progress: true
 
 jobs:
-  # Build libamrex and all tests with CUDA 10.2
-  tests-cuda10:
-    name: CUDA@10.2 GNU@6.5.0 C++14 Release [tests]
-    runs-on: ubuntu-18.04
-    env: {CXXFLAGS: "-fno-operator-names -Werror -Wall -Wextra -Wpedantic -Wnull-dereference -Wfloat-conversion -Wshadow -Woverloaded-virtual -Wunreachable-code"}
-    steps:
-    - uses: actions/checkout@v2
-    - name: Dependencies
-      run: .github/workflows/dependencies/dependencies_nvcc10.sh
-    - name: Build & Install
-      run: |
-        export PATH=/usr/local/nvidia/bin:/usr/local/cuda/bin:${PATH}
-        export LD_LIBRARY_PATH=/usr/local/nvidia/lib:/usr/local/nvidia/lib64:/usr/local/cuda/lib64:${LD_LIBRARY_PATH}
-        which nvcc || echo "nvcc not in PATH!"
-        mkdir build
-        cd build
-        cmake ..                                           \
-            -DCMAKE_VERBOSE_MAKEFILE=ON                    \
-            -DAMReX_EB=OFF                                 \
-            -DAMReX_ENABLE_TESTS=ON                        \
-            -DAMReX_FORTRAN=OFF                            \
-            -DAMReX_PARTICLES=ON                           \
-            -DAMReX_GPU_BACKEND=CUDA                       \
-            -DCMAKE_C_COMPILER=$(which gcc-6)              \
-            -DCMAKE_CXX_COMPILER=$(which g++-6)            \
-            -DCMAKE_CUDA_HOST_COMPILER=$(which g++-6)      \
-            -DCMAKE_Fortran_COMPILER=$(which gfortran-6)   \
-            -DAMReX_CUDA_ARCH=7.0 \
-            -DAMReX_CUDA_ERROR_CROSS_EXECUTION_SPACE_CALL=ON
-        make -j 2
-
   # Build libamrex and all tests with CUDA 11.0.2 (recent supported)
   tests-cuda11:
     name: CUDA@11.2 GNU@9.3.0 C++17 Release [tests]
     runs-on: ubuntu-20.04
-    env: {CXXFLAGS: "-fno-operator-names -Werror -Wall -Wextra -Wpedantic -Wnull-dereference -Wfloat-conversion -Wshadow -Woverloaded-virtual -Wextra-semi -Wunreachable-code"}
+    env: {CXXFLAGS: "-fno-operator-names -Werror -Wall -Wextra -Wpedantic -Wnull-dereference -Wfloat-conversion -Wshadow -Woverloaded-virtual -Wextra-semi -Wunreachable-code -Wnon-virtual-dtor -Wlogical-op -Wmisleading-indentation -Wduplicated-cond -Wduplicated-branches"}
     steps:
-    - uses: actions/checkout@v2
+    - uses: actions/checkout@v3
     - name: Dependencies
       run: .github/workflows/dependencies/dependencies_nvcc11.sh
     - name: Build & Install
@@ -64,9 +33,7 @@ jobs:
             -DCMAKE_CXX_COMPILER=$(which g++)            \
             -DCMAKE_CUDA_HOST_COMPILER=$(which g++)      \
             -DCMAKE_Fortran_COMPILER=$(which gfortran)   \
-            -DCMAKE_CUDA_STANDARD=17                     \
-            -DCMAKE_CXX_STANDARD=17                      \
-            -DAMReX_CUDA_ARCH=8.0                        \
+            -DAMReX_CUDA_ARCH=7.0                        \
             -DAMReX_CUDA_ERROR_CROSS_EXECUTION_SPACE_CALL=ON \
             -DAMReX_CUDA_ERROR_CAPTURE_THIS=ON
 
@@ -78,7 +45,7 @@ jobs:
     runs-on: ubuntu-20.04
     env: {CXXFLAGS: "-Werror -Wall -Wextra -Wpedantic -Wshadow"}
     steps:
-    - uses: actions/checkout@v2
+    - uses: actions/checkout@v3
     - name: Dependencies
       run: .github/workflows/dependencies/dependencies_nvhpc21-11.sh
     - name: Build & Install
@@ -106,8 +73,6 @@ jobs:
             -DCMAKE_CXX_COMPILER=$(which nvc++)          \
             -DCMAKE_CUDA_HOST_COMPILER=$(which nvc++)    \
             -DCMAKE_Fortran_COMPILER=$(which nvfortran)  \
-            -DCMAKE_CUDA_STANDARD=17                     \
-            -DCMAKE_CXX_STANDARD=17                      \
             -DAMReX_CUDA_ARCH=8.0                        \
             -DAMReX_CUDA_ERROR_CROSS_EXECUTION_SPACE_CALL=ON \
             -DAMReX_CUDA_ERROR_CAPTURE_THIS=ON
@@ -119,12 +84,12 @@ jobs:
     name: CUDA@11.2 GNU@9.3.0 [configure 3D]
     runs-on: ubuntu-20.04
     steps:
-    - uses: actions/checkout@v2
+    - uses: actions/checkout@v3
     - name: Dependencies
       run: .github/workflows/dependencies/dependencies_nvcc11.sh
     - name: Build & Install
       run: |
         export PATH=/usr/local/nvidia/bin:/usr/local/cuda/bin:${PATH}
         ./configure --dim 3 --with-cuda yes --enable-eb yes --enable-xsdk-defaults yes --with-fortran no
-        make -j2 WARN_ALL=TRUE WARN_ERROR=TRUE XTRA_CXXFLAGS=-fno-operator-names CXXSTD=c++17
+        make -j2 WARN_ALL=TRUE WARN_ERROR=TRUE XTRA_CXXFLAGS=-fno-operator-names
         make install
diff --git a/.github/workflows/dependencies/dependencies.sh b/.github/workflows/dependencies/dependencies.sh
index d0e86e99c0a..c9bb080831c 100755
--- a/.github/workflows/dependencies/dependencies.sh
+++ b/.github/workflows/dependencies/dependencies.sh
@@ -1,6 +1,6 @@
 #!/usr/bin/env bash
 #
-# Copyright 2020 The AMReX Community
+# Copyright 2020-2022 The AMReX Community
 #
 # License: BSD-3-Clause-LBNL
 # Authors: Axel Huebl
diff --git a/.github/workflows/dependencies/dependencies_clang6.sh b/.github/workflows/dependencies/dependencies_clang7.sh
similarity index 73%
rename from .github/workflows/dependencies/dependencies_clang6.sh
rename to .github/workflows/dependencies/dependencies_clang7.sh
index 19b348b920b..85396a2f73c 100755
--- a/.github/workflows/dependencies/dependencies_clang6.sh
+++ b/.github/workflows/dependencies/dependencies_clang7.sh
@@ -1,6 +1,6 @@
 #!/usr/bin/env bash
 #
-# Copyright 2020 The AMReX Community
+# Copyright 2020-2022 The AMReX Community
 #
 # License: BSD-3-Clause-LBNL
 # Authors: Axel Huebl
@@ -11,4 +11,4 @@ sudo apt-get update
 
 sudo apt-get install -y  \
     build-essential      \
-    clang gfortran
+    clang-7 gfortran
diff --git a/.github/workflows/dependencies/dependencies_gcc8.sh b/.github/workflows/dependencies/dependencies_gcc8.sh
new file mode 100755
index 00000000000..c216e6a8c51
--- /dev/null
+++ b/.github/workflows/dependencies/dependencies_gcc8.sh
@@ -0,0 +1,17 @@
+#!/usr/bin/env bash
+#
+# Copyright 2020-2022 The AMReX Community
+#
+# License: BSD-3-Clause-LBNL
+# Authors: Axel Huebl
+
+set -eu -o pipefail
+
+sudo add-apt-repository ppa:ubuntu-toolchain-r/test
+sudo apt-get update
+
+sudo apt-get install -y --no-install-recommends \
+    build-essential    \
+    g++-8 gfortran-8   \
+    libopenmpi-dev     \
+    openmpi-bin
diff --git a/.github/workflows/dependencies/dependencies_nofortran.sh b/.github/workflows/dependencies/dependencies_nofortran.sh
index 36d759f66fa..61089ad8bf7 100755
--- a/.github/workflows/dependencies/dependencies_nofortran.sh
+++ b/.github/workflows/dependencies/dependencies_nofortran.sh
@@ -1,8 +1,9 @@
 #!/usr/bin/env bash
 #
-# Copyright 2020 Axel Huebl
+# Copyright 2020-2022 The AMReX Community
 #
 # License: BSD-3-Clause-LBNL
+# Authors: Axel Huebl
 
 # search recursive inside a folder if a file contains tabs
 #
diff --git a/.github/workflows/dependencies/dependencies_nvcc10.sh b/.github/workflows/dependencies/dependencies_nvcc10.sh
deleted file mode 100755
index 591dd04d79b..00000000000
--- a/.github/workflows/dependencies/dependencies_nvcc10.sh
+++ /dev/null
@@ -1,30 +0,0 @@
-#!/usr/bin/env bash
-#
-# Copyright 2020 Axel Huebl
-#
-# License: BSD-3-Clause-LBNL
-
-set -eu -o pipefail
-
-sudo apt-get update
-
-sudo apt-get install -y --no-install-recommends\
-    build-essential     \
-    g++-6               \
-    gfortran-6          \
-    libopenmpi-dev      \
-    openmpi-bin
-
-sudo apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/3bf863cc.pub
-echo "deb https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64 /" \
-    | sudo tee /etc/apt/sources.list.d/cuda.list
-sudo apt-get update
-sudo apt-get install -y \
-    cuda-command-line-tools-10-2 \
-    cuda-compiler-10-2           \
-    cuda-cupti-dev-10-2          \
-    cuda-minimal-build-10-2      \
-    cuda-nvml-dev-10-2           \
-    cuda-nvtx-10-2               \
-    cuda-curand-dev-10-2
-sudo ln -s cuda-10.2 /usr/local/cuda
diff --git a/.github/workflows/dependencies/dependencies_nvcc11.sh b/.github/workflows/dependencies/dependencies_nvcc11.sh
index 79c8c6c31f6..a4b2f335a99 100755
--- a/.github/workflows/dependencies/dependencies_nvcc11.sh
+++ b/.github/workflows/dependencies/dependencies_nvcc11.sh
@@ -1,6 +1,6 @@
 #!/usr/bin/env bash
 #
-# Copyright 2020 Axel Huebl
+# Copyright 2020-2022 Axel Huebl
 #
 # License: BSD-3-Clause-LBNL
 
@@ -19,9 +19,8 @@ sudo apt-get install -y \
     pkg-config          \
     wget
 
-sudo apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/3bf863cc.pub
-echo "deb https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64 /" \
-    | sudo tee /etc/apt/sources.list.d/cuda.list
+curl -O https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/cuda-keyring_1.0-1_all.deb
+sudo dpkg -i cuda-keyring_1.0-1_all.deb
 sudo apt-get update
 sudo apt-get install -y \
     cuda-command-line-tools-11-2 \
diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml
index c0d50aa99e1..82e387cbff4 100644
--- a/.github/workflows/docs.yml
+++ b/.github/workflows/docs.yml
@@ -11,7 +11,7 @@ jobs:
     runs-on: ubuntu-latest
     steps:
       - name: Checkout
-        uses: actions/checkout@v2.3.1 # If you're using actions/checkout@v2 you must set persist-credentials to false in most cases for the deployment to work correctly.
+        uses: actions/checkout@v3
         with:
           persist-credentials: false
 
diff --git a/.github/workflows/gcc.yml b/.github/workflows/gcc.yml
index 188d7d32f95..32726a4767a 100644
--- a/.github/workflows/gcc.yml
+++ b/.github/workflows/gcc.yml
@@ -13,13 +13,13 @@ jobs:
   # Build and install libamrex as AMReX CMake project
   # Note: this is an intentional "minimal" build that does not enable (many) options
   library:
-    name: GNU@7.5 C++17 Release [lib]
-    runs-on: ubuntu-18.04
-    env: {CXXFLAGS: "-fno-operator-names -Werror -Wall -Wextra -Wpedantic -Wnull-dereference -Wfloat-conversion -Wshadow -Woverloaded-virtual"}
+    name: GNU@8.4 C++17 Release [lib]
+    runs-on: ubuntu-20.04
+    env: {CXXFLAGS: "-fno-operator-names -Werror -Wall -Wextra -Wpedantic -Wnull-dereference -Wfloat-conversion -Wshadow -Woverloaded-virtual -Wnon-virtual-dtor -Wlogical-op -Wmisleading-indentation -Wduplicated-cond -Wduplicated-branches"}
     steps:
-    - uses: actions/checkout@v2
+    - uses: actions/checkout@v3
     - name: Dependencies
-      run: .github/workflows/dependencies/dependencies.sh
+      run: .github/workflows/dependencies/dependencies_gcc8.sh
     - name: Build & Install
       run: |
         mkdir build
@@ -29,7 +29,9 @@ jobs:
             -DAMReX_PLOTFILE_TOOLS=ON             \
             -DCMAKE_VERBOSE_MAKEFILE=ON           \
             -DCMAKE_INSTALL_PREFIX=/tmp/my-amrex  \
-            -DCMAKE_CXX_STANDARD=17
+            -DCMAKE_C_COMPILER=$(which gcc-8)     \
+            -DCMAKE_CXX_COMPILER=$(which g++-8)   \
+            -DCMAKE_Fortran_COMPILER=$(which gfortran-8)
         make -j 2
         make install
         make test_install
@@ -41,12 +43,12 @@ jobs:
 
   # Build libamrex and all tests
   tests_build_3D:
-    name: GNU@7.5 C++14 3D Debug Fortran [tests]
-    runs-on: ubuntu-18.04
-    env: {CXXFLAGS: "-fno-operator-names -Werror -Wall -Wextra -Wpedantic -Wnull-dereference -Wfloat-conversion -Wshadow -Woverloaded-virtual -O1"}
+    name: GNU@9.3 C++17 3D Debug Fortran [tests]
+    runs-on: ubuntu-20.04
+    env: {CXXFLAGS: "-fno-operator-names -Werror -Wall -Wextra -Wpedantic -Wnull-dereference -Wfloat-conversion -Wshadow -Woverloaded-virtual -O1 -Wnon-virtual-dtor -Wlogical-op -Wmisleading-indentation -Wduplicated-cond -Wduplicated-branches"}
       # It's too slow with -O0
     steps:
-    - uses: actions/checkout@v2
+    - uses: actions/checkout@v3
     - name: Dependencies
       run: .github/workflows/dependencies/dependencies.sh
     - name: Build & Install
@@ -64,12 +66,12 @@ jobs:
         ctest --test-dir build --output-on-failure
 
   tests_build_2D:
-    name: GNU@7.5 C++14 2D Debug Fortran [tests]
-    runs-on: ubuntu-18.04
-    env: {CXXFLAGS: "-fno-operator-names -Werror -Wall -Wextra -Wpedantic -Wnull-dereference -Wfloat-conversion -Wshadow -Woverloaded-virtual -O1"}
+    name: GNU@9.3 C++17 2D Debug Fortran [tests]
+    runs-on: ubuntu-20.04
+    env: {CXXFLAGS: "-fno-operator-names -Werror -Wall -Wextra -Wpedantic -Wnull-dereference -Wfloat-conversion -Wshadow -Woverloaded-virtual -O1 -Wnon-virtual-dtor -Wlogical-op -Wmisleading-indentation -Wduplicated-cond -Wduplicated-branches"}
       # It's too slow with -O0
     steps:
-    - uses: actions/checkout@v2
+    - uses: actions/checkout@v3
     - name: Dependencies
       run: .github/workflows/dependencies/dependencies.sh
     - name: Build & Install
@@ -87,13 +89,13 @@ jobs:
         ctest --test-dir build --output-on-failure
 
   tests_build_1D:
-    name: GNU@7.5 C++14 1D Debug Fortran [tests]
-    runs-on: ubuntu-18.04
-    env: {CXXFLAGS: "-fno-operator-names -Werror -Wall -Wextra -Wpedantic -Wnull-dereference -Wfloat-conversion -Wshadow -Woverloaded-virtual -O1"}
+    name: GNU@9.3 C++17 1D Debug Fortran [tests]
+    runs-on: ubuntu-20.04
+    env: {CXXFLAGS: "-fno-operator-names -Werror -Wall -Wextra -Wpedantic -Wnull-dereference -Wfloat-conversion -Wshadow -Woverloaded-virtual -O1 -Wnon-virtual-dtor -Wlogical-op -Wmisleading-indentation -Wduplicated-cond -Wduplicated-branches"}
       # -Werror temporarily skipped until we have functional testing established
       # It's too slow with -O0
     steps:
-    - uses: actions/checkout@v2
+    - uses: actions/checkout@v3
     - name: Dependencies
       run: .github/workflows/dependencies/dependencies.sh
     - name: Build & Install
@@ -113,10 +115,10 @@ jobs:
   # Build libamrex and all tests
   tests_cxx20:
     name: GNU@10.1 C++20 OMP [tests]
-    runs-on: ubuntu-18.04
-    env: {CXXFLAGS: "-fno-operator-names -Werror -Wall -Wextra -Wpedantic -Wnull-dereference -Wfloat-conversion -Wshadow -Woverloaded-virtual -Wextra-semi"}
+    runs-on: ubuntu-20.04
+    env: {CXXFLAGS: "-fno-operator-names -Werror -Wall -Wextra -Wpedantic -Wnull-dereference -Wfloat-conversion -Wshadow -Woverloaded-virtual -Wextra-semi -Wnon-virtual-dtor -Wlogical-op -Wmisleading-indentation -Wduplicated-cond -Wduplicated-branches"}
     steps:
-    - uses: actions/checkout@v2
+    - uses: actions/checkout@v3
     - name: Dependencies
       run: .github/workflows/dependencies/dependencies_gcc10.sh
     - name: Build & Install
@@ -145,13 +147,13 @@ jobs:
 
   # Build libamrex and all tests w/o MPI
   tests-nonmpi:
-    name: GNU@7.5 C++14 NOMPI [tests]
-    runs-on: ubuntu-18.04
-    env: {CXXFLAGS: "-fno-operator-names -Werror -Wall -Wextra -Wpedantic -Wnull-dereference -Wfloat-conversion -Wshadow -Woverloaded-virtual"}
+    name: GNU@8.4 C++17 NOMPI [tests]
+    runs-on: ubuntu-20.04
+    env: {CXXFLAGS: "-fno-operator-names -Werror -Wall -Wextra -Wpedantic -Wnull-dereference -Wfloat-conversion -Wshadow -Woverloaded-virtual -Wnon-virtual-dtor -Wlogical-op -Wmisleading-indentation -Wduplicated-cond -Wduplicated-branches"}
     steps:
-    - uses: actions/checkout@v2
+    - uses: actions/checkout@v3
     - name: Dependencies
-      run: .github/workflows/dependencies/dependencies.sh
+      run: .github/workflows/dependencies/dependencies_gcc8.sh
     - name: Build & Install
       run: |
         mkdir build
@@ -167,18 +169,21 @@ jobs:
             -DAMReX_ENABLE_TESTS=ON     \
             -DAMReX_FORTRAN=ON          \
             -DAMReX_MPI=OFF             \
-            -DAMReX_PARTICLES=ON
+            -DAMReX_PARTICLES=ON        \
+            -DCMAKE_C_COMPILER=$(which gcc-8)     \
+            -DCMAKE_CXX_COMPILER=$(which g++-8)   \
+            -DCMAKE_Fortran_COMPILER=$(which gfortran-8)
         make -j 2
 
         ctest --output-on-failure
 
   # Build libamrex and all tests
   tests-nofortran:
-    name: GNU@7.5 C++14 w/o Fortran [tests]
-    runs-on: ubuntu-18.04
-    env: {CXXFLAGS: "-fno-operator-names -Werror -Wall -Wextra -Wpedantic -Wnull-dereference -Wfloat-conversion -Wshadow -Woverloaded-virtual -Wunreachable-code"}
+    name: GNU@9.3 C++17 w/o Fortran [tests]
+    runs-on: ubuntu-20.04
+    env: {CXXFLAGS: "-fno-operator-names -Werror -Wall -Wextra -Wpedantic -Wnull-dereference -Wfloat-conversion -Wshadow -Woverloaded-virtual -Wunreachable-code -Wnon-virtual-dtor -Wlogical-op -Wmisleading-indentation -Wduplicated-cond -Wduplicated-branches"}
     steps:
-    - uses: actions/checkout@v2
+    - uses: actions/checkout@v3
     - name: Dependencies
       run: .github/workflows/dependencies/dependencies_nofortran.sh
     - name: Build & Install
@@ -203,10 +208,10 @@ jobs:
 
   # Build 1D libamrex with configure
   configure-1d:
-    name: GNU@7.5 Release [configure 1D]
-    runs-on: ubuntu-18.04
+    name: GNU@9.3 Release [configure 1D]
+    runs-on: ubuntu-20.04
     steps:
-    - uses: actions/checkout@v2
+    - uses: actions/checkout@v3
     - name: Dependencies
       run: .github/workflows/dependencies/dependencies.sh
     - name: Build & Install
@@ -217,10 +222,10 @@ jobs:
 
   # Build 3D libamrex with configure
   configure-3d:
-    name: GNU@7.5 Release [configure 3D]
-    runs-on: ubuntu-18.04
+    name: GNU@11.2 Release [configure 3D]
+    runs-on: ubuntu-22.04
     steps:
-    - uses: actions/checkout@v2
+    - uses: actions/checkout@v3
     - name: Dependencies
       run: .github/workflows/dependencies/dependencies.sh
     - name: Build & Install
@@ -231,10 +236,10 @@ jobs:
 
   # Build 3D libamrex with single precision and tiny profiler
   configure-3d-single-tprof:
-    name: GNU@7.5 Release [configure 3D]
-    runs-on: ubuntu-18.04
+    name: GNU@9.3 Release [configure 3D]
+    runs-on: ubuntu-20.04
     steps:
-    - uses: actions/checkout@v2
+    - uses: actions/checkout@v3
     - name: Dependencies
       run: .github/workflows/dependencies/dependencies.sh
     - name: Build & Install
@@ -245,10 +250,10 @@ jobs:
 
   # Build 3D libamrex debug omp build with configure
   configure-3d-omp-debug:
-    name: GNU@7.5 OMP Debug [configure 3D]
-    runs-on: ubuntu-18.04
+    name: GNU@9.3 OMP Debug [configure 3D]
+    runs-on: ubuntu-20.04
     steps:
-    - uses: actions/checkout@v2
+    - uses: actions/checkout@v3
     - name: Dependencies
       run: .github/workflows/dependencies/dependencies.sh
     - name: Build & Install
@@ -260,9 +265,9 @@ jobs:
   # Build Tools/Plotfile
   plotfile-tools:
     name: GNU Plotfile Tools [tools]
-    runs-on: ubuntu-18.04
+    runs-on: ubuntu-20.04
     steps:
-    - uses: actions/checkout@v2
+    - uses: actions/checkout@v3
     - name: Dependencies
       run: .github/workflows/dependencies/dependencies.sh
     - name: Build & Install
@@ -272,11 +277,11 @@ jobs:
 
   # Build libamrex and run all tests
   tests_run:
-    name: GNU@7.5 C++14 [tests]
-    runs-on: ubuntu-18.04
-    env: {CXXFLAGS: "-fno-operator-names -Werror -Wall -Wextra -Wpedantic -Wnull-dereference -Wfloat-conversion -Wshadow -Woverloaded-virtual -Wunreachable-code"}
+    name: GNU@9.3 C++17 [tests]
+    runs-on: ubuntu-20.04
+    env: {CXXFLAGS: "-fno-operator-names -Werror -Wall -Wextra -Wpedantic -Wnull-dereference -Wfloat-conversion -Wshadow -Woverloaded-virtual -Wunreachable-code -Wnon-virtual-dtor -Wlogical-op -Wmisleading-indentation -Wduplicated-cond -Wduplicated-branches"}
     steps:
-    - uses: actions/checkout@v2
+    - uses: actions/checkout@v3
     - name: Dependencies
       run: .github/workflows/dependencies/dependencies.sh
     - name: Build & Install
@@ -295,13 +300,13 @@ jobs:
         ctest --output-on-failure -R
 
   test_hdf5:
-    name: GNU@7.5 HDF5 I/O Test [tests]
-    runs-on: ubuntu-18.04
+    name: GNU@9.3 HDF5 I/O Test [tests]
+    runs-on: ubuntu-20.04
     env:
       CXX: h5pcc
       CC: h5cc
     steps:
-    - uses: actions/checkout@v2
+    - uses: actions/checkout@v3
     - name: Dependencies
       run: |
         .github/workflows/dependencies/dependencies.sh
diff --git a/.github/workflows/hip.yml b/.github/workflows/hip.yml
index a128eabf664..d542fb603a2 100644
--- a/.github/workflows/hip.yml
+++ b/.github/workflows/hip.yml
@@ -20,9 +20,9 @@ jobs:
     #                                                                          ^
     #    /opt/rocm-4.1.1/hip/include/hip/hcc_detail/hip_runtime.h:176:9: note: macro 'select_impl_' defined here
     #    #define select_impl_(_1, _2, impl_, ...) impl_
-    env: {CXXFLAGS: "-fno-operator-names -Werror -Wall -Wextra -Wpedantic -Wnull-dereference -Wfloat-conversion -Wshadow -Woverloaded-virtual -Wextra-semi -Wunreachable-code -Wno-deprecated-declarations -Wno-gnu-zero-variadic-macro-arguments -Wno-pass-failed"}
+    env: {CXXFLAGS: "-fno-operator-names -Werror -Wall -Wextra -Wpedantic -Wnull-dereference -Wfloat-conversion -Wshadow -Woverloaded-virtual -Wextra-semi -Wunreachable-code -Wnon-virtual-dtor -Wno-deprecated-declarations -Wno-gnu-zero-variadic-macro-arguments"}
     steps:
-    - uses: actions/checkout@v2
+    - uses: actions/checkout@v3
     - name: Dependencies
       run: .github/workflows/dependencies/dependencies_hip.sh
     - name: Build & Install
@@ -47,6 +47,7 @@ jobs:
             -DAMReX_LINEAR_SOLVERS=ON                     \
             -DAMReX_GPU_BACKEND=HIP                       \
             -DAMReX_AMD_ARCH=gfx908                       \
+            -DAMReX_ROCTX=ON                              \
             -DCMAKE_C_COMPILER=$(which clang)             \
             -DCMAKE_CXX_COMPILER=$(which clang++)         \
             -DCMAKE_Fortran_COMPILER=$(which flang)       \
@@ -66,9 +67,9 @@ jobs:
     #                                                                          ^
     #    /opt/rocm-4.1.1/hip/include/hip/hcc_detail/hip_runtime.h:176:9: note: macro 'select_impl_' defined here
     #    #define select_impl_(_1, _2, impl_, ...) impl_
-    env: {CXXFLAGS: "-fno-operator-names -Werror -Wall -Wextra -Wpedantic -Wnull-dereference -Wfloat-conversion -Wshadow -Woverloaded-virtual -Wextra-semi -Wunreachable-code -Wno-deprecated-declarations -Wno-gnu-zero-variadic-macro-arguments -Wno-pass-failed"}
+    env: {CXXFLAGS: "-fno-operator-names -Werror -Wall -Wextra -Wpedantic -Wnull-dereference -Wfloat-conversion -Wshadow -Woverloaded-virtual -Wextra-semi -Wunreachable-code -Wnon-virtual-dtor -Wno-deprecated-declarations -Wno-gnu-zero-variadic-macro-arguments"}
     steps:
-    - uses: actions/checkout@v2
+    - uses: actions/checkout@v3
     - name: Dependencies
       run: .github/workflows/dependencies/dependencies_hip.sh
     - name: Build & Install
@@ -91,6 +92,7 @@ jobs:
             -DAMReX_LINEAR_SOLVERS=ON                     \
             -DAMReX_GPU_BACKEND=HIP                       \
             -DAMReX_AMD_ARCH=gfx908                       \
+            -DAMReX_ROCTX=ON                              \
             -DCMAKE_C_COMPILER=$(which clang)             \
             -DCMAKE_CXX_COMPILER=$(which hipcc)           \
             -DCMAKE_Fortran_COMPILER=$(which gfortran)    \
@@ -102,7 +104,7 @@ jobs:
     name: HIP EB [configure 2D]
     runs-on: ubuntu-20.04
     steps:
-    - uses: actions/checkout@v2
+    - uses: actions/checkout@v3
     - name: Dependencies
       run: .github/workflows/dependencies/dependencies_hip.sh
     - name: Build & Install
diff --git a/.github/workflows/intel.yml b/.github/workflows/intel.yml
index 6fef4fc0459..6e7d87a299e 100644
--- a/.github/workflows/intel.yml
+++ b/.github/workflows/intel.yml
@@ -11,9 +11,9 @@ jobs:
     name: DPCPP GFortran@7.5 C++17 [tests]
     runs-on: ubuntu-20.04
     # mkl/rng/device/detail/mrg32k3a_impl.hpp has a number of sign-compare error
-    env: {CXXFLAGS: "-fno-operator-names -Werror -Wall -Wextra -Wpedantic -Wnull-dereference -Wfloat-conversion -Wshadow -Woverloaded-virtual -Wextra-semi -Wunreachable-code -Wno-sign-compare"}
+    env: {CXXFLAGS: "-fno-operator-names -Werror -Wall -Wextra -Wpedantic -Wnull-dereference -Wfloat-conversion -Wshadow -Woverloaded-virtual -Wextra-semi -Wunreachable-code -Wnon-virtual-dtor -Wno-sign-compare"}
     steps:
-    - uses: actions/checkout@v2
+    - uses: actions/checkout@v3
     - name: Dependencies
       run: .github/workflows/dependencies/dependencies_dpcpp.sh
     - name: Build & Install
@@ -41,7 +41,7 @@ jobs:
     runs-on: ubuntu-20.04
     env: {CXXFLAGS: "-Werror"}
     steps:
-    - uses: actions/checkout@v2
+    - uses: actions/checkout@v3
     - name: install dependencies
       run: |
         export DEBIAN_FRONTEND=noninteractive
diff --git a/.github/workflows/macos.yml b/.github/workflows/macos.yml
index e1446a038da..be5a1e738ca 100644
--- a/.github/workflows/macos.yml
+++ b/.github/workflows/macos.yml
@@ -14,10 +14,10 @@ jobs:
     env:
       # build universal binaries for M1 "Apple Silicon" and Intel CPUs
       CMAKE_OSX_ARCHITECTURES: "arm64;x86_64"
-      CXXFLAGS: "-fno-operator-names -Werror -Wall -Wextra -Wpedantic -Wnull-dereference -Wfloat-conversion -Wshadow -Woverloaded-virtual -Wextra-semi -Wunreachable-code -Wno-c++17-extensions -Wno-range-loop-analysis -Wno-pass-failed"
+      CXXFLAGS: "-fno-operator-names -Werror -Wall -Wextra -Wpedantic -Wnull-dereference -Wfloat-conversion -Wshadow -Woverloaded-virtual -Wextra-semi -Wunreachable-code -Wnon-virtual-dtor -Wno-c++17-extensions -Wno-range-loop-analysis"
       # -Wno-range-loop-analysis: Apple clang has a bug in range-loop-analysis
     steps:
-    - uses: actions/checkout@v2
+    - uses: actions/checkout@v3
     - name: Dependencies
       run: .github/workflows/dependencies/dependencies_mac.sh
     - name: Build & Install
@@ -39,10 +39,10 @@ jobs:
     name: AppleClang@11.0 GFortran@9.3 [tests]
     runs-on: macos-latest
     env:
-      CXXFLAGS: "-fno-operator-names -Werror -Wall -Wextra -Wpedantic -Wnull-dereference -Wfloat-conversion -Wshadow -Woverloaded-virtual -Wextra-semi -Wunreachable-code -Wno-c++17-extensions -Wno-range-loop-analysis -Wno-pass-failed"
+      CXXFLAGS: "-fno-operator-names -Werror -Wall -Wextra -Wpedantic -Wnull-dereference -Wfloat-conversion -Wshadow -Woverloaded-virtual -Wextra-semi -Wunreachable-code -Wnon-virtual-dtor -Wno-c++17-extensions -Wno-range-loop-analysis"
       # -Wno-range-loop-analysis: Apple clang has a bug in range-loop-analysis
     steps:
-    - uses: actions/checkout@v2
+    - uses: actions/checkout@v3
     - name: Dependencies
       run: .github/workflows/dependencies/dependencies_mac.sh
     - name: Build & Install
diff --git a/.github/workflows/sensei.yml b/.github/workflows/sensei.yml
index 19121889d6b..163456a924a 100644
--- a/.github/workflows/sensei.yml
+++ b/.github/workflows/sensei.yml
@@ -17,17 +17,17 @@ jobs:
       CC: clang
       CXXFLAGS: "-Werror -Wshadow -Woverloaded-virtual -Wunreachable-code -fno-operator-names"
       CMAKE_GENERATOR: Ninja
-      CMAKE_PREFIX_PATH: /root/install/sensei/develop/lib/cmake
+      CMAKE_PREFIX_PATH: /root/install/sensei/v4.0.0/lib64/cmake
     container:
-      image: ryankrattiger/sensei:fedora33-vtk-mpi-20210616
+      image: senseiinsitu/ci:fedora35-amrex-20220613
     steps:
-    - uses: actions/checkout@v2
+    - uses: actions/checkout@v3
     - name: Setup
       run: mkdir build
     - name: Configure
       run: |
         cd build
-        cmake ..                  \
+        cmake ..                     \
             -DCMAKE_BUILD_TYPE=Debug \
             -DAMReX_ENABLE_TESTS=ON  \
             -DAMReX_FORTRAN=OFF      \
diff --git a/.github/workflows/style.yml b/.github/workflows/style.yml
index b459865f587..9c32554218d 100644
--- a/.github/workflows/style.yml
+++ b/.github/workflows/style.yml
@@ -10,13 +10,13 @@ jobs:
   tabs:
     runs-on: ubuntu-latest
     steps:
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v3
       - name: Tabs
         run: .github/workflows/style/check_tabs.sh
 
   trailing_whitespaces:
     runs-on: ubuntu-latest
     steps:
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v3
       - name: Trailing Whitespaces
         run: .github/workflows/style/check_trailing_whitespaces.sh
diff --git a/.github/workflows/windows.yml b/.github/workflows/windows.yml
index b066ba6c98c..fba862d26dd 100644
--- a/.github/workflows/windows.yml
+++ b/.github/workflows/windows.yml
@@ -12,7 +12,7 @@ jobs:
     name: MSVC C++17 w/o Fortran w/o MPI
     runs-on: windows-latest
     steps:
-    - uses: actions/checkout@v2
+    - uses: actions/checkout@v3
     - name: Build & Install
       run: |
         cmake -S . -B build   `
@@ -31,7 +31,7 @@ jobs:
     name: MSVC C++17 w/o Fortran w/o MPI static
     runs-on: windows-latest
     steps:
-    - uses: actions/checkout@v2
+    - uses: actions/checkout@v3
     - name: Build & Install
       run: |
         cmake -S . -B build   `
@@ -49,7 +49,7 @@ jobs:
     name: Clang C++17 w/o Fortran w/o MPI
     runs-on: windows-latest
     steps:
-    - uses: actions/checkout@v2
+    - uses: actions/checkout@v3
     - uses: seanmiddleditch/gha-setup-ninja@master
     - name: Build & Install
       shell: cmd
diff --git a/CHANGES b/CHANGES
index 8104566abe2..648db385c07 100644
--- a/CHANGES
+++ b/CHANGES
@@ -1,3 +1,236 @@
+# 22.11
+
+  -- MPI Reduce for ValLocPair (#3003)
+
+  -- `FabArray::isDefined` (#2997)
+
+  -- Make The_Device_Arena non-managed (#2998)
+
+  -- Add alias template Gpu::NonManagedDeviceVector (#2999)
+
+  -- Pre- and Post-interpolation hook interface (#2991)
+
+  -- Add user defined BC types (#2995)
+
+  -- Add BCRec::set for convenience (#2993)
+
+  -- ParallelFor with compile time optimization of kernels with run time parameters (#2954)
+
+  -- 2D RZ solver for WarpX: Arbitrary coefficient (#2986)
+
+  -- Runge-Kutta support for AMR (#2974)
+
+  -- Fourth-order interpolation from fine to coarse level (#2987)
+
+  -- Fix EB data inconsistency when fixing small cells and multiple cuts (#2943)
+
+  -- MFIter::Finalize (#2983, #2985, #2988)
+
+  -- Fix MLMG::getGradSolution & getFluxes for inhomogeneous Neumann and Robin BC (#2984)
+
+  -- MLLinOp::postSolve (#2981)
+
+  -- add templating for the cell bilinear interpolators (#2979)
+
+  -- FillPatcher class (#2972)
+
+  -- Remove sycl namespace alias (#2971)
+
+  -- Fix Tensor Solver BC (#2930)
+
+  -- Disable host device for macros for SYCL/DPC++ (#2969)
+
+# 22.10
+
+  -- Solve an issue with particles async IO when having runtime added variables (#2966)
+
+  -- Fix int overflow in amrex::bisect (#2964)
+
+  -- Fix MLEBNodeFDLaplacian bottom solver (#2963)
+
+  -- make tagging routines EB_aware (#2962)
+
+  -- Volume weighted sum (#2961)
+
+  -- CellData: data in a single cell (#2959)
+
+  -- Quartic interpolation for cell centered data (#2960)
+
+  -- Add GPU-compatible upper bound and lower bound algorithms to AMReX_Algorithm (#2958)
+
+  -- add option for makebuildsources to specify the style arguments for 'git describe'. (#2957)
+
+  -- Add roundoff_lo corresponding to roundoff_hi for domains that don't start at 0 (#2950)
+
+  -- Add template parameter to ParallelFor and launch specifying block size (#2947)
+
+  -- Byte spread fixes (#2949)
+
+  -- CMake: HIP_PATH from ROCM_PATH (#2948)
+
+  -- Fix: Make Finalize->Initialize->F->I->... Work (#2944)
+
+  -- Changes for Cray & Clang (#2941)
+
+  -- Link to cublas when using CUDA and Hypre (#2933)
+
+  -- HIP: use coarse grained host memory (#2932)
+
+  -- EB checkpoint files (#2897)
+
+  -- Fix: Loading Files Again (#2936)
+
+  -- Check if boundary particles container has been created before clearance. (#2935)
+
+  -- SYCL: Replace deprecated atomic types and operations (#2921)
+
+# 22.09
+
+  -- Preserve neighbor particles when sorting particles. (#2923)
+
+  -- Scope of NonLocalBC::ParallelCopy (#2922)
+
+  -- Open Boundary Poisson Solver (#2912)
+     Add hypre as an option for OpenBCSolver (#2931)
+
+  -- Fix OOB access of ref ratio on HDF write header (#2919)
+
+  -- Add Polaris to GNUMake (#2908)
+
+  -- Export GpuDevice Globals (#2918)
+
+  -- enable LinOp to use the right Factory (fixes moving geometry problem) (#2916)
+
+  -- Use 1 atomic instead of two per item in DenseBins::build (#2911)
+
+  -- [SYCL] Remove amrex::oneapi and update deprecated device descriptors (#2910)
+
+  -- Add: `MultiFab::sum_unique` (#2909)
+
+  -- In MLMG::mgFcycle, assert that for EB the linop is cell-centered. (#2905)
+
+  -- EB: Add Fine Levels (#2881)
+
+  -- Add rpath to lib64 for ZFP. (#2902)
+
+  -- change data types from double to amrex::Real, and thus we can use single precision for the hypre IJ interface (#2896)
+
+  -- MPMD Support (#2895)
+
+  -- MLMG interface (#2858)
+
+# 22.08
+
+  -- Let `selectActualNeighbors` return right after starting if there are no
+     particles for communication. (#2886)
+
+  -- Add Comm Sync to Redistribute (#2891)
+
+  -- Multi-materials and derived variable output (#2888)
+
+  -- Fix host / device sync bug in PODVector (#2890)
+
+  -- MinLoc and MaxLoc Support (#2885)
+
+  -- HIP: Remove the call to hipDeviceSetSharedMemConfig (#2884)
+
+  -- Add Frontier to GNU Make (#2879)
+
+  -- Add option to derefine to AMRErrorTag (#2875)
+
+  -- Fix the segmentation fault in selecting actual neighbor particles. (#2877)
+
+  -- Workaround to bypass issue observed at very large scale with Fujitsu MPI (#2874)
+     `TagBoxArray::collate`: Fujitsu Clang (#2889)
+
+  -- Allow zero components MultiFab and BaseFab (#2873)
+
+  -- New EB optimization parameter: eb2.num_coarsen_opt (#2872)
+
+  -- SENSEI 4.0: Fix Build for Particles (#2869)
+
+  -- Cache the neighbor comm tags for the CPU implementation of fillNeighbors. (#2862)
+
+  -- Remove some hard checks in check_mvmc for 3D (#2864)
+
+  -- Carry over fix for ngbxy.smallEnd typo (#2868)
+
+# 22.07
+
+  -- Adding control APIs and namespacing for core algorithm paths like SpGEMM, SpMV, and SpTrans. (#2859)
+
+  -- update the SENSEI in situ coupling for SENSEI v4.0.0 (#2785)
+
+  -- Write runtime attribs to checkpoints on GPUs (#2856)
+
+  -- Fix gnu make on Crusher for mpi_gtl_hsa (#2857)
+
+  -- CMake: FindDependency CUDAToolkit (#2849)
+
+  -- NERSC Programming Environment prototype (#2848)
+
+  -- GNU Make: No need to query mpif90 if Fortran is not used. (#2852)
+
+  -- Remove f90doc (#2851)
+
+  -- Explicitly invoke python3 (#2850)
+
+  -- Maintain the high end of the 'roundoff domain' in both float and double precision (#2839)
+
+  -- add Ok to coordsys (#2844)
+
+  -- ParamParse: Add Files at Runtime (#2842)
+
+  -- Fix a pathological case for 2d EB (#2840)
+
+  -- add fvolumesum to GNUmakefile (#2836)
+
+  -- Clamp particles shifted from plo boundary against rhi, rather than back to plo (#2814)
+
+  -- Fix: CMake NVTX not only Hypre (#2837)
+
+  -- Update sensei CI container for sensei v4.0 integration (#2834)
+
+  -- HIP Memory Advise : Set managed memory to coarse grain (#2835)
+
+  -- CMake: Fix `export` with `AMReX_INSTALL=OFF` (#2838)
+
+  -- make PODVector work with PolymorphicArenaAllocator (#2829)
+
+  -- Re-implement FaceLinear::interp() for InterpFromCoarseLevel (#2831)
+
+  -- Make regrid method of Amr class public (#2833)
+
+  -- amrex::Any (#2827)
+
+  -- Fix line integral computation (#2830)
+
+  -- Fix a bug in multigrid grids (#2823)
+
+  -- Add html, additional sections to README.md (#2775)
+
+  -- Allow StateDataPhysBCFunct to operate on face-centered data (#2819)
+
+  -- Fix Parser ODR (#2820)
+
+  -- CMake: Cleanup old nvToolsExt (#2817)
+
+  -- Handle the case where we don't have enough device memory for the snd_buffer (#2705)
+
+  -- CMake: 3.17+ (#2813)
+
+  -- Landon/fix bug ghost particles (#2812)
+
+  -- Follow-on to 2809; update selectActualNeighbors as well. (#2810)
+
+  -- Generalize the type of callables that can be passed into the neighbor list build function (#2809)
+
+  -- Add AVX2 instructions flag. (#2803)
+
+  -- Avoid M_PI because it's not in the C++ standard (#2807)
+
+  -- In the array version of FillPatchTwoLevels, allow specifying an (#2800)
+
 # 22.06
 
   -- Fix solvability issue in the nodal solver RAP approach (#2783, #2801)
diff --git a/Docs/sphinx_documentation/source/Basics.rst b/Docs/sphinx_documentation/source/Basics.rst
index dd4e53d455e..dc3022f7e12 100644
--- a/Docs/sphinx_documentation/source/Basics.rst
+++ b/Docs/sphinx_documentation/source/Basics.rst
@@ -2549,7 +2549,11 @@ The basic idea behind physical boundary conditions is as follows:
            Reflection from interior cells with sign
            changed, :math:`q(-i) = -q(i)`.
 
--  For external Dirichlet boundaries, the user needs to provide a
+       user_1, user_2 and user_3
+           "User".  It is the user's responsibility to write a routine
+           to fill ghost cells (more details below).
+
+-  For external Dirichlet and user boundaries, the user needs to provide a
    callable object like below.
 
    .. highlight:: c++
@@ -2564,7 +2568,7 @@ The basic idea behind physical boundary conditions is as follows:
                             const BCRec* bcr, const int bcomp,
                             const int orig_comp) const
            {
-               // external Dirichlet for cell iv
+               // external Dirichlet or user BC for cell iv
            }
        };
 
diff --git a/Docs/sphinx_documentation/source/BuildingAMReX.rst b/Docs/sphinx_documentation/source/BuildingAMReX.rst
index 7b3273bf874..331f9b8c9f6 100644
--- a/Docs/sphinx_documentation/source/BuildingAMReX.rst
+++ b/Docs/sphinx_documentation/source/BuildingAMReX.rst
@@ -35,8 +35,8 @@ list of important variables.
    +-----------------+-------------------------------------+--------------------+
    | COMP            | gnu, cray, ibm, intel, llvm, or pgi | none               |
    +-----------------+-------------------------------------+--------------------+
-   | CXXSTD          | C++ standard (``c++14``, ``c++17``, | compiler default,  |
-   |                 | ``c++20``)                          | at least ``c++14`` |
+   | CXXSTD          | C++ standard (``c++17``, ``c++20``) | compiler default,  |
+   |                 |                                     | at least ``c++17`` |
    +-----------------+-------------------------------------+--------------------+
    | DEBUG           | TRUE or FALSE                       | FALSE              |
    +-----------------+-------------------------------------+--------------------+
@@ -584,7 +584,7 @@ the following line in the appropriate CMakeLists.txt file:
 
 ::
 
-    target_link_libraries( <your-target-name>  AMReX::<amrex-target-name> )
+    target_link_libraries( <your-target-name> PUBLIC AMReX::<amrex-target-name> )
 
 
 In the above snippet, ``<amrex-target-name>`` is any of the targets listed in the table below.
@@ -709,7 +709,7 @@ As an example, consider the following CMake code:
 ::
 
     find_package(AMReX REQUIRED 3D EB)
-    target_link_libraries( Foo  AMReX::amrex AMReX::Flags_CXX )
+    target_link_libraries( Foo PUBLIC AMReX::amrex )
 
 The code in the snippet above checks whether an AMReX installation with 3D and Embedded Boundary support
 is available on the system. If so, AMReX is linked to target ``Foo`` and AMReX flags preset is used
@@ -740,8 +740,8 @@ The AMReX team does development on Linux machines, from laptops to supercomputer
 We do not officially support AMReX on Windows, and many of us do not have access to any Windows
 machines.  However, we believe there are no fundamental issues for it to work on Windows.
 
-(1) AMReX mostly uses standard C++14, but for Windows C++17 is required.  This is because we use
-    C++17 to support file system operations when POSIX I/O is not available.
+(1) AMReX mostly uses standard C++17.
+We run continous integration tests on Windows with MSVC and Clang compilers.
 
 (2) We use POSIX signal handling when floating point exceptions, segmentation faults, etc. happen.
 This capability is not supported on Windows.
diff --git a/Docs/sphinx_documentation/source/BuildingAMReX_Chapter.rst b/Docs/sphinx_documentation/source/BuildingAMReX_Chapter.rst
index dd61bb254d3..3ecbc775c17 100644
--- a/Docs/sphinx_documentation/source/BuildingAMReX_Chapter.rst
+++ b/Docs/sphinx_documentation/source/BuildingAMReX_Chapter.rst
@@ -18,7 +18,7 @@ an application code then uses its own build system and links to AMReX as an exte
 
 Finally, AMReX can also be built with CMake, as detailed in the section on :ref:`sec:build:cmake`.
 
-AMReX requires a C++ compiler that supports the C++14 standard, a
+AMReX requires a C++ compiler that supports the C++17 standard, a
 Fortran compiler that supports the Fortran 2003 standard, and a C
 compiler that supports the C99 standard.  Prerequisites for building
 with GNU Make include Python (>= 2.7, including 3) and standard tools
diff --git a/Docs/sphinx_documentation/source/GPU.rst b/Docs/sphinx_documentation/source/GPU.rst
index 4101c806be2..4984b839132 100644
--- a/Docs/sphinx_documentation/source/GPU.rst
+++ b/Docs/sphinx_documentation/source/GPU.rst
@@ -315,7 +315,7 @@ we provide the helper function ``setup_target_for_cuda_compilation()``:
    setup_target_for_cuda_compilation(my_target)
 
    # Link against amrex
-   target_link_libraries(my_target AMReX::amrex)
+   target_link_libraries(my_target PUBLIC AMReX::amrex)
 
 
 
@@ -1001,7 +1001,7 @@ launch function.
 
 ``amrex::ParallelFor()`` expands into different variations of a quadruply-nested
 :cpp:`for` loop depending dimensionality and whether it is being implemented on CPU or GPU.
-The best way to understand this macro is to take a look at the 4D :cpp:`amrex::ParallelFor`
+The best way to understand this function is to take a look at the 4D :cpp:`amrex::ParallelFor`
 that is implemented when ``USE_CUDA=FALSE``. A simplified version is reproduced here:
 
 .. highlight:: c++
@@ -1103,6 +1103,15 @@ bounds, a :cpp:`long` or :cpp:`int` number of elements is passed to bound the si
 passing the number of elements to work on and indexing the pointer to the starting
 element: :cpp:`p[idx + 15]`.
 
+GPU block size
+--------------
+
+By default, :cpp:`ParallelFor` launches ``AMREX_GPU_MAX_THREADS`` threads
+per GPU block, where ``AMREX_GPU_MAX_THREADS`` is a compile-time constant
+with a default value of 256.  The users can also explcitly specify the
+number of threads per block by :cpp:`ParallelFor<MY_BLOCK_SIZE>(...)`, where
+``MY_BLOCK_SIZE`` is a multiple of the warp size (e.g., 128).  This allows
+the users to do performance tuning for individual kernels.
 
 Launching general kernels
 -------------------------
diff --git a/Docs/sphinx_documentation/source/LinearSolvers.rst b/Docs/sphinx_documentation/source/LinearSolvers.rst
index c8743a3e8e2..d893859e7c2 100644
--- a/Docs/sphinx_documentation/source/LinearSolvers.rst
+++ b/Docs/sphinx_documentation/source/LinearSolvers.rst
@@ -209,8 +209,8 @@ function
 
 ::
 
-    void setDomainBC (const Array<BCType,AMREX_SPACEDIM>& lobc,  // for lower ends
-                      const Array<BCType,AMREX_SPACEDIM>& hibc); // for higher ends
+    void setDomainBC (const Array<LinOpBCType,AMREX_SPACEDIM>& lobc,  // for lower ends
+                      const Array<LinOpBCType,AMREX_SPACEDIM>& hibc); // for higher ends
 
 The supported BC types at the physical domain boundaries are
 
@@ -222,6 +222,8 @@ The supported BC types at the physical domain boundaries are
 
 - :cpp:`LinOpBCType::inhomogNeumann` for inhomogeneous Neumann boundary condition.
 
+- :cpp:`LinOpBCType::Robin` for Robin boundary conditions, :math:`a\phi + b\frac{\partial\phi}{\partial n} = f`.
+
 - :cpp:`LinOpBCType::reflect_odd` for reflection with sign changed.
 
 2) Cell-centered solvers only:
@@ -255,12 +257,12 @@ before the solve one must always call the :cpp:`MLLinOp` member function
 ::
 
     virtual void setLevelBC (int amrlev, const MultiFab* levelbcdata,
-                             const MultiFab* robinbc_a,
-                             const MultiFab* robinbc_b,
-                             const MultiFab* robinbc_f) = 0;
+                             const MultiFab* robinbc_a = nullptr,
+                             const MultiFab* robinbc_b = nullptr,
+                             const MultiFab* robinbc_f = nullptr) = 0;
 
-If we want to supply an inhomogeneous Dirichlet, inhomogeneous Neumann, or
-Robin boundary conditions at the domain boundaries, we must supply those values
+If we want to supply an inhomogeneous Dirichlet or inhomogeneous Neumann
+boundary condition at the domain boundaries, we must supply those values
 in ``MultiFab* levelbcdata``, which must have at least one ghost cell.
 Note that the argument :cpp:`amrlev` is relative to the solve, not
 necessarily the full AMR hierarchy; amrlev = 0 refers to the coarsest
@@ -286,6 +288,11 @@ Dirichlet or Neumann boundaries are assumed to be exactly on the face
 of the physical domain; storing these values in the ghost cell of
 a cell-centered array is a convenience of implementation.
 
+For Robin boundary conditions, the ghost cells in
+``MultiFab* robinbc_a``, ``MultiFab* robinbc_b``, and ``MultiFab* robinbc_f``
+store the numerical values in the condition,
+:math:`a\phi + b\frac{\partial\phi}{\partial n} = f`.
+
 .. _sec:linearsolver:pars:
 
 Parameters
@@ -754,4 +761,3 @@ An example (implemented in the ``MultiComponent`` tutorial) might be:
 See ``amrex-tutorials/ExampleCodes/LinearSolvers/MultiComponent`` for a complete working example.
 
 .. solver reuse
-
diff --git a/Docs/sphinx_documentation/source/Post_Processing.rst b/Docs/sphinx_documentation/source/Post_Processing.rst
index c2cce7fd7b2..fd707f221db 100644
--- a/Docs/sphinx_documentation/source/Post_Processing.rst
+++ b/Docs/sphinx_documentation/source/Post_Processing.rst
@@ -76,8 +76,8 @@ variable.
 
 **How to build and run**
 
-In ``amrex/Tools/Plotfile``, type ``make`` and then ``./fextract.gnu.ex`` to run.
-Typing ``./fextract.gnu.ex`` without inputs will bring up usage and options.
+In ``amrex/Tools/Plotfile``, type ``make`` and then ``./fcompare.gnu.ex`` to run.
+Typing ``./fcompare.gnu.ex`` without inputs will bring up usage and options.
 
 
 **Example**
diff --git a/Docs/sphinx_documentation/source/SWFFT.rst b/Docs/sphinx_documentation/source/SWFFT.rst
index 3e886dcc2a8..9e6192ff048 100644
--- a/Docs/sphinx_documentation/source/SWFFT.rst
+++ b/Docs/sphinx_documentation/source/SWFFT.rst
@@ -98,7 +98,7 @@ AMReX contains two SWFFT tutorials, `SWFFT Poisson`_ and `SWFFT Simple`_:
 .. _`SWFFT Simple`: https://amrex-codes.github.io/amrex/tutorials_html/SWFFT_Tutorial.html#swfft-simple
 
 .. [1]
-   https://xgitlab.cels.anl.gov/hacc/SWFFT
+   https://git.cels.anl.gov/hacc/SWFFT
 
 .. [2]
    SWFFT source code directory in AMReX: amrex/Src/Extern/SWFFT
diff --git a/Docs/sphinx_documentation/source/Testing.rst b/Docs/sphinx_documentation/source/Testing.rst
index b7e32c9477b..bbceae1d1ad 100644
--- a/Docs/sphinx_documentation/source/Testing.rst
+++ b/Docs/sphinx_documentation/source/Testing.rst
@@ -18,6 +18,7 @@ application codes that use it as a framework. We use an in-house test runner scr
 operation, originally developed by Michael Zingale for the Castro code, and later expanded to other
 application codes as well. The results for each night are collected and stored on a web page; see
 https://ccse.lbl.gov/pub/RegressionTesting/ for the latest set of results.
+The runtime option ``amrex.abort_on_unused_inputs`` (``0`` or ``1``; default is ``0`` for false) is useful for making sure that tests always stay up to date with API changes as it will abort the application after the test run if any unused input parameters were detected.
 
 Running the test suite locally
 ==============================
@@ -73,7 +74,7 @@ re-run the script without the :cpp:`--make_benchmarks` option:
 
 ::
 
-   python regtest.py --make_benchmarks 'generating initial benchmarks' AMReX-tests.ini
+   python regtest.py AMReX-tests.ini
 
 The script will generate a set of html pages in the directory specified in your :cpp:`AMReX-tests.ini`
 file that you can examine using the browser of your choice.
diff --git a/Docs/sphinx_documentation/source/Visualization.rst b/Docs/sphinx_documentation/source/Visualization.rst
index ea8b4ab8c0b..59f95f76090 100644
--- a/Docs/sphinx_documentation/source/Visualization.rst
+++ b/Docs/sphinx_documentation/source/Visualization.rst
@@ -873,9 +873,12 @@ and point to the CMake configuration installed with SENSEI.
 
 .. code-block:: bash
 
-   cmake -DAMReX_SENSEI=ON -DSENSEI_DIR=<path to install>/lib/cmake ..
+   cmake -DAMReX_SENSEI=ON -DSENSEI_DIR=<path to install>/<lib dir>/cmake ..
 
-When CMake generates the make files proceed as usual.
+When CMake generates the make files proceed as usual. Note: <lib dir> may be
+`lib` or `lib64` or something else depending on what CMake decided to use for
+your particular OS. See the CMake GNUInstallDirs documentation for more
+information.
 
 .. code-block:: bash
 
@@ -952,8 +955,7 @@ dataset.
 
 Obtaining SENSEI
 -----------------
-SENSEI is hosted on Kitware's Gitlab site at https://gitlab.kitware.com/sensei/sensei
-It's best to checkout the latest release rather than working on the master branch.
+SENSEI is hosted on github at https://github.com/SENSEI-insitu/SENSEI.git
 
 To ease the burden of wrangling back end installs SENSEI provides two platforms
 with all dependencies pre-installed, a VirtualBox VM, and a NERSC Cori
diff --git a/GNUmakefile.in b/GNUmakefile.in
index 8a6ce69df09..ad6238543dc 100644
--- a/GNUmakefile.in
+++ b/GNUmakefile.in
@@ -19,6 +19,9 @@ ifeq ($(USE_FORTRAN_INTERFACE),TRUE)
 endif
 ifeq ($(USE_LINEAR_SOLVERS),TRUE)
    Pdirs += LinearSolvers/MLMG
+   ifeq ($(DIM),3)
+     Pdirs += LinearSolvers/OpenBC
+   endif
    ifeq ($(USE_FORTRAN_INTERFACE),TRUE)
      Pdirs += F_Interfaces/LinearSolvers
    endif
diff --git a/INSTALL b/INSTALL
index efb40fbdb2e..ed1e0dfb36e 100644
--- a/INSTALL
+++ b/INSTALL
@@ -10,7 +10,7 @@ There are three ways to use AMReX.
     Fortran modules via `./configure` followed by `make` and `make
     install`.  Type `./configure -h` to show help message.  An
     application code uses its build system to compile and link to the
-    AMReX library.  Because AMReX uses C++14 and Fortran, the linker
+    AMReX library.  Because AMReX uses C++17 and Fortran, the linker
     needs to link the libraries.  See
     `Tutorials/Basic/Build_with_libamrex` for an example of this
     approach.  Note that this approach relies the make system in
diff --git a/README.md b/README.md
index 72c182470e1..da3a1abcbd2 100644
--- a/README.md
+++ b/README.md
@@ -1,5 +1,5 @@
 <div align="center">
-<img src="https://github.com/etpalmer63/etpalmer63.github.io/blob/bead0339d7fef46be8067a0f1122126652f024be/Images/AMReX_logo_small_banner.png" alt="AMReX Logo" style="width:500px">
+<img src="https://github.com/amrex-codes/amrex-codes.github.io/blob/main/images/AMReX_logo_small_banner_500.png" alt="AMReX Logo">
 
 <p align="center">
   <a href="https://doi.org/10.21105/joss.01370">
@@ -71,7 +71,7 @@ in a wide variety of other scientific simulations, some of which, can be seen
 in our application [gallery](https://amrex-codes.github.io/amrex/gallery.html).
 
 <div align="center">
-<img src="https://github.com/etpalmer63/etpalmer63.github.io/blob/f07ae02daf5f58d25c4cdad712f4024c62ea50b7/Images/gallery.gif" alt="Gallery Slideshow" style="width:500px">
+<img src="https://github.com/amrex-codes/amrex-codes.github.io/blob/main/images/gallery_small.gif" alt="Gallery Slideshow">
 </div>
 
 ## Get Help
diff --git a/Src/Amr/AMReX_Amr.cpp b/Src/Amr/AMReX_Amr.cpp
index 66ec4664c5a..02f0452eac9 100644
--- a/Src/Amr/AMReX_Amr.cpp
+++ b/Src/Amr/AMReX_Amr.cpp
@@ -910,7 +910,7 @@ Amr::writeSmallPlotFile ()
 
     // Don't continue if we have no variables to plot.
 
-    if (stateSmallPlotVars().size() == 0) {
+    if (stateSmallPlotVars().size() == 0 && deriveSmallPlotVars().size() == 0) {
         return;
     }
 
diff --git a/Src/Amr/AMReX_AmrLevel.H b/Src/Amr/AMReX_AmrLevel.H
index 0aaf7fc2620..5034df1b5e5 100644
--- a/Src/Amr/AMReX_AmrLevel.H
+++ b/Src/Amr/AMReX_AmrLevel.H
@@ -15,6 +15,8 @@
 #include <AMReX_StateDescriptor.H>
 #include <AMReX_StateData.H>
 #include <AMReX_VisMF.H>
+#include <AMReX_RungeKutta.H>
+#include <AMReX_FillPatcher.H>
 #ifdef AMREX_USE_EB
 #include <AMReX_EBSupport.H>
 #endif
@@ -152,11 +154,10 @@ public:
                           int  ncycle) = 0;
 
     /**
-    * \brief Contains operations to be done after a timestep.  This is a
-    * pure virtual function and hence MUST be implemented by derived
-    * classes.
+    * \brief Contains operations to be done after a timestep.  If this
+    * function is overridden, don't forget to reset FillPatcher.
     */
-    virtual  void post_timestep (int iteration) = 0;
+    virtual void post_timestep (int iteration);
     /**
     * \brief Contains operations to be done only after a full coarse
     * timestep.  The default implementation does nothing.
@@ -243,12 +244,14 @@ public:
     Long countCells () const noexcept;
 
     //! Get the area not to tag.
-    const BoxArray& getAreaNotToTag() noexcept;
-    const Box& getAreaToTag() noexcept;
+    const BoxArray& getAreaNotToTag () noexcept;
+    const Box& getAreaToTag () noexcept;
     //! Construct the area not to tag.
-    void constructAreaNotToTag();
+    void constructAreaNotToTag ();
     //! Set the area not to tag.
-    void setAreaNotToTag(BoxArray& ba) noexcept;
+    void setAreaNotToTag (BoxArray& ba) noexcept;
+
+    void resetFillPatcher ();
 
     /**
     * \brief Error estimation for regridding. This is a pure virtual
@@ -365,6 +368,20 @@ public:
     virtual void particle_redistribute (int /*lbase*/ = 0, bool /*a_init*/ = false) {;}
 #endif
 
+    /**
+     * \brief Fill with FillPatcher on level > 0 and AmrLevel::FillPatch on level 0.
+     *
+     * \param mf     destination MultiFab
+     * \param dcomp  starting component for the destination
+     * \param ncomp  number of component to fill
+     * \param nghost number of ghost cells to fill
+     * \param time   time
+     * \param state_index StateData index
+     * \param scomp  starting component in the StateData
+     */
+    void FillPatcherFill (amrex::MultiFab& mf, int dcomp, int ncomp, int nghost,
+                          amrex::Real time, int state_index, int scomp);
+
     static void FillPatch (AmrLevel& amrlevel,
                            MultiFab& leveldata,
                            int       boxGrow,
@@ -380,8 +397,33 @@ public:
                               Real      time,
                               int       index,
                               int       scomp,
-                             int       ncomp,
-                             int       dcomp=0);
+                              int       ncomp,
+                              int       dcomp=0);
+
+    /**
+     * \brief Evolve one step with Runge-Kutta (2, 3, or 4)
+     *
+     * To use RK, the StateData must have all the ghost cells needed.  See
+     * namespace RungeKutta for expected function signatures of the callable
+     * parameters.
+     *
+     * \param order      order of RK
+     * \param state_type index of StateData
+     * \param time       time at the beginning of the step.
+     * \param dt         time step
+     * \param iteration  iteration number on fine level during a coarse time
+     *                   step.  For an AMR simulation with subcycling and a
+     *                   refinement ratio of 2, the number is either 1 or 2,
+     *                   denoting the first and second substep, respectively.
+     * \param ncycle     number of subcyling steps.  It's usually 2 or 4.
+     *                   Without subcycling, this will be 1.
+     * \param f          computing right-hand side for evolving the StateData.
+     *                   One can also register data for flux registers in this.
+     * \param p          optionally post-processing RK stage results
+     */
+    template <typename F, typename P = RungeKutta::PostStageNoOp>
+    void RK (int order, int state_type, Real time, Real dt, int iteration,
+             int ncycle, F&& f, P&& p = RungeKutta::PostStageNoOp());
 
 #ifdef AMREX_USE_EB
     static void SetEBMaxGrowCells (int nbasic, int nvolume, int nfull) noexcept {
@@ -425,7 +467,7 @@ protected:
     IntVect               fine_ratio;   // Refinement ratio to finer level.
     static DeriveList     derive_lst;   // List of derived quantities.
     static DescriptorList desc_lst;     // List of state variables.
-    Vector<StateData>      state;        // Array of state data.
+    Vector<StateData>     state;        // Array of state data.
 
     BoxArray              m_AreaNotToTag; //Area which shouldn't be tagged on this level.
     Box                   m_AreaToTag;    //Area which is allowed to be tagged on this level.
@@ -436,8 +478,18 @@ protected:
 
     std::unique_ptr<FabFactory<FArrayBox> > m_factory;
 
+    Vector<std::unique_ptr<FillPatcher<MultiFab>>> m_fillpatcher;
+
 private:
 
+    template <std::size_t order>
+    void storeRKCoarseData (int state_type, Real time, Real dt,
+                            MultiFab const& S_old,
+                            Array<MultiFab,order> const& rkk);
+
+    void FillRKPatch (int state_index, MultiFab& S, Real time,
+                      int stage, int iteration, int ncycle);
+
     mutable BoxArray      edge_grids[AMREX_SPACEDIM];  // face-centered grids
     mutable BoxArray      nodal_grids;              // all nodal grids
 };
@@ -558,6 +610,74 @@ private:
     std::map< int,Vector< Vector< Vector<FillBoxId> > > > m_fbid; // [grid][level][fillablesubbox][oldnew]
 };
 
+template <typename F, typename P>
+void AmrLevel::RK (int order, int state_type, Real time, Real dt, int iteration,
+                   int ncycle, F&& f, P&& p)
+{
+    BL_PROFILE("AmrLevel::RK()");
+
+    AMREX_ASSERT(AmrLevel::desc_lst[state_type].nExtra() > 0); // Need ghost cells in StateData
+
+    MultiFab& S_old = get_old_data(state_type);
+    MultiFab& S_new = get_new_data(state_type);
+    const Real t_old = state[state_type].prevTime();
+    const Real t_new = state[state_type].curTime();
+    AMREX_ALWAYS_ASSERT(amrex::almostEqual(time,t_old) && amrex::almostEqual(time+dt,t_new));
+
+    if (order == 2) {
+        RungeKutta::RK2(S_old, S_new, time, dt, std::forward<F>(f),
+                        [&] (int /*stage*/, MultiFab& mf, Real t) {
+                            FillPatcherFill(mf, 0, mf.nComp(), mf.nGrow(), t,
+                                            state_type, 0); },
+                        std::forward<P>(p));
+    } else if (order == 3) {
+        RungeKutta::RK3(S_old, S_new, time, dt, std::forward<F>(f),
+                        [&] (int stage, MultiFab& mf, Real t) {
+                            FillRKPatch(state_type, mf, t, stage, iteration, ncycle);
+                        },
+                        [&] (Array<MultiFab,3> const& rkk) {
+                            if (level < parent->finestLevel()) {
+                                storeRKCoarseData(state_type, time, dt, S_old, rkk);
+                            }
+                        },
+                        std::forward<P>(p));
+    } else if (order == 4) {
+        RungeKutta::RK4(S_old, S_new, time, dt, std::forward<F>(f),
+                        [&] (int stage, MultiFab& mf, Real t) {
+                            FillRKPatch(state_type, mf, t, stage, iteration, ncycle);
+                        },
+                        [&] (Array<MultiFab,4> const& rkk) {
+                            if (level < parent->finestLevel()) {
+                                storeRKCoarseData(state_type, time, dt, S_old, rkk);
+                            }
+                        },
+                        std::forward<P>(p));
+    } else {
+        amrex::Abort("AmrLevel::RK: order = "+std::to_string(order)+" is not supported");
+    }
+}
+
+template <std::size_t order>
+void AmrLevel::storeRKCoarseData (int state_type, Real time, Real dt,
+                                  MultiFab const& S_old,
+                                  Array<MultiFab,order> const& rkk)
+{
+    if (level == parent->finestLevel()) { return; }
+
+    const StateDescriptor& desc = AmrLevel::desc_lst[state_type];
+
+    auto& fillpatcher = parent->getLevel(level+1).m_fillpatcher[state_type];
+    fillpatcher = std::make_unique<FillPatcher<MultiFab>>
+        (parent->boxArray(level+1), parent->DistributionMap(level+1),
+         parent->Geom(level+1),
+         parent->boxArray(level), parent->DistributionMap(level),
+         parent->Geom(level),
+         IntVect(desc.nExtra()), desc.nComp(), desc.interp(0));
+
+    fillpatcher->storeRKCoarseData(time, dt, S_old, rkk);
+}
+
+
 }
 
 #endif /*_AmrLevel_H_*/
diff --git a/Src/Amr/AMReX_AmrLevel.cpp b/Src/Amr/AMReX_AmrLevel.cpp
index a88489f9512..c10a1e6277b 100644
--- a/Src/Amr/AMReX_AmrLevel.cpp
+++ b/Src/Amr/AMReX_AmrLevel.cpp
@@ -31,6 +31,14 @@ EBSupport AmrLevel::m_eb_support_level = EBSupport::volume;
 DescriptorList AmrLevel::desc_lst;
 DeriveList     AmrLevel::derive_lst;
 
+void
+AmrLevel::post_timestep (int /*iteration*/)
+{
+    if (level < parent->finestLevel()) {
+        parent->getLevel(level+1).resetFillPatcher();
+    }
+}
+
 void
 AmrLevel::postCoarseTimeStep (Real time)
 {
@@ -102,6 +110,7 @@ AmrLevel::AmrLevel (Amr&            papa,
     }
 
     state.resize(desc_lst.size());
+    m_fillpatcher.resize(desc_lst.size());
 
 #ifdef AMREX_USE_EB
     if (EB2::TopIndexSpaceIfPresent()) {
@@ -451,6 +460,8 @@ AmrLevel::restart (Amr&          papa,
         }
     }
 
+    m_fillpatcher.resize(ndesc);
+
     if (parent->useFixedCoarseGrids()) constructAreaNotToTag();
 
     post_step_regrid = 0;
@@ -2096,6 +2107,63 @@ void AmrLevel::constructAreaNotToTag ()
     }
 }
 
+void
+AmrLevel::resetFillPatcher ()
+{
+    for (auto& fp : m_fillpatcher) {
+        fp.reset();
+    }
+}
+
+void
+AmrLevel::FillPatcherFill (MultiFab& mf, int dcomp, int ncomp, int nghost,
+                           Real time, int state_index, int scomp)
+{
+    if (level == 0) {
+        FillPatch(*this, mf, nghost, time, state_index, scomp, ncomp, dcomp);
+    } else {
+        AmrLevel& fine_level = *this;
+        AmrLevel& crse_level = parent->getLevel(level-1);
+        const Geometry& geom_fine = fine_level.geom;
+        const Geometry& geom_crse = crse_level.geom;
+
+        Vector<MultiFab*> smf_crse;
+        Vector<Real> stime_crse;
+        StateData& statedata_crse = crse_level.state[state_index];
+        statedata_crse.getData(smf_crse,stime_crse,time);
+        StateDataPhysBCFunct physbcf_crse(statedata_crse,scomp,geom_crse);
+
+        Vector<MultiFab*> smf_fine;
+        Vector<Real> stime_fine;
+        StateData& statedata_fine = fine_level.state[state_index];
+        statedata_fine.getData(smf_fine,stime_fine,time);
+        StateDataPhysBCFunct physbcf_fine(statedata_fine,scomp,geom_fine);
+
+        const StateDescriptor& desc = AmrLevel::desc_lst[state_index];
+
+        if (level > 1 &&!amrex::ProperlyNested(fine_level.crse_ratio,
+                                               parent->blockingFactor(fine_level.level),
+                                               nghost, mf.ixType(),
+                                               desc.interp(scomp))) {
+            amrex::Abort("FillPatcherFill: Grids are not properly nested.  Must increase blocking factor.");
+        }
+
+        auto& fillpatcher = m_fillpatcher[state_index];
+        if (fillpatcher == nullptr) {
+            fillpatcher = std::make_unique<FillPatcher<MultiFab>>
+                (parent->boxArray(level), parent->DistributionMap(level), geom_fine,
+                 parent->boxArray(level-1), parent->DistributionMap(level-1), geom_crse,
+                 IntVect(nghost), desc.nComp(), desc.interp(scomp));
+        }
+
+        fillpatcher->fill(mf, IntVect(nghost), time,
+                          smf_crse, stime_crse, smf_fine, stime_fine,
+                          scomp, dcomp, ncomp,
+                          physbcf_crse, scomp, physbcf_fine, scomp,
+                          desc.getBCs(), scomp);
+    }
+}
+
 void
 AmrLevel::FillPatch (AmrLevel& amrlevel,
                      MultiFab& leveldata,
@@ -2163,4 +2231,23 @@ AmrLevel::CreateLevelDirectory (const std::string &dir)
     levelDirectoryCreated = true;
 }
 
+void
+AmrLevel::FillRKPatch (int state_index, MultiFab& S, Real time,
+                       int stage, int iteration, int ncycle)
+{
+    StateDataPhysBCFunct physbcf(state[state_index], 0, geom);
+
+    if (level == 0) {
+        S.FillBoundary(geom.periodicity());
+        physbcf(S, 0, S.nComp(), S.nGrowVect(), time, 0);
+    } else {
+        auto& crse_level = parent->getLevel(level-1);
+        StateDataPhysBCFunct physbcf_crse(crse_level.state[state_index], 0,
+                                          crse_level.geom);
+        auto& fillpatcher = m_fillpatcher[state_index];
+        fillpatcher->fillRK(stage, iteration, ncycle, S, time, physbcf_crse,
+                            physbcf, AmrLevel::desc_lst[state_index].getBCs());
+    }
+}
+
 }
diff --git a/Src/Amr/AMReX_Derive.H b/Src/Amr/AMReX_Derive.H
index 2a7c2e26713..7d5b32d7aa6 100644
--- a/Src/Amr/AMReX_Derive.H
+++ b/Src/Amr/AMReX_Derive.H
@@ -84,9 +84,9 @@ extern "C"
                                  const int* level, const int* grid_no) ;
 }
 
-typedef void (*DeriveFuncFab) (const amrex::Box& bx, amrex::FArrayBox& derfab, int dcomp, int ncomp,
-                               const amrex::FArrayBox& datafab, const amrex::Geometry& geomdata,
-                               amrex::Real time, const int* bcrec, int level);
+  typedef std::function<void(const amrex::Box& bx, amrex::FArrayBox& derfab, int dcomp, int ncomp,
+                 const amrex::FArrayBox& datafab, const amrex::Geometry& geomdata,
+                 amrex::Real time, const int* bcrec, int level)> DeriveFuncFab;
 
 class DescriptorList;
 
diff --git a/Src/Amr/AMReX_StateDescriptor.cpp b/Src/Amr/AMReX_StateDescriptor.cpp
index 932479feeb2..1910dcf7b3f 100644
--- a/Src/Amr/AMReX_StateDescriptor.cpp
+++ b/Src/Amr/AMReX_StateDescriptor.cpp
@@ -42,23 +42,31 @@ StateDescriptor::BndryFunc::operator () (Real* data,const int* lo,const int* hi,
 {
     BL_ASSERT(m_func != 0 || m_func3D != 0);
 
+#ifdef AMREX_USE_OMP
     bool thread_safe = bf_thread_safety(lo, hi, dom_lo, dom_hi, a_bc, 1);
     if (thread_safe) {
-      if (m_func != 0)
-          m_func(data,AMREX_ARLIM(lo),AMREX_ARLIM(hi),dom_lo,dom_hi,dx,grd_lo,time,a_bc);
-      else
-          m_func3D(data,AMREX_ARLIM_3D(lo),AMREX_ARLIM_3D(hi),AMREX_ARLIM_3D(dom_lo),AMREX_ARLIM_3D(dom_hi),
-                   AMREX_ZFILL(dx),AMREX_ZFILL(grd_lo),time,a_bc);
-    } else {
+#endif
+        {
+            if (m_func != 0) {
+                m_func(data,AMREX_ARLIM(lo),AMREX_ARLIM(hi),dom_lo,dom_hi,dx,grd_lo,time,a_bc);
+            } else {
+                m_func3D(data,AMREX_ARLIM_3D(lo),AMREX_ARLIM_3D(hi),AMREX_ARLIM_3D(dom_lo),AMREX_ARLIM_3D(dom_hi),
+                         AMREX_ZFILL(dx),AMREX_ZFILL(grd_lo),time,a_bc);
+            }
+        }
 #ifdef AMREX_USE_OMP
+    } else {
 #pragma omp critical (bndryfunc)
-#endif
-      if (m_func != 0)
-          m_func(data,AMREX_ARLIM(lo),AMREX_ARLIM(hi),dom_lo,dom_hi,dx,grd_lo,time,a_bc);
-      else
-          m_func3D(data,AMREX_ARLIM_3D(lo),AMREX_ARLIM_3D(hi),AMREX_ARLIM_3D(dom_lo),AMREX_ARLIM_3D(dom_hi),
-                   AMREX_ZFILL(dx),AMREX_ZFILL(grd_lo),time,a_bc);
+        {
+            if (m_func != 0) {
+                m_func(data,AMREX_ARLIM(lo),AMREX_ARLIM(hi),dom_lo,dom_hi,dx,grd_lo,time,a_bc);
+            } else {
+                m_func3D(data,AMREX_ARLIM_3D(lo),AMREX_ARLIM_3D(hi),AMREX_ARLIM_3D(dom_lo),AMREX_ARLIM_3D(dom_hi),
+                         AMREX_ZFILL(dx),AMREX_ZFILL(grd_lo),time,a_bc);
+            }
+        }
     }
+#endif
 }
 
 void
@@ -69,23 +77,32 @@ StateDescriptor::BndryFunc::operator () (Real* data,const int* lo,const int* hi,
 {
     BL_ASSERT(m_gfunc != 0 || m_gfunc3D != 0);
 
+    amrex::ignore_unused(ng);
+#ifdef AMREX_USE_OMP
     bool thread_safe = bf_thread_safety(lo, hi, dom_lo, dom_hi, a_bc, ng);
     if (thread_safe) {
-        if (m_gfunc != 0)
-            m_gfunc(data,AMREX_ARLIM(lo),AMREX_ARLIM(hi),dom_lo,dom_hi,dx,grd_lo,time,a_bc);
-        else
-            m_gfunc3D(data,AMREX_ARLIM_3D(lo),AMREX_ARLIM_3D(hi),AMREX_ARLIM_3D(dom_lo),AMREX_ARLIM_3D(dom_hi),
-                      AMREX_ZFILL(dx),AMREX_ZFILL(grd_lo),time,a_bc);
-    } else {
+#endif
+        {
+            if (m_gfunc != 0) {
+                m_gfunc(data,AMREX_ARLIM(lo),AMREX_ARLIM(hi),dom_lo,dom_hi,dx,grd_lo,time,a_bc);
+            } else {
+                m_gfunc3D(data,AMREX_ARLIM_3D(lo),AMREX_ARLIM_3D(hi),AMREX_ARLIM_3D(dom_lo),AMREX_ARLIM_3D(dom_hi),
+                          AMREX_ZFILL(dx),AMREX_ZFILL(grd_lo),time,a_bc);
+            }
+        }
 #ifdef AMREX_USE_OMP
+    } else {
 #pragma omp critical (bndryfunc)
-#endif
-        if (m_gfunc != 0)
-            m_gfunc(data,AMREX_ARLIM(lo),AMREX_ARLIM(hi),dom_lo,dom_hi,dx,grd_lo,time,a_bc);
-        else
-            m_gfunc3D(data,AMREX_ARLIM_3D(lo),AMREX_ARLIM_3D(hi),AMREX_ARLIM_3D(dom_lo),AMREX_ARLIM_3D(dom_hi),
-                      AMREX_ZFILL(dx),AMREX_ZFILL(grd_lo),time,a_bc);
+        {
+            if (m_gfunc != 0) {
+                m_gfunc(data,AMREX_ARLIM(lo),AMREX_ARLIM(hi),dom_lo,dom_hi,dx,grd_lo,time,a_bc);
+            } else {
+                m_gfunc3D(data,AMREX_ARLIM_3D(lo),AMREX_ARLIM_3D(hi),AMREX_ARLIM_3D(dom_lo),AMREX_ARLIM_3D(dom_hi),
+                          AMREX_ZFILL(dx),AMREX_ZFILL(grd_lo),time,a_bc);
+            }
+        }
     }
+#endif
 }
 
 void
diff --git a/Src/AmrCore/AMReX_ErrorList.H b/Src/AmrCore/AMReX_ErrorList.H
index 90f49b02749..1cc8d61fd07 100644
--- a/Src/AmrCore/AMReX_ErrorList.H
+++ b/Src/AmrCore/AMReX_ErrorList.H
@@ -383,6 +383,7 @@ std::ostream& operator << (std::ostream& os, const ErrorList& elst);
     Real m_min_time = std::numeric_limits<Real>::lowest();
     Real m_max_time = std::numeric_limits<Real>::max();
     int m_volume_weighting = 0;
+    int m_derefine = 0;
     RealBox m_realbox;
 
     AMRErrorTagInfo& SetMaxLevel (int max_level) noexcept {
@@ -405,6 +406,10 @@ std::ostream& operator << (std::ostream& os, const ErrorList& elst);
       m_volume_weighting = volume_weighting;
       return *this;
     }
+    AMRErrorTagInfo& SetDerefine (int derefine) noexcept {
+      m_derefine = derefine;
+      return *this;
+    }
   };
 
   class AMRErrorTag
@@ -415,6 +420,8 @@ std::ostream& operator << (std::ostream& os, const ErrorList& elst);
 
     struct UserFunc
     {
+      virtual ~UserFunc () {}
+
       virtual void operator() (const amrex::Box&                       bx,
                                amrex::Array4<const amrex::Real> const& dat,
                                amrex::Array4<char> const&              tag,
@@ -465,6 +472,8 @@ std::ostream& operator << (std::ostream& os, const ErrorList& elst);
                  const AMRErrorTagInfo& info = AMRErrorTagInfo()) noexcept
       : m_userfunc(userfunc), m_field(field), m_info(info), m_ngrow(ngrow) {}
 
+    virtual ~AMRErrorTag () {}
+
     virtual void operator() (amrex::TagBoxArray&    tb,
                              const amrex::MultiFab* mf,
                              char                   clearval,
diff --git a/Src/AmrCore/AMReX_ErrorList.cpp b/Src/AmrCore/AMReX_ErrorList.cpp
index 1594ba740a9..6dcb5565227 100644
--- a/Src/AmrCore/AMReX_ErrorList.cpp
+++ b/Src/AmrCore/AMReX_ErrorList.cpp
@@ -293,80 +293,225 @@ AMRErrorTag::operator() (TagBoxArray&    tba,
                 auto threshold = m_value[level];
                 auto const volume_weighting = m_info.m_volume_weighting;
                 auto geomdata = geom.data();
+                auto tag_update = tagval;
+                if (m_info.m_derefine) {
+                    tag_update = clearval;
+                }
+
                 if (m_test == GRAD)
                 {
-                    ParallelFor(tba, [=] AMREX_GPU_DEVICE (int bi, int i, int j, int k) noexcept
+#ifdef AMREX_USE_EB
+                    if (mf->hasEBFabFactory()) {
+                        auto const& ebfact =
+                            dynamic_cast<amrex::EBFArrayBoxFactory const&>(mf->Factory());
+                        auto const& flags = ebfact.getMultiEBCellFlagFab().arrays();
+                        ParallelFor(tba, [=] AMREX_GPU_DEVICE (int bi, int i, int j, int k) noexcept
+                        {
+                            auto const& dat  = datma[bi];
+                            auto const& flag = flags[bi];
+
+                            Real ax = 0.; Real ay = 0.;
+                            if (flag(i,j,k).isConnected(1,0,0)) {
+                                ax = amrex::max(ax,amrex::Math::abs(dat(i+1,j,k) - dat(i,j,k)));
+                            }
+                            if (flag(i,j,k).isConnected(-1,0,0)) {
+                                ax = amrex::max(ax,amrex::Math::abs(dat(i,j,k) - dat(i-1,j,k)));
+                            }
+                            if (flag(i,j,k).isConnected(0,1,0)) {
+                                ay = amrex::max(ay,amrex::Math::abs(dat(i,j+1,k) - dat(i,j,k)));
+                            }
+                            if (flag(i,j,k).isConnected(0,-1,0)) {
+                                ay = amrex::max(ay,amrex::Math::abs(dat(i,j,k) - dat(i,j-1,k)));
+                            }
+#if AMREX_SPACEDIM > 2
+                            Real az = 0.;
+                            if (flag(i,j,k).isConnected(0,0,1)) {
+                                az = amrex::max(az,amrex::Math::abs(dat(i,j,k+1) - dat(i,j,k)));
+                            }
+                            if (flag(i,j,k).isConnected(0,0,-1)) {
+                                az = amrex::max(az,amrex::Math::abs(dat(i,j,k) - dat(i,j,k-1)));
+                            }
+#endif
+                            if (amrex::max(AMREX_D_DECL(ax,ay,az)) >= threshold) {
+                                tagma[bi](i,j,k) = tag_update;
+                            }
+                        });
+                    } else
+#endif
                     {
-                        auto const& dat = datma[bi];
-                        auto ax = amrex::Math::abs(dat(i+1,j,k) - dat(i,j,k));
-                        ax = amrex::max(ax,amrex::Math::abs(dat(i,j,k) - dat(i-1,j,k)));
+                        ParallelFor(tba, [=] AMREX_GPU_DEVICE (int bi, int i, int j, int k) noexcept
+                        {
+                            auto const& dat = datma[bi];
+
+                            Real ax = 0.;
+                            ax = amrex::Math::abs(dat(i+1,j,k) - dat(i,j,k));
+                            ax = amrex::max(ax,amrex::Math::abs(dat(i,j,k) - dat(i-1,j,k)));
 #if AMREX_SPACEDIM == 1
-                        if (ax >= threshold) { tagma[bi](i,j,k) = tagval;}
+                            if (ax >= threshold) { tagma[bi](i,j,k) = tag_update;}
 #else
-                        auto ay = amrex::Math::abs(dat(i,j+1,k) - dat(i,j,k));
-                        ay = amrex::max(ay,amrex::Math::abs(dat(i,j,k) - dat(i,j-1,k)));
+                            Real ay = 0.;
+                            ay = amrex::Math::abs(dat(i,j+1,k) - dat(i,j,k));
+                            ay = amrex::max(ay,amrex::Math::abs(dat(i,j,k) - dat(i,j-1,k)));
 #if AMREX_SPACEDIM > 2
-                        auto az = amrex::Math::abs(dat(i,j,k+1) - dat(i,j,k));
-                        az = amrex::max(az,amrex::Math::abs(dat(i,j,k) - dat(i,j,k-1)));
-#endif
-                        if (amrex::max(AMREX_D_DECL(ax,ay,az)) >= threshold) {
-                            tagma[bi](i,j,k) = tagval;
-                        }
-#endif
-                    });
+                            Real az = 0.;
+                            az = amrex::Math::abs(dat(i,j,k+1) - dat(i,j,k));
+                            az = amrex::max(az,amrex::Math::abs(dat(i,j,k) - dat(i,j,k-1)));
+#endif // DIM > 2
+                            if (amrex::max(AMREX_D_DECL(ax,ay,az)) >= threshold) {
+                                tagma[bi](i,j,k) = tag_update;
+                            }
+#endif // DIM > 1
+                       });
+                    }
                 }
                 else if (m_test == RELGRAD)
                 {
-                    ParallelFor(tba, [=] AMREX_GPU_DEVICE (int bi, int i, int j, int k) noexcept
+#ifdef AMREX_USE_EB
+                    if (mf->hasEBFabFactory()) {
+                        auto const& ebfact =
+                            dynamic_cast<amrex::EBFArrayBoxFactory const&>(mf->Factory());
+                        auto const& flags = ebfact.getMultiEBCellFlagFab().arrays();
+                        ParallelFor(tba, [=] AMREX_GPU_DEVICE (int bi, int i, int j, int k) noexcept
+                        {
+                            auto const& dat  = datma[bi];
+                            auto const& flag = flags[bi];
+
+                            Real ax = 0.; Real ay = 0.;
+
+                            if (flag(i,j,k).isConnected(1,0,0)) {
+                                ax = amrex::max(ax,amrex::Math::abs(dat(i+1,j,k) - dat(i,j,k)));
+                            }
+                            if (flag(i,j,k).isConnected(-1,0,0)) {
+                                ax = amrex::max(ax,amrex::Math::abs(dat(i,j,k) - dat(i-1,j,k)));
+                            }
+                            if (flag(i,j,k).isConnected(0,1,0)) {
+                                ay = amrex::max(ay,amrex::Math::abs(dat(i,j+1,k) - dat(i,j,k)));
+                            }
+                            if (flag(i,j,k).isConnected(0,-1,0)) {
+                                ay = amrex::max(ay,amrex::Math::abs(dat(i,j,k) - dat(i,j-1,k)));
+                            }
+#if AMREX_SPACEDIM > 2
+                            Real az = 0.;
+                            if (flag(i,j,k).isConnected(0,0,1)) {
+                                az = amrex::max(az,amrex::Math::abs(dat(i,j,k+1) - dat(i,j,k)));
+                            }
+                            if (flag(i,j,k).isConnected(0,0,-1)) {
+                                az = amrex::max(az,amrex::Math::abs(dat(i,j,k) - dat(i,j,k-1)));
+                            }
+#endif // DIM > 2
+                            if (amrex::max(AMREX_D_DECL(ax,ay,az))
+                                >= threshold * amrex::Math::abs(dat(i,j,k))) {
+                                tagma[bi](i,j,k) = tag_update;
+                            }
+                        });
+                    } else
+#endif
                     {
-                        auto const& dat = datma[bi];
-                        auto ax = amrex::Math::abs(dat(i+1,j,k) - dat(i,j,k));
-                        ax = amrex::max(ax,amrex::Math::abs(dat(i,j,k) - dat(i-1,j,k)));
+                        ParallelFor(tba, [=] AMREX_GPU_DEVICE (int bi, int i, int j, int k) noexcept
+                        {
+                            auto const& dat = datma[bi];
+
+                            Real ax = amrex::Math::abs(dat(i+1,j,k) - dat(i,j,k));
+                            ax = amrex::max(ax,amrex::Math::abs(dat(i,j,k) - dat(i-1,j,k)));
 #if AMREX_SPACEDIM == 1
-                        if (ax >= threshold * amrex::Math::abs(dat(i,j,k))) { tagma[bi](i,j,k) = tagval;}
+                            if (ax >= threshold * amrex::Math::abs(dat(i,j,k))) { tagma[bi](i,j,k) = tag_update;}
 #else
-                        auto ay = amrex::Math::abs(dat(i,j+1,k) - dat(i,j,k));
-                        ay = amrex::max(ay,amrex::Math::abs(dat(i,j,k) - dat(i,j-1,k)));
+                            Real ay = amrex::Math::abs(dat(i,j+1,k) - dat(i,j,k));
+                            ay = amrex::max(ay,amrex::Math::abs(dat(i,j,k) - dat(i,j-1,k)));
 #if AMREX_SPACEDIM > 2
-                        auto az = amrex::Math::abs(dat(i,j,k+1) - dat(i,j,k));
-                        az = amrex::max(az,amrex::Math::abs(dat(i,j,k) - dat(i,j,k-1)));
-#endif
-                        if (amrex::max(AMREX_D_DECL(ax,ay,az))
-                            >= threshold * amrex::Math::abs(dat(i,j,k))) {
-                            tagma[bi](i,j,k) = tagval;
-                        }
-#endif
-                    });
+                            Real az = amrex::Math::abs(dat(i,j,k+1) - dat(i,j,k));
+                            az = amrex::max(az,amrex::Math::abs(dat(i,j,k) - dat(i,j,k-1)));
+#endif // DIM > 2
+                            if (amrex::max(AMREX_D_DECL(ax,ay,az))
+                                >= threshold * amrex::Math::abs(dat(i,j,k))) {
+                                tagma[bi](i,j,k) = tag_update;
+                            }
+#endif // DIM > 1
+                        });
+                    }
                 }
                 else if (m_test == LESS)
                 {
-                    ParallelFor(tba, [=] AMREX_GPU_DEVICE (int bi, int i, int j, int k) noexcept
+#ifdef AMREX_USE_EB
+                    if (mf->hasEBFabFactory()) {
+                        auto const& ebfact =
+                            dynamic_cast<amrex::EBFArrayBoxFactory const&>(mf->Factory());
+                        auto const& flags = ebfact.getMultiEBCellFlagFab().arrays();
+                        ParallelFor(tba, [=] AMREX_GPU_DEVICE (int bi, int i, int j, int k) noexcept
+                        {
+                            Real vol = volume_weighting ? Geometry::Volume(IntVect{AMREX_D_DECL(i,j,k)}, geomdata) : 1.0_rt;
+                            auto const& flag = flags[bi];
+                            if (!flag(i,j,k).isCovered()) {
+                                if (datma[bi](i,j,k) * vol <= threshold) {
+                                    tagma[bi](i,j,k) = tag_update;
+                                }
+                            }
+                        });
+                    } else
+#endif
                     {
+                    ParallelFor(tba, [=] AMREX_GPU_DEVICE (int bi, int i, int j, int k) noexcept {
                         Real vol = volume_weighting ? Geometry::Volume(IntVect{AMREX_D_DECL(i,j,k)}, geomdata) : 1.0_rt;
                         if (datma[bi](i,j,k) * vol <= threshold) {
-                            tagma[bi](i,j,k) = tagval;
+                            tagma[bi](i,j,k) = tag_update;
                         }
                     });
+                    }
                 }
                 else if (m_test == GREATER)
                 {
+#ifdef AMREX_USE_EB
+                    if (mf->hasEBFabFactory()) {
+                        auto const& ebfact =
+                            dynamic_cast<amrex::EBFArrayBoxFactory const&>(mf->Factory());
+                        auto const& flags = ebfact.getMultiEBCellFlagFab().arrays();
+                        ParallelFor(tba, [=] AMREX_GPU_DEVICE (int bi, int i, int j, int k) noexcept
+                        {
+                            Real vol = volume_weighting ? Geometry::Volume(IntVect{AMREX_D_DECL(i,j,k)}, geomdata) : 1.0_rt;
+                            auto const& flag = flags[bi];
+                            if (!flag(i,j,k).isCovered()) {
+                                if (datma[bi](i,j,k) * vol >= threshold) {
+                                    tagma[bi](i,j,k) = tag_update;
+                                }
+                            }
+                        });
+                    } else
+#endif
                     ParallelFor(tba, [=] AMREX_GPU_DEVICE (int bi, int i, int j, int k) noexcept
                     {
                         Real vol = volume_weighting ? Geometry::Volume(IntVect{AMREX_D_DECL(i,j,k)}, geomdata) : 1.0_rt;
-                        if (datma[bi](i,j,k) * vol >= threshold) {
-                            tagma[bi](i,j,k) = tagval;
-                        }
+                            if (datma[bi](i,j,k) * vol >= threshold) {
+                                tagma[bi](i,j,k) = tag_update;
+                            }
                     });
                 }
                 else if (m_test == VORT)
                 {
                     const Real fac = threshold * Real(std::pow(2,level));
-                    ParallelFor(tba, [=] AMREX_GPU_DEVICE (int bi, int i, int j, int k) noexcept
+#ifdef AMREX_USE_EB
+                    if (mf->hasEBFabFactory()) {
+                        auto const& ebfact =
+                            dynamic_cast<amrex::EBFArrayBoxFactory const&>(mf->Factory());
+                        auto const& flags = ebfact.getMultiEBCellFlagFab().arrays();
+                        ParallelFor(tba, [=] AMREX_GPU_DEVICE (int bi, int i, int j, int k) noexcept
+                        {
+                            auto const& flag = flags[bi];
+                            if (!flag(i,j,k).isCovered()) {
+                                if (datma[bi](i,j,k) >= fac) {
+                                    tagma[bi](i,j,k) = tag_update;
+                                }
+                            }
+                        });
+                    } else
+#endif
                     {
-                        if (datma[bi](i,j,k) >= fac) {
-                            tagma[bi](i,j,k) = tagval;
-                        }
-                    });
+                        ParallelFor(tba, [=] AMREX_GPU_DEVICE (int bi, int i, int j, int k) noexcept
+                        {
+                            if (datma[bi](i,j,k) >= fac) {
+                                tagma[bi](i,j,k) = tag_update;
+                            }
+                        });
+                    }
                 }
                 else
                 {
diff --git a/Src/AmrCore/AMReX_FillPatchUtil.H b/Src/AmrCore/AMReX_FillPatchUtil.H
index 51a5f457391..495cbc180b6 100644
--- a/Src/AmrCore/AMReX_FillPatchUtil.H
+++ b/Src/AmrCore/AMReX_FillPatchUtil.H
@@ -28,12 +28,17 @@
 namespace amrex
 {
 
-    template <typename FAB>
+    template <typename MFFAB>
     struct NullInterpHook
     {
-        void operator() (FAB& /*fab*/, const Box& /*bx*/, int /*icomp*/, int /*ncomp*/) const {}
+        template <class F=MFFAB, std::enable_if_t<IsBaseFab<F>::value,int> = 0>
+        void operator() (MFFAB& /*fab*/, const Box& /*bx*/, int /*icomp*/, int /*ncomp*/) const {}
 
-        void operator() (Array<FAB*, AMREX_SPACEDIM> /*fab*/, const Box& /*bx*/, int /*icomp*/, int /*ncomp*/) const {}
+        template <class F=MFFAB, std::enable_if_t<IsBaseFab<F>::value,int> = 0>
+        void operator() (Array<MFFAB*, AMREX_SPACEDIM> /*fab*/, const Box& /*bx*/, int /*icomp*/, int /*ncomp*/) const {}
+
+        template <class F=MFFAB, std::enable_if_t<IsFabArray<F>::value,int> = 0>
+        void operator() (MFFAB& /*mf*/, int /*icomp*/, int /*ncomp*/) const {}
     };
 
     template <typename Interp>
diff --git a/Src/AmrCore/AMReX_FillPatchUtil_I.H b/Src/AmrCore/AMReX_FillPatchUtil_I.H
index 8d8f210a0fe..3e94abfad27 100644
--- a/Src/AmrCore/AMReX_FillPatchUtil_I.H
+++ b/Src/AmrCore/AMReX_FillPatchUtil_I.H
@@ -4,6 +4,31 @@
 
 namespace amrex {
 
+namespace detail {
+
+template <typename F, typename MF>
+auto call_interp_hook (F const& f, MF& mf, int icomp, int ncomp)
+    -> decltype(f(mf[0],Box(),icomp,ncomp))
+{
+#ifdef AMREX_USE_OMP
+#pragma omp parallel if (Gpu::notInLaunchRegion())
+#endif
+    for (MFIter mfi(mf); mfi.isValid(); ++mfi) {
+        auto& dfab = mf[mfi];
+        const Box& dbx = dfab.box();
+        f(dfab, dbx, icomp, ncomp);
+    }
+}
+
+template <typename F, typename MF>
+auto call_interp_hook (F const& f, MF& mf, int icomp, int ncomp)
+    -> decltype(f(mf,icomp,ncomp))
+{
+    f(mf, icomp, ncomp);
+}
+
+}
+
 template <typename Interp>
 bool ProperlyNested (const IntVect& ratio, const IntVect& blocking_factor, int ngrow,
                      const IndexType& boxType, Interp* mapper)
@@ -459,9 +484,6 @@ namespace {
 
                 if ( ! fpc.ba_crse_patch.empty())
                 {
-
-                    using FAB = typename MF::FABType::value_type;
-
                     MF mf_crse_patch     = make_mf_crse_patch<MF>      (fpc, ncomp, mf.boxArray().ixType());
                     // Must make sure fine exists under needed coarse faces.
                     // It stores values for the final (interior) interpolation,
@@ -491,20 +513,12 @@ namespace {
                     solve_mask.setVal(1);                   // Values to solve.
                     solve_mask.setVal(0, mask_cpc, 0, 1);   // Known values.
 
-                    for (MFIter mfi(mf_refined_patch); mfi.isValid(); ++mfi)
-                    {
-                        FAB& sfab = mf_crse_patch[mfi];
-                        pre_interp(sfab, sfab.box(), 0, ncomp);
-                    }
+                    detail::call_interp_hook(pre_interp, mf_crse_patch, 0, ncomp);
 
                     InterpFace(mapper, mf_crse_patch, 0, mf_refined_patch, 0, ncomp,
                                ratio, solve_mask, cgeom, fgeom, bcscomp, RunOn::Gpu, bcs);
 
-                    for (MFIter mfi(mf_refined_patch); mfi.isValid(); ++mfi)
-                    {
-                        FAB& dfab = mf_refined_patch[mfi];
-                        post_interp(dfab, dfab.box(), 0, ncomp);
-                    }
+                    detail::call_interp_hook(post_interp, mf_refined_patch, 0, ncomp);
 
                     bool aliasing = false;
                     for (auto const& fmf_a : fmf) {
@@ -538,30 +552,14 @@ namespace {
 
                     MF mf_fine_patch = make_mf_fine_patch<MF>(fpc, ncomp);
 
-#ifdef AMREX_USE_OMP
-#pragma omp parallel if (Gpu::notInLaunchRegion())
-#endif
-                    for (MFIter mfi(mf_crse_patch); mfi.isValid(); ++mfi)
-                    {
-                        auto& sfab = mf_crse_patch[mfi];
-                        const Box& sbx = sfab.box();
-                        pre_interp(sfab, sbx, 0, ncomp);
-                    }
+                    detail::call_interp_hook(pre_interp, mf_crse_patch, 0, ncomp);
 
                     FillPatchInterp(mf_fine_patch, 0, mf_crse_patch, 0,
                                     ncomp, IntVect(0), cgeom, fgeom,
                                     amrex::grow(amrex::convert(fgeom.Domain(),mf.ixType()),nghost),
                                     ratio, mapper, bcs, bcscomp);
 
-#ifdef AMREX_USE_OMP
-#pragma omp parallel if (Gpu::notInLaunchRegion())
-#endif
-                    for (MFIter mfi(mf_fine_patch); mfi.isValid(); ++mfi)
-                    {
-                        auto& dfab = mf_fine_patch[mfi];
-                        const Box& dbx = dfab.box();
-                        post_interp(dfab, dbx, 0, ncomp);
-                    }
+                    detail::call_interp_hook(post_interp, mf_fine_patch, 0, ncomp);
 
                     mf.ParallelCopy(mf_fine_patch, 0, dcomp, ncomp, IntVect{0}, nghost);
                 }
@@ -1024,14 +1022,7 @@ InterpFromCoarseLevel (MF& mf, IntVect const& nghost, Real time,
 
     cbc(mf_crse_patch, 0, ncomp, mf_crse_patch.nGrowVect(), time, cbccomp);
 
-#ifdef AMREX_USE_OMP
-#pragma omp parallel if (Gpu::notInLaunchRegion())
-#endif
-    for (MFIter mfi(mf_crse_patch); mfi.isValid(); ++mfi)
-    {
-        FAB& sfab   = mf_crse_patch[mfi];
-        pre_interp(sfab, sfab.box(), 0, ncomp);
-    }
+    detail::call_interp_hook(pre_interp, mf_crse_patch, 0, ncomp);
 
     FillPatchInterp(mf, dcomp, mf_crse_patch, 0, ncomp, nghost, cgeom, fgeom, fdomain_g,
                     ratio, mapper, bcs, bcscomp);
diff --git a/Src/AmrCore/AMReX_FillPatcher.H b/Src/AmrCore/AMReX_FillPatcher.H
new file mode 100644
index 00000000000..d0e775416ee
--- /dev/null
+++ b/Src/AmrCore/AMReX_FillPatcher.H
@@ -0,0 +1,585 @@
+#ifndef AMREX_FILLPATCHER_H_
+#define AMREX_FILLPATCHER_H_
+#include <AMReX_Config.H>
+
+#include <AMReX_FillPatchUtil.H>
+
+namespace amrex {
+
+/**
+ * \brief FillPatcher is for filling a fine level MultiFab/FabArray.
+ *
+ * This class is not as general as the FillPatchTwoLevels functions.  It
+ * fills the fine ghost cells not overlapping any fine level valid cells
+ * with interpolation of the coarse data.  Then it fills the fine ghost
+ * cells overlapping fine level valid cells with the fine level data.  If
+ * the valid cells of the destination need to be filled, it will be done as
+ * well.  Finally, it will fill the physical bounbary using the user
+ * provided functor.  The `fill` member function can be used to do the
+ * operations just described.  Alternatively, one can also use the
+ * `fillCoarseFineBounary` to fill the ghost cells at the coarse/fine
+ * boundary only.  Then one can manually call FillBoundary to fill the other
+ * ghost cells, and use the physical BC functor to handle the physical
+ * boundeary.
+ *
+ * The communication of the coarse data needed for spatial interpolation is
+ * optimized at the cost of being error-prone.  One must follow the
+ * following guidelines.
+ *
+ * (1) This class is for filling data during time stepping, not during
+ * regrid.  The fine level data passed as input must have the same BoxArray
+ * and DistributionMapping as the destination.  It's OK they are the same
+ * MultiFab.  For AmrLevel based codes, AmrLevel::FillPatcherFill wil try to
+ * use FillPatcher if it can, and AmrLevel::FillPatch will use the fillpatch
+ * functions.
+ *
+ * (2) When to build?  It is recommended that one uses `std::unique_ptr` to
+ * store the FillPatcher object, and build it only when it is needed and
+ * it's a nullptr.  For AmrLevel based codes, the AmrLevel class will build
+ * it for you as needed when you call the AmrLevel::FillPatcherFill
+ * function.
+ *
+ * (3) When to destroy?  Usually, we do time steppig on a coarse level
+ * first.  Then we recursively do time stepping on fine levels.  After the
+ * finer level finishes, we do reflux and averge the fine data down to the
+ * coarse level.  After that we should destroy the FillPatcher object
+ * associated with these two levels, because the coarse data stored in the
+ * object has become outdated.  For AmrCore based codes, you could use
+ * Tests/Amr/Advection_AmrCore as an example.  For AmrLevel based codes, you
+ * should do this in the post_timestep virtual function (see
+ * Tests/Amr/Advection_AmrLevel for an example).
+ *
+ * (4) The source MultiFabs/FabArrays (i.e., the crse_data and fine_data
+ * arguments of the fill function) need to have exactly the same number of
+ * components as the ncomp argument of the constructor, even though it's
+ * allowed to fill only some of the components with the fill function.
+ *
+ * (5) This only works for cell-centered and nodal data.
+ *
+ * This class also provides support for RungeKutta::RK3 and RungeKutta::RK4.
+ * The storeRKCoarseData function can be used to store coarse AMR level
+ * data that are needed for filling fine level data's ghost cells in this
+ * class.  The `fillRK` function can be used to fill ghost cells for fine
+ * AMR levels.  This operation at the coarse/fine boundary is non-trivial
+ * for RK orders higher than 2.  Note that it is expected that time stepping
+ * on the coarse level is perform before any fine level time stepping, and
+ * it's the user's reponsibility to properly create and destroy this object.
+ * See AmrLevel::RK for an example of using the RungeKutta functions and
+ * FillPatcher together.
+ */
+
+template <class MF = MultiFab>
+class FillPatcher
+{
+public:
+
+    /**
+     * \brief Constructor of FillPatcher
+     *
+     * \param fba    fine level BoxArray
+     * \param fdm    fine level DistributionMapping
+     * \param fgeom  fine level Geometry
+     * \param cba    coarse level BoxArray
+     * \param cdm    coarse level DistributionMapping
+     * \param cgeom  coarse level Geometry
+     * \param nghost max number of ghost cells to be filled at coarse/fine boundary
+     * \param ncomp  the number of components
+     * \param interp for spatial interpolation
+     * \param eb_index_space optional argument for specifying EB IndexSpace
+     */
+    FillPatcher (BoxArray const& fba, DistributionMapping const& fdm,
+                 Geometry const& fgeom,
+                 BoxArray const& cba, DistributionMapping const& cdm,
+                 Geometry const& cgeom,
+                 IntVect const& nghost, int ncomp, InterpBase* interp,
+#ifdef AMREX_USE_EB
+                 EB2::IndexSpace const* eb_index_space = EB2::TopIndexSpaceIfPresent());
+#else
+                 EB2::IndexSpace const* eb_index_space = nullptr);
+#endif
+
+    /**
+     * \brief Function to fill data
+     *
+     * \param mf          destination MultiFab/FabArray
+     * \param nghost      number of ghost cells to fill. This must be <= what's
+     *                    provided to the constructor
+     * \param time        time associated with the destination
+     * \param crse_data   coarse level data
+     * \param crse_time   time associated with the coarse data
+     * \param fine_data   fine level data
+     * \param fine_time   time associated with the fine data
+     * \param scomp       starting component of the source
+     * \param dcomp       starting component of the destination
+     * \param ncomp       the number of components to fill
+     * \param cbc         for filling coarse level physical BC
+     * \param cbccomp     starting component of the coarse level BC functor
+     * \param fbc         for filling fine level physical BC
+     * \param fbccomp     starting component of the fine level BC functor
+     * \param bcs         BCRec specifying physical boundary types
+     * \parame bcscomp    starting component of the BCRec Vector.
+     * \param pre_interp  optional pre-interpolation hook for modifying the coarse data
+     * \param post_interp optional post-interpolation hook for modifying the fine data
+     */
+    template <typename BC,
+              typename PreInterpHook=NullInterpHook<MF>,
+              typename PostInterpHook=NullInterpHook<MF> >
+    void fill (MF& mf, IntVect const& nghost, Real time,
+               Vector<MF*> const& crse_data, Vector<Real> const& crse_time,
+               Vector<MF*> const& fine_data, Vector<Real> const& fine_time,
+               int scomp, int dcomp, int ncomp,
+               BC& cbc, int cbccomp, BC& fbc, int fbccomp,
+               Vector<BCRec> const& bcs, int bcscomp,
+               PreInterpHook const& pre_interp = {},
+               PostInterpHook const& post_interp = {});
+
+    /**
+     * \brief Function to fill data at coarse/fine boundary only
+     *
+     * \param mf          destination MultiFab/FabArray
+     * \param nghost      number of ghost cells to fill. This must be <= what's
+     *                    provided to the constructor
+     * \param time        time associated with the destination
+     * \param crse_data   coarse level data
+     * \param crse_time   time associated with the coarse data
+     * \param scomp       starting component of the source
+     * \param dcomp       starting component of the destination
+     * \param ncomp       the number of components to fill
+     * \param cbc         for filling coarse level physical BC
+     * \param cbccomp     starting component of the coarse level BC functor
+     * \param bcs         BCRec specifying physical boundary types
+     * \param bcscomp     starting component of the BCRec Vector.
+     * \param pre_interp  optional pre-interpolation hook for modifying the coarse data
+     * \param post_interp optional post-interpolation hook for modifying the fine data
+     */
+    template <typename BC,
+              typename PreInterpHook=NullInterpHook<MF>,
+              typename PostInterpHook=NullInterpHook<MF> >
+    void fillCoarseFineBoundary (MF& mf, IntVect const& nghost, Real time,
+                                 Vector<MF*> const& crse_data,
+                                 Vector<Real> const& crse_time,
+                                 int scomp, int dcomp, int ncomp,
+                                 BC& cbc, int cbccomp,
+                                 Vector<BCRec> const& bcs, int bcscomp,
+                                 PreInterpHook const& pre_interp = {},
+                                 PostInterpHook const& post_interp = {});
+
+    /**
+     * \brief Store coarse AMR level data for RK3 and RK4
+     *
+     * \tparam order RK order.  Must be 3 or 4.
+     * \param time   time at the beginning of the step
+     * \param dt     time step
+     * \param S_old  data at time
+     * \param RK_k   right-hand side at RK stages
+     */
+    template <std::size_t order>
+    void storeRKCoarseData (Real time, Real dt, MF const& S_old,
+                            Array<MF,order> const& RK_k);
+
+    /**
+     * \brief Fill ghost cells of fine AMR level for RK3 and RK4
+     *
+     * \param stage     RK stage number starting from 1
+     * \param iteration iteration number on fine level during a coarse time
+     *                  step.  For an AMR simulation with subcycling and a
+     *                  refinement ratio of 2, the number is either 1 or 2,
+     *                  denoting the first and second substep, respectively.
+     * \param ncycle    number of subcyling steps.  It's usually 2 or 4.
+     *                  Without subcycling, this will be 1.
+     * \param cbc       filling physical boundary on coarse level
+     * \param fbc       filling physical boundary on fine level
+     * \param bcs       physical BC types
+     */
+    template <typename BC>
+    void fillRK (int stage, int iteration, int ncycle, MF& mf, Real time,
+                 BC& cbc, BC& fbc, Vector<BCRec> const& bcs);
+
+private:
+
+    BoxArray m_fba;
+    BoxArray m_cba;
+    DistributionMapping m_fdm;
+    DistributionMapping m_cdm;
+    Geometry m_fgeom;
+    Geometry m_cgeom;
+    IntVect m_nghost;
+    int m_ncomp;
+    InterpBase* m_interp;
+    EB2::IndexSpace const* m_eb_index_space = nullptr;
+    MF m_sfine;
+    IntVect m_ratio;
+    Vector<std::pair<Real,std::unique_ptr<MF>>> m_cf_crse_data;
+    std::unique_ptr<MF> m_cf_crse_data_tmp;
+    std::unique_ptr<MF> m_cf_fine_data;
+    Real m_dt_coarse = std::numeric_limits<Real>::lowest();
+
+    FabArrayBase::FPinfo const& getFPinfo ();
+};
+
+template <class MF>
+FillPatcher<MF>::FillPatcher (BoxArray const& fba, DistributionMapping const& fdm,
+                              Geometry const& fgeom,
+                              BoxArray const& cba, DistributionMapping const& cdm,
+                              Geometry const& cgeom,
+                              IntVect const& nghost, int ncomp, InterpBase* interp,
+                              EB2::IndexSpace const* eb_index_space)
+    : m_fba(fba),
+      m_cba(cba),
+      m_fdm(fdm),
+      m_cdm(cdm),
+      m_fgeom(fgeom),
+      m_cgeom(cgeom),
+      m_nghost(nghost),
+      m_ncomp(ncomp),
+      m_interp(interp),
+      m_eb_index_space(eb_index_space),
+      m_sfine(fba, fdm, 1, nghost, MFInfo().SetAlloc(false))
+{
+    static_assert(IsFabArray<MF>::value,
+                  "FillPatcher<MF>: MF must be FabArray type");
+    AMREX_ALWAYS_ASSERT(m_fba.ixType().cellCentered() || m_fba.ixType().nodeCentered());
+
+    for (int idim = 0; idim < AMREX_SPACEDIM; ++idim) {
+        m_ratio[idim] = m_fgeom.Domain().length(idim) / m_cgeom.Domain().length(idim);
+    }
+    AMREX_ASSERT(m_fgeom.Domain() == amrex::refine(m_cgeom.Domain(),m_ratio));
+}
+
+template <class MF>
+template <typename BC, typename PreInterpHook, typename PostInterpHook>
+void
+FillPatcher<MF>::fill (MF& mf, IntVect const& nghost, Real time,
+                       Vector<MF*> const& cmf, Vector<Real> const& ct,
+                       Vector<MF*> const& fmf, Vector<Real> const& ft,
+                       int scomp, int dcomp, int ncomp,
+                       BC& cbc, int cbccomp,
+                       BC& fbc, int fbccomp,
+                       Vector<BCRec> const& bcs, int bcscomp,
+                       PreInterpHook const& pre_interp,
+                       PostInterpHook const& post_interp)
+{
+    BL_PROFILE("FillPatcher::fill()");
+
+    AMREX_ALWAYS_ASSERT(m_fba == fmf[0]->boxArray() &&
+                        m_fdm == fmf[0]->DistributionMap());
+
+    fillCoarseFineBoundary(mf, nghost, time, cmf, ct, scomp, dcomp, ncomp,
+                           cbc, cbccomp, bcs, bcscomp, pre_interp, post_interp);
+
+    FillPatchSingleLevel(mf, nghost, time, fmf, ft, scomp, dcomp, ncomp,
+                         m_fgeom, fbc, fbccomp);
+}
+
+template <class MF>
+FabArrayBase::FPinfo const&
+FillPatcher<MF>::getFPinfo ()
+{
+    const InterpolaterBoxCoarsener& coarsener = m_interp->BoxCoarsener(m_ratio);
+    return FabArrayBase::TheFPinfo(m_sfine, m_sfine, m_nghost, coarsener,
+                                   m_fgeom, m_cgeom, m_eb_index_space);
+}
+
+template <class MF>
+template <typename BC, typename PreInterpHook, typename PostInterpHook>
+void
+FillPatcher<MF>::fillCoarseFineBoundary (MF& mf, IntVect const& nghost, Real time,
+                                         Vector<MF*> const& cmf,
+                                         Vector<Real> const& ct,
+                                         int scomp, int dcomp, int ncomp,
+                                         BC& cbc, int cbccomp,
+                                         Vector<BCRec> const& bcs, int bcscomp,
+                                         PreInterpHook const& pre_interp,
+                                         PostInterpHook const& post_interp)
+{
+    BL_PROFILE("FillPatcher::fillCFB");
+
+    AMREX_ALWAYS_ASSERT(nghost.allLE(m_nghost) &&
+                        m_fba == mf.boxArray() &&
+                        m_fdm == mf.DistributionMap() &&
+                        m_cba == cmf[0]->boxArray() &&
+                        m_cdm == cmf[0]->DistributionMap() &&
+                        m_ncomp >= ncomp &&
+                        m_ncomp == cmf[0]->nComp());
+
+    auto const& fpc = getFPinfo();
+
+    if ( ! fpc.ba_crse_patch.empty())
+    {
+        if (m_cf_fine_data == nullptr) {
+            m_cf_fine_data = std::make_unique<MF>
+                (make_mf_fine_patch<MF>(fpc, m_ncomp));
+        }
+
+        int ncmfs = cmf.size();
+        for (int icmf = 0; icmf < ncmfs; ++icmf) {
+            Real t = ct[icmf];
+            auto it = std::find_if(m_cf_crse_data.begin(), m_cf_crse_data.end(),
+                                   [=] (auto const& x) {
+                                       return amrex::almostEqual(x.first,t,5);
+                                   });
+
+            if (it == std::end(m_cf_crse_data)) {
+                MF mf_crse_patch = make_mf_crse_patch<MF>(fpc, m_ncomp);
+                mf_crse_patch.ParallelCopy(*cmf[icmf], m_cgeom.periodicity());
+
+                std::pair<Real,std::unique_ptr<MF>> tmp;
+                tmp.first = t;
+                tmp.second = std::make_unique<MF>(std::move(mf_crse_patch));
+                m_cf_crse_data.push_back(std::move(tmp));
+            }
+        }
+
+        if (m_cf_crse_data_tmp == nullptr) {
+            m_cf_crse_data_tmp = std::make_unique<MF>
+                (make_mf_crse_patch<MF>(fpc, m_ncomp));
+        }
+
+        if (m_cf_crse_data.size() > 0 &&
+            amrex::almostEqual(time, m_cf_crse_data[0].first,5))
+        {
+            amrex::Copy(*m_cf_crse_data_tmp, *m_cf_crse_data[0].second,
+                        scomp, 0, ncomp, 0);
+        }
+        else if (m_cf_crse_data.size() > 1 &&
+                 amrex::almostEqual(time, m_cf_crse_data[1].first,5))
+        {
+            amrex::Copy(*m_cf_crse_data_tmp, *m_cf_crse_data[1].second,
+                        scomp, 0, ncomp, 0);
+        }
+        else if (m_cf_crse_data.size() == 2)
+        {
+            int const ng_space_interp = 8; // Need to be big enough
+            Box domain = m_cgeom.growPeriodicDomain(ng_space_interp);
+            domain.convert(mf.ixType());
+            Real t0 = m_cf_crse_data[0].first;
+            Real t1 = m_cf_crse_data[1].first;
+            Real alpha = (t1-time)/(t1-t0);
+            Real beta = (time-t0)/(t1-t0);
+            AMREX_ASSERT(alpha >= 0._rt && beta >= 0._rt);
+            auto const& a = m_cf_crse_data_tmp->arrays();
+            auto const& a0 = m_cf_crse_data[0].second->const_arrays();
+            auto const& a1 = m_cf_crse_data[1].second->const_arrays();
+            amrex::ParallelFor(*m_cf_crse_data_tmp, IntVect(0), ncomp,
+                               [=] AMREX_GPU_DEVICE (int bi, int i, int j, int k, int n) noexcept
+                               {
+                                   if (domain.contains(i,j,k)) {
+                                       a[bi](i,j,k,n)
+                                           = alpha*a0[bi](i,j,k,scomp+n)
+                                           +  beta*a1[bi](i,j,k,scomp+n);
+                                   }
+                               });
+            Gpu::streamSynchronize();
+        }
+        else
+        {
+            amrex::Abort("FillPatcher: High order interpolation in time not supported.  Or FillPatcher was not properly deleted.");
+        }
+
+        cbc(*m_cf_crse_data_tmp, 0, ncomp, nghost, time, cbccomp);
+
+        detail::call_interp_hook(pre_interp, *m_cf_crse_data_tmp, 0, ncomp);
+
+        FillPatchInterp(*m_cf_fine_data, scomp, *m_cf_crse_data_tmp, 0,
+                        ncomp, IntVect(0), m_cgeom, m_fgeom,
+                        amrex::grow(amrex::convert(m_fgeom.Domain(),
+                                                   mf.ixType()),nghost),
+                        m_ratio, m_interp, bcs, bcscomp);
+
+        detail::call_interp_hook(post_interp, *m_cf_fine_data, scomp, ncomp);
+
+        mf.ParallelCopy(*m_cf_fine_data, scomp, dcomp, ncomp, IntVect{0}, nghost);
+    }
+}
+
+template <typename MF>
+template <std::size_t order>
+void FillPatcher<MF>::storeRKCoarseData (Real /*time*/, Real dt, MF const& S_old,
+                                         Array<MF,order> const& RK_k)
+{
+    m_dt_coarse = dt;
+    m_cf_crse_data.resize(order+1);
+
+    auto const& fpc = getFPinfo();
+
+    for (auto& tmf : m_cf_crse_data) {
+        tmf.first = std::numeric_limits<Real>::lowest(); // because we dont' need it
+        tmf.second = std::make_unique<MF>(make_mf_crse_patch<MF>(fpc, m_ncomp));
+    }
+    m_cf_crse_data[0].second->ParallelCopy(S_old, m_cgeom.periodicity());
+    for (std::size_t i = 0; i < order; ++i) {
+        m_cf_crse_data[i+1].second->ParallelCopy(RK_k[i], m_cgeom.periodicity());
+    }
+}
+
+template <typename MF>
+template <typename BC>
+void FillPatcher<MF>::fillRK (int stage, int iteration, int ncycle,
+                              MF& mf, Real time, BC& cbc, BC& fbc,
+                              Vector<BCRec> const& bcs)
+{
+    int rk_order = m_cf_crse_data.size()-1;
+    if (rk_order != 3 && rk_order != 4) {
+        amrex::Abort("FillPatcher: unsupported RK order "+std::to_string(rk_order));
+        return;
+    }
+    AMREX_ASSERT(stage > 0 && stage <= rk_order);
+
+    auto const& fpc = getFPinfo();
+    if (m_cf_crse_data_tmp == nullptr) {
+        m_cf_crse_data_tmp = std::make_unique<MF>
+            (make_mf_crse_patch<MF>(fpc, m_ncomp));
+    }
+
+    auto const& u = m_cf_crse_data_tmp->arrays();
+    auto const& u0 = m_cf_crse_data[0].second->const_arrays();
+    auto const& k1 = m_cf_crse_data[1].second->const_arrays();
+    auto const& k2 = m_cf_crse_data[2].second->const_arrays();
+    auto const& k3 = m_cf_crse_data[3].second->const_arrays();
+
+    Real dtc = m_dt_coarse;
+    Real r = Real(1) / Real(ncycle);
+    Real xsi = Real(iteration-1) / Real(ncycle);
+
+    if (rk_order == 3) {
+        // coefficients for U
+        Real b1 = xsi - Real(5./6.)*xsi*xsi;
+        Real b2 = Real(1./6.)*xsi*xsi;
+        Real b3 = Real(2./3)*xsi*xsi;
+        // coefficients for Ut
+        Real c1 = Real(1.) - Real(5./3.)*xsi;
+        Real c2 = Real(1./3.)*xsi;
+        Real c3 = Real(4./3.)*xsi;
+        // coefficients for Utt
+        constexpr Real d1 = Real(-5./3.);
+        constexpr Real d2 = Real(1./3.);
+        constexpr Real d3 = Real(4./3.);
+        if (stage == 1) {
+            amrex::ParallelFor(*m_cf_crse_data_tmp, IntVect(0), m_ncomp,
+            [=] AMREX_GPU_DEVICE (int bi, int i, int j, int k, int n) noexcept
+            {
+                Real kk1 = k1[bi](i,j,k,n);
+                Real kk2 = k2[bi](i,j,k,n);
+                Real kk3 = k3[bi](i,j,k,n);
+                Real uu  = b1*kk1 + b2*kk2 + b3*kk3;
+                u[bi](i,j,k,n) = u0[bi](i,j,k,n) + dtc*uu;
+            });
+        } else if (stage == 2) {
+            amrex::ParallelFor(*m_cf_crse_data_tmp, IntVect(0), m_ncomp,
+            [=] AMREX_GPU_DEVICE (int bi, int i, int j, int k, int n) noexcept
+            {
+                Real kk1 = k1[bi](i,j,k,n);
+                Real kk2 = k2[bi](i,j,k,n);
+                Real kk3 = k3[bi](i,j,k,n);
+                Real uu = b1*kk1 + b2*kk2 + b3*kk3;
+                Real ut = c1*kk1 + c2*kk2 + c3*kk3;
+                u[bi](i,j,k,n) = u0[bi](i,j,k,n) + dtc*(uu + r*ut);
+            });
+        } else if (stage == 3) {
+            amrex::ParallelFor(*m_cf_crse_data_tmp, IntVect(0), m_ncomp,
+            [=] AMREX_GPU_DEVICE (int bi, int i, int j, int k, int n) noexcept
+            {
+                Real kk1 = k1[bi](i,j,k,n);
+                Real kk2 = k2[bi](i,j,k,n);
+                Real kk3 = k3[bi](i,j,k,n);
+                Real uu  = b1*kk1 + b2*kk2 + b3*kk3;
+                Real ut  = c1*kk1 + c2*kk2 + c3*kk3;
+                Real utt = d1*kk1 + d2*kk2 + d3*kk3;
+                u[bi](i,j,k,n) = u0[bi](i,j,k,n) + dtc*
+                    (uu + Real(0.5)*r*ut + Real(0.25)*r*r*utt);
+            });
+        }
+    } else if (rk_order == 4) {
+        auto const& k4 = m_cf_crse_data[4].second->const_arrays();
+        Real xsi2 = xsi*xsi;
+        Real xsi3 = xsi2*xsi;
+        // coefficients for U
+        Real b1 = xsi - Real(1.5)*xsi2 + Real(2./3.)*xsi3;
+        Real b2 = xsi2 - Real(2./3.)*xsi3;
+        Real b3 = b2;
+        Real b4 = Real(-0.5)*xsi2 + Real(2./3.)*xsi3;
+        // coefficients for Ut
+        Real c1 = Real(1.) - Real(3.)*xsi + Real(2.)*xsi2;
+        Real c2 = Real(2.)*xsi - Real(2.)*xsi2;
+        Real c3 = c2;
+        Real c4 = -xsi + Real(2.)*xsi2;
+        // coefficients for Utt
+        Real d1 = Real(-3.) + Real(4.)*xsi;
+        Real d2 = Real( 2.) - Real(4.)*xsi;
+        Real d3 = d2;
+        Real d4 = Real(-1.) + Real(4.)*xsi;
+        // coefficients for Uttt
+        constexpr Real e1 = Real( 4.);
+        constexpr Real e2 = Real(-4.);
+        constexpr Real e3 = Real(-4.);
+        constexpr Real e4 = Real( 4.);
+        if (stage == 1) {
+            amrex::ParallelFor(*m_cf_crse_data_tmp, IntVect(0), m_ncomp,
+            [=] AMREX_GPU_DEVICE (int bi, int i, int j, int k, int n) noexcept
+            {
+                Real kk1 = k1[bi](i,j,k,n);
+                Real kk2 = k2[bi](i,j,k,n);
+                Real kk3 = k3[bi](i,j,k,n);
+                Real kk4 = k4[bi](i,j,k,n);
+                Real uu  = b1*kk1 + b2*kk2 + b3*kk3 + b4*kk4;
+                u[bi](i,j,k,n) = u0[bi](i,j,k,n) + dtc*uu;
+            });
+        } else if (stage == 2) {
+            amrex::ParallelFor(*m_cf_crse_data_tmp, IntVect(0), m_ncomp,
+            [=] AMREX_GPU_DEVICE (int bi, int i, int j, int k, int n) noexcept
+            {
+                Real kk1 = k1[bi](i,j,k,n);
+                Real kk2 = k2[bi](i,j,k,n);
+                Real kk3 = k3[bi](i,j,k,n);
+                Real kk4 = k4[bi](i,j,k,n);
+                Real uu = b1*kk1 + b2*kk2 + b3*kk3 + b4*kk4;
+                Real ut = c1*kk1 + c2*kk2 + c3*kk3 + c4*kk4;
+                u[bi](i,j,k,n) = u0[bi](i,j,k,n) + dtc*(uu + Real(0.5)*r*ut);
+            });
+        } else if (stage == 3 || stage == 4) {
+            Real r2 = r*r;
+            Real r3 = r2*r;
+            Real at = (stage == 3) ? Real(0.5)*r : r;
+            Real att = (stage == 3) ? Real(0.25)*r2 : Real(0.5)*r2;
+            Real attt = (stage == 3) ? Real(0.0625)*r3 : Real(0.125)*r3;
+            Real akk = (stage == 3) ? Real(-4.) : Real(4.);
+            amrex::ParallelFor(*m_cf_crse_data_tmp, IntVect(0), m_ncomp,
+            [=] AMREX_GPU_DEVICE (int bi, int i, int j, int k, int n) noexcept
+            {
+                Real kk1 = k1[bi](i,j,k,n);
+                Real kk2 = k2[bi](i,j,k,n);
+                Real kk3 = k3[bi](i,j,k,n);
+                Real kk4 = k4[bi](i,j,k,n);
+                Real uu   = b1*kk1 + b2*kk2 + b3*kk3 + b4*kk4;
+                Real ut   = c1*kk1 + c2*kk2 + c3*kk3 + c4*kk4;
+                Real utt  = d1*kk1 + d2*kk2 + d3*kk3 + d4*kk4;
+                Real uttt = e1*kk1 + e2*kk2 + e3*kk3 + e4*kk4;
+                u[bi](i,j,k,n) = u0[bi](i,j,k,n) + dtc *
+                    (uu + at*ut + att*utt + attt*(uttt+akk*(kk3-kk2)));
+            });
+        }
+    }
+    Gpu::streamSynchronize();
+
+    cbc(*m_cf_crse_data_tmp, 0, m_ncomp, m_nghost, time, 0);
+
+    if (m_cf_fine_data == nullptr) {
+        m_cf_fine_data = std::make_unique<MF>(make_mf_fine_patch<MF>(fpc, m_ncomp));
+    }
+
+    FillPatchInterp(*m_cf_fine_data, 0, *m_cf_crse_data_tmp, 0,
+                    m_ncomp, IntVect(0), m_cgeom, m_fgeom,
+                    amrex::grow(amrex::convert(m_fgeom.Domain(),
+                                               mf.ixType()),m_nghost),
+                    m_ratio, m_interp, bcs, 0);
+
+    // xxxxx We can optimize away this ParallelCopy by making a special fpinfo.
+    mf.ParallelCopy(*m_cf_fine_data, 0, 0, m_ncomp, IntVect(0), m_nghost);
+
+    mf.FillBoundary(m_fgeom.periodicity());
+    fbc(mf, 0, m_ncomp, m_nghost, time, 0);
+}
+
+}
+
+#endif
diff --git a/Src/AmrCore/AMReX_Interp_C.H b/Src/AmrCore/AMReX_Interp_C.H
index e12c4495fde..967d3aaa177 100644
--- a/Src/AmrCore/AMReX_Interp_C.H
+++ b/Src/AmrCore/AMReX_Interp_C.H
@@ -135,5 +135,53 @@ face_linear_interp_z (int i, int j, int k, int n, amrex::Array4<amrex::Real> con
     }
 }
 
+AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+void cell_quartic_interp_x (int i, int j, int k, int n, Array4<Real> const& fine,
+                            Array4<Real const> const& crse) noexcept
+{
+    constexpr Array1D<Real,-2,2> c = {Real(0.01708984), Real(-0.12304688),
+                                      Real(0.92285156), Real(0.20507812),
+                                      Real(-0.02197266)};
+    int ii = amrex::coarsen(i,2);
+    int s = 2*(i-ii*2) - 1;  // if i == ii*2, s = -1; if i == ii*2+1, s = 1;
+    fine(i,j,k,n) = c(-2*s)*crse(ii-2,j,k,n)
+        +           c(  -s)*crse(ii-1,j,k,n)
+        +           c(   0)*crse(ii  ,j,k,n)
+        +           c(   s)*crse(ii+1,j,k,n)
+        +           c( 2*s)*crse(ii+2,j,k,n);
+}
+
+AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+void cell_quartic_interp_y (int i, int j, int k, int n, Array4<Real> const& fine,
+                            Array4<Real const> const& crse) noexcept
+{
+    constexpr Array1D<Real,-2,2> c = {Real(0.01708984), Real(-0.12304688),
+                                      Real(0.92285156), Real(0.20507812),
+                                      Real(-0.02197266)};
+    int jj = amrex::coarsen(j,2);
+    int s = 2*(j-jj*2) - 1;  // if j == jj*2, s = -1; if j == jj*2+1, s = 1;
+    fine(i,j,k,n) = c(-2*s)*crse(i,jj-2,k,n)
+        +           c(  -s)*crse(i,jj-1,k,n)
+        +           c(   0)*crse(i,jj  ,k,n)
+        +           c(   s)*crse(i,jj+1,k,n)
+        +           c( 2*s)*crse(i,jj+2,k,n);
+}
+
+AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+void cell_quartic_interp_z (int i, int j, int k, int n, Array4<Real> const& fine,
+                            Array4<Real const> const& crse) noexcept
+{
+    constexpr Array1D<Real,-2,2> c = {Real(0.01708984), Real(-0.12304688),
+                                      Real(0.92285156), Real(0.20507812),
+                                      Real(-0.02197266)};
+    int kk = amrex::coarsen(k,2);
+    int s = 2*(k-kk*2) - 1;  // if k == kk*2, s = -1; if k == kk*2+1, s = 1;
+    fine(i,j,k,n) = c(-2*s)*crse(i,j,kk-2,n)
+        +           c(  -s)*crse(i,j,kk-1,n)
+        +           c(   0)*crse(i,j,kk  ,n)
+        +           c(   s)*crse(i,j,kk+1,n)
+        +           c( 2*s)*crse(i,j,kk+2,n);
+}
+
 }
 #endif
diff --git a/Src/AmrCore/AMReX_Interpolater.H b/Src/AmrCore/AMReX_Interpolater.H
index 06398b73097..bdb6cf9d46b 100644
--- a/Src/AmrCore/AMReX_Interpolater.H
+++ b/Src/AmrCore/AMReX_Interpolater.H
@@ -844,6 +844,74 @@ public:
 
 };
 
+/**
+* \brief Quartic interpolation on cell centered data.
+*
+* Quartic interpolation on cell centered data.
+*/
+
+class CellQuartic
+    :
+    public Interpolater
+{
+public:
+
+    /**
+    * \brief The constructor.
+    */
+    explicit CellQuartic ();
+
+    /**
+    * \brief The destructor.
+    */
+    virtual ~CellQuartic () override;
+
+    /**
+    * \brief Returns coarsened box given fine box and refinement ratio.
+    *
+    * \param fine
+    * \param ratio
+    */
+    virtual Box CoarseBox (const Box& fine, int ratio) override;
+
+    /**
+    * \brief Returns coarsened box given fine box and refinement ratio.
+    *
+    * \param fine
+    * \param ratio
+    */
+    virtual Box CoarseBox (const Box& fine, const IntVect& ratio) override;
+
+    /**
+    * \brief Coarse to fine interpolation in space.
+    *
+    * \param crse
+    * \param crse_comp
+    * \param fine
+    * \param fine_comp
+    * \param ncomp
+    * \param fine_region
+    * \param ratio
+    * \param crse_geom
+    * \param fine_geom
+    * \param bcr
+    * \param actual_comp
+    * \param actual_state
+    */
+    virtual void interp (const FArrayBox& crse,
+                         int              crse_comp,
+                         FArrayBox&       fine,
+                         int              fine_comp,
+                         int              ncomp,
+                         const Box&       fine_region,
+                         const IntVect&   ratio,
+                         const Geometry&  crse_geom,
+                         const Geometry&  fine_geom,
+                         Vector<BCRec> const& bcr,
+                         int              actual_comp,
+                         int              actual_state,
+                         RunOn            gpu_or_cpu) override;
+};
 
 //! CONSTRUCT A GLOBAL OBJECT OF EACH VERSION.
 extern AMREX_EXPORT PCInterp                  pc_interp;
@@ -856,6 +924,7 @@ extern AMREX_EXPORT CellBilinear              cell_bilinear_interp;
 extern AMREX_EXPORT CellConservativeProtected protected_interp;
 extern AMREX_EXPORT CellConservativeQuartic   quartic_interp;
 extern AMREX_EXPORT CellQuadratic             quadratic_interp;
+extern AMREX_EXPORT CellQuartic               cell_quartic_interp;
 
 }
 
diff --git a/Src/AmrCore/AMReX_Interpolater.cpp b/Src/AmrCore/AMReX_Interpolater.cpp
index a78eac89aa0..8042aa2f322 100644
--- a/Src/AmrCore/AMReX_Interpolater.cpp
+++ b/Src/AmrCore/AMReX_Interpolater.cpp
@@ -18,6 +18,8 @@ namespace amrex {
  *
  * CellQuadratic only works in 2D and 3D on cpu and gpu.
  *
+ * CellQuartic works in 1D, 2D and 3D on cpu and gpu with ref ratio of 2
+ *
  * CellConservativeQuartic only works with ref ratio of 2 on cpu and gpu.
  *
  * FaceDivFree works in 2D and 3D on cpu and gpu.
@@ -37,6 +39,7 @@ CellConservativeProtected protected_interp;
 CellConservativeQuartic   quartic_interp;
 CellBilinear              cell_bilinear_interp;
 CellQuadratic             quadratic_interp;
+CellQuartic               cell_quartic_interp;
 
 NodeBilinear::~NodeBilinear () {}
 
@@ -988,4 +991,94 @@ FaceDivFree::interp_arr (Array<FArrayBox*, AMREX_SPACEDIM> const& crse,
     });
 }
 
+CellQuartic::CellQuartic () {}
+
+CellQuartic::~CellQuartic () {}
+
+Box
+CellQuartic::CoarseBox (const Box& fine, const IntVect& ratio)
+{
+    Box crse = amrex::coarsen(fine,ratio);
+    crse.grow(2);
+    return crse;
+}
+
+Box
+CellQuartic::CoarseBox (const Box& fine, int ratio)
+{
+    Box crse = amrex::coarsen(fine,ratio);
+    crse.grow(2);
+    return crse;
+}
+
+void
+CellQuartic::interp (const FArrayBox& crse,
+                     int              crse_comp,
+                     FArrayBox&       fine,
+                     int              fine_comp,
+                     int              ncomp,
+                     const Box&       fine_region,
+                     const IntVect&   ratio,
+                     const Geometry&  /*crse_geom*/,
+                     const Geometry&  /*fine_geom*/,
+                     Vector<BCRec> const&  /*bcr*/,
+                     int              /* actual_comp */,
+                     int              /* actual_state */,
+                     RunOn            runon)
+{
+    BL_PROFILE("CellQuartic::interp()");
+    amrex::ignore_unused(ratio);
+    AMREX_ASSERT(ratio == 2);
+
+    Box target_fine_region = fine_region & fine.box();
+
+    bool run_on_gpu = (runon == RunOn::Gpu && Gpu::inLaunchRegion());
+    amrex::ignore_unused(run_on_gpu);
+
+    Array4<Real const> const& crsearr = crse.const_array(crse_comp);
+    Array4<Real>       const& finearr = fine.array(fine_comp);
+
+#if (AMREX_SPACEDIM == 3)
+    Box bz = amrex::coarsen(target_fine_region, IntVect(2,2,1));
+    bz.grow(IntVect(2,2,0));
+    FArrayBox tmpz(bz, ncomp);
+    Elixir tmpz_eli;
+    if (run_on_gpu) tmpz_eli = tmpz.elixir();
+    Array4<Real> const& tmpzarr = tmpz.array();
+    AMREX_HOST_DEVICE_PARALLEL_FOR_4D_FLAG(runon, bz, ncomp, i, j, k, n,
+    {
+        cell_quartic_interp_z(i,j,k,n,tmpzarr,crsearr);
+    });
+#endif
+
+#if (AMREX_SPACEDIM >= 2)
+    Box by = amrex::coarsen(target_fine_region, IntVect(AMREX_D_DECL(2,1,1)));
+    by.grow(IntVect(AMREX_D_DECL(2,0,0)));
+    FArrayBox tmpy(by, ncomp);
+    Elixir tmpy_eli;
+    if (run_on_gpu) tmpy_eli = tmpy.elixir();
+    Array4<Real> const& tmpyarr = tmpy.array();
+#if (AMREX_SPACEDIM == 2)
+    Array4<Real const> srcarr = crsearr;
+#else
+    Array4<Real const> srcarr = tmpz.const_array();
+#endif
+    AMREX_HOST_DEVICE_PARALLEL_FOR_4D_FLAG(runon, by, ncomp, i, j, k, n,
+    {
+        cell_quartic_interp_y(i,j,k,n,tmpyarr,srcarr);
+    });
+#endif
+
+#if (AMREX_SPACEDIM == 1)
+    Array4<Real const> srcarr = crsearr;
+#else
+    srcarr = tmpy.const_array();
+#endif
+    AMREX_HOST_DEVICE_PARALLEL_FOR_4D_FLAG(runon, target_fine_region, ncomp,
+                                           i, j, k, n,
+    {
+        cell_quartic_interp_x(i,j,k,n,finearr,srcarr);
+    });
+}
+
 }
diff --git a/Src/AmrCore/AMReX_MFInterp_1D_C.H b/Src/AmrCore/AMReX_MFInterp_1D_C.H
index 37751acc3b9..8fcadec5794 100644
--- a/Src/AmrCore/AMReX_MFInterp_1D_C.H
+++ b/Src/AmrCore/AMReX_MFInterp_1D_C.H
@@ -149,9 +149,10 @@ void mf_cell_cons_lin_interp_sph (int i, int ns, Array4<Real> const& fine, int f
         + xoff * slope(ic,0,0,ns);
 }
 
+template<typename T>
 AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
-void mf_cell_bilin_interp (int i, int, int, int n, Array4<Real> const& fine, int fcomp,
-                           Array4<Real const> const& crse, int ccomp, IntVect const& ratio) noexcept
+void mf_cell_bilin_interp (int i, int, int, int n, Array4<T> const& fine, int fcomp,
+                           Array4<T const> const& crse, int ccomp, IntVect const& ratio) noexcept
 {
     int ic = amrex::coarsen(i,ratio[0]);
     int ioff = i - ic*ratio[0];
diff --git a/Src/AmrCore/AMReX_MFInterp_2D_C.H b/Src/AmrCore/AMReX_MFInterp_2D_C.H
index c505ef2655c..e02084e2e8e 100644
--- a/Src/AmrCore/AMReX_MFInterp_2D_C.H
+++ b/Src/AmrCore/AMReX_MFInterp_2D_C.H
@@ -189,9 +189,10 @@ void mf_cell_cons_lin_interp_rz (int i, int j, int ns, Array4<Real> const& fine,
         + yoff * slope(ic,jc,0,ns+ncomp);
 }
 
+template<typename T>
 AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
-void mf_cell_bilin_interp (int i, int j, int, int n, Array4<Real> const& fine, int fcomp,
-                           Array4<Real const> const& crse, int ccomp, IntVect const& ratio) noexcept
+void mf_cell_bilin_interp (int i, int j, int, int n, Array4<T> const& fine, int fcomp,
+                           Array4<T const> const& crse, int ccomp, IntVect const& ratio) noexcept
 {
     int ic = amrex::coarsen(i,ratio[0]);
     int jc = amrex::coarsen(j,ratio[1]);
diff --git a/Src/AmrCore/AMReX_MFInterp_3D_C.H b/Src/AmrCore/AMReX_MFInterp_3D_C.H
index dc0da5dba40..17d14ff689b 100644
--- a/Src/AmrCore/AMReX_MFInterp_3D_C.H
+++ b/Src/AmrCore/AMReX_MFInterp_3D_C.H
@@ -128,9 +128,10 @@ void mf_cell_cons_lin_interp (int i, int j, int k, int ns, Array4<Real> const& f
         + zoff * slope(ic,jc,kc,ns+ncomp*2);
 }
 
+template<typename T>
 AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
-void mf_cell_bilin_interp (int i, int j, int k, int n, Array4<Real> const& fine, int fcomp,
-                           Array4<Real const> const& crse, int ccomp, IntVect const& ratio) noexcept
+void mf_cell_bilin_interp (int i, int j, int k, int n, Array4<T> const& fine, int fcomp,
+                           Array4<T const> const& crse, int ccomp, IntVect const& ratio) noexcept
 {
     int ic = amrex::coarsen(i,ratio[0]);
     int jc = amrex::coarsen(j,ratio[1]);
diff --git a/Src/AmrCore/AMReX_TagBox.cpp b/Src/AmrCore/AMReX_TagBox.cpp
index 6a989ffbbf1..3ec7425e283 100644
--- a/Src/AmrCore/AMReX_TagBox.cpp
+++ b/Src/AmrCore/AMReX_TagBox.cpp
@@ -441,7 +441,7 @@ TagBoxArray::local_collate_gpu (Gpu::PinnedVector<IntVect>& v) const
     std::partial_sum(nblocks.begin(), nblocks.end(), blockoffset.begin()+1);
     int ntotblocks = blockoffset.back();
 
-    PODVector<int,DeviceArenaAllocator<int> > dv_ntags(ntotblocks);
+    Gpu::NonManagedDeviceVector<int> dv_ntags(ntotblocks);
 
     for (MFIter fai(*this); fai.isValid(); ++fai)
     {
@@ -491,21 +491,21 @@ TagBoxArray::local_collate_gpu (Gpu::PinnedVector<IntVect>& v) const
 #endif
     }
 
-    PODVector<int,PinnedArenaAllocator<int> > hv_ntags(ntotblocks);
+    Gpu::PinnedVector<int> hv_ntags(ntotblocks);
     Gpu::dtoh_memcpy(hv_ntags.data(), dv_ntags.data(), ntotblocks*sizeof(int));
 
-    PODVector<int,PinnedArenaAllocator<int> > hv_tags_offset(ntotblocks+1);
+    Gpu::PinnedVector<int> hv_tags_offset(ntotblocks+1);
     hv_tags_offset[0] = 0;
     std::partial_sum(hv_ntags.begin(), hv_ntags.end(), hv_tags_offset.begin()+1);
     int ntotaltags = hv_tags_offset.back();
 
     if (ntotaltags == 0) return;
 
-    PODVector<int,DeviceArenaAllocator<int> > dv_tags_offset(ntotblocks);
+    Gpu::NonManagedDeviceVector<int> dv_tags_offset(ntotblocks);
     int* dp_tags_offset = dv_tags_offset.data();
     Gpu::htod_memcpy_async(dp_tags_offset, hv_tags_offset.data(), ntotblocks*sizeof(int));
 
-    PODVector<IntVect,DeviceArenaAllocator<IntVect> > dv_tags(ntotaltags);
+    Gpu::NonManagedDeviceVector<IntVect> dv_tags(ntotaltags);
     IntVect* dp_tags = dv_tags.data();
 
     int iblock = 0;
@@ -649,7 +649,24 @@ TagBoxArray::collate (Gpu::PinnedVector<IntVect>& TheGlobalCollateSpace) const
     //
     const IntVect* psend = (count > 0) ? TheLocalCollateSpace.data() : nullptr;
     IntVect* precv = TheGlobalCollateSpace.data();
+
+    // Issues have been observed with the following call at very large scale when using
+    // FujitsuMPI. The issue seems to be related to the use of MPI_Datatype. We can
+    // bypasses the issue by exchanging simpler integer arrays.
+#if !(defined(__FUJITSU) || defined(__CLANG_FUJITSU))
     ParallelDescriptor::Gatherv(psend, count, precv, countvec, offset, IOProcNumber);
+#else
+    const int* psend_int = psend->begin();
+    int* precv_int = precv->begin();
+    Long count_int = count * AMREX_SPACEDIM;
+    auto countvec_int = std::vector<int>(countvec.size());
+    auto offset_int = std::vector<int>(offset.size());
+    const auto mul_funct = [](const auto el){return el*AMREX_SPACEDIM;};
+    std::transform(countvec.begin(), countvec.end(), countvec_int.begin(), mul_funct);
+    std::transform(offset.begin(), offset.end(), offset_int.begin(), mul_funct);
+    ParallelDescriptor::Gatherv(
+        psend_int, count_int, precv_int, countvec_int, offset_int, IOProcNumber);
+#endif
 
 #else
     TheGlobalCollateSpace = std::move(TheLocalCollateSpace);
diff --git a/Src/AmrCore/CMakeLists.txt b/Src/AmrCore/CMakeLists.txt
index f9ff24f243b..be7c87eee4f 100644
--- a/Src/AmrCore/CMakeLists.txt
+++ b/Src/AmrCore/CMakeLists.txt
@@ -12,6 +12,7 @@ target_sources(amrex
    AMReX_FluxRegister.cpp
    AMReX_FillPatchUtil.H
    AMReX_FillPatchUtil_I.H
+   AMReX_FillPatcher.H
    AMReX_FluxRegister.H
    AMReX_InterpBase.H
    AMReX_InterpBase.cpp
diff --git a/Src/AmrCore/Make.package b/Src/AmrCore/Make.package
index 5b3afa61ccb..df3c2e83d40 100644
--- a/Src/AmrCore/Make.package
+++ b/Src/AmrCore/Make.package
@@ -6,6 +6,8 @@ CEXE_sources += AMReX_AmrCore.cpp AMReX_Cluster.cpp AMReX_ErrorList.cpp AMReX_Fi
                 AMReX_Interpolater.cpp AMReX_MFInterpolater.cpp AMReX_TagBox.cpp AMReX_AmrMesh.cpp \
                 AMReX_InterpBase.cpp
 
+CEXE_headers += AMReX_FillPatcher.H
+
 CEXE_headers += AMReX_Interp_C.H AMReX_Interp_$(DIM)D_C.H
 CEXE_headers += AMReX_MFInterp_C.H AMReX_MFInterp_$(DIM)D_C.H
 
diff --git a/Src/Base/AMReX.H b/Src/Base/AMReX.H
index e02280f3e3b..91f8fc43b7c 100644
--- a/Src/Base/AMReX.H
+++ b/Src/Base/AMReX.H
@@ -271,7 +271,7 @@ namespace amrex
 
     private:
 
-        static std::vector<std::unique_ptr<AMReX> > m_instance;
+        static AMREX_EXPORT std::vector<std::unique_ptr<AMReX> > m_instance;
 
         Geometry* m_geom = nullptr;
     };
diff --git a/Src/Base/AMReX.cpp b/Src/Base/AMReX.cpp
index f06806babcd..76488bf81e0 100644
--- a/Src/Base/AMReX.cpp
+++ b/Src/Base/AMReX.cpp
@@ -123,6 +123,11 @@ namespace {
 #ifdef AMREX_USE_HYPRE
 namespace {
     int init_hypre = 1;
+#if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
+    int hypre_spgemm_use_vendor = 0;
+    int hypre_spmv_use_vendor = 0;
+    int hypre_sptrans_use_vendor = 0;
+#endif
 }
 #endif
 
@@ -489,6 +494,11 @@ amrex::Initialize (int& argc, char**& argv, bool build_parm_parse,
 
 #ifdef AMREX_USE_HYPRE
         pp.queryAdd("init_hypre", init_hypre);
+#if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
+        pp.queryAdd("hypre_spgemm_use_vendor", hypre_spgemm_use_vendor);
+        pp.queryAdd("hypre_spmv_use_vendor", hypre_spmv_use_vendor);
+        pp.queryAdd("hypre_sptrans_use_vendor", hypre_sptrans_use_vendor);
+#endif
 #endif
     }
 
@@ -526,7 +536,7 @@ amrex::Initialize (int& argc, char**& argv, bool build_parm_parse,
 #ifdef AMREX_USE_HYPRE
     if (init_hypre) {
         HYPRE_Init();
-#ifdef HYPRE_USING_CUDA
+#if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
 
 #if defined(HYPRE_RELEASE_NUMBER) && (HYPRE_RELEASE_NUMBER >= 22400)
 
@@ -541,9 +551,13 @@ amrex::Initialize (int& argc, char**& argv, bool build_parm_parse,
         HYPRE_SetGPUMemoryPoolSize( mempool_bin_growth, mempool_min_bin,
                                     mempool_max_bin, mempool_max_cached_bytes );
 #endif
-        /* This API below used to be HYPRE_SetSpGemmUseCusparse(). This was changed in commit
-           Hypre master commit dfdd1cd12f */
-        HYPRE_SetSpGemmUseVendor(false);
+#if (HYPRE_RELEASE_NUMBER >= 22500)
+        HYPRE_SetSpGemmUseVendor(hypre_spgemm_use_vendor);
+        HYPRE_SetSpMVUseVendor(hypre_spmv_use_vendor);
+        HYPRE_SetSpTransUseVendor(hypre_sptrans_use_vendor);
+#elif (HYPRE_USING_CUDA)
+        HYPRE_SetSpGemmUseCusparse(hypre_spgemm_use_vendor);
+#endif
         HYPRE_SetMemoryLocation(HYPRE_MEMORY_DEVICE);
         HYPRE_SetExecutionPolicy(HYPRE_EXEC_DEVICE);
         HYPRE_SetUseGpuRand(true);
diff --git a/Src/Base/AMReX_Algorithm.H b/Src/Base/AMReX_Algorithm.H
index b5a5f4973c7..65a5f8cb763 100644
--- a/Src/Base/AMReX_Algorithm.H
+++ b/Src/Base/AMReX_Algorithm.H
@@ -145,7 +145,7 @@ namespace amrex
     AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
     I bisect (T const* d, I lo, I hi, T const& v) {
         while (lo <= hi) {
-            int mid = (lo+hi)/2;
+            int mid = lo + (hi-lo)/2;
             if (v >= d[mid] && v < d[mid+1]) {
                 return mid;
             } else if (v < d[mid]) {
@@ -157,6 +157,57 @@ namespace amrex
         return hi;
     }
 
+    template<typename ItType, typename ValType>
+    AMREX_GPU_HOST_DEVICE
+    ItType upper_bound (ItType first, ItType last, const ValType& val)
+    {
+#if AMREX_DEVICE_COMPILE
+        std::ptrdiff_t count = last-first;
+        while(count>0){
+            auto it = first;
+            const auto step = count/2;
+            it += step;
+            if (!(val < *it)){
+                first = ++it;
+                count -= step + 1;
+            }
+            else{
+                count = step;
+            }
+        }
+
+        return first;
+#else
+        return std::upper_bound(first, last, val);
+#endif
+    }
+
+    template<typename ItType, typename ValType>
+    AMREX_GPU_HOST_DEVICE
+    ItType lower_bound (ItType first, ItType last, const ValType& val)
+    {
+#ifdef AMREX_DEVICE_COMPILE
+        std::ptrdiff_t count = last-first;
+        while(count>0)
+        {
+            auto it = first;
+            const auto step = count/2;
+            it += step;
+            if (*it < val){
+                first = ++it;
+                count -= step + 1;
+            }
+            else{
+                count = step;
+            }
+        }
+
+        return first;
+#else
+        return std::lower_bound(first, last, val);
+#endif
+    }
+
 namespace detail {
 
 struct clzll_tag {};
diff --git a/Src/Base/AMReX_Any.H b/Src/Base/AMReX_Any.H
index b57aa9a39ef..2c7d9688d36 100644
--- a/Src/Base/AMReX_Any.H
+++ b/Src/Base/AMReX_Any.H
@@ -48,11 +48,25 @@ public:
 
     //! Returns a reference to the contained object.
     template <typename MF>
-    MF& get () { return dynamic_cast<innards<MF>&>(*m_ptr).m_mf; }
+    MF& get () {
+        if (auto p0 = dynamic_cast<innards<MF>*>(m_ptr.get())) {
+            return p0->m_mf;
+        } else {
+            return dynamic_cast<innards<MF&>&>(*m_ptr).m_mf;
+        }
+    }
 
     //! Returns a const reference to the contained object.
     template <typename MF>
-    MF const& get () const { return dynamic_cast<innards<MF> const&>(*m_ptr).m_mf; }
+    MF const& get () const {
+        if (auto p0 = dynamic_cast<innards<MF>*>(m_ptr.get())) {
+            return p0->m_mf;
+        } else if (auto p1 = dynamic_cast<innards<MF&>*>(m_ptr.get())) {
+            return p1->m_mf;
+        } else {
+            return dynamic_cast<innards<MF const&> const&>(*m_ptr).m_mf;
+        }
+    }
 
     template <typename MF>
     bool is () const { return m_ptr->Type() == typeid(MF); }
@@ -60,15 +74,18 @@ public:
 private:
     struct innards_base {
         virtual const std::type_info& Type () const = 0;
+        virtual ~innards_base () = default;
     };
 
     template <typename MF>
     struct innards : innards_base
     {
-        innards(MF && mf)
+        innards (MF && mf)
             : m_mf(std::forward<MF>(mf))
             {}
 
+        virtual ~innards () = default;
+
         virtual const std::type_info& Type () const override {
             return typeid(MF);
         }
diff --git a/Src/Base/AMReX_Arena.cpp b/Src/Base/AMReX_Arena.cpp
index c14fced3872..f7a46dc25c8 100644
--- a/Src/Base/AMReX_Arena.cpp
+++ b/Src/Base/AMReX_Arena.cpp
@@ -14,11 +14,11 @@
 ///#include <memoryapi.h>
 //#define AMREX_MLOCK(x,y) VirtualLock(x,y)
 //#define AMREX_MUNLOCK(x,y) VirtualUnlock(x,y)
-#define AMREX_MLOCK(x,y) ((void)0)
+//#define AMREX_MLOCK(x,y) ((void)0)
 #define AMREX_MUNLOCK(x,y) ((void)0)
 #else
 #include <sys/mman.h>
-#define AMREX_MLOCK(x,y) mlock(x,y)
+//#define AMREX_MLOCK(x,y) mlock(x,y)
 #define AMREX_MUNLOCK(x,y) munlock(x,y)
 #endif
 
@@ -132,19 +132,21 @@ Arena::allocate_system (std::size_t nbytes)
     if (arena_info.use_cpu_memory)
     {
         p = std::malloc(nbytes);
+#ifndef _WIN32
 #if defined(__GNUC__) && !defined(__clang__)
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
 #endif
-        if (p && arena_info.device_use_hostalloc) AMREX_MLOCK(p, nbytes);
+        if (p && (nbytes > 0) && arena_info.device_use_hostalloc) mlock(p, nbytes);
 #if defined(__GNUC__) && !defined(__clang__)
 #pragma GCC diagnostic pop
+#endif
 #endif
     }
     else if (arena_info.device_use_hostalloc)
     {
         AMREX_HIP_OR_CUDA_OR_DPCPP(
-            AMREX_HIP_SAFE_CALL (hipHostMalloc(&p, nbytes, hipHostMallocMapped));,
+            AMREX_HIP_SAFE_CALL (hipHostMalloc(&p, nbytes, hipHostMallocMapped|hipHostMallocNonCoherent));,
             AMREX_CUDA_SAFE_CALL(cudaHostAlloc(&p, nbytes, cudaHostAllocMapped));,
             p = sycl::malloc_host(nbytes, Gpu::Device::syclContext()));
     }
@@ -190,7 +192,16 @@ Arena::allocate_system (std::size_t nbytes)
     }
 #else
     p = std::malloc(nbytes);
-    if (p && arena_info.device_use_hostalloc) AMREX_MLOCK(p, nbytes);
+#ifndef _WIN32
+#if defined(__GNUC__) && !defined(__clang__)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
+#endif
+    if (p && (nbytes > 0) && arena_info.device_use_hostalloc) mlock(p, nbytes);
+#if defined(__GNUC__) && !defined(__clang__)
+#pragma GCC diagnostic pop
+#endif
+#endif
 #endif
     if (p == nullptr) amrex::Abort("Sorry, malloc failed");
     return p;
@@ -253,12 +264,13 @@ Arena::Initialize ()
     if (initialized) return;
     initialized = true;
 
-    BL_ASSERT(the_arena == nullptr);
+    // see reason on allowed reuse of the default CPU BArena in Arena::Finalize
+    BL_ASSERT(the_arena == nullptr || the_arena == The_BArena());
     BL_ASSERT(the_async_arena == nullptr);
-    BL_ASSERT(the_device_arena == nullptr);
-    BL_ASSERT(the_managed_arena == nullptr);
+    BL_ASSERT(the_device_arena == nullptr || the_device_arena == The_BArena());
+    BL_ASSERT(the_managed_arena == nullptr || the_managed_arena == The_BArena());
     BL_ASSERT(the_pinned_arena == nullptr);
-    BL_ASSERT(the_cpu_arena == nullptr);
+    BL_ASSERT(the_cpu_arena == nullptr || the_cpu_arena == The_BArena());
 
 #ifdef AMREX_USE_GPU
 #ifdef AMREX_USE_DPCPP
@@ -304,7 +316,7 @@ Arena::Initialize ()
     the_async_arena = new PArena(the_async_arena_release_threshold);
 
 #ifdef AMREX_USE_GPU
-    if (the_arena->isDevice() || the_arena->isManaged()) {
+    if (the_arena->isDevice()) {
         the_device_arena = the_arena;
     } else {
         the_device_arena = new CArena(0, ArenaInfo{}.SetDeviceMemory().SetReleaseThreshold
@@ -468,6 +480,13 @@ Arena::Finalize ()
 
     initialized = false;
 
+    // we reset Arenas unless they are the default "CPU malloc/free" BArena
+    // this is because we want to allow users to free their UB objects
+    // that they forgot to destruct after amrex::Finalize():
+    //   amrex::Initialize(...);
+    //   MultiFab mf(...);  // this should be scoped in { ... }
+    //   amrex::Finalize();
+    // mf cannot be used now, but it can at least be freed without a segfault
     if (!dynamic_cast<BArena*>(the_device_arena)) {
         if (the_device_arena != the_arena) {
             delete the_device_arena;
diff --git a/Src/Base/AMReX_Array4.H b/Src/Base/AMReX_Array4.H
index 0fc4c049437..296762614d3 100644
--- a/Src/Base/AMReX_Array4.H
+++ b/Src/Base/AMReX_Array4.H
@@ -11,6 +11,50 @@
 
 namespace amrex {
 
+    template <typename T>
+    struct CellData // Data in a single cell
+    {
+        T* AMREX_RESTRICT p = nullptr;
+        Long stride = 0;
+        int ncomp = 0;
+
+        AMREX_GPU_HOST_DEVICE
+        constexpr CellData (T* a_p, Long a_stride, int a_ncomp)
+            : p(a_p), stride(a_stride), ncomp(a_ncomp)
+            {}
+
+        template <class U=T,
+                  std::enable_if_t<std::is_const<U>::value,int> = 0>
+        AMREX_GPU_HOST_DEVICE
+        constexpr CellData (CellData<typename std::remove_const<T>::type> const& rhs) noexcept
+            : p(rhs.p), stride(rhs.stride), ncomp(rhs.ncomp)
+            {}
+
+        AMREX_GPU_HOST_DEVICE
+        explicit operator bool() const noexcept { return p != nullptr; }
+
+        AMREX_GPU_HOST_DEVICE
+        int nComp() const noexcept { return ncomp; }
+
+        template <class U=T,
+                  std::enable_if_t<!std::is_void<U>::value,int> = 0>
+        AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+        U& operator[] (int n) const noexcept {
+#if defined(AMREX_DEBUG) || defined(AMREX_BOUND_CHECK)
+            if (n < 0 || n >= ncomp) {
+#if AMREX_DEVICE_COMPILE
+                AMREX_DEVICE_PRINTF(" %d is out of bound (0:%d)", n, ncomp-1);
+#else
+                std::stringstream ss;
+                ss << " " << n << " is out of bound: (0:" << ncomp-1 << ")";
+                amrex::Abort(ss.str());
+#endif
+            }
+#endif
+            return p[n*stride];
+        }
+    };
+
     template <typename T>
     struct Array4
     {
@@ -207,6 +251,11 @@ namespace amrex {
             }
         }
 #endif
+
+        AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+        CellData<T> cellData (int i, int j, int k) const noexcept {
+            return CellData<T>{this->ptr(i,j,k), nstride, ncomp};
+        }
     };
 
     template <class Tto, class Tfrom>
diff --git a/Src/Base/AMReX_BCRec.H b/Src/Base/AMReX_BCRec.H
index 1980c727e81..d76760df9d9 100644
--- a/Src/Base/AMReX_BCRec.H
+++ b/Src/Base/AMReX_BCRec.H
@@ -74,6 +74,17 @@ public:
     AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
     void setHi (int dir, int bc_val) noexcept { bc[AMREX_SPACEDIM+dir] = bc_val; }
     /**
+    * \brief Explicitly set bndry value for given face.
+    */
+    AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+    void set (Orientation face, int bc_val) noexcept {
+        if (face.isLow()) {
+            setLo(face.coordDir(), bc_val);
+        } else {
+            setHi(face.coordDir(), bc_val);
+        }
+    }
+    /**
     * \brief  Return bndry values (used in calls to FORTRAN).
     */
     AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
diff --git a/Src/Base/AMReX_BC_TYPES.H b/Src/Base/AMReX_BC_TYPES.H
index ea24a64addf..b735da6fddb 100644
--- a/Src/Base/AMReX_BC_TYPES.H
+++ b/Src/Base/AMReX_BC_TYPES.H
@@ -73,7 +73,10 @@ enum mathematicalBndryTypes : int {
     foextrap     =  2,
     ext_dir      =  3,
     hoextrap     =  4,
-    hoextrapcc   =  5
+    hoextrapcc   =  5,
+    user_1       = 1001,
+    user_2       = 1002,
+    user_3       = 1003
 };
 }
 
@@ -102,4 +105,3 @@ enum mathematicalBndryTypes : int {
 #endif
 
 #endif
-
diff --git a/Src/Base/AMReX_BLBackTrace.cpp b/Src/Base/AMReX_BLBackTrace.cpp
index 477e0b6bac2..0c304d30011 100644
--- a/Src/Base/AMReX_BLBackTrace.cpp
+++ b/Src/Base/AMReX_BLBackTrace.cpp
@@ -5,6 +5,9 @@
 #include <AMReX_AsyncOut.H>
 #include <AMReX.H>
 #include <AMReX_Utility.H>
+#ifdef AMREX_USE_MPI
+#include <AMReX_MPMD.H>
+#endif
 
 #ifdef AMREX_TINY_PROFILING
 #include <AMReX_TinyProfiler.H>
@@ -71,7 +74,15 @@ BLBackTrace::handler(int s)
     std::string errfilename;
     {
         std::ostringstream ss;
-        ss << "Backtrace." << ParallelDescriptor::MyProc();
+#ifdef AMREX_USE_MPI
+        if (MPMD::Initialized()) {
+            ss << "Backtrace.prog" << MPMD::MyProgId() << ".";
+        } else
+#endif
+        {
+            ss << "Backtrace.";
+        }
+        ss << ParallelDescriptor::MyProc();
 #ifdef AMREX_USE_OMP
         ss << "." << omp_get_thread_num();
 #endif
diff --git a/Src/Base/AMReX_BaseFab.H b/Src/Base/AMReX_BaseFab.H
index 3a9f5eea018..f0e50ecac48 100644
--- a/Src/Base/AMReX_BaseFab.H
+++ b/Src/Base/AMReX_BaseFab.H
@@ -260,7 +260,7 @@ public:
      */
     void clear () noexcept;
 
-    // Release ownership of memory
+    //! Release ownership of memory
     std::unique_ptr<T,DataDeleter> release () noexcept;
 
     //! Returns how many bytes used
@@ -350,10 +350,22 @@ public:
     * order, with the component index coming last.   In other words,
     * dataPtr returns a pointer to all the Nth components.
     */
-    T* dataPtr (int n = 0) noexcept { AMREX_ASSERT(!(this->dptr == 0)); return &(this->dptr[n*this->domain.numPts()]); }
+    T* dataPtr (int n = 0) noexcept {
+        if (this->dptr) {
+            return &(this->dptr[n*this->domain.numPts()]);
+        } else {
+            return nullptr;
+        }
+    }
 
     //! Same as above except works on const FABs.
-    const T* dataPtr (int n = 0) const noexcept { AMREX_ASSERT(!(this->dptr == 0)); return &(this->dptr[n*this->domain.numPts()]); }
+    const T* dataPtr (int n = 0) const noexcept {
+        if (this->dptr) {
+            return &(this->dptr[n*this->domain.numPts()]);
+        } else {
+            return nullptr;
+        }
+    }
 
     T* dataPtr (const IntVect& iv, int n = 0) noexcept;
 
@@ -1882,9 +1894,9 @@ BaseFab<T>::define ()
 {
     AMREX_ASSERT(this->dptr == 0);
     AMREX_ASSERT(this->domain.numPts() > 0);
-    AMREX_ASSERT(std::numeric_limits<Long>::max()/this->nvar > this->domain.numPts());
     AMREX_ASSERT(this->nvar >= 0);
     if (this->nvar == 0) return;
+    AMREX_ASSERT(std::numeric_limits<Long>::max()/this->nvar > this->domain.numPts());
 
     this->truesize  = this->nvar*this->domain.numPts();
     this->ptr_owner = true;
diff --git a/Src/Base/AMReX_Box.cpp b/Src/Base/AMReX_Box.cpp
index f93818e784d..e61942c2a48 100644
--- a/Src/Base/AMReX_Box.cpp
+++ b/Src/Base/AMReX_Box.cpp
@@ -126,7 +126,7 @@ AllGatherBoxes (Vector<Box>& bxs, int n_extra_reserve)
     if (count_tot == 0) return;
 
     if (count_tot > static_cast<Long>(std::numeric_limits<int>::max())) {
-        amrex::Abort("AllGatherBoxes: not many boxes");
+        amrex::Abort("AllGatherBoxes: too many boxes");
     }
 
     Vector<Box> recv_buffer;
@@ -161,7 +161,7 @@ AllGatherBoxes (Vector<Box>& bxs, int n_extra_reserve)
     if (count_tot == 0) return;
 
     if (count_tot > static_cast<Long>(std::numeric_limits<int>::max())) {
-        amrex::Abort("AllGatherBoxes: not many boxes");
+        amrex::Abort("AllGatherBoxes: too many boxes");
     }
 
     Vector<Box> recv_buffer;
diff --git a/Src/Base/AMReX_BoxList.H b/Src/Base/AMReX_BoxList.H
index 04e93eab97e..1dc8f15c536 100644
--- a/Src/Base/AMReX_BoxList.H
+++ b/Src/Base/AMReX_BoxList.H
@@ -206,9 +206,9 @@ public:
     BoxList& convert (IndexType typ) noexcept;
 
     //! Returns a reference to the Vector<Box>.
-    Vector<Box>& data() noexcept { return m_lbox; }
+    Vector<Box>& data () noexcept { return m_lbox; }
     //! Returns a constant reference to the Vector<Box>.
-    const Vector<Box>& data() const noexcept { return m_lbox; }
+    const Vector<Box>& data () const noexcept { return m_lbox; }
 
     void swap (BoxList& rhs) {
         std::swap(m_lbox, rhs.m_lbox);
diff --git a/Src/Base/AMReX_CTOParallelForImpl.H b/Src/Base/AMReX_CTOParallelForImpl.H
new file mode 100644
index 00000000000..e79122de24d
--- /dev/null
+++ b/Src/Base/AMReX_CTOParallelForImpl.H
@@ -0,0 +1,331 @@
+#ifndef AMREX_CTO_PARALLEL_FOR_H_
+#define AMREX_CTO_PARALLEL_FOR_H_
+
+#include <AMReX_BLassert.H>
+#include <AMReX_Box.H>
+#include <AMReX_Tuple.H>
+
+#include <array>
+#include <type_traits>
+
+/* This header is not for the users to include directly.  It's meant to be
+ * included in AMReX_GpuLaunch.H, which has included the headers needed
+ * here. */
+
+/* Thank Maikel Nadolski and Alex Sinn for the techniques used here! */
+
+namespace amrex {
+
+template <int... ctr>
+struct CompileTimeOptions {
+    // TypeList is defined in AMReX_Tuple.H
+    using list_type = TypeList<std::integral_constant<int, ctr>...>;
+};
+
+#if (__cplusplus >= 201703L)
+
+//namespace meta
+//{
+    template <typename... As, typename... Bs>
+    constexpr auto operator+ (TypeList<As...>, TypeList<Bs...>) {
+        return TypeList<As..., Bs...>{};
+    }
+
+    template <typename... Ls, typename A>
+    constexpr auto single_product (TypeList<Ls...>, A) {
+        return TypeList<decltype(Ls{} + TypeList<A>{})...>{};
+    }
+
+    template <typename LLs, typename... As>
+    constexpr auto operator* (LLs, TypeList<As...>) {
+        return (TypeList<>{} + ... + single_product(LLs{}, As{}));
+    }
+
+    template <typename... Ls>
+    constexpr auto cartesian_product_n (TypeList<Ls...>) {
+        return (TypeList<TypeList<>>{} * ... * Ls{});
+    }
+//}
+
+namespace detail
+{
+    template <int MT, typename T, class F, typename... As>
+    std::enable_if_t<std::is_integral<T>::value || std::is_same<T,Box>::value, bool>
+    ParallelFor_helper2 (T const& N, F&& f, TypeList<As...>,
+                         std::array<int,sizeof...(As)> const& runtime_options)
+    {
+        if (runtime_options == std::array<int,sizeof...(As)>{As::value...}) {
+            if constexpr (std::is_integral<T>::value) {
+                ParallelFor<MT>(N, [f] AMREX_GPU_DEVICE (T i) noexcept
+                {
+                    f(i, As{}...);
+                });
+            } else {
+                ParallelFor<MT>(N, [f] AMREX_GPU_DEVICE (int i, int j, int k) noexcept
+                {
+                    f(i, j, k, As{}...);
+                });
+            }
+            return true;
+        } else {
+            return false;
+        }
+    }
+
+    template <int MT, typename T, class F, typename... As>
+    std::enable_if_t<std::is_integral<T>::value, bool>
+    ParallelFor_helper2 (Box const& box, T ncomp, F&& f, TypeList<As...>,
+                         std::array<int,sizeof...(As)> const& runtime_options)
+    {
+        if (runtime_options == std::array<int,sizeof...(As)>{As::value...}) {
+            ParallelFor<MT>(box, ncomp, [f] AMREX_GPU_DEVICE (int i, int j, int k, T n) noexcept
+            {
+                f(i, j, k, n, As{}...);
+            });
+            return true;
+        } else {
+            return false;
+        }
+    }
+
+    template <int MT, typename T, class F, typename... PPs, typename RO>
+    std::enable_if_t<std::is_integral<T>::value || std::is_same<T,Box>::value>
+    ParallelFor_helper1 (T const& N, F&& f, TypeList<PPs...>,
+                         RO const& runtime_options)
+    {
+        bool found_option = (false || ... ||
+                             ParallelFor_helper2<MT>(N, std::forward<F>(f),
+                                                     PPs{}, runtime_options));
+        amrex::ignore_unused(found_option);
+        AMREX_ASSERT(found_option);
+    }
+
+    template <int MT, typename T, class F, typename... PPs, typename RO>
+    std::enable_if_t<std::is_integral<T>::value>
+    ParallelFor_helper1 (Box const& box, T ncomp, F&& f, TypeList<PPs...>,
+                         RO const& runtime_options)
+    {
+        bool found_option = (false || ... ||
+                             ParallelFor_helper2<MT>(box, ncomp, std::forward<F>(f),
+                                                     PPs{}, runtime_options));
+        amrex::ignore_unused(found_option);
+        AMREX_ASSERT(found_option);
+    }
+}
+
+#endif
+
+template <int MT, typename T, class F, typename... CTOs>
+std::enable_if_t<std::is_integral<T>::value>
+ParallelFor (TypeList<CTOs...> /*list_of_compile_time_options*/,
+             std::array<int,sizeof...(CTOs)> const& runtime_options,
+             T N, F&& f)
+{
+#if (__cplusplus >= 201703L)
+    using OptionsListList = TypeList<typename CTOs::list_type...>;
+    detail::ParallelFor_helper1<MT>(N, std::forward<F>(f),
+                                    cartesian_product_n(OptionsListList{}),
+                                    runtime_options);
+#else
+    amrex::ignore_unused(N, f, runtime_options);
+    static_assert(std::is_integral<F>::value, "This requires C++17");
+#endif
+}
+
+template <int MT, class F, typename... CTOs>
+void ParallelFor (TypeList<CTOs...> /*list_of_compile_time_options*/,
+                  std::array<int,sizeof...(CTOs)> const& runtime_options,
+                  Box const& box, F&& f)
+{
+#if (__cplusplus >= 201703L)
+    using OptionsListList = TypeList<typename CTOs::list_type...>;
+    detail::ParallelFor_helper1<MT>(box, std::forward<F>(f),
+                                    cartesian_product_n(OptionsListList{}),
+                                    runtime_options);
+#else
+    amrex::ignore_unused(box, f, runtime_options);
+    static_assert(std::is_integral<F>::value, "This requires C++17");
+#endif
+}
+
+template <int MT, typename T, class F, typename... CTOs>
+std::enable_if_t<std::is_integral<T>::value>
+ParallelFor (TypeList<CTOs...> /*list_of_compile_time_options*/,
+             std::array<int,sizeof...(CTOs)> const& runtime_options,
+             Box const& box, T ncomp, F&& f)
+{
+#if (__cplusplus >= 201703L)
+    using OptionsListList = TypeList<typename CTOs::list_type...>;
+    detail::ParallelFor_helper1<MT>(box, ncomp, std::forward<F>(f),
+                                    cartesian_product_n(OptionsListList{}),
+                                    runtime_options);
+#else
+    amrex::ignore_unused(box, ncomp, f, runtime_options);
+    static_assert(std::is_integral<F>::value, "This requires C++17");
+#endif
+}
+
+/**
+ * \brief ParallelFor with compile time optimization of kernels with run time options.
+ *
+ * It uses fold expression to generate kernel launches for all combinations
+ * of the run time options.  The kernel function can use constexpr if to
+ * discard unused code blocks for better run time performance.  In the
+ * example below, the code will be expanded into 4*2=8 normal ParallelFors
+ * for all combinations of the run time parameters.
+ \verbatim
+     int A_runtime_option = ...;
+     int B_runtime_option = ...;
+     enum A_options : int { A0, A1, A2, A3};
+     enum B_options : int { B0, B1 };
+     ParallelFor(TypeList<CompileTimeOptions<A0,A1,A2,A3>,
+                          CompileTimeOptions<B0,B1>>{},
+                 {A_runtime_option, B_runtime_option},
+                 N, [=] AMREX_GPU_DEVICE (int i, auto A_control, auto B_control)
+     {
+         ...
+         if constexpr (A_control.value == A0) {
+             ...
+         } else if constexpr (A_control.value == A1) {
+             ...
+         } else if constexpr (A_control.value == A2) {
+             ...
+         else {
+             ...
+         }
+         if constexpr (A_control.value != A3 && B_control.value == B1) {
+             ...
+         }
+         ...
+     });
+ \endverbatim
+ * Note that due to a limitation of CUDA's extended device lambda, the
+ * constexpr if block cannot be the one that captures a variable first.
+ * If nvcc complains about it, you will have to manually capture it outside
+ * constexpr if.  The data type for the parameters is int.
+ *
+ * \param ctos   list of all possible values of the parameters.
+ * \param option the run time parameters.
+ * \param N      an interger specifying the 1D for loop's range.
+ * \param f      a callable object taking an integer and working on that iteration.
+ */
+template <typename T, class F, typename... CTOs>
+std::enable_if_t<std::is_integral<T>::value>
+ParallelFor (TypeList<CTOs...> ctos,
+             std::array<int,sizeof...(CTOs)> const& option,
+             T N, F&& f)
+{
+    ParallelFor<AMREX_GPU_MAX_THREADS>(ctos, option, N, std::forward<F>(f));
+}
+
+/**
+ * \brief ParallelFor with compile time optimization of kernels with run time options.
+ *
+ * It uses fold expression to generate kernel launches for all combinations
+ * of the run time options.  The kernel function can use constexpr if to
+ * discard unused code blocks for better run time performance.  In the
+ * example below, the code will be expanded into 4*2=8 normal ParallelFors
+ * for all combinations of the run time parameters.
+ \verbatim
+     int A_runtime_option = ...;
+     int B_runtime_option = ...;
+     enum A_options : int { A0, A1, A2, A3};
+     enum B_options : int { B0, B1 };
+     ParallelFor(TypeList<CompileTimeOptions<A0,A1,A2,A3>,
+                          CompileTimeOptions<B0,B1>>{},
+                 {A_runtime_option, B_runtime_option},
+                 box, [=] AMREX_GPU_DEVICE (int i, int j, int k,
+                                            auto A_control, auto B_control)
+     {
+         ...
+         if constexpr (A_control.value == A0) {
+             ...
+         } else if constexpr (A_control.value == A1) {
+             ...
+         } else if constexpr (A_control.value == A2) {
+             ...
+         else {
+             ...
+         }
+         if constexpr (A_control.value != A3 && B_control.value == B1) {
+             ...
+         }
+         ...
+     });
+ \endverbatim
+ * Note that due to a limitation of CUDA's extended device lambda, the
+ * constexpr if block cannot be the one that captures a variable first.
+ * If nvcc complains about it, you will have to manually capture it outside
+ * constexpr if.  The data type for the parameters is int.
+ *
+ * \param ctos   list of all possible values of the parameters.
+ * \param option the run time parameters.
+ * \param box      a Box specifying the 3D for loop's range.
+ * \param f        a callable object taking three integers and working on the given cell.
+ */
+template <class F, typename... CTOs>
+void ParallelFor (TypeList<CTOs...> ctos,
+                  std::array<int,sizeof...(CTOs)> const& option,
+                  Box const& box, F&& f)
+{
+    ParallelFor<AMREX_GPU_MAX_THREADS>(ctos, option, box, std::forward<F>(f));
+}
+
+/**
+ * \brief ParallelFor with compile time optimization of kernels with run time options.
+ *
+ * It uses fold expression to generate kernel launches for all combinations
+ * of the run time options.  The kernel function can use constexpr if to
+ * discard unused code blocks for better run time performance.  In the
+ * example below, the code will be expanded into 4*2=8 normal ParallelFors
+ * for all combinations of the run time parameters.
+ \verbatim
+     int A_runtime_option = ...;
+     int B_runtime_option = ...;
+     enum A_options : int { A0, A1, A2, A3};
+     enum B_options : int { B0, B1 };
+     ParallelFor(TypeList<CompileTimeOptions<A0,A1,A2,A3>,
+                          CompileTimeOptions<B0,B1>>{},
+                 {A_runtime_option, B_runtime_option},
+                 box, ncomp, [=] AMREX_GPU_DEVICE (int i, int j, int k, int n,
+                                                   auto A_control, auto B_control)
+     {
+         ...
+         if constexpr (A_control.value == A0) {
+             ...
+         } else if constexpr (A_control.value == A1) {
+             ...
+         } else if constexpr (A_control.value == A2) {
+             ...
+         else {
+             ...
+         }
+         if constexpr (A_control.value != A3 && B_control.value == B1) {
+             ...
+         }
+         ...
+     });
+ \endverbatim
+ * Note that due to a limitation of CUDA's extended device lambda, the
+ * constexpr if block cannot be the one that captures a variable first.
+ * If nvcc complains about it, you will have to manually capture it outside
+ * constexpr if.  The data type for the parameters is int.
+ *
+ * \param ctos   list of all possible values of the parameters.
+ * \param option the run time parameters.
+ * \param box    a Box specifying the iteration in 3D space.
+ * \param ncomp  an integer specifying the range for iteration over components.
+ * \param f      a callable object taking three integers and working on the given cell.
+ */
+template <typename T, class F, typename... CTOs>
+std::enable_if_t<std::is_integral<T>::value>
+ParallelFor (TypeList<CTOs...> ctos,
+             std::array<int,sizeof...(CTOs)> const& option,
+             Box const& box, T ncomp, F&& f)
+{
+    ParallelFor<AMREX_GPU_MAX_THREADS>(ctos, option, box, ncomp, std::forward<F>(f));
+}
+
+}
+
+#endif
diff --git a/Src/Base/AMReX_DistributionMapping.cpp b/Src/Base/AMReX_DistributionMapping.cpp
index a61d5b2f591..6b4c0c8925c 100644
--- a/Src/Base/AMReX_DistributionMapping.cpp
+++ b/Src/Base/AMReX_DistributionMapping.cpp
@@ -1300,7 +1300,7 @@ DistributionMapping::SFCProcessorMap (const BoxArray& boxes,
 
         for (int i = 0, N = boxes.size(); i < N; ++i)
         {
-            wgts.push_back(boxes[i].volume());
+            wgts.push_back(boxes[i].numPts());
         }
 
         SFCProcessorMapDoIt(boxes,wgts,nprocs);
@@ -1769,7 +1769,7 @@ DistributionMapping::makeSFC (const BoxArray& ba, bool use_box_vol, const int np
     {
         const Box& bx = ba[i];
         tokens.push_back(makeSFCToken(i, bx.smallEnd()));
-        const Long v = use_box_vol ? bx.volume() : Long(1);
+        const Long v = use_box_vol ? bx.numPts() : Long(1);
         vol_sum += v;
         wgts.push_back(v);
     }
diff --git a/Src/Base/AMReX_Extension.H b/Src/Base/AMReX_Extension.H
index a084777f1a0..753b43995f3 100644
--- a/Src/Base/AMReX_Extension.H
+++ b/Src/Base/AMReX_Extension.H
@@ -57,7 +57,7 @@
 #elif defined(__INTEL_COMPILER)
 #define AMREX_PRAGMA_SIMD _Pragma("ivdep")
 
-#elif defined(_CRAYC)
+#elif defined(_CRAYC) || defined(__cray__)
 #define AMREX_PRAGMA_SIMD _Pragma("ivdep")
 
 #elif defined(__PGI)
@@ -73,7 +73,7 @@
 #define AMREX_PRAGMA_SIMD _Pragma("ibm independent_loop")
 
 #elif defined(__clang__)
-#define AMREX_PRAGMA_SIMD _Pragma("clang loop vectorize(enable)")
+#define AMREX_PRAGMA_SIMD
 
 #elif defined(__GNUC__)
 #define AMREX_PRAGMA_SIMD _Pragma("GCC ivdep")
diff --git a/Src/Base/AMReX_FArrayBox.H b/Src/Base/AMReX_FArrayBox.H
index 3d3cda3674b..b678986c0e9 100644
--- a/Src/Base/AMReX_FArrayBox.H
+++ b/Src/Base/AMReX_FArrayBox.H
@@ -272,7 +272,7 @@ public:
     virtual ~FArrayBox () noexcept override {}
 
     FArrayBox (FArrayBox&& rhs) noexcept = default;
-    FArrayBox& operator= (FArrayBox&&) = default;
+    FArrayBox& operator= (FArrayBox&&) noexcept = default;
 
     FArrayBox (const FArrayBox&) = delete;
     FArrayBox& operator= (const FArrayBox&) = delete;
diff --git a/Src/Base/AMReX_FBI.H b/Src/Base/AMReX_FBI.H
index 61ef452b601..cc0bfeecbce 100644
--- a/Src/Base/AMReX_FBI.H
+++ b/Src/Base/AMReX_FBI.H
@@ -924,7 +924,7 @@ FabArray<FAB>::pack_send_buffer_cpu (FabArray<FAB> const& src, int scomp, int nc
                 amrex::LoopConcurrentOnCpu( bx, ncomp,
                 [=] (int ii, int jj, int kk, int n) noexcept
                 {
-                    pfab(ii,jj,kk,n) = sfab(ii,jj,kk,n+scomp);
+                    pfab(ii,jj,kk,n) = static_cast<BUF>(sfab(ii,jj,kk,n+scomp));
                 });
                 dptr += (bx.numPts() * ncomp * sizeof(BUF));
             }
diff --git a/Src/Base/AMReX_FabArray.H b/Src/Base/AMReX_FabArray.H
index 6eef7caa579..736c39567ae 100644
--- a/Src/Base/AMReX_FabArray.H
+++ b/Src/Base/AMReX_FabArray.H
@@ -438,6 +438,15 @@ public:
     */
     bool ok () const;
 
+    /** Has define() been called on this rank?
+     *
+     * \return true if `define` has been called on this `FabArray`.  Note that all constructors except `FabArray ()`
+     * and `FabArray(Arena*a)` call `define`, even if the `MFInfo` argument has `alloc=false`.  One could
+     * also use `FabArrayBase::empty()` to find whether `define` is called or not, although they are not exactly
+     * the same.
+     */
+    bool isDefined () const;
+
     //! Return a constant reference to the FAB associated with mfi.
     const FAB& operator[] (const MFIter& mfi) const noexcept { return *(this->fabPtr(mfi)); }
 
@@ -1128,6 +1137,7 @@ protected:
     std::unique_ptr<FabFactory<FAB> > m_factory;
     DataAllocator m_dallocator;
 
+    //! has define() been called?
     bool define_function_called = false;
 
     //
@@ -1768,6 +1778,13 @@ FabArray<FAB>::ok () const
     return isok == 1;
 }
 
+template <class FAB>
+bool
+FabArray<FAB>::isDefined () const
+{
+    return define_function_called;
+}
+
 template <class FAB>
 void
 FabArray<FAB>::define (const BoxArray&            bxs,
@@ -2848,7 +2865,7 @@ FabArray<FAB>::SumBoundary_nowait (int scomp, int ncomp, IntVect const& src_ngho
 
     FabArray<FAB>* tmp = new FabArray<FAB>( boxArray(), DistributionMap(), ncomp, src_nghost, MFInfo(), Factory() );
     amrex::Copy(*tmp, *this, scomp, 0, ncomp, src_nghost);
-    this->setVal(0.0, scomp, ncomp, dst_nghost);
+    this->setVal(typename FAB::value_type(0), scomp, ncomp, dst_nghost);
     this->ParallelCopy_nowait(*tmp,0,scomp,ncomp,src_nghost,dst_nghost,period,FabArrayBase::ADD);
 
     // All local. Operation complete.
diff --git a/Src/Base/AMReX_FabArrayCommI.H b/Src/Base/AMReX_FabArrayCommI.H
index c894fe0b2c7..3d3fe1743a2 100644
--- a/Src/Base/AMReX_FabArrayCommI.H
+++ b/Src/Base/AMReX_FabArrayCommI.H
@@ -10,7 +10,7 @@ FabArray<FAB>::FBEP_nowait (int scomp, int ncomp, const IntVect& nghost,
                             bool enforce_periodicity_only,
                             bool override_sync)
 {
-    BL_PROFILE_SYNC_START_TIMED("SyncBeforeComms");
+    BL_PROFILE_SYNC_START_TIMED("SyncBeforeComms: FB");
     BL_PROFILE("FillBoundary_nowait()");
 
     AMREX_ASSERT_WITH_MESSAGE(!fbd, "FillBoundary_nowait() called when comm operation already in progress.");
@@ -316,7 +316,7 @@ FabArray<FAB>::ParallelCopy_nowait (const FabArray<FAB>& src,
                                     const FabArrayBase::CPC * a_cpc,
                                     bool                 to_ghost_cells_only)
 {
-    BL_PROFILE_SYNC_START_TIMED("SyncBeforeComms");
+    BL_PROFILE_SYNC_START_TIMED("SyncBeforeComms: PC");
     BL_PROFILE("FabArray::ParallelCopy_nowait()");
 
     AMREX_ASSERT_WITH_MESSAGE(!pcd, "ParallelCopy_nowait() called when comm operation already in progress.");
diff --git a/Src/Base/AMReX_Geometry.H b/Src/Base/AMReX_Geometry.H
index 54a8b8630d3..890ec2e0f7e 100644
--- a/Src/Base/AMReX_Geometry.H
+++ b/Src/Base/AMReX_Geometry.H
@@ -67,6 +67,56 @@ public:
     int coord;
 };
 
+    namespace detail {
+        template <typename T>
+        T bisect_prob_lo (amrex::Real plo, amrex::Real /*phi*/, amrex::Real dxinv, int ilo, int ihi, amrex::Real tol) {
+            T lo = static_cast<T>(plo + tol);
+            bool safe;
+            {
+                int i = int(Math::floor((lo - plo)*dxinv)) + ilo;
+                safe = i >= ilo && i <= ihi;
+            }
+            if (safe) {
+                return lo;
+            } else {
+                // bisect the point at which the cell no longer maps to inside the domain
+                T hi = static_cast<T>(plo + 0.5_rt/dxinv);
+                T mid = bisect(lo, hi,
+                               [=] AMREX_GPU_HOST_DEVICE (T x) -> T
+                               {
+                                   int i = int(Math::floor((x - plo)*dxinv)) + ilo;
+                                   bool inside = i >= ilo && i <= ihi;
+                                   return static_cast<T>(inside) - T(0.5);
+                               }, static_cast<T>(tol));
+                return mid - static_cast<T>(tol);
+            }
+        }
+
+        template <typename T>
+        T bisect_prob_hi (amrex::Real plo, amrex::Real phi, amrex::Real dxinv, int ilo, int ihi, amrex::Real tol) {
+            T hi = static_cast<T>(phi - tol);
+            bool safe;
+            {
+                int i = int(Math::floor((hi - plo)*dxinv)) + ilo;
+                safe = i >= ilo && i <= ihi;
+            }
+            if (safe) {
+                return hi;
+            } else {
+                // bisect the point at which the cell no longer maps to inside the domain
+                T lo = static_cast<T>(phi - 0.5_rt/dxinv);
+                T mid = bisect(lo, hi,
+                               [=] AMREX_GPU_HOST_DEVICE (T x) -> T
+                                   {
+                                       int i = int(Math::floor((x - plo)*dxinv)) + ilo;
+                                       bool inside = i >= ilo && i <= ihi;
+                                       return static_cast<T>(inside) - T(0.5);
+                                   }, static_cast<T>(tol));
+                return mid - static_cast<T>(tol);
+            }
+        }
+    }
+
 class Geometry
     :
     public CoordSys
@@ -168,8 +218,6 @@ public:
 
     //! Returns the problem domain.
     const RealBox& ProbDomain () const noexcept { return prob_domain; }
-    //! Returns the roundoff domain.
-    const RealBox& RoundoffDomain () const noexcept { return roundoff_domain; }
     //! Sets the problem domain.
     void ProbDomain (const RealBox& rb) noexcept
     {
@@ -193,12 +241,19 @@ public:
         return {{AMREX_D_DECL(prob_domain.hi(0),prob_domain.hi(1),prob_domain.hi(2))}};
     }
 
-    GpuArray<Real,AMREX_SPACEDIM> RoundoffLoArray () const noexcept {
-        return {{AMREX_D_DECL(roundoff_domain.lo(0),roundoff_domain.lo(1),roundoff_domain.lo(2))}};
+    GpuArray<ParticleReal,AMREX_SPACEDIM> ProbLoArrayInParticleReal () const noexcept {
+#ifdef AMREX_SINGLE_PRECISION_PARTICLES
+        return roundoff_lo_f;
+#else
+        return roundoff_lo_d;
+#endif
     }
-
-    GpuArray<Real,AMREX_SPACEDIM> RoundoffHiArray () const noexcept {
-        return {{AMREX_D_DECL(roundoff_domain.hi(0),roundoff_domain.hi(1),roundoff_domain.hi(2))}};
+    GpuArray<ParticleReal,AMREX_SPACEDIM> ProbHiArrayInParticleReal () const noexcept {
+#ifdef AMREX_SINGLE_PRECISION_PARTICLES
+        return roundoff_hi_f;
+#else
+        return roundoff_hi_d;
+#endif
     }
 
     //! Returns the overall size of the domain by multiplying the ProbLength's together
@@ -365,9 +420,13 @@ public:
                         const Box&      src,
                         Vector<IntVect>& out) const noexcept;
 
+    //! Return domain box with non-periodic directions grown by ngrow.
+    Box growNonPeriodicDomain (IntVect const& ngrow) const noexcept;
     //! Return domain box with non-periodic directions grown by ngrow.
     Box growNonPeriodicDomain (int ngrow) const noexcept;
     //! Return domain box with periodic directions grown by ngrow.
+    Box growPeriodicDomain (IntVect const& ngrow) const noexcept;
+    //! Return domain box with periodic directions grown by ngrow.
     Box growPeriodicDomain (int ngrow) const noexcept;
 
     //! Set periodicity flags and return the old flags.
@@ -406,7 +465,7 @@ public:
     *        are sure to be mapped to cells inside the Domain() box. Note that
     *        the same need not be true for all points inside ProbDomain().
     */
-    bool outsideRoundoffDomain (AMREX_D_DECL(Real x, Real y, Real z)) const;
+    bool outsideRoundoffDomain (AMREX_D_DECL(ParticleReal x, ParticleReal y, ParticleReal z)) const;
 
     /**
     * \brief Returns true if a point is inside the roundoff domain.
@@ -414,7 +473,7 @@ public:
     *        are sure to be mapped to cells inside the Domain() box. Note that
     *        the same need not be true for all points inside ProbDomain().
     */
-    bool insideRoundoffDomain (AMREX_D_DECL(Real x, Real y, Real z)) const;
+    bool insideRoundoffDomain (AMREX_D_DECL(ParticleReal x, ParticleReal y, ParticleReal z)) const;
 
     /**
     * \brief Compute the roundoff domain. Public because it contains an
@@ -430,10 +489,11 @@ private:
     RealBox prob_domain;
 
     // Due to round-off errors, not all floating point numbers for which plo >= x < phi
-    // will map to a cell that is inside "domain". "roundoff_domain" stores a phi
-    // that is very close to that in prob_domain, and for which all floating point numbers
-    // inside it according to a naive inequality check will map to a cell inside domain.
-    RealBox roundoff_domain;
+    // will map to a cell that is inside "domain". "roundoff_{lo,hi}_{f,d}" each store
+    // a position that is very close to that in prob_domain, and for which all doubles and floats less than
+    // it will map to a cell inside domain.
+    GpuArray<double, AMREX_SPACEDIM> roundoff_lo_d, roundoff_hi_d;
+    GpuArray<float , AMREX_SPACEDIM> roundoff_lo_f, roundoff_hi_f;
 
     //
     Box     domain;
diff --git a/Src/Base/AMReX_Geometry.cpp b/Src/Base/AMReX_Geometry.cpp
index 395f17e352b..235c7bb7674 100644
--- a/Src/Base/AMReX_Geometry.cpp
+++ b/Src/Base/AMReX_Geometry.cpp
@@ -473,29 +473,41 @@ Geometry::periodicShift (const Box&      target,
 }
 
 Box
-Geometry::growNonPeriodicDomain (int ngrow) const noexcept
+Geometry::growNonPeriodicDomain (IntVect const& ngrow) const noexcept
 {
     Box b = Domain();
     for (int idim = 0; idim < AMREX_SPACEDIM; ++idim) {
         if (!isPeriodic(idim)) {
-            b.grow(idim,ngrow);
+            b.grow(idim,ngrow[idim]);
         }
     }
     return b;
 }
 
 Box
-Geometry::growPeriodicDomain (int ngrow) const noexcept
+Geometry::growPeriodicDomain (IntVect const& ngrow) const noexcept
 {
     Box b = Domain();
     for (int idim = 0; idim < AMREX_SPACEDIM; ++idim) {
         if (isPeriodic(idim)) {
-            b.grow(idim,ngrow);
+            b.grow(idim,ngrow[idim]);
         }
     }
     return b;
 }
 
+Box
+Geometry::growNonPeriodicDomain (int ngrow) const noexcept
+{
+    return growNonPeriodicDomain(IntVect(ngrow));
+}
+
+Box
+Geometry::growPeriodicDomain (int ngrow) const noexcept
+{
+    return growPeriodicDomain(IntVect(ngrow));
+}
+
 void
 Geometry::computeRoundoffDomain ()
 {
@@ -506,50 +518,48 @@ Geometry::computeRoundoffDomain ()
         inv_dx[k] = 1.0_rt/dx[k];
     }
 
-    roundoff_domain = prob_domain;
     for (int idim = 0; idim < AMREX_SPACEDIM; ++idim)
     {
         int ilo = Domain().smallEnd(idim);
         int ihi = Domain().bigEnd(idim);
         Real plo = ProbLo(idim);
         Real phi = ProbHi(idim);
-        Real idx = InvCellSize(idim);
+        Real dxinv = InvCellSize(idim);
         Real deltax = CellSize(idim);
 
-#ifdef AMREX_SINGLE_PRECISION_PARTICLES
-        Real tolerance = std::max(1.e-4_rt*deltax, 2.e-7_rt*phi);
-#else
-        Real tolerance = std::max(1.e-8_rt*deltax, 1.e-14_rt*phi);
-#endif
-        // bisect the point at which the cell no longer maps to inside the domain
-        Real lo = static_cast<Real>(phi) - Real(0.5)*static_cast<Real>(deltax);
-        Real hi = static_cast<Real>(phi) + Real(0.5)*static_cast<Real>(deltax);
-
-        Real mid = bisect(lo, hi,
-                          [=] AMREX_GPU_HOST_DEVICE (Real x) -> Real
-                          {
-                              int i = int(Math::floor((x - plo)*idx)) + ilo;
-                              bool inside = i >= ilo && i <= ihi;
-                              return static_cast<Real>(inside) - Real(0.5);
-                          }, tolerance);
-        roundoff_domain.setHi(idim, mid - tolerance);
+        Real ftol = std::max(1.e-4_rt*deltax, 2.e-7_rt*phi);
+        Real dtol = std::max(1.e-8_rt*deltax, 1.e-14_rt*phi);
+
+        roundoff_lo_f[idim] = detail::bisect_prob_lo<float> (plo, phi, dxinv, ilo, ihi, ftol);
+        roundoff_lo_d[idim] = detail::bisect_prob_lo<double>(plo, phi, dxinv, ilo, ihi, dtol);
+        roundoff_hi_f[idim] = detail::bisect_prob_hi<float> (plo, phi, dxinv, ilo, ihi, ftol);
+        roundoff_hi_d[idim] = detail::bisect_prob_hi<double>(plo, phi, dxinv, ilo, ihi, dtol);
     }
 }
 
 bool
-Geometry::outsideRoundoffDomain (AMREX_D_DECL(Real x, Real y, Real z)) const
+Geometry::outsideRoundoffDomain (AMREX_D_DECL(ParticleReal x, ParticleReal y, ParticleReal z)) const
 {
-    bool outside = AMREX_D_TERM(x <  roundoff_domain.lo(0)
-                             || x >= roundoff_domain.hi(0),
-                             || y <  roundoff_domain.lo(1)
-                             || y >= roundoff_domain.hi(1),
-                             || z <  roundoff_domain.lo(2)
-                             || z >= roundoff_domain.hi(2));
+#ifdef AMREX_SINGLE_PRECISION_PARTICLES
+    bool outside = AMREX_D_TERM(x <  roundoff_lo_f[0]
+                             || x >= roundoff_hi_f[0],
+                             || y <  roundoff_lo_f[1]
+                             || y >= roundoff_hi_f[1],
+                             || z <  roundoff_lo_f[2]
+                             || z >= roundoff_hi_f[2]);
+#else
+    bool outside = AMREX_D_TERM(x <  roundoff_lo_d[0]
+                             || x >= roundoff_hi_d[0],
+                             || y <  roundoff_lo_d[1]
+                             || y >= roundoff_hi_d[1],
+                             || z <  roundoff_lo_d[2]
+                             || z >= roundoff_hi_d[2]);
+#endif
     return outside;
 }
 
 bool
-Geometry::insideRoundoffDomain (AMREX_D_DECL(Real x, Real y, Real z)) const
+Geometry::insideRoundoffDomain (AMREX_D_DECL(ParticleReal x, ParticleReal y, ParticleReal z)) const
 {
     return !outsideRoundoffDomain(AMREX_D_DECL(x, y, z));
 }
diff --git a/Src/Base/AMReX_GpuAtomic.H b/Src/Base/AMReX_GpuAtomic.H
index e6b2780abe0..a07704cb86b 100644
--- a/Src/Base/AMReX_GpuAtomic.H
+++ b/Src/Base/AMReX_GpuAtomic.H
@@ -30,15 +30,16 @@ namespace detail {
     {
 #if defined(__SYCL_DEVICE_ONLY__)
         constexpr auto mo = sycl::memory_order::relaxed;
+        constexpr auto ms = sycl::memory_scope::device;
         constexpr auto as = sycl::access::address_space::global_space;
         static_assert(sizeof(R) == sizeof(I), "sizeof R != sizeof I");
         I* const add_as_I = reinterpret_cast<I*>(address);
-        sycl::atomic<I,as> a{sycl::multi_ptr<I,as>(add_as_I)};
-        I old_I = a.load(mo), new_I;
+        sycl::atomic_ref<I,mo,ms,as> a{*add_as_I};
+        I old_I = a.load(), new_I;
         do {
             R const new_R = f(*(reinterpret_cast<R const*>(&old_I)), val);
             new_I = *(reinterpret_cast<I const*>(&new_R));
-        } while (! a.compare_exchange_strong(old_I, new_I, mo));
+        } while (! a.compare_exchange_strong(old_I, new_I));
         return *(reinterpret_cast<R const*>(&old_I));
 #else
         R old = *address;
@@ -53,17 +54,18 @@ namespace detail {
     {
 #if defined(__SYCL_DEVICE_ONLY__)
         constexpr auto mo = sycl::memory_order::relaxed;
+        constexpr auto ms = sycl::memory_scope::device;
         constexpr auto as = sycl::access::address_space::global_space;
         static_assert(sizeof(R) == sizeof(I), "sizeof R != sizeof I");
         I* const add_as_I = reinterpret_cast<I*>(address);
-        sycl::atomic<I, as> a{sycl::multi_ptr<I,as>(add_as_I)};
-        I old_I = a.load(mo), new_I;
+        sycl::atomic_ref<I,mo,ms,as> a{*add_as_I};
+        I old_I = a.load(), new_I;
         bool test_success;
         do {
             R const tmp = op(*(reinterpret_cast<R const*>(&old_I)), val);
             new_I = *(reinterpret_cast<I const*>(&tmp));
             test_success = cond(tmp);
-        } while (test_success && ! a.compare_exchange_strong(old_I, new_I, mo));
+        } while (test_success && ! a.compare_exchange_strong(old_I, new_I));
         return test_success;
 #else
         R old = *address;
@@ -131,9 +133,10 @@ namespace detail {
         return atomicAdd(sum, value);
 #elif defined(__SYCL_DEVICE_ONLY__)
         constexpr auto mo = sycl::memory_order::relaxed;
+        constexpr auto ms = sycl::memory_scope::device;
         constexpr auto as = sycl::access::address_space::global_space;
-        sycl::atomic<T,as> a{sycl::multi_ptr<T,as>(sum)};
-        return a.fetch_add(value, mo);
+        sycl::atomic_ref<T,mo,ms,as> a{*sum};
+        return a.fetch_add(value);
 #else
         amrex::ignore_unused(sum, value);
         return T(); // should never get here, but have to return something
@@ -313,9 +316,10 @@ namespace detail {
         return atomicMin(m, value);
 #elif defined(__SYCL_DEVICE_ONLY__)
         constexpr auto mo = sycl::memory_order::relaxed;
+        constexpr auto ms = sycl::memory_scope::device;
         constexpr auto as = sycl::access::address_space::global_space;
-        sycl::atomic<T,as> a{sycl::multi_ptr<T,as>(m)};
-        return a.fetch_min(value, mo);
+        sycl::atomic_ref<T,mo,ms,as> a{*m};
+        return a.fetch_min(value);
 #else
         amrex::ignore_unused(m,value);
         return T(); // should never get here, but have to return something
@@ -373,9 +377,10 @@ namespace detail {
         return atomicMax(m, value);
 #elif defined(__SYCL_DEVICE_ONLY__)
         constexpr auto mo = sycl::memory_order::relaxed;
+        constexpr auto ms = sycl::memory_scope::device;
         constexpr auto as = sycl::access::address_space::global_space;
-        sycl::atomic<T,as> a{sycl::multi_ptr<T,as>(m)};
-        return a.fetch_max(value, mo);
+        sycl::atomic_ref<T,mo,ms,as> a{*m};
+        return a.fetch_max(value);
 #else
         amrex::ignore_unused(m,value);
         return T(); // should never get here, but have to return something
@@ -430,9 +435,10 @@ namespace detail {
         return atomicOr(m, value);
 #elif defined(__SYCL_DEVICE_ONLY__)
         constexpr auto mo = sycl::memory_order::relaxed;
+        constexpr auto ms = sycl::memory_scope::device;
         constexpr auto as = sycl::access::address_space::global_space;
-        sycl::atomic<int,as> a{sycl::multi_ptr<int,as>(m)};
-        return a.fetch_or(value, mo);
+        sycl::atomic_ref<int,mo,ms,as> a{*m};
+        return a.fetch_or(value);
 #else
         int const old = *m;
         *m = (*m) || value;
@@ -451,9 +457,10 @@ namespace detail {
         return atomicAnd(m, value ? ~0x0 : 0);
 #elif defined(__SYCL_DEVICE_ONLY__)
         constexpr auto mo = sycl::memory_order::relaxed;
+        constexpr auto ms = sycl::memory_scope::device;
         constexpr auto as = sycl::access::address_space::global_space;
-        sycl::atomic<int,as> a{sycl::multi_ptr<int,as>(m)};
-        return a.fetch_and(value ? ~0x0 : 0, mo);
+        sycl::atomic_ref<int,mo,ms,as> a{*m};
+        return a.fetch_and(value ? ~0x0 : 0);
 #else
         int const old = *m;
         *m = (*m) && value;
@@ -472,11 +479,12 @@ namespace detail {
     {
 #if defined(__SYCL_DEVICE_ONLY__)
         constexpr auto mo = sycl::memory_order::relaxed;
-        sycl::atomic<unsigned int,AS> a{sycl::multi_ptr<unsigned int,AS>(m)};
-        unsigned int oldi = a.load(mo), newi;
+        constexpr auto ms = sycl::memory_scope::device;
+        sycl::atomic_ref<unsigned int,mo,ms,AS> a{*m};
+        unsigned int oldi = a.load(), newi;
         do {
             newi = (oldi >= value) ? 0u : (oldi+1u);
-        } while (! a.compare_exchange_strong(oldi, newi, mo));
+        } while (! a.compare_exchange_strong(oldi, newi));
         return oldi;
 #else
         auto const old = *m;
@@ -509,12 +517,13 @@ namespace detail {
         return atomicDec(m, value);
 #elif defined(__SYCL_DEVICE_ONLY__)
         constexpr auto mo = sycl::memory_order::relaxed;
+        constexpr auto ms = sycl::memory_scope::device;
         constexpr auto as = sycl::access::address_space::global_space;
-        sycl::atomic<unsigned int,as> a{sycl::multi_ptr<unsigned int,as>(m)};
-        unsigned int oldi = a.load(mo), newi;
+        sycl::atomic_ref<unsigned int,mo,ms,as> a{*m};
+        unsigned int oldi = a.load(), newi;
         do {
             newi = ((oldi == 0u) || (oldi > value)) ? value : (oldi-1u);
-        } while (! a.compare_exchange_strong(oldi, newi, mo));
+        } while (! a.compare_exchange_strong(oldi, newi));
         return oldi;
 #else
         auto const old = *m;
@@ -535,9 +544,10 @@ namespace detail {
         return atomicExch(address, val);
 #elif defined(__SYCL_DEVICE_ONLY__)
         constexpr auto mo = sycl::memory_order::relaxed;
+        constexpr auto ms = sycl::memory_scope::device;
         constexpr auto as = sycl::access::address_space::global_space;
-        sycl::atomic<T,as> a{sycl::multi_ptr<T,as>(address)};
-        return sycl::atomic_exchange(a, val, mo);
+        sycl::atomic_ref<T,mo,ms,as> a{*address};
+        return a.exchange(val);
 #else
         auto const old = *address;
         *address = val;
@@ -557,9 +567,10 @@ namespace detail {
         return atomicCAS(address, compare, val);
 #elif defined(__SYCL_DEVICE_ONLY__)
         constexpr auto mo = sycl::memory_order::relaxed;
+        constexpr auto ms = sycl::memory_scope::device;
         constexpr auto as = sycl::access::address_space::global_space;
-        sycl::atomic<T,as> a{sycl::multi_ptr<T,as>(address)};
-        a.compare_exchange_strong(compare, val, mo);
+        sycl::atomic_ref<T,mo,ms,as> a{*address};
+        a.compare_exchange_strong(compare, val);
         return compare;
 #else
         auto const old = *address;
diff --git a/Src/Base/AMReX_GpuContainers.H b/Src/Base/AMReX_GpuContainers.H
index cc68770ff3f..faccec1d2ef 100644
--- a/Src/Base/AMReX_GpuContainers.H
+++ b/Src/Base/AMReX_GpuContainers.H
@@ -19,13 +19,19 @@ namespace Gpu {
 
     /**
      * \brief A PODVector that uses the standard memory Arena.
-     * Note that, on NVIDIA architectures, this Arena is actually
-     * managed.
-     *
+     * Note that the memory might or might not be managed depending
+     * on the amrex.the_arena_is_managed ParmParse parameter.
      */
     template <class T>
     using DeviceVector = PODVector<T, ArenaAllocator<T> >;
 
+    /**
+     * \brief A PODVector that uses the non-managed device memory arena.
+     *
+     */
+    template <class T>
+    using NonManagedDeviceVector = PODVector<T, DeviceArenaAllocator<T> >;
+
     /**
      * \brief A PODVector that uses the managed memory arena.
      *
@@ -83,6 +89,9 @@ namespace Gpu {
     template <class T>
     using HostVector = PODVector<T>;
 
+    template <class T>
+    using NonManagedVector = PODVector<T>;
+
     template <class T>
     using ManagedVector = PODVector<T>;
 
diff --git a/Src/Base/AMReX_GpuDevice.H b/Src/Base/AMReX_GpuDevice.H
index 8a327704a1d..a61ab4fe406 100644
--- a/Src/Base/AMReX_GpuDevice.H
+++ b/Src/Base/AMReX_GpuDevice.H
@@ -3,6 +3,7 @@
 #include <AMReX_Config.H>
 
 #include <AMReX.H>
+#include <AMReX_Extension.H>
 #include <AMReX_Utility.H>
 #include <AMReX_GpuTypes.H>
 #include <AMReX_GpuError.H>
@@ -148,9 +149,9 @@ public:
     // definition: https://github.com/llvm/llvm-project/blob/62ec4ac90738a5f2d209ed28c822223e58aaaeb7/clang/lib/Basic/Targets/AMDGPU.cpp#L400
     // overview wavefront size: https://github.com/llvm/llvm-project/blob/efc063b621ea0c4d1e452bcade62f7fc7e1cc937/clang/test/Driver/amdgpu-macros.cl#L70-L115
     // gfx10XX has 32 threads per wavefront else 64
-    static constexpr int warp_size = __AMDGCN_WAVEFRONT_SIZE;
+    static AMREX_EXPORT constexpr int warp_size = __AMDGCN_WAVEFRONT_SIZE;
 #   else
-    static constexpr int warp_size = AMREX_HIP_OR_CUDA_OR_DPCPP(64,32,16);
+    static AMREX_EXPORT constexpr int warp_size = AMREX_HIP_OR_CUDA_OR_DPCPP(64,32,16);
 #   endif
 
     static unsigned int maxBlocksPerLaunch () noexcept { return max_blocks_per_launch; }
@@ -166,28 +167,28 @@ private:
 
     static void initialize_gpu ();
 
-    static int device_id;
-    static int num_devices_used;
-    static int verbose;
-    static int max_gpu_streams;
+    static AMREX_EXPORT int device_id;
+    static AMREX_EXPORT int num_devices_used;
+    static AMREX_EXPORT int verbose;
+    static AMREX_EXPORT int max_gpu_streams;
 
 #ifdef AMREX_USE_GPU
-    static dim3 numThreadsMin;
-    static dim3 numBlocksOverride, numThreadsOverride;
+    static AMREX_EXPORT dim3 numThreadsMin;
+    static AMREX_EXPORT dim3 numBlocksOverride, numThreadsOverride;
 
     // We build gpu_default_stream and gpu_stream_pool.
     // The non-owning gpu_stream is used to store the current stream that will be used.
     // gpu_stream is a vector so that it's thread safe to write to it.
-    static gpuStream_t gpu_default_stream;
-    static Vector<gpuStream_t> gpu_stream_pool; // The size of this is max_gpu_stream
-    static Vector<gpuStream_t> gpu_stream; // The size of this is omp_max_threads
-    static gpuDeviceProp_t device_prop;
-    static int memory_pools_supported;
-    static unsigned int max_blocks_per_launch;
+    static AMREX_EXPORT gpuStream_t gpu_default_stream;
+    static AMREX_EXPORT Vector<gpuStream_t> gpu_stream_pool; // The size of this is max_gpu_stream
+    static AMREX_EXPORT Vector<gpuStream_t> gpu_stream; // The size of this is omp_max_threads
+    static AMREX_EXPORT gpuDeviceProp_t device_prop;
+    static AMREX_EXPORT int memory_pools_supported;
+    static AMREX_EXPORT unsigned int max_blocks_per_launch;
 
 #ifdef AMREX_USE_DPCPP
-    static std::unique_ptr<sycl::context> sycl_context;
-    static std::unique_ptr<sycl::device>  sycl_device;
+    static AMREX_EXPORT std::unique_ptr<sycl::context> sycl_context;
+    static AMREX_EXPORT std::unique_ptr<sycl::device>  sycl_device;
 #endif
 #endif
 };
diff --git a/Src/Base/AMReX_GpuDevice.cpp b/Src/Base/AMReX_GpuDevice.cpp
index 8d42363f0a7..fe7257ea971 100644
--- a/Src/Base/AMReX_GpuDevice.cpp
+++ b/Src/Base/AMReX_GpuDevice.cpp
@@ -22,9 +22,9 @@
 #if defined(AMREX_USE_HIP)
 #include <hip/hip_runtime.h>
 #if defined(AMREX_USE_ROCTX)
-#include <roctracer_ext.h>
+#include <roctracer/roctracer_ext.h>
 #if defined(AMREX_PROFILING) || defined (AMREX_TINY_PROFILING)
-#include <roctx.h>
+#include <roctracer/roctx.h>
 #endif
 #endif
 #endif
@@ -397,11 +397,7 @@ Device::initialize_gpu ()
 
     // check compute capability
 
-    if (sizeof(Real) == 8) {
-        AMREX_HIP_SAFE_CALL(hipDeviceSetSharedMemConfig(hipSharedMemBankSizeEightByte));
-    } else if (sizeof(Real) == 4) {
-        AMREX_HIP_SAFE_CALL(hipDeviceSetSharedMemConfig(hipSharedMemBankSizeFourByte));
-    }
+    // AMD devices do not support shared cache banking.
 
     AMREX_HIP_SAFE_CALL(hipStreamCreate(&gpu_default_stream));
     for (int i = 0; i < max_gpu_streams; ++i) {
@@ -467,8 +463,8 @@ Device::initialize_gpu ()
         device_prop.warpSize = warp_size;
         auto sgss = d.get_info<sycl::info::device::sub_group_sizes>();
         device_prop.maxMemAllocSize = d.get_info<sycl::info::device::max_mem_alloc_size>();
-        device_prop.managedMemory = d.get_info<sycl::info::device::host_unified_memory>();
-        device_prop.concurrentManagedAccess = d.get_info<sycl::info::device::usm_shared_allocations>();
+        device_prop.managedMemory = d.has(sycl::aspect::usm_host_allocations);
+        device_prop.concurrentManagedAccess = d.has(sycl::aspect::usm_shared_allocations);
         device_prop.maxParameterSize = d.get_info<sycl::info::device::max_parameter_size>();
         {
             amrex::Print() << "Device Properties:\n"
diff --git a/Src/Base/AMReX_GpuLaunch.H b/Src/Base/AMReX_GpuLaunch.H
index d31bae568c1..7e877140629 100644
--- a/Src/Base/AMReX_GpuLaunch.H
+++ b/Src/Base/AMReX_GpuLaunch.H
@@ -30,11 +30,11 @@
 #define AMREX_GPU_Z_STRIDE 1
 
 #ifdef AMREX_USE_CUDA
-#  define AMREX_LAUNCH_KERNEL(blocks, threads, sharedMem, stream, ... ) \
-        amrex::launch_global<AMREX_GPU_MAX_THREADS><<<blocks, threads, sharedMem, stream>>>(__VA_ARGS__);
+#  define AMREX_LAUNCH_KERNEL(MT, blocks, threads, sharedMem, stream, ... ) \
+        amrex::launch_global<MT><<<blocks, threads, sharedMem, stream>>>(__VA_ARGS__)
 #elif defined(AMREX_USE_HIP)
-#  define AMREX_LAUNCH_KERNEL(blocks, threads, sharedMem, stream, ... ) \
-        hipLaunchKernelGGL(launch_global<AMREX_GPU_MAX_THREADS>, blocks, threads, sharedMem, stream, __VA_ARGS__);
+#  define AMREX_LAUNCH_KERNEL(MT, blocks, threads, sharedMem, stream, ... ) \
+        hipLaunchKernelGGL(launch_global<MT>, blocks, threads, sharedMem, stream, __VA_ARGS__)
 #endif
 
 
@@ -151,6 +151,28 @@ namespace Gpu {
         dim3 numThreads;
         std::size_t sharedMem = 0;
     };
+
+    template <int MT>
+    ExecutionConfig
+    makeExecutionConfig (Long N) noexcept
+    {
+        ExecutionConfig ec(dim3{}, dim3{});
+        ec.numBlocks.x = (std::max(N,Long(1)) + MT - 1) / MT;
+        ec.numThreads.x = MT;
+        AMREX_ASSERT(MT % Gpu::Device::warp_size == 0);
+        return ec;
+    }
+
+    template <int MT>
+    ExecutionConfig
+    makeExecutionConfig (const Box& box) noexcept
+    {
+        ExecutionConfig ec(dim3{}, dim3{});
+        ec.numBlocks.x = (std::max(box.numPts(),Long(1)) + MT - 1) / MT;
+        ec.numThreads.x = MT;
+        AMREX_ASSERT(MT % Gpu::Device::warp_size == 0);
+        return ec;
+    }
 #endif
 
 }
@@ -221,6 +243,8 @@ namespace Gpu {
 
 #ifdef AMREX_USE_GPU
 
+#ifndef AMREX_USE_DPCPP
+
 #define AMREX_HOST_DEVICE_PARALLEL_FOR_1D_FLAG(where_to_run,n,i,block) \
     {  using amrex_i_inttype = typename std::remove_const<decltype(n)>::type; \
     if ((where_to_run == RunOn::Device) && (Gpu::inLaunchRegion())) \
@@ -344,6 +368,111 @@ namespace Gpu {
         block3; \
     }
 
+#else
+// xxxxx DPCPP todo: host disabled in host device
+
+#define AMREX_HOST_DEVICE_PARALLEL_FOR_1D_FLAG(where_to_run,n,i,block) \
+    {  using amrex_i_inttype = typename std::remove_const<decltype(n)>::type; \
+    if ((where_to_run == RunOn::Device) && (Gpu::inLaunchRegion())) \
+    { \
+        amrex::ParallelFor(n, [=] AMREX_GPU_DEVICE (amrex_i_inttype i) noexcept \
+            block \
+        ); \
+    } \
+    else { \
+        amrex::Abort("amrex:: HOST_DEVICE disabled for Intel.  It takes too long to compile"); \
+    }}
+
+#define AMREX_HOST_DEVICE_PARALLEL_FOR_3D_FLAG(where_to_run,box,i,j,k,block) \
+    if ((where_to_run == RunOn::Device) && (Gpu::inLaunchRegion())) \
+    { \
+        amrex::ParallelFor(box, [=] AMREX_GPU_DEVICE (int i, int j, int k) noexcept \
+            block \
+        ); \
+    } \
+    else { \
+        amrex::Abort("amrex:: HOST_DEVICE disabled for Intel.  It takes too long to compile"); \
+    }
+
+#define AMREX_HOST_DEVICE_PARALLEL_FOR_4D_FLAG(where_to_run,box,nc,i,j,k,n,block) \
+    if ((where_to_run == RunOn::Device) && (Gpu::inLaunchRegion())) \
+    { \
+        amrex::ParallelFor(box, nc, [=] AMREX_GPU_DEVICE (int i, int j, int k, int n) noexcept \
+            block \
+        ); \
+    } \
+    else { \
+        amrex::Abort("amrex:: HOST_DEVICE disabled for Intel.  It takes too long to compile"); \
+    }
+
+#define AMREX_HOST_DEVICE_FOR_1D_FLAG(where_to_run,n,i,block) \
+    {  using amrex_i_inttype = typename std::remove_const<decltype(n)>::type; \
+    if ((where_to_run == RunOn::Device) && (Gpu::inLaunchRegion())) \
+    { \
+        amrex::ParallelFor(n, [=] AMREX_GPU_DEVICE (amrex_i_inttype i) noexcept \
+            block \
+        ); \
+    } \
+    else { \
+        amrex::Abort("amrex:: HOST_DEVICE disabled for Intel.  It takes too long to compile"); \
+    }}
+
+#define AMREX_HOST_DEVICE_FOR_3D_FLAG(where_to_run,box,i,j,k,block) \
+    if ((where_to_run == RunOn::Device) && (Gpu::inLaunchRegion())) \
+    { \
+        amrex::ParallelFor(box, [=] AMREX_GPU_DEVICE (int i, int j, int k) noexcept \
+            block \
+        ); \
+    } \
+    else { \
+        amrex::Abort("amrex:: HOST_DEVICE disabled for Intel.  It takes too long to compile"); \
+    }
+
+#define AMREX_HOST_DEVICE_FOR_4D_FLAG(where_to_run,box,nc,i,j,k,n,block) \
+    if ((where_to_run == RunOn::Device) && (Gpu::inLaunchRegion())) \
+    { \
+        amrex::ParallelFor(box, nc, [=] AMREX_GPU_DEVICE (int i, int j, int k, int n) noexcept \
+            block \
+        ); \
+    } \
+    else { \
+        amrex::Abort("amrex:: HOST_DEVICE disabled for Intel.  It takes too long to compile"); \
+    }
+
+#define AMREX_LAUNCH_HOST_DEVICE_LAMBDA_FLAG(where_to_run,box,tbox,block) \
+    if ((where_to_run == RunOn::Device) && (Gpu::inLaunchRegion())) \
+    { \
+        AMREX_LAUNCH_DEVICE_LAMBDA(box,tbox,block); \
+    } else { \
+        amrex::Abort("amrex:: HOST_DEVICE disabled for Intel.  It takes too long to compile"); \
+    }
+
+#define AMREX_LAUNCH_HOST_DEVICE_LAMBDA_RANGE_FLAG(where_to_run,bx1,tbx1,block1) \
+    if ((where_to_run == RunOn::Device) && (Gpu::inLaunchRegion())) \
+    { \
+        AMREX_LAUNCH_DEVICE_LAMBDA(bx1,tbx1,block1); \
+    } else { \
+        amrex::Abort("amrex:: HOST_DEVICE disabled for Intel.  It takes too long to compile"); \
+    }
+
+#define AMREX_LAUNCH_HOST_DEVICE_LAMBDA_RANGE_2_FLAG(where_to_run,bx1,tbx1,block1,bx2,tbx2,block2) \
+    if ((where_to_run == RunOn::Device) && (Gpu::inLaunchRegion())) \
+    { \
+        AMREX_LAUNCH_DEVICE_LAMBDA(bx1,tbx1,block1,bx2,tbx2,block2); \
+    } else { \
+        amrex::Abort("amrex:: HOST_DEVICE disabled for Intel.  It takes too long to compile"); \
+    }
+
+#define AMREX_LAUNCH_HOST_DEVICE_LAMBDA_RANGE_3_FLAG(where_to_run,bx1,tbx1,block1,bx2,tbx2,block2,bx3,tbx3,block3) \
+    if ((where_to_run == RunOn::Device) && (Gpu::inLaunchRegion())) \
+    { \
+        AMREX_LAUNCH_DEVICE_LAMBDA(bx1,tbx1,block1,bx2,tbx2,block2,bx3,tbx3,block3); \
+    } else { \
+        amrex::Abort("amrex:: HOST_DEVICE disabled for Intel.  It takes too long to compile"); \
+    }
+
+#endif
+
 #else
 
 #define AMREX_HOST_DEVICE_PARALLEL_FOR_1D_FLAG(where_to_run,n,i,block) \
@@ -421,4 +550,6 @@ namespace Gpu {
 
 #endif
 
+#include <AMReX_CTOParallelForImpl.H>
+
 #endif
diff --git a/Src/Base/AMReX_GpuLaunchFunctsC.H b/Src/Base/AMReX_GpuLaunchFunctsC.H
index 025b43fec0a..6ce9cca0f3a 100644
--- a/Src/Base/AMReX_GpuLaunchFunctsC.H
+++ b/Src/Base/AMReX_GpuLaunchFunctsC.H
@@ -55,11 +55,18 @@ namespace detail {
 }
 
 template<typename T, typename L>
-void launch (T const& n, L&& f, std::size_t /*shared_mem_bytes*/=0) noexcept
+void launch (T const& n, L&& f) noexcept
 {
     f(n);
 }
 
+template<int MT, typename T, typename L>
+void launch (T const& n, L&& f) noexcept
+{
+    amrex::ignore_unused(MT);
+    f(n);
+}
+
 template <typename T, typename L, typename M=std::enable_if_t<std::is_integral<T>::value> >
 void For (T n, L&& f) noexcept
 {
@@ -68,12 +75,26 @@ void For (T n, L&& f) noexcept
     }
 }
 
+template <int MT, typename T, typename L, typename M=std::enable_if_t<std::is_integral<T>::value> >
+void For (T n, L&& f) noexcept
+{
+    amrex::ignore_unused(MT);
+    For(n, std::forward<L>(f));
+}
+
 template <typename T, typename L, typename M=std::enable_if_t<std::is_integral<T>::value> >
 void For (Gpu::KernelInfo const&, T n, L&& f) noexcept
 {
     For(n, std::forward<L>(f));
 }
 
+template <int MT, typename T, typename L, typename M=std::enable_if_t<std::is_integral<T>::value> >
+void For (Gpu::KernelInfo const&, T n, L&& f) noexcept
+{
+    amrex::ignore_unused(MT);
+    For(n, std::forward<L>(f));
+}
+
 template <typename T, typename L, typename M=std::enable_if_t<std::is_integral<T>::value> >
 void ParallelFor (T n, L&& f) noexcept
 {
@@ -83,12 +104,26 @@ void ParallelFor (T n, L&& f) noexcept
     }
 }
 
+template <int MT, typename T, typename L, typename M=std::enable_if_t<std::is_integral<T>::value> >
+void ParallelFor (T n, L&& f) noexcept
+{
+    amrex::ignore_unused(MT);
+    ParallelFor(n, std::forward<L>(f));
+}
+
 template <typename T, typename L, typename M=std::enable_if_t<std::is_integral<T>::value> >
 void ParallelFor (Gpu::KernelInfo const&, T n, L&& f) noexcept
 {
     ParallelFor(n, std::forward<L>(f));
 }
 
+template <int MT, typename T, typename L, typename M=std::enable_if_t<std::is_integral<T>::value> >
+void ParallelFor (Gpu::KernelInfo const&, T n, L&& f) noexcept
+{
+    amrex::ignore_unused(MT);
+    ParallelFor(n, std::forward<L>(f));
+}
+
 template <typename L>
 void For (Box const& box, L&& f) noexcept
 {
@@ -101,12 +136,26 @@ void For (Box const& box, L&& f) noexcept
     }}}
 }
 
+template <int MT, typename L>
+void For (Box const& box, L&& f) noexcept
+{
+    amrex::ignore_unused(MT);
+    For(box, std::forward<L>(f));
+}
+
 template <typename L>
 void For (Gpu::KernelInfo const&, Box const& box, L&& f) noexcept
 {
     For(box, std::forward<L>(f));
 }
 
+template <int MT, typename L>
+void For (Gpu::KernelInfo const&, Box const& box, L&& f) noexcept
+{
+    amrex::ignore_unused(MT);
+    For(box, std::forward<L>(f));
+}
+
 template <typename L>
 void ParallelFor (Box const& box, L&& f) noexcept
 {
@@ -120,12 +169,26 @@ void ParallelFor (Box const& box, L&& f) noexcept
     }}}
 }
 
+template <int MT, typename L>
+void ParallelFor (Box const& box, L&& f) noexcept
+{
+    amrex::ignore_unused(MT);
+    ParallelFor(box, std::forward<L>(f));
+}
+
 template <typename L>
 void ParallelFor (Gpu::KernelInfo const&, Box const& box, L&& f) noexcept
 {
     ParallelFor(box, std::forward<L>(f));
 }
 
+template <int MT, typename L>
+void ParallelFor (Gpu::KernelInfo const&, Box const& box, L&& f) noexcept
+{
+    amrex::ignore_unused(MT);
+    ParallelFor(box, std::forward<L>(f));
+}
+
 template <typename T, typename L, typename M=std::enable_if_t<std::is_integral<T>::value> >
 void For (Box const& box, T ncomp, L&& f) noexcept
 {
@@ -140,12 +203,26 @@ void For (Box const& box, T ncomp, L&& f) noexcept
     }
 }
 
+template <int MT, typename T, typename L, typename M=std::enable_if_t<std::is_integral<T>::value> >
+void For (Box const& box, T ncomp, L&& f) noexcept
+{
+    amrex::ignore_unused(MT);
+    For(box, ncomp, std::forward<L>(f));
+}
+
 template <typename T, typename L, typename M=std::enable_if_t<std::is_integral<T>::value> >
 void For (Gpu::KernelInfo const&, Box const& box, T ncomp, L&& f) noexcept
 {
     For(box, ncomp, std::forward<L>(f));
 }
 
+template <int MT, typename T, typename L, typename M=std::enable_if_t<std::is_integral<T>::value> >
+void For (Gpu::KernelInfo const&, Box const& box, T ncomp, L&& f) noexcept
+{
+    amrex::ignore_unused(MT);
+    For(box, ncomp, std::forward<L>(f));
+}
+
 template <typename T, typename L, typename M=std::enable_if_t<std::is_integral<T>::value> >
 void ParallelFor (Box const& box, T ncomp, L&& f) noexcept
 {
@@ -161,12 +238,26 @@ void ParallelFor (Box const& box, T ncomp, L&& f) noexcept
     }
 }
 
+template <int MT, typename T, typename L, typename M=std::enable_if_t<std::is_integral<T>::value> >
+void ParallelFor (Box const& box, T ncomp, L&& f) noexcept
+{
+    amrex::ignore_unused(MT);
+    ParallelFor(box, ncomp, std::forward<L>(f));
+}
+
 template <typename T, typename L, typename M=std::enable_if_t<std::is_integral<T>::value> >
 void ParallelFor (Gpu::KernelInfo const&, Box const& box, T ncomp, L&& f) noexcept
 {
     ParallelFor(box, ncomp, std::forward<L>(f));
 }
 
+template <int MT, typename T, typename L, typename M=std::enable_if_t<std::is_integral<T>::value> >
+void ParallelFor (Gpu::KernelInfo const&, Box const& box, T ncomp, L&& f) noexcept
+{
+    amrex::ignore_unused(MT);
+    ParallelFor(box, ncomp, std::forward<L>(f));
+}
+
 template <typename L1, typename L2>
 void For (Box const& box1, Box const& box2, L1&& f1, L2&& f2) noexcept
 {
@@ -174,12 +265,27 @@ void For (Box const& box1, Box const& box2, L1&& f1, L2&& f2) noexcept
     For(box2, std::forward<L2>(f2));
 }
 
+template <int MT, typename L1, typename L2>
+void For (Box const& box1, Box const& box2, L1&& f1, L2&& f2) noexcept
+{
+    amrex::ignore_unused(MT);
+    For(box1, std::forward<L1>(f1));
+    For(box2, std::forward<L2>(f2));
+}
+
 template <typename L1, typename L2>
 void For (Gpu::KernelInfo const&, Box const& box1, Box const& box2, L1&& f1, L2&& f2) noexcept
 {
     For (box1, box2, std::forward<L1>(f1), std::forward<L2>(f2));
 }
 
+template <int MT, typename L1, typename L2>
+void For (Gpu::KernelInfo const&, Box const& box1, Box const& box2, L1&& f1, L2&& f2) noexcept
+{
+    amrex::ignore_unused(MT);
+    For (box1, box2, std::forward<L1>(f1), std::forward<L2>(f2));
+}
+
 template <typename L1, typename L2, typename L3>
 void For (Box const& box1, Box const& box2, Box const& box3, L1&& f1, L2&& f2, L3&& f3) noexcept
 {
@@ -188,12 +294,28 @@ void For (Box const& box1, Box const& box2, Box const& box3, L1&& f1, L2&& f2, L
     For(box3, std::forward<L3>(f3));
 }
 
+template <int MT, typename L1, typename L2, typename L3>
+void For (Box const& box1, Box const& box2, Box const& box3, L1&& f1, L2&& f2, L3&& f3) noexcept
+{
+    amrex::ignore_unused(MT);
+    For(box1, std::forward<L1>(f1));
+    For(box2, std::forward<L2>(f2));
+    For(box3, std::forward<L3>(f3));
+}
+
 template <typename L1, typename L2, typename L3>
 void For (Gpu::KernelInfo const&, Box const& box1, Box const& box2, Box const& box3, L1&& f1, L2&& f2, L3&& f3) noexcept
 {
     For(box1, box2, box3, std::forward<L1>(f1), std::forward<L2>(f2), std::forward<L3>(f3));
 }
 
+template <int MT, typename L1, typename L2, typename L3>
+void For (Gpu::KernelInfo const&, Box const& box1, Box const& box2, Box const& box3, L1&& f1, L2&& f2, L3&& f3) noexcept
+{
+    amrex::ignore_unused(MT);
+    For(box1, box2, box3, std::forward<L1>(f1), std::forward<L2>(f2), std::forward<L3>(f3));
+}
+
 template <typename T1, typename T2, typename L1, typename L2,
           typename M1=std::enable_if_t<std::is_integral<T1>::value>,
           typename M2=std::enable_if_t<std::is_integral<T2>::value> >
@@ -204,6 +326,17 @@ void For (Box const& box1, T1 ncomp1, L1&& f1,
     For(box2, ncomp2, std::forward<L2>(f2));
 }
 
+template <int MT, typename T1, typename T2, typename L1, typename L2,
+          typename M1=std::enable_if_t<std::is_integral<T1>::value>,
+          typename M2=std::enable_if_t<std::is_integral<T2>::value> >
+void For (Box const& box1, T1 ncomp1, L1&& f1,
+          Box const& box2, T2 ncomp2, L2&& f2) noexcept
+{
+    amrex::ignore_unused(MT);
+    For(box1, ncomp1, std::forward<L1>(f1));
+    For(box2, ncomp2, std::forward<L2>(f2));
+}
+
 template <typename T1, typename T2, typename L1, typename L2,
           typename M1=std::enable_if_t<std::is_integral<T1>::value>,
           typename M2=std::enable_if_t<std::is_integral<T2>::value> >
@@ -214,6 +347,17 @@ void For (Gpu::KernelInfo const&,
     For(box1,ncomp1,std::forward<L1>(f1),box2,ncomp2,std::forward<L2>(f2));
 }
 
+template <int MT, typename T1, typename T2, typename L1, typename L2,
+          typename M1=std::enable_if_t<std::is_integral<T1>::value>,
+          typename M2=std::enable_if_t<std::is_integral<T2>::value> >
+void For (Gpu::KernelInfo const&,
+          Box const& box1, T1 ncomp1, L1&& f1,
+          Box const& box2, T2 ncomp2, L2&& f2) noexcept
+{
+    amrex::ignore_unused(MT);
+    For(box1,ncomp1,std::forward<L1>(f1),box2,ncomp2,std::forward<L2>(f2));
+}
+
 template <typename T1, typename T2, typename T3, typename L1, typename L2, typename L3,
           typename M1=std::enable_if_t<std::is_integral<T1>::value>,
           typename M2=std::enable_if_t<std::is_integral<T2>::value>,
@@ -227,6 +371,20 @@ void For (Box const& box1, T1 ncomp1, L1&& f1,
     For(box3, ncomp3, std::forward<L3>(f3));
 }
 
+template <int MT, typename T1, typename T2, typename T3, typename L1, typename L2, typename L3,
+          typename M1=std::enable_if_t<std::is_integral<T1>::value>,
+          typename M2=std::enable_if_t<std::is_integral<T2>::value>,
+          typename M3=std::enable_if_t<std::is_integral<T3>::value> >
+void For (Box const& box1, T1 ncomp1, L1&& f1,
+          Box const& box2, T2 ncomp2, L2&& f2,
+          Box const& box3, T3 ncomp3, L3&& f3) noexcept
+{
+    amrex::ignore_unused(MT);
+    For(box1, ncomp1, std::forward<L1>(f1));
+    For(box2, ncomp2, std::forward<L2>(f2));
+    For(box3, ncomp3, std::forward<L3>(f3));
+}
+
 template <typename T1, typename T2, typename T3, typename L1, typename L2, typename L3,
           typename M1=std::enable_if_t<std::is_integral<T1>::value>,
           typename M2=std::enable_if_t<std::is_integral<T2>::value>,
@@ -241,6 +399,21 @@ void For (Gpu::KernelInfo const&,
         box3,ncomp3,std::forward<L3>(f3));
 }
 
+template <int MT, typename T1, typename T2, typename T3, typename L1, typename L2, typename L3,
+          typename M1=std::enable_if_t<std::is_integral<T1>::value>,
+          typename M2=std::enable_if_t<std::is_integral<T2>::value>,
+          typename M3=std::enable_if_t<std::is_integral<T3>::value> >
+void For (Gpu::KernelInfo const&,
+          Box const& box1, T1 ncomp1, L1&& f1,
+          Box const& box2, T2 ncomp2, L2&& f2,
+          Box const& box3, T3 ncomp3, L3&& f3) noexcept
+{
+    amrex::ignore_unused(MT);
+    For(box1,ncomp1,std::forward<L1>(f1),
+        box2,ncomp2,std::forward<L2>(f2),
+        box3,ncomp3,std::forward<L3>(f3));
+}
+
 template <typename L1, typename L2>
 void ParallelFor (Box const& box1, Box const& box2, L1&& f1, L2&& f2) noexcept
 {
@@ -248,12 +421,27 @@ void ParallelFor (Box const& box1, Box const& box2, L1&& f1, L2&& f2) noexcept
     ParallelFor(box2, std::forward<L2>(f2));
 }
 
+template <int MT, typename L1, typename L2>
+void ParallelFor (Box const& box1, Box const& box2, L1&& f1, L2&& f2) noexcept
+{
+    amrex::ignore_unused(MT);
+    ParallelFor(box1, std::forward<L1>(f1));
+    ParallelFor(box2, std::forward<L2>(f2));
+}
+
 template <typename L1, typename L2>
 void ParallelFor (Gpu::KernelInfo const&, Box const& box1, Box const& box2, L1&& f1, L2&& f2) noexcept
 {
     ParallelFor(box1,box2,f1,f2);
 }
 
+template <int MT, typename L1, typename L2>
+void ParallelFor (Gpu::KernelInfo const&, Box const& box1, Box const& box2, L1&& f1, L2&& f2) noexcept
+{
+    amrex::ignore_unused(MT);
+    ParallelFor(box1,box2,f1,f2);
+}
+
 template <typename L1, typename L2, typename L3>
 void ParallelFor (Box const& box1, Box const& box2, Box const& box3, L1&& f1, L2&& f2, L3&& f3) noexcept
 {
@@ -262,12 +450,28 @@ void ParallelFor (Box const& box1, Box const& box2, Box const& box3, L1&& f1, L2
     ParallelFor(box3, std::forward<L3>(f3));
 }
 
+template <int MT, typename L1, typename L2, typename L3>
+void ParallelFor (Box const& box1, Box const& box2, Box const& box3, L1&& f1, L2&& f2, L3&& f3) noexcept
+{
+    amrex::ignore_unused(MT);
+    ParallelFor(box1, std::forward<L1>(f1));
+    ParallelFor(box2, std::forward<L2>(f2));
+    ParallelFor(box3, std::forward<L3>(f3));
+}
+
 template <typename L1, typename L2, typename L3>
 void ParallelFor (Gpu::KernelInfo const&, Box const& box1, Box const& box2, Box const& box3, L1&& f1, L2&& f2, L3&& f3) noexcept
 {
     ParallelFor(box1,box2,box3,std::forward<L1>(f1),std::forward<L2>(f2),std::forward<L3>(f3));
 }
 
+template <int MT, typename L1, typename L2, typename L3>
+void ParallelFor (Gpu::KernelInfo const&, Box const& box1, Box const& box2, Box const& box3, L1&& f1, L2&& f2, L3&& f3) noexcept
+{
+    amrex::ignore_unused(MT);
+    ParallelFor(box1,box2,box3,std::forward<L1>(f1),std::forward<L2>(f2),std::forward<L3>(f3));
+}
+
 template <typename T1, typename T2, typename L1, typename L2,
           typename M1=std::enable_if_t<std::is_integral<T1>::value>,
           typename M2=std::enable_if_t<std::is_integral<T2>::value> >
@@ -278,6 +482,17 @@ void ParallelFor (Box const& box1, T1 ncomp1, L1&& f1,
     ParallelFor(box2, ncomp2, std::forward<L2>(f2));
 }
 
+template <int MT, typename T1, typename T2, typename L1, typename L2,
+          typename M1=std::enable_if_t<std::is_integral<T1>::value>,
+          typename M2=std::enable_if_t<std::is_integral<T2>::value> >
+void ParallelFor (Box const& box1, T1 ncomp1, L1&& f1,
+                  Box const& box2, T2 ncomp2, L2&& f2) noexcept
+{
+    amrex::ignore_unused(MT);
+    ParallelFor(box1, ncomp1, std::forward<L1>(f1));
+    ParallelFor(box2, ncomp2, std::forward<L2>(f2));
+}
+
 template <typename T1, typename T2, typename L1, typename L2,
           typename M1=std::enable_if_t<std::is_integral<T1>::value>,
           typename M2=std::enable_if_t<std::is_integral<T2>::value> >
@@ -289,6 +504,18 @@ void ParallelFor (Gpu::KernelInfo const&,
                 box2,ncomp2,std::forward<L2>(f2));
 }
 
+template <int MT, typename T1, typename T2, typename L1, typename L2,
+          typename M1=std::enable_if_t<std::is_integral<T1>::value>,
+          typename M2=std::enable_if_t<std::is_integral<T2>::value> >
+void ParallelFor (Gpu::KernelInfo const&,
+                  Box const& box1, T1 ncomp1, L1&& f1,
+                  Box const& box2, T2 ncomp2, L2&& f2) noexcept
+{
+    amrex::ignore_unused(MT);
+    ParallelFor(box1,ncomp1,std::forward<L1>(f1),
+                box2,ncomp2,std::forward<L2>(f2));
+}
+
 template <typename T1, typename T2, typename T3, typename L1, typename L2, typename L3,
           typename M1=std::enable_if_t<std::is_integral<T1>::value>,
           typename M2=std::enable_if_t<std::is_integral<T2>::value>,
@@ -302,6 +529,20 @@ void ParallelFor (Box const& box1, T1 ncomp1, L1&& f1,
     ParallelFor(box3, ncomp3, std::forward<L3>(f3));
 }
 
+template <int MT, typename T1, typename T2, typename T3, typename L1, typename L2, typename L3,
+          typename M1=std::enable_if_t<std::is_integral<T1>::value>,
+          typename M2=std::enable_if_t<std::is_integral<T2>::value>,
+          typename M3=std::enable_if_t<std::is_integral<T3>::value> >
+void ParallelFor (Box const& box1, T1 ncomp1, L1&& f1,
+                  Box const& box2, T2 ncomp2, L2&& f2,
+                  Box const& box3, T3 ncomp3, L3&& f3) noexcept
+{
+    amrex::ignore_unused(MT);
+    ParallelFor(box1, ncomp1, std::forward<L1>(f1));
+    ParallelFor(box2, ncomp2, std::forward<L2>(f2));
+    ParallelFor(box3, ncomp3, std::forward<L3>(f3));
+}
+
 template <typename T1, typename T2, typename T3, typename L1, typename L2, typename L3,
           typename M1=std::enable_if_t<std::is_integral<T1>::value>,
           typename M2=std::enable_if_t<std::is_integral<T2>::value>,
@@ -316,30 +557,73 @@ void ParallelFor (Gpu::KernelInfo const&,
                 box3, ncomp3, std::forward<L3>(f3));
 }
 
+template <int MT, typename T1, typename T2, typename T3, typename L1, typename L2, typename L3,
+          typename M1=std::enable_if_t<std::is_integral<T1>::value>,
+          typename M2=std::enable_if_t<std::is_integral<T2>::value>,
+          typename M3=std::enable_if_t<std::is_integral<T3>::value> >
+void ParallelFor (Gpu::KernelInfo const&,
+                  Box const& box1, T1 ncomp1, L1&& f1,
+                  Box const& box2, T2 ncomp2, L2&& f2,
+                  Box const& box3, T3 ncomp3, L3&& f3) noexcept
+{
+    amrex::ignore_unused(MT);
+    ParallelFor(box1, ncomp1, std::forward<L1>(f1),
+                box2, ncomp2, std::forward<L2>(f2),
+                box3, ncomp3, std::forward<L3>(f3));
+}
+
 template <typename T, typename L, typename M=std::enable_if_t<std::is_integral<T>::value> >
 void HostDeviceParallelFor (T n, L&& f) noexcept
 {
     ParallelFor(n,std::forward<L>(f));
 }
 
+template <int MT, typename T, typename L, typename M=std::enable_if_t<std::is_integral<T>::value> >
+void HostDeviceParallelFor (T n, L&& f) noexcept
+{
+    amrex::ignore_unused(MT);
+    ParallelFor(n,std::forward<L>(f));
+}
+
 template <typename L>
 void HostDeviceParallelFor (Box const& box, L&& f) noexcept
 {
     ParallelFor(box,std::forward<L>(f));
 }
 
+template <int MT, typename L>
+void HostDeviceParallelFor (Box const& box, L&& f) noexcept
+{
+    amrex::ignore_unused(MT);
+    ParallelFor(box,std::forward<L>(f));
+}
+
 template <typename T, typename L, typename M=std::enable_if_t<std::is_integral<T>::value> >
 void HostDeviceParallelFor (Box const& box, T ncomp, L&& f) noexcept
 {
     ParallelFor(box,ncomp,std::forward<L>(f));
 }
 
+template <int MT, typename T, typename L, typename M=std::enable_if_t<std::is_integral<T>::value> >
+void HostDeviceParallelFor (Box const& box, T ncomp, L&& f) noexcept
+{
+    amrex::ignore_unused(MT);
+    ParallelFor(box,ncomp,std::forward<L>(f));
+}
+
 template <typename L1, typename L2>
 void HostDeviceParallelFor (Box const& box1, Box const& box2, L1&& f1, L2&& f2) noexcept
 {
     ParallelFor(box1,box2,std::forward<L1>(f1),std::forward<L2>(f2));
 }
 
+template <int MT, typename L1, typename L2>
+void HostDeviceParallelFor (Box const& box1, Box const& box2, L1&& f1, L2&& f2) noexcept
+{
+    amrex::ignore_unused(MT);
+    ParallelFor(box1,box2,std::forward<L1>(f1),std::forward<L2>(f2));
+}
+
 template <typename L1, typename L2, typename L3>
 void HostDeviceParallelFor (Box const& box1, Box const& box2, Box const& box3,
                             L1&& f1, L2&& f2, L3&& f3) noexcept
@@ -347,6 +631,14 @@ void HostDeviceParallelFor (Box const& box1, Box const& box2, Box const& box3,
     ParallelFor(box1,box2,box3,std::forward<L1>(f1),std::forward<L2>(f2),std::forward<L3>(f3));
 }
 
+template <int MT, typename L1, typename L2, typename L3>
+void HostDeviceParallelFor (Box const& box1, Box const& box2, Box const& box3,
+                            L1&& f1, L2&& f2, L3&& f3) noexcept
+{
+    amrex::ignore_unused(MT);
+    ParallelFor(box1,box2,box3,std::forward<L1>(f1),std::forward<L2>(f2),std::forward<L3>(f3));
+}
+
 template <typename T1, typename T2, typename L1, typename L2,
           typename M1=std::enable_if_t<std::is_integral<T1>::value>,
           typename M2=std::enable_if_t<std::is_integral<T2>::value> >
@@ -356,6 +648,16 @@ void HostDeviceParallelFor (Box const& box1, T1 ncomp1, L1&& f1,
     ParallelFor(box1,ncomp1,std::forward<L1>(f1),box2,ncomp2,std::forward<L2>(f2));
 }
 
+template <int MT, typename T1, typename T2, typename L1, typename L2,
+          typename M1=std::enable_if_t<std::is_integral<T1>::value>,
+          typename M2=std::enable_if_t<std::is_integral<T2>::value> >
+void HostDeviceParallelFor (Box const& box1, T1 ncomp1, L1&& f1,
+                            Box const& box2, T2 ncomp2, L2&& f2) noexcept
+{
+    amrex::ignore_unused(MT);
+    ParallelFor(box1,ncomp1,std::forward<L1>(f1),box2,ncomp2,std::forward<L2>(f2));
+}
+
 template <typename T1, typename T2, typename T3, typename L1, typename L2, typename L3,
           typename M1=std::enable_if_t<std::is_integral<T1>::value>,
           typename M2=std::enable_if_t<std::is_integral<T2>::value>,
@@ -369,30 +671,72 @@ void HostDeviceParallelFor (Box const& box1, T1 ncomp1, L1&& f1,
                 box3,ncomp3,std::forward<L3>(f3));
 }
 
+template <int MT, typename T1, typename T2, typename T3, typename L1, typename L2, typename L3,
+          typename M1=std::enable_if_t<std::is_integral<T1>::value>,
+          typename M2=std::enable_if_t<std::is_integral<T2>::value>,
+          typename M3=std::enable_if_t<std::is_integral<T3>::value> >
+void HostDeviceParallelFor (Box const& box1, T1 ncomp1, L1&& f1,
+                            Box const& box2, T2 ncomp2, L2&& f2,
+                            Box const& box3, T3 ncomp3, L3&& f3) noexcept
+{
+    amrex::ignore_unused(MT);
+    ParallelFor(box1,ncomp1,std::forward<L1>(f1),
+                box2,ncomp2,std::forward<L2>(f2),
+                box3,ncomp3,std::forward<L3>(f3));
+}
+
 template <typename T, typename L, typename M=std::enable_if_t<std::is_integral<T>::value> >
 void HostDeviceFor (T n, L&& f) noexcept
 {
     For(n,std::forward<L>(f));
 }
 
+template <int MT, typename T, typename L, typename M=std::enable_if_t<std::is_integral<T>::value> >
+void HostDeviceFor (T n, L&& f) noexcept
+{
+    amrex::ignore_unused(MT);
+    For(n,std::forward<L>(f));
+}
+
 template <typename L>
 void HostDeviceFor (Box const& box, L&& f) noexcept
 {
     For(box,std::forward<L>(f));
 }
 
+template <int MT, typename L>
+void HostDeviceFor (Box const& box, L&& f) noexcept
+{
+    amrex::ignore_unused(MT);
+    For(box,std::forward<L>(f));
+}
+
 template <typename T, typename L, typename M=std::enable_if_t<std::is_integral<T>::value> >
 void HostDeviceFor (Box const& box, T ncomp, L&& f) noexcept
 {
     For(box,ncomp,std::forward<L>(f));
 }
 
+template <int MT, typename T, typename L, typename M=std::enable_if_t<std::is_integral<T>::value> >
+void HostDeviceFor (Box const& box, T ncomp, L&& f) noexcept
+{
+    amrex::ignore_unused(MT);
+    For(box,ncomp,std::forward<L>(f));
+}
+
 template <typename L1, typename L2>
 void HostDeviceFor (Box const& box1, Box const& box2, L1&& f1, L2&& f2) noexcept
 {
     For(box1,box2,std::forward<L1>(f1),std::forward<L2>(f2));
 }
 
+template <int MT, typename L1, typename L2>
+void HostDeviceFor (Box const& box1, Box const& box2, L1&& f1, L2&& f2) noexcept
+{
+    amrex::ignore_unused(MT);
+    For(box1,box2,std::forward<L1>(f1),std::forward<L2>(f2));
+}
+
 template <typename L1, typename L2, typename L3>
 void HostDeviceFor (Box const& box1, Box const& box2, Box const& box3,
                     L1&& f1, L2&& f2, L3&& f3) noexcept
@@ -400,6 +744,14 @@ void HostDeviceFor (Box const& box1, Box const& box2, Box const& box3,
     For(box1,box2,box3,std::forward<L1>(f1),std::forward<L2>(f2),std::forward<L3>(f3));
 }
 
+template <int MT, typename L1, typename L2, typename L3>
+void HostDeviceFor (Box const& box1, Box const& box2, Box const& box3,
+                    L1&& f1, L2&& f2, L3&& f3) noexcept
+{
+    amrex::ignore_unused(MT);
+    For(box1,box2,box3,std::forward<L1>(f1),std::forward<L2>(f2),std::forward<L3>(f3));
+}
+
 template <typename T1, typename T2, typename L1, typename L2,
           typename M1=std::enable_if_t<std::is_integral<T1>::value>,
           typename M2=std::enable_if_t<std::is_integral<T2>::value> >
@@ -409,6 +761,16 @@ void HostDeviceFor (Box const& box1, T1 ncomp1, L1&& f1,
     For(box1,ncomp1,std::forward<L1>(f1),box2,ncomp2,std::forward<L2>(f2));
 }
 
+template <int MT, typename T1, typename T2, typename L1, typename L2,
+          typename M1=std::enable_if_t<std::is_integral<T1>::value>,
+          typename M2=std::enable_if_t<std::is_integral<T2>::value> >
+void HostDeviceFor (Box const& box1, T1 ncomp1, L1&& f1,
+                    Box const& box2, T2 ncomp2, L2&& f2) noexcept
+{
+    amrex::ignore_unused(MT);
+    For(box1,ncomp1,std::forward<L1>(f1),box2,ncomp2,std::forward<L2>(f2));
+}
+
 template <typename T1, typename T2, typename T3, typename L1, typename L2, typename L3,
           typename M1=std::enable_if_t<std::is_integral<T1>::value>,
           typename M2=std::enable_if_t<std::is_integral<T2>::value>,
@@ -422,30 +784,72 @@ void HostDeviceFor (Box const& box1, T1 ncomp1, L1&& f1,
         box3,ncomp3,std::forward<L3>(f3));
 }
 
+template <int MT, typename T1, typename T2, typename T3, typename L1, typename L2, typename L3,
+          typename M1=std::enable_if_t<std::is_integral<T1>::value>,
+          typename M2=std::enable_if_t<std::is_integral<T2>::value>,
+          typename M3=std::enable_if_t<std::is_integral<T3>::value> >
+void HostDeviceFor (Box const& box1, T1 ncomp1, L1&& f1,
+                    Box const& box2, T2 ncomp2, L2&& f2,
+                    Box const& box3, T3 ncomp3, L3&& f3) noexcept
+{
+    amrex::ignore_unused(MT);
+    For(box1,ncomp1,std::forward<L1>(f1),
+        box2,ncomp2,std::forward<L2>(f2),
+        box3,ncomp3,std::forward<L3>(f3));
+}
+
 template <typename T, typename L, typename M=std::enable_if_t<std::is_integral<T>::value> >
 void HostDeviceParallelFor (Gpu::KernelInfo const&, T n, L&& f) noexcept
 {
     ParallelFor(n,std::forward<L>(f));
 }
 
+template <int MT, typename T, typename L, typename M=std::enable_if_t<std::is_integral<T>::value> >
+void HostDeviceParallelFor (Gpu::KernelInfo const&, T n, L&& f) noexcept
+{
+    amrex::ignore_unused(MT);
+    ParallelFor(n,std::forward<L>(f));
+}
+
 template <typename L>
 void HostDeviceParallelFor (Gpu::KernelInfo const&, Box const& box, L&& f) noexcept
 {
     ParallelFor(box,std::forward<L>(f));
 }
 
+template <int MT, typename L>
+void HostDeviceParallelFor (Gpu::KernelInfo const&, Box const& box, L&& f) noexcept
+{
+    amrex::ignore_unused(MT);
+    ParallelFor(box,std::forward<L>(f));
+}
+
 template <typename T, typename L, typename M=std::enable_if_t<std::is_integral<T>::value> >
 void HostDeviceParallelFor (Gpu::KernelInfo const&, Box const& box, T ncomp, L&& f) noexcept
 {
     ParallelFor(box,ncomp,std::forward<L>(f));
 }
 
+template <int MT, typename T, typename L, typename M=std::enable_if_t<std::is_integral<T>::value> >
+void HostDeviceParallelFor (Gpu::KernelInfo const&, Box const& box, T ncomp, L&& f) noexcept
+{
+    amrex::ignore_unused(MT);
+    ParallelFor(box,ncomp,std::forward<L>(f));
+}
+
 template <typename L1, typename L2>
 void HostDeviceParallelFor (Gpu::KernelInfo const&, Box const& box1, Box const& box2, L1&& f1, L2&& f2) noexcept
 {
     ParallelFor(box1,box2,std::forward<L1>(f1),std::forward<L2>(f2));
 }
 
+template <int MT, typename L1, typename L2>
+void HostDeviceParallelFor (Gpu::KernelInfo const&, Box const& box1, Box const& box2, L1&& f1, L2&& f2) noexcept
+{
+    amrex::ignore_unused(MT);
+    ParallelFor(box1,box2,std::forward<L1>(f1),std::forward<L2>(f2));
+}
+
 template <typename L1, typename L2, typename L3>
 void HostDeviceParallelFor (Gpu::KernelInfo const&,
                             Box const& box1, Box const& box2, Box const& box3,
@@ -454,6 +858,15 @@ void HostDeviceParallelFor (Gpu::KernelInfo const&,
     ParallelFor(box1,box2,box3,std::forward<L1>(f1),std::forward<L2>(f2),std::forward<L3>(f3));
 }
 
+template <int MT, typename L1, typename L2, typename L3>
+void HostDeviceParallelFor (Gpu::KernelInfo const&,
+                            Box const& box1, Box const& box2, Box const& box3,
+                            L1&& f1, L2&& f2, L3&& f3) noexcept
+{
+    amrex::ignore_unused(MT);
+    ParallelFor(box1,box2,box3,std::forward<L1>(f1),std::forward<L2>(f2),std::forward<L3>(f3));
+}
+
 template <typename T1, typename T2, typename L1, typename L2,
           typename M1=std::enable_if_t<std::is_integral<T1>::value>,
           typename M2=std::enable_if_t<std::is_integral<T2>::value> >
@@ -464,6 +877,17 @@ void HostDeviceParallelFor (Gpu::KernelInfo const&,
     ParallelFor(box1,ncomp1,std::forward<L1>(f1),box2,ncomp2,std::forward<L2>(f2));
 }
 
+template <int MT, typename T1, typename T2, typename L1, typename L2,
+          typename M1=std::enable_if_t<std::is_integral<T1>::value>,
+          typename M2=std::enable_if_t<std::is_integral<T2>::value> >
+void HostDeviceParallelFor (Gpu::KernelInfo const&,
+                            Box const& box1, T1 ncomp1, L1&& f1,
+                            Box const& box2, T2 ncomp2, L2&& f2) noexcept
+{
+    amrex::ignore_unused(MT);
+    ParallelFor(box1,ncomp1,std::forward<L1>(f1),box2,ncomp2,std::forward<L2>(f2));
+}
+
 template <typename T1, typename T2, typename T3, typename L1, typename L2, typename L3,
           typename M1=std::enable_if_t<std::is_integral<T1>::value>,
           typename M2=std::enable_if_t<std::is_integral<T2>::value>,
@@ -478,30 +902,73 @@ void HostDeviceParallelFor (Gpu::KernelInfo const&,
                 box3,ncomp3,std::forward<L3>(f3));
 }
 
+template <int MT, typename T1, typename T2, typename T3, typename L1, typename L2, typename L3,
+          typename M1=std::enable_if_t<std::is_integral<T1>::value>,
+          typename M2=std::enable_if_t<std::is_integral<T2>::value>,
+          typename M3=std::enable_if_t<std::is_integral<T3>::value> >
+void HostDeviceParallelFor (Gpu::KernelInfo const&,
+                            Box const& box1, T1 ncomp1, L1&& f1,
+                            Box const& box2, T2 ncomp2, L2&& f2,
+                            Box const& box3, T3 ncomp3, L3&& f3) noexcept
+{
+    amrex::ignore_unused(MT);
+    ParallelFor(box1,ncomp1,std::forward<L1>(f1),
+                box2,ncomp2,std::forward<L2>(f2),
+                box3,ncomp3,std::forward<L3>(f3));
+}
+
 template <typename T, typename L, typename M=std::enable_if_t<std::is_integral<T>::value> >
 void HostDeviceFor (Gpu::KernelInfo const&, T n, L&& f) noexcept
 {
     For(n,std::forward<L>(f));
 }
 
+template <int MT, typename T, typename L, typename M=std::enable_if_t<std::is_integral<T>::value> >
+void HostDeviceFor (Gpu::KernelInfo const&, T n, L&& f) noexcept
+{
+    amrex::ignore_unused(MT);
+    For(n,std::forward<L>(f));
+}
+
 template <typename L>
 void HostDeviceFor (Gpu::KernelInfo const&, Box const& box, L&& f) noexcept
 {
     For(box,std::forward<L>(f));
 }
 
+template <int MT, typename L>
+void HostDeviceFor (Gpu::KernelInfo const&, Box const& box, L&& f) noexcept
+{
+    amrex::ignore_unused(MT);
+    For(box,std::forward<L>(f));
+}
+
 template <typename T, typename L, typename M=std::enable_if_t<std::is_integral<T>::value> >
 void HostDeviceFor (Gpu::KernelInfo const&, Box const& box, T ncomp, L&& f) noexcept
 {
     For(box,ncomp,std::forward<L>(f));
 }
 
+template <int MT, typename T, typename L, typename M=std::enable_if_t<std::is_integral<T>::value> >
+void HostDeviceFor (Gpu::KernelInfo const&, Box const& box, T ncomp, L&& f) noexcept
+{
+    amrex::ignore_unused(MT);
+    For(box,ncomp,std::forward<L>(f));
+}
+
 template <typename L1, typename L2>
 void HostDeviceFor (Gpu::KernelInfo const&, Box const& box1, Box const& box2, L1&& f1, L2&& f2) noexcept
 {
     For(box1,box2,std::forward<L1>(f1),std::forward<L2>(f2));
 }
 
+template <int MT, typename L1, typename L2>
+void HostDeviceFor (Gpu::KernelInfo const&, Box const& box1, Box const& box2, L1&& f1, L2&& f2) noexcept
+{
+    amrex::ignore_unused(MT);
+    For(box1,box2,std::forward<L1>(f1),std::forward<L2>(f2));
+}
+
 template <typename L1, typename L2, typename L3>
 void HostDeviceFor (Gpu::KernelInfo const&,
                     Box const& box1, Box const& box2, Box const& box3,
@@ -510,6 +977,15 @@ void HostDeviceFor (Gpu::KernelInfo const&,
     For(box1,box2,box3,std::forward<L1>(f1),std::forward<L2>(f2),std::forward<L3>(f3));
 }
 
+template <int MT, typename L1, typename L2, typename L3>
+void HostDeviceFor (Gpu::KernelInfo const&,
+                    Box const& box1, Box const& box2, Box const& box3,
+                    L1&& f1, L2&& f2, L3&& f3) noexcept
+{
+    amrex::ignore_unused(MT);
+    For(box1,box2,box3,std::forward<L1>(f1),std::forward<L2>(f2),std::forward<L3>(f3));
+}
+
 template <typename T1, typename T2, typename L1, typename L2,
           typename M1=std::enable_if_t<std::is_integral<T1>::value>,
           typename M2=std::enable_if_t<std::is_integral<T2>::value> >
@@ -520,6 +996,17 @@ void HostDeviceFor (Gpu::KernelInfo const&,
     For(box1,ncomp1,std::forward<L1>(f1),box2,ncomp2,std::forward<L2>(f2));
 }
 
+template <int MT, typename T1, typename T2, typename L1, typename L2,
+          typename M1=std::enable_if_t<std::is_integral<T1>::value>,
+          typename M2=std::enable_if_t<std::is_integral<T2>::value> >
+void HostDeviceFor (Gpu::KernelInfo const&,
+                    Box const& box1, T1 ncomp1, L1&& f1,
+                    Box const& box2, T2 ncomp2, L2&& f2) noexcept
+{
+    amrex::ignore_unused(MT);
+    For(box1,ncomp1,std::forward<L1>(f1),box2,ncomp2,std::forward<L2>(f2));
+}
+
 template <typename T1, typename T2, typename T3, typename L1, typename L2, typename L3,
           typename M1=std::enable_if_t<std::is_integral<T1>::value>,
           typename M2=std::enable_if_t<std::is_integral<T2>::value>,
@@ -534,6 +1021,21 @@ void HostDeviceFor (Gpu::KernelInfo const&,
         box3,ncomp3,std::forward<L3>(f3));
 }
 
+template <int MT, typename T1, typename T2, typename T3, typename L1, typename L2, typename L3,
+          typename M1=std::enable_if_t<std::is_integral<T1>::value>,
+          typename M2=std::enable_if_t<std::is_integral<T2>::value>,
+          typename M3=std::enable_if_t<std::is_integral<T3>::value> >
+void HostDeviceFor (Gpu::KernelInfo const&,
+                    Box const& box1, T1 ncomp1, L1&& f1,
+                    Box const& box2, T2 ncomp2, L2&& f2,
+                    Box const& box3, T3 ncomp3, L3&& f3) noexcept
+{
+    amrex::ignore_unused(MT);
+    For(box1,ncomp1,std::forward<L1>(f1),
+        box2,ncomp2,std::forward<L2>(f2),
+        box3,ncomp3,std::forward<L3>(f3));
+}
+
 template <typename T, typename L, typename M=std::enable_if_t<std::is_integral<T>::value> >
 void ParallelForRNG (T n, L&& f) noexcept
 {
diff --git a/Src/Base/AMReX_GpuLaunchFunctsG.H b/Src/Base/AMReX_GpuLaunchFunctsG.H
index 12206f69b70..7940b5589a0 100644
--- a/Src/Base/AMReX_GpuLaunchFunctsG.H
+++ b/Src/Base/AMReX_GpuLaunchFunctsG.H
@@ -64,11 +64,24 @@ void launch (int nblocks, int nthreads_per_block, gpuStream_t stream, L&& f) noe
     }
 }
 
-template<typename T, typename L>
+template <int MT, typename L>
+void launch (int nblocks, std::size_t shared_mem_bytes, gpuStream_t stream,
+             L&& f) noexcept
+{
+    launch(nblocks, MT, shared_mem_bytes, stream, std::forward<L>(f));
+}
+
+template <int MT, typename L>
+void launch (int nblocks, gpuStream_t stream, L&& f) noexcept
+{
+    launch(nblocks, MT, stream, std::forward<L>(f));
+}
+
+template<int MT, typename T, typename L>
 void launch (T const& n, L&& f) noexcept
 {
     if (amrex::isEmpty(n)) return;
-    const auto ec = Gpu::ExecutionConfig(n);
+    const auto ec = Gpu::makeExecutionConfig<MT>(n);
     int nthreads_per_block = ec.numThreads.x;
     int nthreads_total = nthreads_per_block * ec.numBlocks.x;
     auto& q = Gpu::Device::streamQueue();
@@ -139,11 +152,11 @@ namespace detail {
     }
 }
 
-template <typename T, typename L, typename M=std::enable_if_t<std::is_integral<T>::value> >
+template <int MT, typename T, typename L, typename M=std::enable_if_t<std::is_integral<T>::value> >
 void ParallelFor (Gpu::KernelInfo const& info, T n, L&& f) noexcept
 {
     if (amrex::isEmpty(n)) return;
-    const auto ec = Gpu::ExecutionConfig(n);
+    const auto ec = Gpu::makeExecutionConfig<MT>(n);
     int nthreads_per_block = ec.numThreads.x;
     int nthreads_total = nthreads_per_block * ec.numBlocks.x;
     auto& q = Gpu::Device::streamQueue();
@@ -186,7 +199,7 @@ void ParallelFor (Gpu::KernelInfo const& info, T n, L&& f) noexcept
     }
 }
 
-template <typename L>
+template <int MT, typename L>
 void ParallelFor (Gpu::KernelInfo const& info, Box const& box, L&& f) noexcept
 {
     if (amrex::isEmpty(box)) return;
@@ -195,7 +208,7 @@ void ParallelFor (Gpu::KernelInfo const& info, Box const& box, L&& f) noexcept
     const auto len = amrex::length(box);
     const auto lenxy = len.x*len.y;
     const auto lenx = len.x;
-    const auto ec = Gpu::ExecutionConfig(ncells);
+    const auto ec = Gpu::makeExecutionConfig<MT>(ncells);
     int nthreads_per_block = ec.numThreads.x;
     int nthreads_total = nthreads_per_block * ec.numBlocks.x;
     auto& q = Gpu::Device::streamQueue();
@@ -250,7 +263,7 @@ void ParallelFor (Gpu::KernelInfo const& info, Box const& box, L&& f) noexcept
     }
 }
 
-template <typename T, typename L, typename M=std::enable_if_t<std::is_integral<T>::value> >
+template <int MT, typename T, typename L, typename M=std::enable_if_t<std::is_integral<T>::value> >
 void ParallelFor (Gpu::KernelInfo const& info, Box const& box, T ncomp, L&& f) noexcept
 {
     if (amrex::isEmpty(box)) return;
@@ -259,7 +272,7 @@ void ParallelFor (Gpu::KernelInfo const& info, Box const& box, T ncomp, L&& f) n
     const auto len = amrex::length(box);
     const auto lenxy = len.x*len.y;
     const auto lenx = len.x;
-    const auto ec = Gpu::ExecutionConfig(ncells);
+    const auto ec = Gpu::makeExecutionConfig<MT>(ncells);
     int nthreads_per_block = ec.numThreads.x;
     int nthreads_total = nthreads_per_block * ec.numBlocks.x;
     auto& q = Gpu::Device::streamQueue();
@@ -437,7 +450,7 @@ void ParallelForRNG (Box const& box, T ncomp, L&& f) noexcept
     }
 }
 
-template <typename L1, typename L2>
+template <int MT, typename L1, typename L2>
 void ParallelFor (Gpu::KernelInfo const& /*info*/, Box const& box1, Box const& box2, L1&& f1, L2&& f2) noexcept
 {
     if (amrex::isEmpty(box1) && amrex::isEmpty(box2)) return;
@@ -452,7 +465,7 @@ void ParallelFor (Gpu::KernelInfo const& /*info*/, Box const& box1, Box const& b
     const auto len2xy = len2.x*len2.y;
     const auto len1x = len1.x;
     const auto len2x = len2.x;
-    const auto ec = Gpu::ExecutionConfig(ncells);
+    const auto ec = Gpu::makeExecutionConfig<MT>(ncells);
     int nthreads_per_block = ec.numThreads.x;
     int nthreads_total = nthreads_per_block * ec.numBlocks.x;
     auto& q = Gpu::Device::streamQueue();
@@ -491,7 +504,7 @@ void ParallelFor (Gpu::KernelInfo const& /*info*/, Box const& box1, Box const& b
     }
 }
 
-template <typename L1, typename L2, typename L3>
+template <int MT, typename L1, typename L2, typename L3>
 void ParallelFor (Gpu::KernelInfo const& /*info*/,
                   Box const& box1, Box const& box2, Box const& box3,
                   L1&& f1, L2&& f2, L3&& f3) noexcept
@@ -513,7 +526,7 @@ void ParallelFor (Gpu::KernelInfo const& /*info*/,
     const auto len1x = len1.x;
     const auto len2x = len2.x;
     const auto len3x = len3.x;
-    const auto ec = Gpu::ExecutionConfig(ncells);
+    const auto ec = Gpu::makeExecutionConfig<MT>(ncells);
     int nthreads_per_block = ec.numThreads.x;
     int nthreads_total = nthreads_per_block * ec.numBlocks.x;
     auto& q = Gpu::Device::streamQueue();
@@ -561,7 +574,7 @@ void ParallelFor (Gpu::KernelInfo const& /*info*/,
     }
 }
 
-template <typename T1, typename T2, typename L1, typename L2,
+template <int MT, typename T1, typename T2, typename L1, typename L2,
           typename M1=std::enable_if_t<std::is_integral<T1>::value>,
           typename M2=std::enable_if_t<std::is_integral<T2>::value> >
 void ParallelFor (Gpu::KernelInfo const& /*info*/,
@@ -580,7 +593,7 @@ void ParallelFor (Gpu::KernelInfo const& /*info*/,
     const auto len2xy = len2.x*len2.y;
     const auto len1x = len1.x;
     const auto len2x = len2.x;
-    const auto ec = Gpu::ExecutionConfig(ncells);
+    const auto ec = Gpu::makeExecutionConfig<MT>(ncells);
     int nthreads_per_block = ec.numThreads.x;
     int nthreads_total = nthreads_per_block * ec.numBlocks.x;
     auto& q = Gpu::Device::streamQueue();
@@ -623,7 +636,7 @@ void ParallelFor (Gpu::KernelInfo const& /*info*/,
     }
 }
 
-template <typename T1, typename T2, typename T3, typename L1, typename L2, typename L3,
+template <int MT, typename T1, typename T2, typename T3, typename L1, typename L2, typename L3,
           typename M1=std::enable_if_t<std::is_integral<T1>::value>,
           typename M2=std::enable_if_t<std::is_integral<T2>::value>,
           typename M3=std::enable_if_t<std::is_integral<T3>::value> >
@@ -649,7 +662,7 @@ void ParallelFor (Gpu::KernelInfo const& /*info*/,
     const auto len1x = len1.x;
     const auto len2x = len2.x;
     const auto len3x = len3.x;
-    const auto ec = Gpu::ExecutionConfig(ncells);
+    const auto ec = Gpu::makeExecutionConfig<MT>(ncells);
     int nthreads_per_block = ec.numThreads.x;
     int nthreads_total = nthreads_per_block * ec.numBlocks.x;
     auto& q = Gpu::Device::streamQueue();
@@ -709,16 +722,34 @@ void ParallelFor (Gpu::KernelInfo const& /*info*/,
 template <typename L>
 void single_task (gpuStream_t stream, L&& f) noexcept
 {
-    AMREX_LAUNCH_KERNEL(1, 1, 0, stream,
+    AMREX_LAUNCH_KERNEL(Gpu::Device::warp_size, 1, 1, 0, stream,
                         [=] AMREX_GPU_DEVICE () noexcept {f();});
     AMREX_GPU_ERROR_CHECK();
 }
 
+template <int MT, typename L>
+void launch (int nblocks, std::size_t shared_mem_bytes, gpuStream_t stream,
+             L&& f) noexcept
+{
+    AMREX_LAUNCH_KERNEL(MT, nblocks, MT, shared_mem_bytes, stream,
+                        [=] AMREX_GPU_DEVICE () noexcept { f(); });
+    AMREX_GPU_ERROR_CHECK();
+}
+
+template <int MT, typename L>
+void launch (int nblocks, gpuStream_t stream, L&& f) noexcept
+{
+    AMREX_LAUNCH_KERNEL(MT, nblocks, MT, 0, stream,
+                        [=] AMREX_GPU_DEVICE () noexcept { f(); });
+    AMREX_GPU_ERROR_CHECK();
+}
+
 template<typename L>
 void launch (int nblocks, int nthreads_per_block, std::size_t shared_mem_bytes,
              gpuStream_t stream, L&& f) noexcept
 {
-    AMREX_LAUNCH_KERNEL(nblocks, nthreads_per_block, shared_mem_bytes,
+    AMREX_ASSERT(nthreads_per_block <= AMREX_GPU_MAX_THREADS);
+    AMREX_LAUNCH_KERNEL(AMREX_GPU_MAX_THREADS, nblocks, nthreads_per_block, shared_mem_bytes,
                         stream, [=] AMREX_GPU_DEVICE () noexcept { f(); });
     AMREX_GPU_ERROR_CHECK();
 }
@@ -729,12 +760,12 @@ void launch (int nblocks, int nthreads_per_block, gpuStream_t stream, L&& f) noe
     launch(nblocks, nthreads_per_block, 0, stream, std::forward<L>(f));
 }
 
-template<typename T, typename L>
+template<int MT, typename T, typename L>
 void launch (T const& n, L&& f) noexcept
 {
     if (amrex::isEmpty(n)) return;
-    const auto ec = Gpu::ExecutionConfig(n);
-    AMREX_LAUNCH_KERNEL(ec.numBlocks, ec.numThreads, 0, Gpu::gpuStream(),
+    const auto ec = Gpu::makeExecutionConfig<MT>(n);
+    AMREX_LAUNCH_KERNEL(MT, ec.numBlocks, ec.numThreads, 0, Gpu::gpuStream(),
     [=] AMREX_GPU_DEVICE () noexcept {
         for (auto const i : Gpu::Range(n)) {
             f(i);
@@ -793,13 +824,13 @@ namespace detail {
     }
 }
 
-template <typename T, typename L, typename M=std::enable_if_t<std::is_integral<T>::value> >
+template <int MT, typename T, typename L, typename M=std::enable_if_t<std::is_integral<T>::value> >
 std::enable_if_t<MaybeDeviceRunnable<L>::value>
 ParallelFor (Gpu::KernelInfo const&, T n, L&& f) noexcept
 {
     if (amrex::isEmpty(n)) return;
-    const auto ec = Gpu::ExecutionConfig(n);
-    AMREX_LAUNCH_KERNEL(ec.numBlocks, ec.numThreads, 0, Gpu::gpuStream(),
+    const auto ec = Gpu::makeExecutionConfig<MT>(n);
+    AMREX_LAUNCH_KERNEL(MT, ec.numBlocks, ec.numThreads, 0, Gpu::gpuStream(),
     [=] AMREX_GPU_DEVICE () noexcept {
         for (T i = blockDim.x*blockIdx.x+threadIdx.x, stride = blockDim.x*gridDim.x;
              i < n; i += stride) {
@@ -809,7 +840,7 @@ ParallelFor (Gpu::KernelInfo const&, T n, L&& f) noexcept
     AMREX_GPU_ERROR_CHECK();
 }
 
-template <typename L>
+template <int MT, typename L>
 std::enable_if_t<MaybeDeviceRunnable<L>::value>
 ParallelFor (Gpu::KernelInfo const&, Box const& box, L&& f) noexcept
 {
@@ -819,8 +850,8 @@ ParallelFor (Gpu::KernelInfo const&, Box const& box, L&& f) noexcept
     const auto len = amrex::length(box);
     const auto lenxy = len.x*len.y;
     const auto lenx = len.x;
-    const auto ec = Gpu::ExecutionConfig(ncells);
-    AMREX_LAUNCH_KERNEL(ec.numBlocks, ec.numThreads, 0, Gpu::gpuStream(),
+    const auto ec = Gpu::makeExecutionConfig<MT>(ncells);
+    AMREX_LAUNCH_KERNEL(MT, ec.numBlocks, ec.numThreads, 0, Gpu::gpuStream(),
     [=] AMREX_GPU_DEVICE () noexcept {
         for (int icell = blockDim.x*blockIdx.x+threadIdx.x, stride = blockDim.x*gridDim.x;
              icell < ncells; icell += stride)
@@ -837,7 +868,7 @@ ParallelFor (Gpu::KernelInfo const&, Box const& box, L&& f) noexcept
     AMREX_GPU_ERROR_CHECK();
 }
 
-template <typename T, typename L, typename M=std::enable_if_t<std::is_integral<T>::value> >
+template <int MT, typename T, typename L, typename M=std::enable_if_t<std::is_integral<T>::value> >
 std::enable_if_t<MaybeDeviceRunnable<L>::value>
 ParallelFor (Gpu::KernelInfo const&, Box const& box, T ncomp, L&& f) noexcept
 {
@@ -847,8 +878,8 @@ ParallelFor (Gpu::KernelInfo const&, Box const& box, T ncomp, L&& f) noexcept
     const auto len = amrex::length(box);
     const auto lenxy = len.x*len.y;
     const auto lenx = len.x;
-    const auto ec = Gpu::ExecutionConfig(ncells);
-    AMREX_LAUNCH_KERNEL(ec.numBlocks, ec.numThreads, 0, Gpu::gpuStream(),
+    const auto ec = Gpu::makeExecutionConfig<MT>(ncells);
+    AMREX_LAUNCH_KERNEL(MT, ec.numBlocks, ec.numThreads, 0, Gpu::gpuStream(),
     [=] AMREX_GPU_DEVICE () noexcept {
         for (int icell = blockDim.x*blockIdx.x+threadIdx.x, stride = blockDim.x*gridDim.x;
              icell < ncells; icell += stride) {
@@ -871,7 +902,8 @@ ParallelForRNG (T n, L&& f) noexcept
     if (amrex::isEmpty(n)) return;
     randState_t* rand_state = getRandState();
     const auto ec = Gpu::ExecutionConfig(n);
-    AMREX_LAUNCH_KERNEL(amrex::min(ec.numBlocks.x, Gpu::Device::maxBlocksPerLaunch()),
+    AMREX_LAUNCH_KERNEL(AMREX_GPU_MAX_THREADS,
+                        amrex::min(ec.numBlocks.x, Gpu::Device::maxBlocksPerLaunch()),
                         ec.numThreads, 0, Gpu::gpuStream(),
     [=] AMREX_GPU_DEVICE () noexcept {
         int tid = blockDim.x*blockIdx.x+threadIdx.x;
@@ -896,7 +928,8 @@ ParallelForRNG (Box const& box, L&& f) noexcept
     const auto lenxy = len.x*len.y;
     const auto lenx = len.x;
     const auto ec = Gpu::ExecutionConfig(ncells);
-    AMREX_LAUNCH_KERNEL(amrex::min(ec.numBlocks.x, Gpu::Device::maxBlocksPerLaunch()),
+    AMREX_LAUNCH_KERNEL(AMREX_GPU_MAX_THREADS,
+                        amrex::min(ec.numBlocks.x, Gpu::Device::maxBlocksPerLaunch()),
                         ec.numThreads, 0, Gpu::gpuStream(),
     [=] AMREX_GPU_DEVICE () noexcept {
         int tid = blockDim.x*blockIdx.x+threadIdx.x;
@@ -927,7 +960,8 @@ ParallelForRNG (Box const& box, T ncomp, L&& f) noexcept
     const auto lenxy = len.x*len.y;
     const auto lenx = len.x;
     const auto ec = Gpu::ExecutionConfig(ncells);
-    AMREX_LAUNCH_KERNEL(amrex::min(ec.numBlocks.x, Gpu::Device::maxBlocksPerLaunch()),
+    AMREX_LAUNCH_KERNEL(AMREX_GPU_MAX_THREADS,
+                        amrex::min(ec.numBlocks.x, Gpu::Device::maxBlocksPerLaunch()),
                         ec.numThreads, 0, Gpu::gpuStream(),
     [=] AMREX_GPU_DEVICE () noexcept {
         int tid = blockDim.x*blockIdx.x+threadIdx.x;
@@ -948,7 +982,7 @@ ParallelForRNG (Box const& box, T ncomp, L&& f) noexcept
     AMREX_GPU_ERROR_CHECK();
 }
 
-template <typename L1, typename L2>
+template <int MT, typename L1, typename L2>
 std::enable_if_t<MaybeDeviceRunnable<L1>::value && MaybeDeviceRunnable<L2>::value>
 ParallelFor (Gpu::KernelInfo const&,
              Box const& box1, Box const& box2, L1&& f1, L2&& f2) noexcept
@@ -965,8 +999,8 @@ ParallelFor (Gpu::KernelInfo const&,
     const auto len2xy = len2.x*len2.y;
     const auto len1x = len1.x;
     const auto len2x = len2.x;
-    const auto ec = Gpu::ExecutionConfig(ncells);
-    AMREX_LAUNCH_KERNEL(ec.numBlocks, ec.numThreads, 0, Gpu::gpuStream(),
+    const auto ec = Gpu::makeExecutionConfig<MT>(ncells);
+    AMREX_LAUNCH_KERNEL(MT, ec.numBlocks, ec.numThreads, 0, Gpu::gpuStream(),
     [=] AMREX_GPU_DEVICE () noexcept {
         for (int icell = blockDim.x*blockIdx.x+threadIdx.x, stride = blockDim.x*gridDim.x;
              icell < ncells; icell += stride) {
@@ -993,7 +1027,7 @@ ParallelFor (Gpu::KernelInfo const&,
     AMREX_GPU_ERROR_CHECK();
 }
 
-template <typename L1, typename L2, typename L3>
+template <int MT, typename L1, typename L2, typename L3>
 std::enable_if_t<MaybeDeviceRunnable<L1>::value && MaybeDeviceRunnable<L2>::value && MaybeDeviceRunnable<L3>::value>
 ParallelFor (Gpu::KernelInfo const&,
              Box const& box1, Box const& box2, Box const& box3,
@@ -1016,8 +1050,8 @@ ParallelFor (Gpu::KernelInfo const&,
     const auto len1x = len1.x;
     const auto len2x = len2.x;
     const auto len3x = len3.x;
-    const auto ec = Gpu::ExecutionConfig(ncells);
-    AMREX_LAUNCH_KERNEL(ec.numBlocks, ec.numThreads, 0, Gpu::gpuStream(),
+    const auto ec = Gpu::makeExecutionConfig<MT>(ncells);
+    AMREX_LAUNCH_KERNEL(MT, ec.numBlocks, ec.numThreads, 0, Gpu::gpuStream(),
     [=] AMREX_GPU_DEVICE () noexcept {
         for (int icell = blockDim.x*blockIdx.x+threadIdx.x, stride = blockDim.x*gridDim.x;
              icell < ncells; icell += stride) {
@@ -1053,7 +1087,7 @@ ParallelFor (Gpu::KernelInfo const&,
     AMREX_GPU_ERROR_CHECK();
 }
 
-template <typename T1, typename T2, typename L1, typename L2,
+template <int MT, typename T1, typename T2, typename L1, typename L2,
           typename M1=std::enable_if_t<std::is_integral<T1>::value>,
           typename M2=std::enable_if_t<std::is_integral<T2>::value> >
 std::enable_if_t<MaybeDeviceRunnable<L1>::value && MaybeDeviceRunnable<L2>::value>
@@ -1073,8 +1107,8 @@ ParallelFor (Gpu::KernelInfo const&,
     const auto len2xy = len2.x*len2.y;
     const auto len1x = len1.x;
     const auto len2x = len2.x;
-    const auto ec = Gpu::ExecutionConfig(ncells);
-    AMREX_LAUNCH_KERNEL(ec.numBlocks, ec.numThreads, 0, Gpu::gpuStream(),
+    const auto ec = Gpu::makeExecutionConfig<MT>(ncells);
+    AMREX_LAUNCH_KERNEL(MT, ec.numBlocks, ec.numThreads, 0, Gpu::gpuStream(),
     [=] AMREX_GPU_DEVICE () noexcept {
         for (int icell = blockDim.x*blockIdx.x+threadIdx.x, stride = blockDim.x*gridDim.x;
              icell < ncells; icell += stride) {
@@ -1105,7 +1139,7 @@ ParallelFor (Gpu::KernelInfo const&,
     AMREX_GPU_ERROR_CHECK();
 }
 
-template <typename T1, typename T2, typename T3, typename L1, typename L2, typename L3,
+template <int MT, typename T1, typename T2, typename T3, typename L1, typename L2, typename L3,
           typename M1=std::enable_if_t<std::is_integral<T1>::value>,
           typename M2=std::enable_if_t<std::is_integral<T2>::value>,
           typename M3=std::enable_if_t<std::is_integral<T3>::value> >
@@ -1132,8 +1166,8 @@ ParallelFor (Gpu::KernelInfo const&,
     const auto len1x = len1.x;
     const auto len2x = len2.x;
     const auto len3x = len3.x;
-    const auto ec = Gpu::ExecutionConfig(ncells);
-    AMREX_LAUNCH_KERNEL(ec.numBlocks, ec.numThreads, 0, Gpu::gpuStream(),
+    const auto ec = Gpu::makeExecutionConfig<MT>(ncells);
+    AMREX_LAUNCH_KERNEL(MT, ec.numBlocks, ec.numThreads, 0, Gpu::gpuStream(),
     [=] AMREX_GPU_DEVICE () noexcept {
         for (int icell = blockDim.x*blockIdx.x+threadIdx.x, stride = blockDim.x*gridDim.x;
              icell < ncells; icell += stride) {
@@ -1183,29 +1217,127 @@ void single_task (L&& f) noexcept
     single_task(Gpu::gpuStream(), std::forward<L>(f));
 }
 
+template<typename T, typename L>
+void launch (T const& n, L&& f) noexcept
+{
+    launch<AMREX_GPU_MAX_THREADS>(n, std::forward<L>(f));
+}
+
+template <typename T, typename L, typename M=std::enable_if_t<std::is_integral<T>::value> >
+std::enable_if_t<MaybeDeviceRunnable<L>::value>
+ParallelFor (Gpu::KernelInfo const& info, T n, L&& f) noexcept
+{
+    ParallelFor<AMREX_GPU_MAX_THREADS>(info, n, std::forward<L>(f));
+}
+
+template <typename L>
+std::enable_if_t<MaybeDeviceRunnable<L>::value>
+ParallelFor (Gpu::KernelInfo const& info, Box const& box, L&& f) noexcept
+{
+    ParallelFor<AMREX_GPU_MAX_THREADS>(info, box, std::forward<L>(f));
+}
+
+template <typename T, typename L, typename M=std::enable_if_t<std::is_integral<T>::value> >
+std::enable_if_t<MaybeDeviceRunnable<L>::value>
+ParallelFor (Gpu::KernelInfo const& info, Box const& box, T ncomp, L&& f) noexcept
+{
+    ParallelFor<AMREX_GPU_MAX_THREADS>(info, box, ncomp, std::forward<L>(f));
+}
+
+template <typename L1, typename L2>
+std::enable_if_t<MaybeDeviceRunnable<L1>::value && MaybeDeviceRunnable<L2>::value>
+ParallelFor (Gpu::KernelInfo const& info,
+             Box const& box1, Box const& box2, L1&& f1, L2&& f2) noexcept
+{
+    ParallelFor<AMREX_GPU_MAX_THREADS>(info, box1, box2, std::forward<L1>(f1),
+                                       std::forward<L2>(f2));
+}
+
+template <typename L1, typename L2, typename L3>
+std::enable_if_t<MaybeDeviceRunnable<L1>::value && MaybeDeviceRunnable<L2>::value && MaybeDeviceRunnable<L3>::value>
+ParallelFor (Gpu::KernelInfo const& info,
+             Box const& box1, Box const& box2, Box const& box3,
+             L1&& f1, L2&& f2, L3&& f3) noexcept
+{
+    ParallelFor<AMREX_GPU_MAX_THREADS>(info, box1, box2, box3, std::forward<L1>(f1),
+                                       std::forward<L2>(f2), std::forward<L3>(f3));
+}
+
+template <typename T1, typename T2, typename L1, typename L2,
+          typename M1=std::enable_if_t<std::is_integral<T1>::value>,
+          typename M2=std::enable_if_t<std::is_integral<T2>::value> >
+std::enable_if_t<MaybeDeviceRunnable<L1>::value && MaybeDeviceRunnable<L2>::value>
+ParallelFor (Gpu::KernelInfo const& info,
+             Box const& box1, T1 ncomp1, L1&& f1,
+             Box const& box2, T2 ncomp2, L2&& f2) noexcept
+{
+    ParallelFor<AMREX_GPU_MAX_THREADS>(info, box1, ncomp1, std::forward<L1>(f1),
+                                             box2, ncomp2, std::forward<L2>(f2));
+}
+
+template <typename T1, typename T2, typename T3, typename L1, typename L2, typename L3,
+          typename M1=std::enable_if_t<std::is_integral<T1>::value>,
+          typename M2=std::enable_if_t<std::is_integral<T2>::value>,
+          typename M3=std::enable_if_t<std::is_integral<T3>::value> >
+std::enable_if_t<MaybeDeviceRunnable<L1>::value && MaybeDeviceRunnable<L2>::value && MaybeDeviceRunnable<L3>::value>
+ParallelFor (Gpu::KernelInfo const& info,
+             Box const& box1, T1 ncomp1, L1&& f1,
+             Box const& box2, T2 ncomp2, L2&& f2,
+             Box const& box3, T3 ncomp3, L3&& f3) noexcept
+{
+    ParallelFor<AMREX_GPU_MAX_THREADS>(info, box1, ncomp1, std::forward<L1>(f1),
+                                             box2, ncomp2, std::forward<L2>(f2),
+                                             box3, ncomp3, std::forward<L3>(f3));
+}
+
 template <typename T, typename L, typename M=std::enable_if_t<std::is_integral<T>::value> >
 void For (Gpu::KernelInfo const& info, T n, L&& f) noexcept
 {
-    ParallelFor(info, n,std::forward<L>(f));
+    ParallelFor<AMREX_GPU_MAX_THREADS>(info, n,std::forward<L>(f));
+}
+
+template <int MT, typename T, typename L, typename M=std::enable_if_t<std::is_integral<T>::value> >
+void For (Gpu::KernelInfo const& info, T n, L&& f) noexcept
+{
+    ParallelFor<MT>(info, n,std::forward<L>(f));
 }
 
 template <typename L>
 void For (Gpu::KernelInfo const& info, Box const& box, L&& f) noexcept
 {
-    ParallelFor(info, box,std::forward<L>(f));
+    ParallelFor<AMREX_GPU_MAX_THREADS>(info, box,std::forward<L>(f));
+}
+
+template <int MT, typename L>
+void For (Gpu::KernelInfo const& info, Box const& box, L&& f) noexcept
+{
+    ParallelFor<MT>(info, box,std::forward<L>(f));
 }
 
 template <typename T, typename L, typename M=std::enable_if_t<std::is_integral<T>::value> >
 void For (Gpu::KernelInfo const& info, Box const& box, T ncomp, L&& f) noexcept
 {
-    ParallelFor(info,box,ncomp,std::forward<L>(f));
+    ParallelFor<AMREX_GPU_MAX_THREADS>(info,box,ncomp,std::forward<L>(f));
+}
+
+template <int MT, typename T, typename L, typename M=std::enable_if_t<std::is_integral<T>::value> >
+void For (Gpu::KernelInfo const& info, Box const& box, T ncomp, L&& f) noexcept
+{
+    ParallelFor<MT>(info,box,ncomp,std::forward<L>(f));
 }
 
 template <typename L1, typename L2>
 void For (Gpu::KernelInfo const& info,
           Box const& box1, Box const& box2, L1&& f1, L2&& f2) noexcept
 {
-    ParallelFor(info,box1,box2,std::forward<L1>(f1),std::forward<L2>(f2));
+    ParallelFor<AMREX_GPU_MAX_THREADS>(info,box1,box2,std::forward<L1>(f1),std::forward<L2>(f2));
+}
+
+template <int MT, typename L1, typename L2>
+void For (Gpu::KernelInfo const& info,
+          Box const& box1, Box const& box2, L1&& f1, L2&& f2) noexcept
+{
+    ParallelFor<MT>(info,box1,box2,std::forward<L1>(f1),std::forward<L2>(f2));
 }
 
 template <typename L1, typename L2, typename L3>
@@ -1213,7 +1345,15 @@ void For (Gpu::KernelInfo const& info,
           Box const& box1, Box const& box2, Box const& box3,
           L1&& f1, L2&& f2, L3&& f3) noexcept
 {
-    ParallelFor(info,box1,box2,box3,std::forward<L1>(f1),std::forward<L2>(f2),std::forward<L3>(f3));
+    ParallelFor<AMREX_GPU_MAX_THREADS>(info,box1,box2,box3,std::forward<L1>(f1),std::forward<L2>(f2),std::forward<L3>(f3));
+}
+
+template <int MT, typename L1, typename L2, typename L3>
+void For (Gpu::KernelInfo const& info,
+          Box const& box1, Box const& box2, Box const& box3,
+          L1&& f1, L2&& f2, L3&& f3) noexcept
+{
+    ParallelFor<MT>(info,box1,box2,box3,std::forward<L1>(f1),std::forward<L2>(f2),std::forward<L3>(f3));
 }
 
 template <typename T1, typename T2, typename L1, typename L2,
@@ -1223,7 +1363,17 @@ void For (Gpu::KernelInfo const& info,
           Box const& box1, T1 ncomp1, L1&& f1,
           Box const& box2, T2 ncomp2, L2&& f2) noexcept
 {
-    ParallelFor(info,box1,ncomp1,std::forward<L1>(f1),box2,ncomp2,std::forward<L2>(f2));
+    ParallelFor<AMREX_GPU_MAX_THREADS>(info,box1,ncomp1,std::forward<L1>(f1),box2,ncomp2,std::forward<L2>(f2));
+}
+
+template <int MT, typename T1, typename T2, typename L1, typename L2,
+          typename M1=std::enable_if_t<std::is_integral<T1>::value>,
+          typename M2=std::enable_if_t<std::is_integral<T2>::value> >
+void For (Gpu::KernelInfo const& info,
+          Box const& box1, T1 ncomp1, L1&& f1,
+          Box const& box2, T2 ncomp2, L2&& f2) noexcept
+{
+    ParallelFor<MT>(info,box1,ncomp1,std::forward<L1>(f1),box2,ncomp2,std::forward<L2>(f2));
 }
 
 template <typename T1, typename T2, typename T3, typename L1, typename L2, typename L3,
@@ -1235,7 +1385,22 @@ void For (Gpu::KernelInfo const& info,
           Box const& box2, T2 ncomp2, L2&& f2,
           Box const& box3, T3 ncomp3, L3&& f3) noexcept
 {
-    ParallelFor(info,
+    ParallelFor<AMREX_GPU_MAX_THREADS>(info,
+                box1,ncomp1,std::forward<L1>(f1),
+                box2,ncomp2,std::forward<L2>(f2),
+                box3,ncomp3,std::forward<L3>(f3));
+}
+
+template <int MT, typename T1, typename T2, typename T3, typename L1, typename L2, typename L3,
+          typename M1=std::enable_if_t<std::is_integral<T1>::value>,
+          typename M2=std::enable_if_t<std::is_integral<T2>::value>,
+          typename M3=std::enable_if_t<std::is_integral<T3>::value> >
+void For (Gpu::KernelInfo const& info,
+          Box const& box1, T1 ncomp1, L1&& f1,
+          Box const& box2, T2 ncomp2, L2&& f2,
+          Box const& box3, T3 ncomp3, L3&& f3) noexcept
+{
+    ParallelFor<MT>(info,
                 box1,ncomp1,std::forward<L1>(f1),
                 box2,ncomp2,std::forward<L2>(f2),
                 box3,ncomp3,std::forward<L3>(f3));
@@ -1244,32 +1409,63 @@ void For (Gpu::KernelInfo const& info,
 template <typename T, typename L, typename M=std::enable_if_t<std::is_integral<T>::value> >
 void ParallelFor (T n, L&& f) noexcept
 {
-    ParallelFor(Gpu::KernelInfo{}, n, std::forward<L>(f));
+    ParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{}, n, std::forward<L>(f));
+}
+
+template <int MT, typename T, typename L, typename M=std::enable_if_t<std::is_integral<T>::value> >
+void ParallelFor (T n, L&& f) noexcept
+{
+    ParallelFor<MT>(Gpu::KernelInfo{}, n, std::forward<L>(f));
 }
 
 template <typename L>
 void ParallelFor (Box const& box, L&& f) noexcept
 {
-    ParallelFor(Gpu::KernelInfo{}, box, std::forward<L>(f));
+    ParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{}, box, std::forward<L>(f));
+}
+
+template <int MT, typename L>
+void ParallelFor (Box const& box, L&& f) noexcept
+{
+    ParallelFor<MT>(Gpu::KernelInfo{}, box, std::forward<L>(f));
 }
 
 template <typename T, typename L, typename M=std::enable_if_t<std::is_integral<T>::value> >
 void ParallelFor (Box const& box, T ncomp, L&& f) noexcept
 {
-    ParallelFor(Gpu::KernelInfo{},box,ncomp,std::forward<L>(f));
+    ParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{},box,ncomp,std::forward<L>(f));
+}
+
+template <int MT, typename T, typename L, typename M=std::enable_if_t<std::is_integral<T>::value> >
+void ParallelFor (Box const& box, T ncomp, L&& f) noexcept
+{
+    ParallelFor<MT>(Gpu::KernelInfo{},box,ncomp,std::forward<L>(f));
 }
 
 template <typename L1, typename L2>
 void ParallelFor (Box const& box1, Box const& box2, L1&& f1, L2&& f2) noexcept
 {
-    ParallelFor(Gpu::KernelInfo{},box1,box2,std::forward<L1>(f1),std::forward<L2>(f2));
+    ParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{},box1,box2,std::forward<L1>(f1),std::forward<L2>(f2));
+}
+
+template <int MT, typename L1, typename L2>
+void ParallelFor (Box const& box1, Box const& box2, L1&& f1, L2&& f2) noexcept
+{
+    ParallelFor<MT>(Gpu::KernelInfo{},box1,box2,std::forward<L1>(f1),std::forward<L2>(f2));
 }
 
 template <typename L1, typename L2, typename L3>
 void ParallelFor (Box const& box1, Box const& box2, Box const& box3,
                   L1&& f1, L2&& f2, L3&& f3) noexcept
 {
-    ParallelFor(Gpu::KernelInfo{},box1,box2,box3,std::forward<L1>(f1),std::forward<L2>(f2),std::forward<L3>(f3));
+    ParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{},box1,box2,box3,std::forward<L1>(f1),std::forward<L2>(f2),std::forward<L3>(f3));
+}
+
+template <int MT, typename L1, typename L2, typename L3>
+void ParallelFor (Box const& box1, Box const& box2, Box const& box3,
+                  L1&& f1, L2&& f2, L3&& f3) noexcept
+{
+    ParallelFor<MT>(Gpu::KernelInfo{},box1,box2,box3,std::forward<L1>(f1),std::forward<L2>(f2),std::forward<L3>(f3));
 }
 
 template <typename T1, typename T2, typename L1, typename L2,
@@ -1278,7 +1474,16 @@ template <typename T1, typename T2, typename L1, typename L2,
 void ParallelFor (Box const& box1, T1 ncomp1, L1&& f1,
                   Box const& box2, T2 ncomp2, L2&& f2) noexcept
 {
-    ParallelFor(Gpu::KernelInfo{},box1,ncomp1,std::forward<L1>(f1),box2,ncomp2,std::forward<L2>(f2));
+    ParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{},box1,ncomp1,std::forward<L1>(f1),box2,ncomp2,std::forward<L2>(f2));
+}
+
+template <int MT, typename T1, typename T2, typename L1, typename L2,
+          typename M1=std::enable_if_t<std::is_integral<T1>::value>,
+          typename M2=std::enable_if_t<std::is_integral<T2>::value> >
+void ParallelFor (Box const& box1, T1 ncomp1, L1&& f1,
+                  Box const& box2, T2 ncomp2, L2&& f2) noexcept
+{
+    ParallelFor<MT>(Gpu::KernelInfo{},box1,ncomp1,std::forward<L1>(f1),box2,ncomp2,std::forward<L2>(f2));
 }
 
 template <typename T1, typename T2, typename T3, typename L1, typename L2, typename L3,
@@ -1289,7 +1494,21 @@ void ParallelFor (Box const& box1, T1 ncomp1, L1&& f1,
                   Box const& box2, T2 ncomp2, L2&& f2,
                   Box const& box3, T3 ncomp3, L3&& f3) noexcept
 {
-    ParallelFor(Gpu::KernelInfo{},
+    ParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{},
+                box1,ncomp1,std::forward<L1>(f1),
+                box2,ncomp2,std::forward<L2>(f2),
+                box3,ncomp3,std::forward<L3>(f3));
+}
+
+template <int MT, typename T1, typename T2, typename T3, typename L1, typename L2, typename L3,
+          typename M1=std::enable_if_t<std::is_integral<T1>::value>,
+          typename M2=std::enable_if_t<std::is_integral<T2>::value>,
+          typename M3=std::enable_if_t<std::is_integral<T3>::value> >
+void ParallelFor (Box const& box1, T1 ncomp1, L1&& f1,
+                  Box const& box2, T2 ncomp2, L2&& f2,
+                  Box const& box3, T3 ncomp3, L3&& f3) noexcept
+{
+    ParallelFor<MT>(Gpu::KernelInfo{},
                 box1,ncomp1,std::forward<L1>(f1),
                 box2,ncomp2,std::forward<L2>(f2),
                 box3,ncomp3,std::forward<L3>(f3));
@@ -1298,32 +1517,63 @@ void ParallelFor (Box const& box1, T1 ncomp1, L1&& f1,
 template <typename T, typename L, typename M=std::enable_if_t<std::is_integral<T>::value> >
 void For (T n, L&& f) noexcept
 {
-    ParallelFor(Gpu::KernelInfo{}, n,std::forward<L>(f));
+    ParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{}, n,std::forward<L>(f));
+}
+
+template <int MT, typename T, typename L, typename M=std::enable_if_t<std::is_integral<T>::value> >
+void For (T n, L&& f) noexcept
+{
+    ParallelFor<MT>(Gpu::KernelInfo{}, n,std::forward<L>(f));
 }
 
 template <typename L>
 void For (Box const& box, L&& f) noexcept
 {
-    ParallelFor(Gpu::KernelInfo{}, box,std::forward<L>(f));
+    ParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{}, box,std::forward<L>(f));
+}
+
+template <int MT, typename L>
+void For (Box const& box, L&& f) noexcept
+{
+    ParallelFor<MT>(Gpu::KernelInfo{}, box,std::forward<L>(f));
 }
 
 template <typename T, typename L, typename M=std::enable_if_t<std::is_integral<T>::value> >
 void For (Box const& box, T ncomp, L&& f) noexcept
 {
-    ParallelFor(Gpu::KernelInfo{},box,ncomp,std::forward<L>(f));
+    ParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{},box,ncomp,std::forward<L>(f));
+}
+
+template <int MT, typename T, typename L, typename M=std::enable_if_t<std::is_integral<T>::value> >
+void For (Box const& box, T ncomp, L&& f) noexcept
+{
+    ParallelFor<MT>(Gpu::KernelInfo{},box,ncomp,std::forward<L>(f));
 }
 
 template <typename L1, typename L2>
 void For (Box const& box1, Box const& box2, L1&& f1, L2&& f2) noexcept
 {
-    ParallelFor(Gpu::KernelInfo{},box1,box2,std::forward<L1>(f1),std::forward<L2>(f2));
+    ParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{},box1,box2,std::forward<L1>(f1),std::forward<L2>(f2));
+}
+
+template <int MT, typename L1, typename L2>
+void For (Box const& box1, Box const& box2, L1&& f1, L2&& f2) noexcept
+{
+    ParallelFor<MT>(Gpu::KernelInfo{},box1,box2,std::forward<L1>(f1),std::forward<L2>(f2));
 }
 
 template <typename L1, typename L2, typename L3>
 void For (Box const& box1, Box const& box2, Box const& box3,
           L1&& f1, L2&& f2, L3&& f3) noexcept
 {
-    ParallelFor(Gpu::KernelInfo{},box1,box2,box3,std::forward<L1>(f1),std::forward<L2>(f2),std::forward<L3>(f3));
+    ParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{},box1,box2,box3,std::forward<L1>(f1),std::forward<L2>(f2),std::forward<L3>(f3));
+}
+
+template <int MT, typename L1, typename L2, typename L3>
+void For (Box const& box1, Box const& box2, Box const& box3,
+          L1&& f1, L2&& f2, L3&& f3) noexcept
+{
+    ParallelFor<MT>(Gpu::KernelInfo{},box1,box2,box3,std::forward<L1>(f1),std::forward<L2>(f2),std::forward<L3>(f3));
 }
 
 template <typename T1, typename T2, typename L1, typename L2,
@@ -1332,7 +1582,16 @@ template <typename T1, typename T2, typename L1, typename L2,
 void For (Box const& box1, T1 ncomp1, L1&& f1,
           Box const& box2, T2 ncomp2, L2&& f2) noexcept
 {
-    ParallelFor(Gpu::KernelInfo{},box1,ncomp1,std::forward<L1>(f1),box2,ncomp2,std::forward<L2>(f2));
+    ParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{},box1,ncomp1,std::forward<L1>(f1),box2,ncomp2,std::forward<L2>(f2));
+}
+
+template <int MT, typename T1, typename T2, typename L1, typename L2,
+          typename M1=std::enable_if_t<std::is_integral<T1>::value>,
+          typename M2=std::enable_if_t<std::is_integral<T2>::value> >
+void For (Box const& box1, T1 ncomp1, L1&& f1,
+          Box const& box2, T2 ncomp2, L2&& f2) noexcept
+{
+    ParallelFor<MT>(Gpu::KernelInfo{},box1,ncomp1,std::forward<L1>(f1),box2,ncomp2,std::forward<L2>(f2));
 }
 
 template <typename T1, typename T2, typename T3, typename L1, typename L2, typename L3,
@@ -1343,7 +1602,21 @@ void For (Box const& box1, T1 ncomp1, L1&& f1,
           Box const& box2, T2 ncomp2, L2&& f2,
           Box const& box3, T3 ncomp3, L3&& f3) noexcept
 {
-    ParallelFor(Gpu::KernelInfo{},
+    ParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{},
+                box1,ncomp1,std::forward<L1>(f1),
+                box2,ncomp2,std::forward<L2>(f2),
+                box3,ncomp3,std::forward<L3>(f3));
+}
+
+template <int MT, typename T1, typename T2, typename T3, typename L1, typename L2, typename L3,
+          typename M1=std::enable_if_t<std::is_integral<T1>::value>,
+          typename M2=std::enable_if_t<std::is_integral<T2>::value>,
+          typename M3=std::enable_if_t<std::is_integral<T3>::value> >
+void For (Box const& box1, T1 ncomp1, L1&& f1,
+          Box const& box2, T2 ncomp2, L2&& f2,
+          Box const& box3, T3 ncomp3, L3&& f3) noexcept
+{
+    ParallelFor<MT>(Gpu::KernelInfo{},
                 box1,ncomp1,std::forward<L1>(f1),
                 box2,ncomp2,std::forward<L2>(f2),
                 box3,ncomp3,std::forward<L3>(f3));
@@ -1354,10 +1627,30 @@ std::enable_if_t<MaybeHostDeviceRunnable<L>::value>
 HostDeviceParallelFor (Gpu::KernelInfo const& info, T n, L&& f) noexcept
 {
     if (Gpu::inLaunchRegion()) {
-        ParallelFor(info,n,std::forward<L>(f));
+        ParallelFor<AMREX_GPU_MAX_THREADS>(info,n,std::forward<L>(f));
     } else {
+#ifdef AMREX_USE_DPCPP
+        amrex::Abort("amrex:: HOST_DEVICE disabled for Intel.  It takes too long to compile");
+#else
         AMREX_PRAGMA_SIMD
         for (T i = 0; i < n; ++i) f(i);
+#endif
+    }
+}
+
+template <int MT, typename T, typename L, typename M=std::enable_if_t<std::is_integral<T>::value> >
+std::enable_if_t<MaybeHostDeviceRunnable<L>::value>
+HostDeviceParallelFor (Gpu::KernelInfo const& info, T n, L&& f) noexcept
+{
+    if (Gpu::inLaunchRegion()) {
+        ParallelFor<MT>(info,n,std::forward<L>(f));
+    } else {
+#ifdef AMREX_USE_DPCPP
+        amrex::Abort("amrex:: HOST_DEVICE disabled for Intel.  It takes too long to compile");
+#else
+        AMREX_PRAGMA_SIMD
+        for (T i = 0; i < n; ++i) f(i);
+#endif
     }
 }
 
@@ -1365,7 +1658,14 @@ template <typename T, typename L, typename M=std::enable_if_t<std::is_integral<T
 std::enable_if_t<MaybeHostDeviceRunnable<L>::value>
 HostDeviceParallelFor (T n, L&& f) noexcept
 {
-    HostDeviceParallelFor(Gpu::KernelInfo{}, n, std::forward<L>(f));
+    HostDeviceParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{}, n, std::forward<L>(f));
+}
+
+template <int MT, typename T, typename L, typename M=std::enable_if_t<std::is_integral<T>::value> >
+std::enable_if_t<MaybeHostDeviceRunnable<L>::value>
+HostDeviceParallelFor (T n, L&& f) noexcept
+{
+    HostDeviceParallelFor<MT>(Gpu::KernelInfo{}, n, std::forward<L>(f));
 }
 
 template <typename L>
@@ -1373,9 +1673,28 @@ std::enable_if_t<MaybeHostDeviceRunnable<L>::value>
 HostDeviceParallelFor (Gpu::KernelInfo const& info, Box const& box, L&& f) noexcept
 {
     if (Gpu::inLaunchRegion()) {
-        ParallelFor(info, box,std::forward<L>(f));
+        ParallelFor<AMREX_GPU_MAX_THREADS>(info, box,std::forward<L>(f));
     } else {
+#ifdef AMREX_USE_DPCPP
+        amrex::Abort("amrex:: HOST_DEVICE disabled for Intel.  It takes too long to compile");
+#else
         LoopConcurrentOnCpu(box,std::forward<L>(f));
+#endif
+    }
+}
+
+template <int MT, typename L>
+std::enable_if_t<MaybeHostDeviceRunnable<L>::value>
+HostDeviceParallelFor (Gpu::KernelInfo const& info, Box const& box, L&& f) noexcept
+{
+    if (Gpu::inLaunchRegion()) {
+        ParallelFor<MT>(info, box,std::forward<L>(f));
+    } else {
+#ifdef AMREX_USE_DPCPP
+        amrex::Abort("amrex:: HOST_DEVICE disabled for Intel.  It takes too long to compile");
+#else
+        LoopConcurrentOnCpu(box,std::forward<L>(f));
+#endif
     }
 }
 
@@ -1384,9 +1703,28 @@ std::enable_if_t<MaybeHostDeviceRunnable<L>::value>
 HostDeviceParallelFor (Gpu::KernelInfo const& info, Box const& box, T ncomp, L&& f) noexcept
 {
     if (Gpu::inLaunchRegion()) {
-        ParallelFor(info, box,ncomp,std::forward<L>(f));
+        ParallelFor<AMREX_GPU_MAX_THREADS>(info, box,ncomp,std::forward<L>(f));
     } else {
+#ifdef AMREX_USE_DPCPP
+        amrex::Abort("amrex:: HOST_DEVICE disabled for Intel.  It takes too long to compile");
+#else
         LoopConcurrentOnCpu(box,ncomp,std::forward<L>(f));
+#endif
+    }
+}
+
+template <int MT, typename T, typename L, typename M=std::enable_if_t<std::is_integral<T>::value> >
+std::enable_if_t<MaybeHostDeviceRunnable<L>::value>
+HostDeviceParallelFor (Gpu::KernelInfo const& info, Box const& box, T ncomp, L&& f) noexcept
+{
+    if (Gpu::inLaunchRegion()) {
+        ParallelFor<MT>(info, box,ncomp,std::forward<L>(f));
+    } else {
+#ifdef AMREX_USE_DPCPP
+        amrex::Abort("amrex:: HOST_DEVICE disabled for Intel.  It takes too long to compile");
+#else
+        LoopConcurrentOnCpu(box,ncomp,std::forward<L>(f));
+#endif
     }
 }
 
@@ -1396,26 +1734,51 @@ HostDeviceParallelFor (Gpu::KernelInfo const& info,
                        Box const& box1, Box const& box2, L1&& f1, L2&& f2) noexcept
 {
     if (Gpu::inLaunchRegion()) {
-        ParallelFor(info,box1,box2,std::forward<L1>(f1),std::forward<L2>(f2));
+        ParallelFor<AMREX_GPU_MAX_THREADS>(info,box1,box2,std::forward<L1>(f1),std::forward<L2>(f2));
     } else {
+#ifdef AMREX_USE_DPCPP
+        amrex::Abort("amrex:: HOST_DEVICE disabled for Intel.  It takes too long to compile");
+#else
         LoopConcurrentOnCpu(box1,std::forward<L1>(f1));
         LoopConcurrentOnCpu(box2,std::forward<L2>(f2));
+#endif
     }
 }
 
-template <typename L1, typename L2, typename L3>
+template <int MT, typename L1, typename L2>
+std::enable_if_t<MaybeHostDeviceRunnable<L1>::value && MaybeHostDeviceRunnable<L2>::value>
+HostDeviceParallelFor (Gpu::KernelInfo const& info,
+                       Box const& box1, Box const& box2, L1&& f1, L2&& f2) noexcept
+{
+    if (Gpu::inLaunchRegion()) {
+        ParallelFor<MT>(info,box1,box2,std::forward<L1>(f1),std::forward<L2>(f2));
+    } else {
+#ifdef AMREX_USE_DPCPP
+        amrex::Abort("amrex:: HOST_DEVICE disabled for Intel.  It takes too long to compile");
+#else
+        LoopConcurrentOnCpu(box1,std::forward<L1>(f1));
+        LoopConcurrentOnCpu(box2,std::forward<L2>(f2));
+#endif
+    }
+}
+
+template <int MT, typename L1, typename L2, typename L3>
 std::enable_if_t<MaybeHostDeviceRunnable<L1>::value && MaybeHostDeviceRunnable<L2>::value && MaybeHostDeviceRunnable<L3>::value>
 HostDeviceParallelFor (Gpu::KernelInfo const& info,
                        Box const& box1, Box const& box2, Box const& box3,
                        L1&& f1, L2&& f2, L3&& f3) noexcept
 {
     if (Gpu::inLaunchRegion()) {
-        ParallelFor(info,box1,box2,box3,
+        ParallelFor<MT>(info,box1,box2,box3,
                     std::forward<L1>(f1),std::forward<L2>(f2),std::forward<L3>(f3));
     } else {
+#ifdef AMREX_USE_DPCPP
+        amrex::Abort("amrex:: HOST_DEVICE disabled for Intel.  It takes too long to compile");
+#else
         LoopConcurrentOnCpu(box1,std::forward<L1>(f1));
         LoopConcurrentOnCpu(box2,std::forward<L2>(f2));
         LoopConcurrentOnCpu(box3,std::forward<L3>(f3));
+#endif
     }
 }
 
@@ -1428,10 +1791,34 @@ HostDeviceParallelFor (Gpu::KernelInfo const& info,
                        Box const& box2, T2 ncomp2, L2&& f2) noexcept
 {
     if (Gpu::inLaunchRegion()) {
-        ParallelFor(info,box1,ncomp1,std::forward<L1>(f1),box2,ncomp2,std::forward<L2>(f2));
+        ParallelFor<AMREX_GPU_MAX_THREADS>(info,box1,ncomp1,std::forward<L1>(f1),box2,ncomp2,std::forward<L2>(f2));
     } else {
+#ifdef AMREX_USE_DPCPP
+        amrex::Abort("amrex:: HOST_DEVICE disabled for Intel.  It takes too long to compile");
+#else
+        LoopConcurrentOnCpu(box1,ncomp1,std::forward<L1>(f1));
+        LoopConcurrentOnCpu(box2,ncomp2,std::forward<L2>(f2));
+#endif
+    }
+}
+
+template <int MT, typename T1, typename T2, typename L1, typename L2,
+          typename M1=std::enable_if_t<std::is_integral<T1>::value>,
+          typename M2=std::enable_if_t<std::is_integral<T2>::value> >
+std::enable_if_t<MaybeHostDeviceRunnable<L1>::value && MaybeHostDeviceRunnable<L2>::value>
+HostDeviceParallelFor (Gpu::KernelInfo const& info,
+                       Box const& box1, T1 ncomp1, L1&& f1,
+                       Box const& box2, T2 ncomp2, L2&& f2) noexcept
+{
+    if (Gpu::inLaunchRegion()) {
+        ParallelFor<MT>(info,box1,ncomp1,std::forward<L1>(f1),box2,ncomp2,std::forward<L2>(f2));
+    } else {
+#ifdef AMREX_USE_DPCPP
+        amrex::Abort("amrex:: HOST_DEVICE disabled for Intel.  It takes too long to compile");
+#else
         LoopConcurrentOnCpu(box1,ncomp1,std::forward<L1>(f1));
         LoopConcurrentOnCpu(box2,ncomp2,std::forward<L2>(f2));
+#endif
     }
 }
 
@@ -1446,40 +1833,95 @@ HostDeviceParallelFor (Gpu::KernelInfo const& info,
                        Box const& box3, T3 ncomp3, L3&& f3) noexcept
 {
     if (Gpu::inLaunchRegion()) {
-        ParallelFor(info,
+        ParallelFor<AMREX_GPU_MAX_THREADS>(info,
                     box1,ncomp1,std::forward<L1>(f1),
                     box2,ncomp2,std::forward<L2>(f2),
                     box3,ncomp3,std::forward<L3>(f3));
     } else {
+#ifdef AMREX_USE_DPCPP
+        amrex::Abort("amrex:: HOST_DEVICE disabled for Intel.  It takes too long to compile");
+#else
         LoopConcurrentOnCpu(box1,ncomp1,std::forward<L1>(f1));
         LoopConcurrentOnCpu(box2,ncomp2,std::forward<L2>(f2));
         LoopConcurrentOnCpu(box3,ncomp3,std::forward<L3>(f3));
+#endif
+    }
+}
+
+template <int MT, typename T1, typename T2, typename T3, typename L1, typename L2, typename L3,
+          typename M1=std::enable_if_t<std::is_integral<T1>::value>,
+          typename M2=std::enable_if_t<std::is_integral<T2>::value>,
+          typename M3=std::enable_if_t<std::is_integral<T3>::value> >
+std::enable_if_t<MaybeHostDeviceRunnable<L1>::value && MaybeHostDeviceRunnable<L2>::value && MaybeHostDeviceRunnable<L3>::value>
+HostDeviceParallelFor (Gpu::KernelInfo const& info,
+                       Box const& box1, T1 ncomp1, L1&& f1,
+                       Box const& box2, T2 ncomp2, L2&& f2,
+                       Box const& box3, T3 ncomp3, L3&& f3) noexcept
+{
+    if (Gpu::inLaunchRegion()) {
+        ParallelFor<MT>(info,
+                    box1,ncomp1,std::forward<L1>(f1),
+                    box2,ncomp2,std::forward<L2>(f2),
+                    box3,ncomp3,std::forward<L3>(f3));
+    } else {
+#ifdef AMREX_USE_DPCPP
+        amrex::Abort("amrex:: HOST_DEVICE disabled for Intel.  It takes too long to compile");
+#else
+        LoopConcurrentOnCpu(box1,ncomp1,std::forward<L1>(f1));
+        LoopConcurrentOnCpu(box2,ncomp2,std::forward<L2>(f2));
+        LoopConcurrentOnCpu(box3,ncomp3,std::forward<L3>(f3));
+#endif
     }
 }
 
 template <typename T, typename L, typename M=std::enable_if_t<std::is_integral<T>::value> >
 void HostDeviceFor (Gpu::KernelInfo const& info, T n, L&& f) noexcept
 {
-    HostDeviceParallelFor(info,n,std::forward<L>(f));
+    HostDeviceParallelFor<AMREX_GPU_MAX_THREADS>(info,n,std::forward<L>(f));
+}
+
+template <int MT, typename T, typename L, typename M=std::enable_if_t<std::is_integral<T>::value> >
+void HostDeviceFor (Gpu::KernelInfo const& info, T n, L&& f) noexcept
+{
+    HostDeviceParallelFor<MT>(info,n,std::forward<L>(f));
 }
 
 template <typename L>
 void HostDeviceFor (Gpu::KernelInfo const& info, Box const& box, L&& f) noexcept
 {
-    HostDeviceParallelFor(info,box,std::forward<L>(f));
+    HostDeviceParallelFor<AMREX_GPU_MAX_THREADS>(info,box,std::forward<L>(f));
+}
+
+template <int MT, typename L>
+void HostDeviceFor (Gpu::KernelInfo const& info, Box const& box, L&& f) noexcept
+{
+    HostDeviceParallelFor<MT>(info,box,std::forward<L>(f));
 }
 
 template <typename T, typename L, typename M=std::enable_if_t<std::is_integral<T>::value> >
 void HostDeviceFor (Gpu::KernelInfo const& info, Box const& box, T ncomp, L&& f) noexcept
 {
-    HostDeviceParallelFor(info,box,ncomp,std::forward<L>(f));
+    HostDeviceParallelFor<AMREX_GPU_MAX_THREADS>(info,box,ncomp,std::forward<L>(f));
+}
+
+template <int MT, typename T, typename L, typename M=std::enable_if_t<std::is_integral<T>::value> >
+void HostDeviceFor (Gpu::KernelInfo const& info, Box const& box, T ncomp, L&& f) noexcept
+{
+    HostDeviceParallelFor<MT>(info,box,ncomp,std::forward<L>(f));
 }
 
 template <typename L1, typename L2>
 void HostDeviceFor (Gpu::KernelInfo const& info,
                     Box const& box1, Box const& box2, L1&& f1, L2&& f2) noexcept
 {
-    HostDeviceParallelFor(info,box1,box2,std::forward<L1>(f1),std::forward<L2>(f2));
+    HostDeviceParallelFor<AMREX_GPU_MAX_THREADS>(info,box1,box2,std::forward<L1>(f1),std::forward<L2>(f2));
+}
+
+template <int MT, typename L1, typename L2>
+void HostDeviceFor (Gpu::KernelInfo const& info,
+                    Box const& box1, Box const& box2, L1&& f1, L2&& f2) noexcept
+{
+    HostDeviceParallelFor<MT>(info,box1,box2,std::forward<L1>(f1),std::forward<L2>(f2));
 }
 
 template <typename L1, typename L2, typename L3>
@@ -1487,7 +1929,16 @@ void HostDeviceFor (Gpu::KernelInfo const& info,
                     Box const& box1, Box const& box2, Box const& box3,
                     L1&& f1, L2&& f2, L3&& f3) noexcept
 {
-    HostDeviceParallelFor(info, box1,box2,box3,
+    HostDeviceParallelFor<AMREX_GPU_MAX_THREADS>(info, box1,box2,box3,
+                          std::forward<L1>(f1),std::forward<L2>(f2),std::forward<L3>(f3));
+}
+
+template <int MT, typename L1, typename L2, typename L3>
+void HostDeviceFor (Gpu::KernelInfo const& info,
+                    Box const& box1, Box const& box2, Box const& box3,
+                    L1&& f1, L2&& f2, L3&& f3) noexcept
+{
+    HostDeviceParallelFor<MT>(info, box1,box2,box3,
                           std::forward<L1>(f1),std::forward<L2>(f2),std::forward<L3>(f3));
 }
 
@@ -1498,7 +1949,17 @@ void HostDeviceFor (Gpu::KernelInfo const& info,
                     Box const& box1, T1 ncomp1, L1&& f1,
                     Box const& box2, T2 ncomp2, L2&& f2) noexcept
 {
-    HostDeviceParallelFor(info,box1,ncomp1,std::forward<L1>(f1),box2,ncomp2,std::forward<L2>(f2));
+    HostDeviceParallelFor<AMREX_GPU_MAX_THREADS>(info,box1,ncomp1,std::forward<L1>(f1),box2,ncomp2,std::forward<L2>(f2));
+}
+
+template <int MT, typename T1, typename T2, typename L1, typename L2,
+          typename M1=std::enable_if_t<std::is_integral<T1>::value>,
+          typename M2=std::enable_if_t<std::is_integral<T2>::value> >
+void HostDeviceFor (Gpu::KernelInfo const& info,
+                    Box const& box1, T1 ncomp1, L1&& f1,
+                    Box const& box2, T2 ncomp2, L2&& f2) noexcept
+{
+    HostDeviceParallelFor<MT>(info,box1,ncomp1,std::forward<L1>(f1),box2,ncomp2,std::forward<L2>(f2));
 }
 
 template <typename T1, typename T2, typename T3, typename L1, typename L2, typename L3,
@@ -1510,7 +1971,22 @@ void HostDeviceFor (Gpu::KernelInfo const& info,
                     Box const& box2, T2 ncomp2, L2&& f2,
                     Box const& box3, T3 ncomp3, L3&& f3) noexcept
 {
-    HostDeviceParallelFor(info,
+    HostDeviceParallelFor<AMREX_GPU_MAX_THREADS>(info,
+                          box1,ncomp1,std::forward<L1>(f1),
+                          box2,ncomp2,std::forward<L2>(f2),
+                          box3,ncomp3,std::forward<L3>(f3));
+}
+
+template <int MT, typename T1, typename T2, typename T3, typename L1, typename L2, typename L3,
+          typename M1=std::enable_if_t<std::is_integral<T1>::value>,
+          typename M2=std::enable_if_t<std::is_integral<T2>::value>,
+          typename M3=std::enable_if_t<std::is_integral<T3>::value> >
+void HostDeviceFor (Gpu::KernelInfo const& info,
+                    Box const& box1, T1 ncomp1, L1&& f1,
+                    Box const& box2, T2 ncomp2, L2&& f2,
+                    Box const& box3, T3 ncomp3, L3&& f3) noexcept
+{
+    HostDeviceParallelFor<MT>(info,
                           box1,ncomp1,std::forward<L1>(f1),
                           box2,ncomp2,std::forward<L2>(f2),
                           box3,ncomp3,std::forward<L3>(f3));
@@ -1519,32 +1995,64 @@ void HostDeviceFor (Gpu::KernelInfo const& info,
 template <typename T, typename L, typename M=std::enable_if_t<std::is_integral<T>::value> >
 void HostDeviceParallelFor (T n, L&& f) noexcept
 {
-    HostDeviceParallelFor(Gpu::KernelInfo{},n,std::forward<L>(f));
+    HostDeviceParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{},n,std::forward<L>(f));
+}
+
+template <int MT, typename T, typename L, typename M=std::enable_if_t<std::is_integral<T>::value> >
+void HostDeviceParallelFor (T n, L&& f) noexcept
+{
+    HostDeviceParallelFor<MT>(Gpu::KernelInfo{},n,std::forward<L>(f));
 }
 
 template <typename L>
 void HostDeviceParallelFor (Box const& box, L&& f) noexcept
 {
-    HostDeviceParallelFor(Gpu::KernelInfo{},box,std::forward<L>(f));
+    HostDeviceParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{},box,std::forward<L>(f));
+}
+
+template <int MT, typename L>
+void HostDeviceParallelFor (Box const& box, L&& f) noexcept
+{
+    HostDeviceParallelFor<MT>(Gpu::KernelInfo{},box,std::forward<L>(f));
 }
 
 template <typename T, typename L, typename M=std::enable_if_t<std::is_integral<T>::value> >
 void HostDeviceParallelFor (Box const& box, T ncomp, L&& f) noexcept
 {
-    HostDeviceParallelFor(Gpu::KernelInfo{},box,ncomp,std::forward<L>(f));
+    HostDeviceParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{},box,ncomp,std::forward<L>(f));
+}
+
+template <int MT, typename T, typename L, typename M=std::enable_if_t<std::is_integral<T>::value> >
+void HostDeviceParallelFor (Box const& box, T ncomp, L&& f) noexcept
+{
+    HostDeviceParallelFor<MT>(Gpu::KernelInfo{},box,ncomp,std::forward<L>(f));
 }
 
 template <typename L1, typename L2>
 void HostDeviceParallelFor (Box const& box1, Box const& box2, L1&& f1, L2&& f2) noexcept
 {
-    HostDeviceParallelFor(Gpu::KernelInfo{},box1,box2,std::forward<L1>(f1),std::forward<L2>(f2));
+    HostDeviceParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{},box1,box2,std::forward<L1>(f1),std::forward<L2>(f2));
+}
+
+template <int MT, typename L1, typename L2>
+void HostDeviceParallelFor (Box const& box1, Box const& box2, L1&& f1, L2&& f2) noexcept
+{
+    HostDeviceParallelFor<MT>(Gpu::KernelInfo{},box1,box2,std::forward<L1>(f1),std::forward<L2>(f2));
 }
 
 template <typename L1, typename L2, typename L3>
 void HostDeviceParallelFor (Box const& box1, Box const& box2, Box const& box3,
                             L1&& f1, L2&& f2, L3&& f3) noexcept
 {
-    HostDeviceParallelFor(Gpu::KernelInfo{}, box1,box2,box3,
+    HostDeviceParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{}, box1,box2,box3,
+                          std::forward<L1>(f1),std::forward<L2>(f2),std::forward<L3>(f3));
+}
+
+template <int MT, typename L1, typename L2, typename L3>
+void HostDeviceParallelFor (Box const& box1, Box const& box2, Box const& box3,
+                            L1&& f1, L2&& f2, L3&& f3) noexcept
+{
+    HostDeviceParallelFor<MT>(Gpu::KernelInfo{}, box1,box2,box3,
                           std::forward<L1>(f1),std::forward<L2>(f2),std::forward<L3>(f3));
 }
 
@@ -1554,7 +2062,16 @@ template <typename T1, typename T2, typename L1, typename L2,
 void HostDeviceParallelFor (Box const& box1, T1 ncomp1, L1&& f1,
                             Box const& box2, T2 ncomp2, L2&& f2) noexcept
 {
-    HostDeviceParallelFor(Gpu::KernelInfo{},box1,ncomp1,std::forward<L1>(f1),box2,ncomp2,std::forward<L2>(f2));
+    HostDeviceParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{},box1,ncomp1,std::forward<L1>(f1),box2,ncomp2,std::forward<L2>(f2));
+}
+
+template <int MT, typename T1, typename T2, typename L1, typename L2,
+          typename M1=std::enable_if_t<std::is_integral<T1>::value>,
+          typename M2=std::enable_if_t<std::is_integral<T2>::value> >
+void HostDeviceParallelFor (Box const& box1, T1 ncomp1, L1&& f1,
+                            Box const& box2, T2 ncomp2, L2&& f2) noexcept
+{
+    HostDeviceParallelFor<MT>(Gpu::KernelInfo{},box1,ncomp1,std::forward<L1>(f1),box2,ncomp2,std::forward<L2>(f2));
 }
 
 template <typename T1, typename T2, typename T3, typename L1, typename L2, typename L3,
@@ -1565,7 +2082,21 @@ void HostDeviceParallelFor (Box const& box1, T1 ncomp1, L1&& f1,
                             Box const& box2, T2 ncomp2, L2&& f2,
                             Box const& box3, T3 ncomp3, L3&& f3) noexcept
 {
-    HostDeviceParallelFor(Gpu::KernelInfo{},
+    HostDeviceParallelFor<AMREX_GPU_MAX_THREADS>(Gpu::KernelInfo{},
+                          box1,ncomp1,std::forward<L1>(f1),
+                          box2,ncomp2,std::forward<L2>(f2),
+                          box3,ncomp3,std::forward<L3>(f3));
+}
+
+template <int MT, typename T1, typename T2, typename T3, typename L1, typename L2, typename L3,
+          typename M1=std::enable_if_t<std::is_integral<T1>::value>,
+          typename M2=std::enable_if_t<std::is_integral<T2>::value>,
+          typename M3=std::enable_if_t<std::is_integral<T3>::value> >
+void HostDeviceParallelFor (Box const& box1, T1 ncomp1, L1&& f1,
+                            Box const& box2, T2 ncomp2, L2&& f2,
+                            Box const& box3, T3 ncomp3, L3&& f3) noexcept
+{
+    HostDeviceParallelFor<MT>(Gpu::KernelInfo{},
                           box1,ncomp1,std::forward<L1>(f1),
                           box2,ncomp2,std::forward<L2>(f2),
                           box3,ncomp3,std::forward<L3>(f3));
diff --git a/Src/Base/AMReX_GpuLaunchMacrosG.H b/Src/Base/AMReX_GpuLaunchMacrosG.H
index 89aa1f24bc9..e1c643454bc 100644
--- a/Src/Base/AMReX_GpuLaunchMacrosG.H
+++ b/Src/Base/AMReX_GpuLaunchMacrosG.H
@@ -29,10 +29,16 @@
         } \
     } \
     else { \
+        amrex::Abort("amrex:: HOST_DEVICE disabled for Intel.  It takes too long to compile"); \
+    }}}
+
+#if 0
         for (auto const TI : amrex::Gpu::Range(amrex_i_tn)) { \
             block \
         } \
     }}}
+#endif
+
 #else
 #define AMREX_GPU_LAUNCH_HOST_DEVICE_LAMBDA_RANGE(TN,TI,block) \
     { auto const& amrex_i_tn = TN; \
@@ -40,7 +46,7 @@
     if (amrex::Gpu::inLaunchRegion()) \
     { \
         const auto amrex_i_ec = amrex::Gpu::ExecutionConfig(amrex_i_tn); \
-        AMREX_LAUNCH_KERNEL(amrex_i_ec.numBlocks, amrex_i_ec.numThreads, amrex_i_ec.sharedMem, amrex::Gpu::gpuStream(), \
+        AMREX_LAUNCH_KERNEL(AMREX_GPU_MAX_THREADS, amrex_i_ec.numBlocks, amrex_i_ec.numThreads, amrex_i_ec.sharedMem, amrex::Gpu::gpuStream(), \
         [=] AMREX_GPU_DEVICE () noexcept { \
             for (auto const TI : amrex::Gpu::Range(amrex_i_tn)) { \
                 block \
@@ -93,6 +99,10 @@
         } \
     } \
     else { \
+        amrex::Abort("amrex:: HOST_DEVICE disabled for Intel.  It takes too long to compile"); \
+    }}}
+
+#if 0
         for (auto const TI1 : amrex::Gpu::Range(amrex_i_tn1)) { \
             block1 \
         } \
@@ -100,6 +110,8 @@
             block2 \
         } \
     }}}
+#endif
+
 #else
 #define AMREX_GPU_LAUNCH_HOST_DEVICE_LAMBDA_RANGE_2(TN1,TI1,block1,TN2,TI2,block2) \
     { auto const& amrex_i_tn1 = TN1; auto const& amrex_i_tn2 = TN2; \
@@ -111,7 +123,7 @@
         dim3 amrex_i_nblocks = amrex::max(amrex_i_ec1.numBlocks.x, \
                                           amrex_i_ec2.numBlocks.x); \
         amrex_i_nblocks.y = 2; \
-        AMREX_LAUNCH_KERNEL(amrex_i_nblocks, amrex_i_ec1.numThreads, 0, amrex::Gpu::gpuStream(), \
+        AMREX_LAUNCH_KERNEL(AMREX_GPU_MAX_THREADS, amrex_i_nblocks, amrex_i_ec1.numThreads, 0, amrex::Gpu::gpuStream(), \
         [=] AMREX_GPU_DEVICE () noexcept { \
             switch (blockIdx.y) { \
             case 0: for (auto const TI1 : amrex::Gpu::Range(amrex_i_tn1)) { \
@@ -179,6 +191,10 @@
         } \
     } \
     else { \
+        amrex::Abort("amrex:: HOST_DEVICE disabled for Intel.  It takes too long to compile"); \
+    }}}
+
+#if 0
         for (auto const TI1 : amrex::Gpu::Range(amrex_i_tn1)) { \
             block1 \
         } \
@@ -189,6 +205,8 @@
             block3 \
         } \
     }}}
+#endif
+
 #else
 #define AMREX_GPU_LAUNCH_HOST_DEVICE_LAMBDA_RANGE_3(TN1,TI1,block1,TN2,TI2,block2,TN3,TI3,block3) \
     { auto const& amrex_i_tn1 = TN1; auto const& amrex_i_tn2 = TN2; auto const& amrex_i_tn3 = TN3; \
@@ -202,7 +220,7 @@
                                                      amrex_i_ec2.numBlocks.x), \
                                                      amrex_i_ec3.numBlocks.x); \
         amrex_i_nblocks.y = 3; \
-        AMREX_LAUNCH_KERNEL(amrex_i_nblocks, amrex_i_ec1.numThreads, 0, amrex::Gpu::gpuStream(), \
+        AMREX_LAUNCH_KERNEL(AMREX_GPU_MAX_THREADS, amrex_i_nblocks, amrex_i_ec1.numThreads, 0, amrex::Gpu::gpuStream(), \
         [=] AMREX_GPU_DEVICE () noexcept { \
             switch (blockIdx.y) { \
             case 0: for (auto const TI1 : amrex::Gpu::Range(amrex_i_tn1)) { \
@@ -269,7 +287,7 @@
     if (amrex::Gpu::inLaunchRegion()) \
     { \
         auto amrex_i_ec = amrex::Gpu::ExecutionConfig(amrex_i_tn); \
-        AMREX_LAUNCH_KERNEL(amrex_i_ec.numBlocks, amrex_i_ec.numThreads, amrex_i_ec.sharedMem, amrex::Gpu::gpuStream(), \
+        AMREX_LAUNCH_KERNEL(AMREX_GPU_MAX_THREADS, amrex_i_ec.numBlocks, amrex_i_ec.numThreads, amrex_i_ec.sharedMem, amrex::Gpu::gpuStream(), \
         [=] AMREX_GPU_DEVICE () noexcept { \
             for (auto const TI : amrex::Gpu::Range(amrex_i_tn)) { \
                 block \
@@ -333,7 +351,7 @@
         dim3 amrex_i_nblocks = amrex::max(amrex_i_ec1.numBlocks.x, \
                                           amrex_i_ec2.numBlocks.x); \
         amrex_i_nblocks.y = 2; \
-        AMREX_LAUNCH_KERNEL(amrex_i_nblocks, amrex_i_ec1.numThreads, 0, amrex::Gpu::gpuStream(), \
+        AMREX_LAUNCH_KERNEL(AMREX_GPU_MAX_THREADS, amrex_i_nblocks, amrex_i_ec1.numThreads, 0, amrex::Gpu::gpuStream(), \
         [=] AMREX_GPU_DEVICE () noexcept { \
             switch (blockIdx.y) { \
             case 0: for (auto const TI1 : amrex::Gpu::Range(amrex_i_tn1)) { \
@@ -410,7 +428,7 @@
                                                      amrex_i_ec2.numBlocks.x), \
                                                      amrex_i_ec3.numBlocks.x); \
         amrex_i_nblocks.y = 3; \
-        AMREX_LAUNCH_KERNEL(amrex_i_nblocks, amrex_i_ec1.numThreads, 0, amrex::Gpu::gpuStream(), \
+        AMREX_LAUNCH_KERNEL(AMREX_GPU_MAX_THREADS, amrex_i_nblocks, amrex_i_ec1.numThreads, 0, amrex::Gpu::gpuStream(), \
         [=] AMREX_GPU_DEVICE () noexcept { \
             switch (blockIdx.y) { \
             case 0: for (auto const TI1 : amrex::Gpu::Range(amrex_i_tn1)) { \
@@ -434,6 +452,18 @@
 
 // FOR_1D
 
+#ifdef AMREX_USE_DPCPP
+#define AMREX_GPU_HOST_DEVICE_FOR_1D(n,i,block) \
+{ \
+    auto const& amrex_i_n = n; \
+    using amrex_i_inttype = typename std::remove_const<decltype(n)>::type; \
+    if (amrex::Gpu::inLaunchRegion()) { \
+        amrex::ParallelFor(amrex_i_n,[=] AMREX_GPU_DEVICE (amrex_i_inttype i) noexcept block); \
+    } else { \
+        amrex::Abort("amrex:: HOST_DEVICE disabled for Intel.  It takes too long to compile"); \
+    } \
+}
+#else
 #define AMREX_GPU_HOST_DEVICE_FOR_1D(n,i,block) \
 { \
     auto const& amrex_i_n = n; \
@@ -446,6 +476,7 @@
         for (amrex_i_inttype i = 0; i < amrex_i_n; ++i) amrex_i_lambda(i); \
     } \
 }
+#endif
 
 #define AMREX_GPU_DEVICE_FOR_1D(n,i,block) \
 { \
@@ -455,6 +486,17 @@
 
 // FOR_3D
 
+#ifdef AMREX_USE_DPCPP
+#define AMREX_GPU_HOST_DEVICE_FOR_3D(box,i,j,k,block) \
+{ \
+    auto const& amrex_i_box = box; \
+    if (amrex::Gpu::inLaunchRegion()) { \
+        amrex::ParallelFor(amrex_i_box,[=] AMREX_GPU_DEVICE (int i, int j, int k) noexcept block); \
+    } else { \
+        amrex::Abort("amrex:: HOST_DEVICE disabled for Intel.  It takes too long to compile"); \
+    } \
+}
+#else
 #define AMREX_GPU_HOST_DEVICE_FOR_3D(box,i,j,k,block) \
 { \
     auto const& amrex_i_box = box; \
@@ -464,6 +506,7 @@
         amrex::LoopConcurrentOnCpu(amrex_i_box,[=] (int i, int j, int k) noexcept block); \
     } \
 }
+#endif
 
 #define AMREX_GPU_DEVICE_FOR_3D(box,i,j,k,block) \
 { \
@@ -472,6 +515,18 @@
 
 // FOR_4D
 
+#ifdef AMREX_USE_DPCPP
+#define AMREX_GPU_HOST_DEVICE_FOR_4D(box,ncomp,i,j,k,n,block) \
+{ \
+    auto const& amrex_i_box = box; \
+    auto const& amrex_i_ncomp = ncomp; \
+    if (amrex::Gpu::inLaunchRegion()) { \
+        amrex::ParallelFor(amrex_i_box,amrex_i_ncomp,[=] AMREX_GPU_DEVICE (int i, int j, int k, int n) noexcept block); \
+    } else { \
+        amrex::Abort("amrex:: HOST_DEVICE disabled for Intel.  It takes too long to compile"); \
+    } \
+}
+#else
 #define AMREX_GPU_HOST_DEVICE_FOR_4D(box,ncomp,i,j,k,n,block) \
 { \
     auto const& amrex_i_box = box; \
@@ -482,6 +537,7 @@
         amrex::LoopConcurrentOnCpu(amrex_i_box,amrex_i_ncomp,[=] (int i, int j, int k, int n) noexcept block); \
     } \
 }
+#endif
 
 #define AMREX_GPU_DEVICE_FOR_4D(box,ncomp,i,j,k,n,block) \
 { \
diff --git a/Src/Base/AMReX_GpuQualifiers.H b/Src/Base/AMReX_GpuQualifiers.H
index ce07a3e52c2..b5d5ea58fbd 100644
--- a/Src/Base/AMReX_GpuQualifiers.H
+++ b/Src/Base/AMReX_GpuQualifiers.H
@@ -41,10 +41,6 @@
 
 # include <CL/sycl.hpp>
 
-namespace amrex {
-    namespace oneapi = sycl::ext::oneapi;
-}
-
 # define AMREX_REQUIRE_SUBGROUP_SIZE(x) \
   _Pragma("clang diagnostic push") \
   _Pragma("clang diagnostic ignored \"-Wattributes\"") \
diff --git a/Src/Base/AMReX_GpuReduce.H b/Src/Base/AMReX_GpuReduce.H
index 9b48138940c..7b9b0e42355 100644
--- a/Src/Base/AMReX_GpuReduce.H
+++ b/Src/Base/AMReX_GpuReduce.H
@@ -8,6 +8,7 @@
 #include <AMReX_GpuAtomic.H>
 #include <AMReX_GpuUtility.H>
 #include <AMReX_Functional.H>
+#include <AMReX_TypeTraits.H>
 
 #if !defined(AMREX_USE_CUB) && defined(AMREX_USE_CUDA) && defined(__CUDACC__) && (__CUDACC_VER_MAJOR__ >= 11)
 #define AMREX_USE_CUB 1
@@ -54,10 +55,10 @@ template <int warpSize, typename T, typename F>
 struct warpReduce
 {
     AMREX_GPU_DEVICE AMREX_FORCE_INLINE
-    T operator() (T x, amrex::oneapi::sub_group const& sg) const noexcept
+    T operator() (T x, sycl::sub_group const& sg) const noexcept
     {
         for (int offset = warpSize/2; offset > 0; offset /= 2) {
-            T y = sg.shuffle_down(x, offset);
+            T y = sycl::shift_group_left(sg, x, offset);
             x = F()(x,y);
         }
         return x;
@@ -70,7 +71,7 @@ T blockReduce (T x, WARPREDUCE && warp_reduce, T x0, Gpu::Handler const& h)
 {
     T* shared = (T*)h.local;
     int tid = h.item->get_local_id(0);
-    amrex::oneapi::sub_group const& sg = h.item->get_sub_group();
+    sycl::sub_group const& sg = h.item->get_sub_group();
     int lane = sg.get_local_id()[0];
     int wid = sg.get_group_id()[0];
     int numwarps = sg.get_group_range()[0];
@@ -93,7 +94,7 @@ AMREX_GPU_DEVICE AMREX_FORCE_INLINE
 void blockReduce_partial (T* dest, T x, WARPREDUCE && warp_reduce, ATOMICOP && atomic_op,
                           Gpu::Handler const& handler)
 {
-   amrex::oneapi::sub_group const& sg = handler.item->get_sub_group();
+   sycl::sub_group const& sg = handler.item->get_sub_group();
    int wid = sg.get_group_id()[0];
    if ((wid+1)*warpSize <= handler.numActiveThreads) {
        x = warp_reduce(x, sg); // full warp
@@ -249,15 +250,54 @@ void deviceReduceLogicalOr (int * dest, int source, Gpu::Handler const& h) noexc
 
 #elif defined(AMREX_USE_CUDA) || defined(AMREX_USE_HIP)
 
+namespace detail {
+
+template <typename T>
+AMREX_GPU_DEVICE AMREX_FORCE_INLINE
+T shuffle_down (T x, int offset) noexcept
+{
+    return AMREX_HIP_OR_CUDA(__shfl_down(x, offset),
+                             __shfl_down_sync(0xffffffff, x, offset));
+}
+
+// If other sizeof is needed, we can implement it later.
+template <class T, std::enable_if_t<sizeof(T)%sizeof(unsigned int) == 0, int> = 0>
+AMREX_GPU_DEVICE AMREX_FORCE_INLINE
+T multi_shuffle_down (T x, int offset) noexcept
+{
+    constexpr int nwords = (sizeof(T) + sizeof(unsigned int) - 1) / sizeof(unsigned int);
+    T y;
+    auto py = reinterpret_cast<unsigned int*>(&y);
+    auto px = reinterpret_cast<unsigned int*>(&x);
+    for (int i = 0; i < nwords; ++i) {
+        py[i] = shuffle_down(px[i],offset);
+    }
+    return y;
+}
+
+}
+
 template <int warpSize, typename T, typename F>
 struct warpReduce
 {
+    // Not all arithmetic types can be taken by shuffle_down, but it's good enough.
+    template <class U=T, std::enable_if_t<std::is_arithmetic<U>::value,int> = 0>
+    AMREX_GPU_DEVICE AMREX_FORCE_INLINE
+    T operator() (T x) const noexcept
+    {
+        for (int offset = warpSize/2; offset > 0; offset /= 2) {
+            T y = detail::shuffle_down(x, offset);
+            x = F()(x,y);
+        }
+        return x;
+    }
+
+    template <class U=T, std::enable_if_t<!std::is_arithmetic<U>::value,int> = 0>
     AMREX_GPU_DEVICE AMREX_FORCE_INLINE
     T operator() (T x) const noexcept
     {
         for (int offset = warpSize/2; offset > 0; offset /= 2) {
-            AMREX_HIP_OR_CUDA(T y = __shfl_down(x, offset);,
-                              T y = __shfl_down_sync(0xffffffff, x, offset); )
+            T y = detail::multi_shuffle_down(x, offset);
             x = F()(x,y);
         }
         return x;
diff --git a/Src/Base/AMReX_GpuTypes.H b/Src/Base/AMReX_GpuTypes.H
index 737a47e665c..12b8fbc1829 100644
--- a/Src/Base/AMReX_GpuTypes.H
+++ b/Src/Base/AMReX_GpuTypes.H
@@ -8,7 +8,6 @@
 
 #ifdef AMREX_USE_DPCPP
 #include <CL/sycl.hpp>
-namespace sycl = cl::sycl;
 #endif
 
 namespace amrex {
diff --git a/Src/Base/AMReX_MFIter.H b/Src/Base/AMReX_MFIter.H
index eb259ac7b6d..9c01e38b138 100644
--- a/Src/Base/AMReX_MFIter.H
+++ b/Src/Base/AMReX_MFIter.H
@@ -164,6 +164,8 @@ public:
 
     static int allowMultipleMFIters (int allow);
 
+    void Finalize ();
+
 protected:
 
     std::unique_ptr<FabArrayBase> m_fa;  //!< This must be the first member!
@@ -180,6 +182,7 @@ protected:
     IndexType     typ;
 
     bool          dynamic;
+    bool          finalized = false;
 
     struct DeviceSync {
         DeviceSync () = default;
diff --git a/Src/Base/AMReX_MFIter.cpp b/Src/Base/AMReX_MFIter.cpp
index e8a97256d3d..c761c466449 100644
--- a/Src/Base/AMReX_MFIter.cpp
+++ b/Src/Base/AMReX_MFIter.cpp
@@ -209,6 +209,19 @@ MFIter::MFIter (const FabArrayBase& fabarray_, const MFItInfo& info)
 
 MFIter::~MFIter ()
 {
+    Finalize();
+}
+
+void
+MFIter::Finalize ()
+{
+    // avoid double finalize
+    if (finalized) return;
+    finalized = true;
+
+    // mark as invalid
+    currentIndex = endIndex;
+
 #ifdef AMREX_USE_OMP
 #pragma omp master
 #endif
@@ -237,6 +250,9 @@ MFIter::~MFIter ()
 #endif
         m_fa->clearThisBD();
     }
+    if (m_fa) {
+        m_fa.reset(nullptr);
+    }
 }
 
 void
diff --git a/Src/Base/AMReX_MPMD.H b/Src/Base/AMReX_MPMD.H
new file mode 100644
index 00000000000..2b8ef399866
--- /dev/null
+++ b/Src/Base/AMReX_MPMD.H
@@ -0,0 +1,178 @@
+#ifndef AMREX_MPMD_H_
+#define AMREX_MPMD_H_
+#include <AMReX_Config.H>
+
+#ifdef AMREX_USE_MPI
+
+#include <AMReX_FabArray.H>
+
+#include <mpi.h>
+
+namespace amrex { namespace MPMD {
+
+MPI_Comm Initialize (int argc, char* argv[]);
+
+void Finalize ();
+
+bool Initialized ();
+
+int MyProc ();   //! Process ID in MPI_COMM_WORLD
+int NProcs ();   //! Number of processes in MPI_COMM_WORLD
+int MyProgId (); //! Program ID
+
+class Copier
+{
+public:
+    Copier (BoxArray const& ba, DistributionMapping const& dm);
+
+    template <typename FAB>
+    void send (FabArray<FAB> const& fa, int icomp, int ncomp) const;
+
+    template <typename FAB>
+    void recv (FabArray<FAB>& fa, int icomp, int ncomp) const;
+
+private:
+    std::map<int,FabArrayBase::CopyComTagsContainer> m_SndTags;
+    std::map<int,FabArrayBase::CopyComTagsContainer> m_RcvTags;
+};
+
+template <typename FAB>
+void Copier::send (FabArray<FAB> const& mf, int icomp, int ncomp) const
+{
+    const int N_snds = m_SndTags.size();
+
+    if (N_snds == 0) return;
+
+    // Prepare buffer
+
+    Vector<char*>       send_data;
+    Vector<std::size_t> send_size;
+    Vector<int>         send_rank;
+    Vector<MPI_Request> send_reqs;
+    Vector<FabArrayBase::CopyComTagsContainer const*> send_cctc;
+
+    Vector<std::size_t> offset;
+    std::size_t total_volume = 0;
+    for (auto const& kv : m_SndTags) {
+        auto const& cctc = kv.second;
+
+        std::size_t nbytes = 0;
+        for (auto const& cct : cctc) {
+            nbytes += cct.sbox.numPts() * ncomp * sizeof(typename FAB::value_type);
+        }
+
+        std::size_t acd = ParallelDescriptor::alignof_comm_data(nbytes);
+        nbytes = amrex::aligned_size(acd, nbytes); // so that bytes are aligned
+
+        // Also need to align the offset properly
+        total_volume = amrex::aligned_size(std::max(alignof(typename FAB::value_type),
+                                                    acd), total_volume);
+
+        offset.push_back(total_volume);
+        total_volume += nbytes;
+
+        send_data.push_back(nullptr);
+        send_size.push_back(nbytes);
+        send_rank.push_back(kv.first);
+        send_reqs.push_back(MPI_REQUEST_NULL);
+        send_cctc.push_back(&cctc);
+    }
+
+    Gpu::PinnedVector<char> send_buffer(total_volume);
+    char* the_send_data = send_buffer.data();
+    for (int i = 0; i < N_snds; ++i) {
+        send_data[i] = the_send_data + offset[i];
+    }
+
+    // Pack buffer
+#ifdef AMREX_USE_GPU
+    if (Gpu::inLaunchRegion() && (mf.arena()->isDevice() || mf.arena()->isManaged())) {
+        mf.pack_send_buffer_gpu(mf, icomp, ncomp, send_data, send_size, send_cctc);
+    } else
+#endif
+    {
+        mf.pack_send_buffer_cpu(mf, icomp, ncomp, send_data, send_size, send_cctc);
+    }
+
+    // Send
+    for (int i = 0; i < N_snds; ++i) {
+        send_reqs[i] = ParallelDescriptor::Asend
+            (send_data[i], send_size[i], send_rank[i], 100, MPI_COMM_WORLD).req();
+    }
+    Vector<MPI_Status> stats(N_snds);
+    ParallelDescriptor::Waitall(send_reqs, stats);
+}
+
+template <typename FAB>
+void Copier::recv (FabArray<FAB>& mf, int icomp, int ncomp) const
+{
+    const int N_rcvs = m_RcvTags.size();
+
+    if (N_rcvs == 0) return;
+
+    // Prepare buffer
+
+    Vector<char*>       recv_data;
+    Vector<std::size_t> recv_size;
+    Vector<int>         recv_from;
+    Vector<MPI_Request> recv_reqs;
+
+    Vector<std::size_t> offset;
+    std::size_t TotalRcvsVolume = 0;
+    for (auto const& kv : m_RcvTags) {
+        std::size_t nbytes = 0;
+        for (auto const& cct : kv.second) {
+            nbytes += cct.dbox.numPts() * ncomp * sizeof(typename FAB::value_type);
+        }
+
+        std::size_t acd = ParallelDescriptor::alignof_comm_data(nbytes);
+        nbytes = amrex::aligned_size(acd, nbytes); // so that nbytes are aligned
+
+        // Also need to align the offset properly
+        TotalRcvsVolume = amrex::aligned_size(std::max(alignof(typename FAB::value_type),
+                                                       acd), TotalRcvsVolume);
+
+        offset.push_back(TotalRcvsVolume);
+        TotalRcvsVolume += nbytes;
+
+        recv_data.push_back(nullptr);
+        recv_size.push_back(nbytes);
+        recv_from.push_back(kv.first);
+        recv_reqs.push_back(MPI_REQUEST_NULL);
+    }
+
+    Gpu::PinnedVector<char> recv_buffer(TotalRcvsVolume);
+    char* the_recv_data = recv_buffer.data();
+
+    // Recv
+    for (int i = 0; i < N_rcvs; ++i) {
+        recv_data[i] = the_recv_data + offset[i];
+        recv_reqs[i] = ParallelDescriptor::Arecv
+            (recv_data[i], recv_size[i], recv_from[i], 100, MPI_COMM_WORLD).req();
+    }
+
+    Vector<FabArrayBase::CopyComTagsContainer const*> recv_cctc(N_rcvs, nullptr);
+    for (int i = 0; i < N_rcvs; ++i) {
+        recv_cctc[i] = &(m_RcvTags.at(recv_from[i]));
+    }
+
+    Vector<MPI_Status> stats(N_rcvs);
+    ParallelDescriptor::Waitall(recv_reqs, stats);
+
+    // Unpack buffer
+#ifdef AMREX_USE_GPU
+    if (Gpu::inLaunchRegion() && (mf.arena()->isDevice() || mf.arena()->isManaged())) {
+        mf.unpack_recv_buffer_gpu(mf, icomp, ncomp, recv_data, recv_size, recv_cctc,
+                                  FabArrayBase::COPY, true);
+    } else
+#endif
+    {
+        mf.unpack_recv_buffer_cpu(mf, icomp, ncomp, recv_data, recv_size, recv_cctc,
+                                  FabArrayBase::COPY, true);
+    }
+}
+
+}}
+
+#endif
+#endif
diff --git a/Src/Base/AMReX_MPMD.cpp b/Src/Base/AMReX_MPMD.cpp
new file mode 100644
index 00000000000..917c741c2a6
--- /dev/null
+++ b/Src/Base/AMReX_MPMD.cpp
@@ -0,0 +1,225 @@
+#include <AMReX_MPMD.H>
+#include <AMReX_ParallelDescriptor.H>
+
+#include <algorithm>
+#include <cstring>
+#include <iostream>
+#include <utility>
+#include <vector>
+
+#ifdef AMREX_USE_MPI
+
+namespace amrex { namespace MPMD {
+
+namespace {
+    bool initialized = false;
+    bool mpi_initialized_by_us = false;
+    MPI_Comm app_comm = MPI_COMM_NULL;
+    int myproc;
+    int nprocs;
+}
+
+namespace {
+
+template <typename T>
+int num_unique_elements (std::vector<T>& v)
+{
+    std::sort(v.begin(), v.end());
+    auto last = std::unique(v.begin(), v.end());
+    return last - v.begin();
+}
+
+}
+
+MPI_Comm Initialize (int argc, char* argv[])
+{
+    initialized = true;
+    int flag;
+    MPI_Initialized(&flag);
+    if (!flag) {
+        MPI_Init(&argc, &argv);
+        mpi_initialized_by_us = true;
+    }
+
+    MPI_Comm_rank(MPI_COMM_WORLD, &myproc);
+    MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
+
+    int* p;
+    MPI_Comm_get_attr(MPI_COMM_WORLD, MPI_APPNUM, &p, &flag);
+    int appnum = *p;
+
+    std::vector<int> all_appnum(nprocs);
+    MPI_Allgather(&appnum, 1, MPI_INT, all_appnum.data(), 1, MPI_INT, MPI_COMM_WORLD);
+    int napps = num_unique_elements(all_appnum);
+
+    // MPI_APPNUM does not appear to work with slurm on some systems.
+    if (napps != 2) {
+        std::vector<int> all_argc(nprocs);
+        MPI_Allgather(&argc, 1, MPI_INT, all_argc.data(), 1, MPI_INT, MPI_COMM_WORLD);
+        napps = num_unique_elements(all_argc);
+        if (napps == 2) {
+            appnum = static_cast<int>(argc != all_argc[0]);
+        }
+    }
+
+    if (napps != 2) {
+        std::string exename;
+        if (argc > 0) {
+            exename = std::string(argv[0]);
+        }
+        unsigned long long hexe = std::hash<std::string>{}(exename);
+        std::vector<unsigned long long> all_hexe(nprocs);
+        MPI_Allgather(&hexe, 1, MPI_UNSIGNED_LONG_LONG,
+                      all_hexe.data(), 1, MPI_UNSIGNED_LONG_LONG, MPI_COMM_WORLD);
+        napps = num_unique_elements(all_hexe);
+        if (napps == 2) {
+            appnum = static_cast<int>(hexe != all_hexe[0]);
+        }
+    }
+
+    if (napps == 2) {
+        MPI_Comm_split(MPI_COMM_WORLD, appnum, myproc, &app_comm);
+    } else {
+        std::cout << "amrex::MPMD only supports two programs." << std::endl;
+        MPI_Abort(MPI_COMM_WORLD, 1);
+    }
+
+    return app_comm;
+}
+
+void Finalize ()
+{
+    MPI_Comm_free(&app_comm);
+    if (mpi_initialized_by_us) {
+        MPI_Finalize();
+        mpi_initialized_by_us = false;
+    }
+    initialized = false;
+}
+
+bool Initialized () { return initialized; }
+
+int MyProc ()
+{
+    return myproc;
+}
+
+int NProcs ()
+{
+    return nprocs;
+}
+
+int MyProgId ()
+{
+    return (myproc == ParallelDescriptor::MyProc()) ? 0 : 1;
+}
+
+Copier::Copier (BoxArray const& ba, DistributionMapping const& dm)
+{
+    int rank_offset = myproc - ParallelDescriptor::MyProc();
+    int this_root, other_root;
+    if (rank_offset == 0) { // First program
+        this_root = 0;
+        other_root = ParallelDescriptor::NProcs();
+    } else {
+        this_root = rank_offset;
+        other_root = 0;
+    }
+
+    Vector<Box> bv = ba.boxList().data();
+
+    int this_nboxes = ba.size();
+    Vector<int> procs = dm.ProcessorMap();
+    if (rank_offset != 0) {
+        for (int i = 0; i < this_nboxes; ++i) {
+            procs[i] += rank_offset;
+        }
+    }
+
+    Vector<Box> obv;
+    Vector<int> oprocs;
+    int other_nboxes;
+    if (myproc == this_root) {
+        if (rank_offset == 0) // the first program
+        {
+            MPI_Send(&this_nboxes, 1, MPI_INT, other_root, 0, MPI_COMM_WORLD);
+            MPI_Recv(&other_nboxes, 1, MPI_INT, other_root, 1, MPI_COMM_WORLD,
+                     MPI_STATUS_IGNORE);
+            obv.resize(other_nboxes);
+            MPI_Send(bv.data(), this_nboxes,
+                     ParallelDescriptor::Mpi_typemap<Box>::type(),
+                     other_root, 2, MPI_COMM_WORLD);
+            MPI_Recv(obv.data(), other_nboxes,
+                     ParallelDescriptor::Mpi_typemap<Box>::type(),
+                     other_root, 3, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
+            oprocs.resize(other_nboxes);
+            MPI_Send(procs.data(), this_nboxes, MPI_INT, other_root, 4, MPI_COMM_WORLD);
+            MPI_Recv(oprocs.data(), other_nboxes, MPI_INT, other_root, 5, MPI_COMM_WORLD,
+                     MPI_STATUS_IGNORE);
+        }
+        else // the second program
+        {
+            MPI_Recv(&other_nboxes, 1, MPI_INT, other_root, 0, MPI_COMM_WORLD,
+                     MPI_STATUS_IGNORE);
+            MPI_Send(&this_nboxes, 1, MPI_INT, other_root, 1, MPI_COMM_WORLD);
+            obv.resize(other_nboxes);
+            MPI_Recv(obv.data(), other_nboxes,
+                     ParallelDescriptor::Mpi_typemap<Box>::type(),
+                     other_root, 2, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
+            MPI_Send(bv.data(), this_nboxes,
+                     ParallelDescriptor::Mpi_typemap<Box>::type(),
+                     other_root, 3, MPI_COMM_WORLD);
+            oprocs.resize(other_nboxes);
+            MPI_Recv(oprocs.data(), other_nboxes, MPI_INT, other_root, 4, MPI_COMM_WORLD,
+                     MPI_STATUS_IGNORE);
+            MPI_Send(procs.data(), this_nboxes, MPI_INT, other_root, 5, MPI_COMM_WORLD);
+        }
+    }
+
+    ParallelDescriptor::Bcast(&other_nboxes, 1);
+    if (obv.empty()) {
+        obv.resize(other_nboxes);
+        oprocs.resize(other_nboxes);
+    }
+    ParallelDescriptor::Bcast(obv.data(), obv.size());
+    ParallelDescriptor::Bcast(oprocs.data(), oprocs.size());
+
+    BoxArray oba(BoxList(std::move(obv)));
+
+    // At this point, ba and bv hold our boxes, and oba holds the other
+    // program's boxes. procs holds mpi ranks of our boxes, and oprocs holds
+    // mpi ranks of the other program's boxes.  All mpi ranks are in
+    // MPI_COMM_WORLD.
+
+    // Build communication meta-data
+
+    AMREX_ALWAYS_ASSERT(ba.ixType().cellCentered());
+
+    std::vector<std::pair<int,Box> > isects;
+
+    for (int i = 0; i < this_nboxes; ++i) {
+        if (procs[i] == myproc) {
+            oba.intersections(bv[i], isects);
+            for (auto const& isec : isects) {
+                const int oi = isec.first;
+                const Box& bx = isec.second;
+                const int orank = oprocs[oi];
+                m_SndTags[orank].push_back
+                    (FabArrayBase::CopyComTag(bx, bx, oi, i));
+                m_RcvTags[orank].push_back
+                    (FabArrayBase::CopyComTag(bx, bx, i, oi));
+            }
+        }
+    }
+
+    for (auto& kv : m_SndTags) {
+        std::sort(kv.second.begin(), kv.second.end());
+    }
+    for (auto& kv : m_RcvTags) {
+        std::sort(kv.second.begin(), kv.second.end());
+    }
+}
+
+}}
+
+#endif
diff --git a/Src/Base/AMReX_Math.H b/Src/Base/AMReX_Math.H
index 7996830d534..3eed941fb00 100644
--- a/Src/Base/AMReX_Math.H
+++ b/Src/Base/AMReX_Math.H
@@ -9,7 +9,6 @@
 
 #ifdef AMREX_USE_DPCPP
 #include <CL/sycl.hpp>
-namespace sycl = cl::sycl;
 #endif
 
 namespace amrex { inline namespace disabled {
diff --git a/Src/Base/AMReX_MultiFab.H b/Src/Base/AMReX_MultiFab.H
index dfb75dacbf9..70e6facaee7 100644
--- a/Src/Base/AMReX_MultiFab.H
+++ b/Src/Base/AMReX_MultiFab.H
@@ -8,6 +8,7 @@
 #include <AMReX_FabArray.H>
 #include <AMReX_FabArrayUtility.H>
 #include <AMReX_Periodicity.H>
+#include <AMReX_NonLocalBC.H>
 
 #ifdef AMREX_USE_EB
 #include <AMReX_EBMultiFabUtil.H>
@@ -190,7 +191,7 @@ public:
 
     /**
     * \brief Returns the maximum *absolute* values contained in
-    * each component of "comps" of the MultiFab.  No ghost cells are used.
+    * each component of "comps" of the MultiFab.  "nghost" ghost cells are used.
     */
     Vector<Real> norm0 (const Vector<int>& comps, int nghost = 0, bool local = false, bool ignore_covered = false ) const;
     Vector<Real> norminf (const Vector<int>& comps, int nghost = 0, bool local = false, bool ignore_covered = false) const {
@@ -232,6 +233,13 @@ public:
     */
     Real sum (int comp = 0, bool local = false) const;
     /**
+    * \brief Same as sum with local=false, but for non-cell-centered data, this
+    *        skips non-unique points that are owned by multiple boxes.
+    */
+    Real sum_unique (int comp = 0,
+                     bool local = false,
+                     const Periodicity& period = Periodicity::NonPeriodic()) const;
+    /**
     * \brief Adds the scalar value val to the value of each cell in the
     * specified subregion of the MultiFab.  The subregion consists
     * of the num_comp components starting at component comp.
diff --git a/Src/Base/AMReX_MultiFab.cpp b/Src/Base/AMReX_MultiFab.cpp
index 9e2f37adf37..83664b307d4 100644
--- a/Src/Base/AMReX_MultiFab.cpp
+++ b/Src/Base/AMReX_MultiFab.cpp
@@ -5,6 +5,7 @@
 #include <AMReX_BLProfiler.H>
 #include <AMReX_iMultiFab.H>
 #include <AMReX_FabArrayUtility.H>
+#include <AMReX_REAL.H>
 
 #ifdef AMREX_MEM_PROFILING
 #include <AMReX_MemProfiler.H>
@@ -1586,6 +1587,58 @@ MultiFab::sum (int comp, bool local) const
     return sm;
 }
 
+Real
+MultiFab::sum_unique (int comp,
+                      bool local,
+                      const Periodicity& period) const
+{
+    BL_PROFILE("MultiFab::sum_unique()");
+
+    // no duplicatly distributed points if cell centered
+    if (ixType().cellCentered())
+        return this->sum(comp, local);
+
+    // Owner is the grid with the lowest grid number containing the data
+    std::unique_ptr<iMultiFab> owner_mask = OwnerMask(period);
+
+    Real sm = Real(0.0);
+#ifdef AMREX_USE_GPU
+    if (Gpu::inLaunchRegion()) {
+        auto const& ma = this->const_arrays();
+        auto const& msk = owner_mask->const_arrays();
+        sm = ParReduce(TypeList<ReduceOpSum>{}, TypeList<Real>{}, *this, IntVect(0),
+        [=] AMREX_GPU_DEVICE (int box_no, int i, int j, int k) noexcept
+                       -> GpuTuple<Real>
+        {
+            return msk[box_no](i,j,k) ? ma[box_no](i,j,k,comp) : 0.0_rt;
+        });
+    } else
+#endif
+    {
+#ifdef AMREX_USE_OMP
+#pragma omp parallel if (!system::regtest_reduction) reduction(+:sm)
+#endif
+        for (MFIter mfi(*this,true); mfi.isValid(); ++mfi)
+        {
+            Box const& bx = mfi.tilebox();
+            Array4<Real const> const& a = this->const_array(mfi);
+            Array4<int const> const& msk = owner_mask->const_array(mfi);
+            Real tmp = 0.0_rt;
+            AMREX_LOOP_3D(bx, i, j, k,
+            {
+                tmp += msk(i,j,k) ? a(i,j,k,comp) : 0.0_rt;
+            });
+            sm += tmp; // Do it this way so that it does not break regression tests.
+        }
+    }
+
+    if (!local) {
+        ParallelAllReduce::Sum(sm, ParallelContext::CommunicatorSub());
+    }
+
+    return sm;
+}
+
 void
 MultiFab::minus (const MultiFab& mf, int strt_comp, int num_comp, int nghost)
 {
diff --git a/Src/Base/AMReX_MultiFabUtil.H b/Src/Base/AMReX_MultiFabUtil.H
index 1444bb90484..21f89c8ed6c 100644
--- a/Src/Base/AMReX_MultiFabUtil.H
+++ b/Src/Base/AMReX_MultiFabUtil.H
@@ -231,6 +231,35 @@ namespace amrex
      */
     Gpu::HostVector<Real> sumToLine (MultiFab const& mf, int icomp, int ncomp,
                                      Box const& domain, int direction, bool local = false);
+
+    /** \brief Volume weighted sum for a vector of MultiFabs
+     *
+     * Return a volume weighted sum of MultiFabs of AMR data.  The sum is
+     * perform on a single component of the data.  If the MultiFabs are
+     * built with EB Factories, the cut cell volume fraction will be
+     * included in the weight.
+     */
+    Real volumeWeightedSum (Vector<MultiFab const*> const& mf, int icomp,
+                            Vector<Geometry> const& geom,
+                            Vector<IntVect> const& ratio,
+                            bool local = false);
+
+    /**
+     * \brief Fourth-order interpolation from fine to coarse level.
+     *
+     * This is for high-order "average-down" of finite-difference data.  If
+     * ghost cell data are used, it's the caller's responsibility to fill
+     * the ghost cells before calling this function.
+     *
+     * \param cmf   coarse data
+     * \param scomp starting component
+     * \param ncomp number of component
+     * \param fmf   fine data
+     * \param ratio refinement ratio.
+     */
+    void FourthOrderInterpFromFineToCoarse (MultiFab& cmf, int scomp, int ncomp,
+                                            MultiFab const& fmf,
+                                            IntVect const& ratio);
 }
 
 namespace amrex {
diff --git a/Src/Base/AMReX_MultiFabUtil.cpp b/Src/Base/AMReX_MultiFabUtil.cpp
index 26a7242e89d..3ae4aa91b9f 100644
--- a/Src/Base/AMReX_MultiFabUtil.cpp
+++ b/Src/Base/AMReX_MultiFabUtil.cpp
@@ -1226,4 +1226,245 @@ namespace amrex
         }
         return hv;
     }
+
+    Real volumeWeightedSum (Vector<MultiFab const*> const& mf, int icomp,
+                            Vector<Geometry> const& geom,
+                            Vector<IntVect> const& ratio,
+                            bool local)
+    {
+        ReduceOps<ReduceOpSum> reduce_op;
+        ReduceData<Real> reduce_data(reduce_op);
+
+#ifdef AMREX_USE_EB
+        bool has_eb = !(mf[0]->isAllRegular());
+#endif
+
+        int nlevels = mf.size();
+        for (int ilev = 0; ilev < nlevels-1; ++ilev) {
+            iMultiFab mask = makeFineMask(*mf[ilev], *mf[ilev+1], IntVect(0),
+                                          ratio[ilev],Periodicity::NonPeriodic(),
+                                          0, 1);
+            auto const& m = mask.const_arrays();
+            auto const& a = mf[ilev]->const_arrays();
+            auto const dx = geom[ilev].CellSizeArray();
+            Real dv = AMREX_D_TERM(dx[0],*dx[1],*dx[2]);
+#ifdef AMREX_USE_EB
+            if (has_eb) {
+                AMREX_ASSERT(mf[ilev]->hasEBFabFactory());
+                auto const& f = dynamic_cast<EBFArrayBoxFactory const&>
+                    (mf[ilev]->Factory());
+                auto const& vfrac = f.getVolFrac();
+                auto const& va = vfrac.const_arrays();
+                reduce_op.eval(*mf[ilev], IntVect(0), reduce_data,
+                [=] AMREX_GPU_DEVICE (int box_no, int i, int j, int k) noexcept
+                               -> Real
+                {
+                    return m[box_no](i,j,k) ? Real(0.)
+                        : dv*a[box_no](i,j,k,icomp)*va[box_no](i,j,k);
+                });
+            } else
+#endif
+            {
+#if (AMREX_SPACEDIM == 1)
+                if (geom[ilev].IsSPHERICAL()) {
+                    const auto rlo = geom[ilev].CellSize(0);
+                    reduce_op.eval(*mf[ilev], IntVect(0), reduce_data,
+                    [=] AMREX_GPU_DEVICE (int box_no, int i, int j, int k)
+                                   noexcept -> Real
+                    {
+                        if (m[box_no](i,j,k)) {
+                            return Real(0.);
+                        } else {
+                            constexpr Real pi = Real(3.1415926535897932);
+                            Real ri = rlo + dx[0]*i;
+                            Real ro = ri + dx[0];
+                            return Real(4./3.)*pi*(ro-ri)*(ro*ro+ro*ri+ri*ri)
+                                * a[box_no](i,j,k,icomp);
+                        }
+                    });
+                } else
+#elif (AMREX_SPACEDIM == 2)
+                if (geom[ilev].IsRZ()) {
+                    const auto rlo = geom[ilev].CellSize(0);
+                    reduce_op.eval(*mf[ilev], IntVect(0), reduce_data,
+                    [=] AMREX_GPU_DEVICE (int box_no, int i, int j, int k)
+                                   noexcept -> Real
+                    {
+                        if (m[box_no](i,j,k)) {
+                            return Real(0.);
+                        } else {
+                            Real ri = rlo + dx[0]*i;
+                            Real ro = ri + dx[0];
+                            constexpr Real pi = Real(3.1415926535897932);
+                            return pi*dx[1]*dx[0]*(ro+ri)
+                                * a[box_no](i,j,k,icomp);
+                        }
+                    });
+                } else
+#endif
+                {
+                    reduce_op.eval(*mf[ilev], IntVect(0), reduce_data,
+                    [=] AMREX_GPU_DEVICE (int box_no, int i, int j, int k)
+                                   noexcept -> Real
+                    {
+                        return m[box_no](i,j,k) ? Real(0.)
+                            : dv*a[box_no](i,j,k,icomp);
+                    });
+                }
+            }
+            Gpu::streamSynchronize();
+        }
+
+        auto const& a = mf.back()->const_arrays();
+        auto const dx = geom[nlevels-1].CellSizeArray();
+        Real dv = AMREX_D_TERM(dx[0],*dx[1],*dx[2]);
+#ifdef AMREX_USE_EB
+        if (has_eb) {
+            AMREX_ASSERT(mf.back()->hasEBFabFactory());
+            auto const& f = dynamic_cast<EBFArrayBoxFactory const&>
+                (mf.back()->Factory());
+            auto const& vfrac = f.getVolFrac();
+            auto const& va = vfrac.const_arrays();
+            reduce_op.eval(*mf.back(), IntVect(0), reduce_data,
+            [=] AMREX_GPU_DEVICE (int box_no, int i, int j, int k) noexcept
+                           -> Real
+            {
+                return dv*a[box_no](i,j,k,icomp)*va[box_no](i,j,k);
+            });
+        } else
+#endif
+        {
+#if (AMREX_SPACEDIM == 1)
+            if (geom[nlevels-1].IsSPHERICAL()) {
+                const auto rlo = geom[nlevels-1].CellSize(0);
+                reduce_op.eval(*mf.back(), IntVect(0), reduce_data,
+                [=] AMREX_GPU_DEVICE (int box_no, int i, int j, int k)
+                               noexcept -> Real
+                {
+                    constexpr Real pi = Real(3.1415926535897932);
+                    Real ri = rlo + dx[0]*i;
+                    Real ro = ri + dx[0];
+                    return Real(4./3.)*pi*(ro-ri)*(ro*ro+ro*ri+ri*ri)
+                        * a[box_no](i,j,k,icomp);
+                });
+            } else
+#elif (AMREX_SPACEDIM == 2)
+            if (geom[nlevels-1].IsRZ()) {
+                const auto rlo = geom[nlevels-1].CellSize(0);
+                reduce_op.eval(*mf.back(), IntVect(0), reduce_data,
+                [=] AMREX_GPU_DEVICE (int box_no, int i, int j, int k)
+                               noexcept -> Real
+                {
+                    Real ri = rlo + dx[0]*i;
+                    Real ro = ri + dx[0];
+                    constexpr Real pi = Real(3.1415926535897932);
+                    return pi*dx[1]*dx[0]*(ro+ri)
+                        * a[box_no](i,j,k,icomp);
+                });
+            } else
+#endif
+            {
+                reduce_op.eval(*mf.back(), IntVect(0), reduce_data,
+                [=] AMREX_GPU_DEVICE (int box_no, int i, int j, int k) noexcept
+                {
+                    return dv*a[box_no](i,j,k,icomp);
+                });
+            }
+        }
+
+        auto const& hv = reduce_data.value(reduce_op);
+        Real r = amrex::get<0>(hv);
+
+        if (!local) {
+            ParallelAllReduce::Sum(r, ParallelContext::CommunicatorSub());
+        }
+        return r;
+    }
+
+    void FourthOrderInterpFromFineToCoarse (MultiFab& cmf, int scomp, int ncomp,
+                                            MultiFab const& fmf,
+                                            IntVect const& ratio)
+    {
+        AMREX_ASSERT(AMREX_D_TERM(   (ratio[0] == 2 || ratio[0] == 4),
+                                  && (ratio[1] == 2 || ratio[1] == 4),
+                                  && (ratio[2] == 2 || ratio[2] == 4)));
+
+        MultiFab tmp(amrex::coarsen(fmf.boxArray(), ratio), fmf.DistributionMap(),
+                     ncomp, 0);
+
+#ifdef AMREX_USE_OMP
+#pragma omp parallel if (Gpu::notInLaunchRegion())
+#endif
+        {
+#if (AMREX_SPACEDIM > 1)
+            FArrayBox xtmp;
+#if (AMREX_SPACEDIM > 2)
+            FArrayBox ytmp;
+#endif
+#endif
+            for (MFIter mfi(tmp,TilingIfNotGPU()); mfi.isValid(); ++mfi) {
+                Box const& bx = mfi.tilebox();
+                auto const& fa = fmf.const_array(mfi,scomp);
+
+                Box xbx = bx;
+#if (AMREX_SPACEDIM == 1)
+                auto const& xa = tmp.array(mfi);
+#else
+                xbx.refine(IntVect(AMREX_D_DECL(1,ratio[1],ratio[2])));
+                if (ratio[1] == 2) { xbx.grow(1,1); }
+#if (AMREX_SPACEDIM == 3)
+                if (ratio[2] == 2) { xbx.grow(2,1); }
+#endif
+                xtmp.resize(xbx,ncomp);
+                Elixir eli = xtmp.elixir();
+                auto const& xa = xtmp.array();
+#endif
+                AMREX_HOST_DEVICE_PARALLEL_FOR_4D(xbx, ncomp, i, j, k, n,
+                {
+                    int ii = 2*i;
+                    xa(i,j,k,n) = Real(1./16)*(Real(9.)*(fa(ii  ,j,k,n) +
+                                                         fa(ii+1,j,k,n))
+                                               -         fa(ii-1,j,k,n)
+                                               -         fa(ii+2,j,k,n));
+                });
+
+#if (AMREX_SPACEDIM > 1)
+                Box ybx = bx;
+                auto const& xca = xtmp.const_array();
+#if (AMREX_SPACEDIM == 2)
+                auto const& ya = tmp.array(mfi);
+#else
+                ybx.refine(IntVect(AMREX_D_DECL(1,1,ratio[2])));
+                if (ratio[2] == 2) { ybx.grow(2,1); }
+                ytmp.resize(ybx,ncomp);
+                eli.append(ytmp.elixir());
+                auto const& ya = ytmp.array();
+#endif
+                AMREX_HOST_DEVICE_PARALLEL_FOR_4D(ybx, ncomp, i, j, k, n,
+                {
+                    int jj = 2*j;
+                    ya(i,j,k,n) = Real(1./16)*(Real(9.)*(xca(i,jj  ,k,n) +
+                                                         xca(i,jj+1,k,n))
+                                               -         xca(i,jj-1,k,n)
+                                               -         xca(i,jj+2,k,n));
+                });
+
+#if (AMREX_SPACEDIM == 3)
+                auto const& yca = ytmp.const_array();
+                auto const& ca = tmp.array(mfi);
+                AMREX_HOST_DEVICE_PARALLEL_FOR_4D(bx, ncomp, i, j, k, n,
+                {
+                    int kk = 2*k;
+                    ca(i,j,k,n) = Real(1./16)*(Real(9.)*(yca(i,j,kk  ,n) +
+                                                         yca(i,j,kk+1,n))
+                                               -         yca(i,j,kk-1,n)
+                                               -         yca(i,j,kk+2,n));
+                });
+#endif
+#endif
+            }
+        }
+
+        cmf.ParallelCopy(tmp, 0, scomp, ncomp);
+    }
 }
diff --git a/Src/Base/AMReX_NonLocalBC.H b/Src/Base/AMReX_NonLocalBC.H
index 7613a35de5b..fd534685a7b 100644
--- a/Src/Base/AMReX_NonLocalBC.H
+++ b/Src/Base/AMReX_NonLocalBC.H
@@ -1038,4 +1038,13 @@ FillPolar (FabArray<FAB>& mf, Box const& domain);
 
 #include <AMReX_NonLocalBCImpl.H>
 
+namespace amrex {
+    using NonLocalBC::ParallelCopy;
+    using NonLocalBC::ParallelCopy_nowait;
+    using NonLocalBC::ParallelCopy_finish;
+    using NonLocalBC::MultiBlockIndexMapping;
+    using NonLocalBC::MultiBlockCommMetaData;
+    using NonLocalBC::CommHandler;
+}
+
 #endif
diff --git a/Src/Base/AMReX_Orientation.H b/Src/Base/AMReX_Orientation.H
index 064344cafd4..de9c54a1b6c 100644
--- a/Src/Base/AMReX_Orientation.H
+++ b/Src/Base/AMReX_Orientation.H
@@ -75,7 +75,7 @@ public:
     * according to the above ordering.
     */
     AMREX_GPU_HOST_DEVICE
-    operator int () const noexcept { return val; }
+    constexpr operator int () const noexcept { return val; }
     //! Return opposite orientation.
     AMREX_GPU_HOST_DEVICE
     Orientation flip () const noexcept
@@ -97,6 +97,30 @@ public:
     //! Read from an istream.
     friend std::istream& operator>> (std::istream& os, Orientation& o);
 
+    //! Int value of the x-lo-face
+    AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+    static constexpr int xlo () noexcept { return 0; }
+
+    //! Int value of the x-hi-face
+    AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+    static constexpr int xhi () noexcept { return AMREX_SPACEDIM; }
+
+    //! Int value of the y-lo-face
+    AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+    static constexpr int ylo () noexcept { return 1; }
+
+    //! Int value of the y-hi-face
+    AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+    static constexpr int yhi () noexcept { return 1+AMREX_SPACEDIM; }
+
+    //! Int value of the z-lo-face
+    AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+    static constexpr int zlo () noexcept { return 2; }
+
+    //! Int value of the z-hi-face
+    AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+    static constexpr int zhi () noexcept { return 2+AMREX_SPACEDIM; }
+
 private:
     //! Used internally.
     AMREX_GPU_HOST_DEVICE
diff --git a/Src/Base/AMReX_PODVector.H b/Src/Base/AMReX_PODVector.H
index 7217b4e814e..bfae2c01627 100644
--- a/Src/Base/AMReX_PODVector.H
+++ b/Src/Base/AMReX_PODVector.H
@@ -608,7 +608,10 @@ namespace amrex
         void AllocateBuffer (size_type a_capacity) noexcept
         {
             pointer new_data = allocate(a_capacity);
-            if (m_data) detail::memCopyImpl<Allocator>(new_data, m_data, size() * sizeof(T), *this);
+            if (m_data) {
+                detail::memCopyImpl<Allocator>(new_data, m_data, size() * sizeof(T), *this);
+                amrex::Gpu::streamSynchronize();
+            }
             deallocate(m_data, capacity());
             m_data = new_data;
             m_capacity = a_capacity;
@@ -621,9 +624,10 @@ namespace amrex
             pointer new_data = allocate(a_capacity);
             if (m_data)
             {
-         memCopyImpl<Allocator>(new_data, m_data, a_index * sizeof(T), *this);
+                memCopyImpl<Allocator>(new_data, m_data, a_index * sizeof(T), *this);
                 memCopyImpl<Allocator>(new_data + a_index + a_count, m_data + a_index,
                                        (size() - a_index)*sizeof(T), *this);
+                amrex::Gpu::streamSynchronize();
             }
             deallocate(m_data, capacity());
             m_data = new_data;
diff --git a/Src/Base/AMReX_ParallelDescriptor.H b/Src/Base/AMReX_ParallelDescriptor.H
index 38cd4cdf167..03c431d135a 100644
--- a/Src/Base/AMReX_ParallelDescriptor.H
+++ b/Src/Base/AMReX_ParallelDescriptor.H
@@ -12,6 +12,7 @@
 #include <AMReX_REAL.H>
 #include <AMReX_Array.H>
 #include <AMReX_Vector.H>
+#include <AMReX_ValLocPair.H>
 
 #ifndef BL_AMRPROF
 #include <AMReX_Box.H>
@@ -211,6 +212,11 @@ while ( false )
     extern AMREX_EXPORT MPI_Comm m_comm;
     inline MPI_Comm Communicator () noexcept { return m_comm; }
 
+#ifdef AMREX_USE_MPI
+    extern Vector<MPI_Datatype*> m_mpi_types;
+    extern Vector<MPI_Op*> m_mpi_ops;
+#endif
+
     //! return the number of MPI ranks local to the current Parallel Context
     inline int
     NProcs () noexcept
@@ -1479,6 +1485,73 @@ void DoReduce (T* r, MPI_Op op, int cnt, int cpu)
 #endif
 }
 
+#ifdef AMREX_USE_MPI
+namespace ParallelDescriptor {
+
+template<typename TV, typename TI>
+struct Mpi_typemap<ValLocPair<TV,TI>>
+{
+    static MPI_Datatype type ()
+    {
+        static MPI_Datatype mpi_type = MPI_DATATYPE_NULL;
+        if (mpi_type == MPI_DATATYPE_NULL) {
+            using T = ValLocPair<TV,TI>;
+            static_assert(std::is_trivially_copyable<T>::value,
+                          "To communicate with MPI, ValLocPair must be trivially copyable.");
+            static_assert(std::is_standard_layout<T>::value,
+                          "To communicate with MPI, ValLocPair must be standard layout");
+
+            T vlp[2];
+            MPI_Datatype types[] = {
+                Mpi_typemap<TV>::type(),
+                Mpi_typemap<TI>::type(),
+            };
+            int blocklens[] = { 1, 1 };
+            MPI_Aint disp[2];
+            BL_MPI_REQUIRE( MPI_Get_address(&vlp[0].value, &disp[0]) );
+            BL_MPI_REQUIRE( MPI_Get_address(&vlp[0].index, &disp[1]) );
+            disp[1] -= disp[0];
+            disp[0] = 0;
+            BL_MPI_REQUIRE( MPI_Type_create_struct(2, blocklens, disp, types,
+                                               &mpi_type) );
+            MPI_Aint lb, extent;
+            BL_MPI_REQUIRE( MPI_Type_get_extent(mpi_type, &lb, &extent) );
+            if (extent != sizeof(T)) {
+                MPI_Datatype tmp = mpi_type;
+                BL_MPI_REQUIRE( MPI_Type_create_resized(tmp, 0, sizeof(vlp[0]), &mpi_type) );
+                BL_MPI_REQUIRE( MPI_Type_free(&tmp) );
+            }
+            BL_MPI_REQUIRE( MPI_Type_commit( &mpi_type ) );
+
+            m_mpi_types.push_back(&mpi_type);
+        }
+        return mpi_type;
+    }
+};
+
+template <typename T, typename F>
+MPI_Op Mpi_op ()
+{
+    static MPI_Op mpi_op = MPI_OP_NULL;
+    if (mpi_op == MPI_OP_NULL) {
+        static auto user_fn = [] (void *invec, void *inoutvec, int* len,
+                                  MPI_Datatype * /*datatype*/)
+        {
+            auto in = static_cast<T const*>(invec);
+            auto out = static_cast<T*>(inoutvec);
+            for (int i = 0; i < *len; ++i) {
+                out[i] = F()(in[i],out[i]);
+            }
+        };
+        BL_MPI_REQUIRE( MPI_Op_create(user_fn, 1, &mpi_op) );
+        m_mpi_ops.push_back(&mpi_op);
+    }
+    return mpi_op;
+}
+
+}
+#endif
+
 }
 
 #endif /*BL_PARALLELDESCRIPTOR_H*/
diff --git a/Src/Base/AMReX_ParallelDescriptor.cpp b/Src/Base/AMReX_ParallelDescriptor.cpp
index 6d457d28398..3ea202d9b50 100644
--- a/Src/Base/AMReX_ParallelDescriptor.cpp
+++ b/Src/Base/AMReX_ParallelDescriptor.cpp
@@ -65,6 +65,11 @@ namespace amrex { namespace ParallelDescriptor {
 
     MPI_Comm m_comm = MPI_COMM_NULL;    // communicator for all ranks, probably MPI_COMM_WORLD
 
+#ifdef AMREX_USE_MPI
+    Vector<MPI_Datatype*> m_mpi_types;
+    Vector<MPI_Op*> m_mpi_ops;
+#endif
+
     int m_MinTag = 1000, m_MaxTag = -1;
 
     const int ioProcessor = 0;
@@ -357,10 +362,20 @@ EndParallel ()
         BL_MPI_REQUIRE( MPI_Type_free(&mpi_type_indextype) );
         BL_MPI_REQUIRE( MPI_Type_free(&mpi_type_box) );
         BL_MPI_REQUIRE( MPI_Type_free(&mpi_type_lull_t) );
+        for (auto t : m_mpi_types) {
+            BL_MPI_REQUIRE( MPI_Type_free(t) );
+            *t = MPI_DATATYPE_NULL;
+        }
+        for (auto op : m_mpi_ops) {
+            BL_MPI_REQUIRE( MPI_Op_free(op) );
+            *op = MPI_OP_NULL;
+        }
         mpi_type_intvect   = MPI_DATATYPE_NULL;
         mpi_type_indextype = MPI_DATATYPE_NULL;
         mpi_type_box       = MPI_DATATYPE_NULL;
         mpi_type_lull_t    = MPI_DATATYPE_NULL;
+        m_mpi_types.clear();
+        m_mpi_ops.clear();
     }
 
     if (!call_mpi_finalize) {
diff --git a/Src/Base/AMReX_ParallelReduce.H b/Src/Base/AMReX_ParallelReduce.H
index e0e1e98b66e..3a6db500a2a 100644
--- a/Src/Base/AMReX_ParallelReduce.H
+++ b/Src/Base/AMReX_ParallelReduce.H
@@ -3,6 +3,7 @@
 #include <AMReX_Config.H>
 
 #include <AMReX.H>
+#include <AMReX_Functional.H>
 #include <AMReX_ParallelDescriptor.H>
 #include <AMReX_Print.H>
 #include <AMReX_Vector.H>
@@ -120,6 +121,32 @@ namespace ParallelGather {
 
 namespace ParallelAllReduce {
 
+    template<typename TV, typename TI>
+    void Max (ValLocPair<TV,TI>& vi, MPI_Comm comm) {
+#ifdef AMREX_USE_MPI
+        auto tmp = vi;
+        using T = ValLocPair<TV,TI>;
+        MPI_Allreduce(&tmp, &vi, 1,
+                      ParallelDescriptor::Mpi_typemap<T>::type(),
+                      ParallelDescriptor::Mpi_op<T,amrex::Greater<T>>(), comm);
+#else
+        amrex::ignore_unused(vi, comm);
+#endif
+    }
+
+    template<typename TV, typename TI>
+    void Min (ValLocPair<TV,TI>& vi, MPI_Comm comm) {
+#ifdef AMREX_USE_MPI
+        auto tmp = vi;
+        using T = ValLocPair<TV,TI>;
+        MPI_Allreduce(&tmp, &vi, 1,
+                      ParallelDescriptor::Mpi_typemap<T>::type(),
+                      ParallelDescriptor::Mpi_op<T,amrex::Less<T>>(), comm);
+#else
+        amrex::ignore_unused(vi, comm);
+#endif
+    }
+
     template<typename T>
     void Max (T& v, MPI_Comm comm) {
         detail::Reduce(detail::ReduceOp::max, v, -1, comm);
@@ -174,6 +201,34 @@ namespace ParallelAllReduce {
 
 namespace ParallelReduce {
 
+    template<typename TV, typename TI>
+    void Max (ValLocPair<TV,TI>& vi, int root, MPI_Comm comm) {
+#ifdef AMREX_USE_MPI
+        auto tmp = vi;
+        using T = ValLocPair<TV,TI>;
+        MPI_Reduce(&tmp, &vi, 1,
+                   ParallelDescriptor::Mpi_typemap<T>::type(),
+                   ParallelDescriptor::Mpi_op<T,amrex::Greater<T>>(),
+                   root, comm);
+#else
+        amrex::ignore_unused(vi, root, comm);
+#endif
+    }
+
+    template<typename TV, typename TI>
+    void Min (ValLocPair<TV,TI>& vi, int root, MPI_Comm comm) {
+#ifdef AMREX_USE_MPI
+        auto tmp = vi;
+        using T = ValLocPair<TV,TI>;
+        MPI_Reduce(&tmp, &vi, 1,
+                   ParallelDescriptor::Mpi_typemap<T>::type(),
+                   ParallelDescriptor::Mpi_op<T,amrex::Less<T>>(),
+                   root, comm);
+#else
+        amrex::ignore_unused(vi, root, comm);
+#endif
+    }
+
     template<typename T>
     void Max (T& v, int root, MPI_Comm comm) {
         detail::Reduce(detail::ReduceOp::max, v, root, comm);
diff --git a/Src/Base/AMReX_ParmParse.H b/Src/Base/AMReX_ParmParse.H
index 6555ee5aec0..504aaa4f256 100644
--- a/Src/Base/AMReX_ParmParse.H
+++ b/Src/Base/AMReX_ParmParse.H
@@ -554,7 +554,7 @@ public:
               const std::string&  val);
 
     //! keyword for files to load
-    static std::string FileKeyword;
+    static std::string const FileKeyword;
 
     //! Add keys and values from a file to the end of the PP table.
     static void addfile (std::string const filename);
diff --git a/Src/Base/AMReX_ParmParse.cpp b/Src/Base/AMReX_ParmParse.cpp
index 79e80fbb8bd..253ad0e37e0 100644
--- a/Src/Base/AMReX_ParmParse.cpp
+++ b/Src/Base/AMReX_ParmParse.cpp
@@ -34,7 +34,7 @@ static bool finalize_verbose = false;
 static bool finalize_verbose = true;
 #endif
 
-std::string ParmParse::FileKeyword = "FILE";
+std::string const ParmParse::FileKeyword = "FILE";
 
 //
 // Used by constructor to build table.
@@ -609,7 +609,8 @@ addDefn (std::string&         def,
         tab.push_back(ParmParse::PP_entry(def,val));
     }
     val.clear();
-    def = std::string();
+    if ( def != ParmParse::FileKeyword )
+        def = std::string();
 }
 
 void
@@ -991,7 +992,8 @@ ParmParse::prefixedName (const std::string& str) const
 void
 ParmParse::addfile (std::string const filename) {
     auto l = std::list<std::string>{filename};
-    addDefn(FileKeyword,
+    auto file = FileKeyword;
+    addDefn(file,
             l,
             g_table);
 }
diff --git a/Src/Base/AMReX_RandomEngine.H b/Src/Base/AMReX_RandomEngine.H
index a639e4731d7..967b9e66569 100644
--- a/Src/Base/AMReX_RandomEngine.H
+++ b/Src/Base/AMReX_RandomEngine.H
@@ -15,7 +15,6 @@
 #include <curand_kernel.h>
 #elif defined(AMREX_USE_DPCPP)
 #include <CL/sycl.hpp>
-namespace sycl = cl::sycl;
 #include <oneapi/mkl/rng/device.hpp>
 namespace mkl = oneapi::mkl;
 #endif
diff --git a/Src/Base/AMReX_Reduce.H b/Src/Base/AMReX_Reduce.H
index 9c07b7b4a2a..9076e984828 100644
--- a/Src/Base/AMReX_Reduce.H
+++ b/Src/Base/AMReX_Reduce.H
@@ -6,9 +6,11 @@
 #include <AMReX_Arena.H>
 #include <AMReX_OpenMP.H>
 #include <AMReX_MFIter.H>
+#include <AMReX_ValLocPair.H>
 
 #include <algorithm>
 #include <functional>
+#include <limits>
 
 namespace amrex {
 
@@ -133,7 +135,12 @@ struct ReduceOpMin
     void local_update (T& d, T const& s) const noexcept { d = amrex::min(d,s); }
 
     template <typename T>
-    constexpr void init (T& t) const noexcept { t = std::numeric_limits<T>::max(); }
+    constexpr std::enable_if_t<std::numeric_limits<T>::is_specialized>
+    init (T& t) const noexcept { t = std::numeric_limits<T>::max(); }
+
+    template <typename T>
+    constexpr std::enable_if_t<!std::numeric_limits<T>::is_specialized>
+    init (T& t) const noexcept { t = T::max(); }
 };
 
 struct ReduceOpMax
@@ -161,7 +168,12 @@ struct ReduceOpMax
     void local_update (T& d, T const& s) const noexcept { d = amrex::max(d,s); }
 
     template <typename T>
-    constexpr void init (T& t) const noexcept { t = std::numeric_limits<T>::lowest(); }
+    constexpr std::enable_if_t<std::numeric_limits<T>::is_specialized>
+    init (T& t) const noexcept { t = std::numeric_limits<T>::lowest(); }
+
+    template <typename T>
+    constexpr std::enable_if_t<!std::numeric_limits<T>::is_specialized>
+    init (T& t) const noexcept { t = T::lowest(); }
 };
 
 struct ReduceOpLogicalAnd
@@ -899,7 +911,8 @@ bool AnyOf (Box const& box, P&& pred)
         }
     });
 #else
-    AMREX_LAUNCH_KERNEL(ec.numBlocks, ec.numThreads, 0, Gpu::gpuStream(),
+    AMREX_LAUNCH_KERNEL(AMREX_GPU_MAX_THREADS, ec.numBlocks, ec.numThreads, 0,
+                        Gpu::gpuStream(),
     [=] AMREX_GPU_DEVICE () noexcept {
         __shared__ int has_any;
         if (threadIdx.x == 0) has_any = *dp;
diff --git a/Src/Base/AMReX_RungeKutta.H b/Src/Base/AMReX_RungeKutta.H
new file mode 100644
index 00000000000..b5e35f783c5
--- /dev/null
+++ b/Src/Base/AMReX_RungeKutta.H
@@ -0,0 +1,293 @@
+#ifndef AMREX_RUNGE_KUTTA_H_
+#define AMREX_RUNGE_KUTTA_H_
+#include <AMReX_Config.H>
+
+#include <AMReX_FabArray.H>
+
+namespace amrex {
+
+/**
+ * \brief Functions for Runge-Kutta methods
+ *
+ * This namespace RungeKutta has functions for a number RK methods, RK2, RK3
+ * and RK4.  Here, RK2 refers to the explicit trapezoid rule, RK3 refers to
+ * the SSPRK3
+ * (https://en.wikipedia.org/wiki/List_of_Runge%E2%80%93Kutta_methods#Third-order_Strong_Stability_Preserving_Runge-Kutta_(SSPRK3)),
+ * and RK4 is the classical fourth-order method
+ * (https://en.wikipedia.org/wiki/List_of_Runge%E2%80%93Kutta_methods#Classic_fourth-order_method).
+ * The function templates take the old data in FabArray/MultiFab as input,
+ * and evolve the system for one time step.  The result is stored in another
+ * FabArray/MultiFab.  These two FabArrays must have ghost cells if they are
+ * needed for evaluating the right-hand side.  The functions take three
+ * callable objects for computing the right-hand side, filling ghost cells,
+ * and optionally post-processing RK stage results.  For RK3 and RK4, they
+ * also need a callable object for storing the data needed for filling
+ * coarse/fine boundaries in AMR simulations.
+ *
+ * The callable object for right-hand side has the signature of `void(int
+ * stage, MF& dudt, MF const& u, Real t, Real dt)`, where `stage` is the RK
+ * stage number starting from 1, `dudt` is the output, `u` is the input, `t`
+ * is the first-order approximate time of the stage, and `dt` is the
+ * sub-time step, which can be used for reflux operations in AMR
+ * simulations.
+ *
+ * The callable object for filling ghost cells has the signature of
+ * `void(int stage, MF& u, Real t)`, where `stage` is the RK stage number
+ * starting from 1, `u` is a FabArray/MultiFab whose ghost cells need to be
+ * filled, and `t` is the first-order approximate time of the data at that
+ * stage.  The FillPatcher class can be useful for implementing such a
+ * callable.  See AmrLevel::RK for an example.
+ *
+ * The callable object for post-processing stage results is optional.  It's
+ * no-op by default.  Its function signature is `void(int stage, MF& u)`,
+ * where `stage` is the RK stage number and `u` is the result of that stage.
+ *
+ * For RK3 and RK4, one must also provide a callable object with the
+ * signature of `void(Array<MF,order> const& rkk)`, where `order` is the RK
+ * order and `rkk` contains the right-hand side at all the RK stages.  The
+ * FillPatcher class can be useful for implementing such a callable.  See
+ * AmrLevel::RK for an example.
+ */
+namespace RungeKutta {
+
+struct PostStageNoOp {
+    template <typename MF>
+    std::enable_if_t<IsFabArray<MF>::value> operator() (int, MF&) const {}
+};
+
+namespace detail {
+//! Unew = Uold + dUdt * dt
+template <typename MF>
+void rk_update (MF& Unew, MF const& Uold, MF const& dUdt, Real dt)
+{
+    auto const& snew = Unew.arrays();
+    auto const& sold = Uold.const_arrays();
+    auto const& sdot = dUdt.const_arrays();
+    amrex::ParallelFor(Unew, IntVect(0), Unew.nComp(), [=] AMREX_GPU_DEVICE
+                       (int bi, int i, int j, int k, int n) noexcept
+    {
+            snew[bi](i,j,k,n) = sold[bi](i,j,k,n) + dt*sdot[bi](i,j,k,n);
+    });
+    Gpu::streamSynchronize();
+}
+
+//! Unew = Uold + (dUdt1 + dUdt2) * dt
+template <typename MF>
+void rk_update (MF& Unew, MF const& Uold, MF const& dUdt1, MF const& dUdt2, Real dt)
+{
+    auto const& snew = Unew.arrays();
+    auto const& sold = Uold.const_arrays();
+    auto const& sdot1 = dUdt1.const_arrays();
+    auto const& sdot2 = dUdt2.const_arrays();
+    amrex::ParallelFor(Unew, IntVect(0), Unew.nComp(), [=] AMREX_GPU_DEVICE
+                       (int bi, int i, int j, int k, int n) noexcept
+    {
+            snew[bi](i,j,k,n) = sold[bi](i,j,k,n) + dt*(sdot1[bi](i,j,k,n) +
+                                                        sdot2[bi](i,j,k,n));
+    });
+    Gpu::streamSynchronize();
+}
+
+//! Unew = (Uold+Unew)/2 + dUdt * dt/2
+template <typename MF>
+void rk2_update_2 (MF& Unew, MF const& Uold, MF const& dUdt, Real dt)
+{
+    auto const& snew = Unew.arrays();
+    auto const& sold = Uold.const_arrays();
+    auto const& sdot = dUdt.const_arrays();
+    amrex::ParallelFor(Unew, IntVect(0), Unew.nComp(), [=] AMREX_GPU_DEVICE
+                       (int bi, int i, int j, int k, int n) noexcept
+    {
+        snew[bi](i,j,k,n) = Real(0.5)*(snew[bi](i,j,k,n) +
+                                       sold[bi](i,j,k,n) +
+                                       sdot[bi](i,j,k,n) * dt);
+    });
+    Gpu::streamSynchronize();
+}
+
+//! Unew = Uold + (k1 + k2 + 4*k3) * dt6, where dt6 = dt/6
+template <typename MF>
+void rk3_update_3 (MF& Unew, MF const& Uold, Array<MF,3> const& rkk, Real dt6)
+{
+    auto const& snew = Unew.arrays();
+    auto const& sold = Uold.const_arrays();
+    auto const& k1 = rkk[0].const_arrays();
+    auto const& k2 = rkk[1].const_arrays();
+    auto const& k3 = rkk[2].const_arrays();
+    amrex::ParallelFor(Unew, IntVect(0), Unew.nComp(), [=] AMREX_GPU_DEVICE
+                       (int bi, int i, int j, int k, int n) noexcept
+    {
+        snew[bi](i,j,k,n) = sold[bi](i,j,k,n)
+            + dt6 * (k1[bi](i,j,k,n) + k2[bi](i,j,k,n)
+                     +      Real(4.) * k3[bi](i,j,k,n));
+    });
+    Gpu::streamSynchronize();
+}
+
+//! Unew = Uold + (k1+k4+2*(k2+k3))*dt6, where dt6 = dt/6
+template <typename MF>
+void rk4_update_4 (MF& Unew, MF const& Uold, Array<MF,4> const& rkk, Real dt6)
+{
+    auto const& snew = Unew.arrays();
+    auto const& sold = Uold.const_arrays();
+    auto const& k1 = rkk[0].const_arrays();
+    auto const& k2 = rkk[1].const_arrays();
+    auto const& k3 = rkk[2].const_arrays();
+    auto const& k4 = rkk[3].const_arrays();
+    amrex::ParallelFor(Unew, IntVect(0), Unew.nComp(), [=] AMREX_GPU_DEVICE
+                       (int bi, int i, int j, int k, int n) noexcept
+    {
+        snew[bi](i,j,k,n) = sold[bi](i,j,k,n)
+            + dt6 * (            k1[bi](i,j,k,n) + k4[bi](i,j,k,n)
+                     + Real(2.)*(k2[bi](i,j,k,n) + k3[bi](i,j,k,n)));
+    });
+    Gpu::streamSynchronize();
+}
+}
+
+/**
+ * \brief Time stepping with RK2
+ *
+ * \param Uold       input FabArray/MultiFab data at time
+ * \param Unew       output FabArray/MultiFab data at time+dt
+ * \param time       time at the beginning of the step
+ * \param dt         time step
+ * \param frhs       computing the right-hand side
+ * \param fillbndry  filling ghost cells
+ * \param post_stage post-processing stage results
+ */
+template <typename MF, typename F, typename FB, typename P = PostStageNoOp>
+void RK2 (MF& Uold, MF& Unew, Real time, Real dt, F&& frhs, FB&& fillbndry,
+          P&& post_stage = PostStageNoOp())
+{
+    BL_PROFILE("RungeKutta2");
+
+    MF dUdt(Unew.boxArray(), Unew.DistributionMap(), Unew.nComp(), 0,
+            MFInfo(), Unew.Factory());
+
+    // RK2 stage 1
+    fillbndry(1, Uold, time);
+    frhs(1, dUdt, Uold, time, Real(0.5)*dt);
+    // Unew = Uold + dt * dUdt
+    detail::rk_update(Unew, Uold, dUdt, dt);
+    post_stage(1, Unew);
+
+    // RK2 stage 2
+    fillbndry(2, Unew, time+dt);
+    frhs(2, dUdt, Unew, time, Real(0.5)*dt);
+    // Unew = (Uold+Unew)/2 + dUdt_2 * dt/2,
+    // which is Unew = Uold + dt/2 * (dUdt_1 + dUdt_2)
+    detail::rk2_update_2(Unew, Uold, dUdt, dt);
+    post_stage(2, Unew);
+}
+
+/**
+ * \brief Time stepping with RK3
+ *
+ * \param Uold            input FabArray/MultiFab data at time
+ * \param Unew            output FabArray/MultiFab data at time+dt
+ * \param time            time at the beginning of the step
+ * \param dt              time step
+ * \param frhs            computing the right-hand side
+ * \param fillbndry       filling ghost cells
+ * \param store_crse_data storing right-hand side data for AMR
+ * \param post_stage      post-processing stage results
+ */
+template <typename MF, typename F, typename FB, typename R,
+          typename P = PostStageNoOp>
+void RK3 (MF& Uold, MF& Unew, Real time, Real dt, F&& frhs, FB&& fillbndry,
+          R&& store_crse_data, P&& post_stage = PostStageNoOp())
+{
+    BL_PROFILE("RungeKutta3");
+
+    Array<MF,3> rkk;
+    for (auto& mf : rkk) {
+        mf.define(Unew.boxArray(), Unew.DistributionMap(), Unew.nComp(), 0,
+                  MFInfo(), Unew.Factory());
+    }
+
+    // RK3 stage 1
+    fillbndry(1, Uold, time);
+    frhs(1, rkk[0], Uold, time, dt/Real(6.));
+    // Unew = Uold + k1 * dt
+    detail::rk_update(Unew, Uold, rkk[0], dt);
+    post_stage(1, Unew);
+
+    // RK3 stage 2
+    fillbndry(2, Unew, time+dt);
+    frhs(2, rkk[1], Unew, time+dt, dt/Real(6.));
+    // Unew = Uold + (k1+k2) * dt/4
+    detail::rk_update(Unew, Uold, rkk[0], rkk[1], Real(0.25)*dt);
+    post_stage(2, Unew);
+
+    // RK3 stage 3
+    Real t_half = time + Real(0.5)*dt;
+    fillbndry(3, Unew, t_half);
+    frhs(3, rkk[2], Unew, t_half, dt*Real(2./3.));
+    // Unew = Uold + (k1/6 + k2/6 + k3*(2/3)) * dt
+    detail::rk3_update_3(Unew, Uold, rkk, Real(1./6.)*dt);
+    post_stage(3, Unew);
+
+    store_crse_data(rkk);
+}
+
+/**
+ * \brief Time stepping with RK4
+ *
+ * \param Uold            input FabArray/MultiFab data at time
+ * \param Unew            output FabArray/MultiFab data at time+dt
+ * \param time            time at the beginning of the step
+ * \param dt              time step
+ * \param frhs            computing the right-hand side
+ * \param fillbndry       filling ghost cells
+ * \param store_crse_data storing right-hand side data for AMR
+ * \param post_stage      post-processing stage results
+ */
+template <typename MF, typename F, typename FB, typename R,
+          typename P = PostStageNoOp>
+void RK4 (MF& Uold, MF& Unew, Real time, Real dt, F&& frhs, FB&& fillbndry,
+          R&& store_crse_data, P&& post_stage = PostStageNoOp())
+{
+    BL_PROFILE("RungeKutta4");
+
+    Array<MF,4> rkk;
+    for (auto& mf : rkk) {
+        mf.define(Unew.boxArray(), Unew.DistributionMap(), Unew.nComp(), 0,
+                  MFInfo(), Unew.Factory());
+    }
+
+    // RK4 stage 1
+    fillbndry(1, Uold, time);
+    frhs(1, rkk[0], Uold, time, dt/Real(6.));
+    // Unew = Uold + k1 * dt/2
+    detail::rk_update(Unew, Uold, rkk[0], Real(0.5)*dt);
+    post_stage(1, Unew);
+
+    // RK4 stage 2
+    Real t_half = time + Real(0.5)*dt;
+    fillbndry(2, Unew, t_half);
+    frhs(2, rkk[1], Unew, t_half, dt/Real(3.));
+    // Unew = Uold + k2 * dt/2
+    detail::rk_update(Unew, Uold, rkk[1], Real(0.5)*dt);
+    post_stage(2, Unew);
+
+    // RK4 stage 3
+    fillbndry(3, Unew, t_half);
+    frhs(3, rkk[2], Unew, t_half, dt/Real(3.));
+    // Unew = Uold + k3 * dt;
+    detail::rk_update(Unew, Uold, rkk[2], dt);
+    post_stage(3, Unew);
+
+    // RK4 stage 4
+    fillbndry(4, Unew, time+dt);
+    frhs(4, rkk[3], Unew, time+dt, dt/Real(6.));
+    // Unew = Uold + (k1/6 + k2/3 + k3/3 + k4/6) * dt
+    detail::rk4_update_4(Unew, Uold, rkk, Real(1./6.)*dt);
+    post_stage(4, Unew);
+
+    store_crse_data(rkk);
+}
+
+}}
+
+#endif
diff --git a/Src/Base/AMReX_Scan.H b/Src/Base/AMReX_Scan.H
index 96aefb870b6..3dc5cb98f9a 100644
--- a/Src/Base/AMReX_Scan.H
+++ b/Src/Base/AMReX_Scan.H
@@ -197,7 +197,7 @@ T PrefixSum_mp (N n, FIN && fin, FOUT && fout, TYPE, RetSum a_ret_sum)
     amrex::launch(nblocks, nthreads, sm, stream,
     [=] AMREX_GPU_DEVICE (Gpu::Handler const& gh) noexcept
     {
-        amrex::oneapi::sub_group const& sg = gh.item->get_sub_group();
+        sycl::sub_group const& sg = gh.item->get_sub_group();
         int lane = sg.get_local_id()[0];
         int warp = sg.get_group_id()[0];
         int nwarps = sg.get_group_range()[0];
@@ -226,7 +226,7 @@ T PrefixSum_mp (N n, FIN && fin, FOUT && fout, TYPE, RetSum a_ret_sum)
             T x = x0;
             // Scan within a warp
             for (int i = 1; i <= Gpu::Device::warp_size; i *= 2) {
-                T s = sg.shuffle_up(x, i);
+                T s = sycl::shift_group_right(sg, x, i);
                 if (lane >= i) x += s;
             }
 
@@ -244,7 +244,7 @@ T PrefixSum_mp (N n, FIN && fin, FOUT && fout, TYPE, RetSum a_ret_sum)
             if (warp == 0) {
                 T y = (lane < nwarps) ? shared[lane] : 0;
                 for (int i = 1; i <= Gpu::Device::warp_size; i *= 2) {
-                    T s = sg.shuffle_up(y, i);
+                    T s = sycl::shift_group_right(sg, y, i);
                     if (lane >= i) y += s;
                 }
 
@@ -277,7 +277,7 @@ T PrefixSum_mp (N n, FIN && fin, FOUT && fout, TYPE, RetSum a_ret_sum)
     amrex::launch(1, nthreads, sm, stream,
     [=] AMREX_GPU_DEVICE (Gpu::Handler const& gh) noexcept
     {
-        amrex::oneapi::sub_group const& sg = gh.item->get_sub_group();
+        sycl::sub_group const& sg = gh.item->get_sub_group();
         int lane = sg.get_local_id()[0];
         int warp = sg.get_group_id()[0];
         int nwarps = sg.get_group_range()[0];
@@ -293,7 +293,7 @@ T PrefixSum_mp (N n, FIN && fin, FOUT && fout, TYPE, RetSum a_ret_sum)
             T x = (offset < nblocks) ? blocksum_p[offset] : 0;
             // Scan within a warp
             for (int i = 1; i <= Gpu::Device::warp_size; i *= 2) {
-                T s = sg.shuffle_up(x, i);
+                T s = sycl::shift_group_right(sg, x, i);
                 if (lane >= i) x += s;
             }
 
@@ -311,7 +311,7 @@ T PrefixSum_mp (N n, FIN && fin, FOUT && fout, TYPE, RetSum a_ret_sum)
             if (warp == 0) {
                 T y = (lane < nwarps) ? shared[lane] : 0;
                 for (int i = 1; i <= Gpu::Device::warp_size; i *= 2) {
-                    T s = sg.shuffle_up(y, i);
+                    T s = sycl::shift_group_right(sg, y, i);
                     if (lane >= i) y += s;
                 }
 
@@ -417,7 +417,7 @@ T PrefixSum (N n, FIN && fin, FOUT && fout, TYPE type, RetSum a_ret_sum = retSum
     amrex::launch(nblocks, nthreads, sm, stream,
     [=] AMREX_GPU_DEVICE (Gpu::Handler const& gh) noexcept
     {
-        amrex::oneapi::sub_group const& sg = gh.item->get_sub_group();
+        sycl::sub_group const& sg = gh.item->get_sub_group();
         int lane = sg.get_local_id()[0];
         int warp = sg.get_group_id()[0];
         int nwarps = sg.get_group_range()[0];
@@ -472,7 +472,7 @@ T PrefixSum (N n, FIN && fin, FOUT && fout, TYPE type, RetSum a_ret_sum = retSum
             T x = x0;
             // Scan within a warp
             for (int i = 1; i <= Gpu::Device::warp_size; i *= 2) {
-                T s = sg.shuffle_up(x, i);
+                T s = sycl::shift_group_right(sg, x, i);
                 if (lane >= i) x += s;
             }
 
@@ -490,7 +490,7 @@ T PrefixSum (N n, FIN && fin, FOUT && fout, TYPE type, RetSum a_ret_sum = retSum
             if (warp == 0) {
                 T y = (lane < nwarps) ? shared[lane] : 0;
                 for (int i = 1; i <= Gpu::Device::warp_size; i *= 2) {
-                    T s = sg.shuffle_up(y, i);
+                    T s = sycl::shift_group_right(sg, y, i);
                     if (lane >= i) y += s;
                 }
 
@@ -543,7 +543,7 @@ T PrefixSum (N n, FIN && fin, FOUT && fout, TYPE type, RetSum a_ret_sum = retSum
                     // implement our own __ballot
                     unsigned status_bf = (stva.status == 'p') ? (0x1u << lane) : 0;
                     for (int i = 1; i < Gpu::Device::warp_size; i *= 2) {
-                        status_bf |= sg.shuffle_xor(status_bf, i);
+                        status_bf |= sycl::permute_group_by_xor(sg, status_bf, i);
                     }
 
                     bool stop_lookback = status_bf & 0x1u;
@@ -563,7 +563,7 @@ T PrefixSum (N n, FIN && fin, FOUT && fout, TYPE type, RetSum a_ret_sum = retSum
                         }
 
                         for (int i = Gpu::Device::warp_size/2; i > 0; i /= 2) {
-                            x += sg.shuffle_down(x,i);
+                            x += sycl::shift_group_left(sg, x,i);
                         }
                     }
 
diff --git a/Src/Base/AMReX_TableData.H b/Src/Base/AMReX_TableData.H
index e44758bde6d..f44157160a7 100644
--- a/Src/Base/AMReX_TableData.H
+++ b/Src/Base/AMReX_TableData.H
@@ -77,8 +77,8 @@ struct Table2D
 {
     T* AMREX_RESTRICT p = nullptr;
     Long jstride = 0;
-    GpuArray<int,2> begin{1,1};
-    GpuArray<int,2> end{0,0};
+    GpuArray<int,2> begin{{1,1}};
+    GpuArray<int,2> end{{0,0}};
 
     AMREX_GPU_HOST_DEVICE
     constexpr Table2D () noexcept {}
@@ -142,8 +142,8 @@ struct Table3D
     T* AMREX_RESTRICT p = nullptr;
     Long jstride = 0;
     Long kstride = 0;
-    GpuArray<int,3> begin{1,1,1};
-    GpuArray<int,3> end{0,0,0};
+    GpuArray<int,3> begin{{1,1,1}};
+    GpuArray<int,3> end{{0,0,0}};
 
     AMREX_GPU_HOST_DEVICE
     constexpr Table3D () noexcept {}
@@ -213,8 +213,8 @@ struct Table4D
     Long jstride = 0;
     Long kstride = 0;
     Long nstride = 0;
-    GpuArray<int,4> begin{1,1,1,1};
-    GpuArray<int,4> end{0,0,0,0};
+    GpuArray<int,4> begin{{1,1,1,1}};
+    GpuArray<int,4> end{{0,0,0,0}};
 
     AMREX_GPU_HOST_DEVICE
     constexpr Table4D () noexcept {}
diff --git a/Src/Base/AMReX_TinyProfiler.H b/Src/Base/AMReX_TinyProfiler.H
index 677b4448d3b..57c9ea0479c 100644
--- a/Src/Base/AMReX_TinyProfiler.H
+++ b/Src/Base/AMReX_TinyProfiler.H
@@ -10,7 +10,7 @@
 #endif
 
 #if defined(AMREX_USE_HIP) && defined(AMREX_USE_ROCTX)
-#include <roctx.h>
+#include <roctracer/roctx.h>
 #endif
 
 #include <deque>
diff --git a/Src/Base/AMReX_ValLocPair.H b/Src/Base/AMReX_ValLocPair.H
new file mode 100644
index 00000000000..b7b480b1dba
--- /dev/null
+++ b/Src/Base/AMReX_ValLocPair.H
@@ -0,0 +1,35 @@
+#ifndef AMREX_VALLOCPAIR_H_
+#define AMREX_VALLOCPAIR_H_
+
+#include <limits>
+
+namespace amrex {
+
+template <typename TV, typename TI>
+struct ValLocPair
+{
+    TV value;
+    TI index;
+
+    static constexpr ValLocPair<TV,TI> max () {
+        return ValLocPair<TV,TI>{std::numeric_limits<TV>::max(), TI()};
+    }
+
+    static constexpr ValLocPair<TV,TI> lowest () {
+        return ValLocPair<TV,TI>{std::numeric_limits<TV>::lowest(), TI()};
+    }
+
+    friend constexpr bool operator< (ValLocPair<TV,TI> const& a, ValLocPair<TV,TI> const& b)
+    {
+        return a.value < b.value;
+    }
+
+    friend constexpr bool operator> (ValLocPair<TV,TI> const& a, ValLocPair<TV,TI> const& b)
+    {
+        return a.value > b.value;
+    }
+};
+
+}
+
+#endif
diff --git a/Src/Base/AMReX_VisMF.H b/Src/Base/AMReX_VisMF.H
index 12777a08307..bfab54abf8d 100644
--- a/Src/Base/AMReX_VisMF.H
+++ b/Src/Base/AMReX_VisMF.H
@@ -638,7 +638,6 @@ Read (FabArray<FAB>& fa, const std::string& name)
         }
 
         int totalioreqs = nboxes;
-        int messtotal = 0;
         int reqspending = 0;
         int iopfileindex;
         std::deque<int> iopreads;
@@ -669,7 +668,6 @@ Read (FabArray<FAB>& fa, const std::string& name)
                             }
                         } else {
                             ParallelDescriptor::Send(vreads, tryproc, readtag);
-                            ++messtotal;
                             ++reqspending;
                         }
                         availablefiles.erase(afilesiter);
diff --git a/Src/Base/AMReX_bc_types_mod.F90 b/Src/Base/AMReX_bc_types_mod.F90
index c326d49e419..c1c6f237ba8 100644
--- a/Src/Base/AMReX_bc_types_mod.F90
+++ b/Src/Base/AMReX_bc_types_mod.F90
@@ -15,6 +15,9 @@ module amrex_bc_types_module
   integer, parameter, public :: amrex_bc_ext_dir      =  3
   integer, parameter, public :: amrex_bc_hoextrap     =  4
   integer, parameter, public :: amrex_bc_hoextrapcc   =  5
+  integer, parameter, public :: amrex_bc_user_1       =  1001
+  integer, parameter, public :: amrex_bc_user_2       =  1002
+  integer, parameter, public :: amrex_bc_user_3       =  1003
 
   integer, parameter, public :: amrex_pbc_interior    = 0
   integer, parameter, public :: amrex_pbc_inflow      = 1
diff --git a/Src/Base/CMakeLists.txt b/Src/Base/CMakeLists.txt
index 6a2db4526cd..7af11a24b5a 100644
--- a/Src/Base/CMakeLists.txt
+++ b/Src/Base/CMakeLists.txt
@@ -30,6 +30,7 @@ target_sources( amrex
    AMReX_Utility.cpp
    AMReX_FileSystem.H
    AMReX_FileSystem.cpp
+   AMReX_ValLocPair.H
    AMReX_Reduce.H
    AMReX_Scan.H
    AMReX_Partition.H
@@ -71,6 +72,7 @@ target_sources( amrex
    AMReX_DataAllocator.H
    AMReX_BLProfiler.H
    AMReX_BLBackTrace.H
+   AMReX_BLBackTrace.cpp
    AMReX_BLFort.H
    AMReX_NFiles.H
    AMReX_NFiles.cpp
@@ -187,6 +189,7 @@ target_sources( amrex
    AMReX_IntegratorBase.H
    AMReX_RKIntegrator.H
    AMReX_TimeIntegrator.H
+   AMReX_RungeKutta.H
    # GPU --------------------------------------------------------------------
    AMReX_Gpu.H
    AMReX_GpuQualifiers.H
@@ -222,6 +225,7 @@ target_sources( amrex
    AMReX_MFParallelForC.H
    AMReX_MFParallelForG.H
    AMReX_TagParallelFor.H
+   AMReX_CTOParallelForImpl.H
    AMReX_ParReduce.H
    # CUDA --------------------------------------------------------------------
    AMReX_CudaGraph.H
@@ -231,8 +235,6 @@ target_sources( amrex
    # Memory pool -------------------------------------------------------------
    AMReX_MemPool.cpp
    AMReX_MemPool.H
-   # Profiling ---------------------------------------------------------------
-   AMReX_BLBackTrace.cpp
    # Parser ---------------------------------------------------------------
    Parser/AMReX_Parser.cpp
    Parser/AMReX_Parser.H
@@ -305,3 +307,8 @@ endif ()
 if (AMReX_TINY_PROFILE)
    target_sources(amrex PRIVATE AMReX_TinyProfiler.cpp AMReX_TinyProfiler.H )
 endif ()
+
+# MPMD
+if (AMReX_MPI)
+   target_sources(amrex PRIVATE AMReX_MPMD.cpp AMReX_MPMD.H )
+endif ()
diff --git a/Src/Base/Make.package b/Src/Base/Make.package
index d7c4e520e7b..9dd615b3251 100644
--- a/Src/Base/Make.package
+++ b/Src/Base/Make.package
@@ -22,6 +22,7 @@ C$(AMREX_BASE)_sources += AMReX_BlockMutex.cpp
 C$(AMREX_BASE)_sources += AMReX_ParmParse.cpp AMReX_parmparse_fi.cpp AMReX_Utility.cpp
 C$(AMREX_BASE)_headers += AMReX_ParmParse.H AMReX_Utility.H AMReX_BLassert.H AMReX_ArrayLim.H
 C$(AMREX_BASE)_headers += AMReX_Functional.H AMReX_Reduce.H AMReX_Scan.H AMReX_Partition.H
+C$(AMREX_BASE)_headers += AMReX_ValLocPair.H
 
 C$(AMREX_BASE)_headers += AMReX_FileSystem.H
 C$(AMREX_BASE)_sources += AMReX_FileSystem.cpp
@@ -100,6 +101,7 @@ C$(AMREX_BASE)_headers += AMReX_MFParallelForC.H
 C$(AMREX_BASE)_headers += AMReX_MFParallelForG.H
 
 C$(AMREX_BASE)_headers += AMReX_TagParallelFor.H
+C$(AMREX_BASE)_headers += AMReX_CTOParallelForImpl.H
 
 C$(AMREX_BASE)_headers += AMReX_ParReduce.H
 
@@ -203,7 +205,7 @@ C$(AMREX_BASE)_headers += AMReX_FEIntegrator.H
 C$(AMREX_BASE)_headers += AMReX_IntegratorBase.H
 C$(AMREX_BASE)_headers += AMReX_RKIntegrator.H
 C$(AMREX_BASE)_headers += AMReX_TimeIntegrator.H
-
+C$(AMREX_BASE)_headers += AMReX_RungeKutta.H
 
 #
 # Fortran interface routines.
@@ -271,6 +273,10 @@ CEXE_sources += AMReX_Machine.cpp
 # Forward declaration
 CEXE_headers += AMReX_BaseFwd.H
 
+ifeq ($(USE_MPI),TRUE)
+  CEXE_headers += AMReX_MPMD.H
+  CEXE_sources += AMReX_MPMD.cpp
+endif
 
 VPATH_LOCATIONS += $(AMREX_HOME)/Src/Base
 INCLUDE_LOCATIONS += $(AMREX_HOME)/Src/Base
diff --git a/Src/Boundary/AMReX_LOUtil_K.H b/Src/Boundary/AMReX_LOUtil_K.H
index b8fdb2a37ce..71bb1dd41d1 100644
--- a/Src/Boundary/AMReX_LOUtil_K.H
+++ b/Src/Boundary/AMReX_LOUtil_K.H
@@ -34,6 +34,22 @@ void poly_interp_coeff (Real xInt, Real const* AMREX_RESTRICT x, int N, Real* AM
     }
 }
 
+template <int N>
+AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+void poly_interp_coeff (Real xInt, Real const* AMREX_RESTRICT x, Real* AMREX_RESTRICT c) noexcept
+{
+    for (int j = 0; j < N; ++j) {
+        Real num = 1.0, den = 1.0;
+        for (int i = 0; i < N; ++i) {
+            if (i != j) {
+                num *= xInt-x[i];
+                den *= x[j]-x[i];
+            }
+        }
+        c[j] = num / den;
+    }
+}
+
 }
 
 #endif
diff --git a/Src/EB/AMReX_EB2.H b/Src/EB/AMReX_EB2.H
index ad56d532520..def8d2de9e0 100644
--- a/Src/EB/AMReX_EB2.H
+++ b/Src/EB/AMReX_EB2.H
@@ -49,6 +49,7 @@ public:
     virtual const Level& getLevel (const Geometry & geom) const = 0;
     virtual const Geometry& getGeometry (const Box& domain) const = 0;
     virtual const Box& coarsestDomain () const = 0;
+    virtual void addFineLevels (int num_new_fine_levels) = 0;
 
 protected:
     static AMREX_EXPORT Vector<std::unique_ptr<IndexSpace> > m_instance;
@@ -66,7 +67,7 @@ public:
     IndexSpaceImp (const G& gshop, const Geometry& geom,
                    int required_coarsening_level, int max_coarsening_level,
                    int ngrow, bool build_coarse_level_by_coarsening,
-                   bool extend_domain_face);
+                   bool extend_domain_face, int num_coarsen_opt);
 
     IndexSpaceImp (IndexSpaceImp<G> const&) = delete;
     IndexSpaceImp (IndexSpaceImp<G> &&) = delete;
@@ -80,46 +81,67 @@ public:
     virtual const Box& coarsestDomain () const final {
         return m_geom.back().Domain();
     }
+    virtual void addFineLevels (int num_new_fine_levels) final;
 
     using F = typename G::FunctionType;
 
 private:
 
+    G m_gshop;
+    bool m_build_coarse_level_by_coarsening;
+    bool m_extend_domain_face;
+    int m_num_coarsen_opt;
+
     Vector<GShopLevel<G> > m_gslevel;
     Vector<Geometry> m_geom;
     Vector<Box> m_domain;
     Vector<int> m_ngrow;
-    std::unique_ptr<F> m_impfunc;
 };
 
 #include <AMReX_EB2_IndexSpaceI.H>
 
 bool ExtendDomainFace ();
+int NumCoarsenOpt ();
 
 template <typename G>
 void
 Build (const G& gshop, const Geometry& geom,
        int required_coarsening_level, int max_coarsening_level,
        int ngrow = 4, bool build_coarse_level_by_coarsening = true,
-       bool extend_domain_face = ExtendDomainFace())
+       bool extend_domain_face = ExtendDomainFace(),
+       int num_coarsen_opt = NumCoarsenOpt())
 {
     BL_PROFILE("EB2::Initialize()");
     IndexSpace::push(new IndexSpaceImp<G>(gshop, geom,
                                           required_coarsening_level,
                                           max_coarsening_level,
                                           ngrow, build_coarse_level_by_coarsening,
-                                          extend_domain_face));
+                                          extend_domain_face,
+                                          num_coarsen_opt));
 }
 
 void Build (const Geometry& geom,
             int required_coarsening_level,
             int max_coarsening_level,
             int ngrow = 4,
-            bool build_coarse_level_by_coarsening = true);
+            bool build_coarse_level_by_coarsening = true,
+            bool extend_domain_face = ExtendDomainFace(),
+            int num_coarsen_opt = NumCoarsenOpt());
+
+
+void BuildFromChkptFile (std::string const& fname,
+                         const Geometry& geom,
+                         int required_coarsening_level,
+                         int max_coarsening_level,
+                         int ngrow = 4,
+                         bool build_coarse_level_by_coarsening = true,
+                         bool extend_domain_face = ExtendDomainFace());
 
 int maxCoarseningLevel (const Geometry& geom);
 int maxCoarseningLevel (IndexSpace const* ebis, const Geometry& geom);
 
+void addFineLevels (int num_new_fine_levels);
+
 }}
 
 #endif
diff --git a/Src/EB/AMReX_EB2.cpp b/Src/EB/AMReX_EB2.cpp
index 3bdf44ee4e9..fc2d75e0a01 100644
--- a/Src/EB/AMReX_EB2.cpp
+++ b/Src/EB/AMReX_EB2.cpp
@@ -11,6 +11,7 @@
 #include <AMReX_EB2_GeometryShop.H>
 #include <AMReX_EB2.H>
 #include <AMReX_EB2_IndexSpace_STL.H>
+#include <AMReX_EB2_IndexSpace_chkpt_file.H>
 #include <AMReX_ParmParse.H>
 #include <AMReX.H>
 #include <algorithm>
@@ -21,12 +22,14 @@ AMREX_EXPORT Vector<std::unique_ptr<IndexSpace> > IndexSpace::m_instance;
 
 AMREX_EXPORT int max_grid_size = 64;
 AMREX_EXPORT bool extend_domain_face = true;
+AMREX_EXPORT int num_coarsen_opt = 0;
 
 void Initialize ()
 {
     ParmParse pp("eb2");
     pp.queryAdd("max_grid_size", max_grid_size);
     pp.queryAdd("extend_domain_face", extend_domain_face);
+    pp.queryAdd("num_coarsen_opt", num_coarsen_opt);
 
     amrex::ExecOnFinalize(Finalize);
 }
@@ -41,6 +44,11 @@ bool ExtendDomainFace ()
     return extend_domain_face;
 }
 
+int NumCoarsenOpt ()
+{
+    return num_coarsen_opt;
+}
+
 void
 IndexSpace::push (IndexSpace* ispace)
 {
@@ -74,7 +82,8 @@ const IndexSpace* TopIndexSpaceIfPresent() noexcept {
 
 void
 Build (const Geometry& geom, int required_coarsening_level,
-       int max_coarsening_level, int ngrow, bool build_coarse_level_by_coarsening)
+       int max_coarsening_level, int ngrow, bool build_coarse_level_by_coarsening,
+       bool a_extend_domain_face, int a_num_coarsen_opt)
 {
     ParmParse pp("eb2");
     std::string geom_type;
@@ -85,7 +94,8 @@ Build (const Geometry& geom, int required_coarsening_level,
         EB2::AllRegularIF rif;
         EB2::GeometryShop<EB2::AllRegularIF> gshop(rif);
         EB2::Build(gshop, geom, required_coarsening_level,
-                   max_coarsening_level, ngrow, build_coarse_level_by_coarsening);
+                   max_coarsening_level, ngrow, build_coarse_level_by_coarsening,
+                   a_extend_domain_face, a_num_coarsen_opt);
     }
     else if (geom_type == "box")
     {
@@ -102,7 +112,8 @@ Build (const Geometry& geom, int required_coarsening_level,
 
         EB2::GeometryShop<EB2::BoxIF> gshop(bf);
         EB2::Build(gshop, geom, required_coarsening_level,
-                   max_coarsening_level, ngrow, build_coarse_level_by_coarsening);
+                   max_coarsening_level, ngrow, build_coarse_level_by_coarsening,
+                   a_extend_domain_face, a_num_coarsen_opt);
     }
     else if (geom_type == "cylinder")
     {
@@ -127,7 +138,8 @@ Build (const Geometry& geom, int required_coarsening_level,
 
         EB2::GeometryShop<EB2::CylinderIF> gshop(cf);
         EB2::Build(gshop, geom, required_coarsening_level,
-                   max_coarsening_level, ngrow, build_coarse_level_by_coarsening);
+                   max_coarsening_level, ngrow, build_coarse_level_by_coarsening,
+                   a_extend_domain_face, a_num_coarsen_opt);
     }
     else if (geom_type == "plane")
     {
@@ -141,7 +153,8 @@ Build (const Geometry& geom, int required_coarsening_level,
 
         EB2::GeometryShop<EB2::PlaneIF> gshop(pf);
         EB2::Build(gshop, geom, required_coarsening_level,
-                   max_coarsening_level, ngrow, build_coarse_level_by_coarsening);
+                   max_coarsening_level, ngrow, build_coarse_level_by_coarsening,
+                   a_extend_domain_face, a_num_coarsen_opt);
     }
     else if (geom_type == "sphere")
     {
@@ -158,7 +171,8 @@ Build (const Geometry& geom, int required_coarsening_level,
 
         EB2::GeometryShop<EB2::SphereIF> gshop(sf);
         EB2::Build(gshop, geom, required_coarsening_level,
-                   max_coarsening_level, ngrow, build_coarse_level_by_coarsening);
+                   max_coarsening_level, ngrow, build_coarse_level_by_coarsening,
+                   a_extend_domain_face, a_num_coarsen_opt);
     }
     else if (geom_type == "torus")
     {
@@ -177,7 +191,8 @@ Build (const Geometry& geom, int required_coarsening_level,
 
         EB2::GeometryShop<EB2::TorusIF> gshop(sf);
         EB2::Build(gshop, geom, required_coarsening_level,
-                   max_coarsening_level, ngrow, build_coarse_level_by_coarsening);
+                   max_coarsening_level, ngrow, build_coarse_level_by_coarsening,
+                   a_extend_domain_face, a_num_coarsen_opt);
     }
     else if (geom_type == "parser")
     {
@@ -188,7 +203,8 @@ Build (const Geometry& geom, int required_coarsening_level,
         EB2::ParserIF pif(parser.compile<3>());
         EB2::GeometryShop<EB2::ParserIF,Parser> gshop(pif,parser);
         EB2::Build(gshop, geom, required_coarsening_level,
-                   max_coarsening_level, ngrow, build_coarse_level_by_coarsening);
+                   max_coarsening_level, ngrow, build_coarse_level_by_coarsening,
+                   a_extend_domain_face, a_num_coarsen_opt);
     }
     else if (geom_type == "stl")
     {
@@ -206,7 +222,8 @@ Build (const Geometry& geom, int required_coarsening_level,
                                            geom, required_coarsening_level,
                                            max_coarsening_level, ngrow,
                                            build_coarse_level_by_coarsening,
-                                           extend_domain_face));
+                                           a_extend_domain_face,
+                                           a_num_coarsen_opt));
     }
     else
     {
@@ -214,6 +231,29 @@ Build (const Geometry& geom, int required_coarsening_level,
     }
 }
 
+void addFineLevels (int num_new_fine_levels)
+{
+    BL_PROFILE("EB2::addFineLevels()");
+    auto p = const_cast<IndexSpace*>(TopIndexSpace());
+    if (p) {
+        p->addFineLevels(num_new_fine_levels);
+    }
+}
+
+void
+BuildFromChkptFile (std::string const& fname,
+                    const Geometry& geom, int required_coarsening_level,
+                    int max_coarsening_level, int ngrow, bool build_coarse_level_by_coarsening,
+                    bool a_extend_domain_face)
+{
+    ChkptFile chkpt_file(fname);
+    IndexSpace::push(new IndexSpaceChkptFile(chkpt_file,
+                     geom, required_coarsening_level,
+                     max_coarsening_level, ngrow,
+                     build_coarse_level_by_coarsening,
+                     a_extend_domain_face));
+}
+
 namespace {
 static int comp_max_crse_level (Box cdomain, const Box& domain)
 {
diff --git a/Src/EB/AMReX_EB2_2D_C.cpp b/Src/EB/AMReX_EB2_2D_C.cpp
index bf17844658c..060ed8f4df4 100644
--- a/Src/EB/AMReX_EB2_2D_C.cpp
+++ b/Src/EB/AMReX_EB2_2D_C.cpp
@@ -391,6 +391,13 @@ void build_cells (Box const& bx, Array4<EBCellFlag> const& cell,
         });
     }
 
+    set_connection_flags(bxg1, cell, fx, fy);
+}
+
+void set_connection_flags (Box const& bxg1,
+                           Array4<EBCellFlag> const& cell,
+                           Array4<Type_t> const& fx, Array4<Type_t> const& fy) noexcept
+{
     // Build neighbors.  By default, all neighbors are already set.
     AMREX_HOST_DEVICE_FOR_3D ( bxg1, i, j, k,
     {
diff --git a/Src/EB/AMReX_EB2_3D_C.H b/Src/EB/AMReX_EB2_3D_C.H
index 14543f81d25..3ea77f149fe 100644
--- a/Src/EB/AMReX_EB2_3D_C.H
+++ b/Src/EB/AMReX_EB2_3D_C.H
@@ -200,11 +200,8 @@ int check_mvmc (int i, int j, int k, Array4<Real const> const& fine)
         nxm = 0;
     } else if (n == 2) {
         nxm = 1;
-    } else if (n == 4) {
-        ierr = 1;
     } else {
         ierr = 1;
-        amrex::Abort("amrex::check_mvmc: how did this happen? wrong number of cuts on xlo-face");
     }
 
     int nxp = -1;
@@ -213,11 +210,8 @@ int check_mvmc (int i, int j, int k, Array4<Real const> const& fine)
         nxp = 0;
     } else if (n == 2) {
         nxp = 1;
-    } else if (n == 4) {
-        ierr = 1;
     } else {
         ierr = 1;
-        amrex::Abort("amrex::check_mvmc: how did this happen? wrong number of cuts on xhi-face");
     }
 
     // y-faces
@@ -227,11 +221,8 @@ int check_mvmc (int i, int j, int k, Array4<Real const> const& fine)
         nym = 0;
     } else if (n == 2) {
         nym = 1;
-    } else if (n == 4) {
-        ierr = 1;
     } else {
         ierr = 1;
-        amrex::Abort("amrex::check_mvmc: how did this happen? wrong number of cuts on ylo-face");
     }
 
     int nyp = -1;
@@ -240,11 +231,8 @@ int check_mvmc (int i, int j, int k, Array4<Real const> const& fine)
         nyp = 0;
     } else if (n == 2) {
         nyp = 1;
-    } else if (n == 4) {
-        ierr = 1;
     } else {
         ierr = 1;
-        amrex::Abort("amrex::check_mvmc: how did this happen? wrong number of cuts on yhi-face");
     }
 
     // z-faces
@@ -254,11 +242,8 @@ int check_mvmc (int i, int j, int k, Array4<Real const> const& fine)
         nzm = 0;
     } else if (n == 2) {
         nzm = 1;
-    } else if (n == 4) {
-        ierr = 1;
     } else {
         ierr = 1;
-        amrex::Abort("amrex::check_mvmc: how did this happen? wrong number of cuts on zlo-face");
     }
 
     int nzp = -1;
@@ -267,11 +252,8 @@ int check_mvmc (int i, int j, int k, Array4<Real const> const& fine)
         nzp = 0;
     } else if (n == 2) {
         nzp = 1;
-    } else if (n == 4) {
-        ierr = 1;
     } else {
         ierr = 1;
-        amrex::Abort("amrex::check_mvmc: how did this happen? wrong number of cuts on zhi-face");
     }
 
     if (nxm == 1 && nym == 1 && nzm == 1 && nxp == 1 && nyp == 1 && nzp == 1) {
diff --git a/Src/EB/AMReX_EB2_3D_C.cpp b/Src/EB/AMReX_EB2_3D_C.cpp
index 0077d817ae4..767626eb9e9 100644
--- a/Src/EB/AMReX_EB2_3D_C.cpp
+++ b/Src/EB/AMReX_EB2_3D_C.cpp
@@ -853,89 +853,96 @@ void build_cells (Box const& bx, Array4<EBCellFlag> const& cell,
     nsmallcells += hp[0];
     nmulticuts  += hp[1];
 
+    Box const& nbxg1 = amrex::surroundingNodes(bxg1);
+    Box const& bxg1x = amrex::surroundingNodes(bxg1,0);
+    Box const& bxg1y = amrex::surroundingNodes(bxg1,1);
+    Box const& bxg1z = amrex::surroundingNodes(bxg1,2);
+    AMREX_HOST_DEVICE_FOR_3D(nbxg1, i, j, k,
+    {
+        if (levset(i,j,k) < Real(0.0)) {
+            bool zero_levset = false;
+            if        (bxg1.contains(i-1,j-1,k-1)
+                       &&       cell(i-1,j-1,k-1).isCovered()) {
+                zero_levset = true;
+            } else if (bxg1.contains(i  ,j-1,k-1)
+                       &&       cell(i  ,j-1,k-1).isCovered()) {
+                zero_levset = true;
+            } else if (bxg1.contains(i-1,j  ,k-1)
+                       &&       cell(i-1,j  ,k-1).isCovered()) {
+                zero_levset = true;
+            } else if (bxg1.contains(i  ,j  ,k-1)
+                       &&       cell(i  ,j  ,k-1).isCovered()) {
+                zero_levset = true;
+            } else if (bxg1.contains(i-1,j-1,k  )
+                       &&       cell(i-1,j-1,k  ).isCovered()) {
+                zero_levset = true;
+            } else if (bxg1.contains(i  ,j-1,k  )
+                       &&       cell(i  ,j-1,k  ).isCovered()) {
+                zero_levset = true;
+            } else if (bxg1.contains(i-1,j  ,k  )
+                       &&       cell(i-1,j  ,k  ).isCovered()) {
+                zero_levset = true;
+            } else if (bxg1.contains(i  ,j  ,k  )
+                       &&       cell(i  ,j  ,k  ).isCovered()) {
+                zero_levset = true;
+            } else if (bxg1x.contains(i  ,j-1,k-1)
+                       &&          fx(i  ,j-1,k-1) == Type::covered) {
+                zero_levset = true;
+            } else if (bxg1x.contains(i  ,j  ,k-1)
+                       &&          fx(i  ,j  ,k-1) == Type::covered) {
+                zero_levset = true;
+            } else if (bxg1x.contains(i  ,j-1,k  )
+                       &&          fx(i  ,j-1,k  ) == Type::covered) {
+                zero_levset = true;
+            } else if (bxg1x.contains(i  ,j  ,k  )
+                       &&          fx(i  ,j  ,k  ) == Type::covered) {
+                zero_levset = true;
+            } else if (bxg1y.contains(i-1,j  ,k-1)
+                       &&          fy(i-1,j  ,k-1) == Type::covered) {
+                zero_levset = true;
+            } else if (bxg1y.contains(i  ,j  ,k-1)
+                       &&          fy(i  ,j  ,k-1) == Type::covered) {
+                zero_levset = true;
+            } else if (bxg1y.contains(i-1,j  ,k  )
+                       &&          fy(i-1,j  ,k  ) == Type::covered) {
+                zero_levset = true;
+            } else if (bxg1y.contains(i  ,j  ,k  )
+                       &&          fy(i  ,j  ,k  ) == Type::covered) {
+                zero_levset = true;
+            } else if (bxg1z.contains(i-1,j-1,k  )
+                       &&          fz(i-1,j-1,k  ) == Type::covered) {
+                zero_levset = true;
+            } else if (bxg1z.contains(i  ,j-1,k  )
+                       &&          fz(i  ,j-1,k  ) == Type::covered) {
+                zero_levset = true;
+            } else if (bxg1z.contains(i-1,j  ,k  )
+                       &&          fz(i-1,j  ,k  ) == Type::covered) {
+                zero_levset = true;
+            } else if (bxg1z.contains(i  ,j  ,k  )
+                       &&          fz(i  ,j  ,k  ) == Type::covered) {
+                zero_levset = true;
+            }
+            if (zero_levset) {
+                levset(i,j,k) = Real(0.0);
+            }
+        }
+    });
+
     if (nsmallcells > 0 || nmulticuts > 0) {
         if (!cover_multiple_cuts && nmulticuts > 0) {
             amrex::Abort("amrex::EB2::build_cells: multi-cuts not supported");
         }
-        Box const& nbxg1 = amrex::surroundingNodes(bxg1);
-        Box const& bxg1x = amrex::surroundingNodes(bxg1,0);
-        Box const& bxg1y = amrex::surroundingNodes(bxg1,1);
-        Box const& bxg1z = amrex::surroundingNodes(bxg1,2);
-        AMREX_HOST_DEVICE_FOR_3D(nbxg1, i, j, k,
-        {
-            if (levset(i,j,k) < Real(0.0)) {
-                bool zero_levset = false;
-                if        (bxg1.contains(i-1,j-1,k-1)
-                           &&       cell(i-1,j-1,k-1).isCovered()) {
-                    zero_levset = true;
-                } else if (bxg1.contains(i  ,j-1,k-1)
-                           &&       cell(i  ,j-1,k-1).isCovered()) {
-                    zero_levset = true;
-                } else if (bxg1.contains(i-1,j  ,k-1)
-                           &&       cell(i-1,j  ,k-1).isCovered()) {
-                    zero_levset = true;
-                } else if (bxg1.contains(i  ,j  ,k-1)
-                           &&       cell(i  ,j  ,k-1).isCovered()) {
-                    zero_levset = true;
-                } else if (bxg1.contains(i-1,j-1,k  )
-                           &&       cell(i-1,j-1,k  ).isCovered()) {
-                    zero_levset = true;
-                } else if (bxg1.contains(i  ,j-1,k  )
-                           &&       cell(i  ,j-1,k  ).isCovered()) {
-                    zero_levset = true;
-                } else if (bxg1.contains(i-1,j  ,k  )
-                           &&       cell(i-1,j  ,k  ).isCovered()) {
-                    zero_levset = true;
-                } else if (bxg1.contains(i  ,j  ,k  )
-                           &&       cell(i  ,j  ,k  ).isCovered()) {
-                    zero_levset = true;
-                } else if (cover_multiple_cuts) {
-                    if        (bxg1x.contains(i  ,j-1,k-1)
-                               &&          fx(i  ,j-1,k-1) == Type::covered) {
-                        zero_levset = true;
-                    } else if (bxg1x.contains(i  ,j  ,k-1)
-                               &&          fx(i  ,j  ,k-1) == Type::covered) {
-                        zero_levset = true;
-                    } else if (bxg1x.contains(i  ,j-1,k  )
-                               &&          fx(i  ,j-1,k  ) == Type::covered) {
-                        zero_levset = true;
-                    } else if (bxg1x.contains(i  ,j  ,k  )
-                               &&          fx(i  ,j  ,k  ) == Type::covered) {
-                        zero_levset = true;
-                    } else if (bxg1y.contains(i-1,j  ,k-1)
-                               &&          fy(i-1,j  ,k-1) == Type::covered) {
-                        zero_levset = true;
-                    } else if (bxg1y.contains(i  ,j  ,k-1)
-                               &&          fy(i  ,j  ,k-1) == Type::covered) {
-                        zero_levset = true;
-                    } else if (bxg1y.contains(i-1,j  ,k  )
-                               &&          fy(i-1,j  ,k  ) == Type::covered) {
-                        zero_levset = true;
-                    } else if (bxg1y.contains(i  ,j  ,k  )
-                               &&          fy(i  ,j  ,k  ) == Type::covered) {
-                        zero_levset = true;
-                    } else if (bxg1z.contains(i-1,j-1,k  )
-                               &&          fz(i-1,j-1,k  ) == Type::covered) {
-                        zero_levset = true;
-                    } else if (bxg1z.contains(i  ,j-1,k  )
-                               &&          fz(i  ,j-1,k  ) == Type::covered) {
-                        zero_levset = true;
-                    } else if (bxg1z.contains(i-1,j  ,k  )
-                               &&          fz(i-1,j  ,k  ) == Type::covered) {
-                        zero_levset = true;
-                    } else if (bxg1z.contains(i  ,j  ,k  )
-                               &&          fz(i  ,j  ,k  ) == Type::covered) {
-                        zero_levset = true;
-                    }
-                }
-                if (zero_levset) {
-                    levset(i,j,k) = Real(0.0);
-                }
-            }
-        });
         return;
+    } else {
+        set_connection_flags(bx, bxg1, cell, ctmp, fx, fy, fz);
     }
+}
 
+void set_connection_flags (Box const& bx,
+                           Box const& bxg1, Array4<EBCellFlag> const& cell,
+                           Array4<EBCellFlag> const& ctmp, Array4<Type_t> const& fx,
+                           Array4<Type_t> const& fy, Array4<Type_t> const& fz) noexcept
+{
     // Build neighbors.  By default all 26 neighbors are already set.
     AMREX_HOST_DEVICE_FOR_3D ( bxg1, i, j, k,
     {
diff --git a/Src/EB/AMReX_EB2_C.H b/Src/EB/AMReX_EB2_C.H
index 7e752f3d051..0be84fdc913 100644
--- a/Src/EB/AMReX_EB2_C.H
+++ b/Src/EB/AMReX_EB2_C.H
@@ -36,6 +36,9 @@ void build_cells (Box const& bx, Array4<EBCellFlag> const& cell,
                   Real small_volfrac, Geometry const& geom, bool extend_domain_face,
                   int& nsmallcells, int const nmulticuts) noexcept;
 
+void set_connection_flags(Box const& bxg1, Array4<EBCellFlag> const& cell,
+                          Array4<Type_t> const& fx, Array4<Type_t> const& fy) noexcept;
+
 #elif (AMREX_SPACEDIM == 3)
 
 int build_faces (Box const& bx, Array4<EBCellFlag> const& cell,
@@ -67,6 +70,11 @@ void build_cells (Box const& bx, Array4<EBCellFlag> const& cell,
                   bool extend_domain_face, bool cover_multiple_cuts,
                   int& nsmallcells, int& nmulticuts) noexcept;
 
+void set_connection_flags(Box const& bx, Box const& bxg1,
+                          Array4<EBCellFlag> const& cell, Array4<EBCellFlag> const& ctmp,
+                          Array4<Type_t> const& fx, Array4<Type_t> const& fy,
+                          Array4<Type_t> const& fz) noexcept;
+
 #endif
 
 void intercept_to_edge_centroid (AMREX_D_DECL(Array4<Real> const& excent,
diff --git a/Src/EB/AMReX_EB2_GeometryShop.H b/Src/EB/AMReX_EB2_GeometryShop.H
index ff80dd20593..2a7565abad2 100644
--- a/Src/EB/AMReX_EB2_GeometryShop.H
+++ b/Src/EB/AMReX_EB2_GeometryShop.H
@@ -244,6 +244,7 @@ public:
                 }
             }
         }
+        amrex::ignore_unused(nzero);
 
         if (nbody == 0) {
             return allregular;
diff --git a/Src/EB/AMReX_EB2_IndexSpaceI.H b/Src/EB/AMReX_EB2_IndexSpaceI.H
index 192df9f43a0..e7db810b03b 100644
--- a/Src/EB/AMReX_EB2_IndexSpaceI.H
+++ b/Src/EB/AMReX_EB2_IndexSpaceI.H
@@ -4,7 +4,11 @@ IndexSpaceImp<G>::IndexSpaceImp (const G& gshop, const Geometry& geom,
                                  int required_coarsening_level,
                                  int max_coarsening_level,
                                  int ngrow, bool build_coarse_level_by_coarsening,
-                                 bool extend_domain_face)
+                                 bool extend_domain_face, int num_coarsen_opt)
+    : m_gshop(gshop),
+      m_build_coarse_level_by_coarsening(build_coarse_level_by_coarsening),
+      m_extend_domain_face(extend_domain_face),
+      m_num_coarsen_opt(num_coarsen_opt)
 {
     // build finest level (i.e., level 0) first
     AMREX_ALWAYS_ASSERT(required_coarsening_level >= 0 && required_coarsening_level <= 30);
@@ -20,7 +24,8 @@ IndexSpaceImp<G>::IndexSpaceImp (const G& gshop, const Geometry& geom,
     m_domain.push_back(geom.Domain());
     m_ngrow.push_back(ngrow_finest);
     m_gslevel.reserve(max_coarsening_level+1);
-    m_gslevel.emplace_back(this, gshop, geom, EB2::max_grid_size, ngrow_finest, extend_domain_face);
+    m_gslevel.emplace_back(this, gshop, geom, EB2::max_grid_size, ngrow_finest, extend_domain_face,
+                           num_coarsen_opt);
 
     for (int ilev = 1; ilev <= max_coarsening_level; ++ilev)
     {
@@ -44,7 +49,8 @@ IndexSpaceImp<G>::IndexSpaceImp (const G& gshop, const Geometry& geom,
                 if (build_coarse_level_by_coarsening) {
                     amrex::Abort("Failed to build required coarse EB level "+std::to_string(ilev));
                 } else {
-                    m_gslevel.emplace_back(this, gshop, cgeom, EB2::max_grid_size, ng, extend_domain_face);
+                    m_gslevel.emplace_back(this, gshop, cgeom, EB2::max_grid_size, ng, extend_domain_face,
+                                           num_coarsen_opt-ilev);
                 }
             } else {
                 break;
@@ -54,8 +60,6 @@ IndexSpaceImp<G>::IndexSpaceImp (const G& gshop, const Geometry& geom,
         m_domain.push_back(cdomain);
         m_ngrow.push_back(ng);
     }
-
-    m_impfunc = std::make_unique<F>(gshop.GetImpFunc());
 }
 
 
@@ -76,3 +80,29 @@ IndexSpaceImp<G>::getGeometry (const Box& dom) const
     int i = std::distance(m_domain.begin(), it);
     return m_geom[i];
 }
+
+template <typename G>
+void
+IndexSpaceImp<G>::addFineLevels (int num_new_fine_levels)
+{
+    if (num_new_fine_levels <= 0) { return; }
+
+    if (m_num_coarsen_opt > 0) {
+        m_num_coarsen_opt += num_new_fine_levels;
+    }
+
+    IndexSpaceImp<G> fine_isp(m_gshop, amrex::refine(m_geom[0], 1<<num_new_fine_levels),
+                              num_new_fine_levels-1, num_new_fine_levels-1,
+                              m_ngrow[0], m_build_coarse_level_by_coarsening,
+                              m_extend_domain_face, m_num_coarsen_opt);
+
+    fine_isp.m_gslevel.reserve(m_domain.size()+num_new_fine_levels);
+    for (int i = 0; i < m_domain.size(); ++i) {
+        fine_isp.m_gslevel.emplace_back(std::move(m_gslevel[i]));
+    }
+    std::swap(fine_isp.m_gslevel, m_gslevel);
+
+    m_geom.insert(m_geom.begin(), fine_isp.m_geom.begin(), fine_isp.m_geom.end());
+    m_domain.insert(m_domain.begin(), fine_isp.m_domain.begin(), fine_isp.m_domain.end());
+    m_ngrow.insert(m_ngrow.begin(), fine_isp.m_ngrow.begin(), fine_isp.m_ngrow.end());
+}
diff --git a/Src/EB/AMReX_EB2_IndexSpace_STL.H b/Src/EB/AMReX_EB2_IndexSpace_STL.H
index 83edab1f9e4..a5e13ba0591 100644
--- a/Src/EB/AMReX_EB2_IndexSpace_STL.H
+++ b/Src/EB/AMReX_EB2_IndexSpace_STL.H
@@ -19,7 +19,7 @@ public:
                   const Geometry& geom, int required_coarsening_level,
                   int max_coarsening_level, int ngrow,
                   bool build_coarse_level_by_coarsening,
-                  bool extend_domain_face);
+                  bool extend_domain_face, int num_coarsen_opt);
 
     IndexSpaceSTL (IndexSpaceSTL const&) = delete;
     IndexSpaceSTL (IndexSpaceSTL &&) = delete;
@@ -33,6 +33,7 @@ public:
     virtual const Box& coarsestDomain () const final {
         return m_geom.back().Domain();
     }
+    virtual void addFineLevels (int num_new_fine_levels) final;
 
 private:
 
diff --git a/Src/EB/AMReX_EB2_IndexSpace_STL.cpp b/Src/EB/AMReX_EB2_IndexSpace_STL.cpp
index 00fc12a7879..1a81cfe72e9 100644
--- a/Src/EB/AMReX_EB2_IndexSpace_STL.cpp
+++ b/Src/EB/AMReX_EB2_IndexSpace_STL.cpp
@@ -7,7 +7,7 @@ IndexSpaceSTL::IndexSpaceSTL (const std::string& stl_file, Real stl_scale,
                               const Geometry& geom, int required_coarsening_level,
                               int max_coarsening_level, int ngrow,
                               bool build_coarse_level_by_coarsening,
-                              bool extend_domain_face)
+                              bool extend_domain_face, int num_coarsen_opt)
 {
     Gpu::LaunchSafeGuard lsg(true); // Always use GPU
 
@@ -29,7 +29,7 @@ IndexSpaceSTL::IndexSpaceSTL (const std::string& stl_file, Real stl_scale,
     m_ngrow.push_back(ngrow_finest);
     m_stllevel.reserve(max_coarsening_level+1);
     m_stllevel.emplace_back(this, stl_tools, geom, EB2::max_grid_size, ngrow_finest,
-                            extend_domain_face);
+                            extend_domain_face, num_coarsen_opt);
 
     for (int ilev = 1; ilev <= max_coarsening_level; ++ilev)
     {
@@ -54,7 +54,7 @@ IndexSpaceSTL::IndexSpaceSTL (const std::string& stl_file, Real stl_scale,
                     amrex::Abort("Failed to build required coarse EB level "+std::to_string(ilev));
                 } else {
                     m_stllevel.emplace_back(this, stl_tools, cgeom, EB2::max_grid_size, ng,
-                                            extend_domain_face);
+                                            extend_domain_face, num_coarsen_opt-ilev);
                 }
             } else {
                 break;
@@ -82,4 +82,10 @@ IndexSpaceSTL::getGeometry (const Box& dom) const
     return m_geom[i];
 }
 
+void
+IndexSpaceSTL::addFineLevels (int /*num_new_fine_levels*/)
+{
+    amrex::Abort("IndexSpaceSTL::addFineLevels: todo");
+}
+
 }}
diff --git a/Src/EB/AMReX_EB2_IndexSpace_chkpt_file.H b/Src/EB/AMReX_EB2_IndexSpace_chkpt_file.H
new file mode 100644
index 00000000000..3285978744a
--- /dev/null
+++ b/Src/EB/AMReX_EB2_IndexSpace_chkpt_file.H
@@ -0,0 +1,47 @@
+#ifndef AMREX_EB2_INDEXSPACE_CHKPTFILE_H_
+#define AMREX_EB2_INDEXSPACE_CHKPTFILE_H_
+#include <AMReX_Config.H>
+
+#include <AMReX_EB2.H>
+#include <AMReX_EB2_Level_chkpt_file.H>
+
+#include <string>
+
+namespace amrex { namespace EB2 {
+
+class IndexSpaceChkptFile
+    : public IndexSpace
+{
+public:
+
+    IndexSpaceChkptFile (const ChkptFile& chkptfile,
+                         const Geometry& geom, int required_coarsening_level,
+                         int max_coarsening_level, int ngrow,
+                         bool build_coarse_level_by_coarsening,
+                         bool extend_domain_face);
+
+    IndexSpaceChkptFile (IndexSpaceChkptFile const&) = delete;
+    IndexSpaceChkptFile (IndexSpaceChkptFile &&) = delete;
+    void operator= (IndexSpaceChkptFile const&) = delete;
+    void operator= (IndexSpaceChkptFile &&) = delete;
+
+    virtual ~IndexSpaceChkptFile () {}
+
+    virtual const Level& getLevel (const Geometry& geom) const final;
+    virtual const Geometry& getGeometry (const Box& dom) const final;
+    virtual const Box& coarsestDomain () const final {
+        return m_geom.back().Domain();
+    }
+    virtual void addFineLevels (int num_new_fine_levels) final;
+
+private:
+
+    Vector<ChkptFileLevel> m_chkpt_file_level;
+    Vector<Geometry> m_geom;
+    Vector<Box> m_domain;
+    Vector<int> m_ngrow;
+};
+
+}}
+
+#endif
diff --git a/Src/EB/AMReX_EB2_IndexSpace_chkpt_file.cpp b/Src/EB/AMReX_EB2_IndexSpace_chkpt_file.cpp
new file mode 100644
index 00000000000..b0318dd402c
--- /dev/null
+++ b/Src/EB/AMReX_EB2_IndexSpace_chkpt_file.cpp
@@ -0,0 +1,86 @@
+#include <AMReX_EB2_IndexSpace_chkpt_file.H>
+
+namespace amrex { namespace EB2 {
+
+IndexSpaceChkptFile::IndexSpaceChkptFile (const ChkptFile& chkpt_file,
+                                          const Geometry& geom, int required_coarsening_level,
+                                          int max_coarsening_level, int ngrow,
+                                          bool build_coarse_level_by_coarsening,
+                                          bool extend_domain_face)
+{
+    Gpu::LaunchSafeGuard lsg(true); // Always use GPU
+
+    // build finest level (i.e., level 0) first
+    AMREX_ALWAYS_ASSERT(required_coarsening_level >= 0 && required_coarsening_level <= 30);
+    max_coarsening_level = std::max(required_coarsening_level,max_coarsening_level);
+    max_coarsening_level = std::min(30,max_coarsening_level);
+
+    int ngrow_finest = std::max(ngrow,0);
+    for (int i = 1; i <= required_coarsening_level; ++i) {
+        ngrow_finest *= 2;
+    }
+
+    m_geom.push_back(geom);
+    m_domain.push_back(geom.Domain());
+    m_ngrow.push_back(ngrow_finest);
+    m_chkpt_file_level.reserve(max_coarsening_level+1);
+    m_chkpt_file_level.emplace_back(this, chkpt_file, geom, EB2::max_grid_size, ngrow_finest,
+            extend_domain_face);
+
+    for (int ilev = 1; ilev <= max_coarsening_level; ++ilev)
+    {
+        bool coarsenable = m_geom.back().Domain().coarsenable(2,2);
+        if (!coarsenable) {
+            if (ilev <= required_coarsening_level) {
+                amrex::Abort("IndexSpaceImp: domain is not coarsenable at level "+std::to_string(ilev));
+            } else {
+                break;
+            }
+        }
+
+        int ng = (ilev > required_coarsening_level) ? 0 : m_ngrow.back()/2;
+
+        Box cdomain = amrex::coarsen(m_geom.back().Domain(),2);
+        Geometry cgeom = amrex::coarsen(m_geom.back(),2);
+        m_chkpt_file_level.emplace_back(this, ilev, EB2::max_grid_size, ng, cgeom, m_chkpt_file_level[ilev-1]);
+        if (!m_chkpt_file_level.back().isOK()) {
+            m_chkpt_file_level.pop_back();
+            if (ilev <= required_coarsening_level) {
+                if (build_coarse_level_by_coarsening) {
+                    amrex::Abort("Failed to build required coarse EB level "+std::to_string(ilev));
+                } else {
+                    amrex::Abort("Chkptfile only stored for finest level. Failed to build "+std::to_string(ilev));
+                }
+            } else {
+                break;
+            }
+        }
+        m_geom.push_back(cgeom);
+        m_domain.push_back(cdomain);
+        m_ngrow.push_back(ng);
+    }
+}
+
+const Level&
+IndexSpaceChkptFile::getLevel (const Geometry& geom) const
+{
+    auto it = std::find(std::begin(m_domain), std::end(m_domain), geom.Domain());
+    int i = std::distance(m_domain.begin(), it);
+    return m_chkpt_file_level[i];
+}
+
+const Geometry&
+IndexSpaceChkptFile::getGeometry (const Box& dom) const
+{
+    auto it = std::find(std::begin(m_domain), std::end(m_domain), dom);
+    int i = std::distance(m_domain.begin(), it);
+    return m_geom[i];
+}
+
+void
+IndexSpaceChkptFile::addFineLevels (int /*num_new_fine_levels*/)
+{
+    amrex::Abort("IndexSpaceChkptFile::addFineLevels: not supported");
+}
+
+}}
diff --git a/Src/EB/AMReX_EB2_Level.H b/Src/EB/AMReX_EB2_Level.H
index d47917328c5..8ebc864b903 100644
--- a/Src/EB/AMReX_EB2_Level.H
+++ b/Src/EB/AMReX_EB2_Level.H
@@ -60,6 +60,8 @@ public:
     const Geometry& Geom () const noexcept { return m_geom; }
     IndexSpace const* getEBIndexSpace () const noexcept { return m_parent; }
 
+    void write_to_chkpt_file (const std::string& fname, bool extend_domain_face, int max_grid_size) const;
+
 protected:
 
     Level (Level && rhs) = default;
@@ -98,12 +100,13 @@ class GShopLevel
     : public Level
 {
 public:
-    GShopLevel (IndexSpace const* is, G const& gshop, const Geometry& geom, int max_grid_size, int ngrow, bool extend_domain_face);
+    GShopLevel (IndexSpace const* is, G const& gshop, const Geometry& geom, int max_grid_size,
+                int ngrow, bool extend_domain_face, int num_crse_opt);
     GShopLevel (IndexSpace const* is, int ilev, int max_grid_size, int ngrow,
                 const Geometry& geom, GShopLevel<G>& fineLevel);
     GShopLevel (IndexSpace const* is, const Geometry& geom);
     void define_fine (G const& gshop, const Geometry& geom,
-                      int max_grid_size, int ngrow, bool extend_domain_face);
+                      int max_grid_size, int ngrow, bool extend_domain_face, int num_crse_opt);
 };
 
 template <typename G>
@@ -113,7 +116,7 @@ GShopLevel<G>::GShopLevel (IndexSpace const* is, const Geometry& geom)
 
 template <typename G>
 GShopLevel<G>::GShopLevel (IndexSpace const* is, G const& gshop, const Geometry& geom,
-                           int max_grid_size, int ngrow, bool extend_domain_face)
+                           int max_grid_size, int ngrow, bool extend_domain_face, int num_crse_opt)
     : Level(is, geom)
 {
     if (std::is_same<typename G::FunctionType, AllRegularIF>::value) {
@@ -122,13 +125,13 @@ GShopLevel<G>::GShopLevel (IndexSpace const* is, G const& gshop, const Geometry&
         return;
     }
 
-    define_fine(gshop, geom, max_grid_size, ngrow, extend_domain_face);
+    define_fine(gshop, geom, max_grid_size, ngrow, extend_domain_face, num_crse_opt);
 }
 
 template <typename G>
 void
 GShopLevel<G>::define_fine (G const& gshop, const Geometry& geom,
-                            int max_grid_size, int ngrow, bool extend_domain_face)
+                            int max_grid_size, int ngrow, bool extend_domain_face, int num_crse_opt)
 {
     if (amrex::Verbose() > 0 && extend_domain_face == false) {
         amrex::Print() << "AMReX WARNING: extend_domain_face=false is not recommended!\n";
@@ -166,57 +169,84 @@ GShopLevel<G>::define_fine (G const& gshop, const Geometry& geom,
     Box bounding_box = (extend_domain_face) ? domain : domain_grown;
     bounding_box.surroundingNodes();
 
-    BoxList bl(domain);
-    bl.maxSize(max_grid_size);
-    if (m_ngrow != 0) {
-        const IntVect& domlo = domain.smallEnd();
-        const IntVect& domhi = domain.bigEnd();
-        for (auto& b : bl) {
-            for (int idim = 0; idim < AMREX_SPACEDIM; ++idim) {
-                if (m_ngrow[idim] != 0) {
-                    if (b.smallEnd(idim) == domlo[idim]) {
-                        b.growLo(idim,m_ngrow[idim]);
-                    }
-                    if (b.bigEnd(idim) == domhi[idim]) {
-                        b.growHi(idim,m_ngrow[idim]);
-                    }
+    BoxList cut_boxes;
+    BoxList covered_boxes;
+
+    const int nprocs = ParallelDescriptor::NProcs();
+    const int iproc = ParallelDescriptor::MyProc();
+
+    num_crse_opt = std::max(0,std::min(8,num_crse_opt));
+    for (int clev = num_crse_opt; clev >= 0; --clev) {
+        IntVect crse_ratio(1 << clev);
+        if (domain.coarsenable(crse_ratio)) {
+            Box const& crse_bounding_box = amrex::coarsen(bounding_box, crse_ratio);
+            Geometry const& crse_geom = amrex::coarsen(geom, crse_ratio);
+            BoxList test_boxes;
+            if (cut_boxes.isEmpty()) {
+                covered_boxes.clear();
+                test_boxes = BoxList(crse_geom.Domain());
+                test_boxes.maxSize(max_grid_size);
+            } else {
+                test_boxes.swap(cut_boxes);
+                test_boxes.coarsen(crse_ratio);
+                test_boxes.maxSize(max_grid_size);
+            }
+
+            const Long nboxes = test_boxes.size();
+            const auto& boxes = test_boxes.data();
+            for (Long i = iproc; i < nboxes; i += nprocs) {
+                const Box& vbx = boxes[i];
+                const Box& gbx = amrex::surroundingNodes(amrex::grow(vbx,1));
+                auto box_type = gshop.getBoxType(gbx&crse_bounding_box,crse_geom,RunOn::Gpu);
+                if (box_type == gshop.allcovered) {
+                    covered_boxes.push_back(amrex::refine(vbx, crse_ratio));
+                } else if (box_type == gshop.mixedcells) {
+                    cut_boxes.push_back(amrex::refine(vbx, crse_ratio));
                 }
             }
+
+            amrex::AllGatherBoxes(cut_boxes.data());
         }
     }
 
-    m_grids.define(std::move(bl));
-    m_dmap.define(m_grids);
-
-    Vector<Box> cut_boxes;
-    Vector<Box> covered_boxes;
+    amrex::AllGatherBoxes(covered_boxes.data());
 
-    for (MFIter mfi(m_grids, m_dmap); mfi.isValid(); ++mfi)
-    {
-        const Box& vbx = mfi.validbox();
-        const Box& gbx = amrex::surroundingNodes(amrex::grow(vbx,1));
-        int box_type = gshop.getBoxType(gbx & bounding_box, geom, RunOn::Gpu);
-        if (box_type == gshop.allcovered) {
-            covered_boxes.push_back(vbx);
-        } else if (box_type == gshop.mixedcells) {
-            cut_boxes.push_back(vbx);
-        }
+    if (m_ngrow != 0) {
+        auto grow_at_domain_boundary = [&] (BoxList& bl)
+        {
+            const IntVect& domlo = domain.smallEnd();
+            const IntVect& domhi = domain.bigEnd();
+            for (auto& b : bl) {
+                for (int idim = 0; idim < AMREX_SPACEDIM; ++idim) {
+                    if (m_ngrow[idim] != 0) {
+                        if (b.smallEnd(idim) == domlo[idim]) {
+                            b.growLo(idim,m_ngrow[idim]);
+                        }
+                        if (b.bigEnd(idim) == domhi[idim]) {
+                            b.growHi(idim,m_ngrow[idim]);
+                        }
+                    }
+                }
+            }
+        };
+        grow_at_domain_boundary(covered_boxes);
+        grow_at_domain_boundary(cut_boxes);
     }
 
-    amrex::AllGatherBoxes(cut_boxes);
-    amrex::AllGatherBoxes(covered_boxes);
-
-    if ( cut_boxes.empty() &&
-        !covered_boxes.empty())
+    if ( cut_boxes.isEmpty() &&
+        !covered_boxes.isEmpty())
     {
         amrex::Abort("AMReX_EB2_Level.H: Domain is completely covered");
     }
 
-    if (!covered_boxes.empty()) {
-        m_covered_grids = BoxArray(BoxList(std::move(covered_boxes)));
+    if (!covered_boxes.isEmpty()) {
+        if (num_crse_opt > 2) { // don't want the box too big
+            covered_boxes.maxSize(max_grid_size*4);
+        }
+        m_covered_grids = BoxArray(std::move(covered_boxes));
     }
 
-    if (cut_boxes.empty()) {
+    if (cut_boxes.isEmpty()) {
         m_grids = BoxArray();
         m_dmap = DistributionMapping();
         m_allregular = true;
@@ -224,7 +254,7 @@ GShopLevel<G>::define_fine (G const& gshop, const Geometry& geom,
         return;
     }
 
-    m_grids = BoxArray(BoxList(std::move(cut_boxes)));
+    m_grids = BoxArray(std::move(cut_boxes));
     m_dmap = DistributionMapping(m_grids);
 
     m_mgf.define(m_grids, m_dmap);
diff --git a/Src/EB/AMReX_EB2_Level.cpp b/Src/EB/AMReX_EB2_Level.cpp
index 46277b59ab1..09b6db4a54c 100644
--- a/Src/EB/AMReX_EB2_Level.cpp
+++ b/Src/EB/AMReX_EB2_Level.cpp
@@ -1,6 +1,7 @@
 
 #include <AMReX_EB2_Level.H>
 #include <AMReX_IArrayBox.H>
+#include <AMReX_EB_chkpt_file.H>
 #include <algorithm>
 
 namespace amrex { namespace EB2 {
@@ -916,4 +917,14 @@ Level::fillLevelSet (MultiFab& levelset, const Geometry& geom) const
     }
 }
 
+void
+Level::write_to_chkpt_file (const std::string& fname, bool extend_domain_face, int max_grid_size) const
+{
+    ChkptFile chkptFile(fname);
+    chkptFile.write_to_chkpt_file(m_grids, m_covered_grids,
+                                  m_volfrac, m_centroid, m_bndryarea, m_bndrycent,
+                                  m_bndrynorm, m_areafrac, m_facecent, m_edgecent, m_levelset,
+                                  m_geom, m_ngrow, extend_domain_face, max_grid_size);
+}
+
 }}
diff --git a/Src/EB/AMReX_EB2_Level_STL.H b/Src/EB/AMReX_EB2_Level_STL.H
index f29460d7a92..19cb31ef93b 100644
--- a/Src/EB/AMReX_EB2_Level_STL.H
+++ b/Src/EB/AMReX_EB2_Level_STL.H
@@ -13,7 +13,7 @@ class STLLevel
 public:
 
     STLLevel (IndexSpace const* is, STLtools const& stl_tools, const Geometry& geom,
-              int max_grid_size, int ngrow, bool extend_domain_face);
+              int max_grid_size, int ngrow, bool extend_domain_face, int num_crse_opt);
 
     STLLevel (IndexSpace const* is, int ilev, int max_grid_size, int ngrow,
               const Geometry& geom, STLLevel& fineLevel);
diff --git a/Src/EB/AMReX_EB2_Level_STL.cpp b/Src/EB/AMReX_EB2_Level_STL.cpp
index 00f29958714..53243cd754a 100644
--- a/Src/EB/AMReX_EB2_Level_STL.cpp
+++ b/Src/EB/AMReX_EB2_Level_STL.cpp
@@ -3,12 +3,12 @@
 namespace amrex { namespace EB2 {
 
 STLLevel::STLLevel (IndexSpace const* is, STLtools const& stl_tools, const Geometry& geom,
-                    int max_grid_size, int ngrow, bool extend_domain_face)
+                    int max_grid_size, int ngrow, bool extend_domain_face, int num_crse_opt)
     : GShopLevel<STLtools>(is, geom)
 {
     BL_PROFILE("EB2::STLLevel()-fine");
 
-    define_fine(stl_tools, geom, max_grid_size, ngrow, extend_domain_face);
+    define_fine(stl_tools, geom, max_grid_size, ngrow, extend_domain_face, num_crse_opt);
 }
 
 STLLevel::STLLevel (IndexSpace const* is, int ilev, int max_grid_size, int ngrow,
diff --git a/Src/EB/AMReX_EB2_Level_chkpt_file.H b/Src/EB/AMReX_EB2_Level_chkpt_file.H
new file mode 100644
index 00000000000..881dd8f22f0
--- /dev/null
+++ b/Src/EB/AMReX_EB2_Level_chkpt_file.H
@@ -0,0 +1,31 @@
+#ifndef AMREX_EB2_LEVEL_CHKPT_FILE_H_
+#define AMREX_EB2_LEVEL_CHKPT_FILE_H_
+#include <AMReX_Config.H>
+
+#include <AMReX_EB2_Level.H>
+#include <AMReX_EB_chkpt_file.H>
+
+namespace amrex { namespace EB2 {
+
+class ChkptFileLevel
+    : public GShopLevel<ChkptFile>
+{
+public:
+
+    ChkptFileLevel (IndexSpace const* is, ChkptFile const& chkpt_file, const Geometry& geom,
+                    int max_grid_size, int ngrow, bool extend_domain_face);
+
+    ChkptFileLevel (IndexSpace const* is, int ilev, int max_grid_size, int ngrow,
+                    const Geometry& geom, ChkptFileLevel& fineLevel);
+
+// for cuda support
+    void define_fine_chkpt_file (ChkptFile const& chkpt_file,
+                                 Geometry const& geom, int max_grid_size, int ngrow,
+                                 bool extend_domain_face);
+
+    void finalize_cell_flags (); //sets the connection flags and adjustments to cellflags
+};
+
+}}
+
+#endif
diff --git a/Src/EB/AMReX_EB2_Level_chkpt_file.cpp b/Src/EB/AMReX_EB2_Level_chkpt_file.cpp
new file mode 100644
index 00000000000..0b2d88e828f
--- /dev/null
+++ b/Src/EB/AMReX_EB2_Level_chkpt_file.cpp
@@ -0,0 +1,203 @@
+#include <AMReX_EB2_Level_chkpt_file.H>
+#include <AMReX_EB2_C.H>
+
+#include <AMReX_MultiFabUtil.H>
+
+namespace amrex { namespace EB2 {
+
+ChkptFileLevel::ChkptFileLevel (IndexSpace const* is, ChkptFile const& chkpt_file,
+                                Geometry const& geom, int max_grid_size, int ngrow, bool extend_domain_face)
+    : GShopLevel<ChkptFile>(is, geom)
+{
+    BL_PROFILE("EB2::ChkptFileLevel()-fine");
+
+    define_fine_chkpt_file(chkpt_file, geom, max_grid_size, ngrow, extend_domain_face);
+}
+
+void
+ChkptFileLevel::define_fine_chkpt_file (ChkptFile const& chkpt_file,
+                                        Geometry const& geom, int max_grid_size,
+                                        int ngrow, bool extend_domain_face)
+{
+    BL_PROFILE("EB2::ChkptFileLevel()-define-fine-chkptfile");
+
+    m_ngrow = IntVect{static_cast<int>(std::ceil(ngrow/16.)) * 16};
+
+    Box const& domain = geom.Domain();
+    Box domain_grown = domain;
+    for (int idim = 0; idim < AMREX_SPACEDIM; ++idim) {
+        if (geom.isPeriodic(idim)) {
+            m_ngrow[idim] = 0;
+        } else {
+            m_ngrow[idim] = std::min(m_ngrow[idim], domain_grown.length(idim));
+        }
+    }
+
+    const int ng = GFab::ng;
+    chkpt_file.read_from_chkpt_file(m_grids, m_covered_grids,
+                                    m_dmap, m_volfrac, m_centroid, m_bndryarea,
+                                    m_bndrycent, m_bndrynorm, m_areafrac, m_facecent,
+                                    m_edgecent, m_levelset, ng, geom, m_ngrow,
+                                    extend_domain_face, max_grid_size);
+
+
+    if ( m_grids.empty() &&
+            !m_covered_grids.empty())
+    {
+        Abort("AMReX_EB2_Level.H: Domain is completely covered");
+    }
+
+    if (m_grids.empty()) {
+        m_allregular = true;
+        m_ok = true;
+        return;
+    }
+
+
+    m_mgf.define(m_grids, m_dmap);
+    MFInfo mf_info;
+    m_cellflag.define(m_grids, m_dmap, 1, ng, mf_info);
+
+#ifdef AMREX_USE_OMP
+#pragma omp parallel if (Gpu::notInLaunchRegion())
+#endif
+    for (MFIter mfi(m_mgf); mfi.isValid(); ++mfi)
+    {
+        auto& gfab = m_mgf[mfi];
+
+        const auto& levelset = m_levelset.const_array(mfi);
+        const Box& bxg2 = amrex::grow(gfab.validbox(),ng);
+        const Box& nodal_box = amrex::surroundingNodes(bxg2);
+        const auto& ls = gfab.getLevelSet().array();
+
+        AMREX_HOST_DEVICE_PARALLEL_FOR_3D(nodal_box, i, j, k,
+        {
+            ls(i,j,k) = levelset(i,j,k);
+        });
+
+        auto& cellflag = m_cellflag[mfi];
+        gfab.buildTypes(cellflag);
+    }
+
+    finalize_cell_flags();
+}
+
+void
+ChkptFileLevel::finalize_cell_flags ()
+{
+
+#ifdef AMREX_USE_OMP
+#pragma omp parallel if (Gpu::notInLaunchRegion())
+#endif
+    {
+        EBCellFlagFab cellflagtmp;
+        for (MFIter mfi(m_mgf); mfi.isValid(); ++mfi)
+        {
+            auto& gfab = m_mgf[mfi];
+            const Box& vbx = mfi.validbox();
+            const Box& bxg1 = amrex::grow(vbx,1);
+            Array4<EBCellFlag> const& cell = m_cellflag.array(mfi);
+
+            cellflagtmp.resize(m_cellflag[mfi].box());
+            Elixir cellflagtmp_eli = cellflagtmp.elixir();
+            Array4<EBCellFlag> const& ctmp = cellflagtmp.array();
+
+            auto& facetype = gfab.getFaceType();
+            AMREX_D_TERM(Array4<Type_t> const& fx = facetype[0].array();,
+                         Array4<Type_t> const& fy = facetype[1].array();,
+                         Array4<Type_t> const& fz = facetype[2].array(););
+
+
+            AMREX_D_TERM(Array4<Real const> const& apx = m_areafrac[0].const_array(mfi);,
+                         Array4<Real const> const& apy = m_areafrac[1].const_array(mfi);,
+                         Array4<Real const> const& apz = m_areafrac[2].const_array(mfi););
+
+            const Box& xbx = amrex::grow(amrex::surroundingNodes(vbx,0),1);
+            AMREX_HOST_DEVICE_FOR_3D ( xbx, i, j, k,
+            {
+                if (apx(i,j,k) == 0.0_rt) {
+                    fx(i,j,k) = Type::covered;
+                } else if (apx(i,j,k) == 1.0_rt) {
+                    fx(i,j,k) = Type::regular;
+                }
+            });
+
+            const Box& ybx = amrex::grow(amrex::surroundingNodes(vbx,1),1);
+            AMREX_HOST_DEVICE_FOR_3D ( ybx, i, j, k,
+            {
+                if (apy(i,j,k) == 0.0_rt) {
+                    fy(i,j,k) = Type::covered;
+                } else if (apy(i,j,k) == 1.0_rt) {
+                    fy(i,j,k) = Type::regular;
+                }
+            });
+
+    #if (AMREX_SPACEDIM == 3)
+            const Box& zbx = amrex::grow(amrex::surroundingNodes(vbx,2),1);
+            AMREX_HOST_DEVICE_FOR_3D ( zbx, i, j, k,
+            {
+                if (apz(i,j,k) == 0.0_rt) {
+                    fz(i,j,k) = Type::covered;
+                } else if (apz(i,j,k) == 1.0_rt) {
+                    fz(i,j,k) = Type::regular;
+                }
+            });
+    #endif
+
+
+    #if (AMREX_SPACEDIM == 2)
+            ignore_unused(ctmp);
+            AMREX_HOST_DEVICE_FOR_3D ( bxg1, i, j, k,
+            {
+                ignore_unused(k);
+                if (cell(i,j,0).isSingleValued()) {
+                    if (fx(i,j,0) == Type::regular && fx(i+1,j,0) == Type::regular &&
+                        fy(i,j,0) == Type::regular && fy(i,j+1,0) == Type::regular)
+                    {
+                        cell(i,j,0).setRegular();
+                    }
+                    else if (fx(i,j,0) == Type::covered && fx(i+1,j,0) == Type::covered &&
+                             fy(i,j,0) == Type::covered && fy(i,j+1,0) == Type::covered)
+                    {
+                        cell(i,j,0).setCovered();
+                    }
+                }
+            });
+
+            set_connection_flags(bxg1, cell, fx, fy);
+
+    #else
+            AMREX_HOST_DEVICE_FOR_3D ( bxg1, i, j, k,
+            {
+                if (cell(i,j,k).isSingleValued()) {
+                    if (fx(i,j,k) == Type::covered && fx(i+1,j,k) == Type::covered &&
+                        fy(i,j,k) == Type::covered && fy(i,j+1,k) == Type::covered &&
+                        fz(i,j,k) == Type::covered && fz(i,j,k+1) == Type::covered)
+                    {
+                        cell(i,j,k).setCovered();
+                    }
+                    else if (fx(i,j,k) == Type::regular && fx(i+1,j,k) == Type::regular &&
+                             fy(i,j,k) == Type::regular && fy(i,j+1,k) == Type::regular &&
+                             fz(i,j,k) == Type::regular && fz(i,j,k+1) == Type::regular)
+                    {
+                        cell(i,j,k).setRegular();
+                    }
+                }
+            });
+
+            set_connection_flags(vbx, bxg1, cell, ctmp, fx, fy, fz);
+
+    #endif
+
+        }
+
+        m_ok = true;
+    }
+}
+
+ChkptFileLevel::ChkptFileLevel (IndexSpace const* is, int ilev, int max_grid_size, int ngrow,
+                                const Geometry& geom, ChkptFileLevel& fineLevel)
+: GShopLevel<ChkptFile>(is, ilev, max_grid_size, ngrow, geom, fineLevel)
+{}
+
+}}
diff --git a/Src/EB/AMReX_EB_chkpt_file.H b/Src/EB/AMReX_EB_chkpt_file.H
new file mode 100644
index 00000000000..781db55a1d8
--- /dev/null
+++ b/Src/EB/AMReX_EB_chkpt_file.H
@@ -0,0 +1,60 @@
+#ifndef AMREX_EB_CHKPT_FILE_H_
+#define AMREX_EB_CHKPT_FILE_H_
+
+#include <AMReX_MultiFab.H>
+
+namespace amrex { namespace EB2 {
+
+class ChkptFile
+{
+private:
+    std::string m_restart_file = "";
+
+    const std::string m_volfrac_name   = "volfrac";
+    const std::string m_centroid_name  = "centroid";
+    const std::string m_bndryarea_name = "bndryarea";
+    const std::string m_bndrycent_name = "bndrycent";
+    const std::string m_bndrynorm_name = "bndrynorm";
+    const std::string m_levelset_name  = "levelset";
+
+    const amrex::Vector<std::string> m_areafrac_name
+        = {AMREX_D_DECL("areafrac_x", "areafrac_y", "areafrac_z")};
+    const amrex::Vector<std::string> m_facecent_name
+        = {AMREX_D_DECL("facecent_x", "facecent_y", "facecent_z")};
+    const amrex::Vector<std::string> m_edgecent_name
+        = {AMREX_D_DECL("edgecent_x", "edgecent_y", "edgecent_z")};
+
+    void writeHeader (const BoxArray& cut_ba, const BoxArray& covered_ba, const Geometry& geom,
+                      const IntVect& ngrow, bool extend_domain_face, int max_grid_size) const;
+
+    void writeToFile (const MultiFab& mf, const std::string& mf_name) const;
+
+
+public:
+    ChkptFile (const std::string &fname);
+
+    void read_from_chkpt_file (BoxArray& cut_grids, BoxArray& covered_grids,
+                               DistributionMapping& dmap,
+                               MultiFab& volfrac, MultiFab& centroid, MultiFab& bndryarea,
+                               MultiFab& bndrycent, MultiFab& bndrynorm,
+                               Array<MultiFab,AMREX_SPACEDIM>& areafrac,
+                               Array<MultiFab,AMREX_SPACEDIM>& facecent,
+                               Array<MultiFab,AMREX_SPACEDIM>& edgecent,
+                               MultiFab& levelset, int ng_gfab, const Geometry& geom,
+                               const IntVect& ngrow_finest, bool extend_domain_face, int max_grid_size) const;
+
+    void write_to_chkpt_file (const BoxArray& cut_grids,
+                              const BoxArray& covered_grids,
+                              const MultiFab& volfrac,
+                              const MultiFab& centroid, const MultiFab& bndryarea,
+                              const MultiFab& bndrycent, const MultiFab& bndrynorm,
+                              const Array<MultiFab,AMREX_SPACEDIM>& areafrac,
+                              const Array<MultiFab,AMREX_SPACEDIM>& facecent,
+                              const Array<MultiFab,AMREX_SPACEDIM>& edgecent,
+                              const MultiFab& levelset, const Geometry& geom,
+                              const IntVect& ngrow, bool extend_domain_face, int max_grid_size) const;
+};
+
+}}
+
+#endif
diff --git a/Src/EB/AMReX_EB_chkpt_file.cpp b/Src/EB/AMReX_EB_chkpt_file.cpp
new file mode 100644
index 00000000000..cd1c00e9ee5
--- /dev/null
+++ b/Src/EB/AMReX_EB_chkpt_file.cpp
@@ -0,0 +1,324 @@
+#include <AMReX_EB_chkpt_file.H>
+
+#include <AMReX_MultiFab.H>
+#include <AMReX_PlotFileUtil.H>
+#include <AMReX_VisMF.H>    // amrex::VisMF::Write(MultiFab)
+#include <AMReX_VectorIO.H> // amrex::[read,write]IntData(array_of_ints)
+
+namespace {
+
+const std::string level_prefix = "Level_";
+
+void gotoNextLine (std::istream& is)
+{
+    constexpr std::streamsize bl_ignore_max { 100000 };
+    is.ignore(bl_ignore_max, '\n');
+}
+
+}
+
+namespace amrex { namespace EB2 {
+
+// Header information includes the cut and covered boxes (if any)
+// Checkpoint file contains data for cut boxes
+void
+ChkptFile::writeHeader (const BoxArray& cut_ba, const BoxArray& covered_ba,
+                        const Geometry& geom,
+                        const IntVect& ngrow, bool extend_domain_face,
+                        int max_grid_size) const
+{
+    if (ParallelDescriptor::IOProcessor())
+    {
+        std::string HeaderFileName(m_restart_file + "/Header");
+        VisMF::IO_Buffer io_buffer(VisMF::IO_Buffer_Size);
+        std::ofstream HeaderFile;
+
+        HeaderFile.rdbuf()->pubsetbuf(io_buffer.dataPtr(), io_buffer.size());
+
+        HeaderFile.open(HeaderFileName.c_str(), std::ofstream::out   |
+                std::ofstream::trunc |
+                std::ofstream::binary);
+
+        if ( ! HeaderFile.good() )
+            FileOpenFailed(HeaderFileName);
+
+        HeaderFile.precision(17);
+
+        HeaderFile << "Checkpoint version: 1\n";
+
+        const int nlevels = 1;
+        HeaderFile << nlevels << "\n";
+
+        // Geometry
+        for (int i = 0; i < AMREX_SPACEDIM; ++i)
+            HeaderFile << geom.ProbLo(i) << ' ';
+        HeaderFile << '\n';
+
+        for (int i = 0; i < AMREX_SPACEDIM; ++i)
+            HeaderFile << geom.ProbHi(i) << ' ';
+        HeaderFile << '\n';
+
+        // ngrow
+        for (int i = 0; i < AMREX_SPACEDIM; ++i)
+            HeaderFile << ngrow[i] << ' ';
+        HeaderFile << '\n';
+
+        // extend domain face
+        HeaderFile << extend_domain_face << "\n";
+
+        // max grid size
+        HeaderFile << max_grid_size << "\n";
+
+        // BoxArray
+        for (int lev = 0; lev < nlevels; ++lev)
+        {
+            cut_ba.writeOn(HeaderFile);
+            HeaderFile << '\n';
+
+            if (! covered_ba.empty()) {
+                covered_ba.writeOn(HeaderFile);
+                HeaderFile << '\n';
+            }
+        }
+    }
+}
+
+void
+ChkptFile::writeToFile (const MultiFab& mf, const std::string& mf_name) const
+{
+    VisMF::Write(mf, MultiFabFileFullPrefix(0, m_restart_file,
+                level_prefix, mf_name));
+}
+
+
+ChkptFile::ChkptFile (const std::string &fname)
+    : m_restart_file(fname)
+{}
+
+void
+ChkptFile::read_from_chkpt_file (BoxArray& cut_grids, BoxArray& covered_grids,
+                                 DistributionMapping& dmap,
+                                 MultiFab& volfrac, MultiFab& centroid,
+                                 MultiFab& bndryarea, MultiFab& bndrycent,
+                                 MultiFab& bndrynorm, Array<MultiFab,AMREX_SPACEDIM>& areafrac,
+                                 Array<MultiFab,AMREX_SPACEDIM>& facecent,
+                                 Array<MultiFab,AMREX_SPACEDIM>& edgecent,
+                                 MultiFab& levelset, int ng_gfab, const Geometry& geom,
+                                 const IntVect& ngrow_finest, bool extend_domain_face,
+                                 int max_grid_size) const
+{
+    Real prob_lo[AMREX_SPACEDIM];
+    Real prob_hi[AMREX_SPACEDIM];
+
+    std::string File(m_restart_file + "/Header");
+
+    if (amrex::Verbose()) amrex::Print() << "file=" << File << std::endl;
+
+    VisMF::IO_Buffer io_buffer(VisMF::GetIOBufferSize());
+
+    Vector<char> fileCharPtr;
+    ParallelDescriptor::ReadAndBcastFile(File, fileCharPtr);
+    std::string fileCharPtrString(fileCharPtr.dataPtr());
+    std::istringstream is(fileCharPtrString, std::istringstream::in);
+
+    std::string line, word;
+
+    std::getline(is, line);
+
+    int nlevs;
+    is >> nlevs;
+    gotoNextLine(is);
+    AMREX_ASSERT(nlevs == 1);
+
+    std::getline(is, line);
+    {
+        std::istringstream lis(line);
+        int i = 0;
+        while (lis >> word) {
+            prob_lo[i++] = std::stod(word);
+        }
+    }
+
+    std::getline(is, line);
+    {
+        std::istringstream lis(line);
+        int i = 0;
+        while (lis >> word) {
+            prob_hi[i++] = std::stod(word);
+        }
+    }
+
+    for (int idim = 0; idim < AMREX_SPACEDIM; ++idim) {
+        AMREX_ALWAYS_ASSERT_WITH_MESSAGE(Math::abs(prob_lo[idim] - geom.ProbLo()[idim]) < std::numeric_limits<Real>::epsilon(),
+                                         "EB2::ChkptFile cannot read from a different problem domain");
+        AMREX_ALWAYS_ASSERT_WITH_MESSAGE(Math::abs(prob_hi[idim] - geom.ProbHi()[idim]) < std::numeric_limits<Real>::epsilon(),
+                                         "EB2::ChkptFile cannot read from a different problem domain");
+    }
+
+    IntVect ngrow_chkptfile;
+    std::getline(is, line);
+    {
+        std::istringstream lis(line);
+        int i = 0;
+        while (lis >> word) {
+            ngrow_chkptfile[i++] = std::stoi(word);
+        }
+    }
+    AMREX_ALWAYS_ASSERT_WITH_MESSAGE(ngrow_chkptfile == ngrow_finest, "EB2::ChkptFile cannot read from different ngrow");
+
+    bool edf_chkptfile;
+    is >> edf_chkptfile;
+    gotoNextLine(is);
+    AMREX_ALWAYS_ASSERT_WITH_MESSAGE(extend_domain_face == edf_chkptfile,
+                                     "EB2::ChkptFile cannot read from different extend_domain_face");
+
+    int mgs_chkptfile;
+    is >> mgs_chkptfile;
+    gotoNextLine(is);
+    AMREX_ALWAYS_ASSERT_WITH_MESSAGE(max_grid_size == mgs_chkptfile,
+                                     "EB2::ChkptFile cannot read from different max_grid_size");
+
+    if (amrex::Verbose()) amrex::Print() << "Loading cut_grids\n";
+    cut_grids.readFrom(is);
+    gotoNextLine(is);
+
+    if (is.peek() != EOF) {
+        if (amrex::Verbose()) amrex::Print() << "Loading covered_grids\n";
+        covered_grids.readFrom(is);
+        gotoNextLine(is);
+    }
+
+    dmap.define(cut_grids, ParallelDescriptor::NProcs());
+
+    // volfrac
+    {
+        if (amrex::Verbose()) amrex::Print() << "  Loading " << m_volfrac_name << std::endl;
+
+        volfrac.define(cut_grids, dmap, 1, ng_gfab);
+
+        auto prefix = MultiFabFileFullPrefix(0, m_restart_file, level_prefix, m_volfrac_name);
+        VisMF::Read(volfrac, prefix);
+    }
+
+    // centroid
+    {
+        if (amrex::Verbose()) amrex::Print() << "  Loading " << m_centroid_name << std::endl;
+
+        centroid.define(cut_grids, dmap, AMREX_SPACEDIM, ng_gfab);
+
+        auto prefix = MultiFabFileFullPrefix(0, m_restart_file, level_prefix, m_centroid_name);
+        VisMF::Read(centroid, prefix);
+    }
+
+    // bndryarea
+    {
+        if (amrex::Verbose()) amrex::Print() << "  Loading " << m_bndryarea_name << std::endl;
+
+        bndryarea.define(cut_grids, dmap, 1, ng_gfab);
+
+        auto prefix = MultiFabFileFullPrefix(0, m_restart_file, level_prefix, m_bndryarea_name);
+        VisMF::Read(bndryarea, prefix);
+    }
+
+    // bndrycent
+    {
+        if (amrex::Verbose()) amrex::Print() << "  Loading " << m_bndrycent_name << std::endl;
+
+        bndrycent.define(cut_grids, dmap, AMREX_SPACEDIM, ng_gfab);
+
+        auto prefix = MultiFabFileFullPrefix(0, m_restart_file, level_prefix, m_bndrycent_name);
+        VisMF::Read(bndrycent, prefix);
+    }
+
+    // bndrynorm
+    {
+        if (amrex::Verbose()) amrex::Print() << "  Loading " << m_bndrynorm_name << std::endl;
+
+        bndrynorm.define(cut_grids, dmap, AMREX_SPACEDIM, ng_gfab);
+
+        auto prefix = MultiFabFileFullPrefix(0, m_restart_file, level_prefix, m_bndrynorm_name);
+        VisMF::Read(bndrynorm, prefix);
+    }
+
+    for (int idim = 0; idim < AMREX_SPACEDIM; ++idim) {
+        // areafrac
+        {
+            if (amrex::Verbose()) amrex::Print() << "  Loading " << m_areafrac_name[idim] << std::endl;
+
+            areafrac[idim].define(convert(cut_grids, IntVect::TheDimensionVector(idim)), dmap, 1, ng_gfab);
+
+            auto prefix = MultiFabFileFullPrefix(0, m_restart_file, level_prefix, m_areafrac_name[idim]);
+            VisMF::Read(areafrac[idim], prefix);
+        }
+
+        // facecent
+        {
+            if (amrex::Verbose()) amrex::Print() << "  Loading " << m_facecent_name[idim] << std::endl;
+
+            facecent[idim].define(convert(cut_grids, IntVect::TheDimensionVector(idim)), dmap, AMREX_SPACEDIM-1, ng_gfab);
+
+            auto prefix = MultiFabFileFullPrefix(0, m_restart_file, level_prefix, m_facecent_name[idim]);
+            VisMF::Read(facecent[idim], prefix);
+        }
+
+        // edgecent
+        {
+            if (amrex::Verbose()) amrex::Print() << "  Loading " << m_edgecent_name[idim] << std::endl;
+
+            IntVect edge_type{1}; edge_type[idim] = 0;
+            edgecent[idim].define(convert(cut_grids, edge_type), dmap, 1, ng_gfab);
+
+            auto prefix = MultiFabFileFullPrefix(0, m_restart_file, level_prefix, m_edgecent_name[idim]);
+            VisMF::Read(edgecent[idim], prefix);
+        }
+    }
+
+    // levelset
+    {
+        if (amrex::Verbose()) amrex::Print() << "  Loading " << m_levelset_name << std::endl;
+
+        levelset.define(convert(cut_grids,IntVect::TheNodeVector()), dmap, 1, ng_gfab);
+
+        auto prefix = MultiFabFileFullPrefix(0, m_restart_file, level_prefix, m_levelset_name);
+        VisMF::Read(levelset, prefix);
+    }
+}
+
+void
+ChkptFile::write_to_chkpt_file (const BoxArray& cut_grids,
+                                const BoxArray& covered_grids,
+                                const MultiFab& volfrac,
+                                const MultiFab& centroid, const MultiFab& bndryarea,
+                                const MultiFab& bndrycent, const MultiFab& bndrynorm,
+                                const Array<MultiFab,AMREX_SPACEDIM>& areafrac,
+                                const Array<MultiFab,AMREX_SPACEDIM>& facecent,
+                                const Array<MultiFab,AMREX_SPACEDIM>& edgecent,
+                                const MultiFab& levelset, const Geometry& geom,
+                                const IntVect& ngrow, bool extend_domain_face,
+                                int max_grid_size) const
+{
+
+    if (ParallelDescriptor::IOProcessor()) {
+        std::cout << "\n\t Writing checkpoint " << m_restart_file << std::endl;
+    }
+
+    const int nlevels = 1;
+    PreBuildDirectorHierarchy(m_restart_file, level_prefix, nlevels, true);
+
+    writeHeader(cut_grids, covered_grids, geom, ngrow, extend_domain_face, max_grid_size);
+
+    writeToFile(volfrac, m_volfrac_name);
+    writeToFile(centroid, m_centroid_name);
+    writeToFile(bndryarea, m_bndryarea_name);
+    writeToFile(bndrycent, m_bndrycent_name);
+    writeToFile(bndrynorm, m_bndrynorm_name);
+    writeToFile(levelset, m_levelset_name);
+
+    for (int idim = 0; idim < AMREX_SPACEDIM; ++idim) {
+        writeToFile(areafrac[idim], m_areafrac_name[idim]);
+        writeToFile(facecent[idim], m_facecent_name[idim]);
+        writeToFile(edgecent[idim], m_edgecent_name[idim]);
+    }
+}
+
+}}
diff --git a/Src/EB/AMReX_distFcnElement.H b/Src/EB/AMReX_distFcnElement.H
index f839bdb5747..2a9c7a0c2f4 100644
--- a/Src/EB/AMReX_distFcnElement.H
+++ b/Src/EB/AMReX_distFcnElement.H
@@ -12,7 +12,7 @@ class distFcnElement2d {
  public:
   //! Constructor
   distFcnElement2d() {}
-  ~distFcnElement2d() {}
+  virtual ~distFcnElement2d() {}
 
   virtual distFcnElement2d* newDistFcnElement2d() const = 0;
 
@@ -29,7 +29,7 @@ class distFcnElement2d {
 class LineDistFcnElement2d: public distFcnElement2d {
  public:
   LineDistFcnElement2d() {}
-  ~LineDistFcnElement2d() {}
+  virtual ~LineDistFcnElement2d() {}
 
   virtual distFcnElement2d* newDistFcnElement2d() const override;
 
@@ -58,7 +58,7 @@ class LineDistFcnElement2d: public distFcnElement2d {
 class SplineDistFcnElement2d: public distFcnElement2d {
  public:
   SplineDistFcnElement2d() {}
-  ~SplineDistFcnElement2d() {}
+  virtual ~SplineDistFcnElement2d() {}
 
   virtual distFcnElement2d* newDistFcnElement2d() const override;
 
diff --git a/Src/EB/CMakeLists.txt b/Src/EB/CMakeLists.txt
index 8ceb433e159..017e4d783a8 100644
--- a/Src/EB/CMakeLists.txt
+++ b/Src/EB/CMakeLists.txt
@@ -70,11 +70,17 @@ target_sources(amrex
    AMReX_EB2_${AMReX_SPACEDIM}D_C.H
    AMReX_EB_STL_utils.H
    AMReX_EB_STL_utils.cpp
+   AMReX_EB_chkpt_file.H
+   AMReX_EB_chkpt_file.cpp
    AMReX_EB_triGeomOps_K.H
    AMReX_EB2_Level_STL.H
    AMReX_EB2_Level_STL.cpp
    AMReX_EB2_IndexSpace_STL.H
    AMReX_EB2_IndexSpace_STL.cpp
+   AMReX_EB2_Level_chkpt_file.H
+   AMReX_EB2_Level_chkpt_file.cpp
+   AMReX_EB2_IndexSpace_chkpt_file.H
+   AMReX_EB2_IndexSpace_chkpt_file.cpp
    )
 
 if (AMReX_SPACEDIM EQUAL 3)
diff --git a/Src/EB/Make.package b/Src/EB/Make.package
index 5865a2da982..b684523924f 100644
--- a/Src/EB/Make.package
+++ b/Src/EB/Make.package
@@ -79,6 +79,12 @@ CEXE_headers += AMReX_EB_triGeomOps_K.H
 CEXE_headers += AMReX_EB2_Level_STL.H AMReX_EB2_IndexSpace_STL.H
 CEXE_sources += AMReX_EB2_Level_STL.cpp AMReX_EB2_IndexSpace_STL.cpp
 
+CEXE_sources += AMReX_EB_chkpt_file.cpp
+CEXE_headers += AMReX_EB_chkpt_file.H  
+
+CEXE_headers += AMReX_EB2_Level_chkpt_file.H AMReX_EB2_IndexSpace_chkpt_file.H
+CEXE_sources += AMReX_EB2_Level_chkpt_file.cpp AMReX_EB2_IndexSpace_chkpt_file.cpp
+
 ifeq ($(DIM),3)
    CEXE_sources += AMReX_WriteEBSurface.cpp AMReX_EBToPVD.cpp
    CEXE_headers += AMReX_WriteEBSurface.H AMReX_EBToPVD.H
diff --git a/Src/Extern/HDF5/AMReX_PlotFileUtilHDF5.cpp b/Src/Extern/HDF5/AMReX_PlotFileUtilHDF5.cpp
index 021ed8c4f60..49a761da801 100644
--- a/Src/Extern/HDF5/AMReX_PlotFileUtilHDF5.cpp
+++ b/Src/Extern/HDF5/AMReX_PlotFileUtilHDF5.cpp
@@ -232,11 +232,8 @@ WriteGenericPlotfileHeaderHDF5 (hid_t fid,
 
         int ratio = 1;
         if (ref_ratio.size() > 0)
-            ratio = ref_ratio[level][0];
+            ratio = (level == finest_level)? 1: ref_ratio[level][0];
 
-        if (level == finest_level) {
-            ratio = 1;
-        }
         CreateWriteHDF5AttrInt(grp, "ref_ratio", 1, &ratio);
 
         for (int k = 0; k < AMREX_SPACEDIM; ++k) {
diff --git a/Src/Extern/HYPRE/AMReX_HypreIJIface.H b/Src/Extern/HYPRE/AMReX_HypreIJIface.H
index 6d0dbacd95f..2ac96748b24 100644
--- a/Src/Extern/HYPRE/AMReX_HypreIJIface.H
+++ b/Src/Extern/HYPRE/AMReX_HypreIJIface.H
@@ -93,11 +93,11 @@ private:
     HypreIntType (*m_precondSolvePtr)(
         HYPRE_Solver, HYPRE_ParCSRMatrix, HYPRE_ParVector, HYPRE_ParVector){nullptr};
 
-    HypreIntType (*m_solverSetTolPtr)(HYPRE_Solver, double){nullptr};
-    HypreIntType (*m_solverSetAbsTolPtr)(HYPRE_Solver, double){nullptr};
+    HypreIntType (*m_solverSetTolPtr)(HYPRE_Solver, amrex::Real){nullptr};
+    HypreIntType (*m_solverSetAbsTolPtr)(HYPRE_Solver, amrex::Real){nullptr};
     HypreIntType (*m_solverSetMaxIterPtr)(HYPRE_Solver, HypreIntType){nullptr};
     HypreIntType (*m_solverNumItersPtr)(HYPRE_Solver, HypreIntType*){nullptr};
-    HypreIntType (*m_solverFinalResidualNormPtr)(HYPRE_Solver, double*){nullptr};
+    HypreIntType (*m_solverFinalResidualNormPtr)(HYPRE_Solver, amrex::Real*){nullptr};
 
     HypreIntType m_ilower{0};
     HypreIntType m_iupper{0};
diff --git a/Src/Extern/HYPRE/AMReX_HypreIJIface.cpp b/Src/Extern/HYPRE/AMReX_HypreIJIface.cpp
index 9e7a42dbb5b..c2e4f126252 100644
--- a/Src/Extern/HYPRE/AMReX_HypreIJIface.cpp
+++ b/Src/Extern/HYPRE/AMReX_HypreIJIface.cpp
@@ -275,7 +275,7 @@ void HypreIJIface::boomeramg_precond_configure (const std::string& prefix)
 
         if (hpp.pp.contains("bamg_non_galerkin_level_tols")) {
             std::vector<int> levels;
-            std::vector<double> tols;
+            std::vector<amrex::Real> tols;
             hpp.pp.getarr("bamg_non_galerkin_level_levels", levels);
             hpp.pp.getarr("bamg_non_galerkin_level_tols", tols);
 
diff --git a/Src/Extern/PETSc/AMReX_PETSc.cpp b/Src/Extern/PETSc/AMReX_PETSc.cpp
index bf0bf68a99c..7d8cd79b582 100644
--- a/Src/Extern/PETSc/AMReX_PETSc.cpp
+++ b/Src/Extern/PETSc/AMReX_PETSc.cpp
@@ -1,7 +1,4 @@
 
-#include <petscksp.h>
-#include <AMReX_PETSc.H>
-
 #ifdef AMREX_USE_EB
 #include <AMReX_MultiCutFab.H>
 #include <AMReX_EBFabFactory.H>
@@ -9,6 +6,9 @@
 
 #include <AMReX_Habec_K.H>
 
+#include <petscksp.h>
+#include <AMReX_PETSc.H>
+
 #include <cmath>
 #include <numeric>
 #include <limits>
diff --git a/Src/Extern/SENSEI/AMReX_AmrDataAdaptor.H b/Src/Extern/SENSEI/AMReX_AmrDataAdaptor.H
index 061ff14c301..602a6298126 100644
--- a/Src/Extern/SENSEI/AMReX_AmrDataAdaptor.H
+++ b/Src/Extern/SENSEI/AMReX_AmrDataAdaptor.H
@@ -30,10 +30,10 @@ public:
   int GetNumberOfArrays(const std::string &meshName, int association, unsigned int &numberOfArrays) override;
   int GetArrayName(const std::string &meshName, int association, unsigned int index, std::string &arrayName) override;
 #endif
-  int GetMesh(const std::string &meshName, bool structureOnly, vtkDataObject *&mesh) override;
-  int AddGhostNodesArray(vtkDataObject* mesh, const std::string &meshName) override;
-  int AddGhostCellsArray(vtkDataObject* mesh, const std::string &meshName) override;
-  int AddArray(vtkDataObject* mesh, const std::string &meshName, int association, const std::string &arrayName) override;
+  int GetMesh(const std::string &meshName, bool structureOnly, svtkDataObject *&mesh) override;
+  int AddGhostNodesArray(svtkDataObject* mesh, const std::string &meshName) override;
+  int AddGhostCellsArray(svtkDataObject* mesh, const std::string &meshName) override;
+  int AddArray(svtkDataObject* mesh, const std::string &meshName, int association, const std::string &arrayName) override;
   int ReleaseData() override;
 
 protected:
diff --git a/Src/Extern/SENSEI/AMReX_AmrDataAdaptor.cpp b/Src/Extern/SENSEI/AMReX_AmrDataAdaptor.cpp
index 135c21ef0e2..aa801eb0993 100644
--- a/Src/Extern/SENSEI/AMReX_AmrDataAdaptor.cpp
+++ b/Src/Extern/SENSEI/AMReX_AmrDataAdaptor.cpp
@@ -1,22 +1,22 @@
 #include "AMReX_AmrDataAdaptor.H"
 
+#include "senseiConfig.h"
 #include "MPIUtils.h"
 #include "STLUtils.h"
-#include "VTKUtils.h"
+#include "SVTKUtils.h"
 #include "Profiler.h"
 #include "Error.h"
 
-#include <vtkObjectFactory.h>
-#include <vtkOverlappingAMR.h>
-#include <vtkAMRBox.h>
-#include <vtkUniformGrid.h>
-#include <vtkXMLUniformGridAMRWriter.h>
-#include <vtkDataSetAttributes.h>
-#include <vtkUnsignedCharArray.h>
-#include <vtkFloatArray.h>
-#include <vtkDoubleArray.h>
-#include <vtkCellData.h>
-#include <vtkPointData.h>
+#include <svtkObjectFactory.h>
+#include <svtkOverlappingAMR.h>
+#include <svtkAMRBox.h>
+#include <svtkUniformGrid.h>
+#include <svtkDataSetAttributes.h>
+#include <svtkUnsignedCharArray.h>
+#include <svtkFloatArray.h>
+#include <svtkDoubleArray.h>
+#include <svtkCellData.h>
+#include <svtkPointData.h>
 
 #include <AMReX_AmrLevel.H>
 #include <AMReX_BoxArray.H>
@@ -62,15 +62,15 @@ int DescriptorMap::Initialize(const DescriptorList &descriptors)
 
             if (itype.cellCentered())
             {
-                this->Map[vtkDataObject::CELL][arrayName] = std::make_pair(i,j);
+                this->Map[svtkDataObject::CELL][arrayName] = std::make_pair(i,j);
             }
             else if (itype.nodeCentered())
             {
-                this->Map[vtkDataObject::POINT][arrayName] = std::make_pair(i,j);
+                this->Map[svtkDataObject::POINT][arrayName] = std::make_pair(i,j);
             }
             else
             {
-                this->Map[vtkDataObject::FIELD][arrayName] = std::make_pair(i,j);
+                this->Map[svtkDataObject::FIELD][arrayName] = std::make_pair(i,j);
             }
         }
     }
@@ -156,7 +156,7 @@ struct AmrDataAdaptor::InternalsType
     int PinMesh;
     amrex::InSituUtils::DescriptorMap SimMetadata;
 #if SENSEI_VERSION_MAJOR < 3
-    std::vector<vtkDataObject*> ManagedObjects;
+    std::vector<svtkDataObject*> ManagedObjects;
 #endif
     std::vector<std::vector<unsigned char *>> Masks;
 };
@@ -225,11 +225,11 @@ int AmrDataAdaptor::GetMeshMetadata(unsigned int id,
     metadata->GlobalView = true;
 
     metadata->MeshName = "mesh";
-    metadata->MeshType = VTK_OVERLAPPING_AMR;
-    metadata->BlockType = VTK_UNIFORM_GRID;
+    metadata->MeshType = SVTK_OVERLAPPING_AMR;
+    metadata->BlockType = SVTK_UNIFORM_GRID;
     metadata->NumBlocks = 0;
     metadata->NumBlocksLocal = {-1};
-    metadata->CoordinateType = InSituUtils::amrex_tt<amrex_real>::vtk_type_enum();
+    metadata->CoordinateType = InSituUtils::amrex_tt<amrex_real>::svtk_type_enum();
     metadata->StaticMesh = 0;
 
     // TODO
@@ -318,14 +318,14 @@ int AmrDataAdaptor::GetMeshMetadata(unsigned int id,
             std::string arrayName = desc.name(j);
             metadata->ArrayName.push_back(arrayName);
             metadata->ArrayComponents.push_back(1);
-            metadata->ArrayType.push_back(InSituUtils::amrex_tt<amrex_real>::vtk_type_enum());
+            metadata->ArrayType.push_back(InSituUtils::amrex_tt<amrex_real>::svtk_type_enum());
 
             if (itype.cellCentered())
-                metadata->ArrayCentering.push_back(vtkDataObject::CELL);
+                metadata->ArrayCentering.push_back(svtkDataObject::CELL);
             else if (itype.nodeCentered())
-                metadata->ArrayCentering.push_back(vtkDataObject::POINT);
+                metadata->ArrayCentering.push_back(svtkDataObject::POINT);
             else
-                metadata->ArrayCentering.push_back(vtkDataObject::FIELD);
+                metadata->ArrayCentering.push_back(svtkDataObject::FIELD);
         }
 
     }
@@ -557,8 +557,8 @@ int AmrDataAdaptor::GetNumberOfArrays(const std::string &meshName,
         return -1;
     }
 
-    if ((association != vtkDataObject::POINT) &&
-        (association != vtkDataObject::CELL))
+    if ((association != svtkDataObject::POINT) &&
+        (association != svtkDataObject::CELL))
     {
         SENSEI_ERROR("Invalid association " << association)
         return -1;
@@ -590,7 +590,7 @@ int AmrDataAdaptor::GetArrayName(const std::string &meshName,
     if (this->Internals->SimMetadata.GetName(association, index, arrayName))
     {
         SENSEI_ERROR("No array named \"" << arrayName << "\" in "
-            << sensei::VTKUtils::GetAttributesName(association)
+            << sensei::SVTKUtils::GetAttributesName(association)
             << " data")
         return -1;
     }
@@ -603,7 +603,7 @@ int AmrDataAdaptor::GetArrayName(const std::string &meshName,
 
 //-----------------------------------------------------------------------------
 int AmrDataAdaptor::GetMesh(const std::string &meshName,
-    bool structureOnly, vtkDataObject *&mesh)
+    bool structureOnly, svtkDataObject *&mesh)
 {
     amrex::ignore_unused(structureOnly);
 
@@ -626,8 +626,8 @@ int AmrDataAdaptor::GetMesh(const std::string &meshName,
 
     unsigned int nLevels = InSituUtils::NumActiveLevels(levels);
 
-    // initialize new vtk datasets
-    vtkOverlappingAMR *amrMesh = vtkOverlappingAMR::New();
+    // initialize new svtk datasets
+    svtkOverlappingAMR *amrMesh = svtkOverlappingAMR::New();
 #if SENSEI_VERSION_MAJOR < 3
     Internals->ManagedObjects.push_back(amrMesh);
 #endif
@@ -685,12 +685,12 @@ int AmrDataAdaptor::GetMesh(const std::string &meshName,
             int cboxLo[3] = {AMREX_ARLIM(cbox.loVect())};
             int cboxHi[3] = {AMREX_ARLIM(cbox.hiVect())};
 
-            // vtk's representation of box metadata
-            vtkAMRBox block(cboxLo, cboxHi);
+            // svtk's representation of box metadata
+            svtkAMRBox block(cboxLo, cboxHi);
             amrMesh->SetAMRBox(i, j, block);
             amrMesh->SetAMRBlockSourceIndex(i, j, gid++);
 
-            // skip building a vtk amrMesh for the non local boxes
+            // skip building a svtk amrMesh for the non local boxes
             if (dmap[j] != rank)
                 continue;
 
@@ -705,14 +705,14 @@ int AmrDataAdaptor::GetMesh(const std::string &meshName,
             int nboxLo[3] = {AMREX_ARLIM(nbox.loVect())};
             int nboxHi[3] = {AMREX_ARLIM(nbox.hiVect())};
 
-            // new vtk uniform amrMesh, node centered
-            vtkUniformGrid *ug = vtkUniformGrid::New();
+            // new svtk uniform amrMesh, node centered
+            svtkUniformGrid *ug = svtkUniformGrid::New();
             ug->SetOrigin(origin);
             ug->SetSpacing(spacing);
             ug->SetExtent(nboxLo[0], nboxHi[0],
                 nboxLo[1], nboxHi[1], nboxLo[2], nboxHi[2]);
 
-            // pass the block into vtk
+            // pass the block into svtk
             amrMesh->SetDataSet(i, j, ug);
             ug->Delete();
         }
@@ -722,7 +722,7 @@ int AmrDataAdaptor::GetMesh(const std::string &meshName,
 }
 
 //-----------------------------------------------------------------------------
-int AmrDataAdaptor::AddGhostCellsArray(vtkDataObject* mesh,
+int AmrDataAdaptor::AddGhostCellsArray(svtkDataObject* mesh,
     const std::string &meshName)
 {
     sensei::TimeEvent<64> event("AmrDataAdaptor::AddGhostCellsArray");
@@ -733,7 +733,7 @@ int AmrDataAdaptor::AddGhostCellsArray(vtkDataObject* mesh,
         return -1;
     }
 
-    vtkOverlappingAMR *amrMesh = dynamic_cast<vtkOverlappingAMR*>(mesh);
+    svtkOverlappingAMR *amrMesh = dynamic_cast<svtkOverlappingAMR*>(mesh);
     if (!amrMesh)
     {
         SENSEI_ERROR("Invalid mesh type "
@@ -780,7 +780,7 @@ int AmrDataAdaptor::AddGhostCellsArray(vtkDataObject* mesh,
             if (dMap[j] != rank)
                 continue;
 
-            vtkUniformGrid *blockMesh = amrMesh->GetDataSet(i, j);
+            svtkUniformGrid *blockMesh = amrMesh->GetDataSet(i, j);
 
             if (!blockMesh)
             {
@@ -790,24 +790,24 @@ int AmrDataAdaptor::AddGhostCellsArray(vtkDataObject* mesh,
 
             long nCells = blockMesh->GetNumberOfCells();
 
-            // transfer mask array into vtk
-            vtkUnsignedCharArray *ga = vtkUnsignedCharArray::New();
-            ga->SetName("vtkGhostType");
+            // transfer mask array into svtk
+            svtkUnsignedCharArray *ga = svtkUnsignedCharArray::New();
+            ga->SetName("svtkGhostType");
             ga->SetArray(mask[j], nCells, 0);
             blockMesh->GetCellData()->AddArray(ga);
             ga->Delete();
 
             // for debug can visualize the ghost cells
             // FIXME -- a bug in Catalyst ignores internal ghost zones
-            // when using the VTK writrer. Until that bug gets fixed, one
+            // when using the SVTK writrer. Until that bug gets fixed, one
             // can manually inject this copy using a PV Python filter
-            ga = vtkUnsignedCharArray::New();
+            ga = svtkUnsignedCharArray::New();
             ga->SetName("GhostType");
             ga->SetArray(mask[j], nCells, 1);
             blockMesh->GetCellData()->AddArray(ga);
             ga->Delete();
 
-            // because VTK takes ownership
+            // because SVTK takes ownership
             mask[j] = nullptr;
         }
     }
@@ -816,7 +816,7 @@ int AmrDataAdaptor::AddGhostCellsArray(vtkDataObject* mesh,
 }
 
 //-----------------------------------------------------------------------------
-int AmrDataAdaptor::AddGhostNodesArray(vtkDataObject *mesh,
+int AmrDataAdaptor::AddGhostNodesArray(svtkDataObject *mesh,
     const std::string &meshName)
 {
     amrex::ignore_unused(mesh);
@@ -834,7 +834,7 @@ int AmrDataAdaptor::AddGhostNodesArray(vtkDataObject *mesh,
 }
 
 //-----------------------------------------------------------------------------
-int AmrDataAdaptor::AddArray(vtkDataObject* mesh, const std::string &meshName,
+int AmrDataAdaptor::AddArray(svtkDataObject* mesh, const std::string &meshName,
     int association, const std::string &arrayName)
 {
     sensei::TimeEvent<64> event("AmrDataAdaptor::AddArray");
@@ -848,7 +848,7 @@ int AmrDataAdaptor::AddArray(vtkDataObject* mesh, const std::string &meshName,
         return -1;
     }
 
-    vtkOverlappingAMR *amrMesh = dynamic_cast<vtkOverlappingAMR*>(mesh);
+    svtkOverlappingAMR *amrMesh = dynamic_cast<svtkOverlappingAMR*>(mesh);
     if (!amrMesh)
     {
         SENSEI_ERROR("Invalid mesh type "
@@ -861,8 +861,8 @@ int AmrDataAdaptor::AddArray(vtkDataObject* mesh, const std::string &meshName,
         return -1;
     }
 
-    if ((association != vtkDataObject::CELL) &&
-        (association != vtkDataObject::POINT))
+    if ((association != svtkDataObject::CELL) &&
+        (association != svtkDataObject::POINT))
     {
         SENSEI_ERROR("Invalid association " << association)
         return -1;
@@ -878,7 +878,7 @@ int AmrDataAdaptor::AddArray(vtkDataObject* mesh, const std::string &meshName,
     if (this->Internals->SimMetadata.GetIndex(arrayName, association, fab, comp))
     {
         SENSEI_ERROR("Failed to locate descriptor for "
-            << sensei::VTKUtils::GetAttributesName(association)
+            << sensei::SVTKUtils::GetAttributesName(association)
             << " data array \"" << arrayName << "\"")
         return -1;
     }
@@ -894,8 +894,8 @@ int AmrDataAdaptor::AddArray(vtkDataObject* mesh, const std::string &meshName,
         amrex::MultiFab& state = levels[i]->get_new_data(fab);
         unsigned int ng = state.nGrow();
 
-        if (!((association == vtkDataObject::CELL) && state.is_cell_centered()) &&
-            !((association == vtkDataObject::POINT) && state.is_nodal()))
+        if (!((association == svtkDataObject::CELL) && state.is_cell_centered()) &&
+            !((association == svtkDataObject::POINT) && state.is_nodal()))
         {
             SENSEI_ERROR("association does not match MultiFAB centering")
             return -1;
@@ -926,7 +926,7 @@ int AmrDataAdaptor::AddArray(vtkDataObject* mesh, const std::string &meshName,
             int cboxLo[3] = {AMREX_ARLIM(cbox.loVect())};
             int cboxHi[3] = {AMREX_ARLIM(cbox.hiVect())};
 
-            // skip building a vtk mesh for the non local boxes
+            // skip building a svtk mesh for the non local boxes
             if (dmap[j] != rank)
                 continue;
 
@@ -938,7 +938,7 @@ int AmrDataAdaptor::AddArray(vtkDataObject* mesh, const std::string &meshName,
             int nboxHi[3] = {AMREX_ARLIM(nbox.hiVect())};
 
             // get the block mesh
-            vtkUniformGrid *ug = amrMesh->GetDataSet(i, j);
+            svtkUniformGrid *ug = amrMesh->GetDataSet(i, j);
 
             // node centered size
             long nlen = 1;
@@ -953,9 +953,9 @@ int AmrDataAdaptor::AddArray(vtkDataObject* mesh, const std::string &meshName,
             // pointer to the data
             amrex_real *pcd = state[j].dataPtr(comp);
 
-            // allocate vtk array
-            InSituUtils::amrex_tt<amrex_real>::vtk_type *da =
-                InSituUtils::amrex_tt<amrex_real>::vtk_type::New();
+            // allocate svtk array
+            InSituUtils::amrex_tt<amrex_real>::svtk_type *da =
+                InSituUtils::amrex_tt<amrex_real>::svtk_type::New();
 
             // set component name
             da->SetName(arrayName.c_str());
@@ -981,7 +981,7 @@ int AmrDataAdaptor::AddArray(vtkDataObject* mesh, const std::string &meshName,
 
 #if defined(SENSEI_DEBUG)
             // mark level id
-            vtkFloatArray *la = vtkFloatArray::New();
+            svtkFloatArray *la = svtkFloatArray::New();
             la->SetName("amrex_level_id");
             la->SetNumberOfTuples(clen);
             la->Fill(i);
@@ -989,7 +989,7 @@ int AmrDataAdaptor::AddArray(vtkDataObject* mesh, const std::string &meshName,
             la->Delete();
 
             // mark mpi rank
-            vtkFloatArray *ra = vtkFloatArray::New();
+            svtkFloatArray *ra = svtkFloatArray::New();
             ra->SetName("amrex_mpi_rank");
             ra->SetNumberOfTuples(clen);
             ra->Fill(rank);
diff --git a/Src/Extern/SENSEI/AMReX_AmrInSituBridge.cpp b/Src/Extern/SENSEI/AMReX_AmrInSituBridge.cpp
index 26f63d2a101..018669a4bfc 100644
--- a/Src/Extern/SENSEI/AMReX_AmrInSituBridge.cpp
+++ b/Src/Extern/SENSEI/AMReX_AmrInSituBridge.cpp
@@ -29,7 +29,7 @@ AmrInSituBridge::update(Amr *dataSource)
         data_adaptor->SetDataSource(dataSource);
         data_adaptor->SetDataTime(dataSource->cumTime());
         data_adaptor->SetDataTimeStep(dataSource->levelSteps(0));
-        ret = analysis_adaptor->Execute(data_adaptor) ? 0 : -1;
+        ret = analysis_adaptor->Execute(data_adaptor, nullptr) ? 0 : -1;
         data_adaptor->ReleaseData();
         data_adaptor->Delete();
 
diff --git a/Src/Extern/SENSEI/AMReX_AmrMeshDataAdaptor.H b/Src/Extern/SENSEI/AMReX_AmrMeshDataAdaptor.H
index 5a8a88552af..54277505bd4 100644
--- a/Src/Extern/SENSEI/AMReX_AmrMeshDataAdaptor.H
+++ b/Src/Extern/SENSEI/AMReX_AmrMeshDataAdaptor.H
@@ -34,10 +34,10 @@ public:
   int GetNumberOfArrays(const std::string &meshName, int association, unsigned int &numberOfArrays) override;
   int GetArrayName(const std::string &meshName, int association, unsigned int index, std::string &arrayName) override;
 #endif
-  int GetMesh(const std::string &meshName, bool structureOnly, vtkDataObject *&mesh) override;
-  int AddGhostNodesArray(vtkDataObject* mesh, const std::string &meshName) override;
-  int AddGhostCellsArray(vtkDataObject* mesh, const std::string &meshName) override;
-  int AddArray(vtkDataObject* mesh, const std::string &meshName, int association, const std::string &arrayName) override;
+  int GetMesh(const std::string &meshName, bool structureOnly, svtkDataObject *&mesh) override;
+  int AddGhostNodesArray(svtkDataObject* mesh, const std::string &meshName) override;
+  int AddGhostCellsArray(svtkDataObject* mesh, const std::string &meshName) override;
+  int AddArray(svtkDataObject* mesh, const std::string &meshName, int association, const std::string &arrayName) override;
   int ReleaseData() override;
 
 protected:
diff --git a/Src/Extern/SENSEI/AMReX_AmrMeshDataAdaptor.cpp b/Src/Extern/SENSEI/AMReX_AmrMeshDataAdaptor.cpp
index 2e4968cc8b2..34b92c1d25d 100644
--- a/Src/Extern/SENSEI/AMReX_AmrMeshDataAdaptor.cpp
+++ b/Src/Extern/SENSEI/AMReX_AmrMeshDataAdaptor.cpp
@@ -2,18 +2,18 @@
 
 #include "Profiler.h"
 #include "Error.h"
-#include "VTKUtils.h"
-
-#include <vtkObjectFactory.h>
-#include <vtkOverlappingAMR.h>
-#include <vtkAMRBox.h>
-#include <vtkUniformGrid.h>
-#include <vtkDataSetAttributes.h>
-#include <vtkUnsignedCharArray.h>
-#include <vtkFloatArray.h>
-#include <vtkDoubleArray.h>
-#include <vtkCellData.h>
-#include <vtkPointData.h>
+#include "SVTKUtils.h"
+
+#include <svtkObjectFactory.h>
+#include <svtkOverlappingAMR.h>
+#include <svtkAMRBox.h>
+#include <svtkUniformGrid.h>
+#include <svtkDataSetAttributes.h>
+#include <svtkUnsignedCharArray.h>
+#include <svtkFloatArray.h>
+#include <svtkDoubleArray.h>
+#include <svtkCellData.h>
+#include <svtkPointData.h>
 
 #include <AMReX_BoxArray.H>
 #include <AMReX_Geometry.H>
@@ -58,11 +58,11 @@ int MeshStateMap::Initialize(
 
             if (state.is_cell_centered())
             {
-                this->Map[vtkDataObject::CELL][arrayName] = std::make_pair(i,j);
+                this->Map[svtkDataObject::CELL][arrayName] = std::make_pair(i,j);
             }
             else if (state.is_nodal())
             {
-                this->Map[vtkDataObject::POINT][arrayName] = std::make_pair(i,j);
+                this->Map[svtkDataObject::POINT][arrayName] = std::make_pair(i,j);
             }
         }
     }
@@ -83,7 +83,7 @@ struct AmrMeshDataAdaptor::InternalsType
     std::vector<std::vector<std::string>> Names;
     amrex::InSituUtils::MeshStateMap StateMetadata;
 #if SENSEI_VERSION_MAJOR < 3
-    std::vector<vtkDataObject*> ManagedObjects;
+    std::vector<svtkDataObject*> ManagedObjects;
 #endif
 };
 
@@ -149,13 +149,13 @@ int AmrMeshDataAdaptor::GetMeshMetadata(unsigned int id,
     metadata->GlobalView = true;
 
     metadata->MeshName = "mesh";
-    metadata->MeshType = VTK_OVERLAPPING_AMR;
-    metadata->BlockType = VTK_UNIFORM_GRID;
+    metadata->MeshType = SVTK_OVERLAPPING_AMR;
+    metadata->BlockType = SVTK_UNIFORM_GRID;
     metadata->NumBlocks = 0;
     metadata->NumCells = 0;
     metadata->NumPoints = 0;
     metadata->NumBlocksLocal = {-1};
-    metadata->CoordinateType = InSituUtils::amrex_tt<amrex_real>::vtk_type_enum();
+    metadata->CoordinateType = InSituUtils::amrex_tt<amrex_real>::svtk_type_enum();
     metadata->StaticMesh = 0;
 
     // num levels
@@ -224,7 +224,7 @@ int AmrMeshDataAdaptor::GetMeshMetadata(unsigned int id,
             {pdLo[0], pdHi[0], pdLo[1], pdHi[1], pdLo[2], pdHi[2]});
     }
 
-    // global extent (note: VTK uses point centered indexing)
+    // global extent (note: SVTK uses point centered indexing)
     const amrex::Box& cdom = this->Internals->Mesh->Geom(0).Domain();
     amrex::Box ndom = surroundingNodes(cdom);
 
@@ -261,19 +261,19 @@ int AmrMeshDataAdaptor::GetMeshMetadata(unsigned int id,
         // scalar, vector, tensor
         metadata->ArrayComponents[j] = 1;
         // POD type
-        metadata->ArrayType[j] = InSituUtils::amrex_tt<amrex_real>::vtk_type_enum();
+        metadata->ArrayType[j] = InSituUtils::amrex_tt<amrex_real>::svtk_type_enum();
         // mesh centering
         if (state0.is_cell_centered())
         {
-            metadata->ArrayCentering[j] = vtkDataObject::CELL;
+            metadata->ArrayCentering[j] = svtkDataObject::CELL;
         }
         else if (state0.is_nodal())
         {
-            metadata->ArrayCentering[j] = vtkDataObject::POINT;
+            metadata->ArrayCentering[j] = svtkDataObject::POINT;
         }
         else
         {
-            metadata->ArrayCentering[j] = vtkDataObject::FIELD;
+            metadata->ArrayCentering[j] = svtkDataObject::FIELD;
         }
     }
 
@@ -396,8 +396,8 @@ int AmrMeshDataAdaptor::GetNumberOfArrays(const std::string &meshName,
         return -1;
     }
 
-    if ((association != vtkDataObject::POINT) &&
-        (association != vtkDataObject::CELL))
+    if ((association != svtkDataObject::POINT) &&
+        (association != svtkDataObject::CELL))
     {
         SENSEI_ERROR("Invalid association " << association)
         return -1;
@@ -427,7 +427,7 @@ int AmrMeshDataAdaptor::GetArrayName(const std::string &meshName,
     if (this->Internals->StateMetadata.GetName(association, index, arrayName))
     {
         SENSEI_ERROR("No array named \"" << arrayName << "\" in "
-            << sensei::VTKUtils::GetAttributesName(association)
+            << sensei::SVTKUtils::GetAttributesName(association)
             << " data")
         return -1;
     }
@@ -475,7 +475,7 @@ int AmrMeshDataAdaptor::GetMeshHasGhostCells(const std::string &meshName, int &n
 
 //-----------------------------------------------------------------------------
 int AmrMeshDataAdaptor::GetMesh(const std::string &meshName,
-    bool structureOnly, vtkDataObject *&mesh)
+    bool structureOnly, svtkDataObject *&mesh)
 {
     amrex::ignore_unused(structureOnly);
 
@@ -498,8 +498,8 @@ int AmrMeshDataAdaptor::GetMesh(const std::string &meshName,
 
     int nLevels = this->Internals->Mesh->finestLevel() + 1;
 
-    // initialize new vtk datasets
-    vtkOverlappingAMR *amrMesh = vtkOverlappingAMR::New();
+    // initialize new svtk datasets
+    svtkOverlappingAMR *amrMesh = svtkOverlappingAMR::New();
 #if SENSEI_VERSION_MAJOR < 3
     Internals->ManagedObjects.push_back(amrMesh);
 #endif
@@ -560,12 +560,12 @@ int AmrMeshDataAdaptor::GetMesh(const std::string &meshName,
             int cboxLo[3] = {AMREX_ARLIM(cbox.loVect())};
             int cboxHi[3] = {AMREX_ARLIM(cbox.hiVect())};
 
-            // vtk's representation of box metadata
-            vtkAMRBox block(cboxLo, cboxHi);
+            // svtk's representation of box metadata
+            svtkAMRBox block(cboxLo, cboxHi);
             amrMesh->SetAMRBox(i, j, block);
             amrMesh->SetAMRBlockSourceIndex(i, j, gid++);
 
-            // skip building a vtk amrMesh for the non local boxes
+            // skip building a svtk amrMesh for the non local boxes
             if (dmap[j] != rank)
                 continue;
 
@@ -580,14 +580,14 @@ int AmrMeshDataAdaptor::GetMesh(const std::string &meshName,
             int nboxLo[3] = {AMREX_ARLIM(nbox.loVect())};
             int nboxHi[3] = {AMREX_ARLIM(nbox.hiVect())};
 
-            // new vtk uniform amrMesh, node centered
-            vtkUniformGrid *ug = vtkUniformGrid::New();
+            // new svtk uniform amrMesh, node centered
+            svtkUniformGrid *ug = svtkUniformGrid::New();
             ug->SetOrigin(origin);
             ug->SetSpacing(spacing);
             ug->SetExtent(nboxLo[0], nboxHi[0],
                 nboxLo[1], nboxHi[1], nboxLo[2], nboxHi[2]);
 
-            // pass the block into vtk
+            // pass the block into svtk
             amrMesh->SetDataSet(i, j, ug);
             ug->Delete();
         }
@@ -597,7 +597,7 @@ int AmrMeshDataAdaptor::GetMesh(const std::string &meshName,
 }
 
 //-----------------------------------------------------------------------------
-int AmrMeshDataAdaptor::AddGhostNodesArray(vtkDataObject *mesh,
+int AmrMeshDataAdaptor::AddGhostNodesArray(svtkDataObject *mesh,
     const std::string &meshName)
 {
     amrex::ignore_unused(mesh);
@@ -613,7 +613,7 @@ int AmrMeshDataAdaptor::AddGhostNodesArray(vtkDataObject *mesh,
 }
 
 //-----------------------------------------------------------------------------
-int AmrMeshDataAdaptor::AddGhostCellsArray(vtkDataObject* mesh,
+int AmrMeshDataAdaptor::AddGhostCellsArray(svtkDataObject* mesh,
     const std::string &meshName)
 {
     if (meshName != "mesh")
@@ -622,7 +622,7 @@ int AmrMeshDataAdaptor::AddGhostCellsArray(vtkDataObject* mesh,
         return -1;
     }
 
-    vtkOverlappingAMR *amrMesh = dynamic_cast<vtkOverlappingAMR*>(mesh);
+    svtkOverlappingAMR *amrMesh = dynamic_cast<svtkOverlappingAMR*>(mesh);
     if (!amrMesh)
     {
         SENSEI_ERROR("Invalid mesh type "
@@ -701,7 +701,7 @@ int AmrMeshDataAdaptor::AddGhostCellsArray(vtkDataObject* mesh,
             if (dmap[j] != rank)
                 continue;
 
-            vtkUniformGrid *blockMesh = amrMesh->GetDataSet(i, j);
+            svtkUniformGrid *blockMesh = amrMesh->GetDataSet(i, j);
 
             if (!blockMesh)
             {
@@ -711,18 +711,18 @@ int AmrMeshDataAdaptor::AddGhostCellsArray(vtkDataObject* mesh,
 
             long nCells = blockMesh->GetNumberOfCells();
 
-            // transfer mask array into vtk
-            vtkUnsignedCharArray *ga = vtkUnsignedCharArray::New();
-            ga->SetName("vtkGhostType");
+            // transfer mask array into svtk
+            svtkUnsignedCharArray *ga = svtkUnsignedCharArray::New();
+            ga->SetName("svtkGhostType");
             ga->SetArray(mask[j], nCells, 0);
             blockMesh->GetCellData()->AddArray(ga);
             ga->Delete();
 
             // for debug can visualize the ghost cells
             // FIXME -- a bug in Catalyst ignores internal ghost zones
-            // when using the VTK writrer. Until that bug gets fixed, one
+            // when using the SVTK writer. Until that bug gets fixed, one
             // can manually inject this copy using a PV Python filter
-            ga = vtkUnsignedCharArray::New();
+            ga = svtkUnsignedCharArray::New();
             ga->SetName("GhostType");
             ga->SetArray(mask[j], nCells, 1);
             blockMesh->GetCellData()->AddArray(ga);
@@ -734,7 +734,7 @@ int AmrMeshDataAdaptor::AddGhostCellsArray(vtkDataObject* mesh,
 }
 
 //-----------------------------------------------------------------------------
-int AmrMeshDataAdaptor::AddArray(vtkDataObject* mesh,
+int AmrMeshDataAdaptor::AddArray(svtkDataObject* mesh,
     const std::string &meshName, int association,
     const std::string &arrayName)
 {
@@ -747,7 +747,7 @@ int AmrMeshDataAdaptor::AddArray(vtkDataObject* mesh,
         return -1;
     }
 
-    vtkOverlappingAMR *amrMesh = dynamic_cast<vtkOverlappingAMR*>(mesh);
+    svtkOverlappingAMR *amrMesh = dynamic_cast<svtkOverlappingAMR*>(mesh);
     if (!amrMesh)
     {
         SENSEI_ERROR("Invalid mesh type "
@@ -760,8 +760,8 @@ int AmrMeshDataAdaptor::AddArray(vtkDataObject* mesh,
         return -1;
     }
 
-    if ((association != vtkDataObject::CELL) &&
-        (association != vtkDataObject::CELL))
+    if ((association != svtkDataObject::CELL) &&
+        (association != svtkDataObject::CELL))
     {
         SENSEI_ERROR("Invalid association " << association)
         return -1;
@@ -774,7 +774,7 @@ int AmrMeshDataAdaptor::AddArray(vtkDataObject* mesh,
     if (this->Internals->StateMetadata.GetIndex(arrayName, association, fab, comp))
     {
         SENSEI_ERROR("Failed to locate descriptor for "
-            << sensei::VTKUtils::GetAttributesName(association)
+            << sensei::SVTKUtils::GetAttributesName(association)
             << " data array \"" << arrayName << "\"")
         return -1;
     }
@@ -792,8 +792,8 @@ int AmrMeshDataAdaptor::AddArray(vtkDataObject* mesh,
         unsigned int ng = state.nGrow();
 
         // check centering
-        if (!((association == vtkDataObject::CELL) && state.is_cell_centered()) &&
-            !((association == vtkDataObject::POINT) && state.is_nodal()))
+        if (!((association == svtkDataObject::CELL) && state.is_cell_centered()) &&
+            !((association == svtkDataObject::POINT) && state.is_nodal()))
         {
             SENSEI_ERROR("association does not match MultiFab centering")
             return -1;
@@ -824,7 +824,7 @@ int AmrMeshDataAdaptor::AddArray(vtkDataObject* mesh,
             int cboxLo[3] = {AMREX_ARLIM(cbox.loVect())};
             int cboxHi[3] = {AMREX_ARLIM(cbox.hiVect())};
 
-            // skip building a vtk mesh for the non local boxes
+            // skip building a svtk mesh for the non local boxes
             if (dmap[j] != rank)
                 continue;
 
@@ -836,7 +836,7 @@ int AmrMeshDataAdaptor::AddArray(vtkDataObject* mesh,
             int nboxHi[3] = {AMREX_ARLIM(nbox.hiVect())};
 
             // get the block mesh
-            vtkUniformGrid *ug = amrMesh->GetDataSet(i, j);
+            svtkUniformGrid *ug = amrMesh->GetDataSet(i, j);
 
             // node centered size
             long nlen = 1;
@@ -851,9 +851,9 @@ int AmrMeshDataAdaptor::AddArray(vtkDataObject* mesh,
             // pointer to the data
             amrex_real *pcd = state[j].dataPtr(comp);
 
-            // allocate vtk array
-            InSituUtils::amrex_tt<amrex_real>::vtk_type *da =
-                InSituUtils::amrex_tt<amrex_real>::vtk_type::New();
+            // allocate svtk array
+            InSituUtils::amrex_tt<amrex_real>::svtk_type *da =
+                InSituUtils::amrex_tt<amrex_real>::svtk_type::New();
 
             // set component name
             da->SetName(arrayName.c_str());
@@ -879,7 +879,7 @@ int AmrMeshDataAdaptor::AddArray(vtkDataObject* mesh,
 
 #if defined(SENSEI_DEBUG)
             // mark level id
-            vtkFloatArray *la = vtkFloatArray::New();
+            svtkFloatArray *la = svtkFloatArray::New();
             la->SetName("amrex_level_id");
             la->SetNumberOfTuples(clen);
             la->Fill(i);
@@ -887,7 +887,7 @@ int AmrMeshDataAdaptor::AddArray(vtkDataObject* mesh,
             la->Delete();
 
             // mark mpi rank
-            vtkFloatArray *ra = vtkFloatArray::New();
+            svtkFloatArray *ra = svtkFloatArray::New();
             ra->SetName("amrex_mpi_rank");
             ra->SetNumberOfTuples(clen);
             ra->Fill(rank);
diff --git a/Src/Extern/SENSEI/AMReX_AmrMeshInSituBridge.cpp b/Src/Extern/SENSEI/AMReX_AmrMeshInSituBridge.cpp
index 55adb1b5c59..cd6b6794171 100644
--- a/Src/Extern/SENSEI/AMReX_AmrMeshInSituBridge.cpp
+++ b/Src/Extern/SENSEI/AMReX_AmrMeshInSituBridge.cpp
@@ -35,7 +35,7 @@ AmrMeshInSituBridge::update(unsigned int step, double time,
         data_adaptor->SetDataSource(mesh, states, names);
         data_adaptor->SetDataTime(time);
         data_adaptor->SetDataTimeStep(step);
-        ret = analysis_adaptor->Execute(data_adaptor) ? 0 : -1;
+        ret = analysis_adaptor->Execute(data_adaptor, nullptr) ? 0 : -1;
         data_adaptor->ReleaseData();
         data_adaptor->Delete();
 
diff --git a/Src/Extern/SENSEI/AMReX_AmrMeshParticleDataAdaptor.H b/Src/Extern/SENSEI/AMReX_AmrMeshParticleDataAdaptor.H
index 61e4d510745..fbd5227824f 100644
--- a/Src/Extern/SENSEI/AMReX_AmrMeshParticleDataAdaptor.H
+++ b/Src/Extern/SENSEI/AMReX_AmrMeshParticleDataAdaptor.H
@@ -45,10 +45,10 @@ public:
   int GetArrayName(const std::string &meshName, int association, unsigned int index, std::string &arrayName) override;
 #endif
   int GetNumberOfMeshes(unsigned int &numMeshes) override;
-  int GetMesh(const std::string &meshName, bool structureOnly, vtkDataObject *&mesh) override;
-  int AddGhostNodesArray(vtkDataObject* mesh, const std::string &meshName) override;
-  int AddGhostCellsArray(vtkDataObject* mesh, const std::string &meshName) override;
-  int AddArray(vtkDataObject* mesh, const std::string &meshName, int association, const std::string &arrayName) override;
+  int GetMesh(const std::string &meshName, bool structureOnly, svtkDataObject *&mesh) override;
+  int AddGhostNodesArray(svtkDataObject* mesh, const std::string &meshName) override;
+  int AddGhostCellsArray(svtkDataObject* mesh, const std::string &meshName) override;
+  int AddArray(svtkDataObject* mesh, const std::string &meshName, int association, const std::string &arrayName) override;
   int ReleaseData() override;
 
 protected:
diff --git a/Src/Extern/SENSEI/AMReX_AmrMeshParticleDataAdaptorI.H b/Src/Extern/SENSEI/AMReX_AmrMeshParticleDataAdaptorI.H
index a93357d5043..4cbb53203b6 100644
--- a/Src/Extern/SENSEI/AMReX_AmrMeshParticleDataAdaptorI.H
+++ b/Src/Extern/SENSEI/AMReX_AmrMeshParticleDataAdaptorI.H
@@ -148,7 +148,7 @@ template<int NStructReal, int NStructInt, int NArrayReal, int NArrayInt>
 int AmrMeshParticleDataAdaptor<NStructReal, NStructInt, NArrayReal, NArrayInt>::GetMesh(
   const std::string &meshName,
   bool structureOnly,
-  vtkDataObject *&mesh)
+  svtkDataObject *&mesh)
 {
   if(meshName == m_meshName)
   {
@@ -164,7 +164,7 @@ int AmrMeshParticleDataAdaptor<NStructReal, NStructInt, NArrayReal, NArrayInt>::
 
 template<int NStructReal, int NStructInt, int NArrayReal, int NArrayInt>
 int AmrMeshParticleDataAdaptor<NStructReal, NStructInt, NArrayReal, NArrayInt>::AddGhostNodesArray(
-  vtkDataObject* mesh,
+  svtkDataObject* mesh,
   const std::string &meshName)
 {
   if(meshName == m_meshName)
@@ -181,7 +181,7 @@ int AmrMeshParticleDataAdaptor<NStructReal, NStructInt, NArrayReal, NArrayInt>::
 
 template<int NStructReal, int NStructInt, int NArrayReal, int NArrayInt>
 int AmrMeshParticleDataAdaptor<NStructReal, NStructInt, NArrayReal, NArrayInt>::AddGhostCellsArray(
-  vtkDataObject* mesh,
+  svtkDataObject* mesh,
   const std::string &meshName)
 {
   if(meshName == m_meshName)
@@ -198,7 +198,7 @@ int AmrMeshParticleDataAdaptor<NStructReal, NStructInt, NArrayReal, NArrayInt>::
 
 template<int NStructReal, int NStructInt, int NArrayReal, int NArrayInt>
 int AmrMeshParticleDataAdaptor<NStructReal, NStructInt, NArrayReal, NArrayInt>::AddArray(
-  vtkDataObject* mesh,
+  svtkDataObject* mesh,
   const std::string &meshName,
   int association,
   const std::string &arrayName)
diff --git a/Src/Extern/SENSEI/AMReX_AmrMeshParticleInSituBridge.H b/Src/Extern/SENSEI/AMReX_AmrMeshParticleInSituBridge.H
index bede5908cdc..9208c8a753b 100644
--- a/Src/Extern/SENSEI/AMReX_AmrMeshParticleInSituBridge.H
+++ b/Src/Extern/SENSEI/AMReX_AmrMeshParticleInSituBridge.H
@@ -86,7 +86,7 @@ int AmrMeshParticleInSituBridge::update(
 
     data_adaptor->SetDataTime(time);
     data_adaptor->SetDataTimeStep(step);
-    ret = analysis_adaptor->Execute(data_adaptor) ? 0 : -1;
+    ret = analysis_adaptor->Execute(data_adaptor, nullptr) ? 0 : -1;
     data_adaptor->ReleaseData();
     data_adaptor->Delete();
 
diff --git a/Src/Extern/SENSEI/AMReX_AmrParticleDataAdaptor.H b/Src/Extern/SENSEI/AMReX_AmrParticleDataAdaptor.H
index 886a7df6d18..3f7a945e019 100644
--- a/Src/Extern/SENSEI/AMReX_AmrParticleDataAdaptor.H
+++ b/Src/Extern/SENSEI/AMReX_AmrParticleDataAdaptor.H
@@ -43,10 +43,10 @@ public:
   int GetArrayName(const std::string &meshName, int association, unsigned int index, std::string &arrayName) override;
 #endif
   int GetNumberOfMeshes(unsigned int &numMeshes) override;
-  int GetMesh(const std::string &meshName, bool structureOnly, vtkDataObject *&mesh) override;
-  int AddGhostNodesArray(vtkDataObject* mesh, const std::string &meshName) override;
-  int AddGhostCellsArray(vtkDataObject* mesh, const std::string &meshName) override;
-  int AddArray(vtkDataObject* mesh, const std::string &meshName, int association, const std::string &arrayName) override;
+  int GetMesh(const std::string &meshName, bool structureOnly, svtkDataObject *&mesh) override;
+  int AddGhostNodesArray(svtkDataObject* mesh, const std::string &meshName) override;
+  int AddGhostCellsArray(svtkDataObject* mesh, const std::string &meshName) override;
+  int AddArray(svtkDataObject* mesh, const std::string &meshName, int association, const std::string &arrayName) override;
   int ReleaseData() override;
 
 protected:
diff --git a/Src/Extern/SENSEI/AMReX_AmrParticleDataAdaptorI.H b/Src/Extern/SENSEI/AMReX_AmrParticleDataAdaptorI.H
index 813466fc0f8..9035cd0c39c 100644
--- a/Src/Extern/SENSEI/AMReX_AmrParticleDataAdaptorI.H
+++ b/Src/Extern/SENSEI/AMReX_AmrParticleDataAdaptorI.H
@@ -146,7 +146,7 @@ template<int NStructReal, int NStructInt, int NArrayReal, int NArrayInt>
 int AmrParticleDataAdaptor<NStructReal, NStructInt, NArrayReal, NArrayInt>::GetMesh(
   const std::string &meshName,
   bool structureOnly,
-  vtkDataObject *&mesh)
+  svtkDataObject *&mesh)
 {
   if(meshName == m_meshName)
   {
@@ -162,7 +162,7 @@ int AmrParticleDataAdaptor<NStructReal, NStructInt, NArrayReal, NArrayInt>::GetM
 
 template<int NStructReal, int NStructInt, int NArrayReal, int NArrayInt>
 int AmrParticleDataAdaptor<NStructReal, NStructInt, NArrayReal, NArrayInt>::AddGhostNodesArray(
-  vtkDataObject* mesh,
+  svtkDataObject* mesh,
   const std::string &meshName)
 {
   if(meshName == m_meshName)
@@ -179,7 +179,7 @@ int AmrParticleDataAdaptor<NStructReal, NStructInt, NArrayReal, NArrayInt>::AddG
 
 template<int NStructReal, int NStructInt, int NArrayReal, int NArrayInt>
 int AmrParticleDataAdaptor<NStructReal, NStructInt, NArrayReal, NArrayInt>::AddGhostCellsArray(
-  vtkDataObject* mesh,
+  svtkDataObject* mesh,
   const std::string &meshName)
 {
   if(meshName == m_meshName)
@@ -196,7 +196,7 @@ int AmrParticleDataAdaptor<NStructReal, NStructInt, NArrayReal, NArrayInt>::AddG
 
 template<int NStructReal, int NStructInt, int NArrayReal, int NArrayInt>
 int AmrParticleDataAdaptor<NStructReal, NStructInt, NArrayReal, NArrayInt>::AddArray(
-  vtkDataObject* mesh,
+  svtkDataObject* mesh,
   const std::string &meshName,
   int association,
   const std::string &arrayName)
diff --git a/Src/Extern/SENSEI/AMReX_InSituUtils.H b/Src/Extern/SENSEI/AMReX_InSituUtils.H
index e7c212d7b4e..2799e21b367 100644
--- a/Src/Extern/SENSEI/AMReX_InSituUtils.H
+++ b/Src/Extern/SENSEI/AMReX_InSituUtils.H
@@ -2,10 +2,10 @@
 #define AMReX_InSituUtils_H
 #include <AMReX_Config.H>
 
-#include <vtkDataSetAttributes.h>
-#include <vtkUnsignedCharArray.h>
-#include <vtkFloatArray.h>
-#include <vtkDoubleArray.h>
+#include <svtkDataSetAttributes.h>
+#include <svtkUnsignedCharArray.h>
+#include <svtkFloatArray.h>
+#include <svtkDoubleArray.h>
 
 #include <AMReX_BoxArray.H>
 #include <AMReX_Geometry.H>
@@ -47,21 +47,21 @@ enum PointGhostTypes
 
 
 
-// traits helper for mapping between amrex_real and vtkDataArray
+// traits helper for mapping between amrex_real and svtkDataArray
 template <typename cpp_t> struct amrex_tt {};
 
-#define amrex_tt_specialize(cpp_t, vtk_t, vtk_t_e)      \
+#define amrex_tt_specialize(cpp_t, svtk_t, svtk_t_e)    \
 template <>                                             \
 struct amrex_tt<cpp_t>                                  \
 {                                                       \
-    using vtk_type = vtk_t;                             \
+    using svtk_type = svtk_t;                           \
                                                         \
     static                                              \
-    constexpr int vtk_type_enum() { return vtk_t_e; }   \
+    constexpr int svtk_type_enum() { return svtk_t_e; } \
 };
 
-amrex_tt_specialize(float, vtkFloatArray, VTK_FLOAT)
-amrex_tt_specialize(double, vtkDoubleArray, VTK_DOUBLE)
+amrex_tt_specialize(float, svtkFloatArray, SVTK_FLOAT)
+amrex_tt_specialize(double, svtkDoubleArray, SVTK_DOUBLE)
 
 
 // helpers to modify values
diff --git a/Src/Extern/SENSEI/AMReX_InSituUtils.cpp b/Src/Extern/SENSEI/AMReX_InSituUtils.cpp
index 64c429e8bb6..d13c8738aeb 100644
--- a/Src/Extern/SENSEI/AMReX_InSituUtils.cpp
+++ b/Src/Extern/SENSEI/AMReX_InSituUtils.cpp
@@ -1,7 +1,7 @@
 #include "AMReX_InSituUtils.H"
 
 #include "Error.h"
-#include "VTKUtils.h"
+#include "SVTKUtils.h"
 
 namespace amrex {
 namespace InSituUtils {
@@ -14,7 +14,7 @@ int StateMap::GetIndex(const std::string &name, int centering,
 
     if (cit == this->Map.end())
     {
-        SENSEI_ERROR("No " << sensei::VTKUtils::GetAttributesName(centering)
+        SENSEI_ERROR("No " << sensei::SVTKUtils::GetAttributesName(centering)
           << " arrays")
         return -1;
     }
@@ -23,7 +23,7 @@ int StateMap::GetIndex(const std::string &name, int centering,
     if (nit == cit->second.end())
     {
         SENSEI_ERROR("No array named \"" << name  << "\" in "
-            << sensei::VTKUtils::GetAttributesName(centering)
+            << sensei::SVTKUtils::GetAttributesName(centering)
             << " centered data")
         return -1;
     }
@@ -41,7 +41,7 @@ int StateMap::GetName(int centering, int id, std::string &name)
 
     if (cit == this->Map.end())
     {
-        SENSEI_ERROR("No " << sensei::VTKUtils::GetAttributesName(centering)
+        SENSEI_ERROR("No " << sensei::SVTKUtils::GetAttributesName(centering)
           << " arrays")
         return -1;
     }
diff --git a/Src/Extern/SENSEI/AMReX_ParticleDataAdaptor.H b/Src/Extern/SENSEI/AMReX_ParticleDataAdaptor.H
index 73ca142ec0b..f284b15831b 100644
--- a/Src/Extern/SENSEI/AMReX_ParticleDataAdaptor.H
+++ b/Src/Extern/SENSEI/AMReX_ParticleDataAdaptor.H
@@ -8,7 +8,7 @@
 #include <AMReX_MultiFab.H>
 
 #include <DataAdaptor.h>
-class vtkPolyData;
+class svtkPolyData;
 
 namespace amrex
 {
@@ -40,22 +40,22 @@ public:
   void SetPinMesh(int val);
 
   // get particle id numbers
-  int AddParticlesIDArray(vtkDataObject* mesh);
+  int AddParticlesIDArray(svtkDataObject* mesh);
 
   // get particle cpu numbers (process each particle was generated on)
-  int AddParticlesCPUArray(vtkDataObject* mesh);
+  int AddParticlesCPUArray(svtkDataObject* mesh);
 
   // get particle integer arrays in Structs of Arrays format
-  int AddParticlesSOAIntArray(const std::string &arrayName, vtkDataObject* mesh);
+  int AddParticlesSOAIntArray(const std::string &arrayName, svtkDataObject* mesh);
 
   // get particle real arrays in Structs of Arrays format
-  int AddParticlesSOARealArray(const std::string &arrayName, vtkDataObject* mesh);
+  int AddParticlesSOARealArray(const std::string &arrayName, svtkDataObject* mesh);
 
   // get particle integer arrays in Array Of Structs format
-  int AddParticlesAOSIntArray(const std::string &arrayName, vtkDataObject* mesh);
+  int AddParticlesAOSIntArray(const std::string &arrayName, svtkDataObject* mesh);
 
   // get particle real arrays in Array Of Structs format
-  int AddParticlesAOSRealArray(const std::string &arrayName, vtkDataObject* mesh);
+  int AddParticlesAOSRealArray(const std::string &arrayName, svtkDataObject* mesh);
 
   // SENSEI API
 #if SENSEI_VERSION_MAJOR >= 3
@@ -68,10 +68,10 @@ public:
   int GetArrayName(const std::string &meshName, int association, unsigned int index, std::string &arrayName) override;
 #endif
   int GetNumberOfMeshes(unsigned int &numMeshes) override;
-  int GetMesh(const std::string &meshName, bool structureOnly, vtkDataObject *&mesh) override;
-  int AddGhostNodesArray(vtkDataObject* mesh, const std::string &meshName) override;
-  int AddGhostCellsArray(vtkDataObject* mesh, const std::string &meshName) override;
-  int AddArray(vtkDataObject* mesh, const std::string &meshName, int association, const std::string &arrayName) override;
+  int GetMesh(const std::string &meshName, bool structureOnly, svtkDataObject *&mesh) override;
+  int AddGhostNodesArray(svtkDataObject* mesh, const std::string &meshName) override;
+  int AddGhostCellsArray(svtkDataObject* mesh, const std::string &meshName) override;
+  int AddArray(svtkDataObject* mesh, const std::string &meshName, int association, const std::string &arrayName) override;
   int ReleaseData() override;
 
 protected:
@@ -79,7 +79,7 @@ protected:
   ~ParticleDataAdaptor() = default;
 
 private:
-  vtkPolyData* BuildParticles();
+  svtkPolyData* BuildParticles();
 
   const std::string m_particlesName = "particles";
 
diff --git a/Src/Extern/SENSEI/AMReX_ParticleDataAdaptorI.H b/Src/Extern/SENSEI/AMReX_ParticleDataAdaptorI.H
index 26174f83e1b..8a2d15562d3 100644
--- a/Src/Extern/SENSEI/AMReX_ParticleDataAdaptorI.H
+++ b/Src/Extern/SENSEI/AMReX_ParticleDataAdaptorI.H
@@ -1,13 +1,13 @@
 #include "Profiler.h"
 #include "Error.h"
-#include "VTKUtils.h"
+#include "SVTKUtils.h"
 #include "MeshMetadata.h"
-// vtk includes
-#include <vtkPolyData.h>
-#include <vtkDataSetAttributes.h>
-#include <vtkCellData.h>
-#include <vtkPointData.h>
-#include <vtkMultiBlockDataSet.h>
+// svtk includes
+#include <svtkPolyData.h>
+#include <svtkDataSetAttributes.h>
+#include <svtkCellData.h>
+#include <svtkPointData.h>
+#include <svtkMultiBlockDataSet.h>
 
 
 
@@ -194,7 +194,7 @@ int ParticleDataAdaptor<NStructReal, NStructInt, NArrayReal, NArrayInt>::GetNumb
   unsigned int &numberOfArrays)
 {
   numberOfArrays = 0;
-  if(association == vtkDataObject::POINT)
+  if(association == svtkDataObject::POINT)
   {
     numberOfArrays = m_realStructs.size()
                    + m_intStructs.size()
@@ -213,7 +213,7 @@ int ParticleDataAdaptor<NStructReal, NStructInt, NArrayReal, NArrayInt>::GetArra
   unsigned int index,
   std::string &arrayName)
 {
-  if(association == vtkDataObject::POINT)
+  if(association == svtkDataObject::POINT)
   {
     if(index < m_realStructs.size())
     {
@@ -253,7 +253,7 @@ template <int NStructReal, int NStructInt, int NArrayReal, int NArrayInt>
 int ParticleDataAdaptor<NStructReal, NStructInt, NArrayReal, NArrayInt>::GetMesh(
   const std::string &meshName,
   bool structureOnly,
-  vtkDataObject *&mesh)
+  svtkDataObject *&mesh)
 {
   mesh = nullptr;
   int nprocs = 1;
@@ -266,7 +266,7 @@ int ParticleDataAdaptor<NStructReal, NStructInt, NArrayReal, NArrayInt>::GetMesh
     SENSEI_ERROR("No mesh named \"" << meshName << "\"")
     return -1;
   }
-  vtkMultiBlockDataSet* mb = vtkMultiBlockDataSet::New();
+  svtkMultiBlockDataSet* mb = svtkMultiBlockDataSet::New();
 
   if (structureOnly)
   {
@@ -275,7 +275,7 @@ int ParticleDataAdaptor<NStructReal, NStructInt, NArrayReal, NArrayInt>::GetMesh
   }
 
   mb->SetNumberOfBlocks(nprocs);
-  vtkPolyData *pd = BuildParticles();
+  svtkPolyData *pd = BuildParticles();
   mb->SetBlock(rank, pd);
   pd->Delete();
   mesh = mb;
@@ -286,7 +286,7 @@ int ParticleDataAdaptor<NStructReal, NStructInt, NArrayReal, NArrayInt>::GetMesh
 //-----------------------------------------------------------------------------
 template <int NStructReal, int NStructInt, int NArrayReal, int NArrayInt>
 int ParticleDataAdaptor<NStructReal, NStructInt, NArrayReal, NArrayInt>::AddGhostNodesArray(
-  vtkDataObject*,
+  svtkDataObject*,
   const std::string &meshName)
 {
   if (meshName != m_particlesName)
@@ -300,7 +300,7 @@ int ParticleDataAdaptor<NStructReal, NStructInt, NArrayReal, NArrayInt>::AddGhos
 //-----------------------------------------------------------------------------
 template <int NStructReal, int NStructInt, int NArrayReal, int NArrayInt>
 int ParticleDataAdaptor<NStructReal, NStructInt, NArrayReal, NArrayInt>::AddGhostCellsArray(
-  vtkDataObject*,
+  svtkDataObject*,
   const std::string &meshName)
 {
   if (meshName != m_particlesName)
@@ -314,7 +314,7 @@ int ParticleDataAdaptor<NStructReal, NStructInt, NArrayReal, NArrayInt>::AddGhos
 //-----------------------------------------------------------------------------
 template <int NStructReal, int NStructInt, int NArrayReal, int NArrayInt>
 int ParticleDataAdaptor<NStructReal, NStructInt, NArrayReal, NArrayInt>::AddArray(
-  vtkDataObject* mesh,
+  svtkDataObject* mesh,
   const std::string &meshName,
   int association,
   const std::string &arrayName)
@@ -325,7 +325,7 @@ int ParticleDataAdaptor<NStructReal, NStructInt, NArrayReal, NArrayInt>::AddArra
     return -1;
   }
 
-  if (association != vtkDataObject::POINT)
+  if (association != svtkDataObject::POINT)
   {
     SENSEI_ERROR("Invalid association " << association);
     return -1;
@@ -393,10 +393,10 @@ int ParticleDataAdaptor<NStructReal, NStructInt, NArrayReal, NArrayInt>::GetMesh
   metadata->MeshName = m_particlesName;
 
   // container mesh type (all)
-  metadata->MeshType = VTK_MULTIBLOCK_DATA_SET;
+  metadata->MeshType = SVTK_MULTIBLOCK_DATA_SET;
 
   // block mesh type (all)
-  metadata->BlockType = VTK_POLY_DATA;
+  metadata->BlockType = SVTK_POLY_DATA;
 
   // global number of blocks (all)
   metadata->NumBlocks = nprocs;
@@ -412,9 +412,9 @@ int ParticleDataAdaptor<NStructReal, NStructInt, NArrayReal, NArrayInt>::GetMesh
 
   // type enum of point data (unstructured, optional)
 #ifdef AMREX_SINGLE_PRECISION_PARTICLES
-  metadata->CoordinateType = VTK_FLOAT;
+  metadata->CoordinateType = SVTK_FLOAT;
 #else
-  metadata->CoordinateType = VTK_DOUBLE;
+  metadata->CoordinateType = SVTK_DOUBLE;
 #endif
 
   // total number of points in all blocks (all, optional)
@@ -467,19 +467,19 @@ int ParticleDataAdaptor<NStructReal, NStructInt, NArrayReal, NArrayInt>::GetMesh
   metadata->ArrayCentering = {};
   for(auto s : m_realStructs)
   {
-    metadata->ArrayCentering.push_back(vtkDataObject::POINT);
+    metadata->ArrayCentering.push_back(svtkDataObject::POINT);
   }
   for(auto s : m_intStructs)
   {
-    metadata->ArrayCentering.push_back(vtkDataObject::POINT);
+    metadata->ArrayCentering.push_back(svtkDataObject::POINT);
   }
   for(auto s : m_realArrays)
   {
-    metadata->ArrayCentering.push_back(vtkDataObject::POINT);
+    metadata->ArrayCentering.push_back(svtkDataObject::POINT);
   }
   for(auto s : m_intArrays)
   {
-    metadata->ArrayCentering.push_back(vtkDataObject::POINT);
+    metadata->ArrayCentering.push_back(svtkDataObject::POINT);
   }
 
   // number of components of each array (all)
@@ -506,26 +506,26 @@ int ParticleDataAdaptor<NStructReal, NStructInt, NArrayReal, NArrayInt>::GetMesh
   for(auto s : m_realStructs)
   {
 #ifdef AMREX_SINGLE_PRECISION_PARTICLES
-    metadata->ArrayType.push_back(VTK_FLOAT);
+    metadata->ArrayType.push_back(SVTK_FLOAT);
 #else
-    metadata->ArrayType.push_back(VTK_DOUBLE);
+    metadata->ArrayType.push_back(SVTK_DOUBLE);
 #endif
   }
   for(auto s : m_intStructs)
   {
-    metadata->ArrayType.push_back(VTK_INT);
+    metadata->ArrayType.push_back(SVTK_INT);
   }
   for(auto s : m_realArrays)
   {
 #ifdef AMREX_SINGLE_PRECISION_PARTICLES
-    metadata->ArrayType.push_back(VTK_FLOAT);
+    metadata->ArrayType.push_back(SVTK_FLOAT);
 #else
-    metadata->ArrayType.push_back(VTK_DOUBLE);
+    metadata->ArrayType.push_back(SVTK_DOUBLE);
 #endif
   }
   for(auto s : m_intArrays)
   {
-    metadata->ArrayType.push_back(VTK_INT);
+    metadata->ArrayType.push_back(SVTK_INT);
   }
 
   // global min,max of each array (all, optional)
@@ -646,19 +646,19 @@ int ParticleDataAdaptor<NStructReal, NStructInt, NArrayReal, NArrayInt>::GetMesh
 
 //-----------------------------------------------------------------------------
 template <int NStructReal, int NStructInt, int NArrayReal, int NArrayInt>
-vtkPolyData* ParticleDataAdaptor<NStructReal, NStructInt, NArrayReal, NArrayInt>::BuildParticles()
+svtkPolyData* ParticleDataAdaptor<NStructReal, NStructInt, NArrayReal, NArrayInt>::BuildParticles()
 {
   // return particle data pd
-  vtkPolyData* pd  = vtkPolyData::New();
+  svtkPolyData* pd  = svtkPolyData::New();
 
   const auto& particles = this->m_particles->GetParticles();
   long long numParticles = this->m_particles->TotalNumberOfParticles(true, true);
 
   // allocate vertex storage for particles
 #ifdef AMREX_SINGLE_PRECISION_PARTICLES
-  vtkNew<vtkFloatArray> coords;
+  svtkNew<svtkFloatArray> coords;
 #else
-  vtkNew<vtkDoubleArray> coords;
+  svtkNew<svtkDoubleArray> coords;
 #endif
   coords->SetName("coords");
   coords->SetNumberOfComponents(3);
@@ -669,12 +669,12 @@ vtkPolyData* ParticleDataAdaptor<NStructReal, NStructInt, NArrayReal, NArrayInt>
   double *pCoords = coords->GetPointer(0);
 #endif
 
-  // use this to index into the VTK array as we copy level by level and tile by
+  // use this to index into the SVTK array as we copy level by level and tile by
   // tile
   long long ptId = 0;
 
   // allocate connectivity array for particles
-  vtkNew<vtkCellArray> vertex;
+  svtkNew<svtkCellArray> vertex;
   vertex->AllocateExact(numParticles, 1);
 
   // points->SetNumberOfPoints(numParticles);
@@ -717,8 +717,8 @@ vtkPolyData* ParticleDataAdaptor<NStructReal, NStructInt, NArrayReal, NArrayInt>
     }
   }
 
-  // pass the particle coordinates into VTK's point data structure.
-  vtkNew<vtkPoints> points;
+  // pass the particle coordinates into SVTK's point data structure.
+  svtkNew<svtkPoints> points;
   points->SetData(coords);
 
   // add point and vertex data to output mesh
@@ -731,14 +731,14 @@ vtkPolyData* ParticleDataAdaptor<NStructReal, NStructInt, NArrayReal, NArrayInt>
 //-----------------------------------------------------------------------------
 template <int NStructReal, int NStructInt, int NArrayReal, int NArrayInt>
 int ParticleDataAdaptor<NStructReal, NStructInt, NArrayReal, NArrayInt>::AddParticlesIDArray(
-  vtkDataObject* mesh)
+  svtkDataObject* mesh)
 {
-  auto vtk_particles = dynamic_cast<vtkPolyData*>(mesh);
+  auto svtk_particles = dynamic_cast<svtkPolyData*>(mesh);
   const auto& particles = this->m_particles->GetParticles();
   auto nptsOnProc = this->m_particles->TotalNumberOfParticles(true, true);
 
- // allocate a VTK array for the data
-  vtkNew<vtkIntArray> idArray;
+ // allocate a SVTK array for the data
+  svtkNew<svtkIntArray> idArray;
   idArray->SetName("id");
   idArray->SetNumberOfComponents(1);
   idArray->SetNumberOfValues(nptsOnProc);
@@ -767,8 +767,8 @@ int ParticleDataAdaptor<NStructReal, NStructInt, NArrayReal, NArrayInt>::AddPart
     }
   }
 
-  // the association for this array is vtkDataObject::POINT
-  vtk_particles->GetPointData()->AddArray(idArray);
+  // the association for this array is svtkDataObject::POINT
+  svtk_particles->GetPointData()->AddArray(idArray);
 
   return 0;
 }
@@ -776,14 +776,14 @@ int ParticleDataAdaptor<NStructReal, NStructInt, NArrayReal, NArrayInt>::AddPart
 //-----------------------------------------------------------------------------
 template <int NStructReal, int NStructInt, int NArrayReal, int NArrayInt>
 int ParticleDataAdaptor<NStructReal, NStructInt, NArrayReal, NArrayInt>::AddParticlesCPUArray(
-  vtkDataObject* mesh)
+  svtkDataObject* mesh)
 {
-  auto vtk_particles = dynamic_cast<vtkPolyData*>(mesh);
+  auto svtk_particles = dynamic_cast<svtkPolyData*>(mesh);
   const auto& particles = this->m_particles->GetParticles();
   auto nptsOnProc = this->m_particles->TotalNumberOfParticles(true, true);
 
-  // allocate a VTK array for the data
-  vtkNew<vtkIntArray> cpuArray;
+  // allocate a SVTK array for the data
+  svtkNew<svtkIntArray> cpuArray;
   cpuArray->SetName("cpu");
   cpuArray->SetNumberOfComponents(1);
   cpuArray->SetNumberOfValues(nptsOnProc);
@@ -811,8 +811,8 @@ int ParticleDataAdaptor<NStructReal, NStructInt, NArrayReal, NArrayInt>::AddPart
     }
   }
 
-  // the association for this array is vtkDataObject::POINT
-  vtk_particles->GetPointData()->AddArray(cpuArray);
+  // the association for this array is svtkDataObject::POINT
+  svtk_particles->GetPointData()->AddArray(cpuArray);
 
   return 0;
 }
@@ -821,7 +821,7 @@ int ParticleDataAdaptor<NStructReal, NStructInt, NArrayReal, NArrayInt>::AddPart
 template <int NStructReal, int NStructInt, int NArrayReal, int NArrayInt>
 int ParticleDataAdaptor<NStructReal, NStructInt, NArrayReal, NArrayInt>::AddParticlesSOARealArray(
   const std::string &arrayName,
-  vtkDataObject* mesh)
+  svtkDataObject* mesh)
 {
   const long nParticles = this->m_particles->TotalNumberOfParticles(true, true);
 
@@ -847,11 +847,11 @@ int ParticleDataAdaptor<NStructReal, NStructInt, NArrayReal, NArrayInt>::AddPart
     }
   }
 
-  // allocate the vtkArray
+  // allocate the svtkArray
 #ifdef AMREX_SINGLE_PRECISION_PARTICLES
-  vtkNew<vtkFloatArray> data;
+  svtkNew<svtkFloatArray> data;
 #else
-  vtkNew<vtkDoubleArray> data;
+  svtkNew<svtkDoubleArray> data;
 #endif
   data->SetName(arrayName.c_str());
   data->SetNumberOfComponents(nComps);
@@ -896,9 +896,9 @@ int ParticleDataAdaptor<NStructReal, NStructInt, NArrayReal, NArrayInt>::AddPart
   int rank = 0;
   MPI_Comm_rank(this->GetCommunicator(), &rank);
 
-  auto blocks = dynamic_cast<vtkMultiBlockDataSet*>(mesh);
+  auto blocks = dynamic_cast<svtkMultiBlockDataSet*>(mesh);
 
-  auto block = dynamic_cast<vtkPolyData*>(blocks->GetBlock(rank));
+  auto block = dynamic_cast<svtkPolyData*>(blocks->GetBlock(rank));
   block->GetPointData()->AddArray(data);
 
   return 0;
@@ -908,7 +908,7 @@ int ParticleDataAdaptor<NStructReal, NStructInt, NArrayReal, NArrayInt>::AddPart
 template <int NStructReal, int NStructInt, int NArrayReal, int NArrayInt>
 int ParticleDataAdaptor<NStructReal, NStructInt, NArrayReal, NArrayInt>::AddParticlesSOAIntArray(
   const std::string &arrayName,
-  vtkDataObject* mesh)
+  svtkDataObject* mesh)
 {
   // get the particles from the particle container
   auto nptsOnProc = this->m_particles->TotalNumberOfParticles(true, true);
@@ -931,7 +931,7 @@ int ParticleDataAdaptor<NStructReal, NStructInt, NArrayReal, NArrayInt>::AddPart
     return -1;
   }
 
-  vtkNew<vtkIntArray> data;
+  svtkNew<svtkIntArray> data;
   data->SetName(arrayName.c_str());
   data->SetNumberOfComponents(1);
   data->SetNumberOfValues(nptsOnProc);
@@ -967,9 +967,9 @@ int ParticleDataAdaptor<NStructReal, NStructInt, NArrayReal, NArrayInt>::AddPart
   int rank = 0;
   MPI_Comm_rank(this->GetCommunicator(), &rank);
 
-  auto blocks = dynamic_cast<vtkMultiBlockDataSet*>(mesh);
+  auto blocks = dynamic_cast<svtkMultiBlockDataSet*>(mesh);
 
-  auto block = dynamic_cast<vtkPolyData*>(blocks->GetBlock(rank));
+  auto block = dynamic_cast<svtkPolyData*>(blocks->GetBlock(rank));
   block->GetPointData()->AddArray(data);
 
   return 0;
@@ -979,7 +979,7 @@ int ParticleDataAdaptor<NStructReal, NStructInt, NArrayReal, NArrayInt>::AddPart
 template <int NStructReal, int NStructInt, int NArrayReal, int NArrayInt>
 int ParticleDataAdaptor<NStructReal, NStructInt, NArrayReal, NArrayInt>::AddParticlesAOSRealArray(
   const std::string &arrayName,
-  vtkDataObject* mesh)
+  svtkDataObject* mesh)
 {
   // get the particles from the particle container
   const auto& particles = this->m_particles->GetParticles();
@@ -1007,11 +1007,11 @@ int ParticleDataAdaptor<NStructReal, NStructInt, NArrayReal, NArrayInt>::AddPart
     }
   }
 
-  // allocate the vtk array
+  // allocate the svtk array
 #ifdef AMREX_SINGLE_PRECISION_PARTICLES
-  vtkNew<vtkFloatArray> data;
+  svtkNew<svtkFloatArray> data;
 #else
-  vtkNew<vtkDoubleArray> data;
+  svtkNew<svtkDoubleArray> data;
 #endif
 
   data->SetName(arrayName.c_str());
@@ -1053,9 +1053,9 @@ int ParticleDataAdaptor<NStructReal, NStructInt, NArrayReal, NArrayInt>::AddPart
   int rank = 0;
   MPI_Comm_rank(this->GetCommunicator(), &rank);
 
-  auto blocks = dynamic_cast<vtkMultiBlockDataSet*>(mesh);
+  auto blocks = dynamic_cast<svtkMultiBlockDataSet*>(mesh);
 
-  auto block = dynamic_cast<vtkPolyData*>(blocks->GetBlock(rank));
+  auto block = dynamic_cast<svtkPolyData*>(blocks->GetBlock(rank));
   block->GetPointData()->AddArray(data);
 
   return 0;
@@ -1065,7 +1065,7 @@ int ParticleDataAdaptor<NStructReal, NStructInt, NArrayReal, NArrayInt>::AddPart
 template <int NStructReal, int NStructInt, int NArrayReal, int NArrayInt>
 int ParticleDataAdaptor<NStructReal, NStructInt, NArrayReal, NArrayInt>::AddParticlesAOSIntArray(
   const std::string &arrayName,
-  vtkDataObject* mesh)
+  svtkDataObject* mesh)
 {
   // get the particles from the particle container
   const auto& particles = this->m_particles->GetParticles();
@@ -1090,8 +1090,8 @@ int ParticleDataAdaptor<NStructReal, NStructInt, NArrayReal, NArrayInt>::AddPart
     return -1;
   }
 
-  // allocate vtkArray
-  vtkNew<vtkIntArray> data;
+  // allocate svtkArray
+  svtkNew<svtkIntArray> data;
   data->SetName(arrayName.c_str());
   data->SetNumberOfComponents(1);
   data->SetNumberOfValues(nptsOnProc);
@@ -1121,9 +1121,9 @@ int ParticleDataAdaptor<NStructReal, NStructInt, NArrayReal, NArrayInt>::AddPart
   int rank = 0;
   MPI_Comm_rank(this->GetCommunicator(), &rank);
 
-  auto blocks = dynamic_cast<vtkMultiBlockDataSet*>(mesh);
+  auto blocks = dynamic_cast<svtkMultiBlockDataSet*>(mesh);
 
-  auto block = dynamic_cast<vtkPolyData*>(blocks->GetBlock(rank));
+  auto block = dynamic_cast<svtkPolyData*>(blocks->GetBlock(rank));
   block->GetPointData()->AddArray(data);
 
 
diff --git a/Src/LinearSolvers/CMakeLists.txt b/Src/LinearSolvers/CMakeLists.txt
index bbefab67999..63de2af0113 100644
--- a/Src/LinearSolvers/CMakeLists.txt
+++ b/Src/LinearSolvers/CMakeLists.txt
@@ -98,3 +98,15 @@ if (AMReX_HYPRE)
       MLMG/AMReX_MLNodeLaplacian_hypre.cpp
       )
 endif ()
+
+if (AMReX_SPACEDIM EQUAL 3)
+
+   target_include_directories(amrex PUBLIC $<BUILD_INTERFACE:${CMAKE_CURRENT_LIST_DIR}/OpenBC>)
+
+   target_sources(amrex
+      PRIVATE
+      OpenBC/AMReX_OpenBC.H
+      OpenBC/AMReX_OpenBC_K.H
+      OpenBC/AMReX_OpenBC.cpp
+      )
+endif ()
diff --git a/Src/LinearSolvers/MLMG/AMReX_MLABecLaplacian.cpp b/Src/LinearSolvers/MLMG/AMReX_MLABecLaplacian.cpp
index 89dbb268e10..e5a9b0b31af 100644
--- a/Src/LinearSolvers/MLMG/AMReX_MLABecLaplacian.cpp
+++ b/Src/LinearSolvers/MLMG/AMReX_MLABecLaplacian.cpp
@@ -323,10 +323,10 @@ MLABecLaplacian::applyMetricTermsCoeffs ()
     for (int alev = 0; alev < m_num_amr_levels; ++alev)
     {
         const int mglev = 0;
-        applyMetricTerm(alev, mglev, m_a_coeffs[alev][mglev]);
+        applyMetricTermToMF(alev, mglev, m_a_coeffs[alev][mglev]);
         for (int idim = 0; idim < AMREX_SPACEDIM; ++idim)
         {
-            applyMetricTerm(alev, mglev, m_b_coeffs[alev][mglev][idim]);
+            applyMetricTermToMF(alev, mglev, m_b_coeffs[alev][mglev][idim]);
         }
     }
 #endif
diff --git a/Src/LinearSolvers/MLMG/AMReX_MLCGSolver.H b/Src/LinearSolvers/MLMG/AMReX_MLCGSolver.H
index 45464bbeb9c..a33d70b4771 100644
--- a/Src/LinearSolvers/MLMG/AMReX_MLCGSolver.H
+++ b/Src/LinearSolvers/MLMG/AMReX_MLCGSolver.H
@@ -40,6 +40,11 @@ public:
                Real            eps_rel,
                Real            eps_abs);
 
+    int solve (Any&       solnL,
+               const Any& rhsL,
+               Real       eps_rel,
+               Real       eps_abs);
+
     void setVerbose (int _verbose) { verbose = _verbose; }
     int getVerbose () const { return verbose; }
 
diff --git a/Src/LinearSolvers/MLMG/AMReX_MLCGSolver.cpp b/Src/LinearSolvers/MLMG/AMReX_MLCGSolver.cpp
index c32b0d6199d..76144e6d42f 100644
--- a/Src/LinearSolvers/MLMG/AMReX_MLCGSolver.cpp
+++ b/Src/LinearSolvers/MLMG/AMReX_MLCGSolver.cpp
@@ -78,6 +78,13 @@ MLCGSolver::solve (MultiFab&       sol,
     }
 }
 
+int
+MLCGSolver::solve (Any& sol, const Any& rhs, Real eps_rel, Real eps_abs)
+{
+    AMREX_ASSERT(sol.is<MultiFab>()); // xxxxx TODO: MLCGSolver Any
+    return solve(sol.get<MultiFab>(), rhs.get<MultiFab>(), eps_rel, eps_abs);
+}
+
 int
 MLCGSolver::solve_bicgstab (MultiFab&       sol,
                             const MultiFab& rhs,
diff --git a/Src/LinearSolvers/MLMG/AMReX_MLCellABecLap.H b/Src/LinearSolvers/MLMG/AMReX_MLCellABecLap.H
index 985bc9855b4..0cc6456b7c8 100644
--- a/Src/LinearSolvers/MLMG/AMReX_MLCellABecLap.H
+++ b/Src/LinearSolvers/MLMG/AMReX_MLCellABecLap.H
@@ -59,9 +59,13 @@ public:
     virtual MultiFab const* getACoeffs (int amrlev, int mglev) const = 0;
     virtual Array<MultiFab const*,AMREX_SPACEDIM> getBCoeffs (int amrlev, int mglev) const = 0;
 
-    virtual void applyInhomogNeumannTerm (int amrlev, MultiFab& rhs) const final override;
+    virtual void applyInhomogNeumannTerm (int amrlev, Any& rhs) const final override;
 
-    virtual void applyOverset (int amlev, MultiFab& rhs) const override;
+    virtual void addInhomogNeumannFlux (
+        int amrlev, const Array<MultiFab*,AMREX_SPACEDIM>& grad,
+        MultiFab const& sol, bool mult_bcoef) const final override;
+
+    virtual void applyOverset (int amlev, Any& rhs) const override;
 
 #if defined(AMREX_USE_HYPRE) && (AMREX_SPACEDIM > 1)
     virtual std::unique_ptr<Hypre> makeHypre (Hypre::Interface hypre_interface) const override;
diff --git a/Src/LinearSolvers/MLMG/AMReX_MLCellABecLap.cpp b/Src/LinearSolvers/MLMG/AMReX_MLCellABecLap.cpp
index b5580b3c15c..db57162c21f 100644
--- a/Src/LinearSolvers/MLMG/AMReX_MLCellABecLap.cpp
+++ b/Src/LinearSolvers/MLMG/AMReX_MLCellABecLap.cpp
@@ -108,7 +108,7 @@ MLCellABecLap::define (const Vector<Geometry>& a_geom,
     amrlev = 0;
     for (int mglev = 1; mglev < m_num_mg_levels[amrlev]; ++mglev) {
         MultiFab foo(m_grids[amrlev][mglev], m_dmap[amrlev][mglev], 1, 0, MFInfo().SetAlloc(false));
-        if (! isMFIterSafe(*m_overset_mask[amrlev][mglev], foo)) {
+        if (! amrex::isMFIterSafe(*m_overset_mask[amrlev][mglev], foo)) {
             auto osm = std::make_unique<iMultiFab>(m_grids[amrlev][mglev],
                                                    m_dmap[amrlev][mglev], 1, 1);
             osm->ParallelCopy(*m_overset_mask[amrlev][mglev]);
@@ -189,17 +189,21 @@ MLCellABecLap::getFluxes (const Vector<Array<MultiFab*,AMREX_SPACEDIM> >& a_flux
                 a_flux[alev][idim]->mult(betainv);
             }
         }
+        addInhomogNeumannFlux(alev, a_flux[alev], *a_sol[alev], true);
     }
 }
 
 void
-MLCellABecLap::applyInhomogNeumannTerm (int amrlev, MultiFab& rhs) const
+MLCellABecLap::applyInhomogNeumannTerm (int amrlev, Any& a_rhs) const
 {
     bool has_inhomog_neumann = hasInhomogNeumannBC();
     bool has_robin = hasRobinBC();
 
     if (!has_inhomog_neumann && !has_robin) return;
 
+    AMREX_ASSERT(a_rhs.is<MultiFab>());
+    MultiFab& rhs = a_rhs.get<MultiFab>();
+
     int ncomp = getNComp();
     const int mglev = 0;
 
@@ -414,9 +418,121 @@ MLCellABecLap::applyInhomogNeumannTerm (int amrlev, MultiFab& rhs) const
 }
 
 void
-MLCellABecLap::applyOverset (int amrlev, MultiFab& rhs) const
+MLCellABecLap::addInhomogNeumannFlux (
+    int amrlev, const Array<MultiFab*,AMREX_SPACEDIM>& grad, MultiFab const& sol,
+    bool mult_bcoef) const
+{
+    /*
+     * if (mult_bcoef == true)
+     *     grad is -bceof*grad phi
+     * else
+     *     grad is grad phi
+     */
+    Real fac = mult_bcoef ? Real(-1.0) : Real(1.0);
+
+    bool has_inhomog_neumann = hasInhomogNeumannBC();
+    bool has_robin = hasRobinBC();
+
+    if (!has_inhomog_neumann && !has_robin) return;
+
+    int ncomp = getNComp();
+    const int mglev = 0;
+
+    const auto dxinv = m_geom[amrlev][mglev].InvCellSize();
+    const Box domain = m_geom[amrlev][mglev].growPeriodicDomain(1);
+
+    Array<MultiFab const*, AMREX_SPACEDIM> bcoef = {AMREX_D_DECL(nullptr,nullptr,nullptr)};
+    if (mult_bcoef) {
+        bcoef = getBCoeffs(amrlev,mglev);
+    }
+
+    const auto& bndry = *m_bndry_sol[amrlev];
+
+    MFItInfo mfi_info;
+    if (Gpu::notInLaunchRegion()) mfi_info.SetDynamic(true);
+
+#ifdef AMREX_USE_OMP
+#pragma omp parallel if (Gpu::notInLaunchRegion())
+#endif
+    for (MFIter mfi(sol, mfi_info); mfi.isValid(); ++mfi)
+    {
+        Box const& vbx = mfi.validbox();
+        for (OrientationIter orit; orit.isValid(); ++orit) {
+            const Orientation ori = orit();
+            const int idim = ori.coordDir();
+            const Box& ccb = amrex::adjCell(vbx, ori);
+            const Dim3 os = IntVect::TheDimensionVector(idim).dim3();
+            const Real dxi = dxinv[idim];
+            if (! domain.contains(ccb)) {
+                for (int icomp = 0; icomp < ncomp; ++icomp) {
+                    auto const& phi = sol.const_array(mfi,icomp);
+                    auto const bv = bndry.bndryValues(ori).multiFab().const_array(mfi,icomp);
+                    auto const bc = bcoef[idim] ? bcoef[idim]->const_array(mfi,icomp)
+                        : Array4<Real const>{};
+                    auto const& f = grad[idim]->array(mfi,icomp);
+                    if (ori.isLow()) {
+                        if (m_lobc_orig[icomp][idim] ==
+                            LinOpBCType::inhomogNeumann) {
+                            AMREX_HOST_DEVICE_FOR_3D(ccb, i, j, k,
+                            {
+                                int ii = i+os.x;
+                                int jj = j+os.y;
+                                int kk = k+os.z;
+                                Real b = bc ? bc(ii,jj,kk) : Real(1.0);
+                                f(ii,jj,kk) = fac*b*bv(i,j,k);
+                            });
+                        } else if (m_lobc_orig[icomp][idim] ==
+                                   LinOpBCType::Robin) {
+                            Array4<Real const> const& rbc = (*m_robin_bcval[amrlev])[mfi].const_array(icomp*3);
+                            AMREX_HOST_DEVICE_FOR_3D(ccb, i, j, k,
+                            {
+                                int ii = i+os.x;
+                                int jj = j+os.y;
+                                int kk = k+os.z;
+                                Real tmp = Real(1.0) /
+                                    (rbc(i,j,k,1)*dxi + rbc(i,j,k,0)*Real(0.5));
+                                Real RA = rbc(i,j,k,2) * tmp;
+                                Real RB = (rbc(i,j,k,1)*dxi - rbc(i,j,k,0)*Real(0.5)) * tmp;
+                                Real b = bc ? bc(ii,jj,kk) : Real(1.0);
+                                f(ii,jj,kk) = fac*b*dxi*((Real(1.0)-RB)*phi(ii,jj,kk)-RA);
+                            });
+                        }
+                    } else {
+                        if (m_hibc_orig[icomp][idim] ==
+                            LinOpBCType::inhomogNeumann) {
+                            AMREX_HOST_DEVICE_FOR_3D(ccb, i, j, k,
+                            {
+                                Real b = bc ? bc(i,j,k) : Real(1.0);
+                                f(i,j,k) = fac*b*bv(i,j,k);
+                            });
+                        } else if (m_hibc_orig[icomp][idim] ==
+                                   LinOpBCType::Robin) {
+                            Array4<Real const> const& rbc = (*m_robin_bcval[amrlev])[mfi].const_array(icomp*3);
+                            AMREX_HOST_DEVICE_FOR_3D(ccb, i, j, k,
+                            {
+                                Real tmp = Real(1.0) /
+                                    (rbc(i,j,k,1)*dxi + rbc(i,j,k,0)*Real(0.5));
+                                Real RA = rbc(i,j,k,2) * tmp;
+                                Real RB = (rbc(i,j,k,1)*dxi - rbc(i,j,k,0)*Real(0.5)) * tmp;
+                                Real b = bc ? bc(i,j,k) : Real(1.0);
+                                f(i,j,k) = fac*b*dxi*(RA+(RB-Real(1.0))*
+                                                      phi(i-os.x,j-os.y,k-os.z));
+                            });
+                        }
+                    }
+                }
+            }
+        }
+    }
+}
+
+
+void
+MLCellABecLap::applyOverset (int amrlev, Any& a_rhs) const
 {
     if (m_overset_mask[amrlev][0]) {
+        AMREX_ASSERT(a_rhs.is<MultiFab>());
+        auto& rhs = a_rhs.get<MultiFab>();
         const int ncomp = getNComp();
 #ifdef AMREX_USE_OMP
 #pragma omp parallel if (Gpu::notInLaunchRegion())
diff --git a/Src/LinearSolvers/MLMG/AMReX_MLCellLinOp.H b/Src/LinearSolvers/MLMG/AMReX_MLCellLinOp.H
index f1168e5c41e..9a6bb222113 100644
--- a/Src/LinearSolvers/MLMG/AMReX_MLCellLinOp.H
+++ b/Src/LinearSolvers/MLMG/AMReX_MLCellLinOp.H
@@ -3,6 +3,7 @@
 #include <AMReX_Config.H>
 
 #include <AMReX_MLLinOp.H>
+#include <AMReX_iMultiFab.H>
 
 namespace amrex {
 
@@ -109,6 +110,8 @@ public:
 
     virtual void interpolation (int amrlev, int fmglev, MultiFab& fine, const MultiFab& crse) const override;
 
+    virtual void interpAssign (int amrlev, int fmglev, MultiFab& fine, MultiFab& crse) const override;
+
     virtual void averageDownSolutionRHS (int camrlev, MultiFab& crse_sol, MultiFab& crse_rhs,
                                          const MultiFab& fine_sol, const MultiFab& fine_rhs) override;
 
@@ -132,9 +135,12 @@ public:
     virtual void compGrad (int amrlev, const Array<MultiFab*,AMREX_SPACEDIM>& grad,
                            MultiFab& sol, Location loc) const override;
 
-    virtual void applyMetricTerm (int amrlev, int mglev, MultiFab& rhs) const final override;
+    virtual void applyMetricTerm (int amrlev, int mglev, Any& rhs) const final override;
     virtual void unapplyMetricTerm (int amrlev, int mglev, MultiFab& rhs) const final override;
-    virtual void fillSolutionBC (int amrlev, MultiFab& sol, const MultiFab* crse_bcdata=nullptr) final override;
+    virtual Vector<Real> getSolvabilityOffset (int amrlev, int mglev,
+                                               Any const& rhs) const override;
+    virtual void fixSolvabilityByOffset (int amrlev, int mglev, Any& rhs,
+                                         Vector<Real> const& offset) const override;
 
     virtual void prepareForSolve () override;
 
@@ -146,6 +152,23 @@ public:
                         const Array<FArrayBox*,AMREX_SPACEDIM>& flux,
                         const FArrayBox& sol, Location loc, const int face_only=0) const = 0;
 
+    // This could be turned into template if needed.
+    void applyMetricTermToMF (int amrlev, int mglev, MultiFab& rhs) const;
+
+    virtual Real AnyNormInfMask (int amrlev, Any const& a, bool local) const override;
+
+    virtual void AnyAvgDownResAmr (int clev, Any& cres, Any const& fres) const override;
+
+    virtual void AnyInterpolationAmr (int famrlev, Any& fine, const Any& crse,
+                                      IntVect const& /*nghost*/) const override;
+
+    virtual void AnyAverageDownAndSync (Vector<Any>& sol) const override;
+
+    virtual void addInhomogNeumannFlux (int /*amrlev*/,
+                                        const Array<MultiFab*,AMREX_SPACEDIM>& /*grad*/,
+                                        MultiFab const& /*sol*/,
+                                        bool /*mult_bcoef*/) const {}
+
     struct BCTL {
         BoundCond type;
         Real location;
@@ -210,12 +233,17 @@ protected:
     // boundary cell flags for covered, not_covered, outside_domain
     Vector<Vector<Array<MultiMask,2*AMREX_SPACEDIM> > > m_maskvals;
 
+    Vector<std::unique_ptr<iMultiFab> > m_norm_fine_mask;
+
     mutable Vector<YAFluxRegister> m_fluxreg;
 
 private:
 
     void defineAuxData ();
     void defineBC ();
+
+    void computeVolInv () const;
+    mutable Vector<Vector<Real> > m_volinv; // used by solvability fix
 };
 
 }
diff --git a/Src/LinearSolvers/MLMG/AMReX_MLCellLinOp.cpp b/Src/LinearSolvers/MLMG/AMReX_MLCellLinOp.cpp
index 8f6921950e7..5c8edcbb1a6 100644
--- a/Src/LinearSolvers/MLMG/AMReX_MLCellLinOp.cpp
+++ b/Src/LinearSolvers/MLMG/AMReX_MLCellLinOp.cpp
@@ -1,6 +1,7 @@
 
 #include <AMReX_MLCellLinOp.H>
 #include <AMReX_MLLinOp_K.H>
+#include <AMReX_MLMG_K.H>
 #include <AMReX_MultiFabUtil.H>
 
 #ifndef BL_NO_FORT
@@ -9,6 +10,11 @@
 
 namespace amrex {
 
+#ifdef AMREX_SOFT_PERF_COUNTERS
+// perf_counters
+MLCellLinOp::Counters MLCellLinOp::perf_counters;
+#endif
+
 namespace {
     // Have to put it here due to CUDA extended lambda limitation
     struct ABCTag {
@@ -97,6 +103,7 @@ MLCellLinOp::defineAuxData ()
     m_undrrelxr.resize(m_num_amr_levels);
     m_maskvals.resize(m_num_amr_levels);
     m_fluxreg.resize(m_num_amr_levels-1);
+    m_norm_fine_mask.resize(m_num_amr_levels-1);
 
     const int ncomp = getNComp();
 
@@ -136,6 +143,9 @@ MLCellLinOp::defineAuxData ()
                                  m_dmap[amrlev+1][0], m_dmap[amrlev][0],
                                  m_geom[amrlev+1][0], m_geom[amrlev][0],
                                  ratio, amrlev+1, ncomp);
+        m_norm_fine_mask[amrlev] = std::make_unique<iMultiFab>
+            (makeFineMask(m_grids[amrlev][0], m_dmap[amrlev][0], m_grids[amrlev+1][0],
+                          ratio, 1, 0));
     }
 
 #if (AMREX_SPACEDIM != 3)
@@ -530,18 +540,6 @@ MLCellLinOp::solutionResidual (int amrlev, MultiFab& resid, MultiFab& x, const M
     MultiFab::Xpay(resid, Real(-1.0), b, 0, 0, ncomp, 0);
 }
 
-void
-MLCellLinOp::fillSolutionBC (int amrlev, MultiFab& sol, const MultiFab* crse_bcdata)
-{
-    BL_PROFILE("MLCellLinOp::fillSolutionBC()");
-    if (crse_bcdata != nullptr) {
-        updateSolBC(amrlev, *crse_bcdata);
-    }
-    const int mglev = 0;
-    applyBC(amrlev, mglev, sol, BCMode::Inhomogeneous, StateMode::Solution,
-            m_bndry_sol[amrlev].get());
-}
-
 void
 MLCellLinOp::correctionResidual (int amrlev, int mglev, MultiFab& resid, MultiFab& x, const MultiFab& b,
                                  BCMode bc_mode, const MultiFab* crse_bcdata)
@@ -940,6 +938,8 @@ MLCellLinOp::compGrad (int amrlev, const Array<MultiFab*,AMREX_SPACEDIM>& grad,
         });
 #endif
     }
+
+    addInhomogNeumannFlux(amrlev, grad, sol, false);
 }
 
 void
@@ -1316,7 +1316,20 @@ MLCellLinOp::BndryCondLoc::setLOBndryConds (const Geometry& geom, const Real* dx
 }
 
 void
-MLCellLinOp::applyMetricTerm (int amrlev, int mglev, MultiFab& rhs) const
+MLCellLinOp::applyMetricTerm (int amrlev, int mglev, Any& rhs) const
+{
+    amrex::ignore_unused(amrlev,mglev,rhs);
+#if (AMREX_SPACEDIM != 3)
+
+    if (!m_has_metric_term) return;
+
+    AMREX_ASSERT(rhs.is<MultiFab>());
+    applyMetricTermToMF(amrlev, mglev, rhs.get<MultiFab>());
+#endif
+}
+
+void
+MLCellLinOp::applyMetricTermToMF (int amrlev, int mglev, MultiFab& rhs) const
 {
     amrex::ignore_unused(amrlev,mglev,rhs);
 #if (AMREX_SPACEDIM != 3)
@@ -1435,9 +1448,417 @@ MLCellLinOp::update ()
     if (MLLinOp::needsUpdate()) MLLinOp::update();
 }
 
-#ifdef AMREX_SOFT_PERF_COUNTERS
-// perf_counters
-MLCellLinOp::Counters MLCellLinOp::perf_counters;
+void
+MLCellLinOp::computeVolInv () const
+{
+    if (!m_volinv.empty()) return;
+
+    m_volinv.resize(m_num_amr_levels);
+    for (int amrlev = 0; amrlev < m_num_amr_levels; ++amrlev) {
+        m_volinv[amrlev].resize(NMGLevels(amrlev));
+    }
+
+    // We don't need to compute for every level
+
+    auto f = [&] (int amrlev, int mglev) {
+#ifdef AMREX_USE_EB
+        auto factory = dynamic_cast<EBFArrayBoxFactory const*>(Factory(amrlev,mglev));
+        if (factory)
+        {
+            const MultiFab& vfrac = factory->getVolFrac();
+            m_volinv[amrlev][mglev] = vfrac.sum(0,true);
+        }
+        else
+#endif
+        {
+            m_volinv[amrlev][mglev]
+                = Real(1.0 / compactify(Geom(amrlev,mglev).Domain()).d_numPts());
+        }
+    };
+
+    // amrlev = 0, mglev = 0
+    f(0,0);
+
+    int mgbottom = NMGLevels(0)-1;
+    f(0,mgbottom);
+
+#ifdef AMREX_USE_EB
+    Real temp1, temp2;
+    auto factory = dynamic_cast<EBFArrayBoxFactory const*>(Factory(0,0));
+    if (factory)
+    {
+        ParallelAllReduce::Sum<Real>({m_volinv[0][0], m_volinv[0][mgbottom]},
+                                     ParallelContext::CommunicatorSub());
+        temp1 = Real(1.0)/m_volinv[0][0];
+        temp2 = Real(1.0)/m_volinv[0][mgbottom];
+    }
+    else
+    {
+        temp1 = m_volinv[0][0];
+        temp2 = m_volinv[0][mgbottom];
+    }
+    m_volinv[0][0] = temp1;
+    m_volinv[0][mgbottom] = temp2;
+#endif
+}
+
+Vector<Real>
+MLCellLinOp::getSolvabilityOffset (int amrlev, int mglev, Any const& a_rhs) const
+{
+    AMREX_ASSERT(a_rhs.is<MultiFab>());
+    auto const& rhs = a_rhs.get<MultiFab>();
+
+    computeVolInv();
+
+    const int ncomp = getNComp();
+    Vector<Real> offset(ncomp);
+
+#ifdef AMREX_USE_EB
+    auto factory = dynamic_cast<EBFArrayBoxFactory const*>(Factory(amrlev,mglev));
+    if (factory)
+    {
+        const MultiFab& vfrac = factory->getVolFrac();
+        for (int c = 0; c < ncomp; ++c) {
+            offset[c] = MultiFab::Dot(rhs, c, vfrac, 0, 1, 0, true) * m_volinv[amrlev][mglev];
+        }
+    }
+    else
+#endif
+    {
+        for (int c = 0; c < ncomp; ++c) {
+            offset[c] = rhs.sum(c,true) * m_volinv[amrlev][mglev];
+        }
+    }
+
+    ParallelAllReduce::Sum(offset.data(), ncomp, ParallelContext::CommunicatorSub());
+
+    return offset;
+}
+
+Real
+MLCellLinOp::AnyNormInfMask (int amrlev, Any const& a, bool local) const
+{
+    AMREX_ASSERT(a.is<MultiFab>());
+    auto& mf = a.get<MultiFab>();
+
+    const int finest_level = NAMRLevels() - 1;
+    Real norm = 0._rt;
+#ifdef AMREX_USE_EB
+    const int ncomp = getNComp();
+    if (! mf.isAllRegular()) {
+        auto factory = dynamic_cast<EBFArrayBoxFactory const*>(Factory(amrlev));
+        const MultiFab& vfrac = factory->getVolFrac();
+        if (amrlev == finest_level) {
+#ifdef AMREX_USE_GPU
+            if (Gpu::inLaunchRegion()) {
+                auto const& ma = mf.const_arrays();
+                auto const& vfrac_ma = vfrac.const_arrays();
+                norm = ParReduce(TypeList<ReduceOpMax>{}, TypeList<Real>{},
+                                 mf, IntVect(0), ncomp,
+                                 [=] AMREX_GPU_DEVICE (int box_no, int i, int j, int k, int n)
+                                     -> GpuTuple<Real>
+                                 {
+                                     return amrex::Math::abs(ma[box_no](i,j,k,n)
+                                                             *vfrac_ma[box_no](i,j,k));
+                                 });
+            } else
+#endif
+            {
+#ifdef AMREX_USE_OMP
+#pragma omp parallel reduction(max:norm)
+#endif
+                for (MFIter mfi(mf,true); mfi.isValid(); ++mfi) {
+                    Box const& bx = mfi.tilebox();
+                    auto const& fab = mf.const_array(mfi);
+                    auto const& v = vfrac.const_array(mfi);
+                    AMREX_LOOP_4D(bx, ncomp, i, j, k, n,
+                    {
+                        norm = std::max(norm, amrex::Math::abs(fab(i,j,k,n)*v(i,j,k)));
+                    });
+                }
+            }
+        } else {
+#ifdef AMREX_USE_GPU
+            if (Gpu::inLaunchRegion()) {
+                auto const& ma = mf.const_arrays();
+                auto const& mask_ma = m_norm_fine_mask[amrlev]->const_arrays();
+                auto const& vfrac_ma = vfrac.const_arrays();
+                norm = ParReduce(TypeList<ReduceOpMax>{}, TypeList<Real>{},
+                                 mf, IntVect(0), ncomp,
+                                 [=] AMREX_GPU_DEVICE (int box_no, int i, int j, int k, int n)
+                                     -> GpuTuple<Real>
+                                 {
+                                     if (mask_ma[box_no](i,j,k)) {
+                                         return amrex::Math::abs(ma[box_no](i,j,k,n)
+                                                                 *vfrac_ma[box_no](i,j,k));
+                                     } else {
+                                         return Real(0.0);
+                                     }
+                                 });
+            } else
+#endif
+            {
+#ifdef AMREX_USE_OMP
+#pragma omp parallel reduction(max:norm)
+#endif
+                for (MFIter mfi(mf,true); mfi.isValid(); ++mfi) {
+                    Box const& bx = mfi.tilebox();
+                    auto const& fab = mf.const_array(mfi);
+                    auto const& mask = m_norm_fine_mask[amrlev]->const_array(mfi);
+                    auto const& v = vfrac.const_array(mfi);
+                    AMREX_LOOP_4D(bx, ncomp, i, j, k, n,
+                    {
+                        if (mask(i,j,k)) {
+                            norm = std::max(norm, amrex::Math::abs(fab(i,j,k,n)*v(i,j,k)));
+                        }
+                    });
+                }
+            }
+        }
+    } else
+#endif
+    {
+        iMultiFab const* fine_mask = (amrlev == finest_level)
+            ? nullptr : m_norm_fine_mask[amrlev].get();
+        norm = MFNormInf(mf, fine_mask, true);
+    }
+
+    if (!local) ParallelAllReduce::Max(norm, ParallelContext::CommunicatorSub());
+    return norm;
+}
+
+void
+MLCellLinOp::AnyAvgDownResAmr (int clev, Any& cres, Any const& fres) const
+{
+    AMREX_ASSERT(cres.is<MultiFab>() && fres.is<MultiFab>());
+#ifdef AMREX_USE_EB
+    amrex::EB_average_down
+#else
+    amrex::average_down
+#endif
+        (fres.get<MultiFab>(), cres.get<MultiFab>(), 0, getNComp(), AMRRefRatio(clev));
+}
+
+void
+MLCellLinOp::AnyInterpolationAmr (int famrlev, Any& a_fine, const Any& a_crse,
+                                  IntVect const& /*nghost*/) const
+{
+    AMREX_ASSERT(a_fine.is<MultiFab>());
+    MultiFab& fine = a_fine.get<MultiFab>();
+    MultiFab const& crse = a_crse.get<MultiFab>();
+
+    const int ncomp = getNComp();
+    const int refratio = AMRRefRatio(famrlev-1);
+
+#ifdef AMREX_USE_EB
+    auto factory = dynamic_cast<EBFArrayBoxFactory const*>(Factory(famrlev));
+    const FabArray<EBCellFlagFab>* flags = (factory) ? &(factory->getMultiEBCellFlagFab()) : nullptr;
+#endif
+
+    MFItInfo mfi_info;
+    if (Gpu::notInLaunchRegion()) mfi_info.EnableTiling().SetDynamic(true);
+#ifdef AMREX_USE_OMP
+#pragma omp parallel if (Gpu::notInLaunchRegion())
+#endif
+    for (MFIter mfi(fine, mfi_info); mfi.isValid(); ++mfi)
+    {
+        const Box& bx = mfi.tilebox();
+        Array4<Real> const& ff = fine.array(mfi);
+        Array4<Real const> const& cc = crse.const_array(mfi);
+#ifdef AMREX_USE_EB
+        bool call_lincc;
+        if (factory)
+        {
+            const auto& flag = (*flags)[mfi];
+            if (flag.getType(amrex::grow(bx,1)) == FabType::regular) {
+                call_lincc = true;
+            } else {
+                Array4<EBCellFlag const> const& flg = flag.const_array();
+                switch(refratio) {
+                case 2:
+                {
+                    AMREX_LAUNCH_HOST_DEVICE_LAMBDA (bx, tbx,
+                    {
+                        mlmg_eb_cc_interp_r<2>(tbx, ff, cc, flg, ncomp);
+                    });
+                    break;
+                }
+                case 4:
+                {
+                    AMREX_LAUNCH_HOST_DEVICE_LAMBDA (bx, tbx,
+                    {
+                        mlmg_eb_cc_interp_r<4>(tbx, ff, cc, flg, ncomp);
+                    });
+                    break;
+                }
+                default:
+                    amrex::Abort("mlmg_eb_cc_interp: only refratio 2 and 4 are supported");
+                }
+
+                call_lincc = false;
+            }
+        }
+        else
+        {
+            call_lincc = true;
+        }
+#else
+        const bool call_lincc = true;
+#endif
+        if (call_lincc)
+        {
+            switch(refratio) {
+            case 2:
+            {
+                AMREX_LAUNCH_HOST_DEVICE_LAMBDA (bx, tbx,
+                {
+                    mlmg_lin_cc_interp_r2(tbx, ff, cc, ncomp);
+                });
+                break;
+            }
+            case 4:
+            {
+                AMREX_LAUNCH_HOST_DEVICE_LAMBDA (bx, tbx,
+                {
+                    mlmg_lin_cc_interp_r4(tbx, ff, cc, ncomp);
+                });
+                break;
+            }
+            default:
+                amrex::Abort("mlmg_lin_cc_interp: only refratio 2 and 4 are supported");
+            }
+        }
+    }
+}
+
+void
+MLCellLinOp::interpAssign (int amrlev, int fmglev, MultiFab& fine, MultiFab& crse) const
+{
+    const int ncomp = getNComp();
+
+    const Geometry& crse_geom = Geom(amrlev,fmglev+1);
+    const IntVect refratio = (amrlev > 0) ? IntVect(2) : mg_coarsen_ratio_vec[fmglev];
+    const IntVect ng = crse.nGrowVect();
+
+    MultiFab cfine;
+    const MultiFab* cmf;
+
+    if (amrex::isMFIterSafe(crse, fine))
+    {
+        crse.FillBoundary(crse_geom.periodicity());
+        cmf = &crse;
+    }
+    else
+    {
+        BoxArray cba = fine.boxArray();
+        cba.coarsen(refratio);
+        cfine.define(cba, fine.DistributionMap(), ncomp, ng);
+        cfine.setVal(0.0);
+        cfine.ParallelCopy(crse, 0, 0, ncomp, IntVect(0), ng, crse_geom.periodicity());
+        cmf = & cfine;
+    }
+
+    bool isEB = fine.hasEBFabFactory();
+    ignore_unused(isEB);
+
+#ifdef AMREX_USE_EB
+    auto factory = dynamic_cast<EBFArrayBoxFactory const*>(&(fine.Factory()));
+    const FabArray<EBCellFlagFab>* flags = (factory) ? &(factory->getMultiEBCellFlagFab()) : nullptr;
+#endif
+
+    MFItInfo mfi_info;
+    if (Gpu::notInLaunchRegion()) mfi_info.EnableTiling().SetDynamic(true);
+#ifdef AMREX_USE_OMP
+#pragma omp parallel if (Gpu::notInLaunchRegion())
+#endif
+    for (MFIter mfi(fine, mfi_info); mfi.isValid(); ++mfi)
+    {
+        const Box& bx = mfi.tilebox();
+        const auto& ff = fine.array(mfi);
+        const auto& cc = cmf->array(mfi);
+#ifdef AMREX_USE_EB
+        bool call_lincc;
+        if (isEB)
+        {
+            const auto& flag = (*flags)[mfi];
+            if (flag.getType(amrex::grow(bx,1)) == FabType::regular) {
+                call_lincc = true;
+            } else {
+                Array4<EBCellFlag const> const& flg = flag.const_array();
+                AMREX_LAUNCH_HOST_DEVICE_LAMBDA (bx, tbx,
+                {
+                    mlmg_eb_cc_interp_r<2>(tbx, ff, cc, flg, ncomp);
+                });
+
+                call_lincc = false;
+            }
+        }
+        else
+        {
+            call_lincc = true;
+        }
+#else
+        const bool call_lincc = true;
+#endif
+        if (call_lincc)
+        {
+#if (AMREX_SPACEDIM == 3)
+            if (hasHiddenDimension()) {
+                Box const& bx_2d = compactify(bx);
+                auto const& ff_2d = compactify(ff);
+                auto const& cc_2d = compactify(cc);
+                AMREX_LAUNCH_HOST_DEVICE_LAMBDA (bx_2d, tbx,
+                {
+                    TwoD::mlmg_lin_cc_interp_r2(tbx, ff_2d, cc_2d, ncomp);
+                });
+            } else
 #endif
+            {
+                AMREX_LAUNCH_HOST_DEVICE_LAMBDA (bx, tbx,
+                {
+                    mlmg_lin_cc_interp_r2(tbx, ff, cc, ncomp);
+                });
+            }
+        }
+    }
+}
+
+void
+MLCellLinOp::AnyAverageDownAndSync (Vector<Any>& sol) const
+{
+    AMREX_ASSERT(sol[0].is<MultiFab>());
+
+    int ncomp = getNComp();
+    for (int falev = NAMRLevels()-1; falev > 0; --falev)
+    {
+#ifdef AMREX_USE_EB
+        amrex::EB_average_down(sol[falev  ].get<MultiFab>(),
+                               sol[falev-1].get<MultiFab>(), 0, ncomp, AMRRefRatio(falev-1));
+#else
+        amrex::average_down(sol[falev  ].get<MultiFab>(),
+                            sol[falev-1].get<MultiFab>(), 0, ncomp, AMRRefRatio(falev-1));
+#endif
+    }
+}
+
+void
+MLCellLinOp::fixSolvabilityByOffset (int amrlev, int mglev, Any& a_rhs,
+                                     Vector<Real> const& offset) const
+{
+    amrex::ignore_unused(amrlev, mglev);
+    AMREX_ASSERT(a_rhs.is<MultiFab>());
+    auto& rhs = a_rhs.get<MultiFab>();
+
+    const int ncomp = getNComp();
+    for (int c = 0; c < ncomp; ++c) {
+        rhs.plus(-offset[c], c, 1);
+    }
+#ifdef AMREX_USE_EB
+    if (rhs.hasEBFabFactory()) {
+        Vector<Real> val(ncomp, 0.0_rt);
+        amrex::EB_set_covered(rhs, 0, ncomp, val);
+    }
+#endif
+}
 
 }
diff --git a/Src/LinearSolvers/MLMG/AMReX_MLEBABecLap.cpp b/Src/LinearSolvers/MLMG/AMReX_MLEBABecLap.cpp
index a006976dc08..c8bea8dd2d2 100644
--- a/Src/LinearSolvers/MLMG/AMReX_MLEBABecLap.cpp
+++ b/Src/LinearSolvers/MLMG/AMReX_MLEBABecLap.cpp
@@ -34,7 +34,8 @@ MLEBABecLap::MLEBABecLap (const Vector<Geometry>& a_geom,
 std::unique_ptr<FabFactory<FArrayBox> >
 MLEBABecLap::makeFactory (int amrlev, int mglev) const
 {
-    return makeEBFabFactory(m_geom[amrlev][mglev],
+    return makeEBFabFactory(static_cast<EBFArrayBoxFactory const*>(Factory(0,0))->getEBIndexSpace(),
+                            m_geom[amrlev][mglev],
                             m_grids[amrlev][mglev],
                             m_dmap[amrlev][mglev],
                             {1,1,1}, EBSupport::full);
diff --git a/Src/LinearSolvers/MLMG/AMReX_MLEBNodeFDLap_2D_K.H b/Src/LinearSolvers/MLMG/AMReX_MLEBNodeFDLap_2D_K.H
index 1b490726405..08439f9f99b 100644
--- a/Src/LinearSolvers/MLMG/AMReX_MLEBNodeFDLap_2D_K.H
+++ b/Src/LinearSolvers/MLMG/AMReX_MLEBNodeFDLap_2D_K.H
@@ -200,7 +200,7 @@ AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
 void mlebndfdlap_adotx_rz_eb_doit (int i, int j, int k, Array4<Real> const& y,
                                    Array4<Real const> const& x, Array4<int const> const& dmsk,
                                    Array4<Real const> const& ecx, Array4<Real const> const& ecy,
-                                   F && xeb, Real dr, Real dz, Real rlo) noexcept
+                                   F && xeb, Real sigr, Real dr, Real dz, Real rlo) noexcept
 {
     if (dmsk(i,j,k)) {
         y(i,j,k) = Real(0.0);
@@ -211,11 +211,11 @@ void mlebndfdlap_adotx_rz_eb_doit (int i, int j, int k, Array4<Real> const& y,
         Real const r = rlo + Real(i) * dr;
         if (r == Real(0.0)) {
             if (ecx(i,j,k) == Real(1.0)) { // regular
-                out = Real(4.0) * (x(i+1,j,k)-x(i,j,k)) / (dr*dr);
+                out = Real(4.0) * sigr * (x(i+1,j,k)-x(i,j,k)) / (dr*dr);
                 scale = Real(1.0);
             } else {
                 hp = Real(1.0) + Real(2.) * ecx(i,j,k);
-                out = Real(4.0) * (xeb(i+1,j,k)-x(i,j,k)) / (dr*dr*hp*hp);
+                out = Real(4.0) * sigr * (xeb(i+1,j,k)-x(i,j,k)) / (dr*dr*hp*hp);
                 scale = hp;
             }
         } else {
@@ -235,7 +235,7 @@ void mlebndfdlap_adotx_rz_eb_doit (int i, int j, int k, Array4<Real> const& y,
                 tmp += (xeb(i-1,j,k) - x(i,j,k)) / hm * (r - Real(0.5) * hp * dr);
             }
 
-            out = tmp * Real(2.0) / ((hp+hm) * r * dr * dr);
+            out = tmp * Real(2.0) * sigr / ((hp+hm) * r * dr * dr);
             scale = amrex::min(hm, hp);
         }
 
@@ -266,29 +266,29 @@ AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
 void mlebndfdlap_adotx_rz_eb (int i, int j, int k, Array4<Real> const& y,
                               Array4<Real const> const& x, Array4<int const> const& dmsk,
                               Array4<Real const> const& ecx, Array4<Real const> const& ecy,
-                              Real xeb, Real dr, Real dz, Real rlo) noexcept
+                              Real xeb, Real sigr, Real dr, Real dz, Real rlo) noexcept
 {
     mlebndfdlap_adotx_rz_eb_doit(i, j, k, y, x, dmsk, ecx, ecy,
                                  [=] (int, int, int) -> Real { return xeb; },
-                                 dr, dz, rlo);
+                                 sigr, dr, dz, rlo);
 }
 
 AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
 void mlebndfdlap_adotx_rz_eb (int i, int j, int k, Array4<Real> const& y,
                               Array4<Real const> const& x, Array4<int const> const& dmsk,
                               Array4<Real const> const& ecx, Array4<Real const> const& ecy,
-                              Array4<Real const> const& xeb, Real dr, Real dz, Real rlo) noexcept
+                              Array4<Real const> const& xeb, Real sigr, Real dr, Real dz, Real rlo) noexcept
 {
     mlebndfdlap_adotx_rz_eb_doit(i, j, k, y, x, dmsk, ecx, ecy,
                                  [=] (int i1, int i2, int i3) -> Real {
                                      return xeb(i1,i2,i3); },
-                                 dr, dz, rlo);
+                                 sigr, dr, dz, rlo);
 }
 
 AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
 void mlebndfdlap_adotx_rz (int i, int j, int k, Array4<Real> const& y,
                            Array4<Real const> const& x, Array4<int const> const& dmsk,
-                           Real dr, Real dz, Real rlo) noexcept
+                           Real sigr, Real dr, Real dz, Real rlo) noexcept
 {
     if (dmsk(i,j,k)) {
         y(i,j,k) = Real(0.0);
@@ -296,11 +296,11 @@ void mlebndfdlap_adotx_rz (int i, int j, int k, Array4<Real> const& y,
         Real Ax = (x(i,j-1,k) - Real(2.0)*x(i,j,k) + x(i,j+1,k)) / (dz*dz);
         Real const r = rlo + Real(i)*dr;
         if (r == Real(0.0)) {
-            Ax += Real(4.0) * (x(i+1,j,k)-x(i,j,k)) / (dr*dr);
+            Ax += Real(4.0) * sigr * (x(i+1,j,k)-x(i,j,k)) / (dr*dr);
         } else {
             Real const rp = r + Real(0.5)*dr;
             Real const rm = r - Real(0.5)*dr;
-            Ax += (rp*x(i+1,j,k) - (rp+rm)*x(i,j,k) + rm*x(i-1,j,k)) / (r*dr*dr);
+            Ax += sigr * (rp*x(i+1,j,k) - (rp+rm)*x(i,j,k) + rm*x(i-1,j,k)) / (r*dr*dr);
         }
         y(i,j,k) = Ax;
     }
@@ -310,7 +310,7 @@ AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
 void mlebndfdlap_gsrb_rz_eb (int i, int j, int k, Array4<Real> const& x,
                              Array4<Real const> const& rhs, Array4<int const> const& dmsk,
                              Array4<Real const> const& ecx, Array4<Real const> const& ecy,
-                             Real dr, Real dz, Real rlo, int redblack) noexcept
+                             Real sigr, Real dr, Real dz, Real rlo, int redblack) noexcept
 {
     if ((i+j+k+redblack)%2 == 0) {
         if (dmsk(i,j,k)) {
@@ -322,12 +322,12 @@ void mlebndfdlap_gsrb_rz_eb (int i, int j, int k, Array4<Real> const& x,
             Real const r = rlo + Real(i) * dr;
             if (r == Real(0.0)) {
                 if (ecx(i,j,k) == Real(1.0)) { // regular
-                    Ax = (Real(4.0) / (dr*dr)) * (x(i+1,j,k)-x(i,j,k));
-                    gamma = -(Real(4.0) / (dr*dr));
+                    Ax = (Real(4.0) * sigr / (dr*dr)) * (x(i+1,j,k)-x(i,j,k));
+                    gamma = -(Real(4.0) * sigr / (dr*dr));
                     scale = Real(1.0);
                 } else {
                     hp = Real(1.0) + Real(2.) * ecx(i,j,k);
-                    gamma = -(Real(4.0) / (dr*dr*hp*hp));
+                    gamma = -(Real(4.0) * sigr / (dr*dr*hp*hp));
                     Ax = gamma * x(i,j,k);
                     scale = hp;
                 }
@@ -352,8 +352,8 @@ void mlebndfdlap_gsrb_rz_eb (int i, int j, int k, Array4<Real> const& x,
                     tmp0 += Real(-1.0) / hm * (r - Real(0.5) * hp * dr);
                 }
 
-                Ax = tmp * Real(2.0) / ((hp+hm) * r * dr * dr);
-                gamma = tmp0 * Real(2.0) / ((hp+hm) * r * dr * dr);
+                Ax = tmp * Real(2.0) * sigr / ((hp+hm) * r * dr * dr);
+                gamma = tmp0 * Real(2.0) * sigr / ((hp+hm) * r * dr * dr);
                 scale = amrex::min(hm, hp);
             }
 
@@ -390,7 +390,7 @@ void mlebndfdlap_gsrb_rz_eb (int i, int j, int k, Array4<Real> const& x,
 AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
 void mlebndfdlap_gsrb_rz (int i, int j, int k, Array4<Real> const& x,
                           Array4<Real const> const& rhs, Array4<int const> const& dmsk,
-                          Real dr, Real dz, Real rlo, int redblack) noexcept
+                          Real sigr, Real dr, Real dz, Real rlo, int redblack) noexcept
 {
     if ((i+j+k+redblack)%2 == 0) {
         if (dmsk(i,j,k)) {
@@ -400,13 +400,13 @@ void mlebndfdlap_gsrb_rz (int i, int j, int k, Array4<Real> const& x,
             Real gamma = -Real(2.0) / (dz*dz);
             Real const r = rlo + Real(i)*dr;
             if (r == Real(0.0)) {
-                Ax += (Real(4.0)/(dr*dr)) * (x(i+1,j,k)-x(i,j,k));
-                gamma += -(Real(4.0)/(dr*dr));
+                Ax += (Real(4.0)*sigr/(dr*dr)) * (x(i+1,j,k)-x(i,j,k));
+                gamma += -(Real(4.0)*sigr/(dr*dr));
             } else {
                 Real const rp = r + Real(0.5)*dr;
                 Real const rm = r - Real(0.5)*dr;
-                Ax += (rp*x(i+1,j,k) - (rp+rm)*x(i,j,k) + rm*x(i-1,j,k)) / (r*dr*dr);
-                gamma += -(rp+rm) / (r*dr*dr);
+                Ax += sigr*(rp*x(i+1,j,k) - (rp+rm)*x(i,j,k) + rm*x(i-1,j,k)) / (r*dr*dr);
+                gamma += -sigr*(rp+rm) / (r*dr*dr);
             }
             constexpr Real omega = Real(1.25);
             x(i,j,k) += (rhs(i,j,k) - Ax) * (omega / gamma);
diff --git a/Src/LinearSolvers/MLMG/AMReX_MLEBNodeFDLaplacian.H b/Src/LinearSolvers/MLMG/AMReX_MLEBNodeFDLaplacian.H
index 1215eda1f6c..404aefc8c0b 100644
--- a/Src/LinearSolvers/MLMG/AMReX_MLEBNodeFDLaplacian.H
+++ b/Src/LinearSolvers/MLMG/AMReX_MLEBNodeFDLaplacian.H
@@ -19,8 +19,8 @@ namespace amrex {
 // with only diagonal components.  The EB is assumed to be Dirichlet.
 //
 // del dot (simga grad phi) - alpha/r^2 phi = rhs, for RZ where alpha is a
-// scalar constant that is zero by default.  sigma is non-zero in
-// z-direction only.  For now the `alpha` term has not been implemented yet.
+// scalar constant that is zero by default.  For now the `alpha` term has
+// not been implemented yet
 
 class MLEBNodeFDLaplacian
     : public MLNodeLinOp
@@ -72,7 +72,7 @@ public:
 
     virtual std::unique_ptr<FabFactory<FArrayBox> > makeFactory (int amrlev, int mglev) const final override;
 
-    virtual void scaleRHS (int amrlev, MultiFab& rhs) const final;
+    virtual void scaleRHS (int amrlev, Any& rhs) const final;
 
 #endif
 
@@ -100,6 +100,7 @@ public:
     virtual void fixUpResidualMask (int amrlev, iMultiFab& resmsk) final override;
 
     virtual bool isSingular (int) const final override { return false; }
+    virtual bool isBottomSingular () const final override { return false; }
 
     virtual void compGrad (int amrlev, const Array<MultiFab*,AMREX_SPACEDIM>& grad,
                            MultiFab& sol, Location /*loc*/) const override;
@@ -118,8 +119,10 @@ public:
                           Array4<Real const> const& bfab) const override;
 #endif
 
+    virtual void postSolve (Vector<Any>& sol) const override;
+
 private:
-    GpuArray<Real,AMREX_SPACEDIM> m_sigma{AMREX_D_DECL(1_rt,1_rt,1_rt)};
+    GpuArray<Real,AMREX_SPACEDIM> m_sigma{{AMREX_D_DECL(1_rt,1_rt,1_rt)}};
     Real m_s_phi_eb = std::numeric_limits<Real>::lowest();
     Vector<MultiFab> m_phi_eb;
     int m_rz = false;
diff --git a/Src/LinearSolvers/MLMG/AMReX_MLEBNodeFDLaplacian.cpp b/Src/LinearSolvers/MLMG/AMReX_MLEBNodeFDLaplacian.cpp
index cfa7595b515..920e8540200 100644
--- a/Src/LinearSolvers/MLMG/AMReX_MLEBNodeFDLaplacian.cpp
+++ b/Src/LinearSolvers/MLMG/AMReX_MLEBNodeFDLaplacian.cpp
@@ -310,16 +310,20 @@ MLEBNodeFDLaplacian::prepareForSolve ()
             AMREX_ALWAYS_ASSERT_WITH_MESSAGE(m_lobc[0][0] == BCType::Neumann,
                                              "The lo-x BC must be Neumann for 2d RZ");
         }
-        AMREX_ALWAYS_ASSERT_WITH_MESSAGE(m_sigma[0] == 0._rt,
-                                         "r-direction sigma must be zero");
+        if (m_sigma[0] == 0._rt) {
+            m_sigma[0] = 1._rt; // For backward compatibility
+        }
     }
 #endif
 }
 
 #ifdef AMREX_USE_EB
 void
-MLEBNodeFDLaplacian::scaleRHS (int amrlev, MultiFab& rhs) const
+MLEBNodeFDLaplacian::scaleRHS (int amrlev, Any& a_rhs) const
 {
+    AMREX_ASSERT(a_rhs.is<MultiFab>());
+    auto& rhs = a_rhs.get<MultiFab>();
+
     auto const& dmask = *m_dirichlet_mask[amrlev][0];
     auto factory = dynamic_cast<EBFArrayBoxFactory const*>(m_factory[amrlev][0].get());
     auto const& edgecent = factory->getEdgeCent();
@@ -353,6 +357,7 @@ MLEBNodeFDLaplacian::Fapply (int amrlev, int mglev, MultiFab& out, const MultiFa
 
     const auto dxinv = m_geom[amrlev][mglev].InvCellSizeArray();
 #if (AMREX_SPACEDIM == 2)
+    const auto sig0 = m_sigma[0];
     const auto dx0 = m_geom[amrlev][mglev].CellSize(0);
     const auto dx1 = m_geom[amrlev][mglev].CellSize(1)/std::sqrt(m_sigma[1]);
     const auto xlo = m_geom[amrlev][mglev].ProbLo(0);
@@ -393,7 +398,7 @@ MLEBNodeFDLaplacian::Fapply (int amrlev, int mglev, MultiFab& out, const MultiFa
                     AMREX_HOST_DEVICE_FOR_3D(box, i, j, k,
                     {
                         mlebndfdlap_adotx_rz_eb(i,j,k,yarr,xarr,dmarr,ecx,ecy,
-                                                phiebarr, dx0, dx1, xlo);
+                                                phiebarr, sig0, dx0, dx1, xlo);
                     });
                 } else
 #endif
@@ -410,7 +415,7 @@ MLEBNodeFDLaplacian::Fapply (int amrlev, int mglev, MultiFab& out, const MultiFa
                     AMREX_HOST_DEVICE_FOR_3D(box, i, j, k,
                     {
                         mlebndfdlap_adotx_rz_eb(i,j,k,yarr,xarr,dmarr,ecx,ecy,
-                                                phieb, dx0, dx1, xlo);
+                                                phieb, sig0, dx0, dx1, xlo);
                     });
                 } else
 #endif
@@ -429,7 +434,7 @@ MLEBNodeFDLaplacian::Fapply (int amrlev, int mglev, MultiFab& out, const MultiFa
             if (m_rz) {
                 AMREX_HOST_DEVICE_FOR_3D(box, i, j, k,
                 {
-                    mlebndfdlap_adotx_rz(i,j,k,yarr,xarr,dmarr,dx0,dx1,xlo);
+                    mlebndfdlap_adotx_rz(i,j,k,yarr,xarr,dmarr,sig0,dx0,dx1,xlo);
                 });
             } else
 #endif
@@ -450,6 +455,7 @@ MLEBNodeFDLaplacian::Fsmooth (int amrlev, int mglev, MultiFab& sol, const MultiF
 
     const auto dxinv = m_geom[amrlev][mglev].InvCellSizeArray();
 #if (AMREX_SPACEDIM == 2)
+    const auto sig0 = m_sigma[0];
     const auto dx0 = m_geom[amrlev][mglev].CellSize(0);
     const auto dx1 = m_geom[amrlev][mglev].CellSize(1)/std::sqrt(m_sigma[1]);
     const auto xlo = m_geom[amrlev][mglev].ProbLo(0);
@@ -492,7 +498,7 @@ MLEBNodeFDLaplacian::Fsmooth (int amrlev, int mglev, MultiFab& sol, const MultiF
                     AMREX_HOST_DEVICE_FOR_3D(box, i, j, k,
                     {
                         mlebndfdlap_gsrb_rz_eb(i,j,k,solarr,rhsarr,dmskarr,ecx,ecy,
-                                               dx0, dx1, xlo, redblack);
+                                               sig0, dx0, dx1, xlo, redblack);
                     });
                 } else
 #endif
@@ -511,7 +517,7 @@ MLEBNodeFDLaplacian::Fsmooth (int amrlev, int mglev, MultiFab& sol, const MultiF
                     AMREX_HOST_DEVICE_FOR_3D(box, i, j, k,
                     {
                         mlebndfdlap_gsrb_rz(i,j,k,solarr,rhsarr,dmskarr,
-                                            dx0, dx1, xlo, redblack);
+                                            sig0, dx0, dx1, xlo, redblack);
                     });
                 } else
 #endif
@@ -634,22 +640,57 @@ MLEBNodeFDLaplacian::compGrad (int amrlev, const Array<MultiFab*,AMREX_SPACEDIM>
 
 #if defined(AMREX_USE_HYPRE) && (AMREX_SPACEDIM > 1)
 void
-MLEBNodeFDLaplacian::fillIJMatrix (MFIter const& mfi,
-                                   Array4<HypreNodeLap::AtomicInt const> const& gid,
-                                   Array4<int const> const& lid,
-                                   HypreNodeLap::Int* const ncols,
-                                   HypreNodeLap::Int* const cols,
-                                   Real* const mat) const
+MLEBNodeFDLaplacian::fillIJMatrix (MFIter const& /*mfi*/,
+                                   Array4<HypreNodeLap::AtomicInt const> const& /*gid*/,
+                                   Array4<int const> const& /*lid*/,
+                                   HypreNodeLap::Int* const /*ncols*/,
+                                   HypreNodeLap::Int* const /*cols*/,
+                                   Real* const /*mat*/) const
 {
     amrex::Abort("MLEBNodeFDLaplacian::fillIJMatrix: todo");
 }
 
 void
-MLEBNodeFDLaplacian::fillRHS (MFIter const& mfi, Array4<int const> const& lid,
-                              Real* const rhs, Array4<Real const> const& bfab) const
+MLEBNodeFDLaplacian::fillRHS (MFIter const& /*mfi*/, Array4<int const> const& /*lid*/,
+                              Real* const /*rhs*/, Array4<Real const> const& /*bfab*/) const
 {
     amrex::Abort("MLEBNodeFDLaplacian::fillRHS: todo");
 }
 #endif
 
+void
+MLEBNodeFDLaplacian::postSolve (Vector<Any>& sol) const
+{
+#ifdef AMREX_USE_EB
+    for (int amrlev = 0; amrlev < m_num_amr_levels; ++amrlev) {
+        const auto phieb = m_s_phi_eb;
+        auto factory = dynamic_cast<EBFArrayBoxFactory const*>(m_factory[amrlev][0].get());
+        auto const& levset_mf = factory->getLevelSet();
+        auto const& levset_ar = levset_mf.const_arrays();
+        MultiFab& mf = sol[amrlev].get<MultiFab>();
+        auto const& sol_ar = mf.arrays();
+        if (phieb == std::numeric_limits<Real>::lowest()) {
+            auto const& phieb_ar = m_phi_eb[amrlev].const_arrays();
+            amrex::ParallelFor(mf, IntVect(1),
+            [=] AMREX_GPU_DEVICE (int bi, int i, int j, int k) noexcept
+            {
+                if (levset_ar[bi](i,j,k) >= Real(0.0)) {
+                    sol_ar[bi](i,j,k) = phieb_ar[bi](i,j,k);
+                }
+            });
+        } else {
+            amrex::ParallelFor(mf, IntVect(1),
+            [=] AMREX_GPU_DEVICE (int bi, int i, int j, int k) noexcept
+            {
+                if (levset_ar[bi](i,j,k) >= Real(0.0)) {
+                    sol_ar[bi](i,j,k) = phieb;
+                }
+            });
+        }
+    }
+#else
+    amrex::ignore_unused(sol);
+#endif
+}
+
 }
diff --git a/Src/LinearSolvers/MLMG/AMReX_MLEBTensorOp.H b/Src/LinearSolvers/MLMG/AMReX_MLEBTensorOp.H
index a522d5aa927..1ed29a84801 100644
--- a/Src/LinearSolvers/MLMG/AMReX_MLEBTensorOp.H
+++ b/Src/LinearSolvers/MLMG/AMReX_MLEBTensorOp.H
@@ -105,7 +105,8 @@ public: // for cuda
 
     void applyBCTensor (int amrlev, int mglev, MultiFab& vel,
                         BCMode bc_mode, StateMode s_mode, const MLMGBndry* bndry) const;
-    void compCrossTerms(int amrlev, int mglev, MultiFab const& mf) const;
+    void compCrossTerms(int amrlev, int mglev, MultiFab const& mf,
+                        const MLMGBndry* bndry) const;
 };
 
 }
diff --git a/Src/LinearSolvers/MLMG/AMReX_MLEBTensorOp.cpp b/Src/LinearSolvers/MLMG/AMReX_MLEBTensorOp.cpp
index 247e0fb292e..87bb78da730 100644
--- a/Src/LinearSolvers/MLMG/AMReX_MLEBTensorOp.cpp
+++ b/Src/LinearSolvers/MLMG/AMReX_MLEBTensorOp.cpp
@@ -226,7 +226,7 @@ MLEBTensorOp::apply (int amrlev, int mglev, MultiFab& out, MultiFab& in, BCMode
     MultiFab const& kapebmf = m_eb_kappa[amrlev][mglev];
     Real bscalar = m_b_scalar;
 
-    compCrossTerms(amrlev, mglev, in);
+    compCrossTerms(amrlev, mglev, in, bndry);
 
     MFItInfo mfi_info;
     if (Gpu::notInLaunchRegion()) mfi_info.EnableTiling().SetDynamic(true);
@@ -289,15 +289,23 @@ MLEBTensorOp::apply (int amrlev, int mglev, MultiFab& out, MultiFab& in, BCMode
 }
 
 void
-MLEBTensorOp::compCrossTerms(int amrlev, int mglev, MultiFab const& mf) const
+MLEBTensorOp::compCrossTerms(int amrlev, int mglev, MultiFab const& mf,
+                             const MLMGBndry* bndry) const
 {
     auto factory = dynamic_cast<EBFArrayBoxFactory const*>(m_factory[amrlev][mglev].get());
     const FabArray<EBCellFlagFab>* flags = (factory) ? &(factory->getMultiEBCellFlagFab()) : nullptr;
     auto area = (factory) ? factory->getAreaFrac()
         : Array<const MultiCutFab*,AMREX_SPACEDIM>{AMREX_D_DECL(nullptr,nullptr,nullptr)};
 
+    const auto& bcondloc = *m_bcondloc[amrlev][mglev];
+
+    Array4<Real const> foo;
+
     const Geometry& geom = m_geom[amrlev][mglev];
     const auto dxinv = geom.InvCellSizeArray();
+    const Box& domain = geom.growPeriodicDomain(1);
+    const auto dlo = amrex::lbound(domain);
+    const auto dhi = amrex::ubound(domain);
 
     Array<MultiFab,AMREX_SPACEDIM> const& etamf = m_b_coeffs[amrlev][mglev];
     Array<MultiFab,AMREX_SPACEDIM> const& kapmf = m_kappa[amrlev][mglev];
@@ -346,56 +354,143 @@ MLEBTensorOp::compCrossTerms(int amrlev, int mglev, MultiFab const& mf) const
             }
           );
         } else {
-          AMREX_D_TERM(Array4<Real> const fxfab = fluxmf[0].array(mfi);,
-                       Array4<Real> const fyfab = fluxmf[1].array(mfi);,
-                       Array4<Real> const fzfab = fluxmf[2].array(mfi););
-          Array4<Real const> const vfab = mf.const_array(mfi);
-          AMREX_D_TERM(Array4<Real const> const etaxfab = etamf[0].const_array(mfi);,
-                       Array4<Real const> const etayfab = etamf[1].const_array(mfi);,
-                       Array4<Real const> const etazfab = etamf[2].const_array(mfi););
-          AMREX_D_TERM(Array4<Real const> const kapxfab = kapmf[0].const_array(mfi);,
-                       Array4<Real const> const kapyfab = kapmf[1].const_array(mfi);,
-                       Array4<Real const> const kapzfab = kapmf[2].const_array(mfi););
-
-          if (fabtyp == FabType::regular)
-          {
-              AMREX_LAUNCH_HOST_DEVICE_LAMBDA_DIM
-              ( xbx, txbx,
-                {
-                  mltensor_cross_terms_fx(txbx,fxfab,vfab,etaxfab,kapxfab,dxinv);
-                }
-                , ybx, tybx,
-                {
-                  mltensor_cross_terms_fy(tybx,fyfab,vfab,etayfab,kapyfab,dxinv);
-                }
-                , zbx, tzbx,
-                {
-                  mltensor_cross_terms_fz(tzbx,fzfab,vfab,etazfab,kapzfab,dxinv);
+            AMREX_D_TERM(Array4<Real> const fxfab = fluxmf[0].array(mfi);,
+                         Array4<Real> const fyfab = fluxmf[1].array(mfi);,
+                         Array4<Real> const fzfab = fluxmf[2].array(mfi););
+            Array4<Real const> const vfab = mf.const_array(mfi);
+            AMREX_D_TERM(Array4<Real const> const etaxfab = etamf[0].const_array(mfi);,
+                         Array4<Real const> const etayfab = etamf[1].const_array(mfi);,
+                         Array4<Real const> const etazfab = etamf[2].const_array(mfi););
+            AMREX_D_TERM(Array4<Real const> const kapxfab = kapmf[0].const_array(mfi);,
+                         Array4<Real const> const kapyfab = kapmf[1].const_array(mfi);,
+                         Array4<Real const> const kapzfab = kapmf[2].const_array(mfi););
+
+            if (fabtyp == FabType::regular)
+            {
+                if (domain.strictly_contains(bx)) {
+                    AMREX_LAUNCH_HOST_DEVICE_LAMBDA_DIM
+                    ( xbx, txbx,
+                      {
+                        mltensor_cross_terms_fx(txbx,fxfab,vfab,etaxfab,kapxfab,dxinv);
+                      }
+                      , ybx, tybx,
+                      {
+                        mltensor_cross_terms_fy(tybx,fyfab,vfab,etayfab,kapyfab,dxinv);
+                      }
+                      , zbx, tzbx,
+                      {
+                        mltensor_cross_terms_fz(tzbx,fzfab,vfab,etazfab,kapzfab,dxinv);
+                      }
+                    );
+                } else {
+                    const auto & bdcv = bcondloc.bndryConds(mfi);
+
+                    Array2D<BoundCond,0,2*AMREX_SPACEDIM,0,AMREX_SPACEDIM> bct;
+                    for (int icomp = 0; icomp < AMREX_SPACEDIM; ++icomp) {
+                        for (OrientationIter face; face; ++face) {
+                            Orientation ori = face();
+                            bct(ori,icomp) = bdcv[icomp][ori];
+                        }
+                    }
+
+                    const auto& bvxlo = (bndry != nullptr) ?
+                        (*bndry)[Orientation(0,Orientation::low )].array(mfi) : foo;
+                    const auto& bvylo = (bndry != nullptr) ?
+                        (*bndry)[Orientation(1,Orientation::low )].array(mfi) : foo;
+                    const auto& bvxhi = (bndry != nullptr) ?
+                        (*bndry)[Orientation(0,Orientation::high)].array(mfi) : foo;
+                    const auto& bvyhi = (bndry != nullptr) ?
+                        (*bndry)[Orientation(1,Orientation::high)].array(mfi) : foo;
+#if (AMREX_SPACEDIM == 3)
+                    const auto& bvzlo = (bndry != nullptr) ?
+                        (*bndry)[Orientation(2,Orientation::low )].array(mfi) : foo;
+                    const auto& bvzhi = (bndry != nullptr) ?
+                        (*bndry)[Orientation(2,Orientation::high)].array(mfi) : foo;
+#endif
+
+                    AMREX_LAUNCH_HOST_DEVICE_LAMBDA_DIM
+                    ( xbx, txbx,
+                      {
+                          mltensor_cross_terms_fx(txbx,fxfab,vfab,etaxfab,kapxfab,dxinv,
+                                                  bvxlo, bvxhi, bct, dlo, dhi);
+                      }
+                    , ybx, tybx,
+                      {
+                          mltensor_cross_terms_fy(tybx,fyfab,vfab,etayfab,kapyfab,dxinv,
+                                                  bvylo, bvyhi, bct, dlo, dhi);
+                      }
+                    , zbx, tzbx,
+                      {
+                          mltensor_cross_terms_fz(tzbx,fzfab,vfab,etazfab,kapzfab,dxinv,
+                                                  bvzlo, bvzhi, bct, dlo, dhi);
+                      }
+                    );
                 }
-              );
-          }
-          else
-          {
-            AMREX_D_TERM(Array4<Real const> const& apx = area[0]->const_array(mfi);,
-                         Array4<Real const> const& apy = area[1]->const_array(mfi);,
-                         Array4<Real const> const& apz = area[2]->const_array(mfi););
-            Array4<EBCellFlag const> const& flag = flags->const_array(mfi);
+            }
+            else
+            {
+                AMREX_D_TERM(Array4<Real const> const& apx = area[0]->const_array(mfi);,
+                             Array4<Real const> const& apy = area[1]->const_array(mfi);,
+                             Array4<Real const> const& apz = area[2]->const_array(mfi););
+                Array4<EBCellFlag const> const& flag = flags->const_array(mfi);
+
+                if (domain.strictly_contains(bx)) {
+                    AMREX_LAUNCH_HOST_DEVICE_LAMBDA_DIM
+                    ( xbx, txbx,
+                      {
+                        mlebtensor_cross_terms_fx(txbx,fxfab,vfab,etaxfab,kapxfab,apx,flag,dxinv);
+                      }
+                      , ybx, tybx,
+                      {
+                        mlebtensor_cross_terms_fy(tybx,fyfab,vfab,etayfab,kapyfab,apy,flag,dxinv);
+                      }
+                      , zbx, tzbx,
+                      {
+                        mlebtensor_cross_terms_fz(tzbx,fzfab,vfab,etazfab,kapzfab,apz,flag,dxinv);
+                      }
+                    );
+                } else {
+                    const auto & bdcv = bcondloc.bndryConds(mfi);
+
+                    Array2D<BoundCond,0,2*AMREX_SPACEDIM,0,AMREX_SPACEDIM> bct;
+                    for (int icomp = 0; icomp < AMREX_SPACEDIM; ++icomp) {
+                        for (OrientationIter face; face; ++face) {
+                            Orientation ori = face();
+                            bct(ori,icomp) = bdcv[icomp][ori];
+                        }
+                    }
+
+                    const auto& bvxlo = (bndry != nullptr) ?
+                        (*bndry)[Orientation(0,Orientation::low )].array(mfi) : foo;
+                    const auto& bvylo = (bndry != nullptr) ?
+                        (*bndry)[Orientation(1,Orientation::low )].array(mfi) : foo;
+                    const auto& bvxhi = (bndry != nullptr) ?
+                        (*bndry)[Orientation(0,Orientation::high)].array(mfi) : foo;
+                    const auto& bvyhi = (bndry != nullptr) ?
+                        (*bndry)[Orientation(1,Orientation::high)].array(mfi) : foo;
+#if (AMREX_SPACEDIM == 3)
+                    const auto& bvzlo = (bndry != nullptr) ?
+                        (*bndry)[Orientation(2,Orientation::low )].array(mfi) : foo;
+                    const auto& bvzhi = (bndry != nullptr) ?
+                        (*bndry)[Orientation(2,Orientation::high)].array(mfi) : foo;
+#endif
 
-            AMREX_LAUNCH_HOST_DEVICE_LAMBDA_DIM
-            ( xbx, txbx,
-              {
-                mlebtensor_cross_terms_fx(txbx,fxfab,vfab,etaxfab,kapxfab,apx,flag,dxinv);
-              }
-              , ybx, tybx,
-              {
-                mlebtensor_cross_terms_fy(tybx,fyfab,vfab,etayfab,kapyfab,apy,flag,dxinv);
-              }
-              , zbx, tzbx,
-              {
-                mlebtensor_cross_terms_fz(tzbx,fzfab,vfab,etazfab,kapzfab,apz,flag,dxinv);
-              }
-              );
-          }
+                    AMREX_LAUNCH_HOST_DEVICE_LAMBDA_DIM
+                    ( xbx, txbx,
+                      {
+                        mlebtensor_cross_terms_fx(txbx,fxfab,vfab,etaxfab,kapxfab,apx,flag,dxinv, bvxlo, bvxhi, bct, dlo, dhi);
+                      }
+                      , ybx, tybx,
+                      {
+                        mlebtensor_cross_terms_fy(tybx,fyfab,vfab,etayfab,kapyfab,apy,flag,dxinv, bvylo, bvyhi, bct, dlo, dhi);
+                      }
+                      , zbx, tzbx,
+                      {
+                        mlebtensor_cross_terms_fz(tzbx,fzfab,vfab,etazfab,kapzfab,apz,flag,dxinv, bvzlo, bvzhi, bct, dlo, dhi);
+                      }
+                    );
+                }
+            }
         }
     }
 
@@ -411,7 +506,7 @@ MLEBTensorOp::compFlux (int amrlev, const Array<MultiFab*,AMREX_SPACEDIM>& fluxe
     BL_PROFILE("MLEBTensorOp::compFlux()");
 
     if ( !(loc==Location::FaceCenter || loc==Location::FaceCentroid) )
-      amrex::Abort("MLEBTensorOp::compFlux() unknown location for fluxes.");
+        amrex::Abort("MLEBTensorOp::compFlux() unknown location for fluxes.");
 
     const int mglev = 0;
     const int ncomp = getNComp();
@@ -429,7 +524,7 @@ MLEBTensorOp::compFlux (int amrlev, const Array<MultiFab*,AMREX_SPACEDIM>& fluxe
     Array<MultiFab,AMREX_SPACEDIM>& fluxmf = m_tauflux[amrlev][mglev];
     Real bscalar = m_b_scalar;
 
-    compCrossTerms(amrlev, mglev, sol);
+    compCrossTerms(amrlev, mglev, sol, m_bndry_sol[amrlev].get());
 
     MFItInfo mfi_info;
     if (Gpu::notInLaunchRegion()) mfi_info.EnableTiling().SetDynamic(true);
@@ -515,104 +610,11 @@ MLEBTensorOp::compFlux (int amrlev, const Array<MultiFab*,AMREX_SPACEDIM>& fluxe
 }
 
 void
-MLEBTensorOp::compVelGrad (int amrlev, const Array<MultiFab*,AMREX_SPACEDIM>& fluxes,
-                       MultiFab& sol, Location loc) const
+MLEBTensorOp::compVelGrad (int /*amrlev*/,
+                           const Array<MultiFab*,AMREX_SPACEDIM>& /*fluxes*/,
+                           MultiFab& /*sol*/, Location /*loc*/) const
 {
-    BL_PROFILE("MLEBTensorOp::compVelGrad()");
-
-    if ( !(loc==Location::FaceCenter || loc==Location::FaceCentroid) )
-      amrex::Abort("MLEBTensorOp::compVelGrad() unknown location for VelGradients.");
-
-    const int mglev = 0;
-
-    applyBCTensor(amrlev, mglev, sol, BCMode::Inhomogeneous, StateMode::Solution, m_bndry_sol[amrlev].get());
-
-    auto factory = dynamic_cast<EBFArrayBoxFactory const*>(m_factory[amrlev][mglev].get());
-    const FabArray<EBCellFlagFab>* flags = (factory) ? &(factory->getMultiEBCellFlagFab()) : nullptr;
-
-    const Geometry& geom = m_geom[amrlev][mglev];
-    const auto dxinv = geom.InvCellSizeArray();
-
-    const int dim_fluxes = AMREX_SPACEDIM*AMREX_SPACEDIM;
-
-    MFItInfo mfi_info;
-    if (Gpu::notInLaunchRegion()) mfi_info.EnableTiling().SetDynamic(true);
-#ifdef AMREX_USE_OMP
-#pragma omp parallel if (Gpu::notInLaunchRegion())
-#endif
-  {
-    Array<FArrayBox,AMREX_SPACEDIM> fluxfab_tmp;
-    for (MFIter mfi(sol, mfi_info); mfi.isValid(); ++mfi)
-    {
-        const Box& bx = mfi.tilebox();
-
-        auto fabtyp = (flags) ? (*flags)[mfi].getType(bx) : FabType::regular;
-        if (fabtyp == FabType::covered) continue;
-
-        if (fabtyp == FabType::regular)
-        {
-
-            Array4<Real const> const vfab = sol.const_array(mfi);
-            AMREX_D_TERM(Box const xbx = mfi.nodaltilebox(0);,
-                         Box const ybx = mfi.nodaltilebox(1);,
-                         Box const zbx = mfi.nodaltilebox(2););
-            AMREX_D_TERM(fluxfab_tmp[0].resize(xbx,dim_fluxes);,
-                         fluxfab_tmp[1].resize(ybx,dim_fluxes);,
-                         fluxfab_tmp[2].resize(zbx,dim_fluxes););
-            AMREX_D_TERM(Elixir fxeli = fluxfab_tmp[0].elixir();,
-                         Elixir fyeli = fluxfab_tmp[1].elixir();,
-                         Elixir fzeli = fluxfab_tmp[2].elixir(););
-            AMREX_D_TERM(Array4<Real> const fxfab = fluxfab_tmp[0].array();,
-                         Array4<Real> const fyfab = fluxfab_tmp[1].array();,
-                         Array4<Real> const fzfab = fluxfab_tmp[2].array(););
-            AMREX_LAUNCH_HOST_DEVICE_LAMBDA_DIM
-            ( xbx, txbx,
-              {
-                  mltensor_vel_grads_fx(txbx,fxfab,vfab,dxinv);
-              }
-            , ybx, tybx,
-              {
-                  mltensor_vel_grads_fy(tybx,fyfab,vfab,dxinv);
-              }
-            , zbx, tzbx,
-              {
-                  mltensor_vel_grads_fz(tzbx,fzfab,vfab,dxinv);
-              }
-            );
-
-// The derivatives are put in the array with the following order:
-// component: 0    ,  1    ,  2    ,  3    ,  4    , 5    ,  6    ,  7    ,  8
-// in 2D:     dU/dx,  dV/dx,  dU/dy,  dV/dy
-// in 3D:     dU/dx,  dV/dx,  dW/dx,  dU/dy,  dV/dy, dW/dy,  dU/dz,  dV/dz,  dW/dz
-
-
-            for (int idim = 0; idim < AMREX_SPACEDIM; ++idim) {
-                const Box& nbx = mfi.nodaltilebox(idim);
-                Array4<Real      > dst = fluxes[idim]->array(mfi);
-                Array4<Real const> src = fluxfab_tmp[idim].const_array();
-                AMREX_HOST_DEVICE_PARALLEL_FOR_4D (nbx, dim_fluxes, i, j, k, n,
-                {
-                    dst(i,j,k,n) = src(i,j,k,n);
-                });
-            }
-
-
-        }
-        else if ( loc==Location::FaceCenter )
-        {
-
-          amrex::Abort("compVelGrad not yet implemented for cut-cells  ");
-
-        }
-        else // loc==Location::FaceCentroid
-        {
-
-          amrex::Abort("compVelGrad not yet implemented for cut-cells  ");
-
-        }
-
-    }
-  }
+    amrex::Abort("compVelGrad not yet implemented for EB.");
 }
 
 }
diff --git a/Src/LinearSolvers/MLMG/AMReX_MLEBTensorOp_bc.cpp b/Src/LinearSolvers/MLMG/AMReX_MLEBTensorOp_bc.cpp
index c9c6eb232bb..98beecf01df 100644
--- a/Src/LinearSolvers/MLMG/AMReX_MLEBTensorOp_bc.cpp
+++ b/Src/LinearSolvers/MLMG/AMReX_MLEBTensorOp_bc.cpp
@@ -13,11 +13,12 @@ MLEBTensorOp::applyBCTensor (int amrlev, int mglev, MultiFab& vel,
     const auto& bcondloc = *m_bcondloc[amrlev][mglev];
     const auto& maskvals = m_maskvals[amrlev][mglev];
 
-    FArrayBox foofab(Box::TheUnitBox(),3);
-    const auto& foo = foofab.array();
+    Array4<Real const> foo;
 
     const auto dxinv = m_geom[amrlev][mglev].InvCellSizeArray();
     const Box& domain = m_geom[amrlev][mglev].growPeriodicDomain(1);
+    const auto dlo = amrex::lbound(domain);
+    const auto dhi = amrex::ubound(domain);
 
     auto factory = dynamic_cast<EBFArrayBoxFactory const*>(m_factory[amrlev][mglev].get());
     const FabArray<EBCellFlagFab>* flags = (factory) ? &(factory->getMultiEBCellFlagFab()) : nullptr;
@@ -39,14 +40,13 @@ MLEBTensorOp::applyBCTensor (int amrlev, int mglev, MultiFab& vel,
             const auto & bdlv = bcondloc.bndryLocs(mfi);
             const auto & bdcv = bcondloc.bndryConds(mfi);
 
-            GpuArray<BoundCond,2*AMREX_SPACEDIM*AMREX_SPACEDIM> bct;
-            GpuArray<Real,2*AMREX_SPACEDIM*AMREX_SPACEDIM> bcl;
-            for (OrientationIter face; face; ++face) {
-                Orientation ori = face();
-                const int iface = ori;
-                for (int icomp = 0; icomp < AMREX_SPACEDIM; ++icomp) {
-                    bct[iface*AMREX_SPACEDIM+icomp] = bdcv[icomp][ori];
-                    bcl[iface*AMREX_SPACEDIM+icomp] = bdlv[icomp][ori];
+            Array2D<BoundCond,0,2*AMREX_SPACEDIM,0,AMREX_SPACEDIM> bct;
+            Array2D<Real,0,2*AMREX_SPACEDIM,0,AMREX_SPACEDIM> bcl;
+            for (int icomp = 0; icomp < AMREX_SPACEDIM; ++icomp) {
+                for (OrientationIter face; face; ++face) {
+                    Orientation ori = face();
+                    bct(ori,icomp) = bdcv[icomp][ori];
+                    bcl(ori,icomp) = bdlv[icomp][ori];
                 }
             }
 
@@ -72,7 +72,7 @@ MLEBTensorOp::applyBCTensor (int amrlev, int mglev, MultiFab& vel,
                                       mxlo, mylo, mxhi, myhi,
                                       bvxlo, bvylo, bvxhi, bvyhi,
                                       bct, bcl, inhomog, imaxorder,
-                                      dxinv, domain);
+                                      dxinv, dlo, dhi);
             });
 #else
             const auto& mzlo = maskvals[Orientation(2,Orientation::low )].array(mfi);
@@ -83,14 +83,37 @@ MLEBTensorOp::applyBCTensor (int amrlev, int mglev, MultiFab& vel,
             const auto& bvzhi = (bndry != nullptr) ?
                 (*bndry)[Orientation(2,Orientation::high)].array(mfi) : foo;
 
-            AMREX_HOST_DEVICE_FOR_1D ( 12, iedge,
+#ifdef AMREX_USE_GPU
+            if (Gpu::inLaunchRegion()) {
+                amrex::launch(12, 64, Gpu::gpuStream(),
+#ifdef AMREX_USE_DPCPP
+                [=] AMREX_GPU_DEVICE (sycl::nd_item<1> const& item)
+                {
+                    int bid = item.get_group_linear_id();
+                    int tid = item.get_local_linear_id();
+                    int bdim = item.get_local_range(0);
+#else
+                [=] AMREX_GPU_DEVICE ()
+                {
+                    int bid = blockIdx.x;
+                    int tid = threadIdx.x;
+                    int bdim = blockDim.x;
+#endif
+                    mltensor_fill_edges(bid, tid, bdim, vbx, velfab,
+                                        mxlo, mylo, mzlo, mxhi, myhi, mzhi,
+                                        bvxlo, bvylo, bvzlo, bvxhi, bvyhi, bvzhi,
+                                        bct, bcl, inhomog, imaxorder,
+                                        dxinv, dlo, dhi);
+                });
+            } else
+#endif
             {
-                mltensor_fill_edges(iedge, vbx, velfab,
+                mltensor_fill_edges(vbx, velfab,
                                     mxlo, mylo, mzlo, mxhi, myhi, mzhi,
                                     bvxlo, bvylo, bvzlo, bvxhi, bvyhi, bvzhi,
                                     bct, bcl, inhomog, imaxorder,
-                                    dxinv, domain);
-            });
+                                    dxinv, dlo, dhi);
+            }
 
             AMREX_HOST_DEVICE_FOR_1D ( 8, icorner,
             {
@@ -98,13 +121,12 @@ MLEBTensorOp::applyBCTensor (int amrlev, int mglev, MultiFab& vel,
                                       mxlo, mylo, mzlo, mxhi, myhi, mzhi,
                                       bvxlo, bvylo, bvzlo, bvxhi, bvyhi, bvzhi,
                                       bct, bcl, inhomog, imaxorder,
-                                      dxinv, domain);
+                                      dxinv, dlo, dhi);
             });
+
 #endif
         }
     }
-
-    // Notet that it is incorrect to call EnforcePeriodicity on vel.
 }
 
 }
diff --git a/Src/LinearSolvers/MLMG/AMReX_MLEBTensor_2D_K.H b/Src/LinearSolvers/MLMG/AMReX_MLEBTensor_2D_K.H
index 165497d1a20..d93ea3a5d1a 100644
--- a/Src/LinearSolvers/MLMG/AMReX_MLEBTensor_2D_K.H
+++ b/Src/LinearSolvers/MLMG/AMReX_MLEBTensor_2D_K.H
@@ -6,10 +6,95 @@
 
 namespace amrex {
 
-namespace {
-    AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
-    Real mlebtensor_weight (int d) {
-        return (d==2) ? 0.5 : ((d==1) ? 1.0 : 0.0);
+AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+void mlebtensor_cross_terms_fx (Box const& box, Array4<Real> const& fx,
+                                Array4<Real const> const& vel,
+                                Array4<Real const> const& etax,
+                                Array4<Real const> const& kapx,
+                                Array4<Real const> const& apx,
+                                Array4<EBCellFlag const> const& flag,
+                                GpuArray<Real,AMREX_SPACEDIM> const& dxinv) noexcept
+{
+    const Real dyi = dxinv[1];
+    const auto lo = amrex::lbound(box);
+    const auto hi = amrex::ubound(box);
+    constexpr Real twoThirds = 2./3.;
+
+    int k = 0;
+    for     (int j = lo.y; j <= hi.y; ++j) {
+        AMREX_PRAGMA_SIMD
+        for (int i = lo.x; i <= hi.x; ++i) {
+            if (apx(i,j,0) == 0.0)
+            {
+                fx(i,j,0,0) = 0.0;
+                fx(i,j,0,1) = 0.0;
+            }
+            else
+            {
+                int jhip = j + flag(i  ,j,0).isConnected(0, 1,0);
+                int jhim = j - flag(i  ,j,0).isConnected(0,-1,0);
+                int jlop = j + flag(i-1,j,0).isConnected(0, 1,0);
+                int jlom = j - flag(i-1,j,0).isConnected(0,-1,0);
+                Real whi = mlebtensor_weight(jhip-jhim);
+                Real wlo = mlebtensor_weight(jlop-jlom);
+                Real dudy = mlebtensor_dy_on_xface(i,j,k,0,vel,dyi,
+                                                   whi,wlo,jhip,jhim,jlop,jlom);
+                Real dvdy = mlebtensor_dy_on_xface(i,j,k,1,vel,dyi,
+                                                   whi,wlo,jhip,jhim,jlop,jlom);
+                Real divu = dvdy;
+                Real xif = kapx(i,j,0);
+                Real mun = Real(0.75)*(etax(i,j,0,0)-xif);// restore the original eta
+                Real mut =             etax(i,j,0,1);
+                fx(i,j,0,0) = -mun*(-twoThirds*divu) - xif*divu;
+                fx(i,j,0,1) = -mut*dudy;
+            }
+        }
+    }
+}
+
+AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+void mlebtensor_cross_terms_fy (Box const& box, Array4<Real> const& fy,
+                                Array4<Real const> const& vel,
+                                Array4<Real const> const& etay,
+                                Array4<Real const> const& kapy,
+                                Array4<Real const> const& apy,
+                                Array4<EBCellFlag const> const& flag,
+                                GpuArray<Real,AMREX_SPACEDIM> const& dxinv) noexcept
+{
+    const Real dxi = dxinv[0];
+    const auto lo = amrex::lbound(box);
+    const auto hi = amrex::ubound(box);
+    constexpr Real twoThirds = 2./3.;
+
+    int k = 0;
+    for     (int j = lo.y; j <= hi.y; ++j) {
+        AMREX_PRAGMA_SIMD
+        for (int i = lo.x; i <= hi.x; ++i) {
+            if (apy(i,j,0) == 0.0)
+            {
+                fy(i,j,0,0) = 0.0;
+                fy(i,j,0,1) = 0.0;
+            }
+            else
+            {
+                int ihip = i + flag(i,j  ,0).isConnected( 1,0,0);
+                int ihim = i - flag(i,j  ,0).isConnected(-1,0,0);
+                int ilop = i + flag(i,j-1,0).isConnected( 1,0,0);
+                int ilom = i - flag(i,j-1,0).isConnected(-1,0,0);
+                Real whi = mlebtensor_weight(ihip-ihim);
+                Real wlo = mlebtensor_weight(ilop-ilom);
+                Real dudx = mlebtensor_dx_on_yface(i,j,k,0,vel,dxi,
+                                                   whi,wlo,ihip,ihim,ilop,ilom);
+                Real dvdx = mlebtensor_dx_on_yface(i,j,k,1,vel,dxi,
+                                                   whi,wlo,ihip,ihim,ilop,ilom);
+                Real divu = dudx;
+                Real xif = kapy(i,j,0);
+                Real mun = Real(0.75)*(etay(i,j,0,1)-xif);// restore the original eta
+                Real mut =             etay(i,j,0,0);
+                fy(i,j,0,0) = -mut*dvdx;
+                fy(i,j,0,1) = -mun*(-twoThirds*divu) - xif*divu;
+            }
+        }
     }
 }
 
@@ -20,13 +105,20 @@ void mlebtensor_cross_terms_fx (Box const& box, Array4<Real> const& fx,
                                 Array4<Real const> const& kapx,
                                 Array4<Real const> const& apx,
                                 Array4<EBCellFlag const> const& flag,
-                                GpuArray<Real,AMREX_SPACEDIM> const& dxinv) noexcept
+                                GpuArray<Real,AMREX_SPACEDIM> const& dxinv,
+                                Array4<Real const> const& bvxlo,
+                                Array4<Real const> const& bvxhi,
+                                Array2D<BoundCond,
+                                        0,2*AMREX_SPACEDIM,
+                                        0,AMREX_SPACEDIM> const& bct,
+                                Dim3 const& dlo, Dim3 const& dhi) noexcept
 {
     const Real dyi = dxinv[1];
     const auto lo = amrex::lbound(box);
     const auto hi = amrex::ubound(box);
     constexpr Real twoThirds = 2./3.;
 
+    int k = 0;
     for     (int j = lo.y; j <= hi.y; ++j) {
         AMREX_PRAGMA_SIMD
         for (int i = lo.x; i <= hi.x; ++i) {
@@ -43,13 +135,15 @@ void mlebtensor_cross_terms_fx (Box const& box, Array4<Real> const& fx,
                 int jlom = j - flag(i-1,j,0).isConnected(0,-1,0);
                 Real whi = mlebtensor_weight(jhip-jhim);
                 Real wlo = mlebtensor_weight(jlop-jlom);
-                Real dudy = (0.5*dyi) * ((vel(i  ,jhip,0,0)-vel(i  ,jhim,0,0))*whi
-                                        +(vel(i-1,jlop,0,0)-vel(i-1,jlom,0,0))*wlo);
-                Real dvdy = (0.5*dyi) * ((vel(i  ,jhip,0,1)-vel(i  ,jhim,0,1))*whi
-                                        +(vel(i-1,jlop,0,1)-vel(i-1,jlom,0,1))*wlo);
+                Real dudy = mlebtensor_dy_on_xface(i,j,k,0,vel,dyi,
+                                                   bvxlo,bvxhi,bct,dlo,dhi,
+                                                   whi,wlo,jhip,jhim,jlop,jlom);
+                Real dvdy = mlebtensor_dy_on_xface(i,j,k,1,vel,dyi,
+                                                   bvxlo,bvxhi,bct,dlo,dhi,
+                                                   whi,wlo,jhip,jhim,jlop,jlom);
                 Real divu = dvdy;
                 Real xif = kapx(i,j,0);
-                Real mun = 0.75*(etax(i,j,0,0)-xif);  // restore the original eta
+                Real mun = Real(0.75)*(etax(i,j,0,0)-xif);// restore the original eta
                 Real mut =       etax(i,j,0,1);
                 fx(i,j,0,0) = -mun*(-twoThirds*divu) - xif*divu;
                 fx(i,j,0,1) = -mut*dudy;
@@ -65,13 +159,20 @@ void mlebtensor_cross_terms_fy (Box const& box, Array4<Real> const& fy,
                                 Array4<Real const> const& kapy,
                                 Array4<Real const> const& apy,
                                 Array4<EBCellFlag const> const& flag,
-                                GpuArray<Real,AMREX_SPACEDIM> const& dxinv) noexcept
+                                GpuArray<Real,AMREX_SPACEDIM> const& dxinv,
+                                Array4<Real const> const& bvylo,
+                                Array4<Real const> const& bvyhi,
+                                Array2D<BoundCond,
+                                        0,2*AMREX_SPACEDIM,
+                                        0,AMREX_SPACEDIM> const& bct,
+                                Dim3 const& dlo, Dim3 const& dhi) noexcept
 {
     const Real dxi = dxinv[0];
     const auto lo = amrex::lbound(box);
     const auto hi = amrex::ubound(box);
     constexpr Real twoThirds = 2./3.;
 
+    int k = 0;
     for     (int j = lo.y; j <= hi.y; ++j) {
         AMREX_PRAGMA_SIMD
         for (int i = lo.x; i <= hi.x; ++i) {
@@ -88,15 +189,16 @@ void mlebtensor_cross_terms_fy (Box const& box, Array4<Real> const& fy,
                 int ilom = i - flag(i,j-1,0).isConnected(-1,0,0);
                 Real whi = mlebtensor_weight(ihip-ihim);
                 Real wlo = mlebtensor_weight(ilop-ilom);
-                Real dudx = (0.5*dxi) * ((vel(ihip,j  ,0,0)-vel(ihim,j  ,0,0))*whi
-                                        +(vel(ilop,j-1,0,0)-vel(ilom,j-1,0,0))*wlo);
-                Real dvdx = (0.5*dxi) * ((vel(ihip,j  ,0,1)-vel(ihim,j  ,0,1))*whi
-                                        +(vel(ilop,j-1,0,1)-vel(ilom,j-1,0,1))*wlo);
-
+                Real dudx = mlebtensor_dx_on_yface(i,j,k,0,vel,dxi,
+                                                   bvylo,bvyhi,bct,dlo,dhi,
+                                                   whi,wlo,ihip,ihim,ilop,ilom);
+                Real dvdx = mlebtensor_dx_on_yface(i,j,k,1,vel,dxi,
+                                                   bvylo,bvyhi,bct,dlo,dhi,
+                                                   whi,wlo,ihip,ihim,ilop,ilom);
                 Real divu = dudx;
                 Real xif = kapy(i,j,0);
-                Real mun = 0.75*(etay(i,j,0,1)-xif);  // restore the original eta
-                Real mut =       etay(i,j,0,0);
+                Real mun = Real(0.75)*(etay(i,j,0,1)-xif);// restore the original eta
+                Real mut =             etay(i,j,0,0);
                 fy(i,j,0,0) = -mut*dvdx;
                 fy(i,j,0,1) = -mun*(-twoThirds*divu) - xif*divu;
             }
diff --git a/Src/LinearSolvers/MLMG/AMReX_MLEBTensor_3D_K.H b/Src/LinearSolvers/MLMG/AMReX_MLEBTensor_3D_K.H
index 3c26566e7ac..2651addee2c 100644
--- a/Src/LinearSolvers/MLMG/AMReX_MLEBTensor_3D_K.H
+++ b/Src/LinearSolvers/MLMG/AMReX_MLEBTensor_3D_K.H
@@ -6,11 +6,44 @@
 
 namespace amrex {
 
-namespace {
-    AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
-    Real mlebtensor_weight (int d) {
-        return (d==2) ? 0.5 : ((d==1) ? 1.0 : 0.0);
-    }
+AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+Real mlebtensor_dz_on_xface (int i, int j, int, int n,
+                             Array4<Real const> const& vel, Real dzi,
+                             Real whi, Real wlo,
+                             int khip, int khim, int klop, int klom) noexcept
+{
+    return Real(0.5)*dzi * ((vel(i  ,j,khip,n)-vel(i  ,j,khim,n))*whi +
+                            (vel(i-1,j,klop,n)-vel(i-1,j,klom,n))*wlo);
+}
+
+AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+Real mlebtensor_dz_on_yface (int i, int j, int, int n,
+                             Array4<Real const> const& vel, Real dzi,
+                             Real whi, Real wlo,
+                             int khip, int khim, int klop, int klom) noexcept
+{
+    return Real(0.5)*dzi * ((vel(i,j  ,khip,n)-vel(i,j  ,khim,n))*whi +
+                            (vel(i,j-1,klop,n)-vel(i,j-1,klom,n))*wlo);
+}
+
+AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+Real mlebtensor_dx_on_zface (int, int j, int k, int n,
+                             Array4<Real const> const& vel, Real dxi,
+                             Real whi, Real wlo,
+                             int ihip, int ihim, int ilop, int ilom) noexcept
+{
+    return Real(0.5)*dxi * ((vel(ihip,j,k  ,n)-vel(ihim,j,k  ,n))*whi +
+                            (vel(ilop,j,k-1,n)-vel(ilom,j,k-1,n))*wlo);
+}
+
+AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+Real mlebtensor_dy_on_zface (int i, int, int k, int n,
+                             Array4<Real const> const& vel, Real dyi,
+                             Real whi, Real wlo,
+                             int jhip, int jhim, int jlop, int jlom) noexcept
+{
+    return Real(0.5)*dyi * ((vel(i,jhip,k  ,n)-vel(i,jhim,k  ,n))*whi +
+                            (vel(i,jlop,k-1,n)-vel(i,jlom,k-1,n))*wlo);
 }
 
 AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
@@ -46,26 +79,24 @@ void mlebtensor_cross_terms_fx (Box const& box, Array4<Real> const& fx,
                     int jlom = j - flag(i-1,j,k).isConnected(0,-1,0);
                     Real whi = mlebtensor_weight(jhip-jhim);
                     Real wlo = mlebtensor_weight(jlop-jlom);
-                    Real dudy = (0.5*dyi) * ((vel(i  ,jhip,k,0)-vel(i  ,jhim,k,0))*whi
-                                            +(vel(i-1,jlop,k,0)-vel(i-1,jlom,k,0))*wlo);
-                    Real dvdy = (0.5*dyi) * ((vel(i  ,jhip,k,1)-vel(i  ,jhim,k,1))*whi
-                                            +(vel(i-1,jlop,k,1)-vel(i-1,jlom,k,1))*wlo);
-
+                    Real dudy = mlebtensor_dy_on_xface(i,j,k,0,vel,dyi,
+                                                       whi,wlo,jhip,jhim,jlop,jlom);
+                    Real dvdy = mlebtensor_dy_on_xface(i,j,k,1,vel,dyi,
+                                                       whi,wlo,jhip,jhim,jlop,jlom);
                     int khip = k + flag(i  ,j,k).isConnected(0,0, 1);
                     int khim = k - flag(i  ,j,k).isConnected(0,0,-1);
                     int klop = k + flag(i-1,j,k).isConnected(0,0, 1);
                     int klom = k - flag(i-1,j,k).isConnected(0,0,-1);
                     whi = mlebtensor_weight(khip-khim);
                     wlo = mlebtensor_weight(klop-klom);
-                    Real dudz = (0.5*dzi) * ((vel(i  ,j,khip,0)-vel(i  ,j,khim,0))*whi
-                                            +(vel(i-1,j,klop,0)-vel(i-1,j,klom,0))*wlo);
-                    Real dwdz = (0.5*dzi) * ((vel(i  ,j,khip,2)-vel(i  ,j,khim,2))*whi
-                                            +(vel(i-1,j,klop,2)-vel(i-1,j,klom,2))*wlo);
-
+                    Real dudz = mlebtensor_dz_on_xface(i,j,k,0,vel,dzi,
+                                                       whi,wlo,khip,khim,klop,klom);
+                    Real dwdz = mlebtensor_dz_on_xface(i,j,k,2,vel,dzi,
+                                                       whi,wlo,khip,khim,klop,klom);
                     Real divu = dvdy + dwdz;
                     Real xif = kapx(i,j,k);
-                    Real mun = 0.75*(etax(i,j,k,0)-xif);  // restore the original eta
-                    Real mut =       etax(i,j,k,1);
+                    Real mun = Real(0.75)*(etax(i,j,k,0)-xif);// restore the original eta
+                    Real mut =             etax(i,j,k,1);
                     fx(i,j,k,0) = -mun*(-twoThirds*divu) - xif*divu;
                     fx(i,j,k,1) = -mut*dudy;
                     fx(i,j,k,2) = -mut*dudz;
@@ -108,26 +139,24 @@ void mlebtensor_cross_terms_fy (Box const& box, Array4<Real> const& fy,
                     int ilom = i - flag(i,j-1,k).isConnected(-1,0,0);
                     Real whi = mlebtensor_weight(ihip-ihim);
                     Real wlo = mlebtensor_weight(ilop-ilom);
-                    Real dudx = (0.5*dxi) * ((vel(ihip,j  ,k,0)-vel(ihim,j  ,k,0))*whi
-                                            +(vel(ilop,j-1,k,0)-vel(ilom,j-1,k,0))*wlo);
-                    Real dvdx = (0.5*dxi) * ((vel(ihip,j  ,k,1)-vel(ihim,j  ,k,1))*whi
-                                            +(vel(ilop,j-1,k,1)-vel(ilom,j-1,k,1))*wlo);
-
+                    Real dudx = mlebtensor_dx_on_yface(i,j,k,0,vel,dxi,
+                                                       whi,wlo,ihip,ihim,ilop,ilom);
+                    Real dvdx = mlebtensor_dx_on_yface(i,j,k,1,vel,dxi,
+                                                       whi,wlo,ihip,ihim,ilop,ilom);
                     int khip = k + flag(i,j  ,k).isConnected(0,0, 1);
                     int khim = k - flag(i,j  ,k).isConnected(0,0,-1);
                     int klop = k + flag(i,j-1,k).isConnected(0,0, 1);
                     int klom = k - flag(i,j-1,k).isConnected(0,0,-1);
                     whi = mlebtensor_weight(khip-khim);
                     wlo = mlebtensor_weight(klop-klom);
-                    Real dvdz = (0.5*dzi) * ((vel(i,j  ,khip,1)-vel(i,j  ,khim,1))*whi
-                                            +(vel(i,j-1,klop,1)-vel(i,j-1,klom,1))*wlo);
-                    Real dwdz = (0.5*dzi) * ((vel(i,j  ,khip,2)-vel(i,j  ,khim,2))*whi
-                                            +(vel(i,j-1,klop,2)-vel(i,j-1,klom,2))*wlo);
-
+                    Real dvdz = mlebtensor_dz_on_yface(i,j,k,1,vel,dzi,
+                                                       whi,wlo,khip,khim,klop,klom);
+                    Real dwdz = mlebtensor_dz_on_yface(i,j,k,2,vel,dzi,
+                                                       whi,wlo,khip,khim,klop,klom);
                     Real divu = dudx + dwdz;
                     Real xif = kapy(i,j,k);
-                    Real mun = 0.75*(etay(i,j,k,1)-xif);  // restore the original eta
-                    Real mut =       etay(i,j,k,0);
+                    Real mun = Real(0.75)*(etay(i,j,k,1)-xif);// restore the original eta
+                    Real mut =             etay(i,j,k,0);
                     fy(i,j,k,0) = -mut*dvdx;
                     fy(i,j,k,1) = -mun*(-twoThirds*divu) - xif*divu;
                     fy(i,j,k,2) = -mut*dvdz;
@@ -170,27 +199,457 @@ void mlebtensor_cross_terms_fz (Box const& box, Array4<Real> const& fz,
                     int ilom = i - flag(i,j,k-1).isConnected(-1,0,0);
                     Real whi = mlebtensor_weight(ihip-ihim);
                     Real wlo = mlebtensor_weight(ilop-ilom);
+                    Real dudx = mlebtensor_dx_on_zface(i,j,k,0,vel,dxi,
+                                                       whi,wlo,ihip,ihim,ilop,ilom);
+                    Real dwdx = mlebtensor_dx_on_zface(i,j,k,2,vel,dxi,
+                                                       whi,wlo,ihip,ihim,ilop,ilom);
+                    int jhip = j + flag(i,j,k  ).isConnected(0, 1,0);
+                    int jhim = j - flag(i,j,k  ).isConnected(0,-1,0);
+                    int jlop = j + flag(i,j,k-1).isConnected(0, 1,0);
+                    int jlom = j - flag(i,j,k-1).isConnected(0,-1,0);
+                    whi = mlebtensor_weight(jhip-jhim);
+                    wlo = mlebtensor_weight(jlop-jlom);
+                    Real dvdy = mlebtensor_dy_on_zface(i,j,k,1,vel,dyi,
+                                                       whi,wlo,jhip,jhim,jlop,jlom);
+                    Real dwdy = mlebtensor_dy_on_zface(i,j,k,2,vel,dyi,
+                                                       whi,wlo,jhip,jhim,jlop,jlom);
+                    Real divu = dudx + dvdy;
+                    Real xif = kapz(i,j,k);
+                    Real mun = Real(0.75)*(etaz(i,j,k,2)-xif);// restore the original eta
+                    Real mut =             etaz(i,j,k,0);
+
+                    fz(i,j,k,0) = -mut*dwdx;
+                    fz(i,j,k,1) = -mut*dwdy;
+                    fz(i,j,k,2) = -mun*(-twoThirds*divu) - xif*divu;
+                }
+            }
+        }
+    }
+}
+
+AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+Real mlebtensor_dz_on_xface (int i, int j, int k, int n,
+                             Array4<Real const> const& vel, Real dzi,
+                             Array4<Real const> const& bvxlo,
+                             Array4<Real const> const& bvxhi,
+                             Array2D<BoundCond,
+                                     0,2*AMREX_SPACEDIM,
+                                     0,AMREX_SPACEDIM> const& bct,
+                             Dim3 const& dlo, Dim3 const& dhi,
+                             Real whi, Real wlo,
+                             int khip, int khim, int klop, int klom) noexcept
+{
+    Real ddz;
+    if (i == dlo.x) {
+        if (bct(Orientation::xlo(),n) == AMREX_LO_DIRICHLET && bvxlo) {
+            if (k == dlo.z) {
+                ddz = (bvxlo(i-1,j,k  ,n) * Real(-1.5) +
+                       bvxlo(i-1,j,k+1,n) * Real(2.) +
+                       bvxlo(i-1,j,k+2,n) * Real(-0.5)) * dzi;
+            } else if (k == dhi.z) {
+                ddz = -(bvxlo(i-1,j,k  ,n) * Real(-1.5) +
+                        bvxlo(i-1,j,k-1,n) * Real(2.) +
+                        bvxlo(i-1,j,k-2,n) * Real(-0.5)) * dzi;
+            } else {
+                ddz = whi*dzi*(bvxlo(i-1,j,khip,n)-bvxlo(i-1,j,khim,n));
+            }
+        } else if (bct(Orientation::xlo(),n) == AMREX_LO_NEUMANN) {
+            ddz = whi*dzi*(vel(i,j,khip,n)-vel(i,j,khim,n));
+        } else { // AMREX_LO_REFLECT_ODD or homogeneous Dirichlet
+            ddz = Real(0.);
+        }
+    } else if (i == dhi.x+1) {
+        if (bct(Orientation::xhi(),n) == AMREX_LO_DIRICHLET && bvxhi) {
+            if (k == dlo.z) {
+                ddz = (bvxhi(i,j,k  ,n) * Real(-1.5) +
+                       bvxhi(i,j,k+1,n) * Real(2.) +
+                       bvxhi(i,j,k+2,n) * Real(-0.5)) * dzi;
+            } else if (k == dhi.z) {
+                ddz = -(bvxhi(i,j,k  ,n) * Real(-1.5) +
+                        bvxhi(i,j,k-1,n) * Real(2.) +
+                        bvxhi(i,j,k-2,n) * Real(-0.5)) * dzi;
+            } else {
+                ddz = wlo*dzi*(bvxhi(i,j,klop,n)-bvxhi(i,j,klom,n));
+            }
+        } else if (bct(Orientation::xhi(),n) == AMREX_LO_NEUMANN) {
+            ddz = wlo*dzi*(vel(i-1,j,klop,n)-vel(i-1,j,klom,n));
+        } else { // AMREX_LO_REFLECT_ODD or homogeneous Dirichlet
+            ddz = Real(0.);
+        }
+    } else {
+        ddz = mlebtensor_dz_on_xface(i,j,k,n,vel,dzi,whi,wlo,khip,khim,klop,klom);
+    }
+    return ddz;
+}
+
+AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+Real mlebtensor_dz_on_yface (int i, int j, int k, int n,
+                             Array4<Real const> const& vel, Real dzi,
+                             Array4<Real const> const& bvylo,
+                             Array4<Real const> const& bvyhi,
+                             Array2D<BoundCond,
+                                     0,2*AMREX_SPACEDIM,
+                                     0,AMREX_SPACEDIM> const& bct,
+                             Dim3 const& dlo, Dim3 const& dhi,
+                             Real whi, Real wlo,
+                             int khip, int khim, int klop, int klom) noexcept
+{
+    Real ddz;
+    if (j == dlo.y) {
+        if (bct(Orientation::ylo(),n) == AMREX_LO_DIRICHLET && bvylo) {
+            if (k == dlo.z) {
+                ddz = (bvylo(i,j-1,k  ,n) * Real(-1.5) +
+                       bvylo(i,j-1,k+1,n) * Real(2.) +
+                       bvylo(i,j-1,k+2,n) * Real(-0.5)) * dzi;
+            } else if (k == dhi.z) {
+                ddz = -(bvylo(i,j-1,k  ,n) * Real(-1.5) +
+                        bvylo(i,j-1,k-1,n) * Real(2.) +
+                        bvylo(i,j-1,k-2,n) * Real(-0.5)) * dzi;
+            } else {
+                ddz = whi*dzi*(bvylo(i,j-1,khip,n)-bvylo(i,j-1,khim,n));
+            }
+        } else if (bct(Orientation::ylo(),n) == AMREX_LO_NEUMANN) {
+            ddz = whi*dzi*(vel(i,j,khip,n)-vel(i,j,khim,n));
+        } else { // AMREX_LO_REFLECT_ODD or homogeneous Dirichlet
+            ddz = Real(0.);
+        }
+    } else if (j == dhi.y+1) {
+        if (bct(Orientation::yhi(),n) == AMREX_LO_DIRICHLET && bvyhi) {
+            if (k == dlo.z) {
+                ddz = (bvyhi(i,j,k  ,n) * Real(-1.5) +
+                       bvyhi(i,j,k+1,n) * Real(2.) +
+                       bvyhi(i,j,k+2,n) * Real(-0.5)) * dzi;
+            } else if (k == dhi.z) {
+                ddz = -(bvyhi(i,j,k  ,n) * Real(-1.5) +
+                        bvyhi(i,j,k-1,n) * Real(2.) +
+                        bvyhi(i,j,k-2,n) * Real(-0.5)) * dzi;
+            } else {
+                ddz = wlo*dzi*(bvyhi(i,j,klop,n)-bvyhi(i,j,klom,n));
+            }
+        } else if (bct(Orientation::yhi(),n) == AMREX_LO_NEUMANN) {
+            ddz = wlo*dzi*(vel(i,j-1,klop,n)-vel(i,j-1,klom,n));
+        } else { // AMREX_LO_REFLECT_ODD or homogeneous Dirichlet
+            ddz = Real(0.);
+        }
+    } else {
+        ddz = mlebtensor_dz_on_yface(i,j,k,n,vel,dzi,whi,wlo,khip,khim,klop,klom);
+    }
+    return ddz;
+}
+
+AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+Real mlebtensor_dx_on_zface (int i, int j, int k, int n,
+                             Array4<Real const> const& vel, Real dxi,
+                             Array4<Real const> const& bvzlo,
+                             Array4<Real const> const& bvzhi,
+                             Array2D<BoundCond,
+                                     0,2*AMREX_SPACEDIM,
+                                     0,AMREX_SPACEDIM> const& bct,
+                             Dim3 const& dlo, Dim3 const& dhi,
+                             Real whi, Real wlo,
+                             int ihip, int ihim, int ilop, int ilom) noexcept
+{
+    Real ddx;
+    if (k == dlo.z) {
+        if (bct(Orientation::zlo(),n) == AMREX_LO_DIRICHLET && bvzlo) {
+            if (i == dlo.x) {
+                ddx = (bvzlo(i  ,j,k-1,n) * Real(-1.5) +
+                       bvzlo(i+1,j,k-1,n) * Real(2.) +
+                       bvzlo(i+2,j,k-1,n) * Real(-0.5)) * dxi;
+            } else if (i == dhi.x) {
+                ddx = -(bvzlo(i  ,j,k-1,n) * Real(-1.5) +
+                        bvzlo(i-1,j,k-1,n) * Real(2.) +
+                        bvzlo(i-2,j,k-1,n) * Real(-0.5)) * dxi;
+            } else {
+                ddx = whi*dxi*(bvzlo(ihip,j,k-1,n)-bvzlo(ihim,j,k-1,n));
+            }
+        } else if (bct(Orientation::zlo(),n) == AMREX_LO_NEUMANN) {
+            ddx = whi*dxi*(vel(ihip,j,k,n)-vel(ihim,j,k,n));
+        } else { // AMREX_LO_REFLECT_ODD or homogeneous Dirichlet
+            ddx = Real(0.);
+        }
+    } else if (k == dhi.z+1) {
+        if (bct(Orientation::zhi(),n) == AMREX_LO_DIRICHLET && bvzhi) {
+            if (i == dlo.x) {
+                ddx = (bvzhi(i  ,j,k,n) * Real(-1.5) +
+                       bvzhi(i+1,j,k,n) * Real(2.) +
+                       bvzhi(i+2,j,k,n) * Real(-0.5)) * dxi;
+            } else if (i == dhi.x) {
+                ddx = -(bvzhi(i  ,j,k,n) * Real(-1.5) +
+                        bvzhi(i-1,j,k,n) * Real(2.) +
+                        bvzhi(i-2,j,k,n) * Real(-0.5)) * dxi;
+            } else {
+                ddx = wlo*dxi*(bvzhi(ilop,j,k,n)-bvzhi(ilom,j,k,n));
+            }
+        } else if (bct(Orientation::zhi(),n) == AMREX_LO_NEUMANN) {
+            ddx = wlo*dxi*(vel(ilop,j,k-1,n)-vel(ilom,j,k-1,n));
+        } else { // AMREX_LO_REFLECT_ODD or homogeneous Dirichlet
+            ddx = Real(0.);
+        }
+    } else {
+        ddx = mlebtensor_dx_on_zface(i,j,k,n,vel,dxi,whi,wlo,ihip,ihim,ilop,ilom);
+
+    }
+    return ddx;
+}
+
+AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+Real mlebtensor_dy_on_zface (int i, int j, int k, int n,
+                             Array4<Real const> const& vel, Real dyi,
+                             Array4<Real const> const& bvzlo,
+                             Array4<Real const> const& bvzhi,
+                             Array2D<BoundCond,
+                                     0,2*AMREX_SPACEDIM,
+                                     0,AMREX_SPACEDIM> const& bct,
+                             Dim3 const& dlo, Dim3 const& dhi,
+                             Real whi, Real wlo,
+                             int jhip, int jhim, int jlop, int jlom) noexcept
+{
+    Real ddy;
+    if (k == dlo.z) {
+        if (bct(Orientation::zlo(),n) == AMREX_LO_DIRICHLET && bvzlo) {
+            if (j == dlo.y) {
+                ddy = (bvzlo(i,j  ,k-1,n) * Real(-1.5) +
+                       bvzlo(i,j+1,k-1,n) * Real(2.) +
+                       bvzlo(i,j+2,k-1,n) * Real(-0.5)) * dyi;
+            } else if (j == dhi.y) {
+                ddy = -(bvzlo(i,j  ,k-1,n) * Real(-1.5) +
+                        bvzlo(i,j-1,k-1,n) * Real(2.) +
+                        bvzlo(i,j-2,k-1,n) * Real(-0.5)) * dyi;
+            } else {
+                ddy = whi*dyi*(bvzlo(i,jhip,k-1,n)-bvzlo(i,jhim,k-1,n));
+            }
+        } else if (bct(Orientation::zlo(),n) == AMREX_LO_NEUMANN) {
+            ddy = whi*dyi*(vel(i,jhip,k,n)-vel(i,jhim,k,n));
+        } else { // AMREX_LO_REFLECT_ODD or homogeneous Dirichlet
+            ddy = Real(0.);
+        }
+    } else if (k == dhi.z+1) {
+        if (bct(Orientation::zhi(),n) == AMREX_LO_DIRICHLET && bvzhi) {
+            if (j == dlo.y) {
+                ddy = (bvzhi(i,j  ,k,n) * Real(-1.5) +
+                       bvzhi(i,j+1,k,n) * Real(2.) +
+                       bvzhi(i,j+2,k,n) * Real(-0.5)) * dyi;
+            } else if (j == dhi.y) {
+                ddy = -(bvzhi(i,j  ,k,n) * Real(-1.5) +
+                        bvzhi(i,j-1,k,n) * Real(2.) +
+                        bvzhi(i,j-2,k,n) * Real(-0.5)) * dyi;
+            } else {
+                ddy = wlo*dyi*(bvzhi(i,jlop,k,n)-bvzhi(i,jlom,k,n));
+            }
+        } else if (bct(Orientation::zhi(),n) == AMREX_LO_NEUMANN) {
+            ddy = wlo*dyi*(vel(i,jlop,k-1,n)-vel(i,jlom,k-1,n));
+        } else { // AMREX_LO_REFLECT_ODD or homogeneous Dirichlet
+            ddy = Real(0.);
+        }
+    } else {
+        ddy = mlebtensor_dy_on_zface(i,j,k,n,vel,dyi,whi,wlo,jhip,jhim,jlop,jlom);
+    }
+    return ddy;
+}
+
+AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+void mlebtensor_cross_terms_fx (Box const& box, Array4<Real> const& fx,
+                                Array4<Real const> const& vel,
+                                Array4<Real const> const& etax,
+                                Array4<Real const> const& kapx,
+                                Array4<Real const> const& apx,
+                                Array4<EBCellFlag const> const& flag,
+                                GpuArray<Real,AMREX_SPACEDIM> const& dxinv,
+                                Array4<Real const> const& bvxlo,
+                                Array4<Real const> const& bvxhi,
+                                Array2D<BoundCond,
+                                        0,2*AMREX_SPACEDIM,
+                                        0,AMREX_SPACEDIM> const& bct,
+                                Dim3 const& dlo, Dim3 const& dhi) noexcept
+
+{
+    const Real dyi = dxinv[1];
+    const Real dzi = dxinv[2];
+    const auto lo = amrex::lbound(box);
+    const auto hi = amrex::ubound(box);
+    constexpr Real twoThirds = 2./3.;
+
+    for         (int k = lo.z; k <= hi.z; ++k) {
+        for     (int j = lo.y; j <= hi.y; ++j) {
+            AMREX_PRAGMA_SIMD
+            for (int i = lo.x; i <= hi.x; ++i) {
+                if (apx(i,j,k) == 0.0)
+                {
+                    fx(i,j,k,0) = 0.0;
+                    fx(i,j,k,1) = 0.0;
+                    fx(i,j,k,2) = 0.0;
+                }
+                else
+                {
+                    int jhip = j + flag(i  ,j,k).isConnected(0, 1,0);
+                    int jhim = j - flag(i  ,j,k).isConnected(0,-1,0);
+                    int jlop = j + flag(i-1,j,k).isConnected(0, 1,0);
+                    int jlom = j - flag(i-1,j,k).isConnected(0,-1,0);
+                    Real whi = mlebtensor_weight(jhip-jhim);
+                    Real wlo = mlebtensor_weight(jlop-jlom);
+                    Real dudy = mlebtensor_dy_on_xface(i,j,k,0,vel,dyi,
+                                                       bvxlo,bvxhi,bct,dlo,dhi,
+                                                       whi,wlo,jhip,jhim,jlop,jlom);
+                    Real dvdy = mlebtensor_dy_on_xface(i,j,k,1,vel,dyi,
+                                                       bvxlo,bvxhi,bct,dlo,dhi,
+                                                       whi,wlo,jhip,jhim,jlop,jlom);
+                    int khip = k + flag(i  ,j,k).isConnected(0,0, 1);
+                    int khim = k - flag(i  ,j,k).isConnected(0,0,-1);
+                    int klop = k + flag(i-1,j,k).isConnected(0,0, 1);
+                    int klom = k - flag(i-1,j,k).isConnected(0,0,-1);
+                    whi = mlebtensor_weight(khip-khim);
+                    wlo = mlebtensor_weight(klop-klom);
+                    Real dudz = mlebtensor_dz_on_xface(i,j,k,0,vel,dzi,
+                                                       bvxlo,bvxhi,bct,dlo,dhi,
+                                                       whi,wlo,khip,khim,klop,klom);
+                    Real dwdz = mlebtensor_dz_on_xface(i,j,k,2,vel,dzi,
+                                                       bvxlo,bvxhi,bct,dlo,dhi,
+                                                       whi,wlo,khip,khim,klop,klom);
+                    Real divu = dvdy + dwdz;
+                    Real xif = kapx(i,j,k);
+                    Real mun = Real(0.75)*(etax(i,j,k,0)-xif);// restore the original eta
+                    Real mut =             etax(i,j,k,1);
+                    fx(i,j,k,0) = -mun*(-twoThirds*divu) - xif*divu;
+                    fx(i,j,k,1) = -mut*dudy;
+                    fx(i,j,k,2) = -mut*dudz;
+                }
+            }
+        }
+    }
+}
+
+AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+void mlebtensor_cross_terms_fy (Box const& box, Array4<Real> const& fy,
+                                Array4<Real const> const& vel,
+                                Array4<Real const> const& etay,
+                                Array4<Real const> const& kapy,
+                                Array4<Real const> const& apy,
+                                Array4<EBCellFlag const> const& flag,
+                                GpuArray<Real,AMREX_SPACEDIM> const& dxinv,
+                                Array4<Real const> const& bvylo,
+                                Array4<Real const> const& bvyhi,
+                                Array2D<BoundCond,
+                                        0,2*AMREX_SPACEDIM,
+                                        0,AMREX_SPACEDIM> const& bct,
+                                Dim3 const& dlo, Dim3 const& dhi) noexcept
+{
+    const Real dxi = dxinv[0];
+    const Real dzi = dxinv[2];
+    const auto lo = amrex::lbound(box);
+    const auto hi = amrex::ubound(box);
+    constexpr Real twoThirds = 2./3.;
+
+    for         (int k = lo.z; k <= hi.z; ++k) {
+        for     (int j = lo.y; j <= hi.y; ++j) {
+            AMREX_PRAGMA_SIMD
+            for (int i = lo.x; i <= hi.x; ++i) {
+                if (apy(i,j,k) == 0.0)
+                {
+                    fy(i,j,k,0) = 0.0;
+                    fy(i,j,k,1) = 0.0;
+                    fy(i,j,k,2) = 0.0;
+                }
+                else
+                {
+                    int ihip = i + flag(i,j  ,k).isConnected( 1,0,0);
+                    int ihim = i - flag(i,j  ,k).isConnected(-1,0,0);
+                    int ilop = i + flag(i,j-1,k).isConnected( 1,0,0);
+                    int ilom = i - flag(i,j-1,k).isConnected(-1,0,0);
+                    Real whi = mlebtensor_weight(ihip-ihim);
+                    Real wlo = mlebtensor_weight(ilop-ilom);
+                    Real dudx = mlebtensor_dx_on_yface(i,j,k,0,vel,dxi,
+                                                       bvylo,bvyhi,bct,dlo,dhi,
+                                                       whi,wlo,ihip,ihim,ilop,ilom);
+                    Real dvdx = mlebtensor_dx_on_yface(i,j,k,1,vel,dxi,
+                                                       bvylo,bvyhi,bct,dlo,dhi,
+                                                       whi,wlo,ihip,ihim,ilop,ilom);
+                    int khip = k + flag(i,j  ,k).isConnected(0,0, 1);
+                    int khim = k - flag(i,j  ,k).isConnected(0,0,-1);
+                    int klop = k + flag(i,j-1,k).isConnected(0,0, 1);
+                    int klom = k - flag(i,j-1,k).isConnected(0,0,-1);
+                    whi = mlebtensor_weight(khip-khim);
+                    wlo = mlebtensor_weight(klop-klom);
+                    Real dvdz = mlebtensor_dz_on_yface(i,j,k,1,vel,dzi,
+                                                       bvylo,bvyhi,bct,dlo,dhi,
+                                                       whi,wlo,khip,khim,klop,klom);
+                    Real dwdz = mlebtensor_dz_on_yface(i,j,k,2,vel,dzi,
+                                                       bvylo,bvyhi,bct,dlo,dhi,
+                                                       whi,wlo,khip,khim,klop,klom);
+                    Real divu = dudx + dwdz;
+                    Real xif = kapy(i,j,k);
+                    Real mun = Real(0.75)*(etay(i,j,k,1)-xif);// restore the original eta
+                    Real mut =             etay(i,j,k,0);
+                    fy(i,j,k,0) = -mut*dvdx;
+                    fy(i,j,k,1) = -mun*(-twoThirds*divu) - xif*divu;
+                    fy(i,j,k,2) = -mut*dvdz;
+                }
+            }
+        }
+    }
+}
 
-                    Real dudx = (0.5*dxi) * ((vel(ihip,j,k  ,0)-vel(ihim,j,k  ,0))*whi
-                                            +(vel(ilop,j,k-1,0)-vel(ilom,j,k-1,0))*wlo);
-                    Real dwdx = (0.5*dxi) * ((vel(ihip,j,k  ,2)-vel(ihim,j,k  ,2))*whi
-                                            +(vel(ilop,j,k-1,2)-vel(ilom,j,k-1,2))*wlo);
+AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+void mlebtensor_cross_terms_fz (Box const& box, Array4<Real> const& fz,
+                                Array4<Real const> const& vel,
+                                Array4<Real const> const& etaz,
+                                Array4<Real const> const& kapz,
+                                Array4<Real const> const& apz,
+                                Array4<EBCellFlag const> const& flag,
+                                GpuArray<Real,AMREX_SPACEDIM> const& dxinv,
+                                Array4<Real const> const& bvzlo,
+                                Array4<Real const> const& bvzhi,
+                                Array2D<BoundCond,
+                                        0,2*AMREX_SPACEDIM,
+                                        0,AMREX_SPACEDIM> const& bct,
+                                Dim3 const& dlo, Dim3 const& dhi) noexcept
+{
+    const Real dxi = dxinv[0];
+    const Real dyi = dxinv[1];
+    const auto lo = amrex::lbound(box);
+    const auto hi = amrex::ubound(box);
+    constexpr Real twoThirds = 2./3.;
 
+    for         (int k = lo.z; k <= hi.z; ++k) {
+        for     (int j = lo.y; j <= hi.y; ++j) {
+            AMREX_PRAGMA_SIMD
+            for (int i = lo.x; i <= hi.x; ++i) {
+                if (apz(i,j,k) == 0.0)
+                {
+                    fz(i,j,k,0) = 0.0;
+                    fz(i,j,k,1) = 0.0;
+                    fz(i,j,k,2) = 0.0;
+                }
+                else
+                {
+                    int ihip = i + flag(i,j,k  ).isConnected( 1,0,0);
+                    int ihim = i - flag(i,j,k  ).isConnected(-1,0,0);
+                    int ilop = i + flag(i,j,k-1).isConnected( 1,0,0);
+                    int ilom = i - flag(i,j,k-1).isConnected(-1,0,0);
+                    Real whi = mlebtensor_weight(ihip-ihim);
+                    Real wlo = mlebtensor_weight(ilop-ilom);
+                    Real dudx = mlebtensor_dx_on_zface(i,j,k,0,vel,dxi,
+                                                       bvzlo,bvzhi,bct,dlo,dhi,
+                                                       whi,wlo,ihip,ihim,ilop,ilom);
+                    Real dwdx = mlebtensor_dx_on_zface(i,j,k,2,vel,dxi,
+                                                       bvzlo,bvzhi,bct,dlo,dhi,
+                                                       whi,wlo,ihip,ihim,ilop,ilom);
                     int jhip = j + flag(i,j,k  ).isConnected(0, 1,0);
                     int jhim = j - flag(i,j,k  ).isConnected(0,-1,0);
                     int jlop = j + flag(i,j,k-1).isConnected(0, 1,0);
                     int jlom = j - flag(i,j,k-1).isConnected(0,-1,0);
                     whi = mlebtensor_weight(jhip-jhim);
                     wlo = mlebtensor_weight(jlop-jlom);
-                    Real dvdy = (0.5*dyi) * ((vel(i,jhip,k  ,1)-vel(i,jhim,k  ,1))*whi
-                                            +(vel(i,jlop,k-1,1)-vel(i,jlom,k-1,1))*wlo);
-                    Real dwdy = (0.5*dyi) * ((vel(i,jhip,k  ,2)-vel(i,jhim,k  ,2))*whi
-                                            +(vel(i,jlop,k-1,2)-vel(i,jlom,k-1,2))*wlo);
-
+                    Real dvdy = mlebtensor_dy_on_zface(i,j,k,1,vel,dyi,
+                                                       bvzlo,bvzhi,bct,dlo,dhi,
+                                                       whi,wlo,jhip,jhim,jlop,jlom);
+                    Real dwdy = mlebtensor_dy_on_zface(i,j,k,2,vel,dyi,
+                                                       bvzlo,bvzhi,bct,dlo,dhi,
+                                                       whi,wlo,jhip,jhim,jlop,jlom);
                     Real divu = dudx + dvdy;
                     Real xif = kapz(i,j,k);
-                    Real mun = 0.75*(etaz(i,j,k,2)-xif);  // restore the original eta
-                    Real mut =       etaz(i,j,k,0);
+                    Real mun = Real(0.75)*(etaz(i,j,k,2)-xif);// restore the original eta
+                    Real mut =             etaz(i,j,k,0);
 
                     fz(i,j,k,0) = -mut*dwdx;
                     fz(i,j,k,1) = -mut*dwdy;
diff --git a/Src/LinearSolvers/MLMG/AMReX_MLEBTensor_K.H b/Src/LinearSolvers/MLMG/AMReX_MLEBTensor_K.H
index c814b3b8e41..8abdde8a7c0 100644
--- a/Src/LinearSolvers/MLMG/AMReX_MLEBTensor_K.H
+++ b/Src/LinearSolvers/MLMG/AMReX_MLEBTensor_K.H
@@ -4,6 +4,145 @@
 
 #include <AMReX_FArrayBox.H>
 
+namespace amrex {
+
+AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+Real mlebtensor_weight (int d) {
+    return (d==2) ? 0.5 : ((d==1) ? 1.0 : 0.0);
+}
+
+AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+Real mlebtensor_dy_on_xface (int i, int, int k, int n,
+                             Array4<Real const> const& vel, Real dyi,
+                             Real whi, Real wlo,
+                             int jhip, int jhim, int jlop, int jlom) noexcept
+{
+    return Real(0.5)*dyi * ((vel(i  ,jhip,k,n)-vel(i  ,jhim,k,n))*whi +
+                            (vel(i-1,jlop,k,n)-vel(i-1,jlom,k,n))*wlo);
+}
+
+AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+Real mlebtensor_dx_on_yface (int, int j, int k, int n,
+                             Array4<Real const> const& vel, Real dxi,
+                             Real whi, Real wlo,
+                             int ihip, int ihim, int ilop, int ilom) noexcept
+{
+    return Real(0.5)*dxi * ((vel(ihip,j  ,k,n)-vel(ihim,j  ,k,n))*whi +
+                            (vel(ilop,j-1,k,n)-vel(ilom,j-1,k,n))*wlo);
+}
+
+AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+Real mlebtensor_dy_on_xface (int i, int j, int k, int n,
+                             Array4<Real const> const& vel, Real dyi,
+                             Array4<Real const> const& bvxlo,
+                             Array4<Real const> const& bvxhi,
+                             Array2D<BoundCond,
+                                     0,2*AMREX_SPACEDIM,
+                                     0,AMREX_SPACEDIM> const& bct,
+                             Dim3 const& dlo, Dim3 const& dhi,
+                             Real whi, Real wlo,
+                             int jhip, int jhim, int jlop, int jlom) noexcept
+{
+    Real ddy;
+    if (i == dlo.x) {
+        if (bct(Orientation::xlo(),n) == AMREX_LO_DIRICHLET && bvxlo) {
+            if (j == dlo.y) {
+                ddy = (bvxlo(i-1,j  ,k,n) * Real(-1.5) +
+                       bvxlo(i-1,j+1,k,n) * Real(2.) +
+                       bvxlo(i-1,j+2,k,n) * Real(-0.5)) * dyi;
+            } else if (j == dhi.y) {
+                ddy = -(bvxlo(i-1,j  ,k,n) * Real(-1.5) +
+                        bvxlo(i-1,j-1,k,n) * Real(2.) +
+                        bvxlo(i-1,j-2,k,n) * Real(-0.5)) * dyi;
+            } else {
+                ddy = whi*dyi*(bvxlo(i-1,jhip,k,n)-bvxlo(i-1,jhim,k,n));
+            }
+        } else if (bct(Orientation::xlo(),n) == AMREX_LO_NEUMANN) {
+            ddy = whi*dyi*(vel(i,jhip,k,n)-vel(i,jhim,k,n));
+        } else { // AMREX_LO_REFLECT_ODD or homogeneous Dirichlet
+            ddy = Real(0.);
+        }
+    } else if (i == dhi.x+1) {
+        if (bct(Orientation::xhi(),n) == AMREX_LO_DIRICHLET && bvxhi) {
+            if (j == dlo.y) {
+                ddy = (bvxhi(i,j  ,k,n) * Real(-1.5) +
+                       bvxhi(i,j+1,k,n) * Real(2.) +
+                       bvxhi(i,j+2,k,n) * Real(-0.5)) * dyi;
+            } else if (j == dhi.y) {
+                ddy = -(bvxhi(i,j  ,k,n) * Real(-1.5) +
+                        bvxhi(i,j-1,k,n) * Real(2.) +
+                        bvxhi(i,j-2,k,n) * Real(-0.5)) * dyi;
+            } else {
+                ddy = wlo*dyi*(bvxhi(i,jlop,k,n)-bvxhi(i,jlom,k,n));
+            }
+        } else if (bct(Orientation::xhi(),n) == AMREX_LO_NEUMANN) {
+            ddy = wlo*dyi*(vel(i-1,jlop,k,n)-vel(i-1,jlom,k,n));
+        } else { // AMREX_LO_REFLECT_ODD or homogeneous Dirichlet
+            ddy = Real(0.);
+        }
+    } else {
+        ddy = mlebtensor_dy_on_xface(i,j,k,n,vel,dyi,whi,wlo,jhip,jhim,jlop,jlom);
+    }
+    return ddy;
+}
+
+AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+Real mlebtensor_dx_on_yface (int i, int j, int k, int n,
+                             Array4<Real const> const& vel, Real dxi,
+                             Array4<Real const> const& bvylo,
+                             Array4<Real const> const& bvyhi,
+                             Array2D<BoundCond,
+                                     0,2*AMREX_SPACEDIM,
+                                     0,AMREX_SPACEDIM> const& bct,
+                             Dim3 const& dlo, Dim3 const& dhi,
+                             Real whi, Real wlo,
+                             int ihip, int ihim, int ilop, int ilom) noexcept
+{
+    Real ddx;
+    if (j == dlo.y) {
+        if (bct(Orientation::ylo(),n) == AMREX_LO_DIRICHLET && bvylo) {
+            if (i == dlo.x) {
+                ddx = (bvylo(i  ,j-1,k,n) * Real(-1.5) +
+                       bvylo(i+1,j-1,k,n) * Real(2.) +
+                       bvylo(i+2,j-1,k,n) * Real(-0.5)) * dxi;
+            } else if (i == dhi.x) {
+                ddx = -(bvylo(i  ,j-1,k,n) * Real(-1.5) +
+                        bvylo(i-1,j-1,k,n) * Real(2.) +
+                        bvylo(i-2,j-1,k,n) * Real(-0.5)) * dxi;
+            } else {
+                ddx = whi*dxi*(bvylo(ihip,j-1,k,n)-bvylo(ihim,j-1,k,n));
+            }
+        } else if (bct(Orientation::ylo(),n) == AMREX_LO_NEUMANN) {
+            ddx = whi*dxi*(vel(ihip,j,k,n)-vel(ihim,j,k,n));
+        } else { // AMREX_LO_REFLECT_ODD or homogeneous Dirichlet
+            ddx = Real(0.);
+        }
+    } else if (j == dhi.y+1) {
+        if (bct(Orientation::yhi(),n) == AMREX_LO_DIRICHLET && bvyhi) {
+            if (i == dlo.x) {
+                ddx = (bvyhi(i  ,j,k,n) * Real(-1.5) +
+                       bvyhi(i+1,j,k,n) * Real(2.) +
+                       bvyhi(i+2,j,k,n) * Real(-0.5)) * dxi;
+            } else if (i == dhi.x) {
+                ddx = -(bvyhi(i  ,j,k,n) * Real(-1.5) +
+                        bvyhi(i-1,j,k,n) * Real(2.) +
+                        bvyhi(i-2,j,k,n) * Real(-0.5)) * dxi;
+            } else {
+                ddx = wlo*dxi*(bvyhi(ilop,j,k,n)-bvyhi(ilom,j,k,n));
+            }
+        } else if (bct(Orientation::yhi(),n) == AMREX_LO_NEUMANN) {
+            ddx = wlo*dxi*(vel(ilop,j-1,k,n)-vel(ilom,j-1,k,n));
+        } else { // AMREX_LO_REFLECT_ODD or homogeneous Dirichlet
+            ddx = Real(0.);
+        }
+    } else {
+        ddx = mlebtensor_dx_on_yface(i,j,k,n,vel,dxi,whi,wlo,ihip,ihim,ilop,ilom);
+    }
+    return ddx;
+}
+
+}
+
 #if (AMREX_SPACEDIM == 1)
 #elif (AMREX_SPACEDIM == 2)
 #include <AMReX_MLEBTensor_2D_K.H>
diff --git a/Src/LinearSolvers/MLMG/AMReX_MLLinOp.H b/Src/LinearSolvers/MLMG/AMReX_MLLinOp.H
index f744c96e059..09d835d8b86 100644
--- a/Src/LinearSolvers/MLMG/AMReX_MLLinOp.H
+++ b/Src/LinearSolvers/MLMG/AMReX_MLLinOp.H
@@ -2,6 +2,7 @@
 #define AMREX_ML_LINOP_H_
 #include <AMReX_Config.H>
 
+#include <AMReX_Any.H>
 #include <AMReX_SPACE.H>
 #include <AMReX_MultiFab.H>
 #include <AMReX_Geometry.H>
@@ -177,10 +178,10 @@ public:
     * inhomogeneous Neumann BC, the value in leveldata is assumed to be
     * `d./dx`.
     */
-    virtual void setLevelBC (int amrlev, const MultiFab* levelbcdata,
-                             const MultiFab* robinbc_a = nullptr,
-                             const MultiFab* robinbc_b = nullptr,
-                             const MultiFab* robinbc_f = nullptr) = 0;
+    virtual void setLevelBC (int /*amrlev*/, const MultiFab* /*levelbcdata*/,
+                             const MultiFab* /*robinbc_a*/ = nullptr,
+                             const MultiFab* /*robinbc_b*/ = nullptr,
+                             const MultiFab* /*robinbc_f*/ = nullptr) {}
 
     void setVerbose (int v) noexcept { verbose = v; }
 
@@ -197,52 +198,51 @@ public:
     virtual bool needsUpdate () const { return false; }
     virtual void update () {}
 
-    virtual void restriction (int amrlev, int cmglev, MultiFab& crse, MultiFab& fine) const = 0;
-    virtual void interpolation (int amrlev, int fmglev, MultiFab& fine, const MultiFab& crse) const = 0;
-    virtual void averageDownSolutionRHS (int camrlev, MultiFab& crse_sol, MultiFab& crse_rhs,
-                                         const MultiFab& fine_sol, const MultiFab& fine_rhs) = 0;
+    virtual void restriction (int /*amrlev*/, int /*cmglev*/, MultiFab& /*crse*/, MultiFab& /*fine*/) const {}
+    virtual void interpolation (int /*amrlev*/, int /*fmglev*/, MultiFab& /*fine*/, const MultiFab& /*crse*/) const {}
+    virtual void interpAssign (int /*amrlev*/, int /*fmglev*/, MultiFab& /*fine*/, MultiFab& /*crse*/) const {}
+    virtual void averageDownSolutionRHS (int /*camrlev*/, MultiFab& /*crse_sol*/, MultiFab& /*crse_rhs*/,
+                                         const MultiFab& /*fine_sol*/, const MultiFab& /*fine_rhs*/) {}
 
-    virtual void apply (int amrlev, int mglev, MultiFab& out, MultiFab& in, BCMode bc_mode,
-                        StateMode s_mode, const MLMGBndry* bndry=nullptr) const = 0;
-    virtual void smooth (int amrlev, int mglev, MultiFab& sol, const MultiFab& rhs,
-                         bool skip_fillboundary=false) const = 0;
+    virtual void apply (int /*amrlev*/, int /*mglev*/, MultiFab& /*out*/, MultiFab& /*in*/, BCMode /*bc_mode*/,
+                        StateMode /*s_mode*/, const MLMGBndry* /*bndry*/=nullptr) const {}
+    virtual void smooth (int /*amrlev*/, int /*mglev*/, MultiFab& /*sol*/, const MultiFab& /*rhs*/,
+                         bool /*skip_fillboundary*/=false) const {}
 
     // Divide mf by the diagonal component of the operator. Used by bicgstab.
     virtual void normalize (int /*amrlev*/, int /*mglev*/, MultiFab& /*mf*/) const {}
 
-    virtual void solutionResidual (int amrlev, MultiFab& resid, MultiFab& x, const MultiFab& b,
-                                   const MultiFab* crse_bcdata=nullptr) = 0;
-    virtual void correctionResidual (int amrlev, int mglev, MultiFab& resid, MultiFab& x, const MultiFab& b,
-                                     BCMode bc_mode, const MultiFab* crse_bcdata=nullptr) = 0;
-
-    virtual void reflux (int crse_amrlev,
-                         MultiFab& res, const MultiFab& crse_sol, const MultiFab& crse_rhs,
-                         MultiFab& fine_res, MultiFab& fine_sol, const MultiFab& fine_rhs) const = 0;
-    virtual void compFlux (int amrlev, const Array<MultiFab*,AMREX_SPACEDIM>& fluxes,
-                           MultiFab& sol, Location loc) const = 0;
-    virtual void compGrad (int amrlev, const Array<MultiFab*,AMREX_SPACEDIM>& grad,
-                           MultiFab& sol, Location loc) const = 0;
-
-    virtual void applyMetricTerm (int amrlev, int mglev, MultiFab& rhs) const = 0;
-    virtual void unapplyMetricTerm (int amrlev, int mglev, MultiFab& rhs) const = 0;
-    virtual void fillSolutionBC (int amrlev, MultiFab& sol, const MultiFab* crse_bcdata=nullptr) = 0;
-
-    virtual void unimposeNeumannBC (int /*amrlev*/, MultiFab& /*rhs*/) const {} // only nodal solver might need it
-    virtual void applyInhomogNeumannTerm (int /*amrlev*/, MultiFab& /*rhs*/) const {}
-    virtual void applyOverset (int /*amlev*/, MultiFab& /*rhs*/) const {}
-    virtual void scaleRHS (int /*amrlev*/, MultiFab& /*rhs*/) const {}
-    virtual Real getSolvabilityOffset (int /*amrlev*/, int /*mglev*/, MultiFab const& /*rhs*/) const { return 0._rt; } // Only nodal solvers need it
-    virtual void fixSolvabilityByOffset (int /*amrlev*/, int /*mglev*/, MultiFab& /*rhs*/, Real /*offset*/) const {} // Only nodal solvers need it
+    virtual void solutionResidual (int /*amrlev*/, MultiFab& /*resid*/, MultiFab& /*x*/, const MultiFab& /*b*/,
+                                   const MultiFab* /*crse_bcdata*/=nullptr) {}
+    virtual void correctionResidual (int /*amrlev*/, int /*mglev*/, MultiFab& /*resid*/, MultiFab& /*x*/, const MultiFab& /*b*/,
+                                     BCMode /*bc_mode*/, const MultiFab* /*crse_bcdata*/=nullptr) {}
+
+    virtual void reflux (int /*crse_amrlev*/,
+                         MultiFab& /*res*/, const MultiFab& /*crse_sol*/, const MultiFab& /*crse_rhs*/,
+                         MultiFab& /*fine_res*/, MultiFab& /*fine_sol*/, const MultiFab& /*fine_rhs*/) const {}
+    virtual void compFlux (int /*amrlev*/, const Array<MultiFab*,AMREX_SPACEDIM>& /*fluxes*/,
+                           MultiFab& /*sol*/, Location /*loc*/) const {}
+    virtual void compGrad (int /*amrlev*/, const Array<MultiFab*,AMREX_SPACEDIM>& /*grad*/,
+                           MultiFab& /*sol*/, Location /*loc*/) const {}
+
+    virtual void applyMetricTerm (int /*amrlev*/, int /*mglev*/, Any& /*rhs*/) const {}
+    virtual void unapplyMetricTerm (int /*amrlev*/, int /*mglev*/, MultiFab& /*rhs*/) const {}
+
+    virtual void unimposeNeumannBC (int /*amrlev*/, Any& /*rhs*/) const {} // only nodal solver might need it
+    virtual void applyInhomogNeumannTerm (int /*amrlev*/, Any& /*rhs*/) const {}
+    virtual void applyOverset (int /*amlev*/, Any& /*rhs*/) const {}
+    virtual void scaleRHS (int /*amrlev*/, Any& /*rhs*/) const {}
+    virtual Vector<Real> getSolvabilityOffset (int /*amrlev*/, int /*mglev*/,
+                                               Any const& /*rhs*/) const { return {}; }
+    virtual void fixSolvabilityByOffset (int /*amrlev*/, int /*mglev*/, Any& /*rhs*/,
+                                         Vector<Real> const& /*offset*/) const {}
 
     virtual void prepareForSolve () = 0;
-    virtual bool isSingular (int amrlev) const = 0;
-    virtual bool isBottomSingular () const = 0;
-    virtual Real xdoty (int amrlev, int mglev, const MultiFab& x, const MultiFab& y, bool local) const = 0;
+    virtual bool isSingular (int /*amrlev*/) const { return false; }
+    virtual bool isBottomSingular () const { return false; }
+    virtual Real xdoty (int /*amrlev*/, int /*mglev*/, const MultiFab& /*x*/, const MultiFab& /*y*/, bool /*local*/) const { return 0._rt; }
 
-    virtual void fixUpResidualMask (int /*amrlev*/, iMultiFab& /*resmsk*/) { }
-    virtual void nodalSync (int /*amrlev*/, int /*mglev*/, MultiFab& /*mf*/) const {}
-
-    virtual std::unique_ptr<MLLinOp> makeNLinOp (int grid_size) const = 0;
+    virtual std::unique_ptr<MLLinOp> makeNLinOp (int /*grid_size*/) const { return {nullptr}; }
 
     virtual void getFluxes (const Vector<Array<MultiFab*,AMREX_SPACEDIM> >& /*a_flux*/,
                             const Vector<MultiFab*>& /*a_sol*/,
@@ -283,6 +283,59 @@ public:
 
     virtual void copyNSolveSolution (MultiFab&, MultiFab const&) const {}
 
+    virtual Any AnyMake (int amrlev, int mglev, IntVect const& ng) const;
+    virtual Any AnyMakeCoarseMG (int amrlev, int mglev, IntVect const& ng) const;
+    virtual Any AnyMakeCoarseAmr (int famrlev, IntVect const& ng) const;
+    virtual Any AnyMakeAlias (Any const& a) const;
+    virtual IntVect AnyGrowVect (Any const& a) const;
+    virtual void AnyCopy (Any& dst, Any const& src, IntVect const& ng) const;
+    virtual void AnyAdd (Any& dst, Any const& src, IntVect const& ng) const;
+    virtual void AnySetToZero (Any& a) const;
+    virtual void AnySetBndryToZero (Any& a) const;
+#ifdef AMREX_USE_EB
+    virtual void AnySetCoveredToZero (Any& a) const;
+#endif
+    virtual void AnyParallelCopy (Any& dst, Any const& src,
+                                  IntVect const& src_nghost, IntVect const& dst_nghost,
+                                  Periodicity const& period = Periodicity::NonPeriodic()) const;
+
+    virtual Real AnyNormInf (Any& a) const;
+
+    virtual Real AnyNormInfMask (int amrlev, Any const& a, bool local) const = 0;
+
+    virtual void AnySolutionResidual (int amrlev, Any& resid, Any& x, Any const& b,
+                                      Any const* crse_bcdata = nullptr);
+    virtual void AnyCorrectionResidual (int amrlev, int mglev, Any& resid, Any& x,
+                                        const Any& b, BCMode bc_mode,
+                                        const Any* crse_bcdata=nullptr);
+    virtual void AnyReflux (int crse_amrlev,
+                            Any& res, const Any& crse_sol, const Any& crse_rhs,
+                            Any& fine_res, Any& fine_sol, const Any& fine_rhs);
+
+    virtual void AnyAvgDownResAmr (int clev, Any& cres, Any const& fres) const = 0;
+    virtual void AnyAvgDownResMG (int clev, Any& cres, Any const& fres) const;
+
+    virtual void AnySmooth (int amrlev, int mglev, Any& sol, const Any& rhs,
+                            bool skip_fillboundary=false) const;
+
+    virtual void AnyRestriction (int amrlev, int cmglev, Any& crse, Any& fine) const;
+
+    virtual void AnyInterpolationMG (int amrlev, int fmglev, Any& fine, const Any& crse) const;
+    virtual void AnyInterpAssignMG (int amrlev, int fmglev, Any& fine, Any& crse) const;
+    virtual void AnyInterpolationAmr (int famrlev, Any& fine, const Any& crse,
+                                      IntVect const& /*nghost*/) const = 0;
+
+    virtual void AnyAverageDownSolutionRHS (int camrlev, Any& crse_sol, Any& crse_rhs,
+                                            const Any& fine_sol, const Any& fine_rhs);
+
+    virtual void AnyAverageDownAndSync (Vector<Any>& sol) const = 0;
+
+    virtual void postSolve (Vector<Any>& sol) const;
+
+    Real MFNormInf (MultiFab const& mf, iMultiFab const* fine_mask, bool local) const;
+
+    bool isMFIterSafe (int amrlev, int mglev1, int mglev2) const;
+
 protected:
 
     static constexpr int mg_coarsen_ratio = 2;
@@ -401,7 +454,7 @@ protected:
 
     bool isCellCentered () const noexcept { return m_ixtype == 0; }
 
-    virtual void make (Vector<Vector<MultiFab> >& mf, int nc, IntVect const& ng) const;
+    void make (Vector<Vector<Any> >& mf, IntVect const& ng) const;
 
     virtual std::unique_ptr<FabFactory<FArrayBox> > makeFactory (int /*amrlev*/, int /*mglev*/) const {
         return std::make_unique<FArrayBoxFactory>();
diff --git a/Src/LinearSolvers/MLMG/AMReX_MLLinOp.cpp b/Src/LinearSolvers/MLMG/AMReX_MLLinOp.cpp
index 9c6ccc8ce05..e53ed376d97 100644
--- a/Src/LinearSolvers/MLMG/AMReX_MLLinOp.cpp
+++ b/Src/LinearSolvers/MLMG/AMReX_MLLinOp.cpp
@@ -4,10 +4,12 @@
 #include <AMReX_MLCellLinOp.H>
 #include <AMReX_ParmParse.H>
 #include <AMReX_Machine.H>
+#include <AMReX_MultiFabUtil.H>
 
 #ifdef AMREX_USE_EB
 #include <AMReX_EB2.H>
 #include <AMReX_EBFabFactory.H>
+#include <AMReX_EBMultiFabUtil.H>
 #endif
 
 #ifdef AMREX_USE_PETSC
@@ -544,7 +546,7 @@ MLLinOp::defineBC ()
 }
 
 void
-MLLinOp::make (Vector<Vector<MultiFab> >& mf, int nc, IntVect const& ng) const
+MLLinOp::make (Vector<Vector<Any> >& mf, IntVect const& ng) const
 {
     mf.clear();
     mf.resize(m_num_amr_levels);
@@ -553,8 +555,7 @@ MLLinOp::make (Vector<Vector<MultiFab> >& mf, int nc, IntVect const& ng) const
         mf[alev].resize(m_num_mg_levels[alev]);
         for (int mlev = 0; mlev < m_num_mg_levels[alev]; ++mlev)
         {
-            const auto& ba = amrex::convert(m_grids[alev][mlev], m_ixtype);
-            mf[alev][mlev].define(ba, m_dmap[alev][mlev], nc, ng, MFInfo(), *m_factory[alev][mlev]);
+            mf[alev][mlev] = AnyMake(alev, mlev, ng);
         }
     }
 }
@@ -895,6 +896,279 @@ MLLinOp::resizeMultiGrid (int new_size)
     }
 }
 
+Any
+MLLinOp::AnyMake (int amrlev, int mglev, IntVect const& ng) const
+{
+    return Any(MultiFab(amrex::convert(m_grids[amrlev][mglev], m_ixtype),
+                        m_dmap[amrlev][mglev], getNComp(), ng, MFInfo(),
+                        *m_factory[amrlev][mglev]));
+}
+
+Any
+MLLinOp::AnyMakeCoarseMG (int amrlev, int mglev, IntVect const& ng) const
+{
+    BoxArray cba = m_grids[amrlev][mglev];
+    IntVect ratio = (amrlev > 0) ? IntVect(2) : mg_coarsen_ratio_vec[mglev];
+    cba.coarsen(ratio);
+    cba.convert(m_ixtype);
+    return Any(MultiFab(cba, m_dmap[amrlev][mglev], getNComp(), ng));
+}
+
+Any
+MLLinOp::AnyMakeCoarseAmr (int famrlev, IntVect const& ng) const
+{
+    BoxArray cba = m_grids[famrlev][0];
+    IntVect ratio(AMRRefRatio(famrlev-1));
+    cba.coarsen(ratio);
+    cba.convert(m_ixtype);
+    return Any(MultiFab(cba, m_dmap[famrlev][0], getNComp(), ng));
+}
+
+Any
+MLLinOp::AnyMakeAlias (Any const& a) const
+{
+    AMREX_ASSERT(a.is<MultiFab>());
+    MultiFab const& mf = a.get<MultiFab>();
+    return Any(MultiFab(mf, amrex::make_alias, 0, mf.nComp()));
+}
+
+IntVect
+MLLinOp::AnyGrowVect (Any const& a) const
+{
+    AMREX_ASSERT(a.is<MultiFab>());
+    MultiFab const& mf = a.get<MultiFab>();
+    return mf.nGrowVect();
+}
+
+void
+MLLinOp::AnySetToZero (Any& a) const
+{
+    AMREX_ASSERT(a.is<MultiFab>());
+    MultiFab& mf = a.get<MultiFab>();
+    mf.setVal(0._rt);
+}
+
+void
+MLLinOp::AnySetBndryToZero (Any& a) const
+{
+    AMREX_ASSERT(a.is<MultiFab>());
+    MultiFab& mf = a.get<MultiFab>();
+    mf.setBndry(0._rt, 0, getNComp());
+}
+
+#ifdef AMREX_USE_EB
+void
+MLLinOp::AnySetCoveredToZero (Any& a) const
+{
+    AMREX_ASSERT(a.is<MultiFab>());
+    auto& mf = a.get<MultiFab>();
+    EB_set_covered(mf, 0, getNComp(), 0, 0._rt);
+}
+#endif
+
+void
+MLLinOp::AnyCopy (Any& dst, Any const& src, IntVect const& ng) const
+{
+    AMREX_ASSERT(dst.is<MultiFab>() && src.is<MultiFab>());
+    MultiFab& dmf = dst.get<MultiFab>();
+    MultiFab const& smf = src.get<MultiFab>();
+    MultiFab::Copy(dmf, smf, 0, 0, getNComp(), ng);
+}
+
+void
+MLLinOp::AnyAdd (Any& dst, Any const& src, IntVect const& ng) const
+{
+    AMREX_ASSERT(dst.is<MultiFab>() && src.is<MultiFab>());
+    MultiFab& dmf = dst.get<MultiFab>();
+    MultiFab const& smf = src.get<MultiFab>();
+    MultiFab::Add(dmf, smf, 0, 0, getNComp(), ng);
+}
+
+void
+MLLinOp::AnyAverageDownSolutionRHS (int camrlev, Any& a_crse_sol, Any& a_crse_rhs,
+                                    const Any& a_fine_sol, const Any& a_fine_rhs)
+{
+    AMREX_ASSERT(a_crse_sol.is<MultiFab>() &&
+                 a_crse_rhs.is<MultiFab>() &&
+                 a_fine_sol.is<MultiFab>() &&
+                 a_fine_rhs.is<MultiFab>());
+    auto& crse_sol = a_crse_sol.get<MultiFab>();
+    auto& crse_rhs = a_crse_rhs.get<MultiFab>();
+    auto& fine_sol = a_fine_sol.get<MultiFab>();
+    auto& fine_rhs = a_fine_rhs.get<MultiFab>();
+    averageDownSolutionRHS(camrlev, crse_sol, crse_rhs, fine_sol, fine_rhs);
+}
+
+void
+MLLinOp::AnyParallelCopy (Any& dst, Any const& src,
+                          IntVect const& src_nghost, IntVect const& dst_nghost,
+                          Periodicity const& period) const
+{
+    AMREX_ASSERT(dst.is<MultiFab>());
+    MultiFab& dmf = dst.get<MultiFab>();
+    MultiFab const& smf = src.get<MultiFab>();
+    dmf.ParallelCopy(smf, 0, 0, getNComp(), src_nghost, dst_nghost, period);
+}
+
+Real
+MLLinOp::AnyNormInf (Any& a) const
+{
+    AMREX_ASSERT(a.is<MultiFab>());
+    return a.get<MultiFab>().norminf();
+}
+
+void
+MLLinOp::AnySolutionResidual (int amrlev, Any& resid, Any& x, Any const& b,
+                              Any const* crse_bcdata)
+{
+    AMREX_ASSERT(x.is<MultiFab>());
+    solutionResidual(amrlev, resid.get<MultiFab>(), x.get<MultiFab>(), b.get<MultiFab>(),
+                     (crse_bcdata) ? &(crse_bcdata->get<MultiFab>()) : nullptr);
+}
+
+void
+MLLinOp::AnyCorrectionResidual (int amrlev, int mglev, Any& resid, Any& x, const Any& b,
+                                BCMode bc_mode, const Any* crse_bcdata)
+{
+    AMREX_ASSERT(x.is<MultiFab>());
+    correctionResidual(amrlev, mglev, resid.get<MultiFab>(), x.get<MultiFab>(),
+                       b.get<MultiFab>(), bc_mode,
+                       (crse_bcdata) ? &(crse_bcdata->get<MultiFab>()) : nullptr);
+}
+
+void
+MLLinOp::AnyReflux (int clev, Any& res, const Any& crse_sol, const Any& crse_rhs,
+                    Any& fine_res, Any& fine_sol, const Any& fine_rhs)
+{
+    AMREX_ASSERT(res.is<MultiFab>());
+    reflux(clev,res.get<MultiFab>(), crse_sol.get<MultiFab>(), crse_rhs.get<MultiFab>(),
+           fine_res.get<MultiFab>(), fine_sol.get<MultiFab>(), fine_rhs.get<MultiFab>());
+}
+
+Real
+MLLinOp::MFNormInf (MultiFab const& mf, iMultiFab const* fine_mask, bool local) const
+{
+    const int ncomp = getNComp();
+    Real norm = 0._rt;
+
+    if (fine_mask == nullptr) {
+#ifdef AMREX_USE_GPU
+        if (Gpu::inLaunchRegion()) {
+            auto const& ma = mf.const_arrays();
+            norm = ParReduce(TypeList<ReduceOpMax>{}, TypeList<Real>{},
+                             mf, IntVect(0), ncomp,
+                             [=] AMREX_GPU_DEVICE (int box_no, int i, int j, int k, int n)
+                                 -> GpuTuple<Real>
+                             {
+                                 return amrex::Math::abs(ma[box_no](i,j,k,n));
+                             });
+        } else
+#endif
+        {
+#ifdef AMREX_USE_OMP
+#pragma omp parallel reduction(max:norm)
+#endif
+            for (MFIter mfi(mf,true); mfi.isValid(); ++mfi) {
+                Box const& bx = mfi.tilebox();
+                auto const& fab = mf.const_array(mfi);
+                AMREX_LOOP_4D(bx, ncomp, i, j, k, n,
+                {
+                    norm = std::max(norm, amrex::Math::abs(fab(i,j,k,n)));
+                });
+            }
+        }
+    } else {
+#ifdef AMREX_USE_GPU
+        if (Gpu::inLaunchRegion()) {
+            auto const& ma = mf.const_arrays();
+            auto const& mask_ma = fine_mask->const_arrays();
+            norm = ParReduce(TypeList<ReduceOpMax>{}, TypeList<Real>{},
+                             mf, IntVect(0), ncomp,
+                             [=] AMREX_GPU_DEVICE (int box_no, int i, int j, int k, int n)
+                                 -> GpuTuple<Real>
+                             {
+                                 if (mask_ma[box_no](i,j,k)) {
+                                     return amrex::Math::abs(ma[box_no](i,j,k,n));
+                                 } else {
+                                     return Real(0.0);
+                                 }
+                             });
+        } else
+#endif
+        {
+#ifdef AMREX_USE_OMP
+#pragma omp parallel reduction(max:norm)
+#endif
+            for (MFIter mfi(mf,true); mfi.isValid(); ++mfi) {
+                Box const& bx = mfi.tilebox();
+                auto const& fab = mf.const_array(mfi);
+                auto const& mask = fine_mask->const_array(mfi);
+                AMREX_LOOP_4D(bx, ncomp, i, j, k, n,
+                {
+                    if (mask(i,j,k)) {
+                        norm = std::max(norm, amrex::Math::abs(fab(i,j,k,n)));
+                    }
+                });
+            }
+        }
+    }
+
+    if (!local) ParallelAllReduce::Max(norm, ParallelContext::CommunicatorSub());
+    return norm;
+}
+
+void
+MLLinOp::AnyAvgDownResMG (int clev, Any& cres, Any const& fres) const
+{
+    AMREX_ASSERT(cres.is<MultiFab>());
+#ifdef AMREX_USE_EB
+    amrex::EB_average_down
+#else
+    amrex::average_down
+#endif
+        (fres.get<MultiFab>(), cres.get<MultiFab>(), 0, getNComp(),
+         mg_coarsen_ratio_vec[clev-1]);
+}
+
+void
+MLLinOp::AnySmooth (int amrlev, int mglev, Any& sol, const Any& rhs,
+                    bool skip_fillboundary) const
+{
+    AMREX_ASSERT(sol.is<MultiFab>() && rhs.is<MultiFab>());
+    smooth(amrlev, mglev, sol.get<MultiFab>(), rhs.get<MultiFab>(), skip_fillboundary);
+}
+
+void
+MLLinOp::AnyRestriction (int amrlev, int cmglev, Any& crse, Any& fine) const
+{
+    AMREX_ASSERT(crse.is<MultiFab>() && fine.is<MultiFab>());
+    restriction(amrlev, cmglev, crse.get<MultiFab>(), fine.get<MultiFab>());
+}
+
+void
+MLLinOp::AnyInterpolationMG (int amrlev, int fmglev, Any& fine, const Any& crse) const
+{
+    AMREX_ASSERT(crse.is<MultiFab>() && fine.is<MultiFab>());
+    interpolation(amrlev, fmglev, fine.get<MultiFab>(), crse.get<MultiFab>());
+}
+
+void
+MLLinOp::AnyInterpAssignMG (int amrlev, int fmglev, Any& fine, Any& crse) const
+{
+    AMREX_ASSERT(crse.is<MultiFab>() && fine.is<MultiFab>());
+    interpAssign(amrlev, fmglev, fine.get<MultiFab>(), crse.get<MultiFab>());
+}
+
+void
+MLLinOp::postSolve (Vector<Any>& /* sol */) const {}
+
+bool
+MLLinOp::isMFIterSafe (int amrlev, int mglev1, int mglev2) const
+{
+    return m_dmap[amrlev][mglev1] == m_dmap[amrlev][mglev2]
+        && BoxArray::SameRefs(m_grids[amrlev][mglev1], m_grids[amrlev][mglev2]);
+}
+
 #ifdef AMREX_USE_PETSC
 std::unique_ptr<PETScABecLap>
 MLLinOp::makePETSc () const
diff --git a/Src/LinearSolvers/MLMG/AMReX_MLLinOp_temp.H b/Src/LinearSolvers/MLMG/AMReX_MLLinOp_temp.H
new file mode 100644
index 00000000000..68d7c836ba5
--- /dev/null
+++ b/Src/LinearSolvers/MLMG/AMReX_MLLinOp_temp.H
@@ -0,0 +1,486 @@
+#ifndef AMREX_MLLINOP_TEMP_H_
+#define AMREX_MLLINOP_TEMP_H_
+
+//! This is a template for writing your own linear operator class for Ax=b.
+
+#include <AMReX_MLLinOp.H>
+
+namespace amrex_temp
+{
+
+class MLLinOpTemp
+    : public amrex::MLLinOp
+{
+public:
+
+    //! In this example, there are 3 edge based MultiFabs.
+    using Container = amrex::Array<amrex::MultiFab,3>;
+
+    MLLinOpTemp () {}
+
+    virtual ~MLLinOpTemp () {}
+
+    MLLinOpTemp (const MLLinOpTemp&) = delete;
+    MLLinOpTemp (MLLinOpTemp&&) = delete;
+    MLLinOpTemp& operator= (const MLLinOpTemp&) = delete;
+    MLLinOpTemp& operator= (MLLinOpTemp&&) = delete;
+
+    MLLinOpTemp (const amrex::Vector<amrex::Geometry>& a_geom,
+                 const amrex::Vector<amrex::BoxArray>& a_grids,
+                 const amrex::Vector<amrex::DistributionMapping>& a_dmap,
+                 const amrex::LPInfo& a_info = amrex::LPInfo(),
+                 const amrex::Vector<amrex::FabFactory<amrex::FArrayBox> const*>& a_factory = {})
+    {
+        define(a_geom, a_grids, a_dmap, a_info, a_factory);
+    }
+
+    void define (const amrex::Vector<amrex::Geometry>& a_geom,
+                 const amrex::Vector<amrex::BoxArray>& a_grids,
+                 const amrex::Vector<amrex::DistributionMapping>& a_dmap,
+                 const amrex::LPInfo& a_info = amrex::LPInfo(),
+                 const amrex::Vector<amrex::FabFactory<amrex::FArrayBox> const*>& a_factory = {})
+    {
+        amrex::MLLinOp::define(a_geom, a_grids, a_dmap, a_info, a_factory);
+    }
+
+    /**
+     * \brief Return the default solver at the bottom of MG cycles.  By
+     * default, MLLinOp uses a BiCGStab solver implemented in
+     * AMReX::MLCGSolver.  However, it only supports a single MultiFab.
+     * Since our data type is different, we use a smoother instead. In the
+     * future we can try to generalize MLCGSolver.
+     */
+    virtual amrex::BottomSolver getDefaultBottomSolver () const override {
+        return amrex::BottomSolver::smoother;
+    }
+
+    /**
+     * \brief Make data container (e.g., MultiFabs stored in Any) for given level.
+     *
+     * \param amrlev AMR level.  Note that the lowest level is always 0.
+     * \param mglev MG level. Note that mglev+1 is one level coarser than mglev.
+     * \param ng number of ghost cells.
+     */
+    virtual amrex::Any AnyMake (int amrlev, int mglev, amrex::IntVect const& ng) const override
+    {
+        auto const& ba = m_grids[amrlev][mglev];
+        auto const& dm = m_dmap [amrlev][mglev];
+        auto const& fc = *m_factory[amrlev][mglev];
+        return amrex::Any(Container{amrex::MultiFab(amrex::convert(ba,amrex::IntVect(0,1,1)),
+                                                    dm, 1, ng, amrex::MFInfo(), fc),
+                                    amrex::MultiFab(amrex::convert(ba,amrex::IntVect(1,0,1)),
+                                                    dm, 1, ng, amrex::MFInfo(), fc),
+                                    amrex::MultiFab(amrex::convert(ba,amrex::IntVect(1,1,0)),
+                                                    dm, 1, ng, amrex::MFInfo(), fc)});
+    }
+
+    /**
+     * \brief Make data container with coarsened BoxArray and
+     * DistributionMapping of the give MG level.
+     *
+     * \param amrlev AMR level.  Note that the lowest level is always 0.
+     * \param mglev MG level.  The coarser level is mglev+1.
+     * \param ng number of ghost cells.
+     */
+    virtual amrex::Any AnyMakeCoarseMG (int amrlev, int mglev, amrex::IntVect const& ng) const override
+    {
+        auto ratio = (amrlev > 0) ? amrex::IntVect(2) : this->mg_coarsen_ratio_vec[mglev];
+        auto const& ba = amrex::coarsen(m_grids[amrlev][mglev], ratio);
+        auto const& dm = m_dmap[amrlev][mglev];
+        return amrex::Any(Container{amrex::MultiFab(amrex::convert(ba,amrex::IntVect(0,1,1)),
+                                                    dm, 1, ng),
+                                    amrex::MultiFab(amrex::convert(ba,amrex::IntVect(1,0,1)),
+                                                    dm, 1, ng),
+                                    amrex::MultiFab(amrex::convert(ba,amrex::IntVect(1,1,0)),
+                                                    dm, 1, ng)});
+    }
+
+    /**
+     * \brief Make data container with coarsened BoxArray and
+     * DistributionMapping of the given AMR level.
+     *
+     * \param famrlev AMR level.  The coarser AMR level is famrlev-1.
+     * \param ng number of ghost cells.
+     */
+    virtual amrex::Any AnyMakeCoarseAmr (int famrlev, amrex::IntVect const& ng) const override
+    {
+        amrex::IntVect ratio(this->AMRRefRatio(famrlev-1));
+        auto const& ba = amrex::coarsen(m_grids[famrlev][0], ratio);
+        auto const& dm = m_dmap[famrlev][0];
+        return amrex::Any(Container{amrex::MultiFab(amrex::convert(ba,amrex::IntVect(0,1,1)),
+                                                    dm, 1, ng),
+                                    amrex::MultiFab(amrex::convert(ba,amrex::IntVect(1,0,1)),
+                                                    dm, 1, ng),
+                                    amrex::MultiFab(amrex::convert(ba,amrex::IntVect(1,1,0)),
+                                                    dm, 1, ng)});
+    }
+
+    /**
+     * \brief Make an alias of the given Any without deepcopying.
+     *
+     * \param a an Any object.
+     */
+    virtual amrex::Any AnyMakeAlias (amrex::Any const& a) const override
+    {
+        auto const& rhs = a.get<Container>();
+        return amrex::Any(Container{amrex::MultiFab(rhs[0], amrex::make_alias, 0, 1),
+                                    amrex::MultiFab(rhs[1], amrex::make_alias, 0, 1),
+                                    amrex::MultiFab(rhs[2], amrex::make_alias, 0, 1)});
+    }
+
+    /**
+     * \brief Retuen the number of ghost cells in the given Any.
+     *
+     * \param a an Any object.
+     */
+    virtual amrex::IntVect AnyGrowVect (amrex::Any const& a) const override
+    {
+        auto const& mfs = a.get<Container>();
+        return mfs[0].nGrowVect();
+    }
+
+    /**
+     * \brief Copy data from source Any to destination Any.
+     *
+     * \param dst destination Any.
+     * \param src source Any.
+     * \param ng number of ghost cells included in the operation.
+     */
+    virtual void AnyCopy (amrex::Any& dst, amrex::Any const& src, amrex::IntVect const& ng) const override
+    {
+        auto& dmf = dst.get<Container>();
+        auto const& smf = src.get<Container>();
+        for (int idim=0; idim < 3; ++idim) {
+            amrex::MultiFab::Copy(dmf[idim], smf[idim], 0, 0, 1, ng);
+        }
+    }
+
+    /**
+     * \brief Add data from source Any to destination Any.
+     *
+     * \param dst destination Any.
+     * \param src source Any.
+     * \param ng number of ghost cells included in the operation.
+     */
+    virtual void AnyAdd (amrex::Any& dst, amrex::Any const& src, amrex::IntVect const& ng) const override
+    {
+        auto& dmf = dst.get<Container>();
+        auto const& smf = src.get<Container>();
+        for (int idim=0; idim < 3; ++idim) {
+            amrex::MultiFab::Add(dmf[idim], smf[idim], 0, 0, 1, ng);
+        }
+    }
+
+    /**
+     * \brief Set the given Any to zero.
+     *
+     * \param a an Any object.
+     */
+    virtual void AnySetToZero (amrex::Any& a) const override
+    {
+        auto& mfs = a.get<Container>();
+        for (int idim=0; idim < 3; ++idim) {
+            mfs[idim].setVal(amrex::Real(0.0));
+        }
+    }
+
+    /**
+     * \brief Set boundary (i.e., ghost cells) the given Any to zero.
+     *
+     * \param a an Any object.
+     */
+    virtual void AnySetBndryToZero (amrex::Any& a) const override
+    {
+        auto& mfs = a.get<Container>();
+        for (int idim=0; idim < 3; ++idim) {
+            mfs[idim].setBndry(amrex::Real(0.0), 0, 1);
+        }
+    }
+
+#ifdef AMREX_USE_EB
+    /**
+     * \brief Set covered region of the given Any to zero.
+     *
+     * \param a an Any object.
+     */
+    virtual void AnySetCoveredToZero (amrex::Any& a) const override
+    {
+        auto& mfs = a.get<Container>();
+        for (int idim=0; idim < 3; ++idim) {
+            amrex::EB_set_covered(mfs[idim], 0, 1, 0, amrex::Real(0.0));
+        }
+    }
+#endif
+
+    /**
+     * \brief ParallelCopy from source Any ot destination Any.
+     *
+     * \param dst destination Any.
+     * \param src source Any.
+     * \param src_nghost number of ghost cells in the source included in the operation.
+     * \param dst_nghost number of ghost cells in the destination included in the operation.
+     * \param period Periodicity.
+     */
+    virtual void AnyParallelCopy (amrex::Any& dst, amrex::Any const& src,
+                                  amrex::IntVect const& src_nghost, amrex::IntVect const& dst_nghost,
+                                  amrex::Periodicity const& period = amrex::Periodicity::NonPeriodic()) const override
+    {
+        auto& dmf = dst.get<Container>();
+        auto const& smf = src.get<Container>();
+        for (int idim=0; idim < 3; ++idim) {
+            dmf[idim].ParallelCopy_nowait(smf[idim], 0, 0, 1, src_nghost, dst_nghost, period);
+        }
+        for (int idim=0; idim < 3; ++idim) {
+            dmf[idim].ParallelCopy_finish();
+        }
+    }
+
+    /**
+     * \brief Return the infinity norm of the given Any.
+     *
+     * \param a an Any object.
+     */
+    virtual amrex::Real AnyNormInf (amrex::Any& a) const override
+    {
+        auto& mfs = a.get<Container>();
+        amrex::Real r = amrex::Real(0.0);
+        for (int idim=0; idim < 3; ++idim) {
+            auto tmp = mfs[idim].norminf(0, 0, true);
+            r = std::max(r, tmp);
+        }
+        amrex::ParallelAllReduce::Max(r, amrex::ParallelContext::CommunicatorSub());
+        return r;
+    }
+
+    /**
+     * \brief Return the infinity norm of the masked region of the given Any.
+     *
+     * For a composite solve with multiple AMR levels, the region covered by
+     * finer AMR levels are not included in the operation.
+     *
+     * \parame amrlev AMR level.
+     * \param a an Any object.
+     * \parame local determines if the reduction is local (i.e., no MPI communication) or not.
+     */
+    virtual amrex::Real AnyNormInfMask (int amrlev, amrex::Any const& a, bool local) const override
+    {
+        amrex::ignore_unused(amrlev, a, local);
+        amrex::Abort("TODO: AnyNormInfMask");
+        // This is only needed for multi-level composite solve
+        return amrex::Real(0.0);
+    }
+
+    /**
+     * \brief Compute residual of the original form, r = b - Ax.
+     *
+     * \param amrlev AMR level
+     * \param resid residual
+     * \param x the solution x
+     * \param b the RHS b
+     * \param crse_bcdata provides Dirichlet BC at AMR coarse/fine interface.
+     *                    It's a nullptr for single level solve.
+     */
+    virtual void AnySolutionResidual (int amrlev, amrex::Any& resid, amrex::Any& x, amrex::Any const& b,
+                                      amrex::Any const* crse_bcdata = nullptr) override
+    {
+        amrex::ignore_unused(amrlev, resid, x, b, crse_bcdata);
+        amrex::Abort("TODO: AnySolutionResidual");
+    }
+
+    /**
+     * \brief Compute residual of the residual correction form, r = b - Ax.
+     *
+     * \param amrlev AMR level.
+     * \param resid residual of the residual correction form.
+     * \param x the correction.
+     * \param b the RHS for the residual correction form (i.e., the residual of the original form.
+     * \param bc_mode is either Homogeneous or Inhomogeneous.
+     * \param crse_bcdata provides inhomogenous Dirichlet BC at AMR coarse/fine interface.
+     *                    It's ignored for homogeneous Dirichlet BC.
+     */
+    virtual void AnyCorrectionResidual (int amrlev, int mglev, amrex::Any& resid, amrex::Any& x,
+                                        const amrex::Any& b, MLLinOp::BCMode bc_mode,
+                                        const amrex::Any* crse_bcdata=nullptr) override
+    {
+        amrex::ignore_unused(amrlev, mglev, resid, x, b, bc_mode, crse_bcdata);
+        amrex::Abort("TODO: AnyCorrectionResidual");
+    }
+
+    /**
+     * \brief Reflux
+     *
+     * This modifies the coarse level residual at the coarse/fine interface.
+     *
+     * \param crse_amrlev coarse AMR level.
+     * \param res coarse level residual.
+     * \param crse_sol coarse level x.
+     * \param crse_rhs coarse level b.
+     * \param fine_res fine level residual.  This may not be needed depending on the coarse/fine stencil.
+     * \param fine_sol fine level x.
+     * \param fine_rhs fine level b.
+     */
+    virtual void AnyReflux (int crse_amrlev,
+                            amrex::Any& res, const amrex::Any& crse_sol, const amrex::Any& crse_rhs,
+                            amrex::Any& fine_res, amrex::Any& fine_sol, const amrex::Any& fine_rhs) override
+    {
+        amrex::ignore_unused(crse_amrlev, res, crse_sol, crse_rhs, fine_res, fine_sol, fine_rhs);
+        amrex::Abort("TODO: AnyReflux");
+        // This is only needed for multi-level composite solve
+    }
+
+    /**
+     * \brief Average down residual from fine to coarse AMR level.
+     *
+     * \param clev coarse ARR level.
+     * \param cres coarse level residual.
+     * \param fres fine level residual.
+     */
+    virtual void AnyAvgDownResAmr (int clev, amrex::Any& cres, amrex::Any const& fres) const override
+    {
+        amrex::ignore_unused(clev, cres, fres);
+        amrex::Abort("TODO: AnyAvgDownResAmr");
+        // This is only needed for mulit-level composite solve.
+        // And maybe there is nothing neeed to be done here, like in the nodal projection solver.
+    }
+
+    /**
+     * \brief Average down residual from fine to coarse MG level.
+     *
+     * This is only needed for MG F-cycle, and we don't need to implement this for V-cycle.
+     *
+     * \param clev coarse MG level.
+     * \param cres coarse level residual.
+     * \param fres fine level residual.
+     */
+    virtual void AnyAvgDownResMG (int clev, amrex::Any& cres, amrex::Any const& fres) const override
+    {
+        amrex::ignore_unused(clev, cres, fres);
+        amrex::Abort("TODO: AnyAvgDownResMG"); // Not needed for V-cycle.
+    }
+
+    /**
+     * \brief Smooth the given level.
+     *
+     * \param amrlev AMR level.  Note that the lowest level is always 0.
+     * \param mglev MG level.  Note that mglev+1 is one level coarser than mglev.
+     * \param sol x
+     * \param rhs b
+     * \param skip_fillboundary a flag for if we need to fill ghost cells in this function.
+     */
+    virtual void AnySmooth (int amrlev, int mglev, amrex::Any& sol, const amrex::Any& rhs,
+                            bool skip_fillboundary=false) const override
+    {
+        amrex::ignore_unused(amrlev, mglev, sol, rhs, skip_fillboundary);
+        amrex::Abort("TODO: AnySmooth");
+    }
+
+    /**
+     * \brief Restriction from fine to coarse MG level.
+     *
+     * \param amrlev AMR level.
+     * \param cmglev coarse MG level.  The fine MG level is cmglev-1.
+     * \param crse coarse data.
+     * \param fine fine data.  This is not const& because we may need to fill its ghost cells.
+     */
+    virtual void AnyRestriction (int amrlev, int cmglev, amrex::Any& crse, amrex::Any& fine) const override
+    {
+        amrex::ignore_unused(amrlev, cmglev, crse, fine);
+        amrex::Abort("TODO: AnyRestriction");
+    }
+
+    /**
+     * \brief Add interpolated coarse data onto the fine MG level.
+     *
+     * Note that it's an ADD operation.
+     *
+     * \param amrlev AMR level.
+     * \param fmglev fine MG level.  The coarse MG level is fmglev+1.
+     * \param fine fine MG level data.
+     * \param crse coarse MG level data.
+     */
+    virtual void AnyInterpolationMG (int amrlev, int fmglev, amrex::Any& fine, const amrex::Any& crse) const override
+    {
+        amrex::ignore_unused(amrlev, fmglev, fine, crse);
+        amrex::Abort("TODO: AnyInterpolationMG");
+    }
+
+    /**
+     * \brief Assign (i.e., copy) interpolated coarse data onto the fine MG level.
+     *
+     * Note that it's an ASSIGN operation.  This is used in MG F-cycle, and
+     * does not need to be implemented for V-cycle.
+     *
+     * \param amrlev AMR level.
+     * \param fmglev fine MG level.  The coarse MG level is fmglev+1.
+     * \param fine fine MG level data.
+     * \param crse coarse MG level data.
+     */
+    virtual void AnyInterpAssignMG (int amrlev, int fmglev, amrex::Any& fine, amrex::Any& crse) const override
+    {
+        amrex::ignore_unused(amrlev, fmglev, fine, crse);
+        amrex::Abort("TODO: AnyInterpAssignMG"); // not needed for V-cycle.
+    }
+
+    /**
+     * \brief Interpolate data from coarse to fine AMR level.
+     *
+     * \param famrlev fine AMR level. The coarse AMR level is famrlev-1.
+     * \param fine data on fine AMR level.
+     * \param crse data on coarse AMR level.
+     */
+    virtual void AnyInterpolationAmr (int famrlev, amrex::Any& fine, const amrex::Any& crse,
+                                      amrex::IntVect const& /*nghost*/) const override
+    {
+        amrex::ignore_unused(famrlev, fine, crse);
+        // This is only needed for multi-level composite solve
+        amrex::Abort("TODO: AnyInterpolationAmr");
+    }
+
+    /**
+     * \brief Average down x and b from fine to coarse AMR level.
+     *
+     * This is called before V-cycle to make data on AMR levels consistent.
+     *
+     * \param camrlev coarse AMR level.  The fine level is camrlev+1.
+     * \param crse_sol x on coarse level.
+     * \param crse_rhs b on coarse level.
+     * \param fine_sol x on fine level.
+     * \param fine_rhs b on fine level.
+     */
+    virtual void AnyAverageDownSolutionRHS (int camrlev, amrex::Any& crse_sol, amrex::Any& crse_rhs,
+                                            const amrex::Any& fine_sol, const amrex::Any& fine_rhs) override
+    {
+        amrex::ignore_unused(camrlev, crse_sol, crse_rhs, fine_sol, fine_rhs);
+        // This is only needed for multi-level composite solve
+        amrex::Abort("AnyAverageDownSolutionRHS");
+    }
+
+    /**
+     * \brief Average down and synchronize AMR data.
+     *
+     * Synchronize the data on each level.  That is the nodal data in the
+     * same MultiFab needs to be synchronized.  This function also needs to
+     * average down the data from fine to coarse AMR levels.
+     *
+     * \param sol data on all AMR levels.
+     */
+    virtual void AnyAverageDownAndSync (amrex::Vector<amrex::Any>& sol) const override
+    {
+        amrex::ignore_unused(sol);
+        // Even for single level, we shoudl synchronize the data on level 0.
+        amrex::Abort("TODO: AnyAverageDownAndSync");
+    }
+
+    /**
+     * \brief Prepare the solver for MG cycle.
+     */
+    virtual void prepareForSolve () override
+    {
+        amrex::Abort("TODO: prepareForSolve");
+    }
+};
+
+}
+
+
+#endif
diff --git a/Src/LinearSolvers/MLMG/AMReX_MLMG.H b/Src/LinearSolvers/MLMG/AMReX_MLMG.H
index 32980d74c45..e884f877fbc 100644
--- a/Src/LinearSolvers/MLMG/AMReX_MLMG.H
+++ b/Src/LinearSolvers/MLMG/AMReX_MLMG.H
@@ -36,6 +36,10 @@ public:
     Real solve (const Vector<MultiFab*>& a_sol, const Vector<MultiFab const*>& a_rhs,
                 Real a_tol_rel, Real a_tol_abs, const char* checkpoint_file = nullptr);
 
+    // For this version of solve, Any holds MultiFab like objects.
+    Real solve (Vector<Any>& a_sol, const Vector<Any>& a_rhs,
+                Real a_tol_rel, Real a_tol_abs, const char* checkpoint_file = nullptr);
+
     void getGradSolution (const Vector<Array<MultiFab*,AMREX_SPACEDIM> >& a_grad_sol,
                           Location a_loc = Location::FaceCenter);
 
@@ -121,7 +125,7 @@ public:
     void setHypreStrongThreshold (Real t) noexcept {hypre_strong_threshold = t;}
 #endif
 
-    void prepareForSolve (const Vector<MultiFab*>& a_sol, const Vector<MultiFab const*>& a_rhs);
+    void prepareForSolve (Vector<Any>& a_sol, const Vector<Any>& a_rhs);
 
     void prepareForNSolve ();
 
@@ -151,19 +155,16 @@ public:
     Real MLRhsNormInf (bool local = false);
     void buildFineMask ();
 
-    void averageDownAndSync ();
-
-    void computeVolInv ();
     void makeSolvable ();
-    void makeSolvable (int amrlev, int mglev, MultiFab& mf);
+    void makeSolvable (int amrlev, int mglev, Any& mf);
 
 #if defined(AMREX_USE_HYPRE) && (AMREX_SPACEDIM > 1)
-    void bottomSolveWithHypre (MultiFab& x, const MultiFab& b);
+    void bottomSolveWithHypre (Any& x, const Any& b);
 #endif
 
-    void bottomSolveWithPETSc (MultiFab& x, const MultiFab& b);
+    void bottomSolveWithPETSc (Any& x, const Any& b);
 
-    int bottomSolveWithCG (MultiFab& x, const MultiFab& b, MLCGSolver::Type type);
+    int bottomSolveWithCG (Any& x, const Any& b, MLCGSolver::Type type);
 
     Real getInitRHS () const noexcept { return m_rhsnorm0; }
     // Initial composite residual
@@ -242,26 +243,21 @@ private:
     * \brief To avoid confusion, terms like sol, cor, rhs, res, ... etc. are
     * in the frame of the original equation, not the correction form
     */
-    Vector<std::unique_ptr<MultiFab> > sol_raii;
-    Vector<MultiFab*>         sol;      //!< alias to argument a_sol
-    Vector<MultiFab>          rhs;      //!< Copy of original rhs
-                                        //! L(sol) = rhs
+    Vector<Any> sol;      //!< Might be alias to argument a_sol
+    Vector<Any> rhs;      //!< Copy of original rhs
+                          //! L(sol) = rhs
+
+    Vector<int> sol_is_alias;
 
     /**
     * \brief First Vector: Amr levels.  0 is the coarest level
     * Second Vector: MG levels.  0 is the finest level
     */
-    Vector<Vector<MultiFab> >                   res;     //! = rhs - L(sol)
-    Vector<Vector<std::unique_ptr<MultiFab> > > cor;     //!< L(cor) = res
-    Vector<Vector<std::unique_ptr<MultiFab> > > cor_hold;
-    Vector<Vector<MultiFab> >                   rescor;  //!< = res - L(cor)
-                                                         //!  Residual of the correction form
-
-    Vector<std::unique_ptr<iMultiFab> > fine_mask;
-
-    Vector<Vector<Real> > volinv;      //!< used by makeSolvable
-
-    Vector<std::unique_ptr<MultiFab> > scratch;
+    Vector<Vector<Any> > res;     //! = rhs - L(sol)
+    Vector<Vector<Any> > cor;     //!< L(cor) = res
+    Vector<Vector<Any> > cor_hold;
+    Vector<Vector<Any> > rescor;  //!< = res - L(cor)
+                                  //!  Residual of the correction form
 
     enum timer_types { solve_time=0, iter_time, bottom_time, ntimers };
     Vector<double> timer;
diff --git a/Src/LinearSolvers/MLMG/AMReX_MLMG.cpp b/Src/LinearSolvers/MLMG/AMReX_MLMG.cpp
index 2bdb9222b4b..28c833397b4 100644
--- a/Src/LinearSolvers/MLMG/AMReX_MLMG.cpp
+++ b/Src/LinearSolvers/MLMG/AMReX_MLMG.cpp
@@ -2,7 +2,6 @@
 #include <AMReX_MultiFabUtil.H>
 #include <AMReX_VisMF.H>
 #include <AMReX_BC_TYPES.H>
-#include <AMReX_MLMG_K.H>
 #include <AMReX_MLABecLaplacian.H>
 
 #ifdef AMREX_USE_PETSC
@@ -51,25 +50,52 @@ MLMG::~MLMG ()
 Real
 MLMG::solve (const Vector<MultiFab*>& a_sol, const Vector<MultiFab const*>& a_rhs,
              Real a_tol_rel, Real a_tol_abs, const char* checkpoint_file)
+{
+    Vector<Any> any_sol(namrlevs);
+    Vector<Any> any_rhs(namrlevs);
+    for (int lev = 0; lev < namrlevs; ++lev) {
+        any_sol[lev] = MultiFab(*a_sol[lev], amrex::make_alias, 0, a_sol[lev]->nComp());
+        any_rhs[lev] = MultiFab(*a_rhs[lev], amrex::make_alias, 0, a_rhs[lev]->nComp());
+    }
+    return solve(any_sol, any_rhs, a_tol_rel, a_tol_abs, checkpoint_file);
+}
+
+Real
+MLMG::solve (Vector<Any>& a_sol, const Vector<Any>& a_rhs,
+             Real a_tol_rel, Real a_tol_abs, const char* checkpoint_file)
 {
     BL_PROFILE("MLMG::solve()");
 
     if (checkpoint_file != nullptr) {
-        checkPoint(a_sol, a_rhs, a_tol_rel, a_tol_abs, checkpoint_file);
+        if (a_sol[0].is<MultiFab>()) {
+            Vector<MultiFab*> mf_sol(namrlevs);
+            Vector<MultiFab const*> mf_rhs(namrlevs);
+            for (int lev = 0; lev < namrlevs; ++lev) {
+                mf_sol[lev] = &(a_sol[lev].get<MultiFab>());
+                mf_rhs[lev] = &(a_rhs[lev].get<MultiFab>());
+            }
+            checkPoint(mf_sol, mf_rhs, a_tol_rel, a_tol_abs, checkpoint_file);
+        } else {
+            amrex::Abort("MLMG::solve: checkpoint not supported for non-MultiFab type");
+        }
     }
 
     if (bottom_solver == BottomSolver::Default) {
         bottom_solver = linop.getDefaultBottomSolver();
     }
 
+#if defined(AMREX_USE_HYPRE) || defined(AMREX_USE_PETSC)
     if (bottom_solver == BottomSolver::hypre || bottom_solver == BottomSolver::petsc) {
+        AMREX_ALWAYS_ASSERT_WITH_MESSAGE(a_sol[0].is<MultiFab>(),
+                                         "Non-MultiFab type not supported for hypre and petsc");
         int mo = linop.getMaxOrder();
-        if (a_sol[0]->hasEBFabFactory()) {
+        if (a_sol[0].get<MultiFab>().hasEBFabFactory()) {
             linop.setMaxOrder(2);
         } else {
             linop.setMaxOrder(std::min(3,mo));  // maxorder = 4 not supported
         }
     }
+#endif
 
     bool is_nsolve = linop.m_parent;
 
@@ -84,8 +110,6 @@ MLMG::solve (const Vector<MultiFab*>& a_sol, const Vector<MultiFab const*>& a_rh
 
     computeMLResidual(finest_amr_lev);
 
-    int ncomp = linop.getNComp();
-
     bool local = true;
     Real resnorm0 = MLResNormInf(finest_amr_lev, local);
     Real rhsnorm0 = MLRhsNormInf(local);
@@ -194,15 +218,16 @@ MLMG::solve (const Vector<MultiFab*>& a_sol, const Vector<MultiFab const*>& a_rh
         timer[iter_time] = amrex::second() - iter_start_time;
     }
 
+    linop.postSolve(sol);
+
     IntVect ng_back = final_fill_bc ? IntVect(1) : IntVect(0);
     if (linop.hasHiddenDimension()) {
         ng_back[linop.hiddenDirection()] = 0;
     }
     for (int alev = 0; alev < namrlevs; ++alev)
     {
-        if (a_sol[alev] != sol[alev])
-        {
-            MultiFab::Copy(*a_sol[alev], *sol[alev], 0, 0, ncomp, ng_back);
+        if (!sol_is_alias[alev]) {
+            linop.AnyCopy(a_sol[alev], sol[alev], ng_back);
         }
     }
 
@@ -229,16 +254,13 @@ void MLMG::oneIter (int iter)
 {
     BL_PROFILE("MLMG::oneIter()");
 
-    int ncomp = linop.getNComp();
-    int nghost = 0;
-    if (cf_strategy == CFStrategy::ghostnodes) nghost = linop.getNGrow();
-
     for (int alev = finest_amr_lev; alev > 0; --alev)
     {
-        if (cf_strategy == CFStrategy::ghostnodes) nghost = linop.getNGrow(alev);
         miniCycle(alev);
 
-        MultiFab::Add(*sol[alev], *cor[alev][0], 0, 0, ncomp, nghost);
+        IntVect nghost(0);
+        if (cf_strategy == CFStrategy::ghostnodes) nghost = IntVect(linop.getNGrow(alev));
+        linop.AnyAdd(sol[alev], cor[alev][0], nghost);
 
         // compute residual for the coarse AMR level
         computeResWithCrseSolFineCor(alev-1,alev);
@@ -250,7 +272,6 @@ void MLMG::oneIter (int iter)
 
     // coarsest amr level
     {
-        if (cf_strategy == CFStrategy::ghostnodes) nghost = linop.getNGrow(0);
         // enforce solvability if appropriate
         if (linop.isSingular(0) && linop.getEnforceSingularSolvable())
         {
@@ -258,24 +279,27 @@ void MLMG::oneIter (int iter)
         }
 
         if (iter < max_fmg_iters) {
-            mgFcycle ();
+            mgFcycle();
         } else {
-            mgVcycle (0, 0);
+            mgVcycle(0, 0);
         }
 
-        MultiFab::Add(*sol[0], *cor[0][0], 0, 0, ncomp, nghost);
+        IntVect nghost(0);
+        if (cf_strategy == CFStrategy::ghostnodes) nghost = IntVect(linop.getNGrow(0));
+        linop.AnyAdd(sol[0], cor[0][0], nghost);
     }
 
     for (int alev = 1; alev <= finest_amr_lev; ++alev)
     {
-        if (cf_strategy == CFStrategy::ghostnodes) nghost = linop.getNGrow(alev);
         // (Fine AMR correction) = I(Coarse AMR correction)
         interpCorrection(alev);
 
-        MultiFab::Add(*sol[alev], *cor[alev][0], 0, 0, ncomp, nghost);
+        IntVect nghost(0);
+        if (cf_strategy == CFStrategy::ghostnodes) nghost = IntVect(linop.getNGrow(alev));
+        linop.AnyAdd(sol[alev], cor[alev][0], nghost);
 
         if (alev != finest_amr_lev) {
-            MultiFab::Add(*cor_hold[alev][0], *cor[alev][0], 0, 0, ncomp, nghost);
+            linop.AnyAdd(cor_hold[alev][0], cor[alev][0], nghost);
         }
 
         // Update fine AMR level correction
@@ -283,14 +307,14 @@ void MLMG::oneIter (int iter)
 
         miniCycle(alev);
 
-        MultiFab::Add(*sol[alev], *cor[alev][0], 0, 0, ncomp, nghost);
+        linop.AnyAdd(sol[alev], cor[alev][0], nghost);
 
         if (alev != finest_amr_lev) {
-            MultiFab::Add(*cor[alev][0], *cor_hold[alev][0], 0, 0, ncomp, nghost);
+            linop.AnyAdd(cor[alev][0], cor_hold[alev][0], nghost);
         }
     }
 
-    averageDownAndSync();
+    linop.AnyAverageDownAndSync(sol);
 }
 
 // Compute multi-level Residual (res) up to amrlevmax.
@@ -301,11 +325,11 @@ MLMG::computeMLResidual (int amrlevmax)
 
     const int mglev = 0;
     for (int alev = amrlevmax; alev >= 0; --alev) {
-        const MultiFab* crse_bcdata = (alev > 0) ? sol[alev-1] : nullptr;
-        linop.solutionResidual(alev, res[alev][mglev], *sol[alev], rhs[alev], crse_bcdata);
+        const Any* crse_bcdata = (alev > 0) ? &(sol[alev-1]) : nullptr;
+        linop.AnySolutionResidual(alev, res[alev][mglev], sol[alev], rhs[alev], crse_bcdata);
         if (alev < finest_amr_lev) {
-            linop.reflux(alev, res[alev][mglev], *sol[alev], rhs[alev],
-                         res[alev+1][mglev], *sol[alev+1], rhs[alev+1]);
+            linop.AnyReflux(alev, res[alev][mglev], sol[alev], rhs[alev],
+                            res[alev+1][mglev], sol[alev+1], rhs[alev+1]);
         }
     }
 }
@@ -315,16 +339,8 @@ void
 MLMG::computeResidual (int alev)
 {
     BL_PROFILE("MLMG::computeResidual()");
-
-    MultiFab& x = *sol[alev];
-    const MultiFab& b = rhs[alev];
-    MultiFab& r = res[alev][0];
-
-    const MultiFab* crse_bcdata = nullptr;
-    if (alev > 0) {
-        crse_bcdata = sol[alev-1];
-    }
-    linop.solutionResidual(alev, r, x, b, crse_bcdata);
+    const Any* crse_bcdata = (alev > 0) ? &(sol[alev-1]) : nullptr;
+    linop.AnySolutionResidual(alev, res[alev][0], sol[alev], rhs[alev], crse_bcdata);
 }
 
 // Compute coarse AMR level composite residual with coarse solution and fine correction
@@ -333,39 +349,28 @@ MLMG::computeResWithCrseSolFineCor (int calev, int falev)
 {
     BL_PROFILE("MLMG::computeResWithCrseSolFineCor()");
 
-    int ncomp = linop.getNComp();
-    int nghost = 0;
-    if (cf_strategy == CFStrategy::ghostnodes) nghost = std::min(linop.getNGrow(falev),linop.getNGrow(calev));
+    IntVect nghost(0);
+    if (cf_strategy == CFStrategy::ghostnodes) nghost = IntVect(std::min(linop.getNGrow(falev),linop.getNGrow(calev)));
 
-    MultiFab& crse_sol = *sol[calev];
-    const MultiFab& crse_rhs = rhs[calev];
-    MultiFab& crse_res = res[calev][0];
+    Any&       crse_sol = sol[calev];
+    const Any& crse_rhs = rhs[calev];
+    Any&       crse_res = res[calev][0];
 
-    MultiFab& fine_sol = *sol[falev];
-    const MultiFab& fine_rhs = rhs[falev];
-    MultiFab& fine_cor = *cor[falev][0];
-    MultiFab& fine_res = res[falev][0];
-    MultiFab& fine_rescor = rescor[falev][0];
+    Any&       fine_sol = sol[falev];
+    const Any& fine_rhs = rhs[falev];
+    Any&       fine_cor = cor[falev][0];
+    Any&       fine_res = res[falev][0];
+    Any&    fine_rescor = rescor[falev][0];
 
-    const MultiFab* crse_bcdata = nullptr;
-    if (calev > 0) {
-        crse_bcdata = sol[calev-1];
-    }
-    linop.solutionResidual(calev, crse_res, crse_sol, crse_rhs, crse_bcdata);
+    const Any* crse_bcdata = (calev > 0) ? &(sol[calev-1]) : nullptr;
+    linop.AnySolutionResidual(calev, crse_res, crse_sol, crse_rhs, crse_bcdata);
 
-    linop.correctionResidual(falev, 0, fine_rescor, fine_cor, fine_res, BCMode::Homogeneous);
-    MultiFab::Copy(fine_res, fine_rescor, 0, 0, ncomp, nghost);
+    linop.AnyCorrectionResidual(falev, 0, fine_rescor, fine_cor, fine_res, BCMode::Homogeneous);
+    linop.AnyCopy(fine_res, fine_rescor, nghost);
 
-    linop.reflux(calev, crse_res, crse_sol, crse_rhs, fine_res, fine_sol, fine_rhs);
+    linop.AnyReflux(calev, crse_res, crse_sol, crse_rhs, fine_res, fine_sol, fine_rhs);
 
-    if (linop.isCellCentered()) {
-        const int amrrr = linop.AMRRefRatio(calev);
-#ifdef AMREX_USE_EB
-        amrex::EB_average_down(fine_res, crse_res, 0, ncomp, amrrr);
-#else
-        amrex::average_down(fine_res, crse_res, 0, ncomp, amrrr);
-#endif
-    }
+    linop.AnyAvgDownResAmr(calev, crse_res, fine_res);
 }
 
 // Compute fine AMR level residual fine_res = fine_res - L(fine_cor) with coarse providing BC.
@@ -374,20 +379,19 @@ MLMG::computeResWithCrseCorFineCor (int falev)
 {
     BL_PROFILE("MLMG::computeResWithCrseCorFineCor()");
 
-    int ncomp = linop.getNComp();
-    int nghost = 0;
-    if (cf_strategy == CFStrategy::ghostnodes) nghost = linop.getNGrow(falev);
+    IntVect nghost(0);
+    if (cf_strategy == CFStrategy::ghostnodes) nghost = IntVect(linop.getNGrow(falev));
 
-    const MultiFab& crse_cor = *cor[falev-1][0];
+    const Any& crse_cor = cor[falev-1][0];
 
-    MultiFab& fine_cor = *cor[falev][0];
-    MultiFab& fine_res = res[falev][0];
-    MultiFab& fine_rescor = rescor[falev][0];
+    Any& fine_cor    = cor   [falev][0];
+    Any& fine_res    = res   [falev][0];
+    Any& fine_rescor = rescor[falev][0];
 
     // fine_rescor = fine_res - L(fine_cor)
-    linop.correctionResidual(falev, 0, fine_rescor, fine_cor, fine_res,
-                             BCMode::Inhomogeneous, &crse_cor);
-    MultiFab::Copy(fine_res, fine_rescor, 0, 0, ncomp, nghost);
+    linop.AnyCorrectionResidual(falev, 0, fine_rescor, fine_cor, fine_res,
+                                BCMode::Inhomogeneous, &crse_cor);
+    linop.AnyCopy(fine_res, fine_rescor, nghost);
 }
 
 void
@@ -413,16 +417,16 @@ MLMG::mgVcycle (int amrlev, int mglev_top)
 
         if (verbose >= 4)
         {
-            Real norm = res[amrlev][mglev].norm0();
+            Real norm = linop.AnyNormInf(res[amrlev][mglev]);
             amrex::Print() << "AT LEVEL "  << amrlev << " " << mglev
                            << "   DN: Norm before smooth " << norm << "\n";
         }
 
-        cor[amrlev][mglev]->setVal(0.0);
+        linop.AnySetToZero(cor[amrlev][mglev]);
         bool skip_fillboundary = true;
         for (int i = 0; i < nu1; ++i) {
-            linop.smooth(amrlev, mglev, *cor[amrlev][mglev], res[amrlev][mglev],
-                         skip_fillboundary);
+            linop.AnySmooth(amrlev, mglev, cor[amrlev][mglev], res[amrlev][mglev],
+                            skip_fillboundary);
             skip_fillboundary = false;
         }
 
@@ -431,14 +435,13 @@ MLMG::mgVcycle (int amrlev, int mglev_top)
 
         if (verbose >= 4)
         {
-            Real norm = rescor[amrlev][mglev].norm0();
+            Real norm = linop.AnyNormInf(rescor[amrlev][mglev]);
             amrex::Print() << "AT LEVEL "  << amrlev << " " << mglev
                            << "   DN: Norm after  smooth " << norm << "\n";
         }
 
         // res_crse = R(rescor_fine); this provides res/b to the level below
-        linop.restriction(amrlev, mglev+1, res[amrlev][mglev+1], rescor[amrlev][mglev]);
-
+        linop.AnyRestriction(amrlev, mglev+1, res[amrlev][mglev+1], rescor[amrlev][mglev]);
     }
 
     BL_PROFILE_VAR("MLMG::mgVcycle_bottom", blp_bottom);
@@ -446,7 +449,7 @@ MLMG::mgVcycle (int amrlev, int mglev_top)
     {
         if (verbose >= 4)
         {
-            Real norm = res[amrlev][mglev_bottom].norm0();
+            Real norm = linop.AnyNormInf(res[amrlev][mglev_bottom]);
             amrex::Print() << "AT LEVEL "  << amrlev << " " << mglev_bottom
                            << "   DN: Norm before bottom " << norm << "\n";
         }
@@ -454,7 +457,7 @@ MLMG::mgVcycle (int amrlev, int mglev_top)
         if (verbose >= 4)
         {
             computeResOfCorrection(amrlev, mglev_bottom);
-            Real norm = rescor[amrlev][mglev_bottom].norm0();
+            Real norm = linop.AnyNormInf(rescor[amrlev][mglev_bottom]);
 
             amrex::Print() << "AT LEVEL "  << amrlev << " " << mglev_bottom
                            << "   UP: Norm after  bottom " << norm << "\n";
@@ -464,21 +467,21 @@ MLMG::mgVcycle (int amrlev, int mglev_top)
     {
         if (verbose >= 4)
         {
-            Real norm = res[amrlev][mglev_bottom].norm0();
+            Real norm = linop.AnyNormInf(res[amrlev][mglev_bottom]);
             amrex::Print() << "AT LEVEL "  << amrlev << " " << mglev_bottom
                            << "       Norm before smooth " << norm << "\n";
         }
-        cor[amrlev][mglev_bottom]->setVal(0.0);
+        linop.AnySetToZero(cor[amrlev][mglev_bottom]);
         bool skip_fillboundary = true;
         for (int i = 0; i < nu1; ++i) {
-            linop.smooth(amrlev, mglev_bottom, *cor[amrlev][mglev_bottom], res[amrlev][mglev_bottom],
-                         skip_fillboundary);
+            linop.AnySmooth(amrlev, mglev_bottom, cor[amrlev][mglev_bottom],
+                            res[amrlev][mglev_bottom], skip_fillboundary);
             skip_fillboundary = false;
         }
         if (verbose >= 4)
         {
             computeResOfCorrection(amrlev, mglev_bottom);
-            Real norm = rescor[amrlev][mglev_bottom].norm0();
+            Real norm = linop.AnyNormInf(rescor[amrlev][mglev_bottom]);
             amrex::Print() << "AT LEVEL "  << amrlev  << " " << mglev_bottom
                            << "       Norm after  smooth " << norm << "\n";
         }
@@ -493,12 +496,12 @@ MLMG::mgVcycle (int amrlev, int mglev_top)
         if (verbose >= 4)
         {
             computeResOfCorrection(amrlev, mglev);
-            Real norm = rescor[amrlev][mglev].norm0();
+            Real norm = linop.AnyNormInf(rescor[amrlev][mglev]);
             amrex::Print() << "AT LEVEL "  << amrlev << " " << mglev
                            << "   UP: Norm before smooth " << norm << "\n";
         }
         for (int i = 0; i < nu2; ++i) {
-            linop.smooth(amrlev, mglev, *cor[amrlev][mglev], res[amrlev][mglev]);
+            linop.AnySmooth(amrlev, mglev, cor[amrlev][mglev], res[amrlev][mglev]);
         }
 
         if (cf_strategy == CFStrategy::ghostnodes) computeResOfCorrection(amrlev, mglev);
@@ -506,7 +509,7 @@ MLMG::mgVcycle (int amrlev, int mglev_top)
         if (verbose >= 4)
         {
             computeResOfCorrection(amrlev, mglev);
-            Real norm = rescor[amrlev][mglev].norm0();
+            Real norm = linop.AnyNormInf(rescor[amrlev][mglev]);
             amrex::Print() << "AT LEVEL "  << amrlev << " " << mglev
                            << "   UP: Norm after  smooth " << norm << "\n";
         }
@@ -521,21 +524,18 @@ MLMG::mgFcycle ()
 {
     BL_PROFILE("MLMG::mgFcycle()");
 
+#ifdef AMREX_USE_EB
+    AMREX_ASSERT(linop.isCellCentered());
+#endif
+
     const int amrlev = 0;
     const int mg_bottom_lev = linop.NMGLevels(amrlev) - 1;
-    const int ncomp = linop.getNComp();
-    int nghost = 0;
-    if (cf_strategy == CFStrategy::ghostnodes) nghost = linop.getNGrow(amrlev);
+    IntVect nghost(0);
+    if (cf_strategy == CFStrategy::ghostnodes) nghost = IntVect(linop.getNGrow(amrlev));
 
     for (int mglev = 1; mglev <= mg_bottom_lev; ++mglev)
     {
-#ifdef AMREX_USE_EB
-        amrex::EB_average_down(res[amrlev][mglev-1], res[amrlev][mglev], 0, ncomp,
-                               linop.mg_coarsen_ratio_vec[mglev-1]);
-#else
-        amrex::average_down(res[amrlev][mglev-1], res[amrlev][mglev], 0, ncomp,
-                            linop.mg_coarsen_ratio_vec[mglev-1]);
-#endif
+        linop.AnyAvgDownResMG(mglev, res[amrlev][mglev], res[amrlev][mglev-1]);
     }
 
     bottomSolve();
@@ -543,17 +543,17 @@ MLMG::mgFcycle ()
     for (int mglev = mg_bottom_lev-1; mglev >= 0; --mglev)
     {
         // cor_fine = I(cor_crse)
-        interpCorrection (amrlev, mglev);
+        interpCorrection(amrlev, mglev);
 
         // rescor = res - L(cor)
         computeResOfCorrection(amrlev, mglev);
         // res = rescor; this provides b to the vcycle below
-        MultiFab::Copy(res[amrlev][mglev], rescor[amrlev][mglev], 0,0,ncomp,nghost);
+        linop.AnyCopy(res[amrlev][mglev], rescor[amrlev][mglev], nghost);
 
         // save cor; do v-cycle; add the saved to cor
         std::swap(cor[amrlev][mglev], cor_hold[amrlev][mglev]);
         mgVcycle(amrlev, mglev);
-        MultiFab::Add(*cor[amrlev][mglev], *cor_hold[amrlev][mglev], 0, 0, ncomp, nghost);
+        linop.AnyAdd(cor[amrlev][mglev], cor_hold[amrlev][mglev], nghost);
     }
 }
 
@@ -563,17 +563,11 @@ MLMG::interpCorrection (int alev)
 {
     BL_PROFILE("MLMG::interpCorrection_1");
 
-    const int ncomp = linop.getNComp();
-    int nghost = 0;
-    if (cf_strategy == CFStrategy::ghostnodes) nghost = linop.getNGrow(alev);
-
-    const MultiFab& crse_cor = *cor[alev-1][0];
-    MultiFab& fine_cor = *cor[alev][0];
+    IntVect nghost(0);
+    if (cf_strategy == CFStrategy::ghostnodes) nghost = IntVect(linop.getNGrow(alev));
 
-    BoxArray ba = fine_cor.boxArray();
-    const int amrrr = linop.AMRRefRatio(alev-1);
-    IntVect refratio{amrrr};
-    ba.coarsen(refratio);
+    Any const& crse_cor = cor[alev-1][0];
+    Any      & fine_cor = cor[alev  ][0];
 
     const Geometry& crse_geom = linop.Geom(alev-1,0);
 
@@ -584,121 +578,12 @@ MLMG::interpCorrection (int alev)
         ng_src = linop.getNGrow(alev-1);
         ng_dst = linop.getNGrow(alev-1);
     }
-    MultiFab cfine(ba, fine_cor.DistributionMap(), ncomp, ng_dst);
-    cfine.setVal(0.0);
-    cfine.ParallelCopy(crse_cor, 0, 0, ncomp, ng_src, ng_dst, crse_geom.periodicity());
-
-    bool isEB = fine_cor.hasEBFabFactory();
-    ignore_unused(isEB);
 
-#ifdef AMREX_USE_EB
-    auto factory = dynamic_cast<EBFArrayBoxFactory const*>(&(fine_cor.Factory()));
-    const FabArray<EBCellFlagFab>* flags = (factory) ? &(factory->getMultiEBCellFlagFab()) : nullptr;
-#endif
-
-    if (linop.isCellCentered())
-    {
-        MFItInfo mfi_info;
-        if (Gpu::notInLaunchRegion()) mfi_info.EnableTiling().SetDynamic(true);
-#ifdef AMREX_USE_OMP
-#pragma omp parallel if (Gpu::notInLaunchRegion())
-#endif
-        for (MFIter mfi(fine_cor, mfi_info); mfi.isValid(); ++mfi)
-        {
-            const Box& bx = mfi.tilebox();
-            Array4<Real> const& ff = fine_cor.array(mfi);
-            Array4<Real const> const& cc = cfine.const_array(mfi);
-#ifdef AMREX_USE_EB
-            bool call_lincc;
-            if (isEB)
-            {
-                const auto& flag = (*flags)[mfi];
-                if (flag.getType(amrex::grow(bx,1)) == FabType::regular) {
-                    call_lincc = true;
-                } else {
-                    Array4<EBCellFlag const> const& flg = flag.const_array();
-                    switch(refratio[0]) {
-                    case 2:
-                    {
-                        AMREX_LAUNCH_HOST_DEVICE_LAMBDA (bx, tbx,
-                        {
-                            mlmg_eb_cc_interp_r<2>(tbx, ff, cc, flg, ncomp);
-                        });
-                        break;
-                    }
-                    case 4:
-                    {
-                        AMREX_LAUNCH_HOST_DEVICE_LAMBDA (bx, tbx,
-                        {
-                            mlmg_eb_cc_interp_r<4>(tbx, ff, cc, flg, ncomp);
-                        });
-                        break;
-                    }
-                    default:
-                        amrex::Abort("mlmg_eb_cc_interp: only refratio 2 and 4 are supported");
-                    }
+    Any cfine = linop.AnyMakeCoarseAmr(alev, IntVect(ng_dst));
+    linop.AnySetToZero(cfine);
+    linop.AnyParallelCopy(cfine, crse_cor, IntVect(ng_src), IntVect(ng_dst), crse_geom.periodicity());
 
-                    call_lincc = false;
-                }
-            }
-            else
-            {
-                call_lincc = true;
-            }
-#else
-            const bool call_lincc = true;
-#endif
-            if (call_lincc)
-            {
-                switch(refratio[0]) {
-                case 2:
-                {
-                    AMREX_LAUNCH_HOST_DEVICE_LAMBDA (bx, tbx,
-                    {
-                        mlmg_lin_cc_interp_r2(tbx, ff, cc, ncomp);
-                    });
-                    break;
-                }
-                case 4:
-                {
-                    AMREX_LAUNCH_HOST_DEVICE_LAMBDA (bx, tbx,
-                    {
-                        mlmg_lin_cc_interp_r4(tbx, ff, cc, ncomp);
-                    });
-                    break;
-                }
-                default:
-                    amrex::Abort("mlmg_lin_cc_interp: only refratio 2 and 4 are supported");
-                }
-            }
-        }
-    }
-    else
-    {
-        AMREX_ALWAYS_ASSERT(amrrr == 2 || amrrr == 4);
-#ifdef AMREX_USE_OMP
-#pragma omp parallel if (Gpu::notInLaunchRegion())
-#endif
-        for (MFIter mfi(fine_cor, TilingIfNotGPU()); mfi.isValid(); ++mfi)
-        {
-            Box fbx = mfi.tilebox();
-            if (cf_strategy == CFStrategy::ghostnodes && nghost >1) fbx.grow(nghost);
-            Array4<Real> const& ffab = fine_cor.array(mfi);
-            Array4<Real const> const& cfab = cfine.const_array(mfi);
-
-            if (amrrr == 2) {
-                AMREX_HOST_DEVICE_FOR_4D ( fbx, ncomp, i, j, k, n,
-                {
-                    mlmg_lin_nd_interp_r2(i,j,k,n,ffab,cfab);
-                });
-            } else {
-                AMREX_HOST_DEVICE_FOR_4D ( fbx, ncomp, i, j, k, n,
-                {
-                    mlmg_lin_nd_interp_r4(i,j,k,n,ffab,cfab);
-                });
-            }
-        }
-    }
+    linop.AnyInterpolationAmr(alev, fine_cor, cfine, nghost);
 }
 
 // Interpolate correction between MG levels
@@ -709,119 +594,9 @@ MLMG::interpCorrection (int alev, int mglev)
 {
     BL_PROFILE("MLMG::interpCorrection_2");
 
-    MultiFab& crse_cor = *cor[alev][mglev+1];
-    MultiFab& fine_cor = *cor[alev][mglev  ];
-
-    const int ncomp = linop.getNComp();
-    int nghost = 0;
-    if (cf_strategy == CFStrategy::ghostnodes) nghost = linop.getNGrow(alev);
-
-    const Geometry& crse_geom = linop.Geom(alev,mglev+1);
-    const IntVect refratio = (alev > 0) ? IntVect(2) : linop.mg_coarsen_ratio_vec[mglev];
-
-    MultiFab cfine;
-    const MultiFab* cmf;
-
-    if (amrex::isMFIterSafe(crse_cor, fine_cor))
-    {
-        crse_cor.FillBoundary(crse_geom.periodicity());
-        cmf = &crse_cor;
-    }
-    else
-    {
-        BoxArray cba = fine_cor.boxArray();
-        cba.coarsen(refratio);
-        IntVect ng = linop.isCellCentered() ? crse_cor.nGrowVect() : IntVect(0);
-        if (cf_strategy == CFStrategy::ghostnodes) ng = IntVect(nghost);
-        cfine.define(cba, fine_cor.DistributionMap(), ncomp, ng);
-        cfine.setVal(0.0);
-        cfine.ParallelCopy(crse_cor, 0, 0, ncomp, IntVect(0), ng, crse_geom.periodicity());
-        cmf = & cfine;
-    }
-
-    bool isEB = fine_cor.hasEBFabFactory();
-    ignore_unused(isEB);
-
-#ifdef AMREX_USE_EB
-    auto factory = dynamic_cast<EBFArrayBoxFactory const*>(&(fine_cor.Factory()));
-    const FabArray<EBCellFlagFab>* flags = (factory) ? &(factory->getMultiEBCellFlagFab()) : nullptr;
-#endif
-
-    if (linop.isCellCentered())
-    {
-        MFItInfo mfi_info;
-        if (Gpu::notInLaunchRegion()) mfi_info.EnableTiling().SetDynamic(true);
-#ifdef AMREX_USE_OMP
-#pragma omp parallel if (Gpu::notInLaunchRegion())
-#endif
-        for (MFIter mfi(fine_cor, mfi_info); mfi.isValid(); ++mfi)
-        {
-            const Box& bx = mfi.tilebox();
-            const auto& ff = fine_cor.array(mfi);
-            const auto& cc = cmf->array(mfi);
-#ifdef AMREX_USE_EB
-            bool call_lincc;
-            if (isEB)
-            {
-                const auto& flag = (*flags)[mfi];
-                if (flag.getType(amrex::grow(bx,1)) == FabType::regular) {
-                    call_lincc = true;
-                } else {
-                    Array4<EBCellFlag const> const& flg = flag.const_array();
-                    AMREX_LAUNCH_HOST_DEVICE_LAMBDA (bx, tbx,
-                    {
-                        mlmg_eb_cc_interp_r<2>(tbx, ff, cc, flg, ncomp);
-                    });
-
-                    call_lincc = false;
-                }
-            }
-            else
-            {
-                call_lincc = true;
-            }
-#else
-            const bool call_lincc = true;
-#endif
-            if (call_lincc)
-            {
-#if (AMREX_SPACEDIM == 3)
-                if (linop.hasHiddenDimension()) {
-                    Box const& bx_2d = linop.compactify(bx);
-                    auto const& ff_2d = linop.compactify(ff);
-                    auto const& cc_2d = linop.compactify(cc);
-                    AMREX_LAUNCH_HOST_DEVICE_LAMBDA (bx_2d, tbx,
-                    {
-                        TwoD::mlmg_lin_cc_interp_r2(tbx, ff_2d, cc_2d, ncomp);
-                    });
-                } else
-#endif
-                {
-                    AMREX_LAUNCH_HOST_DEVICE_LAMBDA (bx, tbx,
-                    {
-                        mlmg_lin_cc_interp_r2(tbx, ff, cc, ncomp);
-                    });
-                }
-            }
-        }
-    }
-    else
-    {
-#ifdef AMREX_USE_OMP
-#pragma omp parallel if (Gpu::notInLaunchRegion())
-#endif
-        for (MFIter mfi(fine_cor, TilingIfNotGPU()); mfi.isValid(); ++mfi)
-        {
-            const Box& fbx = mfi.tilebox();
-            Array4<Real> const& ffab = fine_cor.array(mfi);
-            Array4<Real const> const& cfab = cmf->const_array(mfi);
-
-            AMREX_HOST_DEVICE_FOR_4D ( fbx, ncomp, i, j, k, n,
-            {
-                mlmg_lin_nd_interp_r2(i,j,k,n,ffab,cfab);
-            });
-        }
-    }
+    Any& crse_cor = cor[alev][mglev+1];
+    Any& fine_cor = cor[alev][mglev  ];
+    linop.AnyInterpAssignMG(alev, mglev, fine_cor, crse_cor);
 }
 
 // (Fine MG level correction) += I(Coarse MG level correction)
@@ -830,31 +605,24 @@ MLMG::addInterpCorrection (int alev, int mglev)
 {
     BL_PROFILE("MLMG::addInterpCorrection()");
 
-    const int ncomp = linop.getNComp();
-
-    const MultiFab& crse_cor = *cor[alev][mglev+1];
-    MultiFab&       fine_cor = *cor[alev][mglev  ];
+    const Any& crse_cor = cor[alev][mglev+1];
+    Any&       fine_cor = cor[alev][mglev  ];
 
-    MultiFab cfine;
-    const MultiFab* cmf;
+    Any cfine;
+    const Any* cany;
 
-    if (amrex::isMFIterSafe(crse_cor, fine_cor))
+    if (linop.isMFIterSafe(alev, mglev, mglev+1))
     {
-        cmf = &crse_cor;
+        cany = &crse_cor;
     }
     else
     {
-        BoxArray cba = fine_cor.boxArray();
-        IntVect ratio = (alev > 0) ? IntVect(2) : linop.mg_coarsen_ratio_vec[mglev];
-
-        cba.coarsen(ratio);
-        const int ng = 0;
-        cfine.define(cba, fine_cor.DistributionMap(), ncomp, ng);
-        cfine.ParallelCopy(crse_cor);
-        cmf = &cfine;
+        cfine = linop.AnyMakeCoarseMG(alev, mglev, IntVect(0));
+        linop.AnyParallelCopy(cfine,crse_cor,IntVect(0),IntVect(0));
+        cany = &cfine;
     }
 
-    linop.interpolation(alev, mglev, fine_cor, *cmf);
+    linop.AnyInterpolationMG(alev, mglev, fine_cor, *cany);
 }
 
 // Compute rescor = res - L(cor)
@@ -865,10 +633,10 @@ void
 MLMG::computeResOfCorrection (int amrlev, int mglev)
 {
     BL_PROFILE("MLMG:computeResOfCorrection()");
-    MultiFab& x = *cor[amrlev][mglev];
-    const MultiFab& b = res[amrlev][mglev];
-    MultiFab& r = rescor[amrlev][mglev];
-    linop.correctionResidual(amrlev, mglev, r, x, b, BCMode::Homogeneous);
+    Any      & x =    cor[amrlev][mglev];
+    const Any& b =    res[amrlev][mglev];
+    Any      & r = rescor[amrlev][mglev];
+    linop.AnyCorrectionResidual(amrlev, mglev, r, x, b, BCMode::Homogeneous);
 }
 
 // At the true bottom of the coarset AMR level.
@@ -894,7 +662,7 @@ MLMG::NSolve (MLMG& a_solver, MultiFab& a_sol, MultiFab& a_rhs)
 
     a_sol.setVal(0.0);
 
-    MultiFab const& res_bottom = res[0].back();
+    MultiFab const& res_bottom = res[0].back().get<MultiFab>();
     if (BoxArray::SameRefs(a_rhs.boxArray(),res_bottom.boxArray()) &&
         DistributionMapping::SameRefs(a_rhs.DistributionMap(),res_bottom.DistributionMap()))
     {
@@ -906,7 +674,7 @@ MLMG::NSolve (MLMG& a_solver, MultiFab& a_sol, MultiFab& a_rhs)
 
     a_solver.solve({&a_sol}, {&a_rhs}, Real(-1.0), Real(-1.0));
 
-    linop.copyNSolveSolution(*cor[0].back(), a_sol);
+    linop.copyNSolveSolution(cor[0].back().get<MultiFab>(), a_sol);
 }
 
 void
@@ -914,8 +682,6 @@ MLMG::actualBottomSolve ()
 {
     BL_PROFILE("MLMG::actualBottomSolve()");
 
-    const int ncomp = linop.getNComp();
-
     if (!linop.isBottomActive()) return;
 
     auto bottom_start_time = amrex::second();
@@ -924,28 +690,28 @@ MLMG::actualBottomSolve ()
 
     const int amrlev = 0;
     const int mglev = linop.NMGLevels(amrlev) - 1;
-    MultiFab& x = *cor[amrlev][mglev];
-    MultiFab& b = res[amrlev][mglev];
+    auto& x = cor[amrlev][mglev];
+    auto& b = res[amrlev][mglev];
 
-    x.setVal(0.0);
+    linop.AnySetToZero(x);
 
     if (bottom_solver == BottomSolver::smoother)
     {
         bool skip_fillboundary = true;
         for (int i = 0; i < nuf; ++i) {
-            linop.smooth(amrlev, mglev, x, b, skip_fillboundary);
+            linop.AnySmooth(amrlev, mglev, x, b, skip_fillboundary);
             skip_fillboundary = false;
         }
     }
     else
     {
-        MultiFab* bottom_b = &b;
-        MultiFab raii_b;
+        Any* bottom_b = &b;
+        Any raii_b;
         if (linop.isBottomSingular() && linop.getEnforceSingularSolvable())
         {
-            raii_b.define(b.boxArray(), b.DistributionMap(), ncomp, b.nGrowVect(),
-                          MFInfo(), *linop.Factory(amrlev,mglev));
-            MultiFab::Copy(raii_b,b,0,0,ncomp,b.nGrowVect());
+            const IntVect ng = linop.AnyGrowVect(b);
+            raii_b = linop.AnyMake(amrlev, mglev, ng);
+            linop.AnyCopy(raii_b, b, ng);
             bottom_b = &raii_b;
 
             makeSolvable(amrlev,mglev,*bottom_b);
@@ -973,7 +739,7 @@ MLMG::actualBottomSolve ()
             int ret = bottomSolveWithCG(x, *bottom_b, cg_type);
             // If the MLMG solve failed then set the correction to zero
             if (ret != 0) {
-                cor[amrlev][mglev]->setVal(0.0);
+                linop.AnySetToZero(cor[amrlev][mglev]);
                 if (bottom_solver == BottomSolver::cgbicg ||
                     bottom_solver == BottomSolver::bicgcg) {
                     if (bottom_solver == BottomSolver::cgbicg) {
@@ -983,7 +749,7 @@ MLMG::actualBottomSolve ()
                     }
                     ret = bottomSolveWithCG(x, *bottom_b, cg_type);
                     if (ret != 0) {
-                        cor[amrlev][mglev]->setVal(0.0);
+                        linop.AnySetToZero(cor[amrlev][mglev]);
                     } else { // switch permanently
                         if (cg_type == MLCGSolver::Type::CG) {
                             bottom_solver = BottomSolver::cg;
@@ -995,7 +761,7 @@ MLMG::actualBottomSolve ()
             }
             const int n = (ret==0) ? nub : nuf;
             for (int i = 0; i < n; ++i) {
-                linop.smooth(amrlev, mglev, x, b);
+                linop.AnySmooth(amrlev, mglev, x, b);
             }
         }
     }
@@ -1006,7 +772,7 @@ MLMG::actualBottomSolve ()
 }
 
 int
-MLMG::bottomSolveWithCG (MultiFab& x, const MultiFab& b, MLCGSolver::Type type)
+MLMG::bottomSolveWithCG (Any& x, const Any& b, MLCGSolver::Type type)
 {
     MLCGSolver cg_solver(this, linop);
     cg_solver.setSolver(type);
@@ -1027,37 +793,7 @@ Real
 MLMG::ResNormInf (int alev, bool local)
 {
     BL_PROFILE("MLMG::ResNormInf()");
-    const int ncomp = linop.getNComp();
-    const int mglev = 0;
-    Real norm = 0.0;
-    MultiFab* pmf = &(res[alev][mglev]);
-#ifdef AMREX_USE_EB
-    if (linop.isCellCentered() && scratch[alev]) {
-        pmf = scratch[alev].get();
-        MultiFab::Copy(*pmf, res[alev][mglev], 0, 0, ncomp, 0);
-        auto factory = dynamic_cast<EBFArrayBoxFactory const*>(linop.Factory(alev));
-        if (factory) {
-            const MultiFab& vfrac = factory->getVolFrac();
-            for (int n=0; n < ncomp; ++n) {
-                MultiFab::Multiply(*pmf, vfrac, 0, n, 1, 0);
-            }
-        } else {
-            amrex::Abort("MLMG::ResNormInf: not EB Factory");
-        }
-    }
-#endif
-    for (int n = 0; n < ncomp; n++)
-    {
-        Real newnorm = 0.0;
-        if (fine_mask[alev]) {
-            newnorm = pmf->norm0(*fine_mask[alev],n,0,true);
-        } else {
-            newnorm = pmf->norm0(n,0,true);
-        }
-        norm = std::max(norm, newnorm);
-    }
-    if (!local) ParallelAllReduce::Max(norm, ParallelContext::CommunicatorSub());
-    return norm;
+    return linop.AnyNormInfMask(alev, res[alev][0], local);
 }
 
 // Computes multi-level masked inf-norm of Residual (res).
@@ -1079,66 +815,17 @@ Real
 MLMG::MLRhsNormInf (bool local)
 {
     BL_PROFILE("MLMG::MLRhsNormInf()");
-    const int ncomp = linop.getNComp();
-    Real r = 0.0;
-    for (int alev = 0; alev <= finest_amr_lev; ++alev)
-    {
-        MultiFab* pmf = &(rhs[alev]);
-#ifdef AMREX_USE_EB
-        if (linop.isCellCentered() && scratch[alev]) {
-            pmf = scratch[alev].get();
-            MultiFab::Copy(*pmf, rhs[alev], 0, 0, ncomp, 0);
-            auto factory = dynamic_cast<EBFArrayBoxFactory const*>(linop.Factory(alev));
-            if (factory) {
-                const MultiFab& vfrac = factory->getVolFrac();
-                for (int n=0; n < ncomp; ++n) {
-                    MultiFab::Multiply(*pmf, vfrac, 0, n, 1, 0);
-                }
-            } else {
-                amrex::Abort("MLMG::MLRhsNormInf: not EB Factory");
-            }
-        }
-#endif
-        for (int n=0; n<ncomp; ++n)
-        {
-            if (alev < finest_amr_lev) {
-                r = std::max(r, pmf->norm0(*fine_mask[alev],n,0,true));
-            } else {
-                r = std::max(r, pmf->norm0(n,0,true));
-            }
-        }
+    Real r = 0.0_rt;
+    for (int alev = 0; alev <= finest_amr_lev; ++alev) {
+        auto t = linop.AnyNormInfMask(alev, rhs[alev], true);
+        r = std::max(r, t);
     }
     if (!local) ParallelAllReduce::Max(r, ParallelContext::CommunicatorSub());
     return r;
 }
 
 void
-MLMG::buildFineMask ()
-{
-    BL_PROFILE("MLMG::buildFineMask()");
-
-    if (!fine_mask.empty()) return;
-
-    fine_mask.clear();
-    fine_mask.resize(namrlevs);
-
-    const auto& amrrr = linop.AMRRefRatio();
-    for (int alev = 0; alev < finest_amr_lev; ++alev)
-    {
-        fine_mask[alev] = std::make_unique<iMultiFab>
-            (makeFineMask(rhs[alev], rhs[alev+1], IntVect(0), IntVect(amrrr[alev]),
-                          Periodicity::NonPeriodic(), 1, 0));
-    }
-
-    if (!linop.isCellCentered()) {
-        for (int alev = 0; alev < finest_amr_lev; ++alev) {
-            linop.fixUpResidualMask(alev, *fine_mask[alev]);
-        }
-    }
-}
-
-void
-MLMG::prepareForSolve (const Vector<MultiFab*>& a_sol, const Vector<MultiFab const*>& a_rhs)
+MLMG::prepareForSolve (Vector<Any>& a_sol, const Vector<Any>& a_rhs)
 {
     BL_PROFILE("MLMG::prepareForSolve()");
 
@@ -1147,7 +834,6 @@ MLMG::prepareForSolve (const Vector<MultiFab*>& a_sol, const Vector<MultiFab con
 
     timer.assign(ntimers, 0.0);
 
-    const int ncomp = linop.getNComp();
     IntVect ng_rhs(0);
     IntVect ng_sol(1);
     if (linop.hasHiddenDimension()) ng_sol[linop.hiddenDirection()] = 0;
@@ -1171,29 +857,28 @@ MLMG::prepareForSolve (const Vector<MultiFab*>& a_sol, const Vector<MultiFab con
     }
 
     sol.resize(namrlevs);
-    sol_raii.resize(namrlevs);
+    sol_is_alias.resize(namrlevs);
     for (int alev = 0; alev < namrlevs; ++alev)
     {
         if (cf_strategy == CFStrategy::ghostnodes)
         {
-            sol[alev] = a_sol[alev];
+            sol[alev] = linop.AnyMakeAlias(a_sol[alev]);
+            sol_is_alias[alev] = true;
         }
-        else if (a_sol[alev]->nGrowVect() == ng_sol)
+        else if (linop.AnyGrowVect(a_sol[alev]) == ng_sol)
         {
-            sol[alev] = a_sol[alev];
-            sol[alev]->setBndry(0.0);
+            sol[alev] = linop.AnyMakeAlias(a_sol[alev]);
+            linop.AnySetBndryToZero(sol[alev]);
+            sol_is_alias[alev] = true;
         }
         else
         {
             if (!solve_called) {
-                sol_raii[alev] = std::make_unique<MultiFab>(a_sol[alev]->boxArray(),
-                                                            a_sol[alev]->DistributionMap(),
-                                                            ncomp, ng_sol, MFInfo(),
-                                                            *linop.Factory(alev));
+                sol[alev] = linop.AnyMake(alev, 0, ng_sol);
             }
-            MultiFab::Copy(*sol_raii[alev], *a_sol[alev], 0, 0, ncomp, 0);
-            sol_raii[alev]->setBndry(0.0);
-            sol[alev] = sol_raii[alev].get();
+            linop.AnyCopy(sol[alev], a_sol[alev], IntVect(0));
+            linop.AnySetBndryToZero(sol[alev]);
+            sol_is_alias[alev] = false;
         }
     }
 
@@ -1202,10 +887,9 @@ MLMG::prepareForSolve (const Vector<MultiFab*>& a_sol, const Vector<MultiFab con
     {
         if (cf_strategy == CFStrategy::ghostnodes) ng_rhs = IntVect(linop.getNGrow(alev));
         if (!solve_called) {
-            rhs[alev].define(a_rhs[alev]->boxArray(), a_rhs[alev]->DistributionMap(), ncomp, ng_rhs,
-                             MFInfo(), *linop.Factory(alev));
+            rhs[alev] = linop.AnyMake(alev, 0, ng_rhs);
         }
-        MultiFab::Copy(rhs[alev], *a_rhs[alev], 0, 0, ncomp, ng_rhs);
+        linop.AnyCopy(rhs[alev], a_rhs[alev], ng_rhs);
         linop.applyMetricTerm(alev, 0, rhs[alev]);
         linop.unimposeNeumannBC(alev, rhs[alev]);
         linop.applyInhomogNeumannTerm(alev, rhs[alev]);
@@ -1215,38 +899,37 @@ MLMG::prepareForSolve (const Vector<MultiFab*>& a_sol, const Vector<MultiFab con
 #ifdef AMREX_USE_EB
         auto factory = dynamic_cast<EBFArrayBoxFactory const*>(linop.Factory(alev));
         if (factory) {
-            Vector<Real> val(ncomp, 0.0);
-            amrex::EB_set_covered(rhs[alev], 0, ncomp, val);
-            amrex::EB_set_covered(*sol[alev], 0, ncomp, val);
+            linop.AnySetCoveredToZero(rhs[alev]);
+            linop.AnySetCoveredToZero(sol[alev]);
         }
 #endif
     }
 
     for (int falev = finest_amr_lev; falev > 0; --falev)
     {
-        linop.averageDownSolutionRHS(falev-1, *sol[falev-1], rhs[falev-1], *sol[falev], rhs[falev]);
+        linop.AnyAverageDownSolutionRHS(falev-1, sol[falev-1], rhs[falev-1],
+                                        sol[falev], rhs[falev]);
     }
 
     // enforce solvability if appropriate
     if (linop.isSingular(0) && linop.getEnforceSingularSolvable())
     {
-        computeVolInv();
         makeSolvable();
     }
 
     IntVect ng = linop.isCellCentered() ? IntVect(0) : IntVect(1);
     if (cf_strategy == CFStrategy::ghostnodes) ng = ng_rhs;
     if (!solve_called) {
-        linop.make(res, ncomp, ng);
-        linop.make(rescor, ncomp, ng);
+        linop.make(res, ng);
+        linop.make(rescor, ng);
     }
     for (int alev = 0; alev <= finest_amr_lev; ++alev)
     {
         const int nmglevs = linop.NMGLevels(alev);
         for (int mglev = 0; mglev < nmglevs; ++mglev)
         {
-               res[alev][mglev].setVal(0.0);
-            rescor[alev][mglev].setVal(0.0);
+            linop.AnySetToZero(res   [alev][mglev]);
+            linop.AnySetToZero(rescor[alev][mglev]);
         }
     }
 
@@ -1261,12 +944,9 @@ MLMG::prepareForSolve (const Vector<MultiFab*>& a_sol, const Vector<MultiFab con
             if (!solve_called) {
                 IntVect _ng = ng;
                 if (cf_strategy == CFStrategy::ghostnodes) _ng=IntVect(linop.getNGrow(alev,mglev));
-                cor[alev][mglev] = std::make_unique<MultiFab>(res[alev][mglev].boxArray(),
-                                                              res[alev][mglev].DistributionMap(),
-                                                              ncomp, _ng, MFInfo(),
-                                                              *linop.Factory(alev,mglev));
+                cor[alev][mglev] = linop.AnyMake(alev, mglev, _ng);
             }
-            cor[alev][mglev]->setVal(0.0);
+            linop.AnySetToZero(cor[alev][mglev]);
         }
     }
 
@@ -1280,12 +960,9 @@ MLMG::prepareForSolve (const Vector<MultiFab*>& a_sol, const Vector<MultiFab con
             if (!solve_called) {
                 IntVect _ng = ng;
                 if (cf_strategy == CFStrategy::ghostnodes) _ng=IntVect(linop.getNGrow(alev,mglev));
-                cor_hold[alev][mglev] = std::make_unique<MultiFab>(cor[alev][mglev]->boxArray(),
-                                                                   cor[alev][mglev]->DistributionMap(),
-                                                                   ncomp, _ng, MFInfo(),
-                                                                   *linop.Factory(alev,mglev));
+                cor_hold[alev][mglev] = linop.AnyMake(alev, mglev, _ng);
             }
-            cor_hold[alev][mglev]->setVal(0.0);
+            linop.AnySetToZero(cor_hold[alev][mglev]);
         }
     }
     for (int alev = 1; alev < finest_amr_lev; ++alev)
@@ -1294,31 +971,9 @@ MLMG::prepareForSolve (const Vector<MultiFab*>& a_sol, const Vector<MultiFab con
         if (!solve_called) {
             IntVect _ng = ng;
             if (cf_strategy == CFStrategy::ghostnodes) _ng=IntVect(linop.getNGrow(alev));
-            cor_hold[alev][0] = std::make_unique<MultiFab>(cor[alev][0]->boxArray(),
-                                                           cor[alev][0]->DistributionMap(),
-                                                           ncomp, _ng, MFInfo(),
-                                                           *linop.Factory(alev,0));
-        }
-        cor_hold[alev][0]->setVal(0.0);
-    }
-
-    buildFineMask();
-
-    if (!solve_called)
-    {
-        scratch.resize(namrlevs);
-#ifdef AMREX_USE_EB
-        if (linop.isCellCentered()) {
-            for (int alev=0; alev < namrlevs; ++alev) {
-                if (rhs[alev].hasEBFabFactory()) {
-                    scratch[alev] = std::make_unique<MultiFab>(rhs[alev].boxArray(),
-                                                               rhs[alev].DistributionMap(),
-                                                               ncomp, 0, MFInfo(),
-                                                               *linop.Factory(alev));
-                }
-            }
+            cor_hold[alev][0] = linop.AnyMake(alev, 0, _ng);
         }
-#endif
+        linop.AnySetToZero(cor_hold[alev][0]);
     }
 
     if (linop.m_parent) {
@@ -1379,7 +1034,7 @@ MLMG::getGradSolution (const Vector<Array<MultiFab*,AMREX_SPACEDIM> >& a_grad_so
 {
     BL_PROFILE("MLMG::getGradSolution()");
     for (int alev = 0; alev <= finest_amr_lev; ++alev) {
-        linop.compGrad(alev, a_grad_sol[alev], *sol[alev], a_loc);
+        linop.compGrad(alev, a_grad_sol[alev], sol[alev].get<MultiFab>(), a_loc);
     }
 }
 
@@ -1392,7 +1047,11 @@ MLMG::getFluxes (const Vector<Array<MultiFab*,AMREX_SPACEDIM> >& a_flux,
     }
 
     AMREX_ASSERT(sol.size() == a_flux.size());
-    getFluxes(a_flux, sol, a_loc);
+    Vector<MultiFab*> solmf;
+    for (auto & s : sol) {
+        solmf.push_back(&(s.get<MultiFab>()));
+    }
+    getFluxes(a_flux, solmf, a_loc);
 }
 
 void
@@ -1413,7 +1072,11 @@ void
 MLMG::getFluxes (const Vector<MultiFab*> & a_flux, Location a_loc)
 {
     AMREX_ASSERT(sol.size() == a_flux.size());
-    getFluxes(a_flux, sol, a_loc);
+    Vector<MultiFab*> solmf;
+    for (auto & s : sol) {
+        solmf.push_back(&(s.get<MultiFab>()));
+    }
+    getFluxes(a_flux, solmf, a_loc);
 }
 
 void
@@ -1459,7 +1122,11 @@ MLMG::getEBFluxes (const Vector<MultiFab*>& a_eb_flux)
     }
 
     AMREX_ASSERT(sol.size() == a_eb_flux.size());
-    getEBFluxes(a_eb_flux, sol);
+    Vector<MultiFab*> solmf;
+    for (auto & s : sol) {
+        solmf.push_back(&(s.get<MultiFab>()));
+    }
+    getEBFluxes(a_eb_flux, solmf);
 }
 
 void
@@ -1486,28 +1153,21 @@ MLMG::compResidual (const Vector<MultiFab*>& a_res, const Vector<MultiFab*>& a_s
     if (linop.hasHiddenDimension()) ng_sol[linop.hiddenDirection()] = 0;
 
     sol.resize(namrlevs);
-    sol_raii.resize(namrlevs);
+    sol_is_alias.resize(namrlevs,true);
     for (int alev = 0; alev < namrlevs; ++alev)
     {
-        if (cf_strategy == CFStrategy::ghostnodes)
+        if (cf_strategy == CFStrategy::ghostnodes || a_sol[alev]->nGrowVect() == ng_sol)
         {
-            sol[alev] = a_sol[alev];
-        }
-        else if (a_sol[alev]->nGrowVect() == ng_sol)
-        {
-            sol[alev] = a_sol[alev];
+            sol[alev] = linop.AnyMakeAlias(*a_sol[alev]);
+            sol_is_alias[alev] = true;
         }
         else
         {
-            if (sol_raii[alev] == nullptr)
+            if (sol_is_alias[alev])
             {
-                sol_raii[alev] = std::make_unique<MultiFab>(a_sol[alev]->boxArray(),
-                                                            a_sol[alev]->DistributionMap(),
-                                                            ncomp, ng_sol, MFInfo(),
-                                                            *linop.Factory(alev));
+                sol[alev] = linop.AnyMake(alev, 0, ng_sol);
             }
-            MultiFab::Copy(*sol_raii[alev], *a_sol[alev], 0, 0, ncomp, 0);
-            sol[alev] = sol_raii[alev].get();
+            MultiFab::Copy(sol[alev].get<MultiFab>(), *a_sol[alev], 0, 0, ncomp, 0);
         }
     }
 
@@ -1521,22 +1181,23 @@ MLMG::compResidual (const Vector<MultiFab*>& a_res, const Vector<MultiFab*>& a_s
     const auto& amrrr = linop.AMRRefRatio();
 
     for (int alev = finest_amr_lev; alev >= 0; --alev) {
-        const MultiFab* crse_bcdata = (alev > 0) ? sol[alev-1] : nullptr;
+        const MultiFab* crse_bcdata = (alev > 0) ? &(sol[alev-1].get<MultiFab>()) : nullptr;
         const MultiFab* prhs = a_rhs[alev];
 #if (AMREX_SPACEDIM != 3)
         int nghost = (cf_strategy == CFStrategy::ghostnodes) ? linop.getNGrow(alev) : 0;
-        MultiFab rhstmp(prhs->boxArray(), prhs->DistributionMap(), ncomp, nghost,
-                        MFInfo(), *linop.Factory(alev));
+        Any rhstmp_a(MultiFab(prhs->boxArray(), prhs->DistributionMap(), ncomp, nghost,
+                              MFInfo(), *linop.Factory(alev)));
+        MultiFab& rhstmp = rhstmp_a.get<MultiFab>();
         MultiFab::Copy(rhstmp, *prhs, 0, 0, ncomp, nghost);
-        linop.applyMetricTerm(alev, 0, rhstmp);
-        linop.unimposeNeumannBC(alev, rhstmp);
-        linop.applyInhomogNeumannTerm(alev, rhstmp);
+        linop.applyMetricTerm(alev, 0, rhstmp_a);
+        linop.unimposeNeumannBC(alev, rhstmp_a);
+        linop.applyInhomogNeumannTerm(alev, rhstmp_a);
         prhs = &rhstmp;
 #endif
-        linop.solutionResidual(alev, *a_res[alev], *sol[alev], *prhs, crse_bcdata);
+        linop.solutionResidual(alev, *a_res[alev], sol[alev].get<MultiFab>(), *prhs, crse_bcdata);
         if (alev < finest_amr_lev) {
-            linop.reflux(alev, *a_res[alev], *sol[alev], *prhs,
-                         *a_res[alev+1], *sol[alev+1], *a_rhs[alev+1]);
+            linop.reflux(alev, *a_res[alev], sol[alev].get<MultiFab>(), *prhs,
+                         *a_res[alev+1], sol[alev+1].get<MultiFab>(), *a_rhs[alev+1]);
             if (linop.isCellCentered()) {
 #ifdef AMREX_USE_EB
                 amrex::EB_average_down(*a_res[alev+1], *a_res[alev], 0, ncomp, amrrr[alev]);
@@ -1604,7 +1265,8 @@ MLMG::apply (const Vector<MultiFab*>& out, const Vector<MultiFab*>& a_in)
     }
 
     for (int alev = 0; alev < namrlevs; ++alev) {
-        linop.applyInhomogNeumannTerm(alev, rh[alev]);
+        Any a(MultiFab(rh[alev], amrex::make_alias, 0, rh[alev].nComp()));
+        linop.applyInhomogNeumannTerm(alev, a);
     }
 
     const auto& amrrr = linop.AMRRefRatio();
@@ -1637,215 +1299,45 @@ MLMG::apply (const Vector<MultiFab*>& out, const Vector<MultiFab*>& a_in)
     }
 }
 
-void
-MLMG::averageDownAndSync ()
-{
-    const auto& amrrr = linop.AMRRefRatio();
-
-    int ncomp = linop.getNComp();
-    int nghost = 0;
-    if (cf_strategy == CFStrategy::ghostnodes) nghost = linop.getNGrow();
-
-    if (linop.isCellCentered())
-    {
-        for (int falev = finest_amr_lev; falev > 0; --falev)
-        {
-#ifdef AMREX_USE_EB
-            amrex::EB_average_down(*sol[falev], *sol[falev-1], 0, ncomp, amrrr[falev-1]);
-#else
-            amrex::average_down(*sol[falev], *sol[falev-1], 0, ncomp, amrrr[falev-1]);
-#endif
-        }
-    }
-    else
-    {
-        linop.nodalSync(finest_amr_lev, 0, *sol[finest_amr_lev]);
-
-        for (int falev = finest_amr_lev; falev > 0; --falev)
-        {
-            const auto& fmf = *sol[falev];
-            auto&       cmf = *sol[falev-1];
-
-            MultiFab tmpmf(amrex::coarsen(fmf.boxArray(), amrrr[falev-1]), fmf.DistributionMap(), ncomp, nghost);
-            amrex::average_down(fmf, tmpmf, 0, ncomp, amrrr[falev-1]);
-            cmf.ParallelCopy(tmpmf, 0, 0, ncomp);
-            linop.nodalSync(falev-1, 0, cmf);
-        }
-    }
-}
-
-void
-MLMG::computeVolInv ()
-{
-    if (solve_called) return;
-
-    if (linop.isCellCentered())
-    {
-        volinv.resize(namrlevs);
-        for (int amrlev = 0; amrlev < namrlevs; ++amrlev) {
-            volinv[amrlev].resize(linop.NMGLevels(amrlev));
-        }
-
-        // We don't need to compute for every level
-
-        auto f = [&] (int amrlev, int mglev) {
-#ifdef AMREX_USE_EB
-            auto factory = dynamic_cast<EBFArrayBoxFactory const*>(linop.Factory(amrlev,mglev));
-            if (factory)
-            {
-                const MultiFab& vfrac = factory->getVolFrac();
-                volinv[amrlev][mglev] = vfrac.sum(0,true);
-            }
-            else
-#endif
-            {
-                volinv[amrlev][mglev]
-                    = Real(1.0 / linop.compactify(linop.Geom(amrlev,mglev).Domain()).d_numPts());
-            }
-        };
-
-        // amrlev = 0, mglev = 0
-        f(0,0);
-
-        int mgbottom = linop.NMGLevels(0)-1;
-        f(0,mgbottom);
-
-#ifdef AMREX_USE_EB
-        Real temp1, temp2;
-        if (rhs[0].hasEBFabFactory())
-        {
-            ParallelAllReduce::Sum<Real>({volinv[0][0], volinv[0][mgbottom]},
-                                         ParallelContext::CommunicatorSub());
-            temp1 = Real(1.0)/volinv[0][0];
-            temp2 = Real(1.0)/volinv[0][mgbottom];
-        }
-        else
-        {
-            temp1 = volinv[0][0];
-            temp2 = volinv[0][mgbottom];
-        }
-        volinv[0][0] = temp1;
-        volinv[0][mgbottom] = temp2;
-#endif
-    }
-}
-
 void
 MLMG::makeSolvable ()
 {
-    const int ncomp = linop.getNComp();
-
-    if (linop.isCellCentered())
-    {
-        Vector<Real> offset(ncomp);
-#ifdef AMREX_USE_EB
-        auto factory = dynamic_cast<EBFArrayBoxFactory const*>(linop.Factory(0));
-        if (factory)
-        {
-            const MultiFab& vfrac = factory->getVolFrac();
-            for (int c = 0; c < ncomp; ++c) {
-                offset[c] = MultiFab::Dot(rhs[0], c, vfrac, 0, 1, 0, true) * volinv[0][0];
-            }
-        }
-        else
-#endif
-        {
-            for (int c = 0; c < ncomp; ++c) {
-                offset[c] = rhs[0].sum(c,true) * volinv[0][0];
-            }
-        }
-        ParallelAllReduce::Sum(offset.data(), ncomp, ParallelContext::CommunicatorSub());
-        if (verbose >= 4) {
-            for (int c = 0; c < ncomp; ++c) {
-                amrex::Print() << "MLMG: Subtracting " << offset[c]
-                               << " from rhs component " << c << "\n";
-            }
-        }
-        for (int alev = 0; alev < namrlevs; ++alev) {
-            for (int c = 0; c < ncomp; ++c) {
-                rhs[alev].plus(-offset[c], c, 1);
-            }
-#ifdef AMREX_USE_EB
-            if (rhs[alev].hasEBFabFactory()) {
-                Vector<Real> val(ncomp, 0.0);
-                amrex::EB_set_covered(rhs[alev], 0, ncomp, val);
-            }
-#endif
+    auto const& offset = linop.getSolvabilityOffset(0, 0, rhs[0]);
+    if (verbose >= 4) {
+        const int ncomp = offset.size();
+        for (int c = 0; c < ncomp; ++c) {
+            amrex::Print() << "MLMG: Subtracting " << offset[c] << " from rhs component "
+                           << c << "\n";
         }
     }
-    else
-    {
-        AMREX_ASSERT_WITH_MESSAGE(ncomp==1, "ncomp > 1 not supported for singular nodal problem");
-        Real offset = linop.getSolvabilityOffset(0, 0, rhs[0]);
-        if (verbose >= 4) {
-            amrex::Print() << "MLMG: Subtracting " << offset << " from rhs\n";
-        }
-        for (int alev = 0; alev < namrlevs; ++alev) {
-            linop.fixSolvabilityByOffset(alev, 0, rhs[alev], offset);
-        }
+    for (int alev = 0; alev < namrlevs; ++alev) {
+        linop.fixSolvabilityByOffset(alev, 0, rhs[alev], offset);
     }
 }
 
 void
-MLMG::makeSolvable (int amrlev, int mglev, MultiFab& mf)
+MLMG::makeSolvable (int amrlev, int mglev, Any& mf)
 {
-    const int ncomp = linop.getNComp();
-
-    if (linop.isCellCentered())
-    {
-        Vector<Real> offset(ncomp);
-#ifdef AMREX_USE_EB
-        auto factory = dynamic_cast<EBFArrayBoxFactory const*>(linop.Factory(amrlev,mglev));
-        if (factory)
-        {
-            const MultiFab& vfrac = factory->getVolFrac();
-            for (int c = 0; c < ncomp; ++c) {
-                offset[c] = MultiFab::Dot(mf, c, vfrac, 0, 1, 0, true) * volinv[amrlev][mglev];
-            }
-        }
-        else
-#endif
-        {
-            for (int c = 0; c < ncomp; ++c) {
-                offset[c] = mf.sum(c,true) * volinv[amrlev][mglev];
-            }
-        }
-
-        ParallelAllReduce::Sum(offset.data(), ncomp, ParallelContext::CommunicatorSub());
-
-        if (verbose >= 4) {
-            for (int c = 0; c < ncomp; ++c) {
-                amrex::Print() << "MLMG: Subtracting " << offset[c]
-                               << " from mf component c = " << c << "\n";
-            }
-        }
-
+    auto const& offset = linop.getSolvabilityOffset(amrlev, mglev, mf);
+    if (verbose >= 4) {
+        const int ncomp = offset.size();
         for (int c = 0; c < ncomp; ++c) {
-            mf.plus(-offset[c], c, 1);
+            amrex::Print() << "MLMG: Subtracting " << offset[c]
+                           << " from mf component c = " << c
+                           << " on level (" << amrlev << ", " << mglev << ")\n";
         }
-#ifdef AMREX_USE_EB
-        if (mf.hasEBFabFactory()) {
-            Vector<Real> val(ncomp, 0.0);
-            amrex::EB_set_covered(mf, 0, ncomp, val);
-        }
-#endif
-    }
-    else
-    {
-        AMREX_ASSERT_WITH_MESSAGE(ncomp==1, "ncomp > 1 not supported for singular nodal problem");
-        Real offset = linop.getSolvabilityOffset(amrlev, mglev, mf);
-        if (verbose >= 4) {
-            amrex::Print() << "MLMG: Subtracting " << offset << " on level (" << amrlev << ", "
-                           << mglev << ")\n";
-        }
-        linop.fixSolvabilityByOffset(amrlev, mglev, mf, offset);
     }
+    linop.fixSolvabilityByOffset(amrlev, mglev, mf, offset);
 }
 
 #if defined(AMREX_USE_HYPRE) && (AMREX_SPACEDIM > 1)
 void
-MLMG::bottomSolveWithHypre (MultiFab& x, const MultiFab& b)
+MLMG::bottomSolveWithHypre (Any& a_x, const Any& a_b)
 {
+    AMREX_ASSERT(a_x.is<MultiFab>());
+    MultiFab& x = a_x.get<MultiFab>();
+    MultiFab const& b = a_b.get<MultiFab>();
+
     const int amrlev = 0;
     const int mglev  = linop.NMGLevels(amrlev) - 1;
 
@@ -1905,18 +1397,21 @@ MLMG::bottomSolveWithHypre (MultiFab& x, const MultiFab& b)
     // For precision reasons we enforce that the average of the correction from hypre is 0
     if (linop.isSingular(amrlev) && linop.getEnforceSingularSolvable())
     {
-        makeSolvable(amrlev, mglev, x);
+        makeSolvable(amrlev, mglev, a_x);
     }
 }
 #endif
 
 void
-MLMG::bottomSolveWithPETSc (MultiFab& x, const MultiFab& b)
+MLMG::bottomSolveWithPETSc (Any& a_x, const Any& a_b)
 {
 #if !defined(AMREX_USE_PETSC)
-    amrex::ignore_unused(x,b);
+    amrex::ignore_unused(a_x,a_b);
     amrex::Abort("bottomSolveWithPETSc is called without building with PETSc");
 #else
+    AMREX_ASSERT(a_x.is<MultiFab>());
+    MultiFab& x = a_x.get<MultiFab>();
+    MultiFab const& b = a_b.get<MultiFab>();
 
     const int ncomp = linop.getNComp();
     AMREX_ALWAYS_ASSERT_WITH_MESSAGE(ncomp == 1, "bottomSolveWithPETSc doesn't work with ncomp > 1");
diff --git a/Src/LinearSolvers/MLMG/AMReX_MLNodeLaplacian.H b/Src/LinearSolvers/MLMG/AMReX_MLNodeLaplacian.H
index affe4c73eaf..50f20e22915 100644
--- a/Src/LinearSolvers/MLMG/AMReX_MLNodeLaplacian.H
+++ b/Src/LinearSolvers/MLMG/AMReX_MLNodeLaplacian.H
@@ -116,9 +116,11 @@ public :
     }
     virtual void getFluxes (const Vector<MultiFab*>& a_flux,
                             const Vector<MultiFab*>& a_sol) const final override;
-    virtual void unimposeNeumannBC (int amrlev, MultiFab& rhs) const final override;
-    virtual Real getSolvabilityOffset (int amrlev, int mglev, MultiFab const& rhs) const override;
-    virtual void fixSolvabilityByOffset (int amrlev, int mglev, MultiFab& rhs, Real offset) const override;
+    virtual void unimposeNeumannBC (int amrlev, Any& rhs) const final override;
+    virtual Vector<Real> getSolvabilityOffset (int amrlev, int mglev,
+                                               Any const& rhs) const override;
+    virtual void fixSolvabilityByOffset (int amrlev, int mglev, Any& rhs,
+                                         Vector<Real> const& offset) const override;
 
     virtual void compGrad (int /*amrlev*/, const Array<MultiFab*,AMREX_SPACEDIM>& /*grad*/,
                            MultiFab& /*sol*/, Location /*loc*/) const final override {
diff --git a/Src/LinearSolvers/MLMG/AMReX_MLNodeLaplacian.cpp b/Src/LinearSolvers/MLMG/AMReX_MLNodeLaplacian.cpp
index 79358b58898..c0efaed25d6 100644
--- a/Src/LinearSolvers/MLMG/AMReX_MLNodeLaplacian.cpp
+++ b/Src/LinearSolvers/MLMG/AMReX_MLNodeLaplacian.cpp
@@ -150,13 +150,16 @@ MLNodeLaplacian::resizeMultiGrid (int new_size)
 }
 
 void
-MLNodeLaplacian::unimposeNeumannBC (int amrlev, MultiFab& rhs) const
+MLNodeLaplacian::unimposeNeumannBC (int amrlev, Any& a_rhs) const
 {
     if (m_coarsening_strategy == CoarseningStrategy::RAP) {
         const Box& nddom = amrex::surroundingNodes(Geom(amrlev).Domain());
         const auto lobc = LoBC();
         const auto hibc = HiBC();
 
+        AMREX_ASSERT(a_rhs.is<MultiFab>());
+        MultiFab& rhs = a_rhs.get<MultiFab>();
+
         MFItInfo mfi_info;
         if (Gpu::notInLaunchRegion()) mfi_info.EnableTiling().SetDynamic(true);
 #ifdef AMREX_USE_OMP
@@ -171,14 +174,17 @@ MLNodeLaplacian::unimposeNeumannBC (int amrlev, MultiFab& rhs) const
     }
 }
 
-Real
-MLNodeLaplacian::getSolvabilityOffset (int amrlev, int mglev, MultiFab const& rhs) const
+Vector<Real>
+MLNodeLaplacian::getSolvabilityOffset (int amrlev, int mglev, Any const& a_rhs) const
 {
     amrex::ignore_unused(amrlev);
-    AMREX_ASSERT(amrlev==0);
-    AMREX_ASSERT(mglev+1==m_num_mg_levels[0] || mglev==0);
+    AMREX_ASSERT(amrlev==0 && (mglev+1==m_num_mg_levels[0] || mglev==0));
+    AMREX_ASSERT(getNComp() == 1);
 
     if (m_coarsening_strategy == CoarseningStrategy::RAP) {
+        AMREX_ASSERT(a_rhs.is<MultiFab>());
+        auto const& rhs = a_rhs.get<MultiFab>();
+
 #ifdef AMREX_USE_EB
         auto factory = dynamic_cast<EBFArrayBoxFactory const*>(m_factory[amrlev][0].get());
         if (mglev == 0 && factory && !factory->isAllRegular()) {
@@ -229,7 +235,7 @@ MLNodeLaplacian::getSolvabilityOffset (int amrlev, int mglev, MultiFab const& rh
             Real s1 = amrex::get<0>(r);
             Real s2 = amrex::get<1>(r);
             ParallelAllReduce::Sum<Real>({s1,s2}, ParallelContext::CommunicatorSub());
-            return s1/s2;
+            return {s1/s2};
         } else
 #endif
         {
@@ -279,16 +285,21 @@ MLNodeLaplacian::getSolvabilityOffset (int amrlev, int mglev, MultiFab const& rh
             Real s1 = amrex::get<0>(r);
             Real s2 = amrex::get<1>(r);
             ParallelAllReduce::Sum<Real>({s1,s2}, ParallelContext::CommunicatorSub());
-            return s1/s2;
+            return {s1/s2};
         }
     } else {
-        return MLNodeLinOp::getSolvabilityOffset(amrlev, mglev, rhs);
+        return MLNodeLinOp::getSolvabilityOffset(amrlev, mglev, a_rhs);
     }
 }
 
 void
-MLNodeLaplacian::fixSolvabilityByOffset (int amrlev, int mglev, MultiFab& rhs, Real offset) const
+MLNodeLaplacian::fixSolvabilityByOffset (int amrlev, int mglev, Any& a_rhs,
+                                         Vector<Real> const& a_offset) const
 {
+    AMREX_ASSERT(a_rhs.is<MultiFab>());
+    auto& rhs = a_rhs.get<MultiFab>();
+    Real offset = a_offset[0];
+
     if (m_coarsening_strategy == CoarseningStrategy::RAP) {
 #ifdef AMREX_USE_EB
         auto factory = dynamic_cast<EBFArrayBoxFactory const*>(m_factory[amrlev][0].get());
diff --git a/Src/LinearSolvers/MLMG/AMReX_MLNodeLaplacian_misc.cpp b/Src/LinearSolvers/MLMG/AMReX_MLNodeLaplacian_misc.cpp
index df5ab489d2f..339ca98e072 100644
--- a/Src/LinearSolvers/MLMG/AMReX_MLNodeLaplacian_misc.cpp
+++ b/Src/LinearSolvers/MLMG/AMReX_MLNodeLaplacian_misc.cpp
@@ -26,7 +26,11 @@ MLNodeLaplacian::averageDownCoeffs ()
         {
             for (int mglev = 0; mglev < m_num_mg_levels[amrlev]; ++mglev)
             {
+#if (AMREX_SPACEDIM == 1)
+                int ndims = 1;
+#else
                 int ndims = (m_use_harmonic_average || m_use_mapped) ? AMREX_SPACEDIM : 1;
+#endif
                 for (int idim = 0; idim < ndims; ++idim)
                 {
                     if (m_sigma[amrlev][mglev][idim] == nullptr) {
@@ -101,7 +105,11 @@ MLNodeLaplacian::averageDownCoeffsSameAmrLevel (int amrlev)
 
     if (m_coarsening_strategy != CoarseningStrategy::Sigma) return;
 
+#if (AMREX_SPACEDIM == 1)
+    const int nsigma = 1;
+#else
     const int nsigma = (m_use_harmonic_average || m_use_mapped) ? AMREX_SPACEDIM : 1;
+#endif
 
     for (int mglev = 1; mglev < m_num_mg_levels[amrlev]; ++mglev)
     {
diff --git a/Src/LinearSolvers/MLMG/AMReX_MLNodeLinOp.H b/Src/LinearSolvers/MLMG/AMReX_MLNodeLinOp.H
index c46f4a250f2..1935be89f1d 100644
--- a/Src/LinearSolvers/MLMG/AMReX_MLNodeLinOp.H
+++ b/Src/LinearSolvers/MLMG/AMReX_MLNodeLinOp.H
@@ -36,10 +36,6 @@ public:
                  const Vector<FabFactory<FArrayBox> const*>& a_factory = {},
                  int a_eb_limit_coarsening = -1);
 
-    virtual void setLevelBC (int /*amrlev*/, const MultiFab* /*levelbcdata*/,
-                             const MultiFab* = nullptr, const MultiFab* = nullptr,
-                             const MultiFab* = nullptr) final override {}
-
     virtual void apply (int amrlev, int mglev, MultiFab& out, MultiFab& in, BCMode bc_mode,
                         StateMode s_mode, const MLMGBndry* bndry=nullptr) const final override;
 
@@ -59,20 +55,15 @@ public:
         amrex::Abort("AMReX_MLNodeLinOp::compGrad::How did we get here?");
     }
 
-    virtual void applyMetricTerm (int /*amrlev*/, int /*mglev*/, MultiFab& /*rhs*/) const final override {}
+    virtual void applyMetricTerm (int /*amrlev*/, int /*mglev*/, Any& /*rhs*/) const final override {}
     virtual void unapplyMetricTerm (int /*amrlev*/, int /*mglev*/, MultiFab& /*rhs*/) const final override {}
 
-    virtual void fillSolutionBC (int /*amrlev*/, MultiFab& /*sol*/,
-                                 const MultiFab* /*crse_bcdata*/=nullptr) final override {
-        amrex::Abort("AMReX_MLNodeLinOp::fillSolutionBC::How did we get here?");
-    }
-
-    virtual void applyInhomogNeumannTerm (int amrlev, MultiFab& rhs) const override;
+    virtual Vector<Real> getSolvabilityOffset (int amrlev, int mglev,
+                                               Any const& rhs) const override;
+    virtual void fixSolvabilityByOffset (int amrlev, int mglev, Any& rhs,
+                                         Vector<Real> const& offset) const override;
 
-    virtual Real getSolvabilityOffset (int amrlev, int mglev, MultiFab const& rhs) const override;
-    virtual void fixSolvabilityByOffset (int amrlev, int mglev, MultiFab& rhs, Real offset) const override;
-
-    virtual void prepareForSolve () override {}
+    virtual void prepareForSolve () override;
 
     virtual bool isSingular (int amrlev) const override
         { return (amrlev == 0) ? m_is_bottom_singular : false; }
@@ -86,7 +77,7 @@ public:
     virtual void Fapply (int amrlev, int mglev, MultiFab& out, const MultiFab& in) const = 0;
     virtual void Fsmooth (int amrlev, int mglev, MultiFab& sol, const MultiFab& rsh) const = 0;
 
-    virtual void nodalSync (int amrlev, int mglev, MultiFab& mf) const final override;
+    void nodalSync (int amrlev, int mglev, MultiFab& mf) const;
 
     virtual std::unique_ptr<MLLinOp> makeNLinOp (int /*grid_size*/) const final override {
         amrex::Abort("MLNodeLinOp::makeNLinOp: N-Solve not supported");
@@ -102,6 +93,19 @@ public:
     // omask is either 0 or 1. 1 means the node is an unknown. 0 means it's known.
     void setOversetMask (int amrlev, const iMultiFab& a_omask);
 
+    virtual void fixUpResidualMask (int /*amrlev*/, iMultiFab& /*resmsk*/) { }
+
+    virtual Real AnyNormInfMask (int amrlev, Any const& a, bool local) const override;
+
+    virtual void AnyAvgDownResAmr (int, Any&, Any const&) const final override { }
+
+    virtual void AnyInterpolationAmr (int famrlev, Any& fine, const Any& crse,
+                                      IntVect const& nghost) const override;
+
+    virtual void AnyAverageDownAndSync (Vector<Any>& sol) const override;
+
+    virtual void interpAssign (int amrlev, int fmglev, MultiFab& fine, MultiFab& crse) const override;
+
 #if defined(AMREX_USE_HYPRE) && (AMREX_SPACEDIM > 1)
     virtual std::unique_ptr<HypreNodeLap> makeHypreNodeLap(
         int bottom_verbose,
@@ -139,6 +143,8 @@ protected:
     MultiFab m_bottom_dot_mask;
     MultiFab m_coarse_dot_mask;
 
+    Vector<std::unique_ptr<iMultiFab> > m_norm_fine_mask;
+
 #ifdef AMREX_USE_EB
     CoarseningStrategy m_coarsening_strategy = CoarseningStrategy::RAP;
 #else
diff --git a/Src/LinearSolvers/MLMG/AMReX_MLNodeLinOp.cpp b/Src/LinearSolvers/MLMG/AMReX_MLNodeLinOp.cpp
index baf0f5edb42..b5173b71f5f 100644
--- a/Src/LinearSolvers/MLMG/AMReX_MLNodeLinOp.cpp
+++ b/Src/LinearSolvers/MLMG/AMReX_MLNodeLinOp.cpp
@@ -1,6 +1,7 @@
 
 #include <AMReX_MLNodeLinOp.H>
 #include <AMReX_MLNodeLap_K.H>
+#include <AMReX_MLMG_K.H>
 #include <AMReX_MultiFabUtil.H>
 
 #ifdef AMREX_USE_OMP
@@ -83,6 +84,22 @@ MLNodeLinOp::define (const Vector<Geometry>& a_geom,
         m_has_fine_bndry[amrlev] = std::make_unique<LayoutData<int> >(m_grids[amrlev][0],
                                                                       m_dmap[amrlev][0]);
     }
+
+    m_norm_fine_mask.resize(m_num_amr_levels-1);
+    for (int amrlev = 0; amrlev < m_num_amr_levels-1; ++amrlev) {
+        m_norm_fine_mask[amrlev] = std::make_unique<iMultiFab>
+            (makeFineMask(amrex::convert(m_grids[amrlev][0], IntVect(1)), m_dmap[amrlev][0],
+                          amrex::convert(m_grids[amrlev+1][0], IntVect(1)),
+                          IntVect(m_amr_ref_ratio[amrlev]), 1, 0));
+    }
+}
+
+void
+MLNodeLinOp::prepareForSolve ()
+{
+    for (int amrlev = 0; amrlev < m_num_amr_levels-1; ++amrlev) {
+        fixUpResidualMask(amrlev, *m_norm_fine_mask[amrlev]);
+    }
 }
 
 std::unique_ptr<iMultiFab>
@@ -177,17 +194,16 @@ MLNodeLinOp::xdoty (int amrlev, int mglev, const MultiFab& x, const MultiFab& y,
     return result;
 }
 
-void
-MLNodeLinOp::applyInhomogNeumannTerm (int /*amrlev*/, MultiFab& /*rhs*/) const
-{
-}
-
-Real
-MLNodeLinOp::getSolvabilityOffset (int amrlev, int mglev, MultiFab const& rhs) const
+Vector<Real>
+MLNodeLinOp::getSolvabilityOffset (int amrlev, int mglev, Any const& a_rhs) const
 {
     amrex::ignore_unused(amrlev);
-    AMREX_ASSERT(amrlev==0);
-    AMREX_ASSERT(mglev+1==m_num_mg_levels[0] || mglev==0);
+    AMREX_ASSERT(amrlev==0 && (mglev+1==m_num_mg_levels[0] || mglev==0));
+    AMREX_ASSERT(getNComp() == 1);
+
+    AMREX_ASSERT(a_rhs.is<MultiFab>());
+    auto const& rhs = a_rhs.get<MultiFab>();
+
     const auto& mask = (mglev+1 == m_num_mg_levels[0]) ? m_bottom_dot_mask : m_coarse_dot_mask;
     const auto& mask_ma = mask.const_arrays();
     const auto& rhs_ma = rhs.const_arrays();
@@ -203,13 +219,16 @@ MLNodeLinOp::getSolvabilityOffset (int amrlev, int mglev, MultiFab const& rhs) c
     Real s1 = amrex::get<0>(r);
     Real s2 = amrex::get<1>(r);
     ParallelAllReduce::Sum<Real>({s1,s2}, ParallelContext::CommunicatorSub());
-    return s1/s2;
+    return {s1/s2};
 }
 
 void
-MLNodeLinOp::fixSolvabilityByOffset (int /*amrlev*/, int /*mglev*/, MultiFab& rhs, Real offset) const
+MLNodeLinOp::fixSolvabilityByOffset (int /*amrlev*/, int /*mglev*/, Any& a_rhs,
+                                     Vector<Real> const& offset) const
 {
-    rhs.plus(-offset, 0, 1);
+    AMREX_ASSERT(a_rhs.is<MultiFab>());
+    auto& rhs = a_rhs.get<MultiFab>();
+    rhs.plus(-offset[0], 0, 1);
 }
 
 namespace {
@@ -448,6 +467,119 @@ MLNodeLinOp::resizeMultiGrid (int new_size)
     MLLinOp::resizeMultiGrid(new_size);
 }
 
+Real
+MLNodeLinOp::AnyNormInfMask (int amrlev, Any const& a, bool local) const
+{
+    AMREX_ASSERT(a.is<MultiFab>());
+    auto& mf = a.get<MultiFab>();
+
+    const int finest_level = NAMRLevels() - 1;
+    iMultiFab const* fine_mask = (amrlev == finest_level)
+        ? nullptr : m_norm_fine_mask[amrlev].get();
+    return MFNormInf(mf, fine_mask, local);
+}
+
+void
+MLNodeLinOp::AnyInterpolationAmr (int famrlev, Any& a_fine, const Any& a_crse,
+                                  IntVect const& nghost) const
+{
+    AMREX_ASSERT(a_fine.is<MultiFab>());
+    MultiFab& fine = a_fine.get<MultiFab>();
+    MultiFab const& crse = a_crse.get<MultiFab>();
+
+    const int ncomp = getNComp();
+    const int refratio = AMRRefRatio(famrlev-1);
+
+    AMREX_ALWAYS_ASSERT(refratio == 2 || refratio == 4);
+#ifdef AMREX_USE_OMP
+#pragma omp parallel if (Gpu::notInLaunchRegion())
+#endif
+    for (MFIter mfi(fine, TilingIfNotGPU()); mfi.isValid(); ++mfi)
+    {
+        Box fbx = mfi.tilebox();
+        fbx.grow(nghost);
+        Array4<Real> const& ffab = fine.array(mfi);
+        Array4<Real const> const& cfab = crse.const_array(mfi);
+
+        if (refratio == 2) {
+            AMREX_HOST_DEVICE_FOR_4D ( fbx, ncomp, i, j, k, n,
+            {
+                mlmg_lin_nd_interp_r2(i,j,k,n,ffab,cfab);
+            });
+        } else {
+            AMREX_HOST_DEVICE_FOR_4D ( fbx, ncomp, i, j, k, n,
+            {
+                mlmg_lin_nd_interp_r4(i,j,k,n,ffab,cfab);
+            });
+        }
+    }
+}
+
+void
+MLNodeLinOp::AnyAverageDownAndSync (Vector<Any>& sol) const
+{
+    AMREX_ASSERT(sol[0].is<MultiFab>());
+
+    const int ncomp = getNComp();
+    const int finest_amr_lev = NAMRLevels() - 1;
+
+    nodalSync(finest_amr_lev, 0, sol[finest_amr_lev].get<MultiFab>());
+
+    for (int falev = finest_amr_lev; falev > 0; --falev)
+    {
+        const auto& fmf = sol[falev  ].get<MultiFab>();
+        auto&       cmf = sol[falev-1].get<MultiFab>();
+
+        auto rr = AMRRefRatio(falev-1);
+        MultiFab tmpmf(amrex::coarsen(fmf.boxArray(), rr), fmf.DistributionMap(), ncomp, 0);
+        amrex::average_down(fmf, tmpmf, 0, ncomp, rr);
+        cmf.ParallelCopy(tmpmf, 0, 0, ncomp);
+        nodalSync(falev-1, 0, cmf);
+    }
+}
+
+void
+MLNodeLinOp::interpAssign (int amrlev, int fmglev, MultiFab& fine, MultiFab& crse) const
+{
+    const int ncomp = getNComp();
+
+    const Geometry& crse_geom = Geom(amrlev,fmglev+1);
+    const IntVect refratio = (amrlev > 0) ? IntVect(2) : mg_coarsen_ratio_vec[fmglev];
+    AMREX_ALWAYS_ASSERT(refratio == 2);
+
+    MultiFab cfine;
+    const MultiFab* cmf;
+
+    if (amrex::isMFIterSafe(crse, fine))
+    {
+        crse.FillBoundary(crse_geom.periodicity());
+        cmf = &crse;
+    }
+    else
+    {
+        BoxArray cba = fine.boxArray();
+        cba.coarsen(refratio);
+        cfine.define(cba, fine.DistributionMap(), ncomp, 0);
+        cfine.ParallelCopy(crse, 0, 0, ncomp, 0, 0, crse_geom.periodicity());
+        cmf = & cfine;
+    }
+
+#ifdef AMREX_USE_OMP
+#pragma omp parallel if (Gpu::notInLaunchRegion())
+#endif
+    for (MFIter mfi(fine, TilingIfNotGPU()); mfi.isValid(); ++mfi)
+    {
+        const Box& fbx = mfi.tilebox();
+        Array4<Real> const& ffab = fine.array(mfi);
+        Array4<Real const> const& cfab = cmf->const_array(mfi);
+
+        AMREX_HOST_DEVICE_FOR_4D ( fbx, ncomp, i, j, k, n,
+        {
+            mlmg_lin_nd_interp_r2(i,j,k,n,ffab,cfab);
+        });
+    }
+}
+
 #if defined(AMREX_USE_HYPRE) && (AMREX_SPACEDIM > 1)
 std::unique_ptr<HypreNodeLap>
 MLNodeLinOp::makeHypreNodeLap (int bottom_verbose, const std::string& options_namespace) const
diff --git a/Src/LinearSolvers/MLMG/AMReX_MLPoisson.H b/Src/LinearSolvers/MLMG/AMReX_MLPoisson.H
index 81dd431d953..41f8fbf1cae 100644
--- a/Src/LinearSolvers/MLMG/AMReX_MLPoisson.H
+++ b/Src/LinearSolvers/MLMG/AMReX_MLPoisson.H
@@ -70,6 +70,10 @@ public:
 
     virtual void copyNSolveSolution (MultiFab& dst, MultiFab const& src) const final override;
 
+    //! Compute dphi/dn on domain faces after the solver has converged.
+    void get_dpdn_on_domain_faces (Array<MultiFab*,AMREX_SPACEDIM> const& dpdn,
+                                   MultiFab const& phi);
+
 private:
 
     Vector<int> m_is_singular;
diff --git a/Src/LinearSolvers/MLMG/AMReX_MLPoisson.cpp b/Src/LinearSolvers/MLMG/AMReX_MLPoisson.cpp
index ce27eb936fd..15ee75e961a 100644
--- a/Src/LinearSolvers/MLMG/AMReX_MLPoisson.cpp
+++ b/Src/LinearSolvers/MLMG/AMReX_MLPoisson.cpp
@@ -702,4 +702,63 @@ MLPoisson::copyNSolveSolution (MultiFab& dst, MultiFab const& src) const
     dst.ParallelCopy(src);
 }
 
+void
+MLPoisson::get_dpdn_on_domain_faces (Array<MultiFab*,AMREX_SPACEDIM> const& dpdn,
+                                     MultiFab const& phi)
+{
+    BL_PROFILE("MLPoisson::dpdn_faces()");
+
+    // We do not need to call applyBC because this function is used by the
+    // OpenBC solver after solver has converged.  That means the BC has been
+    // filled to check the residual.
+
+    Box const& domain0 = m_geom[0][0].Domain();
+    AMREX_D_TERM(const Real dxi = m_geom[0][0].InvCellSize(0);,
+                 const Real dyi = m_geom[0][0].InvCellSize(1);,
+                 const Real dzi = m_geom[0][0].InvCellSize(2);)
+
+#ifdef AMREX_USE_OMP
+#pragma omp parallel if (Gpu::notInLaunchRegion())
+#endif
+    for (MFIter mfi(phi);  mfi.isValid(); ++mfi)
+    {
+        Box const& vbx = mfi.validbox();
+        for (OrientationIter oit; oit.isValid(); ++oit) {
+            Orientation face = oit();
+            if (vbx[face] == domain0[face]) {
+                int dir = face.coordDir();
+                Array4<Real const> const& p = phi.const_array(mfi);
+                Array4<Real> const& gp = dpdn[dir]->array(mfi);
+                Box const& b2d = amrex::bdryNode(vbx,face);
+                if (dir == 0) {
+                    // because it's dphi/dn, not dphi/dx.
+                    Real fac = dxi * (face.isLow() ? -1.0_rt : 1._rt);
+                    AMREX_HOST_DEVICE_PARALLEL_FOR_3D(b2d, i, j, k,
+                    {
+                        gp(i,j,k) = fac * (p(i,j,k) - p(i-1,j,k));
+                    });
+                }
+#if (AMREX_SPACEDIM > 1)
+                else if (dir == 1) {
+                    Real fac = dyi * (face.isLow() ? -1.0_rt : 1._rt);
+                    AMREX_HOST_DEVICE_PARALLEL_FOR_3D(b2d, i, j, k,
+                    {
+                        gp(i,j,k) = fac * (p(i,j,k) - p(i,j-1,k));
+                    });
+                }
+#if (AMREX_SPACEDIM > 2)
+                else {
+                    Real fac = dzi * (face.isLow() ? -1.0_rt : 1._rt);
+                    AMREX_HOST_DEVICE_PARALLEL_FOR_3D(b2d, i, j, k,
+                    {
+                        gp(i,j,k) = fac * (p(i,j,k) - p(i,j,k-1));
+                    });
+                }
+#endif
+#endif
+            }
+        }
+    }
+}
+
 }
diff --git a/Src/LinearSolvers/MLMG/AMReX_MLTensorOp.cpp b/Src/LinearSolvers/MLMG/AMReX_MLTensorOp.cpp
index d4e77f312dc..0750ffdd969 100644
--- a/Src/LinearSolvers/MLMG/AMReX_MLTensorOp.cpp
+++ b/Src/LinearSolvers/MLMG/AMReX_MLTensorOp.cpp
@@ -210,9 +210,16 @@ MLTensorOp::apply (int amrlev, int mglev, MultiFab& out, MultiFab& in, BCMode bc
 
     if (mglev >= m_kappa[amrlev].size()) return;
 
-    applyBCTensor(amrlev, mglev, in, bc_mode, s_mode, bndry );
+    applyBCTensor(amrlev, mglev, in, bc_mode, s_mode, bndry);
+
+    const auto& bcondloc = *m_bcondloc[amrlev][mglev];
+
+    Array4<Real const> foo;
 
     const auto dxinv = m_geom[amrlev][mglev].InvCellSizeArray();
+    const Box& domain = m_geom[amrlev][mglev].growPeriodicDomain(1);
+    const auto dlo = amrex::lbound(domain);
+    const auto dhi = amrex::ubound(domain);
 
     Array<MultiFab,AMREX_SPACEDIM> const& etamf = m_b_coeffs[amrlev][mglev];
     Array<MultiFab,AMREX_SPACEDIM> const& kapmf = m_kappa[amrlev][mglev];
@@ -247,20 +254,65 @@ MLTensorOp::apply (int amrlev, int mglev, MultiFab& out, MultiFab& in, BCMode bc
                          Array4<Real> const fyfab = fluxfab_tmp[1].array();,
                          Array4<Real> const fzfab = fluxfab_tmp[2].array(););
 
-            AMREX_LAUNCH_HOST_DEVICE_LAMBDA_DIM
-            ( xbx, txbx,
-              {
-                  mltensor_cross_terms_fx(txbx,fxfab,vfab,etaxfab,kapxfab,dxinv);
-              }
-            , ybx, tybx,
-              {
-                  mltensor_cross_terms_fy(tybx,fyfab,vfab,etayfab,kapyfab,dxinv);
-              }
-            , zbx, tzbx,
-              {
-                  mltensor_cross_terms_fz(tzbx,fzfab,vfab,etazfab,kapzfab,dxinv);
-              }
-            );
+            if (domain.strictly_contains(bx)) {
+                AMREX_LAUNCH_HOST_DEVICE_LAMBDA_DIM
+                ( xbx, txbx,
+                  {
+                      mltensor_cross_terms_fx(txbx,fxfab,vfab,etaxfab,kapxfab,dxinv);
+                  }
+                , ybx, tybx,
+                  {
+                      mltensor_cross_terms_fy(tybx,fyfab,vfab,etayfab,kapyfab,dxinv);
+                  }
+                , zbx, tzbx,
+                  {
+                      mltensor_cross_terms_fz(tzbx,fzfab,vfab,etazfab,kapzfab,dxinv);
+                  }
+                );
+            } else {
+                const auto & bdcv = bcondloc.bndryConds(mfi);
+
+                Array2D<BoundCond,0,2*AMREX_SPACEDIM,0,AMREX_SPACEDIM> bct;
+                for (int icomp = 0; icomp < AMREX_SPACEDIM; ++icomp) {
+                    for (OrientationIter face; face; ++face) {
+                        Orientation ori = face();
+                        bct(ori,icomp) = bdcv[icomp][ori];
+                    }
+                }
+
+                const auto& bvxlo = (bndry != nullptr) ?
+                    (*bndry)[Orientation(0,Orientation::low )].array(mfi) : foo;
+                const auto& bvylo = (bndry != nullptr) ?
+                    (*bndry)[Orientation(1,Orientation::low )].array(mfi) : foo;
+                const auto& bvxhi = (bndry != nullptr) ?
+                    (*bndry)[Orientation(0,Orientation::high)].array(mfi) : foo;
+                const auto& bvyhi = (bndry != nullptr) ?
+                    (*bndry)[Orientation(1,Orientation::high)].array(mfi) : foo;
+#if (AMREX_SPACEDIM == 3)
+                const auto& bvzlo = (bndry != nullptr) ?
+                    (*bndry)[Orientation(2,Orientation::low )].array(mfi) : foo;
+                const auto& bvzhi = (bndry != nullptr) ?
+                    (*bndry)[Orientation(2,Orientation::high)].array(mfi) : foo;
+#endif
+
+                AMREX_LAUNCH_HOST_DEVICE_LAMBDA_DIM
+                ( xbx, txbx,
+                  {
+                      mltensor_cross_terms_fx(txbx,fxfab,vfab,etaxfab,kapxfab,dxinv,
+                                              bvxlo, bvxhi, bct, dlo, dhi);
+                  }
+                , ybx, tybx,
+                  {
+                      mltensor_cross_terms_fy(tybx,fyfab,vfab,etayfab,kapyfab,dxinv,
+                                              bvylo, bvyhi, bct, dlo, dhi);
+                  }
+                , zbx, tzbx,
+                  {
+                      mltensor_cross_terms_fz(tzbx,fzfab,vfab,etazfab,kapzfab,dxinv,
+                                              bvzlo, bvzhi, bct, dlo, dhi);
+                  }
+                );
+            }
 
             if (m_overset_mask[amrlev][mglev]) {
                 const auto& osm = m_overset_mask[amrlev][mglev]->array(mfi);
@@ -288,18 +340,18 @@ MLTensorOp::applyBCTensor (int amrlev, int mglev, MultiFab& vel,
 #if (AMREX_SPACEDIM == 1)
     amrex::ignore_unused(amrlev,mglev,vel,bc_mode,bndry);
 #else
+
     const int inhomog = bc_mode == BCMode::Inhomogeneous;
     const int imaxorder = maxorder;
     const auto& bcondloc = *m_bcondloc[amrlev][mglev];
     const auto& maskvals = m_maskvals[amrlev][mglev];
 
-    FArrayBox foofab(Box::TheUnitBox(),3);
-    const auto& foo = foofab.array();
+    Array4<Real const> foo;
 
     const auto dxinv = m_geom[amrlev][mglev].InvCellSizeArray();
     const Box& domain = m_geom[amrlev][mglev].growPeriodicDomain(1);
-
-    // Domain and coarse-fine boundaries are handled below.
+    const auto dlo = amrex::lbound(domain);
+    const auto dhi = amrex::ubound(domain);
 
     MFItInfo mfi_info;
     if (Gpu::notInLaunchRegion()) mfi_info.SetDynamic(true);
@@ -315,14 +367,13 @@ MLTensorOp::applyBCTensor (int amrlev, int mglev, MultiFab& vel,
         const auto & bdlv = bcondloc.bndryLocs(mfi);
         const auto & bdcv = bcondloc.bndryConds(mfi);
 
-        GpuArray<BoundCond,2*AMREX_SPACEDIM*AMREX_SPACEDIM> bct;
-        GpuArray<Real,2*AMREX_SPACEDIM*AMREX_SPACEDIM> bcl;
-        for (OrientationIter face; face; ++face) {
-            Orientation ori = face();
-            const int iface = ori;
-            for (int icomp = 0; icomp < AMREX_SPACEDIM; ++icomp) {
-                bct[iface*AMREX_SPACEDIM+icomp] = bdcv[icomp][ori];
-                bcl[iface*AMREX_SPACEDIM+icomp] = bdlv[icomp][ori];
+        Array2D<BoundCond,0,2*AMREX_SPACEDIM,0,AMREX_SPACEDIM> bct;
+        Array2D<Real,0,2*AMREX_SPACEDIM,0,AMREX_SPACEDIM> bcl;
+        for (int icomp = 0; icomp < AMREX_SPACEDIM; ++icomp) {
+            for (OrientationIter face; face; ++face) {
+                Orientation ori = face();
+                bct(ori,icomp) = bdcv[icomp][ori];
+                bcl(ori,icomp) = bdlv[icomp][ori];
             }
         }
 
@@ -341,14 +392,13 @@ MLTensorOp::applyBCTensor (int amrlev, int mglev, MultiFab& vel,
           (*bndry)[Orientation(1,Orientation::high)].array(mfi) : foo;
 
 #if (AMREX_SPACEDIM == 2)
-
         AMREX_HOST_DEVICE_FOR_1D ( 4, icorner,
         {
             mltensor_fill_corners(icorner, vbx, velfab,
                                   mxlo, mylo, mxhi, myhi,
                                   bvxlo, bvylo, bvxhi, bvyhi,
                                   bct, bcl, inhomog, imaxorder,
-                                  dxinv, domain);
+                                  dxinv, dlo, dhi);
         });
 #else
         const auto& mzlo = maskvals[Orientation(2,Orientation::low )].array(mfi);
@@ -360,18 +410,40 @@ MLTensorOp::applyBCTensor (int amrlev, int mglev, MultiFab& vel,
           (*bndry)[Orientation(2,Orientation::high)].array(mfi) : foo;
 
         // only edge vals used in 3D stencil
-        AMREX_HOST_DEVICE_FOR_1D ( 12, iedge,
+#ifdef AMREX_USE_GPU
+        if (Gpu::inLaunchRegion()) {
+            amrex::launch(12, 64, Gpu::gpuStream(),
+#ifdef AMREX_USE_DPCPP
+            [=] AMREX_GPU_DEVICE (sycl::nd_item<1> const& item)
+            {
+                int bid = item.get_group_linear_id();
+                int tid = item.get_local_linear_id();
+                int bdim = item.get_local_range(0);
+#else
+            [=] AMREX_GPU_DEVICE ()
+            {
+                int bid = blockIdx.x;
+                int tid = threadIdx.x;
+                int bdim = blockDim.x;
+#endif
+                mltensor_fill_edges(bid, tid, bdim, vbx, velfab,
+                                    mxlo, mylo, mzlo, mxhi, myhi, mzhi,
+                                    bvxlo, bvylo, bvzlo, bvxhi, bvyhi, bvzhi,
+                                    bct, bcl, inhomog, imaxorder,
+                                    dxinv, dlo, dhi);
+            });
+        } else
+#endif
         {
-            mltensor_fill_edges(iedge, vbx, velfab,
+            mltensor_fill_edges(vbx, velfab,
                                 mxlo, mylo, mzlo, mxhi, myhi, mzhi,
                                 bvxlo, bvylo, bvzlo, bvxhi, bvyhi, bvzhi,
                                 bct, bcl, inhomog, imaxorder,
-                                dxinv, domain);
-        });
+                                dxinv, dlo, dhi);
+        }
 #endif
     }
 
-    // Notet that it is incorrect to call EnforcePeriodicity on vel.
 #endif
 }
 
diff --git a/Src/LinearSolvers/MLMG/AMReX_MLTensorOp_grad.cpp b/Src/LinearSolvers/MLMG/AMReX_MLTensorOp_grad.cpp
index 705f38052d1..d395ecdac13 100644
--- a/Src/LinearSolvers/MLMG/AMReX_MLTensorOp_grad.cpp
+++ b/Src/LinearSolvers/MLMG/AMReX_MLTensorOp_grad.cpp
@@ -16,9 +16,15 @@ MLTensorOp::compFlux (int amrlev, const Array<MultiFab*,AMREX_SPACEDIM>& fluxes,
     const int ncomp = getNComp();
     MLABecLaplacian::compFlux(amrlev, fluxes, sol, loc);
 
-    applyBCTensor(amrlev, mglev, sol, BCMode::Inhomogeneous, StateMode::Solution, m_bndry_sol[amrlev].get());
+    MLMGBndry const* bndry = m_bndry_sol[amrlev].get();
+    applyBCTensor(amrlev, mglev, sol, BCMode::Inhomogeneous, StateMode::Solution, bndry);
+
+    const auto& bcondloc = *m_bcondloc[amrlev][mglev];
 
     const auto dxinv = m_geom[amrlev][mglev].InvCellSizeArray();
+    const Box& domain = m_geom[amrlev][mglev].growPeriodicDomain(1);
+    const auto dlo = amrex::lbound(domain);
+    const auto dhi = amrex::ubound(domain);
 
     Array<MultiFab,AMREX_SPACEDIM> const& etamf = m_b_coeffs[amrlev][mglev];
     Array<MultiFab,AMREX_SPACEDIM> const& kapmf = m_kappa[amrlev][mglev];
@@ -52,20 +58,59 @@ MLTensorOp::compFlux (int amrlev, const Array<MultiFab*,AMREX_SPACEDIM>& fluxes,
                          Array4<Real> const fyfab = fluxfab_tmp[1].array();,
                          Array4<Real> const fzfab = fluxfab_tmp[2].array(););
 
-            AMREX_LAUNCH_HOST_DEVICE_LAMBDA_DIM
-            ( xbx, txbx,
-              {
-                  mltensor_cross_terms_fx(txbx,fxfab,vfab,etaxfab,kapxfab,dxinv);
-              }
-            , ybx, tybx,
-              {
-                  mltensor_cross_terms_fy(tybx,fyfab,vfab,etayfab,kapyfab,dxinv);
-              }
-            , zbx, tzbx,
-              {
-                  mltensor_cross_terms_fz(tzbx,fzfab,vfab,etazfab,kapzfab,dxinv);
-              }
-            );
+            if (domain.strictly_contains(mfi.tilebox())) {
+                AMREX_LAUNCH_HOST_DEVICE_LAMBDA_DIM
+                ( xbx, txbx,
+                  {
+                      mltensor_cross_terms_fx(txbx,fxfab,vfab,etaxfab,kapxfab,dxinv);
+                  }
+                , ybx, tybx,
+                  {
+                      mltensor_cross_terms_fy(tybx,fyfab,vfab,etayfab,kapyfab,dxinv);
+                  }
+                , zbx, tzbx,
+                  {
+                      mltensor_cross_terms_fz(tzbx,fzfab,vfab,etazfab,kapzfab,dxinv);
+                  }
+                );
+            } else {
+                const auto & bdcv = bcondloc.bndryConds(mfi);
+
+                Array2D<BoundCond,0,2*AMREX_SPACEDIM,0,AMREX_SPACEDIM> bct;
+                for (int icomp = 0; icomp < AMREX_SPACEDIM; ++icomp) {
+                    for (OrientationIter face; face; ++face) {
+                        Orientation ori = face();
+                        bct(ori,icomp) = bdcv[icomp][ori];
+                    }
+                }
+
+                const auto& bvxlo = (*bndry)[Orientation(0,Orientation::low )].array(mfi);
+                const auto& bvylo = (*bndry)[Orientation(1,Orientation::low )].array(mfi);
+                const auto& bvxhi = (*bndry)[Orientation(0,Orientation::high)].array(mfi);
+                const auto& bvyhi = (*bndry)[Orientation(1,Orientation::high)].array(mfi);
+#if (AMREX_SPACEDIM == 3)
+                const auto& bvzlo = (*bndry)[Orientation(2,Orientation::low )].array(mfi);
+                const auto& bvzhi = (*bndry)[Orientation(2,Orientation::high)].array(mfi);
+#endif
+
+                AMREX_LAUNCH_HOST_DEVICE_LAMBDA_DIM
+                ( xbx, txbx,
+                  {
+                      mltensor_cross_terms_fx(txbx,fxfab,vfab,etaxfab,kapxfab,dxinv,
+                                              bvxlo, bvxhi, bct, dlo, dhi);
+                  }
+                , ybx, tybx,
+                  {
+                      mltensor_cross_terms_fy(tybx,fyfab,vfab,etayfab,kapyfab,dxinv,
+                                              bvylo, bvyhi, bct, dlo, dhi);
+                  }
+                , zbx, tzbx,
+                  {
+                      mltensor_cross_terms_fz(tzbx,fzfab,vfab,etazfab,kapzfab,dxinv,
+                                              bvzlo, bvzhi, bct, dlo, dhi);
+                  }
+                );
+            }
 
             for (int idim = 0; idim < AMREX_SPACEDIM; ++idim) {
                 const Box& nbx = mfi.nodaltilebox(idim);
@@ -95,33 +140,36 @@ MLTensorOp::compVelGrad (int amrlev, const Array<MultiFab*,AMREX_SPACEDIM>& flux
 
     const int mglev = 0;
 
-    applyBCTensor(amrlev, mglev, sol, BCMode::Inhomogeneous, StateMode::Solution, m_bndry_sol[amrlev].get());
+    MLMGBndry const* bndry = m_bndry_sol[amrlev].get();
+    applyBC(amrlev, mglev, sol, BCMode::Inhomogeneous, StateMode::Solution, bndry);
+    applyBCTensor(amrlev, mglev, sol, BCMode::Inhomogeneous, StateMode::Solution, bndry);
+
+    const auto& bcondloc = *m_bcondloc[amrlev][mglev];
 
     const auto dxinv = m_geom[amrlev][mglev].InvCellSizeArray();
-    const int dim_fluxes = AMREX_SPACEDIM*AMREX_SPACEDIM;
+    const Box& domain = m_geom[amrlev][mglev].growPeriodicDomain(1);
+    const auto dlo = amrex::lbound(domain);
+    const auto dhi = amrex::ubound(domain);
 
 #ifdef AMREX_USE_OMP
 #pragma omp parallel if (Gpu::notInLaunchRegion())
 #endif
+    for (MFIter mfi(sol, TilingIfNotGPU()); mfi.isValid(); ++mfi)
     {
-        Array<FArrayBox,AMREX_SPACEDIM> fluxfab_tmp;
+        Array4<Real const> const vfab = sol.const_array(mfi);
+        AMREX_D_TERM(Box const xbx = mfi.nodaltilebox(0);,
+                     Box const ybx = mfi.nodaltilebox(1);,
+                     Box const zbx = mfi.nodaltilebox(2);)
+        AMREX_D_TERM(Array4<Real> const fxfab = fluxes[0]->array(mfi);,
+                     Array4<Real> const fyfab = fluxes[1]->array(mfi);,
+                     Array4<Real> const fzfab = fluxes[2]->array(mfi);)
 
-        for (MFIter mfi(sol, TilingIfNotGPU()); mfi.isValid(); ++mfi)
-        {
-            Array4<Real const> const vfab = sol.const_array(mfi);
-            AMREX_D_TERM(Box const xbx = mfi.nodaltilebox(0);,
-                         Box const ybx = mfi.nodaltilebox(1);,
-                         Box const zbx = mfi.nodaltilebox(2););
-            AMREX_D_TERM(fluxfab_tmp[0].resize(xbx,dim_fluxes);,
-                         fluxfab_tmp[1].resize(ybx,dim_fluxes);,
-                         fluxfab_tmp[2].resize(zbx,dim_fluxes););
-            AMREX_D_TERM(Elixir fxeli = fluxfab_tmp[0].elixir();,
-                         Elixir fyeli = fluxfab_tmp[1].elixir();,
-                         Elixir fzeli = fluxfab_tmp[2].elixir(););
-            AMREX_D_TERM(Array4<Real> const fxfab = fluxfab_tmp[0].array();,
-                         Array4<Real> const fyfab = fluxfab_tmp[1].array();,
-                         Array4<Real> const fzfab = fluxfab_tmp[2].array(););
+// The derivatives are put in the array with the following order:
+// component: 0    ,  1    ,  2    ,  3    ,  4    , 5    ,  6    ,  7    ,  8
+// in 2D:     dU/dx,  dV/dx,  dU/dy,  dV/dy
+// in 3D:     dU/dx,  dV/dx,  dW/dx,  dU/dy,  dV/dy, dW/dy,  dU/dz,  dV/dz,  dW/dz
 
+        if (domain.strictly_contains(mfi.tilebox())) {
             AMREX_LAUNCH_HOST_DEVICE_LAMBDA_DIM
             ( xbx, txbx,
               {
@@ -136,23 +184,39 @@ MLTensorOp::compVelGrad (int amrlev, const Array<MultiFab*,AMREX_SPACEDIM>& flux
                   mltensor_vel_grads_fz(tzbx,fzfab,vfab,dxinv);
               }
             );
-
-// The derivatives are put in the array with the following order:
-// component: 0    ,  1    ,  2    ,  3    ,  4    , 5    ,  6    ,  7    ,  8
-// in 2D:     dU/dx,  dV/dx,  dU/dy,  dV/dy
-// in 3D:     dU/dx,  dV/dx,  dW/dx,  dU/dy,  dV/dy, dW/dy,  dU/dz,  dV/dz,  dW/dz
-
-
-            for (int idim = 0; idim < AMREX_SPACEDIM; ++idim) {
-                const Box& nbx = mfi.nodaltilebox(idim);
-                Array4<Real      > dst = fluxes[idim]->array(mfi);
-                Array4<Real const> src = fluxfab_tmp[idim].const_array();
-                AMREX_HOST_DEVICE_PARALLEL_FOR_4D (nbx, dim_fluxes, i, j, k, n,
-                {
-                    dst(i,j,k,n) = src(i,j,k,n);
-                });
+        } else {
+            const auto & bdcv = bcondloc.bndryConds(mfi);
+
+            Array2D<BoundCond,0,2*AMREX_SPACEDIM,0,AMREX_SPACEDIM> bct;
+            for (int icomp = 0; icomp < AMREX_SPACEDIM; ++icomp) {
+                for (OrientationIter face; face; ++face) {
+                    Orientation ori = face();
+                    bct(ori,icomp) = bdcv[icomp][ori];
+                }
             }
 
+            const auto& bvxlo = (*bndry)[Orientation(0,Orientation::low )].array(mfi);
+            const auto& bvylo = (*bndry)[Orientation(1,Orientation::low )].array(mfi);
+            const auto& bvxhi = (*bndry)[Orientation(0,Orientation::high)].array(mfi);
+            const auto& bvyhi = (*bndry)[Orientation(1,Orientation::high)].array(mfi);
+#if (AMREX_SPACEDIM == 3)
+            const auto& bvzlo = (*bndry)[Orientation(2,Orientation::low )].array(mfi);
+            const auto& bvzhi = (*bndry)[Orientation(2,Orientation::high)].array(mfi);
+#endif
+            AMREX_LAUNCH_HOST_DEVICE_LAMBDA_DIM
+            ( xbx, txbx,
+              {
+                  mltensor_vel_grads_fx(txbx,fxfab,vfab,dxinv,bvxlo,bvxhi,bct,dlo,dhi);
+              }
+            , ybx, tybx,
+              {
+                  mltensor_vel_grads_fy(tybx,fyfab,vfab,dxinv,bvylo,bvyhi,bct,dlo,dhi);
+              }
+            , zbx, tzbx,
+              {
+                  mltensor_vel_grads_fz(tzbx,fzfab,vfab,dxinv,bvzlo,bvzhi,bct,dlo,dhi);
+              }
+            );
         }
     }
 #endif
diff --git a/Src/LinearSolvers/MLMG/AMReX_MLTensor_2D_K.H b/Src/LinearSolvers/MLMG/AMReX_MLTensor_2D_K.H
index 8f10f08ec58..a40fa4611a8 100644
--- a/Src/LinearSolvers/MLMG/AMReX_MLTensor_2D_K.H
+++ b/Src/LinearSolvers/MLMG/AMReX_MLTensor_2D_K.H
@@ -17,110 +17,168 @@ void mltensor_fill_corners (int icorner, Box const& vbox, // vbox: the valid box
                             Array4<Real const> const& bcvalylo,
                             Array4<Real const> const& bcvalxhi,
                             Array4<Real const> const& bcvalyhi,
-                            GpuArray<BoundCond,2*AMREX_SPACEDIM*AMREX_SPACEDIM> const& bct,
-                            GpuArray<Real,2*AMREX_SPACEDIM*AMREX_SPACEDIM> const& bcl,
+                            Array2D<BoundCond,
+                                    0,2*AMREX_SPACEDIM,
+                                    0,AMREX_SPACEDIM> const& bct,
+                            Array2D<Real,
+                                    0,2*AMREX_SPACEDIM,
+                                    0,AMREX_SPACEDIM> const& bcl,
                             int inhomog, int maxorder,
-                            GpuArray<Real,AMREX_SPACEDIM> const& dxinv, Box const& domain) noexcept
+                            GpuArray<Real,AMREX_SPACEDIM> const& dxinv,
+                            Dim3 const& dlo, Dim3 const& dhi) noexcept
 {
-    constexpr int oxlo = 0;
-    constexpr int oylo = 1;
-    constexpr int oxhi = 2;
-    constexpr int oyhi = 3;
-    constexpr int xdir = 0;
-    constexpr int ydir = 1;
+    constexpr int k = 0;
     const auto blen = amrex::length(vbox);
     const auto vlo  = amrex::lbound(vbox);
     const auto vhi  = amrex::ubound(vbox);
-    const auto dlo  = amrex::lbound(domain);
-    const auto dhi  = amrex::ubound(domain);
 
-    for (int icomp = 0; icomp < AMREX_SPACEDIM; ++icomp) {
-        switch (icorner) {
-        case 0: {
-            // xlo & ylo
-            if (mxlo(vlo.x-1,vlo.y-1,0) != BndryData::covered) {
-                Box bx = amrex::adjCellLo(amrex::adjCellLo(vbox,xdir,1),ydir,1);
-                if (vlo.x == dlo.x && vlo.y == dlo.y) {
-                    vel(vlo.x-1,vlo.y-1,0,icomp) = vel(vlo.x-1,vlo.y,0,icomp)
-                        + vel(vlo.x,vlo.y-1,0,icomp) - vel(vlo.x,vlo.y,0,icomp);
-                } else if (vlo.x == dlo.x || mylo(vlo.x,vlo.y-1,0) == BndryData::covered) {
-                    int offset = AMREX_SPACEDIM * oxlo;
-                    mllinop_apply_bc_x(Orientation::low, bx, blen.x,
-                                       vel, mxlo, bct[offset+icomp], bcl[offset+icomp],
-                                       bcvalxlo, maxorder, dxinv[xdir], inhomog, icomp);
-                } else {
-                    int offset = AMREX_SPACEDIM * oylo;
-                    mllinop_apply_bc_y(Orientation::low, bx, blen.y,
-                                       vel, mylo, bct[offset+icomp], bcl[offset+icomp],
-                                       bcvalylo, maxorder, dxinv[ydir], inhomog, icomp);
+    if (icorner == 0) { // xlo & ylo
+        int const i = vlo.x-1;
+        int const j = vlo.y-1;
+        if (mxlo(i,j,k) != BndryData::covered && (dlo.x != vlo.x || dlo.y != vlo.y)) {
+            bool x_interior = mylo(i+1,j  ,k) == BndryData::covered; // i+1,j is a valid cell inside domain
+            bool x_exterior = mylo(i+1,j  ,k) == BndryData::not_covered; // i+1,j is a ghost cell inside domain
+            bool y_interior = mxlo(i  ,j+1,k) == BndryData::covered;
+            bool y_exterior = mxlo(i  ,j+1,k) == BndryData::not_covered;
+            if ((x_interior && y_interior) || (x_exterior && y_exterior)) {
+                for (int icomp = 0; icomp < AMREX_SPACEDIM; ++icomp) {
+                    mllinop_apply_bc_x(Orientation::low, i,j,k, blen.x, vel, mxlo,
+                                       bct(Orientation::xlo(), icomp),
+                                       bcl(Orientation::xlo(), icomp),
+                                       bcvalxlo, maxorder, dxinv[0], inhomog, icomp);
+                    Real tmp = vel(i,j,k,icomp);
+                    mllinop_apply_bc_y(Orientation::low, i,j,k, blen.y, vel, mylo,
+                                       bct(Orientation::ylo(), icomp),
+                                       bcl(Orientation::ylo(), icomp),
+                                       bcvalylo, maxorder, dxinv[1], inhomog, icomp);
+                    vel(i,j,k,icomp) = 0.5_rt*(tmp+vel(i,j,k,icomp));
+                }
+            } else if (x_interior || dlo.x == vlo.x) {
+                for (int icomp = 0; icomp < AMREX_SPACEDIM; ++icomp) {
+                    mllinop_apply_bc_x(Orientation::low, i,j,k, blen.x, vel, mxlo,
+                                       bct(Orientation::xlo(), icomp),
+                                       bcl(Orientation::xlo(), icomp),
+                                       bcvalxlo, maxorder, dxinv[0], inhomog, icomp);
+                }
+            } else if (y_interior || dlo.y == vlo.y) {
+                for (int icomp = 0; icomp < AMREX_SPACEDIM; ++icomp) {
+                    mllinop_apply_bc_y(Orientation::low, i,j,k, blen.y, vel, mylo,
+                                       bct(Orientation::ylo(), icomp),
+                                       bcl(Orientation::ylo(), icomp),
+                                       bcvalylo, maxorder, dxinv[1], inhomog, icomp);
                 }
             }
-            break;
         }
-        case 1: {
-            // xhi & ylo
-            if (mxhi(vhi.x+1,vlo.y-1,0) != BndryData::covered) {
-                Box bx = amrex::adjCellLo(amrex::adjCellHi(vbox,xdir,1),ydir,1);
-                if (vhi.x == dhi.x && vlo.y == dlo.y) {
-                    vel(vhi.x+1,vlo.y-1,0,icomp) = vel(vhi.x+1,vlo.y,0,icomp)
-                        + vel(vhi.x,vlo.y-1,0,icomp) - vel(vhi.x,vlo.y,0,icomp);
-                } else if (vhi.x == dhi.x || mylo(vhi.x,vlo.y-1,0) == BndryData::covered) {
-                    int offset = AMREX_SPACEDIM * oxhi;
-                    mllinop_apply_bc_x(Orientation::high, bx, blen.x,
-                                       vel, mxhi, bct[offset+icomp], bcl[offset+icomp],
-                                       bcvalxhi, maxorder, dxinv[xdir], inhomog, icomp);
-                } else {
-                    int offset = AMREX_SPACEDIM * oylo;
-                    mllinop_apply_bc_y(Orientation::low, bx, blen.y,
-                                       vel, mylo, bct[offset+icomp], bcl[offset+icomp],
-                                       bcvalylo, maxorder, dxinv[ydir], inhomog, icomp);
+    } else if (icorner == 1) { // xhi & ylo
+        int const i = vhi.x+1;
+        int const j = vlo.y-1;
+        if (mxhi(i,j,k) != BndryData::covered && (dhi.x != vhi.x || dlo.y != vlo.y)) {
+            bool x_interior = mylo(i-1,j  ,k) == BndryData::covered;
+            bool x_exterior = mylo(i-1,j  ,k) == BndryData::not_covered;
+            bool y_interior = mxhi(i  ,j+1,k) == BndryData::covered;
+            bool y_exterior = mxhi(i  ,j+1,k) == BndryData::not_covered;
+            if ((x_interior && y_interior) || (x_exterior && y_exterior)) {
+                for (int icomp = 0; icomp < AMREX_SPACEDIM; ++icomp) {
+                    mllinop_apply_bc_x(Orientation::high, i,j,k, blen.x, vel, mxhi,
+                                       bct(Orientation::xhi(), icomp),
+                                       bcl(Orientation::xhi(), icomp),
+                                       bcvalxhi, maxorder, dxinv[0], inhomog, icomp);
+                    Real tmp = vel(i,j,k,icomp);
+                    mllinop_apply_bc_y(Orientation::low, i,j,k, blen.y, vel, mylo,
+                                       bct(Orientation::ylo(), icomp),
+                                       bcl(Orientation::ylo(), icomp),
+                                       bcvalylo, maxorder, dxinv[1], inhomog, icomp);
+                    vel(i,j,k,icomp) = 0.5_rt*(tmp+vel(i,j,k,icomp));
+                }
+            } else if (x_interior || dhi.x == vhi.x) {
+                for (int icomp = 0; icomp < AMREX_SPACEDIM; ++icomp) {
+                    mllinop_apply_bc_x(Orientation::high, i,j,k, blen.x, vel, mxhi,
+                                       bct(Orientation::xhi(), icomp),
+                                       bcl(Orientation::xhi(), icomp),
+                                       bcvalxhi, maxorder, dxinv[0], inhomog, icomp);
+                }
+            } else if (y_interior || dlo.y == vlo.y) {
+                for (int icomp = 0; icomp < AMREX_SPACEDIM; ++icomp) {
+                    mllinop_apply_bc_y(Orientation::low, i,j,k, blen.y, vel, mylo,
+                                       bct(Orientation::ylo(), icomp),
+                                       bcl(Orientation::ylo(), icomp),
+                                       bcvalylo, maxorder, dxinv[1], inhomog, icomp);
                 }
             }
-            break;
         }
-        case 2: {
-            // xlo & yhi
-            if (mxlo(vlo.x-1,vhi.y+1,0) != BndryData::covered) {
-                Box bx = amrex::adjCellHi(amrex::adjCellLo(vbox,xdir,1),ydir,1);
-                if (vlo.x == dlo.x && vhi.y == dhi.y) {
-                    vel(vlo.x-1,vhi.y+1,0,icomp) = vel(vlo.x-1,vhi.y,0,icomp)
-                        + vel(vlo.x,vhi.y+1,0,icomp) - vel(vlo.x,vhi.y,0,icomp);
-                } else if (vlo.x == dlo.x || myhi(vlo.x,vhi.y+1,0) == BndryData::covered) {
-                    int offset = AMREX_SPACEDIM * oxlo;
-                    mllinop_apply_bc_x(Orientation::low, bx, blen.x,
-                                       vel, mxlo, bct[offset+icomp], bcl[offset+icomp],
-                                       bcvalxlo, maxorder, dxinv[xdir], inhomog, icomp);
-                } else {
-                    int offset = AMREX_SPACEDIM * oyhi;
-                    mllinop_apply_bc_y(Orientation::high, bx, blen.y,
-                                       vel, myhi, bct[offset+icomp], bcl[offset+icomp],
-                                       bcvalyhi, maxorder, dxinv[ydir], inhomog, icomp);
+    } else if (icorner == 2) { // xlo & yhi
+        int const i = vlo.x-1;
+        int const j = vhi.y+1;
+        if (mxlo(i,j,k) != BndryData::covered && (dlo.x != vlo.x || dhi.y != vhi.y)) {
+            bool x_interior = myhi(i+1,j  ,k) == BndryData::covered;
+            bool x_exterior = myhi(i+1,j  ,k) == BndryData::not_covered;
+            bool y_interior = mxlo(i  ,j-1,k) == BndryData::covered;
+            bool y_exterior = mxlo(i  ,j-1,k) == BndryData::not_covered;
+            if ((x_interior && y_interior) || (x_exterior && y_exterior)) {
+                for (int icomp = 0; icomp < AMREX_SPACEDIM; ++icomp) {
+                    mllinop_apply_bc_x(Orientation::low, i,j,k, blen.x, vel, mxlo,
+                                       bct(Orientation::xlo(), icomp),
+                                       bcl(Orientation::xlo(), icomp),
+                                       bcvalxlo, maxorder, dxinv[0], inhomog, icomp);
+                    Real tmp = vel(i,j,k,icomp);
+                    mllinop_apply_bc_y(Orientation::high, i,j,k, blen.y, vel, myhi,
+                                       bct(Orientation::yhi(), icomp),
+                                       bcl(Orientation::yhi(), icomp),
+                                       bcvalyhi, maxorder, dxinv[1], inhomog, icomp);
+                    vel(i,j,k,icomp) = 0.5_rt*(tmp+vel(i,j,k,icomp));
+                }
+            } else if (x_interior || dlo.x == vlo.x) {
+                for (int icomp = 0; icomp < AMREX_SPACEDIM; ++icomp) {
+                    mllinop_apply_bc_x(Orientation::low, i,j,k, blen.x, vel, mxlo,
+                                       bct(Orientation::xlo(), icomp),
+                                       bcl(Orientation::xlo(), icomp),
+                                       bcvalxlo, maxorder, dxinv[0], inhomog, icomp);
+                }
+            } else if (y_interior || dhi.y == vhi.y) {
+                for (int icomp = 0; icomp < AMREX_SPACEDIM; ++icomp) {
+                    mllinop_apply_bc_y(Orientation::high, i,j,k, blen.y, vel, myhi,
+                                       bct(Orientation::yhi(), icomp),
+                                       bcl(Orientation::yhi(), icomp),
+                                       bcvalyhi, maxorder, dxinv[1], inhomog, icomp);
                 }
             }
-            break;
         }
-        case 3: {
-            // xhi & yhi
-            if (mxhi(vhi.x+1,vhi.y+1,0) != BndryData::covered) {
-                Box bx = amrex::adjCellHi(amrex::adjCellHi(vbox,xdir,1),ydir,1);
-                if (vhi.x == dhi.x && vhi.y == dhi.y) {
-                    vel(vhi.x+1,vhi.y+1,0,icomp) = vel(vhi.x+1,vhi.y,0,icomp)
-                        + vel(vhi.x,vhi.y+1,0,icomp) - vel(vhi.x,vhi.y,0,icomp);
-                } else if (vhi.x == dhi.x || myhi(vhi.x,vhi.y+1,0) == BndryData::covered) {
-                    int offset = AMREX_SPACEDIM * oxhi;
-                    mllinop_apply_bc_x(Orientation::high, bx, blen.x,
-                                       vel, mxhi, bct[offset+icomp], bcl[offset+icomp],
-                                       bcvalxhi, maxorder, dxinv[xdir], inhomog, icomp);
-                } else {
-                    int offset = AMREX_SPACEDIM * oyhi;
-                    mllinop_apply_bc_y(Orientation::high, bx, blen.y,
-                                       vel, myhi, bct[offset+icomp], bcl[offset+icomp],
-                                       bcvalyhi, maxorder, dxinv[ydir], inhomog, icomp);
+    } else if (icorner == 3) { // xhi & yhi
+        int const i = vhi.x+1;
+        int const j = vhi.y+1;
+        if (mxhi(i,j,k) != BndryData::covered && (dhi.x != vhi.x || dhi.y != vhi.y)) {
+            bool x_interior = myhi(i-1,j  ,k) == BndryData::covered;
+            bool x_exterior = myhi(i-1,j  ,k) == BndryData::not_covered;
+            bool y_interior = mxhi(i  ,j-1,k) == BndryData::covered;
+            bool y_exterior = mxhi(i  ,j-1,k) == BndryData::not_covered;
+            if ((x_interior && y_interior) || (x_exterior && y_exterior)) {
+                for (int icomp = 0; icomp < AMREX_SPACEDIM; ++icomp) {
+                    mllinop_apply_bc_x(Orientation::high, i,j,k, blen.x, vel, mxhi,
+                                       bct(Orientation::xhi(), icomp),
+                                       bcl(Orientation::xhi(), icomp),
+                                       bcvalxhi, maxorder, dxinv[0], inhomog, icomp);
+                    Real tmp = vel(i,j,k,icomp);
+                    mllinop_apply_bc_y(Orientation::high, i,j,k, blen.y, vel, myhi,
+                                       bct(Orientation::yhi(), icomp),
+                                       bcl(Orientation::yhi(), icomp),
+                                       bcvalyhi, maxorder, dxinv[1], inhomog, icomp);
+                    vel(i,j,k,icomp) = 0.5_rt*(tmp+vel(i,j,k,icomp));
+                }
+            } else if (x_interior || dhi.x == vhi.x) {
+                for (int icomp = 0; icomp < AMREX_SPACEDIM; ++icomp) {
+                    mllinop_apply_bc_x(Orientation::high, i,j,k, blen.x, vel, mxhi,
+                                       bct(Orientation::xhi(), icomp),
+                                       bcl(Orientation::xhi(), icomp),
+                                       bcvalxhi, maxorder, dxinv[0], inhomog, icomp);
+                }
+            } else if (y_interior || dhi.y == vhi.y) {
+                for (int icomp = 0; icomp < AMREX_SPACEDIM; ++icomp) {
+                    mllinop_apply_bc_y(Orientation::high, i,j,k, blen.y, vel, myhi,
+                                       bct(Orientation::yhi(), icomp),
+                                       bcl(Orientation::yhi(), icomp),
+                                       bcvalyhi, maxorder, dxinv[1], inhomog, icomp);
                 }
             }
-            break;
-        }
-        default: {}
         }
     }
 }
@@ -137,11 +195,12 @@ void mltensor_cross_terms_fx (Box const& box, Array4<Real> const& fx,
     const auto hi = amrex::ubound(box);
     constexpr Real twoThirds = Real(2./3.);
 
+    int k = 0;
     for     (int j = lo.y; j <= hi.y; ++j) {
         AMREX_PRAGMA_SIMD
         for (int i = lo.x; i <= hi.x; ++i) {
-            Real dudy = (vel(i,j+1,0,0)+vel(i-1,j+1,0,0)-vel(i,j-1,0,0)-vel(i-1,j-1,0,0))*(Real(0.25)*dyi);
-            Real dvdy = (vel(i,j+1,0,1)+vel(i-1,j+1,0,1)-vel(i,j-1,0,1)-vel(i-1,j-1,0,1))*(Real(0.25)*dyi);
+            Real dudy = mltensor_dy_on_xface(i,j,k,0,vel,dyi);
+            Real dvdy = mltensor_dy_on_xface(i,j,k,1,vel,dyi);
             Real divu = dvdy;
             Real xif = kapx(i,j,0);
             Real mun = Real(0.75)*(etax(i,j,0,0)-xif);  // restore the original eta
@@ -164,11 +223,80 @@ void mltensor_cross_terms_fy (Box const& box, Array4<Real> const& fy,
     const auto hi = amrex::ubound(box);
     constexpr Real twoThirds = Real(2./3.);
 
+    int k = 0;
     for     (int j = lo.y; j <= hi.y; ++j) {
         AMREX_PRAGMA_SIMD
         for (int i = lo.x; i <= hi.x; ++i) {
-            Real dudx = (vel(i+1,j,0,0)+vel(i+1,j-1,0,0)-vel(i-1,j,0,0)-vel(i-1,j-1,0,0))*(Real(0.25)*dxi);
-            Real dvdx = (vel(i+1,j,0,1)+vel(i+1,j-1,0,1)-vel(i-1,j,0,1)-vel(i-1,j-1,0,1))*(Real(0.25)*dxi);
+            Real dudx = mltensor_dx_on_yface(i,j,k,0,vel,dxi);
+            Real dvdx = mltensor_dx_on_yface(i,j,k,1,vel,dxi);
+            Real divu = dudx;
+            Real xif = kapy(i,j,0);
+            Real mun = Real(0.75)*(etay(i,j,0,1)-xif);  // restore the original eta
+            Real mut =             etay(i,j,0,0);
+            fy(i,j,0,0) = -mut*dvdx;
+            fy(i,j,0,1) = -mun*(-twoThirds*divu) - xif*divu;
+        }
+    }
+}
+
+AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+void mltensor_cross_terms_fx (Box const& box, Array4<Real> const& fx,
+                              Array4<Real const> const& vel,
+                              Array4<Real const> const& etax,
+                              Array4<Real const> const& kapx,
+                              GpuArray<Real,AMREX_SPACEDIM> const& dxinv,
+                              Array4<Real const> const& bvxlo,
+                              Array4<Real const> const& bvxhi,
+                              Array2D<BoundCond,
+                                      0,2*AMREX_SPACEDIM,
+                                      0,AMREX_SPACEDIM> const& bct,
+                              Dim3 const& dlo, Dim3 const& dhi) noexcept
+{
+    const Real dyi = dxinv[1];
+    const auto lo = amrex::lbound(box);
+    const auto hi = amrex::ubound(box);
+    constexpr Real twoThirds = Real(2./3.);
+
+    // Three BC types: reflect odd, neumann, and dirichlet
+
+    int k = 0;
+    for     (int j = lo.y; j <= hi.y; ++j) {
+        for (int i = lo.x; i <= hi.x; ++i) {
+            Real dudy = mltensor_dy_on_xface(i,j,k,0,vel,dyi,bvxlo,bvxhi,bct,dlo,dhi);
+            Real dvdy = mltensor_dy_on_xface(i,j,k,1,vel,dyi,bvxlo,bvxhi,bct,dlo,dhi);
+            Real divu = dvdy;
+            Real xif = kapx(i,j,0);
+            Real mun = Real(0.75)*(etax(i,j,0,0)-xif);  // restore the original eta
+            Real mut =             etax(i,j,0,1);
+            fx(i,j,0,0) = -mun*(-twoThirds*divu) - xif*divu;
+            fx(i,j,0,1) = -mut*dudy;
+        }
+    }
+}
+
+AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+void mltensor_cross_terms_fy (Box const& box, Array4<Real> const& fy,
+                              Array4<Real const> const& vel,
+                              Array4<Real const> const& etay,
+                              Array4<Real const> const& kapy,
+                              GpuArray<Real,AMREX_SPACEDIM> const& dxinv,
+                              Array4<Real const> const& bvylo,
+                              Array4<Real const> const& bvyhi,
+                              Array2D<BoundCond,
+                                      0,2*AMREX_SPACEDIM,
+                                      0,AMREX_SPACEDIM> const& bct,
+                              Dim3 const& dlo, Dim3 const& dhi) noexcept
+{
+    const Real dxi = dxinv[0];
+    const auto lo = amrex::lbound(box);
+    const auto hi = amrex::ubound(box);
+    constexpr Real twoThirds = Real(2./3.);
+
+    int k = 0;
+    for     (int j = lo.y; j <= hi.y; ++j) {
+        for (int i = lo.x; i <= hi.x; ++i) {
+            Real dudx = mltensor_dx_on_yface(i,j,k,0,vel,dxi,bvylo,bvyhi,bct,dlo,dhi);
+            Real dvdx = mltensor_dx_on_yface(i,j,k,1,vel,dxi,bvylo,bvyhi,bct,dlo,dhi);
             Real divu = dudx;
             Real xif = kapy(i,j,0);
             Real mun = Real(0.75)*(etay(i,j,0,1)-xif);  // restore the original eta
@@ -241,13 +369,14 @@ void mltensor_vel_grads_fx (Box const& box, Array4<Real> const& fx,
     const auto lo = amrex::lbound(box);
     const auto hi = amrex::ubound(box);
 
+    int k = 0;
     for     (int j = lo.y; j <= hi.y; ++j) {
         AMREX_PRAGMA_SIMD
         for (int i = lo.x; i <= hi.x; ++i) {
             Real dudx = (vel(i,j,0,0) - vel(i-1,j,0,0))*dxi;
             Real dvdx = (vel(i,j,0,1) - vel(i-1,j,0,1))*dxi;
-            Real dudy = (vel(i,j+1,0,0)+vel(i-1,j+1,0,0)-vel(i,j-1,0,0)-vel(i-1,j-1,0,0))*(Real(0.25)*dyi);
-            Real dvdy = (vel(i,j+1,0,1)+vel(i-1,j+1,0,1)-vel(i,j-1,0,1)-vel(i-1,j-1,0,1))*(Real(0.25)*dyi);
+            Real dudy = mltensor_dy_on_xface(i,j,k,0,vel,dyi);
+            Real dvdy = mltensor_dy_on_xface(i,j,k,1,vel,dyi);
             fx(i,j,0,0) = dudx;
             fx(i,j,0,1) = dvdx;
             fx(i,j,0,2) = dudy;
@@ -266,11 +395,74 @@ void mltensor_vel_grads_fy (Box const& box, Array4<Real> const& fy,
     const auto lo = amrex::lbound(box);
     const auto hi = amrex::ubound(box);
 
+    int k = 0;
     for     (int j = lo.y; j <= hi.y; ++j) {
         AMREX_PRAGMA_SIMD
         for (int i = lo.x; i <= hi.x; ++i) {
-            Real dudx = (vel(i+1,j,0,0)+vel(i+1,j-1,0,0)-vel(i-1,j,0,0)-vel(i-1,j-1,0,0))*(Real(0.25)*dxi);
-            Real dvdx = (vel(i+1,j,0,1)+vel(i+1,j-1,0,1)-vel(i-1,j,0,1)-vel(i-1,j-1,0,1))*(Real(0.25)*dxi);
+            Real dudx = mltensor_dx_on_yface(i,j,k,0,vel,dxi);
+            Real dvdx = mltensor_dx_on_yface(i,j,k,1,vel,dxi);
+            Real dudy = (vel(i,j,0,0) - vel(i,j-1,0,0))*dyi;
+            Real dvdy = (vel(i,j,0,1) - vel(i,j-1,0,1))*dyi;
+            fy(i,j,0,0) = dudx;
+            fy(i,j,0,1) = dvdx;
+            fy(i,j,0,2) = dudy;
+            fy(i,j,0,3) = dvdy;
+        }
+    }
+}
+
+AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+void mltensor_vel_grads_fx (Box const& box, Array4<Real> const& fx,
+                            Array4<Real const> const& vel,
+                            GpuArray<Real,AMREX_SPACEDIM> const& dxinv,
+                            Array4<Real const> const& bvxlo,
+                            Array4<Real const> const& bvxhi,
+                            Array2D<BoundCond,
+                                    0,2*AMREX_SPACEDIM,
+                                    0,AMREX_SPACEDIM> const& bct,
+                            Dim3 const& dlo, Dim3 const& dhi) noexcept
+{
+    const Real dxi = dxinv[0];
+    const Real dyi = dxinv[1];
+    const auto lo = amrex::lbound(box);
+    const auto hi = amrex::ubound(box);
+
+    int k = 0;
+    for     (int j = lo.y; j <= hi.y; ++j) {
+        for (int i = lo.x; i <= hi.x; ++i) {
+            Real dudx = (vel(i,j,0,0) - vel(i-1,j,0,0))*dxi;
+            Real dvdx = (vel(i,j,0,1) - vel(i-1,j,0,1))*dxi;
+            Real dudy = mltensor_dy_on_xface(i,j,k,0,vel,dyi,bvxlo,bvxhi,bct,dlo,dhi);
+            Real dvdy = mltensor_dy_on_xface(i,j,k,1,vel,dyi,bvxlo,bvxhi,bct,dlo,dhi);
+            fx(i,j,0,0) = dudx;
+            fx(i,j,0,1) = dvdx;
+            fx(i,j,0,2) = dudy;
+            fx(i,j,0,3) = dvdy;
+        }
+    }
+}
+
+AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+void mltensor_vel_grads_fy (Box const& box, Array4<Real> const& fy,
+                            Array4<Real const> const& vel,
+                            GpuArray<Real,AMREX_SPACEDIM> const& dxinv,
+                            Array4<Real const> const& bvylo,
+                            Array4<Real const> const& bvyhi,
+                            Array2D<BoundCond,
+                                    0,2*AMREX_SPACEDIM,
+                                    0,AMREX_SPACEDIM> const& bct,
+                            Dim3 const& dlo, Dim3 const& dhi) noexcept
+{
+    const Real dxi = dxinv[0];
+    const Real dyi = dxinv[1];
+    const auto lo = amrex::lbound(box);
+    const auto hi = amrex::ubound(box);
+
+    int k = 0;
+    for     (int j = lo.y; j <= hi.y; ++j) {
+        for (int i = lo.x; i <= hi.x; ++i) {
+            Real dudx = mltensor_dx_on_yface(i,j,k,0,vel,dxi,bvylo,bvyhi,bct,dlo,dhi);
+            Real dvdx = mltensor_dx_on_yface(i,j,k,1,vel,dxi,bvylo,bvyhi,bct,dlo,dhi);
             Real dudy = (vel(i,j,0,0) - vel(i,j-1,0,0))*dyi;
             Real dvdy = (vel(i,j,0,1) - vel(i,j-1,0,1))*dyi;
             fy(i,j,0,0) = dudx;
diff --git a/Src/LinearSolvers/MLMG/AMReX_MLTensor_3D_K.H b/Src/LinearSolvers/MLMG/AMReX_MLTensor_3D_K.H
index a4a4c7df9ef..a5de05a385e 100644
--- a/Src/LinearSolvers/MLMG/AMReX_MLTensor_3D_K.H
+++ b/Src/LinearSolvers/MLMG/AMReX_MLTensor_3D_K.H
@@ -6,6 +6,643 @@
 
 namespace amrex {
 
+AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+void mltensor_fill_edges_xlo_ylo (int const i, int const j, int const k, Dim3 const& blen,
+                                  Array4<Real> const& vel,
+                                  Array4<int const> const& mxlo,
+                                  Array4<int const> const& mylo,
+                                  Array4<Real const> const& bcvalxlo,
+                                  Array4<Real const> const& bcvalylo,
+                                  Array2D<BoundCond,
+                                          0,2*AMREX_SPACEDIM,
+                                          0,AMREX_SPACEDIM> const& bct,
+                                  Array2D<Real,
+                                          0,2*AMREX_SPACEDIM,
+                                          0,AMREX_SPACEDIM> const& bcl,
+                                  int inhomog, int maxorder,
+                                  GpuArray<Real,AMREX_SPACEDIM> const& dxinv,
+                                  bool xlo_domain, bool ylo_domain) noexcept
+{
+    if (mxlo(i,j,k) != BndryData::covered && (!xlo_domain || !ylo_domain)) {
+        bool x_interior = mylo(i+1,j  ,k) == BndryData::covered;
+        bool x_exterior = mylo(i+1,j  ,k) == BndryData::not_covered;
+        bool y_interior = mxlo(i  ,j+1,k) == BndryData::covered;
+        bool y_exterior = mxlo(i  ,j+1,k) == BndryData::not_covered;
+        if ((x_interior && y_interior) || (x_exterior && y_exterior)) {
+            for (int icomp = 0; icomp < AMREX_SPACEDIM; ++icomp) {
+                mllinop_apply_bc_x(Orientation::low, i,j,k, blen.x, vel, mxlo,
+                                   bct(Orientation::xlo(), icomp),
+                                   bcl(Orientation::xlo(), icomp),
+                                   bcvalxlo, maxorder, dxinv[0], inhomog, icomp);
+                Real tmp = vel(i,j,k,icomp);
+                mllinop_apply_bc_y(Orientation::low, i,j,k, blen.y, vel, mylo,
+                                   bct(Orientation::ylo(), icomp),
+                                   bcl(Orientation::ylo(), icomp),
+                                   bcvalylo, maxorder, dxinv[1], inhomog, icomp);
+                vel(i,j,k,icomp) = 0.5_rt*(tmp+vel(i,j,k,icomp));
+            }
+        } else if (x_interior || xlo_domain) {
+            for (int icomp = 0; icomp < AMREX_SPACEDIM; ++icomp) {
+                mllinop_apply_bc_x(Orientation::low, i,j,k, blen.x, vel, mxlo,
+                                   bct(Orientation::xlo(), icomp),
+                                   bcl(Orientation::xlo(), icomp),
+                                   bcvalxlo, maxorder, dxinv[0], inhomog, icomp);
+            }
+        } else if (y_interior || ylo_domain) {
+            for (int icomp = 0; icomp < AMREX_SPACEDIM; ++icomp) {
+                mllinop_apply_bc_y(Orientation::low, i,j,k, blen.y, vel, mylo,
+                                   bct(Orientation::ylo(), icomp),
+                                   bcl(Orientation::ylo(), icomp),
+                                   bcvalylo, maxorder, dxinv[1], inhomog, icomp);
+            }
+        }
+    }
+}
+
+AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+void mltensor_fill_edges_xhi_ylo (int const i, int const j, int const k, Dim3 const& blen,
+                                  Array4<Real> const& vel,
+                                  Array4<int const> const& mxhi,
+                                  Array4<int const> const& mylo,
+                                  Array4<Real const> const& bcvalxhi,
+                                  Array4<Real const> const& bcvalylo,
+                                  Array2D<BoundCond,
+                                          0,2*AMREX_SPACEDIM,
+                                          0,AMREX_SPACEDIM> const& bct,
+                                  Array2D<Real,
+                                          0,2*AMREX_SPACEDIM,
+                                          0,AMREX_SPACEDIM> const& bcl,
+                                  int inhomog, int maxorder,
+                                  GpuArray<Real,AMREX_SPACEDIM> const& dxinv,
+                                  bool xhi_domain, bool ylo_domain) noexcept
+{
+    if (mxhi(i,j,k) != BndryData::covered && (!xhi_domain || !ylo_domain)) {
+        bool x_interior = mylo(i-1,j  ,k) == BndryData::covered;
+        bool x_exterior = mylo(i-1,j  ,k) == BndryData::not_covered;
+        bool y_interior = mxhi(i  ,j+1,k) == BndryData::covered;
+        bool y_exterior = mxhi(i  ,j+1,k) == BndryData::not_covered;
+        if ((x_interior && y_interior) || (x_exterior && y_exterior)) {
+            for (int icomp = 0; icomp < AMREX_SPACEDIM; ++icomp) {
+                mllinop_apply_bc_x(Orientation::high, i,j,k, blen.x, vel, mxhi,
+                                   bct(Orientation::xhi(), icomp),
+                                   bcl(Orientation::xhi(), icomp),
+                                   bcvalxhi, maxorder, dxinv[0], inhomog, icomp);
+                Real tmp = vel(i,j,k,icomp);
+                mllinop_apply_bc_y(Orientation::low, i,j,k, blen.y, vel, mylo,
+                                   bct(Orientation::ylo(), icomp),
+                                   bcl(Orientation::ylo(), icomp),
+                                   bcvalylo, maxorder, dxinv[1], inhomog, icomp);
+                vel(i,j,k,icomp) = 0.5_rt*(tmp+vel(i,j,k,icomp));
+            }
+        } else if (x_interior || xhi_domain) {
+            for (int icomp = 0; icomp < AMREX_SPACEDIM; ++icomp) {
+                mllinop_apply_bc_x(Orientation::high, i,j,k, blen.x, vel, mxhi,
+                                   bct(Orientation::xhi(), icomp),
+                                   bcl(Orientation::xhi(), icomp),
+                                   bcvalxhi, maxorder, dxinv[0], inhomog, icomp);
+            }
+        } else if (y_interior || ylo_domain) {
+            for (int icomp = 0; icomp < AMREX_SPACEDIM; ++icomp) {
+                mllinop_apply_bc_y(Orientation::low, i,j,k, blen.y, vel, mylo,
+                                   bct(Orientation::ylo(), icomp),
+                                   bcl(Orientation::ylo(), icomp),
+                                   bcvalylo, maxorder, dxinv[1], inhomog, icomp);
+            }
+        }
+    }
+}
+
+AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+void mltensor_fill_edges_xlo_yhi (int const i, int const j, int const k, Dim3 const& blen,
+                                  Array4<Real> const& vel,
+                                  Array4<int const> const& mxlo,
+                                  Array4<int const> const& myhi,
+                                  Array4<Real const> const& bcvalxlo,
+                                  Array4<Real const> const& bcvalyhi,
+                                  Array2D<BoundCond,
+                                          0,2*AMREX_SPACEDIM,
+                                          0,AMREX_SPACEDIM> const& bct,
+                                  Array2D<Real,
+                                          0,2*AMREX_SPACEDIM,
+                                          0,AMREX_SPACEDIM> const& bcl,
+                                  int inhomog, int maxorder,
+                                  GpuArray<Real,AMREX_SPACEDIM> const& dxinv,
+                                  bool xlo_domain, bool yhi_domain) noexcept
+{
+    if (mxlo(i,j,k) != BndryData::covered && (!xlo_domain || !yhi_domain)) {
+        bool x_interior = myhi(i+1,j  ,k) == BndryData::covered;
+        bool x_exterior = myhi(i+1,j  ,k) == BndryData::not_covered;
+        bool y_interior = mxlo(i  ,j-1,k) == BndryData::covered;
+        bool y_exterior = mxlo(i  ,j-1,k) == BndryData::not_covered;
+        if ((x_interior && y_interior) || (x_exterior && y_exterior)) {
+            for (int icomp = 0; icomp < AMREX_SPACEDIM; ++icomp) {
+                mllinop_apply_bc_x(Orientation::low, i,j,k, blen.x, vel, mxlo,
+                                   bct(Orientation::xlo(), icomp),
+                                   bcl(Orientation::xlo(), icomp),
+                                   bcvalxlo, maxorder, dxinv[0], inhomog, icomp);
+                Real tmp = vel(i,j,k,icomp);
+                mllinop_apply_bc_y(Orientation::high, i,j,k, blen.y, vel, myhi,
+                                   bct(Orientation::yhi(), icomp),
+                                   bcl(Orientation::yhi(), icomp),
+                                   bcvalyhi, maxorder, dxinv[1], inhomog, icomp);
+                vel(i,j,k,icomp) = 0.5_rt*(tmp+vel(i,j,k,icomp));
+            }
+        } else if (x_interior || xlo_domain) {
+            for (int icomp = 0; icomp < AMREX_SPACEDIM; ++icomp) {
+                mllinop_apply_bc_x(Orientation::low, i,j,k, blen.x, vel, mxlo,
+                                   bct(Orientation::xlo(), icomp),
+                                   bcl(Orientation::xlo(), icomp),
+                                   bcvalxlo, maxorder, dxinv[0], inhomog, icomp);
+            }
+        } else if (y_interior || yhi_domain) {
+            for (int icomp = 0; icomp < AMREX_SPACEDIM; ++icomp) {
+                mllinop_apply_bc_y(Orientation::high, i,j,k, blen.y, vel, myhi,
+                                   bct(Orientation::yhi(), icomp),
+                                   bcl(Orientation::yhi(), icomp),
+                                   bcvalyhi, maxorder, dxinv[1], inhomog, icomp);
+            }
+        }
+    }
+}
+
+AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+void mltensor_fill_edges_xhi_yhi (int const i, int const j, int const k, Dim3 const& blen,
+                                  Array4<Real> const& vel,
+                                  Array4<int const> const& mxhi,
+                                  Array4<int const> const& myhi,
+                                  Array4<Real const> const& bcvalxhi,
+                                  Array4<Real const> const& bcvalyhi,
+                                  Array2D<BoundCond,
+                                          0,2*AMREX_SPACEDIM,
+                                          0,AMREX_SPACEDIM> const& bct,
+                                  Array2D<Real,
+                                          0,2*AMREX_SPACEDIM,
+                                          0,AMREX_SPACEDIM> const& bcl,
+                                  int inhomog, int maxorder,
+                                  GpuArray<Real,AMREX_SPACEDIM> const& dxinv,
+                                  bool xhi_domain, bool yhi_domain) noexcept
+{
+    if (mxhi(i,j,k) != BndryData::covered && (!xhi_domain || !yhi_domain)) {
+        bool x_interior = myhi(i-1,j  ,k) == BndryData::covered;
+        bool x_exterior = myhi(i-1,j  ,k) == BndryData::not_covered;
+        bool y_interior = mxhi(i  ,j-1,k) == BndryData::covered;
+        bool y_exterior = mxhi(i  ,j-1,k) == BndryData::not_covered;
+        if ((x_interior && y_interior) || (x_exterior && y_exterior)) {
+            for (int icomp = 0; icomp < AMREX_SPACEDIM; ++icomp) {
+                mllinop_apply_bc_x(Orientation::high, i,j,k, blen.x, vel, mxhi,
+                                   bct(Orientation::xhi(), icomp),
+                                   bcl(Orientation::xhi(), icomp),
+                                   bcvalxhi, maxorder, dxinv[0], inhomog, icomp);
+                Real tmp = vel(i,j,k,icomp);
+                mllinop_apply_bc_y(Orientation::high, i,j,k, blen.y, vel, myhi,
+                                   bct(Orientation::yhi(), icomp),
+                                   bcl(Orientation::yhi(), icomp),
+                                   bcvalyhi, maxorder, dxinv[1], inhomog, icomp);
+                vel(i,j,k,icomp) = 0.5_rt*(tmp+vel(i,j,k,icomp));
+            }
+        } else if (x_interior || xhi_domain) {
+            for (int icomp = 0; icomp < AMREX_SPACEDIM; ++icomp) {
+                mllinop_apply_bc_x(Orientation::high, i,j,k, blen.x, vel, mxhi,
+                                   bct(Orientation::xhi(), icomp),
+                                   bcl(Orientation::xhi(), icomp),
+                                   bcvalxhi, maxorder, dxinv[0], inhomog, icomp);
+            }
+        } else if (y_interior || yhi_domain) {
+            for (int icomp = 0; icomp < AMREX_SPACEDIM; ++icomp) {
+                mllinop_apply_bc_y(Orientation::high, i,j,k, blen.y, vel, myhi,
+                                   bct(Orientation::yhi(), icomp),
+                                   bcl(Orientation::yhi(), icomp),
+                                   bcvalyhi, maxorder, dxinv[1], inhomog, icomp);
+            }
+        }
+    }
+}
+
+AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+void mltensor_fill_edges_xlo_zlo (int const i, int const j, int const k, Dim3 const& blen,
+                                  Array4<Real> const& vel,
+                                  Array4<int const> const& mxlo,
+                                  Array4<int const> const& mzlo,
+                                  Array4<Real const> const& bcvalxlo,
+                                  Array4<Real const> const& bcvalzlo,
+                                  Array2D<BoundCond,
+                                          0,2*AMREX_SPACEDIM,
+                                          0,AMREX_SPACEDIM> const& bct,
+                                  Array2D<Real,
+                                          0,2*AMREX_SPACEDIM,
+                                          0,AMREX_SPACEDIM> const& bcl,
+                                  int inhomog, int maxorder,
+                                  GpuArray<Real,AMREX_SPACEDIM> const& dxinv,
+                                  bool xlo_domain, bool zlo_domain) noexcept
+{
+    if (mxlo(i,j,k) != BndryData::covered && (!xlo_domain || !zlo_domain)) {
+        bool x_interior = mzlo(i+1,j,k  ) == BndryData::covered;
+        bool x_exterior = mzlo(i+1,j,k  ) == BndryData::not_covered;
+        bool z_interior = mxlo(i  ,j,k+1) == BndryData::covered;
+        bool z_exterior = mxlo(i  ,j,k+1) == BndryData::not_covered;
+        if ((x_interior && z_interior) || (x_exterior && z_exterior)) {
+            for (int icomp = 0; icomp < AMREX_SPACEDIM; ++icomp) {
+                mllinop_apply_bc_x(Orientation::low, i,j,k, blen.x, vel, mxlo,
+                                   bct(Orientation::xlo(), icomp),
+                                   bcl(Orientation::xlo(), icomp),
+                                   bcvalxlo, maxorder, dxinv[0], inhomog, icomp);
+                Real tmp = vel(i,j,k,icomp);
+                mllinop_apply_bc_z(Orientation::low, i,j,k, blen.z, vel, mzlo,
+                                   bct(Orientation::zlo(), icomp),
+                                   bcl(Orientation::zlo(), icomp),
+                                   bcvalzlo, maxorder, dxinv[2], inhomog, icomp);
+                vel(i,j,k,icomp) = 0.5_rt*(tmp+vel(i,j,k,icomp));
+            }
+        } else if (x_interior || xlo_domain) {
+            for (int icomp = 0; icomp < AMREX_SPACEDIM; ++icomp) {
+                mllinop_apply_bc_x(Orientation::low, i,j,k, blen.x, vel, mxlo,
+                                   bct(Orientation::xlo(), icomp),
+                                   bcl(Orientation::xlo(), icomp),
+                                   bcvalxlo, maxorder, dxinv[0], inhomog, icomp);
+            }
+        } else if (z_interior || zlo_domain) {
+            for (int icomp = 0; icomp < AMREX_SPACEDIM; ++icomp) {
+                mllinop_apply_bc_z(Orientation::low, i,j,k, blen.z, vel, mzlo,
+                                   bct(Orientation::zlo(), icomp),
+                                   bcl(Orientation::zlo(), icomp),
+                                   bcvalzlo, maxorder, dxinv[2], inhomog, icomp);
+            }
+        }
+    }
+}
+
+AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+void mltensor_fill_edges_xhi_zlo (int const i, int const j, int const k, Dim3 const& blen,
+                                  Array4<Real> const& vel,
+                                  Array4<int const> const& mxhi,
+                                  Array4<int const> const& mzlo,
+                                  Array4<Real const> const& bcvalxhi,
+                                  Array4<Real const> const& bcvalzlo,
+                                  Array2D<BoundCond,
+                                          0,2*AMREX_SPACEDIM,
+                                          0,AMREX_SPACEDIM> const& bct,
+                                  Array2D<Real,
+                                          0,2*AMREX_SPACEDIM,
+                                          0,AMREX_SPACEDIM> const& bcl,
+                                  int inhomog, int maxorder,
+                                  GpuArray<Real,AMREX_SPACEDIM> const& dxinv,
+                                  bool xhi_domain, bool zlo_domain) noexcept
+{
+    if (mxhi(i,j,k) != BndryData::covered && (!xhi_domain || !zlo_domain)) {
+        bool x_interior = mzlo(i-1,j,k  ) == BndryData::covered;
+        bool x_exterior = mzlo(i-1,j,k  ) == BndryData::not_covered;
+        bool z_interior = mxhi(i  ,j,k+1) == BndryData::covered;
+        bool z_exterior = mxhi(i  ,j,k+1) == BndryData::not_covered;
+        if ((x_interior && z_interior) || (x_exterior && z_exterior)) {
+            for (int icomp = 0; icomp < AMREX_SPACEDIM; ++icomp) {
+                mllinop_apply_bc_x(Orientation::high, i,j,k, blen.x, vel, mxhi,
+                                   bct(Orientation::xhi(), icomp),
+                                   bcl(Orientation::xhi(), icomp),
+                                   bcvalxhi, maxorder, dxinv[0], inhomog, icomp);
+                Real tmp = vel(i,j,k,icomp);
+                mllinop_apply_bc_z(Orientation::low, i,j,k, blen.z, vel, mzlo,
+                                   bct(Orientation::zlo(), icomp),
+                                   bcl(Orientation::zlo(), icomp),
+                                   bcvalzlo, maxorder, dxinv[2], inhomog, icomp);
+                vel(i,j,k,icomp) = 0.5_rt*(tmp+vel(i,j,k,icomp));
+            }
+        } else if (x_interior || xhi_domain) {
+            for (int icomp = 0; icomp < AMREX_SPACEDIM; ++icomp) {
+                mllinop_apply_bc_x(Orientation::high, i,j,k, blen.x, vel, mxhi,
+                                   bct(Orientation::xhi(), icomp),
+                                   bcl(Orientation::xhi(), icomp),
+                                   bcvalxhi, maxorder, dxinv[0], inhomog, icomp);
+            }
+        } else if (z_interior || zlo_domain) {
+            for (int icomp = 0; icomp < AMREX_SPACEDIM; ++icomp) {
+                mllinop_apply_bc_z(Orientation::low, i,j,k, blen.z, vel, mzlo,
+                                   bct(Orientation::zlo(), icomp),
+                                   bcl(Orientation::zlo(), icomp),
+                                   bcvalzlo, maxorder, dxinv[2], inhomog, icomp);
+            }
+        }
+    }
+}
+
+AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+void mltensor_fill_edges_xlo_zhi (int const i, int const j, int const k, Dim3 const& blen,
+                                  Array4<Real> const& vel,
+                                  Array4<int const> const& mxlo,
+                                  Array4<int const> const& mzhi,
+                                  Array4<Real const> const& bcvalxlo,
+                                  Array4<Real const> const& bcvalzhi,
+                                  Array2D<BoundCond,
+                                          0,2*AMREX_SPACEDIM,
+                                          0,AMREX_SPACEDIM> const& bct,
+                                  Array2D<Real,
+                                          0,2*AMREX_SPACEDIM,
+                                          0,AMREX_SPACEDIM> const& bcl,
+                                  int inhomog, int maxorder,
+                                  GpuArray<Real,AMREX_SPACEDIM> const& dxinv,
+                                  bool xlo_domain, bool zhi_domain) noexcept
+{
+    if (mxlo(i,j,k) != BndryData::covered && (!xlo_domain || !zhi_domain)) {
+        bool x_interior = mzhi(i+1,j,k  ) == BndryData::covered;
+        bool x_exterior = mzhi(i+1,j,k  ) == BndryData::not_covered;
+        bool z_interior = mxlo(i  ,j,k-1) == BndryData::covered;
+        bool z_exterior = mxlo(i  ,j,k-1) == BndryData::not_covered;
+        if ((x_interior && z_interior) || (x_exterior && z_exterior)) {
+            for (int icomp = 0; icomp < AMREX_SPACEDIM; ++icomp) {
+                mllinop_apply_bc_x(Orientation::low, i,j,k, blen.x, vel, mxlo,
+                                   bct(Orientation::xlo(), icomp),
+                                   bcl(Orientation::xlo(), icomp),
+                                   bcvalxlo, maxorder, dxinv[0], inhomog, icomp);
+                Real tmp = vel(i,j,k,icomp);
+                mllinop_apply_bc_z(Orientation::high, i,j,k, blen.z, vel, mzhi,
+                                   bct(Orientation::zhi(), icomp),
+                                   bcl(Orientation::zhi(), icomp),
+                                   bcvalzhi, maxorder, dxinv[2], inhomog, icomp);
+                vel(i,j,k,icomp) = 0.5_rt*(tmp+vel(i,j,k,icomp));
+            }
+        } else if (x_interior || xlo_domain) {
+            for (int icomp = 0; icomp < AMREX_SPACEDIM; ++icomp) {
+                mllinop_apply_bc_x(Orientation::low, i,j,k, blen.x, vel, mxlo,
+                                   bct(Orientation::xlo(), icomp),
+                                   bcl(Orientation::xlo(), icomp),
+                                   bcvalxlo, maxorder, dxinv[0], inhomog, icomp);
+            }
+        } else if (z_interior || zhi_domain) {
+            for (int icomp = 0; icomp < AMREX_SPACEDIM; ++icomp) {
+                mllinop_apply_bc_z(Orientation::high, i,j,k, blen.z, vel, mzhi,
+                                   bct(Orientation::zhi(), icomp),
+                                   bcl(Orientation::zhi(), icomp),
+                                   bcvalzhi, maxorder, dxinv[2], inhomog, icomp);
+            }
+        }
+    }
+}
+
+AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+void mltensor_fill_edges_xhi_zhi (int const i, int const j, int const k, Dim3 const& blen,
+                                  Array4<Real> const& vel,
+                                  Array4<int const> const& mxhi,
+                                  Array4<int const> const& mzhi,
+                                  Array4<Real const> const& bcvalxhi,
+                                  Array4<Real const> const& bcvalzhi,
+                                  Array2D<BoundCond,
+                                          0,2*AMREX_SPACEDIM,
+                                          0,AMREX_SPACEDIM> const& bct,
+                                  Array2D<Real,
+                                          0,2*AMREX_SPACEDIM,
+                                          0,AMREX_SPACEDIM> const& bcl,
+                                  int inhomog, int maxorder,
+                                  GpuArray<Real,AMREX_SPACEDIM> const& dxinv,
+                                  bool xhi_domain, bool zhi_domain) noexcept
+{
+    if (mxhi(i,j,k) != BndryData::covered && (!xhi_domain || !zhi_domain)) {
+        bool x_interior = mzhi(i-1,j,k  ) == BndryData::covered;
+        bool x_exterior = mzhi(i-1,j,k  ) == BndryData::not_covered;
+        bool z_interior = mxhi(i  ,j,k-1) == BndryData::covered;
+        bool z_exterior = mxhi(i  ,j,k-1) == BndryData::not_covered;
+        if ((x_interior && z_interior) || (x_exterior && z_exterior)) {
+            for (int icomp = 0; icomp < AMREX_SPACEDIM; ++icomp) {
+                mllinop_apply_bc_x(Orientation::high, i,j,k, blen.x, vel, mxhi,
+                                   bct(Orientation::xhi(), icomp),
+                                   bcl(Orientation::xhi(), icomp),
+                                   bcvalxhi, maxorder, dxinv[0], inhomog, icomp);
+                Real tmp = vel(i,j,k,icomp);
+                mllinop_apply_bc_z(Orientation::high, i,j,k, blen.z, vel, mzhi,
+                                   bct(Orientation::zhi(), icomp),
+                                   bcl(Orientation::zhi(), icomp),
+                                   bcvalzhi, maxorder, dxinv[2], inhomog, icomp);
+                vel(i,j,k,icomp) = 0.5_rt*(tmp+vel(i,j,k,icomp));
+            }
+        } else if (x_interior || xhi_domain) {
+            for (int icomp = 0; icomp < AMREX_SPACEDIM; ++icomp) {
+                mllinop_apply_bc_x(Orientation::high, i,j,k, blen.x, vel, mxhi,
+                                   bct(Orientation::xhi(), icomp),
+                                   bcl(Orientation::xhi(), icomp),
+                                   bcvalxhi, maxorder, dxinv[0], inhomog, icomp);
+            }
+        } else if (z_interior || zhi_domain) {
+            for (int icomp = 0; icomp < AMREX_SPACEDIM; ++icomp) {
+                mllinop_apply_bc_z(Orientation::high, i,j,k, blen.z, vel, mzhi,
+                                   bct(Orientation::zhi(), icomp),
+                                   bcl(Orientation::zhi(), icomp),
+                                   bcvalzhi, maxorder, dxinv[2], inhomog, icomp);
+            }
+        }
+    }
+}
+
+AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+void mltensor_fill_edges_ylo_zlo (int const i, int const j, int const k, Dim3 const& blen,
+                                  Array4<Real> const& vel,
+                                  Array4<int const> const& mylo,
+                                  Array4<int const> const& mzlo,
+                                  Array4<Real const> const& bcvalylo,
+                                  Array4<Real const> const& bcvalzlo,
+                                  Array2D<BoundCond,
+                                          0,2*AMREX_SPACEDIM,
+                                          0,AMREX_SPACEDIM> const& bct,
+                                  Array2D<Real,
+                                          0,2*AMREX_SPACEDIM,
+                                          0,AMREX_SPACEDIM> const& bcl,
+                                  int inhomog, int maxorder,
+                                  GpuArray<Real,AMREX_SPACEDIM> const& dxinv,
+                                  bool ylo_domain, bool zlo_domain) noexcept
+{
+    if (mylo(i,j,k) != BndryData::covered && (!ylo_domain || !zlo_domain)) {
+        bool y_interior = mzlo(i,j+1,k  ) == BndryData::covered;
+        bool y_exterior = mzlo(i,j+1,k  ) == BndryData::not_covered;
+        bool z_interior = mylo(i,j  ,k+1) == BndryData::covered;
+        bool z_exterior = mylo(i,j  ,k+1) == BndryData::not_covered;
+        if ((y_interior && z_interior) || (y_exterior && z_exterior)) {
+            for (int icomp = 0; icomp < AMREX_SPACEDIM; ++icomp) {
+                mllinop_apply_bc_y(Orientation::low, i,j,k, blen.y, vel, mylo,
+                                   bct(Orientation::ylo(), icomp),
+                                   bcl(Orientation::ylo(), icomp),
+                                   bcvalylo, maxorder, dxinv[1], inhomog, icomp);
+                Real tmp = vel(i,j,k,icomp);
+                mllinop_apply_bc_z(Orientation::low, i,j,k, blen.z, vel, mzlo,
+                                   bct(Orientation::zlo(), icomp),
+                                   bcl(Orientation::zlo(), icomp),
+                                   bcvalzlo, maxorder, dxinv[2], inhomog, icomp);
+                vel(i,j,k,icomp) = 0.5_rt*(tmp+vel(i,j,k,icomp));
+            }
+        } else if (y_interior || ylo_domain) {
+            for (int icomp = 0; icomp < AMREX_SPACEDIM; ++icomp) {
+                mllinop_apply_bc_y(Orientation::low, i,j,k, blen.y, vel, mylo,
+                                   bct(Orientation::ylo(), icomp),
+                                   bcl(Orientation::ylo(), icomp),
+                                   bcvalylo, maxorder, dxinv[1], inhomog, icomp);
+            }
+        } else if (z_interior || zlo_domain) {
+            for (int icomp = 0; icomp < AMREX_SPACEDIM; ++icomp) {
+                mllinop_apply_bc_z(Orientation::low, i,j,k, blen.z, vel, mzlo,
+                                   bct(Orientation::zlo(), icomp),
+                                   bcl(Orientation::zlo(), icomp),
+                                   bcvalzlo, maxorder, dxinv[2], inhomog, icomp);
+            }
+        }
+    }
+}
+
+AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+void mltensor_fill_edges_yhi_zlo (int const i, int const j, int const k, Dim3 const& blen,
+                                  Array4<Real> const& vel,
+                                  Array4<int const> const& myhi,
+                                  Array4<int const> const& mzlo,
+                                  Array4<Real const> const& bcvalyhi,
+                                  Array4<Real const> const& bcvalzlo,
+                                  Array2D<BoundCond,
+                                          0,2*AMREX_SPACEDIM,
+                                          0,AMREX_SPACEDIM> const& bct,
+                                  Array2D<Real,
+                                          0,2*AMREX_SPACEDIM,
+                                          0,AMREX_SPACEDIM> const& bcl,
+                                  int inhomog, int maxorder,
+                                  GpuArray<Real,AMREX_SPACEDIM> const& dxinv,
+                                  bool yhi_domain, bool zlo_domain) noexcept
+{
+    if (myhi(i,j,k) != BndryData::covered && (!yhi_domain || !zlo_domain)) {
+        bool y_interior = mzlo(i,j-1,k  ) == BndryData::covered;
+        bool y_exterior = mzlo(i,j-1,k  ) == BndryData::not_covered;
+        bool z_interior = myhi(i,j  ,k+1) == BndryData::covered;
+        bool z_exterior = myhi(i,j  ,k+1) == BndryData::not_covered;
+        if ((y_interior && z_interior) || (y_exterior && z_exterior)) {
+            for (int icomp = 0; icomp < AMREX_SPACEDIM; ++icomp) {
+                mllinop_apply_bc_y(Orientation::high, i,j,k, blen.y, vel, myhi,
+                                   bct(Orientation::yhi(), icomp),
+                                   bcl(Orientation::yhi(), icomp),
+                                   bcvalyhi, maxorder, dxinv[1], inhomog, icomp);
+                Real tmp = vel(i,j,k,icomp);
+                mllinop_apply_bc_z(Orientation::low, i,j,k, blen.z, vel, mzlo,
+                                   bct(Orientation::zlo(), icomp),
+                                   bcl(Orientation::zlo(), icomp),
+                                   bcvalzlo, maxorder, dxinv[2], inhomog, icomp);
+                vel(i,j,k,icomp) = 0.5_rt*(tmp+vel(i,j,k,icomp));
+            }
+        } else if (y_interior || yhi_domain) {
+            for (int icomp = 0; icomp < AMREX_SPACEDIM; ++icomp) {
+                mllinop_apply_bc_y(Orientation::high, i,j,k, blen.y, vel, myhi,
+                                   bct(Orientation::yhi(), icomp),
+                                   bcl(Orientation::yhi(), icomp),
+                                   bcvalyhi, maxorder, dxinv[1], inhomog, icomp);
+            }
+        } else if (z_interior || zlo_domain) {
+            for (int icomp = 0; icomp < AMREX_SPACEDIM; ++icomp) {
+                mllinop_apply_bc_z(Orientation::low, i,j,k, blen.z, vel, mzlo,
+                                   bct(Orientation::zlo(), icomp),
+                                   bcl(Orientation::zlo(), icomp),
+                                   bcvalzlo, maxorder, dxinv[2], inhomog, icomp);
+            }
+        }
+    }
+}
+
+AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+void mltensor_fill_edges_ylo_zhi (int const i, int const j, int const k, Dim3 const& blen,
+                                  Array4<Real> const& vel,
+                                  Array4<int const> const& mylo,
+                                  Array4<int const> const& mzhi,
+                                  Array4<Real const> const& bcvalylo,
+                                  Array4<Real const> const& bcvalzhi,
+                                  Array2D<BoundCond,
+                                          0,2*AMREX_SPACEDIM,
+                                          0,AMREX_SPACEDIM> const& bct,
+                                  Array2D<Real,
+                                          0,2*AMREX_SPACEDIM,
+                                          0,AMREX_SPACEDIM> const& bcl,
+                                  int inhomog, int maxorder,
+                                  GpuArray<Real,AMREX_SPACEDIM> const& dxinv,
+                                  bool ylo_domain, bool zhi_domain) noexcept
+{
+    if (mylo(i,j,k) != BndryData::covered && (!ylo_domain || !zhi_domain)) {
+        bool y_interior = mzhi(i,j+1,k  ) == BndryData::covered;
+        bool y_exterior = mzhi(i,j+1,k  ) == BndryData::not_covered;
+        bool z_interior = mylo(i,j  ,k-1) == BndryData::covered;
+        bool z_exterior = mylo(i,j  ,k-1) == BndryData::not_covered;
+        if ((y_interior && z_interior) || (y_exterior && z_exterior)) {
+            for (int icomp = 0; icomp < AMREX_SPACEDIM; ++icomp) {
+                mllinop_apply_bc_y(Orientation::low, i,j,k, blen.y, vel, mylo,
+                                   bct(Orientation::ylo(), icomp),
+                                   bcl(Orientation::ylo(), icomp),
+                                   bcvalylo, maxorder, dxinv[1], inhomog, icomp);
+                Real tmp = vel(i,j,k,icomp);
+                mllinop_apply_bc_z(Orientation::high, i,j,k, blen.z, vel, mzhi,
+                                   bct(Orientation::zhi(), icomp),
+                                   bcl(Orientation::zhi(), icomp),
+                                   bcvalzhi, maxorder, dxinv[2], inhomog, icomp);
+                vel(i,j,k,icomp) = 0.5_rt*(tmp+vel(i,j,k,icomp));
+            }
+        } else if (y_interior || ylo_domain) {
+            for (int icomp = 0; icomp < AMREX_SPACEDIM; ++icomp) {
+                mllinop_apply_bc_y(Orientation::low, i,j,k, blen.y, vel, mylo,
+                                   bct(Orientation::ylo(), icomp),
+                                   bcl(Orientation::ylo(), icomp),
+                                   bcvalylo, maxorder, dxinv[1], inhomog, icomp);
+            }
+        } else if (z_interior || zhi_domain) {
+            for (int icomp = 0; icomp < AMREX_SPACEDIM; ++icomp) {
+                mllinop_apply_bc_z(Orientation::high, i,j,k, blen.z, vel, mzhi,
+                                   bct(Orientation::zhi(), icomp),
+                                   bcl(Orientation::zhi(), icomp),
+                                   bcvalzhi, maxorder, dxinv[2], inhomog, icomp);
+            }
+        }
+    }
+}
+
+AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+void mltensor_fill_edges_yhi_zhi (int const i, int const j, int const k, Dim3 const& blen,
+                                  Array4<Real> const& vel,
+                                  Array4<int const> const& myhi,
+                                  Array4<int const> const& mzhi,
+                                  Array4<Real const> const& bcvalyhi,
+                                  Array4<Real const> const& bcvalzhi,
+                                  Array2D<BoundCond,
+                                          0,2*AMREX_SPACEDIM,
+                                          0,AMREX_SPACEDIM> const& bct,
+                                  Array2D<Real,
+                                          0,2*AMREX_SPACEDIM,
+                                          0,AMREX_SPACEDIM> const& bcl,
+                                  int inhomog, int maxorder,
+                                  GpuArray<Real,AMREX_SPACEDIM> const& dxinv,
+                                  bool yhi_domain, bool zhi_domain) noexcept
+{
+    if (myhi(i,j,k) != BndryData::covered && (!yhi_domain || !zhi_domain)) {
+        bool y_interior = mzhi(i,j-1,k  ) == BndryData::covered;
+        bool y_exterior = mzhi(i,j-1,k  ) == BndryData::not_covered;
+        bool z_interior = myhi(i,j  ,k-1) == BndryData::covered;
+        bool z_exterior = myhi(i,j  ,k-1) == BndryData::not_covered;
+        if ((y_interior && z_interior) || (y_exterior && z_exterior)) {
+            for (int icomp = 0; icomp < AMREX_SPACEDIM; ++icomp) {
+                mllinop_apply_bc_y(Orientation::high, i,j,k, blen.y, vel, myhi,
+                                   bct(Orientation::yhi(), icomp),
+                                   bcl(Orientation::yhi(), icomp),
+                                   bcvalyhi, maxorder, dxinv[1], inhomog, icomp);
+                Real tmp = vel(i,j,k,icomp);
+                mllinop_apply_bc_z(Orientation::high, i,j,k, blen.z, vel, mzhi,
+                                   bct(Orientation::zhi(), icomp),
+                                   bcl(Orientation::zhi(), icomp),
+                                   bcvalzhi, maxorder, dxinv[2], inhomog, icomp);
+                vel(i,j,k,icomp) = 0.5_rt*(tmp+vel(i,j,k,icomp));
+            }
+        } else if (y_interior || yhi_domain) {
+            for (int icomp = 0; icomp < AMREX_SPACEDIM; ++icomp) {
+                mllinop_apply_bc_y(Orientation::high, i,j,k, blen.y, vel, myhi,
+                                   bct(Orientation::yhi(), icomp),
+                                   bcl(Orientation::yhi(), icomp),
+                                   bcvalyhi, maxorder, dxinv[1], inhomog, icomp);
+            }
+        } else if (z_interior || zhi_domain) {
+            for (int icomp = 0; icomp < AMREX_SPACEDIM; ++icomp) {
+                mllinop_apply_bc_z(Orientation::high, i,j,k, blen.z, vel, mzhi,
+                                   bct(Orientation::zhi(), icomp),
+                                   bcl(Orientation::zhi(), icomp),
+                                   bcvalzhi, maxorder, dxinv[2], inhomog, icomp);
+            }
+        }
+    }
+}
+
+#ifdef AMREX_USE_EB
 AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
 void mltensor_fill_corners (int icorner, Box const& vbox, // vbox: the valid box
                             Array4<Real> const& vel,
@@ -21,495 +658,680 @@ void mltensor_fill_corners (int icorner, Box const& vbox, // vbox: the valid box
                             Array4<Real const> const& bcvalxhi,
                             Array4<Real const> const& bcvalyhi,
                             Array4<Real const> const& bcvalzhi,
-                            GpuArray<BoundCond,2*AMREX_SPACEDIM*AMREX_SPACEDIM> const& bct,
-                            GpuArray<Real,2*AMREX_SPACEDIM*AMREX_SPACEDIM> const& bcl,
+                            Array2D<BoundCond,
+                                    0,2*AMREX_SPACEDIM,
+                                    0,AMREX_SPACEDIM> const& bct,
+                            Array2D<Real,
+                                    0,2*AMREX_SPACEDIM,
+                                    0,AMREX_SPACEDIM> const& bcl,
                             int inhomog, int maxorder,
-                            GpuArray<Real,AMREX_SPACEDIM> const& dxinv, Box const& domain) noexcept
+                            GpuArray<Real,AMREX_SPACEDIM> const& dxinv,
+                            Dim3 const& dlo, Dim3 const& dhi) noexcept
 {
-    constexpr int oxlo = 0;
-    constexpr int oylo = 1;
-    constexpr int ozlo = 2;
-    constexpr int oxhi = 3;
-    constexpr int oyhi = 4;
-    constexpr int ozhi = 5;
-    constexpr int xdir = 0;
-    constexpr int ydir = 1;
-    constexpr int zdir = 2;
     const auto blen = amrex::length(vbox);
     const auto vlo  = amrex::lbound(vbox);
     const auto vhi  = amrex::ubound(vbox);
-    const auto dlo  = amrex::lbound(domain);
-    const auto dhi  = amrex::ubound(domain);
+    bool xlo_domain = (vlo.x == dlo.x);
+    bool ylo_domain = (vlo.y == dlo.y);
+    bool zlo_domain = (vlo.z == dlo.z);
+    bool xhi_domain = (vhi.x == dhi.x);
+    bool yhi_domain = (vhi.y == dhi.y);
+    bool zhi_domain = (vhi.z == dhi.z);
+
     for (int icomp = 0; icomp < AMREX_SPACEDIM; ++icomp) {
         switch (icorner) {
         case 0: {
             // xlo & ylo & zlo
-            Box bx = amrex::adjCellLo(amrex::adjCellLo(amrex::adjCellLo(vbox,xdir,1),ydir,1),zdir,1);
-            if (vlo.x == dlo.x && vlo.y == dlo.y && vlo.z == dlo.z) {
-                vel      (vlo.x-1,vlo.y-1,vlo.z-1,icomp)
-                    = vel(vlo.x-1,vlo.y  ,vlo.z  ,icomp)
-                    + vel(vlo.x  ,vlo.y-1,vlo.z  ,icomp)
-                    + vel(vlo.x  ,vlo.y  ,vlo.z-1,icomp)
-                    - vel(vlo.x  ,vlo.y  ,vlo.z  ,icomp) * Real(2.0);
-            } else if (vlo.x == dlo.x && vlo.y == dlo.y) {
-                vel      (vlo.x-1,vlo.y-1,vlo.z-1,icomp)
-                    = vel(vlo.x-1,vlo.y  ,vlo.z-1,icomp)
-                    + vel(vlo.x  ,vlo.y-1,vlo.z-1,icomp)
-                    - vel(vlo.x  ,vlo.y  ,vlo.z-1,icomp);
-            } else if (vlo.x == dlo.x && vlo.z == dlo.z) {
-                vel      (vlo.x-1,vlo.y-1,vlo.z-1,icomp)
-                    = vel(vlo.x-1,vlo.y-1,vlo.z  ,icomp)
-                    + vel(vlo.x  ,vlo.y-1,vlo.z-1,icomp)
-                    - vel(vlo.x  ,vlo.y-1,vlo.z  ,icomp);
-            } else if (vlo.y == dlo.y && vlo.z == dlo.z) {
-                vel      (vlo.x-1,vlo.y-1,vlo.z-1,icomp)
-                    = vel(vlo.x-1,vlo.y-1,vlo.z  ,icomp)
-                    + vel(vlo.x-1,vlo.y  ,vlo.z-1,icomp)
-                    - vel(vlo.x-1,vlo.y  ,vlo.z  ,icomp);
-            } else if (vlo.x == dlo.x) {
-                int offset = AMREX_SPACEDIM * oxlo;
-                mllinop_apply_bc_x(Orientation::low, bx, blen.x,
-                                   vel, mxlo, bct[offset+icomp], bcl[offset+icomp],
-                                   bcvalxlo, maxorder, dxinv[xdir], inhomog, icomp);
-            } else if (vlo.y == dlo.y) {
-                int offset = AMREX_SPACEDIM * oylo;
-                mllinop_apply_bc_y(Orientation::low, bx, blen.y,
-                                   vel, mylo, bct[offset+icomp], bcl[offset+icomp],
-                                   bcvalylo, maxorder, dxinv[ydir], inhomog, icomp);
-            } else if (vlo.z == dlo.z) {
-                int offset = AMREX_SPACEDIM * ozlo;
-                mllinop_apply_bc_z(Orientation::low, bx, blen.z,
-                                   vel, mzlo, bct[offset+icomp], bcl[offset+icomp],
-                                   bcvalzlo, maxorder, dxinv[zdir], inhomog, icomp);
-            } else if (mxlo(vlo.x-1,vlo.y-1,vlo.z-1) != BndryData::covered) {
-                if (mylo(vlo.x,vlo.y-1,vlo.z-1) == BndryData::covered) {
-                    int offset = AMREX_SPACEDIM * oxlo;
-                    mllinop_apply_bc_x(Orientation::low, bx, blen.x,
-                                       vel, mxlo, bct[offset+icomp], bcl[offset+icomp],
-                                       bcvalxlo, maxorder, dxinv[xdir], inhomog, icomp);
-                } else if (mxlo(vlo.x-1,vlo.y,vlo.z-1) == BndryData::covered) {
-                    int offset = AMREX_SPACEDIM * oylo;
-                    mllinop_apply_bc_y(Orientation::low, bx, blen.y,
-                                       vel, mylo, bct[offset+icomp], bcl[offset+icomp],
-                                       bcvalylo, maxorder, dxinv[ydir], inhomog, icomp);
-                } else {
-                    int offset = AMREX_SPACEDIM * ozlo;
-                    mllinop_apply_bc_z(Orientation::low, bx, blen.z,
-                                       vel, mzlo, bct[offset+icomp], bcl[offset+icomp],
-                                       bcvalzlo, maxorder, dxinv[zdir], inhomog, icomp);
+            int i = vlo.x-1;
+            int j = vlo.y-1;
+            int k = vlo.z-1;
+            if (mxlo(i,j,k) != BndryData::covered &&
+                (!xlo_domain || !ylo_domain || !zlo_domain)) {
+                bool x_interior = mylo(i+1,j  ,k  ) == BndryData::covered;
+                bool x_exterior = mylo(i+1,j  ,k  ) == BndryData::not_covered;
+                bool y_interior = mxlo(i  ,j+1,k  ) == BndryData::covered;
+                bool y_exterior = mxlo(i  ,j+1,k  ) == BndryData::not_covered;
+                bool z_interior = mxlo(i  ,j  ,k+1) == BndryData::covered;
+                bool z_exterior = mxlo(i  ,j  ,k+1) == BndryData::not_covered;
+                if ((x_interior && y_interior && z_interior) ||
+                    (x_exterior && y_exterior && z_exterior)) {
+                    mllinop_apply_bc_x(Orientation::low, i,j,k, blen.x, vel, mxlo,
+                                       bct(Orientation::xlo(), icomp),
+                                       bcl(Orientation::xlo(), icomp),
+                                       bcvalxlo, maxorder, dxinv[0], inhomog, icomp);
+                    Real tmp = vel(i,j,k,icomp);
+                    mllinop_apply_bc_y(Orientation::low, i,j,k, blen.y, vel, mylo,
+                                       bct(Orientation::ylo(), icomp),
+                                       bcl(Orientation::ylo(), icomp),
+                                       bcvalylo, maxorder, dxinv[1], inhomog, icomp);
+                    tmp += vel(i,j,k,icomp);
+                    mllinop_apply_bc_z(Orientation::low, i,j,k, blen.z, vel, mzlo,
+                                       bct(Orientation::zlo(), icomp),
+                                       bcl(Orientation::zlo(), icomp),
+                                       bcvalzlo, maxorder, dxinv[2], inhomog, icomp);
+                    vel(i,j,k,icomp) = Real(1./3.)*(tmp+vel(i,j,k,icomp));
+                } else if (x_interior && y_interior) {
+                    mllinop_apply_bc_x(Orientation::low, i,j,k, blen.x, vel, mxlo,
+                                       bct(Orientation::xlo(), icomp),
+                                       bcl(Orientation::xlo(), icomp),
+                                       bcvalxlo, maxorder, dxinv[0], inhomog, icomp);
+                    Real tmp = vel(i,j,k,icomp);
+                    mllinop_apply_bc_y(Orientation::low, i,j,k, blen.y, vel, mylo,
+                                       bct(Orientation::ylo(), icomp),
+                                       bcl(Orientation::ylo(), icomp),
+                                       bcvalylo, maxorder, dxinv[1], inhomog, icomp);
+                    vel(i,j,k,icomp) = 0.5_rt*(tmp+vel(i,j,k,icomp));
+                } else if (x_interior && z_interior) {
+                    mllinop_apply_bc_x(Orientation::low, i,j,k, blen.x, vel, mxlo,
+                                       bct(Orientation::xlo(), icomp),
+                                       bcl(Orientation::xlo(), icomp),
+                                       bcvalxlo, maxorder, dxinv[0], inhomog, icomp);
+                    Real tmp = vel(i,j,k,icomp);
+                    mllinop_apply_bc_z(Orientation::low, i,j,k, blen.z, vel, mzlo,
+                                       bct(Orientation::zlo(), icomp),
+                                       bcl(Orientation::zlo(), icomp),
+                                       bcvalzlo, maxorder, dxinv[2], inhomog, icomp);
+                    vel(i,j,k,icomp) = 0.5_rt*(tmp+vel(i,j,k,icomp));
+                } else if (y_interior && z_interior) {
+                    mllinop_apply_bc_y(Orientation::low, i,j,k, blen.y, vel, mylo,
+                                       bct(Orientation::ylo(), icomp),
+                                       bcl(Orientation::ylo(), icomp),
+                                       bcvalylo, maxorder, dxinv[1], inhomog, icomp);
+                    Real tmp = vel(i,j,k,icomp);
+                    mllinop_apply_bc_z(Orientation::low, i,j,k, blen.z, vel, mzlo,
+                                       bct(Orientation::zlo(), icomp),
+                                       bcl(Orientation::zlo(), icomp),
+                                       bcvalzlo, maxorder, dxinv[2], inhomog, icomp);
+                    vel(i,j,k,icomp) = 0.5_rt*(tmp+vel(i,j,k,icomp));
+                } else if (x_interior) {
+                    mllinop_apply_bc_x(Orientation::low, i,j,k, blen.x, vel, mxlo,
+                                       bct(Orientation::xlo(), icomp),
+                                       bcl(Orientation::xlo(), icomp),
+                                       bcvalxlo, maxorder, dxinv[0], inhomog, icomp);
+                } else if (y_interior) {
+                    mllinop_apply_bc_y(Orientation::low, i,j,k, blen.y, vel, mylo,
+                                       bct(Orientation::ylo(), icomp),
+                                       bcl(Orientation::ylo(), icomp),
+                                       bcvalylo, maxorder, dxinv[1], inhomog, icomp);
+                } else if (z_interior) {
+                    mllinop_apply_bc_z(Orientation::low, i,j,k, blen.z, vel, mzlo,
+                                       bct(Orientation::zlo(), icomp),
+                                       bcl(Orientation::zlo(), icomp),
+                                       bcvalzlo, maxorder, dxinv[2], inhomog, icomp);
                 }
             }
             break;
         }
         case 1: {
             // xhi & ylo & zlo
-            Box bx = amrex::adjCellLo(amrex::adjCellLo(amrex::adjCellHi(vbox,xdir,1),ydir,1),zdir,1);
-            if (vhi.x == dhi.x && vlo.y == dlo.y && vlo.z == dlo.z) {
-                vel      (vhi.x+1,vlo.y-1,vlo.z-1,icomp)
-                    = vel(vhi.x+1,vlo.y  ,vlo.z  ,icomp)
-                    + vel(vhi.x  ,vlo.y-1,vlo.z  ,icomp)
-                    + vel(vhi.x  ,vlo.y  ,vlo.z-1,icomp)
-                    - vel(vhi.x  ,vlo.y  ,vlo.z  ,icomp) * Real(2.0);
-            } else if (vhi.x == dhi.x && vlo.y == dlo.y) {
-                vel      (vhi.x+1,vlo.y-1,vlo.z-1,icomp)
-                    = vel(vhi.x+1,vlo.y  ,vlo.z-1,icomp)
-                    + vel(vhi.x  ,vlo.y-1,vlo.z-1,icomp)
-                    - vel(vhi.x  ,vlo.y  ,vlo.z-1,icomp);
-            } else if (vhi.x == dhi.x && vlo.z == dlo.z) {
-                vel      (vhi.x+1,vlo.y-1,vlo.z-1,icomp)
-                    = vel(vhi.x+1,vlo.y-1,vlo.z  ,icomp)
-                    + vel(vhi.x  ,vlo.y-1,vlo.z-1,icomp)
-                    - vel(vhi.x  ,vlo.y-1,vlo.z  ,icomp);
-            } else if (vlo.y == dlo.y && vlo.z == dlo.z) {
-                vel      (vhi.x+1,vlo.y-1,vlo.z-1,icomp)
-                    = vel(vhi.x+1,vlo.y-1,vlo.z  ,icomp)
-                    + vel(vhi.x+1,vlo.y  ,vlo.z-1,icomp)
-                    - vel(vhi.x+1,vlo.y  ,vlo.z  ,icomp);
-            } else if (vhi.x == dhi.x) {
-                int offset = AMREX_SPACEDIM * oxhi;
-                mllinop_apply_bc_x(Orientation::high, bx, blen.x,
-                                   vel, mxhi, bct[offset+icomp], bcl[offset+icomp],
-                                   bcvalxhi, maxorder, dxinv[xdir], inhomog, icomp);
-            } else if (vlo.y == dlo.y) {
-                int offset = AMREX_SPACEDIM * oylo;
-                mllinop_apply_bc_y(Orientation::low, bx, blen.y,
-                                   vel, mylo, bct[offset+icomp], bcl[offset+icomp],
-                                   bcvalylo, maxorder, dxinv[ydir], inhomog, icomp);
-            } else if (vlo.z == dlo.z) {
-                int offset = AMREX_SPACEDIM * ozlo;
-                mllinop_apply_bc_z(Orientation::low, bx, blen.z,
-                                   vel, mzlo, bct[offset+icomp], bcl[offset+icomp],
-                                   bcvalzlo, maxorder, dxinv[zdir], inhomog, icomp);
-            } else if (mxhi(vhi.x+1,vlo.y-1,vlo.z-1) != BndryData::covered) {
-                if (mylo(vhi.x,vlo.y-1,vlo.z-1) == BndryData::covered) {
-                    int offset = AMREX_SPACEDIM * oxhi;
-                    mllinop_apply_bc_x(Orientation::high, bx, blen.x,
-                                       vel, mxhi, bct[offset+icomp], bcl[offset+icomp],
-                                       bcvalxhi, maxorder, dxinv[xdir], inhomog, icomp);
-                } else if (mxhi(vhi.x+1,vlo.y,vlo.z-1) == BndryData::covered) {
-                    int offset = AMREX_SPACEDIM * oylo;
-                    mllinop_apply_bc_y(Orientation::low, bx, blen.y,
-                                       vel, mylo, bct[offset+icomp], bcl[offset+icomp],
-                                       bcvalylo, maxorder, dxinv[ydir], inhomog, icomp);
-                } else {
-                    int offset = AMREX_SPACEDIM * ozlo;
-                    mllinop_apply_bc_z(Orientation::low, bx, blen.z,
-                                       vel, mzlo, bct[offset+icomp], bcl[offset+icomp],
-                                       bcvalzlo, maxorder, dxinv[zdir], inhomog, icomp);
+            int i = vhi.x+1;
+            int j = vlo.y-1;
+            int k = vlo.z-1;
+            bool x_interior = mylo(i-1,j  ,k  ) == BndryData::covered;
+            bool x_exterior = mylo(i-1,j  ,k  ) == BndryData::not_covered;
+            bool y_interior = mxhi(i  ,j+1,k  ) == BndryData::covered;
+            bool y_exterior = mxhi(i  ,j+1,k  ) == BndryData::not_covered;
+            bool z_interior = mxhi(i  ,j  ,k+1) == BndryData::covered;
+            bool z_exterior = mxhi(i  ,j  ,k+1) == BndryData::not_covered;
+            if (mxhi(i,j,k) != BndryData::covered &&
+                (!xhi_domain || !ylo_domain || !zlo_domain)) {
+                if ((x_interior && y_interior && z_interior) ||
+                    (x_exterior && y_exterior && z_exterior)) {
+                    mllinop_apply_bc_x(Orientation::high, i,j,k, blen.x, vel, mxhi,
+                                       bct(Orientation::xhi(), icomp),
+                                       bcl(Orientation::xhi(), icomp),
+                                       bcvalxhi, maxorder, dxinv[0], inhomog, icomp);
+                    Real tmp = vel(i,j,k,icomp);
+                    mllinop_apply_bc_y(Orientation::low, i,j,k, blen.y, vel, mylo,
+                                       bct(Orientation::ylo(), icomp),
+                                       bcl(Orientation::ylo(), icomp),
+                                       bcvalylo, maxorder, dxinv[1], inhomog, icomp);
+                    tmp += vel(i,j,k,icomp);
+                    mllinop_apply_bc_z(Orientation::low, i,j,k, blen.z, vel, mzlo,
+                                       bct(Orientation::zlo(), icomp),
+                                       bcl(Orientation::zlo(), icomp),
+                                       bcvalzlo, maxorder, dxinv[2], inhomog, icomp);
+                    vel(i,j,k,icomp) = Real(1./3.)*(tmp+vel(i,j,k,icomp));
+                } else if (x_interior && y_interior) {
+                    mllinop_apply_bc_x(Orientation::high, i,j,k, blen.x, vel, mxhi,
+                                       bct(Orientation::xhi(), icomp),
+                                       bcl(Orientation::xhi(), icomp),
+                                       bcvalxhi, maxorder, dxinv[0], inhomog, icomp);
+                    Real tmp = vel(i,j,k,icomp);
+                    mllinop_apply_bc_y(Orientation::low, i,j,k, blen.y, vel, mylo,
+                                       bct(Orientation::ylo(), icomp),
+                                       bcl(Orientation::ylo(), icomp),
+                                       bcvalylo, maxorder, dxinv[1], inhomog, icomp);
+                    vel(i,j,k,icomp) = 0.5_rt*(tmp+vel(i,j,k,icomp));
+                } else if (x_interior && z_interior) {
+                    mllinop_apply_bc_x(Orientation::high, i,j,k, blen.x, vel, mxhi,
+                                       bct(Orientation::xhi(), icomp),
+                                       bcl(Orientation::xhi(), icomp),
+                                       bcvalxhi, maxorder, dxinv[0], inhomog, icomp);
+                    Real tmp = vel(i,j,k,icomp);
+                    mllinop_apply_bc_z(Orientation::low, i,j,k, blen.z, vel, mzlo,
+                                       bct(Orientation::zlo(), icomp),
+                                       bcl(Orientation::zlo(), icomp),
+                                       bcvalzlo, maxorder, dxinv[2], inhomog, icomp);
+                    vel(i,j,k,icomp) = 0.5_rt*(tmp+vel(i,j,k,icomp));
+                } else if (y_interior && z_interior) {
+                    mllinop_apply_bc_y(Orientation::low, i,j,k, blen.y, vel, mylo,
+                                       bct(Orientation::ylo(), icomp),
+                                       bcl(Orientation::ylo(), icomp),
+                                       bcvalylo, maxorder, dxinv[1], inhomog, icomp);
+                    Real tmp = vel(i,j,k,icomp);
+                    mllinop_apply_bc_z(Orientation::low, i,j,k, blen.z, vel, mzlo,
+                                       bct(Orientation::zlo(), icomp),
+                                       bcl(Orientation::zlo(), icomp),
+                                       bcvalzlo, maxorder, dxinv[2], inhomog, icomp);
+                    vel(i,j,k,icomp) = 0.5_rt*(tmp+vel(i,j,k,icomp));
+                } else if (x_interior) {
+                    mllinop_apply_bc_x(Orientation::high, i,j,k, blen.x, vel, mxhi,
+                                       bct(Orientation::xhi(), icomp),
+                                       bcl(Orientation::xhi(), icomp),
+                                       bcvalxhi, maxorder, dxinv[0], inhomog, icomp);
+                } else if (y_interior) {
+                    mllinop_apply_bc_y(Orientation::low, i,j,k, blen.y, vel, mylo,
+                                       bct(Orientation::ylo(), icomp),
+                                       bcl(Orientation::ylo(), icomp),
+                                       bcvalylo, maxorder, dxinv[1], inhomog, icomp);
+                } else if (z_interior) {
+                    mllinop_apply_bc_z(Orientation::low, i,j,k, blen.z, vel, mzlo,
+                                       bct(Orientation::zlo(), icomp),
+                                       bcl(Orientation::zlo(), icomp),
+                                       bcvalzlo, maxorder, dxinv[2], inhomog, icomp);
                 }
             }
             break;
         }
         case 2: {
             // xlo & yhi & zlo
-            Box bx = amrex::adjCellLo(amrex::adjCellHi(amrex::adjCellLo(vbox,xdir,1),ydir,1),zdir,1);
-            if (vlo.x == dlo.x && vhi.y == dhi.y && vlo.z == dlo.z) {
-                vel      (vlo.x-1,vhi.y+1,vlo.z-1,icomp)
-                    = vel(vlo.x-1,vhi.y  ,vlo.z  ,icomp)
-                    + vel(vlo.x  ,vhi.y+1,vlo.z  ,icomp)
-                    + vel(vlo.x  ,vhi.y  ,vlo.z-1,icomp)
-                    - vel(vlo.x  ,vhi.y  ,vlo.z  ,icomp) * Real(2.0);
-            } else if (vlo.x == dlo.x && vhi.y == dhi.y) {
-                vel      (vlo.x-1,vhi.y+1,vlo.z-1,icomp)
-                    = vel(vlo.x-1,vhi.y  ,vlo.z-1,icomp)
-                    + vel(vlo.x  ,vhi.y+1,vlo.z-1,icomp)
-                    - vel(vlo.x  ,vhi.y  ,vlo.z-1,icomp);
-            } else if (vlo.x == dlo.x && vlo.z == dlo.z) {
-                vel      (vlo.x-1,vhi.y+1,vlo.z-1,icomp)
-                    = vel(vlo.x-1,vhi.y+1,vlo.z  ,icomp)
-                    + vel(vlo.x  ,vhi.y+1,vlo.z-1,icomp)
-                    - vel(vlo.x  ,vhi.y+1,vlo.z  ,icomp);
-            } else if (vhi.y == dhi.y && vlo.z == dlo.z) {
-                vel      (vlo.x-1,vhi.y+1,vlo.z-1,icomp)
-                    = vel(vlo.x-1,vhi.y+1,vlo.z  ,icomp)
-                    + vel(vlo.x-1,vhi.y  ,vlo.z-1,icomp)
-                    - vel(vlo.x-1,vhi.y  ,vlo.z  ,icomp);
-            } else if (vlo.x == dlo.x) {
-                int offset = AMREX_SPACEDIM * oxlo;
-                mllinop_apply_bc_x(Orientation::low, bx, blen.x,
-                                   vel, mxlo, bct[offset+icomp], bcl[offset+icomp],
-                                   bcvalxlo, maxorder, dxinv[xdir], inhomog, icomp);
-            } else if (vhi.y == dhi.y) {
-                int offset = AMREX_SPACEDIM * oyhi;
-                mllinop_apply_bc_y(Orientation::high, bx, blen.y,
-                                   vel, myhi, bct[offset+icomp], bcl[offset+icomp],
-                                   bcvalyhi, maxorder, dxinv[ydir], inhomog, icomp);
-            } else if (vlo.z == dlo.z) {
-                int offset = AMREX_SPACEDIM * ozlo;
-                mllinop_apply_bc_z(Orientation::low, bx, blen.z,
-                                   vel, mzlo, bct[offset+icomp], bcl[offset+icomp],
-                                   bcvalzlo, maxorder, dxinv[zdir], inhomog, icomp);
-            } else if (mxlo(vlo.x-1,vhi.y+1,vlo.z-1) != BndryData::covered) {
-                if (myhi(vlo.x,vhi.y+1,vlo.z-1) == BndryData::covered) {
-                    int offset = AMREX_SPACEDIM * oxlo;
-                    mllinop_apply_bc_x(Orientation::low, bx, blen.x,
-                                       vel, mxlo, bct[offset+icomp], bcl[offset+icomp],
-                                       bcvalxlo, maxorder, dxinv[xdir], inhomog, icomp);
-                } else if (mxlo(vlo.x-1,vhi.y,vlo.z-1) == BndryData::covered) {
-                    int offset = AMREX_SPACEDIM * oyhi;
-                    mllinop_apply_bc_y(Orientation::high, bx, blen.y,
-                                       vel, myhi, bct[offset+icomp], bcl[offset+icomp],
-                                       bcvalyhi, maxorder, dxinv[ydir], inhomog, icomp);
-                } else {
-                    int offset = AMREX_SPACEDIM * ozlo;
-                    mllinop_apply_bc_z(Orientation::low, bx, blen.z,
-                                       vel, mzlo, bct[offset+icomp], bcl[offset+icomp],
-                                       bcvalzlo, maxorder, dxinv[zdir], inhomog, icomp);
+            int i = vlo.x-1;
+            int j = vhi.y+1;
+            int k = vlo.z-1;
+            bool x_interior = myhi(i+1,j  ,k  ) == BndryData::covered;
+            bool x_exterior = myhi(i+1,j  ,k  ) == BndryData::not_covered;
+            bool y_interior = mxlo(i  ,j-1,k  ) == BndryData::covered;
+            bool y_exterior = mxlo(i  ,j-1,k  ) == BndryData::not_covered;
+            bool z_interior = mxlo(i  ,j  ,k+1) == BndryData::covered;
+            bool z_exterior = mxlo(i  ,j  ,k+1) == BndryData::not_covered;
+            if (mxlo(i,j,k) != BndryData::covered &&
+                (!xlo_domain || !yhi_domain || !zlo_domain)) {
+                if ((x_interior && y_interior && z_interior) ||
+                    (x_exterior && y_exterior && z_exterior)) {
+                    mllinop_apply_bc_x(Orientation::low, i,j,k, blen.x, vel, mxlo,
+                                       bct(Orientation::xlo(), icomp),
+                                       bcl(Orientation::xlo(), icomp),
+                                       bcvalxlo, maxorder, dxinv[0], inhomog, icomp);
+                    Real tmp = vel(i,j,k,icomp);
+                    mllinop_apply_bc_y(Orientation::high, i,j,k, blen.y, vel, myhi,
+                                       bct(Orientation::yhi(), icomp),
+                                       bcl(Orientation::yhi(), icomp),
+                                       bcvalyhi, maxorder, dxinv[1], inhomog, icomp);
+                    tmp += vel(i,j,k,icomp);
+                    mllinop_apply_bc_z(Orientation::low, i,j,k, blen.z, vel, mzlo,
+                                       bct(Orientation::zlo(), icomp),
+                                       bcl(Orientation::zlo(), icomp),
+                                       bcvalzlo, maxorder, dxinv[2], inhomog, icomp);
+                    vel(i,j,k,icomp) = Real(1./3.)*(tmp+vel(i,j,k,icomp));
+                } else if (x_interior && y_interior) {
+                    mllinop_apply_bc_x(Orientation::low, i,j,k, blen.x, vel, mxlo,
+                                       bct(Orientation::xlo(), icomp),
+                                       bcl(Orientation::xlo(), icomp),
+                                       bcvalxlo, maxorder, dxinv[0], inhomog, icomp);
+                    Real tmp = vel(i,j,k,icomp);
+                    mllinop_apply_bc_y(Orientation::high, i,j,k, blen.y, vel, myhi,
+                                       bct(Orientation::yhi(), icomp),
+                                       bcl(Orientation::yhi(), icomp),
+                                       bcvalyhi, maxorder, dxinv[1], inhomog, icomp);
+                    vel(i,j,k,icomp) = 0.5_rt*(tmp+vel(i,j,k,icomp));
+                } else if (x_interior && z_interior) {
+                    mllinop_apply_bc_x(Orientation::low, i,j,k, blen.x, vel, mxlo,
+                                       bct(Orientation::xlo(), icomp),
+                                       bcl(Orientation::xlo(), icomp),
+                                       bcvalxlo, maxorder, dxinv[0], inhomog, icomp);
+                    Real tmp = vel(i,j,k,icomp);
+                    mllinop_apply_bc_z(Orientation::low, i,j,k, blen.z, vel, mzlo,
+                                       bct(Orientation::zlo(), icomp),
+                                       bcl(Orientation::zlo(), icomp),
+                                       bcvalzlo, maxorder, dxinv[2], inhomog, icomp);
+                    vel(i,j,k,icomp) = 0.5_rt*(tmp+vel(i,j,k,icomp));
+                } else if (y_interior && z_interior) {
+                    mllinop_apply_bc_y(Orientation::high, i,j,k, blen.y, vel, myhi,
+                                       bct(Orientation::yhi(), icomp),
+                                       bcl(Orientation::yhi(), icomp),
+                                       bcvalyhi, maxorder, dxinv[1], inhomog, icomp);
+                    Real tmp = vel(i,j,k,icomp);
+                    mllinop_apply_bc_z(Orientation::low, i,j,k, blen.z, vel, mzlo,
+                                       bct(Orientation::zlo(), icomp),
+                                       bcl(Orientation::zlo(), icomp),
+                                       bcvalzlo, maxorder, dxinv[2], inhomog, icomp);
+                    vel(i,j,k,icomp) = 0.5_rt*(tmp+vel(i,j,k,icomp));
+                } else if (x_interior) {
+                    mllinop_apply_bc_x(Orientation::low, i,j,k, blen.x, vel, mxlo,
+                                       bct(Orientation::xlo(), icomp),
+                                       bcl(Orientation::xlo(), icomp),
+                                       bcvalxlo, maxorder, dxinv[0], inhomog, icomp);
+                } else if (y_interior) {
+                    mllinop_apply_bc_y(Orientation::high, i,j,k, blen.y, vel, myhi,
+                                       bct(Orientation::yhi(), icomp),
+                                       bcl(Orientation::yhi(), icomp),
+                                       bcvalyhi, maxorder, dxinv[1], inhomog, icomp);
+                } else if (z_interior) {
+                    mllinop_apply_bc_z(Orientation::low, i,j,k, blen.z, vel, mzlo,
+                                       bct(Orientation::zlo(), icomp),
+                                       bcl(Orientation::zlo(), icomp),
+                                       bcvalzlo, maxorder, dxinv[2], inhomog, icomp);
                 }
             }
             break;
         }
         case 3: {
             // xhi & yhi & zlo
-            Box bx = amrex::adjCellLo(amrex::adjCellHi(amrex::adjCellHi(vbox,xdir,1),ydir,1),zdir,1);
-            if (vhi.x == dhi.x && vhi.y == dhi.y && vlo.z == dlo.z) {
-                vel      (vhi.x+1,vhi.y+1,vlo.z-1,icomp)
-                    = vel(vhi.x+1,vhi.y  ,vlo.z  ,icomp)
-                    + vel(vhi.x  ,vhi.y+1,vlo.z  ,icomp)
-                    + vel(vhi.x  ,vhi.y  ,vlo.z-1,icomp)
-                    - vel(vhi.x  ,vhi.y  ,vlo.z  ,icomp) * Real(2.0);
-            } else if (vhi.x == dhi.x && vhi.y == dhi.y) {
-                vel      (vhi.x+1,vhi.y+1,vlo.z-1,icomp)
-                    = vel(vhi.x+1,vhi.y  ,vlo.z-1,icomp)
-                    + vel(vhi.x  ,vhi.y+1,vlo.z-1,icomp)
-                    - vel(vhi.x  ,vhi.y  ,vlo.z-1,icomp);
-            } else if (vhi.x == dhi.x && vlo.z == dlo.z) {
-                vel      (vhi.x+1,vhi.y+1,vlo.z-1,icomp)
-                    = vel(vhi.x+1,vhi.y+1,vlo.z  ,icomp)
-                    + vel(vhi.x  ,vhi.y+1,vlo.z-1,icomp)
-                    - vel(vhi.x  ,vhi.y+1,vlo.z  ,icomp);
-            } else if (vhi.y == dhi.y && vlo.z == dlo.z) {
-                vel      (vhi.x+1,vhi.y+1,vlo.z-1,icomp)
-                    = vel(vhi.x+1,vhi.y+1,vlo.z  ,icomp)
-                    + vel(vhi.x+1,vhi.y  ,vlo.z-1,icomp)
-                    - vel(vhi.x+1,vhi.y  ,vlo.z  ,icomp);
-            } else if (vhi.x == dhi.x) {
-                int offset = AMREX_SPACEDIM * oxhi;
-                mllinop_apply_bc_x(Orientation::high, bx, blen.x,
-                                   vel, mxhi, bct[offset+icomp], bcl[offset+icomp],
-                                   bcvalxhi, maxorder, dxinv[xdir], inhomog, icomp);
-            } else if (vhi.y == dhi.y) {
-                int offset = AMREX_SPACEDIM * oyhi;
-                mllinop_apply_bc_y(Orientation::high, bx, blen.y,
-                                   vel, myhi, bct[offset+icomp], bcl[offset+icomp],
-                                   bcvalyhi, maxorder, dxinv[ydir], inhomog, icomp);
-            } else if (vlo.z == dlo.z) {
-                int offset = AMREX_SPACEDIM * ozlo;
-                mllinop_apply_bc_z(Orientation::low, bx, blen.z,
-                                   vel, mzlo, bct[offset+icomp], bcl[offset+icomp],
-                                   bcvalzlo, maxorder, dxinv[zdir], inhomog, icomp);
-            } else if (mxhi(vhi.x+1,vhi.y+1,vlo.z-1) != BndryData::covered) {
-                if (myhi(vhi.x,vhi.y+1,vlo.z-1) == BndryData::covered) {
-                    int offset = AMREX_SPACEDIM * oxhi;
-                    mllinop_apply_bc_x(Orientation::high, bx, blen.x,
-                                       vel, mxhi, bct[offset+icomp], bcl[offset+icomp],
-                                       bcvalxhi, maxorder, dxinv[xdir], inhomog, icomp);
-                } else if (mxhi(vhi.x+1,vhi.y,vlo.z-1) == BndryData::covered) {
-                    int offset = AMREX_SPACEDIM * oyhi;
-                    mllinop_apply_bc_y(Orientation::high, bx, blen.y,
-                                       vel, myhi, bct[offset+icomp], bcl[offset+icomp],
-                                       bcvalyhi, maxorder, dxinv[ydir], inhomog, icomp);
-                } else {
-                    int offset = AMREX_SPACEDIM * ozlo;
-                    mllinop_apply_bc_z(Orientation::low, bx, blen.z,
-                                       vel, mzlo, bct[offset+icomp], bcl[offset+icomp],
-                                       bcvalzlo, maxorder, dxinv[zdir], inhomog, icomp);
+            int i = vhi.x+1;
+            int j = vhi.y+1;
+            int k = vlo.z-1;
+            bool x_interior = myhi(i-1,j  ,k  ) == BndryData::covered;
+            bool x_exterior = myhi(i-1,j  ,k  ) == BndryData::not_covered;
+            bool y_interior = mxhi(i  ,j-1,k  ) == BndryData::covered;
+            bool y_exterior = mxhi(i  ,j-1,k  ) == BndryData::not_covered;
+            bool z_interior = mxhi(i  ,j  ,k+1) == BndryData::covered;
+            bool z_exterior = mxhi(i  ,j  ,k+1) == BndryData::not_covered;
+            if (mxhi(i,j,k) != BndryData::covered &&
+                (!xhi_domain || !yhi_domain || !zlo_domain)) {
+                if ((x_interior && y_interior && z_interior) ||
+                    (x_exterior && y_exterior && z_exterior)) {
+                    mllinop_apply_bc_x(Orientation::high, i,j,k, blen.x, vel, mxhi,
+                                       bct(Orientation::xhi(), icomp),
+                                       bcl(Orientation::xhi(), icomp),
+                                       bcvalxhi, maxorder, dxinv[0], inhomog, icomp);
+                    Real tmp = vel(i,j,k,icomp);
+                    mllinop_apply_bc_y(Orientation::high, i,j,k, blen.y, vel, myhi,
+                                       bct(Orientation::yhi(), icomp),
+                                       bcl(Orientation::yhi(), icomp),
+                                       bcvalyhi, maxorder, dxinv[1], inhomog, icomp);
+                    tmp += vel(i,j,k,icomp);
+                    mllinop_apply_bc_z(Orientation::low, i,j,k, blen.z, vel, mzlo,
+                                       bct(Orientation::zlo(), icomp),
+                                       bcl(Orientation::zlo(), icomp),
+                                       bcvalzlo, maxorder, dxinv[2], inhomog, icomp);
+                    vel(i,j,k,icomp) = Real(1./3.)*(tmp+vel(i,j,k,icomp));
+                } else if (x_interior && y_interior) {
+                    mllinop_apply_bc_x(Orientation::high, i,j,k, blen.x, vel, mxhi,
+                                       bct(Orientation::xhi(), icomp),
+                                       bcl(Orientation::xhi(), icomp),
+                                       bcvalxhi, maxorder, dxinv[0], inhomog, icomp);
+                    Real tmp = vel(i,j,k,icomp);
+                    mllinop_apply_bc_y(Orientation::high, i,j,k, blen.y, vel, myhi,
+                                       bct(Orientation::yhi(), icomp),
+                                       bcl(Orientation::yhi(), icomp),
+                                       bcvalyhi, maxorder, dxinv[1], inhomog, icomp);
+                    vel(i,j,k,icomp) = 0.5_rt*(tmp+vel(i,j,k,icomp));
+                } else if (x_interior && z_interior) {
+                    mllinop_apply_bc_x(Orientation::high, i,j,k, blen.x, vel, mxhi,
+                                       bct(Orientation::xhi(), icomp),
+                                       bcl(Orientation::xhi(), icomp),
+                                       bcvalxhi, maxorder, dxinv[0], inhomog, icomp);
+                    Real tmp = vel(i,j,k,icomp);
+                    mllinop_apply_bc_z(Orientation::low, i,j,k, blen.z, vel, mzlo,
+                                       bct(Orientation::zlo(), icomp),
+                                       bcl(Orientation::zlo(), icomp),
+                                       bcvalzlo, maxorder, dxinv[2], inhomog, icomp);
+                    vel(i,j,k,icomp) = 0.5_rt*(tmp+vel(i,j,k,icomp));
+                } else if (y_interior && z_interior) {
+                    mllinop_apply_bc_y(Orientation::high, i,j,k, blen.y, vel, myhi,
+                                       bct(Orientation::yhi(), icomp),
+                                       bcl(Orientation::yhi(), icomp),
+                                       bcvalyhi, maxorder, dxinv[1], inhomog, icomp);
+                    Real tmp = vel(i,j,k,icomp);
+                    mllinop_apply_bc_z(Orientation::low, i,j,k, blen.z, vel, mzlo,
+                                       bct(Orientation::zlo(), icomp),
+                                       bcl(Orientation::zlo(), icomp),
+                                       bcvalzlo, maxorder, dxinv[2], inhomog, icomp);
+                    vel(i,j,k,icomp) = 0.5_rt*(tmp+vel(i,j,k,icomp));
+                } else if (x_interior) {
+                    mllinop_apply_bc_x(Orientation::high, i,j,k, blen.x, vel, mxhi,
+                                       bct(Orientation::xhi(), icomp),
+                                       bcl(Orientation::xhi(), icomp),
+                                       bcvalxhi, maxorder, dxinv[0], inhomog, icomp);
+                } else if (y_interior) {
+                    mllinop_apply_bc_y(Orientation::high, i,j,k, blen.y, vel, myhi,
+                                       bct(Orientation::yhi(), icomp),
+                                       bcl(Orientation::yhi(), icomp),
+                                       bcvalyhi, maxorder, dxinv[1], inhomog, icomp);
+                } else if (z_interior) {
+                    mllinop_apply_bc_z(Orientation::low, i,j,k, blen.z, vel, mzlo,
+                                       bct(Orientation::zlo(), icomp),
+                                       bcl(Orientation::zlo(), icomp),
+                                       bcvalzlo, maxorder, dxinv[2], inhomog, icomp);
                 }
             }
             break;
         }
         case 4: {
             // xlo & ylo & zhi
-            Box bx = amrex::adjCellHi(amrex::adjCellLo(amrex::adjCellLo(vbox,xdir,1),ydir,1),zdir,1);
-            if (vlo.x == dlo.x && vlo.y == dlo.y && vhi.z == dhi.z) {
-                vel      (vlo.x-1, vlo.y-1, vhi.z+1,icomp)
-                    = vel(vlo.x-1, vlo.y  , vhi.z  ,icomp)
-                    + vel(vlo.x  , vlo.y-1, vhi.z  ,icomp)
-                    + vel(vlo.x  , vlo.y  , vhi.z+1,icomp)
-                    - vel(vlo.x  , vlo.y  , vhi.z  ,icomp) * Real(2.0);
-            } else if (vlo.x == dlo.x && vlo.y == dlo.y) {
-                vel      (vlo.x-1, vlo.y-1, vhi.z+1,icomp)
-                    = vel(vlo.x-1, vlo.y  , vhi.z+1,icomp)
-                    + vel(vlo.x  , vlo.y-1, vhi.z+1,icomp)
-                    - vel(vlo.x  , vlo.y  , vhi.z+1,icomp);
-            } else if (vlo.x == dlo.x && vhi.z == dhi.z) {
-                vel      (vlo.x-1, vlo.y-1, vhi.z+1,icomp)
-                    = vel(vlo.x-1, vlo.y-1, vhi.z  ,icomp)
-                    + vel(vlo.x  , vlo.y-1, vhi.z+1,icomp)
-                    - vel(vlo.x  , vlo.y-1, vhi.z  ,icomp);
-            } else if (vlo.y == dlo.y && vhi.z == dhi.z) {
-                vel      (vlo.x-1, vlo.y-1, vhi.z+1,icomp)
-                    = vel(vlo.x-1, vlo.y-1, vhi.z  ,icomp)
-                    + vel(vlo.x-1, vlo.y  , vhi.z+1,icomp)
-                    - vel(vlo.x-1, vlo.y  , vhi.z  ,icomp);
-            } else if (vlo.x == dlo.x) {
-                int offset = AMREX_SPACEDIM * oxlo;
-                mllinop_apply_bc_x(Orientation::low, bx, blen.x,
-                                   vel, mxlo, bct[offset+icomp], bcl[offset+icomp],
-                                   bcvalxlo, maxorder, dxinv[xdir], inhomog, icomp);
-            } else if (vlo.y == dlo.y) {
-                int offset = AMREX_SPACEDIM * oylo;
-                mllinop_apply_bc_y(Orientation::low, bx, blen.y,
-                                   vel, mylo, bct[offset+icomp], bcl[offset+icomp],
-                                   bcvalylo, maxorder, dxinv[ydir], inhomog, icomp);
-            } else if (vhi.z == dhi.z) {
-                int offset = AMREX_SPACEDIM * ozhi;
-                mllinop_apply_bc_z(Orientation::high, bx, blen.z,
-                                   vel, mzhi, bct[offset+icomp], bcl[offset+icomp],
-                                   bcvalzhi, maxorder, dxinv[zdir], inhomog, icomp);
-            } else if (mxlo(vlo.x-1,vlo.y-1,vhi.z+1) != BndryData::covered) {
-                if (mylo(vlo.x,vlo.y-1,vhi.z+1) == BndryData::covered) {
-                    int offset = AMREX_SPACEDIM * oxlo;
-                    mllinop_apply_bc_x(Orientation::low, bx, blen.x,
-                                       vel, mxlo, bct[offset+icomp], bcl[offset+icomp],
-                                       bcvalxlo, maxorder, dxinv[xdir], inhomog, icomp);
-                } else if (mxlo(vlo.x-1,vlo.y,vhi.z+1) == BndryData::covered) {
-                    int offset = AMREX_SPACEDIM * oylo;
-                    mllinop_apply_bc_y(Orientation::low, bx, blen.y,
-                                       vel, mylo, bct[offset+icomp], bcl[offset+icomp],
-                                       bcvalylo, maxorder, dxinv[ydir], inhomog, icomp);
-                } else {
-                    int offset = AMREX_SPACEDIM * ozhi;
-                    mllinop_apply_bc_z(Orientation::high, bx, blen.z,
-                                       vel, mzhi, bct[offset+icomp], bcl[offset+icomp],
-                                       bcvalzhi, maxorder, dxinv[zdir], inhomog, icomp);
+            int i = vlo.x-1;
+            int j = vlo.y-1;
+            int k = vhi.z+1;
+            bool x_interior = mylo(i+1,j  ,k  ) == BndryData::covered;
+            bool x_exterior = mylo(i+1,j  ,k  ) == BndryData::not_covered;
+            bool y_interior = mxlo(i  ,j+1,k  ) == BndryData::covered;
+            bool y_exterior = mxlo(i  ,j+1,k  ) == BndryData::not_covered;
+            bool z_interior = mxlo(i  ,j  ,k-1) == BndryData::covered;
+            bool z_exterior = mxlo(i  ,j  ,k-1) == BndryData::not_covered;
+            if (mxlo(i,j,k) != BndryData::covered &&
+                (!xlo_domain || !ylo_domain || !zhi_domain)) {
+                if ((x_interior && y_interior && z_interior) ||
+                    (x_exterior && y_exterior && z_exterior)) {
+                    mllinop_apply_bc_x(Orientation::low, i,j,k, blen.x, vel, mxlo,
+                                       bct(Orientation::xlo(), icomp),
+                                       bcl(Orientation::xlo(), icomp),
+                                       bcvalxlo, maxorder, dxinv[0], inhomog, icomp);
+                    Real tmp = vel(i,j,k,icomp);
+                    mllinop_apply_bc_y(Orientation::low, i,j,k, blen.y, vel, mylo,
+                                       bct(Orientation::ylo(), icomp),
+                                       bcl(Orientation::ylo(), icomp),
+                                       bcvalylo, maxorder, dxinv[1], inhomog, icomp);
+                    tmp += vel(i,j,k,icomp);
+                    mllinop_apply_bc_z(Orientation::high, i,j,k, blen.z, vel, mzhi,
+                                       bct(Orientation::zhi(), icomp),
+                                       bcl(Orientation::zhi(), icomp),
+                                       bcvalzhi, maxorder, dxinv[2], inhomog, icomp);
+                    vel(i,j,k,icomp) = Real(1./3.)*(tmp+vel(i,j,k,icomp));
+                } else if (x_interior && y_interior) {
+                    mllinop_apply_bc_x(Orientation::low, i,j,k, blen.x, vel, mxlo,
+                                       bct(Orientation::xlo(), icomp),
+                                       bcl(Orientation::xlo(), icomp),
+                                       bcvalxlo, maxorder, dxinv[0], inhomog, icomp);
+                    Real tmp = vel(i,j,k,icomp);
+                    mllinop_apply_bc_y(Orientation::low, i,j,k, blen.y, vel, mylo,
+                                       bct(Orientation::ylo(), icomp),
+                                       bcl(Orientation::ylo(), icomp),
+                                       bcvalylo, maxorder, dxinv[1], inhomog, icomp);
+                    vel(i,j,k,icomp) = 0.5_rt*(tmp+vel(i,j,k,icomp));
+                } else if (x_interior && z_interior) {
+                    mllinop_apply_bc_x(Orientation::low, i,j,k, blen.x, vel, mxlo,
+                                       bct(Orientation::xlo(), icomp),
+                                       bcl(Orientation::xlo(), icomp),
+                                       bcvalxlo, maxorder, dxinv[0], inhomog, icomp);
+                    Real tmp = vel(i,j,k,icomp);
+                    mllinop_apply_bc_z(Orientation::high, i,j,k, blen.z, vel, mzhi,
+                                       bct(Orientation::zhi(), icomp),
+                                       bcl(Orientation::zhi(), icomp),
+                                       bcvalzhi, maxorder, dxinv[2], inhomog, icomp);
+                    vel(i,j,k,icomp) = 0.5_rt*(tmp+vel(i,j,k,icomp));
+                } else if (y_interior && z_interior) {
+                    mllinop_apply_bc_y(Orientation::low, i,j,k, blen.y, vel, mylo,
+                                       bct(Orientation::ylo(), icomp),
+                                       bcl(Orientation::ylo(), icomp),
+                                       bcvalylo, maxorder, dxinv[1], inhomog, icomp);
+                    Real tmp = vel(i,j,k,icomp);
+                    mllinop_apply_bc_z(Orientation::high, i,j,k, blen.z, vel, mzhi,
+                                       bct(Orientation::zhi(), icomp),
+                                       bcl(Orientation::zhi(), icomp),
+                                       bcvalzhi, maxorder, dxinv[2], inhomog, icomp);
+                    vel(i,j,k,icomp) = 0.5_rt*(tmp+vel(i,j,k,icomp));
+                } else if (x_interior) {
+                    mllinop_apply_bc_x(Orientation::low, i,j,k, blen.x, vel, mxlo,
+                                       bct(Orientation::xlo(), icomp),
+                                       bcl(Orientation::xlo(), icomp),
+                                       bcvalxlo, maxorder, dxinv[0], inhomog, icomp);
+                } else if (y_interior) {
+                    mllinop_apply_bc_y(Orientation::low, i,j,k, blen.y, vel, mylo,
+                                       bct(Orientation::ylo(), icomp),
+                                       bcl(Orientation::ylo(), icomp),
+                                       bcvalylo, maxorder, dxinv[1], inhomog, icomp);
+                } else if (z_interior) {
+                    mllinop_apply_bc_z(Orientation::high, i,j,k, blen.z, vel, mzhi,
+                                       bct(Orientation::zhi(), icomp),
+                                       bcl(Orientation::zhi(), icomp),
+                                       bcvalzhi, maxorder, dxinv[2], inhomog, icomp);
                 }
             }
             break;
         }
         case 5: {
             // xhi & ylo & zhi
-            Box bx = amrex::adjCellHi(amrex::adjCellLo(amrex::adjCellHi(vbox,xdir,1),ydir,1),zdir,1);
-            if (vhi.x == dhi.x && vlo.y == dlo.y && vhi.z == dhi.z) {
-                vel      (vhi.x+1,vlo.y-1,vhi.z+1,icomp)
-                    = vel(vhi.x+1,vlo.y  ,vhi.z  ,icomp)
-                    + vel(vhi.x  ,vlo.y-1,vhi.z  ,icomp)
-                    + vel(vhi.x  ,vlo.y  ,vhi.z+1,icomp)
-                    - vel(vhi.x  ,vlo.y  ,vhi.z  ,icomp) * Real(2.0);
-            } else if (vhi.x == dhi.x && vlo.y == dlo.y) {
-                vel      (vhi.x+1,vlo.y-1,vhi.z+1,icomp)
-                    = vel(vhi.x+1,vlo.y  ,vhi.z+1,icomp)
-                    + vel(vhi.x  ,vlo.y-1,vhi.z+1,icomp)
-                    - vel(vhi.x  ,vlo.y  ,vhi.z+1,icomp);
-            } else if (vhi.x == dhi.x && vhi.z == dhi.z) {
-                vel      (vhi.x+1,vlo.y-1,vhi.z+1,icomp)
-                    = vel(vhi.x+1,vlo.y-1,vhi.z  ,icomp)
-                    + vel(vhi.x  ,vlo.y-1,vhi.z+1,icomp)
-                    - vel(vhi.x  ,vlo.y-1,vhi.z  ,icomp);
-            } else if (vlo.y == dlo.y && vhi.z == dhi.z) {
-                vel      (vhi.x+1,vlo.y-1,vhi.z+1,icomp)
-                    = vel(vhi.x+1,vlo.y-1,vhi.z  ,icomp)
-                    + vel(vhi.x+1,vlo.y  ,vhi.z+1,icomp)
-                    - vel(vhi.x+1,vlo.y  ,vhi.z  ,icomp);
-            } else if (vhi.x == dhi.x) {
-                int offset = AMREX_SPACEDIM * oxhi;
-                mllinop_apply_bc_x(Orientation::high, bx, blen.x,
-                                   vel, mxhi, bct[offset+icomp], bcl[offset+icomp],
-                                   bcvalxhi, maxorder, dxinv[xdir], inhomog, icomp);
-            } else if (vlo.y == dlo.y) {
-                int offset = AMREX_SPACEDIM * oylo;
-                mllinop_apply_bc_y(Orientation::low, bx, blen.y,
-                                   vel, mylo, bct[offset+icomp], bcl[offset+icomp],
-                                   bcvalylo, maxorder, dxinv[ydir], inhomog, icomp);
-            } else if (vhi.z == dhi.z) {
-                int offset = AMREX_SPACEDIM * ozhi;
-                mllinop_apply_bc_z(Orientation::high, bx, blen.z,
-                                   vel, mzhi, bct[offset+icomp], bcl[offset+icomp],
-                                   bcvalzhi, maxorder, dxinv[zdir], inhomog, icomp);
-            } else  if (mxhi(vhi.x+1,vlo.y-1,vhi.z+1) != BndryData::covered) {
-                if (mylo(vhi.x,vlo.y-1,vhi.z+1) == BndryData::covered) {
-                    int offset = AMREX_SPACEDIM * oxhi;
-                    mllinop_apply_bc_x(Orientation::high, bx, blen.x,
-                                       vel, mxhi, bct[offset+icomp], bcl[offset+icomp],
-                                       bcvalxhi, maxorder, dxinv[xdir], inhomog, icomp);
-                } else if (mxhi(vhi.x+1,vlo.y,vhi.z+1) == BndryData::covered) {
-                    int offset = AMREX_SPACEDIM * oylo;
-                    mllinop_apply_bc_y(Orientation::low, bx, blen.y,
-                                       vel, mylo, bct[offset+icomp], bcl[offset+icomp],
-                                       bcvalylo, maxorder, dxinv[ydir], inhomog, icomp);
-                } else {
-                    int offset = AMREX_SPACEDIM * ozhi;
-                    mllinop_apply_bc_z(Orientation::high, bx, blen.z,
-                                       vel, mzhi, bct[offset+icomp], bcl[offset+icomp],
-                                       bcvalzhi, maxorder, dxinv[zdir], inhomog, icomp);
+            int i = vhi.x+1;
+            int j = vlo.y-1;
+            int k = vhi.z+1;
+            bool x_interior = mylo(i-1,j  ,k  ) == BndryData::covered;
+            bool x_exterior = mylo(i-1,j  ,k  ) == BndryData::not_covered;
+            bool y_interior = mxhi(i  ,j+1,k  ) == BndryData::covered;
+            bool y_exterior = mxhi(i  ,j+1,k  ) == BndryData::not_covered;
+            bool z_interior = mxhi(i  ,j  ,k-1) == BndryData::covered;
+            bool z_exterior = mxhi(i  ,j  ,k-1) == BndryData::not_covered;
+            if (mxhi(i,j,k) != BndryData::covered &&
+                (!xhi_domain || !ylo_domain || !zhi_domain)) {
+                if ((x_interior && y_interior && z_interior) ||
+                    (x_exterior && y_exterior && z_exterior)) {
+                    mllinop_apply_bc_x(Orientation::high, i,j,k, blen.x, vel, mxhi,
+                                       bct(Orientation::xhi(), icomp),
+                                       bcl(Orientation::xhi(), icomp),
+                                       bcvalxhi, maxorder, dxinv[0], inhomog, icomp);
+                    Real tmp = vel(i,j,k,icomp);
+                    mllinop_apply_bc_y(Orientation::low, i,j,k, blen.y, vel, mylo,
+                                       bct(Orientation::ylo(), icomp),
+                                       bcl(Orientation::ylo(), icomp),
+                                       bcvalylo, maxorder, dxinv[1], inhomog, icomp);
+                    tmp += vel(i,j,k,icomp);
+                    mllinop_apply_bc_z(Orientation::high, i,j,k, blen.z, vel, mzhi,
+                                       bct(Orientation::zhi(), icomp),
+                                       bcl(Orientation::zhi(), icomp),
+                                       bcvalzhi, maxorder, dxinv[2], inhomog, icomp);
+                    vel(i,j,k,icomp) = Real(1./3.)*(tmp+vel(i,j,k,icomp));
+                } else if (x_interior && y_interior) {
+                    mllinop_apply_bc_x(Orientation::high, i,j,k, blen.x, vel, mxhi,
+                                       bct(Orientation::xhi(), icomp),
+                                       bcl(Orientation::xhi(), icomp),
+                                       bcvalxhi, maxorder, dxinv[0], inhomog, icomp);
+                    Real tmp = vel(i,j,k,icomp);
+                    mllinop_apply_bc_y(Orientation::low, i,j,k, blen.y, vel, mylo,
+                                       bct(Orientation::ylo(), icomp),
+                                       bcl(Orientation::ylo(), icomp),
+                                       bcvalylo, maxorder, dxinv[1], inhomog, icomp);
+                    vel(i,j,k,icomp) = 0.5_rt*(tmp+vel(i,j,k,icomp));
+                } else if (x_interior && z_interior) {
+                    mllinop_apply_bc_x(Orientation::high, i,j,k, blen.x, vel, mxhi,
+                                       bct(Orientation::xhi(), icomp),
+                                       bcl(Orientation::xhi(), icomp),
+                                       bcvalxhi, maxorder, dxinv[0], inhomog, icomp);
+                    Real tmp = vel(i,j,k,icomp);
+                    mllinop_apply_bc_z(Orientation::high, i,j,k, blen.z, vel, mzhi,
+                                       bct(Orientation::zhi(), icomp),
+                                       bcl(Orientation::zhi(), icomp),
+                                       bcvalzhi, maxorder, dxinv[2], inhomog, icomp);
+                    vel(i,j,k,icomp) = 0.5_rt*(tmp+vel(i,j,k,icomp));
+                } else if (y_interior && z_interior) {
+                    mllinop_apply_bc_y(Orientation::low, i,j,k, blen.y, vel, mylo,
+                                       bct(Orientation::ylo(), icomp),
+                                       bcl(Orientation::ylo(), icomp),
+                                       bcvalylo, maxorder, dxinv[1], inhomog, icomp);
+                    Real tmp = vel(i,j,k,icomp);
+                    mllinop_apply_bc_z(Orientation::high, i,j,k, blen.z, vel, mzhi,
+                                       bct(Orientation::zhi(), icomp),
+                                       bcl(Orientation::zhi(), icomp),
+                                       bcvalzhi, maxorder, dxinv[2], inhomog, icomp);
+                    vel(i,j,k,icomp) = 0.5_rt*(tmp+vel(i,j,k,icomp));
+                } else if (x_interior) {
+                    mllinop_apply_bc_x(Orientation::high, i,j,k, blen.x, vel, mxhi,
+                                       bct(Orientation::xhi(), icomp),
+                                       bcl(Orientation::xhi(), icomp),
+                                       bcvalxhi, maxorder, dxinv[0], inhomog, icomp);
+                } else if (y_interior) {
+                    mllinop_apply_bc_y(Orientation::low, i,j,k, blen.y, vel, mylo,
+                                       bct(Orientation::ylo(), icomp),
+                                       bcl(Orientation::ylo(), icomp),
+                                       bcvalylo, maxorder, dxinv[1], inhomog, icomp);
+                } else if (z_interior) {
+                    mllinop_apply_bc_z(Orientation::high, i,j,k, blen.z, vel, mzhi,
+                                       bct(Orientation::zhi(), icomp),
+                                       bcl(Orientation::zhi(), icomp),
+                                       bcvalzhi, maxorder, dxinv[2], inhomog, icomp);
                 }
             }
             break;
         }
         case 6: {
             // xlo & yhi & zhi
-            Box bx = amrex::adjCellHi(amrex::adjCellHi(amrex::adjCellLo(vbox,xdir,1),ydir,1),zdir,1);
-            if (vlo.x == dlo.x && vhi.y == dhi.y && vhi.z == dhi.z) {
-                vel      (vlo.x-1,vhi.y+1,vhi.z+1,icomp)
-                    = vel(vlo.x-1,vhi.y  ,vhi.z  ,icomp)
-                    + vel(vlo.x  ,vhi.y+1,vhi.z  ,icomp)
-                    + vel(vlo.x  ,vhi.y  ,vhi.z+1,icomp)
-                    - vel(vlo.x  ,vhi.y  ,vhi.z  ,icomp) * Real(2.0);
-            } else if (vlo.x == dlo.x && vhi.y == dhi.y) {
-                vel      (vlo.x-1,vhi.y+1,vhi.z+1,icomp)
-                    = vel(vlo.x-1,vhi.y  ,vhi.z+1,icomp)
-                    + vel(vlo.x  ,vhi.y+1,vhi.z+1,icomp)
-                    - vel(vlo.x  ,vhi.y  ,vhi.z+1,icomp);
-            } else if (vlo.x == dlo.x && vhi.z == dhi.z) {
-                vel      (vlo.x-1,vhi.y+1,vhi.z+1,icomp)
-                    = vel(vlo.x-1,vhi.y+1,vhi.z  ,icomp)
-                    + vel(vlo.x  ,vhi.y+1,vhi.z+1,icomp)
-                    - vel(vlo.x  ,vhi.y+1,vhi.z  ,icomp);
-            } else if (vhi.y == dhi.y && vhi.z == dhi.z) {
-                vel      (vlo.x-1,vhi.y+1,vhi.z+1,icomp)
-                    = vel(vlo.x-1,vhi.y+1,vhi.z  ,icomp)
-                    + vel(vlo.x-1,vhi.y  ,vhi.z+1,icomp)
-                    - vel(vlo.x-1,vhi.y  ,vhi.z  ,icomp);
-            } else if (vlo.x == dlo.x) {
-                int offset = AMREX_SPACEDIM * oxlo;
-                mllinop_apply_bc_x(Orientation::low, bx, blen.x,
-                                   vel, mxlo, bct[offset+icomp], bcl[offset+icomp],
-                                   bcvalxlo, maxorder, dxinv[xdir], inhomog, icomp);
-            } else if (vhi.y == dhi.y) {
-                int offset = AMREX_SPACEDIM * oyhi;
-                mllinop_apply_bc_y(Orientation::high, bx, blen.y,
-                                   vel, myhi, bct[offset+icomp], bcl[offset+icomp],
-                                   bcvalyhi, maxorder, dxinv[ydir], inhomog, icomp);
-            } else if (vhi.z == dhi.z) {
-                int offset = AMREX_SPACEDIM * ozhi;
-                mllinop_apply_bc_z(Orientation::high, bx, blen.z,
-                                   vel, mzhi, bct[offset+icomp], bcl[offset+icomp],
-                                   bcvalzhi, maxorder, dxinv[zdir], inhomog, icomp);
-            } else if (mxlo(vlo.x-1,vhi.y+1,vhi.z+1) != BndryData::covered) {
-                if (myhi(vlo.x,vhi.y+1,vhi.z+1) == BndryData::covered) {
-                    int offset = AMREX_SPACEDIM * oxlo;
-                    mllinop_apply_bc_x(Orientation::low, bx, blen.x,
-                                       vel, mxlo, bct[offset+icomp], bcl[offset+icomp],
-                                       bcvalxlo, maxorder, dxinv[xdir], inhomog, icomp);
-                } else if (mxlo(vlo.x-1,vhi.y,vhi.z+1) == BndryData::covered) {
-                    int offset = AMREX_SPACEDIM * oyhi;
-                    mllinop_apply_bc_y(Orientation::high, bx, blen.y,
-                                       vel, myhi, bct[offset+icomp], bcl[offset+icomp],
-                                       bcvalyhi, maxorder, dxinv[ydir], inhomog, icomp);
-                } else {
-                    int offset = AMREX_SPACEDIM * ozhi;
-                    mllinop_apply_bc_z(Orientation::high, bx, blen.z,
-                                       vel, mzhi, bct[offset+icomp], bcl[offset+icomp],
-                                       bcvalzhi, maxorder, dxinv[zdir], inhomog, icomp);
+            int i = vlo.x-1;
+            int j = vhi.y+1;
+            int k = vhi.z+1;
+            bool x_interior = myhi(i+1,j  ,k  ) == BndryData::covered;
+            bool x_exterior = myhi(i+1,j  ,k  ) == BndryData::not_covered;
+            bool y_interior = mxlo(i  ,j-1,k  ) == BndryData::covered;
+            bool y_exterior = mxlo(i  ,j-1,k  ) == BndryData::not_covered;
+            bool z_interior = mxlo(i  ,j  ,k-1) == BndryData::covered;
+            bool z_exterior = mxlo(i  ,j  ,k-1) == BndryData::not_covered;
+            if (mxlo(i,j,k) != BndryData::covered &&
+                (!xlo_domain || !yhi_domain || !zhi_domain)) {
+                if ((x_interior && y_interior && z_interior) ||
+                    (x_exterior && y_exterior && z_exterior)) {
+                    mllinop_apply_bc_x(Orientation::low, i,j,k, blen.x, vel, mxlo,
+                                       bct(Orientation::xlo(), icomp),
+                                       bcl(Orientation::xlo(), icomp),
+                                       bcvalxlo, maxorder, dxinv[0], inhomog, icomp);
+                    Real tmp = vel(i,j,k,icomp);
+                    mllinop_apply_bc_y(Orientation::high, i,j,k, blen.y, vel, myhi,
+                                       bct(Orientation::yhi(), icomp),
+                                       bcl(Orientation::yhi(), icomp),
+                                       bcvalyhi, maxorder, dxinv[1], inhomog, icomp);
+                    tmp += vel(i,j,k,icomp);
+                    mllinop_apply_bc_z(Orientation::high, i,j,k, blen.z, vel, mzhi,
+                                       bct(Orientation::zhi(), icomp),
+                                       bcl(Orientation::zhi(), icomp),
+                                       bcvalzhi, maxorder, dxinv[2], inhomog, icomp);
+                    vel(i,j,k,icomp) = Real(1./3.)*(tmp+vel(i,j,k,icomp));
+                } else if (x_interior && y_interior) {
+                    mllinop_apply_bc_x(Orientation::low, i,j,k, blen.x, vel, mxlo,
+                                       bct(Orientation::xlo(), icomp),
+                                       bcl(Orientation::xlo(), icomp),
+                                       bcvalxlo, maxorder, dxinv[0], inhomog, icomp);
+                    Real tmp = vel(i,j,k,icomp);
+                    mllinop_apply_bc_y(Orientation::high, i,j,k, blen.y, vel, myhi,
+                                       bct(Orientation::yhi(), icomp),
+                                       bcl(Orientation::yhi(), icomp),
+                                       bcvalyhi, maxorder, dxinv[1], inhomog, icomp);
+                    vel(i,j,k,icomp) = 0.5_rt*(tmp+vel(i,j,k,icomp));
+                } else if (x_interior && z_interior) {
+                    mllinop_apply_bc_x(Orientation::low, i,j,k, blen.x, vel, mxlo,
+                                       bct(Orientation::xlo(), icomp),
+                                       bcl(Orientation::xlo(), icomp),
+                                       bcvalxlo, maxorder, dxinv[0], inhomog, icomp);
+                    Real tmp = vel(i,j,k,icomp);
+                    mllinop_apply_bc_z(Orientation::high, i,j,k, blen.z, vel, mzhi,
+                                       bct(Orientation::zhi(), icomp),
+                                       bcl(Orientation::zhi(), icomp),
+                                       bcvalzhi, maxorder, dxinv[2], inhomog, icomp);
+                    vel(i,j,k,icomp) = 0.5_rt*(tmp+vel(i,j,k,icomp));
+                } else if (y_interior && z_interior) {
+                    mllinop_apply_bc_y(Orientation::high, i,j,k, blen.y, vel, myhi,
+                                       bct(Orientation::yhi(), icomp),
+                                       bcl(Orientation::yhi(), icomp),
+                                       bcvalyhi, maxorder, dxinv[1], inhomog, icomp);
+                    Real tmp = vel(i,j,k,icomp);
+                    mllinop_apply_bc_z(Orientation::high, i,j,k, blen.z, vel, mzhi,
+                                       bct(Orientation::zhi(), icomp),
+                                       bcl(Orientation::zhi(), icomp),
+                                       bcvalzhi, maxorder, dxinv[2], inhomog, icomp);
+                    vel(i,j,k,icomp) = 0.5_rt*(tmp+vel(i,j,k,icomp));
+                } else if (x_interior) {
+                    mllinop_apply_bc_x(Orientation::low, i,j,k, blen.x, vel, mxlo,
+                                       bct(Orientation::xlo(), icomp),
+                                       bcl(Orientation::xlo(), icomp),
+                                       bcvalxlo, maxorder, dxinv[0], inhomog, icomp);
+                } else if (y_interior) {
+                    mllinop_apply_bc_y(Orientation::high, i,j,k, blen.y, vel, myhi,
+                                       bct(Orientation::yhi(), icomp),
+                                       bcl(Orientation::yhi(), icomp),
+                                       bcvalyhi, maxorder, dxinv[1], inhomog, icomp);
+                } else if (z_interior) {
+                    mllinop_apply_bc_z(Orientation::high, i,j,k, blen.z, vel, mzhi,
+                                       bct(Orientation::zhi(), icomp),
+                                       bcl(Orientation::zhi(), icomp),
+                                       bcvalzhi, maxorder, dxinv[2], inhomog, icomp);
                 }
             }
             break;
         }
         case 7: {
             // xhi & yhi & zhi
-            Box bx = amrex::adjCellHi(amrex::adjCellHi(amrex::adjCellHi(vbox,xdir,1),ydir,1),zdir,1);
-            if (vhi.x == dhi.x && vhi.y == dhi.y && vhi.z == dhi.z) {
-                vel      (vhi.x+1,vhi.y+1,vhi.z+1,icomp)
-                    = vel(vhi.x+1,vhi.y  ,vhi.z  ,icomp)
-                    + vel(vhi.x  ,vhi.y+1,vhi.z  ,icomp)
-                    + vel(vhi.x  ,vhi.y  ,vhi.z+1,icomp)
-                    - vel(vhi.x  ,vhi.y  ,vhi.z  ,icomp) * Real(2.0);
-            } else if (vhi.x == dhi.x && vhi.y == dhi.y) {
-                vel      (vhi.x+1,vhi.y+1,vhi.z+1,icomp)
-                    = vel(vhi.x+1,vhi.y  ,vhi.z+1,icomp)
-                    + vel(vhi.x  ,vhi.y+1,vhi.z+1,icomp)
-                    - vel(vhi.x  ,vhi.y  ,vhi.z+1,icomp);
-            } else if (vhi.x == dhi.x && vhi.z == dhi.z) {
-                vel      (vhi.x+1,vhi.y+1,vhi.z+1,icomp)
-                    = vel(vhi.x+1,vhi.y+1,vhi.z  ,icomp)
-                    + vel(vhi.x  ,vhi.y+1,vhi.z+1,icomp)
-                    - vel(vhi.x  ,vhi.y+1,vhi.z  ,icomp);
-            } else if (vhi.y == dhi.y && vhi.z == dhi.z) {
-                vel      (vhi.x+1,vhi.y+1,vhi.z+1,icomp)
-                    = vel(vhi.x+1,vhi.y+1,vhi.z  ,icomp)
-                    + vel(vhi.x+1,vhi.y  ,vhi.z+1,icomp)
-                    - vel(vhi.x+1,vhi.y  ,vhi.z  ,icomp);
-            } else if (vhi.x == dhi.x) {
-                int offset = AMREX_SPACEDIM * oxhi;
-                mllinop_apply_bc_x(Orientation::high, bx, blen.x,
-                                   vel, mxhi, bct[offset+icomp], bcl[offset+icomp],
-                                   bcvalxhi, maxorder, dxinv[xdir], inhomog, icomp);
-            } else if (vhi.y == dhi.y) {
-                int offset = AMREX_SPACEDIM * oyhi;
-                mllinop_apply_bc_y(Orientation::high, bx, blen.y,
-                                   vel, myhi, bct[offset+icomp], bcl[offset+icomp],
-                                   bcvalyhi, maxorder, dxinv[ydir], inhomog, icomp);
-            } else if (vhi.z == dhi.z) {
-                int offset = AMREX_SPACEDIM * ozhi;
-                mllinop_apply_bc_z(Orientation::high, bx, blen.z,
-                                   vel, mzhi, bct[offset+icomp], bcl[offset+icomp],
-                                   bcvalzhi, maxorder, dxinv[zdir], inhomog, icomp);
-            } else if (mxhi(vhi.x+1,vhi.y+1,vhi.z+1) != BndryData::covered) {
-                if (myhi(vhi.x,vhi.y+1,vhi.z+1) == BndryData::covered) {
-                    int offset = AMREX_SPACEDIM * oxhi;
-                    mllinop_apply_bc_x(Orientation::high, bx, blen.x,
-                                       vel, mxhi, bct[offset+icomp], bcl[offset+icomp],
-                                       bcvalxhi, maxorder, dxinv[xdir], inhomog, icomp);
-                } else if (mxhi(vhi.x+1,vhi.y,vhi.z+1) == BndryData::covered) {
-                    int offset = AMREX_SPACEDIM * oyhi;
-                    mllinop_apply_bc_y(Orientation::high, bx, blen.y,
-                                       vel, myhi, bct[offset+icomp], bcl[offset+icomp],
-                                       bcvalyhi, maxorder, dxinv[ydir], inhomog, icomp);
-                } else {
-                    int offset = AMREX_SPACEDIM * ozhi;
-                    mllinop_apply_bc_z(Orientation::high, bx, blen.z,
-                                       vel, mzhi, bct[offset+icomp], bcl[offset+icomp],
-                                       bcvalzhi, maxorder, dxinv[zdir], inhomog, icomp);
+            int i = vhi.x+1;
+            int j = vhi.y+1;
+            int k = vhi.z+1;
+            bool x_interior = myhi(i-1,j  ,k  ) == BndryData::covered;
+            bool x_exterior = myhi(i-1,j  ,k  ) == BndryData::not_covered;
+            bool y_interior = mxhi(i  ,j-1,k  ) == BndryData::covered;
+            bool y_exterior = mxhi(i  ,j-1,k  ) == BndryData::not_covered;
+            bool z_interior = mxhi(i  ,j  ,k-1) == BndryData::covered;
+            bool z_exterior = mxhi(i  ,j  ,k-1) == BndryData::not_covered;
+            if (mxhi(i,j,k) != BndryData::covered &&
+                (!xhi_domain || !yhi_domain || !zhi_domain)) {
+                if ((x_interior && y_interior && z_interior) ||
+                    (x_exterior && y_exterior && z_exterior)) {
+                    mllinop_apply_bc_x(Orientation::high, i,j,k, blen.x, vel, mxhi,
+                                       bct(Orientation::xhi(), icomp),
+                                       bcl(Orientation::xhi(), icomp),
+                                       bcvalxhi, maxorder, dxinv[0], inhomog, icomp);
+                    Real tmp = vel(i,j,k,icomp);
+                    mllinop_apply_bc_y(Orientation::high, i,j,k, blen.y, vel, myhi,
+                                       bct(Orientation::yhi(), icomp),
+                                       bcl(Orientation::yhi(), icomp),
+                                       bcvalyhi, maxorder, dxinv[1], inhomog, icomp);
+                    tmp += vel(i,j,k,icomp);
+                    mllinop_apply_bc_z(Orientation::high, i,j,k, blen.z, vel, mzhi,
+                                       bct(Orientation::zhi(), icomp),
+                                       bcl(Orientation::zhi(), icomp),
+                                       bcvalzhi, maxorder, dxinv[2], inhomog, icomp);
+                    vel(i,j,k,icomp) = Real(1./3.)*(tmp+vel(i,j,k,icomp));
+                } else if (x_interior && y_interior) {
+                    mllinop_apply_bc_x(Orientation::high, i,j,k, blen.x, vel, mxhi,
+                                       bct(Orientation::xhi(), icomp),
+                                       bcl(Orientation::xhi(), icomp),
+                                       bcvalxhi, maxorder, dxinv[0], inhomog, icomp);
+                    Real tmp = vel(i,j,k,icomp);
+                    mllinop_apply_bc_y(Orientation::high, i,j,k, blen.y, vel, myhi,
+                                       bct(Orientation::yhi(), icomp),
+                                       bcl(Orientation::yhi(), icomp),
+                                       bcvalyhi, maxorder, dxinv[1], inhomog, icomp);
+                    vel(i,j,k,icomp) = 0.5_rt*(tmp+vel(i,j,k,icomp));
+                } else if (x_interior && z_interior) {
+                    mllinop_apply_bc_x(Orientation::high, i,j,k, blen.x, vel, mxhi,
+                                       bct(Orientation::xhi(), icomp),
+                                       bcl(Orientation::xhi(), icomp),
+                                       bcvalxhi, maxorder, dxinv[0], inhomog, icomp);
+                    Real tmp = vel(i,j,k,icomp);
+                    mllinop_apply_bc_z(Orientation::high, i,j,k, blen.z, vel, mzhi,
+                                       bct(Orientation::zhi(), icomp),
+                                       bcl(Orientation::zhi(), icomp),
+                                       bcvalzhi, maxorder, dxinv[2], inhomog, icomp);
+                    vel(i,j,k,icomp) = 0.5_rt*(tmp+vel(i,j,k,icomp));
+                } else if (y_interior && z_interior) {
+                    mllinop_apply_bc_y(Orientation::high, i,j,k, blen.y, vel, myhi,
+                                       bct(Orientation::yhi(), icomp),
+                                       bcl(Orientation::yhi(), icomp),
+                                       bcvalyhi, maxorder, dxinv[1], inhomog, icomp);
+                    Real tmp = vel(i,j,k,icomp);
+                    mllinop_apply_bc_z(Orientation::high, i,j,k, blen.z, vel, mzhi,
+                                       bct(Orientation::zhi(), icomp),
+                                       bcl(Orientation::zhi(), icomp),
+                                       bcvalzhi, maxorder, dxinv[2], inhomog, icomp);
+                    vel(i,j,k,icomp) = 0.5_rt*(tmp+vel(i,j,k,icomp));
+                } else if (x_interior) {
+                    mllinop_apply_bc_x(Orientation::high, i,j,k, blen.x, vel, mxhi,
+                                       bct(Orientation::xhi(), icomp),
+                                       bcl(Orientation::xhi(), icomp),
+                                       bcvalxhi, maxorder, dxinv[0], inhomog, icomp);
+                } else if (y_interior) {
+                    mllinop_apply_bc_y(Orientation::high, i,j,k, blen.y, vel, myhi,
+                                       bct(Orientation::yhi(), icomp),
+                                       bcl(Orientation::yhi(), icomp),
+                                       bcvalyhi, maxorder, dxinv[1], inhomog, icomp);
+                } else if (z_interior) {
+                    mllinop_apply_bc_z(Orientation::high, i,j,k, blen.z, vel, mzhi,
+                                       bct(Orientation::zhi(), icomp),
+                                       bcl(Orientation::zhi(), icomp),
+                                       bcvalzhi, maxorder, dxinv[2], inhomog, icomp);
                 }
             }
             break;
@@ -518,9 +1340,10 @@ void mltensor_fill_corners (int icorner, Box const& vbox, // vbox: the valid box
         }
     }
 }
+#endif
 
-AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
-void mltensor_fill_edges (int iedge, Box const& vbox, // vbox: the valid box
+inline
+void mltensor_fill_edges (Box const& vbox, // vbox: the valid box
                           Array4<Real> const& vel,
                           Array4<int const> const& mxlo,
                           Array4<int const> const& mylo,
@@ -534,522 +1357,486 @@ void mltensor_fill_edges (int iedge, Box const& vbox, // vbox: the valid box
                           Array4<Real const> const& bcvalxhi,
                           Array4<Real const> const& bcvalyhi,
                           Array4<Real const> const& bcvalzhi,
-                          GpuArray<BoundCond,2*AMREX_SPACEDIM*AMREX_SPACEDIM> const& bct,
-                          GpuArray<Real,2*AMREX_SPACEDIM*AMREX_SPACEDIM> const& bcl,
+                          Array2D<BoundCond,
+                                  0,2*AMREX_SPACEDIM,
+                                  0,AMREX_SPACEDIM> const& bct,
+                          Array2D<Real,
+                                  0,2*AMREX_SPACEDIM,
+                                  0,AMREX_SPACEDIM> const& bcl,
                           int inhomog, int maxorder,
-                          GpuArray<Real,AMREX_SPACEDIM> const& dxinv, Box const& domain) noexcept
+                          GpuArray<Real,AMREX_SPACEDIM> const& dxinv,
+                          Dim3 const& dlo, Dim3 const& dhi) noexcept
+
 {
-    constexpr int oxlo = 0;
-    constexpr int oylo = 1;
-    constexpr int ozlo = 2;
-    constexpr int oxhi = 3;
-    constexpr int oyhi = 4;
-    constexpr int ozhi = 5;
-    constexpr int xdir = 0;
-    constexpr int ydir = 1;
-    constexpr int zdir = 2;
     const auto blen = amrex::length(vbox);
     const auto vlo  = amrex::lbound(vbox);
     const auto vhi  = amrex::ubound(vbox);
-    const auto dlo  = amrex::lbound(domain);
-    const auto dhi  = amrex::ubound(domain);
-    for (int icomp = 0; icomp < AMREX_SPACEDIM; ++icomp) {
-        switch (iedge) {
-        case 0: {
-            // xlo & ylo
-            if (vlo.x == dlo.x && vlo.y == dlo.y) {
-                for (int k = vlo.z; k <= vhi.z; ++k) {
-                    vel      (vlo.x-1,vlo.y-1,k,icomp)
-                        = vel(vlo.x  ,vlo.y-1,k,icomp)
-                        + vel(vlo.x-1,vlo.y  ,k,icomp)
-                        - vel(vlo.x  ,vlo.y  ,k,icomp);
-                }
-            } else if (vlo.x == dlo.x) {
-                Box bx = amrex::adjCellLo(amrex::adjCellLo(vbox,xdir,1),ydir,1);
-                int offset = AMREX_SPACEDIM * oxlo;
-                mllinop_apply_bc_x(Orientation::low, bx, blen.x,
-                                   vel, mxlo, bct[offset+icomp], bcl[offset+icomp],
-                                   bcvalxlo, maxorder, dxinv[xdir], inhomog, icomp);
-            } else if (vlo.y == dlo.y) {
-                Box bx = amrex::adjCellLo(amrex::adjCellLo(vbox,xdir,1),ydir,1);
-                int offset = AMREX_SPACEDIM * oylo;
-                mllinop_apply_bc_y(Orientation::low, bx, blen.y,
-                                   vel, mylo, bct[offset+icomp], bcl[offset+icomp],
-                                   bcvalylo, maxorder, dxinv[ydir], inhomog, icomp);
-            } else {
-                for (int k = vlo.z; k <= vhi.z; ++k) {
-                    if (mxlo(vlo.x-1,vlo.y-1,k) != BndryData::covered) {
-                        Box bx(IntVect(vlo.x-1,vlo.y-1,k),IntVect(vlo.x-1,vlo.y-1,k));
-                        if (mylo(vlo.x,vlo.y-1,k) == BndryData::covered) {
-                            int offset = AMREX_SPACEDIM * oxlo;
-                            mllinop_apply_bc_x(Orientation::low, bx, blen.x,
-                                               vel, mxlo, bct[offset+icomp], bcl[offset+icomp],
-                                               bcvalxlo, maxorder, dxinv[xdir], inhomog, icomp);
-                        } else {
-                            int offset = AMREX_SPACEDIM * oylo;
-                            mllinop_apply_bc_y(Orientation::low, bx, blen.y,
-                                               vel, mylo, bct[offset+icomp], bcl[offset+icomp],
-                                               bcvalylo, maxorder, dxinv[ydir], inhomog, icomp);
-                        }
-                    }
-                }
-            }
-            break;
+    bool xlo_domain = (vlo.x == dlo.x);
+    bool ylo_domain = (vlo.y == dlo.y);
+    bool zlo_domain = (vlo.z == dlo.z);
+    bool xhi_domain = (vhi.x == dhi.x);
+    bool yhi_domain = (vhi.y == dhi.y);
+    bool zhi_domain = (vhi.z == dhi.z);
+
+    for (int k = vlo.z; k <= vhi.z; ++k) {
+        mltensor_fill_edges_xlo_ylo(vlo.x-1, vlo.y-1, k, blen, vel, mxlo, mylo, bcvalxlo, bcvalylo,
+                                    bct, bcl, inhomog, maxorder, dxinv, xlo_domain, ylo_domain);
+        mltensor_fill_edges_xhi_ylo(vhi.x+1, vlo.y-1, k, blen, vel, mxhi, mylo, bcvalxhi, bcvalylo,
+                                    bct, bcl, inhomog, maxorder, dxinv, xhi_domain, ylo_domain);
+        mltensor_fill_edges_xlo_yhi(vlo.x-1, vhi.y+1, k, blen, vel, mxlo, myhi, bcvalxlo, bcvalyhi,
+                                    bct, bcl, inhomog, maxorder, dxinv, xlo_domain, yhi_domain);
+        mltensor_fill_edges_xhi_yhi(vhi.x+1, vhi.y+1, k, blen, vel, mxhi, myhi, bcvalxhi, bcvalyhi,
+                                    bct, bcl, inhomog, maxorder, dxinv, xhi_domain, yhi_domain);
+    }
+
+    for (int j = vlo.y; j <= vhi.y; ++j) {
+        mltensor_fill_edges_xlo_zlo(vlo.x-1, j, vlo.z-1, blen, vel, mxlo, mzlo, bcvalxlo, bcvalzlo,
+                                    bct, bcl, inhomog, maxorder, dxinv, xlo_domain, zlo_domain);
+        mltensor_fill_edges_xhi_zlo(vhi.x+1, j, vlo.z-1, blen, vel, mxhi, mzlo, bcvalxhi, bcvalzlo,
+                                    bct, bcl, inhomog, maxorder, dxinv, xhi_domain, zlo_domain);
+        mltensor_fill_edges_xlo_zhi(vlo.x-1, j, vhi.z+1, blen, vel, mxlo, mzhi, bcvalxlo, bcvalzhi,
+                                    bct, bcl, inhomog, maxorder, dxinv, xlo_domain, zhi_domain);
+        mltensor_fill_edges_xhi_zhi(vhi.x+1, j, vhi.z+1, blen, vel, mxhi, mzhi, bcvalxhi, bcvalzhi,
+                                    bct, bcl, inhomog, maxorder, dxinv, xhi_domain, zhi_domain);
+    }
+
+    for (int i = vlo.x; i <= vhi.x; ++i) {
+        mltensor_fill_edges_ylo_zlo(i, vlo.y-1, vlo.z-1, blen, vel, mylo, mzlo, bcvalylo, bcvalzlo,
+                                    bct, bcl, inhomog, maxorder, dxinv, ylo_domain, zlo_domain);
+        mltensor_fill_edges_yhi_zlo(i, vhi.y+1, vlo.z-1, blen, vel, myhi, mzlo, bcvalyhi, bcvalzlo,
+                                    bct, bcl, inhomog, maxorder, dxinv, yhi_domain, zlo_domain);
+        mltensor_fill_edges_ylo_zhi(i, vlo.y-1, vhi.z+1, blen, vel, mylo, mzhi, bcvalylo, bcvalzhi,
+                                    bct, bcl, inhomog, maxorder, dxinv, ylo_domain, zhi_domain);
+        mltensor_fill_edges_yhi_zhi(i, vhi.y+1, vhi.z+1, blen, vel, myhi, mzhi, bcvalyhi, bcvalzhi,
+                                    bct, bcl, inhomog, maxorder, dxinv, yhi_domain, zhi_domain);
+    }
+}
+
+#ifdef AMREX_USE_GPU
+AMREX_GPU_DEVICE AMREX_FORCE_INLINE
+void mltensor_fill_edges (int const bid, int const tid, int const bdim,
+                          Box const& vbox, // vbox: the valid box
+                          Array4<Real> const& vel,
+                          Array4<int const> const& mxlo,
+                          Array4<int const> const& mylo,
+                          Array4<int const> const& mzlo,
+                          Array4<int const> const& mxhi,
+                          Array4<int const> const& myhi,
+                          Array4<int const> const& mzhi,
+                          Array4<Real const> const& bcvalxlo,
+                          Array4<Real const> const& bcvalylo,
+                          Array4<Real const> const& bcvalzlo,
+                          Array4<Real const> const& bcvalxhi,
+                          Array4<Real const> const& bcvalyhi,
+                          Array4<Real const> const& bcvalzhi,
+                          Array2D<BoundCond,
+                                  0,2*AMREX_SPACEDIM,
+                                  0,AMREX_SPACEDIM> const& bct,
+                          Array2D<Real,
+                                  0,2*AMREX_SPACEDIM,
+                                  0,AMREX_SPACEDIM> const& bcl,
+                          int inhomog, int maxorder,
+                          GpuArray<Real,AMREX_SPACEDIM> const& dxinv,
+                          Dim3 const& dlo, Dim3 const& dhi) noexcept
+{
+    const auto blen = amrex::length(vbox);
+    const auto vlo  = amrex::lbound(vbox);
+    const auto vhi  = amrex::ubound(vbox);
+    bool xlo_domain = (vlo.x == dlo.x);
+    bool ylo_domain = (vlo.y == dlo.y);
+    bool zlo_domain = (vlo.z == dlo.z);
+    bool xhi_domain = (vhi.x == dhi.x);
+    bool yhi_domain = (vhi.y == dhi.y);
+    bool zhi_domain = (vhi.z == dhi.z);
+    if (bid == 0) {
+        for (int k = vlo.z + tid; k <= vhi.z; k += bdim) {
+            mltensor_fill_edges_xlo_ylo(vlo.x-1, vlo.y-1, k, blen, vel, mxlo, mylo, bcvalxlo, bcvalylo,
+                                        bct, bcl, inhomog, maxorder, dxinv, xlo_domain, ylo_domain);
         }
-        case 1: {
-            // xhi & ylo
-            if (vhi.x == dhi.x && vlo.y == dlo.y) {
-                for (int k = vlo.z; k <= vhi.z; ++k) {
-                    vel      (vhi.x+1,vlo.y-1,k,icomp)
-                        = vel(vhi.x  ,vlo.y-1,k,icomp)
-                        + vel(vhi.x+1,vlo.y  ,k,icomp)
-                        - vel(vhi.x  ,vlo.y  ,k,icomp);
-                }
-            } else if (vhi.x == dhi.x) {
-                Box bx = amrex::adjCellLo(amrex::adjCellHi(vbox,xdir,1),ydir,1);
-                int offset = AMREX_SPACEDIM * oxhi;
-                mllinop_apply_bc_x(Orientation::high, bx, blen.x,
-                                   vel, mxhi, bct[offset+icomp], bcl[offset+icomp],
-                                   bcvalxhi, maxorder, dxinv[xdir], inhomog, icomp);
-            } else if (vlo.y == dlo.y) {
-                Box bx = amrex::adjCellLo(amrex::adjCellHi(vbox,xdir,1),ydir,1);
-                int offset = AMREX_SPACEDIM * oylo;
-                mllinop_apply_bc_y(Orientation::low, bx, blen.y,
-                                   vel, mylo, bct[offset+icomp], bcl[offset+icomp],
-                                   bcvalylo, maxorder, dxinv[ydir], inhomog, icomp);
-            } else {
-                for (int k = vlo.z; k <= vhi.z; ++k) {
-                    if (mxhi(vhi.x+1,vlo.y-1,k) != BndryData::covered) {
-                        Box bx(IntVect(vhi.x+1,vlo.y-1,k),IntVect(vhi.x+1,vlo.y-1,k));
-                        if (mylo(vhi.x,vlo.y-1,k) == BndryData::covered) {
-                            int offset = AMREX_SPACEDIM * oxhi;
-                            mllinop_apply_bc_x(Orientation::high, bx, blen.x,
-                                               vel, mxhi, bct[offset+icomp], bcl[offset+icomp],
-                                               bcvalxhi, maxorder, dxinv[xdir], inhomog, icomp);
-                        } else {
-                            int offset = AMREX_SPACEDIM * oylo;
-                            mllinop_apply_bc_y(Orientation::low, bx, blen.y,
-                                               vel, mylo, bct[offset+icomp], bcl[offset+icomp],
-                                               bcvalylo, maxorder, dxinv[ydir], inhomog, icomp);
-                        }
-                    }
-                }
+    } else if (bid == 1) {
+        for (int k = vlo.z + tid; k <= vhi.z; k += bdim) {
+            mltensor_fill_edges_xhi_ylo(vhi.x+1, vlo.y-1, k, blen, vel, mxhi, mylo, bcvalxhi, bcvalylo,
+                                        bct, bcl, inhomog, maxorder, dxinv, xhi_domain, ylo_domain);
+        }
+    } else if (bid == 2) {
+        for (int k = vlo.z + tid; k <= vhi.z; k += bdim) {
+            mltensor_fill_edges_xlo_yhi(vlo.x-1, vhi.y+1, k, blen, vel, mxlo, myhi, bcvalxlo, bcvalyhi,
+                                        bct, bcl, inhomog, maxorder, dxinv, xlo_domain, yhi_domain);
+        }
+    } else if (bid == 3) {
+        for (int k = vlo.z + tid; k <= vhi.z; k += bdim) {
+            mltensor_fill_edges_xhi_yhi(vhi.x+1, vhi.y+1, k, blen, vel, mxhi, myhi, bcvalxhi, bcvalyhi,
+                                        bct, bcl, inhomog, maxorder, dxinv, xhi_domain, yhi_domain);
+        }
+    } else if (bid == 4) {
+        for (int j = vlo.y + tid; j <= vhi.y; j += bdim) {
+            mltensor_fill_edges_xlo_zlo(vlo.x-1, j, vlo.z-1, blen, vel, mxlo, mzlo, bcvalxlo, bcvalzlo,
+                                        bct, bcl, inhomog, maxorder, dxinv, xlo_domain, zlo_domain);
+        }
+    } else if (bid == 5) {
+        for (int j = vlo.y + tid; j <= vhi.y; j += bdim) {
+            mltensor_fill_edges_xhi_zlo(vhi.x+1, j, vlo.z-1, blen, vel, mxhi, mzlo, bcvalxhi, bcvalzlo,
+                                        bct, bcl, inhomog, maxorder, dxinv, xhi_domain, zlo_domain);
+        }
+    } else if (bid == 6) {
+        for (int j = vlo.y + tid; j <= vhi.y; j += bdim) {
+            mltensor_fill_edges_xlo_zhi(vlo.x-1, j, vhi.z+1, blen, vel, mxlo, mzhi, bcvalxlo, bcvalzhi,
+                                        bct, bcl, inhomog, maxorder, dxinv, xlo_domain, zhi_domain);
+        }
+    } else if (bid == 7) {
+        for (int j = vlo.y + tid; j <= vhi.y; j += bdim) {
+            mltensor_fill_edges_xhi_zhi(vhi.x+1, j, vhi.z+1, blen, vel, mxhi, mzhi, bcvalxhi, bcvalzhi,
+                                        bct, bcl, inhomog, maxorder, dxinv, xhi_domain, zhi_domain);
+        }
+    } else if (bid == 8) {
+        for (int i = vlo.x + tid; i <= vhi.x; i += bdim) {
+            mltensor_fill_edges_ylo_zlo(i, vlo.y-1, vlo.z-1, blen, vel, mylo, mzlo, bcvalylo, bcvalzlo,
+                                        bct, bcl, inhomog, maxorder, dxinv, ylo_domain, zlo_domain);
+        }
+    } else if (bid == 9) {
+        for (int i = vlo.x + tid; i <= vhi.x; i += bdim) {
+            mltensor_fill_edges_yhi_zlo(i, vhi.y+1, vlo.z-1, blen, vel, myhi, mzlo, bcvalyhi, bcvalzlo,
+                                        bct, bcl, inhomog, maxorder, dxinv, yhi_domain, zlo_domain);
+        }
+    } else if (bid == 10) {
+        for (int i = vlo.x + tid; i <= vhi.x; i += bdim) {
+            mltensor_fill_edges_ylo_zhi(i, vlo.y-1, vhi.z+1, blen, vel, mylo, mzhi, bcvalylo, bcvalzhi,
+                                        bct, bcl, inhomog, maxorder, dxinv, ylo_domain, zhi_domain);
+        }
+    } else if (bid == 11) {
+        for (int i = vlo.x + tid; i <= vhi.x; i += bdim) {
+            mltensor_fill_edges_yhi_zhi(i, vhi.y+1, vhi.z+1, blen, vel, myhi, mzhi, bcvalyhi, bcvalzhi,
+                                        bct, bcl, inhomog, maxorder, dxinv, yhi_domain, zhi_domain);
+        }
+    }
+}
+#endif
+
+AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+Real mltensor_dz_on_xface (int i, int j, int k, int n, Array4<Real const> const& vel, Real dzi) noexcept
+{
+    return (vel(i,j,k+1,n)+vel(i-1,j,k+1,n)-vel(i,j,k-1,n)-vel(i-1,j,k-1,n))*(Real(0.25)*dzi);
+}
+
+AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+Real mltensor_dz_on_yface (int i, int j, int k, int n, Array4<Real const> const& vel, Real dzi) noexcept
+{
+    return (vel(i,j,k+1,n)+vel(i,j-1,k+1,n)-vel(i,j,k-1,n)-vel(i,j-1,k-1,n))*(Real(0.25)*dzi);
+}
+
+AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+Real mltensor_dx_on_zface (int i, int j, int k, int n, Array4<Real const> const& vel, Real dxi) noexcept
+{
+    return (vel(i+1,j,k,n)+vel(i+1,j,k-1,n)-vel(i-1,j,k,n)-vel(i-1,j,k-1,n))*(Real(0.25)*dxi);
+}
+
+AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+Real mltensor_dy_on_zface (int i, int j, int k, int n, Array4<Real const> const& vel, Real dyi) noexcept
+{
+    return (vel(i,j+1,k,n)+vel(i,j+1,k-1,n)-vel(i,j-1,k,n)-vel(i,j-1,k-1,n))*(Real(0.25)*dyi);
+}
+
+AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+void mltensor_cross_terms_fx (Box const& box, Array4<Real> const& fx,
+                              Array4<Real const> const& vel,
+                              Array4<Real const> const& etax,
+                              Array4<Real const> const& kapx,
+                              GpuArray<Real,AMREX_SPACEDIM> const& dxinv) noexcept
+{
+    const Real dyi = dxinv[1];
+    const Real dzi = dxinv[2];
+    const auto lo = amrex::lbound(box);
+    const auto hi = amrex::ubound(box);
+    constexpr Real twoThirds = Real(2./3.);
+
+    for         (int k = lo.z; k <= hi.z; ++k) {
+        for     (int j = lo.y; j <= hi.y; ++j) {
+            AMREX_PRAGMA_SIMD
+            for (int i = lo.x; i <= hi.x; ++i) {
+                Real dudy = mltensor_dy_on_xface(i,j,k,0,vel,dyi);
+                Real dvdy = mltensor_dy_on_xface(i,j,k,1,vel,dyi);
+                Real dudz = mltensor_dz_on_xface(i,j,k,0,vel,dzi);
+                Real dwdz = mltensor_dz_on_xface(i,j,k,2,vel,dzi);
+                Real divu = dvdy + dwdz;
+                Real xif = kapx(i,j,k);
+                Real mun = Real(0.75)*(etax(i,j,k,0)-xif);  // restore the original eta
+                Real mut =             etax(i,j,k,1);
+                fx(i,j,k,0) = -mun*(-twoThirds*divu) - xif*divu;
+                fx(i,j,k,1) = -mut*(dudy);
+                fx(i,j,k,2) = -mut*(dudz);
             }
-            break;
         }
-        case 2: {
-            // xlo & yhi
-            if (vlo.x == dlo.x && vhi.y == dhi.y) {
-                for (int k = vlo.z; k <= vhi.z; ++k) {
-                    vel      (vlo.x-1,vhi.y+1,k,icomp)
-                        = vel(vlo.x  ,vhi.y+1,k,icomp)
-                        + vel(vlo.x-1,vhi.y  ,k,icomp)
-                        - vel(vlo.x  ,vhi.y  ,k,icomp);
-                }
-            } else if (vlo.x == dlo.x) {
-                Box bx = amrex::adjCellHi(amrex::adjCellLo(vbox,xdir,1),ydir,1);
-                int offset = AMREX_SPACEDIM * oxlo;
-                mllinop_apply_bc_x(Orientation::low, bx, blen.x,
-                                   vel, mxlo, bct[offset+icomp], bcl[offset+icomp],
-                                   bcvalxlo, maxorder, dxinv[xdir], inhomog, icomp);
-            } else if (vhi.y == dhi.y) {
-                Box bx = amrex::adjCellHi(amrex::adjCellLo(vbox,xdir,1),ydir,1);
-                int offset = AMREX_SPACEDIM * oyhi;
-                mllinop_apply_bc_y(Orientation::high, bx, blen.y,
-                                   vel, myhi, bct[offset+icomp], bcl[offset+icomp],
-                                   bcvalyhi, maxorder, dxinv[ydir], inhomog, icomp);
-            } else {
-                for (int k = vlo.z; k <= vhi.z; ++k) {
-                    if (mxlo(vlo.x-1,vhi.y+1,k) != BndryData::covered) {
-                        Box bx(IntVect(vlo.x-1,vhi.y+1,k),IntVect(vlo.x-1,vhi.y+1,k));
-                        if (myhi(vlo.x,vhi.y+1,k) == BndryData::covered) {
-                            int offset = AMREX_SPACEDIM * oxlo;
-                            mllinop_apply_bc_x(Orientation::low, bx, blen.x,
-                                               vel, mxlo, bct[offset+icomp], bcl[offset+icomp],
-                                               bcvalxlo, maxorder, dxinv[xdir], inhomog, icomp);
-                        } else {
-                            int offset = AMREX_SPACEDIM * oyhi;
-                            mllinop_apply_bc_y(Orientation::high, bx, blen.y,
-                                               vel, myhi, bct[offset+icomp], bcl[offset+icomp],
-                                               bcvalyhi, maxorder, dxinv[ydir], inhomog, icomp);
-                        }
-                    }
-                }
+    }
+}
+
+AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+void mltensor_cross_terms_fy (Box const& box, Array4<Real> const& fy,
+                              Array4<Real const> const& vel,
+                              Array4<Real const> const& etay,
+                              Array4<Real const> const& kapy,
+                              GpuArray<Real,AMREX_SPACEDIM> const& dxinv) noexcept
+{
+    const Real dxi = dxinv[0];
+    const Real dzi = dxinv[2];
+    const auto lo = amrex::lbound(box);
+    const auto hi = amrex::ubound(box);
+    constexpr Real twoThirds = Real(2./3.);
+
+    for         (int k = lo.z; k <= hi.z; ++k) {
+        for     (int j = lo.y; j <= hi.y; ++j) {
+            AMREX_PRAGMA_SIMD
+            for (int i = lo.x; i <= hi.x; ++i) {
+                Real dudx = mltensor_dx_on_yface(i,j,k,0,vel,dxi);
+                Real dvdx = mltensor_dx_on_yface(i,j,k,1,vel,dxi);
+                Real dvdz = mltensor_dz_on_yface(i,j,k,1,vel,dzi);
+                Real dwdz = mltensor_dz_on_yface(i,j,k,2,vel,dzi);
+                Real divu = dudx + dwdz;
+                Real xif = kapy(i,j,k);
+                Real mun = Real(0.75)*(etay(i,j,k,1)-xif);  // restore the original eta
+                Real mut =             etay(i,j,k,0);
+                fy(i,j,k,0) = -mut*(dvdx);
+                fy(i,j,k,1) = -mun*(-twoThirds*divu) - xif*divu;
+                fy(i,j,k,2) = -mut*(dvdz);
             }
-            break;
         }
-        case 3: {
-            // xhi & yhi
-            if (vhi.x == dhi.x && vhi.y == dhi.y) {
-                for (int k = vlo.z; k <= vhi.z; ++k) {
-                    vel      (vhi.x+1,vhi.y+1,k,icomp)
-                        = vel(vhi.x  ,vhi.y+1,k,icomp)
-                        + vel(vhi.x+1,vhi.y  ,k,icomp)
-                        - vel(vhi.x  ,vhi.y  ,k,icomp);
-                }
-            } else if (vhi.x == dhi.x) {
-                Box bx = amrex::adjCellHi(amrex::adjCellHi(vbox,xdir,1),ydir,1);
-                int offset = AMREX_SPACEDIM * oxhi;
-                mllinop_apply_bc_x(Orientation::high, bx, blen.x,
-                                   vel, mxhi, bct[offset+icomp], bcl[offset+icomp],
-                                   bcvalxhi, maxorder, dxinv[xdir], inhomog, icomp);
-            } else if (vhi.y == dhi.y) {
-                Box bx = amrex::adjCellHi(amrex::adjCellHi(vbox,xdir,1),ydir,1);
-                int offset = AMREX_SPACEDIM * oyhi;
-                mllinop_apply_bc_y(Orientation::high, bx, blen.y,
-                                   vel, myhi, bct[offset+icomp], bcl[offset+icomp],
-                                   bcvalyhi, maxorder, dxinv[ydir], inhomog, icomp);
-            } else {
-                for (int k = vlo.z; k <= vhi.z; ++k) {
-                    if (mxhi(vhi.x+1,vhi.y+1,k) != BndryData::covered) {
-                        Box bx(IntVect(vhi.x+1,vhi.y+1,k),IntVect(vhi.x+1,vhi.y+1,k));
-                        if (myhi(vhi.x,vhi.y+1,k) == BndryData::covered) {
-                            int offset = AMREX_SPACEDIM * oxhi;
-                            mllinop_apply_bc_x(Orientation::high, bx, blen.x,
-                                               vel, mxhi, bct[offset+icomp], bcl[offset+icomp],
-                                               bcvalxhi, maxorder, dxinv[xdir], inhomog, icomp);
-                        } else {
-                            int offset = AMREX_SPACEDIM * oyhi;
-                            mllinop_apply_bc_y(Orientation::high, bx, blen.y,
-                                               vel, myhi, bct[offset+icomp], bcl[offset+icomp],
-                                               bcvalyhi, maxorder, dxinv[ydir], inhomog, icomp);
-                        }
-                    }
-                }
+    }
+}
+
+AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+void mltensor_cross_terms_fz (Box const& box, Array4<Real> const& fz,
+                              Array4<Real const> const& vel,
+                              Array4<Real const> const& etaz,
+                              Array4<Real const> const& kapz,
+                              GpuArray<Real,AMREX_SPACEDIM> const& dxinv) noexcept
+{
+    const Real dxi = dxinv[0];
+    const Real dyi = dxinv[1];
+    const auto lo = amrex::lbound(box);
+    const auto hi = amrex::ubound(box);
+    constexpr Real twoThirds = Real(2./3.);
+
+    for         (int k = lo.z; k <= hi.z; ++k) {
+        for     (int j = lo.y; j <= hi.y; ++j) {
+            AMREX_PRAGMA_SIMD
+            for (int i = lo.x; i <= hi.x; ++i) {
+                Real dudx = mltensor_dx_on_zface(i,j,k,0,vel,dxi);
+                Real dwdx = mltensor_dx_on_zface(i,j,k,2,vel,dxi);
+                Real dvdy = mltensor_dy_on_zface(i,j,k,1,vel,dyi);
+                Real dwdy = mltensor_dy_on_zface(i,j,k,2,vel,dyi);
+                Real divu = dudx + dvdy;
+                Real xif = kapz(i,j,k);
+                Real mun = Real(0.75)*(etaz(i,j,k,2)-xif);  // restore the original eta
+                Real mut =             etaz(i,j,k,0);
+                fz(i,j,k,0) = -mut*(dwdx);
+                fz(i,j,k,1) = -mut*(dwdy);
+                fz(i,j,k,2) = -mun*(-twoThirds*divu) - xif*divu;
             }
-            break;
         }
-        case 4: {
-            // xlo & zlo
-            if (vlo.x == dlo.x && vlo.z == dlo.z) {
-                for (int j = vlo.y; j <= vhi.y; ++j) {
-                    vel      (vlo.x-1,j,vlo.z-1,icomp)
-                        = vel(vlo.x  ,j,vlo.z-1,icomp)
-                        + vel(vlo.x-1,j,vlo.z  ,icomp)
-                        - vel(vlo.x  ,j,vlo.z  ,icomp);
-                }
-            } else if (vlo.x == dlo.x) {
-                Box bx = amrex::adjCellLo(amrex::adjCellLo(vbox,xdir,1),zdir,1);
-                int offset = AMREX_SPACEDIM * oxlo;
-                mllinop_apply_bc_x(Orientation::low, bx, blen.x,
-                                   vel, mxlo, bct[offset+icomp], bcl[offset+icomp],
-                                   bcvalxlo, maxorder, dxinv[xdir], inhomog, icomp);
-            } else if (vlo.z == dlo.z) {
-                Box bx = amrex::adjCellLo(amrex::adjCellLo(vbox,xdir,1),zdir,1);
-                int offset = AMREX_SPACEDIM * ozlo;
-                mllinop_apply_bc_z(Orientation::low, bx, blen.z,
-                                   vel, mzlo, bct[offset+icomp], bcl[offset+icomp],
-                                   bcvalzlo, maxorder, dxinv[zdir], inhomog, icomp);
+    }
+}
+
+AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+Real mltensor_dz_on_xface (int i, int j, int k, int n, Array4<Real const> const& vel, Real dzi,
+                           Array4<Real const> const& bvxlo, Array4<Real const> const& bvxhi,
+                           Array2D<BoundCond,
+                                   0,2*AMREX_SPACEDIM,
+                                   0,AMREX_SPACEDIM> const& bct,
+                           Dim3 const& dlo, Dim3 const& dhi) noexcept
+{
+    Real ddz;
+    if (i == dlo.x) {
+        if (bct(Orientation::xlo(),n) == AMREX_LO_DIRICHLET && bvxlo) {
+            if (k == dlo.z) {
+                ddz = (bvxlo(i-1,j,k  ,n) * Real(-1.5) +
+                       bvxlo(i-1,j,k+1,n) * Real(2.) +
+                       bvxlo(i-1,j,k+2,n) * Real(-0.5)) * dzi;
+            } else if (k == dhi.z) {
+                ddz = -(bvxlo(i-1,j,k  ,n) * Real(-1.5) +
+                        bvxlo(i-1,j,k-1,n) * Real(2.) +
+                        bvxlo(i-1,j,k-2,n) * Real(-0.5)) * dzi;
             } else {
-                for (int j = vlo.y; j <= vhi.y; ++j) {
-                    if (mxlo(vlo.x-1,j,vlo.z-1) != BndryData::covered) {
-                        Box bx(IntVect(vlo.x-1,j,vlo.z-1),IntVect(vlo.x-1,j,vlo.z-1));
-                        if (mzlo(vlo.x,j,vlo.z-1) == BndryData::covered) {
-                            int offset = AMREX_SPACEDIM * oxlo;
-                            mllinop_apply_bc_x(Orientation::low, bx, blen.x,
-                                               vel, mxlo, bct[offset+icomp], bcl[offset+icomp],
-                                               bcvalxlo, maxorder, dxinv[xdir], inhomog, icomp);
-                        } else {
-                            int offset = AMREX_SPACEDIM * ozlo;
-                            mllinop_apply_bc_z(Orientation::low, bx, blen.z,
-                                               vel, mzlo, bct[offset+icomp], bcl[offset+icomp],
-                                               bcvalzlo, maxorder, dxinv[zdir], inhomog, icomp);
-                        }
-                    }
-                }
+                ddz = (bvxlo(i-1,j,k+1,n)-bvxlo(i-1,j,k-1,n))*(Real(0.5)*dzi);
             }
-            break;
+        } else if (bct(Orientation::xlo(),n) == AMREX_LO_NEUMANN) {
+            ddz = (vel(i,j,k+1,n)-vel(i,j,k-1,n))*(Real(0.5)*dzi);
+        } else { // AMREX_LO_REFLECT_ODD or homogeneous Dirichlet
+            ddz = Real(0.);
         }
-        case 5: {
-            // xhi & zlo
-            if (vhi.x == dhi.x && vlo.z == dlo.z) {
-                for (int j = vlo.y; j <= vhi.y; ++j) {
-                    vel      (vhi.x+1,j,vlo.z-1,icomp)
-                        = vel(vhi.x  ,j,vlo.z-1,icomp)
-                        + vel(vhi.x+1,j,vlo.z  ,icomp)
-                        - vel(vhi.x  ,j,vlo.z  ,icomp);
-                }
-            } else if (vhi.x == dhi.x) {
-                Box bx = amrex::adjCellLo(amrex::adjCellHi(vbox,xdir,1),zdir,1);
-                int offset = AMREX_SPACEDIM * oxhi;
-                mllinop_apply_bc_x(Orientation::high, bx, blen.x,
-                                   vel, mxhi, bct[offset+icomp], bcl[offset+icomp],
-                                   bcvalxhi, maxorder, dxinv[xdir], inhomog, icomp);
-            } else if (vlo.z == dlo.z) {
-                Box bx = amrex::adjCellLo(amrex::adjCellHi(vbox,xdir,1),zdir,1);
-                int offset = AMREX_SPACEDIM * ozlo;
-                mllinop_apply_bc_z(Orientation::low, bx, blen.z,
-                                   vel, mzlo, bct[offset+icomp], bcl[offset+icomp],
-                                   bcvalzlo, maxorder, dxinv[zdir], inhomog, icomp);
+    } else if (i == dhi.x+1) {
+        if (bct(Orientation::xhi(),n) == AMREX_LO_DIRICHLET && bvxhi) {
+            if (k == dlo.z) {
+                ddz = (bvxhi(i,j,k  ,n) * Real(-1.5) +
+                       bvxhi(i,j,k+1,n) * Real(2.) +
+                       bvxhi(i,j,k+2,n) * Real(-0.5)) * dzi;
+            } else if (k == dhi.z) {
+                ddz = -(bvxhi(i,j,k  ,n) * Real(-1.5) +
+                        bvxhi(i,j,k-1,n) * Real(2.) +
+                        bvxhi(i,j,k-2,n) * Real(-0.5)) * dzi;
             } else {
-                for (int j = vlo.y; j <= vhi.y; ++j) {
-                    if (mxhi(vhi.x+1,j,vlo.z-1) != BndryData::covered) {
-                        Box bx(IntVect(vhi.x+1,j,vlo.z-1),IntVect(vhi.x+1,j,vlo.z-1));
-                        if (mzlo(vhi.x,j,vlo.z-1) == BndryData::covered) {
-                            int offset = AMREX_SPACEDIM * oxhi;
-                            mllinop_apply_bc_x(Orientation::high, bx, blen.x,
-                                               vel, mxhi, bct[offset+icomp], bcl[offset+icomp],
-                                               bcvalxhi, maxorder, dxinv[xdir], inhomog, icomp);
-                        } else {
-                            int offset = AMREX_SPACEDIM * ozlo;
-                            mllinop_apply_bc_z(Orientation::low, bx, blen.z,
-                                               vel, mzlo, bct[offset+icomp], bcl[offset+icomp],
-                                               bcvalzlo, maxorder, dxinv[zdir], inhomog, icomp);
-                        }
-                    }
-                }
+                ddz = (bvxhi(i,j,k+1,n)-bvxhi(i,j,k-1,n))*(Real(0.5)*dzi);
             }
-            break;
+        } else if (bct(Orientation::xhi(),n) == AMREX_LO_NEUMANN) {
+            ddz = (vel(i-1,j,k+1,n)-vel(i-1,j,k-1,n))*(Real(0.5)*dzi);
+        } else { // AMREX_LO_REFLECT_ODD or homogeneous Dirichlet
+            ddz = Real(0.);
         }
-        case 6: {
-            // xlo & zhi
-            if (vlo.x == dlo.x && vhi.z == dhi.z) {
-                for (int j = vlo.y; j <= vhi.y; ++j) {
-                    vel      (vlo.x-1,j,vhi.z+1,icomp)
-                        = vel(vlo.x  ,j,vhi.z+1,icomp)
-                        + vel(vlo.x-1,j,vhi.z  ,icomp)
-                        - vel(vlo.x  ,j,vhi.z  ,icomp);
-                }
-            } else if (vlo.x == dlo.x) {
-                Box bx = amrex::adjCellHi(amrex::adjCellLo(vbox,xdir,1),zdir,1);
-                int offset = AMREX_SPACEDIM * oxlo;
-                mllinop_apply_bc_x(Orientation::low, bx, blen.x,
-                                   vel, mxlo, bct[offset+icomp], bcl[offset+icomp],
-                                   bcvalxlo, maxorder, dxinv[xdir], inhomog, icomp);
-            } else if (vhi.z == dhi.z) {
-                Box bx = amrex::adjCellHi(amrex::adjCellLo(vbox,xdir,1),zdir,1);
-                int offset = AMREX_SPACEDIM * ozhi;
-                mllinop_apply_bc_z(Orientation::high, bx, blen.z,
-                                   vel, mzhi, bct[offset+icomp], bcl[offset+icomp],
-                                   bcvalzhi, maxorder, dxinv[zdir], inhomog, icomp);
+    } else {
+        ddz = mltensor_dz_on_xface(i,j,k,n,vel,dzi);
+    }
+    return ddz;
+}
+
+AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+Real mltensor_dz_on_yface (int i, int j, int k, int n, Array4<Real const> const& vel, Real dzi,
+                           Array4<Real const> const& bvylo, Array4<Real const> const& bvyhi,
+                           Array2D<BoundCond,
+                                   0,2*AMREX_SPACEDIM,
+                                   0,AMREX_SPACEDIM> const& bct,
+                           Dim3 const& dlo, Dim3 const& dhi) noexcept
+{
+    Real ddz;
+    if (j == dlo.y) {
+        if (bct(Orientation::ylo(),n) == AMREX_LO_DIRICHLET && bvylo) {
+            if (k == dlo.z) {
+                ddz = (bvylo(i,j-1,k  ,n) * Real(-1.5) +
+                       bvylo(i,j-1,k+1,n) * Real(2.) +
+                       bvylo(i,j-1,k+2,n) * Real(-0.5)) * dzi;
+            } else if (k == dhi.z) {
+                ddz = -(bvylo(i,j-1,k  ,n) * Real(-1.5) +
+                        bvylo(i,j-1,k-1,n) * Real(2.) +
+                        bvylo(i,j-1,k-2,n) * Real(-0.5)) * dzi;
             } else {
-                for (int j = vlo.y; j <= vhi.y; ++j) {
-                    if (mxlo(vlo.x-1,j,vhi.z+1) != BndryData::covered) {
-                        Box bx(IntVect(vlo.x-1,j,vhi.z+1),IntVect(vlo.x-1,j,vhi.z+1));
-                        if (mzhi(vlo.x,j,vhi.z+1) == BndryData::covered) {
-                            int offset = AMREX_SPACEDIM * oxlo;
-                            mllinop_apply_bc_x(Orientation::low, bx, blen.x,
-                                               vel, mxlo, bct[offset+icomp], bcl[offset+icomp],
-                                               bcvalxlo, maxorder, dxinv[xdir], inhomog, icomp);
-                        } else {
-                            int offset = AMREX_SPACEDIM * ozhi;
-                            mllinop_apply_bc_z(Orientation::high, bx, blen.z,
-                                               vel, mzhi, bct[offset+icomp], bcl[offset+icomp],
-                                               bcvalzhi, maxorder, dxinv[zdir], inhomog, icomp);
-                        }
-                    }
-                }
+                ddz = (bvylo(i,j-1,k+1,n)-bvylo(i,j-1,k-1,n))*(Real(0.5)*dzi);
             }
-            break;
+        } else if (bct(Orientation::ylo(),n) == AMREX_LO_NEUMANN) {
+            ddz = (vel(i,j,k+1,n)-vel(i,j,k-1,n))*(Real(0.5)*dzi);
+        } else { // AMREX_LO_REFLECT_ODD or homogeneous Dirichlet
+            ddz = Real(0.);
         }
-        case 7: {
-            // xhi & zhi
-            if (vhi.x == dhi.x && vhi.z == dhi.z) {
-                for (int j = vlo.y; j <= vhi.y; ++j) {
-                    vel      (vhi.x+1,j,vhi.z+1,icomp)
-                        = vel(vhi.x  ,j,vhi.z+1,icomp)
-                        + vel(vhi.x+1,j,vhi.z  ,icomp)
-                        - vel(vhi.x  ,j,vhi.z  ,icomp);
-                }
-            } else if (vhi.x == dhi.x) {
-                Box bx = amrex::adjCellHi(amrex::adjCellHi(vbox,xdir,1),zdir,1);
-                int offset = AMREX_SPACEDIM * oxhi;
-                mllinop_apply_bc_x(Orientation::high, bx, blen.x,
-                                   vel, mxhi, bct[offset+icomp], bcl[offset+icomp],
-                                   bcvalxhi, maxorder, dxinv[xdir], inhomog, icomp);
-            } else if (vhi.z == dhi.z) {
-                Box bx = amrex::adjCellHi(amrex::adjCellHi(vbox,xdir,1),zdir,1);
-                int offset = AMREX_SPACEDIM * ozhi;
-                mllinop_apply_bc_z(Orientation::high, bx, blen.z,
-                                   vel, mzhi, bct[offset+icomp], bcl[offset+icomp],
-                                   bcvalzhi, maxorder, dxinv[zdir], inhomog, icomp);
+    } else if (j == dhi.y+1) {
+        if (bct(Orientation::yhi(),n) == AMREX_LO_DIRICHLET && bvyhi) {
+            if (k == dlo.z) {
+                ddz = (bvyhi(i,j,k  ,n) * Real(-1.5) +
+                       bvyhi(i,j,k+1,n) * Real(2.) +
+                       bvyhi(i,j,k+2,n) * Real(-0.5)) * dzi;
+            } else if (k == dhi.z) {
+                ddz = -(bvyhi(i,j,k  ,n) * Real(-1.5) +
+                        bvyhi(i,j,k-1,n) * Real(2.) +
+                        bvyhi(i,j,k-2,n) * Real(-0.5)) * dzi;
             } else {
-                for (int j = vlo.y; j <= vhi.y; ++j) {
-                    if (mxhi(vhi.x+1,j,vhi.z+1) != BndryData::covered) {
-                        Box bx(IntVect(vhi.x+1,j,vhi.z+1),IntVect(vhi.x+1,j,vhi.z+1));
-                        if (mzhi(vhi.x,j,vhi.z+1) == BndryData::covered) {
-                            int offset = AMREX_SPACEDIM * oxhi;
-                            mllinop_apply_bc_x(Orientation::high, bx, blen.x,
-                                               vel, mxhi, bct[offset+icomp], bcl[offset+icomp],
-                                               bcvalxhi, maxorder, dxinv[xdir], inhomog, icomp);
-                        } else {
-                            int offset = AMREX_SPACEDIM * ozhi;
-                            mllinop_apply_bc_z(Orientation::high, bx, blen.z,
-                                               vel, mzhi, bct[offset+icomp], bcl[offset+icomp],
-                                               bcvalzhi, maxorder, dxinv[zdir], inhomog, icomp);
-                        }
-                    }
-                }
+                ddz = (bvyhi(i,j,k+1,n)-bvyhi(i,j,k-1,n))*(Real(0.5)*dzi);
             }
-            break;
+        } else if (bct(Orientation::yhi(),n) == AMREX_LO_NEUMANN) {
+            ddz = (vel(i,j-1,k+1,n)-vel(i,j-1,k-1,n))*(Real(0.5)*dzi);
+        } else { // AMREX_LO_REFLECT_ODD or homogeneous Dirichlet
+            ddz = Real(0.);
         }
-        case 8: {
-            // ylo & zlo
-            if (vlo.y == dlo.y && vlo.z == dlo.z) {
-                for (int i = vlo.x; i <= vhi.x; ++i) {
-                    vel      (i,vlo.y-1,vlo.z-1,icomp)
-                        = vel(i,vlo.y  ,vlo.z-1,icomp)
-                        + vel(i,vlo.y-1,vlo.z  ,icomp)
-                        - vel(i,vlo.y  ,vlo.z  ,icomp);
-                }
-            } else if (vlo.y == dlo.y) {
-                Box bx = amrex::adjCellLo(amrex::adjCellLo(vbox,ydir,1),zdir,1);
-                int offset = AMREX_SPACEDIM * oylo;
-                mllinop_apply_bc_y(Orientation::low, bx, blen.y,
-                                   vel, mylo, bct[offset+icomp], bcl[offset+icomp],
-                                   bcvalylo, maxorder, dxinv[ydir], inhomog, icomp);
-            } else if (vlo.z == dlo.z) {
-                Box bx = amrex::adjCellLo(amrex::adjCellLo(vbox,ydir,1),zdir,1);
-                int offset = AMREX_SPACEDIM * ozlo;
-                mllinop_apply_bc_z(Orientation::low, bx, blen.z,
-                                   vel, mzlo, bct[offset+icomp], bcl[offset+icomp],
-                                   bcvalzlo, maxorder, dxinv[zdir], inhomog, icomp);
+    } else {
+        ddz = mltensor_dz_on_yface(i,j,k,n,vel,dzi);
+    }
+    return ddz;
+}
+
+AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+Real mltensor_dx_on_zface (int i, int j, int k, int n, Array4<Real const> const& vel, Real dxi,
+                           Array4<Real const> const& bvzlo, Array4<Real const> const& bvzhi,
+                           Array2D<BoundCond,
+                                   0,2*AMREX_SPACEDIM,
+                                   0,AMREX_SPACEDIM> const& bct,
+                           Dim3 const& dlo, Dim3 const& dhi) noexcept
+{
+    Real ddx;
+    if (k == dlo.z) {
+        if (bct(Orientation::zlo(),n) == AMREX_LO_DIRICHLET && bvzlo) {
+            if (i == dlo.x) {
+                ddx = (bvzlo(i  ,j,k-1,n) * Real(-1.5) +
+                       bvzlo(i+1,j,k-1,n) * Real(2.) +
+                       bvzlo(i+2,j,k-1,n) * Real(-0.5)) * dxi;
+            } else if (i == dhi.x) {
+                ddx = -(bvzlo(i  ,j,k-1,n) * Real(-1.5) +
+                        bvzlo(i-1,j,k-1,n) * Real(2.) +
+                        bvzlo(i-2,j,k-1,n) * Real(-0.5)) * dxi;
             } else {
-                for (int i = vlo.x; i <= vhi.x; ++i) {
-                    if (mylo(i,vlo.y-1,vlo.z-1) != BndryData::covered) {
-                        Box bx(IntVect(i,vlo.y-1,vlo.z-1),IntVect(i,vlo.y-1,vlo.z-1));
-                        if (mzlo(i,vlo.y,vlo.z-1) == BndryData::covered) {
-                            int offset = AMREX_SPACEDIM * oylo;
-                            mllinop_apply_bc_y(Orientation::low, bx, blen.y,
-                                               vel, mylo, bct[offset+icomp], bcl[offset+icomp],
-                                               bcvalylo, maxorder, dxinv[ydir], inhomog, icomp);
-                        } else {
-                            int offset = AMREX_SPACEDIM * ozlo;
-                            mllinop_apply_bc_z(Orientation::low, bx, blen.z,
-                                               vel, mzlo, bct[offset+icomp], bcl[offset+icomp],
-                                               bcvalzlo, maxorder, dxinv[zdir], inhomog, icomp);
-                        }
-                    }
-                }
+                ddx = (bvzlo(i+1,j,k-1,n)-bvzlo(i-1,j,k-1,n))*(Real(0.5)*dxi);
             }
-            break;
+        } else if (bct(Orientation::zlo(),n) == AMREX_LO_NEUMANN) {
+            ddx = (vel(i+1,j,k,n)-vel(i-1,j,k,n))*(Real(0.5)*dxi);
+        } else { // AMREX_LO_REFLECT_ODD or homogeneous Dirichlet
+            ddx = Real(0.);
         }
-        case 9: {
-            // yhi & zlo
-            if (vhi.y == dhi.y && vlo.z == dlo.z) {
-                for (int i = vlo.x; i <= vhi.x; ++i) {
-                    vel      (i,vhi.y+1,vlo.z-1,icomp)
-                        = vel(i,vhi.y  ,vlo.z-1,icomp)
-                        + vel(i,vhi.y+1,vlo.z  ,icomp)
-                        - vel(i,vhi.y  ,vlo.z  ,icomp);
-                }
-            } else if (vhi.y == dhi.y) {
-                Box bx = amrex::adjCellLo(amrex::adjCellHi(vbox,ydir,1),zdir,1);
-                int offset = AMREX_SPACEDIM * oyhi;
-                mllinop_apply_bc_y(Orientation::high, bx, blen.y,
-                                   vel, myhi, bct[offset+icomp], bcl[offset+icomp],
-                                   bcvalyhi, maxorder, dxinv[ydir], inhomog, icomp);
-            } else if (vlo.z == dlo.z) {
-                Box bx = amrex::adjCellLo(amrex::adjCellHi(vbox,ydir,1),zdir,1);
-                int offset = AMREX_SPACEDIM * ozlo;
-                mllinop_apply_bc_z(Orientation::low, bx, blen.z,
-                                   vel, mzlo, bct[offset+icomp], bcl[offset+icomp],
-                                   bcvalzlo, maxorder, dxinv[zdir], inhomog, icomp);
+    } else if (k == dhi.z+1) {
+        if (bct(Orientation::zhi(),n) == AMREX_LO_DIRICHLET && bvzhi) {
+            if (i == dlo.x) {
+                ddx = (bvzhi(i  ,j,k,n) * Real(-1.5) +
+                       bvzhi(i+1,j,k,n) * Real(2.) +
+                       bvzhi(i+2,j,k,n) * Real(-0.5)) * dxi;
+            } else if (i == dhi.x) {
+                ddx = -(bvzhi(i  ,j,k,n) * Real(-1.5) +
+                        bvzhi(i-1,j,k,n) * Real(2.) +
+                        bvzhi(i-2,j,k,n) * Real(-0.5)) * dxi;
             } else {
-                for (int i = vlo.x; i <= vhi.x; ++i) {
-                    if (myhi(i,vhi.y+1,vlo.z-1) != BndryData::covered) {
-                        Box bx(IntVect(i,vhi.y+1,vlo.z-1),IntVect(i,vhi.y+1,vlo.z-1));
-                        if (mzlo(i,vhi.y,vlo.z-1) == BndryData::covered) {
-                            int offset = AMREX_SPACEDIM * oyhi;
-                            mllinop_apply_bc_y(Orientation::high, bx, blen.y,
-                                               vel, myhi, bct[offset+icomp], bcl[offset+icomp],
-                                               bcvalyhi, maxorder, dxinv[ydir], inhomog, icomp);
-                        } else {
-                            int offset = AMREX_SPACEDIM * ozlo;
-                            mllinop_apply_bc_z(Orientation::low, bx, blen.z,
-                                               vel, mzlo, bct[offset+icomp], bcl[offset+icomp],
-                                               bcvalzlo, maxorder, dxinv[zdir], inhomog, icomp);
-                        }
-                    }
-                }
+                ddx = (bvzhi(i+1,j,k,n)-bvzhi(i-1,j,k,n))*(Real(0.5)*dxi);
             }
-            break;
+        } else if (bct(Orientation::zhi(),n) == AMREX_LO_NEUMANN) {
+            ddx = (vel(i+1,j,k-1,n)-vel(i-1,j,k-1,n))*(Real(0.5)*dxi);
+        } else { // AMREX_LO_REFLECT_ODD or homogeneous Dirichlet
+            ddx = Real(0.);
         }
-        case 10: {
-            // ylo & zhi
-            if (vlo.y == dlo.y && vhi.z == dhi.z) {
-                for (int i = vlo.x; i <= vhi.x; ++i) {
-                    vel      (i,vlo.y-1,vhi.z+1,icomp)
-                        = vel(i,vlo.y  ,vhi.z+1,icomp)
-                        + vel(i,vlo.y-1,vhi.z  ,icomp)
-                        - vel(i,vlo.y  ,vhi.z  ,icomp);
-                }
-            } else if (vlo.y == dlo.y) {
-                Box bx = amrex::adjCellHi(amrex::adjCellLo(vbox,ydir,1),zdir,1);
-                int offset = AMREX_SPACEDIM * oylo;
-                mllinop_apply_bc_y(Orientation::low, bx, blen.y,
-                                   vel, mylo, bct[offset+icomp], bcl[offset+icomp],
-                                   bcvalylo, maxorder, dxinv[ydir], inhomog, icomp);
-            } else if (vhi.z == dhi.z) {
-                Box bx = amrex::adjCellHi(amrex::adjCellLo(vbox,ydir,1),zdir,1);
-                int offset = AMREX_SPACEDIM * ozhi;
-                mllinop_apply_bc_z(Orientation::high, bx, blen.z,
-                                   vel, mzhi, bct[offset+icomp], bcl[offset+icomp],
-                                   bcvalzhi, maxorder, dxinv[zdir], inhomog, icomp);
+    } else {
+        ddx = mltensor_dx_on_zface(i,j,k,n,vel,dxi);
+    }
+    return ddx;
+}
+
+AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+Real mltensor_dy_on_zface (int i, int j, int k, int n, Array4<Real const> const& vel, Real dyi,
+                           Array4<Real const> const& bvzlo, Array4<Real const> const& bvzhi,
+                           Array2D<BoundCond,
+                                   0,2*AMREX_SPACEDIM,
+                                   0,AMREX_SPACEDIM> const& bct,
+                           Dim3 const& dlo, Dim3 const& dhi) noexcept
+{
+    Real ddy;
+    if (k == dlo.z) {
+        if (bct(Orientation::zlo(),n) == AMREX_LO_DIRICHLET && bvzlo) {
+            if (j == dlo.y) {
+                ddy = (bvzlo(i,j  ,k-1,n) * Real(-1.5) +
+                       bvzlo(i,j+1,k-1,n) * Real(2.) +
+                       bvzlo(i,j+2,k-1,n) * Real(-0.5)) * dyi;
+            } else if (j == dhi.y) {
+                ddy = -(bvzlo(i,j  ,k-1,n) * Real(-1.5) +
+                        bvzlo(i,j-1,k-1,n) * Real(2.) +
+                        bvzlo(i,j-2,k-1,n) * Real(-0.5)) * dyi;
             } else {
-                for (int i = vlo.x; i <= vhi.x; ++i) {
-                    if (mylo(i,vlo.y-1,vhi.z+1) != BndryData::covered) {
-                        Box bx(IntVect(i,vlo.y-1,vhi.z+1),IntVect(i,vlo.y-1,vhi.z+1));
-                        if (mzhi(i,vlo.y,vhi.z+1) == BndryData::covered) {
-                            int offset = AMREX_SPACEDIM * oylo;
-                            mllinop_apply_bc_y(Orientation::low, bx, blen.y,
-                                               vel, mylo, bct[offset+icomp], bcl[offset+icomp],
-                                               bcvalylo, maxorder, dxinv[ydir], inhomog, icomp);
-                        } else {
-                            int offset = AMREX_SPACEDIM * ozhi;
-                            mllinop_apply_bc_z(Orientation::high, bx, blen.z,
-                                               vel, mzhi, bct[offset+icomp], bcl[offset+icomp],
-                                               bcvalzhi, maxorder, dxinv[zdir], inhomog, icomp);
-                        }
-                    }
-                }
+                ddy = (bvzlo(i,j+1,k-1,n)-bvzlo(i,j-1,k-1,n))*(Real(0.5)*dyi);
             }
-            break;
+        } else if (bct(Orientation::zlo(),n) == AMREX_LO_NEUMANN) {
+            ddy = (vel(i,j+1,k,n)-vel(i,j-1,k,n))*(Real(0.5)*dyi);
+        } else { // AMREX_LO_REFLECT_ODD or homogeneous Dirichlet
+            ddy = Real(0.);
         }
-        case 11: {
-            // yhi & zhi
-            if (vhi.y == dhi.y && vhi.z == dhi.z) {
-                for (int i = vlo.x; i <= vhi.x; ++i) {
-                    vel      (i,vhi.y+1,vhi.z+1,icomp)
-                        = vel(i,vhi.y  ,vhi.z+1,icomp)
-                        + vel(i,vhi.y+1,vhi.z  ,icomp)
-                        - vel(i,vhi.y  ,vhi.z  ,icomp);
-                }
-            } else if (vhi.y == dhi.y) {
-                Box bx = amrex::adjCellHi(amrex::adjCellHi(vbox,ydir,1),zdir,1);
-                int offset = AMREX_SPACEDIM * oyhi;
-                mllinop_apply_bc_y(Orientation::high, bx, blen.y,
-                                   vel, myhi, bct[offset+icomp], bcl[offset+icomp],
-                                   bcvalyhi, maxorder, dxinv[ydir], inhomog, icomp);
-            } else if (vhi.z == dhi.z) {
-                Box bx = amrex::adjCellHi(amrex::adjCellHi(vbox,ydir,1),zdir,1);
-                int offset = AMREX_SPACEDIM * ozhi;
-                mllinop_apply_bc_z(Orientation::high, bx, blen.z,
-                                   vel, mzhi, bct[offset+icomp], bcl[offset+icomp],
-                                   bcvalzhi, maxorder, dxinv[zdir], inhomog, icomp);
+    } else if (k == dhi.z+1) {
+        if (bct(Orientation::zhi(),n) == AMREX_LO_DIRICHLET && bvzhi) {
+            if (j == dlo.y) {
+                ddy = (bvzhi(i,j  ,k,n) * Real(-1.5) +
+                       bvzhi(i,j+1,k,n) * Real(2.) +
+                       bvzhi(i,j+2,k,n) * Real(-0.5)) * dyi;
+            } else if (j == dhi.y) {
+                ddy = -(bvzhi(i,j  ,k,n) * Real(-1.5) +
+                        bvzhi(i,j-1,k,n) * Real(2.) +
+                        bvzhi(i,j-2,k,n) * Real(-0.5)) * dyi;
             } else {
-                for (int i = vlo.x; i <= vhi.x; ++i) {
-                    if (myhi(i,vhi.y+1,vhi.z+1) != BndryData::covered) {
-                        Box bx(IntVect(i,vhi.y+1,vhi.z+1),IntVect(i,vhi.y+1,vhi.z+1));
-                        if (mzhi(i,vhi.y,vhi.z+1) == BndryData::covered) {
-                            int offset = AMREX_SPACEDIM * oyhi;
-                            mllinop_apply_bc_y(Orientation::high, bx, blen.y,
-                                               vel, myhi, bct[offset+icomp], bcl[offset+icomp],
-                                               bcvalyhi, maxorder, dxinv[ydir], inhomog, icomp);
-                        } else {
-                            int offset = AMREX_SPACEDIM * ozhi;
-                            mllinop_apply_bc_z(Orientation::high, bx, blen.z,
-                                               vel, mzhi, bct[offset+icomp], bcl[offset+icomp],
-                                               bcvalzhi, maxorder, dxinv[zdir], inhomog, icomp);
-                        }
-                    }
-                }
+                ddy = (bvzhi(i,j+1,k,n)-bvzhi(i,j-1,k,n))*(Real(0.5)*dyi);
             }
-            break;
-        }
-        default: {}
+        } else if (bct(Orientation::zhi(),n) == AMREX_LO_NEUMANN) {
+            ddy = (vel(i,j+1,k-1,n)-vel(i,j-1,k-1,n))*(Real(0.5)*dyi);
+        } else { // AMREX_LO_REFLECT_ODD or homogeneous Dirichlet
+            ddy = Real(0.);
         }
+    } else {
+        ddy = mltensor_dy_on_zface(i,j,k,n,vel,dyi);
     }
+    return ddy;
 }
 
 AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
@@ -1057,7 +1844,13 @@ void mltensor_cross_terms_fx (Box const& box, Array4<Real> const& fx,
                               Array4<Real const> const& vel,
                               Array4<Real const> const& etax,
                               Array4<Real const> const& kapx,
-                              GpuArray<Real,AMREX_SPACEDIM> const& dxinv) noexcept
+                              GpuArray<Real,AMREX_SPACEDIM> const& dxinv,
+                              Array4<Real const> const& bvxlo,
+                              Array4<Real const> const& bvxhi,
+                              Array2D<BoundCond,
+                                      0,2*AMREX_SPACEDIM,
+                                      0,AMREX_SPACEDIM> const& bct,
+                              Dim3 const& dlo, Dim3 const& dhi) noexcept
 {
     const Real dyi = dxinv[1];
     const Real dzi = dxinv[2];
@@ -1067,12 +1860,11 @@ void mltensor_cross_terms_fx (Box const& box, Array4<Real> const& fx,
 
     for         (int k = lo.z; k <= hi.z; ++k) {
         for     (int j = lo.y; j <= hi.y; ++j) {
-            AMREX_PRAGMA_SIMD
             for (int i = lo.x; i <= hi.x; ++i) {
-                Real dudy = (vel(i,j+1,k,0)+vel(i-1,j+1,k,0)-vel(i,j-1,k,0)-vel(i-1,j-1,k,0))*(Real(0.25)*dyi);
-                Real dvdy = (vel(i,j+1,k,1)+vel(i-1,j+1,k,1)-vel(i,j-1,k,1)-vel(i-1,j-1,k,1))*(Real(0.25)*dyi);
-                Real dudz = (vel(i,j,k+1,0)+vel(i-1,j,k+1,0)-vel(i,j,k-1,0)-vel(i-1,j,k-1,0))*(Real(0.25)*dzi);
-                Real dwdz = (vel(i,j,k+1,2)+vel(i-1,j,k+1,2)-vel(i,j,k-1,2)-vel(i-1,j,k-1,2))*(Real(0.25)*dzi);
+                Real dudy = mltensor_dy_on_xface(i,j,k,0,vel,dyi,bvxlo,bvxhi,bct,dlo,dhi);
+                Real dvdy = mltensor_dy_on_xface(i,j,k,1,vel,dyi,bvxlo,bvxhi,bct,dlo,dhi);
+                Real dudz = mltensor_dz_on_xface(i,j,k,0,vel,dzi,bvxlo,bvxhi,bct,dlo,dhi);
+                Real dwdz = mltensor_dz_on_xface(i,j,k,2,vel,dzi,bvxlo,bvxhi,bct,dlo,dhi);
                 Real divu = dvdy + dwdz;
                 Real xif = kapx(i,j,k);
                 Real mun = Real(0.75)*(etax(i,j,k,0)-xif);  // restore the original eta
@@ -1090,7 +1882,13 @@ void mltensor_cross_terms_fy (Box const& box, Array4<Real> const& fy,
                               Array4<Real const> const& vel,
                               Array4<Real const> const& etay,
                               Array4<Real const> const& kapy,
-                              GpuArray<Real,AMREX_SPACEDIM> const& dxinv) noexcept
+                              GpuArray<Real,AMREX_SPACEDIM> const& dxinv,
+                              Array4<Real const> const& bvylo,
+                              Array4<Real const> const& bvyhi,
+                              Array2D<BoundCond,
+                                      0,2*AMREX_SPACEDIM,
+                                      0,AMREX_SPACEDIM> const& bct,
+                              Dim3 const& dlo, Dim3 const& dhi) noexcept
 {
     const Real dxi = dxinv[0];
     const Real dzi = dxinv[2];
@@ -1100,12 +1898,11 @@ void mltensor_cross_terms_fy (Box const& box, Array4<Real> const& fy,
 
     for         (int k = lo.z; k <= hi.z; ++k) {
         for     (int j = lo.y; j <= hi.y; ++j) {
-            AMREX_PRAGMA_SIMD
             for (int i = lo.x; i <= hi.x; ++i) {
-                Real dudx = (vel(i+1,j,k,0)+vel(i+1,j-1,k,0)-vel(i-1,j,k,0)-vel(i-1,j-1,k,0))*(Real(0.25)*dxi);
-                Real dvdx = (vel(i+1,j,k,1)+vel(i+1,j-1,k,1)-vel(i-1,j,k,1)-vel(i-1,j-1,k,1))*(Real(0.25)*dxi);
-                Real dvdz = (vel(i,j,k+1,1)+vel(i,j-1,k+1,1)-vel(i,j,k-1,1)-vel(i,j-1,k-1,1))*(Real(0.25)*dzi);
-                Real dwdz = (vel(i,j,k+1,2)+vel(i,j-1,k+1,2)-vel(i,j,k-1,2)-vel(i,j-1,k-1,2))*(Real(0.25)*dzi);
+                Real dudx = mltensor_dx_on_yface(i,j,k,0,vel,dxi,bvylo,bvyhi,bct,dlo,dhi);
+                Real dvdx = mltensor_dx_on_yface(i,j,k,1,vel,dxi,bvylo,bvyhi,bct,dlo,dhi);
+                Real dvdz = mltensor_dz_on_yface(i,j,k,1,vel,dzi,bvylo,bvyhi,bct,dlo,dhi);
+                Real dwdz = mltensor_dz_on_yface(i,j,k,2,vel,dzi,bvylo,bvyhi,bct,dlo,dhi);
                 Real divu = dudx + dwdz;
                 Real xif = kapy(i,j,k);
                 Real mun = Real(0.75)*(etay(i,j,k,1)-xif);  // restore the original eta
@@ -1123,7 +1920,13 @@ void mltensor_cross_terms_fz (Box const& box, Array4<Real> const& fz,
                               Array4<Real const> const& vel,
                               Array4<Real const> const& etaz,
                               Array4<Real const> const& kapz,
-                              GpuArray<Real,AMREX_SPACEDIM> const& dxinv) noexcept
+                              GpuArray<Real,AMREX_SPACEDIM> const& dxinv,
+                              Array4<Real const> const& bvzlo,
+                              Array4<Real const> const& bvzhi,
+                              Array2D<BoundCond,
+                                      0,2*AMREX_SPACEDIM,
+                                      0,AMREX_SPACEDIM> const& bct,
+                              Dim3 const& dlo, Dim3 const& dhi) noexcept
 {
     const Real dxi = dxinv[0];
     const Real dyi = dxinv[1];
@@ -1133,12 +1936,11 @@ void mltensor_cross_terms_fz (Box const& box, Array4<Real> const& fz,
 
     for         (int k = lo.z; k <= hi.z; ++k) {
         for     (int j = lo.y; j <= hi.y; ++j) {
-            AMREX_PRAGMA_SIMD
             for (int i = lo.x; i <= hi.x; ++i) {
-                Real dudx = (vel(i+1,j,k,0)+vel(i+1,j,k-1,0)-vel(i-1,j,k,0)-vel(i-1,j,k-1,0))*(Real(0.25)*dxi);
-                Real dwdx = (vel(i+1,j,k,2)+vel(i+1,j,k-1,2)-vel(i-1,j,k,2)-vel(i-1,j,k-1,2))*(Real(0.25)*dxi);
-                Real dvdy = (vel(i,j+1,k,1)+vel(i,j+1,k-1,1)-vel(i,j-1,k,1)-vel(i,j-1,k-1,1))*(Real(0.25)*dyi);
-                Real dwdy = (vel(i,j+1,k,2)+vel(i,j+1,k-1,2)-vel(i,j-1,k,2)-vel(i,j-1,k-1,2))*(Real(0.25)*dyi);
+                Real dudx = mltensor_dx_on_zface(i,j,k,0,vel,dxi,bvzlo,bvzhi,bct,dlo,dhi);
+                Real dwdx = mltensor_dx_on_zface(i,j,k,2,vel,dxi,bvzlo,bvzhi,bct,dlo,dhi);
+                Real dvdy = mltensor_dy_on_zface(i,j,k,1,vel,dyi,bvzlo,bvzhi,bct,dlo,dhi);
+                Real dwdy = mltensor_dy_on_zface(i,j,k,2,vel,dyi,bvzlo,bvzhi,bct,dlo,dhi);
                 Real divu = dudx + dvdy;
                 Real xif = kapz(i,j,k);
                 Real mun = Real(0.75)*(etaz(i,j,k,2)-xif);  // restore the original eta
@@ -1242,13 +2044,13 @@ void mltensor_vel_grads_fx (Box const& box, Array4<Real> const& fx,
                 Real dvdx = (vel(i,j,k,1) - vel(i-1,j,k,1))*dxi;
                 Real dwdx = (vel(i,j,k,2) - vel(i-1,j,k,2))*dxi;
 
-                Real dudy = (vel(i,j+1,k,0)+vel(i-1,j+1,k,0)-vel(i,j-1,k,0)-vel(i-1,j-1,k,0))*(Real(0.25)*dyi);
-                Real dvdy = (vel(i,j+1,k,1)+vel(i-1,j+1,k,1)-vel(i,j-1,k,1)-vel(i-1,j-1,k,1))*(Real(0.25)*dyi);
-                Real dwdy = (vel(i,j+1,k,2)+vel(i-1,j+1,k,2)-vel(i,j-1,k,2)-vel(i-1,j-1,k,2))*(Real(0.25)*dyi);
+                Real dudy = mltensor_dy_on_xface(i,j,k,0,vel,dyi);
+                Real dvdy = mltensor_dy_on_xface(i,j,k,1,vel,dyi);
+                Real dwdy = mltensor_dy_on_xface(i,j,k,2,vel,dyi);
 
-                Real dudz = (vel(i,j,k+1,0)+vel(i-1,j,k+1,0)-vel(i,j,k-1,0)-vel(i-1,j,k-1,0))*(Real(0.25)*dzi);
-                Real dvdz = (vel(i,j,k+1,1)+vel(i-1,j,k+1,1)-vel(i,j,k-1,1)-vel(i-1,j,k-1,1))*(Real(0.25)*dzi);
-                Real dwdz = (vel(i,j,k+1,2)+vel(i-1,j,k+1,2)-vel(i,j,k-1,2)-vel(i-1,j,k-1,2))*(Real(0.25)*dzi);
+                Real dudz = mltensor_dz_on_xface(i,j,k,0,vel,dzi);
+                Real dvdz = mltensor_dz_on_xface(i,j,k,1,vel,dzi);
+                Real dwdz = mltensor_dz_on_xface(i,j,k,2,vel,dzi);
 
                 fx(i,j,k,0) = dudx;
                 fx(i,j,k,1) = dvdx;
@@ -1281,17 +2083,17 @@ void mltensor_vel_grads_fy (Box const& box, Array4<Real> const& fy,
             AMREX_PRAGMA_SIMD
             for (int i = lo.x; i <= hi.x; ++i) {
 
-                Real dudx = (vel(i+1,j,k,0)+vel(i+1,j-1,k,0)-vel(i-1,j,k,0)-vel(i-1,j-1,k,0))*(Real(0.25)*dxi);
-                Real dvdx = (vel(i+1,j,k,1)+vel(i+1,j-1,k,1)-vel(i-1,j,k,1)-vel(i-1,j-1,k,1))*(Real(0.25)*dxi);
-                Real dwdx = (vel(i+1,j,k,2)+vel(i+1,j-1,k,2)-vel(i-1,j,k,2)-vel(i-1,j-1,k,2))*(Real(0.25)*dxi);
+                Real dudx = mltensor_dx_on_yface(i,j,k,0,vel,dxi);
+                Real dvdx = mltensor_dx_on_yface(i,j,k,1,vel,dxi);
+                Real dwdx = mltensor_dx_on_yface(i,j,k,2,vel,dxi);
 
                 Real dudy = (vel(i,j,k,0) - vel(i,j-1,k,0))*dyi;
                 Real dvdy = (vel(i,j,k,1) - vel(i,j-1,k,1))*dyi;
                 Real dwdy = (vel(i,j,k,2) - vel(i,j-1,k,2))*dyi;
 
-                Real dudz = (vel(i,j,k+1,0)+vel(i,j-1,k+1,0)-vel(i,j,k-1,0)-vel(i,j-1,k-1,0))*(Real(0.25)*dzi);
-                Real dvdz = (vel(i,j,k+1,1)+vel(i,j-1,k+1,1)-vel(i,j,k-1,1)-vel(i,j-1,k-1,1))*(Real(0.25)*dzi);
-                Real dwdz = (vel(i,j,k+1,2)+vel(i,j-1,k+1,2)-vel(i,j,k-1,2)-vel(i,j-1,k-1,2))*(Real(0.25)*dzi);
+                Real dudz = mltensor_dz_on_yface(i,j,k,0,vel,dzi);
+                Real dvdz = mltensor_dz_on_yface(i,j,k,1,vel,dzi);
+                Real dwdz = mltensor_dz_on_yface(i,j,k,2,vel,dzi);
 
                 fy(i,j,k,0) = dudx;
                 fy(i,j,k,1) = dvdx;
@@ -1324,13 +2126,13 @@ void mltensor_vel_grads_fz (Box const& box, Array4<Real> const& fz,
             AMREX_PRAGMA_SIMD
             for (int i = lo.x; i <= hi.x; ++i) {
 
-                Real dudx = (vel(i+1,j,k,0)+vel(i+1,j,k-1,0)-vel(i-1,j,k,0)-vel(i-1,j,k-1,0))*(Real(0.25)*dxi);
-                Real dvdx = (vel(i+1,j,k,1)+vel(i+1,j,k-1,1)-vel(i-1,j,k,1)-vel(i-1,j,k-1,1))*(Real(0.25)*dxi);
-                Real dwdx = (vel(i+1,j,k,2)+vel(i+1,j,k-1,2)-vel(i-1,j,k,2)-vel(i-1,j,k-1,2))*(Real(0.25)*dxi);
+                Real dudx = mltensor_dx_on_zface(i,j,k,0,vel,dxi);
+                Real dvdx = mltensor_dx_on_zface(i,j,k,1,vel,dxi);
+                Real dwdx = mltensor_dx_on_zface(i,j,k,2,vel,dxi);
 
-                Real dudy = (vel(i,j+1,k,0)+vel(i,j+1,k-1,0)-vel(i,j-1,k,0)-vel(i,j-1,k-1,0))*(Real(0.25)*dyi);
-                Real dvdy = (vel(i,j+1,k,1)+vel(i,j+1,k-1,1)-vel(i,j-1,k,1)-vel(i,j-1,k-1,1))*(Real(0.25)*dyi);
-                Real dwdy = (vel(i,j+1,k,2)+vel(i,j+1,k-1,2)-vel(i,j-1,k,2)-vel(i,j-1,k-1,2))*(Real(0.25)*dyi);
+                Real dudy = mltensor_dy_on_zface(i,j,k,0,vel,dyi);
+                Real dvdy = mltensor_dy_on_zface(i,j,k,1,vel,dyi);
+                Real dwdy = mltensor_dy_on_zface(i,j,k,2,vel,dyi);
 
                 Real dudz = (vel(i,j,k,0) - vel(i,j,k-1,0))*dzi;
                 Real dvdz = (vel(i,j,k,1) - vel(i,j,k-1,1))*dzi;
@@ -1351,6 +2153,138 @@ void mltensor_vel_grads_fz (Box const& box, Array4<Real> const& fz,
     }
 }
 
+AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+void mltensor_vel_grads_fx (Box const& box, Array4<Real> const& fx,
+                            Array4<Real const> const& vel,
+                            GpuArray<Real,AMREX_SPACEDIM> const& dxinv,
+                            Array4<Real const> const& bvxlo,
+                            Array4<Real const> const& bvxhi,
+                            Array2D<BoundCond,
+                                    0,2*AMREX_SPACEDIM,
+                                    0,AMREX_SPACEDIM> const& bct,
+                            Dim3 const& dlo, Dim3 const& dhi) noexcept
+{
+    const Real dxi = dxinv[0];
+    const Real dyi = dxinv[1];
+    const Real dzi = dxinv[2];
+    const auto lo = amrex::lbound(box);
+    const auto hi = amrex::ubound(box);
+
+    for         (int k = lo.z; k <= hi.z; ++k) {
+        for     (int j = lo.y; j <= hi.y; ++j) {
+            for (int i = lo.x; i <= hi.x; ++i) {
+                Real dudx = (vel(i,j,k,0) - vel(i-1,j,k,0))*dxi;
+                Real dvdx = (vel(i,j,k,1) - vel(i-1,j,k,1))*dxi;
+                Real dwdx = (vel(i,j,k,2) - vel(i-1,j,k,2))*dxi;
+                Real dudy = mltensor_dy_on_xface(i,j,k,0,vel,dyi,bvxlo,bvxhi,bct,dlo,dhi);
+                Real dvdy = mltensor_dy_on_xface(i,j,k,1,vel,dyi,bvxlo,bvxhi,bct,dlo,dhi);
+                Real dwdy = mltensor_dy_on_xface(i,j,k,2,vel,dyi,bvxlo,bvxhi,bct,dlo,dhi);
+                Real dudz = mltensor_dz_on_xface(i,j,k,0,vel,dzi,bvxlo,bvxhi,bct,dlo,dhi);
+                Real dvdz = mltensor_dz_on_xface(i,j,k,1,vel,dzi,bvxlo,bvxhi,bct,dlo,dhi);
+                Real dwdz = mltensor_dz_on_xface(i,j,k,2,vel,dzi,bvxlo,bvxhi,bct,dlo,dhi);
+                fx(i,j,k,0) = dudx;
+                fx(i,j,k,1) = dvdx;
+                fx(i,j,k,2) = dwdx;
+                fx(i,j,k,3) = dudy;
+                fx(i,j,k,4) = dvdy;
+                fx(i,j,k,5) = dwdy;
+                fx(i,j,k,6) = dudz;
+                fx(i,j,k,7) = dvdz;
+                fx(i,j,k,8) = dwdz;
+
+            }
+        }
+    }
+}
+
+AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+void mltensor_vel_grads_fy (Box const& box, Array4<Real> const& fy,
+                            Array4<Real const> const& vel,
+                            GpuArray<Real,AMREX_SPACEDIM> const& dxinv,
+                            Array4<Real const> const& bvylo,
+                            Array4<Real const> const& bvyhi,
+                            Array2D<BoundCond,
+                                    0,2*AMREX_SPACEDIM,
+                                    0,AMREX_SPACEDIM> const& bct,
+                            Dim3 const& dlo, Dim3 const& dhi) noexcept
+{
+    const Real dxi = dxinv[0];
+    const Real dyi = dxinv[1];
+    const Real dzi = dxinv[2];
+    const auto lo = amrex::lbound(box);
+    const auto hi = amrex::ubound(box);
+
+    for         (int k = lo.z; k <= hi.z; ++k) {
+        for     (int j = lo.y; j <= hi.y; ++j) {
+            for (int i = lo.x; i <= hi.x; ++i) {
+                Real dudx = mltensor_dx_on_yface(i,j,k,0,vel,dxi,bvylo,bvyhi,bct,dlo,dhi);
+                Real dvdx = mltensor_dx_on_yface(i,j,k,1,vel,dxi,bvylo,bvyhi,bct,dlo,dhi);
+                Real dwdx = mltensor_dx_on_yface(i,j,k,2,vel,dxi,bvylo,bvyhi,bct,dlo,dhi);
+                Real dudy = (vel(i,j,k,0) - vel(i,j-1,k,0))*dyi;
+                Real dvdy = (vel(i,j,k,1) - vel(i,j-1,k,1))*dyi;
+                Real dwdy = (vel(i,j,k,2) - vel(i,j-1,k,2))*dyi;
+                Real dudz = mltensor_dz_on_yface(i,j,k,0,vel,dzi,bvylo,bvyhi,bct,dlo,dhi);
+                Real dvdz = mltensor_dz_on_yface(i,j,k,1,vel,dzi,bvylo,bvyhi,bct,dlo,dhi);
+                Real dwdz = mltensor_dz_on_yface(i,j,k,2,vel,dzi,bvylo,bvyhi,bct,dlo,dhi);
+                fy(i,j,k,0) = dudx;
+                fy(i,j,k,1) = dvdx;
+                fy(i,j,k,2) = dwdx;
+                fy(i,j,k,3) = dudy;
+                fy(i,j,k,4) = dvdy;
+                fy(i,j,k,5) = dwdy;
+                fy(i,j,k,6) = dudz;
+                fy(i,j,k,7) = dvdz;
+                fy(i,j,k,8) = dwdz;
+
+            }
+        }
+    }
+}
+
+AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+void mltensor_vel_grads_fz (Box const& box, Array4<Real> const& fz,
+                            Array4<Real const> const& vel,
+                            GpuArray<Real,AMREX_SPACEDIM> const& dxinv,
+                            Array4<Real const> const& bvzlo,
+                            Array4<Real const> const& bvzhi,
+                            Array2D<BoundCond,
+                                    0,2*AMREX_SPACEDIM,
+                                    0,AMREX_SPACEDIM> const& bct,
+                            Dim3 const& dlo, Dim3 const& dhi) noexcept
+{
+    const Real dxi = dxinv[0];
+    const Real dyi = dxinv[1];
+    const Real dzi = dxinv[2];
+    const auto lo = amrex::lbound(box);
+    const auto hi = amrex::ubound(box);
+
+    for         (int k = lo.z; k <= hi.z; ++k) {
+        for     (int j = lo.y; j <= hi.y; ++j) {
+            for (int i = lo.x; i <= hi.x; ++i) {
+                Real dudx = mltensor_dx_on_zface(i,j,k,0,vel,dxi,bvzlo,bvzhi,bct,dlo,dhi);
+                Real dvdx = mltensor_dx_on_zface(i,j,k,1,vel,dxi,bvzlo,bvzhi,bct,dlo,dhi);
+                Real dwdx = mltensor_dx_on_zface(i,j,k,2,vel,dxi,bvzlo,bvzhi,bct,dlo,dhi);
+                Real dudy = mltensor_dy_on_zface(i,j,k,0,vel,dyi,bvzlo,bvzhi,bct,dlo,dhi);
+                Real dvdy = mltensor_dy_on_zface(i,j,k,1,vel,dyi,bvzlo,bvzhi,bct,dlo,dhi);
+                Real dwdy = mltensor_dy_on_zface(i,j,k,2,vel,dyi,bvzlo,bvzhi,bct,dlo,dhi);
+                Real dudz = (vel(i,j,k,0) - vel(i,j,k-1,0))*dzi;
+                Real dvdz = (vel(i,j,k,1) - vel(i,j,k-1,1))*dzi;
+                Real dwdz = (vel(i,j,k,2) - vel(i,j,k-1,2))*dzi;
+                fz(i,j,k,0) = dudx;
+                fz(i,j,k,1) = dvdx;
+                fz(i,j,k,2) = dwdx;
+                fz(i,j,k,3) = dudy;
+                fz(i,j,k,4) = dvdy;
+                fz(i,j,k,5) = dwdy;
+                fz(i,j,k,6) = dudz;
+                fz(i,j,k,7) = dvdz;
+                fz(i,j,k,8) = dwdz;
+
+            }
+        }
+    }
+}
+
 }
 
 #endif
diff --git a/Src/LinearSolvers/MLMG/AMReX_MLTensor_K.H b/Src/LinearSolvers/MLMG/AMReX_MLTensor_K.H
index 4440f57e7a8..33457ec1ced 100644
--- a/Src/LinearSolvers/MLMG/AMReX_MLTensor_K.H
+++ b/Src/LinearSolvers/MLMG/AMReX_MLTensor_K.H
@@ -5,6 +5,123 @@
 #include <AMReX_FArrayBox.H>
 #include <AMReX_BndryData.H>
 
+namespace amrex {
+
+AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+Real mltensor_dy_on_xface (int i, int j, int k, int n, Array4<Real const> const& vel, Real dyi) noexcept
+{
+    return (vel(i,j+1,k,n)+vel(i-1,j+1,k,n)-vel(i,j-1,k,n)-vel(i-1,j-1,k,n))*(Real(0.25)*dyi);
+}
+
+AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+Real mltensor_dx_on_yface (int i, int j, int k, int n, Array4<Real const> const& vel, Real dxi) noexcept
+{
+    return (vel(i+1,j,k,n)+vel(i+1,j-1,k,n)-vel(i-1,j,k,n)-vel(i-1,j-1,k,n))*(Real(0.25)*dxi);
+}
+
+AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+Real mltensor_dy_on_xface (int i, int j, int k, int n, Array4<Real const> const& vel, Real dyi,
+                           Array4<Real const> const& bvxlo, Array4<Real const> const& bvxhi,
+                           Array2D<BoundCond,
+                                   0,2*AMREX_SPACEDIM,
+                                   0,AMREX_SPACEDIM> const& bct,
+                           Dim3 const& dlo, Dim3 const& dhi) noexcept
+{
+    Real ddy;
+    if (i == dlo.x) {
+        if (bct(Orientation::xlo(),n) == AMREX_LO_DIRICHLET && bvxlo) {
+            if (j == dlo.y) {
+                ddy = (bvxlo(i-1,j  ,k,n) * Real(-1.5) +
+                       bvxlo(i-1,j+1,k,n) * Real(2.) +
+                       bvxlo(i-1,j+2,k,n) * Real(-0.5)) * dyi;
+            } else if (j == dhi.y) {
+                ddy = -(bvxlo(i-1,j  ,k,n) * Real(-1.5) +
+                        bvxlo(i-1,j-1,k,n) * Real(2.) +
+                        bvxlo(i-1,j-2,k,n) * Real(-0.5)) * dyi;
+            } else {
+                ddy = (bvxlo(i-1,j+1,k,n)-bvxlo(i-1,j-1,k,n))*(Real(0.5)*dyi);
+            }
+        } else if (bct(Orientation::xlo(),n) == AMREX_LO_NEUMANN) {
+            ddy = (vel(i,j+1,k,n)-vel(i,j-1,k,n))*(Real(0.5)*dyi);
+        } else { // AMREX_LO_REFLECT_ODD or homogeneous Dirichlet
+            ddy = Real(0.);
+        }
+    } else if (i == dhi.x+1) {
+        if (bct(Orientation::xhi(),n) == AMREX_LO_DIRICHLET && bvxhi) {
+            if (j == dlo.y) {
+                ddy = (bvxhi(i,j  ,k,n) * Real(-1.5) +
+                       bvxhi(i,j+1,k,n) * Real(2.) +
+                       bvxhi(i,j+2,k,n) * Real(-0.5)) * dyi;
+            } else if (j == dhi.y) {
+                ddy = -(bvxhi(i,j  ,k,n) * Real(-1.5) +
+                        bvxhi(i,j-1,k,n) * Real(2.) +
+                        bvxhi(i,j-2,k,n) * Real(-0.5)) * dyi;
+            } else {
+                ddy = (bvxhi(i,j+1,k,n)-bvxhi(i,j-1,k,n))*(Real(0.5)*dyi);
+            }
+        } else if (bct(Orientation::xhi(),n) == AMREX_LO_NEUMANN) {
+            ddy = (vel(i-1,j+1,k,n)-vel(i-1,j-1,k,n))*(Real(0.5)*dyi);
+        } else { // AMREX_LO_REFLECT_ODD or homogeneous Dirichlet
+            ddy = Real(0.);
+        }
+    } else {
+        ddy = mltensor_dy_on_xface(i,j,k,n,vel,dyi);
+    }
+    return ddy;
+}
+
+AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+Real mltensor_dx_on_yface (int i, int j, int k, int n, Array4<Real const> const& vel, Real dxi,
+                           Array4<Real const> const& bvylo, Array4<Real const> const& bvyhi,
+                           Array2D<BoundCond,
+                                   0,2*AMREX_SPACEDIM,
+                                   0,AMREX_SPACEDIM> const& bct,
+                           Dim3 const& dlo, Dim3 const& dhi) noexcept
+{
+    Real ddx;
+    if (j == dlo.y) {
+        if (bct(Orientation::ylo(),n) == AMREX_LO_DIRICHLET && bvylo) {
+            if (i == dlo.x) {
+                ddx = (bvylo(i  ,j-1,k,n) * Real(-1.5) +
+                       bvylo(i+1,j-1,k,n) * Real(2.) +
+                       bvylo(i+2,j-1,k,n) * Real(-0.5)) * dxi;
+            } else if (i == dhi.x) {
+                ddx = -(bvylo(i  ,j-1,k,n) * Real(-1.5) +
+                        bvylo(i-1,j-1,k,n) * Real(2.) +
+                        bvylo(i-2,j-1,k,n) * Real(-0.5)) * dxi;
+            } else {
+                ddx = (bvylo(i+1,j-1,k,n)-bvylo(i-1,j-1,k,n))*(Real(0.5)*dxi);
+            }
+        } else if (bct(Orientation::ylo(),n) == AMREX_LO_NEUMANN) {
+            ddx = (vel(i+1,j,k,n)-vel(i-1,j,k,n))*(Real(0.5)*dxi);
+        } else { // AMREX_LO_REFLECT_ODD or homogeneous Dirichlet
+            ddx = Real(0.);
+        }
+    } else if (j == dhi.y+1) {
+        if (bct(Orientation::yhi(),n) == AMREX_LO_DIRICHLET && bvyhi) {
+            if (i == dlo.x) {
+                ddx = (bvyhi(i  ,j,k,n) * Real(-1.5) +
+                       bvyhi(i+1,j,k,n) * Real(2.) +
+                       bvyhi(i+2,j,k,n) * Real(-0.5)) * dxi;
+            } else if (i == dhi.x) {
+                ddx = -(bvyhi(i  ,j,k,n) * Real(-1.5) +
+                        bvyhi(i-1,j,k,n) * Real(2.) +
+                        bvyhi(i-2,j,k,n) * Real(-0.5)) * dxi;
+            } else {
+                ddx = (bvyhi(i+1,j,k,n)-bvyhi(i-1,j,k,n))*(Real(0.5)*dxi);
+            }
+        } else if (bct(Orientation::yhi(),n) == AMREX_LO_NEUMANN) {
+            ddx = (vel(i+1,j-1,k,n)-vel(i-1,j-1,k,n))*(Real(0.5)*dxi);
+        } else { // AMREX_LO_REFLECT_ODD or homogeneous Dirichlet
+            ddx = Real(0.);
+        }
+    } else {
+        ddx = mltensor_dx_on_yface(i,j,k,n,vel,dxi);
+    }
+    return ddx;
+}
+}
+
 #if (AMREX_SPACEDIM == 1)
 #include <AMReX_MLTensor_1D_K.H>
 #elif (AMREX_SPACEDIM == 2)
diff --git a/Src/LinearSolvers/OpenBC/AMReX_OpenBC.H b/Src/LinearSolvers/OpenBC/AMReX_OpenBC.H
new file mode 100644
index 00000000000..00d589e34b4
--- /dev/null
+++ b/Src/LinearSolvers/OpenBC/AMReX_OpenBC.H
@@ -0,0 +1,141 @@
+#ifndef AMREX_OPENBC_H_
+#define AMREX_OPENBC_H_
+#include <AMReX_Config.H>
+
+#include <AMReX_MLMG.H>
+#include <AMReX_MLPoisson.H>
+
+namespace amrex
+{
+
+namespace openbc {
+
+    static constexpr int M = 7; // highest order of moments
+    static constexpr int P = 3;
+
+    struct Moments
+    {
+        typedef GpuArray<Real,(M+2)*(M+1)/2> array_type;
+        array_type mom;
+        Real x, y, z;
+        Orientation face;
+    };
+
+    struct MomTag
+    {
+        Array4<Real const> gp;
+        Box b2d;
+        Orientation face;
+        int offset;
+    };
+
+    std::ostream& operator<< (std::ostream& os, Moments const& mom);
+}
+
+#if defined(AMREX_USE_CUDA) || defined(AMREX_USE_HIP)
+template<>
+struct Gpu::SharedMemory<openbc::Moments::array_type>
+{
+    AMREX_GPU_DEVICE openbc::Moments::array_type* dataPtr () noexcept {
+        AMREX_HIP_OR_CUDA(HIP_DYNAMIC_SHARED(openbc::Moments::array_type,amrex_openbc_momarray);,
+                          extern __shared__  openbc::Moments::array_type amrex_openbc_momarray[];)
+            return amrex_openbc_momarray;
+    }
+};
+#endif
+
+/**
+ * \brief Open Boundary Poisson Solver
+ *
+ * References:
+ *    (1) The Solution of Poisson's Equation for Isolated Source
+ *        Distributions, R. A. James, 1977, JCP 25, 71
+ *    (2) A Local Corrections Algorithm for Solving Poisson's Equation in Three
+ *        Dimensions, P. McCorquodale, P. Colella, G. T. Balls, & S. B. Baden,
+ *        2007, Communications in Applied Mathematics and Computational Science,
+ *        2, 1, 57-81
+ */
+class OpenBCSolver
+{
+public:
+    OpenBCSolver ();
+
+    OpenBCSolver (const Vector<Geometry>& a_geom,
+                  const Vector<BoxArray>& a_grids,
+                  const Vector<DistributionMapping>& a_dmap,
+                  const LPInfo& a_info = LPInfo());
+
+    ~OpenBCSolver ();
+
+    OpenBCSolver (const OpenBCSolver&) = delete;
+    OpenBCSolver (OpenBCSolver&&) = delete;
+    OpenBCSolver& operator= (const OpenBCSolver&) = delete;
+    OpenBCSolver& operator= (OpenBCSolver&&) = delete;
+
+    void define (const Vector<Geometry>& a_geom,
+                 const Vector<BoxArray>& a_grids,
+                 const Vector<DistributionMapping>& a_dmap,
+                 const LPInfo& a_info = LPInfo());
+
+    void setVerbose (int v) noexcept;
+    void setBottomVerbose (int v) noexcept;
+
+    void useHypre (bool use_hypre) noexcept;
+
+    Real solve (const Vector<MultiFab*>& a_sol, const Vector<MultiFab const*>& a_rhs,
+                Real a_tol_rel, Real a_tol_abs);
+
+public: // public for cuda
+
+    void compute_moments (Gpu::DeviceVector<openbc::Moments>& moments);
+    void compute_potential (Gpu::DeviceVector<openbc::Moments> const& moments);
+    void interpolate_potential (MultiFab& solg);
+
+private:
+
+#ifdef AMREX_USE_MPI
+    void bcast_moments (Gpu::DeviceVector<openbc::Moments>& moments);
+#endif
+
+    int m_verbose = 0;
+    int m_bottom_verbose = 0;
+    Vector<Geometry> m_geom;
+    Vector<BoxArray> m_grids;
+    Vector<DistributionMapping> m_dmap;
+    LPInfo m_info;
+    std::unique_ptr<MLPoisson> m_poisson_1;
+    std::unique_ptr<MLPoisson> m_poisson_2;
+    std::unique_ptr<MLMG> m_mlmg_1;
+    std::unique_ptr<MLMG> m_mlmg_2;
+    BottomSolver m_bottom_solver_type = BottomSolver::bicgstab;
+
+    int m_coarsen_ratio = 0;
+    Array<MultiFab,AMREX_SPACEDIM> m_dpdn;
+    Gpu::PinnedVector<openbc::MomTag> m_momtags_h;
+#ifdef AMREX_USE_GPU
+    Gpu::DeviceVector<openbc::MomTag> m_momtags_d;
+    Gpu::PinnedVector<int> m_ngpublocks_h;
+    Gpu::DeviceVector<int> m_ngpublocks_d;
+    int m_nthreads_momtag;
+#endif
+
+    int m_nblocks_local = 0;
+    int m_nblocks = 0;
+#ifdef AMREX_USE_MPI
+    Vector<int> m_countvec;
+    Vector<int> m_offset;
+#endif
+
+    IntVect m_ngrowdomain;
+    MultiFab m_crse_grown_faces_phi;
+    MultiFab m_phind;
+    BoxArray m_bag;
+
+    BoxArray m_ba_all;
+    DistributionMapping m_dm_all;
+    Geometry m_geom_all;
+};
+
+}
+
+#endif
diff --git a/Src/LinearSolvers/OpenBC/AMReX_OpenBC.cpp b/Src/LinearSolvers/OpenBC/AMReX_OpenBC.cpp
new file mode 100644
index 00000000000..9e320d7a55f
--- /dev/null
+++ b/Src/LinearSolvers/OpenBC/AMReX_OpenBC.cpp
@@ -0,0 +1,864 @@
+#include <AMReX_OpenBC.H>
+#include <AMReX_OpenBC_K.H>
+#include <AMReX_Algorithm.H>
+
+namespace amrex
+{
+
+OpenBCSolver::OpenBCSolver () {}
+
+OpenBCSolver::OpenBCSolver (const Vector<Geometry>& a_geom,
+                            const Vector<BoxArray>& a_grids,
+                            const Vector<DistributionMapping>& a_dmap,
+                            const LPInfo& a_info)
+{
+    define(a_geom, a_grids, a_dmap, a_info);
+}
+
+OpenBCSolver::~OpenBCSolver () {}
+
+void OpenBCSolver::define (const Vector<Geometry>& a_geom,
+                           const Vector<BoxArray>& a_grids,
+                           const Vector<DistributionMapping>& a_dmap,
+                           const LPInfo& a_info)
+{
+    BL_PROFILE("OpenBCSoler::define()");
+
+    m_geom = a_geom;
+    m_grids = a_grids;
+    m_dmap = a_dmap;
+    m_info = a_info;
+    for (auto& grids : m_grids) {
+        grids.enclosedCells();
+    }
+
+    Box const domain0 = m_geom[0].Domain();
+    m_coarsen_ratio = 8;
+    AMREX_ALWAYS_ASSERT(domain0.coarsenable(m_coarsen_ratio));
+    int N1d = static_cast<int>(std::round(std::pow(domain0.d_numPts(),1./3.)));
+    while (domain0.coarsenable(m_coarsen_ratio*2)
+           && 4*m_coarsen_ratio*m_coarsen_ratio <= N1d) {
+        m_coarsen_ratio *= 2;
+    }
+
+    int ntags = 0;
+    for (int idim = 0; idim < AMREX_SPACEDIM; ++idim) {
+        Box lo = amrex::coarsen(amrex::bdryLo(domain0, idim), m_coarsen_ratio);
+        Box hi = amrex::coarsen(amrex::bdryHi(domain0, idim), m_coarsen_ratio);
+        BoxList bl({lo,hi});
+        IntVect chunk = lo.length();
+        while (bl.size() < ParallelContext::NProcsSub()) {
+            IntVect chunk_prev = chunk;
+            for (int jdim = AMREX_SPACEDIM-1; jdim >= 0; --jdim) {
+                if (jdim != idim) {
+                    int new_chunk_size = chunk[jdim] / 2;
+                    if (bl.size() < ParallelContext::NProcsSub()
+                        && new_chunk_size > 0) {
+                        chunk[jdim] = new_chunk_size;
+                        bl.maxSize(chunk);
+                    }
+                }
+            }
+            if (chunk == chunk_prev) {
+                break;
+            }
+        }
+        int mgs = std::max(1, 256/m_coarsen_ratio);
+        bl.maxSize(mgs);
+        bl.refine(m_coarsen_ratio);
+        BoxArray ba2d(std::move(bl));
+        DistributionMapping dm2d{ba2d};
+        m_dpdn[idim].define(ba2d, dm2d, 1, 0);
+        ntags += m_dpdn[idim].local_size();
+    }
+
+    m_momtags_h.reserve(ntags);
+    int nblocks = 0;
+    for (int idim = 0; idim < AMREX_SPACEDIM; ++idim) {
+        for (MFIter mfi(m_dpdn[idim]); mfi.isValid(); ++mfi) {
+            Box const& b2d = mfi.validbox();
+            Orientation::Side side = (b2d.smallEnd(idim) == domain0.smallEnd(idim))
+                ? Orientation::low : Orientation::high;
+            Orientation face(idim, side);
+            m_momtags_h.push_back({m_dpdn[idim].const_array(mfi), b2d, face,
+                                   nblocks});
+            nblocks += static_cast<int>(b2d.numPts())
+                / (m_coarsen_ratio*m_coarsen_ratio);
+        }
+    }
+    m_nblocks_local = nblocks;
+
+#ifdef AMREX_USE_GPU
+    if (ntags > 0) {
+        m_momtags_d.resize(ntags);
+        Gpu::copyAsync(Gpu::hostToDevice, m_momtags_h.begin(), m_momtags_h.end(), m_momtags_d.begin());
+
+        m_nthreads_momtag = (m_coarsen_ratio == 8) ? 64 : 128;
+        int ntotgpublocks = 0;
+        m_ngpublocks_h.reserve(ntags+1);
+        for (auto const& tag : m_momtags_h) {
+            m_ngpublocks_h.push_back(ntotgpublocks);
+            Box cb2d = amrex::coarsen(tag.b2d, m_coarsen_ratio);
+            ntotgpublocks += static_cast<int>(cb2d.numPts());
+        }
+        m_ngpublocks_h.push_back(ntotgpublocks);
+        m_ngpublocks_d.resize(m_ngpublocks_h.size());
+        Gpu::copyAsync(Gpu::hostToDevice, m_ngpublocks_h.begin(), m_ngpublocks_h.end(),
+                       m_ngpublocks_d.begin());
+    }
+#endif
+
+    auto const dx = m_geom[0].CellSize();
+    Real dmax = amrex::max(std::sqrt(dx[0]*dx[0]+dx[1]*dx[1]),
+                           std::sqrt(dx[0]*dx[0]+dx[2]*dx[2]),
+                           std::sqrt(dx[1]*dx[1]+dx[2]*dx[2]));
+    m_ngrowdomain[0] = static_cast<int>(std::ceil(dmax/dx[0])) * m_coarsen_ratio;
+    m_ngrowdomain[1] = static_cast<int>(std::ceil(dmax/dx[1])) * m_coarsen_ratio;
+    m_ngrowdomain[2] = static_cast<int>(std::ceil(dmax/dx[2])) * m_coarsen_ratio;
+    // This is the minimal size we need to embiggen the domain.
+
+    Box const domain1 = amrex::grow(domain0, m_ngrowdomain);
+    BoxList bl_crse_grown_faces(IndexType::TheNodeType());
+    for (OrientationIter oit; oit.isValid(); ++oit) {
+        Orientation face = oit();
+        Box face_box = amrex::surroundingNodes(amrex::bdryNode(domain1,face));
+        face_box.coarsen(m_coarsen_ratio);
+        for (int idim = 0; idim < AMREX_SPACEDIM; ++idim) {
+            if (idim != face.coordDir()) {
+                face_box.grow(idim,openbc::P);
+            }
+        }
+        bl_crse_grown_faces.push_back(face_box);
+    }
+
+    bl_crse_grown_faces.maxSize(16); // xxxxx make this a parameter?
+    BoxArray ba_crse_grown_faces(std::move(bl_crse_grown_faces));
+    DistributionMapping dm_crse_grown_faces(ba_crse_grown_faces);
+    m_crse_grown_faces_phi.define(ba_crse_grown_faces, dm_crse_grown_faces, 1, 0);
+
+    BoxList blg = amrex::boxDiff(domain1, domain0);
+    blg.maxSize(std::max(64,m_coarsen_ratio)); // xxxxx make this a parameter?
+    m_bag = BoxArray(std::move(blg));
+    DistributionMapping dmg(m_bag);
+    m_phind.define(amrex::coarsen(amrex::convert(m_bag,IntVect(1)),m_coarsen_ratio),
+                   dmg, 1, openbc::P);
+
+    BoxList bl0 = m_grids[0].boxList();
+    BoxList bl1 = m_bag.boxList();
+    Vector<int> p0 = m_dmap[0].ProcessorMap();
+    Vector<int> p1 = dmg.ProcessorMap();
+    bl0.join(bl1);
+    p0.insert(p0.end(), p1.begin(), p1.end());
+    IntVect const offset = -domain1.smallEnd();
+    for (auto& b : bl0) {
+        b.shift(offset);
+    }
+    m_ba_all = BoxArray(std::move(bl0));
+    m_dm_all = DistributionMapping(std::move(p0));
+
+    auto const problo = m_geom[0].ProbLo();
+    auto const probhi = m_geom[0].ProbHi();
+    std::array<Real,AMREX_SPACEDIM> problo_all, probhi_all;
+    for (int idim = 0; idim < AMREX_SPACEDIM; ++idim) {
+        problo_all[idim] = problo[idim] - m_ngrowdomain[idim]*dx[idim];
+        probhi_all[idim] = probhi[idim] + m_ngrowdomain[idim]*dx[idim];
+    }
+    m_geom_all = Geometry(amrex::shift(domain1,offset),
+                          RealBox(problo_all,probhi_all),
+                          m_geom[0].Coord(), m_geom[0].isPeriodic());
+}
+
+void OpenBCSolver::setVerbose (int v) noexcept
+{
+    m_verbose = v;
+}
+
+void OpenBCSolver::setBottomVerbose (int v) noexcept
+{
+    m_bottom_verbose = v;
+}
+
+void OpenBCSolver::useHypre (bool use_hypre) noexcept
+{
+    if (use_hypre) {
+        m_bottom_solver_type = BottomSolver::hypre;
+        m_info.setMaxCoarseningLevel(0);
+#ifndef AMREX_USE_HYPRE
+        amrex::Abort("OpenBCSolver: Must enable Hypre support to use it.");
+#endif
+    }
+}
+
+Real OpenBCSolver::solve (const Vector<MultiFab*>& a_sol,
+                          const Vector<MultiFab const*>& a_rhs,
+                          Real a_tol_rel, Real a_tol_abs)
+{
+    BL_PROFILE("OpenBCSolver::solve()");
+
+    auto solve_start_time = amrex::second();
+
+    int nlevels = m_geom.size();
+
+    BL_PROFILE_VAR("OpenBCSolver::MG1", blp_mg1);
+
+    if (m_poisson_1 == nullptr) {
+        m_poisson_1 = std::make_unique<MLPoisson>(m_geom, m_grids, m_dmap, m_info);
+        m_poisson_1->setVerbose(m_verbose);
+        m_poisson_1->setMaxOrder(4);
+        m_poisson_1->setDomainBC({AMREX_D_DECL(LinOpBCType::Dirichlet,
+                                               LinOpBCType::Dirichlet,
+                                               LinOpBCType::Dirichlet)},
+                                 {AMREX_D_DECL(LinOpBCType::Dirichlet,
+                                               LinOpBCType::Dirichlet,
+                                               LinOpBCType::Dirichlet)});
+        for (int ilev = 0; ilev < nlevels; ++ilev) {
+            m_poisson_1->setLevelBC(ilev, nullptr);
+        }
+
+        m_mlmg_1 = std::make_unique<MLMG>(*m_poisson_1);
+        m_mlmg_1->setVerbose(m_verbose);
+        m_mlmg_1->setBottomVerbose(m_bottom_verbose);
+        m_mlmg_1->setBottomSolver(m_bottom_solver_type);
+#ifdef AMREX_USE_HYPRE
+        if (m_bottom_solver_type == BottomSolver::hypre) {
+            m_mlmg_1->setHypreInterface(Hypre::Interface::structed);
+        }
+#endif
+    }
+    m_mlmg_1->solve(a_sol, a_rhs, a_tol_rel, a_tol_abs);
+
+    BL_PROFILE_VAR_STOP(blp_mg1);
+
+    Array<MultiFab,AMREX_SPACEDIM> dpdn_tmp;
+    for (int idim = 0; idim < AMREX_SPACEDIM; ++idim) {
+        dpdn_tmp[idim].define(amrex::convert(m_grids[0],
+                                             IntVect::TheDimensionVector(idim)),
+                          m_dmap[0], 1, 0);
+    }
+    m_poisson_1->get_dpdn_on_domain_faces(GetArrOfPtrs(dpdn_tmp), *a_sol[0]);
+
+    for (int idim = 0; idim < AMREX_SPACEDIM; ++idim) {
+        m_dpdn[idim].ParallelCopy(dpdn_tmp[idim]);
+    }
+
+    {
+        Gpu::DeviceVector<openbc::Moments> moments(m_nblocks_local);
+        compute_moments(moments);
+        compute_potential(moments);
+    }
+
+    MultiFab rhsg(m_bag, m_phind.DistributionMap(), 1, a_rhs[0]->nGrowVect());
+    rhsg.setVal(0._rt);
+
+    MultiFab solg(m_bag, m_phind.DistributionMap(), 1, 1);
+    solg.setVal(0._rt);
+    interpolate_potential(solg);
+
+    const int nboxes0 = m_grids[0].size();
+    MultiFab sol_all(m_ba_all, m_dm_all, 1, solg.nGrowVect(),
+                     MFInfo().SetAlloc(false));
+    MultiFab rhs_all(m_ba_all, m_dm_all, 1, rhsg.nGrowVect(),
+                     MFInfo().SetAlloc(false));
+
+    Box const domain1 = amrex::grow(m_geom[0].Domain(), m_ngrowdomain);
+    IntVect const offset = -domain1.smallEnd();
+    for (MFIter mfi(sol_all); mfi.isValid(); ++mfi) {
+        const int index = mfi.index();
+        FArrayBox solfab, rhsfab;
+        if (index < nboxes0) {
+            FArrayBox& sfab0 = (*a_sol[0])[index];
+            if (sol_all.nGrowVect() == a_sol[0]->nGrowVect()) {
+                solfab = FArrayBox(sfab0, amrex::make_alias, 0, 1);
+            } else {
+                Box b = sfab0.box();
+                b.grow(sol_all.nGrowVect()-a_sol[0]->nGrowVect());
+                solfab.resize(b,1);
+                solfab.template setVal<RunOn::Device>(0._rt);
+            }
+            rhsfab = FArrayBox((*a_rhs[0])[index], amrex::make_alias, 0, 1);
+        } else {
+            solfab = FArrayBox(solg[index-nboxes0], amrex::make_alias, 0, 1);
+            rhsfab = FArrayBox(rhsg[index-nboxes0], amrex::make_alias, 0, 1);
+        }
+        solfab.shift(offset);
+        rhsfab.shift(offset);
+        sol_all.setFab(index, std::move(solfab));
+        rhs_all.setFab(index, std::move(rhsfab));
+    }
+
+    BL_PROFILE_VAR("OpenBCSolver::MG2", blp_mg2);
+
+    if (m_poisson_2 == nullptr) {
+        Vector<Geometry> geom_all = m_geom;
+        Vector<BoxArray> grids_all = m_grids;
+        Vector<DistributionMapping> dmap_all = m_dmap;
+        geom_all[0] = m_geom_all;
+        grids_all[0] = m_ba_all;
+        dmap_all[0] = m_dm_all;
+        m_poisson_2 = std::make_unique<MLPoisson>(geom_all, grids_all, dmap_all,
+                                                  m_info);
+        m_poisson_2->setVerbose(m_verbose);
+        m_poisson_2->setMaxOrder(4);
+        m_poisson_2->setDomainBC({AMREX_D_DECL(LinOpBCType::Dirichlet,
+                                               LinOpBCType::Dirichlet,
+                                               LinOpBCType::Dirichlet)},
+                                 {AMREX_D_DECL(LinOpBCType::Dirichlet,
+                                               LinOpBCType::Dirichlet,
+                                               LinOpBCType::Dirichlet)});
+        m_poisson_2->setLevelBC(0, &sol_all);
+        for (int ilev = 1; ilev < nlevels; ++ilev) {
+            m_poisson_2->setLevelBC(ilev, nullptr);
+        }
+
+        m_mlmg_2 = std::make_unique<MLMG>(*m_poisson_2);
+        m_mlmg_2->setVerbose(m_verbose);
+        m_mlmg_2->setBottomVerbose(m_bottom_verbose);
+        m_mlmg_2->setBottomSolver(m_bottom_solver_type);
+#ifdef AMREX_USE_HYPRE
+        if (m_bottom_solver_type == BottomSolver::hypre) {
+            m_mlmg_2->setHypreInterface(Hypre::Interface::structed);
+        }
+#endif
+    }
+    Vector<MultiFab*> solv_all = a_sol;
+    Vector<MultiFab const*> rhsv_all = a_rhs;
+    solv_all[0] = &sol_all;
+    rhsv_all[0] = &rhs_all;
+    Real err = m_mlmg_2->solve(solv_all, rhsv_all, a_tol_rel, a_tol_abs);
+
+    BL_PROFILE_VAR_STOP(blp_mg2);
+
+    if (sol_all.nGrowVect() != a_sol[0]->nGrowVect()) {
+#ifdef AMREX_USE_OMP
+#pragma omp parallel if (Gpu::notInLaunchRegion())
+#endif
+        for (MFIter mfi(*a_sol[0], TilingIfNotGPU()); mfi.isValid(); ++mfi)
+        {
+            Box const& bx = mfi.tilebox();
+            Array4<Real const> const& sall = sol_all.const_array(mfi.index());
+            Array4<Real> const& s = a_sol[0]->array(mfi);
+            AMREX_HOST_DEVICE_PARALLEL_FOR_3D(bx, i, j, k,
+            {
+                s(i,j,k) = sall(i,j,k);
+            });
+        }
+    }
+
+    auto solve_stop_time = amrex::second();
+    if (m_verbose >= 1) {
+        amrex::Print() << "OpenBCSolver time = "
+                       << solve_stop_time - solve_start_time << "\n";
+    }
+
+    return err;
+}
+
+void OpenBCSolver::compute_moments (Gpu::DeviceVector<openbc::Moments>& moments)
+{
+    BL_PROFILE("OpenBCSolver::comp_mom()");
+
+    auto const problo = m_geom[0].ProbLoArray();
+    auto const probhi = m_geom[0].ProbHiArray();
+    auto const dx     = m_geom[0].CellSizeArray();
+
+#ifdef AMREX_USE_GPU
+    if (m_momtags_h.size() > 0)
+    {
+        int crse_ratio = m_coarsen_ratio;
+        int ntags = m_momtags_h.size();
+        openbc::Moments* pm = moments.data();
+        openbc::MomTag const* ptag = m_momtags_d.data();
+        int const* pnblks = m_ngpublocks_d.data();
+        std::size_t shared_mem_bytes = m_nthreads_momtag * sizeof(openbc::Moments::array_type);
+
+#ifdef AMREX_USE_DPCPP
+        amrex::ignore_unused(problo,probhi,dx,crse_ratio,ntags,pm,ptag,pnblks,
+                             shared_mem_bytes);
+        amrex::Abort("xxxx DPCPP todo: openbc compute_moments");
+#else
+        amrex::launch(m_ngpublocks_h.back(), m_nthreads_momtag, shared_mem_bytes, Gpu::gpuStream(),
+        [=] AMREX_GPU_DEVICE () noexcept
+        {
+            Gpu::SharedMemory<openbc::Moments::array_type> gsm;
+            openbc::Moments::array_type* const shared = gsm.dataPtr();
+            openbc::Moments::array_type& tmom = shared[threadIdx.x];
+            for (int i = 0; i < (openbc::M+1)*(openbc::M+2)/2; ++i) {
+                tmom[i] = Real(0.);
+            }
+
+            int tag_id = amrex::bisect(pnblks, 0, ntags, static_cast<int>(blockIdx.x));
+            int iblock = blockIdx.x - pnblks[tag_id]; // iblock'th gpublock on this box.
+            auto const& tag = ptag[tag_id];
+            openbc::Moments& mom = pm[tag.offset+iblock];
+            if (tag.face.coordDir() == 0) {
+                int const nby = tag.b2d.length(1) / crse_ratio;
+                int const kb = iblock / nby;
+                int const jb = iblock - kb*nby;
+                int const i = tag.b2d.smallEnd(0);
+                int const jlo = tag.b2d.smallEnd(1) + jb*crse_ratio;
+                int const klo = tag.b2d.smallEnd(2) + kb*crse_ratio;
+                Real const fac = dx[1]*dx[2];
+                Real const xc = tag.face.isLow() ? problo[0] : probhi[0];
+                for (int icell = threadIdx.x; icell < crse_ratio*crse_ratio; icell += blockDim.x) {
+                    int k = icell/crse_ratio;
+                    int j = icell - k*crse_ratio;
+                    Real const yy = (j-crse_ratio/2+Real(0.5))*dx[1];
+                    Real const zz = (k-crse_ratio/2+Real(0.5))*dx[2];
+                    j += jlo;
+                    k += klo;
+                    Real const charge = tag.gp(i,j,k) * fac;
+                    Real zpow = Real(1.);
+                    int m = 0;
+                    for (int q = 0; q <= openbc::M; ++q) {
+                        Real ypow = Real(1.);
+                        for (int p = 0; p <= openbc::M-q; ++p) {
+                            tmom[m++] += charge*ypow*zpow;
+                            ypow *= yy;
+                        }
+                        zpow *= zz;
+                    }
+                }
+                if (threadIdx.x == 0) {
+                    mom.x = xc;
+                    mom.y = problo[1] + dx[1]*(jlo + crse_ratio/2);
+                    mom.z = problo[2] + dx[2]*(klo + crse_ratio/2);
+                    mom.face = tag.face;
+                }
+            } else if (tag.face.coordDir() == 1) {
+                int const nbx = tag.b2d.length(0) / crse_ratio;
+                int const kb = iblock / nbx;
+                int const ib = iblock - kb*nbx;
+                int const j = tag.b2d.smallEnd(1);
+                int const ilo = tag.b2d.smallEnd(0) + ib*crse_ratio;
+                int const klo = tag.b2d.smallEnd(2) + kb*crse_ratio;
+                Real const fac = dx[0]*dx[2];
+                Real const yc = tag.face.isLow() ? problo[1] : probhi[1];
+                for (int icell = threadIdx.x; icell < crse_ratio*crse_ratio; icell += blockDim.x) {
+                    int k = icell/crse_ratio;
+                    int i = icell - k*crse_ratio;
+                    Real const xx = (i-crse_ratio/2+Real(0.5))*dx[0];
+                    Real const zz = (k-crse_ratio/2+Real(0.5))*dx[2];
+                    i += ilo;
+                    k += klo;
+                    Real const charge = tag.gp(i,j,k) * fac;
+                    Real zpow = Real(1.);
+                    int m = 0;
+                    for (int q = 0; q <= openbc::M; ++q) {
+                        Real xpow = Real(1.);
+                        for (int p = 0; p <= openbc::M-q; ++p) {
+                            tmom[m++] += charge*xpow*zpow;
+                            xpow *= xx;
+                        }
+                        zpow *= zz;
+                    }
+                }
+                if (threadIdx.x == 0) {
+                    mom.x = problo[0] + dx[0]*(ilo + crse_ratio/2);
+                    mom.y = yc;
+                    mom.z = problo[2] + dx[2]*(klo + crse_ratio/2);
+                    mom.face = tag.face;
+                }
+            } else {
+                int const nbx = tag.b2d.length(0) / crse_ratio;
+                int const jb = iblock / nbx;
+                int const ib = iblock - jb*nbx;
+                int const k = tag.b2d.smallEnd(2);
+                int const ilo = tag.b2d.smallEnd(0) + ib*crse_ratio;
+                int const jlo = tag.b2d.smallEnd(1) + jb*crse_ratio;
+                Real const fac = dx[0]*dx[1];
+                Real const zc = tag.face.isLow() ? problo[2] : probhi[2];
+                for (int icell = threadIdx.x; icell < crse_ratio*crse_ratio; icell += blockDim.x) {
+                    int j = icell/crse_ratio;
+                    int i = icell - j*crse_ratio;
+                    Real const xx = (i-crse_ratio/2+Real(0.5))*dx[0];
+                    Real const yy = (j-crse_ratio/2+Real(0.5))*dx[1];
+                    i += ilo;
+                    j += jlo;
+                    Real const charge = tag.gp(i,j,k) * fac;
+                    Real ypow = Real(1.);
+                    int m = 0;
+                    for (int q=0; q <= openbc::M; ++q) {
+                        Real xpow = Real(1.);
+                        for (int p = 0; p <= openbc::M-q; ++p) {
+                            tmom[m++] += charge*xpow*ypow;
+                            xpow *= xx;
+                        }
+                        ypow *= yy;
+                    }
+                }
+                if (threadIdx.x == 0) {
+                    mom.x = problo[0] + dx[0]*(ilo + crse_ratio/2);
+                    mom.y = problo[1] + dx[1]*(jlo + crse_ratio/2);
+                    mom.z = zc;
+                    mom.face = tag.face;
+                }
+            }
+            openbc::scale_moments(tmom);
+
+            __syncthreads();
+
+            if (threadIdx.x < (openbc::M+1)*(openbc::M+2)/2) {
+                mom.mom[threadIdx.x] = Real(0.);
+                for (unsigned int i = 0; i < blockDim.x; ++i) {
+                    mom.mom[threadIdx.x] += shared[i][threadIdx.x];
+                }
+            }
+        });
+#endif
+    }
+#else
+    for (auto const& tag : m_momtags_h) {
+        if (tag.face.coordDir() == 0) {
+            int nby = tag.b2d.length(1) / m_coarsen_ratio;
+            int nbz = tag.b2d.length(2) / m_coarsen_ratio;
+            int i = tag.b2d.smallEnd(0);
+            int jlo = tag.b2d.smallEnd(1);
+            int klo = tag.b2d.smallEnd(2);
+            Real fac = dx[1]*dx[2];
+            Real xc = tag.face.isLow() ? problo[0] : probhi[0];
+            for (int kb = 0; kb < nbz; ++kb) {
+            for (int jb = 0; jb < nby; ++jb) {
+                openbc::Moments& mom = moments[tag.offset+jb+kb*nby];
+                for (auto& m : mom.mom) {
+                    m = 0._rt;
+                }
+                for (int kk = 0; kk < m_coarsen_ratio; ++kk) {
+                for (int jj = 0; jj < m_coarsen_ratio; ++jj) {
+                    Real charge = tag.gp(i, jlo+jb*m_coarsen_ratio+jj,
+                                         klo+kb*m_coarsen_ratio+kk) * fac;
+                    Real yy = (jj-m_coarsen_ratio/2+0.5_rt)*dx[1];
+                    Real zz = (kk-m_coarsen_ratio/2+0.5_rt)*dx[2];
+                    Real zpow = 1._rt;
+                    int m = 0;
+                    for (int q = 0; q <= openbc::M; ++q) {
+                        Real ypow = 1._rt;
+                        for (int p = 0; p <= openbc::M-q; ++p) {
+                            mom.mom[m++] += charge*ypow*zpow;
+                            ypow *= yy;
+                        }
+                        zpow *= zz;
+                    }
+                }}
+                openbc::scale_moments(mom.mom);
+                // center of the block
+                mom.x = xc;
+                mom.y = problo[1] + dx[1]*(tag.b2d.smallEnd(1)
+                                           + jb*m_coarsen_ratio
+                                           + m_coarsen_ratio/2);
+                mom.z = problo[2] + dx[2]*(tag.b2d.smallEnd(2)
+                                           + kb*m_coarsen_ratio
+                                           + m_coarsen_ratio/2);
+                mom.face = tag.face;
+            }}
+        } else if (tag.face.coordDir() == 1) {
+            int nbx = tag.b2d.length(0) / m_coarsen_ratio;
+            int nbz = tag.b2d.length(2) / m_coarsen_ratio;
+            int j = tag.b2d.smallEnd(1);
+            int ilo = tag.b2d.smallEnd(0);
+            int klo = tag.b2d.smallEnd(2);
+            Real fac = dx[0]*dx[2];
+            Real yc = tag.face.isLow() ? problo[1] : probhi[1];
+            for (int kb = 0; kb < nbz; ++kb) {
+            for (int ib = 0; ib < nbx; ++ib) {
+                openbc::Moments& mom = moments[tag.offset+ib+kb*nbx];
+                for (auto& m : mom.mom) {
+                    m = 0._rt;
+                }
+                for (int kk = 0; kk < m_coarsen_ratio; ++kk) {
+                for (int ii = 0; ii < m_coarsen_ratio; ++ii) {
+                    Real charge = tag.gp(ilo+ib*m_coarsen_ratio+ii, j,
+                                         klo+kb*m_coarsen_ratio+kk) * fac;
+                    Real xx = (ii-m_coarsen_ratio/2+0.5_rt)*dx[0];
+                    Real zz = (kk-m_coarsen_ratio/2+0.5_rt)*dx[2];
+                    Real zpow = 1._rt;
+                    int m = 0;
+                    for (int q = 0; q <= openbc::M; ++q) {
+                        Real xpow = 1._rt;
+                        for (int p = 0; p <= openbc::M-q; ++p) {
+                            mom.mom[m++] += charge*xpow*zpow;
+                            xpow *= xx;
+                        }
+                        zpow *= zz;
+                    }
+                }}
+                openbc::scale_moments(mom.mom);
+                mom.x = problo[0] + dx[0]*(tag.b2d.smallEnd(0)
+                                           + ib*m_coarsen_ratio
+                                           + m_coarsen_ratio/2);
+                mom.y = yc;
+                mom.z = problo[2] + dx[2]*(tag.b2d.smallEnd(2)
+                                           + kb*m_coarsen_ratio
+                                           + m_coarsen_ratio/2);
+                mom.face = tag.face;
+            }}
+        } else {
+            int nbx = tag.b2d.length(0) / m_coarsen_ratio;
+            int nby = tag.b2d.length(1) / m_coarsen_ratio;
+            int k = tag.b2d.smallEnd(2);
+            int ilo = tag.b2d.smallEnd(0);
+            int jlo = tag.b2d.smallEnd(1);
+            Real fac = dx[0]*dx[1];
+            Real zc = tag.face.isLow() ? problo[2] : probhi[2];
+            for (int jb = 0; jb < nby; ++jb) {
+            for (int ib = 0; ib < nbx; ++ib) {
+                openbc::Moments& mom = moments[tag.offset+ib+jb*nbx];
+                for (auto& m : mom.mom) {
+                    m = 0._rt;
+                }
+                for (int jj = 0; jj < m_coarsen_ratio; ++jj) {
+                for (int ii = 0; ii < m_coarsen_ratio; ++ii) {
+                    Real charge = tag.gp(ilo+ib*m_coarsen_ratio+ii,
+                                         jlo+jb*m_coarsen_ratio+jj, k) * fac;
+                    Real xx = (ii-m_coarsen_ratio/2+0.5_rt)*dx[0];
+                    Real yy = (jj-m_coarsen_ratio/2+0.5_rt)*dx[1];
+                    Real ypow = 1._rt;
+                    int m = 0;
+                    for (int q = 0; q <= openbc::M; ++q) {
+                        Real xpow = 1._rt;
+                        for (int p = 0; p <= openbc::M-q; ++p) {
+                            mom.mom[m++] += charge*xpow*ypow;
+                            xpow *= xx;
+                        }
+                        ypow *= yy;
+                    }
+                }}
+                openbc::scale_moments(mom.mom);
+                mom.x = problo[0] + dx[0]*(tag.b2d.smallEnd(0)
+                                           + ib*m_coarsen_ratio
+                                           + m_coarsen_ratio/2);
+                mom.y = problo[1] + dx[1]*(tag.b2d.smallEnd(1)
+                                           + jb*m_coarsen_ratio
+                                           + m_coarsen_ratio/2);
+                mom.z = zc;
+                mom.face = tag.face;
+            }}
+        }
+    }
+#endif
+
+#ifdef AMREX_USE_MPI
+    bcast_moments(moments);
+#endif
+    m_nblocks = moments.size();
+}
+
+#ifdef AMREX_USE_MPI
+void OpenBCSolver::bcast_moments (Gpu::DeviceVector<openbc::Moments>& moments)
+{
+    if (ParallelContext::NProcsSub() > 1)
+    {
+        MPI_Comm comm = ParallelContext::CommunicatorSub();
+        if (m_nblocks == 0) {
+            int count = moments.size();
+            count *= static_cast<int>(sizeof(openbc::Moments));
+            m_countvec.resize(ParallelContext::NProcsSub());
+            MPI_Allgather(&count, 1, MPI_INT, m_countvec.data(), 1, MPI_INT, comm);
+
+            m_offset.resize(m_countvec.size(), 0);
+            Long count_tot = m_countvec[0];
+            for (int i = 1, N = m_offset.size(); i < N; ++i) {
+                m_offset[i] = m_offset[i-1] + m_countvec[i-1];
+                count_tot += m_countvec[i];
+            }
+
+            if (count_tot > static_cast<Long>(std::numeric_limits<int>::max())) {
+                amrex::Abort("OpenBC: integer overflow. Let us know and we will fix this.");
+            }
+
+            m_nblocks = count_tot/sizeof(openbc::Moments);
+        }
+
+        Gpu::DeviceVector<openbc::Moments> moments_all(m_nblocks);
+
+#ifdef AMREX_USE_GPU
+        Gpu::PinnedVector<openbc::Moments> h_moments(moments.size());
+        Gpu::PinnedVector<openbc::Moments> h_moments_all(moments_all.size());
+        Gpu::copyAsync(Gpu::deviceToHost, moments.begin(), moments.end(),
+                       h_moments.begin());
+        Gpu::streamSynchronize();
+#else
+        auto const& h_moments = moments;
+        auto& h_moments_all = moments_all;
+#endif
+
+        int count = m_nblocks_local*static_cast<int>(sizeof(openbc::Moments));
+        MPI_Allgatherv(h_moments.data(), count, MPI_CHAR, h_moments_all.data(),
+                       m_countvec.data(), m_offset.data(), MPI_CHAR, comm);
+
+#ifdef AMREX_USE_GPU
+        Gpu::copyAsync(Gpu::hostToDevice, h_moments_all.begin(), h_moments_all.end(),
+                       moments_all.begin());
+        Gpu::streamSynchronize();
+#endif
+
+        std::swap(moments, moments_all);
+    }
+}
+#endif
+
+void OpenBCSolver::compute_potential (Gpu::DeviceVector<openbc::Moments> const& moments)
+{
+    BL_PROFILE("OpenBCSolver::comp_phi()");
+
+    auto const problo = m_geom[0].ProbLoArray();
+    auto const dx     = m_geom[0].CellSizeArray();
+
+    int crse_ratio = m_coarsen_ratio;
+    int nblocks = m_nblocks;
+    openbc::Moments const* pmom = moments.data();
+    for (MFIter mfi(m_crse_grown_faces_phi); mfi.isValid(); ++mfi) {
+        Box const& b = mfi.validbox();
+        Array4<Real> const& phi_arr = m_crse_grown_faces_phi.array(mfi);
+#if defined(AMREX_USE_GPU)
+        const auto lo  = amrex::lbound(b);
+        const auto len = amrex::length(b);
+        const auto lenxy = len.x*len.y;
+        const auto lenx = len.x;
+#ifdef AMREX_USE_DPCPP
+        amrex::ignore_unused(problo,dx,crse_ratio,nblocks,pmom,b,phi_arr,lo,
+                             lenxy,lenx);
+        amrex::Abort("xxxxx DPCPP todo: openbc compute_potential");
+#else
+        amrex::launch(b.numPts(), AMREX_GPU_MAX_THREADS, Gpu::gpuStream(),
+        [=] AMREX_GPU_DEVICE () noexcept
+        {
+            int icell = blockIdx.x;
+            int k =  icell /   lenxy;
+            int j = (icell - k*lenxy) /   lenx;
+            int i = (icell - k*lenxy) - j*lenx;
+            i += lo.x;
+            j += lo.y;
+            k += lo.z;
+            Real xb = problo[0] + i*crse_ratio*dx[0];
+            Real yb = problo[1] + j*crse_ratio*dx[1];
+            Real zb = problo[2] + k*crse_ratio*dx[2];
+            Real phi = Real(0.);
+            for (int iblock = threadIdx.x; iblock < nblocks; iblock += blockDim.x) {
+                phi += openbc::block_potential(pmom[iblock], xb, yb, zb);
+            }
+            Real phitot = Gpu::blockReduceSum<AMREX_GPU_MAX_THREADS>(phi);
+            if (threadIdx.x == 0) {
+                phi_arr(i,j,k) = phitot;
+            }
+        });
+#endif
+#else
+        amrex::LoopOnCpu(b, [&] (int i, int j, int k) noexcept
+        {
+            Real xb = problo[0] + i*crse_ratio*dx[0];
+            Real yb = problo[1] + j*crse_ratio*dx[1];
+            Real zb = problo[2] + k*crse_ratio*dx[2];
+            Real phi = 0._rt;
+            for (int iblock = 0; iblock < nblocks; ++iblock) {
+                phi += openbc::block_potential(pmom[iblock], xb, yb, zb);
+            }
+            phi_arr(i,j,k) = phi;
+        });
+#endif
+    }
+
+    m_phind.ParallelCopy(m_crse_grown_faces_phi, 0, 0, 1, IntVect(0),
+                         m_phind.nGrowVect());
+}
+
+void OpenBCSolver::interpolate_potential (MultiFab& solg)
+{
+    BL_PROFILE("OpenBCSolver::interp_phi");
+
+    Box const domain1 = amrex::grow(m_geom[0].Domain(), m_ngrowdomain);
+    int crse_ratio = m_coarsen_ratio;
+
+    for (MFIter mfi(solg); mfi.isValid(); ++mfi) {
+        Box const& vbx = mfi.validbox();
+        for (OrientationIter oit; oit.isValid(); ++oit) {
+            Orientation face = oit();
+            if (vbx[face] == domain1[face]) {
+                Array4<Real> const& solg_arr = solg.array(mfi);
+                Array4<Real const> const& phi_arr = m_phind.const_array(mfi);
+                Box const& b2d = amrex::bdryNode(vbx, face);
+                int offset = face.isLow() ? -1 : 0;
+                if (face.coordDir() == 0) {
+                    Box b = amrex::coarsen(b2d,IntVect(crse_ratio,crse_ratio,1));
+                    b.grow(1,openbc::P).surroundingNodes(1);
+                    FArrayBox tmpfab(b,1,The_Async_Arena());
+                    Array4<Real> const& tmp = tmpfab.array();
+                    Array4<Real const> const& ctmp = tmpfab.const_array();
+                    amrex::ParallelFor(b,
+                    [=] AMREX_GPU_DEVICE (int ic, int jc, int k) noexcept
+                    {
+                        tmp(ic,jc,k) = openbc::interpccz(ic,jc,k,phi_arr,crse_ratio);
+                    });
+                    b = amrex::coarsen(b2d,IntVect(crse_ratio,1,1));
+                    amrex::ParallelFor(b,
+                    [=] AMREX_GPU_DEVICE (int ic, int j, int k) noexcept
+                    {
+                        int i = ic*crse_ratio+offset;
+                        solg_arr(i,j,k) = openbc::interpccy(ic,j,k,ctmp,crse_ratio);
+                    });
+                } else if (face.coordDir() == 1) {
+                    Box b = amrex::coarsen(b2d,IntVect(crse_ratio,crse_ratio,1));
+                    b.grow(0,openbc::P).surroundingNodes(0);
+                    FArrayBox tmpfab(b,1,The_Async_Arena());
+                    Array4<Real> const& tmp = tmpfab.array();
+                    Array4<Real const> const& ctmp = tmpfab.const_array();
+                    amrex::ParallelFor(b,
+                    [=] AMREX_GPU_DEVICE (int ic, int jc, int k) noexcept
+                    {
+                        tmp(ic,jc,k) = openbc::interpccz(ic,jc,k,phi_arr,crse_ratio);
+                    });
+                    b = amrex::coarsen(b2d,IntVect(1,crse_ratio,1));
+                    amrex::ParallelFor(b,
+                    [=] AMREX_GPU_DEVICE (int i, int jc, int k) noexcept
+                    {
+                        int j = jc*crse_ratio+offset;
+                        solg_arr(i,j,k) = openbc::interpccx(i,jc,k,ctmp,crse_ratio);
+                    });
+                } else {
+                    Box b = amrex::coarsen(b2d,IntVect(crse_ratio,1,crse_ratio));
+                    b.grow(0,openbc::P).surroundingNodes(0);
+                    FArrayBox tmpfab(b,1,The_Async_Arena());
+                    Array4<Real> const& tmp = tmpfab.array();
+                    Array4<Real const> const& ctmp = tmpfab.const_array();
+                    amrex::ParallelFor(b,
+                    [=] AMREX_GPU_DEVICE (int ic, int j, int kc) noexcept
+                    {
+                        tmp(ic,j,kc) = openbc::interpccy(ic,j,kc,phi_arr,crse_ratio);
+                    });
+                    b = amrex::coarsen(b2d,IntVect(1,1,crse_ratio));
+                    amrex::ParallelFor(b,
+                    [=] AMREX_GPU_DEVICE (int i, int j, int kc) noexcept
+                    {
+                        int k = kc*crse_ratio+offset;
+                        solg_arr(i,j,k) = openbc::interpccx(i,j,kc,ctmp,crse_ratio);
+                    });
+                }
+            }
+        }
+    }
+}
+
+namespace openbc {
+std::ostream& operator<< (std::ostream& os, Moments const& mom)
+{
+    os << "Face " << mom.face << ", x = " << mom.x << ", y = " << mom.y
+       << ", z = " << mom.z << "\n"
+       << "  " << mom.mom[0] << "\n"
+       << "  " << mom.mom[1] << ", " << mom.mom[8] << "\n"
+       << "  " << mom.mom[2] << ", " << mom.mom[9] << ", " << mom.mom[15] << "\n"
+       << "  " << mom.mom[3] << ", " << mom.mom[10] << ", " << mom.mom[16]
+       << ", " << mom.mom[21] << "\n"
+       << "  " << mom.mom[4] << ", " << mom.mom[11] << ", " << mom.mom[17]
+       << ", " << mom.mom[22] << ", " << mom.mom[26] << "\n"
+       << "  " << mom.mom[5] << ", " << mom.mom[12] << ", " << mom.mom[18]
+       << ", " << mom.mom[23] << ", " << mom.mom[27] << ", " << mom.mom[30] << "\n"
+       << "  " << mom.mom[6] << ", " << mom.mom[13] << ", " << mom.mom[19]
+       << ", " << mom.mom[24] << ", " << mom.mom[28] << ", " << mom.mom[31]
+       << ", " << mom.mom[33] << "\n"
+       << "  " << mom.mom[7] << ", " << mom.mom[14] << ", " << mom.mom[20]
+       << ", " << mom.mom[25] << ", " << mom.mom[29] << ", " << mom.mom[32]
+       << ", " << mom.mom[34] << ", " << mom.mom[35] << "\n";
+    return os;
+}
+}
+
+}
diff --git a/Src/LinearSolvers/OpenBC/AMReX_OpenBC_K.H b/Src/LinearSolvers/OpenBC/AMReX_OpenBC_K.H
new file mode 100644
index 00000000000..7a6b2643b68
--- /dev/null
+++ b/Src/LinearSolvers/OpenBC/AMReX_OpenBC_K.H
@@ -0,0 +1,166 @@
+#ifndef AMREX_OPENBC_K_H_
+#define AMREX_OPENBC_K_H_
+
+#include <AMReX_OpenBC.H>
+#include <AMReX_LOUtil_K.H>
+
+namespace amrex { namespace openbc {
+
+AMREX_GPU_DEVICE AMREX_FORCE_INLINE
+void scale_moments (openbc::Moments::array_type& mom)
+{ // p!*q! in the order of 0!*0!, 1!*0!, ..., 7!*0!, 0!*1!, 1!*1!, 2!*1!, ..., 6!*1!, 0!*2!, ..., 0!*7!.
+    mom[ 2] *= Real(0.5);
+    mom[ 3] *= Real(1./6.);
+    mom[ 4] *= Real(1./24.);
+    mom[ 5] *= Real(1./120.);
+    mom[ 6] *= Real(1./720.);
+    mom[ 7] *= Real(1./5040.);
+    mom[10] *= Real(0.5);
+    mom[11] *= Real(1./6.);
+    mom[12] *= Real(1./24.);
+    mom[13] *= Real(1./120.);
+    mom[14] *= Real(1./720.);
+    mom[15] *= Real(0.5);
+    mom[16] *= Real(0.5);
+    mom[17] *= Real(0.25);
+    mom[18] *= Real(1./12.);
+    mom[19] *= Real(1./48.);
+    mom[20] *= Real(1./240.);
+    mom[21] *= Real(1./6.);
+    mom[22] *= Real(1./6.);
+    mom[23] *= Real(1./12.);
+    mom[24] *= Real(1./36.);
+    mom[25] *= Real(1./144.);
+    mom[26] *= Real(1./24.);
+    mom[27] *= Real(1./24.);
+    mom[28] *= Real(1./48.);
+    mom[29] *= Real(1./144.);
+    mom[30] *= Real(1./120.);
+    mom[31] *= Real(1./120.);
+    mom[32] *= Real(1./240.);
+    mom[33] *= Real(1./720.);
+    mom[34] *= Real(1./720.);
+    mom[35] *= Real(1./5040.);
+}
+
+AMREX_GPU_DEVICE AMREX_FORCE_INLINE
+Real block_potential (openbc::Moments const& mom, Real xb, Real yb, Real zb)
+{
+    constexpr Real oneover4pi = Real(1.)/Real(4.*3.1415926535897932);
+
+    xb -= mom.x;
+    yb -= mom.y;
+    zb -= mom.z;
+    Real ri = Real(1.)/std::sqrt(xb*xb+yb*yb+zb*zb);
+    Real ri2 = ri*ri;
+    Real ri3 = ri2*ri;
+    Real ri4 = ri3*ri;
+    Real xr, yr;
+    if (mom.face.coordDir() == 0) {
+        xr = yb*ri;
+        yr = zb*ri;
+    } else if (mom.face.coordDir() == 1) {
+        xr = xb*ri;
+        yr = zb*ri;
+    } else {
+        xr = xb*ri;
+        yr = yb*ri;
+    }
+    Real xr2 = xr *xr;
+    Real xr4 = xr2*xr2;
+    Real xr6 = xr4*xr2;
+    Real yr2 = yr *yr;
+    Real yr4 = yr2*yr2;
+    Real yr6 = yr4*yr2;
+    Real phi = ri * mom.mom[0]
+        + ri2*(xr*mom.mom[1] + yr*mom.mom[8])
+        + ri3*((Real(3.) * xr2 - Real(1.)) * mom.mom[2] +
+               (Real(3.) * xr * yr       ) * mom.mom[9] +
+               (Real(3.) * yr2 - Real(1.)) * mom.mom[15])
+        + ri4 * (xr * (Real(15.) * xr2 - Real(9.)) * mom.mom[3] +
+                 yr * (Real(15.) * xr2 - Real(3.)) * mom.mom[10] +
+                 xr * (Real(15.) * yr2 - Real(3.)) * mom.mom[16] +
+                 yr * (Real(15.) * yr2 - Real(9.)) * mom.mom[21])
+        + ri4*ri * ((Real(105.) * xr4 - Real(90.) * xr2 + Real(9.)) * mom.mom[4] +
+                    (xr * yr * (Real(105.) * xr2 - Real(45.))) * mom.mom[11] +
+                    (Real(105.) * xr2 * yr2 - Real(15.) * xr2 - Real(15.) * yr2 + Real(3.)) * mom.mom[17] +
+                    (xr * yr * (Real(105.) * yr2 - Real(45.))) * mom.mom[22] +
+                    (Real(105.) * yr4 - Real(90.) * yr2 + Real(9.)) * mom.mom[26])
+        + ri4*ri2 * (xr * (Real(945.)*xr4 - Real(1050.)*xr2 + Real(225.)) * mom.mom[5] +
+                     yr * (Real(945.)*xr4 - Real(630.)*xr2 + Real(45.)) * mom.mom[12] +
+                     xr * (Real(945.)*xr2*yr2 - Real(105.)*xr2 - Real(315.)*yr2 + Real(45.)) * mom.mom[18] +
+                     yr * (Real(945.)*xr2*yr2 - Real(315.)*xr2 - Real(105.)*yr2 + Real(45.)) * mom.mom[23] +
+                     xr * (Real(945.)*yr4 - Real(630.)*yr2 + Real(45.)) * mom.mom[27] +
+                     yr * (Real(945.)*yr4 - Real(1050.)*yr2 + Real(225.)) * mom.mom[30])
+        + ri4*ri3 * (Real(45.) * (Real(231.)*xr6 - Real(315.)*xr4 + Real(105.)*xr2 - Real(5.)) * mom.mom[6] +
+                     Real(315.)*xr*yr * (Real(33.)*xr4 - Real(30.)*xr2 + Real(5.)) * mom.mom[13] +
+                     Real(45.) * (Real(231.)*xr4*yr2 - Real(21.)*xr4 - Real(126.)*xr2*yr2 + Real(14.)*xr2 + Real(7.)*yr2 - Real(1.)) * mom.mom[19] +
+                     Real(945.)*xr*yr * (Real(11.)*xr2*yr2 - Real(3.)*xr2 - Real(3.)*yr2 + Real(1.)) * mom.mom[24] +
+                     Real(45.) * (Real(231.)*xr2*yr4 - Real(126.)*xr2*yr2 + Real(7.)*xr2 - Real(21.)*yr4 + Real(14.)*yr2 - Real(1.)) * mom.mom[28] +
+                     Real(315.)*xr*yr * (Real(33.)*yr4 - Real(30.)*yr2 + Real(5.)) * mom.mom[31] +
+                     Real(45.) * (Real(231.)*yr6 - Real(315.)*yr4 + Real(105.)*yr2 - Real(5.)) * mom.mom[33])
+        + ri4*ri4*(Real(315.)*xr*(Real(429.)*xr6 - Real(693.)*xr4 + Real(315.)*xr2 - Real(35.)) * mom.mom[7] +
+                   Real(315.)*yr*(Real(429.)*xr6 - Real(495.)*xr4 + Real(135.)*xr2 - Real(5.)) * mom.mom[14] +
+                   Real(315.)*xr*(Real(429.)*xr4*yr2 - Real(33.)*xr4 - Real(330.)*xr2*yr2 + Real(30.)*xr2 + Real(45.)*yr2 - Real(5.)) * mom.mom[20] +
+                   Real(945.)*yr*(Real(143.)*xr4*yr2 - Real(33.)*xr4 - Real(66.)*xr2*yr2 + Real(18.)*xr2 + Real(3.)*yr2 - Real(1.)) * mom.mom[25] +
+                   Real(945.)*xr*(Real(143.)*xr2*yr4 - Real(66.)*xr2*yr2 + Real(3.)*xr2 - Real(33.)*yr4 + Real(18.)*yr2 - Real(1.)) * mom.mom[29] +
+                   Real(315.)*yr*(Real(429.)*xr2*yr4 - Real(330.)*xr2*yr2 + Real(45.)*xr2 - Real(33.)*yr4 + Real(30.)*yr2 - Real(5.)) * mom.mom[32] +
+                   Real(315.)*xr*(Real(429.)*yr6 - Real(495.)*yr4 + Real(135.)*yr2 - Real(5.)) * mom.mom[34] +
+                   Real(315.)*yr*(Real(429.)*yr6 - Real(693.)*yr4 + Real(315.)*yr2 - Real(35.)) * mom.mom[35]);
+    return phi*(-oneover4pi);
+}
+
+AMREX_GPU_DEVICE AMREX_FORCE_INLINE
+void interp_coef (int i, int ii, Real* AMREX_RESTRICT c, int crse_ratio)
+{
+    static_assert(openbc::P == 3, "openbc::P is assumed to be 3 here");
+    Real xint = (ii-i*crse_ratio + Real(0.5))/static_cast<Real>(crse_ratio);
+    constexpr Real x[] = {-3._rt, -2._rt, -1._rt, 0._rt, 1._rt, 2._rt, 3._rt, 4._rt};
+    poly_interp_coeff<8>(xint, x, c);
+}
+
+AMREX_GPU_DEVICE AMREX_FORCE_INLINE
+Real interpccx (int ii, int j, int k, Array4<Real const> const& phi, int crse_ratio)
+{
+    int i = amrex::coarsen(ii,crse_ratio);
+    Real c[8];
+    interp_coef(i,ii,c,crse_ratio);
+
+    Real p = Real(0.);
+    for (int n = 0; n < 8; ++n) {
+        p += c[n] * phi(i-3+n,j,k);
+    }
+    return p;
+}
+
+AMREX_GPU_DEVICE AMREX_FORCE_INLINE
+Real interpccy (int i, int jj, int k, Array4<Real const> const& phi, int crse_ratio)
+{
+    int j = amrex::coarsen(jj,crse_ratio);
+    Real c[8];
+    interp_coef(j,jj,c,crse_ratio);
+
+    Real p = Real(0.);
+    for (int n = 0; n < 8; ++n) {
+        p += c[n] * phi(i,j-3+n,k);
+    }
+    return p;
+}
+
+AMREX_GPU_DEVICE AMREX_FORCE_INLINE
+Real interpccz (int i, int j, int kk, Array4<Real const> const& phi, int crse_ratio)
+{
+    int k = amrex::coarsen(kk,crse_ratio);
+    Real c[8];
+    interp_coef(k,kk,c,crse_ratio);
+
+    Real p = Real(0.);
+    for (int n = 0; n < 8; ++n) {
+        p += c[n] * phi(i,j,k-3+n);
+    }
+    return p;
+}
+
+}}
+
+#endif
diff --git a/Src/LinearSolvers/OpenBC/Make.package b/Src/LinearSolvers/OpenBC/Make.package
new file mode 100644
index 00000000000..5fc39f69371
--- /dev/null
+++ b/Src/LinearSolvers/OpenBC/Make.package
@@ -0,0 +1,6 @@
+
+CEXE_headers += AMReX_OpenBC.H AMReX_OpenBC_K.H
+CEXE_sources += AMReX_OpenBC.cpp
+
+VPATH_LOCATIONS += $(AMREX_HOME)/Src/LinearSolvers/OpenBC
+INCLUDE_LOCATIONS += $(AMREX_HOME)/Src/LinearSolvers/OpenBC
diff --git a/Src/Particle/AMReX_DenseBins.H b/Src/Particle/AMReX_DenseBins.H
index 93c9415ad25..0f1e94bb176 100644
--- a/Src/Particle/AMReX_DenseBins.H
+++ b/Src/Particle/AMReX_DenseBins.H
@@ -200,6 +200,7 @@ public:
 
         m_bins.resize(nitems);
         m_perm.resize(nitems);
+        m_local_offsets.resize(nitems);
 
         m_counts.resize(0);
         m_counts.resize(nbins+1, 0);
@@ -209,21 +210,21 @@ public:
 
         index_type* pbins   = m_bins.dataPtr();
         index_type* pcount  = m_counts.dataPtr();
+        index_type* plocal_offsets  = m_local_offsets.dataPtr();
         amrex::ParallelFor(nitems, [=] AMREX_GPU_DEVICE (int i) noexcept
         {
             pbins[i] = f(v[i]);
-            Gpu::Atomic::AddNoRet(&pcount[pbins[i]], index_type{ 1 });
+            index_type off = Gpu::Atomic::Add(&pcount[pbins[i]], index_type{ 1 });
+            plocal_offsets[i] = off;
         });
 
         Gpu::exclusive_scan(m_counts.begin(), m_counts.end(), m_offsets.begin());
 
-        Gpu::copyAsync(Gpu::deviceToDevice, m_offsets.begin(), m_offsets.end(), m_counts.begin());
-
         index_type* pperm = m_perm.dataPtr();
-        constexpr index_type max_index = std::numeric_limits<index_type>::max();
+        index_type* poffsets = m_offsets.dataPtr();
         amrex::ParallelFor(nitems, [=] AMREX_GPU_DEVICE (int i) noexcept
         {
-            index_type index = Gpu::Atomic::Inc(&pcount[pbins[i]], max_index);
+            index_type index = poffsets[pbins[i]] + plocal_offsets[i];
             pperm[index] = i;
         });
 
@@ -503,6 +504,7 @@ private:
 
     Gpu::DeviceVector<index_type> m_bins;
     Gpu::DeviceVector<index_type> m_counts;
+    Gpu::DeviceVector<index_type> m_local_offsets;
     Gpu::DeviceVector<index_type> m_offsets;
     Gpu::DeviceVector<index_type> m_perm;
 };
diff --git a/Src/Particle/AMReX_NeighborParticles.H b/Src/Particle/AMReX_NeighborParticles.H
index 36d2c5351d7..344d39f778e 100644
--- a/Src/Particle/AMReX_NeighborParticles.H
+++ b/Src/Particle/AMReX_NeighborParticles.H
@@ -348,6 +348,11 @@ protected:
     ///
     void BuildMasks ();
 
+    ///
+    /// Are the masks computed by the above function still valid?
+    ///
+    bool areMasksValid ();
+
     void GetNeighborCommTags ();
 
     void GetCommTagsBox (Vector<NeighborCommTag>& tags, const int lev, const Box& in_box);
diff --git a/Src/Particle/AMReX_NeighborParticlesCPUImpl.H b/Src/Particle/AMReX_NeighborParticlesCPUImpl.H
index d5fb9fc40ee..4d5ecb4fcc8 100644
--- a/Src/Particle/AMReX_NeighborParticlesCPUImpl.H
+++ b/Src/Particle/AMReX_NeighborParticlesCPUImpl.H
@@ -7,8 +7,10 @@ void
 NeighborParticleContainer<NStructReal, NStructInt, NArrayReal, NArrayInt>
 ::fillNeighborsCPU () {
     BL_PROFILE("NeighborParticleContainer::fillNeighborsCPU");
-    BuildMasks();
-    GetNeighborCommTags();
+    if (!areMasksValid()) {
+        BuildMasks();
+        GetNeighborCommTags();
+    }
     cacheNeighborInfo();
     updateNeighborsCPU(false);
 }
diff --git a/Src/Particle/AMReX_NeighborParticlesGPUImpl.H b/Src/Particle/AMReX_NeighborParticlesGPUImpl.H
index 6e112318757..81bef1302e2 100644
--- a/Src/Particle/AMReX_NeighborParticlesGPUImpl.H
+++ b/Src/Particle/AMReX_NeighborParticlesGPUImpl.H
@@ -121,7 +121,7 @@ buildNeighborCopyOp (bool use_boundary_neighbor)
 {
     BL_PROFILE("NeighborParticleContainer<NStructReal, NStructInt, NArrayReal, NArrayInt>::buildNeighborCopyOp()");
 
-    AMREX_ASSERT(hasNeighbors() == false);
+    AMREX_ASSERT(!hasNeighbors() || use_boundary_neighbor);
 
     const int lev = 0;
     const auto& geom = this->Geom(lev);
diff --git a/Src/Particle/AMReX_NeighborParticlesI.H b/Src/Particle/AMReX_NeighborParticlesI.H
index a07cfab92c4..202f41f87f3 100644
--- a/Src/Particle/AMReX_NeighborParticlesI.H
+++ b/Src/Particle/AMReX_NeighborParticlesI.H
@@ -119,6 +119,30 @@ NeighborParticleContainer<NStructReal, NStructInt, NArrayReal, NArrayInt>
     this->Redistribute();
 }
 
+template <int NStructReal, int NStructInt, int NArrayReal, int NArrayInt>
+bool
+NeighborParticleContainer<NStructReal, NStructInt, NArrayReal, NArrayInt>
+::areMasksValid () {
+
+    BL_PROFILE("NeighborParticleContainer::areMasksValid");
+
+    resizeContainers(this->numLevels());
+
+    for (int lev = 0; lev < this->numLevels(); ++lev)
+    {
+        BoxArray ba = this->ParticleBoxArray(lev);
+        const DistributionMapping& dmap = this->ParticleDistributionMap(lev);
+
+        if (mask_ptr[lev] == nullptr ||
+            ! BoxArray::SameRefs(mask_ptr[lev]->boxArray(), ba) ||
+            ! DistributionMapping::SameRefs(mask_ptr[lev]->DistributionMap(), dmap))
+        {
+            return false;
+        }
+    }
+    return true;
+}
+
 template <int NStructReal, int NStructInt, int NArrayReal, int NArrayInt>
 void
 NeighborParticleContainer<NStructReal, NStructInt, NArrayReal, NArrayInt>
@@ -136,30 +160,25 @@ NeighborParticleContainer<NStructReal, NStructInt, NArrayReal, NArrayInt>
         BoxArray ba = this->ParticleBoxArray(lev);
         const DistributionMapping& dmap = this->ParticleDistributionMap(lev);
 
-        if (mask_ptr[lev] == nullptr ||
-            ! BoxArray::SameRefs(mask_ptr[lev]->boxArray(), ba) ||
-            ! DistributionMapping::SameRefs(mask_ptr[lev]->DistributionMap(), dmap))
-        {
-            const Geometry& geom = this->Geom(lev);
+        const Geometry& geom = this->Geom(lev);
 
-            mask_ptr[lev] = std::make_unique<iMultiFab>(ba, dmap, int(num_mask_comps), m_num_neighbor_cells);
-            mask_ptr[lev]->setVal(-1, m_num_neighbor_cells);
+        mask_ptr[lev] = std::make_unique<iMultiFab>(ba, dmap, int(num_mask_comps), m_num_neighbor_cells);
+        mask_ptr[lev]->setVal(-1, m_num_neighbor_cells);
 
 #ifdef AMREX_USE_OMP
 #pragma omp parallel
 #endif
-            for (MFIter mfi(*mask_ptr[lev],this->do_tiling ? this->tile_size : IntVect::TheZeroVector());
-                 mfi.isValid(); ++mfi) {
-                const Box& box = mfi.tilebox();
-                const int grid_id = mfi.index();
-                const int tile_id = mfi.LocalTileIndex();
-                (*mask_ptr[lev])[mfi].template setVal<RunOn::Host>(grid_id, box, MaskComps::grid,  1);
-                (*mask_ptr[lev])[mfi].template setVal<RunOn::Host>(tile_id, box, MaskComps::tile,  1);
-                (*mask_ptr[lev])[mfi].template setVal<RunOn::Host>(lev    , box, MaskComps::level, 1);
-            }
-
-            mask_ptr[lev]->FillBoundary(geom.periodicity());
+        for (MFIter mfi(*mask_ptr[lev],this->do_tiling ? this->tile_size : IntVect::TheZeroVector());
+             mfi.isValid(); ++mfi) {
+            const Box& box = mfi.tilebox();
+            const int grid_id = mfi.index();
+            const int tile_id = mfi.LocalTileIndex();
+            (*mask_ptr[lev])[mfi].template setVal<RunOn::Host>(grid_id, box, MaskComps::grid,  1);
+            (*mask_ptr[lev])[mfi].template setVal<RunOn::Host>(tile_id, box, MaskComps::tile,  1);
+            (*mask_ptr[lev])[mfi].template setVal<RunOn::Host>(lev    , box, MaskComps::level, 1);
         }
+
+        mask_ptr[lev]->FillBoundary(geom.periodicity());
     }
 }
 
@@ -794,9 +813,21 @@ NeighborParticleContainer<NStructReal, NStructInt, NArrayReal, NArrayInt>::
 selectActualNeighbors (CheckPair&& check_pair, int num_cells)
 {
     BL_PROFILE("NeighborParticleContainer::selectActualNeighbors");
+    const auto& geom_fine = this->Geom(0);
+    const auto& ba_fine   = this->ParticleBoxArray(0);
+    if (ba_fine.size() == 1 && !geom_fine.isAnyPeriodic()) {
+        return;
+    }
 
     for (int lev = 0; lev < this->numLevels(); ++lev)
     {
+        // clear previous neighbor particle ids
+        if (!m_boundary_particle_ids.empty()) {
+          for (auto& keyval: m_boundary_particle_ids[lev]) {
+            keyval.second.clear();
+          }
+        }
+
         for (MyParIter pti(*this, lev); pti.isValid(); ++pti) {
             PairIndex index(pti.index(), pti.LocalTileIndex());
 
@@ -838,8 +869,8 @@ selectActualNeighbors (CheckPair&& check_pair, int num_cells)
             auto pperm   = bins.permutationPtr();
             auto poffset = bins.offsetsPtr();
 
-            unsigned int  np_boundary    = 0;
-            unsigned int* p_np_boundary  = &np_boundary;
+            Gpu::Buffer<unsigned int> np_boundary({0});
+            unsigned int* p_np_boundary = np_boundary.data();
             constexpr unsigned int max_unsigned_int = std::numeric_limits<unsigned int>::max();
 
             AMREX_FOR_1D ( np_real, i,
@@ -880,9 +911,9 @@ selectActualNeighbors (CheckPair&& check_pair, int num_cells)
                     }
                 }
             });// end amrex_for_1d
-            Gpu::streamSynchronize();
 
-            m_boundary_particle_ids[lev][index].resize(np_boundary);
+            unsigned int* p_np_boundary_h = np_boundary.copyToHost();
+            m_boundary_particle_ids[lev][index].resize(*p_np_boundary_h);
 
         }// end mypariter
     }// end lev
diff --git a/Src/Particle/AMReX_ParticleContainer.H b/Src/Particle/AMReX_ParticleContainer.H
index c164e7214d3..d604a36c896 100644
--- a/Src/Particle/AMReX_ParticleContainer.H
+++ b/Src/Particle/AMReX_ParticleContainer.H
@@ -531,9 +531,9 @@ public:
     */
     bool OK (int lev_min = 0, int lev_max = -1, int nGrow = 0) const;
 
-    void ByteSpread () const;
+    std::array<Long, 3> ByteSpread () const;
 
-    void PrintCapacity () const;
+    std::array<Long, 3> PrintCapacity () const;
 
     void ShrinkToFit ();
 
diff --git a/Src/Particle/AMReX_ParticleContainerI.H b/Src/Particle/AMReX_ParticleContainerI.H
index c7ee56c3f68..f257ff17ddd 100644
--- a/Src/Particle/AMReX_ParticleContainerI.H
+++ b/Src/Particle/AMReX_ParticleContainerI.H
@@ -241,10 +241,11 @@ ParticleContainer_impl<ParticleType, NArrayReal, NArrayInt, Allocator>
     const auto& geom = Geom(0);
     const auto plo = geom.ProbLoArray();
     const auto phi = geom.ProbHiArray();
-    const auto rhi = geom.RoundoffHiArray();
+    const auto rlo = geom.ProbLoArrayInParticleReal();
+    const auto rhi = geom.ProbHiArrayInParticleReal();
     const auto is_per = geom.isPeriodicArray();
 
-    return enforcePeriodic(p, plo, phi, rhi, is_per);
+    return enforcePeriodic(p, plo, phi, rlo, rhi, is_per);
 }
 
 template <typename ParticleType, int NArrayReal, int NArrayInt,
@@ -316,20 +317,21 @@ ParticleContainer_impl<ParticleType, NArrayReal, NArrayInt, Allocator>::locatePa
 
     if (! outside)
     {
-        if (Geom(0).outsideRoundoffDomain(AMREX_D_DECL(Real(p.pos(0)), Real(p.pos(1)), Real(p.pos(2)))))
+        if (Geom(0).outsideRoundoffDomain(AMREX_D_DECL(p.pos(0), p.pos(1), p.pos(2))))
         {
-            RealBox roundoff_domain = Geom(0).RoundoffDomain();
+            GpuArray<ParticleReal, AMREX_SPACEDIM> rhi = Geom(0).ProbHiArrayInParticleReal();
+            GpuArray<ParticleReal, AMREX_SPACEDIM> rlo = Geom(0).ProbLoArrayInParticleReal();
             for (int idim=0; idim < AMREX_SPACEDIM; ++idim)
             {
-                if (p.pos(idim) <= roundoff_domain.lo(idim)) {
-                    p.pos(idim) = std::nextafter((ParticleReal) roundoff_domain.lo(idim), (ParticleReal) roundoff_domain.hi(idim));
+                if (p.pos(idim) <= rlo[idim]) {
+                    p.pos(idim) = std::nextafter(rlo[idim], rhi[idim]);
                 }
-                if (p.pos(idim) >= roundoff_domain.hi(idim)) {
-                    p.pos(idim) = std::nextafter((ParticleReal) roundoff_domain.hi(idim), (ParticleReal) roundoff_domain.lo(idim));
+                if (p.pos(idim) >= rhi[idim]) {
+                    p.pos(idim) = std::nextafter(rhi[idim], rlo[idim]);
                 }
             }
 
-            AMREX_ASSERT(! Geom(0).outsideRoundoffDomain(AMREX_D_DECL(Real(p.pos(0)), Real(p.pos(1)), Real(p.pos(2)))));
+            AMREX_ASSERT(! Geom(0).outsideRoundoffDomain(AMREX_D_DECL(p.pos(0), p.pos(1), p.pos(2))));
         }
     }
 
@@ -517,8 +519,9 @@ Long ParticleContainer_impl<ParticleType, NArrayReal, NArrayInt, Allocator>::Num
 
 template <typename ParticleType, int NArrayReal, int NArrayInt,
           template<class> class Allocator>
-void
-ParticleContainer_impl<ParticleType, NArrayReal, NArrayInt, Allocator>::ByteSpread () const
+std::array<Long, 3>
+ParticleContainer_impl<ParticleType, NArrayReal, NArrayInt, Allocator>
+::ByteSpread () const
 {
     Long cnt = 0;
 
@@ -533,7 +536,7 @@ ParticleContainer_impl<ParticleType, NArrayReal, NArrayInt, Allocator>::ByteSpre
     Long mn = cnt, mx = mn;
 
     const int IOProc = ParallelContext::IOProcessorNumberSub();
-    const std::size_t sz = sizeof(ParticleType)+NumRealComps()*sizeof(Real)+NumIntComps()*sizeof(int);
+    const Long sz = sizeof(ParticleType)+NumRealComps()*sizeof(ParticleReal)+NumIntComps()*sizeof(int);
 
 #ifdef AMREX_LAZY
     Lazy::QueueReduction( [=] () mutable {
@@ -542,22 +545,27 @@ ParticleContainer_impl<ParticleType, NArrayReal, NArrayInt, Allocator>::ByteSpre
             ParallelReduce::Max(mx,  IOProc, ParallelContext::CommunicatorSub());
             ParallelReduce::Sum(cnt, IOProc, ParallelContext::CommunicatorSub());
 
-            amrex::Print() << "ParticleContainer byte spread across MPI nodes: ["
+            amrex::Print() << "ParticleContainer spread across MPI nodes - bytes (num particles): [Min: "
                            << mn*sz
                            << " (" << mn << ")"
-                           << " ... "
+                           << ", Max: "
                            << mx*sz
                            << " (" << mx << ")"
-                           << "] total particles: (" << cnt << ")\n";
+                           << ", Total: "
+                           << cnt*sz
+                           << " (" << cnt << ")]\n";
 #ifdef AMREX_LAZY
         });
 #endif
+
+    return {mn*sz, mx*sz, cnt*sz};
 }
 
 template <typename ParticleType, int NArrayReal, int NArrayInt,
           template<class> class Allocator>
-void
-ParticleContainer_impl<ParticleType, NArrayReal, NArrayInt, Allocator>::PrintCapacity () const
+std::array<Long, 3>
+ParticleContainer_impl<ParticleType, NArrayReal, NArrayInt, Allocator>
+::PrintCapacity () const
 {
     Long cnt = 0;
 
@@ -580,16 +588,18 @@ ParticleContainer_impl<ParticleType, NArrayReal, NArrayInt, Allocator>::PrintCap
             ParallelReduce::Max(mx,  IOProc, ParallelContext::CommunicatorSub());
             ParallelReduce::Sum(cnt, IOProc, ParallelContext::CommunicatorSub());
 
-            amrex::Print() << "ParticleContainer byte spread across MPI nodes: ["
+            amrex::Print() << "ParticleContainer spread across MPI nodes - bytes: [Min: "
                            << mn
-                           << " (" << mn << ")"
-                           << " ... "
+                           << ", Max: "
                            << mx
-                           << " (" << mx << ")"
-                           << "] total memory: (" << cnt << ")\n";
+                           << ", Total: "
+                           << cnt
+                           << "]\n";
 #ifdef AMREX_LAZY
         });
 #endif
+
+    return {mn, mx, cnt};
 }
 
 template <typename ParticleType, int NArrayReal, int NArrayInt,
@@ -1110,6 +1120,8 @@ void
 ParticleContainer_impl<ParticleType, NArrayReal, NArrayInt, Allocator>
 ::Redistribute (int lev_min, int lev_max, int nGrow, int local, bool remove_negative)
 {
+    BL_PROFILE_SYNC_START_TIMED("SyncBeforeComms: Redist");
+
 #ifdef AMREX_USE_GPU
     if ( Gpu::inLaunchRegion() )
     {
@@ -1122,6 +1134,8 @@ ParticleContainer_impl<ParticleType, NArrayReal, NArrayInt, Allocator>
 #else
     RedistributeCPU(lev_min, lev_max, nGrow, local, remove_negative);
 #endif
+
+    BL_PROFILE_SYNC_STOP();
 }
 
 template <typename ParticleType, int NArrayReal, int NArrayInt,
@@ -1150,10 +1164,11 @@ ParticleContainer_impl<ParticleType, NArrayReal, NArrayInt, Allocator>::SortPart
 
         for(MFIter mfi = MakeMFIter(lev); mfi.isValid(); ++mfi)
         {
-            auto& ptile = ParticlesAt(lev, mfi);
-            auto& aos   = ptile.GetArrayOfStructs();
-            const size_t np = aos.numParticles();
-            auto pstruct_ptr = aos().dataPtr();
+            auto& ptile           = ParticlesAt(lev, mfi);
+            auto& aos             = ptile.GetArrayOfStructs();
+            auto  pstruct_ptr     = aos().dataPtr();
+            const size_t np       = aos.numParticles();
+            const size_t np_total = np + aos.numNeighborParticles();
 
             const Box& box = mfi.validbox();
 
@@ -1164,26 +1179,26 @@ ParticleContainer_impl<ParticleType, NArrayReal, NArrayInt, Allocator>::SortPart
 
             if (memEfficientSort) {
                 {
-                    ParticleVector tmp_particles(np);
+                    ParticleVector tmp_particles(np_total);
                     auto src = ptile.getParticleTileData();
                     ParticleType* dst = tmp_particles.data();
 
-                    AMREX_HOST_DEVICE_FOR_1D( np, i,
+                    AMREX_HOST_DEVICE_FOR_1D( np_total, i,
                     {
-                        dst[i] = src.m_aos[inds[i]];
+                        dst[i] = i < np ? src.m_aos[inds[i]] : src.m_aos[i];
                     });
 
                     Gpu::streamSynchronize();
                     ptile.GetArrayOfStructs()().swap(tmp_particles);
                 }
 
-                RealVector tmp_real(np);
+                RealVector tmp_real(np_total);
                 for (int comp = 0; comp < NArrayReal + m_num_runtime_real; ++comp) {
                     auto src = ptile.GetStructOfArrays().GetRealData(comp).data();
                     ParticleReal* dst = tmp_real.data();
-                    AMREX_HOST_DEVICE_FOR_1D( np, i,
+                    AMREX_HOST_DEVICE_FOR_1D( np_total, i,
                     {
-                        dst[i] = src[inds[i]];
+                        dst[i] = i < np ? src[inds[i]] : src[i];
                     });
 
                     Gpu::streamSynchronize();
@@ -1191,13 +1206,13 @@ ParticleContainer_impl<ParticleType, NArrayReal, NArrayInt, Allocator>::SortPart
                     ptile.GetStructOfArrays().GetRealData(comp).swap(tmp_real);
                 }
 
-                IntVector tmp_int(np);
+                IntVector tmp_int(np_total);
                 for (int comp = 0; comp < NArrayInt + m_num_runtime_int; ++comp) {
                     auto src = ptile.GetStructOfArrays().GetIntData(comp).data();
                     int* dst = tmp_int.data();
-                    AMREX_HOST_DEVICE_FOR_1D( np, i,
+                    AMREX_HOST_DEVICE_FOR_1D( np_total , i,
                     {
-                        dst[i] = src[inds[i]];
+                        dst[i] = i < np ? src[inds[i]] : src[i];
                     });
 
                     Gpu::streamSynchronize();
@@ -1207,8 +1222,11 @@ ParticleContainer_impl<ParticleType, NArrayReal, NArrayInt, Allocator>::SortPart
             } else {
                 ParticleTileType ptile_tmp;
                 ptile_tmp.define(m_num_runtime_real, m_num_runtime_int);
-                ptile_tmp.resize(np);
+                ptile_tmp.resize(np_total);
+                // copy re-ordered particles
                 gatherParticles(ptile_tmp, ptile, np, m_bins.permutationPtr());
+                // copy neighbor particles
+                amrex::copyParticles(ptile_tmp, ptile, np, np, np_total-np);
                 ptile.swap(ptile_tmp);
             }
         }
@@ -1271,7 +1289,8 @@ ParticleContainer_impl<ParticleType, NArrayReal, NArrayInt, Allocator>
     Vector<std::map<int, int> > new_sizes(num_levels);
     const auto plo    = Geom(0).ProbLoArray();
     const auto phi    = Geom(0).ProbHiArray();
-    const auto rhi    = Geom(0).RoundoffHiArray();
+    const auto rlo    = Geom(0).ProbLoArrayInParticleReal();
+    const auto rhi    = Geom(0).ProbHiArrayInParticleReal();
     const auto is_per = Geom(0).isPeriodicArray();
     for (int lev = lev_min; lev <= finest_lev_particles; ++lev)
     {
@@ -1292,7 +1311,7 @@ ParticleContainer_impl<ParticleType, NArrayReal, NArrayInt, Allocator>
                 "perhaps particles have not been initialized correctly?");
 
             int num_stay = partitionParticlesByDest(src_tile, assign_grid, BufferMap(),
-                                                    plo, phi, rhi, is_per, lev, gid, tid,
+                                                    plo, phi, rlo, rhi, is_per, lev, gid, tid,
                                                     lev_min, lev_max, nGrow, remove_negative);
 
             int num_move = np - num_stay;
diff --git a/Src/Particle/AMReX_ParticleInit.H b/Src/Particle/AMReX_ParticleInit.H
index 7aa2141c0b0..c21d0ea3da7 100644
--- a/Src/Particle/AMReX_ParticleInit.H
+++ b/Src/Particle/AMReX_ParticleInit.H
@@ -1022,8 +1022,6 @@ InitRandom (Long                    icount,
 
         ParticleLocData pld;
 
-        int cnt = 0;
-
         Vector<std::map<std::pair<int, int>, Gpu::HostVector<ParticleType> > > host_particles;
         host_particles.reserve(15);
         host_particles.resize(finestLevel()+1);
@@ -1079,8 +1077,6 @@ InitRandom (Long                    icount,
                 for (int i = 0; i < NArrayInt; i++) {
                     host_int_attribs[pld.m_lev][ind][i].push_back(pdata.int_array_data[i]);
                 }
-
-                cnt++;
               }
         }
 
diff --git a/Src/Particle/AMReX_ParticleUtil.H b/Src/Particle/AMReX_ParticleUtil.H
index 7a4df446e56..e0ec8944361 100644
--- a/Src/Particle/AMReX_ParticleUtil.H
+++ b/Src/Particle/AMReX_ParticleUtil.H
@@ -556,7 +556,8 @@ AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
 bool enforcePeriodic (P& p,
                       amrex::GpuArray<amrex::Real,AMREX_SPACEDIM> const& plo,
                       amrex::GpuArray<amrex::Real,AMREX_SPACEDIM> const& phi,
-                      amrex::GpuArray<amrex::Real,AMREX_SPACEDIM> const& rhi,
+                      amrex::GpuArray<amrex::ParticleReal,AMREX_SPACEDIM> const& rlo,
+                      amrex::GpuArray<amrex::ParticleReal,AMREX_SPACEDIM> const& rhi,
                       amrex::GpuArray<int,AMREX_SPACEDIM> const& is_per) noexcept
 {
     bool shifted = false;
@@ -568,7 +569,9 @@ bool enforcePeriodic (P& p,
                 p.pos(idim) -= static_cast<ParticleReal>(phi[idim] - plo[idim]);
             }
             // clamp to avoid precision issues;
-            if (p.pos(idim) < plo[idim]) p.pos(idim) = static_cast<ParticleReal>(plo[idim]);
+            if (p.pos(idim) < rlo[idim]) {
+                p.pos(idim) = rlo[idim];
+            }
             shifted = true;
         }
         else if (p.pos(idim) < plo[idim]) {
@@ -576,8 +579,8 @@ bool enforcePeriodic (P& p,
                 p.pos(idim) += static_cast<ParticleReal>(phi[idim] - plo[idim]);
             }
             // clamp to avoid precision issues;
-            if (p.pos(idim) >= rhi[idim]) {
-                p.pos(idim) = static_cast<ParticleReal>(rhi[idim]);
+            if (p.pos(idim) > rhi[idim]) {
+                p.pos(idim) = rhi[idim];
             }
             shifted = true;
         }
@@ -594,7 +597,8 @@ int
 partitionParticlesByDest (PTile& ptile, const PLocator& ploc, const ParticleBufferMap& pmap,
                           const GpuArray<Real,AMREX_SPACEDIM>& plo,
                           const GpuArray<Real,AMREX_SPACEDIM>& phi,
-                          const GpuArray<Real,AMREX_SPACEDIM>& rhi,
+                          const GpuArray<ParticleReal,AMREX_SPACEDIM>& rlo,
+                          const GpuArray<ParticleReal,AMREX_SPACEDIM>& rhi,
                           const GpuArray<int ,AMREX_SPACEDIM>& is_per,
                           int lev, int gid, int /*tid*/,
                           int lev_min, int lev_max, int nGrow, bool remove_negative)
@@ -641,7 +645,7 @@ partitionParticlesByDest (PTile& ptile, const PLocator& ploc, const ParticleBuff
                 else
                 {
                     auto p_prime = p;
-                    enforcePeriodic(p_prime, plo, phi, rhi, is_per);
+                    enforcePeriodic(p_prime, plo, phi, rlo, rhi, is_per);
                     auto tup_prime = ploc(p_prime, lev_min, lev_max, nGrow);
                     assigned_grid = amrex::get<0>(tup_prime);
                     assigned_lev  = amrex::get<1>(tup_prime);
diff --git a/Src/Particle/AMReX_WriteBinaryParticleData.H b/Src/Particle/AMReX_WriteBinaryParticleData.H
index 6c9494f88c5..b5c59174ae3 100644
--- a/Src/Particle/AMReX_WriteBinaryParticleData.H
+++ b/Src/Particle/AMReX_WriteBinaryParticleData.H
@@ -231,12 +231,26 @@ packIOData (Vector<int>& idata, Vector<ParticleReal>& rdata, const PC& pc, int l
                     }
                 }
 
+                for (int j = 0; j < ptd.m_num_runtime_int; j++) {
+                    if (write_int_comp_d_ptr[PC::SuperParticleType::NInt + j]) {
+                        idata_d_ptr[iout_index] = ptd.m_runtime_idata[j][pindex];
+                        iout_index++;
+                    }
+                }
+
                 for (int j = 0; j < PC::SuperParticleType::NReal; j++) {
                     if (write_real_comp_d_ptr[j]) {
                         rdata_d_ptr[rout_index] = p.rdata(j);
                         rout_index++;
                     }
                 }
+
+                for (int j = 0; j < ptd.m_num_runtime_real; j++) {
+                    if (write_real_comp_d_ptr[PC::SuperParticleType::NReal + j]) {
+                        rdata_d_ptr[rout_index] = ptd.m_runtime_rdata[j][pindex];
+                        rout_index++;
+                    }
+                }
             }
         });
 
@@ -752,7 +766,25 @@ void WriteBinaryParticleDataAsync (PC const& pc,
             if (np_per_grid_local[lev][mfi.index()] > 0)
             {
                 const auto& ptile = pc.ParticlesAt(lev, mfi);
-                new_ptile.resize(np_per_grid_local[lev][mfi.index()]);
+
+                const auto np = np_per_grid_local[lev][mfi.index()];
+
+                new_ptile.resize(np);
+
+                const auto runtime_real_comps = ptile.NumRuntimeRealComps();
+                const auto runtime_int_comps = ptile.NumRuntimeIntComps();
+
+                constexpr auto NReal = NArrayReal + NStructReal;
+                constexpr auto NInt = NArrayInt + NStructInt;
+
+                new_ptile.define(runtime_real_comps, runtime_int_comps);
+
+                for (auto comp(0); comp < runtime_real_comps; ++comp)
+                  new_ptile.push_back_real(NReal+comp, np, 0.);
+
+                for (auto comp(0); comp < runtime_int_comps; ++comp)
+                  new_ptile.push_back_int(NInt+comp, np, 0);
+
                 amrex::filterParticles(new_ptile, ptile, KeepValidFilter());
             }
         }
diff --git a/Tests/Amr/Advection_AmrCore/Source/AdvancePhiAllLevels.cpp b/Tests/Amr/Advection_AmrCore/Source/AdvancePhiAllLevels.cpp
index b5e48e6e409..4f97cbf3184 100644
--- a/Tests/Amr/Advection_AmrCore/Source/AdvancePhiAllLevels.cpp
+++ b/Tests/Amr/Advection_AmrCore/Source/AdvancePhiAllLevels.cpp
@@ -35,7 +35,8 @@ AmrCoreAdv::AdvancePhiAllLevels (Real time, Real dt_lev, int /*iteration*/)
 
         // State with ghost cells
         MultiFab Sborder(grids[lev], dmap[lev], phi_new[lev].nComp(), num_grow);
-        FillPatch(lev, time, Sborder, 0, Sborder.nComp());
+        FillPatch(lev, time, Sborder, 0, Sborder.nComp(),
+                  FillPatchType::fillpatch_function);
 
 #ifdef AMREX_USE_OMP
 #pragma omp parallel if (Gpu::notInLaunchRegion())
diff --git a/Tests/Amr/Advection_AmrCore/Source/AdvancePhiAtLevel.cpp b/Tests/Amr/Advection_AmrCore/Source/AdvancePhiAtLevel.cpp
index 3ddd055eda0..7a5e1abbaa7 100644
--- a/Tests/Amr/Advection_AmrCore/Source/AdvancePhiAtLevel.cpp
+++ b/Tests/Amr/Advection_AmrCore/Source/AdvancePhiAtLevel.cpp
@@ -33,7 +33,8 @@ AmrCoreAdv::AdvancePhiAtLevel (int lev, Real time, Real dt_lev, int /*iteration*
 
     // State with ghost cells
     MultiFab Sborder(grids[lev], dmap[lev], S_new.nComp(), num_grow);
-    FillPatch(lev, time, Sborder, 0, Sborder.nComp());
+    FillPatch(lev, time, Sborder, 0, Sborder.nComp(),
+              FillPatchType::fillpatch_class);
 
 #ifdef AMREX_USE_OMP
 #pragma omp parallel if (Gpu::notInLaunchRegion())
diff --git a/Tests/Amr/Advection_AmrCore/Source/AmrCoreAdv.H b/Tests/Amr/Advection_AmrCore/Source/AmrCoreAdv.H
index e330d30e740..1b6832d8663 100644
--- a/Tests/Amr/Advection_AmrCore/Source/AmrCoreAdv.H
+++ b/Tests/Amr/Advection_AmrCore/Source/AmrCoreAdv.H
@@ -10,6 +10,7 @@
 #include <AMReX_AmrCore.H>
 #include <AMReX_BCRec.H>
 #include <AMReX_FluxRegister.H>
+#include <AMReX_FillPatcher.H>
 
 #ifdef AMREX_USE_OMP
 #   include <omp.h>
@@ -98,15 +99,18 @@ private:
     // more flexible version of AverageDown() that lets you average down across multiple levels
     void AverageDownTo (int crse_lev);
 
+    enum class FillPatchType { fillpatch_class, fillpatch_function };
+
     // compute a new multifab by coping in phi from valid region and filling ghost cells
     // works for single level and 2-level cases (fill fine grid ghost by interpolating from coarse)
-    void FillPatch (int lev, amrex::Real time, amrex::MultiFab& mf, int icomp, int ncomp);
+    void FillPatch (int lev, amrex::Real time, amrex::MultiFab& mf, int icomp,
+                    int ncomp, FillPatchType fptype);
 
     // fill an entire multifab by interpolating from the coarser level
     // this comes into play when a new level of refinement appears
     void FillCoarsePatch (int lev, amrex::Real time, amrex::MultiFab& mf, int icomp, int ncomp);
 
-    // utility to copy in data from phi_old and/or phi_new into another multifab
+    // Pack pointers to phi_old and/or phi_new and associated times.
     void GetData (int lev, amrex::Real time, amrex::Vector<amrex::MultiFab*>& data,
                   amrex::Vector<amrex::Real>& datatime);
 
@@ -165,6 +169,9 @@ private:
     // used in the reflux operation
     amrex::Vector<std::unique_ptr<amrex::FluxRegister> > flux_reg;
 
+    // This is for fillpatch during timestepping, but not for regridding.
+    amrex::Vector<std::unique_ptr<amrex::FillPatcher<amrex::MultiFab>>> fillpatcher;
+
     // Velocity on all faces at all levels
     amrex::Vector< amrex::Array<amrex::MultiFab, AMREX_SPACEDIM> > facevel;
 
diff --git a/Tests/Amr/Advection_AmrCore/Source/AmrCoreAdv.cpp b/Tests/Amr/Advection_AmrCore/Source/AmrCoreAdv.cpp
index 62c9dc7417e..3300e4622cc 100644
--- a/Tests/Amr/Advection_AmrCore/Source/AmrCoreAdv.cpp
+++ b/Tests/Amr/Advection_AmrCore/Source/AmrCoreAdv.cpp
@@ -2,7 +2,6 @@
 #include <AMReX_ParallelDescriptor.H>
 #include <AMReX_ParmParse.H>
 #include <AMReX_MultiFabUtil.H>
-#include <AMReX_FillPatchUtil.H>
 #include <AMReX_PlotFileUtil.H>
 #include <AMReX_VisMF.H>
 #include <AMReX_PhysBCFunct.H>
@@ -95,6 +94,10 @@ AmrCoreAdv::AmrCoreAdv ()
     // with the lev/lev-1 interface (and has grid spacing associated with lev-1)
     // therefore flux_reg[0] is never actually used in the reflux operation
     flux_reg.resize(nlevs_max+1);
+
+    // fillpatcher[lev] is for filling data on level lev using the data on
+    // lev-1 and lev.
+    fillpatcher.resize(nlevs_max+1);
 }
 
 AmrCoreAdv::~AmrCoreAdv ()
@@ -230,7 +233,8 @@ AmrCoreAdv::RemakeLevel (int lev, Real time, const BoxArray& ba,
     MultiFab new_state(ba, dm, ncomp, ng);
     MultiFab old_state(ba, dm, ncomp, ng);
 
-    FillPatch(lev, time, new_state, 0, ncomp);
+    // Must use fillpatch_function
+    FillPatch(lev, time, new_state, 0, ncomp, FillPatchType::fillpatch_function);
 
     std::swap(new_state, phi_new[lev]);
     std::swap(old_state, phi_old[lev]);
@@ -257,6 +261,7 @@ AmrCoreAdv::ClearLevel (int lev)
     phi_new[lev].clear();
     phi_old[lev].clear();
     flux_reg[lev].reset(nullptr);
+    fillpatcher[lev].reset(nullptr);
 }
 
 // Make a new level from scratch using provided BoxArray and DistributionMapping.
@@ -418,7 +423,8 @@ AmrCoreAdv::AverageDownTo (int crse_lev)
 // compute a new multifab by coping in phi from valid region and filling ghost cells
 // works for single level and 2-level cases (fill fine grid ghost by interpolating from coarse)
 void
-AmrCoreAdv::FillPatch (int lev, Real time, MultiFab& mf, int icomp, int ncomp)
+AmrCoreAdv::FillPatch (int lev, Real time, MultiFab& mf, int icomp, int ncomp,
+                       FillPatchType fptype)
 {
     if (lev == 0)
     {
@@ -450,16 +456,31 @@ AmrCoreAdv::FillPatch (int lev, Real time, MultiFab& mf, int icomp, int ncomp)
 
         Interpolater* mapper = &cell_cons_interp;
 
+        if (fptype == FillPatchType::fillpatch_class) {
+            if (fillpatcher[lev] == nullptr) {
+                fillpatcher[lev] = std::make_unique<FillPatcher<MultiFab>>
+                    (boxArray(lev  ), DistributionMap(lev  ), Geom(lev  ),
+                     boxArray(lev-1), DistributionMap(lev-1), Geom(lev-1),
+                     mf.nGrowVect(), mf.nComp(), mapper);
+            }
+        }
+
         if(Gpu::inLaunchRegion())
         {
             GpuBndryFuncFab<AmrCoreFill> gpu_bndry_func(AmrCoreFill{});
             PhysBCFunct<GpuBndryFuncFab<AmrCoreFill> > cphysbc(geom[lev-1],bcs,gpu_bndry_func);
             PhysBCFunct<GpuBndryFuncFab<AmrCoreFill> > fphysbc(geom[lev],bcs,gpu_bndry_func);
 
-            amrex::FillPatchTwoLevels(mf, time, cmf, ctime, fmf, ftime,
-                                      0, icomp, ncomp, geom[lev-1], geom[lev],
-                                      cphysbc, 0, fphysbc, 0, refRatio(lev-1),
-                                      mapper, bcs, 0);
+            if (fptype == FillPatchType::fillpatch_class) {
+                fillpatcher[lev]->fill(mf, mf.nGrowVect(), time,
+                                       cmf, ctime, fmf, ftime, 0, icomp, ncomp,
+                                       cphysbc, 0, fphysbc, 0, bcs, 0);
+            } else {
+                amrex::FillPatchTwoLevels(mf, time, cmf, ctime, fmf, ftime,
+                                          0, icomp, ncomp, geom[lev-1], geom[lev],
+                                          cphysbc, 0, fphysbc, 0, refRatio(lev-1),
+                                          mapper, bcs, 0);
+            }
         }
         else
         {
@@ -467,10 +488,16 @@ AmrCoreAdv::FillPatch (int lev, Real time, MultiFab& mf, int icomp, int ncomp)
             PhysBCFunct<CpuBndryFuncFab> cphysbc(geom[lev-1],bcs,bndry_func);
             PhysBCFunct<CpuBndryFuncFab> fphysbc(geom[lev],bcs,bndry_func);
 
-            amrex::FillPatchTwoLevels(mf, time, cmf, ctime, fmf, ftime,
-                                      0, icomp, ncomp, geom[lev-1], geom[lev],
-                                      cphysbc, 0, fphysbc, 0, refRatio(lev-1),
-                                      mapper, bcs, 0);
+            if (fptype == FillPatchType::fillpatch_class) {
+                fillpatcher[lev]->fill(mf, mf.nGrowVect(), time,
+                                       cmf, ctime, fmf, ftime, 0, icomp, ncomp,
+                                       cphysbc, 0, fphysbc, 0, bcs, 0);
+            } else {
+                amrex::FillPatchTwoLevels(mf, time, cmf, ctime, fmf, ftime,
+                                          0, icomp, ncomp, geom[lev-1], geom[lev],
+                                          cphysbc, 0, fphysbc, 0, refRatio(lev-1),
+                                          mapper, bcs, 0);
+            }
         }
     }
 }
@@ -513,21 +540,18 @@ AmrCoreAdv::FillCoarsePatch (int lev, Real time, MultiFab& mf, int icomp, int nc
     }
 }
 
-// utility to copy in data from phi_old and/or phi_new into another multifab
 void
 AmrCoreAdv::GetData (int lev, Real time, Vector<MultiFab*>& data, Vector<Real>& datatime)
 {
     data.clear();
     datatime.clear();
 
-    const Real teps = (t_new[lev] - t_old[lev]) * 1.e-3;
-
-    if (time > t_new[lev] - teps && time < t_new[lev] + teps)
+    if (amrex::almostEqual(time, t_new[lev], 5))
     {
         data.push_back(&phi_new[lev]);
         datatime.push_back(t_new[lev]);
     }
-    else if (time > t_old[lev] - teps && time < t_old[lev] + teps)
+    else if (amrex::almostEqual(time, t_old[lev], 5))
     {
         data.push_back(&phi_old[lev]);
         datatime.push_back(t_old[lev]);
@@ -631,6 +655,8 @@ AmrCoreAdv::timeStepWithSubcycling (int lev, Real time, int iteration)
         }
 
         AverageDownTo(lev); // average lev+1 down to lev
+
+        fillpatcher[lev+1].reset(); // Because the data on lev have changed.
     }
 
 
@@ -694,6 +720,10 @@ AmrCoreAdv::timeStepNoSubcycling (Real time, int iteration)
     // Make sure the coarser levels are consistent with the finer levels
     AverageDown ();
 
+    for (auto& fp : fillpatcher) {
+        fp.reset(); // Because the data have changed.
+    }
+
     for (int lev = 0; lev <= finest_level; lev++)
         ++istep[lev];
 
diff --git a/Tests/Amr/Advection_AmrCore/Source/DefineVelocity.cpp b/Tests/Amr/Advection_AmrCore/Source/DefineVelocity.cpp
index 995393e05f8..4dc1076dec8 100644
--- a/Tests/Amr/Advection_AmrCore/Source/DefineVelocity.cpp
+++ b/Tests/Amr/Advection_AmrCore/Source/DefineVelocity.cpp
@@ -39,7 +39,7 @@ AmrCoreAdv::DefineVelocityAtLevel (int lev, Real time)
                                                                       facevel[lev][2].array(mfi)) };
 
             const Box& psibox = Box(IntVect(AMREX_D_DECL(std::min(ngbxx.smallEnd(0)-1, ngbxy.smallEnd(0)-1),
-                                                         std::min(ngbxx.smallEnd(1)-1, ngbxy.smallEnd(0)-1),
+                                                         std::min(ngbxx.smallEnd(1)-1, ngbxy.smallEnd(1)-1),
                                                          0)),
                                     IntVect(AMREX_D_DECL(std::max(ngbxx.bigEnd(0),   ngbxy.bigEnd(0)+1),
                                                          std::max(ngbxx.bigEnd(1)+1, ngbxy.bigEnd(1)),
diff --git a/Tests/Amr/Advection_AmrCore/Source/Src_K/Make.package b/Tests/Amr/Advection_AmrCore/Source/Src_K/Make.package
index e98f493727c..5254ff6f63f 100644
--- a/Tests/Amr/Advection_AmrCore/Source/Src_K/Make.package
+++ b/Tests/Amr/Advection_AmrCore/Source/Src_K/Make.package
@@ -1,3 +1,3 @@
 CEXE_headers += Adv_K.H
-CEXE_headers += compute_flux_K_$(DIM).H
+CEXE_headers += compute_flux_$(DIM)D_K.H
 CEXE_headers += slope_K.H
diff --git a/Tests/Amr/Advection_AmrLevel/Source/AmrLevelAdv.H b/Tests/Amr/Advection_AmrLevel/Source/AmrLevelAdv.H
index 1e5bacbc497..faf56357e29 100644
--- a/Tests/Amr/Advection_AmrLevel/Source/AmrLevelAdv.H
+++ b/Tests/Amr/Advection_AmrLevel/Source/AmrLevelAdv.H
@@ -231,7 +231,7 @@ protected:
     /*
      * The data.
      */
-    amrex::FluxRegister*        flux_reg;
+    std::unique_ptr<amrex::FluxRegister> flux_reg;
 
     /*
      * Static data members.
diff --git a/Tests/Amr/Advection_AmrLevel/Source/AmrLevelAdv.cpp b/Tests/Amr/Advection_AmrLevel/Source/AmrLevelAdv.cpp
index db69749a85f..7fae3038f72 100644
--- a/Tests/Amr/Advection_AmrLevel/Source/AmrLevelAdv.cpp
+++ b/Tests/Amr/Advection_AmrLevel/Source/AmrLevelAdv.cpp
@@ -36,7 +36,6 @@ int AmrLevelAdv::do_tracers                       =  0;
  */
 AmrLevelAdv::AmrLevelAdv ()
 {
-    flux_reg = 0;
 }
 
 /**
@@ -51,9 +50,9 @@ AmrLevelAdv::AmrLevelAdv (Amr&            papa,
     :
     AmrLevel(papa,lev,level_geom,bl,dm,time)
 {
-    flux_reg = 0;
-    if (level > 0 && do_reflux)
-        flux_reg = new FluxRegister(grids,dmap,crse_ratio,level,NUM_STATE);
+    if (level > 0 && do_reflux) {
+        flux_reg = std::make_unique<FluxRegister>(grids,dmap,crse_ratio,level,NUM_STATE);
+    }
 }
 
 /**
@@ -61,7 +60,6 @@ AmrLevelAdv::AmrLevelAdv (Amr&            papa,
  */
 AmrLevelAdv::~AmrLevelAdv ()
 {
-    delete flux_reg;
 }
 
 /**
@@ -74,9 +72,9 @@ AmrLevelAdv::restart (Amr&          papa,
 {
     AmrLevel::restart(papa,is,bReadSpecial);
 
-    BL_ASSERT(flux_reg == 0);
-    if (level > 0 && do_reflux)
-        flux_reg = new FluxRegister(grids,dmap,crse_ratio,level,NUM_STATE);
+    if (level > 0 && do_reflux) {
+        flux_reg = std::make_unique<FluxRegister>(grids,dmap,crse_ratio,level,NUM_STATE);
+    }
 }
 
 /**
@@ -88,11 +86,11 @@ AmrLevelAdv::checkPoint (const std::string& dir,
                          VisMF::How         how,
                          bool               dump_old)
 {
-  AmrLevel::checkPoint(dir, os, how, dump_old);
+    AmrLevel::checkPoint(dir, os, how, dump_old);
 #ifdef AMREX_PARTICLES
-  if (do_tracers && level == 0) {
-    TracerPC->WritePlotFile(dir, "Tracer");
-  }
+    if (do_tracers && level == 0) {
+        TracerPC->WritePlotFile(dir, "Tracer");
+    }
 #endif
 }
 
@@ -285,7 +283,8 @@ AmrLevelAdv::advance (Real time,
 
     // State with ghost cells
     MultiFab Sborder(grids, dmap, NUM_STATE, NUM_GROW);
-    FillPatch(*this, Sborder, NUM_GROW, time, Phi_Type, 0, NUM_STATE);
+    // We use FillPatcher to do fillpatch here if we can
+    FillPatcherFill(Sborder, 0, NUM_STATE, NUM_GROW, time, Phi_Type, 0);
 
     // MF to hold the mac velocity
     MultiFab Umac[BL_SPACEDIM];
@@ -601,11 +600,19 @@ AmrLevelAdv::post_timestep (int iteration)
     //
     int finest_level = parent->finestLevel();
 
-    if (do_reflux && level < finest_level)
+    if (do_reflux && level < finest_level) {
         reflux();
+    }
 
-    if (level < finest_level)
+    if (level < finest_level) {
         avgDown();
+    }
+
+    if (level < finest_level) {
+        // fillpatcher on level+1 needs to be reset because data on this
+        // level have changed.
+        getLevel(level+1).resetFillPatcher();
+    }
 
 #ifdef AMREX_PARTICLES
     if (TracerPC)
diff --git a/Tests/CMakeLists.txt b/Tests/CMakeLists.txt
index 50cc2bb8cb2..8d318f918b8 100644
--- a/Tests/CMakeLists.txt
+++ b/Tests/CMakeLists.txt
@@ -1,7 +1,7 @@
 #
 # List of subdirectories to search for CMakeLists.
 #
-set( AMREX_TESTS_SUBDIRS AsyncOut MultiBlock Amr CLZ Parser)
+set( AMREX_TESTS_SUBDIRS AsyncOut MultiBlock Amr CLZ Parser CTOParFor)
 
 if (AMReX_PARTICLES)
    list(APPEND AMREX_TESTS_SUBDIRS Particles)
diff --git a/Tests/CTOParFor/CMakeLists.txt b/Tests/CTOParFor/CMakeLists.txt
new file mode 100644
index 00000000000..57c1e7715e2
--- /dev/null
+++ b/Tests/CTOParFor/CMakeLists.txt
@@ -0,0 +1,7 @@
+set(_sources     main.cpp)
+set(_input_files)
+
+setup_test(_sources _input_files)
+
+unset(_sources)
+unset(_input_files)
diff --git a/Tests/CTOParFor/GNUmakefile b/Tests/CTOParFor/GNUmakefile
new file mode 100644
index 00000000000..0dbc65578af
--- /dev/null
+++ b/Tests/CTOParFor/GNUmakefile
@@ -0,0 +1,20 @@
+AMREX_HOME = ../../
+
+DEBUG	= FALSE
+DIM	= 3
+COMP    = gcc
+
+USE_MPI   = FALSE
+USE_OMP   = FALSE
+USE_CUDA  = FALSE
+
+TINY_PROFILE = FALSE
+
+CXXSTD = c++17
+
+include $(AMREX_HOME)/Tools/GNUMake/Make.defs
+
+include ./Make.package
+include $(AMREX_HOME)/Src/Base/Make.package
+
+include $(AMREX_HOME)/Tools/GNUMake/Make.rules
diff --git a/Tests/CTOParFor/Make.package b/Tests/CTOParFor/Make.package
new file mode 100644
index 00000000000..4497b0e25b9
--- /dev/null
+++ b/Tests/CTOParFor/Make.package
@@ -0,0 +1,4 @@
+CEXE_sources += main.cpp
+
+
+
diff --git a/Tests/CTOParFor/main.cpp b/Tests/CTOParFor/main.cpp
new file mode 100644
index 00000000000..0cf1d7ea35a
--- /dev/null
+++ b/Tests/CTOParFor/main.cpp
@@ -0,0 +1,64 @@
+#include <AMReX.H>
+#include <AMReX_IArrayBox.H>
+
+using namespace amrex;
+
+int main (int argc, char* argv[])
+{
+    amrex::Initialize(argc,argv);
+#if (__cplusplus >= 201703L)
+    {
+        enum A_options: int {
+            A0 = 0, A1
+        };
+
+        enum B_options: int {
+            B0 = 0, B1, B2
+        };
+
+        Box box(IntVect(0),IntVect(7));
+        IArrayBox fab(box,2);
+        fab.setVal<RunOn::Device>(-10);
+
+        auto const& arr = fab.array();
+
+        for (int ia = 0; ia < 2; ++ia) {
+            for (int ib = 0; ib < 3; ++ib) {
+                ParallelFor(TypeList<CompileTimeOptions<A0,A1>,
+                                     CompileTimeOptions<B0,B1,B2>>{},
+                            {ia, ib},
+                            box, [=] AMREX_GPU_DEVICE (int i, int j, int k,
+                                                       auto A_control,
+                                                       auto B_control)
+                {
+                    auto const& larr = arr;
+                    int a, b;
+                    if constexpr (A_control.value == 0) {
+                        a = 0;
+                    } else if constexpr (A_control.value == 1) {
+                        a = 1;
+                    } else {
+                        a = -1;
+                    }
+                    if constexpr (B_control.value == 0) {
+                        b = 0;
+                    } else if constexpr (B_control.value == 1) {
+                        b = 1;
+                    } else if constexpr (B_control.value == 2) {
+                        b = 2;
+                    } else if constexpr (B_control.value == 3) {
+                        b = 3;
+                    }
+                    larr(i,j,k) = a*10 + b;
+                });
+
+                auto s = fab.sum<RunOn::Device>(0);
+                AMREX_ALWAYS_ASSERT(s == box.numPts()*(ia*10+ib));
+            }
+        }
+    }
+#else
+    amrex::Print() << "This test requires C++17." << std::endl;
+#endif
+    amrex::Finalize();
+}
diff --git a/Tests/EB/CNS/Source/main.cpp b/Tests/EB/CNS/Source/main.cpp
index aa851c47956..393431f8f79 100644
--- a/Tests/EB/CNS/Source/main.cpp
+++ b/Tests/EB/CNS/Source/main.cpp
@@ -53,7 +53,13 @@ int main (int argc, char* argv[])
         AmrLevel::SetEBSupportLevel(EBSupport::full);
         AmrLevel::SetEBMaxGrowCells(CNS::numGrow(),4,2);
 
-        initialize_EB2(amr.Geom(amr.maxLevel()), amr.maxLevel(), amr.maxLevel());
+        int max_eb_level = amr.maxLevel();
+        ParmParse pp("amr");
+        pp.query("max_eb_level", max_eb_level);
+        initialize_EB2(amr.Geom(max_eb_level), max_eb_level, max_eb_level);
+        if (max_eb_level < amr.maxLevel()) {
+            EB2::addFineLevels(amr.maxLevel() - max_eb_level);
+        }
 
         amr.init(strt_time,stop_time);
 
diff --git a/Tests/GPU/CNS/Source/CNS.H b/Tests/GPU/CNS/Source/CNS.H
index 877f0b523da..eedb7d486ba 100644
--- a/Tests/GPU/CNS/Source/CNS.H
+++ b/Tests/GPU/CNS/Source/CNS.H
@@ -157,6 +157,8 @@ protected:
 
     static int do_reflux;
 
+    static int rk_order;
+
     static bool do_visc;
     static bool use_const_visc;
 
diff --git a/Tests/GPU/CNS/Source/CNS.cpp b/Tests/GPU/CNS/Source/CNS.cpp
index c3b5e2fb600..1a073c68c8a 100644
--- a/Tests/GPU/CNS/Source/CNS.cpp
+++ b/Tests/GPU/CNS/Source/CNS.cpp
@@ -19,6 +19,7 @@ int       CNS::verbose = 0;
 IntVect   CNS::hydro_tile_size {AMREX_D_DECL(1024,16,16)};
 Real      CNS::cfl       = 0.3;
 int       CNS::do_reflux = 1;
+int       CNS::rk_order = 2;
 int       CNS::refine_max_dengrad_lev   = -1;
 Real      CNS::refine_dengrad           = 1.0e10;
 
@@ -241,6 +242,9 @@ CNS::post_timestep (int /*iteration*/)
 
     if (level < parent->finestLevel()) {
         avgDown();
+        // fillpatcher on level+1 needs to be reset because data on this
+        // level have changed.
+        getLevel(level+1).resetFillPatcher();
     }
 }
 
@@ -354,6 +358,7 @@ CNS::read_params ()
     }
 
     pp.query("do_reflux", do_reflux);
+    pp.query("rk_order", rk_order);
 
     pp.query("do_visc", do_visc);
 
diff --git a/Tests/GPU/CNS/Source/CNS_advance.cpp b/Tests/GPU/CNS/Source/CNS_advance.cpp
index c086cac0e9f..99749dded19 100644
--- a/Tests/GPU/CNS/Source/CNS_advance.cpp
+++ b/Tests/GPU/CNS/Source/CNS_advance.cpp
@@ -7,7 +7,7 @@
 using namespace amrex;
 
 Real
-CNS::advance (Real time, Real dt, int /*iteration*/, int /*ncycle*/)
+CNS::advance (Real time, Real dt, int iteration, int ncycle)
 {
     BL_PROFILE("CNS::advance()");
 
@@ -16,11 +16,6 @@ CNS::advance (Real time, Real dt, int /*iteration*/, int /*ncycle*/)
         state[i].swapTimeLevels(dt);
     }
 
-    MultiFab& S_new = get_new_data(State_Type);
-    MultiFab& S_old = get_old_data(State_Type);
-    MultiFab dSdt(grids,dmap,NUM_STATE,0,MFInfo(),Factory());
-    MultiFab Sborder(grids,dmap,NUM_STATE,NUM_GROW,MFInfo(),Factory());
-
     FluxRegister* fr_as_crse = nullptr;
     if (do_reflux && level < parent->finestLevel()) {
         CNS& fine_level = getLevel(level+1);
@@ -36,23 +31,14 @@ CNS::advance (Real time, Real dt, int /*iteration*/, int /*ncycle*/)
         fr_as_crse->setVal(Real(0.0));
     }
 
-    // RK2 stage 1
-    FillPatch(*this, Sborder, NUM_GROW, time, State_Type, 0, NUM_STATE);
-    compute_dSdt(Sborder, dSdt, Real(0.5)*dt, fr_as_crse, fr_as_fine);
-    // U^* = U^n + dt*dUdt^n
-    MultiFab::LinComb(S_new, Real(1.0), Sborder, 0, dt, dSdt, 0, 0, NUM_STATE, 0);
-    computeTemp(S_new,0);
-
-    // RK2 stage 2
-    // After fillpatch Sborder = U^n+dt*dUdt^n
-    FillPatch(*this, Sborder, NUM_GROW, time+dt, State_Type, 0, NUM_STATE);
-    compute_dSdt(Sborder, dSdt, Real(0.5)*dt, fr_as_crse, fr_as_fine);
-    // S_new = 0.5*(Sborder+S_old) = U^n + 0.5*dt*dUdt^n
-    MultiFab::LinComb(S_new, Real(0.5), Sborder, 0, Real(0.5), S_old, 0, 0, NUM_STATE, 0);
-    // S_new += 0.5*dt*dSdt
-    MultiFab::Saxpy(S_new, Real(0.5)*dt, dSdt, 0, 0, NUM_STATE, 0);
-    // We now have S_new = U^{n+1} = (U^n+0.5*dt*dUdt^n) + 0.5*dt*dUdt^*
-    computeTemp(S_new,0);
+    RK(rk_order, State_Type, time, dt, iteration, ncycle,
+       // Given state S, compute dSdt. dtsub is needed for flux register operations
+       [&] (int /*stage*/, MultiFab& dSdt, MultiFab const& S,
+            Real /*t*/, Real dtsub) {
+           compute_dSdt(S, dSdt, dtsub, fr_as_crse, fr_as_fine);
+       },
+       // Optional. In case if there is anything needed after each RK substep.
+       [&] (int /*stage*/, MultiFab& S) { computeTemp(S,0); });
 
     return dt;
 }
@@ -254,5 +240,3 @@ CNS::compute_dSdt (const MultiFab& S, MultiFab& dSdt, Real dt,
         }
     }
 }
-
-
diff --git a/Tests/GPU/CNS/Source/diffusion/CNS_diffusion_K.H b/Tests/GPU/CNS/Source/diffusion/CNS_diffusion_K.H
index b9bf5a18f78..75f4f784fad 100644
--- a/Tests/GPU/CNS/Source/diffusion/CNS_diffusion_K.H
+++ b/Tests/GPU/CNS/Source/diffusion/CNS_diffusion_K.H
@@ -17,24 +17,24 @@ cns_diffcoef (int i, int j, int k,
 {
     using amrex::Real;
 
-     coefs(i,j,k,CETA) = parm.C_S * std::sqrt(q(i,j,k,QTEMP)) * q(i,j,k,QTEMP) / (q(i,j,k,QTEMP)+parm.T_S);
-     coefs(i,j,k,CXI)  = Real(0.0);
-     coefs(i,j,k,CLAM) = coefs(i,j,k,CETA)*parm.cp/parm.Pr;
+    coefs(i,j,k,CETA) = parm.C_S * std::sqrt(q(i,j,k,QTEMP)) * q(i,j,k,QTEMP) / (q(i,j,k,QTEMP)+parm.T_S);
+    coefs(i,j,k,CXI)  = Real(0.0);
+    coefs(i,j,k,CLAM) = coefs(i,j,k,CETA)*parm.cp/parm.Pr;
 }
 
 AMREX_GPU_DEVICE
 inline
 void
 cns_constcoef (int i, int j, int k,
-              amrex::Array4<amrex::Real const> const& q,
+              amrex::Array4<amrex::Real const> const& /*q*/,
               amrex::Array4<amrex::Real> const& coefs,
               Parm const& parm) noexcept
 {
     using amrex::Real;
 
-     coefs(i,j,k,CETA) = parm.const_visc_mu;
-     coefs(i,j,k,CXI)  = parm.const_visc_ki;
-     coefs(i,j,k,CLAM) = parm.const_lambda;
+    coefs(i,j,k,CETA) = parm.const_visc_mu;
+    coefs(i,j,k,CXI)  = parm.const_visc_ki;
+    coefs(i,j,k,CLAM) = parm.const_lambda;
 }
 
 AMREX_GPU_DEVICE
@@ -45,7 +45,7 @@ cns_diff_x (int i, int j, int k,
                amrex::Array4<amrex::Real const> const& coeffs,
                amrex::GpuArray<amrex::Real,AMREX_SPACEDIM> const& dxinv,
                amrex::Array4<amrex::Real> const& fx,
-               Parm const& parm) noexcept
+               Parm const& /*parm*/) noexcept
 {
     using amrex::Real;
 
@@ -81,7 +81,7 @@ cns_diff_y (int i, int j, int k, amrex::Array4<amrex::Real const> const& q,
                amrex::Array4<amrex::Real const> const& coeffs,
                amrex::GpuArray<amrex::Real,AMREX_SPACEDIM> const& dxinv,
                amrex::Array4<amrex::Real> const& fy,
-               Parm const& parm) noexcept
+               Parm const& /*parm*/) noexcept
 {
     using amrex::Real;
 
@@ -119,7 +119,7 @@ cns_diff_z (int i, int j, int k,
                amrex::Array4<amrex::Real const> const& coeffs,
                amrex::GpuArray<amrex::Real,AMREX_SPACEDIM> const& dxinv,
                amrex::Array4<amrex::Real> const& fz,
-               Parm const& parm) noexcept
+               Parm const& /*parm*/) noexcept
 {
     using amrex::Real;
 
diff --git a/Tests/LinearSolvers/CellEB2/inputs.rt.2d b/Tests/LinearSolvers/CellEB2/inputs.rt.2d
index 8dfd8a7bb3f..4afdf526259 100644
--- a/Tests/LinearSolvers/CellEB2/inputs.rt.2d
+++ b/Tests/LinearSolvers/CellEB2/inputs.rt.2d
@@ -11,6 +11,7 @@ max_level = 1
 n_cell = 128
 max_grid_size = 64
 eb2.max_grid_size = 32
+eb2.num_coarsen_opt=3
 
 eb2.geom_type = sphere
 eb2.sphere_center = 0.5  0.5  0.5
diff --git a/Tests/LinearSolvers/CellEB2/inputs.rt.3d b/Tests/LinearSolvers/CellEB2/inputs.rt.3d
index 9a8037a68c0..64fcef6281b 100644
--- a/Tests/LinearSolvers/CellEB2/inputs.rt.3d
+++ b/Tests/LinearSolvers/CellEB2/inputs.rt.3d
@@ -11,6 +11,7 @@ max_level = 1
 n_cell = 128
 max_grid_size = 64
 eb2.max_grid_size = 32
+eb2.num_coarsen_opt=3
 
 eb2.geom_type = sphere
 eb2.sphere_center = 0.5  0.5  0.5
diff --git a/Tools/AMRProfParser/GNUmakefile b/Tools/AMRProfParser/GNUmakefile
index 619d67a557a..59fd2a54b0c 100644
--- a/Tools/AMRProfParser/GNUmakefile
+++ b/Tools/AMRProfParser/GNUmakefile
@@ -23,7 +23,6 @@ USE_MPI       = FALSE
 USE_OMP       = FALSE
 EBASE         = amrprofparser
 BL_NO_FORT    = FALSE
-USE_CXX11     = TRUE
 
 include $(AMREX_HOME)/Tools/GNUMake/Make.defs
 include $(AMREX_HOME)/Src/Base/Make.package
diff --git a/Tools/Backtrace/parse_bt.py b/Tools/Backtrace/parse_bt.py
index ce4a6684911..dd0234f9120 100755
--- a/Tools/Backtrace/parse_bt.py
+++ b/Tools/Backtrace/parse_bt.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/env python3
 
 import sys
 import re
diff --git a/Tools/CMake/AMReXConfig.cmake.in b/Tools/CMake/AMReXConfig.cmake.in
index 6b0cdd3fd74..64a112da181 100644
--- a/Tools/CMake/AMReXConfig.cmake.in
+++ b/Tools/CMake/AMReXConfig.cmake.in
@@ -223,10 +223,12 @@ endif ()
 # CUDA
 #
 # AMReX 21.06+ supports CUDA_ARCHITECTURES
-if(CMAKE_VERSION VERSION_LESS 3.20)
-   if (@AMReX_CUDA@)
-      include(AMReX_SetupCUDA)
-   endif ()
+if (@AMReX_CUDA@)
+    if (CMAKE_VERSION VERSION_LESS 3.20)
+        include(AMReX_SetupCUDA)
+    else ()
+        find_dependency(CUDAToolkit REQUIRED)
+    endif ()
 endif ()
 
 include( "${CMAKE_CURRENT_LIST_DIR}/AMReXTargets.cmake" )
diff --git a/Tools/CMake/AMReXFlagsTargets.cmake b/Tools/CMake/AMReXFlagsTargets.cmake
index 64dcf3f3a5f..2e89c32fddc 100644
--- a/Tools/CMake/AMReXFlagsTargets.cmake
+++ b/Tools/CMake/AMReXFlagsTargets.cmake
@@ -82,15 +82,15 @@ target_compile_options( Flags_CXX
    $<${_cxx_cray_dbg}:-O0>
    $<${_cxx_cray_rwdbg}:>
    $<${_cxx_cray_rel}:>
-   $<${_cxx_clang_dbg}:-O0 -Wall -Wextra -Wno-sign-compare -Wno-unused-parameter -Wno-unused-variable -Wno-pass-failed>
-   $<${_cxx_clang_rwdbg}:-Wno-pass-failed>
-   $<${_cxx_clang_rel}:-Wno-pass-failed>
-   $<${_cxx_appleclang_dbg}:-O0 -Wall -Wextra -Wno-sign-compare -Wno-unused-parameter -Wno-unused-variable -Wno-pass-failed>
-   $<${_cxx_appleclang_rwdbg}:-Wno-pass-failed>
-   $<${_cxx_appleclang_rel}:-Wno-pass-failed>
-   $<${_cxx_intelllvm_dbg}:-O0 -Wall -Wextra -Wno-sign-compare -Wno-unused-parameter -Wno-unused-variable -Wno-pass-failed>
-   $<${_cxx_intelllvm_rwdbg}:-Wno-pass-failed>
-   $<${_cxx_intelllvm_rel}:-Wno-pass-failed>
+   $<${_cxx_clang_dbg}:-O0 -Wall -Wextra -Wno-sign-compare -Wno-unused-parameter -Wno-unused-variable>
+   $<${_cxx_clang_rwdbg}:>
+   $<${_cxx_clang_rel}:>
+   $<${_cxx_appleclang_dbg}:-O0 -Wall -Wextra -Wno-sign-compare -Wno-unused-parameter -Wno-unused-variable>
+   $<${_cxx_appleclang_rwdbg}:>
+   $<${_cxx_appleclang_rel}:>
+   $<${_cxx_intelllvm_dbg}:-O0 -Wall -Wextra -Wno-sign-compare -Wno-unused-parameter -Wno-unused-variable>
+   $<${_cxx_intelllvm_rwdbg}:>
+   $<${_cxx_intelllvm_rel}:>
    )
 
 #
diff --git a/Tools/CMake/AMReXParallelBackends.cmake b/Tools/CMake/AMReXParallelBackends.cmake
index ebf397266f8..61b563f7c51 100644
--- a/Tools/CMake/AMReXParallelBackends.cmake
+++ b/Tools/CMake/AMReXParallelBackends.cmake
@@ -198,10 +198,12 @@ if (AMReX_HIP)
    unset(_valid_hip_compilers)
 
    if(NOT DEFINED HIP_PATH)
-      if(NOT DEFINED ENV{HIP_PATH})
-         set(HIP_PATH "/opt/rocm/hip" CACHE PATH "Path to which HIP has been installed")
-      else()
+      if(DEFINED ENV{HIP_PATH})
          set(HIP_PATH $ENV{HIP_PATH} CACHE PATH "Path to which HIP has been installed")
+      elseif(DEFINED ENV{ROCM_PATH})
+         set(HIP_PATH "$ENV{ROCM_PATH}/hip" CACHE PATH "Path to which HIP has been installed")
+      else()
+         set(HIP_PATH "/opt/rocm/hip" CACHE PATH "Path to which HIP has been installed")
       endif()
    endif()
 
@@ -255,9 +257,15 @@ if (AMReX_HIP)
    if(AMReX_ROCTX)
        # To be modernized in the future, please see:
        # https://github.com/ROCm-Developer-Tools/roctracer/issues/56
-       target_include_directories(amrex PUBLIC ${HIP_PATH}/../roctracer/include ${HIP_PATH}/../rocprofiler/include)
-       target_link_libraries(amrex PUBLIC "-L${HIP_PATH}/../roctracer/lib/ -lroctracer64" "-L${HIP_PATH}/../roctracer/lib -lroctx64")
-   endif ()
+       target_include_directories(amrex SYSTEM PUBLIC
+           ${HIP_PATH}/../roctracer/include
+           ${HIP_PATH}/../rocprofiler/include
+       )
+       target_link_libraries(amrex PUBLIC
+           "-L${HIP_PATH}/../roctracer/lib -lroctracer64"
+           "-L${HIP_PATH}/../roctracer/lib -lroctx64"
+       )
+   endif()
    target_link_libraries(amrex PUBLIC hip::hiprand roc::rocrand roc::rocprim)
 
    # avoid forcing the rocm LLVM flags on a gfortran
@@ -271,7 +279,7 @@ if (AMReX_HIP)
        # else there will be a runtime issue (cannot find
        # missing gpu devices)
        target_compile_options(amrex PUBLIC
-          $<$<COMPILE_LANGUAGE:CXX>:--amdgpu-target=${AMReX_AMD_ARCH_HIPCC} -Wno-pass-failed>)
+          $<$<COMPILE_LANGUAGE:CXX>:--amdgpu-target=${AMReX_AMD_ARCH_HIPCC}>)
    endif()
 
    target_compile_options(amrex PUBLIC $<$<COMPILE_LANGUAGE:CXX>:-m64>)
diff --git a/Tools/CMake/AMReXSYCL.cmake b/Tools/CMake/AMReXSYCL.cmake
index 8e6c7f2f4d5..007b5f321fe 100644
--- a/Tools/CMake/AMReXSYCL.cmake
+++ b/Tools/CMake/AMReXSYCL.cmake
@@ -45,7 +45,7 @@ target_compile_features(SYCL INTERFACE cxx_std_17)
 #
 target_compile_options( SYCL
    INTERFACE
-   $<${_cxx_dpcpp}:-Wno-error=sycl-strict -Wno-pass-failed -fsycl>
+   $<${_cxx_dpcpp}:-Wno-error=sycl-strict -fsycl>
    $<${_cxx_dpcpp}:$<$<BOOL:${AMReX_DPCPP_SPLIT_KERNEL}>:-fsycl-device-code-split=per_kernel>>)
 
 # temporary work-around for DPC++ beta08 bug
diff --git a/Tools/CMake/AMReXThirdPartyLibraries.cmake b/Tools/CMake/AMReXThirdPartyLibraries.cmake
index 1afbcac4ee2..2b0a90febe1 100644
--- a/Tools/CMake/AMReXThirdPartyLibraries.cmake
+++ b/Tools/CMake/AMReXThirdPartyLibraries.cmake
@@ -45,7 +45,7 @@ endif ()
 # Sensei
 #
 if (AMReX_SENSEI)
-    find_package(SENSEI REQUIRED)
+    find_package( SENSEI 4.0.0 REQUIRED )
     target_link_libraries( amrex PUBLIC sensei )
 endif ()
 
diff --git a/Tools/CMake/AMReXTypecheck.cmake b/Tools/CMake/AMReXTypecheck.cmake
index 926fcda9daf..0b68fb8c274 100644
--- a/Tools/CMake/AMReXTypecheck.cmake
+++ b/Tools/CMake/AMReXTypecheck.cmake
@@ -250,7 +250,7 @@ function( add_typecheck_target _target)
       add_custom_command(
          OUTPUT  ${_cppd_file}
          COMMAND ${CMAKE_C_COMPILER}
-         ARGS    ${_cxx_defines} ${_includes} -E -P -x c -std=c99 ${_fullname} > ${_cppd_file}
+         ARGS    ${_cxx_defines} ${_includes} -E -P -x c -std=c11 ${_fullname} > ${_cppd_file}
          COMMAND sed
          ARGS -i -e 's/amrex::Real/${AMREX_REAL}/g' ${_cppd_file}
          COMMAND sed
diff --git a/Tools/CMake/AMReX_Config.cmake b/Tools/CMake/AMReX_Config.cmake
index 1754b339094..c842db1e136 100644
--- a/Tools/CMake/AMReX_Config.cmake
+++ b/Tools/CMake/AMReX_Config.cmake
@@ -37,22 +37,18 @@ function (configure_amrex)
    #
    # Setup compilers
    #
-   # Set C++ standard and disable compiler-specific extensions, like "-std=gnu++14" for GNU
+   # Set C++ standard and disable compiler-specific extensions, like "-std=gnu++17" for GNU
    # This will also enforce the same standard with the CUDA compiler
    # Moreover, it will also enforce such standard on all the consuming targets
    #
    set_target_properties(amrex PROPERTIES CXX_EXTENSIONS OFF)
-   # minimum: C++14 on Linux, C++17 on Windows, C++17 for dpc++ and hip
-   if (AMReX_DPCPP OR AMReX_HIP)
-      target_compile_features(amrex PUBLIC cxx_std_17)
-   else ()
-      target_compile_features(amrex PUBLIC $<IF:$<STREQUAL:$<PLATFORM_ID>,Windows>,cxx_std_17,cxx_std_14>)
-   endif ()
+   # minimum: C++17
+   target_compile_features(amrex PUBLIC cxx_std_17)
 
    if (AMReX_CUDA)
       set_target_properties(amrex PROPERTIES CUDA_EXTENSIONS OFF)
-      # minimum: C++14 on Linux, C++17 on Windows
-      target_compile_features(amrex PUBLIC $<IF:$<STREQUAL:$<PLATFORM_ID>,Windows>,cuda_std_17,cuda_std_14>)
+      # minimum: C++17
+      target_compile_features(amrex PUBLIC cuda_std_17)
    endif()
 
    #
diff --git a/Tools/C_scripts/describe_sources.py b/Tools/C_scripts/describe_sources.py
index c49d16694a9..97cfe5e1e1c 100755
--- a/Tools/C_scripts/describe_sources.py
+++ b/Tools/C_scripts/describe_sources.py
@@ -1,10 +1,6 @@
-#!/usr/bin/env python
+#!/usr/bin/env python3
 
 import sys
-
-if sys.version_info < (2, 7):
-    sys.exit("ERROR: need python 2.7 or later for dep.py")
-
 import argparse
 import os
 import subprocess
diff --git a/Tools/C_scripts/gatherbuildtime.py b/Tools/C_scripts/gatherbuildtime.py
index 082ec766c28..b0b1740847f 100755
--- a/Tools/C_scripts/gatherbuildtime.py
+++ b/Tools/C_scripts/gatherbuildtime.py
@@ -1,11 +1,7 @@
-#!/usr/bin/env python
+#!/usr/bin/env python3
 
-from __future__ import print_function
 import sys, os, glob, operator, time
 
-if sys.version_info < (2, 7):
-    sys.exit("ERROR: need python 2.7 or later for dep.py")
-
 if __name__ == "__main__":
     dt = float(sys.argv[3])-float(sys.argv[2])
     hours, rem = divmod(dt, 3600)
diff --git a/Tools/C_scripts/makebuildinfo_C.py b/Tools/C_scripts/makebuildinfo_C.py
index 8a05cd3f75d..07f31c0585a 100755
--- a/Tools/C_scripts/makebuildinfo_C.py
+++ b/Tools/C_scripts/makebuildinfo_C.py
@@ -186,11 +186,11 @@ def runcommand(command):
     out = p.stdout.read()
     return out.strip().decode("ascii")
 
-def get_git_hash(d):
+def get_git_hash(d, git_style):
     cwd = os.getcwd()
     os.chdir(d)
     try:
-        ghash = runcommand("git describe --always --tags --dirty")
+        ghash = runcommand("git describe " + git_style)
     except:
         ghash = ""
     os.chdir(cwd)
@@ -259,6 +259,10 @@ def get_git_hash(d):
                         help="the full path to the build directory that corresponds to build_git_name",
                         type=str, default="")
 
+    parser.add_argument("--GIT_STYLE",
+                        help="style options for the 'git describe' command used to construct hash strings",
+                        type=str, default="--always --tags --dirty")
+
 
     # parse and convert to a dictionary
     args = parser.parse_args()
@@ -281,7 +285,7 @@ def get_git_hash(d):
     git_hashes = []
     for d in GIT:
         if d and os.path.isdir(d):
-            git_hashes.append(get_git_hash(d))
+            git_hashes.append(get_git_hash(d, args.GIT_STYLE))
         else:
             git_hashes.append("")
 
@@ -291,7 +295,7 @@ def get_git_hash(d):
         except:
             build_git_hash = "directory not valid"
         else:
-            build_git_hash = get_git_hash(args.build_git_dir)
+            build_git_hash = get_git_hash(args.build_git_dir, args.GIT_STYLE)
             os.chdir(running_dir)
     else:
         build_git_hash = ""
diff --git a/Tools/CompileTesting/compiletesting.py b/Tools/CompileTesting/compiletesting.py
index 129e83ca960..9cb5f59bac5 100755
--- a/Tools/CompileTesting/compiletesting.py
+++ b/Tools/CompileTesting/compiletesting.py
@@ -1,6 +1,5 @@
-#!/usr/bin/env python
+#!/usr/bin/env python3
 
-from __future__ import print_function
 import sys
 import os
 import shlex
@@ -148,4 +147,3 @@ def run(command, outfile=None):
 
 if __name__ == "__main__":
     compiletesting(sys.argv[1:])
-
diff --git a/Tools/F_scripts/dep.py b/Tools/F_scripts/dep.py
index 894dcdb65e6..24bd8318fb8 100755
--- a/Tools/F_scripts/dep.py
+++ b/Tools/F_scripts/dep.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/env python3
 
 # automatically generate Makefile dependencies for Fortran 90 source.
 #
@@ -20,18 +20,7 @@
 #      (e.g. iso_c_binding).  Add any system-provided modules to the
 #      `IGNORES` list below
 
-from __future__ import print_function
-
 import sys
-
-if sys.version_info < (2, 7):
-    sys.exit("ERROR: need python 2.7 or later for dep.py")
-
-if sys.version[0] == "2":
-    reload(sys)
-    sys.setdefaultencoding('latin-1')
-
-
 import io
 import re
 import os
diff --git a/Tools/F_scripts/f90doc/README b/Tools/F_scripts/f90doc/README
deleted file mode 100644
index 6edb2de011f..00000000000
--- a/Tools/F_scripts/f90doc/README
+++ /dev/null
@@ -1,36 +0,0 @@
-This is f90doc version 0.3.4, a documentation tool for Fortran 90.  For
-more information (e.g., documentation), see
-
-        http://theory.lcs.mit.edu/~edemaine/f90doc
-
-or contact Erik Demaine (edemaine@mit.edu).  Comments, suggestions,
-criticisms, and bug reports go to this e-mail address.  If you modify f90doc or
-use it in a serious way, please contact me (I'd be interested).
-
-COPYRIGHT
-
-f90doc is freeware.  If you use it in a research or commercial project, you
-must acknowledge the software and its author.  I would also appreciate it if
-you contact me -- I'd like to know how f90doc is used.  If you base code on
-f90doc, you must acknowledge this.  Again, please let me know if you think your
-changes would be at all useful to the rest of the world (even if you are not
-willing to share it, the ideas may be useful).
-
-This information must accompany any copy of f90doc.
-
-INSTALLATION
-
-You shouldn't have to compile anything.  You can put the file f90doc in
-a more accessible place, but the .pl files have to be in the same directory.
-Alternatively, you can create a symlink to the real f90doc, where the .pl
-files are held.  For example,
-
-        ln -s /usr/local/lib/f90doc-0.3.4/f90doc /usr/local/bin/f90doc
-
-If you don't have a command /usr/bin/env, you'll need to replace the first line
-of f90doc with
-
-        #!/path/to/perl5/bin/perl -w
-
-Otherwise, Perl version 5.003 or higher must be the first program called "perl"
-in your path.
diff --git a/Tools/F_scripts/f90doc/expr_parse.pl b/Tools/F_scripts/f90doc/expr_parse.pl
deleted file mode 100644
index 3e831337041..00000000000
--- a/Tools/F_scripts/f90doc/expr_parse.pl
+++ /dev/null
@@ -1,793 +0,0 @@
-$yysccsid = "@(#)yaccpar 1.8 (Berkeley) 01/20/91 (Perl 2.0 12/31/92)";
-#define YYBYACC 1
-#line 2 "expr_parse.y"
-package expr_parse;
-
-;# On failure, print out this as the line we were working on.
-$expr_parse::line = "";
-
-;# Portion of line left to parse
-$expr_parse::left = "";
-#line 12 "y.tab.pl"
-$COMMA=257;
-$LPAREN=258;
-$RPAREN=259;
-$NOT=260;
-$OR=261;
-$AND=262;
-$EQV=263;
-$NEQV=264;
-$COMPARISON=265;
-$DBLSLASH=266;
-$PERCENT=267;
-$PLUS=268;
-$MINUS=269;
-$UPLUS=270;
-$UMINUS=271;
-$ASTERIK=272;
-$SLASH=273;
-$DBLASTERIK=274;
-$CONST=275;
-$NAME=276;
-$COLON=277;
-$LARRAY=278;
-$RARRAY=279;
-$EQUALS=280;
-$YYERRCODE=256;
-@yylhs = (                                               -1,
-    0,    0,    1,    1,    2,    2,    2,    2,    2,    2,
-    2,    2,    2,    2,    2,    2,    2,    2,    2,    2,
-    2,    5,    5,    5,    5,    5,    4,    4,    7,    6,
-    6,    3,    3,    3,    8,    8,    9,    9,   10,   10,
-   10,   12,   11,   11,   11,   11,
-);
-@yylen = (                                                2,
-    1,    2,    1,    1,    1,    3,    2,    2,    2,    3,
-    3,    3,    3,    3,    3,    3,    3,    3,    3,    3,
-    3,    1,    3,    1,    3,    3,    3,    1,    1,    5,
-    7,    1,    3,    4,    0,    1,    3,    1,    1,    1,
-    1,    3,    1,    2,    2,    3,
-);
-@yydefred = (                                             0,
-    0,    0,    0,    0,    3,   32,    0,    0,    0,    4,
-    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
-   28,    2,    0,    0,    0,    0,    0,    0,    0,    0,
-    0,    0,    0,    0,    0,    0,    0,   10,    0,    6,
-    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
-    0,    0,    0,    0,    0,    0,   38,   40,   41,   33,
-   23,    0,   26,   25,   27,    0,    0,    0,   34,    0,
-    0,    0,    0,   37,    0,    0,    0,    0,    0,
-);
-@yydgoto = (                                              8,
-   19,   10,   11,   20,   15,   63,   21,   55,   56,   57,
-   58,   59,
-);
-@yysindex = (                                          -212,
- -157, -212, -212, -212,    0,    0, -212,    0, -137,    0,
- -246, -241,  -29, -234, -235,  -19, -223, -223,  -29, -257,
-    0,    0, -212, -212, -212, -212, -212, -212, -212, -212,
- -212, -212, -212, -216, -229, -267, -222,    0, -212,    0,
- -255,  -19,  227,  227,  236, -164, -223, -223, -233, -233,
- -233, -205, -212,  -76, -174, -162,    0,    0,    0,    0,
-    0, -180,    0,    0,    0, -212,  -29, -212,    0, -216,
- -212,  -29,  -29,    0, -118, -212,  -95, -212,  -29,
-);
-@yyrindex = (                                             0,
-    0,    0,    0,    0,    0,    0,    0,    0,  106,    0,
-    1,  -59,    0,  -43,    0,  163,   77,   96, -242,    0,
-    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
-    0,    0,    0, -152,    0,    0,    0,    0,    0,    0,
-  191,  172,  199,  208,  182,  153,  115,  134,   20,   39,
-   58, -175, -219, -214,    0, -146,    0,    0,    0,    0,
-    0,    0,    0,    0,    0,    0, -192, -188,    0,    0,
-    0, -183, -178,    0,    0,    0, -145,    0, -143,
-);
-@yygindex = (                                             0,
-    2,  116,    0,    0,    0,   85,   84,    0,    0,   60,
-    0,    0,
-);
-$YYTABLESIZE=510;
-@yytable = (                                             39,
-    5,    9,   13,   16,   17,   18,   24,   61,   62,   27,
-   28,   34,   29,   30,   29,   36,   31,   32,   33,   12,
-   35,   40,   37,   38,   41,   42,   43,   44,   45,   46,
-   47,   48,   49,   50,   51,   54,   29,   43,   13,   43,
-   33,    1,   39,    2,   39,    1,   60,    2,   31,   32,
-   33,    3,    4,   62,   67,    3,    4,   11,    5,   52,
-   53,    7,    5,    6,   45,    7,   45,   72,   44,   73,
-   44,   54,   75,   42,   66,   42,    7,   77,   46,   79,
-   46,   32,   32,   32,   69,   32,   32,   32,   32,   32,
-   32,   32,   32,   32,   70,    8,   32,   32,   32,   71,
-    1,   32,    2,   29,   30,    1,   35,   31,   32,   33,
-    3,    4,   36,   30,   14,   31,   14,   12,    6,   22,
-    7,   64,   65,   23,   24,   25,   26,   27,   28,   74,
-   29,   30,    0,   15,   31,   32,   33,    0,   76,    0,
-    0,    0,   23,   24,   25,   26,   27,   28,    0,   29,
-   30,    0,   16,   31,   32,   33,    0,    0,    0,    0,
-    0,   78,    9,    0,    0,   23,   24,   25,   26,   27,
-   28,   18,   29,   30,    0,    0,   31,   32,   33,    0,
-    0,   17,    0,    0,   23,   24,   25,   26,   27,   28,
-   19,   29,   30,    0,    0,   31,   32,   33,   20,   22,
-   68,    3,    3,    3,    3,    3,    3,   21,    3,    3,
-    0,    0,    3,    3,    3,   24,    0,    4,    4,    4,
-    4,    4,    4,    0,    4,    4,    0,    0,    4,    4,
-    4,   23,   24,   25,   26,   27,   28,    0,   29,   30,
-    0,    0,   31,   32,   33,   27,   28,    0,   29,   30,
-    0,    0,   31,   32,   33,    0,    0,    5,    0,    5,
-    0,    5,    5,    5,    5,    5,    5,    0,    5,    5,
-    0,    0,    5,    5,    5,    0,   12,    5,   12,    5,
-   12,   12,   12,   12,   12,   12,    0,   12,   12,    0,
-    0,   12,   12,    0,    0,   13,   12,   13,   12,   13,
-   13,   13,   13,   13,   13,    0,   13,   13,    0,    0,
-   13,   13,    0,    0,   11,   13,   11,   13,   11,   11,
-   11,   11,   11,   11,    0,   11,   11,    0,    0,   11,
-   11,    0,    0,    7,   11,    7,   11,    7,    7,    7,
-    7,    7,    7,    0,    7,    7,    0,    0,    0,    0,
-    0,    0,    8,    7,    8,    7,    8,    8,    8,    8,
-    8,    8,    0,    8,    8,    0,    0,    0,    0,    0,
-    0,   14,    8,   14,    8,   14,   14,   14,   14,   14,
-   14,    0,   14,   14,    0,    0,    0,    0,    0,    0,
-   15,   14,   15,   14,   15,   15,   15,   15,   15,   15,
-    0,   15,   15,    0,    0,    0,    0,    0,    0,   16,
-   15,   16,   15,   16,   16,   16,   16,   16,   16,    9,
-    0,    9,    0,    9,    9,    9,    9,    0,   18,   16,
-   18,   16,   18,   18,   18,   18,    0,    0,   17,    9,
-   17,    9,   17,   17,   17,   17,    0,   19,   18,   19,
-   18,   19,    0,   19,   19,   20,    0,   20,   17,    0,
-   17,   20,   20,    0,   21,    0,   21,   19,    0,   19,
-   21,   21,    0,    0,    0,   20,    0,   20,    0,    0,
-    0,    0,    0,    0,   21,    0,   21,   23,   24,    0,
-    0,   27,   28,    0,   29,   30,    0,    0,   31,   32,
-   33,   28,    0,   29,   30,    0,    0,   31,   32,   33,
-);
-@yycheck = (                                            257,
-    0,    0,    1,    2,    3,    4,  262,  275,  276,  265,
-  266,  258,  268,  269,  257,  257,  272,  273,  274,    0,
-  267,  279,  257,  259,   23,   24,   25,   26,   27,   28,
-   29,   30,   31,   32,   33,   34,  279,  257,    0,  259,
-  274,  258,  257,  260,  259,  258,  276,  260,  272,  273,
-  274,  268,  269,  276,   53,  268,  269,    0,  275,  276,
-  277,  278,  275,  276,  257,  278,  259,   66,  257,   68,
-  259,   70,   71,  257,  280,  259,    0,   76,  257,   78,
-  259,  257,  258,  259,  259,  261,  262,  263,  264,  265,
-  266,  267,  268,  269,  257,    0,  272,  273,  274,  280,
-  258,  277,  260,  268,  269,    0,  259,  272,  273,  274,
-  268,  269,  259,  259,    0,  259,    1,  275,  276,  257,
-  278,   37,   39,  261,  262,  263,  264,  265,  266,   70,
-  268,  269,   -1,    0,  272,  273,  274,   -1,  257,   -1,
-   -1,   -1,  261,  262,  263,  264,  265,  266,   -1,  268,
-  269,   -1,    0,  272,  273,  274,   -1,   -1,   -1,   -1,
-   -1,  257,    0,   -1,   -1,  261,  262,  263,  264,  265,
-  266,    0,  268,  269,   -1,   -1,  272,  273,  274,   -1,
-   -1,    0,   -1,   -1,  261,  262,  263,  264,  265,  266,
-    0,  268,  269,   -1,   -1,  272,  273,  274,    0,  259,
-  277,  261,  262,  263,  264,  265,  266,    0,  268,  269,
-   -1,   -1,  272,  273,  274,  259,   -1,  261,  262,  263,
-  264,  265,  266,   -1,  268,  269,   -1,   -1,  272,  273,
-  274,  261,  262,  263,  264,  265,  266,   -1,  268,  269,
-   -1,   -1,  272,  273,  274,  265,  266,   -1,  268,  269,
-   -1,   -1,  272,  273,  274,   -1,   -1,  257,   -1,  259,
-   -1,  261,  262,  263,  264,  265,  266,   -1,  268,  269,
-   -1,   -1,  272,  273,  274,   -1,  257,  277,  259,  279,
-  261,  262,  263,  264,  265,  266,   -1,  268,  269,   -1,
-   -1,  272,  273,   -1,   -1,  257,  277,  259,  279,  261,
-  262,  263,  264,  265,  266,   -1,  268,  269,   -1,   -1,
-  272,  273,   -1,   -1,  257,  277,  259,  279,  261,  262,
-  263,  264,  265,  266,   -1,  268,  269,   -1,   -1,  272,
-  273,   -1,   -1,  257,  277,  259,  279,  261,  262,  263,
-  264,  265,  266,   -1,  268,  269,   -1,   -1,   -1,   -1,
-   -1,   -1,  257,  277,  259,  279,  261,  262,  263,  264,
-  265,  266,   -1,  268,  269,   -1,   -1,   -1,   -1,   -1,
-   -1,  257,  277,  259,  279,  261,  262,  263,  264,  265,
-  266,   -1,  268,  269,   -1,   -1,   -1,   -1,   -1,   -1,
-  257,  277,  259,  279,  261,  262,  263,  264,  265,  266,
-   -1,  268,  269,   -1,   -1,   -1,   -1,   -1,   -1,  257,
-  277,  259,  279,  261,  262,  263,  264,  265,  266,  257,
-   -1,  259,   -1,  261,  262,  263,  264,   -1,  257,  277,
-  259,  279,  261,  262,  263,  264,   -1,   -1,  257,  277,
-  259,  279,  261,  262,  263,  264,   -1,  257,  277,  259,
-  279,  261,   -1,  263,  264,  257,   -1,  259,  277,   -1,
-  279,  263,  264,   -1,  257,   -1,  259,  277,   -1,  279,
-  263,  264,   -1,   -1,   -1,  277,   -1,  279,   -1,   -1,
-   -1,   -1,   -1,   -1,  277,   -1,  279,  261,  262,   -1,
-   -1,  265,  266,   -1,  268,  269,   -1,   -1,  272,  273,
-  274,  266,   -1,  268,  269,   -1,   -1,  272,  273,  274,
-);
-$YYFINAL=8;
-#ifndef YYDEBUG
-#define YYDEBUG 0
-#endif
-$YYMAXTOKEN=280;
-#if YYDEBUG
-@yyname = (
-"end-of-file",'','','','','','','','','','','','','','','','','','','','','','','','','','','','','','','','','',
-'','','','','','','','','','','','','','','','','','','','','','','','','','','','','','','','','','','','','','','','',
-'','','','','','','','','','','','','','','','','','','','','','','','','','','','','','','','','','','','','','','','',
-'','','','','','','','','','','','','','','','','','','','','','','','','','','','','','','','','','','','','','','','',
-'','','','','','','','','','','','','','','','','','','','','','','','','','','','','','','','','','','','','','','','',
-'','','','','','','','','','','','','','','','','','','','','','','','','','','','','','','','','','','','','','','','',
-'','','','','','','','','','','','','','','','','','','','','','','',"COMMA","LPAREN","RPAREN","NOT",
-"OR","AND","EQV","NEQV","COMPARISON","DBLSLASH","PERCENT","PLUS","MINUS",
-"UPLUS","UMINUS","ASTERIK","SLASH","DBLASTERIK","CONST","NAME","COLON","LARRAY",
-"RARRAY","EQUALS",
-);
-@yyrule = (
-"\$accept : expr_with_abort",
-"expr_with_abort : expr",
-"expr_with_abort : expr COMMA",
-"expr : CONST",
-"expr : expr_without_const",
-"expr_without_const : chain",
-"expr_without_const : LARRAY array RARRAY",
-"expr_without_const : PLUS expr",
-"expr_without_const : MINUS expr",
-"expr_without_const : NOT expr",
-"expr_without_const : LPAREN potential_complex_or_implied_do RPAREN",
-"expr_without_const : expr DBLASTERIK expr",
-"expr_without_const : expr ASTERIK expr",
-"expr_without_const : expr SLASH expr",
-"expr_without_const : expr PLUS expr",
-"expr_without_const : expr MINUS expr",
-"expr_without_const : expr DBLSLASH expr",
-"expr_without_const : expr COMPARISON expr",
-"expr_without_const : expr AND expr",
-"expr_without_const : expr OR expr",
-"expr_without_const : expr EQV expr",
-"expr_without_const : expr NEQV expr",
-"potential_complex_or_implied_do : CONST",
-"potential_complex_or_implied_do : CONST COMMA CONST",
-"potential_complex_or_implied_do : expr_without_const",
-"potential_complex_or_implied_do : expr_without_const COMMA do_args",
-"potential_complex_or_implied_do : CONST COMMA do_args",
-"array : array COMMA array_piece",
-"array : array_piece",
-"array_piece : expr",
-"do_args : NAME EQUALS expr COMMA expr",
-"do_args : NAME EQUALS expr COMMA expr COMMA expr",
-"chain : NAME",
-"chain : chain PERCENT NAME",
-"chain : chain LPAREN exprlist RPAREN",
-"exprlist :",
-"exprlist : exprlist_ne",
-"exprlist_ne : exprlist_ne COMMA argument",
-"exprlist_ne : argument",
-"argument : expr",
-"argument : colonexpr",
-"argument : namedargument",
-"namedargument : NAME EQUALS expr",
-"colonexpr : COLON",
-"colonexpr : expr COLON",
-"colonexpr : COLON expr",
-"colonexpr : expr COLON expr",
-);
-#endif
-sub yyclearin { $yychar = -1; }
-sub yyerrok { $yyerrflag = 0; }
-$YYSTACKSIZE = $YYSTACKSIZE || $YYMAXDEPTH || 500;
-$YYMAXDEPTH = $YYMAXDEPTH || $YYSTACKSIZE || 500;
-$yyss[$YYSTACKSIZE] = 0;
-$yyvs[$YYSTACKSIZE] = 0;
-sub YYERROR { ++$yynerrs; &yy_err_recover; }
-sub yy_err_recover
-{
-  if ($yyerrflag < 3)
-  {
-    $yyerrflag = 3;
-    while (1)
-    {
-      if (($yyn = $yysindex[$yyss[$yyssp]]) && 
-          ($yyn += $YYERRCODE) >= 0 && 
-          $yycheck[$yyn] == $YYERRCODE)
-      {
-#if YYDEBUG
-       print "yydebug: state $yyss[$yyssp], error recovery shifting",
-             " to state $yytable[$yyn]\n" if $yydebug;
-#endif
-        $yyss[++$yyssp] = $yystate = $yytable[$yyn];
-        $yyvs[++$yyvsp] = $yylval;
-        next yyloop;
-      }
-      else
-      {
-#if YYDEBUG
-        print "yydebug: error recovery discarding state ",
-              $yyss[$yyssp], "\n"  if $yydebug;
-#endif
-        return(1) if $yyssp <= 0;
-        --$yyssp;
-        --$yyvsp;
-      }
-    }
-  }
-  else
-  {
-    return (1) if $yychar == 0;
-#if YYDEBUG
-    if ($yydebug)
-    {
-      $yys = '';
-      if ($yychar <= $YYMAXTOKEN) { $yys = $yyname[$yychar]; }
-      if (!$yys) { $yys = 'illegal-symbol'; }
-      print "yydebug: state $yystate, error recovery discards ",
-            "token $yychar ($yys)\n";
-    }
-#endif
-    $yychar = -1;
-    next yyloop;
-  }
-0;
-} # yy_err_recover
-
-sub yyparse
-{
-#ifdef YYDEBUG
-  if ($yys = $ENV{'YYDEBUG'})
-  {
-    $yydebug = int($1) if $yys =~ /^(\d)/;
-  }
-#endif
-
-  $yynerrs = 0;
-  $yyerrflag = 0;
-  $yychar = (-1);
-
-  $yyssp = 0;
-  $yyvsp = 0;
-  $yyss[$yyssp] = $yystate = 0;
-
-yyloop: while(1)
-  {
-    yyreduce: {
-      last yyreduce if ($yyn = $yydefred[$yystate]);
-      if ($yychar < 0)
-      {
-        if (($yychar = &yylex) < 0) { $yychar = 0; }
-#if YYDEBUG
-        if ($yydebug)
-        {
-          $yys = '';
-          if ($yychar <= $#yyname) { $yys = $yyname[$yychar]; }
-          if (!$yys) { $yys = 'illegal-symbol'; };
-          print "yydebug: state $yystate, reading $yychar ($yys)\n";
-        }
-#endif
-      }
-      if (($yyn = $yysindex[$yystate]) && ($yyn += $yychar) >= 0 &&
-              $yycheck[$yyn] == $yychar)
-      {
-#if YYDEBUG
-        print "yydebug: state $yystate, shifting to state ",
-              $yytable[$yyn], "\n"  if $yydebug;
-#endif
-        $yyss[++$yyssp] = $yystate = $yytable[$yyn];
-        $yyvs[++$yyvsp] = $yylval;
-        $yychar = (-1);
-        --$yyerrflag if $yyerrflag > 0;
-        next yyloop;
-      }
-      if (($yyn = $yyrindex[$yystate]) && ($yyn += $yychar) >= 0 &&
-            $yycheck[$yyn] == $yychar)
-      {
-        $yyn = $yytable[$yyn];
-        last yyreduce;
-      }
-      if (! $yyerrflag) {
-        &yyerror('syntax error');
-        ++$yynerrs;
-      }
-      return(1) if &yy_err_recover;
-    } # yyreduce
-#if YYDEBUG
-    print "yydebug: state $yystate, reducing by rule ",
-          "$yyn ($yyrule[$yyn])\n"  if $yydebug;
-#endif
-    $yym = $yylen[$yyn];
-    $yyval = $yyvs[$yyvsp+1-$yym];
-    switch:
-    {
-if ($yyn == 1) {
-#line 29 "expr_parse.y"
-{ $yyval = $yyvs[$yyvsp-0]; return 1; 
-last switch;
-} }
-if ($yyn == 2) {
-#line 30 "expr_parse.y"
-{ $yyval = $yyvs[$yyvsp-1]; return "s,"; 
-last switch;
-} }
-if ($yyn == 3) {
-#line 33 "expr_parse.y"
-{ $yyval = [ "%const", @{$yyvs[$yyvsp-0]} ]; 
-last switch;
-} }
-if ($yyn == 4) {
-#line 34 "expr_parse.y"
-{ $yyval = $yyvs[$yyvsp-0]; 
-last switch;
-} }
-if ($yyn == 5) {
-#line 37 "expr_parse.y"
-{ $yyval = $yyvs[$yyvsp-0]; 
-last switch;
-} }
-if ($yyn == 6) {
-#line 38 "expr_parse.y"
-{ $yyval = [ "%array", @{$yyvs[$yyvsp-1]} ]; 
-last switch;
-} }
-if ($yyn == 7) {
-#line 39 "expr_parse.y"
-{ $yyval = [ "u+", $yyvs[$yyvsp-0] ]; 
-last switch;
-} }
-if ($yyn == 8) {
-#line 40 "expr_parse.y"
-{ $yyval = [ "u-", $yyvs[$yyvsp-0] ]; 
-last switch;
-} }
-if ($yyn == 9) {
-#line 41 "expr_parse.y"
-{ $yyval = [ $yyvs[$yyvsp-1], $yyvs[$yyvsp-0] ]; 
-last switch;
-} }
-if ($yyn == 10) {
-#line 43 "expr_parse.y"
-{ $yyval = $yyvs[$yyvsp-1]; 
-last switch;
-} }
-if ($yyn == 11) {
-#line 44 "expr_parse.y"
-{ $yyval = [ $yyvs[$yyvsp-1], $yyvs[$yyvsp-2], $yyvs[$yyvsp-0] ]; 
-last switch;
-} }
-if ($yyn == 12) {
-#line 45 "expr_parse.y"
-{ $yyval = [ $yyvs[$yyvsp-1], $yyvs[$yyvsp-2], $yyvs[$yyvsp-0] ]; 
-last switch;
-} }
-if ($yyn == 13) {
-#line 46 "expr_parse.y"
-{ $yyval = [ $yyvs[$yyvsp-1], $yyvs[$yyvsp-2], $yyvs[$yyvsp-0] ]; 
-last switch;
-} }
-if ($yyn == 14) {
-#line 47 "expr_parse.y"
-{ $yyval = [ $yyvs[$yyvsp-1], $yyvs[$yyvsp-2], $yyvs[$yyvsp-0] ]; 
-last switch;
-} }
-if ($yyn == 15) {
-#line 48 "expr_parse.y"
-{ $yyval = [ $yyvs[$yyvsp-1], $yyvs[$yyvsp-2], $yyvs[$yyvsp-0] ]; 
-last switch;
-} }
-if ($yyn == 16) {
-#line 49 "expr_parse.y"
-{ $yyval = [ $yyvs[$yyvsp-1], $yyvs[$yyvsp-2], $yyvs[$yyvsp-0] ]; 
-last switch;
-} }
-if ($yyn == 17) {
-#line 50 "expr_parse.y"
-{ $yyval = [ $yyvs[$yyvsp-1], $yyvs[$yyvsp-2], $yyvs[$yyvsp-0] ]; 
-last switch;
-} }
-if ($yyn == 18) {
-#line 51 "expr_parse.y"
-{ $yyval = [ $yyvs[$yyvsp-1], $yyvs[$yyvsp-2], $yyvs[$yyvsp-0] ]; 
-last switch;
-} }
-if ($yyn == 19) {
-#line 52 "expr_parse.y"
-{ $yyval = [ $yyvs[$yyvsp-1], $yyvs[$yyvsp-2], $yyvs[$yyvsp-0] ]; 
-last switch;
-} }
-if ($yyn == 20) {
-#line 53 "expr_parse.y"
-{ $yyval = [ $yyvs[$yyvsp-1], $yyvs[$yyvsp-2], $yyvs[$yyvsp-0] ]; 
-last switch;
-} }
-if ($yyn == 21) {
-#line 54 "expr_parse.y"
-{ $yyval = [ $yyvs[$yyvsp-1], $yyvs[$yyvsp-2], $yyvs[$yyvsp-0] ]; 
-last switch;
-} }
-if ($yyn == 22) {
-#line 57 "expr_parse.y"
-{ $yyval = [ "%const", @{$yyvs[$yyvsp-0]} ]; 
-last switch;
-} }
-if ($yyn == 23) {
-#line 59 "expr_parse.y"
-{ my ($type1, $val1) = @{$yyvs[$yyvsp-2]};
-        my ($type2, $val2) = @{$yyvs[$yyvsp-0]};
-        $yyval = ["%const", typing::make_complex_type ($type1, $type2),
-              [$val1, $val2]];
-      
-last switch;
-} }
-if ($yyn == 24) {
-#line 64 "expr_parse.y"
-{ $yyval = $yyvs[$yyvsp-0]; 
-last switch;
-} }
-if ($yyn == 25) {
-#line 66 "expr_parse.y"
-{ $yyval = [ "%do", $yyvs[$yyvsp-2], @{$yyvs[$yyvsp-0]} ]; 
-last switch;
-} }
-if ($yyn == 26) {
-#line 68 "expr_parse.y"
-{ $yyval = [ "%do", [ "%const", @{$yyvs[$yyvsp-2]} ], @{$yyvs[$yyvsp-0]} ];
-                                  
-last switch;
-} }
-if ($yyn == 27) {
-#line 72 "expr_parse.y"
-{ $yyval = [ @{$yyvs[$yyvsp-2]}, $yyvs[$yyvsp-0] ]; 
-last switch;
-} }
-if ($yyn == 28) {
-#line 73 "expr_parse.y"
-{ $yyval = [ $yyvs[$yyvsp-0] ]; 
-last switch;
-} }
-if ($yyn == 29) {
-#line 76 "expr_parse.y"
-{ $yyval = $yyvs[$yyvsp-0]; 
-last switch;
-} }
-if ($yyn == 30) {
-#line 80 "expr_parse.y"
-{ $yyval = [ $yyvs[$yyvsp-4], $yyvs[$yyvsp-2], $yyvs[$yyvsp-0] ]; 
-last switch;
-} }
-if ($yyn == 31) {
-#line 82 "expr_parse.y"
-{ $yyval = [ $yyvs[$yyvsp-6], $yyvs[$yyvsp-4], $yyvs[$yyvsp-2], $yyvs[$yyvsp-0] ]; 
-last switch;
-} }
-if ($yyn == 32) {
-#line 85 "expr_parse.y"
-{ $yyval = [ "%var", $yyvs[$yyvsp-0] ]; 
-last switch;
-} }
-if ($yyn == 33) {
-#line 86 "expr_parse.y"
-{ $yyval = [ $yyvs[$yyvsp-1], $yyvs[$yyvsp-2], $yyvs[$yyvsp-0] ]; 
-last switch;
-} }
-if ($yyn == 34) {
-#line 87 "expr_parse.y"
-{ $yyval = [ "%call", $yyvs[$yyvsp-3], @{$yyvs[$yyvsp-1]} ]; 
-last switch;
-} }
-if ($yyn == 35) {
-#line 90 "expr_parse.y"
-{ $yyval = []; 
-last switch;
-} }
-if ($yyn == 36) {
-#line 91 "expr_parse.y"
-{ $yyval = $yyvs[$yyvsp-0]; 
-last switch;
-} }
-if ($yyn == 37) {
-#line 94 "expr_parse.y"
-{ $yyval = [ @{$yyvs[$yyvsp-2]}, $yyvs[$yyvsp-0] ]; 
-last switch;
-} }
-if ($yyn == 38) {
-#line 95 "expr_parse.y"
-{ $yyval = [ $yyvs[$yyvsp-0] ]; 
-last switch;
-} }
-if ($yyn == 39) {
-#line 98 "expr_parse.y"
-{ $yyval = $yyvs[$yyvsp-0]; 
-last switch;
-} }
-if ($yyn == 40) {
-#line 99 "expr_parse.y"
-{ $yyval = $yyvs[$yyvsp-0]; 
-last switch;
-} }
-if ($yyn == 41) {
-#line 100 "expr_parse.y"
-{ $yyval = $yyvs[$yyvsp-0]; 
-last switch;
-} }
-if ($yyn == 42) {
-#line 103 "expr_parse.y"
-{ $yyval = [ "%namedarg", $yyvs[$yyvsp-2], $yyvs[$yyvsp-0] ]; 
-last switch;
-} }
-if ($yyn == 43) {
-#line 106 "expr_parse.y"
-{ $yyval = [ "%colon", "", "" ]; 
-last switch;
-} }
-if ($yyn == 44) {
-#line 107 "expr_parse.y"
-{ $yyval = [ "%colon", $yyvs[$yyvsp-1], "" ]; 
-last switch;
-} }
-if ($yyn == 45) {
-#line 108 "expr_parse.y"
-{ $yyval = [ "%colon", "", $yyvs[$yyvsp-0] ]; 
-last switch;
-} }
-if ($yyn == 46) {
-#line 109 "expr_parse.y"
-{ $yyval = [ "%colon", $yyvs[$yyvsp-2], $yyvs[$yyvsp-1] ]; 
-last switch;
-} }
-#line 624 "y.tab.pl"
-    } # switch
-    $yyssp -= $yym;
-    $yystate = $yyss[$yyssp];
-    $yyvsp -= $yym;
-    $yym = $yylhs[$yyn];
-    if ($yystate == 0 && $yym == 0)
-    {
-#if YYDEBUG
-      print "yydebug: after reduction, shifting from state 0 ",
-            "to state $YYFINAL\n" if $yydebug;
-#endif
-      $yystate = $YYFINAL;
-      $yyss[++$yyssp] = $YYFINAL;
-      $yyvs[++$yyvsp] = $yyval;
-      if ($yychar < 0)
-      {
-        if (($yychar = &yylex) < 0) { $yychar = 0; }
-#if YYDEBUG
-        if ($yydebug)
-        {
-          $yys = '';
-          if ($yychar <= $#yyname) { $yys = $yyname[$yychar]; }
-          if (!$yys) { $yys = 'illegal-symbol'; }
-          print "yydebug: state $YYFINAL, reading $yychar ($yys)\n";
-        }
-#endif
-      }
-      return(0) if $yychar == 0;
-      next yyloop;
-    }
-    if (($yyn = $yygindex[$yym]) && ($yyn += $yystate) >= 0 &&
-        $yyn <= $#yycheck && $yycheck[$yyn] == $yystate)
-    {
-        $yystate = $yytable[$yyn];
-    } else {
-        $yystate = $yydgoto[$yym];
-    }
-#if YYDEBUG
-    print "yydebug: after reduction, shifting from state ",
-        "$yyss[$yyssp] to state $yystate\n" if $yydebug;
-#endif
-    $yyss[++$yyssp] = $yystate;
-    $yyvs[++$yyvsp] = $yyval;
-  } # yyloop
-} # yyparse
-#line 112 "expr_parse.y"
-
-sub yylex {
-   $expr_parse::left =~ s/^\s*//;
-   return 0 if $expr_parse::left eq "";
-   my ($ncharsread, $token, $value) = expr_parse::good_yylex ($expr_parse::left);
-   # print "yylex: token eof\n" unless $ncharsread;
-   return 0 unless $ncharsread;
-   # print "yylex: token $token (" . substr ($expr_parse::left, 0, $ncharsread) . ") with value $value\n";
-   # print join (";", @$value) . "\n";
-   $expr_parse::left = substr ($expr_parse::left, $ncharsread);
-   $yylval = $value;
-   return $token;
-}
-
-# returns (ncharsread, token, value)
-sub good_yylex {
-   my ($s) = @_;
-   my ($c) = substr ($s, 0, 1);
-
-   if ($c eq "") {
-      return 0;
-   } elsif ($s =~ /^(\d+(?:\.\d*)?|\.\d+)D[+-]?\d+/i) {
-      return (length ($&), $CONST, [$typing::double_precision, $&]);
-   } elsif ($s =~ /^(\d+E[+-]?\d+|(?:\d+\.\d*|\.\d+)(?:E[+-]?\d+)?)(_\w+)?/i) {
-      if (defined $2) {
-         return (length ($&), $CONST, [typing::make_type ('real', substr ($2, 1)), $1]);
-      } else {
-         return (length ($&), $CONST, [$typing::default_type{'real'}, $1]);
-      }
-   } elsif ($s =~ /^(\d+)(_\w+)?/) {
-      if ($2) {
-         return (length ($&), $CONST, [typing::make_type ('integer', substr ($2, 1)), $1]);
-      } else {
-         return (length ($&), $CONST, [$typing::default_type{'integer'}, $1]);
-      }
-   } elsif ($s =~ /^(\.true\.|\.false\.)(_\w+)?/i) {
-      if (defined $2) {
-         return (length ($&), $CONST, [typing::make_type ('logical', substr ($2, 1)), $1]);
-      } else {
-         return (length ($&), $CONST, [$typing::default_type{'logical'}, $1]);
-      }
-   } elsif ($s =~ /^'(\d+)'(_\w+)?/) {
-      # Interior of string is digits because it has been grabbed already.
-      my ($str) = stmts::get_string ($1);
-      if (defined $2) {
-         return (length ($&), $CONST, [typing::make_character_type (substr ($2, 1), length ($str)), $str]);
-      } else {
-         return (length ($&), $CONST, [typing::make_character_type ($typing::default_character_kind, length ($str)), $str]);
-      }
-   } elsif ($s =~ /^\w+/) {
-      return (length ($&), $NAME, $&);
-   } else {
-      switch: {
-         $s =~ /^==/      && return (2, $COMPARISON, "==");
-         $s =~ /^<=/      && return (2, $COMPARISON, "<=");
-         $s =~ /^>=/      && return (2, $COMPARISON, ">=");
-         $s =~ /^</       && return (1, $COMPARISON, "<");
-         $s =~ /^>/       && return (1, $COMPARISON, ">");
-         $s =~ /^\/=/     && return (2, $COMPARISON, "/=");
-         $s =~ /^=/       && return (1, $EQUALS, "=");
-         $s =~ /^\.eq\./i && return (4, $COMPARISON, "==");
-         $s =~ /^\.le\./i && return (4, $COMPARISON, "<=");
-         $s =~ /^\.ge\./i && return (4, $COMPARISON, ">=");
-         $s =~ /^\.lt\./i && return (4, $COMPARISON, "<");
-         $s =~ /^\.gt\./i && return (4, $COMPARISON, ">");
-         $s =~ /^\.ne\./i && return (4, $COMPARISON, "/=");
-         $s =~ /^\.neqv\./i && return (6, $NEQV, ".neqv.");
-         $s =~ /^\.eqv\./i && return (5, $EQV, ".eqv.");
-         $s =~ /^\.and\./i && return (5, $AND, ".and.");
-         $s =~ /^\.or\./i && return (4, $OR, ".or.");
-         $s =~ /^\.not\./i && return (5, $NOT, ".not.");
-         $s =~ /^\*\*/    && return (2, $DBLASTERIK, "**");
-         $s =~ /^\/\//    && return (2, $DBLSLASH, "//");
-         $s =~ /^\(\//    && return (2, $LARRAY, "(/");
-         $s =~ /^\/\)/    && return (2, $RARRAY, "/)");
-         $c eq ","        && return (1, $COMMA, ",");
-         $c eq "+"        && return (1, $PLUS, "+");
-         $c eq "-"        && return (1, $MINUS, "-");
-         $c eq "*"        && return (1, $ASTERIK, "*");
-         $c eq "/"        && return (1, $SLASH, "/");
-         $c eq "("        && return (1, $LPAREN, "(");
-         $c eq ")"        && return (1, $RPAREN, ")");
-         $c eq "%"        && return (1, $PERCENT, "%");
-         $c eq ":"        && return (1, $COLON, ":");
-      }
-      die "Lexer failed on `$s'";
-   }
-}
-
-#####
-# Takes a string that consists entirely of an expression, and returns a
-# reference to the parse tree it defines.
-#####
-sub parse_expr {
-  my ($s) = @_;
-  # print "parsing string: $s.\n";
-  $expr_parse::left = $expr_parse::line = $s;
-  die "Expression `$expr_parse::line' has trailing garbage `$1$expr_parse::left'"
-    if yyparse () =~ /^s(.*)$/;
-  return $yyval;
-}
-
-#####
-# Takes a string that consists partly of an expression.  (The first part
-# is an expression.)  Returns (parse tree ref, rest string, separator string).
-#####
-sub parse_part_as_expr {
-  my ($s) = @_;
-  # print "parsing part of string: $s.\n";
-  $expr_parse::left = $expr_parse::line = $s;
-  if (yyparse () =~ /^s(.*)$/) {
-    return ($yyval, $expr_parse::left, $1);
-  } else {
-    return ($yyval);
-  }
-}
-
-sub yyerror {
-  my ($s) = @_;
-  die "yyerror: $s during parsing of F90 code `$expr_parse::line'";
-}
-
-1;
-#line 794 "y.tab.pl"
diff --git a/Tools/F_scripts/f90doc/expr_parse.y b/Tools/F_scripts/f90doc/expr_parse.y
deleted file mode 100644
index 94070cfc768..00000000000
--- a/Tools/F_scripts/f90doc/expr_parse.y
+++ /dev/null
@@ -1,234 +0,0 @@
-%{
-package expr_parse;
-
-# On failure, print out this as the line we were working on.
-$expr_parse::line = "";
-
-# Portion of line left to parse
-$expr_parse::left = "";
-%}
-
-%token COMMA LPAREN RPAREN NOT OR AND EQV NEQV COMPARISON DBLSLASH PERCENT
-%token PLUS MINUS UPLUS UMINUS ASTERIK SLASH DBLASTERIK CONST NAME COLON
-%token LARRAY RARRAY EQUALS
-
-%left EQV NEQV
-%left OR
-%left AND
-%nonassoc NOT
-%nonassoc COMPARISON
-%left DBLSLASH
-%left PLUS MINUS
-%nonassoc UPLUS UMINUS
-%left ASTERIK SLASH
-%right DBLASTERIK
-%left PERCENT
-
-%%
-
-expr_with_abort: expr           { $$ = $1; return 1; }
-    | expr COMMA                { $$ = $1; return "s,"; }
-
-expr:
-    CONST                         { $$ = [ "%const", @{$1} ]; }
-  | expr_without_const            { $$ = $1; }
-
-expr_without_const:
-    chain                         { $$ = $1; }
-  | LARRAY array RARRAY           { $$ = [ "%array", @{$2} ]; }
-  | PLUS expr %prec UPLUS         { $$ = [ "u+", $2 ]; }
-  | MINUS expr %prec UMINUS       { $$ = [ "u-", $2 ]; }
-  | NOT expr                      { $$ = [ $1, $2 ]; }
-  | LPAREN potential_complex_or_implied_do RPAREN
-                                  { $$ = $2; }
-  | expr DBLASTERIK expr          { $$ = [ $2, $1, $3 ]; }
-  | expr ASTERIK    expr          { $$ = [ $2, $1, $3 ]; }
-  | expr SLASH      expr          { $$ = [ $2, $1, $3 ]; }
-  | expr PLUS       expr          { $$ = [ $2, $1, $3 ]; }
-  | expr MINUS      expr          { $$ = [ $2, $1, $3 ]; }
-  | expr DBLSLASH   expr          { $$ = [ $2, $1, $3 ]; }
-  | expr COMPARISON expr          { $$ = [ $2, $1, $3 ]; }
-  | expr AND        expr          { $$ = [ $2, $1, $3 ]; }
-  | expr OR         expr          { $$ = [ $2, $1, $3 ]; }
-  | expr EQV        expr          { $$ = [ $2, $1, $3 ]; }
-  | expr NEQV       expr          { $$ = [ $2, $1, $3 ]; }
-
-potential_complex_or_implied_do:
-    CONST                         { $$ = [ "%const", @{$1} ]; }
-  | CONST COMMA CONST
-      { my ($type1, $val1) = @{$1};
-        my ($type2, $val2) = @{$3};
-        $$ = ["%const", typing::make_complex_type ($type1, $type2),
-              [$val1, $val2]];
-      }
-  | expr_without_const            { $$ = $1; }
-  | expr_without_const COMMA do_args
-                                  { $$ = [ "%do", $1, @{$3} ]; }
-  | CONST COMMA do_args
-                                  { $$ = [ "%do", [ "%const", @{$1} ], @{$3} ];
-                                  }
-
-array:
-    array COMMA array_piece       { $$ = [ @{$1}, $3 ]; }
-  | array_piece                   { $$ = [ $1 ]; }
-
-array_piece:
-    expr                          { $$ = $1; }
-# | implied_do is handled within expr
-
-do_args:
-    NAME EQUALS expr COMMA expr   { $$ = [ $1, $3, $5 ]; }
-  | NAME EQUALS expr COMMA expr COMMA expr
-                                  { $$ = [ $1, $3, $5, $7 ]; }
-
-chain:
-    NAME                          { $$ = [ "%var", $1 ]; }
-  | chain PERCENT NAME            { $$ = [ $2, $1, $3 ]; }
-  | chain LPAREN exprlist RPAREN  { $$ = [ "%call", $1, @{$3} ]; }
-
-exprlist:
-                                  { $$ = []; }
-  | exprlist_ne                   { $$ = $1; }
-
-exprlist_ne:
-    exprlist_ne COMMA argument    { $$ = [ @{$1}, $3 ]; }
-  | argument                      { $$ = [ $1 ]; }
-
-argument:
-    expr                          { $$ = $1; }
-  | colonexpr                     { $$ = $1; }
-  | namedargument                 { $$ = $1; }
-
-namedargument:
-    NAME EQUALS expr              { $$ = [ "%namedarg", $1, $3 ]; }
-
-colonexpr:
-    COLON                         { $$ = [ "%colon", "", "" ]; }
-  | expr COLON                    { $$ = [ "%colon", $1, "" ]; }
-  | COLON expr                    { $$ = [ "%colon", "", $2 ]; }
-  | expr COLON expr               { $$ = [ "%colon", $1, $2 ]; }
-
-%%
-
-sub yylex {
-   $expr_parse::left =~ s/^\s*//;
-   return 0 if $expr_parse::left eq "";
-   my ($ncharsread, $token, $value) = expr_parse::good_yylex ($expr_parse::left);
-   # print "yylex: token eof\n" unless $ncharsread;
-   return 0 unless $ncharsread;
-   # print "yylex: token $token (" . substr ($expr_parse::left, 0, $ncharsread) . ") with value $value\n";
-   # print join (";", @$value) . "\n";
-   $expr_parse::left = substr ($expr_parse::left, $ncharsread);
-   $yylval = $value;
-   return $token;
-}
-
-# returns (ncharsread, token, value)
-sub good_yylex {
-   my ($s) = @_;
-   my ($c) = substr ($s, 0, 1);
-
-   if ($c eq "") {
-      return 0;
-   } elsif ($s =~ /^(\d+(?:\.\d*)?|\.\d+)D[+-]?\d+/i) {
-      return (length ($&), $CONST, [$typing::double_precision, $&]);
-   } elsif ($s =~ /^(\d+E[+-]?\d+|(?:\d+\.\d*|\.\d+)(?:E[+-]?\d+)?)(_\w+)?/i) {
-      if (defined $2) {
-         return (length ($&), $CONST, [typing::make_type ('real', substr ($2, 1)), $1]);
-      } else {
-         return (length ($&), $CONST, [$typing::default_type{'real'}, $1]);
-      }
-   } elsif ($s =~ /^(\d+)(_\w+)?/) {
-      if ($2) {
-         return (length ($&), $CONST, [typing::make_type ('integer', substr ($2, 1)), $1]);
-      } else {
-         return (length ($&), $CONST, [$typing::default_type{'integer'}, $1]);
-      }
-   } elsif ($s =~ /^(\.true\.|\.false\.)(_\w+)?/i) {
-      if (defined $2) {
-         return (length ($&), $CONST, [typing::make_type ('logical', substr ($2, 1)), $1]);
-      } else {
-         return (length ($&), $CONST, [$typing::default_type{'logical'}, $1]);
-      }
-   } elsif ($s =~ /^'(\d+)'(_\w+)?/) {
-      # Interior of string is digits because it has been grabbed already.
-      my ($str) = stmts::get_string ($1);
-      if (defined $2) {
-         return (length ($&), $CONST, [typing::make_character_type (substr ($2, 1), length ($str)), $str]);
-      } else {
-         return (length ($&), $CONST, [typing::make_character_type ($typing::default_character_kind, length ($str)), $str]);
-      }
-   } elsif ($s =~ /^\w+/) {
-      return (length ($&), $NAME, $&);
-   } else {
-      switch: {
-         $s =~ /^==/      && return (2, $COMPARISON, "==");
-         $s =~ /^<=/      && return (2, $COMPARISON, "<=");
-         $s =~ /^>=/      && return (2, $COMPARISON, ">=");
-         $s =~ /^</       && return (1, $COMPARISON, "<");
-         $s =~ /^>/       && return (1, $COMPARISON, ">");
-         $s =~ /^\/=/     && return (2, $COMPARISON, "/=");
-         $s =~ /^=/       && return (1, $EQUALS, "=");
-         $s =~ /^\.eq\./i && return (4, $COMPARISON, "==");
-         $s =~ /^\.le\./i && return (4, $COMPARISON, "<=");
-         $s =~ /^\.ge\./i && return (4, $COMPARISON, ">=");
-         $s =~ /^\.lt\./i && return (4, $COMPARISON, "<");
-         $s =~ /^\.gt\./i && return (4, $COMPARISON, ">");
-         $s =~ /^\.ne\./i && return (4, $COMPARISON, "/=");
-         $s =~ /^\.neqv\./i && return (6, $NEQV, ".neqv.");
-         $s =~ /^\.eqv\./i && return (5, $EQV, ".eqv.");
-         $s =~ /^\.and\./i && return (5, $AND, ".and.");
-         $s =~ /^\.or\./i && return (4, $OR, ".or.");
-         $s =~ /^\.not\./i && return (5, $NOT, ".not.");
-         $s =~ /^\*\*/    && return (2, $DBLASTERIK, "**");
-         $s =~ /^\/\//    && return (2, $DBLSLASH, "//");
-         $s =~ /^\(\//    && return (2, $LARRAY, "(/");
-         $s =~ /^\/\)/    && return (2, $RARRAY, "/)");
-         $c eq ","        && return (1, $COMMA, ",");
-         $c eq "+"        && return (1, $PLUS, "+");
-         $c eq "-"        && return (1, $MINUS, "-");
-         $c eq "*"        && return (1, $ASTERIK, "*");
-         $c eq "/"        && return (1, $SLASH, "/");
-         $c eq "("        && return (1, $LPAREN, "(");
-         $c eq ")"        && return (1, $RPAREN, ")");
-         $c eq "%"        && return (1, $PERCENT, "%");
-         $c eq ":"        && return (1, $COLON, ":");
-      }
-      die "Lexer failed on `$s'";
-   }
-}
-
-#####
-# Takes a string that consists entirely of an expression, and returns a
-# reference to the parse tree it defines.
-#####
-sub parse_expr {
-  my ($s) = @_;
-  # print "parsing string: $s.\n";
-  $expr_parse::left = $expr_parse::line = $s;
-  die "Expression `$expr_parse::line' has trailing garbage `$1$expr_parse::left'"
-    if yyparse () =~ /^s(.*)$/;
-  return $yyval;
-}
-
-#####
-# Takes a string that consists partly of an expression.  (The first part
-# is an expression.)  Returns (parse tree ref, rest string, separator string).
-#####
-sub parse_part_as_expr {
-  my ($s) = @_;
-  # print "parsing part of string: $s.\n";
-  $expr_parse::left = $expr_parse::line = $s;
-  if (yyparse () =~ /^s(.*)$/) {
-    return ($yyval, $expr_parse::left, $1);
-  } else {
-    return ($yyval);
-  }
-}
-
-sub yyerror {
-  my ($s) = @_;
-  die "yyerror: $s during parsing of F90 code `$expr_parse::line'";
-}
-
-1;
diff --git a/Tools/F_scripts/f90doc/f90doc b/Tools/F_scripts/f90doc/f90doc
deleted file mode 100755
index 0afe6dafe73..00000000000
--- a/Tools/F_scripts/f90doc/f90doc
+++ /dev/null
@@ -1,160 +0,0 @@
-#!/usr/bin/env perl
-eval 'exec perl $0 ${1+"$@"}'
-   if 0;
-warn ("Perl 5 not detected, likely a big problem") if $] < 5.0;
-warn "Less than Perl 5.003.  You may witness mysterious segmentation faults."
-   if $] < 5.003;
-
-use strict;
-
-BEGIN {
-  my $zero = $0;
-  while (-l $zero) {
-    my $nextzero = readlink $zero;
-    if (substr ($nextzero, 0, 1) eq "/") {
-      $zero = $nextzero;
-    } elsif ($zero =~ m#^(.*)/#) {
-      $zero = "$1/$nextzero";
-    } else {
-      $zero = $nextzero;
-    }
-  }
-  if ($zero =~ m#(.*)/\w+#) {
-    push @INC, "$1/../common/", $1;
-  } else {
-    push @INC, "../common/", ".";
-  }
-}
-
-require "htmling.pl";
-require "stmts.pl";
-require "utils.pl";
-#require "expr_parse.pl";
-#require "typing.pl";
-
-####################
-
-if (! @ARGV) {
-   print <<END
-Usage: f90doc [options] [--] path/file.f90 ...
-Description: Generates HTML files corresponding to each module in the listed
-   files, and stores them in the current directory.
-Options: -cp: comments should be considered to be preformatted text
-         -cs: comments are formatted "smartly" (default)
-         -ch: comments should be considered to be HTML
-         -o file.html: override output filename for next input file
-         -fixed: fixed form style (like Fortran 77) \ these apply to all
-         -free: free form style (new to Fortran 90) / future files only
-Note: Files are only affected by options listed before them.
-END
-}
-
-####################
-
-($stmts::bangbang, $stmts::comments, $stmts::disable_warns)
-   = (\&addto_comments, \&make_comments, 1);
-print substr ("$stmts::bangbang$stmts::comments$stmts::disable_warns", 0, 0);
-
-my ($catch_args, $next_output, $grab_next) = (1, undef, undef);
-my $arg;
-foreach $arg (@ARGV) {
-  if (defined $grab_next) {
-    $$grab_next = $arg;
-    $grab_next = undef;
-    next;
-  }
-  if ($arg eq "--") {
-    $catch_args = 0;
-  } else {
-    if ($catch_args && substr ($arg, 0, 1) eq "-") {
-      if ($arg =~ /^-cp$/i) {
-        $htmling::comments_type = "preformatted";
-      } elsif ($arg =~ /^-cs$/i) {
-        $htmling::comments_type = "smart";
-      } elsif ($arg =~ /^-ch$/i) {
-        $htmling::comments_type = "html";
-      } elsif ($arg =~ /^-o$/i) {
-        $grab_next = \$next_output;
-      } elsif ($arg =~ /^-fixed$/i) {
-        $stmts::fixed_form = 1;
-      } elsif ($arg =~ /^-free$/i) {
-        $stmts::fixed_form = 0;
-      } else {
-        die "Unrecognized option `$arg'";
-      }
-    } else {
-      process_file ($arg, $next_output);
-      $next_output = undef;
-    }
-  }
-}
-
-print "Done.\n";
-
-####################
-
-sub process_file {
-   my ($infile, $outfile) = @_;
-
-   print "Processing $infile...\n";
-
-   reset_comments ();
-   my $top;
-   for $top (stmts::read_file ($infile, \&reset_comments)) {
-     htmling::do_toplevel ($top, $outfile);
-   }
-}
-
-sub addto_comments {
-   my ($com) = @_;
-
-   $com =~ s/\s*!!//;
-
-   # Check for macros and paragraph breaks.
-   if ($com =~ /^\s*@/) {
-      $com =~ s/^\s*@\s*//;
-      do_macro ($com);
-   } else {
-      push (@::comments, $com);
-   }
-}
-
-sub reset_comments {
-   @::comments = ();
-   @::see_list = ();
-   $::version_num = "";
-   @::authors = ();
-}
-
-sub make_comments {
-   my ($out, $see);
-   $out = join ("\n", @::comments) . "\n";
-   $out = "${out}<P>\n<STRONG>Author:</STRONG> " . join (", ", @::authors) . "\n" if @::authors;
-   $out = "${out}<P>\n<STRONG>Version:</STRONG> $::version_num\n" if $::version_num;
-   $out = "${out}<P>\n<STRONG>See also:</STRONG> " . join (", ", @::see_list) . "\n" if @::see_list;
-   $out = "" if $out eq "\n";
-   $out;
-}
-
-sub do_macro {
-   my ($macro) = @_;
-   my ($part);
-
-   $macro =~ s/\s*$//;
-   if ($macro =~ /^see/i) {
-      die 'Invalid @see macro' unless $macro =~ /^see\s+(\w+)(#\w+)?$/;
-      if ($2) {
-         $part = substr ($2, 1);
-         push (@::see_list, "<A HREF=\"$1.html#" . lc ($part) . "\">$part</A> in module <A HREF=\"$1.html\">$1</A>");
-      } else {
-         push (@::see_list, "module <A HREF=\"$1.html\">$1</A>");
-      }
-   } elsif ($macro =~ /^author\s+/i) {
-      push (@::authors, $');
-   } elsif ($macro =~ /^version\s+/i) {
-      die "Two versions in a single !! block" if $::version_num;
-      $::version_num = $';
-   } else {
-      die "Unrecognized macro $macro";
-   }
-}
diff --git a/Tools/F_scripts/f90doc/htmling.pl b/Tools/F_scripts/f90doc/htmling.pl
deleted file mode 100644
index 956513244d9..00000000000
--- a/Tools/F_scripts/f90doc/htmling.pl
+++ /dev/null
@@ -1,376 +0,0 @@
-package htmling;
-
-use strict;
-
-### CONSTANTS
-$htmling::dblspace = "  ";
-$htmling::indentspace = $htmling::dblspace x 2;
-$htmling::headerspace = $htmling::indentspace;
-$htmling::comment_indent = $htmling::indentspace x 2;
-
-### PUBLIC GLOBALS
-$htmling::comments_type = "smart";
-$htmling::suppress_calls = 0;
-$htmling::calls_make_links = 0;
-$htmling::html_filenames_original_case = 0;
-
-### GLOBALS
-$htmling::htmlfile = "";
-$htmling::indent = 0;
-
-# Return the name of the HTML file for the specified PROGRAM or MODULE
-sub html_filename {
-  my ($name) = @_;
-  $name = lc $name unless $htmling::html_filenames_original_case;
-  return $name . ".html";
-}
-
-# This is the main calling point from f90doc.
-# Takes all top-level objects: programs, subroutines, functions, and modules.
-# Warns if given something else.
-sub do_toplevel {
-  my ($top, $outfile) = @_;
-  
-  my $type = $top->{'type'};
-  unless ($type eq 'module' || $type eq 'subroutine' || $type eq 'function' ||
-          $type eq 'program') {
-    warn "Warning: Unrecognized top-level object $type will not be documented.\n";
-    return;
-  }
-
-  # A positive-length name.  Necessary because programs may not have names.
-  if (defined $outfile) {
-    $htmling::htmlfile = $outfile;
-  } else {
-    $htmling::htmlfile = html_filename (
-        ($top->{'name'} eq '' ? $type : $top->{'name'}));
-  }
-  print "Generating $htmling::htmlfile...\n";
-  open OUT, ">$htmling::htmlfile";
-
-  print OUT "<HTML>\n";
-  print OUT "<HEAD>\n";
-  print OUT "   <TITLE> $type $top->{'name'} (generated by f90doc) </TITLE>\n";
-  print OUT "</HEAD>\n";
-  print OUT "<H1> ", ucfirst ($type), " $top->{'name'} </H1>\n";
-  print OUT "<PRE>$type $top->{'name'}\n";
-
-  list_uses (@{$top->{'uses'}});
-  list_calls (1, keys %{$top->{'calls'}}) if exists $top->{'calls'};
-  list_html ("Types", map (($_->{'type'} eq "type" ? ($_) : ()), @{$top->{'ocontains'}}));
-  list_html ("Variables", map (($_->{'type'} eq "var" ? ($_) : ()), @{$top->{'ocontains'}}));
-  list_html ("Interfaces", map (($_->{'type'} eq "interface" ? ($_) : ()), @{$top->{'ocontains'}}));
-  list_html ("Subroutines and functions", map (($_->{'type'} eq "subroutine" || $_->{'type'} eq "function" ? ($_) : ()), @{$top->{'ocontains'}}));
-
-  print OUT "\nend $type $top->{'name'}\n";
-  do_comments ($top->{'comments'}, 1);
-
-  my @list;
-  @list = map (($_->{'type'} eq "type" ? ($_) : ()), @{$top->{'ocontains'}});
-  print OUT "\n<HR><H2> Description of Types </H2>\n" if @list;
-  do_html (@list);
-  @list = map (($_->{'type'} eq "var" ? ($_) : ()), @{$top->{'ocontains'}});
-  print OUT "\n<HR><H2> Description of Variables </H2>\n" if @list;
-  do_html (@list);
-  @list = map (($_->{'type'} eq "interface" ? ($_) : ()), @{$top->{'ocontains'}});
-  print OUT "\n<HR><H2> Description of Interfaces </H2>\n" if @list;
-  do_html (@list);
-  @list = map (($_->{'type'} eq "subroutine" || $_->{'type'} eq "function" ? ($_) : ()), @{$top->{'ocontains'}});
-  print OUT "\n<HR><H2> Description of Subroutines and Functions </H2>\n" if @list;
-  do_html (@list);
-   
-  print OUT "</HTML>\n";
-  close OUT;
-}
-
-sub list_uses {
-  if (@_) {
-    print OUT "\n${htmling::indentspace}${htmling::headerspace}! Uses\n";
-    my ($use);
-    foreach $use (@_) {
-      my ($module, $extra) = @$use;
-      $extra = defined $extra ? ", $extra" : "";
-      print OUT "${htmling::indentspace}",
-                "use <A HREF=\"", html_filename ($module),
-                "\">$module</A>$extra\n";
-    }
-  }
-}
-
-sub list_calls {
-  return if $htmling::suppress_calls;
-  my ($big, @calls) = (@_);
-  if (@calls) {
-    @calls = sort @calls;
-    @calls = map { "<A HREF=\"$_.html\">$_</A>" } @calls
-      if $htmling::calls_make_links;
-    if ($big) {
-      print OUT join ("\n",
-          "\n${htmling::indentspace}${htmling::headerspace}! Calls",
-          (map { "${htmling::indentspace}call $_" } @calls), "");
-    } else {
-      print OUT "${htmling::indentspace}! Calls: ", join (", ", @calls), "\n";
-    }
-  }
-}
-
-sub list_html {
-  my ($title) = shift;
-
-  if (@_) {
-    print OUT "\n${htmling::indentspace}${htmling::headerspace}! $title\n";
-    my ($struct);
-    foreach $struct (@_) {
-      my ($name, $type) = (txt2html ($struct->{'name'}), $struct->{'type'});
-      my ($href) = "<A HREF=\"${htmling::htmlfile}#${type}_" .
-        lc ($name) . "\">$name</A>";
-      print OUT $htmling::indentspace;
-      if ($type eq "var") {
-        print OUT var2str ($struct, $href) . "\n";
-      } elsif ($type eq "subroutine" ||
-               $type eq "function") {
-        print OUT join (" ", attriblist ($struct), "");
-        print OUT typing::type_to_f90 ($struct->{'rtype'}) . " "
-          if exists $struct->{'rtype'};
-        my $flag;
-        for $flag ('recursive', 'elemental', 'pure') {
-          print OUT "$flag " if $struct->{$flag};
-        }
-        print OUT "$type $href";
-        print OUT " (" . join (", ", @{$struct->{'parms'}}) . ")";
-        print OUT " result ($struct->{'result'})"
-          if exists $struct->{'result'} && !exists $struct->{'rtype'};
-        print OUT "\n";
-      } else {
-        print OUT join (" ", attriblist ($struct), "");
-        print OUT "$type $href\n";
-      }
-    }
-  }
-}
-
-sub do_html {
-   if (@_) {
-      my ($struct);
-
-      foreach $struct (@_) {
-         my ($name, $type) = (txt2html ($struct->{'name'}), $struct->{'type'});
-         if (! $htmling::indent) {
-            print OUT "<A NAME=\"${type}_" . lc ($name) .
-               "\"><H3>$name</H3></A>\n";
-            print OUT "<PRE>";
-         }
-
-         print OUT $htmling::indentspace x $htmling::indent;
-         if ($type eq "var") {
-             print OUT var2str ($struct) . "\n";
-         } elsif ($type eq "mprocedure") {
-             die "do_html: bare module procedure $struct->{'name'} (no enclosing module)"
-                 unless exists $struct->{'bind'};
-             print OUT
-                 "module procedure <A HREF=\"#$struct->{'bind'}->{'type'}_" .
-                 lc ($struct->{'name'}) . "\">$name</A>\n";
-         } elsif ($type eq "subroutine" || $type eq "function") {
-             print OUT join (" ", attriblist ($struct), "");
-             print OUT typing::type_to_f90 ($struct->{'rtype'}) . " "
-                 if exists $struct->{'rtype'} && !exists $struct->{'result'};
-             my $flag;
-             for $flag ('recursive', 'elemental', 'pure') {
-               print OUT "$flag " if $struct->{$flag};
-             }
-             print OUT "$type $name";
-             print OUT " (" . join (", ", @{$struct->{'parms'}}) . ")";
-             print OUT " result ($struct->{'result'})"
-               if exists $struct->{'result'};
-             print OUT "\n";
-         } else {
-             print OUT join (" ", attriblist ($struct), "");
-             print OUT "$type $name\n";
-         }
-
-         $htmling::indent++;
-
-         if ($type eq "var" || $type eq "mprocedure") {
-         } elsif ($type eq "type") {
-           print OUT $htmling::indentspace x $htmling::indent, "private\n"
-             if exists $struct->{'privatetype'};
-           print OUT $htmling::indentspace x $htmling::indent, "sequence\n"
-             if exists $struct->{'sequencetype'};
-           do_html (@{$struct->{'ocontains'}});
-         } elsif ($type eq "interface") {
-           do_html (@{$struct->{'ocontains'}});
-         } elsif ($type eq "subroutine" || $type eq "function") {
-           my @interest = @{$struct->{'parms'}};
-           push @interest, $struct->{'result'} if exists $struct->{'result'};
-           push @interest, $name
-             if $type eq "function" && !exists $struct->{'result'} &&
-               !exists $struct->{'rtype'};
-           my $arg;
-           foreach $arg (@interest) {
-             my (@things) = values %{$struct->{'contains'}->{lc $arg}};
-             die "Confused by/no declaration for parameter $arg of $type $name"
-               if scalar @things != 1;
-             do_html ($things[0]);
-           }
-         } else {
-           die "do: I don't know what a $type is";
-         }
-
-         list_calls (0, keys %{$struct->{'calls'}}) if exists $struct->{'calls'};
-
-         $htmling::indent--;
-
-         if ($type ne "var" && $type ne "mprocedure") {
-            print OUT $htmling::indentspace x $htmling::indent . "end $type $name\n";
-         }
-
-         do_comments ($struct->{'comments'}, ! $htmling::indent);
-      }
-   }
-}
-
-# Pass comments and a flag saying if you want to end the current <PRE> block.
-sub do_comments {
-   my ($comments, $endpre) = @_;
-   if ($comments eq "") {
-      print OUT "</PRE>\n" if $endpre;
-      return;
-   }
-
-   #print OUT "\n" unless $htmling::indent;
-
-   if ($htmling::comments_type eq "preformatted") {
-      my ($s) = $htmling::indentspace x $htmling::indent . $htmling::comment_indent;
-      $comments =~ s/^/$s/m if $htmling::indent;
-      $comments =~ s/^\n*//s;
-      $comments =~ s/\n*$//s;
-      print OUT $comments, "\n";
-      print OUT "</PRE>\n" if $endpre;
-   } else {
-      print OUT "</PRE>\n";
-      print OUT "<DL><DD><DL><DD>\n" if $htmling::indent;
-      if ($htmling::comments_type eq "html") {
-      } elsif ($htmling::comments_type eq "smart") {
-         my @newcomments = ();
-         my $verbmode = 0;
-         my @listmode = ();
-         my $line;
-         foreach $line (split ("\n", $comments)) {
-            if ($verbmode) {
-              if ($line =~ /^>/) {
-                warn "`$line' found while already in verbatim mode";
-                substr ($line, 0, 1) = " ";
-                push @newcomments, $line;
-              } elsif ($line =~ /^</) {
-                $verbmode = 0;
-                substr ($line, 0, 1) = " ";
-                push @newcomments, $line . "</PRE>";
-              } elsif ($line =~ /^v/) {
-                warn "`$line' found while already in verbatim mode";
-                substr ($line, 0, 1) = " ";
-                push @newcomments, $line;
-              } else {
-                push @newcomments, $line;
-              }
-              next;
-            }
-
-            # _italic_ and *bold*
-            while ($line =~ /(\A|\W)_(\w|\w.*?\w)_(\Z|\W)/) {
-              my ($left, $mid, $right) = ("$`$1<I>", $2, "</I>$3$'");
-              $mid =~ s/_/ /g;
-              $line = $left . $mid . $right;
-            }
-            while ($line =~ /(\A|\W)\*(\w|\w.*?\w)\*(\Z|\W)/) {
-              my ($left, $mid, $right) = ("$`$1<STRONG>", $2, "</STRONG>$3$'");
-              $mid =~ s/\*/ /g;
-              $line = $left . $mid . $right;
-            }
-
-            # Lists
-            if ($line =~ /^( *)-/) {
-              if (! @listmode || length ($1) > $listmode[$#listmode]) {
-                push @listmode, length $1;
-                push @newcomments, $1 . "<UL>";
-              } else {
-                while ($listmode[$#listmode] != length ($1)) {
-                  push @newcomments, " " x $listmode[$#listmode] . "</UL>";
-                  pop @listmode;
-                  die "Unindented to invalid position in `$line'"
-                    unless @listmode;
-                }
-              }
-              push @newcomments, $1 . "<LI> " . substr ($line, length ($&));
-            } elsif ($line =~ /^>/) {
-              #warn "Verbatim mode started in list mode" if @listmode;
-              $verbmode = 1;
-              substr ($line, 0, 1) = " ";
-              push @newcomments, "<PRE>" . $line;
-            # Ignore $line =~ /^</ because it may be an HTML tag.
-            } elsif ($line =~ /^v/) {
-              #warn "One-line verbatim in list mode" if @listmode;
-              substr ($line, 0, 1) = " ";
-              push @newcomments, "<PRE>$line</PRE>";
-            } elsif ($line =~ /^\s*$/) {
-              push @newcomments, "<P>";
-            } elsif (@listmode) {
-              $line =~ /^( *)(\t?)/;
-              warn "Tabs have strange effects on indentation detection"
-                if length ($2) > 0;
-              while (@listmode && $listmode[$#listmode] > length ($1)) {
-                push @newcomments, " " x $listmode[$#listmode] . "</UL>";
-                pop @listmode;
-              }
-              push @newcomments, $line;
-            } else {
-              push @newcomments, $line;
-            }
-         }
-         my $list;
-         foreach $list (@listmode) {
-             push @newcomments, " " x $list . "</UL>";
-         }
-         $comments = join ("\n", @newcomments);
-      } else {
-         die "Unsupported comments type `$htmling::comments_type'";
-      }
-      $comments =~ s/<P>\n(<P>\n)+/<P>\n/g;
-      $comments =~ s/<P>\n$//;
-      $comments =~ s/^<P>\n//;
-      $comments =~ s/<P>/<DD>/g if $htmling::indent;
-      print OUT $comments . "\n";
-      print OUT "</DL></DL>\n" if $htmling::indent;
-      print OUT "<PRE>" unless $endpre;
-   }
-}
-
-sub var2str {
-    my ($var, $href) = @_;
-
-    my ($typestr) = typing::type_to_f90 ($var->{'vartype'});
-    my ($initial) = (!exists $var->{'initial'} ? ""
-          : " $var->{'initop'} " . typing::expr_to_f90 ($var->{'initial'}));
-    $href = txt2html ($var->{'name'}) unless $href;
-    return $typestr . join (", ", "", attriblist ($var)) . " :: $href$initial";
-}
-
-sub txt2html {
-    my ($txt) = @_;
-    $txt =~ s/</&lt;/g;
-    $txt =~ s/>/&gt;/g;
-    return $txt;
-}
-
-sub attriblist {
-    my ($struct) = @_;
-    my @attribs = ();
-
-    push @attribs, $struct->{'vis'} if exists $struct->{'vis'};
-    push @attribs, "optional" if exists $struct->{'optional'};
-    push @attribs, @{$struct->{'tempattribs'}}
-        if exists $struct->{'tempattribs'};
-
-    return @attribs;
-}
-
-1;
diff --git a/Tools/F_scripts/f90doc/stmts.pl b/Tools/F_scripts/f90doc/stmts.pl
deleted file mode 100644
index 83d20a300af..00000000000
--- a/Tools/F_scripts/f90doc/stmts.pl
+++ /dev/null
@@ -1,891 +0,0 @@
-package stmts;
-
-use strict;
-
-require "expr_parse.pl";
-require "typing.pl";
-require "utils.pl";
-
-#########################################################################
-# PUBLIC GLOBALS
-
-# Set to a reference to a routine to take !! comments if !! comments are
-# to be caught.
-$stmts::bangbang = "";
-
-# Set to a reference to a routine to return accumulated comments if !! comments
-# are caught.  You should reset them after each time you call read_line or
-# read_stmt.
-$stmts::comments = "";
-
-# Set this to disable warnings.  Don't use this for a compiler!  Suitable for
-# something like f90doc though.  This shouldn't be used once stmts supports
-# all Fortran 90 statements and attributes; until then, it's pretty much
-# needed; after then, it should be removed.
-$stmts::diable_warns = 0;
-
-# Set this to use fixed-form Fortran, like good old Fortran 77.
-$stmts::fixed_form = 0;
-
-#########################################################################
-# PRIVATE GLOBALS
-
-# A "left-over" piece of a statement is stored here when semi-colons are
-# encountered.
-$stmts::leftover = "";
-
-# Number of opened files.
-$stmts::nfile = 0;
-
-# List of string's values.
-@stmts::strings = ();
-
-# List of structure pointers that we're currently nested in.
-# topnest stores the top of the stack.
-@stmts::nesting = ();
-$stmts::topnest = undef;
-
-# List of structure pointers that we're currently nested in, but for a
-# specified type.
-%stmts::nesting_by = ();
-
-#########################################################################
-# ROUTINES
-
-#####
-# Reads an entire file, and returns all the top-level structures found.
-# If specified, a given function will be called after every statement
-# (usually this is for resetting !! comments and such).
-#####
-sub read_file {
-  my ($filename, $every_stmt) = @_;
-  stmts::open_file ($filename);
-
-  my ($stmt, $struct, @rval);
-  my @toplevel = ();
-  while ((@rval = stmts::read_stmt ()) [0]) {
-    push @toplevel, $rval[1] if !defined $stmts::topnest && ref $rval[1];
-    &$every_stmt () if defined $every_stmt;
-  }
-
-  return @toplevel;
-}
-
-#####
-# Starts reading the specified filename.
-#####
-sub open_file {
-   my ($filename) = @_;
-   $stmts::FILE = "";
-
-   open IN, $filename
-     or die "Couldn't open $filename";
-   $stmts::{'FILE' . $stmts::nfile} = $stmts::{'IN'};
-}
-
-#####
-# Cleans up from reading the current file.
-# This is automatically called by read_line, so most don't have to worry
-# about it.
-# Returns false if there are no files left.
-#####
-sub close_file {
-   close IN;
-   $stmts::nfile--;
-   if ($stmts::nfile > 0) {
-      # CHECK--does this still do the desired thing, in light of open_file?
-      $stmts::{'IN'} = $stmts::{'FILE' . $stmts::nfile};
-      return 1;
-   } else {
-      # Clean up strings.
-      @stmts::strings = ();
-      return 0;
-   }
-}
-
-#####
-# Reads a line of Fortran 90 doing whatever it takes.  This may involve
-# reading multiple lines from the current file, walking into files, etc.
-# INCLUDE is parsed at this level.
-# Note that the returned string may have various cases (lc isn't called).
-#####
-sub read_line {
-
-ALLOVERAGAIN:
-  my $line;
-  if ($stmts::leftover ne '') {
-    $line = $stmts::leftover;
-    $stmts::leftover = '';
-  } else {
-    $line = <IN>;
-    until (defined $line) {
-      return "" unless close_file ();
-      $line = <IN>;
-    }
-    chomp $line;
-
-    substr ($line, 0, 1) = '!' if $stmts::fixed_form && $line =~ /^\S/;
-  }
-
-  # This is used for fixed-form continuations.
-  my $lastlen = length $line;
-
-  my $continue = 0;
-
-  while (1) {
-    # Grab doubled comments (!!) if requested.
-    if ($stmts::bangbang && $line =~ /^([^"'!]|('[^']*')|("[^"]*"))*(!!.*)$/) {
-      $line = substr ($line, 0, length ($line) - length ($4));
-      &$stmts::bangbang ($4);
-    }
-
-    # Delete comments.
-    elsif ($line =~ /^([^"'!]|(\'[^']*')|("[^"]*"))*(!.*)$/) {
-      $line = substr ($line, 0, length ($line) - length ($4));
-    }
-
-    # Fixed-form continuations.
-    if ($stmts::fixed_form) {
-
-      # Check next line for continuation mark.
-      $stmts::leftover = <IN>;
-      $stmts::leftover = '' unless defined $stmts::leftover;
-      chomp $stmts::leftover;
-      substr ($stmts::leftover, 0, 1) = '!' if $stmts::leftover =~ /^\S/;
-      if ($stmts::leftover =~ /^\s....\S/) {
-
-        # Pad previous line with spaces if it had less than 72 characters.
-        $line .= ' ' x (72-$lastlen) if $lastlen < 72;
-
-        # Add next (continuation) line to the line.
-        $line .= substr ($stmts::leftover, 6);
-        $lastlen = length $stmts::leftover;
-        
-        # Continue on to check the next line.
-        $stmts::leftover = '';
-        next;
-      }
-      
-    # Free-form continuations.
-    } elsif ($continue || $line =~ /&\s*$/) {
-      $line = $` if $line =~ /&\s*$/;
-      my $rest = <IN>;
-      chomp $rest;
-      $rest = $' if $rest =~ /^\s*&/;
-      $line = "$line$rest";
-      # Blank lines don't stop the continuation.
-      $continue = ($rest =~ /^\s*(?:!.*)?$/);
-      next;
-    }
-
-    last;
-  }
-
-  # Semicolons.
-  if ($line =~ /^([^;]*);(.*)$/) {
-    $line = $1;
-    if ($stmts::leftover eq '') {
-      $stmts::leftover = $2;
-    } else {
-      $stmts::leftover .= ";$2";
-    }
-  }
-
-  # Replace strings to avoid confusion.
-  my @quotes;
-  while ($line =~ / " ([^"]|"")* " | ' ([^']|'')* ' /xg) {
-    push @quotes, [length $`, length $&, $&];
-  }
-  for my $quote (reverse @quotes) {
-    ## Process in reverse order so that $start is preserved despite replacement
-    my ($start, $length, $string) = @$quote;
-    push @stmts::strings, $string;
-    substr ($line, $start, $length) = "\'" . $#stmts::strings . "\'";
-  }
-
-  # Get rid of spaces on either end.
-  $line = utils::trim ($line);
-
-  goto ALLOVERAGAIN if $line eq '';
-
-  #print "read line `$line'\n";
-
-  return $line;
-}
-
-#####
-# Returns the physical value for the given string number.
-#####
-sub get_string {
-   my ($n) = @_;
-   return $stmts::strings[$n];
-}
-
-#####
-# Reads a Fortran 90 statement from the current input.
-# Checks for proper nesting, etc., and keeps tracks of what's in what.
-# Possible results:
-#    ('?', $the_line)
-#    ('program', \%structure)
-#    ('endprogram', \%structure)
-#    ('module', \%structure)
-#    ('endmodule', \%structure)
-#    ('subroutine', \%structure)
-#    ('endsubroutine', \%structure)
-#    ('function', \%structure)
-#    ('endfunction', \%structure)
-#    ('program', \%structure)
-#    ('endprogram', \%structure)
-#    ('type', \%structure)
-#    ('endtype', \%structure)
-#    ('interface', \%structure)
-#    ('endinterface', \%structure)
-#    ('var', \%struct1, \%struct2, ...)
-#    ('contains', \%parent)
-#    ('public', $name1, $name2, ...)          empty means global default
-#    ('private', $name1, $name2, ...)         empty means global default
-#    ('optional', $name1, $name2, ...)
-#    ('call', $arg1, $arg2, ...)              currently args are unparsed
-#####
-sub read_stmt {
-   my ($line) = read_line ();
-   if (! $line) {
-      die "File ended while still nested" if @stmts::nesting;
-      return ("", "");
-   }
-
-   # MODULE PROCEDURE (must be before module)
-   if ($line =~ /^module\s+procedure\s+(\w.*)$/i) {
-      die "module procedure outside of interface block" unless defined $stmts::topnest && $stmts::topnest->{'type'} eq "interface" && $stmts::topnest->{'name'} ne "";
-      my (@list) = split (/\s*,\s*/, utils::trim ($1));
-      my ($p);
-      foreach $p (@list) {
-         die "Invalid module procedure `$p'" unless $p =~ /^\w+$/;
-         new_struct ({
-            'type'   => "mprocedure",
-            'name'   => $p,
-            hashed_comments ()
-         });
-      }
-      return ("mprocedure", @list);
-   }
-
-   # MODULE/PROGRAM
-   elsif ($line =~ /^(module|program)(?:\s+(\w+))?$/i) {
-      die "$1 begun not at top level" if defined $stmts::topnest;
-      return new_nest ({
-         'type' => lc $1,
-         'name' => (defined $2 ? $2 : ''),
-         hashed_comments ()
-      });
-   }
-
-   # END MODULE/SUBROUTINE/FUNCTION/PROGRAM/TYPE/INTERFACE, or general END
-   elsif ($line =~ /^end\s*(?:(module|subroutine|function|program|type|interface)(?:\s+(\w+))?)?$/i) {
-      die "END statement outside of any nesting" unless defined $stmts::topnest;
-      my $top = $stmts::topnest;
-
-      # We do some special "fixing up" for modules, which resolves named
-      # references (module procedures) and computes publicity.
-      #
-      # Note that end_nest will ensure that the type of thing ended matches
-      # the thing the user says it is ending, so we don't have to worry about
-      # that.
-      if ($top->{'type'} eq "module") {
-
-        # Set publicity (visibility) of objects within the module.
-
-        # First, the explicitly set ones.
-        my $name;
-        foreach $name (@{$top->{'publiclist'}}) {
-          do_attrib ($name, "vis", 'public', "visibility");
-        }
-        foreach $name (@{$top->{'privatelist'}}) {
-          do_attrib ($name, "vis", 'private', "visibility");
-        }
-
-        # Second, the globally set ones (those obeying the default).
-        my $obj;
-        $top->{'defaultvis'} = "public" unless exists $top->{'defaultvis'};
-        foreach $obj (@{$top->{'ocontains'}}) {
-          $obj->{'vis'} = $top->{'defaultvis'} unless exists $obj->{'vis'};
-        }
-
-        # Traverse (arbitrarily deeply) nested structures.
-        sub traverse {
-          my ($node) = @_;
-          my $top = $stmts::topnest;   # HAVE NO IDEA WHY THIS IS NEEDED
-          
-          # Graduate nested MODULE PROCEDURE (mprocedure) to point to the
-          # appropriate thing (either a function or a subroutine with that
-          # name).
-          if ($node->{'type'} eq "mprocedure") {
-            die "Couldn't find module procedure $node->{'name'} (nothing with that name in module $top->{'name'})"
-              unless exists $top->{'contains'}->{lc $node->{'name'}};
-            
-            my ($possibles) =
-              $top->{'contains'}->{lc $node->{'name'}};
-            die "Couldn't find module procedure $node->{'name'} in module $top->{'name'} (wrong type)"
-              if !exists $possibles->{'subroutine'}
-              && !exists $possibles->{'function'};
-            die "Found both a subroutine and function to match module procedure $node->{'name'} in module $top->{'name'}"
-              if exists $possibles->{'subroutine'}
-              && exists $possibles->{'function'};
-            
-            if (exists $possibles->{'subroutine'}) {
-              $node->{'bind'} = $possibles->{'subroutine'};
-            } else {
-              $node->{'bind'} = $possibles->{'function'};
-            }
-          }
-
-          # Recurse.
-          map { traverse ($_) } @{$node->{'ocontains'}}
-          if exists $node->{'ocontains'};
-        }
-        map { traverse ($_) } @{$top->{'ocontains'}};
-      }
-
-      my @return_val = end_nest ($1, $2);
-
-      # Subroutines and functions in interface blocks must be noted at the
-      # top level.  We do this with "interface" structures with the names
-      # of the actual contained routines (unless this is already the
-      # case).  Make sense?
-      if ($top->{'type'} eq "interface" && $top->{'name'} eq "") {
-          my $sub;
-          foreach $sub (@{$top->{'ocontains'}}) {
-              next if $sub->{'name'} eq $top->{'name'} ||
-                      $sub->{'type'} eq "mprocedure";
-
-              my %copy = %$top;
-              $copy{'name'} = $sub->{'name'};
-              new_nest (\%copy);
-              my $old_within = $sub->{'within'};
-              new_struct ($sub);
-              $sub->{'within'} = $old_within;
-              end_nest ('interface', $sub->{'name'});
-          }
-      }
-
-      return @return_val;
-   }
-
-   # SUBROUTINE/FUNCTION
-   elsif ($line =~ /^(?:(.+?)\s+)?(subroutine|function)\s+(\w+)\s*(\([^()]*\))?(?:\s*result\s*\(\s*(\w+)\s*\))?$/i) {
-      my ($type, $name, $parmstr, $rtype, $result) =
-         (lc $2, $3,    $4,       $1,     $5);
-
-      die "Start of $type $name before `contains' section of $stmts::topnest->{'type'} $stmts::topnest->{'name'}"
-          if defined $stmts::topnest && ! $stmts::topnest->{'incontains'} &&
-             $stmts::topnest->{'type'} ne "interface";
-      if (exists $stmts::nesting_by{'subroutine'} ||
-          exists $stmts::nesting_by{'function'}) {
-         my $n = 0;
-         $n += scalar @{$stmts::nesting_by{'subroutine'}}
-            if exists $stmts::nesting_by{'subroutine'};
-         $n += scalar @{$stmts::nesting_by{'function'}}
-            if exists $stmts::nesting_by{'function'};
-#FIXME  #die "Routine nested in routine nested in routine" if $n > 1;
-      }
-
-      $parmstr = "()" unless defined $parmstr;
-      $parmstr = utils::trim (substr ($parmstr, 1, length ($parmstr) - 2));
-      my (@parms);
-      if ($parmstr) {
-         @parms = split (/\s*,\s*/, $parmstr);
-         my ($parm);
-         foreach $parm (@parms) {
-            die "Parameter `$parm' is not just a word or *"
-              unless $parm =~ /^\w+|\*$/;
-            ## * as a final argument allows the calling to specify a statement
-            ## to jump as an alternative return address.  (Legacy Fortran!)
-            ## Thanks to Art Olin for this info.
-         }
-      } else {
-         @parms = ();
-      }
-
-      my $struct = {
-         'type'      => $type,
-         'name'      => $name,
-         'parms'     => \@parms,
-         hashed_comments ()
-      };
-      new_nest ($struct);
-
-      $struct->{'result'} = $result if defined $result;
-
-      $rtype = "" unless defined $rtype;
-      while ($rtype =~ /(?:^|\s+)(recursive|pure|elemental)$/i ||
-             $rtype =~ /^(recursive|pure|elemental)(?:\s+|$)/i) {
-        $rtype = $` . $'; # actually whichever is not blank
-        $struct->{lc $1} = 1;
-      }
-      if ($rtype ne '') {
-        $struct->{'rtype'} = parse_type ($rtype);
-        new_struct ({
-          'type'        => 'var',
-          'name'        => (defined $result ? $result : $name),
-          'vartype'     => $struct->{'rtype'},
-          'comments'    => ''
-        });
-      }
-
-      return ($type, $struct);
-   }
-
-   # TYPE definition (must go before variable declarations)
-   elsif ($line =~ /^type(?:\s+|\s*(,.*)?::\s*)(\w+)$/i) {
-     my $struct = new_nest ({
-       'type' => 'type',
-       'name' => $2,
-       hashed_comments ()
-     });
-     if (defined $1) {
-       my $attrib = utils::trim (substr ($1, 1));
-       if ($attrib =~ /^(public|private)$/i) {
-         $struct->{'vis'} = lc $attrib;
-       } elsif ($attrib) {
-         warn "Invalid attribute `$attrib' for derived-type declaration--should be just public or private";
-       }
-     }
-     return $struct;
-   }
-
-   # INTERFACE block (for overloading) or statement (for definition of external)
-   elsif ($line =~ /^interface(?:\s+(\S.+))?$/i) {
-       return new_nest ({
-           'type' => 'interface',
-           'name' => (defined $1 ? $1 : ""),
-           hashed_comments ()
-       });
-   }
-
-   # CONTAINS
-   elsif ($line =~ /^contains$/i) {
-      die "`contains' found at top level" unless defined $stmts::topnest;
-      die "`contains' found in $stmts::topnest->{'type'} $stmts::topnest->{'name'}" unless exists $stmts::topnest->{'incontains'};
-      die "Multiple `contains' found in same scope"
-         if $stmts::topnest->{'incontains'};
-      die "`contains' found in interface definition"
-         if $stmts::topnest->{'interface'};
-      $stmts::topnest->{'incontains'} = 1;
-      return ("contains", $stmts::topnest);
-   }
-
-   # PUBLIC/PRIVATE/SEQUENCE
-   elsif ($line =~ /^(public|private|sequence)(?=\s+[^=(]|::|$)(\s*::\s*)?/i) {
-     my ($what, $rest) = (lc $1, $');
-
-     if (defined $stmts::topnest && $stmts::topnest->{'type'} eq "type") {
-       die "public statement not allowed in a type declaration"
-         if $what eq 'public';
-       die "$1 cannot be qualified inside type declaration" if $rest;
-       $stmts::topnest->{$what . 'type'} = 1;
-       return ($what);
-     } else {
-       die "sequence statement only allowed immediately inside type declaration"
-         if $1 eq 'sequence';
-
-       die "$1 statement not immediately inside a module or type declaration"
-         unless defined $stmts::topnest && $stmts::topnest->{'type'} eq "module";
-       if ($rest eq "") {  # Unqualified
-         die "Unqualified $what in addition to unqualified " .
-           $stmts::topnest->{'defaultvis'}
-         if exists $stmts::topnest->{'defaultvis'};
-         $stmts::topnest->{'defaultvis'} = $what;
-         return ($what);
-         
-       } else {  # Qualified
-         my @namelist = map {
-           die "Invalid name `$_' specified in $what statement"
-             unless /^\s*(\w+)(?:\s*(\([^()]+\)))?\s*$/i;
-           $1 . (defined $2 ? $2 : "");
-         } (split ',', $rest);
-         push @{$stmts::topnest->{"${what}list"}}, @namelist;
-         return ($what, @namelist);
-       }
-     }
-   }
-
-    # OPTIONAL
-    elsif ($line =~ /^optional(\s+|\s*::\s*)((\w|\s|,)+)$/i) {
-        my $name;
-        my @namelist = split (/\s*,\s*/, utils::trim ($2));
-        foreach $name (@namelist) {
-            do_attrib ($name, "optional", 1, "optional attribute");
-        }
-        return ('optional', @namelist);
-    }
-
-   # Variable declarations
-   elsif ($line =~ /^(integer|real|double\s*precision|character|complex|logical|type)\s*(\(|\s\w|[:,*])/i) {
-      my ($vartype, $rest) = parse_part_as_type ($line);
-      my (@attribs, @right);
-      if ($rest =~ /^(.*)\:\:(.*)/) {
-         my ($a, $b) = ($1, $2);
-         @attribs = map (( utils::trim ($_) ), utils::balsplit (",", $a));
-         @right = map (( utils::trim ($_) ), utils::balsplit (",", $b));
-      } else {
-         @attribs = ();
-         @right = map (( &utils::trim ($_) ), utils::balsplit (",", $rest));
-      }
-      my ($r, @structs);
-      foreach $r (@right) {
-          my ($rl, $rassign) = &utils::balsplit ("=", $r);
-          my ($rll, $starpart) = &utils::balsplit ("*", $rl);
-          if (defined $starpart) {
-            die "Sorry, I don't support 'character var*kind' yet; use 'character*kind var' instead";
-          }
-          $rll =~ /^ (\w+) (\s* \(.*\))? \s* $/x
-              or die "Invalid variable declaration `$rll'";
-          my ($name, $dimension) = ($1, $2);
-          my ($initop, $initial);
-          if (defined $rassign) {
-            # implicit lead =
-            $rassign =~ /^ (>?) \s* (.*) $/x
-              or die "Invalid variable initialization `= $rassign'";
-            ($initop, $initial) = ("=" . $1, $2);
-          }
-
-          my $struct;
-          $struct = {
-              'type'        => 'var',
-              'name'        => $name,
-              'vartype'     => $vartype,
-              hashed_comments ()
-          };
-          if (defined $initial) {
-            $struct->{'initop'} = $initop;
-            $struct->{'initial'} = expr_parse::parse_expr ($initial);
-          }
-          new_struct ($struct);
-          push @structs, $struct;
-
-          my @attribs_copy = @attribs;
-          push @attribs_copy, "dimension $dimension" if defined $dimension;
-
-          my ($attrib, @tempattribs);
-          foreach $attrib (@attribs_copy) {
-              if ($attrib =~ /^(public|private)$/i) {
-                  $attrib = lc $attrib;
-                  $struct->{'vis'} = $attrib;
-              } elsif ($attrib =~ /^optional$/i) {
-                  $attrib = lc $attrib;
-                  $struct->{$attrib} = 1;
-              } elsif ($attrib) {
-                  warn "Unrecognized attribute `$attrib'"
-                      unless $stmts::disable_warns;
-                  push @tempattribs, $attrib;
-              }
-          }
-
-          $struct->{'tempattribs'} = \@tempattribs;
-      }
-
-      return ('var', @structs);
-   }
-
-   # USE
-   elsif ($line =~ /^use\s+(\w+)($|,\s*)/i) {
-      die "`use' found at top level" unless defined $stmts::topnest;
-      die "`use' found in $stmts::topnest->{'type'} $stmts::topnest->{'name'}" unless exists $stmts::topnest->{'uses'};
-      my $extra = length $' ? $' : undef;
-      push @{$stmts::topnest->{'uses'}}, [$1, $extra];
-
-      return ('use', $1, $extra);
-   }
-   
-   # CALL or IF (...) CALL [hack--xxx]
-   elsif ($line =~ /^(?:if\s*\(.*\)\s*)?call\s+(\w+)\s*(?:\(\s*(.*?)\s*\))?$/i) {
-      die "`call' found at top level" unless defined $stmts::topnest;
-      die "`call' found in $stmts::topnest->{'type'} $stmts::topnest->{'name'}" unless exists $stmts::topnest->{'calls'};
-      $stmts::topnest->{'calls'}->{$1} = 1;
-      my @args = ();
-      @args = split /\s*,\s*/, $2 if defined $2;
-      return ('call', @args);
-   }
-   
-   # Unrecognized statement
-   else {
-      if ($line =~ /^\w+/) {
-         warn "Unrecognized statement beginning with word $&" unless $stmts::disable_warns;
-      } else {
-         warn "Unrecognized statement" unless $stmts::disable_warns;
-      }
-      return ('?', $line);
-   }
-}
-
-#####
-# Returns a list that would fit right into a hash table you're making.  If
-# there are no comments, returns the empty list.  The entry is called
-# 'comments'.
-#####
-sub hashed_comments {
-   if ($stmts::comments) {
-      return ( 'comments', &$stmts::comments () );
-   } else {
-      return ();
-   }
-}
-
-#####
-# Makes note of a new structure.  Called by new_nest, for example.
-#####
-sub new_struct {
-   my ($struct) = @_;
-   my $type = $struct->{'type'};
-
-   die "Basic structure must be found at a nesting level"
-     unless defined $stmts::topnest;
-
-   if (exists ($stmts::topnest->{'contains'}->{lc $struct->{'name'}})) {
-      die "Redefinition of $type $struct->{'name'} in $stmts::topnest->{'type'} $stmts::topnest->{'name'}"
-         if exists ($stmts::topnest->{'contains'}->{lc $struct->{'name'}}->{$type});
-      $stmts::topnest->{'contains'}->{lc $struct->{'name'}}->{$type} = $struct;
-   } else {
-      $stmts::topnest->{'contains'}->{lc $struct->{'name'}} =
-         { $type => $struct };
-   }
-   push @{$stmts::topnest->{'ocontains'}}, $struct;
-   $struct->{'within'} = $stmts::topnest;
-}
-
-#####
-# Starts a new nesting level represented by the given structure.  The
-# structure must define the 'type' and 'name' entries.  You should not
-# define the 'contains' or 'defaultvis' entry.
-#####
-sub new_nest {
-   my ($struct) = @_;
-   my ($type) = $struct->{'type'};
-
-   $struct->{'contains'} = { };
-   $struct->{'ocontains'} = [ ];
-
-   # Program unit
-   if ($type eq "subroutine" || $type eq "function" || $type eq "module" || $type eq "program") {
-     $struct->{'incontains'} = 0;
-     $struct->{'uses'} = [ ];
-     $struct->{'interface'} = 0 if $type eq "subroutine" || $type eq "function";
-   }
-
-   # Program unit with code
-   if ($type eq "subroutine" || $type eq "function" || $type eq "program") {
-     $struct->{'calls'} = { };
-   }
-
-   if (defined $stmts::topnest) {
-      my ($toptype) = $stmts::topnest->{'type'};
-      if ($toptype eq "interface" && ($struct->{'type'} eq "subroutine" || $struct->{'type'} eq "function")) {
-         $struct->{'interface'} = 1;
-      } else {
-         die "Nesting in $toptype not allowed" unless $toptype eq "subroutine" || $toptype eq "function" || $toptype eq "module" || $toptype eq "program";
-      }
-      new_struct ($struct) unless $struct->{'name'} eq "";
-   }
-   push @stmts::nesting, $struct;
-   if (exists ($stmts::nesting_by{$type})) {
-      push @{$stmts::nesting_by{$type}}, $struct;
-   } else {
-      $stmts::nesting_by{$type} = [ $struct ];
-   }
-   $stmts::topnest = $struct;
-   return ( $type, $struct );
-}
-
-#####
-# Ends the current nesting level.  Optionally, you can pass the 'type' that
-# it's supposed to be as the first argument.  Optionally, you can pass the
-# 'name' it should have after that (as the second argument).
-#####
-sub end_nest {
-  my ($type, $name) = @_;
-  $type = lc $type if defined $type;
-  unless (defined $stmts::topnest) {
-    if (defined $name && defined $type) {
-      die "Ended $type $name at top level";
-    } elsif (defined $type) {
-      die "Ended unnamed $type at top level";
-    } else {
-      die "END statement at top level";
-    }
-  }
-  my ($struct) = pop @stmts::nesting;
-  die "Ended $type while in $struct->{'type'} $struct->{'name'}"
-    if defined $type && $type ne $struct->{'type'};
-  die "Ended $name while in $struct->{'type'} $struct->{'name'}"
-    if defined $name && $name !~ /^\Q$struct->{'name'}\E$/i;
-  if (@stmts::nesting) {
-    $stmts::topnest = $stmts::nesting[$#stmts::nesting];
-  } else {
-    $stmts::topnest = undef;
-  }
-  pop @{$stmts::nesting_by{$struct->{'type'}}};
-  return ( "end" . (defined $type ? $type : ''), $struct );
-}
-
-#####
-# Parses the basic type that prefixes the given string.
-# Returns (parsed type, string portion remaining).
-#####
-sub parse_part_as_type {
-  my ($str) = @_;
-
-  $str =~ /^integer|real|double\s*precision|character|complex|logical|type/i
-    or die "parse_part_as_type: Invalid input `$str'";
-  my ($base, $rest) = ($&, $');
-
-  my $level = 0;
-  ## Wait till we are outside of all parens and see a letter, colon, or comma.
-  while ($rest =~ /[()a-zA-Z_:,]/g) {
-    if ($& eq '(') {
-      $level++;
-    } elsif ($& eq ')') {
-      $level--;
-      die "Unbalanced parens (too many )'s)" if $level < 0;
-    } elsif ($level == 0) {
-      return (parse_type ($base . $`), $& . $');
-    }
-  }
-  
-  die "Couldn't split into type and rest for `$str'";
-
-# Some old, presumably less-efficient code:
-#  my ($level, $len) = (0, length ($str));
-#  my ($i, $c);
-#  for ($i = length ($&); $i < $len; $i++) {
-#    $c = substr ($str, $i, 1);
-#    if ($c eq "(") {
-#      $level++;
-#    } elsif ($c eq ")") {
-#      $level--;
-#      die "Unbalanced parens (too many )'s)" if $level < 0;
-#    } elsif ($level == 0 && $c =~ /^\w|:|,$/) {
-#      last;
-#    }
-#  }
-#  return (parse_type (substr ($str, 0, $i)), substr ($str, $i));
-}
-
-#####
-# Parses a basic type, creating a type structure for it:
-#     integer [( [kind=] kind_val )]
-#     real [( [kind=] kind_val )]
-#     double precision                  (no kind is allowed)
-#     complex [( [kind=] kind_val )]
-#     character [( char_stuff )]
-#     logical [( [kind=] kind_val )]
-#     type (type_name)
-#
-# integer*number, real*number, complex*number, and logical*number are also
-# supported as nonstandard Fortran extensions for kind specification.
-# "number" can either be a direct integer or an expression in parentheses.
-# 
-# char_stuff is empty or (stuff), where stuff is one of:
-#     len_val [, [kind=] kind_val]
-#     kind=kind_val [, [len=] len_val]
-#     len=len_val [, kind=kind_val]
-# kind_val and len_val are expressions; len_val can also be just `*'.
-# 
-# The length can also be specified using the nonstandard Fortran extension
-# character*number.  If number is `*', it must be in parentheses (indeed,
-# any expression other than a number must be in parentheses).
-#####
-sub parse_type {
-  my ($str) = @_;
-
-  # print "Parsing type: $str\n";
-
-  $str = utils::trim ($str);
-  $str =~ /^(integer|real|double\s*precision|complex|character|logical|type)
-    \s* (?: \( (.*) \) | \* \s* (\d+ | \(.*\)) )?$/ix
-    or die "Invalid type `$str'";
-  my $base = lc $1;
-
-  if ($base =~ /^double\s*precision$/) {
-    die "double precision cannot have kind specification"
-      if defined $2 || defined $3;
-    return $typing::double_precision;
-  }
-
-  if (defined $2 || defined $3) {
-    my $star = defined $3;
-    my $args = utils::trim ($star ? $3 : $2);
-
-    if ($base eq 'type') {
-      die "type$args invalid--use type($args)" if $star;
-      die "type(w) for non-word w" unless $args =~ /^\w+$/;
-      return typing::make_type ($base, $args);
-    } elsif ($base eq 'character') {
-      my ($kind, $len, $rest);
-      if ($star) {
-        if ($args =~ /^\(\s*\*\s*\)$/) {
-          $len = '*';
-        } else {
-          $len = expr_parse::parse_expr ($args);
-        }
-      } elsif ($args =~ /^kind\s*=\s*/i) {
-        $args = substr ($args, length ($&));
-        ($kind, $rest) = expr_parse::parse_part_as_expr ($args);
-        if (defined $rest) {
-          $rest = utils::trim ($rest);
-          $rest =~ s/^len\s*=\s*//i;
-          $len = ($rest eq '*' ? '*' : expr_parse::parse_expr ($rest));
-        }
-      } elsif ($args =~ /^len\s*=\s*/i) {
-        $args = substr ($args, length ($&));
-        if (substr ($args, 0, 1) eq '*') {
-          $len = '*';
-          $rest = $args;
-          $rest =~ s/^\*\s*,// or $rest = undef;
-        } else {
-          ($len, $rest) = expr_parse::parse_part_as_expr ($args);
-        }
-        if (defined $rest) {
-          $rest = utils::trim ($rest);
-          $rest =~ /^kind\s*=\s*/
-            or die "kind= specifier needed when len= specifier is given";
-          $rest = substr ($rest, length ($&));
-          $kind = expr_parse::parse_expr ($rest);
-        }
-      } else {  # len
-        if (substr ($args, 0, 1) eq '*') {
-          $len = "*";
-          $rest = $args;
-          $rest =~ s/^\*\s*,// or $rest = undef;
-        } else {
-          ($len, $rest) = expr_parse::parse_part_as_expr ($args);
-        }
-        if (defined $rest) {
-          $rest = utils::trim ($rest);
-          $rest = substr ($rest, length ($&)) if $rest =~ /^kind\s*=\s*/i;
-          $kind = expr_parse::parse_expr ($rest);
-        }
-      }
-      return typing::make_character_type ($kind, $len);
-    } else {
-      $args =~ s/^kind\s*=\s*//i unless $star;
-      return typing::make_type ($base, expr_parse::parse_expr ($args));
-    }
-  } else {
-    die "type without (type-name) after it" if $base eq 'type';
-    die "No default type for `$base'"
-      unless exists $typing::default_type{$base};
-    return $typing::default_type{$base};
-  }
-}
-
-sub do_attrib {
-    my ($name, $attrib, $val, $attribname) = @_;
-    my ($struct);
-    foreach $struct (values %{$stmts::topnest->{'contains'}->{lc $name}}) {
-        die "Redefining $attribname of $struct->{'type'} $name from " .
-            "$struct->{$attrib} to $val" if exists $struct->{$attrib};
-        $struct->{$attrib} = $val;
-    }
-}
-
-1;
diff --git a/Tools/F_scripts/f90doc/typing.pl b/Tools/F_scripts/f90doc/typing.pl
deleted file mode 100644
index 9347b8bbb16..00000000000
--- a/Tools/F_scripts/f90doc/typing.pl
+++ /dev/null
@@ -1,516 +0,0 @@
-package typing;
-
-use strict;
-
-# Stores the type of each variable.
-$typing::typeof = "";
-# Stack: one typeof per scope.
-@typing::typeofs = ();
-
-# Stores the definition of each type.
-$typing::typedef = "";
-# Stack: one typedef per scope.
-@typing::typedefs = ();
-
-# Stores the definition of each function/operator.
-$typing::code = "";
-# Stack: one code per scope.
-@typing::codes = ();
-
-
-# DOUBLE PRECISION type.
-$typing::double_precision = typing::make_type ('real', 8, "double precision");
-
-# Default character kind.
-$typing::default_character_kind = 1;
-
-# Default types.
-%typing::default_type = (
-  'complex' => typing::make_type ('complex', 8, "complex"),
-  'integer' => typing::make_type ('integer', 4, "integer"),
-  'logical' => typing::make_type ('logical', 1, "logical"),
-  'real'    => typing::make_type ('real', 4, "real"),
-);
-$typing::default_type{'character'} = typing::make_character_type ();
-
-# Types with wild sub and any other info (just a base defined).
-$typing::wild_type = {
-   'complex'   => typing::make_type ('complex'),
-   'real'      => typing::make_type ('real'),
-   'integer'   => typing::make_type ('integer'),
-   'logical'   => typing::make_type ('logical'),
-   'character' => typing::make_type ('character')
-};
-
-
-# Precedence of operations; based on that which is in expr_parse.y.
-# Higher precedence indicated by larger number.
-$typing::precedence = {
-  '.eqv.'  => 1,
-  '.neqv.' => 1,
-  '.or.'   => 2,
-  '.and.'  => 3,
-  '.not.'  => 4,
-  '<'      => 5,
-  '>'      => 5,
-  '<='     => 5,
-  '>='     => 5,
-  '=='     => 5,
-  '/='     => 5,
-  '//'     => 6,
-  '+'      => 7,
-  '-'      => 7,
-  'u+'     => 8,
-  'u-'     => 8,
-  '*'      => 9,
-  '/'      => 9,
-  '**'     => 10,
-  '%'      => 11,
-  '%call'  => 11,
-  '%colon' => 30, # this is a guess
-  '%namedarg' => 30, # this is a guess
-  '%array' => 40,    # as in "forty days and forty nights," which means
-  '%const' => 40,    #    "a long time," here we use 40 as an approx. to infty.
-  '%var'   => 40,
-  '%do'    => 40,
-};
-
-#####
-# Starts a new scope.  If this is a top-level scope, initializes the codes
-# to intrinsics and the like.
-#####
-sub new_scope {
-   my ($newtypeof, $newtypedef, $newcode);
-
-   if (@typing::typeofs) {
-      $typing::typeof = utils::copy_hash ($typing::typeof);
-      $typing::typedef = utils::copy_hash ($typing::typedef);
-      $typing::code = utils::copy_hash ($typing::code);
-   } else {
-      $typing::typeof = {};
-      $typing::typedef = {};
-      $typing::code = {};
-      $typing::code{"//"} = [ {
-         'parms' => [ $typing::wild_type{'character'},
-                      $typing::wild_type{'character'} ],
-         'return' => $typing::wild_type{'character'}
-      } ];
-      my ($int, $real, $logical, $char) = ( $typing::wild_type{'integer'},
-         $typing::wild_type{'real'}, $typing::wild_type{'logical'},
-         $typing::wild_type{'character'} );
-      my ($op);
-      foreach $op ("+", "-", "*", "/") {
-         $typing::code->{$op} = [
-            { 'parms' => [ $int, $int ], 'return' => $int },
-            { 'parms' => [ $real, $int ], 'return' => $real },
-            { 'parms' => [ $int, $real ], 'return' => $real },
-            { 'parms' => [ $real, $real ], 'return' => $real }
-         ];
-      }
-      $typing::code->{"**"} = [
-         { 'parms' => [ $int, $int ], 'return' => $int },
-         { 'parms' => [ $real, $int ], 'return' => $real },
-         { 'parms' => [ $int, $real ], 'return' => $real },
-         { 'parms' => [ $real, $real ], 'return' => $real },
-      ];
-      foreach $op ("u+", "u-") {
-         $typing::code->{$op} = [
-            { 'parms' => [ $int ], 'return' => $int },
-            { 'parms' => [ $real ], 'return' => $real }
-         ];
-      }
-      foreach $op ("<", "<=", "==", "/=", ">", ">=") {
-         $typing::code->{$op} = [
-            { 'parms' => [ $int, $int ], 'return' => $logical },
-            { 'parms' => [ $real, $int ], 'return' => $logical },
-            { 'parms' => [ $int, $real ], 'return' => $logical },
-            { 'parms' => [ $real, $real ], 'return' => $logical },
-            { 'parms' => [ $char, $char ], 'return' => $logical }
-         ];
-      }
-      foreach $op (".or.", ".and.", ".eqv.", ".neqv.") {
-         $typing::code->{$op} = [
-            { 'parms' => [ $logical, $logical ], 'return' => $logical }
-         ];
-      }
-      $typing::code->{".not."} = [
-         { 'parms' => [ $logical ], 'return' => $logical }
-      ];
-      $typing::code->{"//"} = [
-         { 'parms' => [ $char, $char ], 'return' => $char }
-      ];
-   }
-
-   push @typing::typeofs, $typing::typeof;
-   push @typing::typedefs, $typing::typedef;
-   push @typing::codes, $typing::code;
-}
-
-#####
-# Ends an old scope.
-#####
-sub end_scope {
-   pop @typing::typeofs;
-   pop @typing::typedefs;
-   pop @typing::codes;
-
-   if ($typing::typeofs) {
-      $typing::typeof = $typing::typeofs[$#typing::typeofs];
-      $typing::typedef = $typing::typedefs[$#typing::typedefs];
-      $typing::code = $typing::codes[$#typing::codes];
-   }
-}
-
-#####
-# Creates a new type with specified base and sub.
-# Note that sub corresponds to kind for built-in types.
-# sub can be left out for a wild type.
-# A third argument, print, can specify how the type should print.  Used for
-# default types, double precision, etc.
-#####
-sub make_type {
-  my ($base, $sub, $print) = @_;
-  my $type = { 'base' => $base };
-  $type->{'sub'} = $sub if $sub;
-  $type->{'print'} = $print;
-  return $type;
-}
-
-#####
-# Creates a new complex type with specified types of "sides."
-#####
-sub make_complex_type {
-  my ($type1, $type2) = @_;
-  my ($base1, $base2) = ($type1->{'base'}, $type2->{'base'});
-  die "Complex constant must have real and/or integer parts, but I found types $base1 and $base2"
-    unless ($base1 eq 'integer' || $base1 eq 'real') &&
-           ($base2 eq 'integer' || $base2 eq 'real');
-  my $which;
-  # From Metcalf and Reed's Fortran 90 Explained, if one of the types is an
-  # integer then the kind of the complex is the kind of the other type.
-  if ($base1 eq 'integer') {
-    $which = $type2;
-  } elsif ($base2 eq 'integer') {
-    $which = $type1;
-  } else {
-    if ($type1->{'sub'} > $type2->{'sub'}) {
-      $which = $type1;
-    } else {
-      $which = $type2;
-    }
-  }
-  return {
-    'base'    => 'complex',
-    'sub'     => $which
-  };
-}
-
-#####
-# Creates a new character type with specified sub (kind) and len.
-#####
-sub make_character_type {
-  my ($sub, $len) = @_;
-  $sub = $typing::default_character_kind unless defined $sub;
-  $sub = [ "%const", $typing::default_type{'integer'}, $sub ] unless ref $sub;
-  $len = "1" unless defined $len;
-  $len = [ "%const", $typing::default_type{'integer'}, $len ]
-    unless ref $len || $len eq "*";
-  return {
-    'base' => 'character',
-    'sub'  => $sub,
-    'len'  => $len
-  };
-}
-
-#####
-# Returns true iff the given type was created to be the default of its kind.
-# This has no meaning for compound types (hence it returns false).  For
-# characters, there's a slight bug in that it will say that the type was
-# created default even if you specify the default explicitly.  No biggie.
-# Note that the defaultness is only for the KIND, not the LENGTH.
-# 
-# I could fix the above-mentioned problem by storing a 'default' entry just for
-# the default types.  Then is_default_kind just translates to an exists test.
-# This is much simpler and avoids the weird checks for double precision numbers
-# (0.0d0 ==> don't show a kind.  This is really "default").  This would be
-# kinda nice but 'default' is probably the wrong word.
-#####
-sub is_default_kind {
-   my ($type) = @_;
-
-   if ($type->{'base'} eq "character") {
-     my ($top, @rest) = @{$type->{'sub'}};
-     return ($top eq "%const" && $rest[0] eq $typing::default_type{'integer'}
-          && $rest[1] == $typing::default_character_kind);
-   } else {
-      return (exists $typing::default_type{$type->{'base'}} && $typing::default_type{$type->{'base'}} eq $type);
-   }
-}
-
-#####
-# Converts the given type to a string, written in Fortran 90 code.
-# Only displays the kind if it was specified explicitly.  Slight bug:
-# if you say character (kind=1) :: c, then it will print character :: c.
-# (This is only for characters with default kind.  For other types with
-# default kind explicitly specified, it is printed.)
-#####
-sub type_to_f90 {
-  my ($type) = @_;
-
-  # This covers the case where the kind is the default, except for characters.
-  return $type->{'print'} if defined $type->{'print'};
-
-  my $mods = "";
-  if ($type->{'base'} eq "character") {
-    if ($type->{'len'} eq "*") {
-      $mods = "len=*";
-    } elsif ($type->{'len'}->[0] ne "%const" ||
-             $type->{'len'}->[1] != $typing::default_type{'integer'} ||
-             $type->{'len'}->[2] ne "1") {
-      $mods = "len=" . expr_to_f90 ($type->{'len'});
-    }
-    unless (is_default_kind ($type)) {
-      $mods .= ", " unless $mods eq '';
-      $mods .= "kind=" . expr_to_f90 ($type->{'sub'});
-    }
-  } elsif ($type->{'base'} eq "type") {
-    $mods = "$type->{'sub'}";
-  } else {
-    $mods = "kind=" . expr_to_f90 ($type->{'sub'});
-  }
-  $mods = " ($mods)" unless $mods eq '';
-  return $type->{'base'} . $mods;
-}
-
-#####
-# Converts an expression right back to a string, doing "no" conversion (i.e.,
-# output is in Fortran 90).  Optionally returns the precedence of the outmost
-# operation in the expression (see $typing::precedence).
-#####
-sub expr_to_f90 {
-  my ($exprptr) = @_;
-  my ($op, @children) = @$exprptr;
-
-  die "Unrecognized operation $op",%$op," (has no precedence?)"
-    unless exists $typing::precedence->{$op};
-  my $prec = $typing::precedence->{$op};
-
-  my $answer;
-  if ($op eq "%") {
-    my ($struct, $elem) = @children;
-    my ($s, $sprec) = expr_to_f90 ($struct);
-    $s = "($s)" if $prec > $sprec;
-    $answer = "$s%$elem";
-  } elsif ($op eq "%var") {
-    $answer = $children[0];
-  } elsif ($op eq "%const") {
-    my ($type, $val) = @children;
-    if ($type->{'base'} eq 'complex') {
-      if (!is_default_kind ($type->{'sub'})) {
-        my ($k1, $k2) = ("", "");
-        $k1 = "_$type->{'sub'}->{'sub'}" unless $val->[0] =~ /D[+-]?\d+$/i;
-        $k2 = "_$type->{'sub'}->{'sub'}" unless $val->[1] =~ /D[+-]?\d+$/i;
-        $answer = "($val->[0]$k1, $val->[1]$k2)";
-      } else {
-        $answer = "($val->[0], $val->[1])";
-      }
-    } elsif (is_default_kind ($type) || $val =~ /D[+-]?\d+$/i) {
-      $answer = $val;
-    } else {
-      $answer = "${val}_$type->{'sub'}";
-    }
-  } elsif ($op eq "%array") {
-    $answer = "(/ " . join (", ", map { (expr_to_f90 ($_))[0] } @children)
-            . " /)";
-  } elsif ($op eq "%colon") {
-    my ($left, $right) = @children;
-    $left = (expr_to_f90 ($left))[0] if $left ne '';
-    $right = (expr_to_f90 ($right))[0] if $right ne '';
-    $answer = $left . ":" . $right;  # : has ultimately low precedence
-  } elsif ($op eq "%namedarg") {
-    my ($left, $right) = @children;
-    $answer = $left . " = " .
-              (expr_to_f90 ($right))[0];  # = has ultimately low precedence
-  } elsif ($op eq "%do") {
-    my ($child, $var, @args) = @children;
-    $answer = "(" . expr_to_f90 ($child) . ", " . $var . " = " .
-              join (", ", map { (expr_to_f90 ($_))[0] } @args) . ")";
-  } elsif ($op eq "%call") {
-    ($op, @children) = @children;
-    my ($s, $sprec) = expr_to_f90 ($op);
-    $s = "($s)" if $prec > $sprec;
-    $answer = "$s (" . join (", ", map ((expr_to_f90 ($_))[0], @children))
-      . ")";
-  } elsif (scalar @children == 1) {
-    $op = substr ($op, 1) if substr ($op, 0, 1) eq 'u';
-    my ($s, $sprec) = expr_to_f90 ($children[0]);
-    $s = "($s)" if $prec > $sprec;
-    $answer = "$op$s";
-  } elsif (scalar @children == 2) {
-    my ($s1, $sprec1) = expr_to_f90 ($children[0]);
-    $s1 = "($s1)" if $prec > $sprec1;
-    my ($s2, $sprec2) = expr_to_f90 ($children[1]);
-    $s2 = "($s2)" if $prec > $sprec2;
-    $answer = "$s1 $op $s2";
-  } else {
-    die "expr_to_f90: Unrecognized operation $op with " . (scalar @children) .
-      " children";
-  }
-
-  if (wantarray) {
-    return ($answer, $prec);
-  } else {
-    return $answer;
-  }
-}
-
-#####
-# Computes the type of the given expression (which is passed by reference).
-# Returns a reference to the actual type.
-#####
-sub expr_type {
-   my ($exprptr) = @_;
-   my ($op, @children) = @$exprptr;
-
-   if ($op eq "%") {
-      my ($struct, $elem) = @children;
-      my ($type) = expr_type ($struct);
-      die "expr_type: \%$elem failed: left part is not a compound type" unless $type->{'base'} eq "type";
-      my ($typedef) = $typing::typedef->{$type->{'sub'}};
-      my ($elemtype) = $typedef->{$elem};
-      die "expr_type: \%$elem failed: left part does not include $elem" unless $elemtype;
-      return $elemtype;
-   } elsif ($op eq "%var") {
-      my ($var) = @children;
-      my ($vartype) = $typing::typeof->{$var};
-      die "expr_type: Variable $var undefined" unless $vartype;
-      return $vartype;
-   } elsif ($op eq "%const") {
-      my ($type, $val) = @children;
-      return $type;
-   } elsif ($op eq "%array") {
-      # HERE
-   } elsif ($op eq "%colon") {
-      my ($string, $left, $right) = @children;
-      my ($stringtype) = expr_type ($string);
-      die "expr_type: colon notation for non-character string" if $stringtype->{'base'} ne "character";
-      die "expr_type: colon notation for character array" if $stringtype->{'dimension'};
-      return typing::make_character_type ($stringtype->{'sub'}, "*");
-   } elsif ($op eq "%call") {
-      ($op, @children) = @children;
-      my ($subop, @subchildren) = @$op;
-      if ($subop eq "%var") {
-         ($op) = @subchildren;
-         # Fall through: we allow overloaded function name in this special case.
-      } else {
-         # Function call without overloading or an array reference.
-         my ($optype) = expr_type ($op);
-
-         if ($optype->{'dimension'}) {  # array reference
-            return make_type ($optype->{'base'}, $optype->{'sub'});
-         } else {
-            die "expr_type: Array/function call for something that is neither" unless $optype->{'base'} eq "interface";
-            # HERE function call without overloading.
-         }
-      }
-   }
-
-   my ($opcodes) = $typing::code->{$op};
-   die "Operation/function $op undefined" unless $opcodes;
-   my (@childtypes) = ();
-   my ($child);
-   foreach $child (@children) {
-      print "childtypes was: @childtypes\n";
-      print "type of $child is ", expr_type ($child), "\n";
-      push @childtypes, expr_type ($child);
-      print "childtypes is now: @childtypes\n";
-   }
-   my ($opcode);
-   foreach $opcode (@$opcodes) {
-      print "children: @children\n";
-      print "childtypes: @childtypes\n";
-      if (typing::subtypes_list (\@childtypes, $opcode->{'parms'})) {
-         my ($parm);
-         my ($ret) = $opcode->{'return'};
-         if ($ret->{'base'} eq "character" && ! $ret->{'len'}) {
-            $ret->{'len'} = 0;
-find_len:
-            foreach $parm (@$opcode->{'parms'}) {
-               if ($parm->{'base'} eq $ret->{'base'}) {
-                  if ($parm->{'len'} eq "*") {
-                     $ret->{'len'} = "*";
-                     last find_len;
-                  } else {
-                     $ret->{'len'} += $parm->{'len'};
-                  }
-               }
-            }
-         }
-         if ($ret->{'sub'}) {
-            return $ret;
-         } else {
-            # Make intrinsic type's kind: look for all parameters with the same
-            # base type, and use the maximum kind out of those.
-            my ($maxkind) = -1;
-            foreach $parm (@$opcode->{'parms'}) {
-               if ($parm->{'base'} eq $ret->{'base'}) {
-                  $maxkind = $parm->{'sub'} if $maxkind < $parm->{'sub'};
-               }
-            }
-            die "expr_type: Internal error caused by new_scope" if $maxkind < 0;
-            return { %$ret, 'sub' => $maxkind };
-         }
-      }
-   }
-   die "Operation/function $op defined but not for this (these) type(s)";
-}
-
-#####
-# Returns if first type is a subtype of the second type.
-# This currently only supports intrinsic types (integer*4 subtypes integer*?).
-#####
-sub subtypes {
-   my ($t1, $t2) = @_;
-   return 0 if $t1->{'base'} ne $t2->{'base'};
-   if ($t1->{'base'} eq "type") {
-      return 0 if $t1->{'sub'} eq $t2->{'sub'};
-   } else {
-      if ($t1->{'base'} eq "character") {
-         if ($t1->{'len'}) {
-            return 0 unless $t1->{'len'};
-            return 0 if $t2->{'len'} != $t1->{'len'};
-         }
-      }
-      if ($t1->{'base'} eq "interface") {
-         # HERE fill this in when I do function types ("interface").
-      }
-      if ($t1->{'sub'}) {
-         return 0 unless $t1->{'sub'};
-         return 0 if $t2->{'sub'} ne $t1->{'sub'};
-      }
-   }
-   return 1;
-}
-
-#####
-# Returns if first type is a subtype of the second type, where the first
-# and second type are (conceptually) tuples.  That is, the lengths must be
-# equal, and each element must subtype the corresponding element.
-# The lists are passed as references.
-#####
-sub subtypes_list {
-   my ($l1ptr, $l2ptr) = @_;
-   my (@l1) = @$l1ptr;
-   my (@l2) = @$l2ptr;
-   return 0 if $#l1 != $#l2;
-
-   print "l1 is: @l1\n";
-   print "l2 is: @l2\n";
-
-   my ($i);
-   for ($i = 0; $i <= $#l1; $i++) {
-      print "calling subtypes with $l1[$i] and $l2[$i]\n";
-      return 0 unless typing::subtypes ($l1[$i], $l2[$i]);
-   }
-   return 1;
-}
diff --git a/Tools/F_scripts/f90doc/utils.pl b/Tools/F_scripts/f90doc/utils.pl
deleted file mode 100644
index 8e409f0db1c..00000000000
--- a/Tools/F_scripts/f90doc/utils.pl
+++ /dev/null
@@ -1,87 +0,0 @@
-package utils;
-
-use strict;
-
-sub copy_list {
-   my ($listref) = @_;
-   my @list;
-   @list = @$listref;
-   \@list;
-}
-
-sub copy_hash {
-   my ($hashref) = @_;
-   my %hash;
-   %hash = %$hashref;
-   \%hash;
-}
-
-sub hash2str {
-   my ($hash) = @_;
-   my ($key, $s);
-   $s = "{\n";
-   foreach $key (keys %$hash) {
-      $s .= "   $key => $hash->{$key}\n";
-   }
-   $s .= "}";
-}
-
-sub trim {
-   my ($s) = @_;
-   $s =~ s/^\s*//;
-   $s =~ s/\s*$//;
-   $s;
-}
-
-# balsplit (sep, string) splits string into pieces divided by sep when
-# sep is "outside" ()s.  Returns a list just like split.
-sub balsplit {
-   my ($sep, $str) = @_;
-   my ($i, $c);
-   my ($len, $level, $left) = (length ($str), 0, 0);
-   my (@list) = ();
-
-   for ($i = 0; $i < $len; $i++) {
-      $c = substr ($str, $i, 1);
-      if ($c eq "(") {
-         $level++;
-      } elsif ($c eq ")") {
-         $level--;
-         die "balsplit: Unbalanced parens (too many )'s)" if $level < 0;
-      } elsif ($c eq $sep && $level == 0) {
-         push (@list, substr ($str, $left, $i-$left));
-         $left = $i + 1;
-      }
-   }
-
-   push (@list, substr ($str, $left));
-   return @list;
-}
-
-# Takes the first word of each element of the list.
-sub leftword {
-   my ($listref) = @_;
-   my @out = ();
-   my ($x);
-   foreach $x (@$listref) {
-      $x =~ s/^\s*//;
-      $x =~ /^\w*/;
-      push (@out, $&);
-   }
-   @out;
-}
-
-sub remove_blanks {
-   my ($listref) = @_;
-   my @out = ();
-   my ($x);
-   foreach $x (@$listref) {
-      push (@out, $x) unless $x =~ /^\s*$/;
-   }
-   @out;
-}
-
-sub do_nothing {
-}
-
-1;
diff --git a/Tools/F_scripts/fcheck.py b/Tools/F_scripts/fcheck.py
index 20033f85ac9..f5be4efd726 100755
--- a/Tools/F_scripts/fcheck.py
+++ b/Tools/F_scripts/fcheck.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/env python3
 
 # a simple routine to parse Fortran files and make sure that things are
 # declared double precision, and constants are of the form 1.0_dp_t or
@@ -122,9 +122,3 @@ def visit(argFiles, dirname, files):
 
         if (badFile == 1):
             print " "
-
-
-
-
-
-
diff --git a/Tools/F_scripts/find_files_vpath.py b/Tools/F_scripts/find_files_vpath.py
index c9dd5485930..a52d0f28f3d 100755
--- a/Tools/F_scripts/find_files_vpath.py
+++ b/Tools/F_scripts/find_files_vpath.py
@@ -1,12 +1,10 @@
-#!/usr/bin/env python
+#!/usr/bin/env python3
 
 """
 Take a vpath and a list of files and find where in the first vpath the
 first occurrence of the file.
 """
 
-from __future__ import print_function
-
 import sys
 import os
 import argparse
diff --git a/Tools/F_scripts/findparams.py b/Tools/F_scripts/findparams.py
index 70280b134de..79d698ade8d 100755
--- a/Tools/F_scripts/findparams.py
+++ b/Tools/F_scripts/findparams.py
@@ -1,6 +1,4 @@
-#!/usr/bin/env python
-
-from __future__ import print_function
+#!/usr/bin/env python3
 
 import sys
 import os
diff --git a/Tools/F_scripts/makebuildinfo.py b/Tools/F_scripts/makebuildinfo.py
index e5f206339b2..4d08a571145 100755
--- a/Tools/F_scripts/makebuildinfo.py
+++ b/Tools/F_scripts/makebuildinfo.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/env python3
 
 # a simple script that writes the build_info.f90 file that is used
 # to store information for the job_info file that we store in plotfiles.
diff --git a/Tools/F_scripts/write_probin.py b/Tools/F_scripts/write_probin.py
index 10ec4489066..54729eb5f5e 100755
--- a/Tools/F_scripts/write_probin.py
+++ b/Tools/F_scripts/write_probin.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/env python3
 
 """This routine parses plain-text parameter files that list runtime
 parameters for use in our codes.  The general format of a parameter
@@ -24,8 +24,6 @@
 
 """
 
-from __future__ import print_function
-
 import os
 import sys
 import argparse
diff --git a/Tools/GNUMake/Make.defs b/Tools/GNUMake/Make.defs
index db1ce350e54..f3f712816a6 100644
--- a/Tools/GNUMake/Make.defs
+++ b/Tools/GNUMake/Make.defs
@@ -1,8 +1,3 @@
-# Check python version
-my_python_version := $(word 2, $(shell python --version 2>&1))
-ifneq ($(firstword $(sort 2.7 $(my_python_version))), 2.7)
-  $(error Python >= 2.7 required! Your version is $(my_python_version))
-endif
 
 ifneq (,$(findstring ~,$(AMREX_HOME)))
   $(warning *** AMREX_HOME string constains ~ and make will not like it. So it is replaced.)
@@ -762,6 +757,7 @@ else ifeq ($(USE_CUDA),TRUE)
         LINK_WITH_FORTRAN_COMPILER=TRUE
     endif
 
+    $(info Loading $(AMREX_HOME)/Tools/GNUMake/comps/nvcc.mak...)
     include $(AMREX_HOME)/Tools/GNUMake/comps/nvcc.mak
 
     ifeq ($(USE_MPI),TRUE)
@@ -971,17 +967,17 @@ endif
 F90CACHE =
 
 ifeq ($(TP_PROFILING),VTUNE)
-  $(into Loading $(AMREX_HOME)/Tools/GNUMake/tools/Make.vtune
+  $(info Loading $(AMREX_HOME)/Tools/GNUMake/tools/Make.vtune
   include        $(AMREX_HOME)/Tools/GNUMake/tools/Make.vtune
 endif
 
 ifeq ($(TP_PROFILING),CRAYPAT)
-  $(into Loading $(AMREX_HOME)/Tools/GNUMake/tools/Make.craypat
+  $(info Loading $(AMREX_HOME)/Tools/GNUMake/tools/Make.craypat
   include        $(AMREX_HOME)/Tools/GNUMake/tools/Make.craypat
 endif
 
 ifeq ($(TP_PROFILING),FORGE)
-  $(into Loading $(AMREX_HOME)/Tools/GNUMake/tools/Make.forge
+  $(info Loading $(AMREX_HOME)/Tools/GNUMake/tools/Make.forge
   include        $(AMREX_HOME)/Tools/GNUMake/tools/Make.forge
 endif
 
diff --git a/Tools/GNUMake/Make.machines b/Tools/GNUMake/Make.machines
index 6903ba05125..738461965d0 100644
--- a/Tools/GNUMake/Make.machines
+++ b/Tools/GNUMake/Make.machines
@@ -60,11 +60,22 @@ ifdef OLCF_ROCM_ROOT
     which_site := olcf
     which_computer := crusher
   endif
+
+  ifeq ($(findstring frontier, $(host_name)), frontier)
+    which_site := olcf
+    which_computer := frontier
+  endif
 endif
 
-ifeq ($(findstring theta, $(host_name)), theta)
-  which_site := alcf
-  which_computer := theta
+ifeq ($(findstring alcf.anl.gov, $(host_name)),alcf.anl.gov)
+  ifeq ($(findstring theta, $(host_name)), theta)
+    which_site := alcf
+    which_computer := theta
+  endif
+  ifeq ($(findstring polaris, $(host_name)), polaris)
+    which_site := alcf
+    which_computer := polaris
+  endif
 endif
 
 ifeq ($(findstring sierra, $(host_name)), sierra)
diff --git a/Tools/GNUMake/Make.rules b/Tools/GNUMake/Make.rules
index 8b014678500..48ef6d9d3f8 100644
--- a/Tools/GNUMake/Make.rules
+++ b/Tools/GNUMake/Make.rules
@@ -441,7 +441,7 @@ $(tmpEXETempDir)/%.F.orig: %.F
 # & --> *
 $(tmpEXETempDir)/%-cppd.h: %.H
 	@if [ ! -d $(tmpEXETempDir) ]; then mkdir -p $(tmpEXETempDir); fi
-	$(SILENT) $(CC) $(CPPFLAGS) -DAMREX_TYPECHECK $(includes) -E -P -x c -std=c99 $< -o $@
+	$(SILENT) $(CC) $(CPPFLAGS) -DAMREX_TYPECHECK $(includes) -E -P -x c -std=c11 $< -o $@
 	@$(SHELL) -ec 'sed -i -e '\''s/amrex::Real/$(amrex_real)/g'\'' $@ ; \
 	               sed -i -e '\''s/amrex_real/$(amrex_real)/g'\''  $@ ; \
 	               sed -i -e '\''s/amrex_particle_real/$(amrex_particle_real)/g'\''  $@ ; \
@@ -512,9 +512,14 @@ endif
 # e.g. libraries, simply do "make print-libraries".  This will
 # print out the value.
 print-%:
-	@echo $* is '$($*)'
+	@echo $* is "$($*)"
 	@echo '    origin = $(origin $*)'
-	@echo '     value = $(value  $*)'
+	@echo '     value = $(subst ','"'"',$(value  $*))'
+# We need to use subst on the result of $(value) because it contains single
+# quotes.  Shell command echo does not like things like 'x'$(filiter-out)'y',
+# because what it sees is 'x', $(filter-out), and 'y'.  With the substition, it
+# will see 'x', "'", '$(filter-out)', "'", and 'y', with $(filter-out) inside a
+# pair of single quotes.
 
 .PHONY: help
 help:
diff --git a/Tools/GNUMake/comps/armclang.mak b/Tools/GNUMake/comps/armclang.mak
index efe4a718106..d2826cb1134 100644
--- a/Tools/GNUMake/comps/armclang.mak
+++ b/Tools/GNUMake/comps/armclang.mak
@@ -57,18 +57,18 @@ ifeq ($(WARN_ERROR),TRUE)
 endif
 
 # disable some warnings
-CXXFLAGS += -Wno-pass-failed -Wno-c++17-extensions
+CXXFLAGS += -Wno-c++17-extensions
 
 ########################################################################
 
 ifdef CXXSTD
   CXXSTD := $(strip $(CXXSTD))
 else
-  CXXSTD := c++14
+  CXXSTD := c++17
 endif
 
 CXXFLAGS += -std=$(CXXSTD)
-CFLAGS   += -std=c99
+CFLAGS   += -std=c11
 
 FMODULES = -J$(fmoddir) -I $(fmoddir)
 
diff --git a/Tools/GNUMake/comps/cray.mak b/Tools/GNUMake/comps/cray.mak
index 85a1133e412..cf484e6ec38 100644
--- a/Tools/GNUMake/comps/cray.mak
+++ b/Tools/GNUMake/comps/cray.mak
@@ -53,10 +53,10 @@ else
     # CCE <= 8. So we adjust some flags to achieve similar optimization. See
     # this page:
     # http://pubs.cray.com/content/S-5212/9.0/cray-compiling-environment-cce-release-overview/cce-900-software-enhancements
-    CXXFLAGS += -O2 -ffast-math #-fsave-loopmark -fsave-decompile
-    CFLAGS   += -O2 -ffast-math #-fsave-loopmark -fsave-decompile
-    FFLAGS   += -O2 -h list=a
-    F90FLAGS += -O2 -h list=a
+    CXXFLAGS += -O3 -ffast-math #-fsave-loopmark -fsave-decompile
+    CFLAGS   += -O3 -ffast-math #-fsave-loopmark -fsave-decompile
+    FFLAGS   += -O3 -h list=a
+    F90FLAGS += -O3 -h list=a
   else
     GENERIC_COMP_FLAGS += -h list=a
 
@@ -73,15 +73,15 @@ endif
 ifdef CXXSTD
   CXXSTD := $(strip $(CXXSTD))
 else
-  CXXSTD := c++14
+  CXXSTD := c++17
 endif
 
 ifeq ($(CRAY_IS_CLANG_BASED),TRUE)
   CXXFLAGS += -std=$(CXXSTD)
-  CFLAGS   += -std=c99
+  CFLAGS   += -std=c11
 else
   CXXFLAGS += -h std=$(CXXSTD)
-  CFLAGS   += -h c99
+  CFLAGS   += -h c11
 endif
 
 F90FLAGS += -N 255 -em
@@ -119,10 +119,6 @@ else
   endif
 endif
 
-ifeq ($(CRAY_IS_CLANG_BASED),TRUE)
-  CXXFLAGS += -Wno-pass-failed -Wno-c++17-extensions
-endif
-
 CXXFLAGS += $(GENERIC_COMP_FLAGS)
 CFLAGS   += $(GENERIC_COMP_FLAGS)
 FFLAGS   += $(GENERIC_COMP_FLAGS)
diff --git a/Tools/GNUMake/comps/dpcpp.mak b/Tools/GNUMake/comps/dpcpp.mak
index d2f7f72108e..33c05fc0c7a 100644
--- a/Tools/GNUMake/comps/dpcpp.mak
+++ b/Tools/GNUMake/comps/dpcpp.mak
@@ -36,8 +36,6 @@ else
 
 endif
 
-CXXFLAGS += -Wno-pass-failed # disable this warning
-
 ifeq ($(WARN_ALL),TRUE)
   warning_flags = -Wall -Wextra -Wno-sign-compare -Wunreachable-code -Wnull-dereference
   warning_flags += -Wfloat-conversion -Wextra-semi
@@ -71,7 +69,7 @@ else
 endif
 
 CXXFLAGS += -Wno-error=sycl-strict -fsycl
-CFLAGS   += -std=c99
+CFLAGS   += -std=c11
 
 ifneq ($(DEBUG),TRUE)  # There is currently a bug that DEBUG build will crash.
 ifeq ($(DPCPP_AOT),TRUE)
diff --git a/Tools/GNUMake/comps/gnu.mak b/Tools/GNUMake/comps/gnu.mak
index 10510f30a8d..2d67d418717 100644
--- a/Tools/GNUMake/comps/gnu.mak
+++ b/Tools/GNUMake/comps/gnu.mak
@@ -38,23 +38,23 @@ ifeq ($(EXPORT_DYNAMIC),TRUE)
   GENERIC_GNU_FLAGS += -rdynamic -fno-omit-frame-pointer
 endif
 
-gcc_major_ge_5 = $(shell expr $(gcc_major_version) \>= 5)
-gcc_major_ge_6 = $(shell expr $(gcc_major_version) \>= 6)
-gcc_major_ge_7 = $(shell expr $(gcc_major_version) \>= 7)
 gcc_major_ge_8 = $(shell expr $(gcc_major_version) \>= 8)
 gcc_major_ge_9 = $(shell expr $(gcc_major_version) \>= 9)
 gcc_major_ge_10 = $(shell expr $(gcc_major_version) \>= 10)
 gcc_major_ge_11 = $(shell expr $(gcc_major_version) \>= 11)
+gcc_major_ge_12 = $(shell expr $(gcc_major_version) \>= 12)
+
+ifneq ($(gcc_major_ge_8),1)
+  $(error GCC < 8 not supported)
+endif
 
 ifeq ($(THREAD_SANITIZER),TRUE)
   GENERIC_GNU_FLAGS += -fsanitize=thread
 endif
 ifeq ($(FSANITIZER),TRUE)
   GENERIC_GNU_FLAGS += -fsanitize=address -fsanitize=undefined
-  ifeq ($(gcc_major_ge_8),1)
-    GENERIC_GNU_FLAGS += -fsanitize=pointer-compare -fsanitize=pointer-subtract
-    GENERIC_GNU_FLAGS += -fsanitize=builtin -fsanitize=pointer-overflow
-  endif
+  GENERIC_GNU_FLAGS += -fsanitize=pointer-compare -fsanitize=pointer-subtract
+  GENERIC_GNU_FLAGS += -fsanitize=builtin -fsanitize=pointer-overflow
 endif
 
 ifeq ($(USE_OMP),TRUE)
@@ -97,7 +97,7 @@ else
 endif
 
 ifeq ($(WARN_ALL),TRUE)
-  warning_flags = -Wall -Wextra
+  warning_flags = -Wall -Wextra -Wlogical-op -Wfloat-conversion -Wnull-dereference -Wmisleading-indentation -Wduplicated-cond -Wduplicated-branches
 
   ifeq ($(WARN_SIGN_COMPARE),FALSE)
     warning_flags += -Wno-sign-compare
@@ -108,27 +108,15 @@ ifeq ($(WARN_ALL),TRUE)
     warning_flags += -Wpedantic
   endif
 
-  ifeq ($(gcc_major_ge_6),1)
-    warning_flags += -Wnull-dereference
-  endif
-
-  ifeq ($(gcc_major_ge_5),1)
-    warning_flags += -Wfloat-conversion
-  endif
-
   ifneq ($(WARN_SHADOW),FALSE)
     warning_flags += -Wshadow
   endif
 
-  ifeq ($(gcc_major_version),7)
-    warning_flags += -Wno-array-bounds
-  endif
-
   ifeq ($(gcc_major_ge10),1)
     warning_flags += -Wextra-semi
   endif
 
-  CXXFLAGS += $(warning_flags) -Woverloaded-virtual
+  CXXFLAGS += $(warning_flags) -Woverloaded-virtual -Wnon-virtual-dtor
   CFLAGS += $(warning_flags)
 endif
 
@@ -157,21 +145,12 @@ endif
 
 ifdef CXXSTD
   CXXSTD := $(strip $(CXXSTD))
-  ifeq ($(shell expr $(gcc_major_version) \< 5),1)
-    ifneq ($(NO_CONFIG_CHECKING),TRUE)
-      ifeq ($(CXXSTD),c++14)
-        $(error C++14 support requires GCC 5 or newer.)
-      endif
-    endif
-  endif
   CXXFLAGS += -std=$(CXXSTD)
 else
-  ifeq ($(gcc_major_version),5)
-    CXXFLAGS += -std=c++14
-  endif
+  CXXFLAGS += -std=c++17
 endif
 
-CFLAGS   += -std=gnu99
+CFLAGS   += -std=c11
 
 ########################################################################
 
diff --git a/Tools/GNUMake/comps/hip.mak b/Tools/GNUMake/comps/hip.mak
index d94f8f3c66f..6005409f9cc 100644
--- a/Tools/GNUMake/comps/hip.mak
+++ b/Tools/GNUMake/comps/hip.mak
@@ -23,7 +23,7 @@ endif
 
 # Generic flags, always used
 CXXFLAGS = -std=$(CXXSTD) -m64
-CFLAGS   = -std=c99 -m64
+CFLAGS   = -std=c11 -m64
 
 FFLAGS   = -ffixed-line-length-none -fno-range-check -fno-second-underscore
 F90FLAGS = -ffree-line-length-none -fno-range-check -fno-second-underscore -fimplicit-none
@@ -86,8 +86,6 @@ ifeq ($(HIP_COMPILER),clang)
 
   endif
 
-  CXXFLAGS += -Wno-pass-failed  # disable this warning
-
   ifeq ($(WARN_ALL),TRUE)
     warning_flags = -Wall -Wextra -Wunreachable-code -Wnull-dereference
     warning_flags += -Wfloat-conversion -Wextra-semi
@@ -109,7 +107,7 @@ ifeq ($(HIP_COMPILER),clang)
 
   # Generic HIP info
   ROC_PATH=$(realpath $(dir $(HIP_PATH)))
-  SYSTEM_INCLUDE_LOCATIONS += $(HIP_PATH)/include
+  SYSTEM_INCLUDE_LOCATIONS += $(ROC_PATH)/include $(HIP_PATH)/include
 
   # rocRand
   SYSTEM_INCLUDE_LOCATIONS += $(ROC_PATH)/include/hiprand $(ROC_PATH)/include/rocrand
@@ -122,13 +120,12 @@ ifeq ($(HIP_COMPILER),clang)
   # rocThrust - Header only
   # SYSTEM_INCLUDE_LOCATIONS += $(ROC_PATH)/include/rocthrust
 
-  ifeq ($(USE_ROCTX),TRUE)
   # rocTracer
-  CXXFLAGS += -DAMREX_USE_ROCTX
-  HIPCC_FLAGS += -DAMREX_USE_ROCTX
-  SYSTEM_INCLUDE_LOCATIONS += $(ROC_PATH)/include/roctracer $(ROC_PATH)/include/rocprofiler
-  LIBRARY_LOCATIONS += $(ROC_PATH)/lib
-  LIBRARIES += -lroctracer64 -lroctx64
+  ifeq ($(USE_ROCTX),TRUE)
+    CXXFLAGS += -DAMREX_USE_ROCTX
+    HIPCC_FLAGS += -DAMREX_USE_ROCTX
+    LIBRARY_LOCATIONS += $(ROC_PATH)/lib
+    LIBRARIES += -Wl,--rpath=$(ROC_PATH)/lib -lroctracer64 -lroctx64
   endif
 
   # hipcc passes a lot of unused arguments to clang
diff --git a/Tools/GNUMake/comps/intel.mak b/Tools/GNUMake/comps/intel.mak
index 0c4d6e30b2a..2341192d163 100644
--- a/Tools/GNUMake/comps/intel.mak
+++ b/Tools/GNUMake/comps/intel.mak
@@ -39,21 +39,12 @@ endif
 
 ifdef CXXSTD
   CXXSTD := $(strip $(CXXSTD))
-  ifneq ($(firstword $(sort 17.0 $(intel_version))), 17.0)
-    ifeq ($(CXXSTD),c++14)
-      $(error C++14 support requires Intel icpc 17.0 or newer.)
-    endif
-  endif
   CXXFLAGS += -std=$(CXXSTD)
 else
-  ifeq ($(firstword $(sort 17.0 $(intel_version))), 17.0)
-    CXXFLAGS += -std=c++14
-  else
-    $(error Intel icpc 17.0 or newer is required.)
-  endif
+  CXXFLAGS += -std=c++17
 endif
 
-CFLAGS   += -std=c99
+CFLAGS   += -std=c11
 
 F90FLAGS += -implicitnone
 
@@ -64,11 +55,7 @@ FMODULES = -module $(fmoddir) -I$(fmoddir)
 GENERIC_COMP_FLAGS =
 
 ifeq ($(USE_OMP),TRUE)
-  ifeq ($(firstword $(sort 16.0 $(intel_version))), 16.0) 
-    GENERIC_COMP_FLAGS += -qopenmp
-  else
-    GENERIC_COMP_FLAGS += -openmp
-  endif
+  GENERIC_COMP_FLAGS += -qopenmp
 endif
 
 CXXFLAGS += $(GENERIC_COMP_FLAGS) -pthread
diff --git a/Tools/GNUMake/comps/llvm-flang.mak b/Tools/GNUMake/comps/llvm-flang.mak
index 58a0a06b64e..c9abdaaaeeb 100644
--- a/Tools/GNUMake/comps/llvm-flang.mak
+++ b/Tools/GNUMake/comps/llvm-flang.mak
@@ -43,11 +43,11 @@ endif
 ifdef CXXSTD
   CXXSTD := $(strip $(CXXSTD))
 else
-  CXXSTD := c++14
+  CXXSTD := c++17
 endif
 
 CXXFLAGS += -std=$(CXXSTD)
-CFLAGS   += -std=c99
+CFLAGS   += -std=c11
 
 FMODULES = -J$(fmoddir) -I $(fmoddir)
 
diff --git a/Tools/GNUMake/comps/llvm.mak b/Tools/GNUMake/comps/llvm.mak
index 2bf710c0d94..ead1d9290c2 100644
--- a/Tools/GNUMake/comps/llvm.mak
+++ b/Tools/GNUMake/comps/llvm.mak
@@ -50,7 +50,7 @@ ifeq ($(WARN_ALL),TRUE)
     warning_flags += -Wshadow
   endif
 
-  CXXFLAGS += $(warning_flags) -Woverloaded-virtual
+  CXXFLAGS += $(warning_flags) -Woverloaded-virtual -Wnon-virtual-dtor
   CFLAGS += $(warning_flags)
 endif
 
@@ -60,18 +60,18 @@ ifeq ($(WARN_ERROR),TRUE)
 endif
 
 # disable some warnings
-CXXFLAGS += -Wno-pass-failed -Wno-c++17-extensions
+CXXFLAGS += -Wno-c++17-extensions
 
 ########################################################################
 
 ifdef CXXSTD
   CXXSTD := $(strip $(CXXSTD))
 else
-  CXXSTD := c++14
+  CXXSTD := c++17
 endif
 
 CXXFLAGS += -std=$(CXXSTD)
-CFLAGS   += -std=c99
+CFLAGS   += -std=c11
 
 FFLAGS   += -ffixed-line-length-none -fno-range-check -fno-second-underscore
 F90FLAGS += -ffree-line-length-none -fno-range-check -fno-second-underscore -fimplicit-none
diff --git a/Tools/GNUMake/comps/nag.mak b/Tools/GNUMake/comps/nag.mak
index faaf0db7155..55ec14b0620 100644
--- a/Tools/GNUMake/comps/nag.mak
+++ b/Tools/GNUMake/comps/nag.mak
@@ -52,17 +52,12 @@ endif
 
 ifdef CXXSTD
   CXXSTD := $(strip $(CXXSTD))
-  ifeq ($(shell expr $(gcc_major_version) \< 5),1)
-    ifeq ($(CXXSTD),c++14)
-      $(error C++14 support requires GCC 5 or newer.)
-    endif
-  endif
   CXXFLAGS += -std=$(CXXSTD)
 else
-  CXXFLAGS += -std=c++14
+  CXXFLAGS += -std=c++17
 endif
 
-CFLAGS   += -std=gnu99
+CFLAGS   += -std=c11
 
 FFLAGS   += -mismatch
 F90FLAGS += -mismatch -u
diff --git a/Tools/GNUMake/comps/nvcc.mak b/Tools/GNUMake/comps/nvcc.mak
index 9d9bf90ce51..f52dfeb6c86 100644
--- a/Tools/GNUMake/comps/nvcc.mak
+++ b/Tools/GNUMake/comps/nvcc.mak
@@ -10,21 +10,11 @@ else
   nvcc_minor_version := 9
 endif
 
-# Disallow CUDA toolkit versions < 10
+# Disallow CUDA toolkit versions < 11
 
-nvcc_major_lt_10 = $(shell expr $(nvcc_major_version) \< 10)
-ifeq ($(nvcc_major_lt_10),1)
-  $(error Your nvcc version is $(nvcc_version). This is unsupported. Please use CUDA toolkit version 10.0 or newer.)
-endif
-
-nvcc_forward_unknowns = 0
-ifeq ($(shell expr $(nvcc_major_version) \= 10),1)
-ifeq ($(shell expr $(nvcc_minor_version) \>= 2),1)
-  nvcc_forward_unknowns = 1
-endif
-endif
-ifeq ($(shell expr $(nvcc_major_version) \>= 11),1)
-  nvcc_forward_unknowns = 1
+nvcc_major_lt_11 = $(shell expr $(nvcc_major_version) \< 11)
+ifeq ($(nvcc_major_lt_11),1)
+  $(error Your nvcc version is $(nvcc_version). This is unsupported. Please use CUDA toolkit version 11.0 or newer.)
 endif
 
 ifeq ($(shell expr $(nvcc_major_version) \= 11),1)
@@ -34,24 +24,6 @@ ifeq ($(shell expr $(nvcc_minor_version) \= 0),1)
 endif
 endif
 
-ifeq ($(shell expr $(nvcc_major_version) \< 11),1)
-  # -MMD -MP not supported in < 11
-  USE_LEGACY_DEPFLAGS = TRUE
-  DEPFLAGS =
-endif
-
-ifeq ($(shell expr $(nvcc_major_version) \< 10),1)
-  # -MM not supported in < 10
-  LEGACY_DEPFLAGS = -M
-endif
-
-ifeq ($(shell expr $(nvcc_major_version) \= 10),1)
-ifeq ($(shell expr $(nvcc_minor_version) \= 0),1)
-  # -MM not supported in 10.0
-  LEGACY_DEPFLAGS = -M
-endif
-endif
-
 #
 # nvcc compiler driver does not always accept pgc++
 # as a host compiler at present. However, if we're using
@@ -72,16 +44,14 @@ endif
 
 ifeq ($(lowercase_nvcc_host_comp),gnu)
 
-  ifeq ($(shell expr $(gcc_major_version) \< 5),1)
-    ifneq ($(NO_CONFIG_CHECKING),TRUE)
-      $(error C++14 support requires GCC 5 or newer.)
-    endif
+  ifeq ($(shell expr $(gcc_major_version) \< 8),1)
+    $(error GCC >= 8 required.)
   endif
 
   ifdef CXXSTD
     CXXSTD := $(strip $(CXXSTD))
   else
-    CXXSTD = c++14
+    CXXSTD = c++17
   endif
   CXXFLAGS += -std=$(CXXSTD)
 
@@ -95,27 +65,22 @@ ifeq ($(lowercase_nvcc_host_comp),gnu)
 else ifeq ($(lowercase_nvcc_host_comp),pgi)
   ifdef CXXSTD
     CXXSTD := $(strip $(CXXSTD))
-    ifeq ($(shell expr $(gcc_major_version) \< 5),1)
-      ifeq ($(CXXSTD),c++14)
-        $(error C++14 support requires GCC 5 or newer.)
-      endif
-    endif
   else
-    CXXSTD := c++14
+    CXXSTD := c++17
   endif
 
   CXXFLAGS += -std=$(CXXSTD)
 
   NVCC_CCBIN ?= pgc++
 
-  # In pgi.make, we use gcc_major_version to handle c++14 flag.
+  # In pgi.make, we use gcc_major_version to handle c++17 flag.
   CXXFLAGS_FROM_HOST := -ccbin=$(NVCC_CCBIN) -Xcompiler='$(CXXFLAGS)' --std=$(CXXSTD)
   CFLAGS_FROM_HOST := $(CXXFLAGS_FROM_HOST)
 else
   ifdef CXXSTD
     CXXSTD := $(strip $(CXXSTD))
   else
-    CXXSTD := c++14
+    CXXSTD := c++17
   endif
 
   NVCC_CCBIN ?= $(CXX)
@@ -124,7 +89,7 @@ else
   CFLAGS_FROM_HOST := $(CXXFLAGS_FROM_HOST)
 endif
 
-NVCC_FLAGS = -Wno-deprecated-gpu-targets -m64 -arch=compute_$(CUDA_ARCH) -code=sm_$(CUDA_ARCH) -maxrregcount=$(CUDA_MAXREGCOUNT) --expt-relaxed-constexpr --expt-extended-lambda
+NVCC_FLAGS = -Wno-deprecated-gpu-targets -m64 -arch=compute_$(CUDA_ARCH) -code=sm_$(CUDA_ARCH) -maxrregcount=$(CUDA_MAXREGCOUNT) --expt-relaxed-constexpr --expt-extended-lambda --forward-unknown-to-host-compiler
 # This is to work around a bug with nvcc, see: https://github.com/kokkos/kokkos/issues/1473
 NVCC_FLAGS += -Xcudafe --diag_suppress=esa_on_defaulted_function_ignored
 
@@ -154,11 +119,6 @@ endif
 
 NVCC_FLAGS += $(XTRA_NVCC_FLAGS)
 
-ifeq ($(nvcc_forward_unknowns),1)
-  NVCC_FLAGS += --forward-unknown-to-host-compiler
-endif
-
-ifeq ($(shell expr $(nvcc_major_version) \>= 11),1)
 ifeq ($(GPU_ERROR_CAPTURE_THIS),TRUE)
   NVCC_FLAGS += --Werror ext-lambda-captures-this
 else
@@ -166,7 +126,6 @@ ifeq ($(GPU_WARN_CAPTURE_THIS),TRUE)
   NVCC_FLAGS += --Wext-lambda-captures-this
 endif
 endif
-endif
 
 nvcc_diag_error = 0
 ifeq ($(shell expr $(nvcc_major_version) \>= 12),1)
diff --git a/Tools/GNUMake/comps/nvhpc.mak b/Tools/GNUMake/comps/nvhpc.mak
index 49f815213f1..d76e7c9d36e 100644
--- a/Tools/GNUMake/comps/nvhpc.mak
+++ b/Tools/GNUMake/comps/nvhpc.mak
@@ -94,19 +94,15 @@ endif
 # The logic here should be consistent with what's in nvcc.mak
 ifdef CXXSTD
   CXXSTD := $(strip $(CXXSTD))
-  ifeq ($(shell expr $(gcc_major_version) \< 5),1)
-    ifeq ($(CXXSTD),c++14)
-      $(error C++14 support requires GCC 5 or newer.)
-    endif
+  ifeq ($(shell expr $(gcc_major_version) \< 8),1)
+    $(error GCC >= 8 required.)
   endif
   CXXFLAGS += -std=$(CXXSTD)
 else
-  ifeq ($(gcc_major_version),5)
-    CXXFLAGS += -std=c++14
-  endif
+  CXXFLAGS += -std=c++17
 endif
 
-CFLAGS   += -c99
+CFLAGS   += -c11
 
 CXXFLAGS += $(GENERIC_NVHPC_FLAGS)
 CFLAGS   += $(GENERIC_NVHPC_FLAGS)
diff --git a/Tools/GNUMake/comps/pgi.mak b/Tools/GNUMake/comps/pgi.mak
index 0cf50d77287..d2736c71a33 100644
--- a/Tools/GNUMake/comps/pgi.mak
+++ b/Tools/GNUMake/comps/pgi.mak
@@ -87,20 +87,18 @@ endif
 
 # The logic here should be consistent with what's in nvcc.mak
 
-ifeq ($(shell expr $(gcc_major_version) \< 5),1)
-  $(error C++14 support requires GCC 5 or newer.)
+ifeq ($(shell expr $(gcc_major_version) \< 8),1)
+  $(error GCC >= 8 required)
 endif
 
 ifdef CXXSTD
   CXXSTD := $(strip $(CXXSTD))
   CXXFLAGS += -std=$(CXXSTD)
 else
-  ifeq ($(gcc_major_version),5)
-    CXXFLAGS += -std=c++14
-  endif
+  CXXFLAGS += -std=c++17
 endif
 
-CFLAGS   += -c99
+CFLAGS   += -c11
 
 CXXFLAGS += $(GENERIC_PGI_FLAGS)
 CFLAGS   += $(GENERIC_PGI_FLAGS)
diff --git a/Tools/GNUMake/packages/Make.hdf5 b/Tools/GNUMake/packages/Make.hdf5
index d09fe43a082..9d54463ce4e 100644
--- a/Tools/GNUMake/packages/Make.hdf5
+++ b/Tools/GNUMake/packages/Make.hdf5
@@ -27,8 +27,9 @@ ifeq ($(USE_HDF5_ZFP),TRUE)
       ZFP_ABSPATH = $(abspath $(ZFP_HOME))
       H5Z_ABSPATH = $(abspath $(H5Z_HOME))
       INCLUDE_LOCATIONS += $(ZFP_ABSPATH)/include $(H5Z_ABSPATH)/include
-      LIBRARY_LOCATIONS += $(ZFP_ABSPATH)/lib $(H5Z_ABSPATH)/lib
+      LIBRARY_LOCATIONS += $(ZFP_ABSPATH)/lib $(ZFP_ABSPATH)/lib64 $(H5Z_ABSPATH)/lib
       LDFLAGS += -Xlinker -rpath -Xlinker $(ZFP_ABSPATH)/lib
+      LDFLAGS += -Xlinker -rpath -Xlinker $(ZFP_ABSPATH)/lib64
     endif
   endif
 endif
diff --git a/Tools/GNUMake/packages/Make.hypre b/Tools/GNUMake/packages/Make.hypre
index 11e0690a67e..d2cc0d7c17a 100644
--- a/Tools/GNUMake/packages/Make.hypre
+++ b/Tools/GNUMake/packages/Make.hypre
@@ -19,5 +19,5 @@ ifdef AMREX_HYPRE_HOME
 endif
 
 ifeq ($(USE_CUDA),TRUE)
-  LIBRARIES += -lcusparse -lcurand
+  LIBRARIES += -lcusparse -lcurand -lcublas
 endif
diff --git a/Tools/GNUMake/sites/Make.alcf b/Tools/GNUMake/sites/Make.alcf
index 324d419ccce..cf607596515 100644
--- a/Tools/GNUMake/sites/Make.alcf
+++ b/Tools/GNUMake/sites/Make.alcf
@@ -8,3 +8,78 @@ ifeq ($(which_computer),theta)
     LIBRARIES += -lmpichf90
   endif
 endif
+
+ifeq ($(which_computer),$(filter $(which_computer),polaris))
+
+  ifdef PE_ENV
+    ifneq ($(USE_GPU),TRUE)
+      lowercase_peenv := $(shell echo $(PE_ENV) | tr A-Z a-z)
+      ifneq ($(lowercase_peenv),$(lowercase_comp))
+        has_compiler_mismatch = COMP=$(COMP) does not match PrgEnv-$(lowercase_peenv)
+      endif
+      ifeq ($(MAKECMDGOALS),)
+        ifeq ($(lowercase_peenv),nvidia)
+          $(error PrgEnv-nvidia cannot be used with CPU-only builds. Try PrgEnv-gnu instead.)
+        endif
+      endif
+    endif
+  endif
+
+  ifeq ($(USE_CUDA),TRUE)
+    CFLAGS += -Xcompiler='$(wordlist 2,1024,$(shell cc -craype-verbose 2> /dev/null))'
+    CXXFLAGS += -Xcompiler='$(wordlist 2,1024,$(shell CC -craype-verbose 2> /dev/null))'
+  else ifeq ($(USE_MPI),FALSE)
+    CFLAGS += $(wordlist 2,1024,$(shell cc -craype-verbose 2> /dev/null))
+    CXXFLAGS += $(wordlist 2,1024,$(shell CC -craype-verbose 2> /dev/null))
+  endif
+
+  ifeq ($(USE_MPI),TRUE)
+    ifneq ($(USE_CUDA),TRUE)
+      CC  = cc
+      CXX = CC
+      FC  = ftn
+      F90 = ftn
+      LIBRARIES += -lmpichf90
+    endif
+
+    includes += $(shell CC --cray-print-opts=cflags)
+  endif
+
+  ifeq ($(USE_CUDA),TRUE)
+    CUDA_ARCH = 80
+
+    ifeq ($(USE_MPI), FALSE)
+      includes += $(CRAY_CUDATOOLKIT_INCLUDE_OPTS)
+    endif
+
+    comm := ,
+    ifneq ($(BL_NO_FORT),TRUE)
+      LIBRARIES += $(subst -Wl$(comm),-Xlinker=,$(shell ftn --cray-print-opts=libs))
+    else
+      LIBRARIES += $(subst -Wl$(comm),-Xlinker=,$(shell CC --cray-print-opts=libs))
+    endif
+
+    ifneq ($(CUDA_ROOT),)
+        SYSTEM_CUDA_PATH := $(CUDA_ROOT)
+        COMPILE_CUDA_PATH := $(CUDA_ROOT)
+    else ifneq ($(CUDA_HOME),)
+        SYSTEM_CUDA_PATH := $(CUDA_HOME)
+        COMPILE_CUDA_PATH := $(CUDA_HOME)
+    else ifneq ($(CUDA_PATH),)
+        SYSTEM_CUDA_PATH := $(CUDA_PATH)
+        COMPILE_CUDA_PATH := $(CUDA_PATH)
+    else ifneq ($(NVIDIA_PATH),)
+        SYSTEM_CUDA_PATH := $(NVIDIA_PATH)/cuda
+        COMPILE_CUDA_PATH := $(NVIDIA_PATH)/cuda
+    else
+        $(error No CUDA_ROOT nor CUDA_HOME nor CUDA_PATH found. Please load a cuda module.)
+    endif
+
+    # Provide system configuration information.
+
+    GPUS_PER_NODE=4
+    GPUS_PER_SOCKET=4
+
+  endif
+
+endif
\ No newline at end of file
diff --git a/Tools/GNUMake/sites/Make.nersc b/Tools/GNUMake/sites/Make.nersc
index c8c938a627a..426b9525887 100644
--- a/Tools/GNUMake/sites/Make.nersc
+++ b/Tools/GNUMake/sites/Make.nersc
@@ -25,11 +25,16 @@ ifeq ($(which_computer),$(filter $(which_computer),perlmutter))
   endif
 
   ifeq ($(USE_CUDA),TRUE)
-      CFLAGS += -Xcompiler="$(wordlist 2,1024,$(shell cc -craype-verbose 2> /dev/null))"
-      CXXFLAGS += -Xcompiler="$(wordlist 2,1024,$(shell CC -craype-verbose 2> /dev/null))"
+    ifdef NPE_VERSION
+      CFLAGS += -Xcompiler='$(filter-out -Wl%, $(wordlist 2,1024,$(shell mpicc -show 2> /dev/null)))'
+      CXXFLAGS += -Xcompiler='$(filter-out -Wl%, $(wordlist 2,1024,$(shell mpicxx -show 2> /dev/null)))'
+    else
+      CFLAGS += -Xcompiler='$(wordlist 2,1024,$(shell cc -craype-verbose 2> /dev/null))'
+      CXXFLAGS += -Xcompiler='$(wordlist 2,1024,$(shell CC -craype-verbose 2> /dev/null))'
+    endif
   else ifeq ($(USE_MPI),FALSE)
-      CFLAGS += $(wordlist 2,1024,$(shell cc -craype-verbose 2> /dev/null))
-      CXXFLAGS += $(wordlist 2,1024,$(shell CC -craype-verbose 2> /dev/null))
+    CFLAGS += $(wordlist 2,1024,$(shell cc -craype-verbose 2> /dev/null))
+    CXXFLAGS += $(wordlist 2,1024,$(shell CC -craype-verbose 2> /dev/null))
   endif
 
   ifeq ($(USE_MPI),TRUE)
@@ -41,7 +46,9 @@ ifeq ($(which_computer),$(filter $(which_computer),perlmutter))
       LIBRARIES += -lmpichf90
     endif
 
-    includes += $(shell CC --cray-print-opts=cflags)
+    ifndef NPE_VERSION
+      includes += $(shell CC --cray-print-opts=cflags)
+    endif
   endif
 
   ifeq ($(USE_CUDA),TRUE)
@@ -51,11 +58,23 @@ ifeq ($(which_computer),$(filter $(which_computer),perlmutter))
       includes += $(CRAY_CUDATOOLKIT_INCLUDE_OPTS)
     endif
 
+    ifdef NPE_VERSION
+      includes += $(CRAY_CUDATOOLKIT_INCLUDE_OPTS)
+    endif
+
     comm := ,
     ifneq ($(BL_NO_FORT),TRUE)
+      ifdef NPE_VERSION
+        LIBRARIES += $(subst -Wl$(comm),-Xlinker=,$(wordlist 2,1024,$(shell mpifort -show)))
+      else
         LIBRARIES += $(subst -Wl$(comm),-Xlinker=,$(shell ftn --cray-print-opts=libs))
+      endif
     else
+      ifdef NPE_VERSION
+        LIBRARIES += $(subst -Wl$(comm),-Xlinker=,$(wordlist 2,1024,$(shell mpicxx -show)))
+      else
         LIBRARIES += $(subst -Wl$(comm),-Xlinker=,$(shell CC --cray-print-opts=libs))
+      endif
     endif
 
     ifneq ($(CUDA_ROOT),)
diff --git a/Tools/GNUMake/sites/Make.nrel b/Tools/GNUMake/sites/Make.nrel
index 68ac8e5116f..ca705698ea1 100644
--- a/Tools/GNUMake/sites/Make.nrel
+++ b/Tools/GNUMake/sites/Make.nrel
@@ -40,27 +40,32 @@ else ifeq ($(which_computer), rhodes)
   endif
 endif
 
-# Account for Intel-MPI, MPICH, OpenMPI, and HPE MPT
 ifeq ($(USE_MPI),TRUE)
+  CXX := mpicxx
+  CC  := mpicc
+  FC  := mpif90
+  F90 := mpif90
   ifeq ($(COMP), intel)
-    CXX := mpiicpc
-    CC  := mpiicc
-    FC  := mpiifort
-    F90 := mpiifort
-  else
-    CXX := mpicxx
-    CC  := mpicc
-    FC  := mpif90
-    F90 := mpif90
-    ifneq ($(findstring mpich, $(shell $(F90) -show 2>&1)),)
-      mpif90_link_flags := $(shell $(F90) -link_info)
-      LIBRARIES += $(wordlist 2,1024,$(mpif90_link_flags))
-    else ifneq ($(findstring Open MPI, $(shell $(F90) -showme:version 2>&1)),)
-      mpif90_link_flags := $(shell $(F90) -showme:link)
-      LIBRARIES += $(mpif90_link_flags)
-    else
-      # MPT case (no option available to query link flags)
-      LIBRARIES += -lmpi
+    ifeq ($(which_computer), eagle)
+        # Always assume MPT on Eagle
+        export MPICXX_CXX := icpc
+        export MPICC_CC   := icc
+        export MPIF90_F90 := ifort
+    else ifeq ($(which_computer), rhodes)
+        CXX := mpiicpc
+        CC  := mpiicc
+        FC  := mpiifort
+        F90 := mpiifort
     endif
   endif
+  ifneq ($(findstring mpich, $(shell $(F90) -show 2>&1)),)
+    mpif90_link_flags := $(shell $(F90) -link_info)
+    LIBRARIES += $(wordlist 2,1024,$(mpif90_link_flags))
+  else ifneq ($(findstring Open MPI, $(shell $(F90) -showme:version 2>&1)),)
+    mpif90_link_flags := $(shell $(F90) -showme:link)
+    LIBRARIES += $(mpif90_link_flags)
+  else
+    # MPT case (no option available to query link flags)
+    LIBRARIES += -lmpi
+  endif
 endif
diff --git a/Tools/GNUMake/sites/Make.olcf b/Tools/GNUMake/sites/Make.olcf
index 651971c6c95..69f557786df 100644
--- a/Tools/GNUMake/sites/Make.olcf
+++ b/Tools/GNUMake/sites/Make.olcf
@@ -2,7 +2,7 @@
 # For Summit et al. at OLCF
 #
 
-OLCF_MACHINES := summit ascent spock crusher
+OLCF_MACHINES := summit ascent spock crusher frontier
 
 ifneq ($(which_computer), $(findstring $(which_computer), $(OLCF_MACHINES)))
   $(error Unknown OLCF computer, $(which_computer))
@@ -60,7 +60,7 @@ ifeq ($(which_computer),spock)
     endif
     # for gpu aware mpi
     ifeq ($(USE_HIP),TRUE)
-      LIBRARIES += $(PE_MPICH_GTL_DIR_gfx908) -lmpi_gtl_hsa
+      LIBRARIES += $(PE_MPICH_GTL_DIR_amd_gfx908) -lmpi_gtl_hsa
     endif
   endif
 endif
@@ -80,7 +80,27 @@ ifeq ($(which_computer),crusher)
     endif
     # for gpu aware mpi
     ifeq ($(USE_HIP),TRUE)
-      LIBRARIES += -lmpi_gtl_hsa
+      LIBRARIES += $(PE_MPICH_GTL_DIR_amd_gfx90a) -lmpi_gtl_hsa
+    endif
+  endif
+endif
+
+ifeq ($(which_computer),frontier)
+  ifeq ($(USE_HIP),TRUE)
+    # MI250X
+    AMD_ARCH=gfx90a
+  endif
+
+  ifeq ($(USE_MPI),TRUE)
+    includes += $(shell CC --cray-print-opts=cflags)
+    ifneq ($(BL_NO_FORT),TRUE)
+      LIBRARIES += $(shell ftn --cray-print-opts=libs)
+    else
+      LIBRARIES += $(shell CC --cray-print-opts=libs)
+    endif
+    # for gpu aware mpi
+    ifeq ($(USE_HIP),TRUE)
+      LIBRARIES += $(PE_MPICH_GTL_DIR_amd_gfx90a) -lmpi_gtl_hsa
     endif
   endif
 endif
diff --git a/Tools/GNUMake/sites/Make.unknown b/Tools/GNUMake/sites/Make.unknown
index 332a7a558de..2ecf6a50ddb 100644
--- a/Tools/GNUMake/sites/Make.unknown
+++ b/Tools/GNUMake/sites/Make.unknown
@@ -29,6 +29,8 @@ ifeq ($(USE_MPI),TRUE)
 
   ifeq ($(LINK_WITH_FORTRAN_COMPILER),TRUE)
     MPI_OTHER_COMP := mpicxx
+  else ifeq ($(BL_NO_FORT),TRUE)
+    MPI_OTHER_COMP := mpicxx
   else
     MPI_OTHER_COMP := mpif90
   endif
@@ -55,7 +57,10 @@ ifeq ($(USE_MPI),TRUE)
      mpi_link_flags := $(filter-out $(mpi_filter), $(mpi_link_flags))
   endif
 
-  LIBRARIES += $(mpi_link_flags) $(mpicxx_link_libs)
+  LIBRARIES += $(mpi_link_flags)
+  ifneq ($(MPI_OTHER_COMP),mpicxx)
+    LIBRARIES += $(mpicxx_link_libs)
+  endif
 
   # OpenMPI specific flag
   # Uncomment if statement if flag causes issue with another compiler.
diff --git a/Tools/Plotfile/CMakeLists.txt b/Tools/Plotfile/CMakeLists.txt
index 44f99d9523c..9f8f066fbbb 100644
--- a/Tools/Plotfile/CMakeLists.txt
+++ b/Tools/Plotfile/CMakeLists.txt
@@ -34,5 +34,5 @@ target_include_directories(fsnapshot PRIVATE ${CMAKE_CURRENT_SOURCE_DIR})
 target_sources(fsnapshot PRIVATE AMReX_PPMUtil.H AMReX_PPMUtil.cpp)
 if (AMReX_CUDA)
    set_source_files_properties(AMReX_PPMUtil.cpp PROPERTIES LANGUAGE CUDA)
-   target_compile_features(fsnapshot PUBLIC cxx_std_14)
+   target_compile_features(fsnapshot PUBLIC cxx_std_17)
 endif()
diff --git a/Tools/Postprocessing/python/column_depth.py b/Tools/Postprocessing/python/column_depth.py
index 3aff2ac4705..be17d6bb663 100755
--- a/Tools/Postprocessing/python/column_depth.py
+++ b/Tools/Postprocessing/python/column_depth.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/env python3
 import sys
 import numpy
 
diff --git a/Tools/Postprocessing/python/conv_slopes.py b/Tools/Postprocessing/python/conv_slopes.py
index f2fe5404aae..9f1a22e3960 100755
--- a/Tools/Postprocessing/python/conv_slopes.py
+++ b/Tools/Postprocessing/python/conv_slopes.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/env python3
 import sys
 import os
 import commands
diff --git a/Tools/Postprocessing/python/dumpparthistory.py b/Tools/Postprocessing/python/dumpparthistory.py
index 092f924423b..23f6d22d1a8 100755
--- a/Tools/Postprocessing/python/dumpparthistory.py
+++ b/Tools/Postprocessing/python/dumpparthistory.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/env python3
 
 # a simple routine to parse particle files and dump out the particle
 # histories into separate files (1 file per particle) so that they can
@@ -96,8 +96,3 @@ def main(files):
         sys.exit(2)
 
     main(sys.argv[1:])
-
-
-
-
-
diff --git a/Tools/Postprocessing/python/test_helmeos.py b/Tools/Postprocessing/python/test_helmeos.py
index 890a66aef77..824f369cf60 100755
--- a/Tools/Postprocessing/python/test_helmeos.py
+++ b/Tools/Postprocessing/python/test_helmeos.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/env python3
 #
 # a script showing how to use the helmeos module
 # it reads T, rho, X data from a sample data file, calculates abar and zbar
diff --git a/Tools/Postprocessing/python/test_parseparticles.py b/Tools/Postprocessing/python/test_parseparticles.py
index b9181af4d8a..8a85fe2faf6 100755
--- a/Tools/Postprocessing/python/test_parseparticles.py
+++ b/Tools/Postprocessing/python/test_parseparticles.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/env python3
 
 # simple script showing how to make plots of particles using the parseparticles
 # module
@@ -92,4 +92,3 @@ def main(fileList):
 
 # this is for profiling
 #    cProfile.run("main(sys.argv[1:])","profile.tmp2")
-
diff --git a/Tools/Py_util/plotsinglevar.py b/Tools/Py_util/plotsinglevar.py
index 616c516c805..bb1c2abacaa 100755
--- a/Tools/Py_util/plotsinglevar.py
+++ b/Tools/Py_util/plotsinglevar.py
@@ -1,11 +1,9 @@
-#!/usr/bin/env python
+#!/usr/bin/env python3
 
 # a simple script to plot 2-d or 3-d BoxLib data using the matplotlib
 # library
 #
 
-from __future__ import print_function
-
 import matplotlib
 matplotlib.use('agg')
 
diff --git a/Tools/Release/ppCleanup.py b/Tools/Release/ppCleanup.py
index 109444daff3..2935d0c1983 100755
--- a/Tools/Release/ppCleanup.py
+++ b/Tools/Release/ppCleanup.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/env python3
 
 import os
 import shutil
diff --git a/Tools/Release/ppCleanupDir.py b/Tools/Release/ppCleanupDir.py
index befebc15f2d..2d8a598291d 100755
--- a/Tools/Release/ppCleanupDir.py
+++ b/Tools/Release/ppCleanupDir.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/env python3
 
 import os
 import shutil
diff --git a/Tools/Release/release.py b/Tools/Release/release.py
index 87de82e5a30..8f2b4d9d5dc 100755
--- a/Tools/Release/release.py
+++ b/Tools/Release/release.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/env python3
 
 import os
 import shutil
diff --git a/Tools/libamrex/configure.py b/Tools/libamrex/configure.py
index ac4b399a471..ebb3cd369f4 100755
--- a/Tools/libamrex/configure.py
+++ b/Tools/libamrex/configure.py
@@ -1,12 +1,6 @@
-#!/usr/bin/env python
-
-from __future__ import print_function
+#!/usr/bin/env python3
 
 import sys
-
-if sys.version_info < (2, 7):
-    sys.exit("ERROR: need python 2.7 or later for configure.py")
-
 import argparse
 
 def configure(argv):
diff --git a/Tools/libamrex/mkconfig.py b/Tools/libamrex/mkconfig.py
index 30c54f285a2..21f66348891 100755
--- a/Tools/libamrex/mkconfig.py
+++ b/Tools/libamrex/mkconfig.py
@@ -1,12 +1,6 @@
-#!/usr/bin/env python
-
-from __future__ import print_function
+#!/usr/bin/env python3
 
 import sys, re
-
-if sys.version_info < (2, 7):
-    sys.exit("ERROR: need python 2.7 or later for mkconfig.py")
-
 import argparse
 
 def doit(defines, undefines, comp, allow_diff_comp):
diff --git a/Tools/libamrex/mkpkgconfig.py b/Tools/libamrex/mkpkgconfig.py
index be91e8736a8..c8a626901da 100755
--- a/Tools/libamrex/mkpkgconfig.py
+++ b/Tools/libamrex/mkpkgconfig.py
@@ -1,12 +1,6 @@
-#!/usr/bin/env python
-
-from __future__ import print_function
+#!/usr/bin/env python3
 
 import sys
-
-if sys.version_info < (2, 7):
-    sys.exit("ERROR: need python 2.7 or later for mkpkgconfig.py")
-
 import argparse
 
 def doit(prefix, version, cflags, libs, libpriv, fflags):
diff --git a/Tools/libamrex/mkversionheader.py b/Tools/libamrex/mkversionheader.py
index f2f6f8865f9..b1dbf0eb2ad 100755
--- a/Tools/libamrex/mkversionheader.py
+++ b/Tools/libamrex/mkversionheader.py
@@ -1,12 +1,6 @@
-#!/usr/bin/env python
-
-from __future__ import print_function
+#!/usr/bin/env python3
 
 import sys, re
-
-if sys.version_info < (2, 7):
-    sys.exit("ERROR: need python 2.7 or later for mkversionheader.py")
-
 import argparse
 
 def doit(code, defines):
diff --git a/Tools/typechecker/typechecker.py b/Tools/typechecker/typechecker.py
index 2086b22d1b5..6035b7a6c15 100755
--- a/Tools/typechecker/typechecker.py
+++ b/Tools/typechecker/typechecker.py
@@ -1,6 +1,4 @@
-#!/usr/bin/env python
-
-from __future__ import print_function
+#!/usr/bin/env python3
 
 import os
 import sys