From 63bd7230f1d82bcbd8d2be88ece78cfc5fd5cc32 Mon Sep 17 00:00:00 2001
From: Ahmed Mahmoud <ahmed.mahmoud@autodesk.com>
Date: Tue, 10 Jan 2023 15:40:23 -0500
Subject: [PATCH 01/24] setup

---
 apps/CMakeLists.txt             |   1 +
 apps/lbmMultiRes/CMakeLists.txt |  17 ++++
 apps/lbmMultiRes/lbmMultiRes.cu | 148 ++++++++++++++++++++++++++++++++
 3 files changed, 166 insertions(+)
 create mode 100644 apps/lbmMultiRes/CMakeLists.txt
 create mode 100644 apps/lbmMultiRes/lbmMultiRes.cu
diff --git a/apps/CMakeLists.txt b/apps/CMakeLists.txt
index 403a18e3..bd85d7a7 100644
--- a/apps/CMakeLists.txt
+++ b/apps/CMakeLists.txt
@@ -4,3 +4,4 @@ add_subdirectory("fractal")
 add_subdirectory("lbm")
 add_subdirectory("gameOfLife")
 add_subdirectory("poisson")
+add_subdirectory("lbmMultiRes")
diff --git a/apps/lbmMultiRes/CMakeLists.txt b/apps/lbmMultiRes/CMakeLists.txt
new file mode 100644
index 00000000..207cb795
--- /dev/null
+++ b/apps/lbmMultiRes/CMakeLists.txt
@@ -0,0 +1,17 @@
+cmake_minimum_required(VERSION 3.19 FATAL_ERROR)
+
+set (APP_NAME app-lbmMultiRes)
+file(GLOB_RECURSE SrcFiles lbmMultiRes.cu)
+
+add_executable(${APP_NAME} ${SrcFiles})
+
+target_link_libraries(${APP_NAME} 
+	PUBLIC libNeonSkeleton)
+
+set_target_properties(${APP_NAME} PROPERTIES 
+	CUDA_SEPARABLE_COMPILATION ON
+	CUDA_RESOLVE_DEVICE_SYMBOLS ON)
+set_target_properties(${APP_NAME} PROPERTIES FOLDER "apps")
+source_group(TREE ${CMAKE_CURRENT_LIST_DIR} PREFIX "${APP_NAME}" FILES ${SrcFiles})
+
+add_test(NAME ${APP_NAME} COMMAND ${APP_NAME})
\ No newline at end of file
diff --git a/apps/lbmMultiRes/lbmMultiRes.cu b/apps/lbmMultiRes/lbmMultiRes.cu
new file mode 100644
index 00000000..a345af3f
--- /dev/null
+++ b/apps/lbmMultiRes/lbmMultiRes.cu
@@ -0,0 +1,148 @@
+#include "Neon/Neon.h"
+#include "Neon/domain/mGrid.h"
+#include "Neon/skeleton/Skeleton.h"
+
+template <unsigned int DIM, unsigned int COMP>
+Neon::domain::Stencil create_stencil();
+
+template <>
+Neon::domain::Stencil create_stencil<2, 9>()
+{
+    std::vector<Neon::index_3d> stencil;
+    stencil.reserve(9);
+    for (int x = -1; x <= 1; ++x) {
+        for (int y = -1; y <= 1; ++y) {
+            stencil.emplace_back(Neon::index_3d(x, y, 0));
+        }
+    }
+    return Neon::domain::Stencil(stencil);
+}
+
+template <>
+Neon::domain::Stencil create_stencil<3, 19>()
+{
+    // filterCenterOut = false;
+    return Neon::domain::Stencil::s19_t(false);
+}
+
+template <typename Field>
+inline void exportVTI(const int t, Field& field)
+{
+    printf("\n Exporting Frame =%d", t);
+    int                precision = 4;
+    std::ostringstream oss;
+    oss << std::setw(precision) << std::setfill('0') << t;
+    std::string prefix = "lbm" + std::to_string(field.getCardinality()) + "D_";
+    std::string fname = prefix + oss.str();
+    field.ioToVtk(fname, "field");
+}
+
+/*Neon::set::Container setOmega(Neon::domain::mGrid& grid, int level)
+{
+    return grid.getContainer(
+        "SetOmega" + std::to_string(level), level,
+        [=](Neon::set::Loader& loader) {
+            //auto& local = field.load(loader, level, Neon::MultiResCompute::MAP);
+            return [=] NEON_CUDA_HOST_DEVICE(const typename Neon::domain::bGrid::Cell& cell) mutable {};
+        });
+}*/
+
+template <typename T>
+NEON_CUDA_HOST_DEVICE T computeOmega(T omega0, int level, int num_levels)
+{
+    int ilevel = num_levels - level - 1;
+    // scalbln(1.0, x) = 2^x
+    return 2 * omega0 / (scalbln(1.0, ilevel + 1) + (1. - scalbln(1.0, ilevel)) * omega0);
+}
+
+template <typename T>
+Neon::set::Container velocity(Neon::domain::mGrid& grid, int level, Neon::domain::mGrid::Field<T>& fin)
+{
+    return grid.getContainer(
+        "SetOmega" + std::to_string(level), level,
+        [=](Neon::set::Loader& loader) {
+            auto& flocal = fin.load(loader, level, Neon::MultiResCompute::MAP);
+            return [=] NEON_CUDA_HOST_DEVICE(const typename Neon::domain::bGrid::Cell& cell) mutable {};
+        });
+}
+
+template <typename T>
+void nonUniformTimestepRecursive(Neon::domain::mGrid&               grid,
+                                 T                                  omega0,
+                                 int                                level,
+                                 int                                max_level,
+                                 Neon::domain::mGrid::Field<T>&     fin,
+                                 Neon::domain::mGrid::Field<T>&     fout,
+                                 std::vector<Neon::set::Container>& containers)
+{
+}
+
+int main(int argc, char** argv)
+{
+    Neon::init();
+    if (Neon::sys::globalSpace::gpuSysObjStorage.numDevs() > 0) {
+        using T = double;
+
+        //Neon grid
+        auto             runtime = Neon::Runtime::stream;
+        std::vector<int> gpu_ids{0};
+        Neon::Backend    backend(gpu_ids, runtime);
+
+        constexpr int DIM = 2;
+        constexpr int COMP = (DIM == 2) ? 9 : 19;
+
+        const int dim_x = 4;
+        const int dim_y = 4;
+        const int dim_z = (DIM < 3) ? 1 : 4;
+
+        const Neon::index_3d grid_dim(dim_x, dim_y, dim_z);
+
+        const Neon::domain::mGridDescriptor descriptor({1, 1, 1});
+
+
+        Neon::domain::mGrid grid(
+            backend, grid_dim,
+            {[&](const Neon::index_3d id) -> bool {
+                 return true;
+             },
+             [&](const Neon::index_3d&) -> bool {
+                 return true;
+             },
+             [&](const Neon::index_3d&) -> bool {
+                 return true;
+             }},
+            create_stencil<DIM, COMP>(), descriptor);
+
+
+        //LBM problem
+        const int max_iter = 300;
+        const T   ulb = 0.01;
+        const T   Re = 20;
+        const T   clength = grid_dim.x;
+        const T   visclb = ulb * clength / Re;
+        const T   smagorinskyConstant = 0.02;
+        const T   omega = 1.0 / (3. * visclb + 0.5);
+
+        auto fin = grid.newField<T>("fin", COMP, 0);
+        auto fout = grid.newField<T>("fout", COMP, 0);
+
+        //TODO init fin and fout
+
+        fin.updateCompute();
+        fout.updateCompute();
+
+        std::vector<Neon::set::Container> containers;
+
+        nonUniformTimestepRecursive(grid, omega, 0, descriptor.getDepth(), fin, fout, containers);
+
+        Neon::skeleton::Skeleton skl(grid.getBackend());
+        skl.sequence(containers, "MultiResLBM");
+        //skl.ioToDot("MultiRes");
+
+        skl.run();
+
+        grid.getBackend().syncAll();
+        fin.updateIO();
+        fout.updateIO();
+    }
+}
\ No newline at end of file

From 22991cd7ba84019a2816bcfeab5c785373465f2a Mon Sep 17 00:00:00 2001
From: Ahmed Mahmoud <ahmed.mahmoud@autodesk.com>
Date: Thu, 12 Jan 2023 18:59:03 -0500
Subject: [PATCH 02/24] collide

---
 apps/lbmMultiRes/lbmMultiRes.cu | 197 +++++++++++++++++++++++++++++---
 1 file changed, 178 insertions(+), 19 deletions(-)

diff --git a/apps/lbmMultiRes/lbmMultiRes.cu b/apps/lbmMultiRes/lbmMultiRes.cu
index a345af3f..13ca1f45 100644
--- a/apps/lbmMultiRes/lbmMultiRes.cu
+++ b/apps/lbmMultiRes/lbmMultiRes.cu
@@ -2,7 +2,7 @@
 #include "Neon/domain/mGrid.h"
 #include "Neon/skeleton/Skeleton.h"
 
-template <unsigned int DIM, unsigned int COMP>
+template <unsigned int DIM, unsigned int Q>
 Neon::domain::Stencil create_stencil();
 
 template <>
@@ -25,6 +25,88 @@ Neon::domain::Stencil create_stencil<3, 19>()
     return Neon::domain::Stencil::s19_t(false);
 }
 
+NEON_CUDA_DEVICE_ONLY static constexpr char latticeVelocity2D[9][2] = {
+    {0, 0},
+    {0, -1},
+    {0, 1},
+    {-1, 0},
+    {-1, -1},
+    {-1, 1},
+    {1, 0},
+    {1, -1},
+    {1, 1}};
+
+NEON_CUDA_DEVICE_ONLY static constexpr char latticeVelocity3D[27][3] = {
+    {0, 0, 0},
+    {0, -1, 0},
+    {0, 1, 0},
+    {-1, 0, 0},
+    {-1, -1, 0},
+    {-1, 1, 0},
+    {1, 0, 0},
+    {1, -1, 0},
+    {1, 1, 0},
+
+    {0, 0, -1},
+    {0, -1, -1},
+    {0, 1, -1},
+    {-1, 0, -1},
+    {-1, -1, -1},
+    {-1, 1, -1},
+    {1, 0, -1},
+    {1, -1, -1},
+    {1, 1, -1},
+
+    {0, 0, 1},
+    {0, -1, 1},
+    {0, 1, 1},
+    {-1, 0, 1},
+    {-1, -1, 1},
+    {-1, 1, 1},
+    {1, 0, 1},
+    {1, -1, 1},
+    {1, 1, 1}
+
+};
+
+template <int DIM, int Q>
+struct latticeWeight
+{
+    NEON_CUDA_HOST_DEVICE __inline__ constexpr latticeWeight()
+        : t()
+    {
+        if constexpr (DIM == 2) {
+
+            for (int i = 0; i < Q; ++i) {
+                if (latticeVelocity2D[i][0] * latticeVelocity2D[i][0] +
+                        latticeVelocity2D[i][1] * latticeVelocity2D[i][1] <
+                    1.1f) {
+                    t[i] = 1.0f / 9.0f;
+                } else {
+                    t[i] = 1.0f / 36.0f;
+                }
+            }
+            t[0] = 4.0f / 9.0f;
+        }
+
+        if constexpr (DIM == 3) {
+            for (int i = 0; i < Q; ++i) {
+                if (latticeVelocity2D[i][0] * latticeVelocity2D[i][0] +
+                        latticeVelocity2D[i][1] * latticeVelocity2D[i][1] +
+                        latticeVelocity2D[i][2] * latticeVelocity2D[i][2] <
+                    1.1f) {
+                    t[i] = 2.0f / 36.0f;
+                } else {
+                    t[i] = 1.0f / 36.0f;
+                }
+            }
+            t[0] = 1.0f / 3.0f;
+        }
+    }
+    float t[Q];
+};
+
+
 template <typename Field>
 inline void exportVTI(const int t, Field& field)
 {
@@ -47,6 +129,7 @@ inline void exportVTI(const int t, Field& field)
         });
 }*/
 
+
 template <typename T>
 NEON_CUDA_HOST_DEVICE T computeOmega(T omega0, int level, int num_levels)
 {
@@ -55,31 +138,103 @@ NEON_CUDA_HOST_DEVICE T computeOmega(T omega0, int level, int num_levels)
     return 2 * omega0 / (scalbln(1.0, ilevel + 1) + (1. - scalbln(1.0, ilevel)) * omega0);
 }
 
-template <typename T>
-Neon::set::Container velocity(Neon::domain::mGrid& grid, int level, Neon::domain::mGrid::Field<T>& fin)
+template <typename T, int DIM, int Q>
+NEON_CUDA_HOST_DEVICE Neon::Vec_3d<T> velocity(const T* fin,
+                                               const T  rho)
+{
+    Neon::Vec_3d<T> vel(0, 0, 0);
+    if constexpr (DIM == 2) {
+        for (int i = 0; i < Q; ++i) {
+            const T f = fin[i];
+            for (int d = 0; d < DIM; ++d) {
+                vel.v[d] += f * latticeVelocity2D[i][d];
+            }
+        }
+    }
+
+    if constexpr (DIM == 3) {
+        for (int i = 0; i < Q; ++i) {
+            const T f = fin[i];
+            for (int d = 0; d < DIM; ++d) {
+                vel.v[d] += f * latticeVelocity3D[i][d];
+            }
+        }
+    }
+
+    for (int d = 0; d < DIM; ++d) {
+        vel.v[d] /= rho;
+    }
+    return vel;
+}
+
+template <typename T, int DIM, int Q>
+Neon::set::Container collide(Neon::domain::mGrid&                 grid,
+                             T                                    omega0,
+                             int                                  level,
+                             int                                  max_level,
+                             const Neon::domain::mGrid::Field<T>& fin,
+                             Neon::domain::mGrid::Field<T>&       fout)
 {
     return grid.getContainer(
-        "SetOmega" + std::to_string(level), level,
+        "Collide" + std::to_string(level), level,
         [=](Neon::set::Loader& loader) {
-            auto& flocal = fin.load(loader, level, Neon::MultiResCompute::MAP);
-            return [=] NEON_CUDA_HOST_DEVICE(const typename Neon::domain::bGrid::Cell& cell) mutable {};
+            const auto& in = fin.load(loader, level, Neon::MultiResCompute::MAP);
+            auto        out = fout.load(loader, level, Neon::MultiResCompute::MAP);
+            const T     omega = computeOmega(omega0, level, max_level);
+
+            return [=] NEON_CUDA_HOST_DEVICE(const typename Neon::domain::bGrid::Cell& cell) mutable {
+                constexpr auto t = latticeWeight<DIM, Q>();
+
+                //fin
+                T ins[Q];
+                for (int i = 0; i < Q; ++i) {
+                    ins[i] = in(cell, i);
+                }
+
+                //density
+                T rho = 0;
+                for (int i = 0; i < Q; ++i) {
+                    rho += ins[i];
+                }
+
+                //velocity
+                const Neon::Vec_3d<T> vel = velocity<T, DIM, Q>(ins, rho);
+
+
+                const T usqr = (3.0 / 2.0) * (vel.x * vel.x + vel.y * vel.y + vel.z * vel.z);
+                for (int i = 0; i < Q; ++i) {
+                    T cu = 0;
+                    for (int d = 0; d < DIM; ++d) {
+                        cu += latticeVelocity2D[i][d] * vel.v[d];
+                    }
+                    //equilibrium
+                    T feq = rho * t.t[i] * (1. + cu + 0.5 * cu * cu - usqr);
+
+                    //collide
+                    out(cell, i) = ins[i] - omega * (ins[i] - feq);
+                }
+            };
         });
 }
 
-template <typename T>
-void nonUniformTimestepRecursive(Neon::domain::mGrid&               grid,
-                                 T                                  omega0,
-                                 int                                level,
-                                 int                                max_level,
-                                 Neon::domain::mGrid::Field<T>&     fin,
-                                 Neon::domain::mGrid::Field<T>&     fout,
-                                 std::vector<Neon::set::Container>& containers)
+
+template <typename T, int DIM, int Q>
+void nonUniformTimestepRecursive(Neon::domain::mGrid&                 grid,
+                                 const T                              omega0,
+                                 const int                            level,
+                                 const int                            max_level,
+                                 const Neon::domain::mGrid::Field<T>& fin,
+                                 Neon::domain::mGrid::Field<T>&       fout,
+                                 std::vector<Neon::set::Container>&   containers)
 {
+    // 1) collision for all voxels at level L=ilevel
+    containers.push_back(collide<T, DIM, Q>(grid, omega0, level, max_level, fin, fout));
 }
 
 int main(int argc, char** argv)
 {
     Neon::init();
+
     if (Neon::sys::globalSpace::gpuSysObjStorage.numDevs() > 0) {
         using T = double;
 
@@ -89,7 +244,7 @@ int main(int argc, char** argv)
         Neon::Backend    backend(gpu_ids, runtime);
 
         constexpr int DIM = 2;
-        constexpr int COMP = (DIM == 2) ? 9 : 19;
+        constexpr int Q = (DIM == 2) ? 9 : 19;
 
         const int dim_x = 4;
         const int dim_y = 4;
@@ -111,7 +266,7 @@ int main(int argc, char** argv)
              [&](const Neon::index_3d&) -> bool {
                  return true;
              }},
-            create_stencil<DIM, COMP>(), descriptor);
+            create_stencil<DIM, Q>(), descriptor);
 
 
         //LBM problem
@@ -123,8 +278,8 @@ int main(int argc, char** argv)
         const T   smagorinskyConstant = 0.02;
         const T   omega = 1.0 / (3. * visclb + 0.5);
 
-        auto fin = grid.newField<T>("fin", COMP, 0);
-        auto fout = grid.newField<T>("fout", COMP, 0);
+        auto fin = grid.newField<T>("fin", Q, 0);
+        auto fout = grid.newField<T>("fout", Q, 0);
 
         //TODO init fin and fout
 
@@ -133,7 +288,11 @@ int main(int argc, char** argv)
 
         std::vector<Neon::set::Container> containers;
 
-        nonUniformTimestepRecursive(grid, omega, 0, descriptor.getDepth(), fin, fout, containers);
+        nonUniformTimestepRecursive<T, DIM, Q>(grid,
+                                               omega,
+                                               descriptor.getDepth() - 1,
+                                               descriptor.getDepth(),
+                                               fin, fout, containers);
 
         Neon::skeleton::Skeleton skl(grid.getBackend());
         skl.sequence(containers, "MultiResLBM");

From d3cfef2fdabcd361fe7f2e11077e3af54e3d45f2 Mon Sep 17 00:00:00 2001
From: Ahmed Mahmoud <ahmed.mahmoud@autodesk.com>
Date: Thu, 12 Jan 2023 19:09:56 -0500
Subject: [PATCH 03/24] steps

---
 apps/lbmMultiRes/lbmMultiRes.cu | 30 ++++++++++++++++++++++++++++--
 1 file changed, 28 insertions(+), 2 deletions(-)

diff --git a/apps/lbmMultiRes/lbmMultiRes.cu b/apps/lbmMultiRes/lbmMultiRes.cu
index 13ca1f45..bbf8f698 100644
--- a/apps/lbmMultiRes/lbmMultiRes.cu
+++ b/apps/lbmMultiRes/lbmMultiRes.cu
@@ -221,14 +221,40 @@ Neon::set::Container collide(Neon::domain::mGrid&                 grid,
 template <typename T, int DIM, int Q>
 void nonUniformTimestepRecursive(Neon::domain::mGrid&                 grid,
                                  const T                              omega0,
-                                 const int                            level,
+                                 const int                            ilevel,
                                  const int                            max_level,
                                  const Neon::domain::mGrid::Field<T>& fin,
                                  Neon::domain::mGrid::Field<T>&       fout,
                                  std::vector<Neon::set::Container>&   containers)
 {
     // 1) collision for all voxels at level L=ilevel
-    containers.push_back(collide<T, DIM, Q>(grid, omega0, level, max_level, fin, fout));
+    containers.push_back(collide<T, DIM, Q>(grid, omega0, ilevel, max_level, fin, fout));
+
+    // 2) Storing fine(ilevel) data for later "coalescence" pulled by the coarse(ilevel)
+
+    // 3) recurse down
+    if (ilevel != 0) {
+        nonUniformTimestepRecursive<T, DIM, Q>(grid, omega0, ilevel - 1, max_level, fin, fout, containers);
+    }
+
+    // 4) Streaming step that also performs the necessary "explosion" and "coalescence" steps.
+
+    // 5) stop
+    if (ilevel == max_level - 1) {
+        return;
+    }
+
+    // 6) collision for all voxels at level L = ilevel
+    containers.push_back(collide<T, DIM, Q>(grid, omega0, ilevel, max_level, fin, fout));
+
+    // 7) Storing fine(ilevel) data for later "coalescence" pulled by the coarse(ilevel)
+
+    // 8) recurse down
+    if (ilevel != 0) {
+        nonUniformTimestepRecursive<T, DIM, Q>(grid, omega0, ilevel - 1, max_level, fin, fout, containers);
+    }
+
+    // 9) Streaming step.
 }
 
 int main(int argc, char** argv)

From 8a83a62fd55af626ee6a45b7bbe29d8d47a901f6 Mon Sep 17 00:00:00 2001
From: Ahmed Mahmoud <ahmed.mahmoud@autodesk.com>
Date: Thu, 12 Jan 2023 19:49:03 -0500
Subject: [PATCH 04/24] streaming

---
 apps/lbmMultiRes/lbmMultiRes.cu | 153 ++++++++++++++++++++++++++------
 1 file changed, 124 insertions(+), 29 deletions(-)

diff --git a/apps/lbmMultiRes/lbmMultiRes.cu b/apps/lbmMultiRes/lbmMultiRes.cu
index bbf8f698..5972b220 100644
--- a/apps/lbmMultiRes/lbmMultiRes.cu
+++ b/apps/lbmMultiRes/lbmMultiRes.cu
@@ -119,16 +119,6 @@ inline void exportVTI(const int t, Field& field)
     field.ioToVtk(fname, "field");
 }
 
-/*Neon::set::Container setOmega(Neon::domain::mGrid& grid, int level)
-{
-    return grid.getContainer(
-        "SetOmega" + std::to_string(level), level,
-        [=](Neon::set::Loader& loader) {
-            //auto& local = field.load(loader, level, Neon::MultiResCompute::MAP);
-            return [=] NEON_CUDA_HOST_DEVICE(const typename Neon::domain::bGrid::Cell& cell) mutable {};
-        });
-}*/
-
 
 template <typename T>
 NEON_CUDA_HOST_DEVICE T computeOmega(T omega0, int level, int num_levels)
@@ -217,44 +207,149 @@ Neon::set::Container collide(Neon::domain::mGrid&                 grid,
         });
 }
 
+template <typename T, int DIM, int Q>
+Neon::set::Container stream(Neon::domain::mGrid&                 grid,
+                            int                                  level,
+                            const Neon::domain::mGrid::Field<T>& fpop_postcollision,
+                            Neon::domain::mGrid::Field<T>&       fpop_poststreaming)
+{
+    //TODO
+    //regular Streaming of the normal voxels at level L which are not interfaced with L+1 and L-1 levels.
+
+    return grid.getContainer(
+        "Stream" + std::to_string(level), level,
+        [=](Neon::set::Loader& loader) {
+            const auto& fpost_col = fpop_postcollision.load(loader, level, Neon::MultiResCompute::STENCIL);
+            auto        fpost_stm = fpop_poststreaming.load(loader, level, Neon::MultiResCompute::MAP);
+
+            return [=] NEON_CUDA_HOST_DEVICE(const typename Neon::domain::bGrid::Cell& cell) mutable {
+                for (int i = 0; i < Q; ++i) {
+
+                    //TODO filter out cells that interface with L+1 or L-1
+                    //TODO figure out the right direction to push/pull the data in the i-direction
+                    fpost_stm(cell, i) = fpost_col(cell, i);
+                }
+            };
+        });
+}
+
+template <typename T, int DIM, int Q>
+Neon::set::Container explosionPull(Neon::domain::mGrid&                 grid,
+                                   int                                  level,
+                                   const Neon::domain::mGrid::Field<T>& fpop_postcollision,
+                                   Neon::domain::mGrid::Field<T>&       fpop_poststreaming)
+{
+    //TODO
+    // Initiated by the fine level (hence "pull"), this function performs a coarse (level+1) to
+    // fine (level) communication or "explosion" by simply distributing copies of coarse grid onto the fine grid.
+    // In other words, this function updates the "halo" cells of the fine level by making copies of the coarse cell
+    // values.
+
+
+    return grid.getContainer(
+        "explosionPull" + std::to_string(level), level,
+        [=](Neon::set::Loader& loader) {
+            const auto& fpost_col = fpop_postcollision.load(loader, level, Neon::MultiResCompute::STENCIL_UP);
+            auto        fpost_stm = fpop_poststreaming.load(loader, level, Neon::MultiResCompute::MAP);
+
+            return [=] NEON_CUDA_HOST_DEVICE(const typename Neon::domain::bGrid::Cell& cell) mutable {
+                for (int i = 0; i < Q; ++i) {
+                }
+            };
+        });
+}
+
+
+template <typename T, int DIM, int Q>
+Neon::set::Container coalescencePull(Neon::domain::mGrid&           grid,
+                                     int                            level,
+                                     Neon::domain::mGrid::Field<T>& fpop_poststreaming)
+{
+    //TODO
+    // Initiated by the coarse level (hence "pull"), this function performs fine (level-1) to coarse
+    // (level) communication or "coalescence" by simply averaging the fine data stored in self.fpop_halo
+
+    return grid.getContainer(
+        "explosionPull" + std::to_string(level), level,
+        [=](Neon::set::Loader& loader) {
+            auto fpost_stm = fpop_poststreaming.load(loader, level, Neon::MultiResCompute::STENCIL_DOWN);
+
+            return [=] NEON_CUDA_HOST_DEVICE(const typename Neon::domain::bGrid::Cell& cell) mutable {
+                for (int i = 0; i < Q; ++i) {
+                }
+            };
+        });
+}
+
+template <typename T, int DIM, int Q>
+void stream(Neon::domain::mGrid&                 grid,
+            int                                  level,
+            const int                            max_level,
+            const Neon::domain::mGrid::Field<T>& fpop_postcollision,
+            Neon::domain::mGrid::Field<T>&       fpop_poststreaming,
+            std::vector<Neon::set::Container>&   containers)
+{
+    containers.push_back(stream<T, DIM, Q>(grid, level, fpop_postcollision, fpop_poststreaming));
+
+    /*
+    * Streaming for interface voxels that have
+    *  (i) coarser or (ii) finer neighbors at level+1 and level-1 and hence require
+    *  (i) "explosion" or (ii) coalescence
+    */
+    if (level != max_level - 1) {
+        /* Explosion: pull missing populations from coarser neighbors by copying coarse (level+1) to fine (level) 
+        * neighbors, initiated by the fine level ("Pull").
+        */
+        containers.push_back(explosionPull<T, DIM, Q>(grid, level, fpop_postcollision, fpop_poststreaming));
+    }
+
+    if (level != 0) {
+        /* Coalescence: pull missing populations from finer neighbors by "smart" averaging fine (level-1) 
+        * to coarse (level) communication, initiated by the coarse level ("Pull").
+        */
+        containers.push_back(coalescencePull<T, DIM, Q>(grid, level, fpop_poststreaming));
+    }
+}
 
 template <typename T, int DIM, int Q>
-void nonUniformTimestepRecursive(Neon::domain::mGrid&                 grid,
-                                 const T                              omega0,
-                                 const int                            ilevel,
-                                 const int                            max_level,
-                                 const Neon::domain::mGrid::Field<T>& fin,
-                                 Neon::domain::mGrid::Field<T>&       fout,
-                                 std::vector<Neon::set::Container>&   containers)
+void nonUniformTimestepRecursive(Neon::domain::mGrid&               grid,
+                                 const T                            omega0,
+                                 const int                          level,
+                                 const int                          max_level,
+                                 Neon::domain::mGrid::Field<T>&     fin,
+                                 Neon::domain::mGrid::Field<T>&     fout,
+                                 std::vector<Neon::set::Container>& containers)
 {
-    // 1) collision for all voxels at level L=ilevel
-    containers.push_back(collide<T, DIM, Q>(grid, omega0, ilevel, max_level, fin, fout));
+    // 1) collision for all voxels at level L=level
+    containers.push_back(collide<T, DIM, Q>(grid, omega0, level, max_level, fin, fout));
 
-    // 2) Storing fine(ilevel) data for later "coalescence" pulled by the coarse(ilevel)
+    // 2) Storing fine(level) data for later "coalescence" pulled by the coarse(level)
 
     // 3) recurse down
-    if (ilevel != 0) {
-        nonUniformTimestepRecursive<T, DIM, Q>(grid, omega0, ilevel - 1, max_level, fin, fout, containers);
+    if (level != 0) {
+        nonUniformTimestepRecursive<T, DIM, Q>(grid, omega0, level - 1, max_level, fin, fout, containers);
     }
 
     // 4) Streaming step that also performs the necessary "explosion" and "coalescence" steps.
+    stream<T, DIM, Q>(grid, level, max_level, fout, fin, containers);
 
     // 5) stop
-    if (ilevel == max_level - 1) {
+    if (level == max_level - 1) {
         return;
     }
 
-    // 6) collision for all voxels at level L = ilevel
-    containers.push_back(collide<T, DIM, Q>(grid, omega0, ilevel, max_level, fin, fout));
+    // 6) collision for all voxels at level L = level
+    containers.push_back(collide<T, DIM, Q>(grid, omega0, level, max_level, fin, fout));
 
-    // 7) Storing fine(ilevel) data for later "coalescence" pulled by the coarse(ilevel)
+    // 7) Storing fine(level) data for later "coalescence" pulled by the coarse(level)
 
     // 8) recurse down
-    if (ilevel != 0) {
-        nonUniformTimestepRecursive<T, DIM, Q>(grid, omega0, ilevel - 1, max_level, fin, fout, containers);
+    if (level != 0) {
+        nonUniformTimestepRecursive<T, DIM, Q>(grid, omega0, level - 1, max_level, fin, fout, containers);
     }
 
-    // 9) Streaming step.
+    // 9) Streaming step
+    stream<T, DIM, Q>(grid, level, max_level, fout, fin, containers);
 }
 
 int main(int argc, char** argv)

From e48ab8ec56ad619c9625c054ce2a86ad825591bc Mon Sep 17 00:00:00 2001
From: Ahmed Mahmoud <ahmed.mahmoud@autodesk.com>
Date: Mon, 16 Jan 2023 13:12:23 -0500
Subject: [PATCH 05/24] WIP streaming

---
 apps/lbmMultiRes/lbmMultiRes.cu               | 13 +++-
 .../Neon/domain/internal/bGrid/bCell.h        |  2 +-
 .../Neon/domain/internal/bGrid/bCell_imp.h    | 10 +++
 .../Neon/domain/internal/bGrid/bGrid.h        |  2 +-
 .../Neon/domain/internal/bGrid/bPartition.h   |  5 +-
 .../domain/internal/bGrid/bPartition_imp.h    | 25 +++---
 .../Neon/domain/internal/mGrid/mField_imp.h   |  2 +
 .../Neon/domain/internal/mGrid/mPartition.h   | 37 +++++++--
 .../domain/internal/mGrid/mPartition_imp.h    | 77 ++++++++++++++-----
 .../src/domain/internal/bGrid/bGrid.cpp       |  4 +-
 .../unit/sUt_multiRes/src/MultiResParent.h    |  6 +-
 .../unit/sUt_multiRes/src/MultiResSkeleton.h  |  6 +-
 12 files changed, 138 insertions(+), 51 deletions(-)

diff --git a/apps/lbmMultiRes/lbmMultiRes.cu b/apps/lbmMultiRes/lbmMultiRes.cu
index 5972b220..c222d4ab 100644
--- a/apps/lbmMultiRes/lbmMultiRes.cu
+++ b/apps/lbmMultiRes/lbmMultiRes.cu
@@ -223,11 +223,20 @@ Neon::set::Container stream(Neon::domain::mGrid&                 grid,
             auto        fpost_stm = fpop_poststreaming.load(loader, level, Neon::MultiResCompute::MAP);
 
             return [=] NEON_CUDA_HOST_DEVICE(const typename Neon::domain::bGrid::Cell& cell) mutable {
-                for (int i = 0; i < Q; ++i) {
+                auto parent = fpost_stm.getParent(cell);
+                bool hasChildren = fpost_stm.hasChildren(cell);
 
+                for (int i = 0; i < Q; ++i) {
                     //TODO filter out cells that interface with L+1 or L-1
                     //TODO figure out the right direction to push/pull the data in the i-direction
-                    fpost_stm(cell, i) = fpost_col(cell, i);
+                    // fpost_stm(cell, i) = fpost_col(cell, i);
+
+                    //if (parent.isActive()) {
+                    //    auto uncle = fpost_stm.setNghCell()
+                    //}
+
+                    if (hasChildren) {
+                    }
                 }
             };
         });
diff --git a/libNeonDomain/include/Neon/domain/internal/bGrid/bCell.h b/libNeonDomain/include/Neon/domain/internal/bGrid/bCell.h
index 96e09f68..4e5d2bdf 100644
--- a/libNeonDomain/include/Neon/domain/internal/bGrid/bCell.h
+++ b/libNeonDomain/include/Neon/domain/internal/bGrid/bCell.h
@@ -30,7 +30,7 @@ class bCell
     //i.e., each entry in the mask array store the state of 32 voxels
     static constexpr uint32_t sMaskSize = 32;
 
-    bCell() = default;
+    bCell();
     virtual ~bCell() = default;
 
     NEON_CUDA_HOST_DEVICE inline auto isActive() const -> bool;
diff --git a/libNeonDomain/include/Neon/domain/internal/bGrid/bCell_imp.h b/libNeonDomain/include/Neon/domain/internal/bGrid/bCell_imp.h
index b5539ea0..d3c14d5f 100644
--- a/libNeonDomain/include/Neon/domain/internal/bGrid/bCell_imp.h
+++ b/libNeonDomain/include/Neon/domain/internal/bGrid/bCell_imp.h
@@ -2,6 +2,16 @@
 
 namespace Neon::domain::internal::bGrid {
 
+NEON_CUDA_HOST_DEVICE inline bCell::bCell()
+    : mLocation(std::numeric_limits<Location::Integer>::max(),
+                std::numeric_limits<Location::Integer>::max(),
+                std::numeric_limits<Location::Integer>::max()),
+      mBlockID(std::numeric_limits<uint32_t>::max()),
+      mIsActive(false),
+      mBlockSize(-1)
+{
+}
+
 NEON_CUDA_HOST_DEVICE inline bCell::bCell(const Location& location)
 {
     mLocation = location;
diff --git a/libNeonDomain/include/Neon/domain/internal/bGrid/bGrid.h b/libNeonDomain/include/Neon/domain/internal/bGrid/bGrid.h
index 76e8ca29..2761ec90 100644
--- a/libNeonDomain/include/Neon/domain/internal/bGrid/bGrid.h
+++ b/libNeonDomain/include/Neon/domain/internal/bGrid/bGrid.h
@@ -94,7 +94,7 @@ class bGrid : public Neon::domain::interface::GridBaseTemplate<bGrid, bCell>
 
 
     auto getOrigins() const -> const Neon::set::MemSet_t<Neon::int32_3d>&;
-    auto getNeighbourBlocks() const -> const Neon::set::MemSet_t<uint32_t>&;
+    auto getNeighbourBlocks() const -> Neon::set::MemSet_t<uint32_t>&;
     auto getActiveMask() const -> Neon::set::MemSet_t<uint32_t>&;
     auto getBlockOriginTo1D() const -> Neon::domain::tool::PointHashTable<int32_t, uint32_t>&;
 
diff --git a/libNeonDomain/include/Neon/domain/internal/bGrid/bPartition.h b/libNeonDomain/include/Neon/domain/internal/bGrid/bPartition.h
index 2080cf42..755d5f20 100644
--- a/libNeonDomain/include/Neon/domain/internal/bGrid/bPartition.h
+++ b/libNeonDomain/include/Neon/domain/internal/bGrid/bPartition.h
@@ -65,8 +65,9 @@ class bPartition
 
    protected:
     inline NEON_CUDA_HOST_DEVICE auto pitch(const Cell& cell, int card) const -> uint32_t;
-    inline NEON_CUDA_HOST_DEVICE auto setNghCell(const Cell& cell, const nghIdx_t& offset) const -> Cell;
-    inline NEON_CUDA_HOST_DEVICE auto shmemPitch(Cell cell, const int card) const -> Cell::Location::Integer;
+    inline NEON_CUDA_HOST_DEVICE auto setNghCell(const Cell& cell, const nghIdx_t& offset, const uint32_t* neighbourBlocks) const -> Cell;
+    inline NEON_CUDA_HOST_DEVICE auto shmemPitch(Cell cell, const int card) const -> Cell::Location::Integer;    
+    inline NEON_CUDA_HOST_DEVICE auto getneighbourBlocksPtr(const Cell& cell) const -> const uint32_t*;
 
     Neon::DataView            mDataView;
     T*                        mMem;
diff --git a/libNeonDomain/include/Neon/domain/internal/bGrid/bPartition_imp.h b/libNeonDomain/include/Neon/domain/internal/bGrid/bPartition_imp.h
index 1bd87452..04e56645 100644
--- a/libNeonDomain/include/Neon/domain/internal/bGrid/bPartition_imp.h
+++ b/libNeonDomain/include/Neon/domain/internal/bGrid/bPartition_imp.h
@@ -110,9 +110,21 @@ inline NEON_CUDA_HOST_DEVICE auto bPartition<T, C>::pitch(const Cell& cell, int
         cell.pitch(card);
 }
 
+template <typename T, int C>
+NEON_CUDA_HOST_DEVICE inline auto bPartition<T, C>::getneighbourBlocksPtr(const Cell& cell) const -> const uint32_t*
+{
+    if (mSharedNeighbourBlocks != nullptr) {
+        return mSharedNeighbourBlocks;
+    } else {
+        return mNeighbourBlocks + (26 * cell.mBlockID);
+    }
+}
+
+
 template <typename T, int C>
 NEON_CUDA_HOST_DEVICE inline auto bPartition<T, C>::setNghCell(const Cell&     cell,
-                                                               const nghIdx_t& offset) const -> Cell
+                                                               const nghIdx_t& offset,
+                                                               const uint32_t* neighbourBlocks) const -> Cell
 {
     Cell ngh_cell(cell.mLocation.x + offset.x,
                   cell.mLocation.y + offset.y,
@@ -151,11 +163,7 @@ NEON_CUDA_HOST_DEVICE inline auto bPartition<T, C>::setNghCell(const Cell&     c
             ngh_cell.mLocation.z -= cell.mBlockSize;
         }
 
-        if (mSharedNeighbourBlocks != nullptr) {
-            ngh_cell.mBlockID = mSharedNeighbourBlocks[Cell::getNeighbourBlockID(block_offset)];
-        } else {
-            ngh_cell.mBlockID = mNeighbourBlocks[26 * cell.mBlockID + Cell::getNeighbourBlockID(block_offset)];
-        }
+        ngh_cell.mBlockID = neighbourBlocks[Cell::getNeighbourBlockID(block_offset)];
 
     } else {
         ngh_cell.mBlockID = cell.mBlockID;
@@ -191,7 +199,7 @@ NEON_CUDA_HOST_DEVICE inline auto bPartition<T, C>::nghVal(const Cell&     cell,
         Cell swirl_cell = cell.toSwirl();
         swirl_cell.mBlockSize = cell.mBlockSize;
 
-        Cell ngh_cell = setNghCell(swirl_cell, offset);
+        Cell ngh_cell = setNghCell(swirl_cell, offset, getneighbourBlocksPtr(swirl_cell));
         ngh_cell.mBlockSize = cell.mBlockSize;
         if (ngh_cell.mBlockID != std::numeric_limits<uint32_t>::max()) {
             //TODO maybe ngh_cell should be mapped to its memory layout
@@ -207,8 +215,7 @@ NEON_CUDA_HOST_DEVICE inline auto bPartition<T, C>::nghVal(const Cell&     cell,
         }
 
     } else {
-        Cell ngh_cell = setNghCell(cell, offset);
-        ngh_cell.mBlockSize = cell.mBlockSize;
+        Cell ngh_cell = setNghCell(cell, offset, getneighbourBlocksPtr(cell));
         if (ngh_cell.mBlockID != std::numeric_limits<uint32_t>::max()) {
             ret.isValid = ngh_cell.computeIsActive(mMask);
             if (ret.isValid) {
diff --git a/libNeonDomain/include/Neon/domain/internal/mGrid/mField_imp.h b/libNeonDomain/include/Neon/domain/internal/mGrid/mField_imp.h
index 91777e06..9f0d9936 100644
--- a/libNeonDomain/include/Neon/domain/internal/mGrid/mField_imp.h
+++ b/libNeonDomain/include/Neon/domain/internal/mGrid/mField_imp.h
@@ -65,6 +65,7 @@ mField<T, C>::mField(const std::string&         name,
                         active_mask.rawMem(gpuID, Neon::DeviceType::CPU),
                         (l == 0) ? nullptr : mData->grid->operator()(l - 1).getActiveMask().rawMem(gpuID, Neon::DeviceType::CPU),  //lower-level mask
                         (l == 0) ? nullptr : childBlockID.rawMem(gpuID, Neon::DeviceType::CPU),
+                        (l == int(descriptor.getDepth()) - 1) ? nullptr : mData->grid->operator()(l + 1).getNeighbourBlocks().rawMem(gpuID, Neon::DeviceType::CPU),  //parent neighbor
                         outsideVal,
                         stencil_ngh.rawMem(gpuID, Neon::DeviceType::CPU),
                         refFactorSet.rawMem(gpuID, Neon::DeviceType::CPU),
@@ -85,6 +86,7 @@ mField<T, C>::mField(const std::string&         name,
                         active_mask.rawMem(gpuID, Neon::DeviceType::CUDA),
                         (l == 0) ? nullptr : mData->grid->operator()(l - 1).getActiveMask().rawMem(gpuID, Neon::DeviceType::CUDA),  //lower-level mask
                         (l == 0) ? nullptr : childBlockID.rawMem(gpuID, Neon::DeviceType::CUDA),
+                        (l == int(descriptor.getDepth()) - 1) ? nullptr : mData->grid->operator()(l + 1).getNeighbourBlocks().rawMem(gpuID, Neon::DeviceType::CUDA),  //parent neighbor
                         outsideVal,
                         stencil_ngh.rawMem(gpuID, Neon::DeviceType::CUDA),
                         refFactorSet.rawMem(gpuID, Neon::DeviceType::CUDA),
diff --git a/libNeonDomain/include/Neon/domain/internal/mGrid/mPartition.h b/libNeonDomain/include/Neon/domain/internal/mGrid/mPartition.h
index b9b65141..7ba3709c 100644
--- a/libNeonDomain/include/Neon/domain/internal/mGrid/mPartition.h
+++ b/libNeonDomain/include/Neon/domain/internal/mGrid/mPartition.h
@@ -37,6 +37,7 @@ class mPartition : public Neon::domain::internal::bGrid::bPartition<T, C>
                         uint32_t*       mask,
                         uint32_t*       maskLowerLevel,
                         uint32_t*       childBlockID,
+                        uint32_t*       parentNeighbourBlocks,
                         T               defaultValue,
                         nghIdx_t*       stencilNghIndex,
                         int*            refFactors,
@@ -63,6 +64,7 @@ class mPartition : public Neon::domain::internal::bGrid::bPartition<T, C>
     NEON_CUDA_HOST_DEVICE inline auto getChild(const Cell&   parent_cell,
                                                Neon::int8_3d child) const -> Cell;
 
+
     /**
      * Given a child cell (as returned by getChild), return the value of this child 
      * @param childCell the child cell as returned by getChild 
@@ -79,13 +81,35 @@ class mPartition : public Neon::domain::internal::bGrid::bPartition<T, C>
     NEON_CUDA_HOST_DEVICE inline auto childVal(const Cell& childCell,
                                                int         card = 0) const -> const T&;
 
+    /**
+     * Check if the cell is refined i.e., has children 
+     * @param cell the cell i.e., parent at which the children are checked 
+     * @return 
+    */
+    NEON_CUDA_HOST_DEVICE inline auto hasChildren(const Cell& cell) const -> bool;
+
+
+    /**
+     * Get a cell that represents the parent of a given cell
+     * @param cell the child cell for which the parent is queried 
+    */
+    NEON_CUDA_HOST_DEVICE inline auto getParent(const Cell& cell) const -> Cell;
+
     /**
      * Given a cell (child), return the value of the parent 
      * @param eId the cell 
      * @param card the cardinality in case of vector-valued data           
     */
-    NEON_CUDA_HOST_DEVICE inline auto parent(const Cell& eId,
-                                             int         card) -> T&;
+    NEON_CUDA_HOST_DEVICE inline auto parentVal(const Cell& eId,
+                                                int         card) -> T&;
+
+    /**
+     * Given a cell (child), return the value of the parent 
+     * @param eId the cell 
+     * @param card the cardinality in case of vector-valued data           
+    */
+    NEON_CUDA_HOST_DEVICE inline auto parentVal(const Cell& eId,
+                                                int         card) const -> const T&;
 
     /**
      * check if the cell has a parent as defined by the user during the construction of the mGrid 
@@ -93,12 +117,8 @@ class mPartition : public Neon::domain::internal::bGrid::bPartition<T, C>
     */
     NEON_CUDA_HOST_DEVICE inline auto hasParent(const Cell& cell) const -> bool;
 
-    /**
-     * Check if the cell is refined i.e., has children 
-     * @param cell the cell i.e., parent at which the children are checked 
-     * @return 
-    */
-    NEON_CUDA_HOST_DEVICE inline auto hasChildren(const Cell& cell) const -> bool;
+    NEON_CUDA_HOST_DEVICE inline auto getUncle(const Cell&   cell,
+                                               Neon::int8_3d direction) const -> Cell;
 
     /**
      * Get the refinement factor i.e., number of children at each dimension
@@ -127,6 +147,7 @@ class mPartition : public Neon::domain::internal::bGrid::bPartition<T, C>
     Cell::Location* mParentLocalID;
     uint32_t*       mMaskLowerLevel;
     uint32_t*       mChildBlockID;
+    uint32_t*       mParentNeighbourBlocks;
     int*            mRefFactors;
     int*            mSpacing;
 };
diff --git a/libNeonDomain/include/Neon/domain/internal/mGrid/mPartition_imp.h b/libNeonDomain/include/Neon/domain/internal/mGrid/mPartition_imp.h
index 50a7eee3..73fcf4ff 100644
--- a/libNeonDomain/include/Neon/domain/internal/mGrid/mPartition_imp.h
+++ b/libNeonDomain/include/Neon/domain/internal/mGrid/mPartition_imp.h
@@ -31,6 +31,7 @@ mPartition<T, C>::mPartition(Neon::DataView  dataView,
                              uint32_t*       mask,
                              uint32_t*       maskLowerLevel,
                              uint32_t*       childBlockID,
+                             uint32_t*       parentNeighbourBlocks,
                              T               outsideValue,
                              nghIdx_t*       stencilNghIndex,
                              int*            refFactors,
@@ -43,6 +44,7 @@ mPartition<T, C>::mPartition(Neon::DataView  dataView,
       mParentLocalID(parentLocalID),
       mMaskLowerLevel(maskLowerLevel),
       mChildBlockID(childBlockID),
+      mParentNeighbourBlocks(parentNeighbourBlocks),
       mRefFactors(refFactors),
       mSpacing(spacing)
 {
@@ -101,29 +103,23 @@ inline NEON_CUDA_HOST_DEVICE auto mPartition<T, C>::childID(const Cell& cell) co
     return mChildBlockID[childPitch];
 }
 
-template <typename T, int C>
-NEON_CUDA_HOST_DEVICE inline auto mPartition<T, C>::hasParent(const Cell& cell) const -> bool
-{
-    if (mMemParent) {
-        return true;
-    }
-    return false;
-}
-
 template <typename T, int C>
 NEON_CUDA_HOST_DEVICE inline auto mPartition<T, C>::getChild(const Cell&   parent_cell,
                                                              Neon::int8_3d child) const -> Cell
 {
     Cell childCell;
-    childCell.mBlockID = childID(parent_cell);
-    childCell.mBlockSize = mRefFactors[mLevel - 1];
-    childCell.mLocation.x = child.x;
-    childCell.mLocation.y = child.y;
-    childCell.mLocation.z = child.z;
-    childCell.mIsActive = childCell.computeIsActive(mMaskLowerLevel);
+    if (hasChildren(parent_cell)) {
+        childCell.mBlockID = childID(parent_cell);
+        childCell.mBlockSize = mRefFactors[mLevel - 1];
+        childCell.mLocation.x = child.x;
+        childCell.mLocation.y = child.y;
+        childCell.mLocation.z = child.z;
+        childCell.mIsActive = childCell.computeIsActive(mMaskLowerLevel);
+    }
     return childCell;
 }
 
+
 template <typename T, int C>
 NEON_CUDA_HOST_DEVICE inline auto mPartition<T, C>::childVal(const Cell& childCell,
                                                              int         card) -> T&
@@ -176,17 +172,58 @@ NEON_CUDA_HOST_DEVICE inline auto mPartition<T, C>::hasChildren(const Cell& cell
 }
 
 template <typename T, int C>
-NEON_CUDA_HOST_DEVICE inline auto mPartition<T, C>::parent(const Cell& eId,
-                                                           int         card) -> T&
+NEON_CUDA_HOST_DEVICE inline auto mPartition<T, C>::getParent(const Cell& cell) const -> Cell
 {
+    Cell parentCell;
     if (mMemParent != nullptr) {
-        Cell parentCell;
-        parentCell.mBlockID = mParentBlockID[eId.mBlockID];
-        parentCell.mLocation = mParentLocalID[eId.mBlockID];
+        parentCell.mBlockID = mParentBlockID[cell.mBlockID];
+        parentCell.mLocation = mParentLocalID[cell.mBlockID];
         parentCell.mBlockSize = mRefFactors[mLevel + 1];
+        parentCell.mIsActive = true;
+    }
+    return parentCell;
+}
+
+template <typename T, int C>
+NEON_CUDA_HOST_DEVICE inline auto mPartition<T, C>::parentVal(const Cell& eId,
+                                                              int         card) -> T&
+{
+    auto parentCell = getParent(eId);
+    if (parentCell.isActive()) {
+        return mMemParent[this->pitch(parentCell, card)];
+    }
+}
+
+template <typename T, int C>
+NEON_CUDA_HOST_DEVICE inline auto mPartition<T, C>::parentVal(const Cell& eId,
+                                                              int         card) const -> const T&
+{
+    auto parentCell = getParent(eId);
+    if (parentCell.isActive()) {
         return mMemParent[this->pitch(parentCell, card)];
     }
 }
 
+template <typename T, int C>
+NEON_CUDA_HOST_DEVICE inline auto mPartition<T, C>::hasParent(const Cell& cell) const -> bool
+{
+    if (mMemParent) {
+        return true;
+    }
+    return false;
+}
+
+template <typename T, int C>
+NEON_CUDA_HOST_DEVICE inline auto mPartition<T, C>::getUncle(const Cell&   cell,
+                                                             Neon::int8_3d direction) const -> Cell
+{
+    Cell uncle = getParent(cell);
+    if (uncle.isActive()) {
+        uncle = this->setNghCell(uncle, direction, mParentNeighbourBlocks);
+        uncle.mIsActive = ngh_cell.mBlockID != std::numeric_limits<uint32_t>::max();
+    }
+    return uncle;
+}
+
 
 }  // namespace Neon::domain::internal::mGrid
\ No newline at end of file
diff --git a/libNeonDomain/src/domain/internal/bGrid/bGrid.cpp b/libNeonDomain/src/domain/internal/bGrid/bGrid.cpp
index 1f616ac4..0692f6a9 100644
--- a/libNeonDomain/src/domain/internal/bGrid/bGrid.cpp
+++ b/libNeonDomain/src/domain/internal/bGrid/bGrid.cpp
@@ -40,7 +40,7 @@ auto bGrid::isInsideDomain(const Neon::index_3d& idx) const -> bool
 
         Cell cell(static_cast<Cell::Location::Integer>((idx.x / mData->voxelSpacing) % mData->blockSize),
                   static_cast<Cell::Location::Integer>((idx.y / mData->voxelSpacing) % mData->blockSize),
-                  static_cast<Cell::Location::Integer>((idx.z / mData->voxelSpacing) % mData->blockSize));        
+                  static_cast<Cell::Location::Integer>((idx.z / mData->voxelSpacing) % mData->blockSize));
 
         cell.mBlockID = *itr;
         cell.mBlockSize = mData->blockSize;
@@ -123,7 +123,7 @@ auto bGrid::getStencilNghIndex() const -> const Neon::set::MemSet_t<nghIdx_t>&
 }
 
 
-auto bGrid::getNeighbourBlocks() const -> const Neon::set::MemSet_t<uint32_t>&
+auto bGrid::getNeighbourBlocks() const -> Neon::set::MemSet_t<uint32_t>&
 {
     return mData->mNeighbourBlocks;
 }
diff --git a/libNeonSkeleton/tests/unit/sUt_multiRes/src/MultiResParent.h b/libNeonSkeleton/tests/unit/sUt_multiRes/src/MultiResParent.h
index 59ce7e25..c631ee76 100644
--- a/libNeonSkeleton/tests/unit/sUt_multiRes/src/MultiResParent.h
+++ b/libNeonSkeleton/tests/unit/sUt_multiRes/src/MultiResParent.h
@@ -70,7 +70,7 @@ void MultiResParent()
                     return [=] NEON_CUDA_HOST_DEVICE(const Neon::domain::mGrid::Cell& cell) mutable {
                         if (xLocal.hasParent(cell)) {
                             hasParentLocal(cell, 0) = 1;
-                            xLocal(cell, 0) = xLocal.parent(cell, 0);
+                            xLocal(cell, 0) = xLocal.parentVal(cell, 0);
                         } else {
                             hasParentLocal(cell, 0) = -1;
                         }
@@ -184,11 +184,11 @@ void MultiResAtomicAddParent()
                         if (xLocal.hasParent(cell)) {
 
 #ifdef NEON_PLACE_CUDA_DEVICE
-                            atomicAdd(&xLocal.parent(cell, 0), xLocal(cell, 0));
+                            atomicAdd(&xLocal.parentVal(cell, 0), xLocal(cell, 0));
 #else
 
 #pragma omp atomic
-                            xLocal.parent(cell, 0) += xLocal(cell, 0);
+                            xLocal.parentVal(cell, 0) += xLocal(cell, 0);
 #endif
                         }
                     };
diff --git a/libNeonSkeleton/tests/unit/sUt_multiRes/src/MultiResSkeleton.h b/libNeonSkeleton/tests/unit/sUt_multiRes/src/MultiResSkeleton.h
index f4584800..324a7577 100644
--- a/libNeonSkeleton/tests/unit/sUt_multiRes/src/MultiResSkeleton.h
+++ b/libNeonSkeleton/tests/unit/sUt_multiRes/src/MultiResSkeleton.h
@@ -77,9 +77,9 @@ void MultiResSkeleton()
                     return [=] NEON_CUDA_HOST_DEVICE(const typename Neon::domain::bGrid::Cell& cell) mutable {
                         assert(local.hasParent(cell));
 
-                        local(cell, 0) = local.parent(cell, 0);
-                        local(cell, 1) = local.parent(cell, 1);
-                        local(cell, 2) = local.parent(cell, 2);
+                        local(cell, 0) = local.parentVal(cell, 0);
+                        local(cell, 1) = local.parentVal(cell, 1);
+                        local(cell, 2) = local.parentVal(cell, 2);
                     };
                 }));
         }

From 544c9d388fa7139d6026edfebbd657ad3222651f Mon Sep 17 00:00:00 2001
From: Ahmed Mahmoud <ahmed.mahmoud@autodesk.com>
Date: Tue, 17 Jan 2023 12:29:07 -0500
Subject: [PATCH 06/24] same-level stream filtering

---
 apps/lbmMultiRes/lbmMultiRes.cu               | 48 ++++++++++++-------
 .../Neon/domain/internal/mGrid/mPartition.h   |  8 ++++
 .../domain/internal/mGrid/mPartition_imp.h    | 16 ++++++-
 3 files changed, 53 insertions(+), 19 deletions(-)

diff --git a/apps/lbmMultiRes/lbmMultiRes.cu b/apps/lbmMultiRes/lbmMultiRes.cu
index c222d4ab..0b84b5ae 100644
--- a/apps/lbmMultiRes/lbmMultiRes.cu
+++ b/apps/lbmMultiRes/lbmMultiRes.cu
@@ -213,8 +213,8 @@ Neon::set::Container stream(Neon::domain::mGrid&                 grid,
                             const Neon::domain::mGrid::Field<T>& fpop_postcollision,
                             Neon::domain::mGrid::Field<T>&       fpop_poststreaming)
 {
-    //TODO
     //regular Streaming of the normal voxels at level L which are not interfaced with L+1 and L-1 levels.
+    //This is "pull" stream
 
     return grid.getContainer(
         "Stream" + std::to_string(level), level,
@@ -224,18 +224,29 @@ Neon::set::Container stream(Neon::domain::mGrid&                 grid,
 
             return [=] NEON_CUDA_HOST_DEVICE(const typename Neon::domain::bGrid::Cell& cell) mutable {
                 auto parent = fpost_stm.getParent(cell);
-                bool hasChildren = fpost_stm.hasChildren(cell);
 
-                for (int i = 0; i < Q; ++i) {
-                    //TODO filter out cells that interface with L+1 or L-1
-                    //TODO figure out the right direction to push/pull the data in the i-direction
-                    // fpost_stm(cell, i) = fpost_col(cell, i);
 
-                    //if (parent.isActive()) {
-                    //    auto uncle = fpost_stm.setNghCell()
-                    //}
+                //If this cell has children i.e., it is been refined, that we should not work on it
+                //because this cell is only there to allow query and not to operate on
+                if (!fpost_stm.hasChildren(cell)) {
+
+                    for (int8_t q = 0; q < Q; ++q) {
+                        Neon::int8_3d dir;
+                        if constexpr (DIM == 2) {
+                            dir = Neon::int8_3d(-latticeVelocity2D[q][0], -latticeVelocity2D[q][1], 0);
+                        }
+                        if constexpr (DIM == 3) {
+                            dir = Neon::int8_3d(-latticeVelocity3D[q][0], -latticeVelocity3D[q][1], -latticeVelocity3D[q][2]);
+                        }
+
+                        auto neighbor = fpost_col.nghVal(cell, dir, q, T(0));
 
-                    if (hasChildren) {
+                        if (neighbor.isValid) {
+                            //if the neighbor cell has children, then this 'cell' is interfacing with L-1 along q direction
+                            if (!fpost_stm.hasChildren(cell, dir)) {
+                                fpost_stm(cell, q) = neighbor.value;
+                            }
+                        }
                     }
                 }
             };
@@ -262,7 +273,7 @@ Neon::set::Container explosionPull(Neon::domain::mGrid&                 grid,
             auto        fpost_stm = fpop_poststreaming.load(loader, level, Neon::MultiResCompute::MAP);
 
             return [=] NEON_CUDA_HOST_DEVICE(const typename Neon::domain::bGrid::Cell& cell) mutable {
-                for (int i = 0; i < Q; ++i) {
+                for (int8_t q = 0; q < Q; ++q) {
                 }
             };
         });
@@ -369,16 +380,16 @@ int main(int argc, char** argv)
         using T = double;
 
         //Neon grid
-        auto             runtime = Neon::Runtime::stream;
+        auto             runtime = Neon::Runtime::openmp;
         std::vector<int> gpu_ids{0};
         Neon::Backend    backend(gpu_ids, runtime);
 
         constexpr int DIM = 2;
         constexpr int Q = (DIM == 2) ? 9 : 19;
 
-        const int dim_x = 4;
-        const int dim_y = 4;
-        const int dim_z = (DIM < 3) ? 1 : 4;
+        const int dim_x = 12;
+        const int dim_y = 12;
+        const int dim_z = (DIM < 3) ? 4 : 4;
 
         const Neon::index_3d grid_dim(dim_x, dim_y, dim_z);
 
@@ -388,16 +399,17 @@ int main(int argc, char** argv)
         Neon::domain::mGrid grid(
             backend, grid_dim,
             {[&](const Neon::index_3d id) -> bool {
-                 return true;
+                 return id.x > 7;
              },
-             [&](const Neon::index_3d&) -> bool {
-                 return true;
+             [&](const Neon::index_3d& id) -> bool {
+                 return id.x > 3;
              },
              [&](const Neon::index_3d&) -> bool {
                  return true;
              }},
             create_stencil<DIM, Q>(), descriptor);
 
+        //grid.topologyToVTK("lbm.vtk", false);
 
         //LBM problem
         const int max_iter = 300;
diff --git a/libNeonDomain/include/Neon/domain/internal/mGrid/mPartition.h b/libNeonDomain/include/Neon/domain/internal/mGrid/mPartition.h
index 7ba3709c..8995401c 100644
--- a/libNeonDomain/include/Neon/domain/internal/mGrid/mPartition.h
+++ b/libNeonDomain/include/Neon/domain/internal/mGrid/mPartition.h
@@ -88,6 +88,14 @@ class mPartition : public Neon::domain::internal::bGrid::bPartition<T, C>
     */
     NEON_CUDA_HOST_DEVICE inline auto hasChildren(const Cell& cell) const -> bool;
 
+    /**
+     * Check if a neighbor to 'cell' has children 
+     * @param cell 
+     * @param cell 
+     * @return 
+    */
+    NEON_CUDA_HOST_DEVICE inline auto hasChildren(const Cell& cell, const Neon::int8_3d nghDir) const -> bool;
+
 
     /**
      * Get a cell that represents the parent of a given cell
diff --git a/libNeonDomain/include/Neon/domain/internal/mGrid/mPartition_imp.h b/libNeonDomain/include/Neon/domain/internal/mGrid/mPartition_imp.h
index 73fcf4ff..986abe29 100644
--- a/libNeonDomain/include/Neon/domain/internal/mGrid/mPartition_imp.h
+++ b/libNeonDomain/include/Neon/domain/internal/mGrid/mPartition_imp.h
@@ -171,6 +171,20 @@ NEON_CUDA_HOST_DEVICE inline auto mPartition<T, C>::hasChildren(const Cell& cell
     return true;
 }
 
+template <typename T, int C>
+NEON_CUDA_HOST_DEVICE inline auto mPartition<T, C>::hasChildren(const Cell& cell, const Neon::int8_3d nghDir) const -> bool
+{
+    if (mMemChild == nullptr || mMaskLowerLevel == nullptr || mLevel == 0) {
+        return false;
+    }
+
+    Cell nghCell = this->setNghCell(cell, nghDir, this->getneighbourBlocksPtr(cell));
+    if (!nghCell.isActive()) {
+        return false;
+    }
+    return hasChildren(nghCell);
+}
+
 template <typename T, int C>
 NEON_CUDA_HOST_DEVICE inline auto mPartition<T, C>::getParent(const Cell& cell) const -> Cell
 {
@@ -220,7 +234,7 @@ NEON_CUDA_HOST_DEVICE inline auto mPartition<T, C>::getUncle(const Cell&   cell,
     Cell uncle = getParent(cell);
     if (uncle.isActive()) {
         uncle = this->setNghCell(uncle, direction, mParentNeighbourBlocks);
-        uncle.mIsActive = ngh_cell.mBlockID != std::numeric_limits<uint32_t>::max();
+        uncle.mIsActive = uncle.mBlockID != std::numeric_limits<uint32_t>::max();
     }
     return uncle;
 }

From fe52e7898363bfa19db6734e3bf9222310d42770 Mon Sep 17 00:00:00 2001
From: Ahmed Mahmoud <ahmed.mahmoud@autodesk.com>
Date: Tue, 17 Jan 2023 19:47:44 -0500
Subject: [PATCH 07/24] Explosion Pull

---
 apps/lbmMultiRes/lbmMultiRes.cu               | 72 ++++++++++++++++---
 .../Neon/domain/internal/bGrid/bPartition.h   |  3 +-
 .../domain/internal/bGrid/bPartition_imp.h    | 10 ++-
 .../Neon/domain/internal/mGrid/mPartition.h   |  5 ++
 .../domain/internal/mGrid/mPartition_imp.h    | 19 +++++
 5 files changed, 95 insertions(+), 14 deletions(-)

diff --git a/apps/lbmMultiRes/lbmMultiRes.cu b/apps/lbmMultiRes/lbmMultiRes.cu
index 0b84b5ae..dcf7ed64 100644
--- a/apps/lbmMultiRes/lbmMultiRes.cu
+++ b/apps/lbmMultiRes/lbmMultiRes.cu
@@ -120,6 +120,27 @@ inline void exportVTI(const int t, Field& field)
 }
 
 
+NEON_CUDA_HOST_DEVICE inline Neon::int8_3d explosionUnlceOffset(const Neon::domain::bGrid::Cell& cell, const Neon::int8_3d& q)
+{
+    //given a local index within a cell and a population direction (q)
+    //find the uncle's (the parent neighbor) offset from which the desired population (q) should be read
+    //this offset is wrt the cell containing the localID (i.e., the parent of localID)
+
+    auto off = [](const int8_t i, const int8_t j) {
+        //0, -1 --> -1
+        //1, -1 --> 0
+        //0, 1 --> 0
+        //0, 0 --> 0
+        //1, 1 --> 1
+        const int8_t s = i + j;
+        return (s <= 0) ? s : s - 1;
+    };
+
+    Neon::int8_3d offset(off(cell.mLocation.x, q.x), off(cell.mLocation.y, q.y), off(cell.mLocation.z, q.z));
+    return offset;
+}
+
+
 template <typename T>
 NEON_CUDA_HOST_DEVICE T computeOmega(T omega0, int level, int num_levels)
 {
@@ -223,9 +244,6 @@ Neon::set::Container stream(Neon::domain::mGrid&                 grid,
             auto        fpost_stm = fpop_poststreaming.load(loader, level, Neon::MultiResCompute::MAP);
 
             return [=] NEON_CUDA_HOST_DEVICE(const typename Neon::domain::bGrid::Cell& cell) mutable {
-                auto parent = fpost_stm.getParent(cell);
-
-
                 //If this cell has children i.e., it is been refined, that we should not work on it
                 //because this cell is only there to allow query and not to operate on
                 if (!fpost_stm.hasChildren(cell)) {
@@ -238,12 +256,10 @@ Neon::set::Container stream(Neon::domain::mGrid&                 grid,
                         if constexpr (DIM == 3) {
                             dir = Neon::int8_3d(-latticeVelocity3D[q][0], -latticeVelocity3D[q][1], -latticeVelocity3D[q][2]);
                         }
-
-                        auto neighbor = fpost_col.nghVal(cell, dir, q, T(0));
-
-                        if (neighbor.isValid) {
-                            //if the neighbor cell has children, then this 'cell' is interfacing with L-1 along q direction
-                            if (!fpost_stm.hasChildren(cell, dir)) {
+                        //if the neighbor cell has children, then this 'cell' is interfacing with L-1 (fine) along q direction
+                        if (!fpost_stm.hasChildren(cell, dir)) {
+                            auto neighbor = fpost_col.nghVal(cell, dir, q, T(0));
+                            if (neighbor.isValid) {
                                 fpost_stm(cell, q) = neighbor.value;
                             }
                         }
@@ -259,7 +275,6 @@ Neon::set::Container explosionPull(Neon::domain::mGrid&                 grid,
                                    const Neon::domain::mGrid::Field<T>& fpop_postcollision,
                                    Neon::domain::mGrid::Field<T>&       fpop_poststreaming)
 {
-    //TODO
     // Initiated by the fine level (hence "pull"), this function performs a coarse (level+1) to
     // fine (level) communication or "explosion" by simply distributing copies of coarse grid onto the fine grid.
     // In other words, this function updates the "halo" cells of the fine level by making copies of the coarse cell
@@ -274,6 +289,39 @@ Neon::set::Container explosionPull(Neon::domain::mGrid&                 grid,
 
             return [=] NEON_CUDA_HOST_DEVICE(const typename Neon::domain::bGrid::Cell& cell) mutable {
                 for (int8_t q = 0; q < Q; ++q) {
+                    //If this cell has children i.e., it is been refined, that we should not work on it
+                    //because this cell is only there to allow query and not to operate on
+                    if (!fpost_stm.hasChildren(cell)) {
+                        Neon::int8_3d dir;
+                        if constexpr (DIM == 2) {
+                            dir = Neon::int8_3d(-latticeVelocity2D[q][0], -latticeVelocity2D[q][1], 0);
+                        }
+                        if constexpr (DIM == 3) {
+                            dir = Neon::int8_3d(-latticeVelocity3D[q][0], -latticeVelocity3D[q][1], -latticeVelocity3D[q][2]);
+                        }
+
+                        //if the neighbor cell has children, then this 'cell' is interfacing with L-1 (fine) along q direction
+                        //we want to only work on cells that interface with L+1 (coarse) cell along q                        
+                        if (!fpost_stm.hasChildren(cell, dir)) {
+
+                            //try to query the cell along this direction (opposite of the population direction) as we do
+                            //in 'normal' streaming
+                            auto neighborCell = fpost_col.setNghCell(cell, dir);
+                            if (!neighborCell.isActive()) {
+                                //only if we can not do normal streaming, then we may have a coarser neighbor from which
+                                //we can read this pop
+
+                                //get the uncle direction/offset i.e., the neighbor of the cell's parent 
+                                //this direction/offset is wrt to the cell's parent 
+                                Neon::int8_3d uncleDir = explosionUnlceOffset(cell, dir);
+
+                                auto uncle = fpost_col.uncleVal(cell, uncleDir, q, T(0));
+                                if (uncle.isValid) {
+                                    fpost_stm(cell, q) = uncle.value;
+                                }
+                            }
+                        }
+                    }
                 }
             };
         });
@@ -290,7 +338,7 @@ Neon::set::Container coalescencePull(Neon::domain::mGrid&           grid,
     // (level) communication or "coalescence" by simply averaging the fine data stored in self.fpop_halo
 
     return grid.getContainer(
-        "explosionPull" + std::to_string(level), level,
+        "coalescencePull" + std::to_string(level), level,
         [=](Neon::set::Loader& loader) {
             auto fpost_stm = fpop_poststreaming.load(loader, level, Neon::MultiResCompute::STENCIL_DOWN);
 
@@ -372,6 +420,7 @@ void nonUniformTimestepRecursive(Neon::domain::mGrid&               grid,
     stream<T, DIM, Q>(grid, level, max_level, fout, fin, containers);
 }
 
+
 int main(int argc, char** argv)
 {
     Neon::init();
@@ -411,6 +460,7 @@ int main(int argc, char** argv)
 
         //grid.topologyToVTK("lbm.vtk", false);
 
+
         //LBM problem
         const int max_iter = 300;
         const T   ulb = 0.01;
diff --git a/libNeonDomain/include/Neon/domain/internal/bGrid/bPartition.h b/libNeonDomain/include/Neon/domain/internal/bGrid/bPartition.h
index 755d5f20..b2b29642 100644
--- a/libNeonDomain/include/Neon/domain/internal/bGrid/bPartition.h
+++ b/libNeonDomain/include/Neon/domain/internal/bGrid/bPartition.h
@@ -62,9 +62,10 @@ class bPartition
 
     NEON_CUDA_HOST_DEVICE inline Neon::index_3d mapToGlobal(const Cell& cell) const;
 
+    inline NEON_CUDA_HOST_DEVICE auto setNghCell(const Cell& cell, const nghIdx_t& offset) const -> Cell;
 
    protected:
-    inline NEON_CUDA_HOST_DEVICE auto pitch(const Cell& cell, int card) const -> uint32_t;
+    inline NEON_CUDA_HOST_DEVICE auto pitch(const Cell& cell, int card) const -> uint32_t;    
     inline NEON_CUDA_HOST_DEVICE auto setNghCell(const Cell& cell, const nghIdx_t& offset, const uint32_t* neighbourBlocks) const -> Cell;
     inline NEON_CUDA_HOST_DEVICE auto shmemPitch(Cell cell, const int card) const -> Cell::Location::Integer;    
     inline NEON_CUDA_HOST_DEVICE auto getneighbourBlocksPtr(const Cell& cell) const -> const uint32_t*;
diff --git a/libNeonDomain/include/Neon/domain/internal/bGrid/bPartition_imp.h b/libNeonDomain/include/Neon/domain/internal/bGrid/bPartition_imp.h
index 04e56645..c6681374 100644
--- a/libNeonDomain/include/Neon/domain/internal/bGrid/bPartition_imp.h
+++ b/libNeonDomain/include/Neon/domain/internal/bGrid/bPartition_imp.h
@@ -120,6 +120,11 @@ NEON_CUDA_HOST_DEVICE inline auto bPartition<T, C>::getneighbourBlocksPtr(const
     }
 }
 
+template <typename T, int C>
+NEON_CUDA_HOST_DEVICE inline auto bPartition<T, C>::setNghCell(const Cell& cell, const nghIdx_t& offset) const -> Cell
+{
+    return setNghCell(cell, offset, getneighbourBlocksPtr(cell));
+}
 
 template <typename T, int C>
 NEON_CUDA_HOST_DEVICE inline auto bPartition<T, C>::setNghCell(const Cell&     cell,
@@ -168,6 +173,7 @@ NEON_CUDA_HOST_DEVICE inline auto bPartition<T, C>::setNghCell(const Cell&     c
     } else {
         ngh_cell.mBlockID = cell.mBlockID;
     }
+    ngh_cell.mIsActive = (ngh_cell.mBlockID != std::numeric_limits<uint32_t>::max());
     return ngh_cell;
 }
 
@@ -201,7 +207,7 @@ NEON_CUDA_HOST_DEVICE inline auto bPartition<T, C>::nghVal(const Cell&     cell,
 
         Cell ngh_cell = setNghCell(swirl_cell, offset, getneighbourBlocksPtr(swirl_cell));
         ngh_cell.mBlockSize = cell.mBlockSize;
-        if (ngh_cell.mBlockID != std::numeric_limits<uint32_t>::max()) {
+        if (ngh_cell.isActive()) {
             //TODO maybe ngh_cell should be mapped to its memory layout
             ret.isValid = ngh_cell.computeIsActive(mMask);
             if (ret.isValid) {
@@ -216,7 +222,7 @@ NEON_CUDA_HOST_DEVICE inline auto bPartition<T, C>::nghVal(const Cell&     cell,
 
     } else {
         Cell ngh_cell = setNghCell(cell, offset, getneighbourBlocksPtr(cell));
-        if (ngh_cell.mBlockID != std::numeric_limits<uint32_t>::max()) {
+        if (ngh_cell.isActive()) {
             ret.isValid = ngh_cell.computeIsActive(mMask);
             if (ret.isValid) {
                 if (mIsInSharedMem) {
diff --git a/libNeonDomain/include/Neon/domain/internal/mGrid/mPartition.h b/libNeonDomain/include/Neon/domain/internal/mGrid/mPartition.h
index 8995401c..ab9d3042 100644
--- a/libNeonDomain/include/Neon/domain/internal/mGrid/mPartition.h
+++ b/libNeonDomain/include/Neon/domain/internal/mGrid/mPartition.h
@@ -128,6 +128,11 @@ class mPartition : public Neon::domain::internal::bGrid::bPartition<T, C>
     NEON_CUDA_HOST_DEVICE inline auto getUncle(const Cell&   cell,
                                                Neon::int8_3d direction) const -> Cell;
 
+    NEON_CUDA_HOST_DEVICE inline auto uncleVal(const Cell&   cell,
+                                               Neon::int8_3d direction,
+                                               int           card,
+                                               const T&      alternativeVal) const -> NghInfo<T>;
+
     /**
      * Get the refinement factor i.e., number of children at each dimension
      * @param level at which the refinement factor is queried      
diff --git a/libNeonDomain/include/Neon/domain/internal/mGrid/mPartition_imp.h b/libNeonDomain/include/Neon/domain/internal/mGrid/mPartition_imp.h
index 986abe29..9e427b7b 100644
--- a/libNeonDomain/include/Neon/domain/internal/mGrid/mPartition_imp.h
+++ b/libNeonDomain/include/Neon/domain/internal/mGrid/mPartition_imp.h
@@ -234,10 +234,29 @@ NEON_CUDA_HOST_DEVICE inline auto mPartition<T, C>::getUncle(const Cell&   cell,
     Cell uncle = getParent(cell);
     if (uncle.isActive()) {
         uncle = this->setNghCell(uncle, direction, mParentNeighbourBlocks);
+        uncle.mBlockSize = mRefFactors[mLevel + 1];
         uncle.mIsActive = uncle.mBlockID != std::numeric_limits<uint32_t>::max();
     }
     return uncle;
 }
 
+template <typename T, int C>
+NEON_CUDA_HOST_DEVICE inline auto mPartition<T, C>::uncleVal(const Cell&   cell,
+                                                             Neon::int8_3d direction,
+                                                             int           card,
+                                                             const T&      alternativeVal) const -> NghInfo<T>
+{
+    NghInfo<T> ret;
+    ret.value = alternativeVal;
+    ret.isValid = false;
+
+    Cell uncle = getUncle(cell, direction);
+    ret.isValid = uncle.isActive();
+    if (ret.isValid) {
+        ret.value = mMemParent[this->pitch(uncle, card)];
+    }
+    return ret;
+}
+
 
 }  // namespace Neon::domain::internal::mGrid
\ No newline at end of file

From d0c132ad12465f61e028ff78ffb48bd829c716a8 Mon Sep 17 00:00:00 2001
From: Ahmed Mahmoud <ahmed.mahmoud@autodesk.com>
Date: Tue, 24 Jan 2023 15:11:36 -0500
Subject: [PATCH 08/24] store() and coalescencePull()

---
 apps/lbmMultiRes/lbmMultiRes.cu               | 236 ++++++++++++++----
 .../Neon/core/types/vec/vec3d_integer.tdecl.h |   2 +
 .../Neon/core/types/vec/vec3d_integer.timp.h  |   7 +
 3 files changed, 190 insertions(+), 55 deletions(-)

diff --git a/apps/lbmMultiRes/lbmMultiRes.cu b/apps/lbmMultiRes/lbmMultiRes.cu
index dcf7ed64..0cfb6efe 100644
--- a/apps/lbmMultiRes/lbmMultiRes.cu
+++ b/apps/lbmMultiRes/lbmMultiRes.cu
@@ -69,6 +69,18 @@ NEON_CUDA_DEVICE_ONLY static constexpr char latticeVelocity3D[27][3] = {
 
 };
 
+
+template <int DIM>
+NEON_CUDA_HOST_DEVICE Neon::int8_3d getDir(const int8_t q)
+{
+    if constexpr (DIM == 2) {
+        return Neon::int8_3d(latticeVelocity2D[q][0], latticeVelocity2D[q][1], 0);
+    }
+    if constexpr (DIM == 3) {
+        return Neon::int8_3d(latticeVelocity3D[q][0], latticeVelocity3D[q][1], latticeVelocity3D[q][2]);
+    }
+}
+
 template <int DIM, int Q>
 struct latticeWeight
 {
@@ -119,8 +131,8 @@ inline void exportVTI(const int t, Field& field)
     field.ioToVtk(fname, "field");
 }
 
-
-NEON_CUDA_HOST_DEVICE inline Neon::int8_3d explosionUnlceOffset(const Neon::domain::bGrid::Cell& cell, const Neon::int8_3d& q)
+template <typename T>
+NEON_CUDA_HOST_DEVICE inline Neon::int8_3d unlceOffset(const T& cell, const Neon::int8_3d& q)
 {
     //given a local index within a cell and a population direction (q)
     //find the uncle's (the parent neighbor) offset from which the desired population (q) should be read
@@ -129,14 +141,14 @@ NEON_CUDA_HOST_DEVICE inline Neon::int8_3d explosionUnlceOffset(const Neon::doma
     auto off = [](const int8_t i, const int8_t j) {
         //0, -1 --> -1
         //1, -1 --> 0
-        //0, 1 --> 0
         //0, 0 --> 0
+        //0, 1 --> 0
         //1, 1 --> 1
         const int8_t s = i + j;
         return (s <= 0) ? s : s - 1;
     };
 
-    Neon::int8_3d offset(off(cell.mLocation.x, q.x), off(cell.mLocation.y, q.y), off(cell.mLocation.z, q.z));
+    Neon::int8_3d offset(off(cell.x, q.x), off(cell.y, q.y), off(cell.z, q.z));
     return offset;
 }
 
@@ -194,35 +206,38 @@ Neon::set::Container collide(Neon::domain::mGrid&                 grid,
             const T     omega = computeOmega(omega0, level, max_level);
 
             return [=] NEON_CUDA_HOST_DEVICE(const typename Neon::domain::bGrid::Cell& cell) mutable {
-                constexpr auto t = latticeWeight<DIM, Q>();
+                if (!in.hasChildren(cell)) {
 
-                //fin
-                T ins[Q];
-                for (int i = 0; i < Q; ++i) {
-                    ins[i] = in(cell, i);
-                }
+                    constexpr auto t = latticeWeight<DIM, Q>();
 
-                //density
-                T rho = 0;
-                for (int i = 0; i < Q; ++i) {
-                    rho += ins[i];
-                }
+                    //fin
+                    T ins[Q];
+                    for (int i = 0; i < Q; ++i) {
+                        ins[i] = in(cell, i);
+                    }
 
-                //velocity
-                const Neon::Vec_3d<T> vel = velocity<T, DIM, Q>(ins, rho);
+                    //density
+                    T rho = 0;
+                    for (int i = 0; i < Q; ++i) {
+                        rho += ins[i];
+                    }
 
+                    //velocity
+                    const Neon::Vec_3d<T> vel = velocity<T, DIM, Q>(ins, rho);
 
-                const T usqr = (3.0 / 2.0) * (vel.x * vel.x + vel.y * vel.y + vel.z * vel.z);
-                for (int i = 0; i < Q; ++i) {
-                    T cu = 0;
-                    for (int d = 0; d < DIM; ++d) {
-                        cu += latticeVelocity2D[i][d] * vel.v[d];
-                    }
-                    //equilibrium
-                    T feq = rho * t.t[i] * (1. + cu + 0.5 * cu * cu - usqr);
 
-                    //collide
-                    out(cell, i) = ins[i] - omega * (ins[i] - feq);
+                    const T usqr = (3.0 / 2.0) * (vel.x * vel.x + vel.y * vel.y + vel.z * vel.z);
+                    for (int i = 0; i < Q; ++i) {
+                        T cu = 0;
+                        for (int d = 0; d < DIM; ++d) {
+                            cu += latticeVelocity2D[i][d] * vel.v[d];
+                        }
+                        //equilibrium
+                        T feq = rho * t.t[i] * (1. + cu + 0.5 * cu * cu - usqr);
+
+                        //collide
+                        out(cell, i) = ins[i] - omega * (ins[i] - feq);
+                    }
                 }
             };
         });
@@ -244,18 +259,13 @@ Neon::set::Container stream(Neon::domain::mGrid&                 grid,
             auto        fpost_stm = fpop_poststreaming.load(loader, level, Neon::MultiResCompute::MAP);
 
             return [=] NEON_CUDA_HOST_DEVICE(const typename Neon::domain::bGrid::Cell& cell) mutable {
-                //If this cell has children i.e., it is been refined, that we should not work on it
+                //If this cell has children i.e., it is been refined, than we should not work on it
                 //because this cell is only there to allow query and not to operate on
                 if (!fpost_stm.hasChildren(cell)) {
 
                     for (int8_t q = 0; q < Q; ++q) {
-                        Neon::int8_3d dir;
-                        if constexpr (DIM == 2) {
-                            dir = Neon::int8_3d(-latticeVelocity2D[q][0], -latticeVelocity2D[q][1], 0);
-                        }
-                        if constexpr (DIM == 3) {
-                            dir = Neon::int8_3d(-latticeVelocity3D[q][0], -latticeVelocity3D[q][1], -latticeVelocity3D[q][2]);
-                        }
+                        const Neon::int8_3d dir = -getDir<DIM>(q);
+
                         //if the neighbor cell has children, then this 'cell' is interfacing with L-1 (fine) along q direction
                         if (!fpost_stm.hasChildren(cell, dir)) {
                             auto neighbor = fpost_col.nghVal(cell, dir, q, T(0));
@@ -288,20 +298,15 @@ Neon::set::Container explosionPull(Neon::domain::mGrid&                 grid,
             auto        fpost_stm = fpop_poststreaming.load(loader, level, Neon::MultiResCompute::MAP);
 
             return [=] NEON_CUDA_HOST_DEVICE(const typename Neon::domain::bGrid::Cell& cell) mutable {
-                for (int8_t q = 0; q < Q; ++q) {
-                    //If this cell has children i.e., it is been refined, that we should not work on it
-                    //because this cell is only there to allow query and not to operate on
-                    if (!fpost_stm.hasChildren(cell)) {
-                        Neon::int8_3d dir;
-                        if constexpr (DIM == 2) {
-                            dir = Neon::int8_3d(-latticeVelocity2D[q][0], -latticeVelocity2D[q][1], 0);
-                        }
-                        if constexpr (DIM == 3) {
-                            dir = Neon::int8_3d(-latticeVelocity3D[q][0], -latticeVelocity3D[q][1], -latticeVelocity3D[q][2]);
-                        }
+                //If this cell has children i.e., it is been refined, that we should not work on it
+                //because this cell is only there to allow query and not to operate on
+                if (!fpost_stm.hasChildren(cell)) {
+                    for (int8_t q = 0; q < Q; ++q) {
+
+                        const Neon::int8_3d dir = -getDir<DIM>(q);
 
                         //if the neighbor cell has children, then this 'cell' is interfacing with L-1 (fine) along q direction
-                        //we want to only work on cells that interface with L+1 (coarse) cell along q                        
+                        //we want to only work on cells that interface with L+1 (coarse) cell along q
                         if (!fpost_stm.hasChildren(cell, dir)) {
 
                             //try to query the cell along this direction (opposite of the population direction) as we do
@@ -311,9 +316,9 @@ Neon::set::Container explosionPull(Neon::domain::mGrid&                 grid,
                                 //only if we can not do normal streaming, then we may have a coarser neighbor from which
                                 //we can read this pop
 
-                                //get the uncle direction/offset i.e., the neighbor of the cell's parent 
-                                //this direction/offset is wrt to the cell's parent 
-                                Neon::int8_3d uncleDir = explosionUnlceOffset(cell, dir);
+                                //get the uncle direction/offset i.e., the neighbor of the cell's parent
+                                //this direction/offset is wrt to the cell's parent
+                                Neon::int8_3d uncleDir = unlceOffset(cell.mLocation, dir);
 
                                 auto uncle = fpost_col.uncleVal(cell, uncleDir, q, T(0));
                                 if (uncle.isValid) {
@@ -333,22 +338,124 @@ Neon::set::Container coalescencePull(Neon::domain::mGrid&           grid,
                                      int                            level,
                                      Neon::domain::mGrid::Field<T>& fpop_poststreaming)
 {
-    //TODO
-    // Initiated by the coarse level (hence "pull"), this function performs fine (level-1) to coarse
-    // (level) communication or "coalescence" by simply averaging the fine data stored in self.fpop_halo
+    // Initiated by the coarse level (hence "pull"), this function simply read the missing population
+    // across the interface between coarse<->fine boundary by reading the population prepare during the store()
 
     return grid.getContainer(
         "coalescencePull" + std::to_string(level), level,
+        [=](Neon::set::Loader& loader) {
+            auto fpost_stm = fpop_poststreaming.load(loader, level, Neon::MultiResCompute::STENCIL);
+
+            return [=] NEON_CUDA_HOST_DEVICE(const typename Neon::domain::bGrid::Cell& cell) mutable {
+                //If this cell has children i.e., it is been refined, than we should not work on it
+                //because this cell is only there to allow query and not to operate on
+                if (!fpost_stm.hasChildren(cell)) {
+
+                    for (int q = 0; q < Q; ++q) {
+                        const Neon::int8_3d dir = -getDir<DIM>(q);
+                        //if we have a neighbor at the same level that has been refined, then cell is on
+                        //the interface and this is where we should do the coalescence
+                        if (fpost_stm.hasChildren(cell, dir)) {
+                            auto neighbor = fpost_stm.nghVal(cell, dir, q, T(0));
+                            if (neighbor.isValid) {
+                                fpost_stm(cell, q) = neighbor.value;
+                            }
+                        }
+                    }
+                }
+            };
+        });
+}
+
+
+template <typename T, int DIM, int Q>
+Neon::set::Container store(Neon::domain::mGrid&           grid,
+                           int                            level,
+                           Neon::domain::mGrid::Field<T>& fpop_poststreaming)
+{
+    //Initiated by the coarse level (level), this function prepares and stores the fine (level - 1)
+    // information for further pulling initiated by the coarse (this) level invoked by coalescence_pull
+    //
+    //Where a coarse cell stores its information? at itself i.e., pull
+    //Where a coarse cell reads the needed info? from its children and neighbor cell's children (level -1)
+    //This function only operates on a coarse cell that has children.
+    //For such cell, we check its neighbor cells at the same level. If any of these neighbor has NO
+    //children, then we need to prepare something for them to be read during coalescence. What
+    //we prepare is some sort of averaged the data from the children (the cell's children and/or
+    //its neighbor's children)
+
+    return grid.getContainer(
+        "store" + std::to_string(level), level,
         [=](Neon::set::Loader& loader) {
             auto fpost_stm = fpop_poststreaming.load(loader, level, Neon::MultiResCompute::STENCIL_DOWN);
 
             return [=] NEON_CUDA_HOST_DEVICE(const typename Neon::domain::bGrid::Cell& cell) mutable {
-                for (int i = 0; i < Q; ++i) {
+                //if the cell is refined, we might need to store something in it for its neighbor
+                if (fpost_stm.hasChildren(cell)) {
+
+                    const int refFactor = fpost_stm.getRefFactor(level);
+
+                    bool should_accumelate =  ((int(fpost_stm(cell, 0)) % refFactor) != 0);
+
+                    fpost_stm(cell, 0) += 1;
+
+
+                    //for each direction aka for each neighbor
+                    //we skip the center here
+                    for (int8_t q = 1; q < Q; ++q) {
+                        const Neon::int8_3d q_dir = getDir<DIM>(q);
+
+                        //check if the neighbor in this direction has children
+                        if (!fpost_stm.hasChildren(cell, q_dir)) {
+                            //now, we know that there is actually something we need to store for this neighbor
+                            //in cell along q (q_dir) direction
+                            int num = 0;
+                            T   sum = 0;
+
+
+                            //for every neighbor cell including the center cell (i.e., cell)
+                            for (int8_t p = 0; p < Q; ++p) {
+                                const Neon::int8_3d p_dir = getDir<DIM>(p);
+
+                                //relative direction of q w.r.t p
+                                //i.e., in which direction we should move starting from p to land on q
+                                const Neon::int8_3d r_dir = q_dir - p_dir;
+
+                                //if this neighbor is refined
+                                if (fpost_stm.hasChildren(cell, p_dir)) {
+
+                                    //for each children of p
+                                    for (int8_t i = 0; i < refFactor; ++i) {
+                                        for (int8_t j = 0; j < refFactor; ++j) {
+                                            for (int8_t k = 0; k < refFactor; ++k) {
+                                                const Neon::int8_3d c(i, j, k);
+
+                                                //cq is coarse neighbor (i.e., uncle) that we need to go in order to read q
+                                                //for c (this is what we do for explosion but here we do this just for the check)
+                                                const Neon::int8_3d cq = unlceOffset(c, q_dir);
+                                                if (cq == r_dir) {
+                                                    num++;
+                                                    sum += fpost_stm.childVal(cell, c, q, 0).value;
+                                                }
+                                            }
+                                        }
+                                    }
+                                }
+                            }
+
+                            if (should_accumelate) {
+                                fpost_stm(cell, q) += sum / static_cast<T>(num * refFactor);
+                            } else {
+                                fpost_stm(cell, q) = sum / static_cast<T>(num * refFactor);
+                            }
+                        }
+                    }
                 }
             };
         });
 }
 
+
 template <typename T, int DIM, int Q>
 void stream(Neon::domain::mGrid&                 grid,
             int                                  level,
@@ -391,7 +498,11 @@ void nonUniformTimestepRecursive(Neon::domain::mGrid&               grid,
     // 1) collision for all voxels at level L=level
     containers.push_back(collide<T, DIM, Q>(grid, omega0, level, max_level, fin, fout));
 
-    // 2) Storing fine(level) data for later "coalescence" pulled by the coarse(level)
+    // 2) Storing fine (level - 1) data for later "coalescence" pulled by the coarse (level)
+    if (level != max_level) {
+        store<T, DIM, Q>(grid, level + 1, fout);
+    }
+
 
     // 3) recurse down
     if (level != 0) {
@@ -410,6 +521,9 @@ void nonUniformTimestepRecursive(Neon::domain::mGrid&               grid,
     containers.push_back(collide<T, DIM, Q>(grid, omega0, level, max_level, fin, fout));
 
     // 7) Storing fine(level) data for later "coalescence" pulled by the coarse(level)
+    if (level != max_level) {
+        store<T, DIM, Q>(grid, level + 1, fout);
+    }
 
     // 8) recurse down
     if (level != 0) {
@@ -474,6 +588,18 @@ int main(int argc, char** argv)
         auto fout = grid.newField<T>("fout", Q, 0);
 
         //TODO init fin and fout
+        for (int l = 0; l < descriptor.getDepth(); ++l) {
+            fin.forEachActiveCell(
+                l,
+                [&](const Neon::int32_3d, const int, T& val) {
+                    val = 0;
+                });
+            fout.forEachActiveCell(
+                l,
+                [&](const Neon::int32_3d, const int, T& val) {
+                    val = 0;
+                });
+        }
 
         fin.updateCompute();
         fout.updateCompute();
diff --git a/libNeonCore/include/Neon/core/types/vec/vec3d_integer.tdecl.h b/libNeonCore/include/Neon/core/types/vec/vec3d_integer.tdecl.h
index 358b086f..03a2c28b 100644
--- a/libNeonCore/include/Neon/core/types/vec/vec3d_integer.tdecl.h
+++ b/libNeonCore/include/Neon/core/types/vec/vec3d_integer.tdecl.h
@@ -391,6 +391,8 @@ class Vec_3d<IntegerType_ta, true, false>
 
     NEON_CUDA_HOST_DEVICE inline bool operator!=(const Integer other[self_t::num_axis]) const;
 
+    NEON_CUDA_HOST_DEVICE inline self_t operator-() const;
+
     /** Returns the most north-est-hi point buildable with A and B coordinates"
      * C = A >> B  is: C.v[i] = A.v[i] > B.v[i] ? A.v[i] : B.v[i]
      *   @param[in] B: second point for the operation.
diff --git a/libNeonCore/include/Neon/core/types/vec/vec3d_integer.timp.h b/libNeonCore/include/Neon/core/types/vec/vec3d_integer.timp.h
index 7118708b..df9d4d94 100644
--- a/libNeonCore/include/Neon/core/types/vec/vec3d_integer.timp.h
+++ b/libNeonCore/include/Neon/core/types/vec/vec3d_integer.timp.h
@@ -486,6 +486,13 @@ NEON_CUDA_HOST_DEVICE inline Vec_3d<IntegerType_ta, true, false> Vec_3d<IntegerT
 }
 
 
+template <typename IntegerType_ta>
+NEON_CUDA_HOST_DEVICE inline Vec_3d<IntegerType_ta, true, false> Vec_3d<IntegerType_ta, true, false>::operator-() const
+{
+    Vec_3d<Integer> res(-x, -y, -z);
+    return res;
+}
+
 template <typename IntegerType_ta>
 NEON_CUDA_HOST_DEVICE inline bool Vec_3d<IntegerType_ta, true, false>::operator>(const Vec_3d<IntegerType_ta, true, false>& B) const
 {

From 238e444ab7d5031c57595c340de49219c9b055c1 Mon Sep 17 00:00:00 2001
From: Ahmed Mahmoud <ahmed.mahmoud@autodesk.com>
Date: Thu, 26 Jan 2023 11:28:13 -0500
Subject: [PATCH 09/24] debugging

---
 apps/lbmMultiRes/lbmMultiRes.cu               | 114 ++++++++----------
 .../Neon/domain/internal/bGrid/bPartition.h   |  10 +-
 .../domain/internal/bGrid/bPartition_imp.h    |  10 +-
 .../domain/internal/mGrid/mPartition_imp.h    |   4 +-
 4 files changed, 65 insertions(+), 73 deletions(-)

diff --git a/apps/lbmMultiRes/lbmMultiRes.cu b/apps/lbmMultiRes/lbmMultiRes.cu
index 0cfb6efe..45a62864 100644
--- a/apps/lbmMultiRes/lbmMultiRes.cu
+++ b/apps/lbmMultiRes/lbmMultiRes.cu
@@ -119,18 +119,6 @@ struct latticeWeight
 };
 
 
-template <typename Field>
-inline void exportVTI(const int t, Field& field)
-{
-    printf("\n Exporting Frame =%d", t);
-    int                precision = 4;
-    std::ostringstream oss;
-    oss << std::setw(precision) << std::setfill('0') << t;
-    std::string prefix = "lbm" + std::to_string(field.getCardinality()) + "D_";
-    std::string fname = prefix + oss.str();
-    field.ioToVtk(fname, "field");
-}
-
 template <typename T>
 NEON_CUDA_HOST_DEVICE inline Neon::int8_3d unlceOffset(const T& cell, const Neon::int8_3d& q)
 {
@@ -301,7 +289,7 @@ Neon::set::Container explosionPull(Neon::domain::mGrid&                 grid,
                 //If this cell has children i.e., it is been refined, that we should not work on it
                 //because this cell is only there to allow query and not to operate on
                 if (!fpost_stm.hasChildren(cell)) {
-                    for (int8_t q = 0; q < Q; ++q) {
+                    for (int8_t q = 1; q < Q; ++q) {
 
                         const Neon::int8_3d dir = -getDir<DIM>(q);
 
@@ -311,7 +299,7 @@ Neon::set::Container explosionPull(Neon::domain::mGrid&                 grid,
 
                             //try to query the cell along this direction (opposite of the population direction) as we do
                             //in 'normal' streaming
-                            auto neighborCell = fpost_col.setNghCell(cell, dir);
+                            auto neighborCell = fpost_col.getNghCell(cell, dir);
                             if (!neighborCell.isActive()) {
                                 //only if we can not do normal streaming, then we may have a coarser neighbor from which
                                 //we can read this pop
@@ -344,14 +332,14 @@ Neon::set::Container coalescencePull(Neon::domain::mGrid&           grid,
     return grid.getContainer(
         "coalescencePull" + std::to_string(level), level,
         [=](Neon::set::Loader& loader) {
-            auto fpost_stm = fpop_poststreaming.load(loader, level, Neon::MultiResCompute::STENCIL);
+            auto fpost_stm = fpop_poststreaming.load(loader, level, Neon::MultiResCompute::STENCIL_DOWN);
 
             return [=] NEON_CUDA_HOST_DEVICE(const typename Neon::domain::bGrid::Cell& cell) mutable {
                 //If this cell has children i.e., it is been refined, than we should not work on it
                 //because this cell is only there to allow query and not to operate on
                 if (!fpost_stm.hasChildren(cell)) {
 
-                    for (int q = 0; q < Q; ++q) {
+                    for (int q = 1; q < Q; ++q) {
                         const Neon::int8_3d dir = -getDir<DIM>(q);
                         //if we have a neighbor at the same level that has been refined, then cell is on
                         //the interface and this is where we should do the coalescence
@@ -395,7 +383,7 @@ Neon::set::Container store(Neon::domain::mGrid&           grid,
 
                     const int refFactor = fpost_stm.getRefFactor(level);
 
-                    bool should_accumelate =  ((int(fpost_stm(cell, 0)) % refFactor) != 0);
+                    bool should_accumelate = ((int(fpost_stm(cell, 0)) % refFactor) != 0);
 
                     fpost_stm(cell, 0) += 1;
 
@@ -406,47 +394,51 @@ Neon::set::Container store(Neon::domain::mGrid&           grid,
                         const Neon::int8_3d q_dir = getDir<DIM>(q);
 
                         //check if the neighbor in this direction has children
-                        if (!fpost_stm.hasChildren(cell, q_dir)) {
-                            //now, we know that there is actually something we need to store for this neighbor
-                            //in cell along q (q_dir) direction
-                            int num = 0;
-                            T   sum = 0;
-
-
-                            //for every neighbor cell including the center cell (i.e., cell)
-                            for (int8_t p = 0; p < Q; ++p) {
-                                const Neon::int8_3d p_dir = getDir<DIM>(p);
-
-                                //relative direction of q w.r.t p
-                                //i.e., in which direction we should move starting from p to land on q
-                                const Neon::int8_3d r_dir = q_dir - p_dir;
-
-                                //if this neighbor is refined
-                                if (fpost_stm.hasChildren(cell, p_dir)) {
-
-                                    //for each children of p
-                                    for (int8_t i = 0; i < refFactor; ++i) {
-                                        for (int8_t j = 0; j < refFactor; ++j) {
-                                            for (int8_t k = 0; k < refFactor; ++k) {
-                                                const Neon::int8_3d c(i, j, k);
-
-                                                //cq is coarse neighbor (i.e., uncle) that we need to go in order to read q
-                                                //for c (this is what we do for explosion but here we do this just for the check)
-                                                const Neon::int8_3d cq = unlceOffset(c, q_dir);
-                                                if (cq == r_dir) {
-                                                    num++;
-                                                    sum += fpost_stm.childVal(cell, c, q, 0).value;
+                        auto neighborCell = fpost_stm.getNghCell(cell, q_dir);
+                        if (neighborCell.isActive()) {
+
+                            if (!fpost_stm.hasChildren(neighborCell)) {
+                                //now, we know that there is actually something we need to store for this neighbor
+                                //in cell along q (q_dir) direction
+                                int num = 0;
+                                T   sum = 0;
+
+
+                                //for every neighbor cell including the center cell (i.e., cell)
+                                for (int8_t p = 0; p < Q; ++p) {
+                                    const Neon::int8_3d p_dir = getDir<DIM>(p);
+
+                                    //relative direction of q w.r.t p
+                                    //i.e., in which direction we should move starting from p to land on q
+                                    const Neon::int8_3d r_dir = q_dir - p_dir;
+
+                                    //if this neighbor is refined
+                                    if (fpost_stm.hasChildren(cell, p_dir)) {
+
+                                        //for each children of p
+                                        for (int8_t i = 0; i < refFactor; ++i) {
+                                            for (int8_t j = 0; j < refFactor; ++j) {
+                                                for (int8_t k = 0; k < refFactor; ++k) {
+                                                    const Neon::int8_3d c(i, j, k);
+
+                                                    //cq is coarse neighbor (i.e., uncle) that we need to go in order to read q
+                                                    //for c (this is what we do for explosion but here we do this just for the check)
+                                                    const Neon::int8_3d cq = unlceOffset(c, q_dir);
+                                                    if (cq == r_dir) {
+                                                        num++;
+                                                        sum += fpost_stm.childVal(cell, c, q, 0).value;
+                                                    }
                                                 }
                                             }
                                         }
                                     }
                                 }
-                            }
 
-                            if (should_accumelate) {
-                                fpost_stm(cell, q) += sum / static_cast<T>(num * refFactor);
-                            } else {
-                                fpost_stm(cell, q) = sum / static_cast<T>(num * refFactor);
+                                if (should_accumelate) {
+                                    fpost_stm(cell, q) += sum / static_cast<T>(num * refFactor);
+                                } else {
+                                    fpost_stm(cell, q) = sum / static_cast<T>(num * refFactor);
+                                }
                             }
                         }
                     }
@@ -550,29 +542,29 @@ int main(int argc, char** argv)
         constexpr int DIM = 2;
         constexpr int Q = (DIM == 2) ? 9 : 19;
 
-        const int dim_x = 12;
-        const int dim_y = 12;
-        const int dim_z = (DIM < 3) ? 4 : 4;
+        const int dim_x = 6;
+        const int dim_y = 6;
+        const int dim_z = 4;
 
         const Neon::index_3d grid_dim(dim_x, dim_y, dim_z);
 
-        const Neon::domain::mGridDescriptor descriptor({1, 1, 1});
+        const Neon::domain::mGridDescriptor descriptor({1, 1});
 
 
         Neon::domain::mGrid grid(
             backend, grid_dim,
             {[&](const Neon::index_3d id) -> bool {
-                 return id.x > 7;
+                 return ((id.x == 2 && id.y == 2) ||
+                         (id.x == 3 && id.y == 2) ||
+                         (id.x == 2 && id.y == 3) ||
+                         (id.x == 3 && id.y == 3));
              },
              [&](const Neon::index_3d& id) -> bool {
-                 return id.x > 3;
-             },
-             [&](const Neon::index_3d&) -> bool {
                  return true;
              }},
             create_stencil<DIM, Q>(), descriptor);
 
-        //grid.topologyToVTK("lbm.vtk", false);
+        grid.topologyToVTK("lbm.vtk", false);
 
 
         //LBM problem
diff --git a/libNeonDomain/include/Neon/domain/internal/bGrid/bPartition.h b/libNeonDomain/include/Neon/domain/internal/bGrid/bPartition.h
index b2b29642..f907dfc2 100644
--- a/libNeonDomain/include/Neon/domain/internal/bGrid/bPartition.h
+++ b/libNeonDomain/include/Neon/domain/internal/bGrid/bPartition.h
@@ -49,7 +49,7 @@ class bPartition
     NEON_CUDA_HOST_DEVICE inline auto nghVal(const Cell& eId,
                                              uint8_t     nghID,
                                              int         card,
-                                             const T&    alternativeVal) const -> NghInfo<T>;
+                                             const T&    alternativeVal) const -> NghInfo<T>;    
 
 
     NEON_CUDA_HOST_DEVICE inline void loadInSharedMemory(const Cell&                cell,
@@ -62,12 +62,12 @@ class bPartition
 
     NEON_CUDA_HOST_DEVICE inline Neon::index_3d mapToGlobal(const Cell& cell) const;
 
-    inline NEON_CUDA_HOST_DEVICE auto setNghCell(const Cell& cell, const nghIdx_t& offset) const -> Cell;
+    inline NEON_CUDA_HOST_DEVICE auto getNghCell(const Cell& cell, const nghIdx_t& offset) const -> Cell;
 
    protected:
-    inline NEON_CUDA_HOST_DEVICE auto pitch(const Cell& cell, int card) const -> uint32_t;    
-    inline NEON_CUDA_HOST_DEVICE auto setNghCell(const Cell& cell, const nghIdx_t& offset, const uint32_t* neighbourBlocks) const -> Cell;
-    inline NEON_CUDA_HOST_DEVICE auto shmemPitch(Cell cell, const int card) const -> Cell::Location::Integer;    
+    inline NEON_CUDA_HOST_DEVICE auto pitch(const Cell& cell, int card) const -> uint32_t;
+    inline NEON_CUDA_HOST_DEVICE auto getNghCell(const Cell& cell, const nghIdx_t& offset, const uint32_t* neighbourBlocks) const -> Cell;
+    inline NEON_CUDA_HOST_DEVICE auto shmemPitch(Cell cell, const int card) const -> Cell::Location::Integer;
     inline NEON_CUDA_HOST_DEVICE auto getneighbourBlocksPtr(const Cell& cell) const -> const uint32_t*;
 
     Neon::DataView            mDataView;
diff --git a/libNeonDomain/include/Neon/domain/internal/bGrid/bPartition_imp.h b/libNeonDomain/include/Neon/domain/internal/bGrid/bPartition_imp.h
index c6681374..8b31e6a8 100644
--- a/libNeonDomain/include/Neon/domain/internal/bGrid/bPartition_imp.h
+++ b/libNeonDomain/include/Neon/domain/internal/bGrid/bPartition_imp.h
@@ -121,13 +121,13 @@ NEON_CUDA_HOST_DEVICE inline auto bPartition<T, C>::getneighbourBlocksPtr(const
 }
 
 template <typename T, int C>
-NEON_CUDA_HOST_DEVICE inline auto bPartition<T, C>::setNghCell(const Cell& cell, const nghIdx_t& offset) const -> Cell
+NEON_CUDA_HOST_DEVICE inline auto bPartition<T, C>::getNghCell(const Cell& cell, const nghIdx_t& offset) const -> Cell
 {
-    return setNghCell(cell, offset, getneighbourBlocksPtr(cell));
+    return getNghCell(cell, offset, getneighbourBlocksPtr(cell));
 }
 
 template <typename T, int C>
-NEON_CUDA_HOST_DEVICE inline auto bPartition<T, C>::setNghCell(const Cell&     cell,
+NEON_CUDA_HOST_DEVICE inline auto bPartition<T, C>::getNghCell(const Cell&     cell,
                                                                const nghIdx_t& offset,
                                                                const uint32_t* neighbourBlocks) const -> Cell
 {
@@ -205,7 +205,7 @@ NEON_CUDA_HOST_DEVICE inline auto bPartition<T, C>::nghVal(const Cell&     cell,
         Cell swirl_cell = cell.toSwirl();
         swirl_cell.mBlockSize = cell.mBlockSize;
 
-        Cell ngh_cell = setNghCell(swirl_cell, offset, getneighbourBlocksPtr(swirl_cell));
+        Cell ngh_cell = getNghCell(swirl_cell, offset, getneighbourBlocksPtr(swirl_cell));
         ngh_cell.mBlockSize = cell.mBlockSize;
         if (ngh_cell.isActive()) {
             //TODO maybe ngh_cell should be mapped to its memory layout
@@ -221,7 +221,7 @@ NEON_CUDA_HOST_DEVICE inline auto bPartition<T, C>::nghVal(const Cell&     cell,
         }
 
     } else {
-        Cell ngh_cell = setNghCell(cell, offset, getneighbourBlocksPtr(cell));
+        Cell ngh_cell = getNghCell(cell, offset, getneighbourBlocksPtr(cell));
         if (ngh_cell.isActive()) {
             ret.isValid = ngh_cell.computeIsActive(mMask);
             if (ret.isValid) {
diff --git a/libNeonDomain/include/Neon/domain/internal/mGrid/mPartition_imp.h b/libNeonDomain/include/Neon/domain/internal/mGrid/mPartition_imp.h
index 9e427b7b..77abe808 100644
--- a/libNeonDomain/include/Neon/domain/internal/mGrid/mPartition_imp.h
+++ b/libNeonDomain/include/Neon/domain/internal/mGrid/mPartition_imp.h
@@ -178,7 +178,7 @@ NEON_CUDA_HOST_DEVICE inline auto mPartition<T, C>::hasChildren(const Cell& cell
         return false;
     }
 
-    Cell nghCell = this->setNghCell(cell, nghDir, this->getneighbourBlocksPtr(cell));
+    Cell nghCell = this->getNghCell(cell, nghDir, this->getneighbourBlocksPtr(cell));
     if (!nghCell.isActive()) {
         return false;
     }
@@ -233,7 +233,7 @@ NEON_CUDA_HOST_DEVICE inline auto mPartition<T, C>::getUncle(const Cell&   cell,
 {
     Cell uncle = getParent(cell);
     if (uncle.isActive()) {
-        uncle = this->setNghCell(uncle, direction, mParentNeighbourBlocks);
+        uncle = this->getNghCell(uncle, direction, mParentNeighbourBlocks);
         uncle.mBlockSize = mRefFactors[mLevel + 1];
         uncle.mIsActive = uncle.mBlockID != std::numeric_limits<uint32_t>::max();
     }

From 1fa1d187325e4991150453dd45aa474bfd097a0c Mon Sep 17 00:00:00 2001
From: Ahmed Mahmoud <ahmed.mahmoud@autodesk.com>
Date: Thu, 26 Jan 2023 15:38:41 -0500
Subject: [PATCH 10/24] problem setup

---
 apps/lbmMultiRes/lbmMultiRes.cu               | 103 +++++++++++-------
 .../domain/internal/mGrid/mGridDescriptor.h   |  15 ++-
 2 files changed, 76 insertions(+), 42 deletions(-)

diff --git a/apps/lbmMultiRes/lbmMultiRes.cu b/apps/lbmMultiRes/lbmMultiRes.cu
index 45a62864..77cffefe 100644
--- a/apps/lbmMultiRes/lbmMultiRes.cu
+++ b/apps/lbmMultiRes/lbmMultiRes.cu
@@ -182,16 +182,16 @@ template <typename T, int DIM, int Q>
 Neon::set::Container collide(Neon::domain::mGrid&                 grid,
                              T                                    omega0,
                              int                                  level,
-                             int                                  max_level,
+                             int                                  num_levels,
                              const Neon::domain::mGrid::Field<T>& fin,
                              Neon::domain::mGrid::Field<T>&       fout)
 {
     return grid.getContainer(
-        "Collide" + std::to_string(level), level,
-        [=](Neon::set::Loader& loader) {
+        "collide_" + std::to_string(level), level,
+        [&, level, omega0, num_levels](Neon::set::Loader& loader) {
             const auto& in = fin.load(loader, level, Neon::MultiResCompute::MAP);
             auto        out = fout.load(loader, level, Neon::MultiResCompute::MAP);
-            const T     omega = computeOmega(omega0, level, max_level);
+            const T     omega = computeOmega(omega0, level, num_levels);
 
             return [=] NEON_CUDA_HOST_DEVICE(const typename Neon::domain::bGrid::Cell& cell) mutable {
                 if (!in.hasChildren(cell)) {
@@ -241,8 +241,8 @@ Neon::set::Container stream(Neon::domain::mGrid&                 grid,
     //This is "pull" stream
 
     return grid.getContainer(
-        "Stream" + std::to_string(level), level,
-        [=](Neon::set::Loader& loader) {
+        "stream_" + std::to_string(level), level,
+        [&, level](Neon::set::Loader& loader) {
             const auto& fpost_col = fpop_postcollision.load(loader, level, Neon::MultiResCompute::STENCIL);
             auto        fpost_stm = fpop_poststreaming.load(loader, level, Neon::MultiResCompute::MAP);
 
@@ -280,8 +280,8 @@ Neon::set::Container explosionPull(Neon::domain::mGrid&                 grid,
 
 
     return grid.getContainer(
-        "explosionPull" + std::to_string(level), level,
-        [=](Neon::set::Loader& loader) {
+        "Explosion_" + std::to_string(level), level,
+        [&, level](Neon::set::Loader& loader) {
             const auto& fpost_col = fpop_postcollision.load(loader, level, Neon::MultiResCompute::STENCIL_UP);
             auto        fpost_stm = fpop_poststreaming.load(loader, level, Neon::MultiResCompute::MAP);
 
@@ -330,9 +330,9 @@ Neon::set::Container coalescencePull(Neon::domain::mGrid&           grid,
     // across the interface between coarse<->fine boundary by reading the population prepare during the store()
 
     return grid.getContainer(
-        "coalescencePull" + std::to_string(level), level,
-        [=](Neon::set::Loader& loader) {
-            auto fpost_stm = fpop_poststreaming.load(loader, level, Neon::MultiResCompute::STENCIL_DOWN);
+        "Coalescence" + std::to_string(level), level,
+        [&, level](Neon::set::Loader& loader) {
+            auto& fpost_stm = fpop_poststreaming.load(loader, level, Neon::MultiResCompute::STENCIL_DOWN);
 
             return [=] NEON_CUDA_HOST_DEVICE(const typename Neon::domain::bGrid::Cell& cell) mutable {
                 //If this cell has children i.e., it is been refined, than we should not work on it
@@ -373,9 +373,9 @@ Neon::set::Container store(Neon::domain::mGrid&           grid,
     //its neighbor's children)
 
     return grid.getContainer(
-        "store" + std::to_string(level), level,
-        [=](Neon::set::Loader& loader) {
-            auto fpost_stm = fpop_poststreaming.load(loader, level, Neon::MultiResCompute::STENCIL_DOWN);
+        "store_" + std::to_string(level), level,
+        [&, level](Neon::set::Loader& loader) {
+            auto& fpost_stm = fpop_poststreaming.load(loader, level, Neon::MultiResCompute::STENCIL_DOWN);
 
             return [=] NEON_CUDA_HOST_DEVICE(const typename Neon::domain::bGrid::Cell& cell) mutable {
                 //if the cell is refined, we might need to store something in it for its neighbor
@@ -451,7 +451,7 @@ Neon::set::Container store(Neon::domain::mGrid&           grid,
 template <typename T, int DIM, int Q>
 void stream(Neon::domain::mGrid&                 grid,
             int                                  level,
-            const int                            max_level,
+            const int                            num_levels,
             const Neon::domain::mGrid::Field<T>& fpop_postcollision,
             Neon::domain::mGrid::Field<T>&       fpop_poststreaming,
             std::vector<Neon::set::Container>&   containers)
@@ -463,7 +463,7 @@ void stream(Neon::domain::mGrid&                 grid,
     *  (i) coarser or (ii) finer neighbors at level+1 and level-1 and hence require
     *  (i) "explosion" or (ii) coalescence
     */
-    if (level != max_level - 1) {
+    if (level != num_levels - 1) {
         /* Explosion: pull missing populations from coarser neighbors by copying coarse (level+1) to fine (level) 
         * neighbors, initiated by the fine level ("Pull").
         */
@@ -482,50 +482,64 @@ template <typename T, int DIM, int Q>
 void nonUniformTimestepRecursive(Neon::domain::mGrid&               grid,
                                  const T                            omega0,
                                  const int                          level,
-                                 const int                          max_level,
+                                 const int                          num_levels,
                                  Neon::domain::mGrid::Field<T>&     fin,
                                  Neon::domain::mGrid::Field<T>&     fout,
                                  std::vector<Neon::set::Container>& containers)
 {
     // 1) collision for all voxels at level L=level
-    containers.push_back(collide<T, DIM, Q>(grid, omega0, level, max_level, fin, fout));
+    containers.push_back(collide<T, DIM, Q>(grid, omega0, level, num_levels, fin, fout));
 
     // 2) Storing fine (level - 1) data for later "coalescence" pulled by the coarse (level)
-    if (level != max_level) {
-        store<T, DIM, Q>(grid, level + 1, fout);
+    if (level != num_levels - 1) {
+        containers.push_back(store<T, DIM, Q>(grid, level + 1, fout));
     }
 
 
     // 3) recurse down
     if (level != 0) {
-        nonUniformTimestepRecursive<T, DIM, Q>(grid, omega0, level - 1, max_level, fin, fout, containers);
+        nonUniformTimestepRecursive<T, DIM, Q>(grid, omega0, level - 1, num_levels, fin, fout, containers);
     }
 
     // 4) Streaming step that also performs the necessary "explosion" and "coalescence" steps.
-    stream<T, DIM, Q>(grid, level, max_level, fout, fin, containers);
+    stream<T, DIM, Q>(grid, level, num_levels, fout, fin, containers);
 
     // 5) stop
-    if (level == max_level - 1) {
+    if (level == num_levels - 1) {
         return;
     }
 
     // 6) collision for all voxels at level L = level
-    containers.push_back(collide<T, DIM, Q>(grid, omega0, level, max_level, fin, fout));
+    containers.push_back(collide<T, DIM, Q>(grid, omega0, level, num_levels, fin, fout));
 
     // 7) Storing fine(level) data for later "coalescence" pulled by the coarse(level)
-    if (level != max_level) {
-        store<T, DIM, Q>(grid, level + 1, fout);
+    if (level != num_levels - 1) {
+        containers.push_back(store<T, DIM, Q>(grid, level + 1, fout));
     }
 
     // 8) recurse down
     if (level != 0) {
-        nonUniformTimestepRecursive<T, DIM, Q>(grid, omega0, level - 1, max_level, fin, fout, containers);
+        nonUniformTimestepRecursive<T, DIM, Q>(grid, omega0, level - 1, num_levels, fin, fout, containers);
     }
 
     // 9) Streaming step
-    stream<T, DIM, Q>(grid, level, max_level, fout, fin, containers);
+    stream<T, DIM, Q>(grid, level, num_levels, fout, fin, containers);
 }
 
+Neon::float_3d mapToCube(Neon::index_3d p, Neon::index_3d dim)
+{
+    //map p to an axis-aligned cube from -1 to 1
+    Neon::float_3d half_dim = dim.newType<float>() * 0.5;
+    Neon::float_3d ret = (p.newType<float>() - half_dim) / half_dim;
+    return ret;
+}
+inline float sdfCube(Neon::float_3d p, float b = 1.0)
+{
+    Neon::float_3d d(std::abs(p.x) - b, std::abs(p.y) - b, std::abs(p.z) - b);
+    Neon::float_3d d_max(std::max(d.x, 0.f), std::max(d.y, 0.f), std::max(d.z, 0.f));
+    float          len = std::sqrt(d_max.x * d_max.x + d_max.y * d_max.y + d_max.z * d_max.z);
+    return std::min(std::max(d.x, std::max(d.y, d.z)), 0.f) + len;
+}
 
 int main(int argc, char** argv)
 {
@@ -541,30 +555,37 @@ int main(int argc, char** argv)
 
         constexpr int DIM = 2;
         constexpr int Q = (DIM == 2) ? 9 : 19;
+        constexpr int depth = 3;
 
-        const int dim_x = 6;
-        const int dim_y = 6;
-        const int dim_z = 4;
+        const Neon::index_3d grid_dim(24, 24, 24);
 
-        const Neon::index_3d grid_dim(dim_x, dim_y, dim_z);
 
-        const Neon::domain::mGridDescriptor descriptor({1, 1});
+        const Neon::domain::mGridDescriptor descriptor(depth);
+
+        float levelSDF[depth + 1];
+        levelSDF[0] = 0;
+        levelSDF[1] = -0.3;
+        levelSDF[2] = -0.6;
+        levelSDF[3] = -1.0;
 
 
         Neon::domain::mGrid grid(
             backend, grid_dim,
             {[&](const Neon::index_3d id) -> bool {
-                 return ((id.x == 2 && id.y == 2) ||
-                         (id.x == 3 && id.y == 2) ||
-                         (id.x == 2 && id.y == 3) ||
-                         (id.x == 3 && id.y == 3));
+                 return sdfCube(mapToCube(id, grid_dim)) <= levelSDF[0] &&
+                        sdfCube(mapToCube(id, grid_dim)) > levelSDF[1];
+             },
+             [&](const Neon::index_3d& id) -> bool {
+                 return sdfCube(mapToCube(id, grid_dim)) <= levelSDF[1] &&
+                        sdfCube(mapToCube(id, grid_dim)) > levelSDF[2];
              },
              [&](const Neon::index_3d& id) -> bool {
-                 return true;
+                 return sdfCube(mapToCube(id, grid_dim)) <= levelSDF[2] &&
+                        sdfCube(mapToCube(id, grid_dim)) > levelSDF[3];
              }},
             create_stencil<DIM, Q>(), descriptor);
 
-        grid.topologyToVTK("lbm.vtk", false);
+        //grid.topologyToVTK("lbm.vtk", false);
 
 
         //LBM problem
@@ -593,6 +614,8 @@ int main(int argc, char** argv)
                 });
         }
 
+        fin.ioToVtk("sdf", "f");
+
         fin.updateCompute();
         fout.updateCompute();
 
@@ -606,7 +629,7 @@ int main(int argc, char** argv)
 
         Neon::skeleton::Skeleton skl(grid.getBackend());
         skl.sequence(containers, "MultiResLBM");
-        //skl.ioToDot("MultiRes");
+        //skl.ioToDot("MultiResLBM", "", true);
 
         skl.run();
 
diff --git a/libNeonDomain/include/Neon/domain/internal/mGrid/mGridDescriptor.h b/libNeonDomain/include/Neon/domain/internal/mGrid/mGridDescriptor.h
index 0def37f6..71097d50 100644
--- a/libNeonDomain/include/Neon/domain/internal/mGrid/mGridDescriptor.h
+++ b/libNeonDomain/include/Neon/domain/internal/mGrid/mGridDescriptor.h
@@ -20,7 +20,7 @@ struct mGridDescriptor
      * @param levels as described above defaulted to 3 levels octree 
     */
     mGridDescriptor(std::initializer_list<int> log2RefFactors)
-        : mLog2RefFactors(log2RefFactors), mSpacing(log2RefFactors)
+        : mLog2RefFactors(log2RefFactors)
     {
         computeSpacing();
     }
@@ -30,8 +30,19 @@ struct mGridDescriptor
      * by 8^3 blocks i.e., block data structure 
     */
     mGridDescriptor()
-        : mLog2RefFactors({3}), mSpacing({2 * 2 * 2})
     {
+        mLog2RefFactors.resize(3, 1);
+        computeSpacing();
+    }
+
+    /**
+     * This constructor can be use for the default mGrid descriptor that defines a grid of single depth partitioned 
+     * by 8^3 blocks i.e., block data structure 
+    */
+    mGridDescriptor(int depth)
+    {
+        mLog2RefFactors.resize(depth, 1);
+        computeSpacing();
     }
 
 

From e54a0cb1871b8ce689b253b5b44e195949ca220e Mon Sep 17 00:00:00 2001
From: Ahmed Mahmoud <ahmed.mahmoud@autodesk.com>
Date: Thu, 26 Jan 2023 17:06:47 -0500
Subject: [PATCH 11/24] problem setup

---
 apps/lbmMultiRes/lbmMultiRes.cu | 124 ++++++++++++++++++++++++--------
 1 file changed, 96 insertions(+), 28 deletions(-)

diff --git a/apps/lbmMultiRes/lbmMultiRes.cu b/apps/lbmMultiRes/lbmMultiRes.cu
index 77cffefe..2251dd5f 100644
--- a/apps/lbmMultiRes/lbmMultiRes.cu
+++ b/apps/lbmMultiRes/lbmMultiRes.cu
@@ -218,7 +218,11 @@ Neon::set::Container collide(Neon::domain::mGrid&                 grid,
                     for (int i = 0; i < Q; ++i) {
                         T cu = 0;
                         for (int d = 0; d < DIM; ++d) {
-                            cu += latticeVelocity2D[i][d] * vel.v[d];
+                            if constexpr (DIM == 2) {
+                                cu += latticeVelocity2D[i][d] * vel.v[d];
+                            } else {
+                                cu += latticeVelocity3D[i][d] * vel.v[d];
+                            }
                         }
                         //equilibrium
                         T feq = rho * t.t[i] * (1. + cu + 0.5 * cu * cu - usqr);
@@ -526,21 +530,53 @@ void nonUniformTimestepRecursive(Neon::domain::mGrid&               grid,
     stream<T, DIM, Q>(grid, level, num_levels, fout, fin, containers);
 }
 
-Neon::float_3d mapToCube(Neon::index_3d p, Neon::index_3d dim)
-{
-    //map p to an axis-aligned cube from -1 to 1
-    Neon::float_3d half_dim = dim.newType<float>() * 0.5;
-    Neon::float_3d ret = (p.newType<float>() - half_dim) / half_dim;
-    return ret;
-}
-inline float sdfCube(Neon::float_3d p, float b = 1.0)
+
+inline float sdfCube(Neon::index_3d id, Neon::index_3d dim, float b = 1.0)
 {
+    auto mapToCube = [&](Neon::index_3d id) {
+        //map p to an axis-aligned cube from -1 to 1
+        Neon::float_3d half_dim = dim.newType<float>() * 0.5;
+        Neon::float_3d ret = (id.newType<float>() - half_dim) / half_dim;
+        return ret;
+    };
+    Neon::float_3d p = mapToCube(id);
+
     Neon::float_3d d(std::abs(p.x) - b, std::abs(p.y) - b, std::abs(p.z) - b);
     Neon::float_3d d_max(std::max(d.x, 0.f), std::max(d.y, 0.f), std::max(d.z, 0.f));
     float          len = std::sqrt(d_max.x * d_max.x + d_max.y * d_max.y + d_max.z * d_max.z);
     return std::min(std::max(d.x, std::max(d.y, d.z)), 0.f) + len;
 }
 
+template <typename T, int DIM, int Q>
+void postProcess(Neon::domain::mGrid&                 grid,
+                 const int                            num_levels,
+                 const Neon::domain::mGrid::Field<T>& fpop,
+                 Neon::domain::mGrid::Field<T>&       vel,
+                 Neon::domain::mGrid::Field<T>&       rho)
+{
+    fpop.updateIO();
+
+    for (int level = 0; level < num_levels; ++level) {
+        grid.getContainer(
+            "postProcess_" + std::to_string(level), level,
+            [&, level](Neon::set::Loader& loader) {
+                const auto& pop = fpop.load(loader, level, Neon::MultiResCompute::STENCIL);
+                auto        u = vel.load(loader, level, Neon::MultiResCompute::MAP);
+                auto        rh = rho.load(loader, level, Neon::MultiResCompute::MAP);
+
+                return [=] NEON_CUDA_HOST_DEVICE(const typename Neon::domain::bGrid::Cell& cell) mutable {
+
+                };
+            });
+    }
+
+    vel.updateIO();
+    rho.updateIO();
+
+    vel.ioToVtk("vel_", "vel");
+    rho.ioToVtk("rho_", "rho");
+}
+
 int main(int argc, char** argv)
 {
     Neon::init();
@@ -572,16 +608,16 @@ int main(int argc, char** argv)
         Neon::domain::mGrid grid(
             backend, grid_dim,
             {[&](const Neon::index_3d id) -> bool {
-                 return sdfCube(mapToCube(id, grid_dim)) <= levelSDF[0] &&
-                        sdfCube(mapToCube(id, grid_dim)) > levelSDF[1];
+                 return sdfCube(id, grid_dim) <= levelSDF[0] &&
+                        sdfCube(id, grid_dim) > levelSDF[1];
              },
              [&](const Neon::index_3d& id) -> bool {
-                 return sdfCube(mapToCube(id, grid_dim)) <= levelSDF[1] &&
-                        sdfCube(mapToCube(id, grid_dim)) > levelSDF[2];
+                 return sdfCube(id, grid_dim) <= levelSDF[1] &&
+                        sdfCube(id, grid_dim) > levelSDF[2];
              },
              [&](const Neon::index_3d& id) -> bool {
-                 return sdfCube(mapToCube(id, grid_dim)) <= levelSDF[2] &&
-                        sdfCube(mapToCube(id, grid_dim)) > levelSDF[3];
+                 return sdfCube(id, grid_dim) <= levelSDF[2] &&
+                        sdfCube(id, grid_dim) > levelSDF[3];
              }},
             create_stencil<DIM, Q>(), descriptor);
 
@@ -589,32 +625,62 @@ int main(int argc, char** argv)
 
 
         //LBM problem
-        const int max_iter = 300;
-        const T   ulb = 0.01;
-        const T   Re = 20;
-        const T   clength = grid_dim.x;
-        const T   visclb = ulb * clength / Re;
-        const T   smagorinskyConstant = 0.02;
-        const T   omega = 1.0 / (3. * visclb + 0.5);
+        const int             max_iter = 300;
+        const T               ulb = 0.02;
+        const T               Re = 100;
+        const T               clength = grid_dim.x;
+        const T               visclb = ulb * clength / Re;
+        const T               smagorinskyConstant = 0.02;
+        const T               omega = 1.0 / (3. * visclb + 0.5);
+        const Neon::double_3d ulid(1.0, 0., 0.);
 
         auto fin = grid.newField<T>("fin", Q, 0);
         auto fout = grid.newField<T>("fout", Q, 0);
 
+        auto vel = grid.newField<T>("vel", 3, 0);
+        auto rho = grid.newField<T>("rho", 1, 0);
+
         //TODO init fin and fout
+        constexpr auto t = latticeWeight<DIM, Q>();
+
+        auto init = [&](const Neon::int32_3d idx, const int q) {
+            T ret = t.t[q];
+
+            if (idx.x == 0 || idx.x == grid_dim.x - 1 ||
+                idx.y == 0 || idx.y == grid_dim.y - 1 ||
+                idx.z == 0 || idx.z == grid_dim.z - 1) {
+
+                if (idx.x == grid_dim.x - 1) {
+                    ret = 0;
+                    for (int d = 0; d < DIM; ++d) {
+                        if (DIM == 2) {
+                            ret += latticeVelocity2D[q][d] * ulid.v[d];
+                        } else {
+                            ret += latticeVelocity3D[q][d] * ulid.v[d];
+                        }
+                    }
+                    ret *= -6. * t.t[q] * ulb;
+                } else {
+                    ret = 0;
+                }
+            }
+            return ret;
+        };
+
         for (int l = 0; l < descriptor.getDepth(); ++l) {
             fin.forEachActiveCell(
                 l,
-                [&](const Neon::int32_3d, const int, T& val) {
-                    val = 0;
+                [&](const Neon::int32_3d idx, const int q, T& val) {
+                    val = init(idx, q);
                 });
             fout.forEachActiveCell(
                 l,
-                [&](const Neon::int32_3d, const int, T& val) {
-                    val = 0;
+                [&](const Neon::int32_3d idx, const int q, T& val) {
+                    val = init(idx, q);
                 });
         }
 
-        fin.ioToVtk("sdf", "f");
+        //fin.ioToVtk("sdf", "f");
 
         fin.updateCompute();
         fout.updateCompute();
@@ -631,7 +697,9 @@ int main(int argc, char** argv)
         skl.sequence(containers, "MultiResLBM");
         //skl.ioToDot("MultiResLBM", "", true);
 
-        skl.run();
+        for (int t = 0; t < max_iter; ++t) {
+            skl.run();
+        }
 
         grid.getBackend().syncAll();
         fin.updateIO();

From 31d7e97365c3d8492a35fb5b1b3c1a69a86bf267 Mon Sep 17 00:00:00 2001
From: Ahmed Mahmoud <ahmed.mahmoud@autodesk.com>
Date: Tue, 31 Jan 2023 11:11:21 -0500
Subject: [PATCH 12/24] init pop, export velocity and density, and boundary
 treatment

---
 apps/lbmMultiRes/lbmMultiRes.cu               | 426 +++++++++++-------
 .../domain/internal/bGrid/bPartition_imp.h    |   2 +-
 2 files changed, 272 insertions(+), 156 deletions(-)

diff --git a/apps/lbmMultiRes/lbmMultiRes.cu b/apps/lbmMultiRes/lbmMultiRes.cu
index 2251dd5f..44b8667d 100644
--- a/apps/lbmMultiRes/lbmMultiRes.cu
+++ b/apps/lbmMultiRes/lbmMultiRes.cu
@@ -2,6 +2,14 @@
 #include "Neon/domain/mGrid.h"
 #include "Neon/skeleton/Skeleton.h"
 
+enum CellType : int
+{
+    bounceBack = 0,
+    movingWall = 1,
+    bulk = 2,
+    undefined = 3,
+};
+
 template <unsigned int DIM, unsigned int Q>
 Neon::domain::Stencil create_stencil();
 
@@ -21,54 +29,55 @@ Neon::domain::Stencil create_stencil<2, 9>()
 template <>
 Neon::domain::Stencil create_stencil<3, 19>()
 {
-    // filterCenterOut = false;
     return Neon::domain::Stencil::s19_t(false);
 }
 
 NEON_CUDA_DEVICE_ONLY static constexpr char latticeVelocity2D[9][2] = {
-    {0, 0},
-    {0, -1},
-    {0, 1},
-    {-1, 0},
-    {-1, -1},
-    {-1, 1},
-    {1, 0},
-    {1, -1},
-    {1, 1}};
+    {0, 0},    //0 -> 0 || 0000 -> 0000
+    {0, -1},   //1 -> 2 || 0001 -> 0010
+    {0, 1},    //2 -> 1 || 0010 -> 0001
+    {-1, 0},   //3 -> 6 || 0100 -> 0110
+    {-1, -1},  //4 -> 8 || 0100 -> 1000
+    {-1, 1},   //5 -> 7 || 0101 -> 0111
+    {1, 0},    //6 -> 3 || 0110 -> 0011
+    {1, -1},   //7 -> 5 || 0111 -> 0101
+    {1, 1}};   //8 -> 4 || 1000 -> 0100
 
 NEON_CUDA_DEVICE_ONLY static constexpr char latticeVelocity3D[27][3] = {
-    {0, 0, 0},
-    {0, -1, 0},
-    {0, 1, 0},
-    {-1, 0, 0},
-    {-1, -1, 0},
-    {-1, 1, 0},
-    {1, 0, 0},
-    {1, -1, 0},
-    {1, 1, 0},
-
-    {0, 0, -1},
-    {0, -1, -1},
-    {0, 1, -1},
-    {-1, 0, -1},
-    {-1, -1, -1},
-    {-1, 1, -1},
-    {1, 0, -1},
-    {1, -1, -1},
-    {1, 1, -1},
-
-    {0, 0, 1},
-    {0, -1, 1},
-    {0, 1, 1},
-    {-1, 0, 1},
-    {-1, -1, 1},
-    {-1, 1, 1},
-    {1, 0, 1},
-    {1, -1, 1},
-    {1, 1, 1}
+    {0, 0, 0},    //0  -> 0   || 00000 -> 00000
+    {0, -1, 0},   //1  -> 2   || 00001 -> 00010
+    {0, 1, 0},    //2  -> 1   || 00010 -> 00001
+    {-1, 0, 0},   //3  -> 6   || 00100 -> 00110
+    {-1, -1, 0},  //4  -> 8   || 00100 -> 01000
+    {-1, 1, 0},   //5  -> 7   || 00101 -> 00111
+    {1, 0, 0},    //6  -> 3   || 00110 -> 00011
+    {1, -1, 0},   //7  -> 5   || 00111 -> 00101
+    {1, 1, 0},    //8  -> 4   || 01000 -> 00100
+
+    {0, 0, -1},    //9  -> 18 || 01000 -> 10010
+    {0, -1, -1},   //10 -> 20 || 01010 -> 10100
+    {0, 1, -1},    //11 -> 19 || 01011 -> 10011
+    {-1, 0, -1},   //12 -> 24 || 01100 -> 11000
+    {-1, -1, -1},  //13 -> 26 || 01101 -> 11010
+    {-1, 1, -1},   //14 -> 25 || 01110 -> 11001
+    {1, 0, -1},    //15 -> 21 || 01111 -> 10101
+    {1, -1, -1},   //16 -> 23 || 10000 -> 10111
+    {1, 1, -1},    //17 -> 22 || 10001 -> 10110
+
+    {0, 0, 1},    //18 -> 9   || 10010 -> 01001
+    {0, -1, 1},   //19 -> 11  || 10011 -> 01011
+    {0, 1, 1},    //20 -> 10  || 10100 -> 01010
+    {-1, 0, 1},   //21 -> 15  || 10101 -> 01111
+    {-1, -1, 1},  //22 -> 17  || 10110 -> 10001
+    {-1, 1, 1},   //23 -> 16  || 10111 -> 10000
+    {1, 0, 1},    //24 -> 12  || 11000 -> 01100
+    {1, -1, 1},   //25 -> 14  || 11001 -> 01110
+    {1, 1, 1}     //26 -> 13  || 11010 -> 01101
 
 };
-
+NEON_CUDA_DEVICE_ONLY static constexpr char latticeOppositeID[27] = {
+    //opposite q for 2d is a subset of what is in 3d so only use one
+    0, 2, 1, 6, 8, 7, 3, 5, 4, 18, 20, 19, 24, 26, 25, 21, 23, 22, 9, 11, 10, 15, 17, 16, 12, 14, 13};
 
 template <int DIM>
 NEON_CUDA_HOST_DEVICE Neon::int8_3d getDir(const int8_t q)
@@ -142,9 +151,9 @@ NEON_CUDA_HOST_DEVICE inline Neon::int8_3d unlceOffset(const T& cell, const Neon
 
 
 template <typename T>
-NEON_CUDA_HOST_DEVICE T computeOmega(T omega0, int level, int num_levels)
+NEON_CUDA_HOST_DEVICE T computeOmega(T omega0, int level, int numLevels)
 {
-    int ilevel = num_levels - level - 1;
+    int ilevel = numLevels - level - 1;
     // scalbln(1.0, x) = 2^x
     return 2 * omega0 / (scalbln(1.0, ilevel + 1) + (1. - scalbln(1.0, ilevel)) * omega0);
 }
@@ -179,56 +188,61 @@ NEON_CUDA_HOST_DEVICE Neon::Vec_3d<T> velocity(const T* fin,
 }
 
 template <typename T, int DIM, int Q>
-Neon::set::Container collide(Neon::domain::mGrid&                 grid,
-                             T                                    omega0,
-                             int                                  level,
-                             int                                  num_levels,
-                             const Neon::domain::mGrid::Field<T>& fin,
-                             Neon::domain::mGrid::Field<T>&       fout)
+Neon::set::Container collide(Neon::domain::mGrid&                        grid,
+                             T                                           omega0,
+                             int                                         level,
+                             int                                         numLevels,
+                             const Neon::domain::mGrid::Field<CellType>& cellType,
+                             const Neon::domain::mGrid::Field<T>&        fin,
+                             Neon::domain::mGrid::Field<T>&              fout)
 {
     return grid.getContainer(
         "collide_" + std::to_string(level), level,
-        [&, level, omega0, num_levels](Neon::set::Loader& loader) {
+        [&, level, omega0, numLevels](Neon::set::Loader& loader) {
+            const auto& type = cellType.load(loader, level, Neon::MultiResCompute::MAP);
             const auto& in = fin.load(loader, level, Neon::MultiResCompute::MAP);
             auto        out = fout.load(loader, level, Neon::MultiResCompute::MAP);
-            const T     omega = computeOmega(omega0, level, num_levels);
+            const T     omega = computeOmega(omega0, level, numLevels);
 
             return [=] NEON_CUDA_HOST_DEVICE(const typename Neon::domain::bGrid::Cell& cell) mutable {
-                if (!in.hasChildren(cell)) {
+                if (type(cell, 0) == CellType::bulk) {
 
-                    constexpr auto t = latticeWeight<DIM, Q>();
+                    if (!in.hasChildren(cell)) {
 
-                    //fin
-                    T ins[Q];
-                    for (int i = 0; i < Q; ++i) {
-                        ins[i] = in(cell, i);
-                    }
+                        constexpr auto t = latticeWeight<DIM, Q>();
 
-                    //density
-                    T rho = 0;
-                    for (int i = 0; i < Q; ++i) {
-                        rho += ins[i];
-                    }
+                        //fin
+                        T ins[Q];
+                        for (int i = 0; i < Q; ++i) {
+                            ins[i] = in(cell, i);
+                        }
 
-                    //velocity
-                    const Neon::Vec_3d<T> vel = velocity<T, DIM, Q>(ins, rho);
+                        //density
+                        T rho = 0;
+                        for (int i = 0; i < Q; ++i) {
+                            rho += ins[i];
+                        }
 
+                        //velocity
+                        const Neon::Vec_3d<T> vel = velocity<T, DIM, Q>(ins, rho);
 
-                    const T usqr = (3.0 / 2.0) * (vel.x * vel.x + vel.y * vel.y + vel.z * vel.z);
-                    for (int i = 0; i < Q; ++i) {
-                        T cu = 0;
-                        for (int d = 0; d < DIM; ++d) {
-                            if constexpr (DIM == 2) {
-                                cu += latticeVelocity2D[i][d] * vel.v[d];
-                            } else {
-                                cu += latticeVelocity3D[i][d] * vel.v[d];
+
+                        const T usqr = (3.0 / 2.0) * (vel.x * vel.x + vel.y * vel.y + vel.z * vel.z);
+                        for (int i = 0; i < Q; ++i) {
+                            T cu = 0;
+                            for (int d = 0; d < DIM; ++d) {
+                                if constexpr (DIM == 2) {
+                                    cu += latticeVelocity2D[i][d] * vel.v[d];
+                                } else {
+                                    cu += latticeVelocity3D[i][d] * vel.v[d];
+                                }
                             }
-                        }
-                        //equilibrium
-                        T feq = rho * t.t[i] * (1. + cu + 0.5 * cu * cu - usqr);
+                            //equilibrium
+                            T feq = rho * t.t[i] * (1. + cu + 0.5 * cu * cu - usqr);
 
-                        //collide
-                        out(cell, i) = ins[i] - omega * (ins[i] - feq);
+                            //collide
+                            out(cell, i) = ins[i] - omega * (ins[i] - feq);
+                        }
                     }
                 }
             };
@@ -236,10 +250,11 @@ Neon::set::Container collide(Neon::domain::mGrid&                 grid,
 }
 
 template <typename T, int DIM, int Q>
-Neon::set::Container stream(Neon::domain::mGrid&                 grid,
-                            int                                  level,
-                            const Neon::domain::mGrid::Field<T>& fpop_postcollision,
-                            Neon::domain::mGrid::Field<T>&       fpop_poststreaming)
+Neon::set::Container stream(Neon::domain::mGrid&                        grid,
+                            int                                         level,
+                            const Neon::domain::mGrid::Field<CellType>& cellType,
+                            const Neon::domain::mGrid::Field<T>&        postCollision,
+                            Neon::domain::mGrid::Field<T>&              postStreaming)
 {
     //regular Streaming of the normal voxels at level L which are not interfaced with L+1 and L-1 levels.
     //This is "pull" stream
@@ -247,22 +262,28 @@ Neon::set::Container stream(Neon::domain::mGrid&                 grid,
     return grid.getContainer(
         "stream_" + std::to_string(level), level,
         [&, level](Neon::set::Loader& loader) {
-            const auto& fpost_col = fpop_postcollision.load(loader, level, Neon::MultiResCompute::STENCIL);
-            auto        fpost_stm = fpop_poststreaming.load(loader, level, Neon::MultiResCompute::MAP);
+            const auto& type = cellType.load(loader, level, Neon::MultiResCompute::STENCIL);
+            const auto& fpost_col = postCollision.load(loader, level, Neon::MultiResCompute::STENCIL);
+            auto        fpost_stm = postStreaming.load(loader, level, Neon::MultiResCompute::MAP);
 
             return [=] NEON_CUDA_HOST_DEVICE(const typename Neon::domain::bGrid::Cell& cell) mutable {
-                //If this cell has children i.e., it is been refined, than we should not work on it
-                //because this cell is only there to allow query and not to operate on
-                if (!fpost_stm.hasChildren(cell)) {
+                if (type(cell, 0) == CellType::bulk) {
+                    //If this cell has children i.e., it is been refined, than we should not work on it
+                    //because this cell is only there to allow query and not to operate on
+                    if (!fpost_stm.hasChildren(cell)) {
 
-                    for (int8_t q = 0; q < Q; ++q) {
-                        const Neon::int8_3d dir = -getDir<DIM>(q);
+                        for (int8_t q = 0; q < Q; ++q) {
+                            const Neon::int8_3d dir = -getDir<DIM>(q);
 
-                        //if the neighbor cell has children, then this 'cell' is interfacing with L-1 (fine) along q direction
-                        if (!fpost_stm.hasChildren(cell, dir)) {
-                            auto neighbor = fpost_col.nghVal(cell, dir, q, T(0));
-                            if (neighbor.isValid) {
-                                fpost_stm(cell, q) = neighbor.value;
+                            //if the neighbor cell has children, then this 'cell' is interfacing with L-1 (fine) along q direction
+                            if (!fpost_stm.hasChildren(cell, dir)) {
+
+                                if (type.nghVal(cell, dir, 0, CellType::undefined).value == CellType::bulk) {
+                                    fpost_stm(cell, q) = fpost_col.nghVal(cell, dir, q, T(0)).value;
+                                } else {
+                                    const int8_t opposte_q = latticeOppositeID[q];
+                                    fpost_stm(cell, q) = fpost_col(cell, opposte_q) + fpost_col.nghVal(cell, dir, opposte_q, T(0)).value;
+                                }
                             }
                         }
                     }
@@ -274,8 +295,8 @@ Neon::set::Container stream(Neon::domain::mGrid&                 grid,
 template <typename T, int DIM, int Q>
 Neon::set::Container explosionPull(Neon::domain::mGrid&                 grid,
                                    int                                  level,
-                                   const Neon::domain::mGrid::Field<T>& fpop_postcollision,
-                                   Neon::domain::mGrid::Field<T>&       fpop_poststreaming)
+                                   const Neon::domain::mGrid::Field<T>& postCollision,
+                                   Neon::domain::mGrid::Field<T>&       postStreaming)
 {
     // Initiated by the fine level (hence "pull"), this function performs a coarse (level+1) to
     // fine (level) communication or "explosion" by simply distributing copies of coarse grid onto the fine grid.
@@ -286,8 +307,8 @@ Neon::set::Container explosionPull(Neon::domain::mGrid&                 grid,
     return grid.getContainer(
         "Explosion_" + std::to_string(level), level,
         [&, level](Neon::set::Loader& loader) {
-            const auto& fpost_col = fpop_postcollision.load(loader, level, Neon::MultiResCompute::STENCIL_UP);
-            auto        fpost_stm = fpop_poststreaming.load(loader, level, Neon::MultiResCompute::MAP);
+            const auto& fpost_col = postCollision.load(loader, level, Neon::MultiResCompute::STENCIL_UP);
+            auto        fpost_stm = postStreaming.load(loader, level, Neon::MultiResCompute::MAP);
 
             return [=] NEON_CUDA_HOST_DEVICE(const typename Neon::domain::bGrid::Cell& cell) mutable {
                 //If this cell has children i.e., it is been refined, that we should not work on it
@@ -328,7 +349,7 @@ Neon::set::Container explosionPull(Neon::domain::mGrid&                 grid,
 template <typename T, int DIM, int Q>
 Neon::set::Container coalescencePull(Neon::domain::mGrid&           grid,
                                      int                            level,
-                                     Neon::domain::mGrid::Field<T>& fpop_poststreaming)
+                                     Neon::domain::mGrid::Field<T>& postStreaming)
 {
     // Initiated by the coarse level (hence "pull"), this function simply read the missing population
     // across the interface between coarse<->fine boundary by reading the population prepare during the store()
@@ -336,7 +357,7 @@ Neon::set::Container coalescencePull(Neon::domain::mGrid&           grid,
     return grid.getContainer(
         "Coalescence" + std::to_string(level), level,
         [&, level](Neon::set::Loader& loader) {
-            auto& fpost_stm = fpop_poststreaming.load(loader, level, Neon::MultiResCompute::STENCIL_DOWN);
+            auto& fpost_stm = postStreaming.load(loader, level, Neon::MultiResCompute::STENCIL_DOWN);
 
             return [=] NEON_CUDA_HOST_DEVICE(const typename Neon::domain::bGrid::Cell& cell) mutable {
                 //If this cell has children i.e., it is been refined, than we should not work on it
@@ -363,7 +384,7 @@ Neon::set::Container coalescencePull(Neon::domain::mGrid&           grid,
 template <typename T, int DIM, int Q>
 Neon::set::Container store(Neon::domain::mGrid&           grid,
                            int                            level,
-                           Neon::domain::mGrid::Field<T>& fpop_poststreaming)
+                           Neon::domain::mGrid::Field<T>& postStreaming)
 {
     //Initiated by the coarse level (level), this function prepares and stores the fine (level - 1)
     // information for further pulling initiated by the coarse (this) level invoked by coalescence_pull
@@ -379,7 +400,7 @@ Neon::set::Container store(Neon::domain::mGrid&           grid,
     return grid.getContainer(
         "store_" + std::to_string(level), level,
         [&, level](Neon::set::Loader& loader) {
-            auto& fpost_stm = fpop_poststreaming.load(loader, level, Neon::MultiResCompute::STENCIL_DOWN);
+            auto& fpost_stm = postStreaming.load(loader, level, Neon::MultiResCompute::STENCIL_DOWN);
 
             return [=] NEON_CUDA_HOST_DEVICE(const typename Neon::domain::bGrid::Cell& cell) mutable {
                 //if the cell is refined, we might need to store something in it for its neighbor
@@ -453,81 +474,83 @@ Neon::set::Container store(Neon::domain::mGrid&           grid,
 
 
 template <typename T, int DIM, int Q>
-void stream(Neon::domain::mGrid&                 grid,
-            int                                  level,
-            const int                            num_levels,
-            const Neon::domain::mGrid::Field<T>& fpop_postcollision,
-            Neon::domain::mGrid::Field<T>&       fpop_poststreaming,
-            std::vector<Neon::set::Container>&   containers)
+void stream(Neon::domain::mGrid&                        grid,
+            int                                         level,
+            const int                                   numLevels,
+            const Neon::domain::mGrid::Field<CellType>& cellType,
+            const Neon::domain::mGrid::Field<T>&        postCollision,
+            Neon::domain::mGrid::Field<T>&              postStreaming,
+            std::vector<Neon::set::Container>&          containers)
 {
-    containers.push_back(stream<T, DIM, Q>(grid, level, fpop_postcollision, fpop_poststreaming));
+    containers.push_back(stream<T, DIM, Q>(grid, level, cellType, postCollision, postStreaming));
 
     /*
     * Streaming for interface voxels that have
     *  (i) coarser or (ii) finer neighbors at level+1 and level-1 and hence require
     *  (i) "explosion" or (ii) coalescence
     */
-    if (level != num_levels - 1) {
+    if (level != numLevels - 1) {
         /* Explosion: pull missing populations from coarser neighbors by copying coarse (level+1) to fine (level) 
         * neighbors, initiated by the fine level ("Pull").
         */
-        containers.push_back(explosionPull<T, DIM, Q>(grid, level, fpop_postcollision, fpop_poststreaming));
+        containers.push_back(explosionPull<T, DIM, Q>(grid, level, postCollision, postStreaming));
     }
 
     if (level != 0) {
         /* Coalescence: pull missing populations from finer neighbors by "smart" averaging fine (level-1) 
         * to coarse (level) communication, initiated by the coarse level ("Pull").
         */
-        containers.push_back(coalescencePull<T, DIM, Q>(grid, level, fpop_poststreaming));
+        containers.push_back(coalescencePull<T, DIM, Q>(grid, level, postStreaming));
     }
 }
 
 template <typename T, int DIM, int Q>
-void nonUniformTimestepRecursive(Neon::domain::mGrid&               grid,
-                                 const T                            omega0,
-                                 const int                          level,
-                                 const int                          num_levels,
-                                 Neon::domain::mGrid::Field<T>&     fin,
-                                 Neon::domain::mGrid::Field<T>&     fout,
-                                 std::vector<Neon::set::Container>& containers)
+void nonUniformTimestepRecursive(Neon::domain::mGrid&                        grid,
+                                 const T                                     omega0,
+                                 const int                                   level,
+                                 const int                                   numLevels,
+                                 const Neon::domain::mGrid::Field<CellType>& cellType,
+                                 Neon::domain::mGrid::Field<T>&              fin,
+                                 Neon::domain::mGrid::Field<T>&              fout,
+                                 std::vector<Neon::set::Container>&          containers)
 {
     // 1) collision for all voxels at level L=level
-    containers.push_back(collide<T, DIM, Q>(grid, omega0, level, num_levels, fin, fout));
+    containers.push_back(collide<T, DIM, Q>(grid, omega0, level, numLevels, cellType, fin, fout));
 
     // 2) Storing fine (level - 1) data for later "coalescence" pulled by the coarse (level)
-    if (level != num_levels - 1) {
+    if (level != numLevels - 1) {
         containers.push_back(store<T, DIM, Q>(grid, level + 1, fout));
     }
 
 
     // 3) recurse down
     if (level != 0) {
-        nonUniformTimestepRecursive<T, DIM, Q>(grid, omega0, level - 1, num_levels, fin, fout, containers);
+        nonUniformTimestepRecursive<T, DIM, Q>(grid, omega0, level - 1, numLevels, cellType, fin, fout, containers);
     }
 
     // 4) Streaming step that also performs the necessary "explosion" and "coalescence" steps.
-    stream<T, DIM, Q>(grid, level, num_levels, fout, fin, containers);
+    stream<T, DIM, Q>(grid, level, numLevels, cellType, fout, fin, containers);
 
     // 5) stop
-    if (level == num_levels - 1) {
+    if (level == numLevels - 1) {
         return;
     }
 
     // 6) collision for all voxels at level L = level
-    containers.push_back(collide<T, DIM, Q>(grid, omega0, level, num_levels, fin, fout));
+    containers.push_back(collide<T, DIM, Q>(grid, omega0, level, numLevels, cellType, fin, fout));
 
     // 7) Storing fine(level) data for later "coalescence" pulled by the coarse(level)
-    if (level != num_levels - 1) {
+    if (level != numLevels - 1) {
         containers.push_back(store<T, DIM, Q>(grid, level + 1, fout));
     }
 
     // 8) recurse down
     if (level != 0) {
-        nonUniformTimestepRecursive<T, DIM, Q>(grid, omega0, level - 1, num_levels, fin, fout, containers);
+        nonUniformTimestepRecursive<T, DIM, Q>(grid, omega0, level - 1, numLevels, cellType, fin, fout, containers);
     }
 
     // 9) Streaming step
-    stream<T, DIM, Q>(grid, level, num_levels, fout, fin, containers);
+    stream<T, DIM, Q>(grid, level, numLevels, cellType, fout, fin, containers);
 }
 
 
@@ -548,30 +571,94 @@ inline float sdfCube(Neon::index_3d id, Neon::index_3d dim, float b = 1.0)
 }
 
 template <typename T, int DIM, int Q>
-void postProcess(Neon::domain::mGrid&                 grid,
-                 const int                            num_levels,
-                 const Neon::domain::mGrid::Field<T>& fpop,
-                 Neon::domain::mGrid::Field<T>&       vel,
-                 Neon::domain::mGrid::Field<T>&       rho)
+void postProcess(Neon::domain::mGrid&                        grid,
+                 const int                                   numLevels,
+                 const Neon::domain::mGrid::Field<T>&        fpop,
+                 const Neon::domain::mGrid::Field<CellType>& cellType,
+                 Neon::domain::mGrid::Field<T>&              vel,
+                 Neon::domain::mGrid::Field<T>&              rho)
 {
-    fpop.updateIO();
+    //fpop.updateIO();
+
+    for (int level = 0; level < numLevels; ++level) {
+        auto container =
+            grid.getContainer(
+                "postProcess_" + std::to_string(level), level,
+                [&, level](Neon::set::Loader& loader) {
+                    const auto& pop = fpop.load(loader, level, Neon::MultiResCompute::STENCIL);
+                    const auto& type = cellType.load(loader, level, Neon::MultiResCompute::MAP);
+                    auto&       u = vel.load(loader, level, Neon::MultiResCompute::MAP);
+                    auto&       rh = rho.load(loader, level, Neon::MultiResCompute::MAP);
 
-    for (int level = 0; level < num_levels; ++level) {
-        grid.getContainer(
-            "postProcess_" + std::to_string(level), level,
-            [&, level](Neon::set::Loader& loader) {
-                const auto& pop = fpop.load(loader, level, Neon::MultiResCompute::STENCIL);
-                auto        u = vel.load(loader, level, Neon::MultiResCompute::MAP);
-                auto        rh = rho.load(loader, level, Neon::MultiResCompute::MAP);
 
-                return [=] NEON_CUDA_HOST_DEVICE(const typename Neon::domain::bGrid::Cell& cell) mutable {
+                    return [=] NEON_CUDA_HOST_DEVICE(const typename Neon::domain::bGrid::Cell& cell) mutable {
+                        constexpr auto t = latticeWeight<DIM, Q>();
 
-                };
-            });
+                        if (type(cell, 0) == CellType::bulk) {
+
+                            //fin
+                            T ins[Q];
+                            for (int i = 0; i < Q; ++i) {
+                                ins[i] = pop(cell, i);
+                            }
+
+                            //density
+                            T r = 0;
+                            for (int i = 0; i < Q; ++i) {
+                                r += ins[i];
+                            }
+                            rh(cell, 0) = r;
+
+                            //velocity
+                            const Neon::Vec_3d<T> vel = velocity<T, DIM, Q>(ins, r);
+
+                            u(cell, 0) = vel.v[0];
+                            u(cell, 1) = vel.v[1];
+                            u(cell, 2) = vel.v[2];
+
+
+                            /*T st[Q];
+                            T r = 0;
+                            if (!pop.hasChildren(cell)) {
+                                assert(!pop.hasChildren(cell));
+                                for (int8_t q = 0; q < Q; ++q) {
+                                    const Neon::int8_3d dir = -getDir<DIM>(q);
+                                    if (!pop.hasChildren(cell, dir)) {
+                                        auto neighbor = pop.nghVal(cell, dir, q, T(0));
+                                        if (neighbor.isValid) {
+                                            st[q] = neighbor.value;
+                                            r += st[q];
+                                        }
+                                    }
+                                }
+
+                                rh(cell, 0) = r;
+
+                                const Neon::Vec_3d<T> vel = velocity<T, DIM, Q>(ins, rho);
+
+                                for (int d = 0; d < DIM; ++d) {
+                                    u(cell, d) = vel.v[d];
+                                }
+                            }*/
+                        }
+                        if (type(cell, 0) == CellType::movingWall) {
+                            rh(cell, 0) = 1.0;
+
+                            for (int d = 0; d < DIM; ++d) {
+                                int i = (d == 0) ? 3 : ((d == 1) ? 1 : 9);
+                                u(cell, d) = pop(cell, i) / (6.0 * 1.0 / 18.0);
+                            }
+                        }
+                    };
+                });
+
+        container.run(0);
     }
 
-    vel.updateIO();
-    rho.updateIO();
+    grid.getBackend().syncAll();
+
+    //vel.updateIO();
+    //rho.updateIO();
 
     vel.ioToVtk("vel_", "vel");
     rho.ioToVtk("rho_", "rho");
@@ -593,7 +680,7 @@ int main(int argc, char** argv)
         constexpr int Q = (DIM == 2) ? 9 : 19;
         constexpr int depth = 3;
 
-        const Neon::index_3d grid_dim(24, 24, 24);
+        const Neon::index_3d grid_dim(24, 24, 4);
 
 
         const Neon::domain::mGridDescriptor descriptor(depth);
@@ -632,15 +719,38 @@ int main(int argc, char** argv)
         const T               visclb = ulb * clength / Re;
         const T               smagorinskyConstant = 0.02;
         const T               omega = 1.0 / (3. * visclb + 0.5);
-        const Neon::double_3d ulid(1.0, 0., 0.);
+        const Neon::double_3d ulid(ulb, 0., 0.);
 
+        //alloc fields
         auto fin = grid.newField<T>("fin", Q, 0);
         auto fout = grid.newField<T>("fout", Q, 0);
+        auto cellType = grid.newField<CellType>("CellType", 1, CellType::bulk);
 
         auto vel = grid.newField<T>("vel", 3, 0);
         auto rho = grid.newField<T>("rho", 1, 0);
 
-        //TODO init fin and fout
+        //classify voxels
+        for (int l = 0; l < descriptor.getDepth(); ++l) {
+            cellType.forEachActiveCell(
+                l,
+                [&](const Neon::int32_3d idx, const int q, CellType& val) {
+                    val = CellType::bulk;
+                    if (idx.x == 0 || idx.x == grid_dim.x - 1 ||
+                        idx.y == 0 || idx.y == grid_dim.y - 1 ||
+                        idx.z == 0 || idx.z == grid_dim.z - 1) {
+
+                        val = CellType::bounceBack;
+
+                        if (idx.y == grid_dim.y - 1) {
+                            val = CellType::movingWall;
+                        }
+                    }
+                });
+        }
+        cellType.updateCompute();
+
+
+        // init fin and fout
         constexpr auto t = latticeWeight<DIM, Q>();
 
         auto init = [&](const Neon::int32_3d idx, const int q) {
@@ -650,7 +760,7 @@ int main(int argc, char** argv)
                 idx.y == 0 || idx.y == grid_dim.y - 1 ||
                 idx.z == 0 || idx.z == grid_dim.z - 1) {
 
-                if (idx.x == grid_dim.x - 1) {
+                if (idx.y == grid_dim.y - 1) {
                     ret = 0;
                     for (int d = 0; d < DIM; ++d) {
                         if (DIM == 2) {
@@ -679,30 +789,36 @@ int main(int argc, char** argv)
                     val = init(idx, q);
                 });
         }
-
-        //fin.ioToVtk("sdf", "f");
-
         fin.updateCompute();
         fout.updateCompute();
 
-        std::vector<Neon::set::Container> containers;
+        //fin.ioToVtk("fin", "f");
 
+        postProcess<T, DIM, Q>(grid, descriptor.getDepth(), fout, cellType, vel, rho);
+
+        //skeleton
+        std::vector<Neon::set::Container> containers;
         nonUniformTimestepRecursive<T, DIM, Q>(grid,
                                                omega,
                                                descriptor.getDepth() - 1,
                                                descriptor.getDepth(),
+                                               cellType,
                                                fin, fout, containers);
 
         Neon::skeleton::Skeleton skl(grid.getBackend());
         skl.sequence(containers, "MultiResLBM");
         //skl.ioToDot("MultiResLBM", "", true);
 
+        //execution
         for (int t = 0; t < max_iter; ++t) {
             skl.run();
+            if (t % 10 == 0) {
+                postProcess<T, DIM, Q>(grid, descriptor.getDepth(), fout, cellType, vel, rho);
+            }
         }
 
         grid.getBackend().syncAll();
-        fin.updateIO();
-        fout.updateIO();
+        //fin.updateIO();
+        //fout.updateIO();
     }
 }
\ No newline at end of file
diff --git a/libNeonDomain/include/Neon/domain/internal/bGrid/bPartition_imp.h b/libNeonDomain/include/Neon/domain/internal/bGrid/bPartition_imp.h
index 8b31e6a8..338617c2 100644
--- a/libNeonDomain/include/Neon/domain/internal/bGrid/bPartition_imp.h
+++ b/libNeonDomain/include/Neon/domain/internal/bGrid/bPartition_imp.h
@@ -13,7 +13,7 @@ bPartition<T, C>::bPartition()
       mNeighbourBlocks(nullptr),
       mOrigin(nullptr),
       mMask(nullptr),
-      mOutsideValue(0),
+      mOutsideValue(T()),
       mStencilNghIndex(nullptr),
       mIsInSharedMem(false),
       mMemSharedMem(nullptr),

From 0294cbd8766fa21f53f19f91b2722abc2b8332b8 Mon Sep 17 00:00:00 2001
From: Ahmed Mahmoud <ahmed.mahmoud@autodesk.com>
Date: Tue, 31 Jan 2023 18:01:52 -0500
Subject: [PATCH 13/24] problem setup

---
 apps/lbmMultiRes/lbmMultiRes.cu | 75 +++++++++++++++++++--------------
 1 file changed, 44 insertions(+), 31 deletions(-)

diff --git a/apps/lbmMultiRes/lbmMultiRes.cu b/apps/lbmMultiRes/lbmMultiRes.cu
index 44b8667d..7ee8de08 100644
--- a/apps/lbmMultiRes/lbmMultiRes.cu
+++ b/apps/lbmMultiRes/lbmMultiRes.cu
@@ -27,9 +27,9 @@ Neon::domain::Stencil create_stencil<2, 9>()
 }
 
 template <>
-Neon::domain::Stencil create_stencil<3, 19>()
+Neon::domain::Stencil create_stencil<3, 27>()
 {
-    return Neon::domain::Stencil::s19_t(false);
+    return Neon::domain::Stencil::s27_t(false);
 }
 
 NEON_CUDA_DEVICE_ONLY static constexpr char latticeVelocity2D[9][2] = {
@@ -112,9 +112,9 @@ struct latticeWeight
 
         if constexpr (DIM == 3) {
             for (int i = 0; i < Q; ++i) {
-                if (latticeVelocity2D[i][0] * latticeVelocity2D[i][0] +
-                        latticeVelocity2D[i][1] * latticeVelocity2D[i][1] +
-                        latticeVelocity2D[i][2] * latticeVelocity2D[i][2] <
+                if (latticeVelocity3D[i][0] * latticeVelocity3D[i][0] +
+                        latticeVelocity3D[i][1] * latticeVelocity3D[i][1] +
+                        latticeVelocity3D[i][2] * latticeVelocity3D[i][2] <
                     1.1f) {
                     t[i] = 2.0f / 36.0f;
                 } else {
@@ -554,7 +554,7 @@ void nonUniformTimestepRecursive(Neon::domain::mGrid&                        gri
 }
 
 
-inline float sdfCube(Neon::index_3d id, Neon::index_3d dim, float b = 1.0)
+inline float sdfCube(Neon::index_3d id, Neon::index_3d dim, Neon::float_3d b = {1.0, 1.0, 1.0})
 {
     auto mapToCube = [&](Neon::index_3d id) {
         //map p to an axis-aligned cube from -1 to 1
@@ -564,10 +564,12 @@ inline float sdfCube(Neon::index_3d id, Neon::index_3d dim, float b = 1.0)
     };
     Neon::float_3d p = mapToCube(id);
 
-    Neon::float_3d d(std::abs(p.x) - b, std::abs(p.y) - b, std::abs(p.z) - b);
+    Neon::float_3d d(std::abs(p.x) - b.x, std::abs(p.y) - b.y, std::abs(p.z) - b.z);
+
     Neon::float_3d d_max(std::max(d.x, 0.f), std::max(d.y, 0.f), std::max(d.z, 0.f));
     float          len = std::sqrt(d_max.x * d_max.x + d_max.y * d_max.y + d_max.z * d_max.z);
-    return std::min(std::max(d.x, std::max(d.y, d.z)), 0.f) + len;
+    float          val = std::min(std::max(d.x, std::max(d.y, d.z)), 0.f) + len;
+    return val;
 }
 
 template <typename T, int DIM, int Q>
@@ -575,6 +577,7 @@ void postProcess(Neon::domain::mGrid&                        grid,
                  const int                                   numLevels,
                  const Neon::domain::mGrid::Field<T>&        fpop,
                  const Neon::domain::mGrid::Field<CellType>& cellType,
+                 const int                                   iteration,
                  Neon::domain::mGrid::Field<T>&              vel,
                  Neon::domain::mGrid::Field<T>&              rho)
 {
@@ -660,8 +663,8 @@ void postProcess(Neon::domain::mGrid&                        grid,
     //vel.updateIO();
     //rho.updateIO();
 
-    vel.ioToVtk("vel_", "vel");
-    rho.ioToVtk("rho_", "rho");
+    vel.ioToVtk("vel_" + std::to_string(iteration), "vel");
+    rho.ioToVtk("rho_" + std::to_string(iteration), "rho");
 }
 
 int main(int argc, char** argv)
@@ -676,35 +679,41 @@ int main(int argc, char** argv)
         std::vector<int> gpu_ids{0};
         Neon::Backend    backend(gpu_ids, runtime);
 
-        constexpr int DIM = 2;
-        constexpr int Q = (DIM == 2) ? 9 : 19;
+        constexpr int DIM = 3;
+        constexpr int Q = (DIM == 2) ? 9 : 27;
         constexpr int depth = 3;
 
-        const Neon::index_3d grid_dim(24, 24, 4);
+        const Neon::domain::mGridDescriptor descriptor(depth);
 
+        //const Neon::index_3d grid_dim(144, 144, 144);
+        //float                levelSDF[depth + 1];
+        //levelSDF[0] = 0;
+        //levelSDF[1] = -28.0 / 144.0;
+        //levelSDF[2] = -56.0 / 144.0;
+        //levelSDF[3] = -1.0;
 
-        const Neon::domain::mGridDescriptor descriptor(depth);
 
-        float levelSDF[depth + 1];
+        const Neon::index_3d grid_dim(24, 24, 24);
+        float                levelSDF[depth + 1];
         levelSDF[0] = 0;
-        levelSDF[1] = -0.3;
-        levelSDF[2] = -0.6;
+        levelSDF[1] = -8 / 24.0;
+        levelSDF[2] = -16 / 24.0;
         levelSDF[3] = -1.0;
 
 
         Neon::domain::mGrid grid(
             backend, grid_dim,
             {[&](const Neon::index_3d id) -> bool {
-                 return sdfCube(id, grid_dim) <= levelSDF[0] &&
-                        sdfCube(id, grid_dim) > levelSDF[1];
+                 return sdfCube(id, grid_dim - 1) <= levelSDF[0] &&
+                        sdfCube(id, grid_dim - 1) > levelSDF[1];
              },
              [&](const Neon::index_3d& id) -> bool {
-                 return sdfCube(id, grid_dim) <= levelSDF[1] &&
-                        sdfCube(id, grid_dim) > levelSDF[2];
+                 return sdfCube(id, grid_dim - 1) <= levelSDF[1] &&
+                        sdfCube(id, grid_dim - 1) > levelSDF[2];
              },
              [&](const Neon::index_3d& id) -> bool {
-                 return sdfCube(id, grid_dim) <= levelSDF[2] &&
-                        sdfCube(id, grid_dim) > levelSDF[3];
+                 return sdfCube(id, grid_dim - 1) <= levelSDF[2] &&
+                        sdfCube(id, grid_dim - 1) > levelSDF[3];
              }},
             create_stencil<DIM, Q>(), descriptor);
 
@@ -712,12 +721,11 @@ int main(int argc, char** argv)
 
 
         //LBM problem
-        const int             max_iter = 300;
-        const T               ulb = 0.02;
-        const T               Re = 100;
+        const int             max_iter = 3000;
+        const T               ulb = 0.04;
+        const T               Re = 1000;
         const T               clength = grid_dim.x;
         const T               visclb = ulb * clength / Re;
-        const T               smagorinskyConstant = 0.02;
         const T               omega = 1.0 / (3. * visclb + 0.5);
         const Neon::double_3d ulid(ulb, 0., 0.);
 
@@ -788,14 +796,18 @@ int main(int argc, char** argv)
                 [&](const Neon::int32_3d idx, const int q, T& val) {
                     val = init(idx, q);
                 });
+            vel.forEachActiveCell(l, [&](const Neon::int32_3d, const int, T& val) {
+                val = 0;
+            });
+            rho.forEachActiveCell(l, [&](const Neon::int32_3d, const int, T& val) {
+                val = 0;
+            });
         }
         fin.updateCompute();
         fout.updateCompute();
 
         //fin.ioToVtk("fin", "f");
 
-        postProcess<T, DIM, Q>(grid, descriptor.getDepth(), fout, cellType, vel, rho);
-
         //skeleton
         std::vector<Neon::set::Container> containers;
         nonUniformTimestepRecursive<T, DIM, Q>(grid,
@@ -812,9 +824,10 @@ int main(int argc, char** argv)
         //execution
         for (int t = 0; t < max_iter; ++t) {
             skl.run();
-            if (t % 10 == 0) {
-                postProcess<T, DIM, Q>(grid, descriptor.getDepth(), fout, cellType, vel, rho);
+            if (t % 100 == 0) {
+                postProcess<T, DIM, Q>(grid, descriptor.getDepth(), fout, cellType, t, vel, rho);
             }
+            
         }
 
         grid.getBackend().syncAll();

From 6c7b0db83adac7b408545697b723fca355924ffa Mon Sep 17 00:00:00 2001
From: Ahmed Mahmoud <ahmed.mahmoud@autodesk.com>
Date: Tue, 31 Jan 2023 19:05:25 -0500
Subject: [PATCH 14/24] remove 2d-related bits

---
 apps/lbmMultiRes/lbmMultiRes.cu | 237 ++++++++++++++------------------
 1 file changed, 106 insertions(+), 131 deletions(-)

diff --git a/apps/lbmMultiRes/lbmMultiRes.cu b/apps/lbmMultiRes/lbmMultiRes.cu
index 7ee8de08..47036ad7 100644
--- a/apps/lbmMultiRes/lbmMultiRes.cu
+++ b/apps/lbmMultiRes/lbmMultiRes.cu
@@ -10,29 +10,7 @@ enum CellType : int
     undefined = 3,
 };
 
-template <unsigned int DIM, unsigned int Q>
-Neon::domain::Stencil create_stencil();
-
-template <>
-Neon::domain::Stencil create_stencil<2, 9>()
-{
-    std::vector<Neon::index_3d> stencil;
-    stencil.reserve(9);
-    for (int x = -1; x <= 1; ++x) {
-        for (int y = -1; y <= 1; ++y) {
-            stencil.emplace_back(Neon::index_3d(x, y, 0));
-        }
-    }
-    return Neon::domain::Stencil(stencil);
-}
-
-template <>
-Neon::domain::Stencil create_stencil<3, 27>()
-{
-    return Neon::domain::Stencil::s27_t(false);
-}
-
-NEON_CUDA_DEVICE_ONLY static constexpr char latticeVelocity2D[9][2] = {
+/*NEON_CUDA_DEVICE_ONLY static constexpr char latticeVelocity2D[9][2] = {
     {0, 0},    //0 -> 0 || 0000 -> 0000
     {0, -1},   //1 -> 2 || 0001 -> 0010
     {0, 1},    //2 -> 1 || 0010 -> 0001
@@ -41,9 +19,9 @@ NEON_CUDA_DEVICE_ONLY static constexpr char latticeVelocity2D[9][2] = {
     {-1, 1},   //5 -> 7 || 0101 -> 0111
     {1, 0},    //6 -> 3 || 0110 -> 0011
     {1, -1},   //7 -> 5 || 0111 -> 0101
-    {1, 1}};   //8 -> 4 || 1000 -> 0100
+    {1, 1}};   //8 -> 4 || 1000 -> 0100*/
 
-NEON_CUDA_DEVICE_ONLY static constexpr char latticeVelocity3D[27][3] = {
+/*NEON_CUDA_DEVICE_ONLY static constexpr char latticeVelocity3D[27][3] = {
     {0, 0, 0},    //0  -> 0   || 00000 -> 00000
     {0, -1, 0},   //1  -> 2   || 00001 -> 00010
     {0, 1, 0},    //2  -> 1   || 00010 -> 00001
@@ -75,53 +53,70 @@ NEON_CUDA_DEVICE_ONLY static constexpr char latticeVelocity3D[27][3] = {
     {1, 1, 1}     //26 -> 13  || 11010 -> 01101
 
 };
+
 NEON_CUDA_DEVICE_ONLY static constexpr char latticeOppositeID[27] = {
     //opposite q for 2d is a subset of what is in 3d so only use one
-    0, 2, 1, 6, 8, 7, 3, 5, 4, 18, 20, 19, 24, 26, 25, 21, 23, 22, 9, 11, 10, 15, 17, 16, 12, 14, 13};
+    0, 2, 1, 6, 8, 7, 3, 5, 4, 18, 20, 19, 24, 26, 25, 21, 23, 22, 9, 11, 10, 15, 17, 16, 12, 14, 13};*/
+
+NEON_CUDA_DEVICE_ONLY static constexpr char latticeVelocity3D[19][3] = {
+    {0, 0, 0},    //0  -> 0   || 00000 -> 00000
+    {0, -1, 0},   //1  -> 2   || 00001 -> 00010
+    {0, 1, 0},    //2  -> 1   || 00010 -> 00001
+    {-1, 0, 0},   //3  -> 6   || 00100 -> 00110
+    {-1, -1, 0},  //4  -> 8   || 00100 -> 01000
+    {-1, 1, 0},   //5  -> 7   || 00101 -> 00111
+    {1, 0, 0},    //6  -> 3   || 00110 -> 00011
+    {1, -1, 0},   //7  -> 5   || 00111 -> 00101
+    {1, 1, 0},    //8  -> 4   || 01000 -> 00100
+
+    {0, 0, -1},   //9  -> 18 || 01000 -> 10010
+    {0, -1, -1},  //10 -> 20 || 01010 -> 10100
+    {0, 1, -1},   //11 -> 19 || 01011 -> 10011
+    {-1, 0, -1},  //12 -> 24 || 01100 -> 11000
+    //{-1, -1, -1},//13 -> 26 || 01101 -> 11010
+    //{-1, 1, -1}, //14 -> 25 || 01110 -> 11001
+    {1, 0, -1},  //15 -> 21 || 01111 -> 10101
+    //{1, -1, -1}, //16 -> 23 || 10000 -> 10111
+    //{1, 1, -1},  //17 -> 22 || 10001 -> 10110
+
+
+    {0, 0, 1},   //18 -> 9   || 10010 -> 01001
+    {0, -1, 1},  //19 -> 11  || 10011 -> 01011
+    {0, 1, 1},   //20 -> 10  || 10100 -> 01010
+    {-1, 0, 1},  //21 -> 15  || 10101 -> 01111
+    //{-1, -1, 1},//22 -> 17  || 10110 -> 10001
+    //{-1, 1, 1}, //23 -> 16  || 10111 -> 10000
+    {1, 0, 1},  //24 -> 12  || 11000 -> 01100
+    //{1, -1, 1}, //25 -> 14  || 11001 -> 01110
+    //{1, 1, 1}   //26 -> 13  || 11010 -> 01101
+
+};
+NEON_CUDA_DEVICE_ONLY static constexpr char latticeOppositeID[19] = {
+    //opposite q for 2d is a subset of what is in 3d so only use one
+    0, 2, 1, 6, 8, 7, 3, 5, 4, 18, 20, 19, 24, 21, 9, 11, 10, 15, 12};
+
 
-template <int DIM>
 NEON_CUDA_HOST_DEVICE Neon::int8_3d getDir(const int8_t q)
 {
-    if constexpr (DIM == 2) {
-        return Neon::int8_3d(latticeVelocity2D[q][0], latticeVelocity2D[q][1], 0);
-    }
-    if constexpr (DIM == 3) {
-        return Neon::int8_3d(latticeVelocity3D[q][0], latticeVelocity3D[q][1], latticeVelocity3D[q][2]);
-    }
+    return Neon::int8_3d(latticeVelocity3D[q][0], latticeVelocity3D[q][1], latticeVelocity3D[q][2]);
 }
 
-template <int DIM, int Q>
+template <int Q>
 struct latticeWeight
 {
     NEON_CUDA_HOST_DEVICE __inline__ constexpr latticeWeight()
         : t()
     {
-        if constexpr (DIM == 2) {
-
-            for (int i = 0; i < Q; ++i) {
-                if (latticeVelocity2D[i][0] * latticeVelocity2D[i][0] +
-                        latticeVelocity2D[i][1] * latticeVelocity2D[i][1] <
-                    1.1f) {
-                    t[i] = 1.0f / 9.0f;
-                } else {
-                    t[i] = 1.0f / 36.0f;
-                }
+        t[0] = 1.0f / 3.0f;
+        for (int i = 1; i < Q; ++i) {
+            if (latticeVelocity3D[i][0] * latticeVelocity3D[i][0] +
+                    latticeVelocity3D[i][1] * latticeVelocity3D[i][1] +
+                    latticeVelocity3D[i][2] * latticeVelocity3D[i][2] <
+                1.1f) {
+                t[i] = 2.0f / 36.0f;
+            } else {
+                t[i] = 1.0f / 36.0f;
             }
-            t[0] = 4.0f / 9.0f;
-        }
-
-        if constexpr (DIM == 3) {
-            for (int i = 0; i < Q; ++i) {
-                if (latticeVelocity3D[i][0] * latticeVelocity3D[i][0] +
-                        latticeVelocity3D[i][1] * latticeVelocity3D[i][1] +
-                        latticeVelocity3D[i][2] * latticeVelocity3D[i][2] <
-                    1.1f) {
-                    t[i] = 2.0f / 36.0f;
-                } else {
-                    t[i] = 1.0f / 36.0f;
-                }
-            }
-            t[0] = 1.0f / 3.0f;
         }
     }
     float t[Q];
@@ -158,36 +153,26 @@ NEON_CUDA_HOST_DEVICE T computeOmega(T omega0, int level, int numLevels)
     return 2 * omega0 / (scalbln(1.0, ilevel + 1) + (1. - scalbln(1.0, ilevel)) * omega0);
 }
 
-template <typename T, int DIM, int Q>
+template <typename T, int Q>
 NEON_CUDA_HOST_DEVICE Neon::Vec_3d<T> velocity(const T* fin,
                                                const T  rho)
 {
     Neon::Vec_3d<T> vel(0, 0, 0);
-    if constexpr (DIM == 2) {
-        for (int i = 0; i < Q; ++i) {
-            const T f = fin[i];
-            for (int d = 0; d < DIM; ++d) {
-                vel.v[d] += f * latticeVelocity2D[i][d];
-            }
-        }
-    }
 
-    if constexpr (DIM == 3) {
-        for (int i = 0; i < Q; ++i) {
-            const T f = fin[i];
-            for (int d = 0; d < DIM; ++d) {
-                vel.v[d] += f * latticeVelocity3D[i][d];
-            }
+    for (int i = 0; i < Q; ++i) {
+        const T f = fin[i];
+        for (int d = 0; d < 3; ++d) {
+            vel.v[d] += f * latticeVelocity3D[i][d];
         }
     }
 
-    for (int d = 0; d < DIM; ++d) {
+    for (int d = 0; d < 3; ++d) {
         vel.v[d] /= rho;
     }
     return vel;
 }
 
-template <typename T, int DIM, int Q>
+template <typename T, int Q>
 Neon::set::Container collide(Neon::domain::mGrid&                        grid,
                              T                                           omega0,
                              int                                         level,
@@ -209,7 +194,7 @@ Neon::set::Container collide(Neon::domain::mGrid&                        grid,
 
                     if (!in.hasChildren(cell)) {
 
-                        constexpr auto t = latticeWeight<DIM, Q>();
+                        constexpr auto t = latticeWeight<Q>();
 
                         //fin
                         T ins[Q];
@@ -224,18 +209,14 @@ Neon::set::Container collide(Neon::domain::mGrid&                        grid,
                         }
 
                         //velocity
-                        const Neon::Vec_3d<T> vel = velocity<T, DIM, Q>(ins, rho);
+                        const Neon::Vec_3d<T> vel = velocity<T, Q>(ins, rho);
 
 
                         const T usqr = (3.0 / 2.0) * (vel.x * vel.x + vel.y * vel.y + vel.z * vel.z);
                         for (int i = 0; i < Q; ++i) {
                             T cu = 0;
-                            for (int d = 0; d < DIM; ++d) {
-                                if constexpr (DIM == 2) {
-                                    cu += latticeVelocity2D[i][d] * vel.v[d];
-                                } else {
-                                    cu += latticeVelocity3D[i][d] * vel.v[d];
-                                }
+                            for (int d = 0; d < 3; ++d) {
+                                cu += latticeVelocity3D[i][d] * vel.v[d];
                             }
                             //equilibrium
                             T feq = rho * t.t[i] * (1. + cu + 0.5 * cu * cu - usqr);
@@ -249,7 +230,7 @@ Neon::set::Container collide(Neon::domain::mGrid&                        grid,
         });
 }
 
-template <typename T, int DIM, int Q>
+template <typename T, int Q>
 Neon::set::Container stream(Neon::domain::mGrid&                        grid,
                             int                                         level,
                             const Neon::domain::mGrid::Field<CellType>& cellType,
@@ -273,7 +254,7 @@ Neon::set::Container stream(Neon::domain::mGrid&                        grid,
                     if (!fpost_stm.hasChildren(cell)) {
 
                         for (int8_t q = 0; q < Q; ++q) {
-                            const Neon::int8_3d dir = -getDir<DIM>(q);
+                            const Neon::int8_3d dir = -getDir(q);
 
                             //if the neighbor cell has children, then this 'cell' is interfacing with L-1 (fine) along q direction
                             if (!fpost_stm.hasChildren(cell, dir)) {
@@ -292,7 +273,7 @@ Neon::set::Container stream(Neon::domain::mGrid&                        grid,
         });
 }
 
-template <typename T, int DIM, int Q>
+template <typename T, int Q>
 Neon::set::Container explosionPull(Neon::domain::mGrid&                 grid,
                                    int                                  level,
                                    const Neon::domain::mGrid::Field<T>& postCollision,
@@ -316,7 +297,7 @@ Neon::set::Container explosionPull(Neon::domain::mGrid&                 grid,
                 if (!fpost_stm.hasChildren(cell)) {
                     for (int8_t q = 1; q < Q; ++q) {
 
-                        const Neon::int8_3d dir = -getDir<DIM>(q);
+                        const Neon::int8_3d dir = -getDir(q);
 
                         //if the neighbor cell has children, then this 'cell' is interfacing with L-1 (fine) along q direction
                         //we want to only work on cells that interface with L+1 (coarse) cell along q
@@ -346,7 +327,7 @@ Neon::set::Container explosionPull(Neon::domain::mGrid&                 grid,
 }
 
 
-template <typename T, int DIM, int Q>
+template <typename T, int Q>
 Neon::set::Container coalescencePull(Neon::domain::mGrid&           grid,
                                      int                            level,
                                      Neon::domain::mGrid::Field<T>& postStreaming)
@@ -365,7 +346,7 @@ Neon::set::Container coalescencePull(Neon::domain::mGrid&           grid,
                 if (!fpost_stm.hasChildren(cell)) {
 
                     for (int q = 1; q < Q; ++q) {
-                        const Neon::int8_3d dir = -getDir<DIM>(q);
+                        const Neon::int8_3d dir = -getDir(q);
                         //if we have a neighbor at the same level that has been refined, then cell is on
                         //the interface and this is where we should do the coalescence
                         if (fpost_stm.hasChildren(cell, dir)) {
@@ -381,7 +362,7 @@ Neon::set::Container coalescencePull(Neon::domain::mGrid&           grid,
 }
 
 
-template <typename T, int DIM, int Q>
+template <typename T, int Q>
 Neon::set::Container store(Neon::domain::mGrid&           grid,
                            int                            level,
                            Neon::domain::mGrid::Field<T>& postStreaming)
@@ -416,7 +397,7 @@ Neon::set::Container store(Neon::domain::mGrid&           grid,
                     //for each direction aka for each neighbor
                     //we skip the center here
                     for (int8_t q = 1; q < Q; ++q) {
-                        const Neon::int8_3d q_dir = getDir<DIM>(q);
+                        const Neon::int8_3d q_dir = getDir(q);
 
                         //check if the neighbor in this direction has children
                         auto neighborCell = fpost_stm.getNghCell(cell, q_dir);
@@ -431,7 +412,7 @@ Neon::set::Container store(Neon::domain::mGrid&           grid,
 
                                 //for every neighbor cell including the center cell (i.e., cell)
                                 for (int8_t p = 0; p < Q; ++p) {
-                                    const Neon::int8_3d p_dir = getDir<DIM>(p);
+                                    const Neon::int8_3d p_dir = getDir(p);
 
                                     //relative direction of q w.r.t p
                                     //i.e., in which direction we should move starting from p to land on q
@@ -473,7 +454,7 @@ Neon::set::Container store(Neon::domain::mGrid&           grid,
 }
 
 
-template <typename T, int DIM, int Q>
+template <typename T, int Q>
 void stream(Neon::domain::mGrid&                        grid,
             int                                         level,
             const int                                   numLevels,
@@ -482,7 +463,7 @@ void stream(Neon::domain::mGrid&                        grid,
             Neon::domain::mGrid::Field<T>&              postStreaming,
             std::vector<Neon::set::Container>&          containers)
 {
-    containers.push_back(stream<T, DIM, Q>(grid, level, cellType, postCollision, postStreaming));
+    containers.push_back(stream<T, Q>(grid, level, cellType, postCollision, postStreaming));
 
     /*
     * Streaming for interface voxels that have
@@ -493,18 +474,18 @@ void stream(Neon::domain::mGrid&                        grid,
         /* Explosion: pull missing populations from coarser neighbors by copying coarse (level+1) to fine (level) 
         * neighbors, initiated by the fine level ("Pull").
         */
-        containers.push_back(explosionPull<T, DIM, Q>(grid, level, postCollision, postStreaming));
+        containers.push_back(explosionPull<T, Q>(grid, level, postCollision, postStreaming));
     }
 
     if (level != 0) {
         /* Coalescence: pull missing populations from finer neighbors by "smart" averaging fine (level-1) 
         * to coarse (level) communication, initiated by the coarse level ("Pull").
         */
-        containers.push_back(coalescencePull<T, DIM, Q>(grid, level, postStreaming));
+        containers.push_back(coalescencePull<T, Q>(grid, level, postStreaming));
     }
 }
 
-template <typename T, int DIM, int Q>
+template <typename T, int Q>
 void nonUniformTimestepRecursive(Neon::domain::mGrid&                        grid,
                                  const T                                     omega0,
                                  const int                                   level,
@@ -515,21 +496,21 @@ void nonUniformTimestepRecursive(Neon::domain::mGrid&                        gri
                                  std::vector<Neon::set::Container>&          containers)
 {
     // 1) collision for all voxels at level L=level
-    containers.push_back(collide<T, DIM, Q>(grid, omega0, level, numLevels, cellType, fin, fout));
+    containers.push_back(collide<T, Q>(grid, omega0, level, numLevels, cellType, fin, fout));
 
     // 2) Storing fine (level - 1) data for later "coalescence" pulled by the coarse (level)
     if (level != numLevels - 1) {
-        containers.push_back(store<T, DIM, Q>(grid, level + 1, fout));
+        containers.push_back(store<T, Q>(grid, level + 1, fout));
     }
 
 
     // 3) recurse down
     if (level != 0) {
-        nonUniformTimestepRecursive<T, DIM, Q>(grid, omega0, level - 1, numLevels, cellType, fin, fout, containers);
+        nonUniformTimestepRecursive<T, Q>(grid, omega0, level - 1, numLevels, cellType, fin, fout, containers);
     }
 
     // 4) Streaming step that also performs the necessary "explosion" and "coalescence" steps.
-    stream<T, DIM, Q>(grid, level, numLevels, cellType, fout, fin, containers);
+    stream<T, Q>(grid, level, numLevels, cellType, fout, fin, containers);
 
     // 5) stop
     if (level == numLevels - 1) {
@@ -537,20 +518,20 @@ void nonUniformTimestepRecursive(Neon::domain::mGrid&                        gri
     }
 
     // 6) collision for all voxels at level L = level
-    containers.push_back(collide<T, DIM, Q>(grid, omega0, level, numLevels, cellType, fin, fout));
+    containers.push_back(collide<T, Q>(grid, omega0, level, numLevels, cellType, fin, fout));
 
     // 7) Storing fine(level) data for later "coalescence" pulled by the coarse(level)
     if (level != numLevels - 1) {
-        containers.push_back(store<T, DIM, Q>(grid, level + 1, fout));
+        containers.push_back(store<T, Q>(grid, level + 1, fout));
     }
 
     // 8) recurse down
     if (level != 0) {
-        nonUniformTimestepRecursive<T, DIM, Q>(grid, omega0, level - 1, numLevels, cellType, fin, fout, containers);
+        nonUniformTimestepRecursive<T, Q>(grid, omega0, level - 1, numLevels, cellType, fin, fout, containers);
     }
 
     // 9) Streaming step
-    stream<T, DIM, Q>(grid, level, numLevels, cellType, fout, fin, containers);
+    stream<T, Q>(grid, level, numLevels, cellType, fout, fin, containers);
 }
 
 
@@ -572,7 +553,7 @@ inline float sdfCube(Neon::index_3d id, Neon::index_3d dim, Neon::float_3d b = {
     return val;
 }
 
-template <typename T, int DIM, int Q>
+template <typename T, int Q>
 void postProcess(Neon::domain::mGrid&                        grid,
                  const int                                   numLevels,
                  const Neon::domain::mGrid::Field<T>&        fpop,
@@ -595,7 +576,7 @@ void postProcess(Neon::domain::mGrid&                        grid,
 
 
                     return [=] NEON_CUDA_HOST_DEVICE(const typename Neon::domain::bGrid::Cell& cell) mutable {
-                        constexpr auto t = latticeWeight<DIM, Q>();
+                        constexpr auto t = latticeWeight<Q>();
 
                         if (type(cell, 0) == CellType::bulk) {
 
@@ -613,7 +594,7 @@ void postProcess(Neon::domain::mGrid&                        grid,
                             rh(cell, 0) = r;
 
                             //velocity
-                            const Neon::Vec_3d<T> vel = velocity<T, DIM, Q>(ins, r);
+                            const Neon::Vec_3d<T> vel = velocity<T, Q>(ins, r);
 
                             u(cell, 0) = vel.v[0];
                             u(cell, 1) = vel.v[1];
@@ -625,7 +606,7 @@ void postProcess(Neon::domain::mGrid&                        grid,
                             if (!pop.hasChildren(cell)) {
                                 assert(!pop.hasChildren(cell));
                                 for (int8_t q = 0; q < Q; ++q) {
-                                    const Neon::int8_3d dir = -getDir<DIM>(q);
+                                    const Neon::int8_3d dir = -getDir(q);
                                     if (!pop.hasChildren(cell, dir)) {
                                         auto neighbor = pop.nghVal(cell, dir, q, T(0));
                                         if (neighbor.isValid) {
@@ -637,9 +618,9 @@ void postProcess(Neon::domain::mGrid&                        grid,
 
                                 rh(cell, 0) = r;
 
-                                const Neon::Vec_3d<T> vel = velocity<T, DIM, Q>(ins, rho);
+                                const Neon::Vec_3d<T> vel = velocity<T, Q>(ins, rho);
 
-                                for (int d = 0; d < DIM; ++d) {
+                                for (int d = 0; d < 3; ++d) {
                                     u(cell, d) = vel.v[d];
                                 }
                             }*/
@@ -647,7 +628,7 @@ void postProcess(Neon::domain::mGrid&                        grid,
                         if (type(cell, 0) == CellType::movingWall) {
                             rh(cell, 0) = 1.0;
 
-                            for (int d = 0; d < DIM; ++d) {
+                            for (int d = 0; d < 3; ++d) {
                                 int i = (d == 0) ? 3 : ((d == 1) ? 1 : 9);
                                 u(cell, d) = pop(cell, i) / (6.0 * 1.0 / 18.0);
                             }
@@ -679,8 +660,7 @@ int main(int argc, char** argv)
         std::vector<int> gpu_ids{0};
         Neon::Backend    backend(gpu_ids, runtime);
 
-        constexpr int DIM = 3;
-        constexpr int Q = (DIM == 2) ? 9 : 27;
+        constexpr int Q = 19;
         constexpr int depth = 3;
 
         const Neon::domain::mGridDescriptor descriptor(depth);
@@ -715,7 +695,7 @@ int main(int argc, char** argv)
                  return sdfCube(id, grid_dim - 1) <= levelSDF[2] &&
                         sdfCube(id, grid_dim - 1) > levelSDF[3];
              }},
-            create_stencil<DIM, Q>(), descriptor);
+            Neon::domain::Stencil::s19_t(false), descriptor);
 
         //grid.topologyToVTK("lbm.vtk", false);
 
@@ -759,7 +739,7 @@ int main(int argc, char** argv)
 
 
         // init fin and fout
-        constexpr auto t = latticeWeight<DIM, Q>();
+        constexpr auto t = latticeWeight<Q>();
 
         auto init = [&](const Neon::int32_3d idx, const int q) {
             T ret = t.t[q];
@@ -770,12 +750,8 @@ int main(int argc, char** argv)
 
                 if (idx.y == grid_dim.y - 1) {
                     ret = 0;
-                    for (int d = 0; d < DIM; ++d) {
-                        if (DIM == 2) {
-                            ret += latticeVelocity2D[q][d] * ulid.v[d];
-                        } else {
-                            ret += latticeVelocity3D[q][d] * ulid.v[d];
-                        }
+                    for (int d = 0; d < 3; ++d) {
+                        ret += latticeVelocity3D[q][d] * ulid.v[d];
                     }
                     ret *= -6. * t.t[q] * ulb;
                 } else {
@@ -810,12 +786,12 @@ int main(int argc, char** argv)
 
         //skeleton
         std::vector<Neon::set::Container> containers;
-        nonUniformTimestepRecursive<T, DIM, Q>(grid,
-                                               omega,
-                                               descriptor.getDepth() - 1,
-                                               descriptor.getDepth(),
-                                               cellType,
-                                               fin, fout, containers);
+        nonUniformTimestepRecursive<T, Q>(grid,
+                                          omega,
+                                          descriptor.getDepth() - 1,
+                                          descriptor.getDepth(),
+                                          cellType,
+                                          fin, fout, containers);
 
         Neon::skeleton::Skeleton skl(grid.getBackend());
         skl.sequence(containers, "MultiResLBM");
@@ -825,9 +801,8 @@ int main(int argc, char** argv)
         for (int t = 0; t < max_iter; ++t) {
             skl.run();
             if (t % 100 == 0) {
-                postProcess<T, DIM, Q>(grid, descriptor.getDepth(), fout, cellType, t, vel, rho);
+                postProcess<T, Q>(grid, descriptor.getDepth(), fout, cellType, t, vel, rho);
             }
-            
         }
 
         grid.getBackend().syncAll();

From 4e1e083b0209cce029836eea425744ab3a2da7df Mon Sep 17 00:00:00 2001
From: Ahmed Mahmoud <ahmed.mahmoud@autodesk.com>
Date: Tue, 31 Jan 2023 20:59:51 -0500
Subject: [PATCH 15/24] fix initialization and stream bugs

---
 apps/lbmMultiRes/lbmMultiRes.cu | 187 ++++++++++++++------------------
 1 file changed, 80 insertions(+), 107 deletions(-)

diff --git a/apps/lbmMultiRes/lbmMultiRes.cu b/apps/lbmMultiRes/lbmMultiRes.cu
index 47036ad7..385400b6 100644
--- a/apps/lbmMultiRes/lbmMultiRes.cu
+++ b/apps/lbmMultiRes/lbmMultiRes.cu
@@ -59,41 +59,32 @@ NEON_CUDA_DEVICE_ONLY static constexpr char latticeOppositeID[27] = {
     0, 2, 1, 6, 8, 7, 3, 5, 4, 18, 20, 19, 24, 26, 25, 21, 23, 22, 9, 11, 10, 15, 17, 16, 12, 14, 13};*/
 
 NEON_CUDA_DEVICE_ONLY static constexpr char latticeVelocity3D[19][3] = {
-    {0, 0, 0},    //0  -> 0   || 00000 -> 00000
-    {0, -1, 0},   //1  -> 2   || 00001 -> 00010
-    {0, 1, 0},    //2  -> 1   || 00010 -> 00001
-    {-1, 0, 0},   //3  -> 6   || 00100 -> 00110
-    {-1, -1, 0},  //4  -> 8   || 00100 -> 01000
-    {-1, 1, 0},   //5  -> 7   || 00101 -> 00111
-    {1, 0, 0},    //6  -> 3   || 00110 -> 00011
-    {1, -1, 0},   //7  -> 5   || 00111 -> 00101
-    {1, 1, 0},    //8  -> 4   || 01000 -> 00100
-
-    {0, 0, -1},   //9  -> 18 || 01000 -> 10010
-    {0, -1, -1},  //10 -> 20 || 01010 -> 10100
-    {0, 1, -1},   //11 -> 19 || 01011 -> 10011
-    {-1, 0, -1},  //12 -> 24 || 01100 -> 11000
-    //{-1, -1, -1},//13 -> 26 || 01101 -> 11010
-    //{-1, 1, -1}, //14 -> 25 || 01110 -> 11001
-    {1, 0, -1},  //15 -> 21 || 01111 -> 10101
-    //{1, -1, -1}, //16 -> 23 || 10000 -> 10111
-    //{1, 1, -1},  //17 -> 22 || 10001 -> 10110
-
-
-    {0, 0, 1},   //18 -> 9   || 10010 -> 01001
-    {0, -1, 1},  //19 -> 11  || 10011 -> 01011
-    {0, 1, 1},   //20 -> 10  || 10100 -> 01010
-    {-1, 0, 1},  //21 -> 15  || 10101 -> 01111
-    //{-1, -1, 1},//22 -> 17  || 10110 -> 10001
-    //{-1, 1, 1}, //23 -> 16  || 10111 -> 10000
-    {1, 0, 1},  //24 -> 12  || 11000 -> 01100
-    //{1, -1, 1}, //25 -> 14  || 11001 -> 01110
-    //{1, 1, 1}   //26 -> 13  || 11010 -> 01101
+    {0, 0, 0},    //0  -> 0
+    {0, -1, 0},   //1  -> 2
+    {0, 1, 0},    //2  -> 1
+    {-1, 0, 0},   //3  -> 6
+    {-1, -1, 0},  //4  -> 8
+    {-1, 1, 0},   //5  -> 7
+    {1, 0, 0},    //6  -> 3
+    {1, -1, 0},   //7  -> 5
+    {1, 1, 0},    //8  -> 4
+
+    {0, 0, -1},   //9  -> 14
+    {0, -1, -1},  //10 -> 16
+    {0, 1, -1},   //11 -> 15
+    {-1, 0, -1},  //12 -> 18
+    {1, 0, -1},   //13 -> 17
+
+    {0, 0, 1},   //14 -> 9
+    {0, -1, 1},  //15 -> 11
+    {0, 1, 1},   //16 -> 10
+    {-1, 0, 1},  //17 -> 13
+    {1, 0, 1},   //18 -> 12
 
 };
 NEON_CUDA_DEVICE_ONLY static constexpr char latticeOppositeID[19] = {
     //opposite q for 2d is a subset of what is in 3d so only use one
-    0, 2, 1, 6, 8, 7, 3, 5, 4, 18, 20, 19, 24, 21, 9, 11, 10, 15, 12};
+    0, 2, 1, 6, 8, 7, 3, 5, 4, 14, 16, 15, 18, 17, 9, 11, 10, 13, 12};
 
 
 NEON_CUDA_HOST_DEVICE Neon::int8_3d getDir(const int8_t q)
@@ -258,12 +249,15 @@ Neon::set::Container stream(Neon::domain::mGrid&                        grid,
 
                             //if the neighbor cell has children, then this 'cell' is interfacing with L-1 (fine) along q direction
                             if (!fpost_stm.hasChildren(cell, dir)) {
-
-                                if (type.nghVal(cell, dir, 0, CellType::undefined).value == CellType::bulk) {
-                                    fpost_stm(cell, q) = fpost_col.nghVal(cell, dir, q, T(0)).value;
-                                } else {
-                                    const int8_t opposte_q = latticeOppositeID[q];
-                                    fpost_stm(cell, q) = fpost_col(cell, opposte_q) + fpost_col.nghVal(cell, dir, opposte_q, T(0)).value;
+                                auto nghType = type.nghVal(cell, dir, 0, CellType::undefined);
+
+                                if (nghType.isValid) {
+                                    if (nghType.value == CellType::bulk) {
+                                        fpost_stm(cell, q) = fpost_col.nghVal(cell, dir, q, T(0)).value;
+                                    } else {
+                                        const int8_t opposte_q = latticeOppositeID[q];
+                                        fpost_stm(cell, q) = fpost_col(cell, opposte_q) + fpost_col.nghVal(cell, dir, opposte_q, T(0)).value;
+                                    }
                                 }
                             }
                         }
@@ -578,59 +572,36 @@ void postProcess(Neon::domain::mGrid&                        grid,
                     return [=] NEON_CUDA_HOST_DEVICE(const typename Neon::domain::bGrid::Cell& cell) mutable {
                         constexpr auto t = latticeWeight<Q>();
 
-                        if (type(cell, 0) == CellType::bulk) {
+                        if (!pop.hasChildren(cell)) {
+                            if (type(cell, 0) == CellType::bulk) {
 
-                            //fin
-                            T ins[Q];
-                            for (int i = 0; i < Q; ++i) {
-                                ins[i] = pop(cell, i);
-                            }
-
-                            //density
-                            T r = 0;
-                            for (int i = 0; i < Q; ++i) {
-                                r += ins[i];
-                            }
-                            rh(cell, 0) = r;
-
-                            //velocity
-                            const Neon::Vec_3d<T> vel = velocity<T, Q>(ins, r);
-
-                            u(cell, 0) = vel.v[0];
-                            u(cell, 1) = vel.v[1];
-                            u(cell, 2) = vel.v[2];
-
-
-                            /*T st[Q];
-                            T r = 0;
-                            if (!pop.hasChildren(cell)) {
-                                assert(!pop.hasChildren(cell));
-                                for (int8_t q = 0; q < Q; ++q) {
-                                    const Neon::int8_3d dir = -getDir(q);
-                                    if (!pop.hasChildren(cell, dir)) {
-                                        auto neighbor = pop.nghVal(cell, dir, q, T(0));
-                                        if (neighbor.isValid) {
-                                            st[q] = neighbor.value;
-                                            r += st[q];
-                                        }
-                                    }
+                                //fin
+                                T ins[Q];
+                                for (int i = 0; i < Q; ++i) {
+                                    ins[i] = pop(cell, i);
                                 }
 
+                                //density
+                                T r = 0;
+                                for (int i = 0; i < Q; ++i) {
+                                    r += ins[i];
+                                }
                                 rh(cell, 0) = r;
 
-                                const Neon::Vec_3d<T> vel = velocity<T, Q>(ins, rho);
+                                //velocity
+                                const Neon::Vec_3d<T> vel = velocity<T, Q>(ins, r);
+
+                                u(cell, 0) = vel.v[0];
+                                u(cell, 1) = vel.v[1];
+                                u(cell, 2) = vel.v[2];
+                            }
+                            if (type(cell, 0) == CellType::movingWall) {
+                                rh(cell, 0) = 1.0;
 
                                 for (int d = 0; d < 3; ++d) {
-                                    u(cell, d) = vel.v[d];
+                                    int i = (d == 0) ? 3 : ((d == 1) ? 1 : 9);
+                                    u(cell, d) = pop(cell, i) / (6.0 * 1.0 / 18.0);
                                 }
-                            }*/
-                        }
-                        if (type(cell, 0) == CellType::movingWall) {
-                            rh(cell, 0) = 1.0;
-
-                            for (int d = 0; d < 3; ++d) {
-                                int i = (d == 0) ? 3 : ((d == 1) ? 1 : 9);
-                                u(cell, d) = pop(cell, i) / (6.0 * 1.0 / 18.0);
                             }
                         }
                     };
@@ -665,22 +636,22 @@ int main(int argc, char** argv)
 
         const Neon::domain::mGridDescriptor descriptor(depth);
 
-        //const Neon::index_3d grid_dim(144, 144, 144);
-        //float                levelSDF[depth + 1];
-        //levelSDF[0] = 0;
-        //levelSDF[1] = -28.0 / 144.0;
-        //levelSDF[2] = -56.0 / 144.0;
-        //levelSDF[3] = -1.0;
-
-
-        const Neon::index_3d grid_dim(24, 24, 24);
+        const Neon::index_3d grid_dim(144, 144, 144);
         float                levelSDF[depth + 1];
         levelSDF[0] = 0;
-        levelSDF[1] = -8 / 24.0;
-        levelSDF[2] = -16 / 24.0;
+        levelSDF[1] = -28.0 / 144.0;
+        levelSDF[2] = -56.0 / 144.0;
         levelSDF[3] = -1.0;
 
 
+        //const Neon::index_3d grid_dim(24, 24, 24);
+        //float                levelSDF[depth + 1];
+        //levelSDF[0] = 0;
+        //levelSDF[1] = -8 / 24.0;
+        //levelSDF[2] = -16 / 24.0;
+        //levelSDF[3] = -1.0;
+
+
         Neon::domain::mGrid grid(
             backend, grid_dim,
             {[&](const Neon::index_3d id) -> bool {
@@ -741,21 +712,23 @@ int main(int argc, char** argv)
         // init fin and fout
         constexpr auto t = latticeWeight<Q>();
 
-        auto init = [&](const Neon::int32_3d idx, const int q) {
+        auto init = [&](const Neon::int32_3d idx, const int q, const int l) {
             T ret = t.t[q];
 
-            if (idx.x == 0 || idx.x == grid_dim.x - 1 ||
-                idx.y == 0 || idx.y == grid_dim.y - 1 ||
-                idx.z == 0 || idx.z == grid_dim.z - 1) {
+            if (l == 0) {
+                if (idx.x == 0 || idx.x == grid_dim.x - 1 ||
+                    idx.y == 0 || idx.y == grid_dim.y - 1 ||
+                    idx.z == 0 || idx.z == grid_dim.z - 1) {
 
-                if (idx.y == grid_dim.y - 1) {
-                    ret = 0;
-                    for (int d = 0; d < 3; ++d) {
-                        ret += latticeVelocity3D[q][d] * ulid.v[d];
+                    if (idx.y == grid_dim.y - 1) {
+                        ret = 0;
+                        for (int d = 0; d < 3; ++d) {
+                            ret += latticeVelocity3D[q][d] * ulid.v[d];
+                        }
+                        ret *= -6. * t.t[q] * ulb;
+                    } else {
+                        ret = 0;
                     }
-                    ret *= -6. * t.t[q] * ulb;
-                } else {
-                    ret = 0;
                 }
             }
             return ret;
@@ -765,12 +738,12 @@ int main(int argc, char** argv)
             fin.forEachActiveCell(
                 l,
                 [&](const Neon::int32_3d idx, const int q, T& val) {
-                    val = init(idx, q);
+                    val = init(idx, q, l);
                 });
             fout.forEachActiveCell(
                 l,
                 [&](const Neon::int32_3d idx, const int q, T& val) {
-                    val = init(idx, q);
+                    val = init(idx, q, l);
                 });
             vel.forEachActiveCell(l, [&](const Neon::int32_3d, const int, T& val) {
                 val = 0;
@@ -795,7 +768,7 @@ int main(int argc, char** argv)
 
         Neon::skeleton::Skeleton skl(grid.getBackend());
         skl.sequence(containers, "MultiResLBM");
-        //skl.ioToDot("MultiResLBM", "", true);
+        // skl.ioToDot("MultiResLBM", "", true);
 
         //execution
         for (int t = 0; t < max_iter; ++t) {

From 999efd4295b114b8c014f11b9198c0b8e91a88ed Mon Sep 17 00:00:00 2001
From: Ahmed Mahmoud <ahmed.mahmoud@autodesk.com>
Date: Thu, 2 Feb 2023 11:18:03 -0500
Subject: [PATCH 16/24] better multi-res ioToVTK and minor fixes to multi-res
 LBM

---
 apps/lbmMultiRes/lbmMultiRes.cu               |  63 +++---
 .../Neon/domain/internal/mGrid/mField.h       |   8 +-
 .../Neon/domain/internal/mGrid/mField_imp.h   | 179 +++++++++++++++---
 .../Neon/domain/internal/mGrid/mGrid.h        |   1 -
 .../src/domain/internal/mGrid/mGrid.cpp       | 144 --------------
 .../tests/unit/gUt_mGrid/src/gUt_mGrid.cpp    |   4 +-
 .../unit/sUt_multiRes/src/MultiResChild.h     |   5 +-
 .../unit/sUt_multiRes/src/MultiResDemo.h      |   6 +-
 .../tests/unit/sUt_multiRes/src/MultiResMap.h |   5 +-
 .../unit/sUt_multiRes/src/MultiResParent.h    |   5 +-
 .../unit/sUt_multiRes/src/MultiResSkeleton.h  |   5 +-
 .../unit/sUt_multiRes/src/MultiResStencil.h   |   5 +-
 12 files changed, 207 insertions(+), 223 deletions(-)

diff --git a/apps/lbmMultiRes/lbmMultiRes.cu b/apps/lbmMultiRes/lbmMultiRes.cu
index 385400b6..49463060 100644
--- a/apps/lbmMultiRes/lbmMultiRes.cu
+++ b/apps/lbmMultiRes/lbmMultiRes.cu
@@ -2,6 +2,7 @@
 #include "Neon/domain/mGrid.h"
 #include "Neon/skeleton/Skeleton.h"
 
+
 enum CellType : int
 {
     bounceBack = 0,
@@ -209,6 +210,8 @@ Neon::set::Container collide(Neon::domain::mGrid&                        grid,
                             for (int d = 0; d < 3; ++d) {
                                 cu += latticeVelocity3D[i][d] * vel.v[d];
                             }
+                            cu *= 3.0;
+
                             //equilibrium
                             T feq = rho * t.t[i] * (1. + cu + 0.5 * cu * cu - usqr);
 
@@ -556,7 +559,7 @@ void postProcess(Neon::domain::mGrid&                        grid,
                  Neon::domain::mGrid::Field<T>&              vel,
                  Neon::domain::mGrid::Field<T>&              rho)
 {
-    //fpop.updateIO();
+    grid.getBackend().syncAll();
 
     for (int level = 0; level < numLevels; ++level) {
         auto container =
@@ -612,11 +615,17 @@ void postProcess(Neon::domain::mGrid&                        grid,
 
     grid.getBackend().syncAll();
 
-    //vel.updateIO();
-    //rho.updateIO();
+    if (grid.getBackend().runtime() == Neon::Runtime::stream) {
+        vel.updateIO();
+        rho.updateIO();
+    }
+
+    int                precision = 4;
+    std::ostringstream suffix;
+    suffix << std::setw(precision) << std::setfill('0') << iteration;
 
-    vel.ioToVtk("vel_" + std::to_string(iteration), "vel");
-    rho.ioToVtk("rho_" + std::to_string(iteration), "rho");
+    vel.ioToVtk("Velocity_" + suffix.str());
+    rho.ioToVtk("Density_" + suffix.str());
 }
 
 int main(int argc, char** argv)
@@ -636,22 +645,22 @@ int main(int argc, char** argv)
 
         const Neon::domain::mGridDescriptor descriptor(depth);
 
-        const Neon::index_3d grid_dim(144, 144, 144);
-        float                levelSDF[depth + 1];
-        levelSDF[0] = 0;
-        levelSDF[1] = -28.0 / 144.0;
-        levelSDF[2] = -56.0 / 144.0;
-        levelSDF[3] = -1.0;
-
-
-        //const Neon::index_3d grid_dim(24, 24, 24);
+        //const Neon::index_3d grid_dim(144, 144, 144);
         //float                levelSDF[depth + 1];
         //levelSDF[0] = 0;
-        //levelSDF[1] = -8 / 24.0;
-        //levelSDF[2] = -16 / 24.0;
+        //levelSDF[1] = -28.0 / 144.0;
+        //levelSDF[2] = -56.0 / 144.0;
         //levelSDF[3] = -1.0;
 
 
+        const Neon::index_3d grid_dim(24, 24, 24);
+        float                levelSDF[depth + 1];
+        levelSDF[0] = 0;
+        levelSDF[1] = -8 / 24.0;
+        levelSDF[2] = -16 / 24.0;
+        levelSDF[3] = -1.0;
+
+
         Neon::domain::mGrid grid(
             backend, grid_dim,
             {[&](const Neon::index_3d id) -> bool {
@@ -668,14 +677,11 @@ int main(int argc, char** argv)
              }},
             Neon::domain::Stencil::s19_t(false), descriptor);
 
-        //grid.topologyToVTK("lbm.vtk", false);
-
-
         //LBM problem
         const int             max_iter = 3000;
         const T               ulb = 0.04;
-        const T               Re = 1000;
-        const T               clength = grid_dim.x;
+        const T               Re = 50;
+        const T               clength = grid_dim.x * std::pow(2, -(descriptor.getDepth() - 1));
         const T               visclb = ulb * clength / Re;
         const T               omega = 1.0 / (3. * visclb + 0.5);
         const Neon::double_3d ulid(ulb, 0., 0.);
@@ -685,8 +691,9 @@ int main(int argc, char** argv)
         auto fout = grid.newField<T>("fout", Q, 0);
         auto cellType = grid.newField<CellType>("CellType", 1, CellType::bulk);
 
-        auto vel = grid.newField<T>("vel", 3, 0);
-        auto rho = grid.newField<T>("rho", 1, 0);
+        auto vel = grid.newField<T>("vel", 3, -50);
+        auto rho = grid.newField<T>("rho", 1, -50);
+
 
         //classify voxels
         for (int l = 0; l < descriptor.getDepth(); ++l) {
@@ -725,7 +732,7 @@ int main(int argc, char** argv)
                         for (int d = 0; d < 3; ++d) {
                             ret += latticeVelocity3D[q][d] * ulid.v[d];
                         }
-                        ret *= -6. * t.t[q] * ulb;
+                        ret *= -6. * t.t[q];
                     } else {
                         ret = 0;
                     }
@@ -755,8 +762,6 @@ int main(int argc, char** argv)
         fin.updateCompute();
         fout.updateCompute();
 
-        //fin.ioToVtk("fin", "f");
-
         //skeleton
         std::vector<Neon::set::Container> containers;
         nonUniformTimestepRecursive<T, Q>(grid,
@@ -773,13 +778,11 @@ int main(int argc, char** argv)
         //execution
         for (int t = 0; t < max_iter; ++t) {
             skl.run();
-            if (t % 100 == 0) {
+            if (t % 10 == 0) {
                 postProcess<T, Q>(grid, descriptor.getDepth(), fout, cellType, t, vel, rho);
             }
         }
 
-        grid.getBackend().syncAll();
-        //fin.updateIO();
-        //fout.updateIO();
+        postProcess<T, Q>(grid, descriptor.getDepth(), fout, cellType, max_iter, vel, rho);
     }
 }
\ No newline at end of file
diff --git a/libNeonDomain/include/Neon/domain/internal/mGrid/mField.h b/libNeonDomain/include/Neon/domain/internal/mGrid/mField.h
index 4458b1e0..0c0405bd 100644
--- a/libNeonDomain/include/Neon/domain/internal/mGrid/mField.h
+++ b/libNeonDomain/include/Neon/domain/internal/mGrid/mField.h
@@ -88,9 +88,11 @@ class mField
                            Neon::computeMode_t::computeMode_e                                            mode = Neon::computeMode_t::computeMode_e::par) -> void;
 
 
-    auto ioToVtk(const std::string& fileName,
-                 const std::string& FieldName,
-                 Neon::IoFileType   ioFileType = Neon::IoFileType::ASCII) const -> void;
+    auto ioToVtk(std::string fileName,
+                 bool        outputLevels = true,
+                 bool        outputBlockID = true,
+                 bool        outputVoxelID = true,
+                 bool        filterOverlaps = true) const -> void;
 
     auto load(Neon::set::Loader loader, int level, Neon::MultiResCompute compute) -> typename xField<T, C>::Partition&;
 
diff --git a/libNeonDomain/include/Neon/domain/internal/mGrid/mField_imp.h b/libNeonDomain/include/Neon/domain/internal/mGrid/mField_imp.h
index 9f0d9936..1df303fd 100644
--- a/libNeonDomain/include/Neon/domain/internal/mGrid/mField_imp.h
+++ b/libNeonDomain/include/Neon/domain/internal/mGrid/mField_imp.h
@@ -262,39 +262,172 @@ auto mField<T, C>::load(Neon::set::Loader     loader,
 
 
 template <typename T, int C>
-auto mField<T, C>::ioToVtk(const std::string& fileName,
-                           const std::string& fieldName,
-                           Neon::IoFileType   ioFileType) const -> void
+auto mField<T, C>::ioToVtk(std::string fileName,
+                           bool        outputLevels,
+                           bool        outputBlockID,
+                           bool        outputVoxelID,
+                           bool        filterOverlaps) const -> void
 {
-    const auto& descriptor = mData->grid->getDescriptor();
+    auto l0Dim = mData->grid->getDimension(0);
+
+    std::ofstream file(fileName + ".vtk");
+    file << "# vtk DataFile Version 2.0\n";
+    file << "mGrid\n";
+    file << "ASCII\n";
+    file << "DATASET UNSTRUCTURED_GRID\n";
+    file << "POINTS " << (l0Dim.rMax() + 1) * (l0Dim.rMax() + 1) * (l0Dim.rMax() + 1) << " float \n";
+    for (int z = 0; z < l0Dim.rMax() + 1; ++z) {
+        for (int y = 0; y < l0Dim.rMax() + 1; ++y) {
+            for (int x = 0; x < l0Dim.rMax() + 1; ++x) {
+                file << x << " " << y << " " << z << "\n";
+            }
+        }
+    }
 
-    for (int l = 0; l < descriptor.getDepth(); ++l) {
+    uint64_t num_cells = 0;
+
+    auto mapTo1D = [&](int x, int y, int z) {
+        return x +
+               y * (l0Dim.rMax() + 1) +
+               z * (l0Dim.rMax() + 1) * (l0Dim.rMax() + 1);
+    };
+
+    enum class Op : int
+    {
+        Count = 0,
+        OutputTopology = 1,
+        OutputLevels = 2,
+        OutputBlockID = 3,
+        OutputVoxelID = 4,
+        OutputData = 5,
+    };
+
+    auto desc = mData->grid->getDescriptor();
+    auto card = (*this)(0).getCardinality();
+
+    auto loopOverActiveBlocks = [&](const Op op) {
+        for (int l = 0; l < desc.getDepth(); ++l) {
+            const int refFactor = desc.getRefFactor(l);
+            const int childSpacing = desc.getSpacing(l - 1);
+
+            (*(mData->grid))(l).getBlockOriginTo1D().forEach([&](const Neon::int32_3d blockOrigin, const uint32_t blockIdx) {
+                // TODO need to figure out which device owns this block
+                SetIdx devID(0);
+
+
+                for (int z = 0; z < refFactor; z++) {
+                    for (int y = 0; y < refFactor; y++) {
+                        for (int x = 0; x < refFactor; x++) {
+                            Cell cell(static_cast<Cell::Location::Integer>(x),
+                                      static_cast<Cell::Location::Integer>(y),
+                                      static_cast<Cell::Location::Integer>(z));
+                            cell.mBlockID = blockIdx;
+                            cell.mBlockSize = refFactor;
+
+                            if (cell.computeIsActive(
+                                    (*(mData->grid))(l).getActiveMask().rawMem(devID, Neon::DeviceType::CPU))) {
+
+                                Neon::int32_3d corner(blockOrigin.x + x * childSpacing,
+                                                      blockOrigin.y + y * childSpacing,
+                                                      blockOrigin.z + z * childSpacing);
+
+                                bool draw = true;
+                                if (filterOverlaps && l != 0) {
+                                    auto cornerIDIter = (*(mData->grid))(l - 1).getBlockOriginTo1D().getMetadata(corner);
+                                    if (cornerIDIter) {
+                                        draw = false;
+                                    }
+                                }
+
+                                if (draw) {
+                                    if (op == Op::Count) {
+                                        num_cells++;
+                                    } else if (op == Op::OutputTopology) {
+
+                                        file << "8 ";
+                                        //x,y,z
+                                        file << mapTo1D(corner.x, corner.y, corner.z) << " ";
+                                        //+x,y,z
+                                        file << mapTo1D(corner.x + childSpacing, corner.y, corner.z) << " ";
+
+                                        //x,+y,z
+                                        file << mapTo1D(corner.x, corner.y + childSpacing, corner.z) << " ";
+
+                                        //+x,+y,z
+                                        file << mapTo1D(corner.x + childSpacing, corner.y + childSpacing, corner.z) << " ";
+
+                                        //x,y,+z
+                                        file << mapTo1D(corner.x, corner.y, corner.z + childSpacing) << " ";
+
+                                        //+x,y,+z
+                                        file << mapTo1D(corner.x + childSpacing, corner.y, corner.z + childSpacing) << " ";
+
+                                        //x,+y,+z
+                                        file << mapTo1D(corner.x, corner.y + childSpacing, corner.z + childSpacing) << " ";
+
+                                        //+x,+y,+z
+                                        file << mapTo1D(corner.x + childSpacing, corner.y + childSpacing, corner.z + childSpacing) << " ";
+                                        file << "\n";
+                                    } else if (op == Op::OutputLevels) {
+                                        file << l << "\n";
+                                    } else if (op == Op::OutputBlockID) {
+                                        file << blockIdx << "\n";
+                                    } else if (op == Op::OutputVoxelID) {
+                                        file << x + y * refFactor + z * refFactor * refFactor
+                                             << "\n";
+                                    } else if (op == Op::OutputData) {
+                                        for (int c = 0; c < card; ++c) {
+                                            file << (*this)(l).getPartition(Neon::Execution::host, devID, Neon::DataView::STANDARD)(cell, c)<< "\n";
+                                        }
+                                    }
+                                }
+                            }
+                        }
+                    }
+                }
+            });
+        }
+    };
 
-        double spacing = double(descriptor.getSpacing(l - 1));
+    loopOverActiveBlocks(Op::Count);
 
-        auto iovtk = IoToVTK<int, T>(fileName + "_level" + std::to_string(l),
-                                     mData->grid->getDimension(l) + 1,
-                                     {spacing, spacing, spacing},
-                                     {0, 0, 0},
-                                     ioFileType);
+    file << "CELLS " << num_cells << " " << num_cells * 9 << " \n";
 
+    loopOverActiveBlocks(Op::OutputTopology);
 
-        iovtk.addField(
-            [&](Neon::index_3d idx, int card) -> T {
-                idx = descriptor.toBaseIndexSpace(idx, l);
+    file << "CELL_TYPES " << num_cells << " \n";
+    for (uint64_t i = 0; i < num_cells; ++i) {
+        file << 11 << "\n";
+    }
 
-                if (mData->grid->isInsideDomain(idx, l)) {
-                    return getReference(idx, card, l);
-                } else {
-                    return mData->fields[l].getOutsideValue();
-                }
-            },
-            mData->fields[l].getCardinality(), fieldName, ioToVTKns::VtiDataType_e::voxel);
+    file << "CELL_DATA " << num_cells << " \n";
 
-        iovtk.flushAndClear();
+    //data
+    file << "SCALARS " << (*this)(0).getName() << " float " << card << "\n";
+    file << "LOOKUP_TABLE default \n";
+    loopOverActiveBlocks(Op::OutputData);
+
+    if (outputLevels) {
+        file << "SCALARS Level int 1 \n";
+        file << "LOOKUP_TABLE default \n";
+        loopOverActiveBlocks(Op::OutputLevels);
+    }
+
+    if (outputBlockID) {
+        file << "SCALARS BlockID int 1 \n";
+        file << "LOOKUP_TABLE default \n";
+        loopOverActiveBlocks(Op::OutputBlockID);
     }
-}
 
+    if (outputVoxelID) {
+        file << "SCALARS VoxelID int 1 \n";
+        file << "LOOKUP_TABLE default \n";
+        loopOverActiveBlocks(Op::OutputVoxelID);
+    }
+
+
+    file.close();
+}
 
 template <typename T, int C>
 auto mField<T, C>::getSharedMemoryBytes(const int32_t stencilRadius, int level) const -> size_t
diff --git a/libNeonDomain/include/Neon/domain/internal/mGrid/mGrid.h b/libNeonDomain/include/Neon/domain/internal/mGrid/mGrid.h
index 982b0040..72b50d14 100644
--- a/libNeonDomain/include/Neon/domain/internal/mGrid/mGrid.h
+++ b/libNeonDomain/include/Neon/domain/internal/mGrid/mGrid.h
@@ -175,7 +175,6 @@ class mGrid
     auto getDescriptor() const -> const mGridDescriptor&;
     auto getRefFactors() const -> const Neon::set::MemSet_t<int>&;
     auto getLevelSpacing() const -> const Neon::set::MemSet_t<int>&;
-    void topologyToVTK(std::string fileName, bool filterOverlaps) const;
     auto getBackend() const -> const Backend&;
     auto getBackend() -> Backend&;
 
diff --git a/libNeonDomain/src/domain/internal/mGrid/mGrid.cpp b/libNeonDomain/src/domain/internal/mGrid/mGrid.cpp
index 5848e3f2..285b92e0 100644
--- a/libNeonDomain/src/domain/internal/mGrid/mGrid.cpp
+++ b/libNeonDomain/src/domain/internal/mGrid/mGrid.cpp
@@ -596,148 +596,4 @@ auto mGrid::getBackend() -> Backend&
     return mData->backend;
 }
 
-void mGrid::topologyToVTK(std::string fileName, bool filterOverlaps) const
-{
-
-    std::ofstream file(fileName);
-    file << "# vtk DataFile Version 2.0\n";
-    file << "mGrid\n";
-    file << "ASCII\n";
-    file << "DATASET UNSTRUCTURED_GRID\n";
-    file << "POINTS " << (getDimension(0).rMax() + 1) * (getDimension(0).rMax() + 1) * (getDimension(0).rMax() + 1) << " float \n";
-    for (int z = 0; z < getDimension(0).rMax() + 1; ++z) {
-        for (int y = 0; y < getDimension(0).rMax() + 1; ++y) {
-            for (int x = 0; x < getDimension(0).rMax() + 1; ++x) {
-                file << x << " " << y << " " << z << "\n";
-            }
-        }
-    }
-
-    uint64_t num_cells = 0;
-
-    auto mapTo1D = [&](int x, int y, int z) {
-        return x +
-               y * (getDimension(0).rMax() + 1) +
-               z * (getDimension(0).rMax() + 1) * (getDimension(0).rMax() + 1);
-    };
-
-    enum class Op : int
-    {
-        Count = 0,
-        OutputTopology = 1,
-        OutputLevels = 2,
-        OutputBlockID = 3,
-        OutputVoxelID = 4,
-    };
-
-    auto loopOverActiveBlocks = [&](const Op op) {
-        for (int l = 0; l < mData->mDescriptor.getDepth(); ++l) {
-            const int refFactor = mData->mDescriptor.getRefFactor(l);
-            const int childSpacing = mData->mDescriptor.getSpacing(l - 1);
-
-            mData->grids[l].getBlockOriginTo1D().forEach([&](const Neon::int32_3d blockOrigin, const uint32_t blockIdx) {
-                // TODO need to figure out which device owns this block
-                SetIdx devID(0);
-
-
-                for (int z = 0; z < refFactor; z++) {
-                    for (int y = 0; y < refFactor; y++) {
-                        for (int x = 0; x < refFactor; x++) {
-                            Cell cell(static_cast<Cell::Location::Integer>(x),
-                                      static_cast<Cell::Location::Integer>(y),
-                                      static_cast<Cell::Location::Integer>(z));
-                            cell.mBlockID = blockIdx;
-                            cell.mBlockSize = refFactor;
-
-                            if (cell.computeIsActive(
-                                    mData->grids[l].getActiveMask().rawMem(devID, Neon::DeviceType::CPU))) {
-
-                                Neon::int32_3d corner(blockOrigin.x + x * childSpacing,
-                                                      blockOrigin.y + y * childSpacing,
-                                                      blockOrigin.z + z * childSpacing);
-
-                                bool draw = true;
-                                if (filterOverlaps && l != 0) {
-                                    auto cornerIDIter = mData->grids[l - 1].getBlockOriginTo1D().getMetadata(corner);
-                                    if (cornerIDIter) {
-                                        draw = false;
-                                    }
-                                }
-
-                                if (draw) {
-                                    if (op == Op::Count) {
-                                        num_cells++;
-                                    } else if (op == Op::OutputTopology) {
-
-                                        file << "8 ";
-                                        //x,y,z
-                                        file << mapTo1D(corner.x, corner.y, corner.z) << " ";
-                                        //+x,y,z
-                                        file << mapTo1D(corner.x + childSpacing, corner.y, corner.z) << " ";
-
-                                        //x,+y,z
-                                        file << mapTo1D(corner.x, corner.y + childSpacing, corner.z) << " ";
-
-                                        //+x,+y,z
-                                        file << mapTo1D(corner.x + childSpacing, corner.y + childSpacing, corner.z) << " ";
-
-                                        //x,y,+z
-                                        file << mapTo1D(corner.x, corner.y, corner.z + childSpacing) << " ";
-
-                                        //+x,y,+z
-                                        file << mapTo1D(corner.x + childSpacing, corner.y, corner.z + childSpacing) << " ";
-
-                                        //x,+y,+z
-                                        file << mapTo1D(corner.x, corner.y + childSpacing, corner.z + childSpacing) << " ";
-
-                                        //+x,+y,+z
-                                        file << mapTo1D(corner.x + childSpacing, corner.y + childSpacing, corner.z + childSpacing) << " ";
-                                        file << "\n";
-                                    } else if (op == Op::OutputLevels) {
-                                        file << l << "\n";
-                                    } else if (op == Op::OutputBlockID) {
-                                        file << blockIdx << "\n";
-                                    } else if (op == Op::OutputVoxelID) {
-                                        file << blockIdx << "\n";
-                                    }
-                                }
-                            }
-                        }
-                    }
-                }
-            });
-        }
-    };
-
-    loopOverActiveBlocks(Op::Count);
-
-    file << "CELLS " << num_cells << " " << num_cells * 9 << " \n";
-
-    loopOverActiveBlocks(Op::OutputTopology);
-
-    file << "CELL_TYPES " << num_cells << " \n";
-    for (uint64_t i = 0; i < num_cells; ++i) {
-        file << 11 << "\n";
-    }
-
-    file << "CELL_DATA " << num_cells << " \n";
-
-    file << "SCALARS Level int 1 \n";
-    file << "LOOKUP_TABLE default \n";
-    loopOverActiveBlocks(Op::OutputLevels);
-
-
-    file << "SCALARS BlockID int 1 \n";
-    file << "LOOKUP_TABLE default \n";
-    loopOverActiveBlocks(Op::OutputBlockID);
-
-    //file << "SCALARS VoxelID int 1 \n";
-    //file << "LOOKUP_TABLE default \n";
-    //loopOverActiveBlocks(Op::OutputVoxelID);
-
-
-    file.close();
-}
-
-
 }  // namespace Neon::domain::internal::mGrid
\ No newline at end of file
diff --git a/libNeonDomain/tests/unit/gUt_mGrid/src/gUt_mGrid.cpp b/libNeonDomain/tests/unit/gUt_mGrid/src/gUt_mGrid.cpp
index bbf3af07..f854bf15 100644
--- a/libNeonDomain/tests/unit/gUt_mGrid/src/gUt_mGrid.cpp
+++ b/libNeonDomain/tests/unit/gUt_mGrid/src/gUt_mGrid.cpp
@@ -29,8 +29,6 @@ TEST(mGrid, multiRes)
             Neon::domain::Stencil::s7_Laplace_t(),
             descriptor);
 
-        grid.topologyToVTK("bGrid112.vtk", false);
-
         auto field = grid.newField<float>("myField", 1, 0);
 
         for (int l = 0; l < descriptor.getDepth(); ++l) {
@@ -42,7 +40,7 @@ TEST(mGrid, multiRes)
                 Neon::computeMode_t::computeMode_e::seq);
         }
 
-        field.ioToVtk("f", "f");
+        field.ioToVtk("f");
     }
 }
 
diff --git a/libNeonSkeleton/tests/unit/sUt_multiRes/src/MultiResChild.h b/libNeonSkeleton/tests/unit/sUt_multiRes/src/MultiResChild.h
index efec28db..abbec0f8 100644
--- a/libNeonSkeleton/tests/unit/sUt_multiRes/src/MultiResChild.h
+++ b/libNeonSkeleton/tests/unit/sUt_multiRes/src/MultiResChild.h
@@ -32,8 +32,7 @@ void MultiResChild()
              }},
             Neon::domain::Stencil::s7_Laplace_t(),
             descriptor);
-        //grid.topologyToVTK("grid112.vtk", false);
-
+        
         auto XField = grid.newField<Type>("XField", 1, -1);
         auto isRefinedField = grid.newField<Type>("isRefined", 1, -1);
 
@@ -56,7 +55,7 @@ void MultiResChild()
             XField.updateCompute();
             isRefinedField.updateCompute();
         }
-        //XField.ioToVtk("f", "f");
+        //XField.ioToVtk("f");
 
         for (int level = 0; level < descriptor.getDepth(); ++level) {
 
diff --git a/libNeonSkeleton/tests/unit/sUt_multiRes/src/MultiResDemo.h b/libNeonSkeleton/tests/unit/sUt_multiRes/src/MultiResDemo.h
index a23b979b..a9f40076 100644
--- a/libNeonSkeleton/tests/unit/sUt_multiRes/src/MultiResDemo.h
+++ b/libNeonSkeleton/tests/unit/sUt_multiRes/src/MultiResDemo.h
@@ -128,9 +128,7 @@ void MultiResDemo()
     for (int i = 0; i < descriptor.getDepth(); ++i) {
         s << descriptor.getLog2RefFactor(i);
     }
-
-    grid.topologyToVTK(s.str() + ".vtk", false);
-
+        
     auto field = grid.newField<float>("myField", 1, -10000);
 
     for (int l = 0; l < descriptor.getDepth(); ++l) {
@@ -184,7 +182,7 @@ void MultiResDemo()
 
     field.updateIO();
 
-    field.ioToVtk(s.str(), "f");
+    field.ioToVtk(s.str());
 }
 
 
diff --git a/libNeonSkeleton/tests/unit/sUt_multiRes/src/MultiResMap.h b/libNeonSkeleton/tests/unit/sUt_multiRes/src/MultiResMap.h
index 46070ea8..009b31dd 100644
--- a/libNeonSkeleton/tests/unit/sUt_multiRes/src/MultiResMap.h
+++ b/libNeonSkeleton/tests/unit/sUt_multiRes/src/MultiResMap.h
@@ -30,8 +30,7 @@ void MultiResSingleMap()
              }},
             Neon::domain::Stencil::s7_Laplace_t(),
             descriptor);
-        //grid.topologyToVTK("grid111.vtk", false);
-
+        
         auto XField = grid.newField<Type>("XField", 1, -1);
         auto YField = grid.newField<Type>("YField", 1, -1);
 
@@ -54,7 +53,7 @@ void MultiResSingleMap()
             XField.updateCompute();
             YField.updateCompute();
         }
-        //XField.ioToVtk("f", "f");
+        //XField.ioToVtk("f");
 
 
         for (int level = 0; level < descriptor.getDepth(); ++level) {
diff --git a/libNeonSkeleton/tests/unit/sUt_multiRes/src/MultiResParent.h b/libNeonSkeleton/tests/unit/sUt_multiRes/src/MultiResParent.h
index c631ee76..0066e316 100644
--- a/libNeonSkeleton/tests/unit/sUt_multiRes/src/MultiResParent.h
+++ b/libNeonSkeleton/tests/unit/sUt_multiRes/src/MultiResParent.h
@@ -154,8 +154,7 @@ void MultiResAtomicAddParent()
              }},
             Neon::domain::Stencil::s7_Laplace_t(),
             descriptor);
-        //grid.topologyToVTK("grid111.vtk", false);
-
+        
         auto XField = grid.newField<Type>("XField", 1, -1);
 
 
@@ -171,7 +170,7 @@ void MultiResAtomicAddParent()
         if (bk.runtime() == Neon::Runtime::stream) {
             XField.updateCompute();
         }
-        //XField.ioToVtk("f", "f");
+        //XField.ioToVtk("f");
 
 
         for (int level = 0; level < descriptor.getDepth(); ++level) {
diff --git a/libNeonSkeleton/tests/unit/sUt_multiRes/src/MultiResSkeleton.h b/libNeonSkeleton/tests/unit/sUt_multiRes/src/MultiResSkeleton.h
index 324a7577..9b978977 100644
--- a/libNeonSkeleton/tests/unit/sUt_multiRes/src/MultiResSkeleton.h
+++ b/libNeonSkeleton/tests/unit/sUt_multiRes/src/MultiResSkeleton.h
@@ -29,8 +29,7 @@ void MultiResSkeleton()
              }},
             Neon::domain::Stencil::s7_Laplace_t(),
             descriptor);
-        //grid.topologyToVTK("grid111.vtk", false);
-
+        
         auto field = grid.newField<Type>("field", 3, -1);
 
 
@@ -46,7 +45,7 @@ void MultiResSkeleton()
         if (bk.runtime() == Neon::Runtime::stream) {
             field.updateCompute();
         }
-        //field.ioToVtk("f", "f");
+        //field.ioToVtk("f");
 
         std::vector<Neon::set::Container> containers;
 
diff --git a/libNeonSkeleton/tests/unit/sUt_multiRes/src/MultiResStencil.h b/libNeonSkeleton/tests/unit/sUt_multiRes/src/MultiResStencil.h
index e4a1f72e..3000d6bf 100644
--- a/libNeonSkeleton/tests/unit/sUt_multiRes/src/MultiResStencil.h
+++ b/libNeonSkeleton/tests/unit/sUt_multiRes/src/MultiResStencil.h
@@ -29,8 +29,7 @@ void MultiResSameLevelStencil()
              }},
             Neon::domain::Stencil::s7_Laplace_t(),
             descriptor);
-        //grid.topologyToVTK("grid111.vtk", false);
-
+        
         auto XField = grid.newField<Type>("XField", 1, -1);
         auto YField = grid.newField<Type>("YField", 1, -1);
 
@@ -53,7 +52,7 @@ void MultiResSameLevelStencil()
             XField.updateCompute();
             YField.updateCompute();
         }
-        //XField.ioToVtk("f", "f");
+        //XField.ioToVtk("f");
 
 
         for (int level = 0; level < descriptor.getDepth(); ++level) {

From b09c79c4a4b598274f00d73a3f1c0d4b7f01eb6b Mon Sep 17 00:00:00 2001
From: Ahmed Mahmoud <ahmed.mahmoud@autodesk.com>
Date: Thu, 2 Feb 2023 15:23:26 -0500
Subject: [PATCH 17/24] matching single resolution results

---
 apps/lbmMultiRes/lbmMultiRes.cu               | 32 ++++++-------------
 .../src/domain/internal/bGrid/bGrid.cpp       | 10 +++---
 2 files changed, 15 insertions(+), 27 deletions(-)

diff --git a/apps/lbmMultiRes/lbmMultiRes.cu b/apps/lbmMultiRes/lbmMultiRes.cu
index 49463060..7e0a0743 100644
--- a/apps/lbmMultiRes/lbmMultiRes.cu
+++ b/apps/lbmMultiRes/lbmMultiRes.cu
@@ -11,17 +11,6 @@ enum CellType : int
     undefined = 3,
 };
 
-/*NEON_CUDA_DEVICE_ONLY static constexpr char latticeVelocity2D[9][2] = {
-    {0, 0},    //0 -> 0 || 0000 -> 0000
-    {0, -1},   //1 -> 2 || 0001 -> 0010
-    {0, 1},    //2 -> 1 || 0010 -> 0001
-    {-1, 0},   //3 -> 6 || 0100 -> 0110
-    {-1, -1},  //4 -> 8 || 0100 -> 1000
-    {-1, 1},   //5 -> 7 || 0101 -> 0111
-    {1, 0},    //6 -> 3 || 0110 -> 0011
-    {1, -1},   //7 -> 5 || 0111 -> 0101
-    {1, 1}};   //8 -> 4 || 1000 -> 0100*/
-
 /*NEON_CUDA_DEVICE_ONLY static constexpr char latticeVelocity3D[27][3] = {
     {0, 0, 0},    //0  -> 0   || 00000 -> 00000
     {0, -1, 0},   //1  -> 2   || 00001 -> 00010
@@ -617,7 +606,7 @@ void postProcess(Neon::domain::mGrid&                        grid,
 
     if (grid.getBackend().runtime() == Neon::Runtime::stream) {
         vel.updateIO();
-        rho.updateIO();
+        //rho.updateIO();
     }
 
     int                precision = 4;
@@ -625,7 +614,7 @@ void postProcess(Neon::domain::mGrid&                        grid,
     suffix << std::setw(precision) << std::setfill('0') << iteration;
 
     vel.ioToVtk("Velocity_" + suffix.str());
-    rho.ioToVtk("Density_" + suffix.str());
+    //rho.ioToVtk("Density_" + suffix.str());
 }
 
 int main(int argc, char** argv)
@@ -636,7 +625,7 @@ int main(int argc, char** argv)
         using T = double;
 
         //Neon grid
-        auto             runtime = Neon::Runtime::openmp;
+        auto             runtime = Neon::Runtime::stream;
         std::vector<int> gpu_ids{0};
         Neon::Backend    backend(gpu_ids, runtime);
 
@@ -652,7 +641,6 @@ int main(int argc, char** argv)
         //levelSDF[2] = -56.0 / 144.0;
         //levelSDF[3] = -1.0;
 
-
         const Neon::index_3d grid_dim(24, 24, 24);
         float                levelSDF[depth + 1];
         levelSDF[0] = 0;
@@ -678,10 +666,10 @@ int main(int argc, char** argv)
             Neon::domain::Stencil::s19_t(false), descriptor);
 
         //LBM problem
-        const int             max_iter = 3000;
-        const T               ulb = 0.04;
-        const T               Re = 50;
-        const T               clength = grid_dim.x * std::pow(2, -(descriptor.getDepth() - 1));
+        const int             max_iter = 5000;
+        const T               ulb = 0.02;
+        const T               Re = 100;
+        const T               clength = (grid_dim.x * std::pow(2, -(descriptor.getDepth() - 1)) - 1);
         const T               visclb = ulb * clength / Re;
         const T               omega = 1.0 / (3. * visclb + 0.5);
         const Neon::double_3d ulid(ulb, 0., 0.);
@@ -691,8 +679,8 @@ int main(int argc, char** argv)
         auto fout = grid.newField<T>("fout", Q, 0);
         auto cellType = grid.newField<CellType>("CellType", 1, CellType::bulk);
 
-        auto vel = grid.newField<T>("vel", 3, -50);
-        auto rho = grid.newField<T>("rho", 1, -50);
+        auto vel = grid.newField<T>("vel", 3, 0);
+        auto rho = grid.newField<T>("rho", 1, 0);
 
 
         //classify voxels
@@ -778,7 +766,7 @@ int main(int argc, char** argv)
         //execution
         for (int t = 0; t < max_iter; ++t) {
             skl.run();
-            if (t % 10 == 0) {
+            if (t % 100 == 0) {
                 postProcess<T, Q>(grid, descriptor.getDepth(), fout, cellType, t, vel, rho);
             }
         }
diff --git a/libNeonDomain/src/domain/internal/bGrid/bGrid.cpp b/libNeonDomain/src/domain/internal/bGrid/bGrid.cpp
index 0692f6a9..69479186 100644
--- a/libNeonDomain/src/domain/internal/bGrid/bGrid.cpp
+++ b/libNeonDomain/src/domain/internal/bGrid/bGrid.cpp
@@ -72,14 +72,14 @@ auto bGrid::setReduceEngine(Neon::sys::patterns::Engine eng) -> void
     }
 }
 
-auto bGrid::getLaunchParameters(Neon::DataView                         dataView,
+auto bGrid::getLaunchParameters([[maybe_unused]] Neon::DataView        dataView,
                                 [[maybe_unused]] const Neon::index_3d& blockSize,
                                 const size_t&                          sharedMem) const -> Neon::set::LaunchParameters
 {
-    if (dataView != Neon::DataView::STANDARD) {
-        NEON_WARNING("Requesting LaunchParameters on {} data view but bGrid only supports Standard data view on a single GPU",
-                     Neon::DataViewUtil::toString(dataView));
-    }
+    //if (dataView != Neon::DataView::STANDARD) {
+    //    NEON_WARNING("Requesting LaunchParameters on {} data view but bGrid only supports Standard data view on a single GPU",
+    //                 Neon::DataViewUtil::toString(dataView));
+    //}
 
     const Neon::int32_3d cuda_block(mData->blockSize,
                                     mData->blockSize,

From 99e1869fc8e690ba37b84665c22a2b14f77fbb8d Mon Sep 17 00:00:00 2001
From: Ahmed Mahmoud <ahmed.mahmoud@autodesk.com>
Date: Fri, 3 Feb 2023 19:09:43 -0500
Subject: [PATCH 18/24] fix dependency for stencil up/down

---
 .../Neon/domain/internal/mGrid/mField_imp.h        | 14 +++++++++-----
 .../tests/unit/sUt_multiRes/src/MultiResDemo.h     |  1 +
 2 files changed, 10 insertions(+), 5 deletions(-)

diff --git a/libNeonDomain/include/Neon/domain/internal/mGrid/mField_imp.h b/libNeonDomain/include/Neon/domain/internal/mGrid/mField_imp.h
index 1df303fd..563b0b89 100644
--- a/libNeonDomain/include/Neon/domain/internal/mGrid/mField_imp.h
+++ b/libNeonDomain/include/Neon/domain/internal/mGrid/mField_imp.h
@@ -216,12 +216,14 @@ auto mField<T, C>::load(Neon::set::Loader     loader,
             break;
         }
         case Neon::MultiResCompute::STENCIL_UP: {
-            loader.load(operator()(level + 1), Neon::Compute::MAP);
+            const auto& parent = operator()(level + 1);
+            loader.load(parent, Neon::Compute::STENCIL);
             return loader.load(operator()(level), Neon::Compute::MAP);
             break;
         }
         case Neon::MultiResCompute::STENCIL_DOWN: {
-            loader.load(operator()(level - 1), Neon::Compute::MAP);
+            const auto& child = operator()(level - 1);
+            loader.load(child, Neon::Compute::STENCIL);
             return loader.load(operator()(level), Neon::Compute::MAP);
             break;
         }
@@ -246,12 +248,14 @@ auto mField<T, C>::load(Neon::set::Loader     loader,
             break;
         }
         case Neon::MultiResCompute::STENCIL_UP: {
-            loader.load(operator()(level + 1), Neon::Compute::MAP);
+            const auto& parent = operator()(level + 1);
+            loader.load(parent, Neon::Compute::STENCIL);
             return loader.load(operator()(level), Neon::Compute::MAP);
             break;
         }
         case Neon::MultiResCompute::STENCIL_DOWN: {
-            loader.load(operator()(level - 1), Neon::Compute::MAP);
+            const auto& child = operator()(level - 1);
+            loader.load(child, Neon::Compute::STENCIL);
             return loader.load(operator()(level), Neon::Compute::MAP);
             break;
         }
@@ -377,7 +381,7 @@ auto mField<T, C>::ioToVtk(std::string fileName,
                                              << "\n";
                                     } else if (op == Op::OutputData) {
                                         for (int c = 0; c < card; ++c) {
-                                            file << (*this)(l).getPartition(Neon::Execution::host, devID, Neon::DataView::STANDARD)(cell, c)<< "\n";
+                                            file << (*this)(l).getPartition(Neon::Execution::host, devID, Neon::DataView::STANDARD)(cell, c) << "\n";
                                         }
                                     }
                                 }
diff --git a/libNeonSkeleton/tests/unit/sUt_multiRes/src/MultiResDemo.h b/libNeonSkeleton/tests/unit/sUt_multiRes/src/MultiResDemo.h
index a9f40076..21b65ed1 100644
--- a/libNeonSkeleton/tests/unit/sUt_multiRes/src/MultiResDemo.h
+++ b/libNeonSkeleton/tests/unit/sUt_multiRes/src/MultiResDemo.h
@@ -39,6 +39,7 @@ inline float sdfDodecahedron(Neon::float_3d p, float r = 1.0)
 
 inline float sdfMenger(Neon::float_3d p)
 {
+    //https://github.com/tovacinni/sdf-explorer/blob/master/data-files/sdf/Fractal/Menger.glsl
     auto mod = [](float x, float y) { return x - y * floor(x / y); };
 
     auto length = [&](const Neon::float_3d& x) -> float {

From 743eeff082787a3b599275033f13a9f29de747d0 Mon Sep 17 00:00:00 2001
From: Ahmed Mahmoud <ahmed.mahmoud@autodesk.com>
Date: Mon, 6 Feb 2023 19:18:18 -0500
Subject: [PATCH 19/24] multi-res lbm minor fixes

---
 apps/lbmMultiRes/lbmMultiRes.cu | 118 ++++++++++++++++++++++++++------
 1 file changed, 98 insertions(+), 20 deletions(-)

diff --git a/apps/lbmMultiRes/lbmMultiRes.cu b/apps/lbmMultiRes/lbmMultiRes.cu
index 7e0a0743..a9d58fe4 100644
--- a/apps/lbmMultiRes/lbmMultiRes.cu
+++ b/apps/lbmMultiRes/lbmMultiRes.cu
@@ -314,9 +314,10 @@ Neon::set::Container explosionPull(Neon::domain::mGrid&                 grid,
 
 
 template <typename T, int Q>
-Neon::set::Container coalescencePull(Neon::domain::mGrid&           grid,
-                                     int                            level,
-                                     Neon::domain::mGrid::Field<T>& postStreaming)
+Neon::set::Container coalescencePull(Neon::domain::mGrid&                 grid,
+                                     int                                  level,
+                                     const Neon::domain::mGrid::Field<T>& postCollision,
+                                     Neon::domain::mGrid::Field<T>&       postStreaming)
 {
     // Initiated by the coarse level (hence "pull"), this function simply read the missing population
     // across the interface between coarse<->fine boundary by reading the population prepare during the store()
@@ -324,7 +325,8 @@ Neon::set::Container coalescencePull(Neon::domain::mGrid&           grid,
     return grid.getContainer(
         "Coalescence" + std::to_string(level), level,
         [&, level](Neon::set::Loader& loader) {
-            auto& fpost_stm = postStreaming.load(loader, level, Neon::MultiResCompute::STENCIL_DOWN);
+            const auto& fpost_col = postCollision.load(loader, level, Neon::MultiResCompute::STENCIL);
+            auto&       fpost_stm = postStreaming.load(loader, level, Neon::MultiResCompute::MAP);
 
             return [=] NEON_CUDA_HOST_DEVICE(const typename Neon::domain::bGrid::Cell& cell) mutable {
                 //If this cell has children i.e., it is been refined, than we should not work on it
@@ -336,7 +338,7 @@ Neon::set::Container coalescencePull(Neon::domain::mGrid&           grid,
                         //if we have a neighbor at the same level that has been refined, then cell is on
                         //the interface and this is where we should do the coalescence
                         if (fpost_stm.hasChildren(cell, dir)) {
-                            auto neighbor = fpost_stm.nghVal(cell, dir, q, T(0));
+                            auto neighbor = fpost_col.nghVal(cell, dir, q, T(0));
                             if (neighbor.isValid) {
                                 fpost_stm(cell, q) = neighbor.value;
                             }
@@ -351,7 +353,7 @@ Neon::set::Container coalescencePull(Neon::domain::mGrid&           grid,
 template <typename T, int Q>
 Neon::set::Container store(Neon::domain::mGrid&           grid,
                            int                            level,
-                           Neon::domain::mGrid::Field<T>& postStreaming)
+                           Neon::domain::mGrid::Field<T>& postCollision)
 {
     //Initiated by the coarse level (level), this function prepares and stores the fine (level - 1)
     // information for further pulling initiated by the coarse (this) level invoked by coalescence_pull
@@ -367,17 +369,18 @@ Neon::set::Container store(Neon::domain::mGrid&           grid,
     return grid.getContainer(
         "store_" + std::to_string(level), level,
         [&, level](Neon::set::Loader& loader) {
-            auto& fpost_stm = postStreaming.load(loader, level, Neon::MultiResCompute::STENCIL_DOWN);
+            auto& fpost_col = postCollision.load(loader, level, Neon::MultiResCompute::STENCIL_DOWN);
 
             return [=] NEON_CUDA_HOST_DEVICE(const typename Neon::domain::bGrid::Cell& cell) mutable {
                 //if the cell is refined, we might need to store something in it for its neighbor
-                if (fpost_stm.hasChildren(cell)) {
+                if (fpost_col.hasChildren(cell)) {
 
-                    const int refFactor = fpost_stm.getRefFactor(level);
+                    const int refFactor = fpost_col.getRefFactor(level);
 
-                    bool should_accumelate = ((int(fpost_stm(cell, 0)) % refFactor) != 0);
+                    bool should_accumelate = ((int(fpost_col(cell, 0)) % refFactor) != 0);
 
-                    fpost_stm(cell, 0) += 1;
+                    //fpost_col(cell, 0) += 1;
+                    fpost_col(cell, 0) = (int(fpost_col(cell, 0)) + 1) % refFactor;
 
 
                     //for each direction aka for each neighbor
@@ -386,10 +389,10 @@ Neon::set::Container store(Neon::domain::mGrid&           grid,
                         const Neon::int8_3d q_dir = getDir(q);
 
                         //check if the neighbor in this direction has children
-                        auto neighborCell = fpost_stm.getNghCell(cell, q_dir);
+                        auto neighborCell = fpost_col.getNghCell(cell, q_dir);
                         if (neighborCell.isActive()) {
 
-                            if (!fpost_stm.hasChildren(neighborCell)) {
+                            if (!fpost_col.hasChildren(neighborCell)) {
                                 //now, we know that there is actually something we need to store for this neighbor
                                 //in cell along q (q_dir) direction
                                 int num = 0;
@@ -405,7 +408,7 @@ Neon::set::Container store(Neon::domain::mGrid&           grid,
                                     const Neon::int8_3d r_dir = q_dir - p_dir;
 
                                     //if this neighbor is refined
-                                    if (fpost_stm.hasChildren(cell, p_dir)) {
+                                    if (fpost_col.hasChildren(cell, p_dir)) {
 
                                         //for each children of p
                                         for (int8_t i = 0; i < refFactor; ++i) {
@@ -418,7 +421,7 @@ Neon::set::Container store(Neon::domain::mGrid&           grid,
                                                     const Neon::int8_3d cq = unlceOffset(c, q_dir);
                                                     if (cq == r_dir) {
                                                         num++;
-                                                        sum += fpost_stm.childVal(cell, c, q, 0).value;
+                                                        sum += fpost_col.childVal(cell, c, q, 0).value;
                                                     }
                                                 }
                                             }
@@ -427,9 +430,9 @@ Neon::set::Container store(Neon::domain::mGrid&           grid,
                                 }
 
                                 if (should_accumelate) {
-                                    fpost_stm(cell, q) += sum / static_cast<T>(num * refFactor);
+                                    fpost_col(cell, q) += sum / static_cast<T>(num * refFactor);
                                 } else {
-                                    fpost_stm(cell, q) = sum / static_cast<T>(num * refFactor);
+                                    fpost_col(cell, q) = sum / static_cast<T>(num * refFactor);
                                 }
                             }
                         }
@@ -467,7 +470,7 @@ void stream(Neon::domain::mGrid&                        grid,
         /* Coalescence: pull missing populations from finer neighbors by "smart" averaging fine (level-1) 
         * to coarse (level) communication, initiated by the coarse level ("Pull").
         */
-        containers.push_back(coalescencePull<T, Q>(grid, level, postStreaming));
+        containers.push_back(coalescencePull<T, Q>(grid, level, postCollision, postStreaming));
     }
 }
 
@@ -641,6 +644,7 @@ int main(int argc, char** argv)
         //levelSDF[2] = -56.0 / 144.0;
         //levelSDF[3] = -1.0;
 
+        //const Neon::index_3d grid_dim(48, 48, 48);
         const Neon::index_3d grid_dim(24, 24, 24);
         float                levelSDF[depth + 1];
         levelSDF[0] = 0;
@@ -666,7 +670,7 @@ int main(int argc, char** argv)
             Neon::domain::Stencil::s19_t(false), descriptor);
 
         //LBM problem
-        const int             max_iter = 5000;
+        const int             max_iter = 20000;
         const T               ulb = 0.02;
         const T               Re = 100;
         const T               clength = (grid_dim.x * std::pow(2, -(descriptor.getDepth() - 1)) - 1);
@@ -750,6 +754,79 @@ int main(int argc, char** argv)
         fin.updateCompute();
         fout.updateCompute();
 
+        //init fields
+        for (int level = 0; level < descriptor.getDepth(); ++level) {
+            auto container =
+                grid.getContainer(
+                    "Init_" + std::to_string(level), level,
+                    [&fin, &fout, &cellType, &vel, &rho, level, grid_dim, ulid, Q](Neon::set::Loader& loader) {
+                        auto& in = fin.load(loader, level, Neon::MultiResCompute::MAP);
+                        auto& out = fout.load(loader, level, Neon::MultiResCompute::MAP);
+                        auto& type = cellType.load(loader, level, Neon::MultiResCompute::MAP);
+                        auto& u = vel.load(loader, level, Neon::MultiResCompute::MAP);
+                        auto& rh = rho.load(loader, level, Neon::MultiResCompute::MAP);
+
+                        return [=] NEON_CUDA_HOST_DEVICE(const typename Neon::domain::bGrid::Cell& cell) mutable {
+                            constexpr auto t = latticeWeight<Q>();
+
+                            //velocity and density
+                            /*u(cell, 0) = 0;
+                            u(cell, 1) = 0;
+                            u(cell, 2) = 0;
+                            rh(cell, 0) = 0;
+                            type(cell, 0) = CellType::bulk;*/
+
+                            if (!in.hasChildren(cell)) {
+                                /*const Neon::index_3d idx = in.mapToGlobal(cell);
+
+                                //pop
+                                for (int q = 0; q < Q; ++q) {
+                                    T pop_init_val = t.t[q];
+
+                                    if (level == 0) {
+                                        if (idx.x == 0 || idx.x == grid_dim.x - 1 ||
+                                            idx.y == 0 || idx.y == grid_dim.y - 1 ||
+                                            idx.z == 0 || idx.z == grid_dim.z - 1) {
+                                            type(cell, 0) = CellType::bounceBack;
+
+                                            if (idx.y == grid_dim.y - 1) {
+                                                type(cell, 0) = CellType::movingWall;
+                                                pop_init_val = 0;
+                                                for (int d = 0; d < 3; ++d) {
+                                                    pop_init_val += latticeVelocity3D[q][d] * ulid.v[d];
+                                                }
+                                                pop_init_val *= -6. * t.t[q];
+                                            } else {
+                                                pop_init_val = 0;
+                                            }
+                                        }
+                                    }
+
+                                    out(cell, q) = pop_init_val;
+                                    in(cell, q) = pop_init_val;
+                                }*/
+
+
+                            } else {
+                                in(cell, 0) = 0;
+                                out(cell, 0) = 0;
+
+                                /*for (int q = 0; q < Q; ++q) {
+                                    in(cell, q) = 0;
+                                    out(cell, q) = 0;
+                                }*/
+                            }
+                        };
+                    });
+
+            container.run(0);
+        }
+
+        //rho.updateIO();
+        //vel.updateIO();
+        //fin.updateIO();
+        //fout.updateIO();
+
         //skeleton
         std::vector<Neon::set::Container> containers;
         nonUniformTimestepRecursive<T, Q>(grid,
@@ -765,8 +842,9 @@ int main(int argc, char** argv)
 
         //execution
         for (int t = 0; t < max_iter; ++t) {
+            printf("\n Iteration = %d", t);
             skl.run();
-            if (t % 100 == 0) {
+            if (t % 20 == 0) {
                 postProcess<T, Q>(grid, descriptor.getDepth(), fout, cellType, t, vel, rho);
             }
         }

From b80e4e6f05a2141aa304362d54edd89e04103d5b Mon Sep 17 00:00:00 2001
From: Ahmed Mahmoud <ahmed.mahmoud@autodesk.com>
Date: Tue, 7 Feb 2023 19:03:26 -0500
Subject: [PATCH 20/24] debugging multi-res lbm

---
 apps/lbmMultiRes/lbmMultiRes.cu               | 107 +++---------------
 .../domain/internal/mGrid/mPartition_imp.h    |   2 +-
 2 files changed, 17 insertions(+), 92 deletions(-)

diff --git a/apps/lbmMultiRes/lbmMultiRes.cu b/apps/lbmMultiRes/lbmMultiRes.cu
index a9d58fe4..3e897dde 100644
--- a/apps/lbmMultiRes/lbmMultiRes.cu
+++ b/apps/lbmMultiRes/lbmMultiRes.cu
@@ -105,7 +105,7 @@ struct latticeWeight
 
 
 template <typename T>
-NEON_CUDA_HOST_DEVICE inline Neon::int8_3d unlceOffset(const T& cell, const Neon::int8_3d& q)
+NEON_CUDA_HOST_DEVICE inline Neon::int8_3d uncleOffset(const T& cell, const Neon::int8_3d& q)
 {
     //given a local index within a cell and a population direction (q)
     //find the uncle's (the parent neighbor) offset from which the desired population (q) should be read
@@ -298,7 +298,9 @@ Neon::set::Container explosionPull(Neon::domain::mGrid&                 grid,
 
                                 //get the uncle direction/offset i.e., the neighbor of the cell's parent
                                 //this direction/offset is wrt to the cell's parent
-                                Neon::int8_3d uncleDir = unlceOffset(cell.mLocation, dir);
+                                Neon::int8_3d uncleDir = uncleOffset(cell.mLocation, dir);
+
+                                auto uncleLoc = fpost_col.getUncle(cell, uncleDir);
 
                                 auto uncle = fpost_col.uncleVal(cell, uncleDir, q, T(0));
                                 if (uncle.isValid) {
@@ -418,7 +420,7 @@ Neon::set::Container store(Neon::domain::mGrid&           grid,
 
                                                     //cq is coarse neighbor (i.e., uncle) that we need to go in order to read q
                                                     //for c (this is what we do for explosion but here we do this just for the check)
-                                                    const Neon::int8_3d cq = unlceOffset(c, q_dir);
+                                                    const Neon::int8_3d cq = uncleOffset(c, q_dir);
                                                     if (cq == r_dir) {
                                                         num++;
                                                         sum += fpost_col.childVal(cell, c, q, 0).value;
@@ -607,10 +609,10 @@ void postProcess(Neon::domain::mGrid&                        grid,
 
     grid.getBackend().syncAll();
 
-    if (grid.getBackend().runtime() == Neon::Runtime::stream) {
-        vel.updateIO();
-        //rho.updateIO();
-    }
+
+    vel.updateIO();
+    //rho.updateIO();
+
 
     int                precision = 4;
     std::ostringstream suffix;
@@ -644,8 +646,7 @@ int main(int argc, char** argv)
         //levelSDF[2] = -56.0 / 144.0;
         //levelSDF[3] = -1.0;
 
-        //const Neon::index_3d grid_dim(48, 48, 48);
-        const Neon::index_3d grid_dim(24, 24, 24);
+        const Neon::index_3d grid_dim(48, 48, 48);
         float                levelSDF[depth + 1];
         levelSDF[0] = 0;
         levelSDF[1] = -8 / 24.0;
@@ -687,73 +688,6 @@ int main(int argc, char** argv)
         auto rho = grid.newField<T>("rho", 1, 0);
 
 
-        //classify voxels
-        for (int l = 0; l < descriptor.getDepth(); ++l) {
-            cellType.forEachActiveCell(
-                l,
-                [&](const Neon::int32_3d idx, const int q, CellType& val) {
-                    val = CellType::bulk;
-                    if (idx.x == 0 || idx.x == grid_dim.x - 1 ||
-                        idx.y == 0 || idx.y == grid_dim.y - 1 ||
-                        idx.z == 0 || idx.z == grid_dim.z - 1) {
-
-                        val = CellType::bounceBack;
-
-                        if (idx.y == grid_dim.y - 1) {
-                            val = CellType::movingWall;
-                        }
-                    }
-                });
-        }
-        cellType.updateCompute();
-
-
-        // init fin and fout
-        constexpr auto t = latticeWeight<Q>();
-
-        auto init = [&](const Neon::int32_3d idx, const int q, const int l) {
-            T ret = t.t[q];
-
-            if (l == 0) {
-                if (idx.x == 0 || idx.x == grid_dim.x - 1 ||
-                    idx.y == 0 || idx.y == grid_dim.y - 1 ||
-                    idx.z == 0 || idx.z == grid_dim.z - 1) {
-
-                    if (idx.y == grid_dim.y - 1) {
-                        ret = 0;
-                        for (int d = 0; d < 3; ++d) {
-                            ret += latticeVelocity3D[q][d] * ulid.v[d];
-                        }
-                        ret *= -6. * t.t[q];
-                    } else {
-                        ret = 0;
-                    }
-                }
-            }
-            return ret;
-        };
-
-        for (int l = 0; l < descriptor.getDepth(); ++l) {
-            fin.forEachActiveCell(
-                l,
-                [&](const Neon::int32_3d idx, const int q, T& val) {
-                    val = init(idx, q, l);
-                });
-            fout.forEachActiveCell(
-                l,
-                [&](const Neon::int32_3d idx, const int q, T& val) {
-                    val = init(idx, q, l);
-                });
-            vel.forEachActiveCell(l, [&](const Neon::int32_3d, const int, T& val) {
-                val = 0;
-            });
-            rho.forEachActiveCell(l, [&](const Neon::int32_3d, const int, T& val) {
-                val = 0;
-            });
-        }
-        fin.updateCompute();
-        fout.updateCompute();
-
         //init fields
         for (int level = 0; level < descriptor.getDepth(); ++level) {
             auto container =
@@ -770,14 +704,14 @@ int main(int argc, char** argv)
                             constexpr auto t = latticeWeight<Q>();
 
                             //velocity and density
-                            /*u(cell, 0) = 0;
+                            u(cell, 0) = 0;
                             u(cell, 1) = 0;
                             u(cell, 2) = 0;
                             rh(cell, 0) = 0;
-                            type(cell, 0) = CellType::bulk;*/
+                            type(cell, 0) = CellType::bulk;
 
                             if (!in.hasChildren(cell)) {
-                                /*const Neon::index_3d idx = in.mapToGlobal(cell);
+                                const Neon::index_3d idx = in.mapToGlobal(cell);
 
                                 //pop
                                 for (int q = 0; q < Q; ++q) {
@@ -804,17 +738,10 @@ int main(int argc, char** argv)
 
                                     out(cell, q) = pop_init_val;
                                     in(cell, q) = pop_init_val;
-                                }*/
-
-
+                                }
                             } else {
                                 in(cell, 0) = 0;
                                 out(cell, 0) = 0;
-
-                                /*for (int q = 0; q < Q; ++q) {
-                                    in(cell, q) = 0;
-                                    out(cell, q) = 0;
-                                }*/
                             }
                         };
                     });
@@ -822,10 +749,8 @@ int main(int argc, char** argv)
             container.run(0);
         }
 
-        //rho.updateIO();
-        //vel.updateIO();
-        //fin.updateIO();
-        //fout.updateIO();
+        grid.getBackend().syncAll();
+
 
         //skeleton
         std::vector<Neon::set::Container> containers;
diff --git a/libNeonDomain/include/Neon/domain/internal/mGrid/mPartition_imp.h b/libNeonDomain/include/Neon/domain/internal/mGrid/mPartition_imp.h
index 77abe808..fd264720 100644
--- a/libNeonDomain/include/Neon/domain/internal/mGrid/mPartition_imp.h
+++ b/libNeonDomain/include/Neon/domain/internal/mGrid/mPartition_imp.h
@@ -233,7 +233,7 @@ NEON_CUDA_HOST_DEVICE inline auto mPartition<T, C>::getUncle(const Cell&   cell,
 {
     Cell uncle = getParent(cell);
     if (uncle.isActive()) {
-        uncle = this->getNghCell(uncle, direction, mParentNeighbourBlocks);
+        uncle = this->getNghCell(uncle, direction, (mParentNeighbourBlocks + (26 * uncle.mBlockID)));
         uncle.mBlockSize = mRefFactors[mLevel + 1];
         uncle.mIsActive = uncle.mBlockID != std::numeric_limits<uint32_t>::max();
     }

From 42c8a752bdd9c36f137b5f509a8640506906b536 Mon Sep 17 00:00:00 2001
From: Ahmed Mahmoud <ahmed.mahmoud@autodesk.com>
Date: Wed, 8 Feb 2023 07:50:39 -0500
Subject: [PATCH 21/24] fix windows compilation error

---
 apps/lbmMultiRes/lbmMultiRes.cu | 28 ++++++++++++++--------------
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/apps/lbmMultiRes/lbmMultiRes.cu b/apps/lbmMultiRes/lbmMultiRes.cu
index 3e897dde..e2edfcf9 100644
--- a/apps/lbmMultiRes/lbmMultiRes.cu
+++ b/apps/lbmMultiRes/lbmMultiRes.cu
@@ -639,20 +639,20 @@ int main(int argc, char** argv)
 
         const Neon::domain::mGridDescriptor descriptor(depth);
 
-        //const Neon::index_3d grid_dim(144, 144, 144);
-        //float                levelSDF[depth + 1];
-        //levelSDF[0] = 0;
-        //levelSDF[1] = -28.0 / 144.0;
-        //levelSDF[2] = -56.0 / 144.0;
-        //levelSDF[3] = -1.0;
-
-        const Neon::index_3d grid_dim(48, 48, 48);
+        const Neon::index_3d grid_dim(144, 144, 144);
         float                levelSDF[depth + 1];
         levelSDF[0] = 0;
-        levelSDF[1] = -8 / 24.0;
-        levelSDF[2] = -16 / 24.0;
+        levelSDF[1] = -28.0 / 144.0;
+        levelSDF[2] = -56.0 / 144.0;
         levelSDF[3] = -1.0;
 
+        //const Neon::index_3d grid_dim(48, 48, 48);
+        //float                levelSDF[depth + 1];
+        //levelSDF[0] = 0;
+        //levelSDF[1] = -8 / 24.0;
+        //levelSDF[2] = -16 / 24.0;
+        //levelSDF[3] = -1.0;
+
 
         Neon::domain::mGrid grid(
             backend, grid_dim,
@@ -690,10 +690,12 @@ int main(int argc, char** argv)
 
         //init fields
         for (int level = 0; level < descriptor.getDepth(); ++level) {
+            constexpr auto t = latticeWeight<Q>();
+
             auto container =
                 grid.getContainer(
                     "Init_" + std::to_string(level), level,
-                    [&fin, &fout, &cellType, &vel, &rho, level, grid_dim, ulid, Q](Neon::set::Loader& loader) {
+                    [&fin, &fout, &cellType, &vel, &rho, level, grid_dim, ulid, Q, t](Neon::set::Loader& loader) {
                         auto& in = fin.load(loader, level, Neon::MultiResCompute::MAP);
                         auto& out = fout.load(loader, level, Neon::MultiResCompute::MAP);
                         auto& type = cellType.load(loader, level, Neon::MultiResCompute::MAP);
@@ -701,8 +703,6 @@ int main(int argc, char** argv)
                         auto& rh = rho.load(loader, level, Neon::MultiResCompute::MAP);
 
                         return [=] NEON_CUDA_HOST_DEVICE(const typename Neon::domain::bGrid::Cell& cell) mutable {
-                            constexpr auto t = latticeWeight<Q>();
-
                             //velocity and density
                             u(cell, 0) = 0;
                             u(cell, 1) = 0;
@@ -769,7 +769,7 @@ int main(int argc, char** argv)
         for (int t = 0; t < max_iter; ++t) {
             printf("\n Iteration = %d", t);
             skl.run();
-            if (t % 20 == 0) {
+            if (t % 100 == 0) {
                 postProcess<T, Q>(grid, descriptor.getDepth(), fout, cellType, t, vel, rho);
             }
         }

From da517f8eb77eb6716535222b23c9cd681655abc8 Mon Sep 17 00:00:00 2001
From: Ahmed Mahmoud <ahmed.mahmoud@autodesk.com>
Date: Thu, 9 Feb 2023 12:00:55 -0500
Subject: [PATCH 22/24] bug fix in store()

---
 apps/lbmMultiRes/lbmMultiRes.cu | 20 +++++++++++++++-----
 1 file changed, 15 insertions(+), 5 deletions(-)

diff --git a/apps/lbmMultiRes/lbmMultiRes.cu b/apps/lbmMultiRes/lbmMultiRes.cu
index e2edfcf9..e4a8634a 100644
--- a/apps/lbmMultiRes/lbmMultiRes.cu
+++ b/apps/lbmMultiRes/lbmMultiRes.cu
@@ -405,6 +405,7 @@ Neon::set::Container store(Neon::domain::mGrid&           grid,
                                 for (int8_t p = 0; p < Q; ++p) {
                                     const Neon::int8_3d p_dir = getDir(p);
 
+                                    const auto p_cell = fpost_col.getNghCell(cell, p_dir);
                                     //relative direction of q w.r.t p
                                     //i.e., in which direction we should move starting from p to land on q
                                     const Neon::int8_3d r_dir = q_dir - p_dir;
@@ -422,8 +423,11 @@ Neon::set::Container store(Neon::domain::mGrid&           grid,
                                                     //for c (this is what we do for explosion but here we do this just for the check)
                                                     const Neon::int8_3d cq = uncleOffset(c, q_dir);
                                                     if (cq == r_dir) {
-                                                        num++;
-                                                        sum += fpost_col.childVal(cell, c, q, 0).value;
+                                                        auto childVal = fpost_col.childVal(p_cell, c, q, 0);
+                                                        if (childVal.isValid) {
+                                                            num++;
+                                                            sum += childVal.value;
+                                                        }
                                                     }
                                                 }
                                             }
@@ -647,31 +651,35 @@ int main(int argc, char** argv)
         levelSDF[3] = -1.0;
 
         //const Neon::index_3d grid_dim(48, 48, 48);
-        //float                levelSDF[depth + 1];
+        //float levelSDF[depth + 1];
         //levelSDF[0] = 0;
         //levelSDF[1] = -8 / 24.0;
         //levelSDF[2] = -16 / 24.0;
         //levelSDF[3] = -1.0;
 
+        //const Neon::index_3d grid_dim(12, 12, 4);
 
         Neon::domain::mGrid grid(
             backend, grid_dim,
             {[&](const Neon::index_3d id) -> bool {
                  return sdfCube(id, grid_dim - 1) <= levelSDF[0] &&
                         sdfCube(id, grid_dim - 1) > levelSDF[1];
+                 //return id.x < 4 && id.z == 0;
              },
              [&](const Neon::index_3d& id) -> bool {
                  return sdfCube(id, grid_dim - 1) <= levelSDF[1] &&
                         sdfCube(id, grid_dim - 1) > levelSDF[2];
+                 //return id.x >= 4 && id.x < 8 && id.z == 0;
              },
              [&](const Neon::index_3d& id) -> bool {
                  return sdfCube(id, grid_dim - 1) <= levelSDF[2] &&
                         sdfCube(id, grid_dim - 1) > levelSDF[3];
+                 //return id.z == 0;
              }},
             Neon::domain::Stencil::s19_t(false), descriptor);
 
         //LBM problem
-        const int             max_iter = 20000;
+        const int             max_iter = 200000;
         const T               ulb = 0.02;
         const T               Re = 100;
         const T               clength = (grid_dim.x * std::pow(2, -(descriptor.getDepth() - 1)) - 1);
@@ -752,6 +760,8 @@ int main(int argc, char** argv)
         grid.getBackend().syncAll();
 
 
+        //vel.ioToVtk("test");
+
         //skeleton
         std::vector<Neon::set::Container> containers;
         nonUniformTimestepRecursive<T, Q>(grid,
@@ -769,7 +779,7 @@ int main(int argc, char** argv)
         for (int t = 0; t < max_iter; ++t) {
             printf("\n Iteration = %d", t);
             skl.run();
-            if (t % 100 == 0) {
+            if (t % 200 == 0) {
                 postProcess<T, Q>(grid, descriptor.getDepth(), fout, cellType, t, vel, rho);
             }
         }

From 7623d74765343b785d37dacba759f3ed594eb7f6 Mon Sep 17 00:00:00 2001
From: Ahmed Mahmoud <ahmed.mahmoud@autodesk.com>
Date: Thu, 9 Feb 2023 15:57:52 -0500
Subject: [PATCH 23/24] parameter tuning

---
 apps/lbmMultiRes/lbmMultiRes.cu | 22 +++++++---------------
 1 file changed, 7 insertions(+), 15 deletions(-)

diff --git a/apps/lbmMultiRes/lbmMultiRes.cu b/apps/lbmMultiRes/lbmMultiRes.cu
index e4a8634a..099d8f1a 100644
--- a/apps/lbmMultiRes/lbmMultiRes.cu
+++ b/apps/lbmMultiRes/lbmMultiRes.cu
@@ -643,11 +643,11 @@ int main(int argc, char** argv)
 
         const Neon::domain::mGridDescriptor descriptor(depth);
 
-        const Neon::index_3d grid_dim(144, 144, 144);
+        const Neon::index_3d grid_dim(160, 160, 160);
         float                levelSDF[depth + 1];
         levelSDF[0] = 0;
-        levelSDF[1] = -28.0 / 144.0;
-        levelSDF[2] = -56.0 / 144.0;
+        levelSDF[1] = -31.0 / 160.0;
+        levelSDF[2] = -64 / 160.0;
         levelSDF[3] = -1.0;
 
         //const Neon::index_3d grid_dim(48, 48, 48);
@@ -657,32 +657,27 @@ int main(int argc, char** argv)
         //levelSDF[2] = -16 / 24.0;
         //levelSDF[3] = -1.0;
 
-        //const Neon::index_3d grid_dim(12, 12, 4);
-
         Neon::domain::mGrid grid(
             backend, grid_dim,
             {[&](const Neon::index_3d id) -> bool {
                  return sdfCube(id, grid_dim - 1) <= levelSDF[0] &&
                         sdfCube(id, grid_dim - 1) > levelSDF[1];
-                 //return id.x < 4 && id.z == 0;
              },
              [&](const Neon::index_3d& id) -> bool {
                  return sdfCube(id, grid_dim - 1) <= levelSDF[1] &&
                         sdfCube(id, grid_dim - 1) > levelSDF[2];
-                 //return id.x >= 4 && id.x < 8 && id.z == 0;
              },
              [&](const Neon::index_3d& id) -> bool {
                  return sdfCube(id, grid_dim - 1) <= levelSDF[2] &&
                         sdfCube(id, grid_dim - 1) > levelSDF[3];
-                 //return id.z == 0;
              }},
             Neon::domain::Stencil::s19_t(false), descriptor);
 
         //LBM problem
         const int             max_iter = 200000;
-        const T               ulb = 0.02;
-        const T               Re = 100;
-        const T               clength = (grid_dim.x * std::pow(2, -(descriptor.getDepth() - 1)) - 1);
+        const T               ulb = 0.04;
+        const T               Re = 1000;
+        const T               clength = T(grid.getDimension(descriptor.getDepth() - 1).x);
         const T               visclb = ulb * clength / Re;
         const T               omega = 1.0 / (3. * visclb + 0.5);
         const Neon::double_3d ulid(ulb, 0., 0.);
@@ -694,8 +689,7 @@ int main(int argc, char** argv)
 
         auto vel = grid.newField<T>("vel", 3, 0);
         auto rho = grid.newField<T>("rho", 1, 0);
-
-
+              
         //init fields
         for (int level = 0; level < descriptor.getDepth(); ++level) {
             constexpr auto t = latticeWeight<Q>();
@@ -760,8 +754,6 @@ int main(int argc, char** argv)
         grid.getBackend().syncAll();
 
 
-        //vel.ioToVtk("test");
-
         //skeleton
         std::vector<Neon::set::Container> containers;
         nonUniformTimestepRecursive<T, Q>(grid,

From 5631a0cbd532ae26c962b23920f6bbb21a99bdf7 Mon Sep 17 00:00:00 2001
From: Ahmed Mahmoud <ahmed.mahmoud@autodesk.com>
Date: Tue, 14 Feb 2023 08:09:46 -0500
Subject: [PATCH 24/24] docs

---
 .../Neon/domain/internal/mGrid/mPartition.h   | 22 ++++++++++++++-----
 1 file changed, 17 insertions(+), 5 deletions(-)

diff --git a/libNeonDomain/include/Neon/domain/internal/mGrid/mPartition.h b/libNeonDomain/include/Neon/domain/internal/mGrid/mPartition.h
index ab9d3042..5921d36c 100644
--- a/libNeonDomain/include/Neon/domain/internal/mGrid/mPartition.h
+++ b/libNeonDomain/include/Neon/domain/internal/mGrid/mPartition.h
@@ -83,16 +83,14 @@ class mPartition : public Neon::domain::internal::bGrid::bPartition<T, C>
 
     /**
      * Check if the cell is refined i.e., has children 
-     * @param cell the cell i.e., parent at which the children are checked 
-     * @return 
+     * @param cell the cell i.e., parent at which the children are checked      
     */
     NEON_CUDA_HOST_DEVICE inline auto hasChildren(const Cell& cell) const -> bool;
 
     /**
      * Check if a neighbor to 'cell' has children 
-     * @param cell 
-     * @param cell 
-     * @return 
+     * @param cell the main cell 
+     * @param nghDir the direction relative to cell 
     */
     NEON_CUDA_HOST_DEVICE inline auto hasChildren(const Cell& cell, const Neon::int8_3d nghDir) const -> bool;
 
@@ -125,9 +123,23 @@ class mPartition : public Neon::domain::internal::bGrid::bPartition<T, C>
     */
     NEON_CUDA_HOST_DEVICE inline auto hasParent(const Cell& cell) const -> bool;
 
+    /**
+     * The uncle of a cell at level L is a cell at level L+1 and is a neighbor to the cell's parent. 
+     * This function returns the uncle of a given cell in a certain direction w.r.t the cell's parent
+     * @param cell the main cell at level L 
+     * @param direction the direction w.r.t the parent of cell      
+    */
     NEON_CUDA_HOST_DEVICE inline auto getUncle(const Cell&   cell,
                                                Neon::int8_3d direction) const -> Cell;
 
+    /**
+     * The uncle of a cell at level L is a cell at level L+1 and is a neighbor to the cell's parent. 
+     * This function returns the value of a give cell in a certain direction w.r.t the cell's parent along a certain cardinality.  
+     * @param cell the main cell at level L 
+     * @param direction the direction w.r.t the parent of cell      
+     * @param card the cardinality 
+     * @param alternativeVal alternative value in case the uncle does not exist. 
+    */
     NEON_CUDA_HOST_DEVICE inline auto uncleVal(const Cell&   cell,
                                                Neon::int8_3d direction,
                                                int           card,