loop tiling for step_curl using cache-oblivious recursive algorithm (N…

…anoComp#1655) * loop tiling for step_curl using cache-oblivious recursive algorithm * minor fixes * check that the sum of all the tiles adds up to the chunk volume * switch from subdomain to gv when calling STEP_CURL * replace manual computation of pixels in grid_volume with private function call * lower mininum tile volume * move loop over tiles outside of loop over components and use new tile_split function * add undocumented parameter for specifying base case of recursion to Simulation constructor
mawc2019 · Nov 3, 2021 · 61599ec · 61599ec
1 parent fbda172
commit 61599ec
Show file tree

Hide file tree

Showing 6 changed files with 143 additions and 75 deletions.
diff --git a/python/simulation.py b/python/simulation.py
@@ -953,6 +953,7 @@ def __init__(self,
                  progress_interval=4,
                  subpixel_tol=1e-4,
                  subpixel_maxeval=100000,
+                 loop_tile_base=0,
                  ensure_periodicity=True,
                  num_chunks=0,
                  Courant=0.5,
@@ -1206,6 +1207,7 @@ def __init__(self,
         self.eps_averaging = eps_averaging
         self.subpixel_tol = subpixel_tol
         self.subpixel_maxeval = subpixel_maxeval
+        self.loop_tile_base = loop_tile_base
         self.ensure_periodicity = ensure_periodicity
         self.extra_materials = extra_materials
         self.default_material = default_material
@@ -1921,7 +1923,8 @@ def init_sim(self):
             self.structure,
             self.m if self.is_cylindrical else 0,
             self.k_point.z if self.special_kz and self.k_point else 0,
-            not self.accurate_fields_near_cylorigin
+            not self.accurate_fields_near_cylorigin,
+            self.loop_tile_base
         )
 
         if self.force_all_components and self.dimensions != 1:

diff --git a/src/fields.cpp b/src/fields.cpp
@@ -29,7 +29,8 @@ using namespace std;
 
 namespace meep {
 
-fields::fields(structure *s, double m, double beta, bool zero_fields_near_cylorigin)
+fields::fields(structure *s, double m, double beta, bool zero_fields_near_cylorigin,
+               int loop_tile_base)
     : S(s->S), gv(s->gv), user_volume(s->user_volume), v(s->v), m(m), beta(beta),
       working_on(&times_spent) {
   shared_chunks = s->shared_chunks;
@@ -57,7 +58,8 @@ fields::fields(structure *s, double m, double beta, bool zero_fields_near_cylori
   typedef fields_chunk *fields_chunk_ptr;
   chunks = new fields_chunk_ptr[num_chunks];
   for (int i = 0; i < num_chunks; i++)
-    chunks[i] = new fields_chunk(s->chunks[i], outdir, m, beta, zero_fields_near_cylorigin, i);
+    chunks[i] = new fields_chunk(s->chunks[i], outdir, m, beta,
+                                 zero_fields_near_cylorigin, i, loop_tile_base);
   FOR_FIELD_TYPES(ft) {
     typedef realnum *realnum_ptr;
     comm_blocks[ft] = new realnum_ptr[num_chunks * num_chunks];
@@ -196,8 +198,40 @@ fields_chunk::~fields_chunk() {
   if (new_s && new_s->refcount-- <= 1) delete new_s; // delete if not shared
 }
 
+static void split_into_tiles(grid_volume gvol, std::vector<grid_volume> *result,
+                             const size_t loop_tile_base) {
+  if (gvol.nowned_min() < loop_tile_base) {
+    result->push_back(gvol);
+    return;
+  }
+
+  int best_split_point;
+  direction best_split_direction;
+  gvol.tile_split(best_split_point, best_split_direction);
+  grid_volume left_gvol = gvol.split_at_fraction(false, best_split_point, best_split_direction);
+  split_into_tiles(left_gvol, result, loop_tile_base);
+  grid_volume right_gvol = gvol.split_at_fraction(true, best_split_point, best_split_direction);
+  split_into_tiles(right_gvol, result, loop_tile_base);
+  return;
+}
+
+// First check that the tile volumes gvs do not intersect and that they add
+// up to the chunk's total grid_volume gv
+static void check_tiles(grid_volume gv, const std::vector<grid_volume> &gvs) {
+  grid_volume vol_intersection;
+  for (size_t i = 0; i < gvs.size(); i++)
+    for (size_t j = i + 1; j < gvs.size(); j++)
+      if (gvs[i].intersect_with(gvs[j], &vol_intersection))
+        meep::abort("gvs[%zu] intersects with gvs[%zu]\n", i, j);
+  size_t sum = 0;
+  for (const auto& sub_gv : gvs) { sum += sub_gv.nowned_min(); }
+  size_t v_grid_points = 1;
+  LOOP_OVER_DIRECTIONS(gv.dim, d) { v_grid_points *= gv.num_direction(d); }
+  if (sum != v_grid_points) meep::abort("v_grid_points = %zu, sum(tiles) = %zu\n", v_grid_points, sum);
+}
+
 fields_chunk::fields_chunk(structure_chunk *the_s, const char *od, double m, double beta,
-                           bool zero_fields_near_cylorigin, int chunkidx)
+                           bool zero_fields_near_cylorigin, int chunkidx, int loop_tile_base)
     : gv(the_s->gv), v(the_s->v), m(m), zero_fields_near_cylorigin(zero_fields_near_cylorigin),
       beta(beta) {
   s = the_s;
@@ -210,6 +244,12 @@ fields_chunk::fields_chunk(structure_chunk *the_s, const char *od, double m, dou
   Courant = s->Courant;
   dt = s->dt;
   dft_chunks = NULL;
+  if (loop_tile_base > 0) {
+    split_into_tiles(gv, &gvs, loop_tile_base);
+    check_tiles(gv, gvs);
+  } else {
+    gvs.push_back(gv);
+  }
   FOR_FIELD_TYPES(ft) {
     polarization_state *cur = NULL;
     pol[ft] = NULL;
@@ -265,6 +305,7 @@ fields_chunk::fields_chunk(const fields_chunk &thef, int chunkidx) : gv(thef.gv)
   Courant = thef.Courant;
   dt = thef.dt;
   dft_chunks = NULL;
+  gvs = thef.gvs;
   FOR_FIELD_TYPES(ft) {
     polarization_state *cur = NULL;
     for (polarization_state *ocur = thef.pol[ft]; ocur; ocur = ocur->next) {

diff --git a/src/meep.hpp b/src/meep.hpp
@@ -1448,6 +1448,7 @@ class fields_chunk {
 
   double a, Courant, dt; // resolution a, Courant number, and timestep dt=Courant/a
   grid_volume gv;
+  std::vector<grid_volume> gvs; // subdomains for cache-tiled execution of step_curl
   volume v;
   double m;                        // angular dependence in cyl. coords
   bool zero_fields_near_cylorigin; // fields=0 m pixels near r=0 for stability
@@ -1460,7 +1461,7 @@ class fields_chunk {
   int chunk_idx;
 
   fields_chunk(structure_chunk *, const char *outdir, double m, double beta,
-               bool zero_fields_near_cylorigin, int chunkidx);
+               bool zero_fields_near_cylorigin, int chunkidx, int loop_tile_base);
 
   fields_chunk(const fields_chunk &, int chunkidx);
   ~fields_chunk();
@@ -1693,7 +1694,8 @@ class fields {
   bool components_allocated;
 
   // fields.cpp methods:
-  fields(structure *, double m = 0, double beta = 0, bool zero_fields_near_cylorigin = true);
+  fields(structure *, double m = 0, double beta = 0, bool zero_fields_near_cylorigin = true,
+         int loop_tile_base = 0);
   fields(const fields &);
   ~fields();
   bool equal_layout(const fields &f) const;

diff --git a/src/meep/vec.hpp b/src/meep/vec.hpp
@@ -1027,6 +1027,8 @@ class grid_volume {
 
   std::complex<double> get_split_costs(direction d, int split_point,
                                        bool frag_cost) const;
+  void tile_split(int &best_split_point,
+                  direction &best_split_direction) const;
   void find_best_split(int desired_chunks, bool frag_cost,
                        int &best_split_point, direction &best_split_direction,
                        double &left_effort_fraction) const;

diff --git a/src/step_db.cpp b/src/step_db.cpp
@@ -44,83 +44,85 @@ bool fields_chunk::step_db(field_type ft) {
 
   if (ft != B_stuff && ft != D_stuff) meep::abort("bug - step_db should only be called for B or D");
 
-  DOCMP FOR_FT_COMPONENTS(ft, cc) {
-    if (f[cc][cmp]) {
-      const component c_p = plus_component[cc], c_m = minus_component[cc];
-      const direction d_deriv_p = plus_deriv_direction[cc];
-      const direction d_deriv_m = minus_deriv_direction[cc];
-      const direction d_c = component_direction(cc);
-      const bool have_p = have_plus_deriv[cc];
-      const bool have_m = have_minus_deriv[cc];
-      const direction dsig0 = cycle_direction(gv.dim, d_c, 1);
-      const direction dsig = s->sigsize[dsig0] > 1 ? dsig0 : NO_DIRECTION;
-      const direction dsigu0 = cycle_direction(gv.dim, d_c, 2);
-      const direction dsigu = s->sigsize[dsigu0] > 1 ? dsigu0 : NO_DIRECTION;
-      ptrdiff_t stride_p = have_p ? gv.stride(d_deriv_p) : 0;
-      ptrdiff_t stride_m = have_m ? gv.stride(d_deriv_m) : 0;
-      realnum *f_p = have_p ? f[c_p][cmp] : NULL;
-      realnum *f_m = have_m ? f[c_m][cmp] : NULL;
-      realnum *the_f = f[cc][cmp];
+  for (const auto& sub_gv : gvs) {
+    DOCMP FOR_FT_COMPONENTS(ft, cc) {
+      if (f[cc][cmp]) {
+        const component c_p = plus_component[cc], c_m = minus_component[cc];
+        const direction d_deriv_p = plus_deriv_direction[cc];
+        const direction d_deriv_m = minus_deriv_direction[cc];
+        const direction d_c = component_direction(cc);
+        const bool have_p = have_plus_deriv[cc];
+        const bool have_m = have_minus_deriv[cc];
+        const direction dsig0 = cycle_direction(gv.dim, d_c, 1);
+        const direction dsig = s->sigsize[dsig0] > 1 ? dsig0 : NO_DIRECTION;
+        const direction dsigu0 = cycle_direction(gv.dim, d_c, 2);
+        const direction dsigu = s->sigsize[dsigu0] > 1 ? dsigu0 : NO_DIRECTION;
+        ptrdiff_t stride_p = have_p ? gv.stride(d_deriv_p) : 0;
+        ptrdiff_t stride_m = have_m ? gv.stride(d_deriv_m) : 0;
+        realnum *f_p = have_p ? f[c_p][cmp] : NULL;
+        realnum *f_m = have_m ? f[c_m][cmp] : NULL;
+        realnum *the_f = f[cc][cmp];
 
-      if (dsig != NO_DIRECTION && s->conductivity[cc][d_c] && !f_cond[cc][cmp]) {
-        f_cond[cc][cmp] = new realnum[gv.ntot()];
-        memset(f_cond[cc][cmp], 0, sizeof(realnum) * gv.ntot());
-      }
-      if (dsigu != NO_DIRECTION && !f_u[cc][cmp]) {
-        f_u[cc][cmp] = new realnum[gv.ntot()];
-        memcpy(f_u[cc][cmp], the_f, gv.ntot() * sizeof(realnum));
-        allocated_u = true;
-      }
+        if (dsig != NO_DIRECTION && s->conductivity[cc][d_c] && !f_cond[cc][cmp]) {
+          f_cond[cc][cmp] = new realnum[gv.ntot()];
+          memset(f_cond[cc][cmp], 0, sizeof(realnum) * gv.ntot());
+        }
+        if (dsigu != NO_DIRECTION && !f_u[cc][cmp]) {
+          f_u[cc][cmp] = new realnum[gv.ntot()];
+          memcpy(f_u[cc][cmp], the_f, gv.ntot() * sizeof(realnum));
+          allocated_u = true;
+        }
 
-      if (ft == D_stuff) { // strides are opposite sign for H curl
-        stride_p = -stride_p;
-        stride_m = -stride_m;
-      }
+        if (ft == D_stuff) { // strides are opposite sign for H curl
+          stride_p = -stride_p;
+          stride_m = -stride_m;
+        }
 
-      if (gv.dim == Dcyl) switch (d_c) {
-          case R:
-            f_p = NULL; // im/r Fz term will be handled separately
-            break;
-          case P: break; // curl works normally for phi component
-          case Z: {
-            f_m = NULL; // im/r Fr term will be handled separately
+        if (gv.dim == Dcyl) switch (d_c) {
+            case R:
+              f_p = NULL; // im/r Fz term will be handled separately
+              break;
+            case P: break; // curl works normally for phi component
+            case Z: {
+              f_m = NULL; // im/r Fr term will be handled separately
 
-            /* Here we do a somewhat cool hack: the update of the z
-               component gives a 1/r d(r Fp)/dr term, rather than
-               just the derivative dg/dr expected in step_curl.
-               Rather than duplicating all of step_curl to handle
-               this bloody derivative, however, we define a new
-               array f_rderiv_int which is the integral of 1/r d(r Fp)/dr,
-               so that we can pass it to the unmodified step_curl
-               and get the correct derivative.  (More precisely,
-               the derivative and integral are replaced by differences
-               and sums, but you get the idea). */
-            if (!f_rderiv_int) f_rderiv_int = new realnum[gv.ntot()];
-            realnum ir0 = gv.origin_r() * gv.a + 0.5 * gv.iyee_shift(c_p).in_direction(R);
-            for (int iz = 0; iz <= gv.nz(); ++iz)
-              f_rderiv_int[iz] = 0;
-            int sr = gv.nz() + 1;
-            for (int ir = 1; ir <= gv.nr(); ++ir) {
-              realnum rinv = 1.0 / ((ir + ir0) - 0.5);
-              for (int iz = 0; iz <= gv.nz(); ++iz) {
-                ptrdiff_t idx = ir * sr + iz;
-                f_rderiv_int[idx] =
-                    f_rderiv_int[idx - sr] +
-                    rinv * (f_p[idx] * (ir + ir0) - f_p[idx - sr] * ((ir - 1) + ir0));
+              /* Here we do a somewhat cool hack: the update of the z
+                 component gives a 1/r d(r Fp)/dr term, rather than
+                 just the derivative dg/dr expected in step_curl.
+                 Rather than duplicating all of step_curl to handle
+                 this bloody derivative, however, we define a new
+                 array f_rderiv_int which is the integral of 1/r d(r Fp)/dr,
+                 so that we can pass it to the unmodified step_curl
+                 and get the correct derivative.  (More precisely,
+                 the derivative and integral are replaced by differences
+                 and sums, but you get the idea). */
+              if (!f_rderiv_int) f_rderiv_int = new realnum[gv.ntot()];
+              realnum ir0 = gv.origin_r() * gv.a + 0.5 * gv.iyee_shift(c_p).in_direction(R);
+              for (int iz = 0; iz <= gv.nz(); ++iz)
+                f_rderiv_int[iz] = 0;
+              int sr = gv.nz() + 1;
+              for (int ir = 1; ir <= gv.nr(); ++ir) {
+                realnum rinv = 1.0 / ((ir + ir0) - 0.5);
+                for (int iz = 0; iz <= gv.nz(); ++iz) {
+                  ptrdiff_t idx = ir * sr + iz;
+                  f_rderiv_int[idx] =
+                      f_rderiv_int[idx - sr] +
+                      rinv * (f_p[idx] * (ir + ir0) - f_p[idx - sr] * ((ir - 1) + ir0));
+                }
               }
+              f_p = f_rderiv_int;
+              break;
             }
-            f_p = f_rderiv_int;
-            break;
+            default: meep::abort("bug - non-cylindrical field component in Dcyl");
           }
-          default: meep::abort("bug - non-cylindrical field component in Dcyl");
-        }
 
-      STEP_CURL(the_f, cc, f_p, f_m, stride_p, stride_m,
-                gv, gv.little_owned_corner0(cc), gv.big_corner(),
-                Courant, dsig, s->sig[dsig],
-                s->kap[dsig], s->siginv[dsig], f_u[cc][cmp], dsigu, s->sig[dsigu], s->kap[dsigu],
-                s->siginv[dsigu], dt, s->conductivity[cc][d_c], s->condinv[cc][d_c],
-                f_cond[cc][cmp]);
+        STEP_CURL(the_f, cc, f_p, f_m, stride_p, stride_m,
+                  gv, sub_gv.little_owned_corner0(cc), sub_gv.big_corner(),
+                  Courant, dsig, s->sig[dsig],
+                  s->kap[dsig], s->siginv[dsig], f_u[cc][cmp], dsigu, s->sig[dsigu], s->kap[dsigu],
+                  s->siginv[dsigu], dt, s->conductivity[cc][d_c], s->condinv[cc][d_c],
+                  f_cond[cc][cmp]);
+      }
     }
   }
 

diff --git a/src/vec.cpp b/src/vec.cpp
@@ -978,6 +978,24 @@ static double cost_diff(int desired_chunks, std::complex<double> costs) {
   return right_cost - left_cost;
 }
 
+void grid_volume::tile_split(int &best_split_point,
+                             direction &best_split_direction) const {
+  const size_t ntot_thresh = 10;
+  if (ntot() < ntot_thresh) {
+    best_split_point = 0;
+    best_split_direction = NO_DIRECTION;
+  } else if (nx() > 1) {
+    best_split_point = nx() / 2;
+    best_split_direction = X;
+  } else if (ny() > 1) {
+    best_split_point = ny() / 2;
+    best_split_direction = Y;
+  } else {
+    best_split_point = nz() / 2;
+    best_split_direction = Z;
+  }
+}
+
 void grid_volume::find_best_split(int desired_chunks, bool fragment_cost,
                                   int &best_split_point,
                                   direction &best_split_direction,