From 3eabcef8c90049e08449bbe39f5f876956f3875e Mon Sep 17 00:00:00 2001
From: "Steven G. Johnson" <stevenj@mit.edu>
Date: Thu, 26 Aug 2021 10:54:59 -0400
Subject: [PATCH] updated multi-threading branch (#1628)

* multithreading branch by smartalecH

* compilation fixes

* rm pragma unroll statements, which shouldn't be needed

* parallelize update_dft

* use MPI_Init_thread with openmp

* fix idx_dft counter for parallel loop

* IVEC_LOOP_COUNTER macro

* disable non-thread-safe PLOOP for epsilon averaging

* simplifications, fixes

* make sure step_generic_stride1.cpp is re-generated

* nevermind, step_generic.cpp dependency is sufficient

* comment formatting

Co-authored-by: Alec Hammond <ahammond36@gatech.edu>
---
 .gitignore                    |   1 +
 doc/docs/Parallel_Meep.md     |   2 +-
 src/anisotropic_averaging.cpp |   7 +++
 src/dft.cpp                   |   7 +--
 src/initialize.cpp            |   2 +
 src/meep/vec.hpp              | 107 +++++++++++++++++++++++++++++++++-
 src/mympi.cpp                 |   9 +++
 src/step.cpp                  |   1 +
 src/step_generic.cpp          |  86 +++++++++++++--------------
 src/structure.cpp             |   5 +-
 src/susceptibility.cpp        |   6 +-
 tests/test_loop.cpp           |  80 +++++++++++++++++++++++++
 12 files changed, 258 insertions(+), 55 deletions(-)
 create mode 100644 tests/test_loop.cpp

diff --git a/.gitignore b/.gitignore
index 1a667d31c..edefa69da 100644
--- a/.gitignore
+++ b/.gitignore
@@ -104,6 +104,7 @@ tests/pw-source-ll
 tests/ring-ll
 tests/stress_tensor
 tests/symmetry
+tests/test_loop
 tests/three_d
 tests/two_dimensional
 tests/user-defined-material
diff --git a/doc/docs/Parallel_Meep.md b/doc/docs/Parallel_Meep.md
index 0228e4b38..5180828da 100644
--- a/doc/docs/Parallel_Meep.md
+++ b/doc/docs/Parallel_Meep.md
@@ -53,7 +53,7 @@ However, there is an alternative strategy for parallelization. If you have many
 
 The `divide_parallel_processes` feature can be useful for large supercomputers which typically restrict the total number of jobs that can be executed but do not restrict the size of each job, or for large-scale optimization where many separate simulations are coupled by an optimization algorithm. Note that when using this feature using the [Python interface](Python_User_Interface.md), only the output of the subgroup belonging to the master process of the entire simulation is shown in the standard output. (In C++, the master process from *every* subgroup prints to standard output.)
 
-Meep also supports [thread-level parallelism](https://en.wikipedia.org/wiki/Task_parallelism) (i.e., multi-threading) on a single, shared-memory, multi-core machine for multi-frequency [near-to-far field](Python_User_Interface.md#near-to-far-field-spectra) computations. Meep does not currently use thread-level parallelism for the time stepping although this feature may be added in the future (see [Issue \#228](https://github.com/NanoComp/meep/issues/228)). 
+Meep also supports [thread-level parallelism](https://en.wikipedia.org/wiki/Task_parallelism) (i.e., multi-threading) on a single, shared-memory, multi-core machine for multi-frequency [near-to-far field](Python_User_Interface.md#near-to-far-field-spectra) computations. Meep does not currently use thread-level parallelism for the time stepping although this feature may be added in the future (see [Issue \#228](https://github.com/NanoComp/meep/issues/228)).
 
 ### Optimization Studies of Parallel Simulations
 
diff --git a/src/anisotropic_averaging.cpp b/src/anisotropic_averaging.cpp
index 5b80245be..958dd06d3 100644
--- a/src/anisotropic_averaging.cpp
+++ b/src/anisotropic_averaging.cpp
@@ -2,6 +2,10 @@
 
 #include "meep_internals.hpp"
 
+/* This file contains routines to compute the "average" or "effective"
+   dielectric constant for a pixel, using an anisotropic averaging
+   procedure described in our papers (similar to MPB). */
+
 using namespace std;
 
 namespace meep {
@@ -226,6 +230,9 @@ void structure_chunk::set_chi1inv(component c, material_function &medium,
   double trivial_val[3] = {0, 0, 0};
   trivial_val[idiag] = 1.0;
   ivec shift1(unit_ivec(gv.dim, component_direction(c)) * (ft == E_stuff ? 1 : -1));
+  // TODO: make this loop thread-safe and change to PLOOP_OVER_VOL
+  // Note that we *cannot* make it thread-safe if `medium` is not thread-safe,
+  // e.g. if it calls back to Python.
   LOOP_OVER_VOL(gv, c, i) {
     double chi1invrow[3], chi1invrow_offdiag[3];
     IVEC_LOOP_ILOC(gv, here);
diff --git a/src/dft.cpp b/src/dft.cpp
index cf5177caa..d27447e7b 100644
--- a/src/dft.cpp
+++ b/src/dft.cpp
@@ -230,8 +230,8 @@ void dft_chunk::update_dft(double time) {
 
   int numcmp = fc->f[c][1] ? 2 : 1;
 
-  size_t idx_dft = 0;
-  LOOP_OVER_IVECS(fc->gv, is, ie, idx) {
+  PLOOP_OVER_IVECS(fc->gv, is, ie, idx) {
+    size_t idx_dft = IVEC_LOOP_COUNTER;
     double w;
     if (include_dV_and_interp_weights) {
       w = IVEC_LOOP_WEIGHT(s0, s1, e0, e1, dV0 + dV1 * loop_i2);
@@ -261,7 +261,6 @@ void dft_chunk::update_dft(double time) {
       for (int i = 0; i < Nomega; ++i)
         dft[Nomega * idx_dft + i] += std::complex<realnum>{fr * dft_phase[i].real(), fr * dft_phase[i].imag()};
     }
-    idx_dft++;
   }
 }
 
@@ -1269,4 +1268,4 @@ void fields::get_mode_mode_overlap(void *mode1_data, void *mode2_data, dft_flux
   get_overlap(mode1_data, mode2_data, flux, 0, overlaps);
 }
 
-} // namespace meep
+} // namespace meep
\ No newline at end of file
diff --git a/src/initialize.cpp b/src/initialize.cpp
index eae6bd596..1050bb051 100644
--- a/src/initialize.cpp
+++ b/src/initialize.cpp
@@ -149,6 +149,8 @@ void fields::initialize_field(component c, complex<double> func(const vec &)) {
 
 void fields_chunk::initialize_field(component c, complex<double> func(const vec &)) {
   if (f[c][0]) {
+    // note: this loop is not thread-safe unless func is, which
+    // isn't true if func e.g. calls back to Python
     LOOP_OVER_VOL(gv, c, i) {
       IVEC_LOOP_LOC(gv, here);
       complex<double> val = func(here);
diff --git a/src/meep/vec.hpp b/src/meep/vec.hpp
index 553d40faa..1c11ce2c3 100644
--- a/src/meep/vec.hpp
+++ b/src/meep/vec.hpp
@@ -148,7 +148,7 @@ component first_field_component(field_type ft);
   for (meep::direction d = dim == meep::Dcyl ? meep::Z : meep::X;                                  \
        d < (dim == meep::Dcyl ? meep::NO_DIRECTION : meep::R); d = meep::direction(d + 1))
 
-// loop over indices idx from is to ie (inclusive) in gv
+
 #define LOOP_OVER_IVECS(gv, is, ie, idx)                                                           \
   for (ptrdiff_t loop_is1 = (is).yucky_val(0), loop_is2 = (is).yucky_val(1),                       \
                  loop_is3 = (is).yucky_val(2), loop_n1 = ((ie).yucky_val(0) - loop_is1) / 2 + 1,   \
@@ -186,6 +186,64 @@ component first_field_component(field_type ft);
          loop_ibound++)                                                                            \
   LOOP_OVER_IVECS(gv, loop_notowned_is, loop_notowned_ie, idx)
 
+/* The following work identically to the LOOP_* macros above,
+   but employ shared memory-parallelism using OpenMP. Mainly
+   used in step_generic and a few other time-critical loops. */
+
+/* for the parallel implementation, we introduce 2 dummy loops, one at the begininnging
+   and one at the end, in order to "trick" openMP to allow us to define our localy variables
+   without having to change any other code in the main codebase. We can proceed to do
+   a collapse over all three main loops. */
+
+#define CHUNK_OPENMP _Pragma("omp parallel for")
+
+// the most generic use case where the user
+// can specify a custom clause
+#define PLOOP_OVER_IVECS_C(gv, is, ie, idx, clause)			                                                     \
+for(ptrdiff_t loop_is1 = (is).yucky_val(0), loop_is2 = (is).yucky_val(1),                          \
+                 loop_is3 = (is).yucky_val(2), loop_n1 = ((ie).yucky_val(0) - loop_is1) / 2 + 1,   \
+                 loop_n2 = ((ie).yucky_val(1) - loop_is2) / 2 + 1,                                 \
+                 loop_n3 = ((ie).yucky_val(2) - loop_is3) / 2 + 1,                                 \
+                 loop_d1 = (gv).yucky_direction(0), loop_d2 = (gv).yucky_direction(1),             \
+                 loop_d3 = (gv).yucky_direction(2),                                                \
+                 loop_s1 = (gv).stride((meep::direction)loop_d1),                                  \
+                 loop_s2 = (gv).stride((meep::direction)loop_d2),                                  \
+                 loop_s3 = (gv).stride((meep::direction)loop_d3),                                  \
+                 idx0 = (is - (gv).little_corner()).yucky_val(0) / 2 * loop_s1 +                   \
+                        (is - (gv).little_corner()).yucky_val(1) / 2 * loop_s2 +                   \
+                        (is - (gv).little_corner()).yucky_val(2) / 2 * loop_s3,                    \
+                  dummy_first=0;dummy_first<1;dummy_first++)                                       \
+_Pragma(clause)     				                                                     \
+  for (ptrdiff_t loop_i1 = 0; loop_i1 < loop_n1; loop_i1++)                                        \
+    for (ptrdiff_t loop_i2 = 0; loop_i2 < loop_n2; loop_i2++)                                      \
+      for (ptrdiff_t loop_i3 = 0; loop_i3 < loop_n3; loop_i3++)                                    \
+        for (ptrdiff_t idx = idx0 + loop_i1*loop_s1 + loop_i2*loop_s2 +                            \
+           loop_i3*loop_s3, dummy_last=0;dummy_last<1;dummy_last++)
+
+// For the main timestepping events, we know
+// we want to do a simple collapse
+#define PLOOP_OVER_IVECS(gv, is, ie, idx)                                                          \
+  /*master_printf("Entered ploop\n");*/ \
+  PLOOP_OVER_IVECS_C(gv, is, ie, idx, "omp parallel for collapse(3)")
+
+#define PLOOP_OVER_VOL(gv, c, idx)                                                                  \
+  PLOOP_OVER_IVECS(gv, (gv).little_corner() + (gv).iyee_shift(c),                                   \
+                  (gv).big_corner() + (gv).iyee_shift(c), idx)
+
+#define PLOOP_OVER_VOL_OWNED(gv, c, idx)                                                            \
+  PLOOP_OVER_IVECS(gv, (gv).little_owned_corner(c), (gv).big_corner(), idx)
+
+#define PLOOP_OVER_VOL_OWNED0(gv, c, idx)                                                           \
+  PLOOP_OVER_IVECS(gv, (gv).little_owned_corner0(c), (gv).big_corner(), idx)
+
+#define PLOOP_OVER_VOL_NOTOWNED(gv, c, idx)                                                         \
+  for (ivec loop_notowned_is((gv).dim, 0), loop_notowned_ie((gv).dim, 0);                          \
+       loop_notowned_is == zero_ivec((gv).dim);)                                                   \
+    for (int loop_ibound = 0;                                                                      \
+         (gv).get_boundary_icorners(c, loop_ibound, &loop_notowned_is, &loop_notowned_ie);         \
+         loop_ibound++)                                                                            \
+  PLOOP_OVER_IVECS(gv, loop_notowned_is, loop_notowned_ie, idx)
+
 #define LOOPS_ARE_STRIDE1(gv) ((gv).stride((gv).yucky_direction(2)) == 1)
 
 // The following work identically to the LOOP_* macros above,
@@ -199,8 +257,10 @@ component first_field_component(field_type ft);
 // should only use these macros where that is true!  (Basically,
 // all of this is here to support performance hacks of step_generic.)
 
-#if !defined(__INTEL_COMPILER) && !defined(__clang__) && (defined(__GNUC__) || defined(__GNUG__))
+#if !defined(__INTEL_COMPILER) && !defined(__clang__) && !defined(_OPENMP) && (defined(__GNUC__) || defined(__GNUG__))
 #define IVDEP _Pragma("GCC ivdep")
+#elif defined(_OPENMP)
+#define IVDEP _Pragma("omp simd")
 #elif defined(__INTEL_COMPILER)
 #define IVDEP _Pragma("ivdep")
 #else
@@ -244,6 +304,46 @@ component first_field_component(field_type ft);
          loop_ibound++)                                                                            \
   S1LOOP_OVER_IVECS(gv, loop_notowned_is, loop_notowned_ie, idx)
 
+/* The following is the stride-optimized version of the parallel loops from above.
+   We can use simd vectorization in addition to the usual par for optimization */
+// loop over indices idx from is to ie (inclusive) in gv
+#define PS1LOOP_OVER_IVECS(gv, is, ie, idx)                                                        \
+for(ptrdiff_t loop_is1 = (is).yucky_val(0), loop_is2 = (is).yucky_val(1),                          \
+                 loop_is3 = (is).yucky_val(2), loop_n1 = ((ie).yucky_val(0) - loop_is1) / 2 + 1,   \
+                 loop_n2 = ((ie).yucky_val(1) - loop_is2) / 2 + 1,                                 \
+                 loop_n3 = ((ie).yucky_val(2) - loop_is3) / 2 + 1,                                 \
+                 loop_d1 = (gv).yucky_direction(0), loop_d2 = (gv).yucky_direction(1),             \
+                 loop_s1 = (gv).stride((meep::direction)loop_d1),                                  \
+                 loop_s2 = (gv).stride((meep::direction)loop_d2), loop_s3 = 1,                     \
+                 idx0 = (is - (gv).little_corner()).yucky_val(0) / 2 * loop_s1 +                   \
+                        (is - (gv).little_corner()).yucky_val(1) / 2 * loop_s2 +                   \
+                        (is - (gv).little_corner()).yucky_val(2) / 2 * loop_s3,                    \
+                  dummy_first=0;dummy_first<1;dummy_first++)                                       \
+_Pragma("omp parallel for collapse(2)")				                                                     \
+  for (ptrdiff_t loop_i1 = 0; loop_i1 < loop_n1; loop_i1++)                                        \
+    for (ptrdiff_t loop_i2 = 0; loop_i2 < loop_n2; loop_i2++)                                      \
+      _Pragma("omp simd") \
+      for (ptrdiff_t loop_i3 = 0; loop_i3 < loop_n3; loop_i3++)                                    \
+        for (ptrdiff_t idx = idx0 + loop_i1 * loop_s1 + loop_i2 * loop_s2 + loop_i3, dummy_last=0;dummy_last<1;dummy_last++)
+
+#define PS1LOOP_OVER_VOL(gv, c, idx)                                                                \
+  PS1LOOP_OVER_IVECS(gv, (gv).little_corner() + (gv).iyee_shift(c),                                 \
+                    (gv).big_corner() + (gv).iyee_shift(c), idx)
+
+#define PS1LOOP_OVER_VOL_OWNED(gv, c, idx)                                                          \
+  PS1LOOP_OVER_IVECS(gv, (gv).little_owned_corner(c), (gv).big_corner(), idx)
+
+#define PS1LOOP_OVER_VOL_OWNED0(gv, c, idx)                                                         \
+  PS1LOOP_OVER_IVECS(gv, (gv).little_owned_corner0(c), (gv).big_corner(), idx)
+
+#define PS1LOOP_OVER_VOL_NOTOWNED(gv, c, idx)                                                      \
+  for (ivec loop_notowned_is((gv).dim, 0), loop_notowned_ie((gv).dim, 0);                          \
+       loop_notowned_is == meep::zero_ivec((gv).dim);)                                             \
+    for (int loop_ibound = 0;                                                                      \
+         (gv).get_boundary_icorners(c, loop_ibound, &loop_notowned_is, &loop_notowned_ie);         \
+         loop_ibound++)                                                                            \
+  PS1LOOP_OVER_IVECS(gv, loop_notowned_is, loop_notowned_ie, idx)
+
 #define IVEC_LOOP_AT_BOUNDARY                                                                      \
   ((loop_s1 != 0 && (loop_i1 == 0 || loop_i1 == loop_n1 - 1)) ||                                   \
    (loop_s2 != 0 && (loop_i2 == 0 || loop_i2 == loop_n2 - 1)) ||                                   \
@@ -276,6 +376,9 @@ component first_field_component(field_type ft);
   (IVEC_LOOP_WEIGHT1(s0, s1, e0, e1, 3) *                                                          \
    (IVEC_LOOP_WEIGHT1(s0, s1, e0, e1, 2) * ((dV)*IVEC_LOOP_WEIGHT1(s0, s1, e0, e1, 1))))
 
+// equivalent to incrementing a counter, starting from 0, in the loop body
+#define IVEC_LOOP_COUNTER ((loop_i1 * loop_n2 + loop_i2) * loop_n3 + loop_i3)
+
 inline signed_direction flip(signed_direction d) {
   signed_direction d2 = d;
   d2.flipped = !d.flipped;
diff --git a/src/mympi.cpp b/src/mympi.cpp
index 647bb395c..6570a34dd 100644
--- a/src/mympi.cpp
+++ b/src/mympi.cpp
@@ -184,7 +184,14 @@ void set_zero_subnormals(bool iszero)
 
 initialize::initialize(int &argc, char **&argv) {
 #ifdef HAVE_MPI
+#ifdef _OPENMP
+  int provided;
+  MPI_Init_thread(&argc, &argv, MPI_THREAD_FUNNELED, &provided);
+  if (provided < MPI_THREAD_FUNNELED && omp_get_num_threads() > 1)
+      abort("MPI does not support multi-threaded execution");
+#else
   MPI_Init(&argc, &argv);
+#endif
   int major, minor;
   MPI_Get_version(&major, &minor);
   if (verbosity > 0)
@@ -214,6 +221,8 @@ initialize::~initialize() {
 double wall_time(void) {
 #ifdef HAVE_MPI
   return MPI_Wtime();
+#elif defined(_OPENMP)
+  return omp_get_wtime();
 #elif HAVE_GETTIMEOFDAY
   struct timeval tv;
   gettimeofday(&tv, 0);
diff --git a/src/step.cpp b/src/step.cpp
index 88f19e9b2..efeb6b625 100644
--- a/src/step.cpp
+++ b/src/step.cpp
@@ -141,6 +141,7 @@ void fields::step() {
 void fields::phase_material() {
   bool changed = false;
   if (is_phasing()) {
+    CHUNK_OPENMP
     for (int i = 0; i < num_chunks; i++)
       if (chunks[i]->is_mine()) {
         chunks[i]->phase_material(phasein_time);
diff --git a/src/step_generic.cpp b/src/step_generic.cpp
index e0e7bab48..ab0cd33dd 100644
--- a/src/step_generic.cpp
+++ b/src/step_generic.cpp
@@ -4,7 +4,7 @@
 
 #define RPR realnum *restrict
 
-/* These macros get into the guts of the LOOP_OVER_VOL loops to
+/* These macros get into the guts of the PLOOP_OVER_VOL loops to
    efficiently construct the index k into a PML sigma array.
    Basically, k needs to increment by 2 for each increment of one of
    LOOP's for-loops, starting at the appropriate corner of the grid_volume,
@@ -87,25 +87,25 @@ void step_curl(RPR f, component c, const RPR g1, const RPR g2,
       if (cnd) {
         realnum dt2 = dt * 0.5;
         if (g2) {
-          LOOP_OVER_IVECS(gv, is, ie, i) {
+          PLOOP_OVER_IVECS(gv, is, ie, i) {
             f[i] = ((1 - dt2 * cnd[i]) * f[i] - dtdx * (g1[i + s1] - g1[i] + g2[i] - g2[i + s2])) *
                    cndinv[i];
           }
         }
         else {
-          LOOP_OVER_IVECS(gv, is, ie, i) {
+          PLOOP_OVER_IVECS(gv, is, ie, i) {
             f[i] = ((1 - dt2 * cnd[i]) * f[i] - dtdx * (g1[i + s1] - g1[i])) * cndinv[i];
           }
         }
       }
       else { // no conductivity
         if (g2) {
-          LOOP_OVER_IVECS(gv, is, ie, i) {
+          PLOOP_OVER_IVECS(gv, is, ie, i) {
             f[i] -= dtdx * (g1[i + s1] - g1[i] + g2[i] - g2[i + s2]);
           }
         }
         else {
-          LOOP_OVER_IVECS(gv, is, ie, i) { f[i] -= dtdx * (g1[i + s1] - g1[i]); }
+          PLOOP_OVER_IVECS(gv, is, ie, i) { f[i] -= dtdx * (g1[i + s1] - g1[i]); }
         }
       }
     }
@@ -114,7 +114,7 @@ void step_curl(RPR f, component c, const RPR g1, const RPR g2,
       if (cnd) {
         realnum dt2 = dt * 0.5;
         if (g2) {
-          LOOP_OVER_IVECS(gv, is, ie, i) {
+          PLOOP_OVER_IVECS(gv, is, ie, i) {
             DEF_ku;
             realnum fprev = fu[i];
             fu[i] =
@@ -124,7 +124,7 @@ void step_curl(RPR f, component c, const RPR g1, const RPR g2,
           }
         }
         else {
-          LOOP_OVER_IVECS(gv, is, ie, i) {
+          PLOOP_OVER_IVECS(gv, is, ie, i) {
             DEF_ku;
             realnum fprev = fu[i];
             fu[i] = ((1 - dt2 * cnd[i]) * fprev - dtdx * (g1[i + s1] - g1[i])) * cndinv[i];
@@ -134,7 +134,7 @@ void step_curl(RPR f, component c, const RPR g1, const RPR g2,
       }
       else { // no conductivity
         if (g2) {
-          LOOP_OVER_IVECS(gv, is, ie, i) {
+          PLOOP_OVER_IVECS(gv, is, ie, i) {
             DEF_ku;
             realnum fprev = fu[i];
             fu[i] -= dtdx * (g1[i + s1] - g1[i] + g2[i] - g2[i + s2]);
@@ -142,7 +142,7 @@ void step_curl(RPR f, component c, const RPR g1, const RPR g2,
           }
         }
         else {
-          LOOP_OVER_IVECS(gv, is, ie, i) {
+          PLOOP_OVER_IVECS(gv, is, ie, i) {
             DEF_ku;
             realnum fprev = fu[i];
             fu[i] -= dtdx * (g1[i + s1] - g1[i]);
@@ -158,7 +158,7 @@ void step_curl(RPR f, component c, const RPR g1, const RPR g2,
       if (cnd) {
         realnum dt2 = dt * 0.5;
         if (g2) {
-          LOOP_OVER_IVECS(gv, is, ie, i) {
+          PLOOP_OVER_IVECS(gv, is, ie, i) {
             DEF_k;
             realnum fcnd_prev = fcnd[i];
             fcnd[i] =
@@ -168,7 +168,7 @@ void step_curl(RPR f, component c, const RPR g1, const RPR g2,
           }
         }
         else {
-          LOOP_OVER_IVECS(gv, is, ie, i) {
+          PLOOP_OVER_IVECS(gv, is, ie, i) {
             DEF_k;
             realnum fcnd_prev = fcnd[i];
             fcnd[i] = ((1 - dt2 * cnd[i]) * fcnd[i] - dtdx * (g1[i + s1] - g1[i])) * cndinv[i];
@@ -178,14 +178,14 @@ void step_curl(RPR f, component c, const RPR g1, const RPR g2,
       }
       else { // no conductivity (other than PML conductivity)
         if (g2) {
-          LOOP_OVER_IVECS(gv, is, ie, i) {
+          PLOOP_OVER_IVECS(gv, is, ie, i) {
             DEF_k;
             f[i] = ((kap[k] - sig[k]) * f[i] - dtdx * (g1[i + s1] - g1[i] + g2[i] - g2[i + s2])) *
                    siginv[k];
           }
         }
         else {
-          LOOP_OVER_IVECS(gv, is, ie, i) {
+          PLOOP_OVER_IVECS(gv, is, ie, i) {
             DEF_k;
             f[i] = ((kap[k] - sig[k]) * f[i] - dtdx * (g1[i + s1] - g1[i])) * siginv[k];
           }
@@ -198,7 +198,7 @@ void step_curl(RPR f, component c, const RPR g1, const RPR g2,
         realnum dt2 = dt * 0.5;
         if (g2) {
           //////////////////// MOST GENERAL CASE //////////////////////
-          LOOP_OVER_IVECS(gv, is, ie, i) {
+          PLOOP_OVER_IVECS(gv, is, ie, i) {
             DEF_k;
             DEF_ku;
             realnum fprev = fu[i];
@@ -212,7 +212,7 @@ void step_curl(RPR f, component c, const RPR g1, const RPR g2,
           /////////////////////////////////////////////////////////////
         }
         else {
-          LOOP_OVER_IVECS(gv, is, ie, i) {
+          PLOOP_OVER_IVECS(gv, is, ie, i) {
             DEF_k;
             DEF_ku;
             realnum fprev = fu[i];
@@ -225,7 +225,7 @@ void step_curl(RPR f, component c, const RPR g1, const RPR g2,
       }
       else { // no conductivity (other than PML conductivity)
         if (g2) {
-          LOOP_OVER_IVECS(gv, is, ie, i) {
+          PLOOP_OVER_IVECS(gv, is, ie, i) {
             DEF_k;
             DEF_ku;
             realnum fprev = fu[i];
@@ -235,7 +235,7 @@ void step_curl(RPR f, component c, const RPR g1, const RPR g2,
           }
         }
         else {
-          LOOP_OVER_IVECS(gv, is, ie, i) {
+          PLOOP_OVER_IVECS(gv, is, ie, i) {
             DEF_k;
             DEF_ku;
             realnum fprev = fu[i];
@@ -262,7 +262,7 @@ void step_beta(RPR f, component c, const RPR g, const grid_volume &gv, const ive
       KSTRIDE_DEF(dsigu, ku, is, gv);
       if (cndinv) { // conductivity + PML
         //////////////////// MOST GENERAL CASE //////////////////////
-        LOOP_OVER_IVECS(gv, is, ie, i) {
+        PLOOP_OVER_IVECS(gv, is, ie, i) {
           DEF_k;
           DEF_ku;
           realnum df;
@@ -274,7 +274,7 @@ void step_beta(RPR f, component c, const RPR g, const grid_volume &gv, const ive
         /////////////////////////////////////////////////////////////
       }
       else { // PML only
-        LOOP_OVER_IVECS(gv, is, ie, i) {
+        PLOOP_OVER_IVECS(gv, is, ie, i) {
           DEF_k;
           DEF_ku;
           realnum df;
@@ -285,7 +285,7 @@ void step_beta(RPR f, component c, const RPR g, const grid_volume &gv, const ive
     }
     else {          // PML in f, no fu
       if (cndinv) { // conductivity + PML
-        LOOP_OVER_IVECS(gv, is, ie, i) {
+        PLOOP_OVER_IVECS(gv, is, ie, i) {
           DEF_k;
           realnum dfcnd = betadt * g[i] * cndinv[i];
           fcnd[i] += dfcnd;
@@ -293,7 +293,7 @@ void step_beta(RPR f, component c, const RPR g, const grid_volume &gv, const ive
         }
       }
       else { // PML only
-        LOOP_OVER_IVECS(gv, is, ie, i) {
+        PLOOP_OVER_IVECS(gv, is, ie, i) {
           DEF_k;
           f[i] += betadt * g[i] * siginv[k];
         }
@@ -304,7 +304,7 @@ void step_beta(RPR f, component c, const RPR g, const grid_volume &gv, const ive
     if (dsigu != NO_DIRECTION) { // fu, no PML in f
       KSTRIDE_DEF(dsigu, ku, is, gv);
       if (cndinv) { // conductivity, no PML
-        LOOP_OVER_IVECS(gv, is, ie, i) {
+        PLOOP_OVER_IVECS(gv, is, ie, i) {
           DEF_ku;
           realnum df;
           fu[i] += (df = betadt * g[i] * cndinv[i]);
@@ -312,7 +312,7 @@ void step_beta(RPR f, component c, const RPR g, const grid_volume &gv, const ive
         }
       }
       else { // no conductivity or PML
-        LOOP_OVER_IVECS(gv, is, ie, i) {
+        PLOOP_OVER_IVECS(gv, is, ie, i) {
           DEF_ku;
           realnum df;
           fu[i] += (df = betadt * g[i]);
@@ -322,10 +322,10 @@ void step_beta(RPR f, component c, const RPR g, const grid_volume &gv, const ive
     }
     else {          // no PML, no fu
       if (cndinv) { // conductivity, no PML
-        LOOP_OVER_IVECS(gv, is, ie, i) { f[i] += betadt * g[i] * cndinv[i]; }
+        PLOOP_OVER_IVECS(gv, is, ie, i) { f[i] += betadt * g[i] * cndinv[i]; }
       }
       else { // no conductivity or PML
-        LOOP_OVER_IVECS(gv, is, ie, i) { f[i] += betadt * g[i]; }
+        PLOOP_OVER_IVECS(gv, is, ie, i) { f[i] += betadt * g[i]; }
       }
     }
   }
@@ -386,7 +386,7 @@ void step_update_EDHB(RPR f, component fc, const grid_volume &gv, const ivec is,
     if (u1 && u2) { // 3x3 off-diagonal u
       if (chi3) {
         //////////////////// MOST GENERAL CASE //////////////////////
-        LOOP_OVER_IVECS(gv, is, ie, i) {
+        PLOOP_OVER_IVECS(gv, is, ie, i) {
           realnum g1s = g1[i] + g1[i + s] + g1[i - s1] + g1[i + (s - s1)];
           realnum g2s = g2[i] + g2[i + s] + g2[i - s2] + g2[i + (s - s2)];
           realnum gs = g[i];
@@ -401,7 +401,7 @@ void step_update_EDHB(RPR f, component fc, const grid_volume &gv, const ivec is,
         /////////////////////////////////////////////////////////////
       }
       else {
-        LOOP_OVER_IVECS(gv, is, ie, i) {
+        PLOOP_OVER_IVECS(gv, is, ie, i) {
           realnum gs = g[i];
           realnum us = u[i];
           DEF_kw;
@@ -413,7 +413,7 @@ void step_update_EDHB(RPR f, component fc, const grid_volume &gv, const ivec is,
     }
     else if (u1) { // 2x2 off-diagonal u
       if (chi3) {
-        LOOP_OVER_IVECS(gv, is, ie, i) {
+        PLOOP_OVER_IVECS(gv, is, ie, i) {
           realnum g1s = g1[i] + g1[i + s] + g1[i - s1] + g1[i + (s - s1)];
           realnum gs = g[i];
           realnum us = u[i];
@@ -425,7 +425,7 @@ void step_update_EDHB(RPR f, component fc, const grid_volume &gv, const ivec is,
         }
       }
       else {
-        LOOP_OVER_IVECS(gv, is, ie, i) {
+        PLOOP_OVER_IVECS(gv, is, ie, i) {
           realnum gs = g[i];
           realnum us = u[i];
           DEF_kw;
@@ -441,7 +441,7 @@ void step_update_EDHB(RPR f, component fc, const grid_volume &gv, const ivec is,
     else { // diagonal u
       if (chi3) {
         if (g1 && g2) {
-          LOOP_OVER_IVECS(gv, is, ie, i) {
+          PLOOP_OVER_IVECS(gv, is, ie, i) {
             realnum g1s = g1[i] + g1[i + s] + g1[i - s1] + g1[i + (s - s1)];
             realnum g2s = g2[i] + g2[i + s] + g2[i - s2] + g2[i + (s - s2)];
             realnum gs = g[i];
@@ -454,7 +454,7 @@ void step_update_EDHB(RPR f, component fc, const grid_volume &gv, const ivec is,
           }
         }
         else if (g1) {
-          LOOP_OVER_IVECS(gv, is, ie, i) {
+          PLOOP_OVER_IVECS(gv, is, ie, i) {
             realnum g1s = g1[i] + g1[i + s] + g1[i - s1] + g1[i + (s - s1)];
             realnum gs = g[i];
             realnum us = u[i];
@@ -469,7 +469,7 @@ void step_update_EDHB(RPR f, component fc, const grid_volume &gv, const ivec is,
           meep::abort("bug - didn't swap off-diagonal terms!?");
         }
         else {
-          LOOP_OVER_IVECS(gv, is, ie, i) {
+          PLOOP_OVER_IVECS(gv, is, ie, i) {
             realnum gs = g[i];
             realnum us = u[i];
             DEF_kw;
@@ -480,7 +480,7 @@ void step_update_EDHB(RPR f, component fc, const grid_volume &gv, const ivec is,
         }
       }
       else if (u) {
-        LOOP_OVER_IVECS(gv, is, ie, i) {
+        PLOOP_OVER_IVECS(gv, is, ie, i) {
           realnum gs = g[i];
           realnum us = u[i];
           DEF_kw;
@@ -490,7 +490,7 @@ void step_update_EDHB(RPR f, component fc, const grid_volume &gv, const ivec is,
         }
       }
       else {
-        LOOP_OVER_IVECS(gv, is, ie, i) {
+        PLOOP_OVER_IVECS(gv, is, ie, i) {
           DEF_kw;
           realnum fwprev = fw[i], kapwkw = kapw[kw], sigwkw = sigw[kw];
           fw[i] = g[i];
@@ -502,7 +502,7 @@ void step_update_EDHB(RPR f, component fc, const grid_volume &gv, const ivec is,
   else {            /////////////// no PML (no fw) ///////////////////
     if (u1 && u2) { // 3x3 off-diagonal u
       if (chi3) {
-        LOOP_OVER_IVECS(gv, is, ie, i) {
+        PLOOP_OVER_IVECS(gv, is, ie, i) {
           realnum g1s = g1[i] + g1[i + s] + g1[i - s1] + g1[i + (s - s1)];
           realnum g2s = g2[i] + g2[i + s] + g2[i - s2] + g2[i + (s - s2)];
           realnum gs = g[i];
@@ -513,7 +513,7 @@ void step_update_EDHB(RPR f, component fc, const grid_volume &gv, const ivec is,
         }
       }
       else {
-        LOOP_OVER_IVECS(gv, is, ie, i) {
+        PLOOP_OVER_IVECS(gv, is, ie, i) {
           realnum gs = g[i];
           realnum us = u[i];
           f[i] = (gs * us + OFFDIAG(u1, g1, s1) + OFFDIAG(u2, g2, s2));
@@ -522,7 +522,7 @@ void step_update_EDHB(RPR f, component fc, const grid_volume &gv, const ivec is,
     }
     else if (u1) { // 2x2 off-diagonal u
       if (chi3) {
-        LOOP_OVER_IVECS(gv, is, ie, i) {
+        PLOOP_OVER_IVECS(gv, is, ie, i) {
           realnum g1s = g1[i] + g1[i + s] + g1[i - s1] + g1[i + (s - s1)];
           realnum gs = g[i];
           realnum us = u[i];
@@ -531,7 +531,7 @@ void step_update_EDHB(RPR f, component fc, const grid_volume &gv, const ivec is,
         }
       }
       else {
-        LOOP_OVER_IVECS(gv, is, ie, i) {
+        PLOOP_OVER_IVECS(gv, is, ie, i) {
           realnum gs = g[i];
           realnum us = u[i];
           f[i] = (gs * us + OFFDIAG(u1, g1, s1));
@@ -544,7 +544,7 @@ void step_update_EDHB(RPR f, component fc, const grid_volume &gv, const ivec is,
     else { // diagonal u
       if (chi3) {
         if (g1 && g2) {
-          LOOP_OVER_IVECS(gv, is, ie, i) {
+          PLOOP_OVER_IVECS(gv, is, ie, i) {
             realnum g1s = g1[i] + g1[i + s] + g1[i - s1] + g1[i + (s - s1)];
             realnum g2s = g2[i] + g2[i + s] + g2[i - s2] + g2[i + (s - s2)];
             realnum gs = g[i];
@@ -554,7 +554,7 @@ void step_update_EDHB(RPR f, component fc, const grid_volume &gv, const ivec is,
           }
         }
         else if (g1) {
-          LOOP_OVER_IVECS(gv, is, ie, i) {
+          PLOOP_OVER_IVECS(gv, is, ie, i) {
             realnum g1s = g1[i] + g1[i + s] + g1[i - s1] + g1[i + (s - s1)];
             realnum gs = g[i];
             realnum us = u[i];
@@ -566,7 +566,7 @@ void step_update_EDHB(RPR f, component fc, const grid_volume &gv, const ivec is,
           meep::abort("bug - didn't swap off-diagonal terms!?");
         }
         else {
-          LOOP_OVER_IVECS(gv, is, ie, i) {
+          PLOOP_OVER_IVECS(gv, is, ie, i) {
             realnum gs = g[i];
             realnum us = u[i];
             f[i] = (gs * us) * calc_nonlinear_u(gs * gs, gs, us, chi2[i], chi3[i]);
@@ -574,14 +574,14 @@ void step_update_EDHB(RPR f, component fc, const grid_volume &gv, const ivec is,
         }
       }
       else if (u) {
-        LOOP_OVER_IVECS(gv, is, ie, i) {
+        PLOOP_OVER_IVECS(gv, is, ie, i) {
           realnum gs = g[i];
           realnum us = u[i];
           f[i] = (gs * us);
         }
       }
       else
-        LOOP_OVER_IVECS(gv, is, ie, i) { f[i] = g[i]; }
+        PLOOP_OVER_IVECS(gv, is, ie, i) { f[i] = g[i]; }
     }
   }
 }
diff --git a/src/structure.cpp b/src/structure.cpp
index 576b2f539..ceb11f221 100644
--- a/src/structure.cpp
+++ b/src/structure.cpp
@@ -801,12 +801,13 @@ void structure_chunk::set_chi3(component c, material_function &epsilon) {
 
   if (!chi1inv[c][component_direction(c)]) { // require chi1 if we have chi3
     chi1inv[c][component_direction(c)] = new realnum[gv.ntot()];
-    for (size_t i = 0; i < gv.ntot(); ++i)
+    for (size_t i = 0; i < gv.ntot(); i++)
       chi1inv[c][component_direction(c)][i] = 1.0;
   }
 
   if (!chi3[c]) chi3[c] = new realnum[gv.ntot()];
   bool trivial = true;
+  // note: not thread-safe if epsilon is not thread-safe, e.g. it if calls back to Python
   LOOP_OVER_VOL(gv, c, i) {
     IVEC_LOOP_LOC(gv, here);
     chi3[c][i] = epsilon.chi3(c, here);
@@ -837,7 +838,7 @@ void structure_chunk::set_chi2(component c, material_function &epsilon) {
 
   if (!chi1inv[c][component_direction(c)]) { // require chi1 if we have chi2
     chi1inv[c][component_direction(c)] = new realnum[gv.ntot()];
-    for (size_t i = 0; i < gv.ntot(); ++i)
+    for (size_t i = 0; i < gv.ntot(); i++)
       chi1inv[c][component_direction(c)][i] = 1.0;
   }
 
diff --git a/src/susceptibility.cpp b/src/susceptibility.cpp
index ae7f9dc8c..7e9fa30fb 100644
--- a/src/susceptibility.cpp
+++ b/src/susceptibility.cpp
@@ -225,7 +225,7 @@ void lorentzian_susceptibility::update_P(realnum *W[NUM_FIELD_COMPONENTS][2],
           SWAP(const realnum *, s1, s2);
         }
         if (s1 && s2) { // 3x3 anisotropic
-          LOOP_OVER_VOL_OWNED(gv, c, i) {
+          PLOOP_OVER_VOL_OWNED(gv, c, i) {
             // s[i] != 0 check is a bit of a hack to work around
             // some instabilities that occur near the boundaries
             // of materials; see PR #666
@@ -239,7 +239,7 @@ void lorentzian_susceptibility::update_P(realnum *W[NUM_FIELD_COMPONENTS][2],
           }
         }
         else if (s1) { // 2x2 anisotropic
-          LOOP_OVER_VOL_OWNED(gv, c, i) {
+          PLOOP_OVER_VOL_OWNED(gv, c, i) {
             if (s[i] != 0) { // see above
               realnum pcur = p[i];
               p[i] = gamma1inv * (pcur * (2 - omega0dtsqr_denom) - gamma1 * pp[i] +
@@ -249,7 +249,7 @@ void lorentzian_susceptibility::update_P(realnum *W[NUM_FIELD_COMPONENTS][2],
           }
         }
         else { // isotropic
-          LOOP_OVER_VOL_OWNED(gv, c, i) {
+          PLOOP_OVER_VOL_OWNED(gv, c, i) {
             realnum pcur = p[i];
             p[i] = gamma1inv *
                    (pcur * (2 - omega0dtsqr_denom) - gamma1 * pp[i] + omega0dtsqr * (s[i] * w[i]));
diff --git a/tests/test_loop.cpp b/tests/test_loop.cpp
new file mode 100644
index 000000000..e6bba8257
--- /dev/null
+++ b/tests/test_loop.cpp
@@ -0,0 +1,80 @@
+/*
+Steps:
+    [x] Set up initial benchmark framework
+    [ ] Set up initial iterator class
+    [ ] Fill in iterator class
+    [ ] Debug iterator class
+    [ ] Parallelize iterator class
+    [ ] Debug parallel version
+    [ ] Integrate into main repo
+    [ ] Finalize test
+    [ ] Try different vectorizations and do basic timing
+    [ ] Set up 3 actual test scripts
+    [ ] Benchmark added benefit with new scripts
+*/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <signal.h>
+#include <omp.h>
+
+#include <meep.hpp>
+using namespace meep;
+using std::complex;
+
+#define PLOOP_OVER_IVECS(gv, is, ie, idx)			\
+for(ptrdiff_t loop_is1 = (is).yucky_val(0), loop_is2 = (is).yucky_val(1),                          \
+                 loop_is3 = (is).yucky_val(2), loop_n1 = ((ie).yucky_val(0) - loop_is1) / 2 + 1,   \
+                 loop_n2 = ((ie).yucky_val(1) - loop_is2) / 2 + 1,                                 \
+                 loop_n3 = ((ie).yucky_val(2) - loop_is3) / 2 + 1,                                 \
+                 loop_d1 = (gv).yucky_direction(0), loop_d2 = (gv).yucky_direction(1),             \
+                 loop_d3 = (gv).yucky_direction(2),                                                \
+                 loop_s1 = (gv).stride((meep::direction)loop_d1),                                  \
+                 loop_s2 = (gv).stride((meep::direction)loop_d2),                                  \
+                 loop_s3 = (gv).stride((meep::direction)loop_d3),                                  \
+                 idx0 = (is - (gv).little_corner()).yucky_val(0) / 2 * loop_s1 +                   \
+                        (is - (gv).little_corner()).yucky_val(1) / 2 * loop_s2 +                   \
+                        (is - (gv).little_corner()).yucky_val(2) / 2 * loop_s3,                    \
+                  dummy_first=0;dummy_first<1;dummy_first++)                                       \
+_Pragma("omp parallel for collapse(3)")				                                                     \
+  for (ptrdiff_t loop_i1 = 0; loop_i1 < loop_n1; loop_i1++)                                        \
+    for (ptrdiff_t loop_i2 = 0; loop_i2 < loop_n2; loop_i2++)                                      \
+      for (ptrdiff_t loop_i3 = 0; loop_i3 < loop_n3; loop_i3++)                                    \
+        for (ptrdiff_t idx = idx0 + loop_i1*loop_s1 + loop_i2*loop_s2 +                            \
+           loop_i3*loop_s3, dummy_last=0;dummy_last<1;dummy_last++)
+
+#define PLOOP_OVER_VOL(gv, c, idx)                                                                  \
+  PLOOP_OVER_IVECS(gv, (gv).little_corner() + (gv).iyee_shift(c),                                   \
+                  (gv).big_corner() + (gv).iyee_shift(c), idx)
+
+double one(const vec &) { return 1.0; }
+
+int test_normal_loop() {
+  double a = 5.0;
+
+  grid_volume gv = vol3d(3.0, 3.0, 1.0, a);
+  structure s1(gv, one, pml(0.1), meep::identity(), 0);
+  fields f(&s1);
+  f.add_point_source(Ez, 0.8, 1.6, 0.0, 4.0, vec(1.299, 1.401, 0.0), 1.0);
+  f.step();
+  for (int i = 0; i < f.num_chunks; i++) {
+      master_printf("\n---------------------------------\n");
+      master_printf("%d",LOOPS_ARE_STRIDE1(f.chunks[i]->gv));
+      master_printf("current chunk: %d\n",i);
+      master_printf("\n---------------------------------\n");
+
+      
+      int my_thread=omp_get_thread_num();
+        if (my_thread == 0)
+          master_printf("%td ",idx);
+      }
+      
+  initialize mpi(argc, argv);
+  verbosity = 0;
+  master_printf("Testing 2D...\n");
+  
+  if (!test_normal_loop()) abort("error in test_periodic_tm vacuum\n");
+
+  master_printf("\n\n\n");
+  return 0;
+}
\ No newline at end of file