updated multi-threading branch (#1628)

* multithreading branch by smartalecH * compilation fixes * rm pragma unroll statements, which shouldn't be needed * parallelize update_dft * use MPI_Init_thread with openmp * fix idx_dft counter for parallel loop * IVEC_LOOP_COUNTER macro * disable non-thread-safe PLOOP for epsilon averaging * simplifications, fixes * make sure step_generic_stride1.cpp is re-generated * nevermind, step_generic.cpp dependency is sufficient * comment formatting Co-authored-by: Alec Hammond <ahammond36@gatech.edu>
NanoComp · Aug 26, 2021 · 3eabcef · 3eabcef
1 parent 21c94c0
commit 3eabcef
Show file tree

Hide file tree

Showing 12 changed files with 258 additions and 55 deletions.
diff --git a/.gitignore b/.gitignore
@@ -104,6 +104,7 @@ tests/pw-source-ll
 tests/ring-ll
 tests/stress_tensor
 tests/symmetry
+tests/test_loop
 tests/three_d
 tests/two_dimensional
 tests/user-defined-material

diff --git a/doc/docs/Parallel_Meep.md b/doc/docs/Parallel_Meep.md
@@ -53,7 +53,7 @@ However, there is an alternative strategy for parallelization. If you have many
 
 The `divide_parallel_processes` feature can be useful for large supercomputers which typically restrict the total number of jobs that can be executed but do not restrict the size of each job, or for large-scale optimization where many separate simulations are coupled by an optimization algorithm. Note that when using this feature using the [Python interface](Python_User_Interface.md), only the output of the subgroup belonging to the master process of the entire simulation is shown in the standard output. (In C++, the master process from *every* subgroup prints to standard output.)
 
-Meep also supports [thread-level parallelism](https://en.wikipedia.org/wiki/Task_parallelism) (i.e., multi-threading) on a single, shared-memory, multi-core machine for multi-frequency [near-to-far field](Python_User_Interface.md#near-to-far-field-spectra) computations. Meep does not currently use thread-level parallelism for the time stepping although this feature may be added in the future (see [Issue \#228](https://github.com/NanoComp/meep/issues/228)). 
+Meep also supports [thread-level parallelism](https://en.wikipedia.org/wiki/Task_parallelism) (i.e., multi-threading) on a single, shared-memory, multi-core machine for multi-frequency [near-to-far field](Python_User_Interface.md#near-to-far-field-spectra) computations. Meep does not currently use thread-level parallelism for the time stepping although this feature may be added in the future (see [Issue \#228](https://github.com/NanoComp/meep/issues/228)).
 
 ### Optimization Studies of Parallel Simulations
 

diff --git a/src/anisotropic_averaging.cpp b/src/anisotropic_averaging.cpp
@@ -2,6 +2,10 @@
 
 #include "meep_internals.hpp"
 
+/* This file contains routines to compute the "average" or "effective"
+   dielectric constant for a pixel, using an anisotropic averaging
+   procedure described in our papers (similar to MPB). */
+
 using namespace std;
 
 namespace meep {
@@ -226,6 +230,9 @@ void structure_chunk::set_chi1inv(component c, material_function &medium,
   double trivial_val[3] = {0, 0, 0};
   trivial_val[idiag] = 1.0;
   ivec shift1(unit_ivec(gv.dim, component_direction(c)) * (ft == E_stuff ? 1 : -1));
+  // TODO: make this loop thread-safe and change to PLOOP_OVER_VOL
+  // Note that we *cannot* make it thread-safe if `medium` is not thread-safe,
+  // e.g. if it calls back to Python.
   LOOP_OVER_VOL(gv, c, i) {
     double chi1invrow[3], chi1invrow_offdiag[3];
     IVEC_LOOP_ILOC(gv, here);

diff --git a/src/dft.cpp b/src/dft.cpp
@@ -230,8 +230,8 @@ void dft_chunk::update_dft(double time) {
 
   int numcmp = fc->f[c][1] ? 2 : 1;
 
-  size_t idx_dft = 0;
-  LOOP_OVER_IVECS(fc->gv, is, ie, idx) {
+  PLOOP_OVER_IVECS(fc->gv, is, ie, idx) {
+    size_t idx_dft = IVEC_LOOP_COUNTER;
     double w;
     if (include_dV_and_interp_weights) {
       w = IVEC_LOOP_WEIGHT(s0, s1, e0, e1, dV0 + dV1 * loop_i2);
@@ -261,7 +261,6 @@ void dft_chunk::update_dft(double time) {
       for (int i = 0; i < Nomega; ++i)
         dft[Nomega * idx_dft + i] += std::complex<realnum>{fr * dft_phase[i].real(), fr * dft_phase[i].imag()};
     }
-    idx_dft++;
   }
 }
 
@@ -1269,4 +1268,4 @@ void fields::get_mode_mode_overlap(void *mode1_data, void *mode2_data, dft_flux
   get_overlap(mode1_data, mode2_data, flux, 0, overlaps);
 }
 
-} // namespace meep
+} // namespace meep
diff --git a/src/initialize.cpp b/src/initialize.cpp
@@ -149,6 +149,8 @@ void fields::initialize_field(component c, complex<double> func(const vec &)) {
 
 void fields_chunk::initialize_field(component c, complex<double> func(const vec &)) {
   if (f[c][0]) {
+    // note: this loop is not thread-safe unless func is, which
+    // isn't true if func e.g. calls back to Python
     LOOP_OVER_VOL(gv, c, i) {
       IVEC_LOOP_LOC(gv, here);
       complex<double> val = func(here);

diff --git a/src/meep/vec.hpp b/src/meep/vec.hpp
@@ -148,7 +148,7 @@ component first_field_component(field_type ft);
   for (meep::direction d = dim == meep::Dcyl ? meep::Z : meep::X;                                  \
        d < (dim == meep::Dcyl ? meep::NO_DIRECTION : meep::R); d = meep::direction(d + 1))
 
-// loop over indices idx from is to ie (inclusive) in gv
+
 #define LOOP_OVER_IVECS(gv, is, ie, idx)                                                           \
   for (ptrdiff_t loop_is1 = (is).yucky_val(0), loop_is2 = (is).yucky_val(1),                       \
                  loop_is3 = (is).yucky_val(2), loop_n1 = ((ie).yucky_val(0) - loop_is1) / 2 + 1,   \
@@ -186,6 +186,64 @@ component first_field_component(field_type ft);
          loop_ibound++)                                                                            \
   LOOP_OVER_IVECS(gv, loop_notowned_is, loop_notowned_ie, idx)
 
+/* The following work identically to the LOOP_* macros above,
+   but employ shared memory-parallelism using OpenMP. Mainly
+   used in step_generic and a few other time-critical loops. */
+
+/* for the parallel implementation, we introduce 2 dummy loops, one at the begininnging
+   and one at the end, in order to "trick" openMP to allow us to define our localy variables
+   without having to change any other code in the main codebase. We can proceed to do
+   a collapse over all three main loops. */
+
+#define CHUNK_OPENMP _Pragma("omp parallel for")
+
+// the most generic use case where the user
+// can specify a custom clause
+#define PLOOP_OVER_IVECS_C(gv, is, ie, idx, clause)			                                                     \
+for(ptrdiff_t loop_is1 = (is).yucky_val(0), loop_is2 = (is).yucky_val(1),                          \
+                 loop_is3 = (is).yucky_val(2), loop_n1 = ((ie).yucky_val(0) - loop_is1) / 2 + 1,   \
+                 loop_n2 = ((ie).yucky_val(1) - loop_is2) / 2 + 1,                                 \
+                 loop_n3 = ((ie).yucky_val(2) - loop_is3) / 2 + 1,                                 \
+                 loop_d1 = (gv).yucky_direction(0), loop_d2 = (gv).yucky_direction(1),             \
+                 loop_d3 = (gv).yucky_direction(2),                                                \
+                 loop_s1 = (gv).stride((meep::direction)loop_d1),                                  \
+                 loop_s2 = (gv).stride((meep::direction)loop_d2),                                  \
+                 loop_s3 = (gv).stride((meep::direction)loop_d3),                                  \
+                 idx0 = (is - (gv).little_corner()).yucky_val(0) / 2 * loop_s1 +                   \
+                        (is - (gv).little_corner()).yucky_val(1) / 2 * loop_s2 +                   \
+                        (is - (gv).little_corner()).yucky_val(2) / 2 * loop_s3,                    \
+                  dummy_first=0;dummy_first<1;dummy_first++)                                       \
+_Pragma(clause)     				                                                     \
+  for (ptrdiff_t loop_i1 = 0; loop_i1 < loop_n1; loop_i1++)                                        \
+    for (ptrdiff_t loop_i2 = 0; loop_i2 < loop_n2; loop_i2++)                                      \
+      for (ptrdiff_t loop_i3 = 0; loop_i3 < loop_n3; loop_i3++)                                    \
+        for (ptrdiff_t idx = idx0 + loop_i1*loop_s1 + loop_i2*loop_s2 +                            \
+           loop_i3*loop_s3, dummy_last=0;dummy_last<1;dummy_last++)
+
+// For the main timestepping events, we know
+// we want to do a simple collapse
+#define PLOOP_OVER_IVECS(gv, is, ie, idx)                                                          \
+  /*master_printf("Entered ploop\n");*/ \
+  PLOOP_OVER_IVECS_C(gv, is, ie, idx, "omp parallel for collapse(3)")
+
+#define PLOOP_OVER_VOL(gv, c, idx)                                                                  \
+  PLOOP_OVER_IVECS(gv, (gv).little_corner() + (gv).iyee_shift(c),                                   \
+                  (gv).big_corner() + (gv).iyee_shift(c), idx)
+
+#define PLOOP_OVER_VOL_OWNED(gv, c, idx)                                                            \
+  PLOOP_OVER_IVECS(gv, (gv).little_owned_corner(c), (gv).big_corner(), idx)
+
+#define PLOOP_OVER_VOL_OWNED0(gv, c, idx)                                                           \
+  PLOOP_OVER_IVECS(gv, (gv).little_owned_corner0(c), (gv).big_corner(), idx)
+
+#define PLOOP_OVER_VOL_NOTOWNED(gv, c, idx)                                                         \
+  for (ivec loop_notowned_is((gv).dim, 0), loop_notowned_ie((gv).dim, 0);                          \
+       loop_notowned_is == zero_ivec((gv).dim);)                                                   \
+    for (int loop_ibound = 0;                                                                      \
+         (gv).get_boundary_icorners(c, loop_ibound, &loop_notowned_is, &loop_notowned_ie);         \
+         loop_ibound++)                                                                            \
+  PLOOP_OVER_IVECS(gv, loop_notowned_is, loop_notowned_ie, idx)
+
 #define LOOPS_ARE_STRIDE1(gv) ((gv).stride((gv).yucky_direction(2)) == 1)
 
 // The following work identically to the LOOP_* macros above,
@@ -199,8 +257,10 @@ component first_field_component(field_type ft);
 // should only use these macros where that is true!  (Basically,
 // all of this is here to support performance hacks of step_generic.)
 
-#if !defined(__INTEL_COMPILER) && !defined(__clang__) && (defined(__GNUC__) || defined(__GNUG__))
+#if !defined(__INTEL_COMPILER) && !defined(__clang__) && !defined(_OPENMP) && (defined(__GNUC__) || defined(__GNUG__))
 #define IVDEP _Pragma("GCC ivdep")
+#elif defined(_OPENMP)
+#define IVDEP _Pragma("omp simd")
 #elif defined(__INTEL_COMPILER)
 #define IVDEP _Pragma("ivdep")
 #else
@@ -244,6 +304,46 @@ component first_field_component(field_type ft);
          loop_ibound++)                                                                            \
   S1LOOP_OVER_IVECS(gv, loop_notowned_is, loop_notowned_ie, idx)
 
+/* The following is the stride-optimized version of the parallel loops from above.
+   We can use simd vectorization in addition to the usual par for optimization */
+// loop over indices idx from is to ie (inclusive) in gv
+#define PS1LOOP_OVER_IVECS(gv, is, ie, idx)                                                        \
+for(ptrdiff_t loop_is1 = (is).yucky_val(0), loop_is2 = (is).yucky_val(1),                          \
+                 loop_is3 = (is).yucky_val(2), loop_n1 = ((ie).yucky_val(0) - loop_is1) / 2 + 1,   \
+                 loop_n2 = ((ie).yucky_val(1) - loop_is2) / 2 + 1,                                 \
+                 loop_n3 = ((ie).yucky_val(2) - loop_is3) / 2 + 1,                                 \
+                 loop_d1 = (gv).yucky_direction(0), loop_d2 = (gv).yucky_direction(1),             \
+                 loop_s1 = (gv).stride((meep::direction)loop_d1),                                  \
+                 loop_s2 = (gv).stride((meep::direction)loop_d2), loop_s3 = 1,                     \
+                 idx0 = (is - (gv).little_corner()).yucky_val(0) / 2 * loop_s1 +                   \
+                        (is - (gv).little_corner()).yucky_val(1) / 2 * loop_s2 +                   \
+                        (is - (gv).little_corner()).yucky_val(2) / 2 * loop_s3,                    \
+                  dummy_first=0;dummy_first<1;dummy_first++)                                       \
+_Pragma("omp parallel for collapse(2)")				                                                     \
+  for (ptrdiff_t loop_i1 = 0; loop_i1 < loop_n1; loop_i1++)                                        \
+    for (ptrdiff_t loop_i2 = 0; loop_i2 < loop_n2; loop_i2++)                                      \
+      _Pragma("omp simd") \
+      for (ptrdiff_t loop_i3 = 0; loop_i3 < loop_n3; loop_i3++)                                    \
+        for (ptrdiff_t idx = idx0 + loop_i1 * loop_s1 + loop_i2 * loop_s2 + loop_i3, dummy_last=0;dummy_last<1;dummy_last++)
+
+#define PS1LOOP_OVER_VOL(gv, c, idx)                                                                \
+  PS1LOOP_OVER_IVECS(gv, (gv).little_corner() + (gv).iyee_shift(c),                                 \
+                    (gv).big_corner() + (gv).iyee_shift(c), idx)
+
+#define PS1LOOP_OVER_VOL_OWNED(gv, c, idx)                                                          \
+  PS1LOOP_OVER_IVECS(gv, (gv).little_owned_corner(c), (gv).big_corner(), idx)
+
+#define PS1LOOP_OVER_VOL_OWNED0(gv, c, idx)                                                         \
+  PS1LOOP_OVER_IVECS(gv, (gv).little_owned_corner0(c), (gv).big_corner(), idx)
+
+#define PS1LOOP_OVER_VOL_NOTOWNED(gv, c, idx)                                                      \
+  for (ivec loop_notowned_is((gv).dim, 0), loop_notowned_ie((gv).dim, 0);                          \
+       loop_notowned_is == meep::zero_ivec((gv).dim);)                                             \
+    for (int loop_ibound = 0;                                                                      \
+         (gv).get_boundary_icorners(c, loop_ibound, &loop_notowned_is, &loop_notowned_ie);         \
+         loop_ibound++)                                                                            \
+  PS1LOOP_OVER_IVECS(gv, loop_notowned_is, loop_notowned_ie, idx)
+
 #define IVEC_LOOP_AT_BOUNDARY                                                                      \
   ((loop_s1 != 0 && (loop_i1 == 0 || loop_i1 == loop_n1 - 1)) ||                                   \
    (loop_s2 != 0 && (loop_i2 == 0 || loop_i2 == loop_n2 - 1)) ||                                   \
@@ -276,6 +376,9 @@ component first_field_component(field_type ft);
   (IVEC_LOOP_WEIGHT1(s0, s1, e0, e1, 3) *                                                          \
    (IVEC_LOOP_WEIGHT1(s0, s1, e0, e1, 2) * ((dV)*IVEC_LOOP_WEIGHT1(s0, s1, e0, e1, 1))))
 
+// equivalent to incrementing a counter, starting from 0, in the loop body
+#define IVEC_LOOP_COUNTER ((loop_i1 * loop_n2 + loop_i2) * loop_n3 + loop_i3)
+
 inline signed_direction flip(signed_direction d) {
   signed_direction d2 = d;
   d2.flipped = !d.flipped;

diff --git a/src/mympi.cpp b/src/mympi.cpp
@@ -184,7 +184,14 @@ void set_zero_subnormals(bool iszero)
 
 initialize::initialize(int &argc, char **&argv) {
 #ifdef HAVE_MPI
+#ifdef _OPENMP
+  int provided;
+  MPI_Init_thread(&argc, &argv, MPI_THREAD_FUNNELED, &provided);
+  if (provided < MPI_THREAD_FUNNELED && omp_get_num_threads() > 1)
+      abort("MPI does not support multi-threaded execution");
+#else
   MPI_Init(&argc, &argv);
+#endif
   int major, minor;
   MPI_Get_version(&major, &minor);
   if (verbosity > 0)
@@ -214,6 +221,8 @@ initialize::~initialize() {
 double wall_time(void) {
 #ifdef HAVE_MPI
   return MPI_Wtime();
+#elif defined(_OPENMP)
+  return omp_get_wtime();
 #elif HAVE_GETTIMEOFDAY
   struct timeval tv;
   gettimeofday(&tv, 0);

diff --git a/src/step.cpp b/src/step.cpp
@@ -141,6 +141,7 @@ void fields::step() {
 void fields::phase_material() {
   bool changed = false;
   if (is_phasing()) {
+    CHUNK_OPENMP
     for (int i = 0; i < num_chunks; i++)
       if (chunks[i]->is_mine()) {
         chunks[i]->phase_material(phasein_time);