Skip to content

Commit

Permalink
updated multi-threading branch (#1628)
Browse files Browse the repository at this point in the history
* multithreading branch by smartalecH

* compilation fixes

* rm pragma unroll statements, which shouldn't be needed

* parallelize update_dft

* use MPI_Init_thread with openmp

* fix idx_dft counter for parallel loop

* IVEC_LOOP_COUNTER macro

* disable non-thread-safe PLOOP for epsilon averaging

* simplifications, fixes

* make sure step_generic_stride1.cpp is re-generated

* nevermind, step_generic.cpp dependency is sufficient

* comment formatting

Co-authored-by: Alec Hammond <ahammond36@gatech.edu>
  • Loading branch information
stevengj and Alec Hammond authored Aug 26, 2021
1 parent 21c94c0 commit 3eabcef
Show file tree
Hide file tree
Showing 12 changed files with 258 additions and 55 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,7 @@ tests/pw-source-ll
tests/ring-ll
tests/stress_tensor
tests/symmetry
tests/test_loop
tests/three_d
tests/two_dimensional
tests/user-defined-material
Expand Down
2 changes: 1 addition & 1 deletion doc/docs/Parallel_Meep.md
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ However, there is an alternative strategy for parallelization. If you have many

The `divide_parallel_processes` feature can be useful for large supercomputers which typically restrict the total number of jobs that can be executed but do not restrict the size of each job, or for large-scale optimization where many separate simulations are coupled by an optimization algorithm. Note that when using this feature using the [Python interface](Python_User_Interface.md), only the output of the subgroup belonging to the master process of the entire simulation is shown in the standard output. (In C++, the master process from *every* subgroup prints to standard output.)

Meep also supports [thread-level parallelism](https://en.wikipedia.org/wiki/Task_parallelism) (i.e., multi-threading) on a single, shared-memory, multi-core machine for multi-frequency [near-to-far field](Python_User_Interface.md#near-to-far-field-spectra) computations. Meep does not currently use thread-level parallelism for the time stepping although this feature may be added in the future (see [Issue \#228](https://github.com/NanoComp/meep/issues/228)).
Meep also supports [thread-level parallelism](https://en.wikipedia.org/wiki/Task_parallelism) (i.e., multi-threading) on a single, shared-memory, multi-core machine for multi-frequency [near-to-far field](Python_User_Interface.md#near-to-far-field-spectra) computations. Meep does not currently use thread-level parallelism for the time stepping although this feature may be added in the future (see [Issue \#228](https://github.com/NanoComp/meep/issues/228)).

### Optimization Studies of Parallel Simulations

Expand Down
7 changes: 7 additions & 0 deletions src/anisotropic_averaging.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,10 @@

#include "meep_internals.hpp"

/* This file contains routines to compute the "average" or "effective"
dielectric constant for a pixel, using an anisotropic averaging
procedure described in our papers (similar to MPB). */

using namespace std;

namespace meep {
Expand Down Expand Up @@ -226,6 +230,9 @@ void structure_chunk::set_chi1inv(component c, material_function &medium,
double trivial_val[3] = {0, 0, 0};
trivial_val[idiag] = 1.0;
ivec shift1(unit_ivec(gv.dim, component_direction(c)) * (ft == E_stuff ? 1 : -1));
// TODO: make this loop thread-safe and change to PLOOP_OVER_VOL
// Note that we *cannot* make it thread-safe if `medium` is not thread-safe,
// e.g. if it calls back to Python.
LOOP_OVER_VOL(gv, c, i) {
double chi1invrow[3], chi1invrow_offdiag[3];
IVEC_LOOP_ILOC(gv, here);
Expand Down
7 changes: 3 additions & 4 deletions src/dft.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -230,8 +230,8 @@ void dft_chunk::update_dft(double time) {

int numcmp = fc->f[c][1] ? 2 : 1;

size_t idx_dft = 0;
LOOP_OVER_IVECS(fc->gv, is, ie, idx) {
PLOOP_OVER_IVECS(fc->gv, is, ie, idx) {
size_t idx_dft = IVEC_LOOP_COUNTER;
double w;
if (include_dV_and_interp_weights) {
w = IVEC_LOOP_WEIGHT(s0, s1, e0, e1, dV0 + dV1 * loop_i2);
Expand Down Expand Up @@ -261,7 +261,6 @@ void dft_chunk::update_dft(double time) {
for (int i = 0; i < Nomega; ++i)
dft[Nomega * idx_dft + i] += std::complex<realnum>{fr * dft_phase[i].real(), fr * dft_phase[i].imag()};
}
idx_dft++;
}
}

Expand Down Expand Up @@ -1269,4 +1268,4 @@ void fields::get_mode_mode_overlap(void *mode1_data, void *mode2_data, dft_flux
get_overlap(mode1_data, mode2_data, flux, 0, overlaps);
}

} // namespace meep
} // namespace meep
2 changes: 2 additions & 0 deletions src/initialize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -149,6 +149,8 @@ void fields::initialize_field(component c, complex<double> func(const vec &)) {

void fields_chunk::initialize_field(component c, complex<double> func(const vec &)) {
if (f[c][0]) {
// note: this loop is not thread-safe unless func is, which
// isn't true if func e.g. calls back to Python
LOOP_OVER_VOL(gv, c, i) {
IVEC_LOOP_LOC(gv, here);
complex<double> val = func(here);
Expand Down
107 changes: 105 additions & 2 deletions src/meep/vec.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -148,7 +148,7 @@ component first_field_component(field_type ft);
for (meep::direction d = dim == meep::Dcyl ? meep::Z : meep::X; \
d < (dim == meep::Dcyl ? meep::NO_DIRECTION : meep::R); d = meep::direction(d + 1))

// loop over indices idx from is to ie (inclusive) in gv

#define LOOP_OVER_IVECS(gv, is, ie, idx) \
for (ptrdiff_t loop_is1 = (is).yucky_val(0), loop_is2 = (is).yucky_val(1), \
loop_is3 = (is).yucky_val(2), loop_n1 = ((ie).yucky_val(0) - loop_is1) / 2 + 1, \
Expand Down Expand Up @@ -186,6 +186,64 @@ component first_field_component(field_type ft);
loop_ibound++) \
LOOP_OVER_IVECS(gv, loop_notowned_is, loop_notowned_ie, idx)

/* The following work identically to the LOOP_* macros above,
but employ shared memory-parallelism using OpenMP. Mainly
used in step_generic and a few other time-critical loops. */

/* for the parallel implementation, we introduce 2 dummy loops, one at the begininnging
and one at the end, in order to "trick" openMP to allow us to define our localy variables
without having to change any other code in the main codebase. We can proceed to do
a collapse over all three main loops. */

#define CHUNK_OPENMP _Pragma("omp parallel for")

// the most generic use case where the user
// can specify a custom clause
#define PLOOP_OVER_IVECS_C(gv, is, ie, idx, clause) \
for(ptrdiff_t loop_is1 = (is).yucky_val(0), loop_is2 = (is).yucky_val(1), \
loop_is3 = (is).yucky_val(2), loop_n1 = ((ie).yucky_val(0) - loop_is1) / 2 + 1, \
loop_n2 = ((ie).yucky_val(1) - loop_is2) / 2 + 1, \
loop_n3 = ((ie).yucky_val(2) - loop_is3) / 2 + 1, \
loop_d1 = (gv).yucky_direction(0), loop_d2 = (gv).yucky_direction(1), \
loop_d3 = (gv).yucky_direction(2), \
loop_s1 = (gv).stride((meep::direction)loop_d1), \
loop_s2 = (gv).stride((meep::direction)loop_d2), \
loop_s3 = (gv).stride((meep::direction)loop_d3), \
idx0 = (is - (gv).little_corner()).yucky_val(0) / 2 * loop_s1 + \
(is - (gv).little_corner()).yucky_val(1) / 2 * loop_s2 + \
(is - (gv).little_corner()).yucky_val(2) / 2 * loop_s3, \
dummy_first=0;dummy_first<1;dummy_first++) \
_Pragma(clause) \
for (ptrdiff_t loop_i1 = 0; loop_i1 < loop_n1; loop_i1++) \
for (ptrdiff_t loop_i2 = 0; loop_i2 < loop_n2; loop_i2++) \
for (ptrdiff_t loop_i3 = 0; loop_i3 < loop_n3; loop_i3++) \
for (ptrdiff_t idx = idx0 + loop_i1*loop_s1 + loop_i2*loop_s2 + \
loop_i3*loop_s3, dummy_last=0;dummy_last<1;dummy_last++)

// For the main timestepping events, we know
// we want to do a simple collapse
#define PLOOP_OVER_IVECS(gv, is, ie, idx) \
/*master_printf("Entered ploop\n");*/ \
PLOOP_OVER_IVECS_C(gv, is, ie, idx, "omp parallel for collapse(3)")

#define PLOOP_OVER_VOL(gv, c, idx) \
PLOOP_OVER_IVECS(gv, (gv).little_corner() + (gv).iyee_shift(c), \
(gv).big_corner() + (gv).iyee_shift(c), idx)

#define PLOOP_OVER_VOL_OWNED(gv, c, idx) \
PLOOP_OVER_IVECS(gv, (gv).little_owned_corner(c), (gv).big_corner(), idx)

#define PLOOP_OVER_VOL_OWNED0(gv, c, idx) \
PLOOP_OVER_IVECS(gv, (gv).little_owned_corner0(c), (gv).big_corner(), idx)

#define PLOOP_OVER_VOL_NOTOWNED(gv, c, idx) \
for (ivec loop_notowned_is((gv).dim, 0), loop_notowned_ie((gv).dim, 0); \
loop_notowned_is == zero_ivec((gv).dim);) \
for (int loop_ibound = 0; \
(gv).get_boundary_icorners(c, loop_ibound, &loop_notowned_is, &loop_notowned_ie); \
loop_ibound++) \
PLOOP_OVER_IVECS(gv, loop_notowned_is, loop_notowned_ie, idx)

#define LOOPS_ARE_STRIDE1(gv) ((gv).stride((gv).yucky_direction(2)) == 1)

// The following work identically to the LOOP_* macros above,
Expand All @@ -199,8 +257,10 @@ component first_field_component(field_type ft);
// should only use these macros where that is true! (Basically,
// all of this is here to support performance hacks of step_generic.)

#if !defined(__INTEL_COMPILER) && !defined(__clang__) && (defined(__GNUC__) || defined(__GNUG__))
#if !defined(__INTEL_COMPILER) && !defined(__clang__) && !defined(_OPENMP) && (defined(__GNUC__) || defined(__GNUG__))
#define IVDEP _Pragma("GCC ivdep")
#elif defined(_OPENMP)
#define IVDEP _Pragma("omp simd")
#elif defined(__INTEL_COMPILER)
#define IVDEP _Pragma("ivdep")
#else
Expand Down Expand Up @@ -244,6 +304,46 @@ component first_field_component(field_type ft);
loop_ibound++) \
S1LOOP_OVER_IVECS(gv, loop_notowned_is, loop_notowned_ie, idx)

/* The following is the stride-optimized version of the parallel loops from above.
We can use simd vectorization in addition to the usual par for optimization */
// loop over indices idx from is to ie (inclusive) in gv
#define PS1LOOP_OVER_IVECS(gv, is, ie, idx) \
for(ptrdiff_t loop_is1 = (is).yucky_val(0), loop_is2 = (is).yucky_val(1), \
loop_is3 = (is).yucky_val(2), loop_n1 = ((ie).yucky_val(0) - loop_is1) / 2 + 1, \
loop_n2 = ((ie).yucky_val(1) - loop_is2) / 2 + 1, \
loop_n3 = ((ie).yucky_val(2) - loop_is3) / 2 + 1, \
loop_d1 = (gv).yucky_direction(0), loop_d2 = (gv).yucky_direction(1), \
loop_s1 = (gv).stride((meep::direction)loop_d1), \
loop_s2 = (gv).stride((meep::direction)loop_d2), loop_s3 = 1, \
idx0 = (is - (gv).little_corner()).yucky_val(0) / 2 * loop_s1 + \
(is - (gv).little_corner()).yucky_val(1) / 2 * loop_s2 + \
(is - (gv).little_corner()).yucky_val(2) / 2 * loop_s3, \
dummy_first=0;dummy_first<1;dummy_first++) \
_Pragma("omp parallel for collapse(2)") \
for (ptrdiff_t loop_i1 = 0; loop_i1 < loop_n1; loop_i1++) \
for (ptrdiff_t loop_i2 = 0; loop_i2 < loop_n2; loop_i2++) \
_Pragma("omp simd") \
for (ptrdiff_t loop_i3 = 0; loop_i3 < loop_n3; loop_i3++) \
for (ptrdiff_t idx = idx0 + loop_i1 * loop_s1 + loop_i2 * loop_s2 + loop_i3, dummy_last=0;dummy_last<1;dummy_last++)

#define PS1LOOP_OVER_VOL(gv, c, idx) \
PS1LOOP_OVER_IVECS(gv, (gv).little_corner() + (gv).iyee_shift(c), \
(gv).big_corner() + (gv).iyee_shift(c), idx)

#define PS1LOOP_OVER_VOL_OWNED(gv, c, idx) \
PS1LOOP_OVER_IVECS(gv, (gv).little_owned_corner(c), (gv).big_corner(), idx)

#define PS1LOOP_OVER_VOL_OWNED0(gv, c, idx) \
PS1LOOP_OVER_IVECS(gv, (gv).little_owned_corner0(c), (gv).big_corner(), idx)

#define PS1LOOP_OVER_VOL_NOTOWNED(gv, c, idx) \
for (ivec loop_notowned_is((gv).dim, 0), loop_notowned_ie((gv).dim, 0); \
loop_notowned_is == meep::zero_ivec((gv).dim);) \
for (int loop_ibound = 0; \
(gv).get_boundary_icorners(c, loop_ibound, &loop_notowned_is, &loop_notowned_ie); \
loop_ibound++) \
PS1LOOP_OVER_IVECS(gv, loop_notowned_is, loop_notowned_ie, idx)

#define IVEC_LOOP_AT_BOUNDARY \
((loop_s1 != 0 && (loop_i1 == 0 || loop_i1 == loop_n1 - 1)) || \
(loop_s2 != 0 && (loop_i2 == 0 || loop_i2 == loop_n2 - 1)) || \
Expand Down Expand Up @@ -276,6 +376,9 @@ component first_field_component(field_type ft);
(IVEC_LOOP_WEIGHT1(s0, s1, e0, e1, 3) * \
(IVEC_LOOP_WEIGHT1(s0, s1, e0, e1, 2) * ((dV)*IVEC_LOOP_WEIGHT1(s0, s1, e0, e1, 1))))

// equivalent to incrementing a counter, starting from 0, in the loop body
#define IVEC_LOOP_COUNTER ((loop_i1 * loop_n2 + loop_i2) * loop_n3 + loop_i3)

inline signed_direction flip(signed_direction d) {
signed_direction d2 = d;
d2.flipped = !d.flipped;
Expand Down
9 changes: 9 additions & 0 deletions src/mympi.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -184,7 +184,14 @@ void set_zero_subnormals(bool iszero)

initialize::initialize(int &argc, char **&argv) {
#ifdef HAVE_MPI
#ifdef _OPENMP
int provided;
MPI_Init_thread(&argc, &argv, MPI_THREAD_FUNNELED, &provided);
if (provided < MPI_THREAD_FUNNELED && omp_get_num_threads() > 1)
abort("MPI does not support multi-threaded execution");
#else
MPI_Init(&argc, &argv);
#endif
int major, minor;
MPI_Get_version(&major, &minor);
if (verbosity > 0)
Expand Down Expand Up @@ -214,6 +221,8 @@ initialize::~initialize() {
double wall_time(void) {
#ifdef HAVE_MPI
return MPI_Wtime();
#elif defined(_OPENMP)
return omp_get_wtime();
#elif HAVE_GETTIMEOFDAY
struct timeval tv;
gettimeofday(&tv, 0);
Expand Down
1 change: 1 addition & 0 deletions src/step.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -141,6 +141,7 @@ void fields::step() {
void fields::phase_material() {
bool changed = false;
if (is_phasing()) {
CHUNK_OPENMP
for (int i = 0; i < num_chunks; i++)
if (chunks[i]->is_mine()) {
chunks[i]->phase_material(phasein_time);
Expand Down
Loading

0 comments on commit 3eabcef

Please sign in to comment.